├── .gitignore
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── _static
    │   ├── cartoons
    │   │   ├── data_hierarchy.png
    │   │   ├── overlap.png
    │   │   └── overlap_reads.png
    │   ├── enrich2_env.yml
    │   ├── gui_screenshots
    │   │   ├── complete_example.png
    │   │   ├── context_menu.png
    │   │   ├── empty.png
    │   │   ├── new_child.png
    │   │   ├── new_root.png
    │   │   ├── seqlib.png
    │   │   └── seqlib_choice.png
    │   ├── iD_icon.png
    │   ├── notebook_plots
    │   │   ├── min_count_plot.png
    │   │   └── unique_barcodes_plot.png
    │   ├── plots
    │   │   ├── barcodes_per_variant.png
    │   │   ├── diversity.png
    │   │   ├── overlap_mismatches.png
    │   │   ├── regression_weights.png
    │   │   ├── se_pctile.png
    │   │   ├── selection_counts.png
    │   │   ├── seqlib_counts.png
    │   │   ├── sfmap.png
    │   │   ├── volcano.png
    │   │   └── wt_shape.png
    │   └── sfmap_aa_files
    │   │   ├── aagroup_default.txt
    │   │   └── aagroup_helical_propensity.txt
    ├── api.rst
    ├── conf.py
    ├── exported_notebooks
    │   ├── README
    │   ├── min_count.rst
    │   └── unique_barcodes.rst
    ├── gui.rst
    ├── index.rst
    ├── installation.rst
    ├── introduction.rst
    ├── make.bat
    ├── notebooks.rst
    ├── notebooks
    │   ├── min_count.ipynb
    │   └── unique_barcodes.ipynb
    ├── output.rst
    ├── plots.rst
    └── seqlib_config.rst
├── enrich2
    ├── __init__.py
    ├── aligner.py
    ├── barcode.py
    ├── barcodeid.py
    ├── barcodemap.py
    ├── barcodevariant.py
    ├── basic.py
    ├── condition.py
    ├── config_check.py
    ├── constants.py
    ├── dataframe.py
    ├── experiment.py
    ├── fastqheader.py
    ├── gui
    │   ├── __init__.py
    │   ├── configurator.py
    │   ├── create_root_dialog.py
    │   ├── create_seqlib_dialog.py
    │   ├── delete_dialog.py
    │   ├── dialog_elements.py
    │   ├── edit_dialog.py
    │   ├── runner_window.py
    │   └── seqlib_apply_dialog.py
    ├── idonly.py
    ├── main.py
    ├── overlap.py
    ├── plots.py
    ├── random_effects.py
    ├── selection.py
    ├── seqlib.py
    ├── sfmap.py
    ├── storemanager.py
    ├── variant.py
    ├── wildtype.py
    └── ztest.py
├── pylintrc
└── pyproject.toml


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 
56 | # Unit test output directory
57 | test/test_output/
58 | 
59 | # Jupyter checkpoint files
60 | docs/notebooks/.ipynb_checkpoints/
61 | 
62 | # Visual Studio Code
63 | .vscode/
64 | 
65 | # PyCharm
66 | .idea/


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Rubin"
 5 |   given-names: "Alan F"
 6 |   orcid: "https://orcid.org/0000-0003-1474-605X"
 7 | title: "Enrich2"
 8 | version: 1.3.1
 9 | doi: 10.5281/zenodo.3742545
10 | date-released: 2020-04-07
11 | url: "https://github.com/FowlerLab/Enrich2"
12 | preferred-citation:
13 |   type: article
14 |   authors:
15 |   - family-names: "Rubin"
16 |     given-names: "Alan F"
17 |     orcid: "https://orcid.org/0000-0003-1474-605X"
18 |   - family-names: "Gelman"
19 |     given-names: "Hannah"
20 |   - family-names: "Lucas"
21 |     given-names: "Nathan"
22 |   - family-names: "Bajjalieh"
23 |     given-names: "Sandra M"
24 |   - family-names: "Papenfuss"
25 |     given-names: "Anthony T"
26 |     orcid: "https://orcid.org/0000-0002-1102-8506"
27 |   - family-names: "Speed"
28 |     given-names: "Terence P"
29 |     orcid: "https://orcid.org/0000-0002-5403-7998"
30 |   - family-names: "Fowler"
31 |     given-names: "Douglas M"
32 |     orcid: "https://orcid.org/0000-0001-7614-1713"
33 |   doi: "10.1186/s13059-017-1272-5"
34 |   journal: "Genome Biology"
35 |   month: 8
36 |   start: 150
37 |   title: "A statistical framework for analyzing deep mutational scanning data"
38 |   issue: 1
39 |   volume: 18
40 |   year: 2017
41 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | alan.rubin@wehi.edu.au.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## How to contribute to Enrich2
 2 | 
 3 | **Enrich2 is no longer under active development. For the Python 3-based successor to Enrich2, please see [CountESS](https://github.com/countess-project/countess).**
 4 | 
 5 | All contributors should familiarize themselves with the [Code of Conduct](https://github.com/fowlerlab/enrich2/CODE_OF_CONDUCT.md).
 6 | 
 7 | #### **Reporting a bug**
 8 | 
 9 | * Check and see if the bug has already been reported by searching on GitHub under [Issues](https://github.com/fowlerlab/enrich2/issues).
10 | 
11 | * If you're unable to find an open issue addressing the problem, [open a new issue](https://github.com/fowlerlab/enrich2/issues/new). Be sure to include a **title and clear description** with as much relevant information as possible.
12 | 
13 | Thanks you for reading!
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2016-2020, Alan F Rubin
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14681278.svg)](https://doi.org/10.5281/zenodo.14681278)
 2 | [![PyPI version](https://badge.fury.io/py/Enrich2.svg)](https://badge.fury.io/py/Enrich2)
 3 | 
 4 | Enrich2
 5 | =======
 6 | 
 7 | Enrich2 is a general software tool for processing, analyzing, and visualizing data from deep mutational scanning experiments.
 8 | For more information or to cite Enrich2, please refer to [A statistical framework for analyzing deep mutational scanning data](https://doi.org/10.1186/s13059-017-1272-5).
 9 | 
10 | [Enrich2 documentation](https://enrich2.readthedocs.io) is available on [Read the Docs](https://readthedocs.org/).
11 | 
12 | An example dataset is available at the [Enrich2-Example GitHub repository](https://github.com/FowlerLab/Enrich2-Example/).
13 | 
14 | Thanks to the efforts of [Chris Macdonald](https://github.com/odcambc), Enrich2 is now able to run under modern versions of Python as of v2.0.0!
15 | 
16 | Installation and dependencies
17 | -----------------------------
18 | 
19 | Enrich2 runs on Python 3 (v2.0.0 and higher) and requires the following packages:
20 | 
21 | * [NumPy](http://www.numpy.org/)
22 | * [SciPy](http://www.scipy.org/)
23 | * [pandas](http://pandas.pydata.org/)
24 | * [PyTables](http://www.pytables.org/)
25 | * [Statsmodels](http://statsmodels.sourceforge.net/)
26 | * [matplotlib](http://matplotlib.org/)
27 | * [fqfa](https://fqfa.readthedocs.io/)
28 | 
29 | The configuration GUI requires [Tkinter](https://docs.python.org/2/library/tkinter.html).
30 | Building a local copy of the documentation requires [Sphinx](http://sphinx-doc.org/).
31 | 
32 | Enrich2 can be installed in a new virtual environment using pip:
33 | 
34 |     python3 -m venv e2env
35 |     source e2env/bin/activate
36 |     pip install enrich2
37 | 
38 | You should now be able to launch the Enrich2 graphical user interface by typing `enrich_gui` or the command line interface by typing `enrich_cmd`.
39 | 
40 | For additional information consult the [online documentation](https://enrich2.readthedocs.io/).
41 | 
42 | Questions?
43 | ----------
44 | 
45 | Please use the [GitHub Issue Tracker](https://github.com/FowlerLab/Enrich2/issues) to file bug reports or request features. 
46 | 
47 | Enrich2 was written by [Alan F Rubin](mailto:alan.rubin@wehi.edu.au).
48 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Enrich2.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Enrich2.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/Enrich2"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Enrich2"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/_static/cartoons/data_hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/cartoons/data_hierarchy.png


--------------------------------------------------------------------------------
/docs/_static/cartoons/overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/cartoons/overlap.png


--------------------------------------------------------------------------------
/docs/_static/cartoons/overlap_reads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/cartoons/overlap_reads.png


--------------------------------------------------------------------------------
/docs/_static/enrich2_env.yml:
--------------------------------------------------------------------------------
 1 | name: enrich2
 2 | dependencies:
 3 |   - python=3
 4 |   - numpy
 5 |   - scipy
 6 |   - pandas
 7 |   - pytables
 8 |   - statsmodels
 9 |   - matplotlib
10 | 


--------------------------------------------------------------------------------
/docs/_static/gui_screenshots/complete_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/complete_example.png


--------------------------------------------------------------------------------
/docs/_static/gui_screenshots/context_menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/context_menu.png


--------------------------------------------------------------------------------
/docs/_static/gui_screenshots/empty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/empty.png


--------------------------------------------------------------------------------
/docs/_static/gui_screenshots/new_child.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/new_child.png


--------------------------------------------------------------------------------
/docs/_static/gui_screenshots/new_root.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/new_root.png


--------------------------------------------------------------------------------
/docs/_static/gui_screenshots/seqlib.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/seqlib.png


--------------------------------------------------------------------------------
/docs/_static/gui_screenshots/seqlib_choice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/seqlib_choice.png


--------------------------------------------------------------------------------
/docs/_static/iD_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/iD_icon.png


--------------------------------------------------------------------------------
/docs/_static/notebook_plots/min_count_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/notebook_plots/min_count_plot.png


--------------------------------------------------------------------------------
/docs/_static/notebook_plots/unique_barcodes_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/notebook_plots/unique_barcodes_plot.png


--------------------------------------------------------------------------------
/docs/_static/plots/barcodes_per_variant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/barcodes_per_variant.png


--------------------------------------------------------------------------------
/docs/_static/plots/diversity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/diversity.png


--------------------------------------------------------------------------------
/docs/_static/plots/overlap_mismatches.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/overlap_mismatches.png


--------------------------------------------------------------------------------
/docs/_static/plots/regression_weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/regression_weights.png


--------------------------------------------------------------------------------
/docs/_static/plots/se_pctile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/se_pctile.png


--------------------------------------------------------------------------------
/docs/_static/plots/selection_counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/selection_counts.png


--------------------------------------------------------------------------------
/docs/_static/plots/seqlib_counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/seqlib_counts.png


--------------------------------------------------------------------------------
/docs/_static/plots/sfmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/sfmap.png


--------------------------------------------------------------------------------
/docs/_static/plots/volcano.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/volcano.png


--------------------------------------------------------------------------------
/docs/_static/plots/wt_shape.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/wt_shape.png


--------------------------------------------------------------------------------
/docs/_static/sfmap_aa_files/aagroup_default.txt:
--------------------------------------------------------------------------------
1 | (+)	H,K,R
2 | (-)	D,E
3 | Polar-neutral	C,M,N,Q,S,T
4 | Non-polar	A,I,L,V
5 | Aromatic	F,W,Y
6 | Unique	G,P
7 | 	*
8 | 


--------------------------------------------------------------------------------
/docs/_static/sfmap_aa_files/aagroup_helical_propensity.txt:
--------------------------------------------------------------------------------
1 | High helical propensity	A,L,R,M,K,Q,E,I,W
2 | Low helical propensity	S,Y,F,V,H,N,T,C,D
3 | Disruptive	G,P
4 | 	*
5 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
  1 | Appendix: API documentation
  2 | #############################################
  3 | 
  4 | This page contains automatically generated documentation from the Enrich2 codebase. It is intended for developers and advanced users.
  5 | 
  6 | :py:mod:`~enrich2.storemanager` --- Abstract class for Enrich2 data
  7 | ===================================================================
  8 | 
  9 | .. py:module:: storemanager
 10 | 	:synopsis: Abstract class for Enrich2 data.
 11 | 
 12 | This module contains the class definition for the :py:class:`~enrich2.storemanager.storemanager.StoreManager` abstract class, the shared base class for most classes in the `Enrich2 <index.html>`_ project. This class provides general behavior for the GUI and for handling HDF5 data files.
 13 | 
 14 | :py:class:`~enrich2.storemanager.StoreManager` class
 15 | -----------------------------------------------------------------
 16 | .. autoclass:: enrich2.storemanager.StoreManager
 17 | 	:members:
 18 | 
 19 | 
 20 | :py:mod:`~enrich2.seqlib` --- Sequencing library file handling and element counting
 21 | ===================================================================================
 22 | 
 23 | .. py:module:: seqlib
 24 | 	:synopsis: Sequencing library file handling and element counting.
 25 | 
 26 | This module provides class definitions for the various types of sequencing library designs usable by `Enrich2 <index.html>`_. Data for each FASTQ_ file (or pair of overlapping FASTQ_ files for overlapping paired-end data) is read into its own :py:class:`~enrich2.seqlib.SeqLib` object. If necessary, FASTQ_ files should be split by index read before being read by a :py:class:`~enrich2.seqlib.SeqLib` object. :py:class:`~enrich2.seqlib.SeqLib` objects are coordinated by :py:mod:`~enrich2.selection.Selection` objects.
 27 | 
 28 | :py:class:`~enrich2.seqlib.SeqLib` and :py:class:`~enrich2.variant.VariantSeqLib` are abstract classes. 
 29 | 
 30 | :py:class:`~enrich2.seqlib.SeqLib` class
 31 | -------------------------------------------------------
 32 | .. autoclass:: enrich2.seqlib.SeqLib
 33 | 	:members:
 34 | 
 35 | :py:class:`~enrich2.variant.VariantSeqLib` class
 36 | -------------------------------------------------------
 37 | .. autoclass:: enrich2.variant.VariantSeqLib
 38 | 	:members:
 39 | 
 40 | :py:class:`~enrich2.barcode.BarcodeSeqLib` class
 41 | -------------------------------------------------------
 42 | .. autoclass:: enrich2.barcode.BarcodeSeqLib
 43 | 	:members:
 44 | 
 45 | :py:class:`~enrich2.barcodevariant.BcvSeqLib` class
 46 | -------------------------------------------------------------
 47 | .. autoclass:: enrich2.barcodevariant.BcvSeqLib
 48 | 	:members:
 49 | 
 50 | :py:class:`~enrich2.barcodeid.BcidSeqLib` class
 51 | -------------------------------------------------------------
 52 | .. autoclass:: enrich2.barcodeid.BcidSeqLib
 53 | 	:members:
 54 | 
 55 | :py:class:`~enrich2.basic.BasicSeqLib` class
 56 | -----------------------------------------------------
 57 | .. autoclass:: enrich2.basic.BasicSeqLib
 58 | 	:members:
 59 | 
 60 | :py:class:`~enrich2.overlap.OverlapSeqLib` class
 61 | --------------------------------------------------------
 62 | .. autoclass:: enrich2.overlap.OverlapSeqLib
 63 | 	:members:
 64 | 
 65 | :py:class:`~enrich2.idonly.IdOnlySeqLib` class
 66 | -------------------------------------------------------------
 67 | .. autoclass:: enrich2.idonly.IdOnlySeqLib
 68 | 	:members:
 69 | 
 70 | :py:class:`~enrich2.seqlib.SeqLib` helper classes
 71 | -------------------------------------------------------
 72 | 
 73 | :py:class:`~enrich2.aligner.Aligner` class
 74 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 75 | .. autoclass:: enrich2.aligner.Aligner
 76 | 	:members:
 77 | 
 78 | :py:class:`~enrich2.wildtype.WildTypeSequence` class
 79 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 80 | .. autoclass:: enrich2.wildtype.WildTypeSequence
 81 | 	:members:
 82 | 
 83 | :py:class:`~enrich2.barcodemap.BarcodeMap` class
 84 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 85 | .. autoclass:: enrich2.barcodemap.BarcodeMap
 86 | 	:members:
 87 | 
 88 | :py:mod:`~enrich2.selection` --- Functional score calculation using SeqLib count data
 89 | =====================================================================================
 90 | 
 91 | .. py:module:: selection
 92 | 	:synopsis: Functional score calculation using :py:class:`~enrich2.seqlib.SeqLib` count data.
 93 | 
 94 | This module provides class definitions for the :py:class:`~enrich2.selection.Selection` class. This is where functional scores are calculated from the :py:class:`~enrich2.seqlib.SeqLib` count data. For time series data, each time point in the selection can have multiple :py:class:`~enrich2.seqlib.SeqLib` assigned to it, in which case the counts for each element will be added together. Each time series selection must have a time point 0 (the "input library").
 95 | 
 96 | :py:class:`~enrich2.selection.Selection` class
 97 | ----------------------------------------------------------
 98 | .. autoclass:: enrich2.selection.Selection
 99 | 	:members:
100 | 
101 | :py:class:`~enrich2.selection.Selection` helpers
102 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
103 | .. autofunction:: enrich2.selection.regression_apply
104 | 
105 | :py:mod:`~enrich2.condition` --- Dummy class for GUI
106 | =======================================================================
107 | 
108 | .. py:module:: condition
109 | 	:synopsis: Dummy class for GUI.
110 | 
111 | This module provides class definitions for the :py:class:`~enrich2.experiment.condition.Condition` classes. This class is required for proper GUI operation. All condition-related behaviors are in the :py:class:`~enrich2.experiment.Experiment` class.
112 | 
113 | :py:class:`~enrich2.condition.Condition` class
114 | -----------------------------------------------------------
115 | .. autoclass:: enrich2.condition.Condition
116 | 	:members:
117 | 
118 | :py:mod:`~enrich2.experiment` --- Aggregation of replicate selections
119 | =======================================================================
120 | 
121 | .. py:module:: experiment
122 | 	:synopsis: Aggregation of replicate selections.
123 | 
124 | This module provides class definitions for the :py:class:`~enrich2.experiment.Experiment`. Functional scores for selections within the same condition are combined to generate a single functional score (and associated error) for each element in each experimental condition.
125 | 
126 | :py:class:`~enrich2.experiment.Experiment` class
127 | --------------------------------------------------------------
128 | .. autoclass:: enrich2.experiment.Experiment
129 | 	:members:
130 | 
131 | Enrich2 plotting
132 | ===================================================================
133 | 
134 | .. py:module:: plots
135 | 	:synopsis: Library for general Enrich2 plotting.
136 | 
137 | Text goes here.
138 | 
139 | .. automodule:: enrich2.plots
140 | 	:members:
141 | 
142 | Sequence-function map plotting
143 | --------------------------------------------------------------------
144 | 
145 | .. py:module:: sfmap
146 | 	:synopsis: Library for sequence-function map plotting.
147 | 
148 | Text goes here.
149 | 
150 | .. automodule:: enrich2.sfmap
151 | 	:members:
152 | 
153 | Utility functions
154 | ====================================================================
155 | 
156 | Configuration object type detection
157 | ---------------------------------------------------------------------------
158 | 
159 | .. automodule:: enrich2.config_check
160 | 	:members:
161 | 
162 | Dataframe and index helper functions
163 | ----------------------------------------------------------------------------
164 | 
165 | .. automodule:: enrich2.dataframe
166 | 	:members:
167 | 
168 | .. _api-variant-helper:
169 | 
170 | Variant helper functions
171 | ----------------------------------------------------------------------------
172 | 
173 | .. automodule:: enrich2.variant
174 | 	:members: mutation_count, has_indel, has_unresolvable, protein_variant
175 | 
176 | HGVS_ variant regular expressions
177 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
178 | 
179 | .. autodata:: enrich2.variant.re_protein
180 | 
181 | .. autodata:: enrich2.variant.re_coding
182 | 
183 | .. autodata:: enrich2.variant.re_noncoding
184 | 
185 | Enrich2 entry points
186 | ====================================================================
187 | 
188 | .. automodule:: enrich2.main
189 | 	:members:
190 | 
191 | 
192 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Enrich2 documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Dec 23 14:59:50 2013.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.insert(0, os.path.abspath('.'))
 20 | sys.path.insert(0, os.path.abspath('..'))
 21 | 
 22 | # -- General configuration -----------------------------------------------------
 23 | 
 24 | # If your documentation needs a minimal Sphinx version, state it here.
 25 | #needs_sphinx = '1.0'
 26 | 
 27 | # Add any Sphinx extension module names here, as strings. They can be extensions
 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 29 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.imgmath',
 30 |               'sphinx.ext.inheritance_diagram', 'sphinx.ext.intersphinx',
 31 |               'sphinx.ext.todo', 'sphinx.ext.napoleon']
 32 | 
 33 | # Add any paths that contain templates here, relative to this directory.
 34 | templates_path = ['_templates']
 35 | 
 36 | # The suffix of source filenames.
 37 | source_suffix = '.rst'
 38 | 
 39 | # The encoding of source files.
 40 | #source_encoding = 'utf-8-sig'
 41 | 
 42 | # The master toctree document.
 43 | master_doc = 'index'
 44 | 
 45 | # General information about the project.
 46 | project = u'Enrich2'
 47 | copyright = u'2016-2024, Alan F Rubin'
 48 | 
 49 | # The version info for the project you're documenting, acts as replacement for
 50 | # |version| and |release|, also used in various other places throughout the
 51 | # built documents.
 52 | #
 53 | # The short X.Y version.
 54 | version = '2.0'
 55 | # The full version, including alpha/beta/rc tags.
 56 | release = '2.0.0'
 57 | 
 58 | # The language for content autogenerated by Sphinx. Refer to documentation
 59 | # for a list of supported languages.
 60 | #language = None
 61 | 
 62 | # There are two options for replacing |today|: either, you set today to some
 63 | # non-false value, then it is used:
 64 | #today = ''
 65 | # Else, today_fmt is used as the format for a strftime call.
 66 | #today_fmt = '%B %d, %Y'
 67 | 
 68 | # List of patterns, relative to source directory, that match files and
 69 | # directories to ignore when looking for source files.
 70 | exclude_patterns = ['_build', 'exported_notebooks']
 71 | 
 72 | # The reST default role (used for this markup: `text`) to use for all documents.
 73 | #default_role = None
 74 | 
 75 | # If true, '()' will be appended to :func: etc. cross-reference text.
 76 | #add_function_parentheses = True
 77 | 
 78 | # If true, the current module name will be prepended to all description
 79 | # unit titles (such as .. function::).
 80 | #add_module_names = True
 81 | 
 82 | # If true, sectionauthor and moduleauthor directives will be shown in the
 83 | # output. They are ignored by default.
 84 | #show_authors = False
 85 | 
 86 | # The name of the Pygments (syntax highlighting) style to use.
 87 | pygments_style = 'sphinx'
 88 | 
 89 | # A list of ignored prefixes for module index sorting.
 90 | #modindex_common_prefix = []
 91 | 
 92 | 
 93 | # -- Options for HTML output ---------------------------------------------------
 94 | 
 95 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 96 | # a list of builtin themes.
 97 | #html_theme = 'alabaster'
 98 | html_theme = 'sphinx_rtd_theme'
 99 | 
100 | # Theme options are theme-specific and customize the look and feel of a theme
101 | # further.  For a list of options available for each theme, see the
102 | # documentation.
103 | html_theme_options = {
104 | #  'github_user': 'FowlerLab',
105 | #  'github_repo': 'Enrich2',
106 | #  'github_button': 'true',
107 | }
108 | 
109 | # Add any paths that contain custom themes here, relative to this directory.
110 | #html_theme_path = []
111 | 
112 | # The name for this set of Sphinx documents.  If None, it defaults to
113 | # "<project> v<release> documentation".
114 | #html_title = None
115 | 
116 | # A shorter title for the navigation bar.  Default is the same as html_title.
117 | #html_short_title = None
118 | 
119 | # The name of an image file (relative to this directory) to place at the top
120 | # of the sidebar.
121 | #html_logo = None
122 | 
123 | # The name of an image file (within the static path) to use as favicon of the
124 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
125 | # pixels large.
126 | #html_favicon = None
127 | 
128 | # Add any paths that contain custom static files (such as style sheets) here,
129 | # relative to this directory. They are copied after the builtin static files,
130 | # so a file named "default.css" will overwrite the builtin "default.css".
131 | html_static_path = ['_static']
132 | 
133 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
134 | # using the given strftime format.
135 | #html_last_updated_fmt = '%b %d, %Y'
136 | 
137 | # If true, SmartyPants will be used to convert quotes and dashes to
138 | # typographically correct entities.
139 | #html_use_smartypants = True
140 | 
141 | # Custom sidebar templates, maps document names to template names.
142 | #html_sidebars = {}
143 | 
144 | # Additional templates that should be rendered to pages, maps page names to
145 | # template names.
146 | #html_additional_pages = {}
147 | 
148 | # If false, no module index is generated.
149 | #html_domain_indices = True
150 | 
151 | # If false, no index is generated.
152 | #html_use_index = True
153 | 
154 | # If true, the index is split into individual pages for each letter.
155 | #html_split_index = False
156 | 
157 | # If true, links to the reST sources are added to the pages.
158 | #html_show_sourcelink = True
159 | 
160 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
161 | #html_show_sphinx = True
162 | 
163 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
164 | #html_show_copyright = True
165 | 
166 | # If true, an OpenSearch description file will be output, and all pages will
167 | # contain a <link> tag referring to it.  The value of this option must be the
168 | # base URL from which the finished HTML is served.
169 | #html_use_opensearch = ''
170 | 
171 | # This is the file name suffix for HTML files (e.g. ".xhtml").
172 | #html_file_suffix = None
173 | 
174 | # Output file base name for HTML help builder.
175 | htmlhelp_basename = 'Enrich2doc'
176 | 
177 | 
178 | # -- Options for LaTeX output --------------------------------------------------
179 | 
180 | latex_elements = {
181 | # The paper size ('letterpaper' or 'a4paper').
182 | #'papersize': 'letterpaper',
183 | 
184 | # The font size ('10pt', '11pt' or '12pt').
185 | #'pointsize': '10pt',
186 | 
187 | # Additional stuff for the LaTeX preamble.
188 | #'preamble': '',
189 | }
190 | 
191 | # Grouping the document tree into LaTeX files. List of tuples
192 | # (source start file, target name, title, author, documentclass [howto/manual]).
193 | latex_documents = [
194 |   ('index', 'Enrich2.tex', u'Enrich2 Documentation',
195 |    u'Alan F Rubin', 'manual'),
196 | ]
197 | 
198 | # The name of an image file (relative to this directory) to place at the top of
199 | # the title page.
200 | #latex_logo = None
201 | 
202 | # For "manual" documents, if this is true, then toplevel headings are parts,
203 | # not chapters.
204 | #latex_use_parts = False
205 | 
206 | # If true, show page references after internal links.
207 | #latex_show_pagerefs = False
208 | 
209 | # If true, show URL addresses after external links.
210 | #latex_show_urls = False
211 | 
212 | # Documents to append as an appendix to all manuals.
213 | #latex_appendices = []
214 | 
215 | # If false, no module index is generated.
216 | #latex_domain_indices = True
217 | 
218 | 
219 | # -- Options for manual page output --------------------------------------------
220 | 
221 | # One entry per manual page. List of tuples
222 | # (source start file, name, description, authors, manual section).
223 | man_pages = [
224 |     ('index', 'enrich2', u'Enrich2 Documentation',
225 |      [u'Alan F Rubin'], 1)
226 | ]
227 | 
228 | # If true, show URL addresses after external links.
229 | #man_show_urls = False
230 | 
231 | 
232 | # -- Options for Texinfo output ------------------------------------------------
233 | 
234 | # Grouping the document tree into Texinfo files. List of tuples
235 | # (source start file, target name, title, author,
236 | #  dir menu entry, description, category)
237 | texinfo_documents = [
238 |   ('index', 'Enrich2', u'Enrich2 Documentation',
239 |    u'Alan F Rubin', 'Enrich2', 'One line description of project.',
240 |    'Miscellaneous'),
241 | ]
242 | 
243 | # Documents to append as an appendix to all manuals.
244 | #texinfo_appendices = []
245 | 
246 | # If false, no module index is generated.
247 | #texinfo_domain_indices = True
248 | 
249 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
250 | #texinfo_show_urls = 'footnote'
251 | 
252 | ##########
253 | 
254 | intersphinx_mapping = {'python': ('http://docs.python.org/3/', None),
255 |                        'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None),
256 |                        'matplotlib': ('http://matplotlib.org/', None),
257 |                        'numpy': ('http://docs.scipy.org/doc/numpy/', None),
258 |                        'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None),
259 |                        'statsmodels': ('http://www.statsmodels.org/stable/', None)}
260 | todo_include_todos = True
261 | 
262 | rst_epilog = """
263 | .. Aliases for commonly used web links
264 | 
265 | .. _FASTQ: http://en.wikipedia.org/wiki/FASTQ_format
266 | 
267 | .. _Araya and Fowler: http://www.pnas.org/content/109/42/16858.abstract
268 | 
269 | .. _HGVS: http://www.hgvs.org/mutnomen/recs.html
270 | 
271 | .. _matplotlib cmap: http://matplotlib.org/examples/color/colormaps_reference.html
272 | 
273 | .. _HDF5: http://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables
274 | 
275 | .. _Enrich2 manuscript: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1272-5
276 | 
277 | .. Replacement aliases for intersphinx library documentation
278 | 
279 | .. |mpl_PdfPages| replace:: :py:class:`~matplotlib.backends.backend_pdf.PdfPages`
280 | 
281 | .. |mpl_pcolormesh| replace:: :py:func:`~matplotlib.pyplot.pcolormesh`
282 | 
283 | .. |pd_DataFrame| replace:: :py:class:`~pandas.DataFrame`
284 | 
285 | .. |pd_Series| replace:: :py:class:`~pandas.Series`
286 | """
287 | 


--------------------------------------------------------------------------------
/docs/exported_notebooks/README:
--------------------------------------------------------------------------------
 1 | This directory contains reST exports of jupyter notebooks, so that the static 
 2 | notebooks can be included in the documentation.
 3 | 
 4 | reST files in this directory are placed inline using the 'include' directive.
 5 | To create these files, download the notebook as reST (requires pandoc). 
 6 | Image links (for plots) must be hand-edited. Plot images should be renamed and 
 7 | put in the '_static/notebook_plots/' directory.
 8 | 
 9 | If sphinx generates a lot of errors that look like the following, this may be 
10 | caused by C function calls being parsed as reST. Pointer notation looks like 
11 | emphasis/strong start-strings. To fix it, change the '.. parsed-literal::' to 
12 | the non-parsed version '::'.
13 | 	WARNING: Inline emphasis start-string without end-string.
14 | 	WARNING: Inline strong start-string without end-string.
15 | 


--------------------------------------------------------------------------------
/docs/exported_notebooks/min_count.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Selecting variants by input library count
  3 | -----------------------------------------
  4 | 
  5 | This notebook gets scores and standard errors for the variants in a
  6 | Selection that exceed a minimum count cutoff in the input time point,
  7 | and plots the relationship between each variant's score and input count.
  8 | 
  9 | .. code:: python
 10 | 
 11 |     % matplotlib inline
 12 | 
 13 | .. code:: python
 14 | 
 15 |     from __future__ import print_function
 16 |     import os.path
 17 |     import numpy as np
 18 |     import pandas as pd
 19 |     import matplotlib.pyplot as plt
 20 |     from enrich2.variant import WILD_TYPE_VARIANT
 21 |     import enrich2.plots as enrich_plot
 22 |     pd.set_option("display.max_rows", 10) # rows shown when pretty-printing
 23 | 
 24 | Modify the ``results_path`` variable in the next cell to match the
 25 | output directory of your Enrich2-Example dataset.
 26 | 
 27 | .. code:: python
 28 | 
 29 |     results_path = "/path/to/Enrich2-Example/Results/"
 30 | 
 31 | Open the Selection HDF5 file with the variants we are interested in.
 32 | 
 33 | .. code:: python
 34 | 
 35 |     my_store = pd.HDFStore(os.path.join(results_path, "Rep1_sel.h5"))
 36 | 
 37 | The ``pd.HDFStore.keys()`` method returns a list of all the tables in
 38 | this HDF5 file.
 39 | 
 40 | .. code:: python
 41 | 
 42 |     my_store.keys()
 43 | 
 44 | 
 45 | 
 46 | 
 47 | .. parsed-literal::
 48 | 
 49 |     ['/main/barcodemap',
 50 |      '/main/barcodes/counts',
 51 |      '/main/barcodes/counts_unfiltered',
 52 |      '/main/barcodes/log_ratios',
 53 |      '/main/barcodes/scores',
 54 |      '/main/barcodes/weights',
 55 |      '/main/synonymous/counts',
 56 |      '/main/synonymous/counts_unfiltered',
 57 |      '/main/synonymous/log_ratios',
 58 |      '/main/synonymous/scores',
 59 |      '/main/synonymous/weights',
 60 |      '/main/variants/counts',
 61 |      '/main/variants/counts_unfiltered',
 62 |      '/main/variants/log_ratios',
 63 |      '/main/variants/scores',
 64 |      '/main/variants/weights']
 65 | 
 66 | 
 67 | 
 68 | We will work with the "/main/variants/counts" table first. Enrich2
 69 | names the columns for counts ``c_n`` where ``n`` is the time point,
 70 | beginning with ``0`` for the input library.
 71 | 
 72 | We can use a query to extract the subset of variants in the table that
 73 | exceed the specified cutoff. Since we're only interested in variants,
 74 | we'll explicitly exclude the wild type. We will store the data we
 75 | extract in the ``variant_count`` data frame.
 76 | 
 77 | .. code:: python
 78 | 
 79 |     read_cutoff = 10
 80 | 
 81 | .. code:: python
 82 | 
 83 |     variant_counts = my_store.select('/main/variants/counts', where='c_0 > read_cutoff and index != WILD_TYPE_VARIANT')
 84 |     variant_counts
 85 | 
 86 | 
 87 | 
 88 | 
 89 | .. raw:: html
 90 | 
 91 |     <div>
 92 |     <table border="1" class="dataframe">
 93 |       <thead>
 94 |         <tr style="text-align: right;">
 95 |           <th></th>
 96 |           <th>c_0</th>
 97 |           <th>c_2</th>
 98 |           <th>c_5</th>
 99 |         </tr>
100 |       </thead>
101 |       <tbody>
102 |         <tr>
103 |           <th>c.10G&gt;A (p.Ala4Arg), c.11C&gt;G (p.Ala4Arg), c.12T&gt;A (p.Ala4Arg)</th>
104 |           <td>787.0</td>
105 |           <td>106.0</td>
106 |           <td>124.0</td>
107 |         </tr>
108 |         <tr>
109 |           <th>c.10G&gt;A (p.Ala4Asn), c.11C&gt;A (p.Ala4Asn)</th>
110 |           <td>699.0</td>
111 |           <td>80.0</td>
112 |           <td>114.0</td>
113 |         </tr>
114 |         <tr>
115 |           <th>c.10G&gt;A (p.Ala4Asn), c.11C&gt;A (p.Ala4Asn), c.12T&gt;C (p.Ala4Asn)</th>
116 |           <td>94.0</td>
117 |           <td>8.0</td>
118 |           <td>13.0</td>
119 |         </tr>
120 |         <tr>
121 |           <th>c.10G&gt;A (p.Ala4Ile), c.11C&gt;T (p.Ala4Ile)</th>
122 |           <td>1280.0</td>
123 |           <td>137.0</td>
124 |           <td>80.0</td>
125 |         </tr>
126 |         <tr>
127 |           <th>c.10G&gt;A (p.Ala4Ile), c.11C&gt;T (p.Ala4Ile), c.12T&gt;A (p.Ala4Ile)</th>
128 |           <td>717.0</td>
129 |           <td>42.0</td>
130 |           <td>27.0</td>
131 |         </tr>
132 |         <tr>
133 |           <th>...</th>
134 |           <td>...</td>
135 |           <td>...</td>
136 |           <td>...</td>
137 |         </tr>
138 |         <tr>
139 |           <th>c.9T&gt;A (p.=)</th>
140 |           <td>327.0</td>
141 |           <td>217.0</td>
142 |           <td>284.0</td>
143 |         </tr>
144 |         <tr>
145 |           <th>c.9T&gt;C (p.=)</th>
146 |           <td>1947.0</td>
147 |           <td>523.0</td>
148 |           <td>1230.0</td>
149 |         </tr>
150 |         <tr>
151 |           <th>c.9T&gt;C (p.=), c.49A&gt;T (p.Met17Ser), c.50T&gt;C (p.Met17Ser), c.51G&gt;A (p.Met17Ser)</th>
152 |           <td>277.0</td>
153 |           <td>43.0</td>
154 |           <td>5.0</td>
155 |         </tr>
156 |         <tr>
157 |           <th>c.9T&gt;C (p.=), c.62T&gt;C (p.Leu21Ser), c.63A&gt;T (p.Leu21Ser)</th>
158 |           <td>495.0</td>
159 |           <td>138.0</td>
160 |           <td>55.0</td>
161 |         </tr>
162 |         <tr>
163 |           <th>c.9T&gt;G (p.=)</th>
164 |           <td>406.0</td>
165 |           <td>18.0</td>
166 |           <td>20.0</td>
167 |         </tr>
168 |       </tbody>
169 |     </table>
170 |     <p>1440 rows × 3 columns</p>
171 |     </div>
172 | 
173 | 
174 | 
175 | The index of the data frame is the list of variants that exceeded the
176 | cutoff.
177 | 
178 | .. code:: python
179 | 
180 |     variant_counts.index
181 | 
182 | 
183 | 
184 | 
185 | .. parsed-literal::
186 | 
187 |     Index([u'c.10G>A (p.Ala4Arg), c.11C>G (p.Ala4Arg), c.12T>A (p.Ala4Arg)',
188 |            u'c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn)',
189 |            u'c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn), c.12T>C (p.Ala4Asn)',
190 |            u'c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile)',
191 |            u'c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile), c.12T>A (p.Ala4Ile)',
192 |            u'c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile), c.12T>C (p.Ala4Ile)',
193 |            u'c.10G>A (p.Ala4Lys), c.11C>A (p.Ala4Lys), c.12T>A (p.Ala4Lys)',
194 |            u'c.10G>A (p.Ala4Met), c.11C>T (p.Ala4Met), c.12T>G (p.Ala4Met)',
195 |            u'c.10G>A (p.Ala4Ser), c.11C>G (p.Ala4Ser)',
196 |            u'c.10G>A (p.Ala4Ser), c.11C>G (p.Ala4Ser), c.12T>C (p.Ala4Ser)',
197 |            ...
198 |            u'c.8C>T (p.Ser3Phe), c.60C>T (p.=)',
199 |            u'c.8C>T (p.Ser3Phe), c.9T>C (p.Ser3Phe)', u'c.90C>A (p.=)',
200 |            u'c.90C>G (p.Ile30Met)', u'c.90C>T (p.=)', u'c.9T>A (p.=)',
201 |            u'c.9T>C (p.=)',
202 |            u'c.9T>C (p.=), c.49A>T (p.Met17Ser), c.50T>C (p.Met17Ser), c.51G>A (p.Met17Ser)',
203 |            u'c.9T>C (p.=), c.62T>C (p.Leu21Ser), c.63A>T (p.Leu21Ser)',
204 |            u'c.9T>G (p.=)'],
205 |           dtype='object', length=1440)
206 | 
207 | 
208 | 
209 | We can use this index to get the scores for these variants by querying
210 | the "/main/variants/scores" table. We'll store the result of the query
211 | in a new data frame named ``variant_scores``, and keep only the score
212 | and standard error (SE) columns.
213 | 
214 | .. code:: python
215 | 
216 |     variant_scores = my_store.select('/main/variants/scores', where='index in variant_counts.index')
217 |     variant_scores = variant_scores[['score', 'SE']]
218 |     variant_scores
219 | 
220 | 
221 | 
222 | 
223 | .. raw:: html
224 | 
225 |     <div>
226 |     <table border="1" class="dataframe">
227 |       <thead>
228 |         <tr style="text-align: right;">
229 |           <th></th>
230 |           <th>score</th>
231 |           <th>SE</th>
232 |         </tr>
233 |       </thead>
234 |       <tbody>
235 |         <tr>
236 |           <th>c.10G&gt;A (p.Ala4Arg), c.11C&gt;G (p.Ala4Arg), c.12T&gt;A (p.Ala4Arg)</th>
237 |           <td>-0.980091</td>
238 |           <td>0.134873</td>
239 |         </tr>
240 |         <tr>
241 |           <th>c.10G&gt;A (p.Ala4Asn), c.11C&gt;A (p.Ala4Asn)</th>
242 |           <td>-0.972035</td>
243 |           <td>0.268962</td>
244 |         </tr>
245 |         <tr>
246 |           <th>c.10G&gt;A (p.Ala4Asn), c.11C&gt;A (p.Ala4Asn), c.12T&gt;C (p.Ala4Asn)</th>
247 |           <td>-1.138667</td>
248 |           <td>0.403767</td>
249 |         </tr>
250 |         <tr>
251 |           <th>c.10G&gt;A (p.Ala4Ile), c.11C&gt;T (p.Ala4Ile)</th>
252 |           <td>-1.875331</td>
253 |           <td>0.014883</td>
254 |         </tr>
255 |         <tr>
256 |           <th>c.10G&gt;A (p.Ala4Ile), c.11C&gt;T (p.Ala4Ile), c.12T&gt;A (p.Ala4Ile)</th>
257 |           <td>-2.552289</td>
258 |           <td>0.421699</td>
259 |         </tr>
260 |         <tr>
261 |           <th>...</th>
262 |           <td>...</td>
263 |           <td>...</td>
264 |         </tr>
265 |         <tr>
266 |           <th>c.9T&gt;A (p.=)</th>
267 |           <td>0.705661</td>
268 |           <td>0.774559</td>
269 |         </tr>
270 |         <tr>
271 |           <th>c.9T&gt;C (p.=)</th>
272 |           <td>0.438654</td>
273 |           <td>0.014857</td>
274 |         </tr>
275 |         <tr>
276 |           <th>c.9T&gt;C (p.=), c.49A&gt;T (p.Met17Ser), c.50T&gt;C (p.Met17Ser), c.51G&gt;A (p.Met17Ser)</th>
277 |           <td>-1.930922</td>
278 |           <td>1.085535</td>
279 |         </tr>
280 |         <tr>
281 |           <th>c.9T&gt;C (p.=), c.62T&gt;C (p.Leu21Ser), c.63A&gt;T (p.Leu21Ser)</th>
282 |           <td>-0.897249</td>
283 |           <td>0.884321</td>
284 |         </tr>
285 |         <tr>
286 |           <th>c.9T&gt;G (p.=)</th>
287 |           <td>-2.314604</td>
288 |           <td>0.671760</td>
289 |         </tr>
290 |       </tbody>
291 |     </table>
292 |     <p>1440 rows × 2 columns</p>
293 |     </div>
294 | 
295 | 
296 | 
297 | Now that we're finished getting data out of the HDF5 file, we'll close
298 | it.
299 | 
300 | .. code:: python
301 | 
302 |     my_store.close()
303 | 
304 | To more easily explore the relationship between input count and score,
305 | we'll add a column to the ``variant_scores`` data frame that contains
306 | input counts from the ``variant_counts`` data frame.
307 | 
308 | .. code:: python
309 | 
310 |     variant_scores['input_count'] = variant_counts['c_0']
311 |     variant_scores
312 | 
313 | 
314 | 
315 | 
316 | .. raw:: html
317 | 
318 |     <div>
319 |     <table border="1" class="dataframe">
320 |       <thead>
321 |         <tr style="text-align: right;">
322 |           <th></th>
323 |           <th>score</th>
324 |           <th>SE</th>
325 |           <th>input_count</th>
326 |         </tr>
327 |       </thead>
328 |       <tbody>
329 |         <tr>
330 |           <th>c.10G&gt;A (p.Ala4Arg), c.11C&gt;G (p.Ala4Arg), c.12T&gt;A (p.Ala4Arg)</th>
331 |           <td>-0.980091</td>
332 |           <td>0.134873</td>
333 |           <td>787.0</td>
334 |         </tr>
335 |         <tr>
336 |           <th>c.10G&gt;A (p.Ala4Asn), c.11C&gt;A (p.Ala4Asn)</th>
337 |           <td>-0.972035</td>
338 |           <td>0.268962</td>
339 |           <td>699.0</td>
340 |         </tr>
341 |         <tr>
342 |           <th>c.10G&gt;A (p.Ala4Asn), c.11C&gt;A (p.Ala4Asn), c.12T&gt;C (p.Ala4Asn)</th>
343 |           <td>-1.138667</td>
344 |           <td>0.403767</td>
345 |           <td>94.0</td>
346 |         </tr>
347 |         <tr>
348 |           <th>c.10G&gt;A (p.Ala4Ile), c.11C&gt;T (p.Ala4Ile)</th>
349 |           <td>-1.875331</td>
350 |           <td>0.014883</td>
351 |           <td>1280.0</td>
352 |         </tr>
353 |         <tr>
354 |           <th>c.10G&gt;A (p.Ala4Ile), c.11C&gt;T (p.Ala4Ile), c.12T&gt;A (p.Ala4Ile)</th>
355 |           <td>-2.552289</td>
356 |           <td>0.421699</td>
357 |           <td>717.0</td>
358 |         </tr>
359 |         <tr>
360 |           <th>...</th>
361 |           <td>...</td>
362 |           <td>...</td>
363 |           <td>...</td>
364 |         </tr>
365 |         <tr>
366 |           <th>c.9T&gt;A (p.=)</th>
367 |           <td>0.705661</td>
368 |           <td>0.774559</td>
369 |           <td>327.0</td>
370 |         </tr>
371 |         <tr>
372 |           <th>c.9T&gt;C (p.=)</th>
373 |           <td>0.438654</td>
374 |           <td>0.014857</td>
375 |           <td>1947.0</td>
376 |         </tr>
377 |         <tr>
378 |           <th>c.9T&gt;C (p.=), c.49A&gt;T (p.Met17Ser), c.50T&gt;C (p.Met17Ser), c.51G&gt;A (p.Met17Ser)</th>
379 |           <td>-1.930922</td>
380 |           <td>1.085535</td>
381 |           <td>277.0</td>
382 |         </tr>
383 |         <tr>
384 |           <th>c.9T&gt;C (p.=), c.62T&gt;C (p.Leu21Ser), c.63A&gt;T (p.Leu21Ser)</th>
385 |           <td>-0.897249</td>
386 |           <td>0.884321</td>
387 |           <td>495.0</td>
388 |         </tr>
389 |         <tr>
390 |           <th>c.9T&gt;G (p.=)</th>
391 |           <td>-2.314604</td>
392 |           <td>0.671760</td>
393 |           <td>406.0</td>
394 |         </tr>
395 |       </tbody>
396 |     </table>
397 |     <p>1440 rows × 3 columns</p>
398 |     </div>
399 | 
400 | 
401 | 
402 | Now that all the information is in a single data frame, we can make a
403 | plot of score vs. input count. This example uses functions and colors
404 | from the Enrich2 plotting library. Taking the log10 of the counts makes
405 | the data easier to visualize.
406 | 
407 | .. code:: python
408 | 
409 |     fig, ax = plt.subplots()
410 |     enrich_plot.configure_axes(ax, xgrid=True)
411 |     ax.plot(np.log10(variant_scores['input_count']), 
412 |             variant_scores['score'], 
413 |             linestyle='none', marker='.', alpha=0.6,
414 |             color=enrich_plot.plot_colors['bright4'])
415 |     ax.set_xlabel("log10(Input Count)")
416 |     ax.set_ylabel("Variant Score")
417 | 
418 | 
419 | 
420 | 
421 | .. parsed-literal::
422 | 
423 |     <matplotlib.text.Text at 0x9e796a0>
424 | 
425 | 
426 | 
427 | 
428 | .. image:: _static/notebook_plots/min_count_plot.png
429 | 
430 | 
431 | 


--------------------------------------------------------------------------------
/docs/gui.rst:
--------------------------------------------------------------------------------
  1 | .. _gui-documentation:
  2 | 
  3 | Using the GUI
  4 | ======================
  5 | 
  6 | The graphical user interface makes it easy to specify an experimental design that Enrich2 can understand. For more information about how these are organized, see :ref:`experimental-designs`.
  7 | 
  8 | Configuring your analysis
  9 | ------------------------------------
 10 | 
 11 | The Enrich2 installer places the graphical user interface (GUI) entry point in your path. Type ``enrich_gui`` from the command line to launch the program. 
 12 | 
 13 | .. error:: Mac OS X users running the Enrich2 GUI in a virtualenv may encounter the following error::
 14 | 
 15 |         2016-10-10 12:34:56.789 python[12345:12345678] -[NSApplication _setup:]: unrecognized selector sent to instance 0x12345abcd
 16 | 
 17 |     This is caused by an interaction between Tkinter and the `matplotlib backend <http://matplotlib.org/faq/usage_faq.html#what-is-a-backend>`_. To fix the issue, edit (or create) the "~/.matplotlib/matplotlibrc" file and add the line::
 18 | 
 19 |         backend: TkAgg
 20 | 
 21 | .. note:: Once you have created your configuration file, you can also run the program in command line mode. Type ``enrich_cmd --help`` for usage and a list of command line options.
 22 | 
 23 | .. image:: _static/gui_screenshots/empty.png
 24 | 	:alt: The Enrich2 GUI window upon launch
 25 | 
 26 | Click "New..." to create the root object.
 27 | 
 28 | .. image:: _static/gui_screenshots/new_root.png
 29 | 	:alt: The new root object window
 30 | 
 31 | Enter a short but descriptive object name that will not conflict with other objects in the analysis.
 32 | 
 33 | Choose the output directory for the HDF5_, plot, and tab-separated files generated by the analysis.
 34 | 
 35 | Select the appropriate object type: Experiment, Selection if there are no replicates, or SeqLib if you only want to count a single sequencing library.
 36 | 
 37 | If you created a Selection or Experiment root object, select it and click "New..." to add a child object.
 38 | 
 39 | .. image:: _static/gui_screenshots/new_child.png
 40 | 	:alt: Creating a child object
 41 | 
 42 | Conditions and Selections do not have any parameters beyond their names.
 43 | 
 44 | Continue adding child objects until the entire experimental design is represented.
 45 | When creating a new SeqLib, choose the appropriate type depending on how the 
 46 | experiment was performed (see :ref:`intro-seqlibs`).
 47 | 
 48 | .. note:: To avoid re-counting the reads when multiple Selections share the same input library, use the same object name for the input library in each Selection.
 49 | 
 50 | Most parameters are specified in SeqLib objects, such as the wild type sequence, filtering options, and the location of the sequencing files or counts files (see :ref:`seqlib-configuration`).
 51 | 
 52 | .. note:: Time points can have multiple sequencing libraries, which are added together before scores are calculated.
 53 | 
 54 | .. image:: _static/gui_screenshots/seqlib.png
 55 | 	:alt: Editing a SeqLib object
 56 | 
 57 | Clicking "New..." with a SeqLib object selected will add a sibling SeqLib to the Selection that shares the same FASTQ_ filtering and other options.  
 58 | 
 59 | Saving and loading
 60 | ---------------------------
 61 | 
 62 | After you have configured the analysis, you can save a configuration file by selecting "Save" or "Save As..." from the File menu. You can also use the File menu to load an existing configuration file by selecting "Open."
 63 | 
 64 | .. note:: If you encounter an error when loading a configuration file, try using a validator such as `JSONLint <http://jsonlint.com/>`_ to identify any issues. 
 65 | 
 66 | Context menus
 67 | ---------------------------
 68 | 
 69 | Right-clicking on an object will open a context menu with additional actions not covered by the New/Edit/Delete buttons.
 70 | 
 71 | .. image:: _static/gui_screenshots/context_menu.png
 72 | 	:alt: Right-click context menu for a SeqLib
 73 | 
 74 | * Apply FASTQ...
 75 | 
 76 | 	Copy the FASTQ_ filtering options from the chosen SeqLib to every highlighted SeqLib of the same type. 
 77 | 
 78 | .. _analysis-options:
 79 | 
 80 | Analysis options
 81 | ---------------------
 82 | 
 83 | These choices are not saved in the configuration file and should be reviewed before running each analysis. For further information about the scoring and normalization methods below, see the `Enrich2 manuscript`_.
 84 | 
 85 | Scoring method
 86 | +++++++++++++++++++++++
 87 | 
 88 | * Weighted Least Squares
 89 | 
 90 | 	Recommended for selections with at least three time points (including the input). 
 91 | 
 92 | * Log Ratios (Enrich2)
 93 | 	
 94 | 	 Recommended for selections with two time points (input and selected). For selections with more than two time points, the last time point is used as the selected time point. Intermediate time points not used.
 95 | 
 96 | * Counts Only
 97 | 
 98 | 	No element scores are calculated. The output contains only element counts.
 99 | 
100 | * Ordinary Least Squares
101 | 
102 | 	Provided for comparison and legacy support.
103 | 
104 | * Log Ratios (Old Enrich)
105 | 
106 | 	Provided for comparison and legacy support. This method is a re-implementation of the previously published `Enrich software <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3232369/>`_. Standard errors are not calculated. For selections with more than two time points, the last time point is used as the selected time point. Intermediate time points not used.
107 | 
108 | Normalization method
109 | +++++++++++++++++++++++
110 | 
111 | * Wild Type
112 | 
113 | 	Recommended if your selection has a wild type sequence. Normalizes counts by the wild type count as described in the `Enrich2 manuscript`_. For designs with identifiers instead of variants, the special wild type identifier "_wt" can be used.
114 | 
115 | * Library Size (Complete Cases)
116 | 
117 | 	Normalizes counts by the library size. Only elements present in all time points within a selection contribute to the library size.
118 | 
119 | * Library Size (All Reads)
120 | 
121 | 	Normalizes counts by the library size. All elements contribute to the library size.
122 | 
123 | Other options
124 | +++++++++++++++++++++++
125 | 
126 | * Force Recalculation
127 | 
128 | 	Discards all data that are not raw counts before performing the analysis. See :ref:`output-table-organization` for more about raw counts.
129 | 
130 | * Component Outlier Statistics
131 | 
132 | 	Tests whether the score of each barcode differs significantly from that of its assigned variant or identifier. Performs an analogous calculation for variant and synonymous scores.
133 | 
134 | .. warning:: Testing for outliers is experimental and very computationally inefficient.
135 | 
136 | * Make Plots
137 | 
138 | 	Creates plots for this analysis.
139 | 
140 | * Write TSV Files
141 | 
142 | 	Outputs tab-separated files for this analysis.
143 | 
144 | Once you've finished selecting your options, click Run Analysis!
145 | 
146 | The output directory will contain :ref:`hdf5-files`, :ref:`plots`, and tab-separated files.
147 | 
148 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Enrich2: deep mutational scanning data analysis
 2 | ============================================================================
 3 | 
 4 | Enrich2 is a general software tool for processing, analyzing, and visualizing data from deep mutational scanning experiments.
 5 | 
 6 | The software is freely available from https://github.com/FowlerLab/Enrich2/ under the BSD 3-clause license.
 7 | 
 8 | For an example dataset, visit https://github.com/FowlerLab/Enrich2-Example/.
 9 | 
10 | To cite Enrich2, please reference `A statistical framework for analyzing deep mutational scanning data <https://doi.org/10.1186/s13059-017-1272-5>`_.
11 | 
12 | Enrich2 was written by `Alan F Rubin <mailto:alan.rubin@wehi.edu.au>`_ |ORCID_icon| http://orcid.org/0000-0003-1474-605X
13 | with contributions from Chris Macdonald |ORCID_icon| https://orcid.org/0000-0002-0201-8832 for the Python 3-compatible version.
14 | 
15 | .. error:: Important notice for users of Enrich2 v1.0 or v1.1
16 | 
17 | Enrich2 v1.2.0 corrected an error in the software that, for most datasets, resulted in the standard errors for combined scores being over-estimated.
18 | The counts, scores, and replicate-wise standard errors are unaffected.
19 | 
20 | If you have analyzed datasets that contain replicates with a previous version of Enrich2, the easiest way to get the correct standard error values is to delete the experiment HDF5_ file (the file name ends with ``'_exp.h5'``) and re-run the program.
21 | This will recalculate combined scores and standard errors without redoing other parts of the analysis.
22 | 
23 | .. |ORCID_icon| image:: _static/iD_icon.png
24 |     :target: http://orcid.org
25 | 
26 | 
27 | .. toctree::
28 |     :hidden:
29 |     :maxdepth: 0
30 |     
31 |     installation
32 |     introduction
33 |     gui
34 |     seqlib_config
35 |     output
36 |     plots
37 |     notebooks
38 |     api
39 | 
40 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Getting started
 2 | =======================================================
 3 | 
 4 | .. _required packages:
 5 | 
 6 | Required packages
 7 | -------------------------------------------------------
 8 | 
 9 | Enrich2 runs on Python 3 and has the following dependencies:
10 | 
11 | * `NumPy <http://www.numpy.org/>`_
12 | * `SciPy <http://www.scipy.org/>`_
13 | * `pandas <http://pandas.pydata.org/>`_
14 | * `PyTables <http://www.pytables.org/>`_
15 | * `Statsmodels <http://statsmodels.sourceforge.net/>`_
16 | * `matplotlib <http://matplotlib.org/>`_
17 | * `fqfa <https://fqfa.readthedocs.io/>`_
18 | 
19 | The configuration GUI requires `Tkinter <https://docs.python.org/2/library/tkinter.html>`_. Building a local copy of the documentation requires `Sphinx <http://sphinx-doc.org/>`_.
20 | 
21 | .. note:: PyTables may not be installed by your distribution. If you encounter errors, check that the ``tables`` module is present.
22 | 
23 | .. note:: Tkinter may not be installed by your distribution. If you encounter errors, try installing ``python3-tk`` or similar using your system package manager.
24 | 
25 | Installation and example dataset
26 | -------------------------------------------------------
27 | 
28 | You can install Enrich2 in a new `virtual environment <https://docs.python.org/3/library/venv.html>`_ using `pip <https://docs.python.org/3/installing/index.html>`_::
29 | 
30 |     python3 -m venv e2env
31 |     source e2env/bin/activate
32 |     pip install enrich2
33 | 
34 | To download the example dataset, visit the `Enrich2-Example GitHub repository <https://github.com/FowlerLab/Enrich2-Example/>`_.
35 | Running this preconfigured analysis will create several :ref:`plots`. The :ref:`example-notebooks` demonstrate how to explore the :ref:`hdf5-files`.
36 | 
37 | Enrich2 executables
38 | -------------------------------------------------------
39 | 
40 | The Enrich2 installer places two executable scripts into the user's path. Both executables run the same analysis, but through different interfaces.
41 | 
42 | * ``enrich_gui`` launches the Enrich2 graphical user interface. This is the recommended way to create a configuration file for Enrich2. See :ref:`gui-documentation` for a step-by-step guide.
43 | 
44 | * ``enrich_cmd`` launches the program from the command line. This is recommended for users performing analyses on a remote server who have already created configuration files. For a detailed list of command line options, type ``enrich_cmd --help``
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | Defining experiments
 2 | =====================================================
 3 | 
 4 | .. _experimental-designs:
 5 | 
 6 | Experimental designs
 7 | -------------------------------------------------------
 8 | 
 9 | Enrich2 represents deep mutational scanning experimental designs as a tree of objects. The hierarchy of object types is defined below:
10 | 
11 | * Experiment
12 | 
13 | 	The root object for most experimental designs. Parent of at least one experimental condition.
14 | 
15 | * Condition
16 | 
17 | 	A single experimental condition. Parent of at least one replicate selection performed under the condition.
18 | 
19 | * Selection
20 | 
21 | 	A single deep mutational scanning replicate. Parent of at least two sequencing libraries, one or more for each time point/round/bin of the selection.
22 | 
23 | * Sequencing library (SeqLib)
24 | 
25 | 	FASTQ_ output or count data from a deep mutational scanning time point/round/bin. Has no children.
26 | 
27 | Each experimental design has a single root object, which can be an Experiment, Selection, or SeqLib. With the exception of Conditions, each experimental design object has its own HDF5 file containing its data. 
28 | 
29 | .. note:: Conditions do not have their own HDF5 file. If there is only one condition, use an Experiment as the root.
30 | 
31 | .. image:: _static/cartoons/data_hierarchy.png
32 | 	:alt: Hierarchy of objects in an experimental design
33 | 
34 | The above diagram illustrates an experimental design with two conditions, each with three replicates sampled at three time points (including the input).
35 | 
36 | .. _intro-elements:
37 | 
38 | Elements
39 | -------------------------------------------------------
40 | 
41 | Enrich2 counts elements to quantify their enrichment or depletion in a complex population. The four element types are defined below:
42 | 
43 | * Barcode
44 | 
45 | 	A short DNA barcode sequence often used for tagging variants. Stored as the barcode DNA sequence. Barcodes are counted directly from sequencing data.
46 | 
47 | * Variant
48 | 	
49 | 	A DNA-level variant of the wild type sequence, which can be coding or non-coding. Stored as an HGVS_ string describing the nucleotide and any amino acid differences from the wild type sequence. Variants can be counted either directly from sequencing data or as the sum of counts for linked barcodes as defined by a barcode-variant map.
50 | 
51 | * Synonymous
52 | 
53 | 	A protein-level variant of the wild type sequence. Stored as an HGVS_ string describing the amino acid differences from the wild type sequence. Synonymous elements are counted as the sum of counts for variant elements with the same amino acid sequence. Variant elements with the wild type amino acid sequence but a non-wild type DNA sequence are assigned to a special variant.
54 | 
55 | * Identifier
56 | 
57 | 	An arbitrary label (such as a target gene name) for barcode assignment. Stored as the label string. Identifiers are counted as the sum of counts for associated barcodes as defined by a barcode-identifier map or specified as counts.
58 | 
59 | .. _intro-seqlibs:
60 | 
61 | SeqLibs
62 | -------------------------------------------------------
63 | 
64 | Enrich2 implements five types of SeqLib, each supporting different element types and/or methods of sequencing deep mutational scanning populations.
65 | 
66 | .. note:: Synonymous elements are only present if the wild type sequence is protein coding.
67 | 
68 | * Barcoded Variant
69 | 
70 | 	Contains barcode, variant, and synonymous elements.	Each DNA variant in the experiment is linked to one or more DNA barcode sequences. A barcode-variant map describes which barcodes map to each variant. The FASTQ_ file contains only barcode sequences.
71 | 
72 | * Barcoded Identifier
73 | 	
74 | 	Contains barcode and identifier elements. Each identifier in the experiment is associated with one or more DNA barcode sequences. A barcode-identifier map describes which barcodes map to each identifier. The FASTQ_ file contains only barcode sequences.
75 | 
76 | * Overlap
77 | 
78 | 	Contains variant and synonymous elements. DNA variants are sequenced directly using overlapping paired-end reads. Requires FASTQ_ files for both forward and reverse reads.
79 | 
80 | * Basic
81 | 	
82 | 	Contains variant and synonymous elements. DNA variants are sequenced directly using single-end reads.
83 | 
84 | * Barcodes Only
85 | 
86 | 	Contains barcode elements. The FASTQ_ file contains only barcode sequences.
87 | 
88 | * Identifiers Only
89 | 
90 | 	Contains identifier elements. No FASTQ_ file is processed, so the counts must be provided by the user.
91 | 
92 | For more information, see :ref:`seqlib-configuration`.
93 | 
94 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Enrich2.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Enrich2.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/docs/notebooks.rst:
--------------------------------------------------------------------------------
 1 | .. _example-notebooks:
 2 | 
 3 | Example notebooks
 4 | ====================================
 5 | 
 6 | Begin exploring Enrich2 datasets with the following notebooks. They rely on the `Enrich2 example dataset <https://github.com/FowlerLab/Enrich2-Example/>`_, so please perform that analysis before running any of these notebooks locally.
 7 | 
 8 | The notebooks can be run interactively by using the command line to navigate to the "Enrich2/docs/notebooks" directory and enter ``jupyter notebook <notebook.ipynb>`` where ``<notebook.ipynb>`` is the notebook file name.
 9 | 
10 | The first two notebooks demonstrate using pandas to open an HDF5 file, extract its contents into a data frame, and perform queries on tables in the HDF5 file. For more information, see the `pandas HDF5 documentation <http://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables>`_.
11 | 
12 | .. include:: exported_notebooks/min_count.rst
13 | 
14 | .. include:: exported_notebooks/unique_barcodes.rst
15 | 
16 | For more information on Enrich2 data tables, see :ref:`hdf5-files`.
17 | 


--------------------------------------------------------------------------------
/docs/output.rst:
--------------------------------------------------------------------------------
 1 | .. _hdf5-files:
 2 | 
 3 | Output HDF5 files
 4 | =======================================
 5 | 
 6 | Enrich2 stores data in an HDF5 file for each Experiment, Selection, and SeqLib analysis object. The name of the HDF5 file is the object's name plus the suffix "_<obj>.h5", where <obj> is the object type ("exp", "sel", or "lib").
 7 | Each file has multiple tables that can be queried and retrieved as pandas data frames (see :ref:`example-notebooks`). 
 8 | 
 9 | Each Experiment, Selection, and SeqLib has its own directory inside "Results/tsv/" containing tab-separated value files for users who want to work with other tools, such as R or Excel.
10 | 
11 | .. _output-table-organization:
12 | 
13 | Table organization
14 | ---------------------------------------------------
15 | 
16 | HDF5 files organize tables into groups like directories in a file system. Enrich2 has two top-level groups, "/main" (used for most tables) and "/raw" (used exclusively in SeqLibs to store raw counts). The first subgroup is typically the element type (variant, barcode, etc.), followed by the kind of data (counts, scores, etc.).
17 | 
18 | .. note:: When the "Force recalculation" analysis option is chosen, the "/main" tables are deleted from all HDF5 files in this analysis, and regenerated based on the "/raw" count data.
19 | 
20 | Enrich2 uses NaN (Not a Number) values to represent missing data, such as zero counts or scores that could not be calculated.
21 | 
22 | List of tables by object type
23 | -------------------------------------------------------
24 | 
25 | Experiment
26 | +++++++++++++++++++++++
27 | 
28 | Most experiment tables use a pandas MultiIndex for column names. The MultiIndex levels are: condition, selection (if applicable), and data value. See the `pandas advanced indexing documentation <http://pandas.pydata.org/pandas-docs/stable/advanced.html>`_ for more information on how to work with these objects.
29 | 
30 | * "/main/<element>/counts"
31 | 
32 |     Counts of elements that appear in at least one time point in the experiment.
33 | 
34 | * "/main/<element>/scores"
35 | 
36 |     Condition-level scores, standard errors, and epsilon (change in the standard error after the last iteration of the random-effects model) for all elements scored in all selections of at least one condition.
37 | 
38 | * "/main/<element>/scores_shared"
39 | 
40 |     Selection-level scores and standard errors for each element with at least one condition-level score.
41 | 
42 | * "/main/<element>/scores_shared_full"
43 | 
44 |     Selection-level scores and standard errors for each element scored in at least one selection.
45 | 
46 | * "/main/<element>/scores_pvalues_wt"
47 | 
48 |     z-scores and p-values for each variant or synonymous element with a condition-level score. The null hypothesis is that the element's score is equal to wild type.
49 | 
50 | * "/main/barcodemap"
51 | 
52 |     Barcode-variant or barcode-identifier map for all barcodes that appear in the Experiment. Only present for Barcoded Variant or Barcoded Identifier SeqLibs. 
53 | 
54 | Selection
55 | +++++++++++++++++++++++
56 | 
57 | * "/main/<element>/counts"
58 | 
59 |     Counts of elements that appear in all time points in the selection.
60 | 
61 | * "/main/<element>/counts_unfiltered"
62 | 
63 |     Counts of elements that appear in at least one time point in the selection.
64 | 
65 | * "/main/<element>/scores"
66 | 
67 |     Scores, standard errors, standard error percentiles, and method-specific values (e.g. regression slope and intercept) for all elements counted in all time points in the selection.
68 | 
69 | * "/main/<element>/weights"
70 | 
71 |     Regression weights for each element at each time point in weighted least squares regression.
72 | 
73 | * "/main/<element>/log_ratios"
74 | 
75 |     Y-values for each element at each time point in weighted and ordinary least squares regression. 
76 | 
77 | * "/main/barcodemap"
78 | 
79 |     Barcode-variant or barcode-identifier map for all barcodes that appear in the Selection. Only present for Barcoded Variant or Barcoded Identifier SeqLibs. 
80 | 
81 | SeqLib
82 | +++++++++++++++++++++++
83 | 
84 | * "/main/<element>/counts"
85 | 
86 |     Counts of elements after minimum count filtering and barcode mapping.
87 | 
88 | * "/raw/<element>/counts"
89 | 
90 |     Counts of elements taken directly from the FASTQ_ data.
91 | 
92 | * "/raw/filter"
93 | 
94 |     Number of reads removed for each FASTQ_ filtering option.
95 | 
96 | * "/raw/barcodemap"
97 | 
98 |     Barcode-variant or barcode-identifier map for barcodes that appear in this SeqLib. Only present for Barcoded Variant or Barcoded Identifier SeqLibs. 
99 | 


--------------------------------------------------------------------------------
/docs/plots.rst:
--------------------------------------------------------------------------------
  1 | .. _plots:
  2 | 
  3 | Automatically generated plots
  4 | =================================================
  5 | 
  6 | In addition to providing structured output to allow users to create their own plots, Enrich2 produces default visualizations for each analysis. Experiment, Selection, and SeqLib objects each have their own directory inside "Results/plots/". Plots are saved in PDF format, and many of the files contain multiple pages.
  7 | 
  8 | Experiment plots
  9 | -------------------------------------------
 10 | 
 11 | * Sequence-function map
 12 | 
 13 |     .. image:: _static/plots/sfmap.png
 14 | 
 15 |     Visualization of scores and standard errors for single changes from wild type. Separate protein- and nucleotide-level sequence-function maps are generated. 
 16 |     
 17 |     Cell color indicates the score for the single change (row) at the given position (column). Positive scores (in red) indicate better performance in the assay, and negative scores (in blue) indicate worse performance. Grey squares denote changes that were not measured. Diagonal lines in each cell represent the standard error for the score, and are scaled such that the highest standard error on the plot covers the entire diagonal. Standard errors that are less than 2% of this maximum value are not plotted. Cells containing circles have the wild type residue at that position.
 18 | 
 19 |     .. _sfmap_aa_file:
 20 | 
 21 |     Custom amino acid ordering and groups can be specified by running Enrich2 in command line mode and using the ``--sfmap-aa-file`` option. Each line of the file begins with an optional label followed by a single tab character and then a comma-separated list of single-letter amino acid codes. All amino acid codes must be present exactly once.
 22 | 
 23 |     The following amino acid grouping files are provided:
 24 | 
 25 |     Default (:download:`click to download <_static/sfmap_aa_files/aagroup_default.txt>`)
 26 | 
 27 |         This grouping is used when no file is specified. `Reference <http://www.sigmaaldrich.com/life-science/metabolomics/learning-center/amino-acid-reference-chart.html>`__
 28 | 
 29 |     .. literalinclude:: _static/sfmap_aa_files/aagroup_default.txt
 30 | 
 31 |     Helical Propensity (:download:`click to download <_static/sfmap_aa_files/aagroup_helical_propensity.txt>`)
 32 | 
 33 |         `Reference <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1299714/>`__
 34 | 
 35 |     .. literalinclude:: _static/sfmap_aa_files/aagroup_helical_propensity.txt
 36 | 
 37 | Selection plots
 38 | --------------------------------------------
 39 | 
 40 | * Sequence-function map
 41 | 
 42 |     As above.
 43 | 
 44 | * Diversity map
 45 |     
 46 |     .. image:: _static/plots/diversity.png
 47 | 
 48 |     Variant frequencies are visualized in the style of a sequence-function map. Separate protein- and nucleotide-level diversity maps for each time point are generated.
 49 | 
 50 |     Custom amino acid ordering and groups can be specified by running Enrich2 in command line mode and using the ``--sfmap-aa-file`` option. :ref:`See above <sfmap_aa_file>` for more details.
 51 | 
 52 | * Counts per time point
 53 | 
 54 |     .. image:: _static/plots/selection_counts.png
 55 | 
 56 |     Bar plots showing the total element count in each time point. One plot for each element type.
 57 | 
 58 | * Representative regression fits
 59 | 
 60 |     .. image:: _static/plots/se_pctile.png
 61 | 
 62 |     Present for linear regression scoring methods only. Linear fits for the element closest to each 5th percentile (0, 5, 10, ..., 95, 100). Used for diagnostic purposes and setting standard error filtering cutoffs. One plot for each element type.
 63 | 
 64 | * Regression weights
 65 | 
 66 |     .. image:: _static/plots/regression_weights.png
 67 | 
 68 |     Present for weighted linear regression scoring method only. Boxplot of regression weights for each time point. Dashed line indicates uniform weight. One plot for each element type.
 69 | 
 70 | * Volcano plot
 71 | 
 72 |     .. image:: _static/plots/volcano.png
 73 | 
 74 |     Present for linear regression scoring methods with variants only. Volcano plot of the raw p-value from a z-test under the null hypothesis that the element behaves the same as wild type vs. the element's score. One plot for each element type.
 75 | 
 76 | * Wild type shape
 77 | 
 78 |     .. image:: _static/plots/wt_shape.png
 79 | 
 80 |     Present for linear regression scoring methods with variants only. Plot of the non-normalized linear fit of the wild type. Used to assess the effect of wild type correction.
 81 | 
 82 | SeqLib plots
 83 | ----------------------------------------------
 84 | 
 85 | * Counts per element
 86 | 
 87 |     .. image:: _static/plots/seqlib_counts.png
 88 | 
 89 |     Histogram of element counts. Two plots for each element type, one with log-transformed x-axis and one without.
 90 | 
 91 | * Unique barcodes per element
 92 | 
 93 |     .. image:: _static/plots/barcodes_per_variant.png
 94 | 
 95 |     Present for Barcoded Variant and Barcoded Identifier SeqLibs only. Histogram of unique barcodes per variant or identifier.
 96 | 
 97 | * Mismatches in overlapping reads
 98 | 
 99 |     .. image:: _static/plots/overlap_mismatches.png
100 | 
101 |     Present for Overlap SeqLibs only. Barplot of the number of resolved and unresolved mismatches at each position in the overlap region, and the number of times the first mismatch in a read pair occured at each position. Used for diagnosing misalignment of overlapping reads.
102 | 


--------------------------------------------------------------------------------
/docs/seqlib_config.rst:
--------------------------------------------------------------------------------
  1 | .. _seqlib-configuration:
  2 | 
  3 | SeqLib configuration details
  4 | ================================
  5 | 
  6 | Most parameters are specified within SeqLib objects. Experiment, Condition, and Selection objects have only a name (and output directory if at the root). :ref:`analysis-options`, such as scoring method, are chosen at run time.
  7 | 
  8 | Sequencing libraries have :ref:`general-seqlib-parameters`, :ref:`sequence-file-seqlib-parameters`, and other parameter groups depending on the type: 
  9 | 
 10 | +----------------------+---------+---------+------------+---------+
 11 | | SeqLib type          | Barcode | Variant | Identifier | Overlap |
 12 | +======================+=========+=========+============+=========+
 13 | | Barcoded Variant     | X       | X       |            |         |
 14 | +----------------------+---------+---------+------------+---------+
 15 | | Barcoded Identifier  | X       |         | X          |         |
 16 | +----------------------+---------+---------+------------+---------+
 17 | | Overlap              |         | X       |            | X       |
 18 | +----------------------+---------+---------+------------+---------+
 19 | | Basic                |         | X       |            |         |
 20 | +----------------------+---------+---------+------------+---------+
 21 | | Barcodes Only        | X       |         |            |         |
 22 | +----------------------+---------+---------+------------+---------+
 23 | | Identifiers Only     |         |         | X          |         |
 24 | +----------------------+---------+---------+------------+---------+
 25 | 
 26 | See :ref:`intro-seqlibs` for descriptions of each type.
 27 | 
 28 | .. _general-seqlib-parameters:
 29 | 
 30 | General parameters
 31 | --------------------------------
 32 | 
 33 | * Name
 34 | 
 35 | 	The object name should be short, descriptive, and not conflict with other object names in the analysis.
 36 | 
 37 | * Output Directory
 38 | 	
 39 | 	Path to the output directory. This field only appears for the root object.
 40 | 
 41 | * Time Point
 42 | 
 43 | 	The time point must be an integer. All Selections require an input library as time point 0. Time point values may refer to the round of selection or hour of sampling.
 44 | 
 45 | * Counts File
 46 | 
 47 | 	Required for Counts File Mode. Path to an HDF5 file or tab-separated value file that contains counts for this time point. Raw counts from that file will be used for this SeqLib. If an HDF5 file is provided, all tables in the "raw/" group are copied. Sequence file parameters will be ignored. The file must have the suffix ".h5" for HDF5 or one of ".txt" ".tsv" or ".csv" for tab-separated value files.
 48 | 
 49 | 	.. note:: Tab-separated value files must have exactly two columns separated by a tab. The first line of the file must have the column heading "counts" preceded by a single tab character. The first column contains the barcode, identifier, or HGVS variant string depending on the type of raw counts required by the SeqLib type. The second column contains the count for that element.
 50 | 
 51 | .. _sequence-file-seqlib-parameters:
 52 | 
 53 | Sequence file parameters
 54 | --------------------------------
 55 | 
 56 | Enrich2 accepts sequence files in FASTQ_ format. These files may be processed while compressed with gzip or bzip2. The file must have the suffix ".fq" or ".fastq" before compression. 
 57 | 
 58 | * Reads
 59 | 
 60 | 	Required for FASTQ_ File Mode. Path to a FASTQ_ file containing the sequencing reads. For overlap SeqLibs, there are fields for Forward Reads and Reverse Reads.
 61 | 
 62 | * Reverse
 63 | 
 64 | 	Checking this box will reverse-complement reads before analysis. Not present for Overlap SeqLibs.
 65 | 
 66 | Read filtering parameters
 67 | ++++++++++++++++++++++++++++++++++
 68 | 
 69 | Filters are applied after read trimming and any read merging.
 70 | 
 71 | * Minimum Quality
 72 | 
 73 | 	Minimum single-base quality. If a single base in the read has a quality score below this value, the read will be discarded.
 74 | 
 75 | * Average Quality
 76 | 
 77 | 	Average read quality. If the mean quality score of all bases in the read is below this value, the read will be discarded.
 78 | 
 79 | * Maximum N's
 80 | 
 81 | 	Maximum number of N nucleotides. If the read contains more than this number of bases called as N, the read will be discarded. This should be set to 0 in most cases.
 82 | 
 83 | * Remove Unresolvable Overlaps
 84 | 
 85 | 	Present for Overlap SeqLibs only. Checking this box discards merged reads with unresolvable discrepant bases (see :ref:`overlap-seqlib-parameters`).
 86 | 
 87 | * Maximum Mutations
 88 | 
 89 | 	Present for SeqLibs with variants only. Maximum number of mutations. If the variant contains more than this number of differences from wild type, the variant is discarded (or aligned if that option is enabled under :ref:`variant-seqlib-parameters`).
 90 | 
 91 | .. _barcode-seqlib-parameters:
 92 | 
 93 | Barcode parameters
 94 | --------------------------------
 95 | 
 96 | * Barcode-variant File
 97 | 
 98 | 	Not present for barcode-only SeqLibs. Path to a tab-separated file in which each line contains a barcode followed by its identifier or linked variant DNA sequence. This file may be processed while compressed with gzip or bzip2. 
 99 | 
100 | * Minimum Count
101 | 
102 | 	Minimum barcode count. If the barcode has fewer counts than this value, it will not be scored and will not contribute to counts of its variant or identifier.
103 | 
104 | * Trim Start
105 | 
106 | 	Position of the first base to keep when trimming barcodes. All subsequent bases are kept if Trim Length is not specified. Reverse-complementing occurs before trimming. Bases are numbered starting at 1.
107 | 
108 | * Trim Length
109 | 
110 | 	Number of bases to keep when trimming barcodes. Starts at the first base if Trim Start is not specified. Reverse-complementing occurs before trimming.
111 | 
112 | .. _variant-seqlib-parameters:
113 | 
114 | Variant parameters
115 | --------------------------------
116 | 
117 | * Wild Type Sequence
118 | 	
119 | 	The wild type DNA sequence. This sequence will be compared to reads or the barcode-variant map when calling variants. All sequences must have the same length and starting position.
120 | 
121 | * Wild Type Offset
122 | 
123 | 	Integer added to every variant nucleotide position. Used to place variants in the context of a larger sequence.
124 | 
125 | * Protein Coding
126 | 
127 | 	Checking this box will interpret the wild type sequence as protein coding. The wild type sequence must be in frame.
128 | 
129 | * Use Aligner
130 | 
131 | 	Checking this box will enable Needleman-Wunsch alignment. Insertion and deletion events will be called.
132 | 
133 | .. warning:: Using the aligner will dramatically increase run time, and is not recommended for most users.
134 | 
135 | * Minimum Count
136 | 
137 | 	Minimum variant count. If the variant has fewer counts than this value, it will not be scored and will not contribute to counts of any synonymous elements.
138 | 
139 | .. _identifier-seqlib-parameters:
140 | 
141 | Identifier parameters
142 | --------------------------------
143 | 
144 | * Minimum Count
145 | 
146 | 	Minimum identifier count. If the identifier has fewer counts than this value, it will not be scored.
147 | 
148 | .. _overlap-seqlib-parameters:
149 | 
150 | Overlap parameters
151 | --------------------------------
152 | 
153 | Overlapping read pairs reduce the likelihood of calling sequencing errors as variants. Paired-end Illumina reads are generated such that they overlap in the target region.
154 | 
155 | When Enrich2 combines forward and reverse reads into merged reads, base quality values in the overlapping region are defined as the higher quality value at each position. Mismatches are resolved by assuming the base with the higher quality value is correct. If mismatched bases have the same quality value, the position is considered unresolvable and replaced by an 'X' base.
156 | 
157 | * Forward Start
158 | 
159 | 	Position of the first overlapping base in the forward read. Bases are numbered starting at 1. 
160 | 
161 | * Reverse Start
162 | 
163 | 	Position of the first overlapping base in the reverse read before reverse complementing. Bases are numbered starting at 1. 
164 | 
165 | * Overlap Length
166 | 
167 | 	Number of bases in the overlapping region.
168 | 
169 | * Maximum Mismatches
170 | 
171 | 	Maximum number of mismatches in the overlapping region. If a merged read has more than this number of mismatches, the read pair will be discarded.
172 | 
173 | * Overlap Only
174 | 
175 | 	Checking this box will trim the merged reads to the overlapping region.
176 | 
177 | 


--------------------------------------------------------------------------------
/enrich2/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "2.0.2"
2 | 


--------------------------------------------------------------------------------
/enrich2/aligner.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for alignment of variants to the wild type sequence.
  3 | 
  4 | This module is optional, and using it will dramatically increase runtime when
  5 | counting variants. It is only recommended for users who need to count
  6 | insertion and deletion variants (i.e. not coding sequences).
  7 | """
  8 | 
  9 | import numpy as np
 10 | 
 11 | #: Default similarity matrix used by the aligner.
 12 | #: User-defined matrices must have this format.
 13 | _simple_similarity = {
 14 |     "A": {"A": 1, "C": -1, "G": -1, "T": -1, "N": 0, "X": 0},
 15 |     "C": {"A": -1, "C": 1, "G": -1, "T": -1, "N": 0, "X": 0},
 16 |     "G": {"A": -1, "C": -1, "G": 1, "T": -1, "N": 0, "X": 0},
 17 |     "T": {"A": -1, "C": -1, "G": -1, "T": 1, "N": 0, "X": 0},
 18 |     "N": {"A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "X": 0},
 19 |     "X": {"A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "X": 0},
 20 |     "gap": -1,
 21 | }
 22 | 
 23 | 
 24 | class Aligner(object):
 25 |     """
 26 |     Class for performing local alignment of two DNA sequences.
 27 | 
 28 |     This class implements `Needleman-Wunsch <http://en.wikipedia.org/wiki/
 29 |     Needleman%E2%80%93Wunsch_algorithm>`_ local alignment.
 30 | 
 31 |     The :py:class:`~aligner.Aligner` requires a scoring matrix when
 32 |     created. The format is a nested dictionary, with a special ``'gap'`` entry
 33 |     for the gap penalty (this value is used for both gap opening and gap
 34 |     extension).
 35 | 
 36 |     The ``'X'`` nucleotide is a special case for unresolvable mismatches in
 37 |     :py:class:`~overlap.OverlapSeqLib` variant data.
 38 |     """
 39 | 
 40 |     _MAT = 1  # match
 41 |     _INS = 2  # insertion (with respect to wild type)
 42 |     _DEL = 3  # deletion (with respect to wild type)
 43 |     _END = 4  # end of traceback
 44 | 
 45 |     def __init__(self, similarity=_simple_similarity):
 46 |         similarity_keys = list(similarity.keys())
 47 |         if "gap" in similarity_keys:
 48 |             similarity_keys.remove("gap")
 49 |         for key in similarity_keys:
 50 |             if not all(x in similarity[key] for x in similarity_keys) or len(
 51 |                 similarity[key]
 52 |             ) != len(similarity_keys):
 53 |                 raise ValueError("Asymmetrical alignment scoring matrix")
 54 | 
 55 |         self.similarity = similarity
 56 |         if "gap" not in self.similarity:
 57 |             raise ValueError("No gap penalty in alignment scoring matrix.")
 58 | 
 59 |         self.matrix = None
 60 |         self.seq1 = None
 61 |         self.seq2 = None
 62 |         self.calls = 0
 63 | 
 64 |     def align(self, seq1, seq2):
 65 |         """
 66 |         Aligns the two sequences, *seq1* and *seq2* and returns a list of
 67 |         tuples describing the differences between the sequences.
 68 | 
 69 |         The tuple format is ``(i, j, type, length)``, where ``i`` and ``j``
 70 |         are the positions in *seq1* and *seq2*, respectively, and type is one
 71 |         of ``"match"``, ``"mismatch"``, ``"insertion"``, or ``"deletion"``.
 72 |         For indels, the ``length`` value is the number of bases inserted or
 73 |         deleted with respect to *seq1* starting at ``i``.
 74 |         """
 75 |         self.matrix = np.ndarray(
 76 |             shape=(len(seq1) + 1, len(seq2) + 1),
 77 |             dtype=np.dtype([("score", int), ("trace", np.byte)]),
 78 |         )
 79 |         seq1 = seq1.upper()
 80 |         seq2 = seq2.upper()
 81 | 
 82 |         # build matrix of scores/traceback information
 83 |         for i in range(len(seq1) + 1):
 84 |             self.matrix[i, 0] = (self.similarity["gap"] * i, Aligner._DEL)
 85 |         for j in range(len(seq2) + 1):
 86 |             self.matrix[0, j] = (self.similarity["gap"] * j, Aligner._INS)
 87 |         for i in range(1, len(seq1) + 1):
 88 |             for j in range(1, len(seq2) + 1):
 89 |                 match = (
 90 |                     self.matrix[i - 1, j - 1]["score"]
 91 |                     + self.similarity[seq1[i - 1]][seq2[j - 1]],
 92 |                     Aligner._MAT,
 93 |                 )
 94 |                 delete = (
 95 |                     self.matrix[i - 1, j]["score"] + self.similarity["gap"],
 96 |                     Aligner._DEL,
 97 |                 )
 98 |                 insert = (
 99 |                     self.matrix[i, j - 1]["score"] + self.similarity["gap"],
100 |                     Aligner._INS,
101 |                 )
102 |                 self.matrix[i, j] = max(delete, insert, match, key=lambda x: x[0])
103 |         self.matrix[0, 0] = (0, Aligner._END)
104 | 
105 |         # calculate alignment from the traceback
106 |         i = len(seq1)
107 |         j = len(seq2)
108 |         traceback = list()
109 |         while i > 0 or j > 0:
110 |             if self.matrix[i, j]["trace"] == Aligner._MAT:
111 |                 if seq1[i - 1] == seq2[j - 1]:
112 |                     traceback.append((i - 1, j - 1, "match", None))
113 |                 else:
114 |                     traceback.append((i - 1, j - 1, "mismatch", None))
115 |                 i -= 1
116 |                 j -= 1
117 |             elif self.matrix[i, j]["trace"] == Aligner._INS:
118 |                 traceback.append((i - 1, j - 1, "insertion", 1))
119 |                 j -= 1
120 |             elif self.matrix[i, j]["trace"] == Aligner._DEL:
121 |                 traceback.append((i - 1, j - 1, "deletion", 1))
122 |                 i -= 1
123 |             elif self.matrix[i, j]["trace"] == Aligner._END:
124 |                 pass
125 |             else:
126 |                 raise RuntimeError("Invalid value in alignment traceback.")
127 |         traceback.reverse()
128 | 
129 |         # combine indels
130 |         indel = None
131 |         traceback_combined = list()
132 |         for t in traceback:
133 |             if t[2] == "insertion" or t[2] == "deletion":
134 |                 if indel is not None:
135 |                     if t[2] == indel[2]:
136 |                         indel[3] += t[3]
137 |                     else:
138 |                         raise RuntimeError(
139 |                             "Aligner failed to combine indels. " "Check gap penalty."
140 |                         )
141 |                 else:
142 |                     indel = list(t)
143 |             else:
144 |                 if indel is not None:
145 |                     traceback_combined.append(tuple(indel))
146 |                     indel = None
147 |                 traceback_combined.append(t)
148 |         if indel is not None:
149 |             traceback_combined.append(tuple(indel))
150 | 
151 |         self.calls += 1
152 |         return traceback_combined
153 | 


--------------------------------------------------------------------------------
/enrich2/barcode.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import logging
  3 | import sys
  4 | from .seqlib import SeqLib
  5 | from fqfa import open_compressed, parse_fastq_reads, has_fastq_ext
  6 | 
  7 | 
  8 | class BarcodeSeqLib(SeqLib):
  9 |     """
 10 |     Class for count data from barcoded sequencing libraries. Designed for
 11 |     barcode-only scoring or as a parent class for
 12 |     :py:class:`~seqlib.barcodevariant.BcvSeqLib` and
 13 |     :py:class:`~seqlib.barcodeid.BcidSeqLib`.
 14 |     """
 15 | 
 16 |     treeview_class_name = "Barcode SeqLib"
 17 | 
 18 |     def __init__(self):
 19 |         # Init step handled by VariantSeqLib's init for Barcode-variant
 20 |         if type(self).__name__ != "BcvSeqLib":
 21 |             SeqLib.__init__(self)
 22 |         self.reads = None
 23 |         self.reverse_complement_reads = False
 24 |         self.trim_start = 1
 25 |         self.trim_length = sys.maxsize
 26 |         self.barcode_min_count = 0
 27 |         self.add_label("barcodes")
 28 |         self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__))
 29 | 
 30 |     def configure(self, cfg):
 31 |         """
 32 |         Set up the object using the config object *cfg*, usually derived from
 33 |         a ``.json`` file.
 34 |         """
 35 |         SeqLib.configure(self, cfg)
 36 |         self.logger = logging.getLogger(
 37 |             "{}.{} - {}".format(__name__, self.__class__.__name__, self.name)
 38 |         )
 39 | 
 40 |         # handle non-FASTQ config options
 41 |         try:
 42 |             if "min count" in cfg["barcodes"]:
 43 |                 self.barcode_min_count = int(cfg["barcodes"]["min count"])
 44 |         except KeyError as key:
 45 |             raise KeyError("Missing required config value {}".format(key), self.name)
 46 | 
 47 |         # if counts are specified, copy them later
 48 |         # else handle the FASTQ config options and check the files
 49 |         if self.counts_file is None:
 50 |             self.configure_fastq(cfg)
 51 |             try:
 52 |                 if not has_fastq_ext(self.reads):
 53 |                     raise ValueError(
 54 |                         "FASTQ file error: unrecognized file extension", self.name
 55 |                     )
 56 |             except IOError as fqerr:
 57 |                 raise IOError("FASTQ file error: {}".format(fqerr), self.name)
 58 | 
 59 |     def serialize(self):
 60 |         """
 61 |         Format this object (and its children) as a config object suitable for
 62 |         dumping to a config file.
 63 |         """
 64 |         cfg = SeqLib.serialize(self)
 65 | 
 66 |         cfg["barcodes"] = dict()
 67 |         if self.barcode_min_count > 0:
 68 |             cfg["barcodes"]["min count"] = self.barcode_min_count
 69 | 
 70 |         cfg["fastq"] = self.serialize_fastq()
 71 | 
 72 |         return cfg
 73 | 
 74 |     def configure_fastq(self, cfg):
 75 |         """
 76 |         Set up the object's FASTQ_ file handling and filtering options.
 77 |         """
 78 |         try:
 79 |             self.reads = cfg["fastq"]["reads"]
 80 |             self.reverse_complement_reads = cfg["fastq"]["reverse"]
 81 | 
 82 |             if "start" in cfg["fastq"]:
 83 |                 self.trim_start = cfg["fastq"]["start"]
 84 | 
 85 |             if "length" in cfg["fastq"]:
 86 |                 self.trim_length = cfg["fastq"]["length"]
 87 | 
 88 |             self.filters = cfg["fastq"]["filters"]
 89 |         except KeyError as key:
 90 |             raise KeyError("Missing required config value {}".format(key), self.name)
 91 | 
 92 |     def serialize_fastq(self):
 93 |         """
 94 |         Serialize this object's FASTQ_ file handling and filtering options.
 95 |         """
 96 |         fastq = {
 97 |             "reads": self.reads,
 98 |             "reverse": self.reverse_complement_reads,
 99 |             "filters": self.serialize_filters(),
100 |         }
101 |         if self.trim_start > 1:
102 |             fastq["start"] = self.trim_start
103 | 
104 |         if self.trim_length < sys.maxsize:
105 |             fastq["length"] = self.trim_length
106 | 
107 |         return fastq
108 | 
109 |     def counts_from_reads(self):
110 |         """
111 |         Reads the forward or reverse FASTQ_ file (reverse reads are
112 |         reverse-complemented), performs quality-based filtering, and counts
113 |         the barcodes.
114 | 
115 |         Barcode counts after read-level filtering are stored under
116 |         ``"/raw/barcodes/counts"``.
117 |         """
118 |         df_dict = dict()
119 | 
120 |         filter_flags = dict()
121 |         for key in self.filters:
122 |             filter_flags[key] = False
123 | 
124 |         # count all the barcodes
125 |         self.logger.info("Counting barcodes")
126 |         with open_compressed(self.reads) as handle:
127 |             for fq in parse_fastq_reads(handle):
128 |                 fq.trim(start=self.trim_start, end=self.trim_start + self.trim_length -1)
129 |                 if self.reverse_complement_reads:
130 |                     fq.reverse_complement()
131 | 
132 |                 if self.read_quality_filter(fq):  # passed filtering
133 |                     try:
134 |                         df_dict[fq.sequence.upper()] += 1
135 |                     except KeyError:
136 |                         df_dict[fq.sequence.upper()] = 1
137 | 
138 |         self.save_counts("barcodes", df_dict, raw=True)
139 |         del df_dict
140 | 
141 |     def calculate(self):
142 |         """
143 |         Counts the barcodes from the FASTQ file or from the provided counts
144 |         file depending on the config.
145 | 
146 |         Barcodes that pass the minimum count
147 |         filtering are stored under ``"/main/barcodes/counts"``.
148 | 
149 |         If ``"/main/barcodes/counts"`` already exists, those will be used
150 |         instead of re-counting.
151 |         """
152 |         if self.check_store("/main/barcodes/counts"):
153 |             return
154 | 
155 |         # no raw counts present
156 |         if not self.check_store("/raw/barcodes/counts"):
157 |             if self.counts_file is not None:
158 |                 self.counts_from_file(self.counts_file)
159 |             else:
160 |                 self.counts_from_reads()
161 | 
162 |         if len(self.labels) == 1:  # only barcodes
163 |             self.save_filtered_counts("barcodes", "count >= self.barcode_min_count")
164 |             self.save_filter_stats()
165 | 


--------------------------------------------------------------------------------
/enrich2/barcodeid.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from .seqlib import SeqLib
  3 | from .barcode import BarcodeSeqLib
  4 | from .barcodemap import BarcodeMap
  5 | import pandas as pd
  6 | from .plots import barcodemap_plot
  7 | from matplotlib.backends.backend_pdf import PdfPages
  8 | import os.path
  9 | 
 10 | 
 11 | class BcidSeqLib(BarcodeSeqLib):
 12 |     """
 13 |     Class for counting data from barcoded sequencing libraries with non-variant
 14 |     identifiers. 
 15 |     Creating a :py:class:`BcidSeqLib` requires a valid *config* 
 16 |     object with an ``'barcodes'`` entry and information.
 17 | 
 18 |     The ``barcode_map`` keyword argument can be used to pass an existing 
 19 |     :py:class:`~seqlib.barcodemap.BarcodeMap`. Ensuring this is the 
 20 |     right :py:class:`~seqlib.barcodemap.BarcodeMap` is the responsibility 
 21 |     of the caller.
 22 |     """
 23 | 
 24 |     treeview_class_name = "Barcoded ID SeqLib"
 25 | 
 26 |     def __init__(self):
 27 |         BarcodeSeqLib.__init__(self)
 28 |         self.barcode_map = None
 29 |         self.identifier_min_count = 0
 30 |         self.add_label("identifiers")
 31 |         self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__))
 32 | 
 33 |     def configure(self, cfg, barcode_map=None):
 34 |         """
 35 |         Set up the object using the config object *cfg*, usually derived from 
 36 |         a ``.json`` file.
 37 |         """
 38 |         BarcodeSeqLib.configure(self, cfg)
 39 |         self.logger = logging.getLogger(
 40 |             "{}.{} - {}".format(__name__, self.__class__.__name__, self.name)
 41 |         )
 42 |         try:
 43 |             if "min count" in cfg["identifiers"]:
 44 |                 self.identifier_min_count = int(cfg["identifiers"]["min count"])
 45 | 
 46 |             if barcode_map is not None:
 47 |                 if barcode_map.filename == cfg["barcodes"]["map file"]:
 48 |                     self.barcode_map = barcode_map
 49 |                 else:
 50 |                     raise ValueError(
 51 |                         "Attempted to assign non-matching barcode map [{}]".format(
 52 |                             self.name
 53 |                         )
 54 |                     )
 55 |             else:
 56 |                 self.barcode_map = BarcodeMap(
 57 |                     cfg["barcodes"]["map file"], is_variant=False
 58 |                 )
 59 |         except KeyError as key:
 60 |             raise KeyError(
 61 |                 "Missing required config value {key} [{name}]".format(
 62 |                     key=key, name=self.name
 63 |                 )
 64 |             )
 65 | 
 66 |     def serialize(self):
 67 |         """
 68 |         Format this object (and its children) as a config object suitable for dumping to a config file.
 69 |         """
 70 |         cfg = BarcodeSeqLib.serialize(self)
 71 | 
 72 |         cfg["identifiers"] = dict()
 73 |         if self.identifier_min_count > 0:
 74 |             cfg["identifiers"]["min count"] = self.identifier_min_count
 75 | 
 76 |         if self.barcode_map is not None:  # required for creating new objects in GUI
 77 |             cfg["barcodes"]["map file"] = self.barcode_map.filename
 78 | 
 79 |         return cfg
 80 | 
 81 |     def calculate(self):
 82 |         """
 83 |         Counts the barcodes using :py:meth:`BarcodeSeqLib.count` and combines them into 
 84 |         identifier counts using the :py:class:`BarcodeMap`.
 85 |         """
 86 |         if not self.check_store("/main/identifiers/counts"):
 87 |             BarcodeSeqLib.calculate(self)  # count the barcodes
 88 |             df_dict = dict()
 89 |             barcode_identifiers = dict()
 90 | 
 91 |             self.logger.info("Converting barcodes to identifiers")
 92 |             # store mapped barcodes
 93 |             self.save_filtered_counts(
 94 |                 "barcodes",
 95 |                 "index in self.barcode_map.keys() & count >= self.barcode_min_count",
 96 |             )
 97 | 
 98 |             # count identifiers associated with the barcodes
 99 |             for bc, count in self.store["/main/barcodes/counts"].iterrows():
100 |                 count = count["count"]
101 |                 identifier = self.barcode_map[bc]
102 |                 try:
103 |                     df_dict[identifier] += count
104 |                 except KeyError:
105 |                     df_dict[identifier] = count
106 |                 barcode_identifiers[bc] = identifier
107 | 
108 |             # save counts, filtering based on the min count
109 |             self.save_counts(
110 |                 "identifiers",
111 |                 {
112 |                     k: v
113 |                     for k, v in df_dict.items()
114 |                     if v >= self.identifier_min_count
115 |                 },
116 |                 raw=False,
117 |             )
118 |             del df_dict
119 | 
120 |             # write the active subset of the BarcodeMap to the store
121 |             barcodes = list(barcode_identifiers.keys())
122 |             barcode_identifiers = pd.DataFrame(
123 |                 {"value": [barcode_identifiers[bc] for bc in barcodes]}, index=barcodes
124 |             )
125 |             del barcodes
126 |             barcode_identifiers.sort_values("value", inplace=True)
127 |             self.store.put(
128 |                 "/raw/barcodemap",
129 |                 barcode_identifiers,
130 |                 data_columns=barcode_identifiers.columns,
131 |                 format="table",
132 |             )
133 |             del barcode_identifiers
134 | 
135 |             # self.report_filter_stats()
136 |             self.save_filter_stats()
137 | 
138 |     def make_plots(self):
139 |         """
140 |         Make plots for :py:class:`~seqlib.seqlib.BcidSeqLib` objects.
141 | 
142 |         Creates plot of the number of barcodes mapping to each identifier.
143 |         """
144 |         if self.plots_requested:
145 |             SeqLib.make_plots(self)
146 |             # open the PDF file
147 |             pdf = PdfPages(os.path.join(self.plot_dir, "barcodes_per_identifier.pdf"))
148 |             barcodemap_plot(self, pdf)
149 |             pdf.close()
150 | 


--------------------------------------------------------------------------------
/enrich2/barcodemap.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import gzip
 3 | import bz2
 4 | import os.path
 5 | 
 6 | re_barcode = re.compile("^[ACGT]+$")
 7 | re_variant_dna = re.compile("^[ACGTN]+$")
 8 | re_identifier = re.compile("^.+$")
 9 | 
10 | 
11 | class BarcodeMap(dict):
12 |     """
13 |     Dictionary-derived class for storing the relationship between barcodes
14 |     (keys) and variants (values). Requires the path to a *mapfile*, containing
15 |     lines in the format ``'barcode<whitespace>variant'`` for each barcode
16 |     expected in the library. This file can be plain text or compressed
17 |     (``.bz2`` or ``.gz``).
18 | 
19 |     Barcodes must only contain the characters ``ACGT`` and variants must only
20 |     contain the characters ``ACGTN`` (lowercase characters are converted to
21 |     uppercase).
22 | 
23 |     Blank lines and lines that begin with ``#`` (comments) are ignored.
24 | 
25 |     *is_variant* is a boolean that is ``True`` if the barcodes are assigned to
26 |     variant DNA sequences, or ``False`` if the barcodes are assigned to
27 |     arbitrary identifiers. If this is ``True``, additional error checking
28 |     is performed on the variant DNA sequences.
29 | 
30 |     """
31 | 
32 |     def __init__(self, mapfile, is_variant=False):
33 |         super(BarcodeMap, self).__init__()
34 |         self.name = "barcodemap_{}".format(os.path.basename(mapfile))
35 |         self.filename = mapfile
36 |         self.is_variant = is_variant
37 | 
38 |         # open the file
39 |         try:
40 |             ext = os.path.splitext(mapfile)[-1].lower()
41 |             if ext in (".bz2",):
42 |                 handle = bz2.BZ2File(mapfile, "r")
43 |             elif ext in (".gz",):
44 |                 handle = gzip.GzipFile(mapfile, "r")
45 |             else:
46 |                 handle = open(mapfile, "r")
47 |         except IOError:
48 |             raise IOError(
49 |                 "Could not open barcode map file '{}' [{}]".format(mapfile, self.name)
50 |             )
51 | 
52 |         # handle each line
53 |         for line in handle:
54 |             line = line.decode("utf-8")
55 |             # skip comments and whitespace-only lines
56 |             if len(line.strip()) == 0 or line[0] == "#":
57 |                 continue
58 | 
59 |             try:
60 |                 barcode, value = line.strip().split()
61 |             except ValueError:
62 |                 raise ValueError(
63 |                     "Unexpected barcode map line format " "[{}]".format(self.name)
64 |                 )
65 | 
66 |             barcode = barcode.upper()
67 |             if not re_barcode.match(barcode):
68 |                 raise ValueError(
69 |                     "Barcode DNA sequence contains unexpected "
70 |                     "characters [{}]".format(self.name)
71 |                 )
72 |             if self.is_variant:
73 |                 value = value.upper()
74 |                 if not re_variant_dna.match(value):
75 |                     raise ValueError(
76 |                         "Variant DNA sequence contains unexpected"
77 |                         " characters [{}]".format(self.name)
78 |                     )
79 |             else:
80 |                 if not re_identifier.match(value):
81 |                     raise ValueError(
82 |                         "Identifier contains unexpected "
83 |                         "characters [{}]".format(self.name)
84 |                     )
85 | 
86 |             if barcode in self:
87 |                 if self[barcode] != value:
88 |                     raise ValueError(
89 |                         "Barcode '{}' assigned to multiple "
90 |                         "unique values: {}".format(barcode, self.name)
91 |                     )
92 |             else:
93 |                 self[barcode] = value
94 | 
95 |         handle.close()
96 | 


--------------------------------------------------------------------------------
/enrich2/barcodevariant.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from .seqlib import SeqLib
  3 | from .variant import VariantSeqLib
  4 | from .barcode import BarcodeSeqLib
  5 | from .barcodemap import BarcodeMap
  6 | import pandas as pd
  7 | from .plots import barcodemap_plot
  8 | from matplotlib.backends.backend_pdf import PdfPages
  9 | import os.path
 10 | 
 11 | 
 12 | class BcvSeqLib(VariantSeqLib, BarcodeSeqLib):
 13 |     """
 14 |     Class for counting variant data from barcoded sequencing libraries. 
 15 |     Creating a :py:class:`BcvSeqLib` requires a valid *config* 
 16 |     object with an ``'barcodes'`` entry and information about the wild type 
 17 |     sequence.
 18 | 
 19 |     The ``barcode_map`` keyword argument can be used to pass an existing 
 20 |     :py:class:`~seqlib.barcodemap.BarcodeMap`. Ensuring this is the 
 21 |     right :py:class:`~seqlib.barcodemap.BarcodeMap` is the responsibility 
 22 |     of the caller.
 23 |     """
 24 | 
 25 |     treeview_class_name = "Barcoded Variant SeqLib"
 26 | 
 27 |     def __init__(self):
 28 |         VariantSeqLib.__init__(self)
 29 |         BarcodeSeqLib.__init__(self)
 30 |         self.barcode_map = None
 31 |         self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__))
 32 | 
 33 |     def configure(self, cfg, barcode_map=None):
 34 |         """
 35 |         Set up the object using the config object *cfg*, usually derived from 
 36 |         a ``.json`` file.
 37 |         """
 38 |         VariantSeqLib.configure(self, cfg)
 39 |         BarcodeSeqLib.configure(self, cfg)
 40 |         self.logger = logging.getLogger(
 41 |             "{}.{} - {}".format(__name__, self.__class__.__name__, self.name)
 42 |         )
 43 |         try:
 44 |             if barcode_map is not None:
 45 |                 if barcode_map.filename == cfg["barcodes"]["map file"]:
 46 |                     self.barcode_map = barcode_map
 47 |                 else:
 48 |                     raise ValueError(
 49 |                         "Attempted to assign non-matching barcode map [{}]".format(
 50 |                             self.name
 51 |                         )
 52 |                     )
 53 |             else:
 54 |                 self.barcode_map = BarcodeMap(
 55 |                     cfg["barcodes"]["map file"], is_variant=True
 56 |                 )
 57 |         except KeyError as key:
 58 |             raise KeyError(
 59 |                 "Missing required config value {key} [{name}]".format(
 60 |                     key=key, name=self.name
 61 |                 )
 62 |             )
 63 | 
 64 |     def serialize(self):
 65 |         """
 66 |         Format this object (and its children) as a config object suitable for dumping to a config file.
 67 |         """
 68 |         cfg = VariantSeqLib.serialize(self)
 69 |         cfg.update(BarcodeSeqLib.serialize(self))
 70 | 
 71 |         if self.barcode_map is not None:  # required for creating new objects in GUI
 72 |             cfg["barcodes"]["map file"] = self.barcode_map.filename
 73 | 
 74 |         return cfg
 75 | 
 76 |     def calculate(self):
 77 |         """
 78 |         Counts the barcodes using :py:meth:`BarcodeSeqLib.count` and combines them into 
 79 |         variant counts using the :py:class:`BarcodeMap`.
 80 |         """
 81 |         if not self.check_store("/main/variants/counts"):
 82 |             BarcodeSeqLib.calculate(self)  # count the barcodes
 83 |             df_dict = dict()
 84 |             barcode_variants = dict()
 85 | 
 86 |             self.logger.info("Converting barcodes to variants")
 87 |             # store mapped barcodes
 88 |             self.save_filtered_counts(
 89 |                 "barcodes",
 90 |                 "index in self.barcode_map.keys() & count >= self.barcode_min_count",
 91 |             )
 92 | 
 93 |             # count variants associated with the barcodes
 94 |             max_mut_barcodes = 0
 95 |             max_mut_variants = 0
 96 |             for bc, count in self.store["/main/barcodes/counts"].iterrows():
 97 |                 count = count["count"]
 98 |                 variant = self.barcode_map[bc]
 99 |                 mutations = self.count_variant(variant)
100 |                 if mutations is None:  # variant has too many mutations
101 |                     max_mut_barcodes += 1
102 |                     max_mut_variants += count
103 |                     if self.report_filtered:
104 |                         self.report_filtered_variant(variant, count)
105 |                 else:
106 |                     try:
107 |                         df_dict[mutations] += count
108 |                     except KeyError:
109 |                         df_dict[mutations] = count
110 |                     barcode_variants[bc] = mutations
111 | 
112 |             # save counts, filtering based on the min count
113 |             self.save_counts(
114 |                 "variants",
115 |                 {k: v for k, v in df_dict.items() if v >= self.variant_min_count},
116 |                 raw=False,
117 |             )
118 |             del df_dict
119 | 
120 |             # write the active subset of the BarcodeMap to the store
121 |             barcodes = list(barcode_variants.keys())
122 |             barcode_variants = pd.DataFrame(
123 |                 {"value": [barcode_variants[bc] for bc in barcodes]}, index=barcodes
124 |             )
125 |             del barcodes
126 |             barcode_variants.sort_values("value", inplace=True)
127 |             self.store.put(
128 |                 "/raw/barcodemap",
129 |                 barcode_variants,
130 |                 data_columns=barcode_variants.columns,
131 |                 format="table",
132 |             )
133 |             del barcode_variants
134 | 
135 |             if self.aligner is not None:
136 |                 self.logger.info("Aligned {} variants".format(self.aligner.calls))
137 |                 self.aligner_cache = None
138 |             # self.report_filter_stats()
139 |             self.logger.info(
140 |                 "Removed {} unique barcodes ({} total variants) "
141 |                 "with excess mutations".format(max_mut_barcodes, max_mut_variants)
142 |             )
143 |             self.save_filter_stats()
144 | 
145 |         self.count_synonymous()
146 | 
147 |     def make_plots(self):
148 |         """
149 |         Make plots for :py:class:`~seqlib.seqlib.BcvSeqLib` objects.
150 | 
151 |         Creates plot of the number of barcodes mapping to each variant.
152 |         """
153 |         if self.plots_requested:
154 |             SeqLib.make_plots(self)
155 |             # open the PDF file
156 |             pdf = PdfPages(os.path.join(self.plot_dir, "barcodes_per_variant.pdf"))
157 |             barcodemap_plot(self, pdf)
158 |             pdf.close()
159 | 


--------------------------------------------------------------------------------
/enrich2/basic.py:
--------------------------------------------------------------------------------
  1 | from .variant import VariantSeqLib
  2 | from fqfa import open_compressed, parse_fastq_reads, has_fastq_ext
  3 | import logging
  4 | import sys
  5 | 
  6 | 
  7 | class BasicSeqLib(VariantSeqLib):
  8 |     """
  9 |     Class for count data from sequencing libraries with a single read for
 10 |     each variant. Creating a :py:class:`BasicSeqLib` requires a valid
 11 |     *config* object, usually from a ``.json`` configuration file.
 12 |     """
 13 | 
 14 |     treeview_class_name = "Basic SeqLib"
 15 | 
 16 |     def __init__(self):
 17 |         VariantSeqLib.__init__(self)
 18 |         self.reads = None
 19 |         self.reverse_complement_reads = False
 20 |         self.trim_start = 1
 21 |         self.trim_length = sys.maxsize
 22 |         self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__))
 23 | 
 24 |     def configure(self, cfg):
 25 |         """
 26 |         Set up the object using the config object *cfg*, usually derived from
 27 |         a ``.json`` file.
 28 |         """
 29 |         VariantSeqLib.configure(self, cfg)
 30 |         self.logger = logging.getLogger(
 31 |             "{}.{} - {}".format(__name__, self.__class__.__name__, self.name)
 32 |         )
 33 | 
 34 |         # if counts are specified, copy them later
 35 |         # else handle the FASTQ config options and check the files
 36 |         if self.counts_file is None:
 37 |             self.configure_fastq(cfg)
 38 |             try:
 39 |                 if not has_fastq_ext(self.reads):
 40 |                     raise IOError(
 41 |                         "FASTQ file error: unrecognized extension "
 42 |                         "[{}]".format(self.name)
 43 |                     )
 44 |             except IOError as fqerr:
 45 |                 raise IOError("FASTQ file error [{}]: {}".format(self.name, fqerr))
 46 | 
 47 |     def serialize(self):
 48 |         """
 49 |         Format this object (and its children) as a config object suitable for
 50 |         dumping to a config file.
 51 |         """
 52 |         cfg = VariantSeqLib.serialize(self)
 53 | 
 54 |         cfg["fastq"] = self.serialize_fastq()
 55 | 
 56 |         return cfg
 57 | 
 58 |     def configure_fastq(self, cfg):
 59 |         """
 60 |         Set up the object's FASTQ_ file handling and filtering options.
 61 |         """
 62 |         try:
 63 |             self.reads = cfg["fastq"]["reads"]
 64 | 
 65 |             if "reverse" in cfg["fastq"]:
 66 |                 self.reverse_complement_reads = cfg["fastq"]["reverse"]
 67 | 
 68 |             if "start" in cfg["fastq"]:
 69 |                 self.trim_start = cfg["fastq"]["start"]
 70 | 
 71 |             if "length" in cfg["fastq"]:
 72 |                 self.trim_length = cfg["fastq"]["length"]
 73 | 
 74 |             self.filters = cfg["fastq"]["filters"]
 75 |         except KeyError as key:
 76 |             raise KeyError(
 77 |                 "Missing required config value {key} [{name}]"
 78 |                 "".format(key=key, name=self.name)
 79 |             )
 80 | 
 81 |     def serialize_fastq(self):
 82 |         """
 83 |         Serialize this object's FASTQ_ file handling and filtering options.
 84 |         """
 85 |         fastq = {"filters": self.serialize_filters()}
 86 |         fastq["reads"] = self.reads
 87 | 
 88 |         if self.reverse_complement_reads:
 89 |             fastq["reverse"] = True
 90 |         else:
 91 |             fastq["reverse"] = False
 92 | 
 93 |         if self.trim_start > 1:
 94 |             fastq["start"] = self.trim_start
 95 | 
 96 |         if self.trim_length < sys.maxsize:
 97 |             fastq["length"] = self.trim_length
 98 | 
 99 |         return fastq
100 | 
101 |     def counts_from_reads(self):
102 |         """
103 |         Reads the forward or reverse FASTQ_ file (reverse reads are
104 |         reverse-complemented), performs quality-based filtering, and counts
105 |         the variants.
106 |         """
107 |         df_dict = dict()
108 | 
109 |         self.logger.info("Counting variants")
110 |         max_mut_variants = 0
111 |         with open_compressed(self.reads) as handle:
112 |             for fq in parse_fastq_reads(handle):
113 |                 fq.trim(start=self.trim_start, end=self.trim_start + self.trim_length -1)
114 |                 if self.reverse_complement_reads:
115 |                     fq.reverse_complement()
116 | 
117 |                 if self.read_quality_filter(fq):
118 |                     mutations = self.count_variant(fq.sequence)
119 |                     if mutations is None:  # too many mutations
120 |                         max_mut_variants += 1
121 |                         if self.report_filtered:
122 |                             self.report_filtered_variant(fq.sequence, 1)
123 |                     else:
124 |                         try:
125 |                             df_dict[mutations] += 1
126 |                         except KeyError:
127 |                             df_dict[mutations] = 1
128 | 
129 |         self.save_counts("variants", df_dict, raw=True)
130 |         del df_dict
131 | 
132 |         if self.aligner is not None:
133 |             self.logger.info("Aligned {} variants".format(self.aligner.calls))
134 |             self.aligner_cache = None
135 |         self.logger.info(
136 |             "Removed {} total variants with excess mutations"
137 |             "".format(max_mut_variants)
138 |         )
139 |         self.save_filter_stats()
140 | 
141 |     def calculate(self):
142 |         """
143 |         Counts variants from counts file or FASTQ.
144 |         """
145 |         if not self.check_store("/main/variants/counts"):
146 |             if not self.check_store("/raw/variants/counts"):
147 |                 if self.counts_file is not None:
148 |                     self.counts_from_file(self.counts_file)
149 |                 else:
150 |                     self.counts_from_reads()
151 |             self.save_filtered_counts("variants", "count >= self.variant_min_count")
152 |         self.count_synonymous()
153 | 


--------------------------------------------------------------------------------
/enrich2/condition.py:
--------------------------------------------------------------------------------
 1 | from .storemanager import StoreManager
 2 | from .selection import Selection
 3 | 
 4 | 
 5 | class Condition(StoreManager):
 6 |     """
 7 |     Dummy class for experimental conditions within an 
 8 |     :py:class:`~experiment.Experiment`. Required for proper GUI behavior.
 9 |     """
10 | 
11 |     has_store = False  # don't create an HDF5 for Conditions
12 |     treeview_class_name = "Condition"
13 | 
14 |     def __init__(self):
15 |         StoreManager.__init__(self)
16 |         self.selections = list()
17 | 
18 |     def configure(self, cfg, configure_children=True):
19 |         StoreManager.configure(self, cfg)
20 |         if configure_children:
21 |             if "selections" not in cfg:
22 |                 raise KeyError(
23 |                     "Missing required config value {} [{}]".format(
24 |                         "selections", self.name
25 |                     )
26 |                 )
27 | 
28 |             for sel_cfg in cfg["selections"]:
29 |                 sel = Selection()
30 |                 sel.configure(sel_cfg)
31 |                 self.add_child(sel)
32 | 
33 |     def serialize(self):
34 |         """
35 |         Format this object (and its children) as a config object suitable for dumping to a config file.
36 |         """
37 |         cfg = StoreManager.serialize(self)
38 |         cfg["selections"] = [child.serialize() for child in self.children]
39 |         return cfg
40 | 
41 |     def validate(self):
42 |         """
43 |         Calls validate on all child Selections.
44 |         """
45 |         for child in self.children:
46 |             child.validate()
47 | 
48 |     def _children(self):
49 |         """
50 |         Method bound to the ``children`` property. Returns a list of all 
51 |         :py:class:`~selection.Selection` objects belonging to this object, 
52 |         sorted by name.
53 |         """
54 |         return sorted(self.selections, key=lambda x: x.name)
55 | 
56 |     def add_child(self, child):
57 |         """
58 |         Add a :py:class:`~selection.Selection`.
59 |         """
60 |         if child.name in self.child_names():
61 |             raise ValueError(
62 |                 "Non-unique selection name '{}' [{}]".format(child.name, self.name)
63 |             )
64 |         child.parent = self
65 |         self.selections.append(child)
66 | 
67 |     def remove_child_id(self, tree_id):
68 |         """
69 |         Remove the reference to a :py:class:`~selection.Selection` with 
70 |         Treeview id *tree_id*.
71 |         """
72 |         self.selections = [x for x in self.selections if x.treeview_id != tree_id]
73 | 


--------------------------------------------------------------------------------
/enrich2/config_check.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for identifying the type of
  3 | :py:class:`~enrich2.storemanager.StoreManager` derived object associated with a
  4 | given configuration object (decoded from a JSON file as described `here
  5 | <https://docs.python.org/2/library/json.html>`_).
  6 | 
  7 | """
  8 | 
  9 | 
 10 | def is_experiment(cfg):
 11 |     """
 12 |     Check if the given configuration object specifies an
 13 |     :py:class:`~enrich2.experiment.Experiment`.
 14 | 
 15 |     Args:
 16 |         cfg (dict): decoded JSON object
 17 | 
 18 |     Returns:
 19 |         bool: True if `cfg` if specifies an
 20 |         :py:class:`~enrich2.experiment.Experiment`, else False.
 21 | 
 22 |     """
 23 |     if "conditions" in list(cfg.keys()):
 24 |         return True
 25 |     else:
 26 |         return False
 27 | 
 28 | 
 29 | def is_condition(cfg):
 30 |     """
 31 |     Check if the given configuration object specifies a
 32 |     :py:class:`~enrich2.condition.Condition`.
 33 | 
 34 |     Args:
 35 |         cfg (dict): decoded JSON object
 36 | 
 37 |     Returns:
 38 |         bool: True if `cfg` if specifies a
 39 |         :py:class:`~enrich2.condition.Condition`, else False.
 40 | 
 41 |     """
 42 |     if "selections" in list(cfg.keys()):
 43 |         return True
 44 |     else:
 45 |         return False
 46 | 
 47 | 
 48 | def is_selection(cfg):
 49 |     """
 50 |     Check if the given configuration object specifies a
 51 |     :py:class:`~enrich2.selection.Selection`.
 52 | 
 53 |     Args:
 54 |         cfg (dict): decoded JSON object
 55 | 
 56 |     Returns:
 57 |         bool: True if `cfg` if specifies a
 58 |         :py:class:`~enrich2.selection.Selection`, else False.
 59 | 
 60 |     """
 61 |     if "libraries" in list(cfg.keys()):
 62 |         return True
 63 |     else:
 64 |         return False
 65 | 
 66 | 
 67 | def is_seqlib(cfg):
 68 |     """
 69 |     Check if the given configuration object specifies a
 70 |     :py:class:`~enrich2.seqlib.SeqLib` derived object.
 71 | 
 72 |     Args:
 73 |         cfg (dict): decoded JSON object
 74 | 
 75 |     Returns:
 76 |         bool: True if `cfg` if specifies a :py:class:`~enrich2.seqlib.SeqLib`
 77 |         derived object, else False.
 78 | 
 79 |     """
 80 |     if "fastq" in list(cfg.keys()) or "identifiers" in list(cfg.keys()):
 81 |         return True
 82 |     else:
 83 |         return False
 84 | 
 85 | 
 86 | def seqlib_type(cfg):
 87 |     """
 88 |     Get the type of :py:class:`~enrich2.seqlib.SeqLib` derived object
 89 |     specified by the configuration object.
 90 | 
 91 |     Args:
 92 |         cfg (dict): decoded JSON object
 93 | 
 94 |     Returns:
 95 |         str: The class name of the :py:class:`~seqlib.seqlib.SeqLib` derived
 96 |         object specified by `cfg`.
 97 | 
 98 |     Raises:
 99 |         ValueError: If the class name cannot be determined.
100 | 
101 |     """
102 |     if "barcodes" in cfg:
103 |         if "map file" in cfg["barcodes"]:
104 |             if "variants" in cfg and "identifiers" in cfg:
105 |                 raise ValueError("Unable to determine SeqLib type.")
106 |             elif "variants" in cfg:
107 |                 return "BcvSeqLib"
108 |             elif "identifiers" in cfg:
109 |                 return "BcidSeqLib"
110 |             else:
111 |                 raise ValueError("Unable to determine SeqLib type.")
112 |         else:
113 |             return "BarcodeSeqLib"
114 |     elif "overlap" in cfg and "variants" in cfg:
115 |         return "OverlapSeqLib"
116 |     elif "variants" in cfg:
117 |         return "BasicSeqLib"
118 |     elif "identifiers" in cfg:
119 |         return "IdOnlySeqLib"
120 |     else:
121 |         raise ValueError("Unable to determine SeqLib type for configuration " "object.")
122 | 
123 | 
124 | def element_type(cfg):
125 |     """
126 |     Get the type of :py:class:`~enrich2.storemanager.StoreManager` derived
127 |     object specified by the configuration object.
128 | 
129 |     Args:
130 |         cfg (dict): decoded JSON object
131 | 
132 |     Returns:
133 |         str: The class name of the
134 |         :py:class:`~enrich2.storemanager.StoreManager` derived object specified
135 |         by `cfg`.
136 | 
137 |     Raises:
138 |         ValueError: If the class name cannot be determined.
139 | 
140 |     """
141 |     if is_experiment(cfg):
142 |         return "Experiment"
143 |     elif is_condition(cfg):
144 |         return "Condition"
145 |     elif is_selection(cfg):
146 |         return "Selection"
147 |     elif is_seqlib(cfg):
148 |         return seqlib_type(cfg)
149 |     else:
150 |         raise ValueError("Unable to determine type for configuration object.")
151 | 


--------------------------------------------------------------------------------
/enrich2/constants.py:
--------------------------------------------------------------------------------
  1 | #: Variant string for counting wild type sequences
  2 | WILD_TYPE_VARIANT = "_wt"
  3 | 
  4 | 
  5 | #: Variant string for synonymous variants in 'synonymous' DataFrame
  6 | SYNONYMOUS_VARIANT = "_sy"
  7 | 
  8 | 
  9 | #: Standard codon table for translating wild type and variant DNA sequences
 10 | CODON_TABLE = {
 11 |     "TTT": "F",
 12 |     "TCT": "S",
 13 |     "TAT": "Y",
 14 |     "TGT": "C",
 15 |     "TTC": "F",
 16 |     "TCC": "S",
 17 |     "TAC": "Y",
 18 |     "TGC": "C",
 19 |     "TTA": "L",
 20 |     "TCA": "S",
 21 |     "TAA": "*",
 22 |     "TGA": "*",
 23 |     "TTG": "L",
 24 |     "TCG": "S",
 25 |     "TAG": "*",
 26 |     "TGG": "W",
 27 |     "CTT": "L",
 28 |     "CCT": "P",
 29 |     "CAT": "H",
 30 |     "CGT": "R",
 31 |     "CTC": "L",
 32 |     "CCC": "P",
 33 |     "CAC": "H",
 34 |     "CGC": "R",
 35 |     "CTA": "L",
 36 |     "CCA": "P",
 37 |     "CAA": "Q",
 38 |     "CGA": "R",
 39 |     "CTG": "L",
 40 |     "CCG": "P",
 41 |     "CAG": "Q",
 42 |     "CGG": "R",
 43 |     "ATT": "I",
 44 |     "ACT": "T",
 45 |     "AAT": "N",
 46 |     "AGT": "S",
 47 |     "ATC": "I",
 48 |     "ACC": "T",
 49 |     "AAC": "N",
 50 |     "AGC": "S",
 51 |     "ATA": "I",
 52 |     "ACA": "T",
 53 |     "AAA": "K",
 54 |     "AGA": "R",
 55 |     "ATG": "M",
 56 |     "ACG": "T",
 57 |     "AAG": "K",
 58 |     "AGG": "R",
 59 |     "GTT": "V",
 60 |     "GCT": "A",
 61 |     "GAT": "D",
 62 |     "GGT": "G",
 63 |     "GTC": "V",
 64 |     "GCC": "A",
 65 |     "GAC": "D",
 66 |     "GGC": "G",
 67 |     "GTA": "V",
 68 |     "GCA": "A",
 69 |     "GAA": "E",
 70 |     "GGA": "G",
 71 |     "GTG": "V",
 72 |     "GCG": "A",
 73 |     "GAG": "E",
 74 |     "GGG": "G",
 75 | }
 76 | 
 77 | 
 78 | #: Conversions between single- and three-letter amino acid codes
 79 | AA_CODES = {
 80 |     "Ala": "A",
 81 |     "A": "Ala",
 82 |     "Arg": "R",
 83 |     "R": "Arg",
 84 |     "Asn": "N",
 85 |     "N": "Asn",
 86 |     "Asp": "D",
 87 |     "D": "Asp",
 88 |     "Cys": "C",
 89 |     "C": "Cys",
 90 |     "Glu": "E",
 91 |     "E": "Glu",
 92 |     "Gln": "Q",
 93 |     "Q": "Gln",
 94 |     "Gly": "G",
 95 |     "G": "Gly",
 96 |     "His": "H",
 97 |     "H": "His",
 98 |     "Ile": "I",
 99 |     "I": "Ile",
100 |     "Leu": "L",
101 |     "L": "Leu",
102 |     "Lys": "K",
103 |     "K": "Lys",
104 |     "Met": "M",
105 |     "M": "Met",
106 |     "Phe": "F",
107 |     "F": "Phe",
108 |     "Pro": "P",
109 |     "P": "Pro",
110 |     "Ser": "S",
111 |     "S": "Ser",
112 |     "Thr": "T",
113 |     "T": "Thr",
114 |     "Trp": "W",
115 |     "W": "Trp",
116 |     "Tyr": "Y",
117 |     "Y": "Tyr",
118 |     "Val": "V",
119 |     "V": "Val",
120 |     "Ter": "*",
121 |     "*": "Ter",
122 |     "???": "?",
123 |     "?": "???",
124 | }
125 | 


--------------------------------------------------------------------------------
/enrich2/dataframe.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import logging
  4 | from .constants import WILD_TYPE_VARIANT
  5 | import collections
  6 | from .variant import mutation_count, re_protein, re_coding, re_noncoding
  7 | from .barcodemap import re_barcode, re_identifier
  8 | from .constants import AA_CODES
  9 | from .storemanager import ELEMENT_LABELS
 10 | from .sfmap import AA_LIST, NT_LIST
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | SingleMut = collections.namedtuple("SingleMut", ["pre", "post", "pos", "key"])
 15 | 
 16 | 
 17 | def validate_index(index, element):
 18 |     """
 19 |     Return a boolean list for which index values are valid for the given
 20 |     element type.
 21 |     """
 22 |     if element not in ELEMENT_LABELS:
 23 |         raise ValueError("Invalid element label '{}'".format(element))
 24 | 
 25 |     if element == "barcodes":
 26 |         retval = [re_barcode.match(x) is not None for x in index]
 27 |     elif element == "identifiers":
 28 |         retval = [re_identifier.match(x) is not None for x in index]
 29 |     elif element == "variants":
 30 |         pass
 31 |     elif element == "synonymous":
 32 |         pass
 33 |     else:
 34 |         raise NotImplementedError("Unimplemented element type '{}'" "".format(element))
 35 | 
 36 | 
 37 | def single_mutation_index(index):
 38 |     """
 39 |     Return a filtered pandas Index containing only single mutations. Filtering
 40 |     also removes unrecognized amino acids (denoted by ``"???"``) caused by
 41 |     some indels.
 42 | 
 43 |     *index* the index to be filtered for single mutations.
 44 |     """
 45 |     return pd.Index(x for x in index if mutation_count(x) == 1)
 46 | 
 47 | 
 48 | def filter_coding_index(index):
 49 |     """
 50 |     Return a filtered pandas Index with any unrecognized amino acids (denoted
 51 |     by ``"???"``) removed. These are caused by some frame shift mutations.
 52 | 
 53 |     *index* the index to be filtered.
 54 |     """
 55 |     return pd.Index(x for x in index if "???" not in x)
 56 | 
 57 | 
 58 | def single_mutations_to_tuples(index):
 59 |     """
 60 |     Return a list of SingleMut namedtuples for each single mutation in the
 61 |     *index*. The type of index (noncoding DNA, coding DNA, or protein) is
 62 |     automatically detected.
 63 | 
 64 |     Position value in the tuple is stored as an integer.
 65 | 
 66 |     If the *index* is a protein index, the amino acids are referred to by
 67 |     single-letter codes not three-letter codes.
 68 | 
 69 |     *index* is the index to convert to SingleMut tuples.
 70 | 
 71 |     Raises a ValueError if non-single mutations are included in *index*.
 72 | 
 73 |     Raises a ValueError if one of the *index* entries cannot be parsed.
 74 | 
 75 |     Raises an IndexError if the *index* is empty.
 76 |     """
 77 |     if any(mutation_count(x) != 1 for x in index):
 78 |         raise ValueError(
 79 |             "Non-single mutations cannot be converted into " "SingleMut tuples."
 80 |         )
 81 | 
 82 |     # identify the type of index
 83 |     try:
 84 |         if re_noncoding.match(index[0]):
 85 |             is_protein = False
 86 |             expression = re_noncoding
 87 |         elif re_coding.match(index[0]):
 88 |             is_protein = False
 89 |             expression = re_coding
 90 |         elif re_protein.match(index[0]):
 91 |             is_protein = True
 92 |             expression = re_protein
 93 |         else:
 94 |             raise ValueError("Unrecognized HGVS string.")
 95 |     except IndexError:
 96 |         raise IndexError("Cannot convert empty index to tuples.")
 97 | 
 98 |     # perform the regular expression matches and create the SingleMut tuples
 99 |     tuples = list()
100 |     for x in index:
101 |         m = expression.match(x)
102 |         if m is None:
103 |             raise ValueError("Unrecognized HGVS string.")
104 |         else:
105 |             if is_protein:  # convert to single-letter amino acid code
106 |                 tuples.append(
107 |                     SingleMut(
108 |                         AA_CODES[m.group("pre")],
109 |                         AA_CODES[m.group("post")],
110 |                         int(m.group("pos")),
111 |                         m.group("match"),
112 |                     )
113 |                 )
114 |             else:
115 |                 tuples.append(
116 |                     SingleMut(
117 |                         m.group("pre"),
118 |                         m.group("post"),
119 |                         int(m.group("pos")),
120 |                         m.group("match"),
121 |                     )
122 |                 )
123 | 
124 |     return tuples
125 | 
126 | 
127 | def fill_position_gaps(positions, gap_size):
128 |     """
129 |     Create a list of integer positions with gaps filled in. Used by
130 |     :py:func:`singleton_dataframe`.
131 | 
132 |     Args:
133 |         positions (list): integer positions
134 |         gap_size (int): maximum length of gap that will be filled
135 | 
136 |     Returns:
137 |         list: sorted list of unique integer positions with gaps filled
138 |     """
139 |     if len(positions) == 0:
140 |         raise ValueError("Empty positions list.")
141 | 
142 |     # uniqify and sort
143 |     positions = sorted(list(set(positions)))
144 | 
145 |     # fill in short gaps
146 |     fill = set()
147 |     for i in range(len(positions) - 1):
148 |         delta = positions[i + 1] - positions[i]
149 |         if delta > 1 and delta <= gap_size:
150 |             fill.update(positions[i] + n + 1 for n in range(delta))
151 |     fill.update(positions)
152 | 
153 |     return sorted(list(fill))
154 | 
155 | 
156 | def singleton_dataframe(
157 |     values, wt, gap_size=5, coding=True, plot_wt_score=True, aa_list=AA_LIST
158 | ):
159 |     """
160 |     Prepare data for plotting as a sequence-function map. Returns a data frame
161 |     suitable for plotting as heat map data and a wild type sequence extracted
162 |     from the variant information.
163 | 
164 |     The type of variants stored is automatically detected, and the index will
165 |     be filtered for single mutations.
166 | 
167 |     The data frame has amino acids or nucleotides as columns and positions with
168 |     rows. If there are no mutations at a given position, it will not appear in
169 |     the data frame unless this gap is filled with rows containing no data. The
170 |     wild type sequence entry for these rows will be blank.
171 | 
172 |     Args:
173 |         values (|pd_Series|): data values (typically scores or counts)
174 | 
175 |         wt (WildTypeSequence): wild type for the data
176 | 
177 |         gap_size (int): maximum length of missing data gap that will be filled
178 | 
179 |         coding (bool): True for amino acid data, False for nucleotide
180 | 
181 |         plot_wt_score (bool): True if the wild type positions should have the
182 |             wild type score, False if they should be missing
183 | 
184 |     Returns:
185 |         tuple: two-element tuple containing a |pd_DataFrame| filled with the
186 |         data values and a list of single-character wild type values
187 |     """
188 |     if len(values.index) == 0:
189 |         raise ValueError(
190 |             "Cannot process an empty data frame [{}]".format(wt.parent_name)
191 |         )
192 | 
193 |     # save the wild type score for later
194 |     if plot_wt_score:
195 |         try:
196 |             wt_score = values[WILD_TYPE_VARIANT]
197 |         except KeyError:
198 |             logger.warning("Wild type score not measured, will be missing in " "plots")
199 |             wt_score = np.nan
200 | 
201 |     # select only rows with singleton mutations
202 |     values = values[filter_coding_index(single_mutation_index(values.index))]
203 | 
204 |     # parse out the information from the index
205 |     index_tuples = single_mutations_to_tuples(values.index)
206 | 
207 |     # create and populate the DataFrame
208 |     # get sorted, unique list of positions that have a mutation
209 |     positions = fill_position_gaps([x.pos for x in index_tuples], gap_size=gap_size)
210 |     # initialize the DataFrame
211 |     if coding:
212 |         columns = aa_list
213 |     else:
214 |         columns = NT_LIST
215 |     frame = pd.DataFrame(np.nan, columns=columns, index=positions)
216 |     # populate the DataFrame
217 |     for x in index_tuples:
218 |         frame.loc[x.pos, x.post] = values.loc[x.key]
219 | 
220 |     # create a dictionary of position->nucleotide/amino acid
221 |     wt_dict = dict(wt.position_tuples(protein=coding))
222 | 
223 |     # convert subset of the wild type dictionary into sequence
224 |     try:
225 |         wt_sequence = "".join(wt_dict[x] for x in positions)
226 |     except KeyError:
227 |         raise ValueError("Inconsistent wild type positions [{}]".format(wt.parent_name))
228 | 
229 |     # double-check that the wild type is consistent with the data frame
230 |     for x in index_tuples:
231 |         if x.pos in wt_dict:
232 |             if x.pre != wt_dict[x.pos]:
233 |                 raise ValueError(
234 |                     "Inconsistent wild type sequence [{}]".format(wt.parent_name)
235 |                 )
236 | 
237 |     # add wild type scores if desired
238 |     if plot_wt_score:
239 |         for p in positions:
240 |             frame.loc[p, wt_dict[p]] = wt_score
241 | 
242 |     return (frame, wt_sequence)
243 | 


--------------------------------------------------------------------------------
/enrich2/fastqheader.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | # Matches FASTQ headers based on the following pattern (modify as needed):
 4 | # @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<sample number>
 5 | 
 6 | # Example: @M02564:876:000000000-L3775:1:1101:16862:1800 1:N:0:TCACTCGA+TAACGGTT
 7 | # Sample number contains indexes if they are present.
 8 | 
 9 | # See: https://help.basespace.illumina.com/files-used-by-basespace/fastq-files
10 | 
11 | # Note: this regex is currently unused since it is not needed to support the legacy chastity filtering feature
12 | new_header_pattern = re.compile(
13 |     r"""
14 |     @(?P<Instrument>[^:]+):
15 |     (?P<RunNumber>\d+):
16 |     (?P<FlowcellID>[^:]+):
17 |     (?P<Lane>\d+):
18 |     (?P<Tile>\d+):
19 |     (?P<XPos>\d+):
20 |     (?P<YPos>\d+)
21 |     \s
22 |     (?P<Read>\d+):
23 |     (?P<IsFiltered>[YN]):
24 |     (?P<ControlNumber>[^:]+):
25 |     (?P<SampleNumber>[^:]+)
26 |     """,
27 |     re.VERBOSE,
28 | )
29 | 
30 | # Matches FASTQ headers based on the following pattern (modify as needed):
31 | # @<MachineName>:<Lane>:<Tile>:<X>:<Y>:<Chastity>#<IndexRead>/<ReadNumber>
32 | old_header_pattern = re.compile(
33 |     r"""
34 |     @(?P<MachineName>.+):
35 |     (?P<Lane>\d+):
36 |     (?P<Tile>\d+):
37 |     (?P<X>\d+):
38 |     (?P<Y>\d+):
39 |     (?P<Chastity>[01])#
40 |     (?P<IndexRead>\d)/
41 |     (?P<ReadNumber>\d)
42 |     """,
43 |     re.VERBOSE,
44 | )
45 | 
46 | def parse_fastq_header(fq, pattern=old_header_pattern):
47 |     """Parse the read's FASTQ_ header and return key-value pairs.
48 | 
49 |     Parses the first FASTQ_ header (@ header) and returns a dictionary.
50 |     Dictionary keys are the named groups in the regular expression
51 |     *pattern*. Unnamed matches are ignored. Integer values are converted
52 |     from strings to integers.
53 | 
54 |     The default pattern matches a header in the format::
55 | 
56 |         @<MachineName>:<Lane>:<Tile>:<X>:<Y>:<Chastity>#<IndexRead>/<ReadNumber>
57 | 
58 |     """
59 |     match = pattern.match(fq.header)
60 |     if match is None:
61 |         return None
62 |     else:
63 |         header_dict = match.groupdict()
64 |         for key in header_dict:
65 |             if header_dict[key].isdigit():
66 |                 header_dict[key] = int(header_dict[key])
67 |         return header_dict
68 | 
69 | 
70 | def fastq_read_is_chaste(self, raises=True):
71 |     """
72 |     Returns ``True`` if the chastity bit is set in the header. The
73 |     regular experession used by :py:meth:`header_information` must
74 |     include a ``'Chastity'`` match that equals ``1`` if the read is
75 |     chaste.
76 | 
77 |     If ``raises`` is ``True``, raises an informative error if the
78 |     chastity information in the header is not found. Otherwise, a
79 |     read without chastity information is treated as unchaste.
80 |     """
81 |     try:
82 |         if self.header_information()["Chastity"] == 1:
83 |             return True
84 |         else:
85 |             return False
86 |     except KeyError:  # no 'Chastity' in pattern
87 |         if raises:
88 |             raise KeyError("No chastity bit in FASTQ header pattern")
89 |         else:
90 |             return False
91 |     except TypeError:  # no header match (unexpected format)
92 |         if raises:
93 |             raise ValueError("Unexpected FASTQ header format")
94 |         else:
95 |             return False
96 | 


--------------------------------------------------------------------------------
/enrich2/gui/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/enrich2/gui/__init__.py


--------------------------------------------------------------------------------
/enrich2/gui/create_root_dialog.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tkinter as tk
  3 | import tkinter.ttk
  4 | import tkinter.simpledialog
  5 | from .dialog_elements import FileEntry, StringEntry, DEFAULT_COLUMNS
  6 | from .create_seqlib_dialog import SEQLIB_LABEL_TEXT
  7 | from ..barcode import BarcodeSeqLib
  8 | from ..barcodevariant import BcvSeqLib
  9 | from ..barcodeid import BcidSeqLib
 10 | from ..basic import BasicSeqLib
 11 | from ..idonly import IdOnlySeqLib
 12 | from ..overlap import OverlapSeqLib
 13 | from ..selection import Selection
 14 | from ..experiment import Experiment
 15 | 
 16 | 
 17 | #: map class names to class definitions to avoid use of globals()
 18 | ELEMENT_CLASSES = {
 19 |     "BarcodeSeqLib": BarcodeSeqLib,
 20 |     "BcvSeqLib": BcvSeqLib,
 21 |     "BcidSeqLib": BcidSeqLib,
 22 |     "BasicSeqLib": BasicSeqLib,
 23 |     "IdOnlySeqLib": IdOnlySeqLib,
 24 |     "OverlapSeqLib": OverlapSeqLib,
 25 |     "Selection": Selection,
 26 |     "Experiment": Experiment,
 27 | }
 28 | 
 29 | 
 30 | class CreateRootDialog(tkinter.simpledialog.Dialog):
 31 |     """
 32 |     Dialog box for creating a new root element.
 33 |     """
 34 | 
 35 |     def __init__(self, parent_window, title="Create Root Object"):
 36 |         self.element_tkstring = tk.StringVar()
 37 |         self.cfg_dict = dict()
 38 |         self.output_directory_tk = FileEntry(
 39 |             "Output Directory",
 40 |             self.cfg_dict,
 41 |             "output directory",
 42 |             optional=False,
 43 |             directory=True,
 44 |         )
 45 |         self.name_tk = StringEntry("Name", self.cfg_dict, "name", optional=False)
 46 |         self.element = None
 47 |         tkinter.simpledialog.Dialog.__init__(self, parent_window, title)
 48 | 
 49 |     def body(self, master):
 50 |         row_no = self.name_tk.body(master, 0)
 51 |         row_no += self.output_directory_tk.body(master, row_no)
 52 | 
 53 |         element_types = tkinter.ttk.Frame(master, padding=(3, 3, 12, 12))
 54 |         element_types.grid(
 55 |             column=0, row=row_no, sticky="nsew", columnspan=DEFAULT_COLUMNS
 56 |         )
 57 | 
 58 |         message = tkinter.ttk.Label(element_types, text="Root object type:")
 59 |         message.grid(column=0, row=0)
 60 | 
 61 |         label = tkinter.ttk.Label(element_types, text="Experiment")
 62 |         label.grid(column=0, row=1, sticky="w")
 63 |         rb = tkinter.ttk.Radiobutton(
 64 |             element_types,
 65 |             text="Experiment",
 66 |             variable=self.element_tkstring,
 67 |             value="Experiment",
 68 |         )
 69 |         rb.grid(column=0, row=2, sticky="w")
 70 |         rb.invoke()
 71 | 
 72 |         label = tkinter.ttk.Label(element_types, text="Selection")
 73 |         label.grid(column=0, row=3, sticky="w")
 74 |         rb = tkinter.ttk.Radiobutton(
 75 |             element_types,
 76 |             text="Selection",
 77 |             variable=self.element_tkstring,
 78 |             value="Selection",
 79 |         )
 80 |         rb.grid(column=0, row=4, sticky="w")
 81 | 
 82 |         label = tkinter.ttk.Label(element_types, text="SeqLib")
 83 |         label.grid(column=0, row=5, sticky="w")
 84 |         for i, k in enumerate(SEQLIB_LABEL_TEXT.keys()):
 85 |             rb = tkinter.ttk.Radiobutton(
 86 |                 element_types,
 87 |                 text=SEQLIB_LABEL_TEXT[k],
 88 |                 variable=self.element_tkstring,
 89 |                 value=k,
 90 |             )
 91 |             rb.grid(column=0, row=(i + 6), sticky="w")
 92 | 
 93 |     def buttonbox(self):
 94 |         """
 95 |         Display only one button.
 96 |         """
 97 |         box = tk.Frame(self)
 98 | 
 99 |         w = tk.Button(box, text="OK", width=10, command=self.ok, default="active")
100 |         w.pack(side="left", padx=5, pady=5)
101 | 
102 |         self.bind("<Return>", self.ok)
103 | 
104 |         box.pack()
105 | 
106 |     def validate(self):
107 |         # check the fields
108 |         return self.output_directory_tk.validate() and self.name_tk.validate()
109 | 
110 |     def apply(self):
111 |         # apply the fields
112 |         self.output_directory_tk.apply()
113 |         self.name_tk.apply()
114 | 
115 |         # create the object
116 |         try:
117 |             self.element = ELEMENT_CLASSES[self.element_tkstring.get()]()
118 |         except KeyError:
119 |             raise KeyError(
120 |                 "Unrecognized element type '{}'".format(self.element_tkstring.get())
121 |             )
122 | 
123 |         # set the properties from this dialog
124 |         self.element.output_dir_override = False
125 |         self.element.output_dir = self.cfg_dict["output directory"]
126 |         self.element.name = self.cfg_dict["name"]
127 | 


--------------------------------------------------------------------------------
/enrich2/gui/create_seqlib_dialog.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import tkinter as tk
 3 | import tkinter.ttk
 4 | import tkinter.simpledialog
 5 | from collections import OrderedDict
 6 | from ..barcode import BarcodeSeqLib
 7 | from ..barcodevariant import BcvSeqLib
 8 | from ..barcodeid import BcidSeqLib
 9 | from ..basic import BasicSeqLib
10 | from ..idonly import IdOnlySeqLib
11 | from ..overlap import OverlapSeqLib
12 | 
13 | 
14 | SEQLIB_LABEL_TEXT = OrderedDict(
15 |     [
16 |         ("BcvSeqLib", "Barcoded Variant"),
17 |         ("BcidSeqLib", "Barcoded Identifier"),
18 |         ("OverlapSeqLib", "Overlap"),
19 |         ("BasicSeqLib", "Basic"),
20 |         ("BarcodeSeqLib", "Barcodes Only"),
21 |         ("IdOnlySeqLib", "Identifiers Only"),
22 |     ]
23 | )
24 | 
25 | #: map class names to class definitions to avoid use of globals()
26 | SEQLIB_CLASSES = {
27 |     "BarcodeSeqLib": BarcodeSeqLib,
28 |     "BcvSeqLib": BcvSeqLib,
29 |     "BcidSeqLib": BcidSeqLib,
30 |     "BasicSeqLib": BasicSeqLib,
31 |     "IdOnlySeqLib": IdOnlySeqLib,
32 |     "OverlapSeqLib": OverlapSeqLib,
33 | }
34 | 
35 | 
36 | class CreateSeqLibDialog(tkinter.simpledialog.Dialog):
37 |     """
38 |     Dialog box for creating a new SeqLib.
39 |     """
40 | 
41 |     def __init__(self, parent_window, title="New SeqLib"):
42 |         self.element_tkstring = tk.StringVar()
43 |         self.element_type = None
44 |         tkinter.simpledialog.Dialog.__init__(self, parent_window, title)
45 | 
46 |     def body(self, master):
47 |         message = tkinter.ttk.Label(master, text="SeqLib type:")
48 |         message.grid(column=0, row=0)
49 | 
50 |         for i, k in enumerate(SEQLIB_LABEL_TEXT.keys()):
51 |             rb = tkinter.ttk.Radiobutton(
52 |                 master,
53 |                 text=SEQLIB_LABEL_TEXT[k],
54 |                 variable=self.element_tkstring,
55 |                 value=k,
56 |             )
57 |             rb.grid(column=0, row=(i + 1), sticky="w")
58 |             if i == 0:
59 |                 rb.invoke()
60 | 
61 |     def buttonbox(self):
62 |         """
63 |         Display only one button.
64 |         """
65 |         box = tk.Frame(self)
66 | 
67 |         w = tk.Button(box, text="OK", width=10, command=self.ok, default="active")
68 |         w.pack(side="left", padx=5, pady=5)
69 | 
70 |         self.bind("<Return>", self.ok)
71 | 
72 |         box.pack()
73 | 
74 |     def apply(self):
75 |         try:
76 |             self.element_type = SEQLIB_CLASSES[self.element_tkstring.get()]
77 |         except KeyError:
78 |             raise KeyError("Unrecognized element type.")
79 | 


--------------------------------------------------------------------------------
/enrich2/gui/delete_dialog.py:
--------------------------------------------------------------------------------
 1 | import tkinter as tk
 2 | import tkinter.ttk
 3 | import tkinter.simpledialog
 4 | 
 5 | 
 6 | def subtree_ids(treeview, x, level=0):
 7 |     """
 8 |     Return a list of tuples containing the ids and levels for *x* and every element below it in the Treeview *treeview*.
 9 | 
10 |     The level of *x* is 0, children of *x* are 1, and so forth.
11 |     """
12 |     id_list = list()
13 |     id_list.append((x, level))
14 |     for y in treeview.get_children(x):
15 |         id_list.extend(subtree_ids(treeview, y, level + 1))
16 |     return id_list
17 | 
18 | 
19 | class DeleteDialog(tkinter.simpledialog.Dialog):
20 |     """
21 |     Confirmation dialog box for deleting the selected items from the Treeview.
22 |     """
23 | 
24 |     def __init__(self, parent_window, tree, title="Confirm Deletion"):
25 |         self.tree = tree
26 |         self.id_tuples = list()
27 |         for x in self.tree.treeview.selection():
28 |             if x not in [y[0] for y in self.id_tuples]:
29 |                 self.id_tuples.extend(subtree_ids(self.tree.treeview, x))
30 |         tkinter.simpledialog.Dialog.__init__(self, parent_window, title)
31 | 
32 |     def body(self, master):
33 |         """
34 |         Generates the required text listing all elements that will be deleted.
35 | 
36 |         Displays the "OK" and "Cancel" buttons.
37 |         """
38 |         if len(self.id_tuples) == 0:
39 |             message_string = "No elements selected."
40 |         elif len(self.id_tuples) == 1:
41 |             message_string = 'Delete "{}"?'.format(
42 |                 self.tree.get_element(self.id_tuples[0][0]).name
43 |             )
44 |         else:
45 |             message_string = "Delete the following items?\n"
46 |             for x, level in self.id_tuples:
47 |                 if level == 0:
48 |                     bullet = "    " + "\u25C6"
49 |                 else:
50 |                     bullet = "    " * (level + 1) + "\u25C7"
51 |                 message_string += "{bullet} {name}\n".format(
52 |                     bullet=bullet, name=self.tree.get_element(x).name
53 |                 )
54 |         message = tkinter.ttk.Label(master, text=message_string, justify="left")
55 |         message.grid(row=0, sticky="w")
56 | 
57 |     def buttonbox(self):
58 |         """
59 |         Display only one button if there's no selection. Otherwise, use the default method to display two buttons.
60 |         """
61 |         if len(self.id_tuples) == 0:
62 |             box = tk.Frame(self)
63 | 
64 |             w = tk.Button(
65 |                 box, text="OK", width=10, command=self.cancel, default="active"
66 |             )
67 |             w.pack(side="left", padx=5, pady=5)
68 | 
69 |             self.bind("<Return>", self.cancel)
70 | 
71 |             box.pack()
72 |         else:
73 |             tkinter.simpledialog.Dialog.buttonbox(self)
74 | 
75 |     def apply(self):
76 |         """
77 |         Called when the user chooses "OK". Performs the deletion.
78 |         """
79 |         for tree_id, _ in self.id_tuples:
80 |             self.tree.delete_element(tree_id)
81 |         self.tree.refresh_treeview()
82 | 


--------------------------------------------------------------------------------
/enrich2/gui/dialog_elements.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tkinter as tk
  3 | import tkinter.ttk
  4 | import tkinter.messagebox
  5 | import tkinter.filedialog
  6 | import os.path
  7 | 
  8 | DEFAULT_COLUMNS = 3
  9 | 
 10 | 
 11 | class SectionLabel(object):
 12 |     def __init__(self, text):
 13 |         self.text = text
 14 | 
 15 |     def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs):
 16 |         label = tkinter.ttk.Label(master, text=self.text)
 17 |         label.grid(row=row, column=0, columnspan=columns, sticky="w")
 18 |         return 1
 19 | 
 20 |     def validate(self):
 21 |         return True
 22 | 
 23 |     def apply(self):
 24 |         return None
 25 | 
 26 |     def enable(self):
 27 |         pass
 28 | 
 29 |     def disable(self):
 30 |         pass
 31 | 
 32 | 
 33 | class Checkbox(object):
 34 |     def __init__(self, text, cfg, key):
 35 |         self.checkbox = None
 36 |         self.enabled = True
 37 | 
 38 |         self.value = tk.BooleanVar()
 39 |         self.text = text
 40 |         self.cfg = cfg
 41 |         self.key = key
 42 |         try:
 43 |             if self.cfg[self.key] not in (True, False):
 44 |                 self.value.set(False)
 45 |             else:
 46 |                 self.value.set(self.cfg[self.key])
 47 |         except KeyError:
 48 |             self.value.set(False)  # default to False
 49 | 
 50 |     def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs):
 51 |         """
 52 |         Place the required elements using the grid layout method.
 53 | 
 54 |         Returns the number of rows taken by this element.
 55 |         """
 56 |         self.checkbox = tkinter.ttk.Checkbutton(master, text=self.text, variable=self.value)
 57 |         self.checkbox.grid(row=row, column=0, columnspan=columns, sticky="w")
 58 |         return 1
 59 | 
 60 |     def validate(self):
 61 |         return True
 62 | 
 63 |     def apply(self):
 64 |         if self.enabled:
 65 |             self.cfg[self.key] = self.value.get()
 66 |         else:
 67 |             self.cfg[self.key] = None
 68 | 
 69 |     def enable(self):
 70 |         self.enabled = True
 71 |         self.checkbox.state(["!disabled"])
 72 | 
 73 |     def disable(self):
 74 |         self.enabled = False
 75 |         self.checkbox.state(["disabled"])
 76 | 
 77 | 
 78 | class MyEntry(object):
 79 |     """
 80 |     Base class for labeled Entry fields.
 81 | 
 82 |     *text* is the Label/error box text.
 83 |     """
 84 | 
 85 |     def __init__(self, text, cfg, key, optional=False):
 86 |         self.entry = None
 87 |         self.enabled = True
 88 | 
 89 |         self.value = tk.StringVar()
 90 |         self.text = text
 91 |         self.cfg = cfg
 92 |         self.key = key
 93 |         self.optional = optional
 94 |         try:
 95 |             if self.cfg[self.key] is None:
 96 |                 self.value.set("")
 97 |             else:
 98 |                 self.value.set(self.cfg[self.key])
 99 |         except KeyError:
100 |             self.value.set("")
101 | 
102 |     def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs):
103 |         """
104 |         Place the required elements using the grid layout method.
105 | 
106 |         Returns the number of rows taken by this element.
107 |         """
108 |         label = tkinter.ttk.Label(master, text=self.text)
109 |         label.grid(row=row, column=0, columnspan=1, sticky="e")
110 |         self.entry = tkinter.ttk.Entry(master, textvariable=self.value)
111 |         self.entry.grid(row=row, column=1, columnspan=columns - 1, sticky="ew")
112 |         return 1
113 | 
114 |     def validate(self):
115 |         """
116 |         Validates the input. Returns ``True`` unless the field is blank and
117 |         *optional* is ``False``.
118 |         """
119 |         if not self.enabled:
120 |             return True
121 |         elif not self.optional and len(self.value.get()) == 0:
122 |             tkinter.messagebox.showwarning("", "{} not specified.".format(self.text))
123 |             return False
124 |         else:
125 |             return True
126 | 
127 |     def apply(self):
128 |         if self.enabled and len(self.value.get()) > 0:
129 |             self.cfg[self.key] = self.value.get()
130 |         else:
131 |             self.cfg[self.key] = None
132 | 
133 |     def enable(self):
134 |         self.enabled = True
135 |         self.entry.state(["!disabled"])
136 | 
137 |     def disable(self):
138 |         self.enabled = False
139 |         self.entry.state(["disabled"])
140 | 
141 | 
142 | class FileEntry(MyEntry):
143 |     """
144 |     Creates a labeled Entry field for a file or directory.
145 | 
146 |     *text* is the Label/error box text.
147 |     *directory* is ``True`` if selecting a directory (instead of a file).
148 |     *extensions* is a list of valid file endings
149 | 
150 |     """
151 | 
152 |     def __init__(
153 |         self, text, cfg, key, optional=False, directory=False, extensions=None
154 |     ):
155 |         MyEntry.__init__(self, text, cfg, key, optional)
156 |         self.choose = None
157 |         self.clear = None
158 | 
159 |         self.directory = directory
160 |         if extensions is not None:
161 |             self.extensions = [x.lower() for x in extensions]
162 |         else:
163 |             self.extensions = None
164 | 
165 |     def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs):
166 |         """
167 |         Place the required elements using the grid layout method.
168 | 
169 |         Returns the number of rows taken by this element.
170 |         """
171 |         label = tkinter.ttk.Label(master, text=self.text)
172 |         label.grid(row=row, column=0, columnspan=1, sticky="e")
173 |         self.entry = tkinter.ttk.Entry(master, textvariable=self.value)
174 |         self.entry.grid(row=row, column=1, columnspan=columns - 1, sticky="ew")
175 |         if self.directory:
176 |             self.choose = tkinter.ttk.Button(
177 |                 master,
178 |                 text="Choose...",
179 |                 command=lambda: self.value.set(tkinter.filedialog.askdirectory()),
180 |             )
181 |         else:
182 |             self.choose = tkinter.ttk.Button(
183 |                 master,
184 |                 text="Choose...",
185 |                 command=lambda: self.value.set(tkinter.filedialog.askopenfilename()),
186 |             )
187 |         self.choose.grid(row=row + 1, column=1, sticky="w")
188 |         if self.optional:
189 |             self.clear = tkinter.ttk.Button(
190 |                 master, text="Clear", command=lambda: self.value.set("")
191 |             )
192 |             self.clear.grid(row=row + 1, column=2, sticky="e")
193 |         return 2
194 | 
195 |     def validate(self):
196 |         if not self.enabled:
197 |             return True
198 |         elif len(self.value.get()) == 0:
199 |             if not self.optional:
200 |                 tkinter.messagebox.showwarning("", "{} not specified.".format(self.text))
201 |                 return False
202 |             else:
203 |                 return True
204 |         else:
205 |             if os.path.exists(self.value.get()):
206 |                 if self.extensions is not None:
207 |                     if any(
208 |                         self.value.get().lower().endswith(x) for x in self.extensions
209 |                     ):
210 |                         return True
211 |                     else:
212 |                         tkinter.messagebox.showwarning(
213 |                             "", "Invalid file extension " "for {}.".format(self.text)
214 |                         )
215 |                         return False
216 |                 else:  # no extension restriction
217 |                     return True
218 |             else:
219 |                 tkinter.messagebox.showwarning(
220 |                     "", "{} file does not exist." "".format(self.text)
221 |                 )
222 |                 return False
223 | 
224 |     def enable(self):
225 |         self.enabled = True
226 |         self.entry.state(["!disabled"])
227 |         self.choose.state(["!disabled"])
228 |         if self.optional:
229 |             self.clear.state(["!disabled"])
230 | 
231 |     def disable(self):
232 |         self.enabled = False
233 |         self.entry.state(["disabled"])
234 |         self.choose.state(["disabled"])
235 |         if self.optional:
236 |             self.clear.state(["disabled"])
237 | 
238 | 
239 | class StringEntry(MyEntry):
240 |     """
241 |     Creates a labeled Entry field for a string.
242 | 
243 |     *text* is the Label/error box text.
244 |     """
245 | 
246 |     def __init__(self, text, cfg, key, optional=False):
247 |         MyEntry.__init__(self, text, cfg, key, optional)
248 | 
249 |     def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs):
250 |         """
251 |         Place the required elements using the grid layout method.
252 | 
253 |         Returns the number of rows taken by this element.
254 |         """
255 |         label = tkinter.ttk.Label(master, text=self.text)
256 |         label.grid(row=row, column=0, columnspan=1, sticky="e")
257 |         self.entry = tkinter.ttk.Entry(master, textvariable=self.value)
258 |         self.entry.grid(row=row, column=1, columnspan=columns - 1, sticky="ew")
259 |         return 1
260 | 
261 | 
262 | class IntegerEntry(MyEntry):
263 |     """
264 |     Creates a labeled Entry field for an integer.
265 | 
266 |     *text* is the Label/error box text.
267 |     """
268 | 
269 |     def __init__(self, text, cfg, key, optional=False, minvalue=0):
270 |         MyEntry.__init__(self, text, cfg, key, optional)
271 |         self.minvalue = minvalue
272 | 
273 |     def body(self, master, row, columns=DEFAULT_COLUMNS, width=4, left=False, **kwargs):
274 |         """
275 |         Add the labeled entry to the Frame *master* using grid at *row*.
276 | 
277 |         *width* controls the width of the Entry.
278 |         *left* is ``True`` if the Entry is to the left of the Label.
279 |         *columns* is the number of columns in *master*.
280 | 
281 |         Returns the number of rows taken by this element.
282 |         """
283 |         if left:
284 |             entry_column = 0
285 |             entry_sticky = "e"
286 |             entry_width = 1
287 |             label_column = 1
288 |             label_sticky = "w"
289 |             label_width = columns - 1
290 |         else:
291 |             entry_column = 1
292 |             entry_sticky = "w"
293 |             entry_width = columns - 1
294 |             label_column = 0
295 |             label_sticky = "e"
296 |             label_width = 1
297 | 
298 |         label = tkinter.ttk.Label(master, text=self.text)
299 |         label.grid(
300 |             row=row, column=label_column, columnspan=label_width, sticky=label_sticky
301 |         )
302 |         self.entry = tkinter.ttk.Entry(master, textvariable=self.value, width=width)
303 |         self.entry.grid(
304 |             row=row, column=entry_column, columnspan=entry_width, sticky=entry_sticky
305 |         )
306 |         return 1
307 | 
308 |     def validate(self):
309 |         """
310 |         Returns ``True`` if the value entered validates; else ``False``.
311 | 
312 |         If *self.optional* is ``True``, the field can be empty.
313 |         Checks the *self.minvalue* that was passed on creation.
314 |         """
315 |         if not self.enabled:
316 |             return True
317 |         else:
318 |             try:
319 |                 intvalue = int(self.value.get())
320 |             except ValueError:
321 |                 if len(self.value.get()) == 0:
322 |                     if not self.optional:
323 |                         tkinter.messagebox.showwarning(
324 |                             "", "{} not specified." "".format(self.text)
325 |                         )
326 |                         return False
327 |                     else:
328 |                         return True
329 |                 else:
330 |                     tkinter.messagebox.showwarning(
331 |                         "", "{} is not an integer." "".format(self.text)
332 |                     )
333 |                     return False
334 |             else:
335 |                 if intvalue < self.minvalue:
336 |                     tkinter.messagebox.showwarning(
337 |                         "",
338 |                         "{} lower than minimum value "
339 |                         "({}).".format(self.text, self.minvalue),
340 |                     )
341 |                     return False
342 |                 else:
343 |                     return True
344 | 
345 |     def apply(self):
346 |         if self.enabled and len(self.value.get()) > 0:
347 |             self.cfg[self.key] = int(self.value.get())
348 |         else:
349 |             self.cfg[self.key] = None
350 | 


--------------------------------------------------------------------------------
/enrich2/gui/runner_window.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tkinter as tk
  3 | import tkinter.ttk
  4 | import tkinter.simpledialog
  5 | import tkinter.messagebox
  6 | import logging
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class RunnerSavePrompt(tkinter.simpledialog.Dialog):
 12 |     """
 13 |     Dialog box for prompting the user to save before running.
 14 |     """
 15 | 
 16 |     def __init__(self, parent_window, title="Enrich2"):
 17 |         self.pw = parent_window
 18 | 
 19 |         self.dialog_text = tk.StringVar()
 20 |         self.dialog_text.set("Would you like to save your config changes?")
 21 | 
 22 |         tkinter.simpledialog.Dialog.__init__(self, parent_window, title)
 23 | 
 24 |     def body(self, master):
 25 |         frame = tkinter.ttk.Frame(master, padding=(12, 6, 12, 6))
 26 |         frame.pack()
 27 | 
 28 |         dialog_text_label = tkinter.ttk.Label(frame, textvariable=self.dialog_text)
 29 |         dialog_text_label.grid(column=0, row=0, sticky="nsew")
 30 | 
 31 |     def apply(self):
 32 |         self.pw.menu_save()
 33 | 
 34 | 
 35 | class RunnerWindow(tkinter.simpledialog.Dialog):
 36 |     """
 37 |     Dialog box for blocking input while running the analysis.
 38 |     """
 39 | 
 40 |     def __init__(self, parent_window, title="Enrich2"):
 41 |         self.pw = parent_window
 42 |         self.run_button = None
 43 | 
 44 |         self.dialog_text = tk.StringVar()
 45 |         self.dialog_text.set("Ready to start analysis...")
 46 | 
 47 |         tkinter.simpledialog.Dialog.__init__(self, parent_window, title)
 48 | 
 49 |     def body(self, master):
 50 |         frame = tkinter.ttk.Frame(master, padding=(12, 6, 12, 6))
 51 |         frame.pack()
 52 | 
 53 |         dialog_text_label = tkinter.ttk.Label(frame, textvariable=self.dialog_text)
 54 |         dialog_text_label.grid(column=0, row=0, sticky="nsew")
 55 | 
 56 |         self.run_button = tk.Button(
 57 |             frame, text="Begin", width=10, command=self.runner, default="active"
 58 |         )
 59 |         self.run_button.grid(column=0, row=1, sticky="nsew")
 60 | 
 61 |     def buttonbox(self):
 62 |         """
 63 |         Display no buttons.
 64 |         """
 65 |         pass
 66 | 
 67 |     def runner(self):
 68 |         # gray out the "Run" button
 69 |         self.run_button.config(state="disabled")
 70 |         self.update_idletasks()
 71 | 
 72 |         # set the analysis options
 73 |         self.pw.root_element.force_recalculate = self.pw.force_recalculate.get()
 74 |         self.pw.root_element.component_outliers = self.pw.component_outliers.get()
 75 |         self.pw.root_element.scoring_method = self.pw.scoring_method.get()
 76 |         self.pw.root_element.logr_method = self.pw.logr_method.get()
 77 |         self.pw.root_element.plots_requested = self.pw.plots_requested.get()
 78 |         self.pw.root_element.tsv_requested = self.pw.tsv_requested.get()
 79 | 
 80 |         # run the analysis, catching any errors to display in a dialog box
 81 |         try:
 82 |             # ensure that all objects are valid
 83 |             self.pw.root_element.validate()
 84 | 
 85 |             # open HDF5 files for the root and all child objects
 86 |             self.pw.root_element.store_open(children=True)
 87 | 
 88 |             # perform the analysis
 89 |             self.pw.root_element.calculate()
 90 | 
 91 |         except Exception as e:
 92 |             # display error
 93 |             logger.error(e)
 94 |             tkinter.messagebox.showerror(
 95 |                 "Enrich2 Error", "Enrich2 encountered an error:\n{}".format(e)
 96 |             )
 97 | 
 98 |         else:
 99 |             # no exception occurred during calculation and setup
100 |             # generate desired output
101 |             if self.pw.plots_requested.get():
102 |                 try:
103 |                     self.pw.root_element.make_plots()
104 |                 except Exception as e:
105 |                     tkinter.messagebox.showwarning(
106 |                         None,
107 |                         "Calculations completed, but plotting failed:\n{}".format(e),
108 |                     )
109 |             if self.pw.tsv_requested.get():
110 |                 try:
111 |                     self.pw.root_element.write_tsv()
112 |                 except Exception as e:
113 |                     tkinter.messagebox.showwarning(
114 |                         None,
115 |                         "Calculations completed, but tsv output failed:\n{}".format(e),
116 |                     )
117 | 
118 |             # show the dialog box
119 |             tkinter.messagebox.showinfo("", "Analysis completed.")
120 | 
121 |         finally:
122 |             # close the HDF5 files
123 |             self.pw.root_element.store_close(children=True)
124 | 
125 |             # close this window
126 |             self.destroy()
127 | 


--------------------------------------------------------------------------------
/enrich2/gui/seqlib_apply_dialog.py:
--------------------------------------------------------------------------------
 1 | import tkinter as tk
 2 | import tkinter.ttk
 3 | import tkinter.simpledialog
 4 | 
 5 | 
 6 | class SeqLibApplyDialog(tkinter.simpledialog.Dialog):
 7 |     """
 8 |     Confirmation dialog box for applying FASTQ filtering options to selected SeqLibs from the Treeview.
 9 |     """
10 | 
11 |     def __init__(
12 |         self, parent_window, tree, source_id, title="Confirm Filtering Changes"
13 |     ):
14 |         self.tree = tree
15 |         self.source_id = source_id
16 |         self.target_ids = [
17 |             x
18 |             for x in self.tree.treeview.selection()
19 |             if x != source_id
20 |             and type(self.tree.get_element(self.source_id))
21 |             == type(self.tree.get_element(x))
22 |         ]
23 |         tkinter.simpledialog.Dialog.__init__(self, parent_window, title)
24 | 
25 |     def body(self, master):
26 |         """
27 |         Generates the required text listing all SeqLibs that will have their FASTQ options updated.
28 | 
29 |         Displays the "OK" and "Cancel" buttons.
30 |         """
31 |         if len(self.target_ids) == 0:
32 |             message_string = "No elegible SeqLibs selected."
33 |         elif len(self.target_ids) == 1:
34 |             message_string = 'Apply FASTQ filtering options from "{}" to "{}"?'.format(
35 |                 self.tree.get_element(self.source_id).name,
36 |                 self.tree.get_element(self.target_ids[0]).name,
37 |             )
38 |         else:
39 |             bullet = "    " + "\u25C6"
40 |             message_string = 'Apply FASTQ filtering options from "{}"" to the following?\n'.format(
41 |                 self.tree.get_element(self.source_id).name
42 |             )
43 |             for x in self.target_ids:
44 |                 message_string += "{bullet} {name}\n".format(
45 |                     bullet=bullet, name=self.tree.get_element(x).name
46 |                 )
47 |         message = tkinter.ttk.Label(master, text=message_string, justify="left")
48 |         message.grid(row=0, sticky="w")
49 | 
50 |     def buttonbox(self):
51 |         """
52 |         Display only one button if there's no selection. Otherwise, use the default method to display two buttons.
53 |         """
54 |         if len(self.target_ids) == 0:
55 |             box = tk.Frame(self)
56 | 
57 |             w = tk.Button(
58 |                 box, text="OK", width=10, command=self.cancel, default="active"
59 |             )
60 |             w.pack(side="left", padx=5, pady=5)
61 | 
62 |             self.bind("<Return>", self.cancel)
63 | 
64 |             box.pack()
65 |         else:
66 |             tkinter.simpledialog.Dialog.buttonbox(self)
67 | 
68 |     def apply(self):
69 |         """
70 |         Called when the user chooses "OK". Performs the FASTQ filtering update.
71 |         """
72 |         filter_cfg = self.tree.get_element(self.source_id).serialize_filters()
73 |         for x in self.target_ids:
74 |             self.tree.get_element(x).filters = filter_cfg
75 |         self.tree.refresh_treeview()
76 | 


--------------------------------------------------------------------------------
/enrich2/idonly.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .seqlib import SeqLib
 3 | 
 4 | 
 5 | class IdOnlySeqLib(SeqLib):
 6 |     """
 7 |     Class for counting data with non-variant identifiers and no associated
 8 |     FASTQ_ data.
 9 |     """
10 | 
11 |     treeview_class_name = "ID-only SeqLib"
12 | 
13 |     def __init__(self):
14 |         SeqLib.__init__(self)
15 |         self.identifier_min_count = 0
16 |         self.add_label("identifiers")
17 |         self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__))
18 | 
19 |     def configure(self, cfg):
20 |         """
21 |         Set up the object using the config object *cfg*, usually derived from
22 |         a ``.json`` file.
23 |         """
24 |         SeqLib.configure(self, cfg)
25 |         self.logger = logging.getLogger(
26 |             "{}.{} - {}".format(__name__, self.__class__.__name__, self.name)
27 |         )
28 |         try:
29 |             if "min count" in cfg["identifiers"]:
30 |                 self.identifier_min_count = int(cfg["identifiers"]["min count"])
31 |         except KeyError as key:
32 |             raise KeyError(
33 |                 "Missing required config value {key} [{name}]"
34 |                 "".format(key=key, name=self.name)
35 |             )
36 | 
37 |     def serialize(self):
38 |         """
39 |         Format this object (and its children) as a config object suitable for
40 |         dumping to a config file.
41 |         """
42 |         cfg = SeqLib.serialize(self)
43 | 
44 |         cfg["identifiers"] = dict()
45 |         if self.identifier_min_count > 0:
46 |             cfg["identifiers"]["min count"] = self.identifier_min_count
47 | 
48 |         return cfg
49 | 
50 |     def calculate(self):
51 |         """
52 |         Get the identifier counts from the counts file.
53 |         """
54 |         if not self.check_store("/main/identifiers/counts"):
55 |             if self.counts_file is not None:
56 |                 self.counts_from_file(self.counts_file)
57 |             else:
58 |                 raise ValueError("Missing counts file [{}]".format(self.name))
59 |             self.save_filtered_counts(
60 |                 "identifiers", "count >= self.identifier_min_count"
61 |             )
62 | 


--------------------------------------------------------------------------------
/enrich2/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | 
  4 | from argparse import ArgumentParser, RawDescriptionHelpFormatter
  5 | import logging
  6 | import json
  7 | import sys
  8 | import platform
  9 | import os.path
 10 | 
 11 | if platform.system() == "Darwin":
 12 |     # Explicitly set the backend to avoid the NSInvalidArgumentException when
 13 |     # running in GUI mode. Advanced users who want to use another matplotlib
 14 |     # backend when running in MacOS on the command line can modify this section
 15 |     # accordingly.
 16 |     import matplotlib
 17 | 
 18 |     matplotlib.use("TkAgg")
 19 | elif os.path.exists("/.dockerenv"):
 20 |     # Explicitly set the backend for running inside Docker. This may fail for
 21 |     # older versions of docker or alternative containerization tools such as
 22 |     # Singularity.
 23 |     import matplotlib
 24 | 
 25 |     matplotlib.use("Agg")
 26 | import enrich2.config_check as config_check
 27 | from enrich2.experiment import Experiment
 28 | from enrich2.selection import Selection
 29 | from enrich2.barcode import BarcodeSeqLib
 30 | from enrich2.barcodeid import BcidSeqLib
 31 | from enrich2.barcodevariant import BcvSeqLib
 32 | from enrich2.basic import BasicSeqLib
 33 | from enrich2.overlap import OverlapSeqLib
 34 | from enrich2.idonly import IdOnlySeqLib
 35 | from enrich2.storemanager import SCORING_METHODS, LOGR_METHODS
 36 | from enrich2.gui.configurator import Configurator
 37 | from enrich2.sfmap import parse_aa_list
 38 | from enrich2 import __version__
 39 | 
 40 | 
 41 | #: Name of the driver script. Used for logging output.
 42 | DRIVER_NAME = os.path.basename(sys.argv[0])
 43 | 
 44 | 
 45 | #: Format string for log entries (console or file).
 46 | LOG_FORMAT = "%(asctime)-15s [%(name)s] %(message)s"
 47 | 
 48 | #: Default log level
 49 | LOG_LEVEL = logging.INFO
 50 | 
 51 | 
 52 | def main_gui():
 53 |     """
 54 |     Entry point for GUI.
 55 | 
 56 |     """
 57 |     logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT)
 58 |     app = Configurator(__version__)
 59 |     app.mainloop()
 60 | 
 61 | 
 62 | def main_cmd():
 63 |     """
 64 |     Entry point for command line.
 65 | 
 66 |     """
 67 |     # build description string based on available methods
 68 |     desc_string = (
 69 |         "Command-line driver for Enrich2 v{}".format(__version__)
 70 |         + "\n\nscoring methods:\n"
 71 |         + "\n".join(["  {:22}{}".format(k, v) for k, v in list(SCORING_METHODS.items())])
 72 |         + "\n\nlog ratio methods:\n"
 73 |         + "\n".join(["  {:22}{}".format(k, v) for k, v in list(LOGR_METHODS.items())])
 74 |     )
 75 | 
 76 |     # create parser and add description
 77 |     parser = ArgumentParser(
 78 |         prog="Enrich2",
 79 |         description=desc_string,
 80 |         formatter_class=RawDescriptionHelpFormatter,
 81 |     )
 82 | 
 83 |     # add command line arguments
 84 |     parser.add_argument("config", help="JSON configuration file")
 85 |     parser.add_argument(
 86 |         "scoring_method", help="scoring method", choices=list(SCORING_METHODS.keys())
 87 |     )
 88 |     parser.add_argument(
 89 |         "logr_method", help="log ratio method", choices=list(LOGR_METHODS.keys())
 90 |     )
 91 | 
 92 |     # add support for semantic version checking
 93 |     parser.add_argument(
 94 |         "--version", action="version", version="{}".format(__version__)
 95 |     )
 96 | 
 97 |     # add analysis options
 98 |     parser.add_argument(
 99 |         "--log", metavar="FILE", dest="log_file", help="path to log file"
100 |     )
101 |     parser.add_argument(
102 |         "--no-plots",
103 |         dest="plots_requested",
104 |         action="store_false",
105 |         default=True,
106 |         help="don't make plots",
107 |     )
108 |     parser.add_argument(
109 |         "--no-tsv",
110 |         dest="tsv_requested",
111 |         action="store_false",
112 |         default=True,
113 |         help="don't generate tsv files",
114 |     )
115 |     parser.add_argument(
116 |         "--recalculate",
117 |         dest="force_recalculate",
118 |         action="store_true",
119 |         default=False,
120 |         help="force recalculation",
121 |     )
122 |     parser.add_argument(
123 |         "--component-outliers",
124 |         dest="component_outliers",
125 |         action="store_true",
126 |         default=False,
127 |         help="calculate component outlier stats",
128 |     )
129 |     parser.add_argument(
130 |         "--output-dir",
131 |         metavar="DIR",
132 |         dest="output_dir_override",
133 |         help="override the config file's output directory",
134 |     )
135 |     parser.add_argument(
136 |         "--sfmap-aa-file",
137 |         metavar="FILE",
138 |         dest="sfmap_aa_file",
139 |         help="amino acid groups for sequence-function maps",
140 |     )
141 | 
142 |     args = parser.parse_args()
143 | 
144 |     # start the logs
145 |     if args.log_file is not None:
146 |         # Create directory if it doesn't exist
147 |         log_dir = os.path.dirname(args.log_file)
148 |         if not os.path.exists(log_dir):
149 |             os.makedirs(log_dir)
150 |         logging.basicConfig(filename=args.log_file, encoding='utf-8', level=LOG_LEVEL, format=LOG_FORMAT)
151 |     else:
152 |         logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT)
153 |     logger = logging.getLogger(__name__)
154 | 
155 |     # read the JSON file
156 |     try:
157 |         cfg = json.load(open(args.config, "r"))
158 |     except IOError:
159 |         raise IOError("Failed to open '{}' [{}]".format(args.config, DRIVER_NAME))
160 |     except ValueError:
161 |         raise ValueError("Improperly formatted .json file [{}]".format(DRIVER_NAME))
162 | 
163 |     # identify config file type and create the object
164 |     if config_check.is_experiment(cfg):
165 |         logger.info("Detected an Experiment config file")
166 |         obj = Experiment()
167 |     elif config_check.is_selection(cfg):
168 |         logger.info("Detected a Selection config file")
169 |         obj = Selection()
170 |     elif config_check.is_seqlib(cfg):
171 |         seqlib_type = config_check.seqlib_type(cfg)
172 |         logger.info("Detected a %s config file", seqlib_type)
173 |         if seqlib_type == "BarcodeSeqLib":
174 |             obj = BarcodeSeqLib()
175 |         elif seqlib_type == "BcidSeqLib":
176 |             obj = BcidSeqLib()
177 |         elif seqlib_type == "BcvSeqLib":
178 |             obj = BcvSeqLib()
179 |         elif seqlib_type == "BasicSeqLib":
180 |             obj = BasicSeqLib()
181 |         elif seqlib_type == "OverlapSeqLib":
182 |             obj = OverlapSeqLib()
183 |         elif seqlib_type == "IdOnlySeqLib":
184 |             obj = IdOnlySeqLib()
185 |         else:
186 |             raise ValueError(
187 |                 "Unrecognized SeqLib type '{}' [{}]".format(seqlib_type, DRIVER_NAME)
188 |             )
189 |     else:
190 |         raise ValueError("Unrecognized .json config [{}]".format(DRIVER_NAME))
191 | 
192 |     # set analysis options
193 |     obj.force_recalculate = args.force_recalculate
194 |     obj.component_outliers = args.component_outliers
195 |     obj.scoring_method = args.scoring_method
196 |     obj.logr_method = args.logr_method
197 |     obj.plots_requested = args.plots_requested
198 |     obj.tsv_requested = args.tsv_requested
199 | 
200 |     if args.output_dir_override is not None:
201 |         obj.output_dir_override = True
202 |         obj.output_dir = args.output_dir_override
203 |     else:
204 |         obj.output_dir_override = False
205 | 
206 |     if args.sfmap_aa_file is not None:
207 |         obj.plot_options = dict()
208 |         obj.plot_options["aa_list"], obj.plot_options[
209 |             "aa_label_groups"
210 |         ] = parse_aa_list(args.sfmap_aa_file)
211 | 
212 |     # configure the object
213 |     obj.configure(cfg)
214 | 
215 |     # make sure objects are valid
216 |     try:
217 |         obj.validate()
218 |     except ValueError:
219 |         logger.exception("Invalid configuration")
220 |     else:
221 |         # open HDF5 files for the object and all child objects
222 |         obj.store_open(children=True)
223 | 
224 |         # perform the analysis
225 |         obj.calculate()
226 | 
227 |         # generate desired output
228 |         obj.make_plots()
229 |         try:
230 |             obj.make_plots()
231 |         except Exception:
232 |             logger.exception("Calculations completed, but plotting failed.")
233 |         try:
234 |             obj.write_tsv()
235 |         except Exception:
236 |             logger.exception("Calculations completed, but TSV output failed.")
237 | 
238 |         # clean up
239 |         obj.store_close(children=True)
240 | 
241 | 
242 | if __name__ == "__main__":
243 |     gui_mode = False
244 | 
245 |     try:
246 |         if sys.argv[1] == "gui":
247 |             gui_mode = True
248 |     except IndexError:
249 |         pass
250 | 
251 |     if gui_mode:
252 |         main_gui()
253 |     else:
254 |         main_cmd()
255 | 


--------------------------------------------------------------------------------
/enrich2/overlap.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | import logging
  4 | 
  5 | from matplotlib.backends.backend_pdf import PdfPages
  6 | import os.path
  7 | from .plots import overlap_merge_plot
  8 | from .seqlib import SeqLib
  9 | from .variant import VariantSeqLib
 10 | from fqfa import open_compressed, parse_fastq_pe_reads, has_fastq_ext
 11 | from fqfa.fastq.fastqread import FastqRead
 12 | from .fastqheader import fastq_read_is_chaste
 13 | 
 14 | 
 15 | class OverlapSeqLib(VariantSeqLib):
 16 |     """
 17 |     Class for count data from sequencing libraries with overlapping paired-end 
 18 |     reads for each variant. Creating a 
 19 |     :py:class:`~seqlib.overlap.OverlapSeqLib` requires a valid *config* object 
 20 |     with an ``'overlap'`` entry.
 21 | 
 22 |     The ``"fastq"`` config entry must contain two read files, with the keys 
 23 |     ``"forward"`` and ``"reverse"``. Information about how to combine these 
 24 |     reads is in the ``"overlap"`` config entry.
 25 | 
 26 |     The ``"overlap"`` config entry contains the following keys:
 27 | 
 28 |     * ``"forward start"`` --- position in the forward read where the \
 29 |         overlapping region begins
 30 |     * ``"reverse start"`` --- position in the reverse read where the \
 31 |         overlapping region begins (before being reverse-complemented)
 32 |     * ``"length"`` --- number of bases in the overlapping region
 33 |     * ``"max mismatches"`` --- maximum number of mismatches tolerated in the \
 34 |         overlapping region before discarding the read
 35 |     * ``"overlap only"`` --- whether to trim the merged read to contain only \
 36 |         the overlapping region (optional, defaults to ``False``)
 37 | 
 38 |     Here is a schematic of the case in the above JSON example::
 39 | 
 40 |         forward ---> 1   
 41 |                      CGACGCAAGGA
 42 |                        |||||||||
 43 |                        ACTCCTTGCGTCG
 44 |                                    1 <--- reverse
 45 | 
 46 |     Note that the merged sequence is identical to the wild type sequence given 
 47 |     in the JSON file.
 48 |     """
 49 | 
 50 |     treeview_class_name = "Overlap SeqLib"
 51 | 
 52 |     def __init__(self):
 53 |         VariantSeqLib.__init__(self)
 54 |         self.forward = None
 55 |         self.reverse = None
 56 |         self.fwd_start = None
 57 |         self.rev_start = None
 58 |         self.overlap_length = None
 59 |         self.trim = None
 60 |         self.max_overlap_mismatches = None
 61 |         self.merge_mismatches = None
 62 |         self.default_filters.update({"merge failure": True})
 63 |         self.default_filters.update({"remove unresolvable": False})
 64 |         self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__))
 65 | 
 66 |     def configure(self, cfg):
 67 |         """
 68 |         Set up the object using the config object *cfg*, usually derived from 
 69 |         a ``.json`` file.
 70 |         """
 71 |         VariantSeqLib.configure(self, cfg)
 72 |         self.logger = logging.getLogger(
 73 |             "{}.{} - {}".format(__name__, self.__class__.__name__, self.name)
 74 |         )
 75 | 
 76 |         # if counts are specified, copy them later
 77 |         # else handle the FASTQ config options and check the files
 78 |         if self.counts_file is None:
 79 |             self.configure_fastq(cfg)
 80 |             try:
 81 |                 self.fwd_start = int(cfg["overlap"]["forward start"])
 82 |                 self.rev_start = int(cfg["overlap"]["reverse start"])
 83 |                 self.overlap_length = int(cfg["overlap"]["length"])
 84 |                 self.trim = cfg["overlap"]["trim"]
 85 |                 self.max_overlap_mismatches = int(cfg["overlap"]["max mismatches"])
 86 | 
 87 |                 forward_error = False
 88 |                 reverse_error = False
 89 |                 if not has_fastq_ext(self.forward):
 90 |                     forward_error = True
 91 |                 if not has_fastq_ext(self.reverse):
 92 |                     reverse_error = True
 93 |                 if forward_error and reverse_error:
 94 |                     raise IOError(
 95 |                         "FASTQ file error: unrecognized extension (forward and reverse) [{}]".format(
 96 |                             self.name
 97 |                         )
 98 |                     )
 99 |                 elif forward_error:
100 |                     raise IOError(
101 |                         "FASTQ file error: unrecognized extension (forward) [{}]".format(
102 |                             self.name
103 |                         )
104 |                     )
105 |                 elif reverse_error:
106 |                     raise IOError(
107 |                         "FASTQ file error: unrecognized extension (reverse) [{}]".format(
108 |                             self.name
109 |                         )
110 |                     )
111 |             except IOError as fqerr:
112 |                 raise IOError("FASTQ file error [{}]: {}".format(self.name, fqerr))
113 |             except KeyError as key:
114 |                 raise KeyError(
115 |                     "Missing required config value {key} [{name}]".format(
116 |                         key=key, name=self.name
117 |                     )
118 |                 )
119 |             except ValueError as value:
120 |                 raise ValueError(
121 |                     "Invalid parameter value {value} [{name}]".format(
122 |                         value=value, name=self.name
123 |                     )
124 |                 )
125 | 
126 |     def serialize(self):
127 |         """
128 |         Format this object (and its children) as a config object suitable for dumping to a config file.
129 |         """
130 |         cfg = VariantSeqLib.serialize(self)
131 | 
132 |         cfg["fastq"] = self.serialize_fastq()
133 |         cfg["overlap"] = {
134 |             "forward start": self.fwd_start,
135 |             "reverse start": self.rev_start,
136 |             "length": self.overlap_length,
137 |             "trim": self.trim,
138 |             "max mismatches": self.max_overlap_mismatches,
139 |         }
140 | 
141 |         return cfg
142 | 
143 |     def configure_fastq(self, cfg):
144 |         """
145 |         Set up the object's FASTQ_ file handling and filtering options.
146 |         """
147 |         try:
148 |             self.forward = cfg["fastq"]["forward reads"]
149 |             self.reverse = cfg["fastq"]["reverse reads"]
150 | 
151 |             if "merge failure" in cfg["fastq"]["filters"]:
152 |                 raise ValueError(
153 |                     "'merge failure' is not user-configurable [{}]".format(self.name)
154 |                 )
155 |             self.filters = cfg["fastq"]["filters"]
156 |         except KeyError as key:
157 |             raise KeyError(
158 |                 "Missing required config value {key} [{name}]".format(
159 |                     key=key, name=self.name
160 |                 )
161 |             )
162 | 
163 |     def serialize_fastq(self):
164 |         """
165 |         Serialize this object's FASTQ_ file handling and filtering options.
166 |         """
167 |         fastq = {
168 |             "forward reads": self.forward,
169 |             "reverse reads": self.reverse,
170 |             "filters": self.serialize_filters(),
171 |         }
172 | 
173 |         return fastq
174 | 
175 |     def merge_reads(self, fwd, rev):
176 |         """
177 |         Combines the *fwd* and *rev* FASTQ read objects into a
178 |         single FASTQ read with the same header information as
179 |         *fwd*. Mismatches are resolved by taking the highest quality base. If 
180 |         discrepant bases have the same quality value, this position is 
181 |         unresolvable and an ``'X'`` is inserted. Quality values in the 
182 |         resulting FASTQ read are the maximum quality for the
183 |         given base at that position. Returns ``None`` if the maximum number of 
184 |         mismatches in the overlap region is exceded.
185 |         """
186 |         rev.reverse_complement()
187 | 
188 |         # print(fwd.sequence, "-" * (self.rev_start - 1), sep="")
189 |         # print("-" * (self.fwd_start - 1), rev.sequence, sep="")
190 |         rev_extra_start = len(rev) - self.rev_start + 1
191 |         fwd_end = self.fwd_start + self.overlap_length - 1
192 |         merge = FastqRead(
193 |             header=fwd.header + "|" + rev.header,
194 |             sequence="A",
195 |             header2=fwd.header2 + "|" + rev.header2,
196 |             quality_string="#",
197 |             quality_encoding_value=fwd.quality_encoding_value,
198 |         )
199 |         merge.sequence = fwd.sequence[:fwd_end] + rev.sequence[rev_extra_start:]
200 |         merge.quality = fwd.quality[:fwd_end] + rev.quality[rev_extra_start:]
201 | 
202 |         mismatches = 0
203 |         first = True
204 |         for i in range(self.overlap_length):
205 |             a = self.fwd_start - 1 + i
206 |             b = len(rev) - self.rev_start - self.overlap_length + i + 1
207 |             try:
208 |                 if fwd.sequence[a] == rev.sequence[b]:
209 |                     # take the highest quality value
210 |                     if rev.quality[b] > fwd.quality[a]:
211 |                         merge.quality[a] = rev.quality[b]
212 |                 else:
213 |                     if fwd.quality[a] == rev.quality[b]:
214 |                         merge.sequence[a] = "X"  # unresolvable
215 |                         self.merge_mismatches.iloc[i]["unresolved"] += 1
216 |                     elif rev.quality[b] > fwd.quality[a]:
217 |                         merge.sequence[a] = rev.sequence[b]
218 |                         merge.quality[a] = rev.quality[b]
219 |                         self.merge_mismatches.iloc[i]["resolved"] += 1
220 |                     else:
221 |                         # overlap region already same as fwd
222 |                         self.merge_mismatches.iloc[i]["resolved"] += 1
223 |                     mismatches += 1
224 |                     if first:
225 |                         self.merge_mismatches.iloc[i]["first"] += 1
226 |                         first = False
227 |             except IndexError:
228 |                 raise IndexError(
229 |                     "Failed to calculate overlap (a={a}, len(a)={lena}, b={b}, len(b)={lenb}) [{name}]".format(
230 |                         a=a,
231 |                         b=b,
232 |                         lena=len(fwd.sequence),
233 |                         lenb=len(rev.sequence),
234 |                         name=self.name,
235 |                     )
236 |                 )
237 | 
238 |         if mismatches > self.max_overlap_mismatches:
239 |             return None  # merge failed
240 | 
241 |         if self.trim:
242 |             merge.trim_length(self.overlap_length, self.fwd_start)
243 |         return merge
244 | 
245 |     def counts_from_reads(self):
246 |         df_dict = dict()
247 | 
248 |         self.merge_mismatches = pd.DataFrame(
249 |             data=0,
250 |             index=[
251 |                 x + self.fwd_start + self.wt.dna_offset
252 |                 for x in range(0, self.overlap_length)
253 |             ],
254 |             columns=["resolved", "unresolved", "first"],
255 |         )
256 | 
257 |         self.logger.info("Counting variants")
258 |         max_mut_variants = 0
259 |         with open_compressed(self.forward) as handle_fwd, open_compressed(self.reverse) as handle_rev:
260 |             for fwd, rev in parse_fastq_pe_reads(handle_fwd, handle_rev):
261 |                 # filter on chastity before merge
262 |                 chaste = True
263 |                 if self.filters["chastity"]:
264 |                     if not fastq_read_is_chaste(fwd):
265 |                         chaste = False
266 |                         if self.report_filtered:
267 |                             self.report_filtered_read(fwd, "chastity")
268 |                     if not fastq_read_is_chaste(rev):
269 |                         chaste = False
270 |                         if self.report_filtered:
271 |                             self.report_filtered_read(rev, "chastity")
272 |                     if not chaste:
273 |                         self.filter_stats["chastity"] += 1
274 |                         self.filter_stats["total"] += 1
275 |                         continue
276 | 
277 |                 merge = self.merge_reads(fwd, rev)
278 |                 if merge is None:  # merge failed
279 |                     self.filter_stats["merge failure"] += 1
280 |                     self.filter_stats["total"] += 1
281 |                     if self.report_filtered:
282 |                         self.report_filtered_read(fwd, {"merge failure": True})
283 |                         self.report_filtered_read(rev, {"merge failure": True})
284 |                 else:
285 |                     if self.read_quality_filter(merge):
286 |                         mutations = self.count_variant(merge.sequence)
287 |                         if mutations is None:  # merge read has too many mutations
288 |                             max_mut_variants += 1
289 |                             if self.report_filtered:
290 |                                 self.report_filtered_variant(merge.sequence, 1)
291 |                         else:
292 |                             try:
293 |                                 df_dict[mutations] += 1
294 |                             except KeyError:
295 |                                 df_dict[mutations] = 1
296 | 
297 |         self.store.put(
298 |             "/raw/overlap_mismatches",
299 |             self.merge_mismatches,
300 |             format="table",
301 |             data_columns=self.merge_mismatches.columns,
302 |         )
303 |         self.merge_mismatches = None
304 |         self.save_counts("variants", df_dict, raw=True)
305 |         del df_dict
306 | 
307 |         if self.aligner is not None:
308 |             self.logger.info("Aligned {} variants".format(self.aligner.calls))
309 |             self.aligner_cache = None
310 |         self.logger.info(
311 |             "Removed {} total variants with excess mutations"
312 |             "".format(max_mut_variants)
313 |         )
314 |         self.save_filter_stats()
315 | 
316 |     def calculate(self):
317 |         """
318 |         Reads the forward and reverse reads, merges them, performs 
319 |         quality-based filtering, and counts the variants.
320 |         """
321 |         if not self.check_store("/main/variants/counts"):
322 |             if not self.check_store("/raw/variants/counts"):
323 |                 if self.counts_file is not None:
324 |                     self.counts_from_file(self.counts_file)
325 |                 else:  # count everything
326 |                     self.counts_from_reads()
327 |             self.save_filtered_counts("variants", "count >= self.variant_min_count")
328 | 
329 |         self.count_synonymous()
330 | 
331 |     def make_plots(self):
332 |         """
333 |         Make plots for :py:class:`~seqlib.seqlib.OverlapSeqLib` objects.
334 | 
335 |         Creates plots of the location of merged read mismatches.
336 |         """
337 |         if self.plots_requested:
338 |             SeqLib.make_plots(self)
339 |             pdf = PdfPages(os.path.join(self.plot_dir, "overlap_mismatches.pdf"))
340 |             overlap_merge_plot(self, pdf)
341 |             pdf.close()
342 | 


--------------------------------------------------------------------------------
/enrich2/random_effects.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def rml_estimator(y, sigma2i, iterations=50):
 5 |     """Implementation of the robust maximum likelihood estimator.
 6 | 
 7 |         ::
 8 | 
 9 |             @book{demidenko2013mixed,
10 |               title={Mixed models: theory and applications with R},
11 |               author={Demidenko, Eugene},
12 |               year={2013},
13 |               publisher={John Wiley & Sons}
14 |             }
15 | 
16 |     """
17 |     w = 1 / sigma2i
18 |     sw = np.sum(w, axis=0)
19 |     beta0 = np.sum(y * w, axis=0) / sw
20 |     sigma2ML = np.sum((y - np.mean(y, axis=0)) ** 2 / (len(beta0) - 1), axis=0)
21 |     eps = np.zeros(beta0.shape)
22 |     betaML = None
23 |     for _ in range(iterations):
24 |         w = 1 / (sigma2i + sigma2ML)
25 |         sw = np.sum(w, axis=0)
26 |         sw2 = np.sum(w ** 2, axis=0)
27 |         betaML = np.sum(y * w, axis=0) / sw
28 |         sigma2ML_new = (
29 |             sigma2ML
30 |             * np.sum(((y - betaML) ** 2) * (w ** 2), axis=0)
31 |             / (sw - (sw2 / sw))
32 |         )
33 |         eps = np.abs(sigma2ML - sigma2ML_new)
34 |         sigma2ML = sigma2ML_new
35 |     var_betaML = 1 / np.sum(1 / (sigma2i + sigma2ML), axis=0)
36 |     return betaML, var_betaML, eps
37 | 


--------------------------------------------------------------------------------
/enrich2/wildtype.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | from .constants import CODON_TABLE
  4 | 
  5 | 
  6 | class WildTypeSequence(object):
  7 |     """
  8 |     Container class for wild type sequence information. Used by :py:class:`~seqlib.seqlib.VariantSeqLib` objects and
  9 |     :py:class:`~enrich2.selection.Selection` or :py:class:`~enrich2.experiment.Experiment` objects that contain
 10 |     variant information.
 11 | 
 12 |     Requires a *parent_name* that associates this object with a StoreManager object for the
 13 |     purposes of error reporting and logging.
 14 |     """
 15 | 
 16 |     def __init__(self, parent_name):
 17 |         self.parent_name = parent_name
 18 |         self.dna_seq = None
 19 |         self.protein_seq = None
 20 |         self.dna_offset = None
 21 |         self.protein_offset = None
 22 |         self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__))
 23 | 
 24 |     def __eq__(self, other):
 25 |         # note we don't need to check protein_offset, since it depends on dna_offset and protein_seq
 26 |         return (
 27 |             self.dna_seq == other.dna_seq
 28 |             and self.protein_seq == other.protein_seq
 29 |             and self.dna_offset == other.dna_offset
 30 |         )
 31 | 
 32 |     def __ne__(self, other):
 33 |         return not self == other
 34 | 
 35 |     def configure(self, cfg):
 36 |         try:
 37 |             # remove whitespace from WT DNA sequence and capitalize
 38 |             self.dna_seq = "".join(cfg["sequence"].split()).upper()
 39 | 
 40 |             # check that only valid characters are included (ACGT)
 41 |             if not re.match("^[ACGT]+$", self.dna_seq):
 42 |                 raise ValueError(
 43 |                     "WT DNA sequence contains unexpected "
 44 |                     "characters [{}]".format(self.parent_name)
 45 |                 )
 46 | 
 47 |             # set the reference offset
 48 |             if "reference offset" in cfg:
 49 |                 try:
 50 |                     self.dna_offset = int(cfg["reference offset"])
 51 |                 except ValueError:
 52 |                     raise ValueError(
 53 |                         "Invalid reference offset value [{}]".format(self.parent_name)
 54 |                     )
 55 |             else:
 56 |                 self.dna_offset = 0
 57 | 
 58 |             # handle coding sequences
 59 |             if cfg["coding"]:
 60 |                 # require coding sequences are in-frame
 61 |                 if len(self.dna_seq) % 3 != 0:
 62 |                     raise ValueError(
 63 |                         "WT DNA sequence contains incomplete codons [{}]".format(
 64 |                             self.parent_name
 65 |                         )
 66 |                     )
 67 | 
 68 |                 # perform translation
 69 |                 self.protein_seq = ""
 70 |                 for i in range(0, len(self.dna_seq), 3):
 71 |                     self.protein_seq += CODON_TABLE[self.dna_seq[i: i + 3]]
 72 | 
 73 |                 # set the reference offset if it's a multiple of three
 74 |                 if self.dna_offset % 3 == 0:
 75 |                     self.protein_offset = int(self.dna_offset / 3)
 76 |                 else:
 77 |                     self.logger.warning(
 78 |                         "Ignoring reference offset for protein changes (not a multiple of three)"
 79 |                     )
 80 |                     self.protein_offset = 0
 81 |             else:
 82 |                 self.protein_seq = None
 83 |                 self.protein_offset = None
 84 | 
 85 |         except KeyError as key:
 86 |             raise KeyError(
 87 |                 "Missing required config value {key} [{name}]".format(
 88 |                     key=key, name=self.parent_name
 89 |                 )
 90 |             )
 91 | 
 92 |     def serialize(self):
 93 |         """
 94 |         Format this object as a config object suitable for dumping to a config file.
 95 |         """
 96 |         cfg = {
 97 |             "sequence": self.dna_seq,
 98 |             "coding": self.is_coding(),
 99 |             "reference offset": self.dna_offset,
100 |         }
101 |         return cfg
102 | 
103 |     def is_coding(self):
104 |         return self.protein_seq is not None
105 | 
106 |     def duplicate(self, new_parent_name):
107 |         """
108 |         Create a copy of this object with the *new_parent_name*.
109 | 
110 |         Uses the configure and serialize methods to perform the copy.
111 |         """
112 |         new = WildTypeSequence(new_parent_name)
113 |         new.configure(self.serialize())
114 | 
115 |         if new != self:
116 |             raise ValueError(
117 |                 "Failed to duplicate wild type sequence [{}]".format(self.parent_name)
118 |             )
119 |         else:
120 |             return new
121 | 
122 |     def position_tuples(self, protein=False):
123 |         """
124 |         Return a list of tuples containing the position number (after offset adjustment) and
125 |         single-letter symbol (nucleotide or amino acid) for each position the wild type sequence.
126 |         """
127 |         if protein:
128 |             if not self.is_coding():
129 |                 raise AttributeError(
130 |                     "Cannot return wild type protein position tuples for non-coding wild type [{}]".format(
131 |                         self.parent_name
132 |                     )
133 |                 )
134 |             else:
135 |                 seq = self.protein_seq
136 |                 offset = self.protein_offset
137 |         else:
138 |             seq = self.dna_seq
139 |             offset = self.dna_offset
140 | 
141 |         return [(i + offset + 1, seq[i]) for i in range(len(seq))]
142 | 


--------------------------------------------------------------------------------
/enrich2/ztest.py:
--------------------------------------------------------------------------------
 1 | """z-score helper functions.
 2 | 
 3 | This module contains functions for calculating z-scores and corresponding
 4 | p-values for pairs of scores.
 5 | """
 6 | 
 7 | import numpy as np
 8 | import scipy.stats as stats
 9 | 
10 | 
11 | def ztest_pair(df1, df2):
12 |     """z-test for elements in two data frames.
13 | 
14 |     Takes two data frames with ``'score'`` and ``'SE'`` columns and returns
15 |     a new data frame containing the scores and standard errors and result for
16 |     pairwise comparison of elements in both input data frames.
17 | 
18 |     Parameters
19 |     ----------
20 |     df1 : pandas.DataFrame
21 |         The first data frame. Must contain ``'score'`` and ``'SE'`` columns.
22 | 
23 |     df2 : pandas.DataFrame
24 |         The second data frame. Must contain ``'score'`` and ``'SE'`` columns.
25 | 
26 |     Returns
27 |     -------
28 |     pandas.DataFrame
29 |         Result data frame containing scores (``'score1'`` and ``'score2'``),
30 |         standard errors (``'SE1'``, ``'SE2'``), z-scores (``'z'``), and
31 |         p-values (``'pvalue_raw'``) for each element found in both data frames.
32 | 
33 |     Raises
34 |     ------
35 |     To be added.
36 |     """
37 |     shared = df1.loc[:, ("score", "SE")].merge(
38 |         df2.loc[:, ("score", "SE")],
39 |         how="inner",
40 |         left_index=True,
41 |         right_index=True,
42 |         suffixes=("1", "2"),
43 |     )
44 |     shared["z"] = np.abs(shared["score1"] - shared["score2"]) / np.sqrt(
45 |         shared["SE1"] ** 2 + shared["SE2"] ** 2
46 |     )
47 |     shared["pvalue_raw"] = 2 * stats.norm.sf(shared["z"])
48 |     return shared
49 | 
50 | 
51 | def ztest_single(df, score, se):
52 |     """z-test for comparing elements in a data frame to a single score and SE.
53 | 
54 |     Takes a data frames with ``'score'`` and ``'SE'`` columns and a score and
55 |     standard error to compare them to and returns a new data frame containing
56 |     the scores, standard errors, and result for the pairwise comparisons.
57 | 
58 |     Parameters
59 |     ----------
60 |     df : pandas.DataFrame
61 |         The data frame. Must contain ``'score'`` and ``'SE'`` columns.
62 | 
63 |     score : float
64 |         Score used for comparison to elements in the data frame.
65 | 
66 |     se : float
67 |         Standard error used for comparison to elements in the data frame.
68 | 
69 |     Returns
70 |     -------
71 |     pandas.DataFrame
72 |         Result data frame containing score (``'score'``), standard error
73 |         (``'SE'``), z-score (``'z'``), and p-value (``'pvalue_raw'``) for each
74 |         element in the input data frame.
75 | 
76 |     Raises
77 |     ------
78 |     To be added.
79 |     """
80 |     result = df.loc[:, ("score", "SE")]
81 |     result["z"] = np.abs(result["score"] - score) / np.sqrt(result["SE"] ** 2 + se ** 2)
82 |     result["pvalue_raw"] = 2 * stats.norm.sf(result["z"])
83 |     return result
84 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | # Variable names that will always be accepted
2 | good-names=df,se,id,_,i,j,k,a,x,y,z
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "Enrich2"
 7 | dynamic = ["version"]
 8 | description = "Analysis program for calculating variant scores from deep mutational scanning data."
 9 | readme = "README.md"
10 | license = "BSD-3-Clause"
11 | authors = [
12 |     { name = "Alan F Rubin", email = "alan.rubin@wehi.edu.au" },
13 | ]
14 | dependencies = [
15 |     "fqfa",
16 |     "matplotlib",
17 |     "numpy",
18 |     "pandas",
19 |     "scipy",
20 |     "statsmodels",
21 |     "tables",
22 | ]
23 | 
24 | [project.scripts]
25 | enrich_cmd = "enrich2.main:main_cmd"
26 | 
27 | [project.gui-scripts]
28 | enrich_gui = "enrich2.main:main_gui"
29 | 
30 | [project.urls]
31 | Homepage = "https://github.com/FowlerLab/Enrich2/"
32 | 
33 | [tool.hatch.version]
34 | path = "enrich2/__init__.py"
35 | 
36 | [tool.hatch.build.targets.sdist]
37 | include = [
38 |     "/enrich2",
39 |     "/docs",
40 | ]
41 | 


--------------------------------------------------------------------------------