├── .gitignore ├── .readthedocs.yml ├── CHANGELOG.md ├── CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── Makefile ├── changelog.md ├── conduct.md ├── conf.py ├── contributing.md ├── example.ipynb ├── index.md ├── make.bat └── requirements.txt ├── poetry.lock ├── pyproject.toml ├── src └── seqwalk │ ├── __init__.py │ ├── analysis.py │ ├── design.py │ ├── filtering.py │ ├── generation.py │ ├── io.py │ └── prebuilt_libs │ ├── __init__.py │ └── kishi2018.txt └── tests └── test_seqwalk.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | **__pycache__ 131 | 132 | # PyCharm 133 | .idea/ 134 | 135 | # RStudio project files 136 | **.Rproj.user/ 137 | **.Rproj.user* 138 | **.Rproj 139 | **.Rhistory 140 | 141 | # MacOS 142 | .DS_Store -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for Sphinx projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.9" 12 | # You can also specify other tool versions: 13 | # nodejs: "20" 14 | # rust: "1.70" 15 | # golang: "1.20" 16 | 17 | # Build documentation in the "docs/" directory with Sphinx 18 | sphinx: 19 | configuration: docs/conf.py 20 | # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs 21 | # builder: "dirhtml" 22 | # Fail on all warnings to avoid broken references 23 | # fail_on_warning: true 24 | 25 | # Optionally build your docs in additional formats such as PDF and ePub 26 | # formats: 27 | # - pdf 28 | # - epub 29 | 30 | # Optional but recommended, declare the Python requirements required 31 | # to build your documentation 32 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 33 | python: 34 | install: 35 | - requirements: docs/requirements.txt 36 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v0.3.3 (23/10/2024) 4 | - Verbosity arguments 5 | 6 | ## v0.3.1 (22/03/2024) 7 | 8 | - Fixed odd-k incrementing in max_orthogonality 9 | 10 | ## v0.3.0 (22/03/2024) 11 | 12 | - Added hash-based filtering 13 | 14 | ## v0.2.1 (11/03/2024) 15 | 16 | - Fixed 3 letter filtering indexing 17 | 18 | ## v0.2.0 (10/03/2024) 19 | 20 | - Fixed adapted_hierholzer function for RC free library generation 21 | 22 | 23 | ## v0.1.0 (22/06/2022) 24 | 25 | - First release of `seqwalk`! -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | ## Our Responsibilities 20 | 21 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 22 | 23 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 24 | 25 | ## Scope 26 | 27 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 28 | 29 | ## Enforcement 30 | 31 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 32 | 33 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 34 | 35 | ## Attribution 36 | 37 | This Code of Conduct is adapted from the [Contributor Covenant homepage](http://contributor-covenant.org/version/1/4), version 1.4. 38 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome, and they are greatly appreciated! Every little bit 4 | helps, and credit will always be given. 5 | 6 | ## Types of Contributions 7 | 8 | ### Report Bugs 9 | 10 | If you are reporting a bug, please include: 11 | 12 | * Your operating system name and version. 13 | * Any details about your local setup that might be helpful in troubleshooting. 14 | * Detailed steps to reproduce the bug. 15 | 16 | ### Fix Bugs 17 | 18 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 19 | wanted" is open to whoever wants to implement it. 20 | 21 | ### Implement Features 22 | 23 | Look through the GitHub issues for features. Anything tagged with "enhancement" 24 | and "help wanted" is open to whoever wants to implement it. 25 | 26 | ### Write Documentation 27 | 28 | You can never have enough documentation! Please feel free to contribute to any 29 | part of the documentation, such as the official docs, docstrings, or even 30 | on the web in blog posts, articles, and such. 31 | 32 | ### Submit Feedback 33 | 34 | If you are proposing a feature: 35 | 36 | * Explain in detail how it would work. 37 | * Keep the scope as narrow as possible, to make it easier to implement. 38 | * Remember that this is a volunteer-driven project, and that contributions 39 | are welcome :) 40 | 41 | ## Get Started! 42 | 43 | Ready to contribute? Here's how to set up `seqwalk` for local development. 44 | 45 | 1. Download a copy of `seqwalk` locally. 46 | 2. Install `seqwalk` using `poetry`: 47 | 48 | ```console 49 | $ poetry install 50 | ``` 51 | 52 | 3. Use `git` (or similar) to create a branch for local development and make your changes: 53 | 54 | ```console 55 | $ git checkout -b name-of-your-bugfix-or-feature 56 | ``` 57 | 58 | 4. When you're done making changes, check that your changes conform to any code formatting requirements and pass any tests. 59 | 60 | 5. Commit your changes and open a pull request. 61 | 62 | ## Pull Request Guidelines 63 | 64 | Before you submit a pull request, check that it meets these guidelines: 65 | 66 | 1. The pull request should include additional tests if appropriate. 67 | 2. If the pull request adds functionality, the docs should be updated. 68 | 3. The pull request should work for all currently supported operating systems and versions of Python. 69 | 70 | ## Code of Conduct 71 | 72 | Please note that the `seqwalk` project is released with a 73 | Code of Conduct. By contributing to this project you agree to abide by its terms. 74 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022, Gokul Gowri 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # seqwalk 2 | 3 | `seqwalk` is a package for designing orthogonal DNA sequence libraries. It can efficiently generate libraries of sequences that satisfy sequence symmetry minimization constraints (i.e. minimizing longest common substrings). `seqwalk` additionally includes off-the-shelf orthogonal sequence libraries, as well as some tools for analyzing orthogonal sequence libraries. 4 | A code-free, interactive version of `seqwalk` can be found [here](https://colab.research.google.com/drive/1eVbcn_b5EE5FcL9NL5EyxeFAqNoImNSa?usp=sharing). 5 | 6 | For more details, see the [paper](https://www.biorxiv.org/content/10.1101/2022.07.11.499592v1.abstract). 7 | 8 | ## Installation 9 | 10 | ```bash 11 | $ pip install seqwalk 12 | ``` 13 | 14 | ## Usage 15 | 16 | ### Designing a set of barcodes with minimal sequence symmetry 17 | 18 | If you want a certain number of barcodes with maximum orthogonality, you can use the `max_orthogonality` function from the `design` module. You must specify the length of desired sequences (L) and the number of desired sequences (N). Optionally, specify the prevention of reverse complementary sequences, GC content limits, allowable alphabet, and specific prevented patterns. By default, reverse complementary sequences are allowed, there are no GC content constraints, a 3 letter (A/C/T, no G) code is used and any 4N sequence is prevented. 19 | 20 | For example, if you want 100 barcodes with length 25, with prevented reverse complements, and a 4 letter alphabet, and between 10 and 15 G/C bases, you can use the following code: 21 | 22 | ```python 23 | from seqwalk import design 24 | 25 | library = design.max_orthogonality(100, 25, alphabet="ACGT", RCfree=True, GClims=(10, 15)) 26 | ``` 27 | 28 | This will generate a library of at least the specified size, with the strongest possible sequence symmetry constraint. 29 | 30 | ### Designing a set of orthogonal barcodes with maximum size 31 | 32 | If you have an orthogonality constraint in mind, you can use the `max_size` function from the `design` module to generate a maximally sized library with the given sequence symmetry minimization k values. That is, the shortest k for which no substring of length k appears twice. 33 | 34 | If you want sequences that satisfy SSM for k=12, and you want barcodes of length 25, without considering reverse complementarity, and using a 4 letter alphabet, with no GC constraints, you can use the following code: 35 | 36 | ```python 37 | from seqwalk import design 38 | 39 | library = design.max_size(25, 12, alphabet="ACGT") 40 | ``` 41 | 42 | ### Importing "off-the-shelf" experimentally characterized libraries 43 | 44 | The `io` module provides the ability to import libraries that have been previously experimentally characterized, using code of the following format. 45 | 46 | ```python 47 | from seqwalk import io 48 | 49 | PERprimers = io.load_library("kishi2018") 50 | ``` 51 | 52 | We provide the following libraries, accessible with the identifier tag. 53 | 54 | | identifier | # of seqs | seq length | original use case | ref | 55 | |------------|-----------|------------|-------------------|-----| 56 | | `kishi2018` | 50 | 9nt | PER primers | [Kishi et al, 2018](https://www.nature.com/articles/nchem.2872) | 57 | 58 | If you have an orthogonal library you would like to add, please submit a PR! 59 | 60 | ### Quality control using pairwise comparisons 61 | 62 | Once you have a library in the form of a list of sequences, you can use the `analysis` module to perform additional quality control. For example, we provide a function to compute pairwise Hamming distances. 63 | 64 | ```python 65 | from seqwalk import analysis 66 | 67 | h_crosstalk = analysis.hamming_matrix(seqs) 68 | ``` 69 | 70 | Since sequence symmetry minimization does not explicitly guarantee low off-target hybridization strength, a simple function for using NUPACK to identify "bad" sequences is included in the `analysis.py` file. However, it is commented out to avoid the NUPACK dependency in the package (problematic due to NUPACK licensing). 71 | 72 | ## License 73 | 74 | `seqwalk` is licensed under the terms of the MIT license. 75 | 76 | ## Credits 77 | 78 | `seqwalk` was created with [`cookiecutter`](https://cookiecutter.readthedocs.io/en/latest/) and the `py-pkgs-cookiecutter` [template](https://github.com/py-pkgs/py-pkgs-cookiecutter). 79 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | 3 | # You can set these variables from the command line. 4 | SPHINXOPTS = 5 | SPHINXBUILD = python -msphinx 6 | SPHINXPROJ = seqwalk 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CHANGELOG.md 2 | ``` -------------------------------------------------------------------------------- /docs/conduct.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CONDUCT.md 2 | ``` -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Project information ----------------------------------------------------- 8 | 9 | project = u"seqwalk" 10 | copyright = u"2022, Gokul Gowri" 11 | author = u"Gokul Gowri" 12 | 13 | # -- General configuration --------------------------------------------------- 14 | 15 | # Add any Sphinx extension module names here, as strings. They can be 16 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 17 | # ones. 18 | extensions = [ 19 | "myst_nb", 20 | "autoapi.extension", 21 | "sphinx.ext.napoleon", 22 | "sphinx.ext.viewcode", 23 | ] 24 | autoapi_dirs = ["../src"] 25 | 26 | # List of patterns, relative to source directory, that match files and 27 | # directories to ignore when looking for source files. 28 | # This pattern also affects html_static_path and html_extra_path. 29 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 30 | 31 | # -- Options for HTML output ------------------------------------------------- 32 | 33 | # The theme to use for HTML and HTML Help pages. See the documentation for 34 | # a list of builtin themes. 35 | # 36 | html_theme = "sphinx_rtd_theme" 37 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CONTRIBUTING.md 2 | ``` -------------------------------------------------------------------------------- /docs/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Example usage\n", 8 | "\n", 9 | "In this tutorial, we will demonstrate three core capabilities of the `seqwalk` package.\n", 10 | "\n", 11 | "1. We will use the `seqwalk.design` module to design maximal orthogonal sequence libraries using the novel methods outlined in our preprint (coming soon!). \n", 12 | "2. We will use the `seqwalk.analysis` module to analyze orthogonal sequence libraries. \n", 13 | "3. We will use the `seqwalk.io` module to load \"off-the-shelf\" orthogonal sequence libraries from previous publications." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from seqwalk import io\n", 23 | "from seqwalk import design\n", 24 | "from seqwalk import analysis\n", 25 | "\n", 26 | "import seaborn as sns" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Designing sequences using seqwalk\n", 34 | "\n", 35 | "The algorithms presented in our preprint are the first that allow for the design of maximally orthogonal and maximally sized orthogonal sequence libraries. The `seqwalk` package implements these algorithms in two design functions: a `max_size` function which designs maximally sized libraries for a given set of constraints, and a `max_orthogonality` function, which designs a library of desired size, with the strictest possible orthogonality constraints. \n", 36 | "\n", 37 | "## Maximally-sized orthogonal library design\n", 38 | "\n", 39 | "To design a maximally-sized orthogonal sequence library, we must specify a desired sequence length, and an orthogonality constraint in the form of an SSM k-value. This value is the smallest $k$ for which no substring of length $k$ appears twice in the library. For example, SSM k=3 means that no length 3 string can repeat multiple times in a library.\n", 40 | "\n", 41 | "Optionally, one can specify the prevention of reverse complementary sequences, GC content limits, allowable alphabet, and specific prevented patterns. By default, reverse complementary sequences are allowed, there are no GC content constraints, a 3 letter (A/C/T, no G) code is used and any 4N (AAAA, CCCC, ...) sequence is prevented. To see in detail how to specify each these constraints, see the API reference for `seqwalk.design.max_orthogonality()`\n", 42 | "\n", 43 | "Suppose we want to design the maximal set of 25nt sequences that satisfy SSM for k=12, with a 4 letter alphabet, without considering reverse complementary sequences, and no GC or pattern constraints. **We need to write only a single line of code, which executes in <1 minute!**\n", 44 | "\n", 45 | "(Note that the Python implementation is slightly slower than the Julia implementation, which can be found in the supplement of the preprint.)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "The 25mer library has 1198372 sequences\n", 58 | "CPU times: user 30.6 s, sys: 464 ms, total: 31 s\n", 59 | "Wall time: 31.5 s\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "%%time\n", 65 | "\n", 66 | "huge_25mer_library = design.max_size(25, 12, alphabet=\"ACGT\", prevented_patterns=[])\n", 67 | "\n", 68 | "print(\"The 25mer library has %d sequences\" % (len(huge_25mer_library)))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Maximally orthogonal library design\n", 76 | "\n", 77 | "To design a library of minimum size with maximal orthogonality, we instead need to specify a minimum library size and desired sequence length. The same optional arguments apply, with the addition of an optional `k_init` parameter which serves as an initial lower-bound for the lowest SSM k for which a library of the desired size can be designed.\n", 78 | "\n", 79 | "\n", 80 | "Suppose we need only 200 barcodes with length 25, with prevented reverse complements, and a 4 letter alphabet, and between 10 and 15 G/C bases, and no 4N homopolymers. We can use the following design code:" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "Number of sequences: 204\n", 93 | "SSM k value: 6\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "small_25mer_library = design.max_orthogonality(200, 25, \n", 99 | " alphabet=\"ACGT\", prevented_patterns=[])" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Note that the identified minimum k-value is half that used in the large library design problem. Based on sequence symmetry, this library will have significantly less crosstalk than the large 25mer library." 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "## Quality control using pairwise comparisons\n", 114 | "\n", 115 | "While `seqwalk.design` uses a graph-theoretic representation of sequence symmetry constraints for efficient orthogonal sequence design, the `analysis` module supports pairwise comparisons of sequences in a library for quality control. For example, you can compare the Hamming distances of each pair of sequences in a library as a form of additional quality control. Note that these functions can be applied to any set of sequences, not just those designed using `seqwalk`." 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 4, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "image/png": "\n", 126 | "text/plain": [ 127 | "
" 128 | ] 129 | }, 130 | "metadata": { 131 | "needs_background": "light" 132 | }, 133 | "output_type": "display_data" 134 | } 135 | ], 136 | "source": [ 137 | "h_crosstalk = analysis.hamming_matrix(small_25mer_library)\n", 138 | "\n", 139 | "p = sns.heatmap(h_crosstalk)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## Importing \"off-the-shelf\" sequences from previous publications\n", 147 | "\n", 148 | "If you would like to use orthogonal sequence libraries from previous experimental publications, you can use the `io` module. Simply use the `load_library` function with the appropriate identifier tag. The following libraries are included:\n", 149 | "\n", 150 | "| identifier | # of seqs | seq length | original use case | ref |\n", 151 | "|------------|-----------|------------|-------------------|-----|\n", 152 | "| `kishi2018` | 50 | 9nt | PER primers | [Kishi et al, 2018](https://www.nature.com/articles/nchem.2872) |" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 5, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "Number of PER primers: 50\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "PER_primers = io.load_library(\"kishi2018\")\n", 170 | "\n", 171 | "print(\"Number of PER primers: %d\" % len(PER_primers))" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3 (ipykernel)", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.9.5" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 4 203 | } 204 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ```{include} ../README.md 2 | ``` 3 | 4 | ```{toctree} 5 | :maxdepth: 1 6 | :hidden: 7 | 8 | example.ipynb 9 | changelog.md 10 | contributing.md 11 | conduct.md 12 | autoapi/index 13 | ``` -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=seqwalk 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | myst-nb 2 | sphinx-autoapi 3 | sphinx-rtd-theme 4 | seaborn 5 | matplotlib 6 | seqwalk==0.2.0 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "seqwalk" 3 | version = "0.3.3" 4 | description = "Design orthogonal DNA sequences" 5 | authors = ["Gokul Gowri"] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.8" 11 | numpy = "^1.23.0" 12 | 13 | [tool.poetry.dev-dependencies] 14 | myst-nb = {version = "^0.16.0", python = "^3.9"} 15 | sphinx-autoapi = "^1.8.4" 16 | sphinx-rtd-theme = "^1.0.0" 17 | python-semantic-release = "^7.29.4" 18 | pytest = "^8.1.1" 19 | 20 | [build-system] 21 | requires = ["poetry-core>=1.0.0"] 22 | build-backend = "poetry.core.masonry.api" 23 | 24 | [tool.semantic_release] 25 | version_variable = "pyproject.toml:version" 26 | 27 | [tool.pytest.ini_options] 28 | pythonpath = "src" 29 | addopts = [ 30 | "--import-mode=importlib", 31 | ] -------------------------------------------------------------------------------- /src/seqwalk/__init__.py: -------------------------------------------------------------------------------- 1 | # read version from installed package 2 | from importlib.metadata import version 3 | __version__ = version("seqwalk") -------------------------------------------------------------------------------- /src/seqwalk/analysis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # import nupack as nu 3 | 4 | # RT = nu.Model(material="dna", celsius=22, sodium=1.0, magnesium=0.0) 5 | 6 | def hamming(seq1, seq2): 7 | """ 8 | compute hamming distance of two sequences 9 | 10 | Args: 11 | seq1: string 12 | seq2: string 13 | 14 | Returns: 15 | int 16 | hamming distance 17 | """ 18 | assert (len(seq1) == len(seq2)), "Sequences must have equal length" 19 | return sum(seq1[i] != seq2[i] for i in range(len(seq1))) 20 | 21 | def hamming_matrix(library): 22 | """ 23 | matrix where element i, j is H distance between seq i and seq j 24 | 25 | Args: 26 | library: list of strings of equal length 27 | 28 | Returns: 29 | NxN numpy array : hdists 30 | hamming distance "heatmap" 31 | """ 32 | 33 | hdists = np.zeros((len(library), len(library))) 34 | for i in range(len(library)): 35 | for j in range(i+1, len(library)): 36 | d = hamming(library[i], library[j]) 37 | hdists[i, j] = d 38 | hdists[j, i] = d 39 | return hdists 40 | 41 | # def np_crosstalk(seq1, seq2, model=RT, conc=1e-6, RCfree=False): 42 | # """ 43 | # compute thermodynamic binding probability 44 | 45 | # Args: 46 | # seq1: string (DNA seq) 47 | # seq2: string (DNA seq) 48 | # model: nupack conditions (default RT) 49 | # conc: molar concentrations of strands (default 1e-6) 50 | # RCfree: Bool, True if library is to be RC free 51 | 52 | # Returns: 53 | # float 54 | # equilibrium concentration of on target binding 55 | # """ 56 | # A = nu.Strand(seq1, name='A') 57 | # B = nu.Strand(seq2, name='B') 58 | # ct1 = nu.Complex([A, ~A], name="c1") 59 | # tRC = nu.Tube(strands={A: conc, ~B: conc, ~A: conc}, 60 | # name='t1', complexes=nu.SetSpec(max_size=2)) 61 | # tRCfree = nu.Tube(strands={A: conc,~A: conc, B: conc, ~B: conc}, 62 | # name='tRC', complexes=nu.SetSpec(max_size=2)) 63 | # tube_results = nu.tube_analysis(tubes=[[tRC, tRCfree][RCfree]], model=model) 64 | # if RCfree: 65 | # return [tube_results.tubes[tRCfree].complex_concentrations[c] for c in [ct1]] 66 | # return [tube_results.tubes[tRC].complex_concentrations[c] for c in [ct1]] 67 | 68 | # def nupack_matrix(library, model=RT, conc=1e-6, RCfree=False): 69 | # """ 70 | # matrix where element i, j is binding prob between seq i and seq j 71 | 72 | # Args: 73 | # library: list of seqs 74 | # model: nupack conditions (default RT) 75 | # conc: molar concentrations of strands (default 1e-6) 76 | # RCfree: Bool, True if library is to be RC free 77 | 78 | # Returns: 79 | # NxN numpy array : np_probs 80 | # binding probability heatmap 81 | # """ 82 | # np_probs = np.zeros((len(library), len(library))) 83 | # for i in range(len(library)): 84 | # for j in range(i+1, len(library)): 85 | # p = 1 - np_crosstalk(library[i], library[j], model, conc)[0]/conc 86 | # np_probs[i, j] += p 87 | # np_probs[j, i] += p 88 | # for i in range(len(library)): 89 | # np_probs[i, i] += np_crosstalk(library[i], library[j], model, conc)[0]/conc 90 | # return np_probs 91 | 92 | -------------------------------------------------------------------------------- /src/seqwalk/design.py: -------------------------------------------------------------------------------- 1 | from seqwalk.generation import * 2 | from seqwalk.filtering import * 3 | import math 4 | from warnings import warn 5 | 6 | 7 | def max_size(L, k, alphabet="ACT", RCfree=False, GClims=None, 8 | prevented_patterns=["AAAA", "CCCC", "GGGG", "TTTT"], 9 | verbose=True): 10 | """ 11 | design a max size library of length L sequences with SSM k 12 | 13 | Args: 14 | L: integer length of desired seqs 15 | k: SSM k value 16 | alphabet: string of allowable letters (default "ACT") 17 | RCfree: bool, True if orthogonality with RCs is required 18 | GClims: tuple of (GCmin, GCmax), allowable range of number of GC bases 19 | prevented_patterns: list of prevented patterns (default 4N) 20 | verbose: bool, True if print and warning statements are desired 21 | 22 | Returns: 23 | list of strings : seqs 24 | library of orthogonal sequences 25 | """ 26 | 27 | assert (L > k), "L must be greater than k" 28 | 29 | if RCfree and len(alphabet) == 4 and k % 2 == 1: 30 | if verbose: 31 | warn("Falling back to Hierholzer algorithm for odd-k, ACGT, RC free") 32 | seq = adapted_hierholzer(k, alphabet) 33 | seqs = partition_path(seq, L, k) 34 | if len(seqs) == 0: 35 | return seqs 36 | if GClims != None: 37 | GCmin, GCmax = GClims 38 | seqs = filter_gc(seqs, GCmin, GCmax) 39 | for pattern in prevented_patterns: 40 | seqs = filter_pattern(seqs, pattern) 41 | return seqs 42 | 43 | else: 44 | seq = "".join([alphabet[i-1] for i in simple_shift(k, len(alphabet))]) 45 | seqs = partition_path(seq[:-1], L, k) 46 | if RCfree: 47 | seqs = rc_hash_filtering(seqs, k) 48 | if len(seqs) == 0: 49 | return seqs 50 | if GClims != None: 51 | GCmin, GCmax = GClims 52 | seqs = filter_gc(seqs, GCmin, GCmax) 53 | for pattern in prevented_patterns: 54 | seqs = filter_pattern(seqs, pattern) 55 | return seqs 56 | 57 | 58 | def max_orthogonality(N, L, alphabet="ACT", RCfree=False, GClims=None, 59 | prevented_patterns=["AAAA", "CCCC", "GGGG","TTTT"], 60 | k_init=None, verbose=True): 61 | """ 62 | design a maximally orthogonal library of N length L sequences 63 | 64 | Args: 65 | N: minimum number of sequences in library 66 | L: integer length of desired seqs 67 | alphabet: string of allowable letters (default "ACT") 68 | RCfree: bool, True if orthogonality with RCs is required 69 | GClims: tuple of (GCmin, GCmax), allowable range of number of GC bases 70 | prevented_patterns: list of prevented patterns (default 4N) 71 | k_init: initial guess for SSM k value 72 | verbose: bool, True if print and warning statements are desired 73 | 74 | Returns: 75 | list of strings : seqs 76 | library of orthogonal sequences 77 | 78 | """ 79 | if k_init == None: 80 | k_init = max(int(math.log(N)/math.log(len(alphabet))), 2) 81 | 82 | while True: 83 | if verbose: 84 | print("Attempting SSM k=%d" %k_init) 85 | 86 | library = max_size(L, k_init, alphabet, RCfree, GClims, prevented_patterns, verbose) 87 | 88 | if len(library)>N: 89 | if verbose: 90 | print("Number of sequences: %d" % len(library)) 91 | print("SSM k value: %d" % k_init) 92 | return library 93 | 94 | k_init += 1 95 | -------------------------------------------------------------------------------- /src/seqwalk/filtering.py: -------------------------------------------------------------------------------- 1 | def rc(seq): 2 | """ 3 | reverse complement of DNA sequence 4 | 5 | Args: 6 | seq: string with letters in {A, C, G, T} 7 | 8 | Returns: 9 | string corresponding to reverse complement 10 | """ 11 | complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 12 | return "".join(complement.get(base, base) for base in reversed(seq)) 13 | 14 | def filter_rc_3letter(library, k): 15 | """ 16 | filter library to be RC free 17 | (Supplementary note X) 18 | 19 | Args: 20 | library: list of sequences 21 | k: SSM k value 22 | 23 | Returns: 24 | list of strings : filtered_library 25 | list of sequences without reverse complementary k-mers 26 | """ 27 | 28 | assert (k % 2 == 1), "SSM k must be odd for RC filtering" 29 | 30 | to_remove = [] 31 | middle = int((k+1)/2) 32 | 33 | for seq in library: 34 | for i in range(len(seq)-k+1): 35 | if sum([(s == "C" or s == "G") for s in seq[i:i+k]]) == 0 : 36 | if seq[i+middle-1] == "A": 37 | to_remove.append(seq) 38 | 39 | return [seq for seq in library if seq not in to_remove] 40 | 41 | def rc_hash_filtering(library, k): 42 | """ 43 | filter any library to be RC free, using simple hash approach 44 | could be slow for large libraries 45 | 46 | Args: 47 | library: list of sequences 48 | k: SSM k value 49 | 50 | Returns: 51 | list of strings : filtered_library 52 | list of sequences without reverse complementary k-mers 53 | """ 54 | seen_kmers = set() 55 | to_remove = set() 56 | 57 | for seq in library: 58 | bad_seq = False 59 | for i in range(len(seq)-k+1): 60 | 61 | if rc(seq[i:i+k]) in seq: 62 | bad_seq = True 63 | if rc(seq[i:i+k]) in seen_kmers: 64 | bad_seq = True 65 | if not bad_seq: 66 | for i in range(len(seq)-k+1): 67 | seen_kmers.add(seq[i:i+k]) 68 | else: 69 | to_remove.add(seq) 70 | 71 | return [seq for seq in library if seq not in to_remove] 72 | 73 | 74 | 75 | def filter_gc(library, gc_min, gc_max): 76 | """ 77 | filters library for sequences that have desired GC content 78 | 79 | Args: 80 | library: list of sequences in string representation 81 | gc_min: minimum number of GC bases (int) 82 | gc_max: maximimum number of GC bases (int) 83 | 84 | Returns: 85 | list of strings : filtered_library 86 | list of sequences in string representation 87 | """ 88 | 89 | assert (gc_min <= gc_max), "gc_min cannot be greater than gc_max" 90 | assert (gc_max <= len(library[0])), "gc_max cannot be greater than seq length" 91 | 92 | filtered_library = [] 93 | 94 | for seq in library: 95 | 96 | gc = sum([(s == "C" or s == "G") for s in seq]) 97 | 98 | if gc >= gc_min: 99 | if gc <= gc_max: 100 | filtered_library.append(seq) 101 | 102 | return filtered_library 103 | 104 | 105 | 106 | def filter_pattern(library, pattern): 107 | """ 108 | filters library to remove specific patterns 109 | 110 | Args: 111 | library: list of sequences in string representation 112 | pattern: sequence pattern to be prevented 113 | 114 | Returns: 115 | list of strings : filtered_library 116 | list of sequences in string representation 117 | """ 118 | 119 | filtered_library = [] 120 | 121 | for seq in library: 122 | if pattern not in seq: 123 | filtered_library.append(seq) 124 | 125 | return filtered_library 126 | 127 | 128 | -------------------------------------------------------------------------------- /src/seqwalk/generation.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | import random 3 | 4 | 5 | 6 | def rc(seq): 7 | """ 8 | reverse complement of DNA sequence 9 | 10 | Args: 11 | seq: string with letters in {A, C, G, T} 12 | 13 | Returns: 14 | string corresponding to reverse complement 15 | """ 16 | complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 17 | return "".join(complement.get(base, base) for base in reversed(seq)) 18 | 19 | 20 | 21 | 22 | def partition_path(seq, L, k): 23 | """ 24 | partitions self-avoiding walk into appropriate length sequences 25 | 26 | Args: 27 | seq: self avoiding walk (string or list) 28 | L: length of desired sequences 29 | k: SSM k-value 30 | 31 | Returns: 32 | seqs: list of strings, each a length L seq 33 | """ 34 | seqs = [] 35 | for i in range(0, len(seq)-L+1, L-k+1): 36 | seqs.append(seq[i:i+L]) 37 | return seqs 38 | 39 | 40 | def is_necklace(seq): 41 | """ 42 | computes if a sequence is a necklace (as defined in Wong 2017) 43 | 44 | Args: 45 | seq: typically list of ints 46 | """ 47 | 48 | p = 1 49 | 50 | for i in range(1, len(seq)): 51 | if seq[i-p] < seq[i]: 52 | p = i + 1 53 | elif seq[i-p] > seq[i]: 54 | return False 55 | 56 | if (len(seq) % (p)) == 0: 57 | return True 58 | return False 59 | 60 | 61 | def f(seq, k): 62 | """ 63 | helper function defined in Wong 2017 64 | * note confusing notation switch 65 | 66 | Args: 67 | seq: list of ints 68 | k: alphabet length 69 | 70 | Returns: 71 | int corresponding to next element of seq 72 | """ 73 | 74 | p = 1 75 | 76 | if seq[0] == k: 77 | if sum(seq[1:]) == len(seq) - 1: 78 | return 1 79 | for i in range(0, k-1): 80 | if not is_necklace(seq[1:] + [k-i]): 81 | return k-i 82 | return 1 83 | 84 | elif is_necklace(seq[1:] + [(seq[0] % k) + 1]): 85 | return (seq[0] % k) + 1 86 | 87 | else: 88 | return seq[0] 89 | 90 | def simple_shift(n, k): 91 | """ 92 | simple shift rule from Wong 2017 93 | * note confusing notation switch 94 | 95 | Args: 96 | n: SSM k value 97 | k: alphabet size 98 | 99 | Returns: 100 | list of integers corresponding to H. path 101 | """ 102 | 103 | seq = [1]*n 104 | 105 | while True: 106 | seq.append(f(seq[-n:], k)) 107 | if seq[-n:] == seq[:n]: 108 | return seq 109 | 110 | 111 | 112 | 113 | def out_edges(v, alphabet): 114 | """ 115 | lists outedges for a node in a kmer graph 116 | 117 | Args: 118 | v: string corresponding to node 119 | alphabet: string containing all valid letters 120 | 121 | Returns: 122 | list of outedges represented as strings 123 | """ 124 | return [v + l for l in alphabet] 125 | 126 | 127 | def adapted_hierholzer(k, alphabet): 128 | """ 129 | finds RC-free path through 4-letter kmer graph using modified hierholzer 130 | (see Supplementary Note X) 131 | 132 | Args: 133 | k: SSM k (integer) 134 | alphabet: string containing all valid letters 135 | 136 | Returns: 137 | path: string corresponding to RC-free self-avoiding walk 138 | """ 139 | 140 | assert (k % 2 == 1), "k must be odd for adapted Hierholzer" 141 | 142 | # dictionary to store visited nodes 143 | # list of all nodes 144 | marked = {"".join(l) : 0 for l in product(alphabet, repeat=k)} 145 | nodes = ["".join(l) for l in product(alphabet, repeat=(k-1))] 146 | 147 | # initialize stack with a random starting node 148 | v_stack = [random.choice(nodes)] 149 | path = "" 150 | 151 | while len(v_stack) != 0: 152 | v = v_stack.pop() 153 | unmarked = [s for s in out_edges(v, alphabet) if not marked[s]] 154 | 155 | if unmarked == []: 156 | if path == "": 157 | path = v 158 | else: 159 | path = v[0] + path 160 | else: 161 | n_edge = random.choice(unmarked) 162 | v_stack.append(v) 163 | v_stack.append(n_edge[1:]) 164 | marked[n_edge] = 1 165 | marked[rc(n_edge)] = 1 166 | 167 | return path 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /src/seqwalk/io.py: -------------------------------------------------------------------------------- 1 | from importlib import resources 2 | 3 | def load_library(identifier): 4 | """ 5 | load a library of prebuilt sequences 6 | 7 | Args: 8 | identifier: string identifier of prebuilt library. listed on usage page 9 | 10 | Returns: 11 | list of strings : seqs 12 | library of orthogonal sequences 13 | 14 | """ 15 | 16 | with resources.path("seqwalk.prebuilt_libs", identifier+".txt") as f: 17 | seqs = [s.strip() for s in open(f, "r").readlines()] 18 | return seqs 19 | 20 | def write_library(seqs, filename): 21 | """ 22 | writes a list of sequences to file. contains no information beyond sequence 23 | 24 | Args: 25 | seqs: list of strings 26 | filename: string corresponding to filename to save to 27 | Returns: 28 | None 29 | """ 30 | f = open(filename, "w+") 31 | f.writelines([s + "\n" for s in seqs]) 32 | f.close() 33 | print("File written!") 34 | -------------------------------------------------------------------------------- /src/seqwalk/prebuilt_libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ggdna/seqwalk/6ba27c831268ffd57dd1ea731945095ecdefc9ab/src/seqwalk/prebuilt_libs/__init__.py -------------------------------------------------------------------------------- /src/seqwalk/prebuilt_libs/kishi2018.txt: -------------------------------------------------------------------------------- 1 | CCAATAATA 2 | ATAAACCTA 3 | CATCATCAT 4 | CAACTTAAC 5 | TCTAAAATC 6 | AATACTCTC 7 | TTATTCACT 8 | CTTTTTTTC 9 | CCTTCTATT 10 | CTCTACTAC 11 | TAAAAACTC 12 | AACTAATCT 13 | TTTCTCTTC 14 | AACATACTA 15 | TTCATTTAC 16 | ATCCTACAA 17 | CAATCAAAA 18 | CTTACAAAC 19 | ACAAATAAC 20 | TTTTCTACC 21 | CCCTTATTT 22 | TCTTTCATT 23 | TTCTTACTC 24 | CCATAAATC 25 | CATTTATCC 26 | ATACTTCAC 27 | TACCTCTAA 28 | CTCCTATTT 29 | CTATCCAAA 30 | ATCCCTATC 31 | TCATTACTT 32 | CTAAATCTC 33 | ACTACTTTT 34 | TACTATCTC 35 | ATATCTTCC 36 | ACTAACTAT 37 | TTATCAACT 38 | TAACTTTTC 39 | TCTTTACAT 40 | CCTATACTT 41 | TTCTTCTTT 42 | TCACATAAT 43 | ATCATATCA 44 | TTTCTATCT 45 | TCCTTTTAT 46 | TCTTATACC 47 | CATATTACA 48 | TTCCTAATC 49 | TAATCTACA 50 | TAAAAATCT 51 | -------------------------------------------------------------------------------- /tests/test_seqwalk.py: -------------------------------------------------------------------------------- 1 | from seqwalk import design 2 | 3 | def rc(seq): 4 | """ 5 | reverse complement of DNA sequence 6 | 7 | Args: 8 | seq: string with letters in {A, C, G, T} 9 | 10 | Returns: 11 | string corresponding to reverse complement 12 | """ 13 | complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 14 | return "".join(complement.get(base, base) for base in reversed(seq)) 15 | 16 | def check_ssm(library, k, RCfree=False): 17 | """ 18 | check a library to verify SSM is satisfied for length-k 19 | RCfree is boolean that is False if RC-tolerant 20 | returns True if SSM is satisfied 21 | """ 22 | 23 | seen_kmers = set() 24 | 25 | for seq in library: 26 | for i in range(len(seq)-k+1): 27 | 28 | if seq[i:i+k] in seen_kmers: 29 | return False 30 | seen_kmers.add(seq[i:i+k]) 31 | 32 | if RCfree: 33 | if rc(seq[i:i+k]) in seen_kmers: 34 | return False 35 | seen_kmers.add(rc(seq[i:i+k])) 36 | return True 37 | 38 | def check_pattern_free(library, pattern): 39 | 40 | for seq in library: 41 | if pattern in seq: 42 | return False 43 | return True 44 | 45 | def check_GC(library, GClims): 46 | 47 | for seq in library: 48 | GC_count = sum([i in ['G', 'C'] for i in seq]) 49 | if GC_count < GClims[0]: 50 | return False 51 | if GC_count > GClims[1]: 52 | return False 53 | return True 54 | 55 | 56 | def test_3letter_RC_tolerant(): 57 | ## test 3 letter no RCfree 58 | library = design.max_size(25, 6, alphabet="ACT") 59 | assert check_ssm(library, 6, RCfree=False), "SSM failed, 4 letter RC tolerant" 60 | library = design.max_size(25, 6, alphabet="TAC") 61 | assert check_ssm(library, 6, RCfree=False), "SSM failed, 4 letter RC tolerant" 62 | library = design.max_size(25, 6, alphabet="CAT") 63 | assert check_ssm(library, 6, RCfree=False), "SSM failed, 4 letter RC tolerant" 64 | 65 | def test_4letter_RC_tolerant(): 66 | ## test 4 letter no RCfree 67 | library = design.max_size(25, 6, alphabet="ACGT") 68 | assert check_ssm(library, 6, RCfree=False), "SSM failed, 4 letter RC tolerant" 69 | 70 | 71 | def test_3letter_RC_free(): 72 | ## test 3 letter filtered 73 | library = design.max_size(25, 4, alphabet="ACT", RCfree=True) 74 | assert check_ssm(library, 4, RCfree=True), "SSM failed, 3 letter RC free" 75 | library = design.max_size(25, 5, alphabet="ACT", RCfree=True) 76 | assert check_ssm(library, 5, RCfree=True), "SSM failed, 3 letter RC free" 77 | 78 | def test_4letter_RC_free(): 79 | library = design.max_size(25, 5, alphabet="ACTG", RCfree=True) 80 | assert check_ssm(library, 5, RCfree=True), "SSM failed, 3 letter RC free" 81 | library = design.max_size(25, 4, alphabet="ACT", RCfree=True) 82 | assert check_ssm(library, 4, RCfree=True), "SSM failed, 3 letter RC free" 83 | 84 | def test_pattern_free(): 85 | library = design.max_size(25, 5, alphabet="ACTG", RCfree=True) 86 | assert check_pattern_free(library, "AAAA"), "Failed pattern filtering" 87 | 88 | def test_GC_lims(): 89 | library = design.max_size(25, 5, alphabet="ACTG", GClims=(10, 20)) 90 | assert check_GC(library, (10, 20)), "Failed GC filtering" 91 | 92 | def test_max_orthogonality(): 93 | library = design.max_orthogonality(100, 10, RCfree=True) 94 | assert check_ssm(library, 6), "Failed max orthogonality" 95 | --------------------------------------------------------------------------------