├── .codecov.yml ├── .github ├── CONTRIBUTING.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── CI.yaml ├── .gitignore ├── .lgtm.yml ├── .readthedocs.yml ├── CODE_OF_CONDUCT.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── devtools ├── README.md ├── conda-envs │ └── test_env.yaml ├── legacy-miniconda-setup │ └── before_install.sh └── scripts │ └── create_conda_env.py ├── docs ├── Makefile ├── README.md ├── _static │ └── README.md ├── _templates │ └── README.md ├── api.rst ├── conf.py ├── getting_started.rst ├── index.rst ├── make.bat ├── predictors.rst ├── requirements.txt └── requirements.yaml ├── examples ├── protein_example_1.py └── sparrow_walk_through.ipynb ├── pyproject.toml ├── readthedocs.yml ├── setup.cfg ├── setup.py └── sparrow ├── __init__.py ├── calculate_parameters.py ├── data ├── README.md ├── __init__.py ├── amino_acids.py ├── configs.py ├── elm_classes.tsv ├── look_and_say.dat └── networks │ ├── asphericity │ ├── README │ ├── asphericity_network_v1.pt │ └── asphericity_network_v2.pt │ ├── dssp │ ├── dssp_predictor_network_v1.pt │ └── dssp_predictor_network_v2.pt │ ├── mitochondrial_targeting │ └── mitochondrial_targeting_predictor_network_v1.pt │ ├── nuclear_export_signal │ └── nes_predictor_network_v1.pt │ ├── nuclear_import_signal │ └── nls_predictor_network_v1.pt │ ├── phosphorylation │ ├── ser_phosphorylation_predictor_network_v1.pt │ ├── thr_phosphorylation_predictor_network_v1.pt │ └── tyr_phosphorylation_predictor_network_v1.pt │ ├── prefactor │ ├── README │ ├── prefactor_network_v1.pt │ └── prefactor_network_v2.pt │ ├── pscore │ ├── pscore_predictor_network_v2.pt │ ├── pscore_predictor_network_v3.pt │ └── pscore_predictor_network_v4.pt │ ├── re │ ├── README │ ├── re_network_v1.pt │ └── re_network_v2.pt │ ├── rg │ ├── README │ ├── rg_network_v1.pt │ └── rg_network_v2.pt │ ├── scaled_re │ ├── README │ ├── scaled_re_network_v1.pt │ └── scaled_re_network_v2.pt │ ├── scaled_rg │ ├── README │ ├── scaled_rg_network_v1.pt │ └── scaled_rg_network_v2.pt │ ├── scaling_exponent │ ├── README │ ├── scaling_exponent_network_v1.5.pt │ ├── scaling_exponent_network_v1.pt │ └── scaling_exponent_network_v2.pt │ ├── transactivation_domains │ └── tad_predictor_network_v1.pt │ └── transmembrane │ ├── transmembrane_predictor_network_v1.pt │ └── transmembrane_predictor_network_v4.pt ├── patterning ├── __init__.py ├── iwd.pyx ├── kappa.pyx ├── patterning.pyx └── scd.pyx ├── polymer ├── __init__.py └── scaling_parameters.py ├── predictors ├── __init__.py ├── asphericity │ ├── __init__.py │ └── asphericity_predictor.py ├── batch_predict.py ├── dssp │ ├── __init__.py │ └── dssp_predictor.py ├── e2e │ ├── __init__.py │ └── end_to_end_distance_predictor.py ├── mitochondrial_targeting │ ├── __init__.py │ └── mitochondrial_targeting_predictor.py ├── nes │ ├── __init__.py │ └── nuclear_export_signal_predictor.py ├── nls │ ├── __init__.py │ └── nuclear_import_signal_predictor.py ├── phosphorylation │ ├── __init__.py │ ├── phospho_predictor_utils.py │ ├── ser_phosphorylation_predictor.py │ ├── thr_phosphorylation_predictor.py │ └── tyr_phosphorylation_predictor.py ├── predictor_template.pyXX ├── prefactor │ ├── __init__.py │ └── prefactor_predictor.py ├── pscore │ ├── __init__.py │ └── pscore_predictor.py ├── rg │ ├── __init__.py │ └── radius_of_gyration_predictor.py ├── scaled_re │ ├── __init__.py │ └── scaled_end_to_end_distance_predictor.py ├── scaled_rg │ ├── __init__.py │ └── scaled_radius_of_gyration_predictor.py ├── scaling_exponent │ ├── __init__.py │ └── scaling_exponent_predictor.py ├── tad │ ├── __init__.py │ └── transactivation_domain_predictor.py └── transmembrane │ ├── __init__.py │ └── transmembrane_predictor.py ├── protein.py ├── sequence_analysis ├── __init__.py ├── alignment.py ├── community_plugins │ └── contributed.py ├── elm.py ├── phospho_isoforms.py ├── physical_properties.py ├── plugins.py └── sequence_complexity.py ├── sparrow_exceptions.py ├── tests ├── __init__.py ├── compute_test_data.ipynb ├── generate_test_data │ ├── generate_dssp_data.ipynb │ └── helicity_class_v2_default.pickle ├── test_albatross.py ├── test_data │ ├── coil_class_v2_default_test_seqs_100.pickle │ ├── coil_class_v2_non_default_test_seqs_100.pickle │ ├── coil_prob_v2_default_test_seqs_100.pickle │ ├── extended_class_v2_default_test_seqs_100.pickle │ ├── extended_class_v2_non_default_test_seqs_100.pickle │ ├── extended_prob_v2_default_test_seqs_100.pickle │ ├── helicity_class_v2_default_test_seqs_100.pickle │ ├── helicity_class_v2_non_default_test_seqs_100.pickle │ ├── helicity_prob_v2_default_test_seqs_100.pickle │ ├── test_100_asph.npy │ ├── test_100_asph_v2.npy │ ├── test_100_exponent.npy │ ├── test_100_exponent_v2.npy │ ├── test_100_prefactor.npy │ ├── test_100_prefactor_v2.npy │ ├── test_100_re.npy │ ├── test_100_re_scaled.npy │ ├── test_100_re_scaled_v2.npy │ ├── test_100_re_v2.npy │ ├── test_100_rg.npy │ ├── test_100_rg_scaled.npy │ ├── test_100_rg_scaled_v2.npy │ ├── test_100_rg_v2.npy │ ├── test_100_scd.npy │ ├── test_100_shd.npy │ ├── test_average_bivariate_inverse_distance_charge.npy │ ├── test_average_inverse_distance_ali.npy │ ├── test_average_inverse_distance_charge_neg.npy │ ├── test_average_inverse_distance_charge_pos.npy │ └── test_seqs_100.fasta ├── test_iwd.py ├── test_kappa.py ├── test_plugins.py ├── test_polymeric.py ├── test_predictor_disorder.py ├── test_predictor_dssp.py ├── test_protein.py ├── test_scd.py ├── test_sparrow.py └── test_sparrow_vs_localcider.py ├── tools ├── __init__.py ├── general_tools.py ├── io.py ├── track_tools.py └── utilities.py └── visualize ├── __init__.py └── sequence_visuals.py /.codecov.yml: -------------------------------------------------------------------------------- 1 | # Codecov configuration to make it a bit less noisy 2 | coverage: 3 | status: 4 | patch: false 5 | project: 6 | default: 7 | threshold: 50% 8 | comment: 9 | layout: "header" 10 | require_changes: false 11 | branches: null 12 | behavior: default 13 | flags: null 14 | paths: null -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | We welcome contributions from external contributors, and this document 4 | describes how to merge code changes into this sparrow. 5 | 6 | ## Getting Started 7 | 8 | * Make sure you have a [GitHub account](https://github.com/signup/free). 9 | * [Fork](https://help.github.com/articles/fork-a-repo/) this repository on GitHub. 10 | * On your local machine, 11 | [clone](https://help.github.com/articles/cloning-a-repository/) your fork of 12 | the repository. 13 | 14 | ## Making Changes 15 | 16 | * Add some really awesome code to your local fork. It's usually a [good 17 | idea](http://blog.jasonmeridth.com/posts/do-not-issue-pull-requests-from-your-master-branch/) 18 | to make changes on a 19 | [branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/) 20 | with the branch name relating to the feature you are going to add. 21 | * When you are ready for others to examine and comment on your new feature, 22 | navigate to your fork of sparrow on GitHub and open a [pull 23 | request](https://help.github.com/articles/using-pull-requests/) (PR). Note that 24 | after you launch a PR from one of your fork's branches, all 25 | subsequent commits to that branch will be added to the open pull request 26 | automatically. Each commit added to the PR will be validated for 27 | mergability, compilation and test suite compliance; the results of these tests 28 | will be visible on the PR page. 29 | * If you're providing a new feature, you must add test cases and documentation. 30 | * When the code is ready to go, make sure you run the test suite using pytest. 31 | * When you're ready to be considered for merging, check the "Ready to go" 32 | box on the PR page to let the sparrow devs know that the changes are complete. 33 | The code will not be merged until this box is checked, the continuous 34 | integration returns checkmarks, 35 | and multiple core developers give "Approved" reviews. 36 | 37 | # Additional Resources 38 | 39 | * [General GitHub documentation](https://help.github.com/) 40 | * [PR best practices](http://codeinthehole.com/writing/pull-requests-and-other-good-practices-for-teams-using-github/) 41 | * [A guide to contributing to software packages](http://www.contribution-guide.org) 42 | * [Thinkful PR example](http://www.thinkful.com/learn/github-pull-request-tutorial/#Time-to-Submit-Your-First-PR) 43 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | Provide a brief description of the PR's purpose here. 3 | 4 | ## Todos 5 | Notable points that this PR has either accomplished or will accomplish. 6 | - [ ] TODO 1 7 | 8 | ## Questions 9 | - [ ] Question1 10 | 11 | ## Status 12 | - [ ] Ready to go -------------------------------------------------------------------------------- /.github/workflows/CI.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | # GitHub has started calling new repo's first branch "main" https://github.com/github/renaming 5 | # Existing codes likely still have "master" as the primary branch 6 | # Both are tracked here to keep legacy and new codes working 7 | push: 8 | branches: 9 | - "master" 10 | - "main" 11 | pull_request: 12 | branches: 13 | - "master" 14 | - "main" 15 | schedule: 16 | # Nightly tests run on master by default: 17 | # Scheduled workflows run on the latest commit on the default or base branch. 18 | # (from https://help.github.com/en/actions/reference/events-that-trigger-workflows#scheduled-events-schedule) 19 | - cron: "0 0 * * *" 20 | 21 | jobs: 22 | test: 23 | name: Test on ${{ matrix.os }}, Python ${{ matrix.python-version }} 24 | runs-on: ${{ matrix.os }} 25 | strategy: 26 | matrix: 27 | os: [macOS-latest, ubuntu-latest, windows-latest] 28 | python-version: [3.7, 3.8, 3.9] 29 | 30 | steps: 31 | - uses: actions/checkout@v1 32 | 33 | - name: Additional info about the build 34 | shell: bash 35 | run: | 36 | uname -a 37 | df -h 38 | ulimit -a 39 | 40 | 41 | # More info on options: https://github.com/conda-incubator/setup-miniconda 42 | - uses: conda-incubator/setup-miniconda@v2 43 | with: 44 | python-version: ${{ matrix.python-version }} 45 | environment-file: devtools/conda-envs/test_env.yaml 46 | 47 | channels: conda-forge,defaults 48 | 49 | activate-environment: test 50 | auto-update-conda: false 51 | auto-activate-base: false 52 | show-channel-urls: true 53 | 54 | - name: Install package 55 | 56 | # conda setup requires this special shell 57 | shell: bash -l {0} 58 | run: | 59 | python -m pip install . --no-deps 60 | conda list 61 | 62 | 63 | - name: Run tests 64 | 65 | # conda setup requires this special shell 66 | shell: bash -l {0} 67 | 68 | run: | 69 | pytest -v --cov=sparrow --cov-report=xml --color=yes sparrow/tests/ 70 | 71 | - name: CodeCov 72 | uses: codecov/codecov-action@v1 73 | with: 74 | file: ./coverage.xml 75 | flags: unittests 76 | name: codecov-${{ matrix.os }}-py${{ matrix.python-version }} 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | sparrow/patterning.html 6 | .DS_Store 7 | sparrow/_version.py 8 | 9 | # C extensions 10 | *.so 11 | *~ 12 | *.c 13 | \#* 14 | \.#* 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | .pytest_cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # dotenv 91 | .env 92 | 93 | # virtualenv 94 | .venv 95 | venv/ 96 | ENV/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | 111 | # profraw files from LLVM? Unclear exactly what triggers this 112 | # There are reports this comes from LLVM profiling, but also Xcode 9. 113 | *profraw 114 | 115 | # pytorch weights 116 | # *pt 117 | -------------------------------------------------------------------------------- /.lgtm.yml: -------------------------------------------------------------------------------- 1 | # Configure LGTM for this package 2 | 3 | extraction: 4 | python: # Configure Python 5 | python_setup: # Configure the setup 6 | version: 3 # Specify Version 3 7 | path_classifiers: 8 | library: 9 | - devtools/* 10 | generated: 11 | - sparrow/_version.py 12 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # File: .readthedocs.yaml 2 | 3 | version: 2 4 | 5 | # Specify the Python version and requirements file 6 | python: 7 | install: 8 | - requirements: docs/requirements.txt 9 | 10 | # Use the "readthedocs" environment to ensure all dependencies are installed before building 11 | build: 12 | os: ubuntu-20.04 13 | tools: 14 | python: "3.9" 15 | 16 | # Sphinx configuration 17 | sphinx: 18 | configuration: docs/conf.py -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, 8 | body size, disability, ethnicity, gender identity and expression, level of 9 | experience, nationality, personal appearance, race, religion, or sexual 10 | identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment include: 15 | 16 | * Using welcoming and inclusive language 17 | * Being respectful of differing viewpoints and experiences 18 | * Gracefully accepting constructive criticism 19 | * Focusing on what is best for the community 20 | * Showing empathy towards other community members 21 | 22 | Examples of unacceptable behavior by participants include: 23 | 24 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 25 | * Trolling, insulting/derogatory comments, and personal or political attacks 26 | * Public or private harassment 27 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 28 | * Other conduct which could reasonably be considered inappropriate in a professional setting 29 | 30 | ## Our Responsibilities 31 | 32 | Project maintainers are responsible for clarifying the standards of acceptable 33 | behavior and are expected to take appropriate and fair corrective action in 34 | response to any instances of unacceptable behavior. 35 | 36 | Project maintainers have the right and responsibility to remove, edit, or 37 | reject comments, commits, code, wiki edits, issues, and other contributions 38 | that are not aligned to this Code of Conduct, or to ban temporarily or 39 | permanently any contributor for other behaviors that they deem inappropriate, 40 | threatening, offensive, or harmful. 41 | 42 | Moreover, project maintainers will strive to offer feedback and advice to 43 | ensure quality and consistency of contributions to the code. Contributions 44 | from outside the group of project maintainers are strongly welcomed but the 45 | final decision as to whether commits are merged into the codebase rests with 46 | the team of project maintainers. 47 | 48 | ## Scope 49 | 50 | This Code of Conduct applies both within project spaces and in public spaces 51 | when an individual is representing the project or its community. Examples of 52 | representing a project or community include using an official project e-mail 53 | address, posting via an official social media account, or acting as an 54 | appointed representative at an online or offline event. Representation of a 55 | project may be further defined and clarified by project maintainers. 56 | 57 | ## Enforcement 58 | 59 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 60 | reported by contacting the project team at 'alex.holehouse@wustl.edu'. The project team will 61 | review and investigate all complaints, and will respond in a way that it deems 62 | appropriate to the circumstances. The project team is obligated to maintain 63 | confidentiality with regard to the reporter of an incident. Further details of 64 | specific enforcement policies may be posted separately. 65 | 66 | Project maintainers who do not follow or enforce the Code of Conduct in good 67 | faith may face temporary or permanent repercussions as determined by other 68 | members of the project's leadership. 69 | 70 | ## Attribution 71 | 72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 73 | version 1.4, available at 74 | [http://contributor-covenant.org/version/1/4][version] 75 | 76 | [homepage]: http://contributor-covenant.org 77 | [version]: http://contributor-covenant.org/version/1/4/ 78 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2023 Jeffrey Lotthammer 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include MANIFEST.in 3 | include CODE_OF_CONDUCT.md 4 | 5 | graft sparrow 6 | global-exclude *.py[cod] __pycache__ *.so 7 | 8 | recursive-include sparrow/data * 9 | 10 | -------------------------------------------------------------------------------- /devtools/README.md: -------------------------------------------------------------------------------- 1 | # Development, testing, and deployment tools 2 | 3 | This directory contains a collection of tools for running Continuous Integration (CI) tests, 4 | conda installation, and other development tools not directly related to the coding process. 5 | 6 | 7 | ## Manifest 8 | 9 | ### Continuous Integration 10 | 11 | You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and 12 | Windows testing if you only plan to deploy on specific platforms. These are just to help you get started. 13 | 14 | The items in this directory have been left for legacy purposes since the change to GitHub Actions, 15 | They will likely be removed in a future version. 16 | 17 | * `legacy-miniconda-setup`: A preserved copy of a helper directory which made Linux and OSX based testing through [Travis-CI](https://about.travis-ci.com/) simpler 18 | * `before_install.sh`: Pip/Miniconda pre-package installation script for Travis. No longer needed thanks to 19 | [GitHub Actions](https://docs.github.com/en/free-pro-team@latest/actions) and the [conda-incubator/setup-miniconda Action](https://github.com/conda-incubator/setup-miniconda) 20 | 21 | ### Conda Environment: 22 | 23 | This directory contains the files to setup the Conda environment for testing purposes 24 | 25 | * `conda-envs`: directory containing the YAML file(s) which fully describe Conda Environments, their dependencies, and those dependency provenance's 26 | * `test_env.yaml`: Simple test environment file with base dependencies. Channels are not specified here and therefore respect global Conda configuration 27 | 28 | ### Additional Scripts: 29 | 30 | This directory contains OS agnostic helper scripts which don't fall in any of the previous categories 31 | * `scripts` 32 | * `create_conda_env.py`: Helper program for spinning up new conda environments based on a starter file with Python Version and Env. Name command-line options 33 | 34 | 35 | ## How to contribute changes 36 | - Clone the repository if you have write access to the main repo, fork the repository if you are a collaborator. 37 | - Make a new branch with `git checkout -b {your branch name}` 38 | - Make changes and test your code 39 | - Ensure that the test environment dependencies (`conda-envs`) line up with the build and deploy dependencies (`conda-recipe/meta.yaml`) 40 | - Push the branch to the repo (either the main or your fork) with `git push -u origin {your branch name}` 41 | * Note that `origin` is the default name assigned to the remote, yours may be different 42 | - Make a PR on GitHub with your changes 43 | - We'll review the changes and get your code into the repo after lively discussion! 44 | 45 | 46 | ## Checklist for updates 47 | - [ ] Make sure there is an/are issue(s) opened for your specific update 48 | - [ ] Create the PR, referencing the issue 49 | - [ ] Debug the PR as needed until tests pass 50 | - [ ] Tag the final, debugged version 51 | * `git tag -a X.Y.Z [latest pushed commit] && git push --follow-tags` 52 | - [ ] Get the PR merged in 53 | -------------------------------------------------------------------------------- /devtools/conda-envs/test_env.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | channels: 3 | 4 | - conda-forge 5 | 6 | - defaults 7 | dependencies: 8 | # Base depends 9 | - python 10 | - pip 11 | 12 | # Testing 13 | - pytest 14 | - pytest-cov 15 | - codecov 16 | 17 | # Pip-only installs 18 | #- pip: 19 | # - codecov 20 | 21 | -------------------------------------------------------------------------------- /devtools/legacy-miniconda-setup/before_install.sh: -------------------------------------------------------------------------------- 1 | # Temporarily change directory to $HOME to install software 2 | pushd . 3 | cd $HOME 4 | # Make sure some level of pip is installed 5 | python -m ensurepip 6 | 7 | # Install Miniconda 8 | if [ "$TRAVIS_OS_NAME" == "osx" ]; then 9 | # Make OSX md5 mimic md5sum from linux, alias does not work 10 | md5sum () { 11 | command md5 -r "$@" 12 | } 13 | MINICONDA=Miniconda3-latest-MacOSX-x86_64.sh 14 | else 15 | MINICONDA=Miniconda3-latest-Linux-x86_64.sh 16 | fi 17 | MINICONDA_HOME=$HOME/miniconda 18 | MINICONDA_MD5=$(wget -qO- https://repo.anaconda.com/miniconda/ | grep -A3 $MINICONDA | sed -n '4p' | sed -n 's/ *\(.*\)<\/td> */\1/p') 19 | wget -q https://repo.anaconda.com/miniconda/$MINICONDA 20 | if [[ $MINICONDA_MD5 != $(md5sum $MINICONDA | cut -d ' ' -f 1) ]]; then 21 | echo "Miniconda MD5 mismatch" 22 | exit 1 23 | fi 24 | bash $MINICONDA -b -p $MINICONDA_HOME 25 | 26 | # Configure miniconda 27 | export PIP_ARGS="-U" 28 | # New to conda >=4.4 29 | echo ". $MINICONDA_HOME/etc/profile.d/conda.sh" >> ~/.bashrc # Source the profile.d file 30 | echo "conda activate" >> ~/.bashrc # Activate conda 31 | source ~/.bashrc # source file to get new commands 32 | #export PATH=$MINICONDA_HOME/bin:$PATH # Old way, should not be needed anymore 33 | 34 | conda config --add channels conda-forge 35 | 36 | conda config --set always_yes yes 37 | conda install conda conda-build jinja2 anaconda-client 38 | conda update --quiet --all 39 | 40 | # Restore original directory 41 | popd 42 | -------------------------------------------------------------------------------- /devtools/scripts/create_conda_env.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | import glob 5 | import shutil 6 | import subprocess as sp 7 | from tempfile import TemporaryDirectory 8 | from contextlib import contextmanager 9 | # YAML imports 10 | try: 11 | import yaml # PyYAML 12 | loader = yaml.load 13 | except ImportError: 14 | try: 15 | import ruamel_yaml as yaml # Ruamel YAML 16 | except ImportError: 17 | try: 18 | # Load Ruamel YAML from the base conda environment 19 | from importlib import util as import_util 20 | CONDA_BIN = os.path.dirname(os.environ['CONDA_EXE']) 21 | ruamel_yaml_path = glob.glob(os.path.join(CONDA_BIN, '..', 22 | 'lib', 'python*.*', 'site-packages', 23 | 'ruamel_yaml', '__init__.py'))[0] 24 | # Based on importlib example, but only needs to load_module since its the whole package, not just 25 | # a module 26 | spec = import_util.spec_from_file_location('ruamel_yaml', ruamel_yaml_path) 27 | yaml = spec.loader.load_module() 28 | except (KeyError, ImportError, IndexError): 29 | raise ImportError("No YAML parser could be found in this or the conda environment. " 30 | "Could not find PyYAML or Ruamel YAML in the current environment, " 31 | "AND could not find Ruamel YAML in the base conda environment through CONDA_EXE path. " 32 | "Environment not created!") 33 | loader = yaml.YAML(typ="safe").load # typ="safe" avoids odd typing on output 34 | 35 | 36 | @contextmanager 37 | def temp_cd(): 38 | """Temporary CD Helper""" 39 | cwd = os.getcwd() 40 | with TemporaryDirectory() as td: 41 | try: 42 | os.chdir(td) 43 | yield 44 | finally: 45 | os.chdir(cwd) 46 | 47 | 48 | # Args 49 | parser = argparse.ArgumentParser(description='Creates a conda environment from file for a given Python version.') 50 | parser.add_argument('-n', '--name', type=str, 51 | help='The name of the created Python environment') 52 | parser.add_argument('-p', '--python', type=str, 53 | help='The version of the created Python environment') 54 | parser.add_argument('conda_file', 55 | help='The file for the created Python environment') 56 | 57 | args = parser.parse_args() 58 | 59 | # Open the base file 60 | with open(args.conda_file, "r") as handle: 61 | yaml_script = loader(handle.read()) 62 | 63 | python_replacement_string = "python {}*".format(args.python) 64 | 65 | try: 66 | for dep_index, dep_value in enumerate(yaml_script['dependencies']): 67 | if re.match('python([ ><=*]+[0-9.*]*)?$', dep_value): # Match explicitly 'python' and its formats 68 | yaml_script['dependencies'].pop(dep_index) 69 | break # Making the assumption there is only one Python entry, also avoids need to enumerate in reverse 70 | except (KeyError, TypeError): 71 | # Case of no dependencies key, or dependencies: None 72 | yaml_script['dependencies'] = [] 73 | finally: 74 | # Ensure the python version is added in. Even if the code does not need it, we assume the env does 75 | yaml_script['dependencies'].insert(0, python_replacement_string) 76 | 77 | # Figure out conda path 78 | if "CONDA_EXE" in os.environ: 79 | conda_path = os.environ["CONDA_EXE"] 80 | else: 81 | conda_path = shutil.which("conda") 82 | if conda_path is None: 83 | raise RuntimeError("Could not find a conda binary in CONDA_EXE variable or in executable search path") 84 | 85 | print("CONDA ENV NAME {}".format(args.name)) 86 | print("PYTHON VERSION {}".format(args.python)) 87 | print("CONDA FILE NAME {}".format(args.conda_file)) 88 | print("CONDA PATH {}".format(conda_path)) 89 | 90 | # Write to a temp directory which will always be cleaned up 91 | with temp_cd(): 92 | temp_file_name = "temp_script.yaml" 93 | with open(temp_file_name, 'w') as f: 94 | f.write(yaml.dump(yaml_script)) 95 | sp.call("{} env create -n {} -f {}".format(conda_path, args.name, temp_file_name), shell=True) 96 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = sparrow 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Compiling sparrow's Documentation 2 | 3 | The docs for this project are built with [Sphinx](http://www.sphinx-doc.org/en/master/). 4 | To compile the docs, first ensure that Sphinx and the ReadTheDocs theme are installed. 5 | 6 | 7 | ```bash 8 | conda install sphinx sphinx_rtd_theme 9 | ``` 10 | 11 | 12 | Once installed, you can use the `Makefile` in this directory to compile static HTML pages by 13 | ```bash 14 | make html 15 | ``` 16 | 17 | The compiled docs will be in the `_build` directory and can be viewed by opening `index.html` (which may itself 18 | be inside a directory called `html/` depending on what version of Sphinx is installed). 19 | 20 | 21 | A configuration file for [Read The Docs](https://readthedocs.org/) (readthedocs.yaml) is included in the top level of the repository. To use Read the Docs to host your documentation, go to https://readthedocs.org/ and connect this repository. You may need to change your default branch to `main` under Advanced Settings for the project. 22 | 23 | If you would like to use Read The Docs with `autodoc` (included automatically) and your package has dependencies, you will need to include those dependencies in your documentation yaml file (`docs/requirements.yaml`). 24 | 25 | -------------------------------------------------------------------------------- /docs/_static/README.md: -------------------------------------------------------------------------------- 1 | # Static Doc Directory 2 | 3 | Add any paths that contain custom static files (such as style sheets) here, 4 | relative to the `conf.py` file's directory. 5 | They are copied after the builtin static files, 6 | so a file named "default.css" will overwrite the builtin "default.css". 7 | 8 | The path to this folder is set in the Sphinx `conf.py` file in the line: 9 | ```python 10 | templates_path = ['_static'] 11 | ``` 12 | 13 | ## Examples of file to add to this directory 14 | * Custom Cascading Style Sheets 15 | * Custom JavaScript code 16 | * Static logo images 17 | -------------------------------------------------------------------------------- /docs/_templates/README.md: -------------------------------------------------------------------------------- 1 | # Templates Doc Directory 2 | 3 | Add any paths that contain templates here, relative to 4 | the `conf.py` file's directory. 5 | They are copied after the builtin template files, 6 | so a file named "page.html" will overwrite the builtin "page.html". 7 | 8 | The path to this folder is set in the Sphinx `conf.py` file in the line: 9 | ```python 10 | html_static_path = ['_templates'] 11 | ``` 12 | 13 | ## Examples of file to add to this directory 14 | * HTML extensions of stock pages like `page.html` or `layout.html` 15 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ================= 3 | 4 | .. autosummary:: 5 | :toctree: autosummary 6 | 7 | sparrow.canvas 8 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/stable/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | 15 | # Incase the project was not installed 16 | import os 17 | import sys 18 | sys.path.insert(0, os.path.abspath('..')) 19 | 20 | import sparrow 21 | 22 | 23 | # -- Project information ----------------------------------------------------- 24 | 25 | project = 'sparrow' 26 | copyright = ("2020, Alex Holehouse. Project structure based on the " 27 | "Computational Molecular Science Python Cookiecutter version 1.5") 28 | author = 'Alex Holehouse' 29 | 30 | # The short X.Y version 31 | version = '' 32 | # The full version, including alpha/beta/rc tags 33 | release = '' 34 | 35 | 36 | # -- General configuration --------------------------------------------------- 37 | 38 | # If your documentation needs a minimal Sphinx version, state it here. 39 | # 40 | # needs_sphinx = '1.0' 41 | 42 | # Add any Sphinx extension module names here, as strings. They can be 43 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 44 | # ones. 45 | extensions = [ 46 | 'sphinx.ext.autosummary', 47 | 'sphinx.ext.autodoc', 48 | 'sphinx.ext.mathjax', 49 | 'sphinx.ext.viewcode', 50 | 'sphinx.ext.napoleon', 51 | 'sphinx.ext.intersphinx', 52 | 'sphinx.ext.extlinks', 53 | ] 54 | 55 | autosummary_generate = True 56 | napoleon_google_docstring = False 57 | napoleon_use_param = False 58 | napoleon_use_ivar = True 59 | 60 | # Add any paths that contain templates here, relative to this directory. 61 | templates_path = ['_templates'] 62 | 63 | # The suffix(es) of source filenames. 64 | # You can specify multiple suffix as a list of string: 65 | # 66 | # source_suffix = ['.rst', '.md'] 67 | source_suffix = '.rst' 68 | 69 | # The master toctree document. 70 | master_doc = 'index' 71 | 72 | # The language for content autogenerated by Sphinx. Refer to documentation 73 | # for a list of supported languages. 74 | # 75 | # This is also used if you do content translation via gettext catalogs. 76 | # Usually you set "language" from the command line for these cases. 77 | language = None 78 | 79 | # List of patterns, relative to source directory, that match files and 80 | # directories to ignore when looking for source files. 81 | # This pattern also affects html_static_path and html_extra_path . 82 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 83 | 84 | # The name of the Pygments (syntax highlighting) style to use. 85 | pygments_style = 'default' 86 | 87 | 88 | # -- Options for HTML output ------------------------------------------------- 89 | 90 | # The theme to use for HTML and HTML Help pages. See the documentation for 91 | # a list of builtin themes. 92 | # 93 | html_theme = 'sphinx_rtd_theme' 94 | 95 | # Theme options are theme-specific and customize the look and feel of a theme 96 | # further. For a list of options available for each theme, see the 97 | # documentation. 98 | # 99 | # html_theme_options = {} 100 | 101 | # Add any paths that contain custom static files (such as style sheets) here, 102 | # relative to this directory. They are copied after the builtin static files, 103 | # so a file named "default.css" will overwrite the builtin "default.css". 104 | html_static_path = ['_static'] 105 | 106 | # Custom sidebar templates, must be a dictionary that maps document names 107 | # to template names. 108 | # 109 | # The default sidebars (for documents that don't match any pattern) are 110 | # defined by theme itself. Builtin themes are using these templates by 111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 112 | # 'searchbox.html']``. 113 | # 114 | # html_sidebars = {} 115 | 116 | 117 | # -- Options for HTMLHelp output --------------------------------------------- 118 | 119 | # Output file base name for HTML help builder. 120 | htmlhelp_basename = 'sparrowdoc' 121 | 122 | 123 | # -- Options for LaTeX output ------------------------------------------------ 124 | 125 | latex_elements = { 126 | # The paper size ('letterpaper' or 'a4paper'). 127 | # 128 | # 'papersize': 'letterpaper', 129 | 130 | # The font size ('10pt', '11pt' or '12pt'). 131 | # 132 | # 'pointsize': '10pt', 133 | 134 | # Additional stuff for the LaTeX preamble. 135 | # 136 | # 'preamble': '', 137 | 138 | # Latex figure (float) alignment 139 | # 140 | # 'figure_align': 'htbp', 141 | } 142 | 143 | # Grouping the document tree into LaTeX files. List of tuples 144 | # (source start file, target name, title, 145 | # author, documentclass [howto, manual, or own class]). 146 | latex_documents = [ 147 | (master_doc, 'sparrow.tex', 'sparrow Documentation', 148 | 'sparrow', 'manual'), 149 | ] 150 | 151 | 152 | # -- Options for manual page output ------------------------------------------ 153 | 154 | # One entry per manual page. List of tuples 155 | # (source start file, name, description, authors, manual section). 156 | man_pages = [ 157 | (master_doc, 'sparrow', 'sparrow Documentation', 158 | [author], 1) 159 | ] 160 | 161 | 162 | # -- Options for Texinfo output ---------------------------------------------- 163 | 164 | # Grouping the document tree into Texinfo files. List of tuples 165 | # (source start file, target name, title, author, 166 | # dir menu entry, description, category) 167 | texinfo_documents = [ 168 | (master_doc, 'sparrow', 'sparrow Documentation', 169 | author, 'sparrow', 'Next generation package for sequence parameter calculation', 170 | 'Miscellaneous'), 171 | ] 172 | 173 | 174 | # -- Extension configuration ------------------------------------------------- 175 | -------------------------------------------------------------------------------- /docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | Getting Started 2 | =============== 3 | 4 | This page details how to get started with sparrow. 5 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. sparrow documentation master file, created by 2 | sphinx-quickstart on Thu Mar 15 13:55:56 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to sparrow's documentation! 7 | ========================================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | getting_started 14 | api 15 | predictors 16 | 17 | 18 | 19 | Indices and tables 20 | ================== 21 | 22 | * :ref:`genindex` 23 | * :ref:`modindex` 24 | * :ref:`search` 25 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=sparrow 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/predictors.rst: -------------------------------------------------------------------------------- 1 | Predictors 2 | ================= 3 | 4 | sparrow implements a set of different sequence-based predictors in a modular, extendable way that enables additional predictors to be easily added. 5 | 6 | 7 | Creating new predictors with PARROT 8 | -------------------------------------- 9 | The guide below assumes you have cloned the git repository of sparrow, created a new branch to add your new predictor to, and have switched into that branch to begin work. As a reminder, when adding new features in Git, the general workflow is: 10 | 11 | 1. Clone the current up-to-date version 12 | 2. Create a new branch (this is a seperate version where you can work in peace, but if new features are added to the main branch you can update your branch as you go) 13 | 3. Add in your snazzy new feature 14 | 4. Once complete, make a pull request to merge your branch back into the main branch. 15 | 16 | This guide assumes these ideas are clear, and specifically provides insight into the details of implementing a new predictor in sparrow, focussing here on using PARROT to train that predictor. 17 | 18 | 19 | **Step 1: Train a predictor with PARROT** 20 | 21 | The first step in adding a new PARROT based predictor is to use PARROT to train your model. The details of how one does this go beyond the scope of this documentation, but once trained you should be left with a Torch parameter file (a ``.pt`` file). This is the file we're going to use with SPARROW to add our custom predictor. Lets call this parameter file ``new_predictor.pt`` to make this concrete. 22 | 23 | Note that the PARROT predictor should be predicted in ``residues`` mode - i.e. we need to recieve one value per residue 24 | 25 | 26 | **Step 2: Copy the parameter file into SPARROW** 27 | 28 | Next we take ``new_predictor.pt`` and we're going to copy it into sparrow. Specifically, this trained network should be placed under:: 29 | 30 | sparrow/data/networks/predictor 31 | 32 | and MUST follow the naming convention ``_network_v.pt``. Note there that: 33 | 34 | * ```` should be a single word or word connected by underscores, all lower case, that we will use as the function name to call the predictor. For example, *disorder*, *dssp* or *transmembrane* are good examples. Keep this simple but it should be clear and unambigious. 35 | * ```` here is the specific version of this network. It is possible that your network may be retrained later, and as such we want to enable future sparrow users to select specific network versions, althogh of the course the predictors should default to the most recent version. This ability to select specific network versions is built into the standard predictor template code. 36 | 37 | As an example, our transmembrane predictor has the format:: 38 | 39 | transmembrane_predictor_network_v4.pt 40 | 41 | 42 | **Step 3: Build a predictor class which performs the prediction** 43 | 44 | The next step is to build a stand-alone predictor class which reads in this network file and enables the return of the per-residue prediction implemented therein. This file should be created in a new package (i.e. a directory with a ``__init__.py``) in the:: 45 | 46 | sparrow/predictors 47 | 48 | and this file should be called ``_predictor.py``. 49 | 50 | As a specific example, our transmembrane predictor is implemented in:: 51 | 52 | sparrow/predictors/transmembrane 53 | 54 | and within this directory there are two files:: 55 | 56 | __init__.py # this is needed so we can import the predictor 57 | transmembrane_predictor.py # this is where the predictor is implemented 58 | 59 | The reason to make a separate package (directory) for every predictor is that if someone has a non-parrot-based predictor they want to incoporate into sparrow (1) this is absolutely welcome and (2) we want to provide a consistent file ecosystem where they have a directory to implement as much/little additional code as they want. As such, the ``__init__.py`` and ``_predictor.py`` are the **minimum** files needed, but you are free to add anything else as well. 60 | 61 | ``__init__.py``` should probably just be empty - it's what tells Python that this directory is a package. 62 | 63 | ``_predictor.py`` should NOT be empty, but should be based on the template file found under ``sparrow/predictors/predictor_template.py``. The template is REALTIVELY simple, but provides code for reading in a PARROT-trained network and performing a prediction. You could re-implement this yourself if you really wanted, but, assuming you're using one-hot encoding on the trained network, this code should work out of the box. The template itself walks through the various small configuration tweaks needed to make this work with your specific network of interest. Note that for classification vs. regression there are some small difference, but the template file provides code for both, so just delete/comment out the irrelevant lines (these are clearly marked). 64 | 65 | Once this is done, it's worth seeing if you can import and run predictions using this class/function as a stand-alone predictor i.e. you should be able to do:: 66 | 67 | 68 | from sparrow.predictor.. import Predictor 69 | 70 | sequence = 'MSAAVTAGKLARAPADPGKAGVPGVAAPGAPAAAPPAKEIPEVLVDPRSRRRYVRGRFLG' 71 | P = Predictor() 72 | P.predict_(sequence) 73 | 74 | 75 | and it return a set of values. 76 | 77 | 78 | **Step 4: Integrate the predictior in the sparrow.Predictor class** 79 | 80 | At this stage we have a working predictor - the last step is to connect this predictor to the sparrow Protein object in a way that inccurs minimal computational overhead if not used, is syntactically simple, and offers functionality like other Protein analysis functions and properties. 81 | 82 | This is achieved by adding a function into the ``sparrow.predictor.Predictor`` class, a class implemented in the ``sparrow.predictor.__init__.py``. 83 | 84 | This class generates an object which is accessible in Protein object under the ``.predictor`` dot operator. As such, functions defined in the `sparrow.predictor.Predictor`` class are then accessible as:: 85 | 86 | seq = 'MSAAVTAGKLARAPADPGKAGVPGVAAPGAPAAAPPAKEIPE' 87 | p = Protein(seq) 88 | 89 | p.predictor. 90 | 91 | 92 | As such, to finally make a new predictor accessible, ``sparrow.predictor.Predictor`` class should be edited to add a new function which is simply the name of the prediction (e.g. ``dssp``, ``transmembrane`` etc). This function should do three things: 93 | 94 | 1. It should UPON BEING CALLED import the predictor package you just created. 95 | 2. It should then perform the prediction on the underlying protein sequence 96 | 3. It should (ideally) memoize the outcome into a local dictionary that means if the same prediction is called again it is simply referenced rather than recomputed. 97 | 98 | Rather than going into the details here, the underlying code and example should make this clear. Noteably, see ``dssp()`` and ``transmembrane_regions()`` for good examples of PARROT-based predictors. One important thing is to document these predictors clearly 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | versioningit 2 | sphinx_rtd_theme -------------------------------------------------------------------------------- /docs/requirements.yaml: -------------------------------------------------------------------------------- 1 | name: docs 2 | channels: 3 | dependencies: 4 | # Base depends 5 | - python 6 | - pip 7 | 8 | 9 | 10 | # Pip-only installs 11 | #- pip: 12 | 13 | -------------------------------------------------------------------------------- /examples/protein_example_1.py: -------------------------------------------------------------------------------- 1 | from sparrow.protein import Protein 2 | 3 | P = Protein('MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP') 4 | 5 | print('Demo 1') 6 | print(P) 7 | print(f"sparrow makes the most of Python's syntactic sugar e.g. we can use len() operator - e.g., len(P) will show the sequence length: {len(P)}") 8 | print(P.predictor.disorder()) 9 | print(P.FCR) 10 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | 3 | # delete cython/numpy if not needed 4 | requires = ["setuptools>=61", "versioningit~=2.0", "cython", "numpy", ] 5 | build-backend = "setuptools.build_meta" 6 | 7 | 8 | # define project info 9 | [project] 10 | name = "sparrow" 11 | dynamic = ["version"] 12 | description = "Next generation sequence analysis package for working with disordered regions and disordered proteins" 13 | authors = [ 14 | {name = "Alex Holehouse", email = "alex.holehouse@wustl.edu"} 15 | ] 16 | license = {text = "CC-NC-ND"} 17 | readme = "README.md" 18 | requires-python = ">=3.7" 19 | 20 | # add in as needed 21 | dependencies = [ 22 | "numpy>=1.14.0,<2.0", 23 | "scipy", 24 | "cython", 25 | "protfasta", 26 | "metapredict>2", 27 | "ipython", 28 | "idptools-parrot @ git+https://git@github.com/idptools/parrot.git", 29 | "afrc", 30 | "tqdm", 31 | "pyfamsa", 32 | ] 33 | 34 | [project.optional-dependencies] 35 | test = [ 36 | "pytest>=6.1.2", 37 | ] 38 | 39 | 40 | [tool.setuptools] 41 | zip-safe = false 42 | include-package-data = true 43 | 44 | [tool.setuptools.packages.find] 45 | namespaces = true 46 | where = ["."] 47 | include = ["sparrow", "sparrow.*"] # Discover all sub-packages inside the main package 48 | 49 | [tool.setuptools.package-data] 50 | sparrow = [ 51 | "py.typed" 52 | ] 53 | 54 | [tool.versioningit] 55 | default-version = "1+unknown" 56 | 57 | [tool.versioningit.format] 58 | distance = "{base_version}+{distance}.{vcs}{rev}" 59 | dirty = "{base_version}+{distance}.{vcs}{rev}.dirty" 60 | distance-dirty = "{base_version}+{distance}.{vcs}{rev}.dirty" 61 | 62 | [tool.versioningit.vcs] 63 | # The method key: 64 | method = "git" # <- The method name 65 | # Parameters to pass to the method: 66 | match = ["*"] 67 | default-tag = "1.0.0" 68 | 69 | [tool.versioningit.write] 70 | file = "sparrow/_version.py" 71 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | # readthedocs.yml 2 | 3 | version: 2 4 | 5 | build: 6 | image: latest 7 | 8 | python: 9 | version: 3.8 10 | install: 11 | - method: pip 12 | path: . 13 | 14 | conda: 15 | environment: docs/requirements.yaml -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [coverage:run] 2 | # .coveragerc to control coverage.py and pytest-cov 3 | omit = 4 | # Omit the tests 5 | */tests/* 6 | # Omit generated versioningit 7 | metapredict/_version.py 8 | 9 | # define consistent style 10 | [yapf] 11 | COLUMN_LIMIT = 119 12 | INDENT_WIDTH = 4 13 | USE_TABS = False 14 | 15 | # define consistent style 16 | [flake8] 17 | max-line-length = 119 18 | 19 | # means we can run python setup.py test to 20 | # run tests... maybe... 21 | [aliases] 22 | test = pytest -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | sparrow 3 | Next generation package for sequence parameter calculation 4 | """ 5 | from setuptools import setup, Extension, find_packages 6 | from Cython.Build import cythonize 7 | import os 8 | import numpy 9 | 10 | # defines the absolute path of where your cython files are 11 | cython_dir = os.path.join("sparrow", "patterning") 12 | 13 | # build a list of the files 14 | cython_files = [os.path.join(cython_dir, f) for f in os.listdir(cython_dir) if f.endswith('.pyx')] 15 | 16 | 17 | extensions = [ 18 | Extension( 19 | name=f"sparrow.patterning.{os.path.splitext(os.path.basename(file))[0]}", 20 | sources=[file], 21 | include_dirs=[numpy.get_include()], 22 | ) for file in cython_files 23 | ] 24 | 25 | setup( 26 | ext_modules=cythonize(extensions, compiler_directives={'language_level': "3"}), 27 | packages=find_packages(), 28 | include_package_data=True, 29 | ) 30 | -------------------------------------------------------------------------------- /sparrow/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | sparrow 3 | Next generation package for sequence parameter calculation 4 | """ 5 | 6 | import os 7 | 8 | # Add imports here 9 | from sparrow.tools import io 10 | from sparrow.protein import Protein 11 | from sparrow.tools.io import read_fasta 12 | 13 | 14 | # Generate _version.py if missing and in the Read the Docs environment 15 | if os.getenv("READTHEDOCS") == "True" and not os.path.isfile('../sparrow/_version.py'): 16 | import versioningit 17 | __version__ = versioningit.get_version('../') 18 | else: 19 | from ._version import __version__ 20 | 21 | # code that allows access to the data directory 22 | _ROOT = os.path.abspath(os.path.dirname(__file__)) 23 | def get_data(path): 24 | return os.path.join(_ROOT, 'data', path) 25 | 26 | -------------------------------------------------------------------------------- /sparrow/calculate_parameters.py: -------------------------------------------------------------------------------- 1 | from sparrow.data import amino_acids 2 | import numpy as np 3 | import math 4 | from . import sparrow_exceptions 5 | 6 | # ................................................................. 7 | # 8 | def calculate_aa_fractions(s): 9 | """ 10 | Standalone function that computes amino-acid fractions for 11 | a given sequence. 12 | 13 | Parameters: 14 | -------------- 15 | s : str 16 | Amino acid sequence 17 | 18 | Returns 19 | --------------- 20 | dict 21 | Returns dictionary with per-residue amino acid fraction 22 | 23 | """ 24 | 25 | aa_dict = {} 26 | for i in amino_acids.VALID_AMINO_ACIDS: 27 | aa_dict[i] = 0 28 | 29 | for i in s: 30 | aa_dict[i] = aa_dict[i] + 1 31 | 32 | 33 | len_s = len(s) 34 | for i in amino_acids.VALID_AMINO_ACIDS: 35 | aa_dict[i] = aa_dict[i]/len_s 36 | 37 | return aa_dict 38 | 39 | 40 | 41 | def calculate_seg_complexity(s, alphabet=amino_acids.VALID_AMINO_ACIDS): 42 | """ 43 | Function to calculate the Wootton-Federhen complexity of a sequence (also called 44 | seg complexity, as this the theory used in the classic SEG algorithm. 45 | 46 | Parameters 47 | ----------- 48 | s : str 49 | Amino acid sequence 50 | 51 | alphabet : list 52 | List of amino acids found in alphabet. Note this does not sanity check in the 53 | case of non-standard amino acids. Default is the standard 20 amino acids 54 | 55 | Returns 56 | ---------- 57 | float 58 | Returns a float that corresponds to the compositional complexity associated with 59 | the passed sequence. 60 | 61 | """ 62 | 63 | alphabet_size = len(alphabet) 64 | seq_len = len(s) 65 | 66 | complexity = 0 67 | for a in alphabet: 68 | p = s.count(a)/seq_len 69 | 70 | if p > 0: 71 | complexity = p * math.log(p, alphabet_size) + complexity 72 | 73 | return -complexity 74 | 75 | 76 | 77 | # ................................................................. 78 | # 79 | def calculate_hydrophobicity(s, mode='KD', normalize=False): 80 | """ 81 | Standalone function that computes hydrophobicity 82 | 83 | Parameters: 84 | -------------- 85 | s : str 86 | Amino acid sequence 87 | 88 | mode : str 89 | Hydrophobicity mode to be used. Currently only KD supported 90 | but can be expanded. Allowed values: 'KD' 91 | 92 | normalize : Bool 93 | If set to True hydrophobicity scales are normalized to be between 0 94 | and 1. Default = False. 95 | 96 | Returns 97 | --------------- 98 | Float 99 | Returns a floating point value with the mean hydrophobicity 100 | as defined based on the passed scale 101 | 102 | """ 103 | return np.mean(calculate_linear_hydrophobicity(s, mode, normalize)) 104 | 105 | 106 | # ................................................................. 107 | # 108 | def calculate_linear_hydrophobicity(s, mode='KD', normalize=False): 109 | """ 110 | Compute linear hydrophobicity from sequence using one of several possible 111 | hydrophobicity scales. 112 | 113 | By default this is Kyte-Doolitle, but, we'll add in additional scales 114 | as/when needed. 115 | 116 | Parameters: 117 | -------------- 118 | s : str 119 | Amino acid sequence 120 | 121 | mode : str 122 | Selector for hydrophobicity table. Options available are 123 | 124 | 'KD' | Kyte-Doolittle 125 | 126 | normalize : bool 127 | Boolean that means hydrophobicity scales operate on a normalized 128 | dynamic range of 0 to 1 129 | 130 | Returns: 131 | ------------ 132 | list 133 | List of values that correspond to per-residue hydrophobicity based on 134 | a given hydrophobicity scale. 135 | 136 | """ 137 | 138 | if mode == 'KD': 139 | try: 140 | if normalize: 141 | return [amino_acids.AA_hydro_KD_normalized[r] for r in s] 142 | else: 143 | return [amino_acids.AA_hydro_KD[r] for r in s] 144 | except KeyError: 145 | raise sparrow_exceptions.CalculationException('Invalid residue found in %s' %(s)) 146 | else: 147 | raise sparrow_exceptions.CalculationException('Invalid mode passed: %s' %(mode)) 148 | -------------------------------------------------------------------------------- /sparrow/data/README.md: -------------------------------------------------------------------------------- 1 | # Sample Package Data 2 | 3 | This directory contains sample additional data you may want to include with your package. 4 | This is a place where non-code related additional information (such as data files, molecular structures, etc.) can 5 | go that you want to ship alongside your code. 6 | 7 | Please note that it is not recommended to place large files in your git directory. If your project requires files larger 8 | than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files 9 | as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version 10 | in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository 11 | cap. 12 | 13 | ## Including package data 14 | 15 | Modify your package's `setup.py` file and the `setup()` command. Include the 16 | [`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the 17 | correct files. 18 | 19 | ## Manifest 20 | 21 | * `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150) 22 | -------------------------------------------------------------------------------- /sparrow/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import configs # import general configurations 2 | from . import amino_acids # import residue-specific amino acid data 3 | 4 | -------------------------------------------------------------------------------- /sparrow/data/amino_acids.py: -------------------------------------------------------------------------------- 1 | ## 2 | ## Data on individual amino acids 3 | ## 4 | ## 5 | 6 | VALID_AMINO_ACIDS = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'] 7 | VALID_AMINO_ACIDS_PHYS = ['W','Y','F','H','Q','N','T','S','C','G','A','I','L','V','M','E','D','K','R','P'] 8 | 9 | 10 | ARO = ['Y','W','F'] 11 | ALI = ['A','L','M','I','V'] 12 | POLAR = ['Q','N','S','T','H','G'] 13 | CHARGE = ['E','D','R','K'] 14 | POS = ['R','K'] 15 | NEG = ['E','D'] 16 | 17 | AA_THREE_TO_ONE = {'ALA':'A', 18 | 'CYS':'C', 19 | 'ASP':'D', 20 | 'GLU':'E', 21 | 'PHE':'F', 22 | 'GLY':'G', 23 | 'HIS':'H', 24 | 'ILE':'I', 25 | 'LYS':'K', 26 | 'LEU':'L', 27 | 'MET':'M', 28 | 'ASN':'N', 29 | 'PRO':'P', 30 | 'GLN':'Q', 31 | 'ARG':'R', 32 | 'SER':'S', 33 | 'THR':'T', 34 | 'VAL':'V', 35 | 'TRP':'W', 36 | 'TYR':'Y'} 37 | 38 | AA_ONE_TO_THREE = {} 39 | for x in AA_THREE_TO_ONE: 40 | AA_ONE_TO_THREE[AA_THREE_TO_ONE[x]] = x 41 | 42 | 43 | # acetyl groups have 1C-2O, 4H prior to 44 | # peptide bond formation 45 | # 46 | AA_MOLECULAR_WEIGHT = {'A': 89.1, 47 | 'C': 121.2, 48 | 'D': 133.1, 49 | 'E': 147.1, 50 | 'F': 165.2, 51 | 'G': 75.1, 52 | 'H': 155.2, 53 | 'I': 131.2, 54 | 'K': 146.2, 55 | 'L': 130.2, 56 | 'M': 149.2, 57 | 'N': 132.1, 58 | 'P': 115.1, 59 | 'Q': 146.2, 60 | 'R': 174.2, 61 | 'S': 105.1, 62 | 'T': 119.1, 63 | 'V': 117.1, 64 | 'W': 204.2, 65 | 'Y': 181.2, 66 | '<': 48, 67 | '>': 48} 68 | 69 | 70 | AA_COLOR = {'Y':'#ff9d00', 71 | 'W':'#ff9d00', 72 | 'F':'#ff9d00', 73 | 'A':'#171616', 74 | 'L':'#171616', 75 | 'M':'#171616', 76 | 'I':'#171616', 77 | 'V':'#171616', 78 | 'Q':'#04700d', 79 | 'N':'#04700d', 80 | 'S':'#04700d', 81 | 'T':'#04700d', 82 | 'H':'#04700d', 83 | 'G':'#04700d', 84 | 'E':'#ff0d0d', 85 | 'D':'#ff0d0d', 86 | 'R':'#2900f5', 87 | 'K':'#2900f5', 88 | 'C':'#ffe70d', 89 | 'P':'#cf30b7'} 90 | 91 | 92 | # KYTE-DOOLITTLE SCALES 93 | # References 94 | # A simple method for displaying the hydropathic character of a protein. 95 | # Kyte J, Doolittle RF. J Mol Biol. 1982 May 5;157(1):105-32. 96 | # Why are "natively unfolded" proteins unstructured under physiological conditions? 97 | # Valdimir N. Uversky, Joel R. Gillespie, and Anthony L. Frink 98 | # Protines: Structure, function, and genetics 41:415-427 (2000) 99 | # Main hydrophobicity scale 100 | 101 | AA_hydro_KD = {"A": 6.3, 102 | "R": 0.0, 103 | "N": 1.0, 104 | "D": 1.0, 105 | "C": 7.0, 106 | "Q": 1.0, 107 | "E": 1.0, 108 | "G": 4.1, 109 | "H": 1.3, 110 | "I": 9.0, 111 | "L": 8.3, 112 | "K": 0.6, 113 | "M": 6.4, 114 | "F": 7.3, 115 | "P": 2.9, 116 | "S": 3.7, 117 | "T": 3.8, 118 | "W": 3.6, 119 | "Y": 3.2, 120 | "V": 8.7} 121 | 122 | AA_hydro_KD_normalized = {'A': 0.7, 123 | 'R': 0.0, 124 | 'N': 0.111, 125 | 'D': 0.111, 126 | 'C': 0.778, 127 | 'Q': 0.111, 128 | 'E': 0.111, 129 | 'G': 0.456, 130 | 'H': 0.144, 131 | 'I': 1.0, 132 | 'L': 0.922, 133 | 'K': 0.067, 134 | 'M': 0.711, 135 | 'F': 0.811, 136 | 'P': 0.322, 137 | 'S': 0.411, 138 | 'T': 0.422, 139 | 'W': 0.4, 140 | 'Y': 0.356, 141 | 'V': 0.967} 142 | 143 | -------------------------------------------------------------------------------- /sparrow/data/configs.py: -------------------------------------------------------------------------------- 1 | DISORDER_THRESHOLD = 0.7 2 | MIN_LENGTH_ALBATROSS_RE_RG = 35 3 | -------------------------------------------------------------------------------- /sparrow/data/look_and_say.dat: -------------------------------------------------------------------------------- 1 | 1 2 | 11 3 | 21 4 | 1211 5 | 111221 6 | 312211 7 | 13112221 8 | 1113213211 9 | 31131211131221 10 | 13211311123113112211 11 | 11131221133112132113212221 12 | 3113112221232112111312211312113211 13 | 1321132132111213122112311311222113111221131221 14 | 11131221131211131231121113112221121321132132211331222113112211 15 | 311311222113111231131112132112311321322112111312211312111322212311322113212221 -------------------------------------------------------------------------------- /sparrow/data/networks/asphericity/README: -------------------------------------------------------------------------------- 1 | # To Be Trained 2 | -------------------------------------------------------------------------------- /sparrow/data/networks/asphericity/asphericity_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/asphericity/asphericity_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/asphericity/asphericity_network_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/asphericity/asphericity_network_v2.pt -------------------------------------------------------------------------------- /sparrow/data/networks/dssp/dssp_predictor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/dssp/dssp_predictor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/dssp/dssp_predictor_network_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/dssp/dssp_predictor_network_v2.pt -------------------------------------------------------------------------------- /sparrow/data/networks/mitochondrial_targeting/mitochondrial_targeting_predictor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/mitochondrial_targeting/mitochondrial_targeting_predictor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/nuclear_export_signal/nes_predictor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/nuclear_export_signal/nes_predictor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/nuclear_import_signal/nls_predictor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/nuclear_import_signal/nls_predictor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/phosphorylation/ser_phosphorylation_predictor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/phosphorylation/ser_phosphorylation_predictor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/phosphorylation/thr_phosphorylation_predictor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/phosphorylation/thr_phosphorylation_predictor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/phosphorylation/tyr_phosphorylation_predictor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/phosphorylation/tyr_phosphorylation_predictor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/prefactor/README: -------------------------------------------------------------------------------- 1 | # To Be Trained 2 | -------------------------------------------------------------------------------- /sparrow/data/networks/prefactor/prefactor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/prefactor/prefactor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/prefactor/prefactor_network_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/prefactor/prefactor_network_v2.pt -------------------------------------------------------------------------------- /sparrow/data/networks/pscore/pscore_predictor_network_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/pscore/pscore_predictor_network_v2.pt -------------------------------------------------------------------------------- /sparrow/data/networks/pscore/pscore_predictor_network_v3.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/pscore/pscore_predictor_network_v3.pt -------------------------------------------------------------------------------- /sparrow/data/networks/pscore/pscore_predictor_network_v4.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/pscore/pscore_predictor_network_v4.pt -------------------------------------------------------------------------------- /sparrow/data/networks/re/README: -------------------------------------------------------------------------------- 1 | ## The end-to-end networks are defined as re, although the actual predictor class is e2e. This is a rare exception where there's a mismatch in network name and predictor class name, because if the predictor class name were 're' then this would clash with the Python regular expression package ('re'), such that for code sanity purposes the predictor class and module is e2e even though the networks are re. 2 | -------------------------------------------------------------------------------- /sparrow/data/networks/re/re_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/re/re_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/re/re_network_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/re/re_network_v2.pt -------------------------------------------------------------------------------- /sparrow/data/networks/rg/README: -------------------------------------------------------------------------------- 1 | # Proof of concept network 2 | -------------------------------------------------------------------------------- /sparrow/data/networks/rg/rg_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/rg/rg_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/rg/rg_network_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/rg/rg_network_v2.pt -------------------------------------------------------------------------------- /sparrow/data/networks/scaled_re/README: -------------------------------------------------------------------------------- 1 | # To Be Trained 2 | -------------------------------------------------------------------------------- /sparrow/data/networks/scaled_re/scaled_re_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_re/scaled_re_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/scaled_re/scaled_re_network_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_re/scaled_re_network_v2.pt -------------------------------------------------------------------------------- /sparrow/data/networks/scaled_rg/README: -------------------------------------------------------------------------------- 1 | # To Be Trained 2 | -------------------------------------------------------------------------------- /sparrow/data/networks/scaled_rg/scaled_rg_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_rg/scaled_rg_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/scaled_rg/scaled_rg_network_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_rg/scaled_rg_network_v2.pt -------------------------------------------------------------------------------- /sparrow/data/networks/scaling_exponent/README: -------------------------------------------------------------------------------- 1 | # To Be Trained 2 | 3 | v1.5 was never assessed or validated and shouldn't be used 4 | -------------------------------------------------------------------------------- /sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.5.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.5.pt -------------------------------------------------------------------------------- /sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/scaling_exponent/scaling_exponent_network_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v2.pt -------------------------------------------------------------------------------- /sparrow/data/networks/transactivation_domains/tad_predictor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/transactivation_domains/tad_predictor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/transmembrane/transmembrane_predictor_network_v1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v1.pt -------------------------------------------------------------------------------- /sparrow/data/networks/transmembrane/transmembrane_predictor_network_v4.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v4.pt -------------------------------------------------------------------------------- /sparrow/patterning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/patterning/__init__.py -------------------------------------------------------------------------------- /sparrow/patterning/scd.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3, boundscheck=False, wraparound=False, initializedcheck=False 2 | import numpy as np 3 | cimport numpy as np 4 | from cython.view cimport array 5 | from libc.math cimport sqrt,abs, fabs 6 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS 7 | from sparrow.sparrow_exceptions import SparrowException 8 | 9 | # Define a typed memoryview for efficient access to numpy arrays 10 | ctypedef np.float64_t DOUBLE_t 11 | ctypedef np.int64_t INT64_t 12 | 13 | cdef dict DEFAULT_HYDRO_DICT = {'A': 0.730, 'R': 0.000, 'N': 0.432, 'D': 0.378, 'C': 0.595, 'Q': 0.514, 'E': 0.459, 14 | 'G': 0.649, 'H': 0.514, 'I': 0.973, 'L': 0.973, 'K': 0.514, 'M': 0.838, 'F': 1.000, 15 | 'P': 1.000, 'S': 0.595, 'T': 0.676, 'W': 0.946, 'Y': 0.865, 'V': 0.892} 16 | 17 | 18 | cpdef double compute_scd_x(str sequence, group1=['E','D'], group2=['R','K']): 19 | cdef int m, n, seqlen 20 | cdef double total, m_val, n_val, charge_val, final_val 21 | cdef int cur_m_charge, cur_n_charge 22 | cdef char cur_m_res, cur_n_res 23 | 24 | # Pre-calculate group membership 25 | cdef int[:] group_membership = np.zeros(256, dtype=np.int32) 26 | for residue in group1: 27 | group_membership[ord(residue)] = -1 28 | for residue in group2: 29 | group_membership[ord(residue)] = 1 30 | 31 | total = 0 32 | seqlen = len(sequence) 33 | 34 | # Convert sequence to array of integers 35 | cdef int[:] sequence_array = np.array([ord(char) for char in sequence], dtype=np.int32) 36 | 37 | for m in range(1, seqlen): 38 | m_val = m + 1 39 | 40 | for n in range(0, m-1): 41 | n_val = n + 1 42 | 43 | # Access residues using array indexing 44 | cur_m_res = sequence_array[m] 45 | cur_n_res = sequence_array[n] 46 | 47 | # Retrieve group charge 48 | cur_m_charge = group_membership[cur_m_res] 49 | cur_n_charge = group_membership[cur_n_res] 50 | 51 | charge_val = cur_m_charge * cur_n_charge 52 | final_val = charge_val * sqrt(m_val - n_val) 53 | total += final_val 54 | 55 | return total / seqlen 56 | 57 | cdef validate_sequence(str seq, dict hydro_dict): 58 | cdef set all_res = set(seq) 59 | for res in all_res: 60 | if res not in hydro_dict: 61 | raise ValueError(f'When calculating SHD the hydrophobicity dictionary lacked the residue {res}') 62 | 63 | cpdef double compute_shd(str seq, dict hydro_dict=None): 64 | """ 65 | Function takes in a sequence and returns Sequence 66 | Hydropathy Decoration (SHD), IE. patterning of hydrophobic 67 | residues in the sequence. This is computed as defined in ref 1 68 | 69 | As an optional parameter this function can take in a predefined 70 | hydropathy conversion dictionary for the amino acids, where the keys 71 | are Amino acids and values are floats. 72 | 73 | If a conversion dict is not provided the following conversion is used: 74 | 75 | 'A': 0.730, 76 | 'R': 0.000, 77 | 'N': 0.432, 78 | 'D': 0.378, 79 | 'C': 0.595, 80 | 'Q': 0.514, 81 | 'E': 0.459, 82 | 'G': 0.649, 83 | 'H': 0.514, 84 | 'I': 0.973, 85 | 'L': 0.973, 86 | 'K': 0.514, 87 | 'M': 0.838, 88 | 'F': 1.000, 89 | 'P': 1.000, 90 | 'S': 0.595, 91 | 'T': 0.676, 92 | 'W': 0.946, 93 | 'Y': 0.865, 94 | 'V': 0.892, 95 | 96 | These are the Kyte Doolitle normalized hydrophobicity. 97 | 98 | Parameters 99 | ------------ 100 | seq : str 101 | Amino acid sequence passed as string 102 | 103 | hydro_dict : dict 104 | Dictionary that maps amino acid to hydrophobicity score 105 | (optional). 106 | 107 | Returns 108 | ----------- 109 | float 110 | Returns a floating point value that reports on the sequence 111 | hydropathy decoration. This in principle should be a positive 112 | number. 113 | 114 | References 115 | -------------- 116 | [1] Zheng, W., Dignon, G. L., Brown, M., Kim, Y. C. & Mittal, J. Hydropathy Patterning 117 | Complements Charge Patterning to Describe Conformational Preferences of Disordered 118 | Proteins. J. Phys. Chem. Lett. (2020). doi:10.1021/acs.jpclett.0c00288 119 | """ 120 | if hydro_dict is None: 121 | hydro_dict = DEFAULT_HYDRO_DICT 122 | 123 | validate_sequence(seq, hydro_dict) 124 | 125 | cdef Py_ssize_t N = len(seq) 126 | cdef double[:] h = np.array([hydro_dict[res] for res in seq], dtype=np.double) 127 | cdef double t = 0.0 128 | cdef Py_ssize_t m, n 129 | 130 | for m in range(1, N): 131 | for n in range(m-1): 132 | t += (h[m] + h[n]) / abs(m - n) 133 | 134 | return t / N 135 | 136 | -------------------------------------------------------------------------------- /sparrow/polymer/scaling_parameters.py: -------------------------------------------------------------------------------- 1 | from sparrow.patterning import scd 2 | import numpy as np 3 | 4 | def compute_nu_zheng2020(seq): 5 | """ 6 | Function takes in a sequence and returns a calculate Nu scaling value 7 | from Sequence Hydropathy Decoration (SHD) and Sequence Charge Decoration) 8 | 9 | Nu = -0.0423×SHD + 0.0074×SCD+0.701 10 | 11 | This equation for predicting nu is adopeted from Zheng et al. [1]. 12 | 13 | Parameters 14 | ------------------ 15 | seq : str 16 | Amino acid sequence (must be valid amino acids only) 17 | 18 | Returns 19 | ------------------ 20 | float 21 | Returns the predict scalinge exponent (nu), a dimensionless 22 | parameter which should fall between 0.33 and 0.6 (in theory). 23 | 24 | References 25 | --------------- 26 | [1] Zheng, W., Dignon, G. L., Brown, M., Kim, Y. C. & Mittal, J. 27 | Hydropathy Patterning Complements Charge Patterning to Describe 28 | Conformational Preferences of Disordered Proteins. J. Phys. 29 | Chem. Lett. (2020). doi:10.1021/acs.jpclett.0c00288 30 | 31 | """ 32 | 33 | SHD = scd.compute_shd(seq) 34 | SCD = scd.compute_scd_x(seq) 35 | 36 | # calculate Nu from SHD and SCD 37 | nu = (-0.0423*SHD)+(0.0074*SCD)+0.701 38 | 39 | return nu 40 | 41 | 42 | 43 | def compute_rg_zheng2020(seq): 44 | """ 45 | Function that takes in an amino acid sequence and computes the 46 | expected radius of gyration using the nu-dependent Rg as developed by 47 | Zheng et al. 48 | 49 | Parameters 50 | ------------------ 51 | seq : str 52 | Amino acid sequence (must be valid amino acids only) 53 | 54 | Returns 55 | ------------------ 56 | float 57 | Returns the predict radius of gyration in Angstorms 58 | 59 | References 60 | --------------- 61 | [1] Zheng, W., Dignon, G. L., Brown, M., Kim, Y. C. & Mittal, J. 62 | Hydropathy Patterning Complements Charge Patterning to Describe 63 | Conformational Preferences of Disordered Proteins. J. Phys. 64 | Chem. Lett. (2020). doi:10.1021/acs.jpclett.0c00288 65 | """ 66 | nu = compute_nu_zheng2020(seq) 67 | 68 | gamma = 1.1615 69 | b = 5.5 # note in Angstroms instead of nanometers 70 | N = len(seq) 71 | 72 | numerator = gamma*(gamma+1) 73 | 74 | denominator = 2*(gamma+2*nu)*(gamma+2*nu+1) 75 | 76 | return np.sqrt(numerator/denominator)*b*np.power(N,nu) 77 | 78 | -------------------------------------------------------------------------------- /sparrow/predictors/asphericity/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/asphericity/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/dssp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/dssp/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/e2e/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/mitochondrial_targeting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/mitochondrial_targeting/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/mitochondrial_targeting/mitochondrial_targeting_predictor.py: -------------------------------------------------------------------------------- 1 | from parrot import brnn_architecture 2 | from parrot import encode_sequence 3 | 4 | import sparrow 5 | 6 | import torch 7 | import numpy as np 8 | import os 9 | from sparrow.sparrow_exceptions import SparrowException 10 | 11 | 12 | 13 | 14 | DEFAULT_VERSION="1" 15 | 16 | 17 | class MitochondrialTargetingPredictor(): 18 | """ 19 | 20 | Class that loads in a network such that predict_mitochondrial_targeting() can be called to predict 21 | mitochondrial targeting for a sequence. 22 | 23 | """ 24 | def __init__(self, version=None): 25 | """ 26 | Constructor for building a MitochondrialTargetingPredictor object. The version keyword allows specific 27 | version(s) of the trained network associated with the underlying predictor to be defined. 28 | By default, it's set to None, which leads to the current best/default network being selected 29 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide 30 | the ability to pass a string as version. This string is inserted at position in the filename 31 | 32 | mitochondrial_targeting_predictor_network_v{version}.pt 33 | 34 | i.e. no need to include the "v" part or the .pt extension 35 | 36 | """ 37 | 38 | if version is None: 39 | version = DEFAULT_VERSION 40 | 41 | saved_weights = sparrow.get_data(f'networks/mitochondrial_targeting/mitochondrial_targeting_predictor_network_v{version}.pt') 42 | 43 | if not os.path.isfile(saved_weights): 44 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__)) 45 | 46 | 47 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu')) 48 | 49 | 50 | # Dynamically read in correct hyperparameters: 51 | num_layers = 0 52 | while True: 53 | s = f'lstm.weight_ih_l{num_layers}' 54 | try: 55 | temp = loaded_model[s] 56 | num_layers += 1 57 | except KeyError: 58 | break 59 | 60 | 61 | ## determine the number of classes; note you may need to change the key names here no leading 62 | # module. in ther 63 | number_of_classes = np.shape(loaded_model['fc.bias'])[0] 64 | input_size = 20 # (hardcoded at 20 for 20 amino acids) 65 | 66 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4) 67 | 68 | 69 | # set these here so we can sanity check if needed 70 | self.number_of_classes = number_of_classes 71 | self.input_size = input_size 72 | self.number_of_layers = num_layers 73 | self.hidden_vector_size = hidden_vector_size 74 | 75 | # Instantiate network weights into object 76 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu') 77 | 78 | self.network.load_state_dict(loaded_model) 79 | 80 | 81 | 82 | def predict_mitochondrial_targeting(self, seq): 83 | """ 84 | Prediction function. seq should be a valid amino acid sequence. 85 | 86 | NOTE that this assumes mitochondrial targeting sequences (MTSs) are 87 | N-terminal, so truncates anything over 168 residues. This threshold 88 | was empyrically determined based on the set of annottated MTSs. 89 | 90 | Parameters 91 | ------------ 92 | seq : str 93 | Valid amino acid sequence 94 | 95 | Returns 96 | ---------- 97 | np.ndarray 98 | Returns a 1D np.ndarray the length of the sequence where each position 99 | is the transient helicity at that position. 100 | 101 | """ 102 | 103 | # convert sequence to uppercase 104 | seq = seq.upper() 105 | 106 | # truncate all but 168 - if shorter than this just gets everything 107 | sub_seq = seq[0:168] 108 | 109 | # Convert to one-hot sequence vector 110 | seq_vector = encode_sequence.one_hot(sub_seq) 111 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting 112 | 113 | # Forward pass -this is specific for classication 114 | prediction = self.network(seq_vector.float()).detach().numpy() 115 | int_vals = [] 116 | for row in prediction[0]: 117 | int_vals.append(np.argmax(row)) 118 | 119 | prediction = int_vals 120 | 121 | # append empty 0s for remainder of sequence 122 | extra = [0]*(len(seq)-len(sub_seq)) 123 | 124 | prediction.extend(extra) 125 | # return prediction + extra zeros 126 | return prediction 127 | -------------------------------------------------------------------------------- /sparrow/predictors/nes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/nes/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/nes/nuclear_export_signal_predictor.py: -------------------------------------------------------------------------------- 1 | from parrot import brnn_architecture 2 | from parrot import encode_sequence 3 | 4 | import sparrow 5 | 6 | import torch 7 | import numpy as np 8 | import os 9 | from sparrow.sparrow_exceptions import SparrowException 10 | 11 | 12 | """ 13 | NB: This network and predictor was imported from GOOSE, so is subtly different internally to how 14 | some of the other predictors work. Notably it includes a softmax project and a loop 15 | this loop below to define probabilities - this may be because these networks have 2 layers 16 | whereas the others only have one? Anyway, just making a note of this in case we need to debug in 17 | the future. 18 | 19 | score = [] 20 | for val in prediction: 21 | score.append(round(val[1],5)) 22 | 23 | 24 | """ 25 | 26 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update 27 | # this default if you want that new network to be used by default 28 | DEFAULT_VERSION="1" 29 | 30 | def softmax(v): 31 | return (np.e ** v) / np.sum(np.e ** v) 32 | 33 | 34 | ## CHANGE class name 35 | class NESPredictor(): 36 | """ 37 | 38 | Class that loads in a network such that nuclear export signals can be predicted. 39 | 40 | """ 41 | def __init__(self, version=None): 42 | """ 43 | Constructor for building a predictor object object. The version keyword allows specific 44 | version(s) of the trained network associated with the predictor to be defined. 45 | 46 | By default, it's set to None, which leads to the current best/default network being selected 47 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide 48 | the ability to pass a string as version. This string is inserted at position in the filename 49 | 50 | _network_v.pt 51 | 52 | i.e. no need to include the "v" part or the .pt extension 53 | 54 | """ 55 | 56 | 57 | 58 | # if no version provided use default, then grab path and check that file actually exists! 59 | if version is None: 60 | version = DEFAULT_VERSION 61 | 62 | # CHANGE THIS!! Make sure oyu change the and to the appropriate 63 | # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected 64 | saved_weights = sparrow.get_data(f'networks/nuclear_export_signal/nes_predictor_network_v{version}.pt') 65 | 66 | if not os.path.isfile(saved_weights): 67 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__)) 68 | 69 | 70 | # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because 71 | # we know everyone has a CPU... 72 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu')) 73 | 74 | ## DELETE ME PROBABLY 75 | # this block of code is relevant ONLY if the trained network has this straneg 76 | # appended 'module.' text at the start of every keyword. This may happen in older 77 | # version of PARROT (see DSSP predictor as an example of where its needed) but in 78 | # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but 79 | # in case you're using an older network we've kept this to make things simple 80 | 81 | #for i in range(len(loaded_model)): 82 | # key, value = loaded_model.popitem(last=False) 83 | # new_key = key[7:] 84 | # loaded_model[new_key] = value 85 | ## END OF DELETE ME PROBABLY 86 | 87 | 88 | # Dynamically calculate the hyperparameters used to train the network. 89 | ## NOTE: 90 | # 91 | # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible 92 | # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example 93 | # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you 94 | # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this 95 | # keyword. 96 | 97 | # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords 98 | # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to 99 | # reach out to Alex or Dan about this! 100 | 101 | num_layers = 0 102 | while True: 103 | s = f'lstm.weight_ih_l{num_layers}' 104 | try: 105 | temp = loaded_model[s] 106 | num_layers += 1 107 | except KeyError: 108 | break 109 | 110 | number_of_classes = np.shape(loaded_model['fc.bias'])[0] 111 | 112 | # Hard coded because we always use one-hot encoding, note that if you trained a specific 113 | # predictor on a different encoding scheme you could, of course, here simply define that 114 | # encoding scheme 115 | input_size = 20 116 | 117 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4) 118 | 119 | # set these here so we can sanity check if needed 120 | self.number_of_classes = number_of_classes 121 | self.input_size = input_size 122 | self.number_of_layers = num_layers 123 | self.hidden_vector_size = hidden_vector_size 124 | 125 | # Instantiate network weights into object 126 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu') 127 | 128 | # load parameters into model 129 | self.network.load_state_dict(loaded_model) 130 | 131 | 132 | 133 | def predict_nuclear_export_signal(self, seq): 134 | """ 135 | Function to predict the presence of nuclear export signals. Returns a per 136 | residue probability score of a residue being in an NES or not 137 | 138 | Parameters 139 | ------------ 140 | seq : str 141 | Valid amino acid sequence 142 | 143 | Returns 144 | ---------- 145 | np.ndarray 146 | Returns a 1D np.ndarray the length of the sequence where each position 147 | gives the prediction of that residue being an NES 148 | 149 | """ 150 | 151 | # convert sequence to uppercase 152 | seq = seq.upper() 153 | 154 | # Convert to one-hot sequence vector - note, as mentioned above if you 155 | # did't use one-hot in the original training you could just edit this here 156 | seq_vector = encode_sequence.one_hot(seq) 157 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting 158 | 159 | 160 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!! 161 | ## CHANGE CODE BELOW HERE ## 162 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!! 163 | 164 | 165 | ## CLASSIFICATION CODE BLOCK 166 | # The block below should be kept if we're doing a classification 167 | # based prediction! if not, comment this out or delete it 168 | #prediction = self.network(seq_vector.float()).detach().numpy() 169 | #int_vals = [] 170 | #for row in prediction[0]: 171 | # int_vals.append(np.argmax(row)) 172 | 173 | #prediction = int_vals 174 | 175 | ## REGRESSION CODE BLOCK 176 | # This block should be kept if we're doing a regression-based 177 | # prediction. If not, comment this out or delete it 178 | prediction = self.network(seq_vector.float()).detach().numpy().flatten() 179 | 180 | prediction = prediction.reshape(-1, self.number_of_classes) 181 | prediction = np.array(list(map(softmax, prediction))) 182 | 183 | # finally we extract out local probabilities 184 | score = [] 185 | for val in prediction: 186 | score.append(round(val[1],5)) 187 | 188 | return score 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /sparrow/predictors/nls/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/nls/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/nls/nuclear_import_signal_predictor.py: -------------------------------------------------------------------------------- 1 | from parrot import brnn_architecture 2 | from parrot import encode_sequence 3 | 4 | import sparrow 5 | 6 | import torch 7 | import numpy as np 8 | import os 9 | from sparrow.sparrow_exceptions import SparrowException 10 | 11 | 12 | """ 13 | NB: This network and predictor was imported from GOOSE, so is subtly different internally to how 14 | some of the other predictors work. Notably it includes a softmax project and a loop 15 | this loop below to define probabilities - this may be because these networks have 2 layers 16 | whereas the others only have one? Anyway, just making a note of this in case we need to debug in 17 | the future. 18 | 19 | score = [] 20 | for val in prediction: 21 | score.append(round(val[1],5)) 22 | 23 | 24 | """ 25 | 26 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update 27 | # this default if you want that new network to be used by default 28 | DEFAULT_VERSION="1" 29 | 30 | def softmax(v): 31 | return (np.e ** v) / np.sum(np.e ** v) 32 | 33 | 34 | ## CHANGE class name 35 | class NLSPredictor(): 36 | """ 37 | 38 | Class that loads in a network such that predict_ser_phosphorylation() can be called to predict 39 | serine phosphorylation from a sequence. 40 | 41 | """ 42 | def __init__(self, version=None): 43 | """ 44 | Constructor for building a predictor object object. The version keyword allows specific 45 | version(s) of the trained network associated with the predictor to be defined. 46 | 47 | By default, it's set to None, which leads to the current best/default network being selected 48 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide 49 | the ability to pass a string as version. This string is inserted at position in the filename 50 | 51 | _network_v.pt 52 | 53 | i.e. no need to include the "v" part or the .pt extension 54 | 55 | """ 56 | 57 | 58 | 59 | # if no version provided use default, then grab path and check that file actually exists! 60 | if version is None: 61 | version = DEFAULT_VERSION 62 | 63 | # CHANGE THIS!! Make sure oyu change the and to the appropriate 64 | # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected 65 | saved_weights = sparrow.get_data(f'networks/nuclear_import_signal/nls_predictor_network_v{version}.pt') 66 | 67 | if not os.path.isfile(saved_weights): 68 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__)) 69 | 70 | 71 | # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because 72 | # we know everyone has a CPU... 73 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu')) 74 | 75 | ## DELETE ME PROBABLY 76 | # this block of code is relevant ONLY if the trained network has this straneg 77 | # appended 'module.' text at the start of every keyword. This may happen in older 78 | # version of PARROT (see DSSP predictor as an example of where its needed) but in 79 | # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but 80 | # in case you're using an older network we've kept this to make things simple 81 | 82 | #for i in range(len(loaded_model)): 83 | # key, value = loaded_model.popitem(last=False) 84 | # new_key = key[7:] 85 | # loaded_model[new_key] = value 86 | ## END OF DELETE ME PROBABLY 87 | 88 | 89 | # Dynamically calculate the hyperparameters used to train the network. 90 | ## NOTE: 91 | # 92 | # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible 93 | # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example 94 | # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you 95 | # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this 96 | # keyword. 97 | 98 | # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords 99 | # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to 100 | # reach out to Alex or Dan about this! 101 | 102 | num_layers = 0 103 | while True: 104 | s = f'lstm.weight_ih_l{num_layers}' 105 | try: 106 | temp = loaded_model[s] 107 | num_layers += 1 108 | except KeyError: 109 | break 110 | 111 | number_of_classes = np.shape(loaded_model['fc.bias'])[0] 112 | 113 | # Hard coded because we always use one-hot encoding, note that if you trained a specific 114 | # predictor on a different encoding scheme you could, of course, here simply define that 115 | # encoding scheme 116 | input_size = 20 117 | 118 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4) 119 | 120 | # set these here so we can sanity check if needed 121 | self.number_of_classes = number_of_classes 122 | self.input_size = input_size 123 | self.number_of_layers = num_layers 124 | self.hidden_vector_size = hidden_vector_size 125 | 126 | # Instantiate network weights into object 127 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu') 128 | 129 | # load parameters into model 130 | self.network.load_state_dict(loaded_model) 131 | 132 | 133 | 134 | def predict_nuclear_import_signal(self, seq): 135 | """ 136 | Function to predict the presence of nuclear import signals. Returns a per 137 | residue probability score of a residue being in an NLS or not 138 | 139 | Parameters 140 | ------------ 141 | seq : str 142 | Valid amino acid sequence 143 | 144 | Returns 145 | ---------- 146 | np.ndarray 147 | Returns a 1D np.ndarray the length of the sequence where each position 148 | gives the prediction of that residue being an NLS 149 | 150 | """ 151 | 152 | # convert sequence to uppercase 153 | seq = seq.upper() 154 | 155 | # Convert to one-hot sequence vector - note, as mentioned above if you 156 | # did't use one-hot in the original training you could just edit this here 157 | seq_vector = encode_sequence.one_hot(seq) 158 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting 159 | 160 | 161 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!! 162 | ## CHANGE CODE BELOW HERE ## 163 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!! 164 | 165 | 166 | ## CLASSIFICATION CODE BLOCK 167 | # The block below should be kept if we're doing a classification 168 | # based prediction! if not, comment this out or delete it 169 | #prediction = self.network(seq_vector.float()).detach().numpy() 170 | #int_vals = [] 171 | #for row in prediction[0]: 172 | # int_vals.append(np.argmax(row)) 173 | 174 | #prediction = int_vals 175 | 176 | ## REGRESSION CODE BLOCK 177 | # This block should be kept if we're doing a regression-based 178 | # prediction. If not, comment this out or delete it 179 | prediction = self.network(seq_vector.float()).detach().numpy().flatten() 180 | 181 | prediction = prediction.reshape(-1, self.number_of_classes) 182 | prediction = np.array(list(map(softmax, prediction))) 183 | 184 | 185 | ## CLIP 186 | # IF we want to ensure we have a value between 0 and 1 the clipping here 187 | # will do that. If not leave commented 188 | #prediction = np.clip(prediction, 0.0, 1.0) 189 | 190 | # finally we extract out local probabilities 191 | score = [] 192 | for val in prediction: 193 | score.append(round(val[1],5)) 194 | 195 | return score 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | -------------------------------------------------------------------------------- /sparrow/predictors/phosphorylation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/phosphorylation/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/phosphorylation/phospho_predictor_utils.py: -------------------------------------------------------------------------------- 1 | from sparrow.sparrow_exceptions import SparrowException 2 | 3 | def return_hits(seq, phospho_probability, target_res, windowsize=4, threshold=0.6, return_sites_only=False): 4 | """ 5 | Function that parses through a sequence and annotated phosphosite 6 | probabilities to extract out specific positions or a per-residue 7 | binary mask of phosphorylatio or non-phosphorylation. 8 | 9 | This function works by sliding a +/- windowsize window across the 10 | sequence and if the central residue in that window has a probability 11 | > threshold then all the target_res in that window are set to be 12 | putative phosphosites. 13 | 14 | Parameters 15 | -------------- 16 | seq : str 17 | Amino acid sequence 18 | 19 | phospho_probability : list 20 | A list with per-residue probabilities for a residue to have been 21 | phosphorylated or not. 22 | 23 | windowsize : int 24 | Define the size of the window this algorithm uses to extend the 25 | influenc of a local phosphosite probability. Note the windowsize 26 | gets applied +/- a central position 27 | 28 | target_res : str 29 | A string with a single residue which each residue in the sequence 30 | is compared against. 31 | 32 | threshold : float 33 | A threshold value used to deliniate between phosphosites for masking. 34 | Default is 0.6. 35 | 36 | return_sites_only : bool 37 | A flag which, if set to True, means the function returns only the positions 38 | found in a list. If set to False the function returns a binary mask 39 | list equal in length to the sequence, where '1's mean the residue 40 | is predicted to be a phosphosite and '0' mean they're not. Default 41 | is False. 42 | 43 | Returns 44 | ----------- 45 | list 46 | Returns EITHER a list (len == seq) if return_positions = False which 47 | contains a per-residue phosphomask (i.e. 1 = phospho 0 if not) OR 48 | returns a list of index positions that correspond to phosphosites. 49 | 50 | If return_positions is True, the function guarentees the order of 51 | indices returned will be numerical 52 | 53 | """ 54 | 55 | ## sanity checking first 56 | if len(target_res) != 1: 57 | raise SparrowException('Target res must be a single amino acid') 58 | 59 | if threshold > 1 or threshold < 0: 60 | raise SparrowException('Probability threshold used in phosphosite masking must be between 0 and 1') 61 | 62 | if windowsize < 1: 63 | raise SparrowException('Window size must be a positive integer') 64 | 65 | if len(seq) != len(phospho_probability): 66 | raise SparrowException('Sequence length and probability vector must be the same length') 67 | 68 | 69 | seqlen = len(seq) 70 | 71 | potential_hits = set([]) 72 | 73 | if seqlen < (2*windowsize)+1: 74 | raise SparrowException(f'Cannot predict phosphosites when the sequence length is less than 1+{2*windowsize}. NB: length = {seqlen}') 75 | 76 | # for each residue 77 | for idx, res in enumerate(seq): 78 | 79 | # if this is a low-probablity residue skip and move on 80 | if phospho_probability[idx] < threshold: 81 | continue 82 | 83 | # if we're in the N-terminal residues just excise out a fragment of 84 | # varying size until we get into the sequence 85 | if idx < windowsize: 86 | slice_start = 0 87 | current_slice = seq[slice_start:idx+windowsize] 88 | 89 | # while in the 'middle' of the sequence 90 | elif idx >= windowsize and idx <= (seqlen - (windowsize+1)): 91 | slice_start = idx-windowsize 92 | current_slice = seq[slice_start:idx+windowsize] 93 | 94 | # at the C-terminus 95 | else: 96 | slice_start = idx-windowsize 97 | current_slice = seq[slice_start:] 98 | 99 | # for each residue in the 100 | for local_idx, aa in enumerate(current_slice): 101 | if aa == target_res: 102 | global_pos = local_idx + slice_start 103 | 104 | if global_pos not in potential_hits: 105 | potential_hits.add(global_pos) 106 | 107 | 108 | # if we just want to return the phosphoindices. Note 109 | # we sort these to guarentee the order of return. 110 | if return_sites_only: 111 | return sorted(list(potential_hits)) 112 | else: 113 | 114 | return_list = [] 115 | for i in range(0,len(seq)): 116 | if i in potential_hits: 117 | return_list.append(1) 118 | else: 119 | return_list.append(0) 120 | 121 | return return_list 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /sparrow/predictors/prefactor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/prefactor/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/pscore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/pscore/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/pscore/pscore_predictor.py: -------------------------------------------------------------------------------- 1 | from parrot import brnn_architecture 2 | from parrot import encode_sequence 3 | 4 | import sparrow 5 | 6 | import torch 7 | import numpy as np 8 | import os 9 | from sparrow.sparrow_exceptions import SparrowException 10 | 11 | 12 | 13 | """ 14 | Predictor template file. This data file should, in principle, require 15 | minimal editing to convert into a specific predictor based on a copied 16 | network file found in sparrow/data/networks/. Some general 17 | guidelines below (also included in the predictor documentation) and inline 18 | comments on things you will want to change. This code WILL NOT RUN as is and 19 | requires you to update missing variables to customize the predictor!! 20 | 21 | Missing values will be enclosed in < > to indicate this is where you (the 22 | software developer) must add some content 23 | 24 | 25 | ## Nomenclature 26 | 27 | 1. The predictor file should be called _predictor.py 28 | 2. This should be inside a module in the /predictor/ directory called 29 | 3. The single class this module implements should be called 30 | 31 | 32 | ## Class structure 33 | 34 | The class should have (at least) two functions: 35 | 36 | 1. A constructor (__init__()) which PRE LOADS the network from sparrow/data/networks/relevant_name - the get_data() function 37 | is defined in sparrow/__init__.py and allows absolute-path access to the /data directory. The constructor should 38 | FULLY load the network along with standard PARROT-style options, as shown here. Trained networks should be versioned and 39 | implemented so previous versions can be chosene even if the default version changes 40 | 41 | 2. Define a function called predict_(self, seq) where is a convenient name that obviously means this is 42 | what the function does. 43 | 44 | The idea is that this class should actually be completely stand alone independent of sparrow - i.e. one should be able to run 45 | 46 | >> from sparrow.predictor. import Predictor 47 | >> 48 | >> P = Predictor() 49 | >> P.predict_('myvalidseqnce') 50 | 51 | And have it work! 52 | 53 | 54 | 55 | 56 | 57 | """ 58 | 59 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update 60 | # this default if you want that new network to be used by default 61 | DEFAULT_VERSION="4" 62 | 63 | 64 | ## CHANGE class name 65 | class PScorePredictor(): 66 | """ 67 | 68 | Class that loads in a network such that predict_pscore() can be called to predict 69 | PScore propensity from a sequence. 70 | 71 | """ 72 | def __init__(self, version=None): 73 | """ 74 | Constructor for building a predictor object object. The version keyword allows specific 75 | version(s) of the trained network associated with the predictor to be defined. 76 | 77 | By default, it's set to None, which leads to the current best/default network being selected 78 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide 79 | the ability to pass a string as version. This string is inserted at position in the filename 80 | 81 | pscore_predictor_network_v.pt 82 | 83 | i.e. no need to include the "v" part or the .pt extension 84 | 85 | """ 86 | 87 | 88 | 89 | # if no version provided use default, then grab path and check that file actually exists! 90 | if version is None: 91 | version = DEFAULT_VERSION 92 | 93 | # CHANGE THIS!! Make sure oyu change the and to the appropriate 94 | # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected 95 | saved_weights = sparrow.get_data(f'networks/pscore/pscore_predictor_network_v{version}.pt') 96 | 97 | if not os.path.isfile(saved_weights): 98 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__)) 99 | 100 | 101 | # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because 102 | # we know everyone has a CPU... 103 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu')) 104 | 105 | ## DELETE ME PROBABLY 106 | # this block of code is relevant ONLY if the trained network has this straneg 107 | # appended 'module.' text at the start of every keyword. This may happen in older 108 | # version of PARROT (see DSSP predictor as an example of where its needed) but in 109 | # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but 110 | # in case you're using an older network we've kept this to make things simple 111 | 112 | for i in range(len(loaded_model)): 113 | key, value = loaded_model.popitem(last=False) 114 | new_key = key[7:] 115 | loaded_model[new_key] = value 116 | ## END OF DELETE ME PROBABLY 117 | 118 | 119 | # Dynamically calculate the hyperparameters used to train the network. 120 | ## NOTE: 121 | # 122 | # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible 123 | # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example 124 | # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you 125 | # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this 126 | # keyword. 127 | 128 | # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords 129 | # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to 130 | # reach out to Alex or Dan about this! 131 | 132 | num_layers = 0 133 | while True: 134 | s = f'lstm.weight_ih_l{num_layers}' 135 | try: 136 | temp = loaded_model[s] 137 | num_layers += 1 138 | except KeyError: 139 | break 140 | 141 | 142 | number_of_classes = np.shape(loaded_model['fc.bias'])[0] 143 | 144 | # hard coded because we always use one-hot encoding, note that if you trained a specific 145 | # predictor on a different encoding scheme you could, of course, here simply define that 146 | # encoding scheme 147 | input_size = 20 148 | 149 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4) 150 | 151 | # set these here so we can sanity check if needed 152 | self.number_of_classes = number_of_classes 153 | self.input_size = input_size 154 | self.number_of_layers = num_layers 155 | self.hidden_vector_size = hidden_vector_size 156 | 157 | # Instantiate network weights into object 158 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu') 159 | 160 | # load parameters into model 161 | self.network.load_state_dict(loaded_model) 162 | 163 | 164 | ## CHANGE FUNCTION NAME 165 | def predict_pscore(self, seq): 166 | """ 167 | 168 | Prediction function. seq should be a valid amino acid sequence. 169 | 170 | Parameters 171 | ------------ 172 | seq : str 173 | Valid amino acid sequence 174 | 175 | Returns 176 | ---------- 177 | np.ndarray 178 | Returns a 1D np.ndarray the length of the sequence where each position 179 | is the predicted value 180 | 181 | """ 182 | 183 | # convert sequence to uppercase 184 | seq = seq.upper() 185 | 186 | # Convert to one-hot sequence vector - note, as mentioned above if you 187 | # did't use one-hot in the original training you could just edit this here 188 | seq_vector = encode_sequence.one_hot(seq) 189 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting 190 | 191 | 192 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!! 193 | ## CHANGE CODE BELOW HERE ## 194 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!! 195 | 196 | 197 | ## CLASSIFICATION CODE BLOCK 198 | # The block below should be kept if we're doing a classification 199 | # based prediction! if not, comment this out or delete it 200 | #prediction = self.network(seq_vector.float()).detach().numpy() 201 | #int_vals = [] 202 | #for row in prediction[0]: 203 | # int_vals.append(np.argmax(row)) 204 | 205 | #prediction = int_vals 206 | 207 | ## REGRESSION CODE BLOCK 208 | # This block should be kept if we're doing a regression-based 209 | # prediction. If not, comment this out or delete it 210 | prediction = self.network(seq_vector.float()).detach().numpy().flatten() 211 | 212 | 213 | return prediction 214 | -------------------------------------------------------------------------------- /sparrow/predictors/rg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/rg/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/scaled_re/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/scaled_re/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/scaled_rg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/scaled_rg/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/scaling_exponent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/scaling_exponent/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/tad/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/tad/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/tad/transactivation_domain_predictor.py: -------------------------------------------------------------------------------- 1 | from parrot import brnn_architecture 2 | from parrot import encode_sequence 3 | 4 | import sparrow 5 | 6 | import torch 7 | import numpy as np 8 | import os 9 | from sparrow.sparrow_exceptions import SparrowException 10 | 11 | 12 | """ 13 | NB: This network and predictor was imported from GOOSE, so is subtly different internally to how 14 | some of the other predictors work. Notably it includes a softmax project and a loop 15 | this loop below to define probabilities - this may be because these networks have 2 layers 16 | whereas the others only have one? Anyway, just making a note of this in case we need to debug in 17 | the future. 18 | 19 | score = [] 20 | for val in prediction: 21 | score.append(round(val[1],5)) 22 | 23 | 24 | """ 25 | 26 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update 27 | # this default if you want that new network to be used by default 28 | DEFAULT_VERSION="1" 29 | 30 | def softmax(v): 31 | return (np.e ** v) / np.sum(np.e ** v) 32 | 33 | 34 | ## CHANGE class name 35 | class TADPredictor(): 36 | """ 37 | 38 | Class that loads in a network such that predict_ser_phosphorylation() can be called to predict 39 | serine phosphorylation from a sequence. 40 | 41 | """ 42 | def __init__(self, version=None): 43 | """ 44 | Constructor for building a predictor object object. The version keyword allows specific 45 | version(s) of the trained network associated with the predictor to be defined. 46 | 47 | By default, it's set to None, which leads to the current best/default network being selected 48 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide 49 | the ability to pass a string as version. This string is inserted at position in the filename 50 | 51 | _network_v.pt 52 | 53 | i.e. no need to include the "v" part or the .pt extension 54 | 55 | """ 56 | 57 | 58 | 59 | # if no version provided use default, then grab path and check that file actually exists! 60 | if version is None: 61 | version = DEFAULT_VERSION 62 | 63 | # CHANGE THIS!! Make sure oyu change the and to the appropriate 64 | # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected 65 | saved_weights = sparrow.get_data(f'networks/transactivation_domains/tad_predictor_network_v{version}.pt') 66 | 67 | if not os.path.isfile(saved_weights): 68 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__)) 69 | 70 | 71 | # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because 72 | # we know everyone has a CPU... 73 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu')) 74 | 75 | ## DELETE ME PROBABLY 76 | # this block of code is relevant ONLY if the trained network has this straneg 77 | # appended 'module.' text at the start of every keyword. This may happen in older 78 | # version of PARROT (see DSSP predictor as an example of where its needed) but in 79 | # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but 80 | # in case you're using an older network we've kept this to make things simple 81 | 82 | #for i in range(len(loaded_model)): 83 | # key, value = loaded_model.popitem(last=False) 84 | # new_key = key[7:] 85 | # loaded_model[new_key] = value 86 | ## END OF DELETE ME PROBABLY 87 | 88 | 89 | # Dynamically calculate the hyperparameters used to train the network. 90 | ## NOTE: 91 | # 92 | # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible 93 | # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example 94 | # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you 95 | # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this 96 | # keyword. 97 | 98 | # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords 99 | # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to 100 | # reach out to Alex or Dan about this! 101 | 102 | num_layers = 0 103 | while True: 104 | s = f'lstm.weight_ih_l{num_layers}' 105 | try: 106 | temp = loaded_model[s] 107 | num_layers += 1 108 | except KeyError: 109 | break 110 | 111 | number_of_classes = np.shape(loaded_model['fc.bias'])[0] 112 | 113 | # Hard coded because we always use one-hot encoding, note that if you trained a specific 114 | # predictor on a different encoding scheme you could, of course, here simply define that 115 | # encoding scheme 116 | input_size = 20 117 | 118 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4) 119 | 120 | # set these here so we can sanity check if needed 121 | self.number_of_classes = number_of_classes 122 | self.input_size = input_size 123 | self.number_of_layers = num_layers 124 | self.hidden_vector_size = hidden_vector_size 125 | 126 | # Instantiate network weights into object 127 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu') 128 | 129 | # load parameters into model 130 | self.network.load_state_dict(loaded_model) 131 | 132 | 133 | 134 | def predict_transactivation_domains(self, seq): 135 | """ 136 | Function to predict the presence of nuclear import signals. Returns a per 137 | residue probability score of a residue being in an NLS or not 138 | 139 | Parameters 140 | ------------ 141 | seq : str 142 | Valid amino acid sequence 143 | 144 | Returns 145 | ---------- 146 | np.ndarray 147 | Returns a 1D np.ndarray the length of the sequence where each position 148 | gives the prediction of that residue being an NLS 149 | 150 | """ 151 | 152 | # convert sequence to uppercase 153 | seq = seq.upper() 154 | 155 | # Convert to one-hot sequence vector - note, as mentioned above if you 156 | # did't use one-hot in the original training you could just edit this here 157 | seq_vector = encode_sequence.one_hot(seq) 158 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting 159 | 160 | 161 | ## REGRESSION CODE BLOCK 162 | # This block should be kept if we're doing a regression-based 163 | # prediction. If not, comment this out or delete it 164 | prediction = self.network(seq_vector.float()).detach().numpy().flatten() 165 | 166 | prediction = prediction.reshape(-1, self.number_of_classes) 167 | prediction = np.array(list(map(softmax, prediction))) 168 | 169 | ## CLIP 170 | # IF we want to ensure we have a value between 0 and 1 the clipping here 171 | # will do that. If not leave commented 172 | #prediction = np.clip(prediction, 0.0, 1.0) 173 | 174 | # finally we extract out local probabilities 175 | score = [] 176 | for val in prediction: 177 | score.append(round(val[1],5)) 178 | 179 | return score 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | -------------------------------------------------------------------------------- /sparrow/predictors/transmembrane/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/transmembrane/__init__.py -------------------------------------------------------------------------------- /sparrow/predictors/transmembrane/transmembrane_predictor.py: -------------------------------------------------------------------------------- 1 | from parrot import brnn_architecture 2 | from parrot import encode_sequence 3 | 4 | import sparrow 5 | 6 | import torch 7 | import numpy as np 8 | import os 9 | from sparrow.sparrow_exceptions import SparrowException 10 | 11 | 12 | 13 | """ 14 | Predictor of transmembrane regions from sequence. 15 | 16 | This is an example of how to implement a system-specific predictor 17 | in sparrow and could/should be used as a template for adding in 18 | additional predictors. 19 | 20 | 21 | ## Nomenclature 22 | 23 | 1. The predictor file should be called _predictor.py 24 | 2. This should be inside a module in the /predictor/ directory called 25 | 3. The single class this module implements should be called 26 | 27 | e.g. here we have 28 | 29 | 1. transmembrane/ 30 | 2. transmembrane_predictor.py 31 | 3. TransmembranePredictor 32 | 33 | 34 | ## Class structure 35 | 36 | The class should have (at least) two functions: 37 | 38 | 1. A constructor (__init__()) which PRE LOADS the network from sparrow/data/networks/relevant_name - the get_data() function 39 | is defined in sparrow/__init__.py and allows absolute-path access to the /data directory. The constructor should 40 | FULLY load the network along with standard PARROT-style options, as shown here. Trained networks should be versioned and 41 | implemented so previous versions can be chosene even if the default version changes 42 | 43 | 2. Define a function called predict_(self, seq) where is a convenient name that obviously means this is 44 | what the function does. 45 | 46 | 47 | """ 48 | 49 | DEFAULT_VERSION="4" 50 | 51 | 52 | class TransmembranePredictor(): 53 | """ 54 | 55 | Class that loads in a network such that predict_transmebrane_regions() can be called to predict 56 | transmembrane regions in a sequence. 57 | 58 | """ 59 | def __init__(self, version=None): 60 | """ 61 | Constructor for building a TransmembranePredictor object. The version keyword allows specific 62 | version(s) of the trained network associated with the HelicityPredictor to be defined. 63 | By default, it's set to None, which leads to the current best/default network being selected 64 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide 65 | the ability to pass a string as version. This string is inserted at position in the filename 66 | 67 | HelicityPredictor_network_v.pt 68 | 69 | i.e. no need to include the "v" part or the .pt extension 70 | 71 | """ 72 | 73 | if version is None: 74 | version = DEFAULT_VERSION 75 | 76 | saved_weights = sparrow.get_data(f'networks/transmembrane/transmembrane_predictor_network_v{version}.pt') 77 | 78 | if not os.path.isfile(saved_weights): 79 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__)) 80 | 81 | 82 | # use helicity predictor version 1 83 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu')) 84 | 85 | # Dynamically read in correct hyperparameters: 86 | num_layers = 0 87 | while True: 88 | s = f'lstm.weight_ih_l{num_layers}' 89 | try: 90 | temp = loaded_model[s] 91 | num_layers += 1 92 | except KeyError: 93 | break 94 | 95 | number_of_classes = np.shape(loaded_model['fc.bias'])[0] 96 | input_size = 20 # hard coded because we always use one-hot encoding 97 | 98 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4) 99 | 100 | # set these here so we can sanity check if needed 101 | self.number_of_classes = number_of_classes 102 | self.input_size = input_size 103 | self.number_of_layers = num_layers 104 | self.hidden_vector_size = hidden_vector_size 105 | 106 | # Instantiate network weights into object 107 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu') 108 | 109 | self.network.load_state_dict(loaded_model) 110 | 111 | 112 | 113 | def predict_transmebrane_regions(self, seq): 114 | """ 115 | Prediction function. seq should be a valid amino acid sequence. 116 | 117 | Parameters 118 | ------------ 119 | seq : str 120 | Valid amino acid sequence 121 | 122 | Returns 123 | ---------- 124 | np.ndarray 125 | Returns a 1D np.ndarray the length of the sequence where each position 126 | is the transient helicity at that position. 127 | 128 | """ 129 | 130 | # convert sequence to uppercase 131 | seq = seq.upper() 132 | 133 | # Convert to one-hot sequence vector 134 | seq_vector = encode_sequence.one_hot(seq) 135 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting 136 | 137 | # Forward pass -this is specific for classication 138 | prediction = self.network(seq_vector.float()).detach().numpy() 139 | int_vals = [] 140 | for row in prediction[0]: 141 | int_vals.append(np.argmax(row)) 142 | 143 | prediction = int_vals 144 | 145 | 146 | # for regression use the line below instead - included here so this 147 | # file can be easily copied over for future predictors 148 | # prediction = self.network(seq_vector.float()).detach().numpy().flatten() 149 | # prediction = np.clip(prediction, 0.0, 1.0) 150 | 151 | return prediction 152 | -------------------------------------------------------------------------------- /sparrow/sequence_analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/sequence_analysis/__init__.py -------------------------------------------------------------------------------- /sparrow/sequence_analysis/alignment.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | 3 | from protfasta import read_fasta, write_fasta 4 | from pyfamsa import Aligner, Sequence 5 | 6 | from sparrow import Protein 7 | from sparrow.visualize.sequence_visuals import show_sequence 8 | 9 | 10 | class SequenceAlignment: 11 | def __init__( 12 | self, 13 | input_data: Union[str, Dict[str, Protein]], 14 | threads: int = 0, 15 | scoring_matrix: str = "BLOSUM62", 16 | guide_tree: str = "upgma", 17 | tree_heuristic: Union[str, None] = None, 18 | medoid_threshold: int = 0, 19 | n_refinements: int = 200, 20 | keep_duplicates: bool = False, 21 | refine: Union[bool, None] = None, 22 | ): 23 | """ 24 | Initialize the SequenceAlignment object. 25 | 26 | Parametersip 27 | ---------- 28 | input_data : Union[List[Protein], str, Dict[str, str]] 29 | A list of Protein objects, a path to a FASTA file, or a dictionary 30 | of FASTA headers to sequences. 31 | """ 32 | self.input_data = input_data 33 | self.threads = threads 34 | self.guide_tree = guide_tree 35 | self.tree_heuristic = tree_heuristic 36 | self.medoid_threshold = medoid_threshold 37 | self.n_refinements = n_refinements 38 | self.keep_duplicates = keep_duplicates 39 | self.refine = refine 40 | self.scoring_matrix = scoring_matrix 41 | self.aligner = self._initialize_aligner() 42 | self._cached_msa = None # Cache for the computed MSA 43 | 44 | def _initialize_aligner(self) -> Aligner: 45 | """ 46 | Initialize the Aligner object with the given parameters. 47 | """ 48 | return Aligner( 49 | threads=self.threads, 50 | guide_tree=self.guide_tree, 51 | tree_heuristic=self.tree_heuristic, 52 | medoid_threshold=self.medoid_threshold, 53 | n_refinements=self.n_refinements, 54 | keep_duplicates=self.keep_duplicates, 55 | refine=self.refine, 56 | scoring_matrix=self.scoring_matrix, 57 | ) 58 | 59 | @staticmethod 60 | def _encode_string(string_to_encode: str, encoding: str = "utf-8") -> bytes: 61 | """ 62 | Encode a string to bytes using the specified encoding. 63 | """ 64 | return string_to_encode.encode(encoding) 65 | 66 | def _load_sequences(self) -> List[Sequence]: 67 | """ 68 | Load sequences from either a list of Protein objects, a FASTA file, or 69 | a dictionary of header-sequence mappings. 70 | 71 | Returns 72 | ------- 73 | List[Sequence] 74 | A list of pyfamsa.Sequence objects for alignment. 75 | """ 76 | if isinstance(self.input_data, str): 77 | # Assume input_data is a path to a FASTA file 78 | fasta_data = read_fasta(self.input_data) 79 | sequences = [ 80 | Sequence(self._encode_string(header), self._encode_string(seq)) 81 | for header, seq in fasta_data.items() 82 | ] 83 | elif isinstance(self.input_data, dict): 84 | # Assume input_data is a dictionary of header-sequence mappings 85 | sequences = [ 86 | Sequence(self._encode_string(header), self._encode_string(seq.sequence)) 87 | for header, seq in self.input_data.items() 88 | ] 89 | else: 90 | raise ValueError( 91 | "Invalid input_data format. Must be either a list of Protein objects, " 92 | "a path to a FASTA file, or a dictionary of header-sequence mappings." 93 | ) 94 | 95 | return sequences 96 | 97 | def construct_msa(self) -> Aligner: 98 | """ 99 | Construct a multiple sequence alignment with pyFAMSA. 100 | 101 | Returns 102 | ------- 103 | Aligner 104 | Returns the constructed MSA as a pyfamsa._famsa.Alignment. 105 | """ 106 | if self._cached_msa is not None: 107 | # Return cached MSA if it exists 108 | return self._cached_msa 109 | 110 | sequences = self._load_sequences() 111 | self._cached_msa = self.aligner.align(sequences) # Cache the computed MSA 112 | return self._cached_msa 113 | 114 | @property 115 | def alignment(self) -> Aligner: 116 | """ 117 | Property to access the cached MSA result. 118 | 119 | Returns 120 | ------- 121 | Aligner 122 | Returns the cached MSA if available, otherwise computes it. 123 | """ 124 | if self._cached_msa is None: 125 | # Compute MSA if it hasn't been computed yet 126 | self.construct_msa() 127 | return self._cached_msa 128 | 129 | def save_msa( 130 | self, filename: str, linelength: int = 60, append_to_fasta: bool = False 131 | ): 132 | """ 133 | Save the multiple sequence alignment to a FASTA file. 134 | 135 | Parameters 136 | ---------- 137 | filename : str 138 | The filename to save the MSA. Should end with .fasta or .fa. 139 | 140 | linelength : int, optional 141 | Length of lines in the output file, by default 60. 142 | 143 | append_to_fasta : bool, optional 144 | Whether to append to an existing FASTA file, by default False. 145 | """ 146 | msa = self.alignment 147 | fasta_data = {seq.id.decode(): seq.sequence.decode() for seq in msa} 148 | write_fasta( 149 | fasta_data, filename, linelength=linelength, append_to_fasta=append_to_fasta 150 | ) 151 | 152 | @property 153 | def display_msa(self, ljust: int = 10, html: bool = False): 154 | """ 155 | Print the multiple sequence alignment using the cached MSA. 156 | 157 | Parameters 158 | ---------- 159 | ljust : int, optional 160 | The number of spaces to pad the sequence ID, by default 10 161 | 162 | html : bool, optional 163 | Set to True to print the alignment in HTML format, by default False 164 | """ 165 | msa = self.alignment 166 | 167 | for seq in msa: 168 | if html: 169 | print(seq.id.decode().ljust(ljust), end=None) 170 | show_sequence(seq.sequence.decode()) 171 | else: 172 | print(seq.id.decode().ljust(ljust), seq.sequence.decode()) 173 | -------------------------------------------------------------------------------- /sparrow/sequence_analysis/community_plugins/contributed.py: -------------------------------------------------------------------------------- 1 | from sparrow.sequence_analysis.plugins import BasePlugin 2 | 3 | 4 | class MultiplicativeFCR(BasePlugin): 5 | def __init__(self, protein): 6 | super().__init__(protein) 7 | 8 | def calculate(self, factor=2.0): 9 | """ 10 | This analysis doubles the FCR (fraction of charged residues) of the protein. 11 | This is a simple example of a contributed plugin. 12 | 13 | Parameters: factor (float) 14 | ------------- 15 | factor: float 16 | The factor by which the FCR will be multiplied (default is 2.0) 17 | 18 | Returns 19 | ------------- 20 | float 21 | Returns the result of the contributed analysis 22 | """ 23 | return factor * self.protein.FCR 24 | -------------------------------------------------------------------------------- /sparrow/sequence_analysis/elm.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dataclasses import dataclass 3 | from typing import List, Set, Tuple, Union 4 | import pandas as pd 5 | from IPython import embed 6 | 7 | import sparrow 8 | from sparrow.sparrow_exceptions import SparrowException 9 | 10 | 11 | @dataclass(frozen=True) 12 | class ELM: 13 | regex: str 14 | identifier: str 15 | functional_site_name: str 16 | description: str 17 | probability: float 18 | start: int 19 | end: int 20 | sequence: str 21 | 22 | def __eq__(self, other): 23 | if self.start > other.end or self.end < other.start: 24 | return False 25 | 26 | # Only compare regex patterns for equality - all regexes for ELMs are unique - we could also check functional site names? 27 | return self.regex == other.regex 28 | 29 | def __hash__(self): 30 | # I THINK this works since we're basically saying we dont CARE about sequences if they're the same or not 31 | # this will let us do set differences and intersections 32 | # Does restrict motif to starting at the same position though which we know these could be diff spots from indels 33 | # This is fine for point mutation comparison, but this could probably be generalized. 34 | # don't want just look for "in the sequence" because their might be 35 | # multiple occurences of the same motif and motif positioning may matter. 36 | return hash((self.regex, self.functional_site_name, self.start)) 37 | 38 | 39 | def parse_hgvs(hgvs_notation : str) -> Tuple: 40 | """This function takes an HGVS notation and returns a tuple of the form (position, mutation) 41 | where position is the position of the mutation and mutation is the amino acid change. 42 | 43 | Parameters 44 | ---------- 45 | hgvs_notation : str 46 | HGVS notation of the form p.XXXX 47 | 48 | Returns 49 | ------- 50 | Tuple[int,str] 51 | Tuple containing the position of the mutation and the amino acid change. 52 | """ 53 | if not hgvs_notation.startswith("p."): 54 | raise SparrowException("Invalid HGVS notation. Must start with 'p.'") 55 | 56 | parts = hgvs_notation.split('p.') 57 | if len(parts) < 2: 58 | raise SparrowException("Invalid HGVS notation. Must be in the form p.xxx") 59 | 60 | # Extract the position and amino acids 61 | position = int(''.join(filter(str.isdigit, parts[1]))) # shift indexing to 0 62 | assert position > 0, SparrowException(f"Invalid position in HGVS notation, must be a 1 indexed integer greater than 0. Received {position}") 63 | mutation = parts[1][-1] 64 | 65 | # position shifted to 0 index 66 | return position-1, mutation.upper() 67 | 68 | def generate_elm_df(file : str) -> pd.DataFrame: 69 | """Generates a pandas DataFrame object containing all the information 70 | annotated as an elm. 71 | 72 | Parameters 73 | ---------- 74 | file : str 75 | This generates a dataframe from the elm_classes.tsv in the data directory. 76 | The latest elm class list can be found at http://elm.eu.org/downloads.html 77 | 78 | Returns 79 | ------- 80 | pandas.DataFrame 81 | DataFrame containing the elm annotations. 82 | 83 | """ 84 | elm_data = [] 85 | with open(f"{file}", "r", encoding="utf-8") as f: 86 | for line in f: 87 | if line.startswith("#"): 88 | continue 89 | if line.startswith('"Accession"'): 90 | columns = line.strip().split("\t") 91 | columns = [col.replace('"','') for col in columns] 92 | else: 93 | elm_data.append(line.replace('"','').strip().split("\t")) 94 | df = pd.DataFrame(elm_data,columns=columns) 95 | return df 96 | 97 | def find_all_elms(sequence : str) -> List[ELM]: 98 | """This function takes an input sequence and returns a namedtuple 99 | containing the regex used to find the elm from sequence, it's functional annotation, 100 | the start and stop position, as well as the sequence of the e 101 | 102 | Parameters 103 | ---------- 104 | sequence : str 105 | Amino Acid Sequence 106 | 107 | Returns 108 | ------- 109 | List[sparrow.sequence_analysis.elm.ELM] 110 | A list of NamedTuples containing all possible elms in a given sequence. 111 | """ 112 | elm_file = sparrow.get_data("elm_classes.tsv") 113 | df = generate_elm_df(elm_file) 114 | elms = [] 115 | for _, row in df.iterrows(): 116 | regex = row["Regex"] 117 | elm_class = row["ELMIdentifier"] 118 | site = row["FunctionalSiteName"] 119 | elm_description = row["Description"] 120 | elm_probability = row["Probability"] 121 | 122 | match_indices = [(m.start(0), m.end(0)) for m in re.finditer(regex, sequence)] 123 | for (start,end) in match_indices: 124 | elm = ELM(regex, elm_class, site, elm_description, elm_probability, start, end, sequence[start:end]) 125 | elms.append(elm) 126 | return set(elms) 127 | 128 | def compute_lost_elms(target_protein, query : Union[Tuple[int,str],str]) -> Set: 129 | """This function takes a protein sequence and a target query and returns a 130 | the set of ELMs that were lost due to the mutation. The query can either be 131 | a list or tuple of the form (position, mutant) where position is the position 132 | of the mutation. or it can be a string in the HGVS format. 133 | Parameters 134 | ---------- 135 | target_protein : Union[sparrow.Protein, str] 136 | sparrow.Protein or amino acid sequence 137 | queries : Union[str, List[int,str], Tuple[int,str]] 138 | List or tuple of the form (position, mutant) where position is the position of the mutation. 139 | Returns 140 | ------- 141 | Set 142 | A set of ELMs containing the functional site name, the start and stop position, 143 | the sequence of the elm. 144 | """ 145 | 146 | if isinstance(target_protein, str): 147 | target_protein = sparrow.Protein(target_protein) 148 | 149 | if isinstance(query, str): 150 | position, mutation = parse_hgvs(query) 151 | else: 152 | position, mutation = query 153 | 154 | mutant_protein = sparrow.Protein(target_protein.sequence[:position] + mutation + target_protein.sequence[position+1:]) 155 | 156 | wt_elms = target_protein.elms 157 | mutant_elms = mutant_protein.elms 158 | lost_elms = wt_elms - mutant_elms 159 | 160 | return lost_elms 161 | 162 | def compute_gained_elms(target_protein, query : Union[Tuple[int,str],str]) -> Set: 163 | """This function takes a protein sequence and a target query and returns a 164 | the set of ELMs that were gained due to the mutation. The query can either be 165 | a list or tuple of the form (position, mutant) where position is the position 166 | of the mutation. or it can be a string in the HGVS format. 167 | 168 | Parameters 169 | ---------- 170 | target_protein : Union[sparrow.Protein, str] 171 | sparrow.Protein or amino acid sequence 172 | queries : Union[List[int,str], Tuple[int,str]] 173 | List or tuple of the form (position, mutant) where position is the position of the mutation. 174 | Returns 175 | ------- 176 | Set 177 | A set of ELMs containing the functional site name, the start and stop position, 178 | the sequence of the elm. 179 | """ 180 | 181 | if isinstance(target_protein, str): 182 | target_protein = sparrow.Protein(target_protein) 183 | 184 | if isinstance(query, str): 185 | position, mutation = parse_hgvs(query) 186 | else: 187 | position, mutation = query 188 | 189 | mutant_protein = sparrow.Protein(target_protein.sequence[:position] + mutation + target_protein.sequence[position+1:]) 190 | 191 | 192 | wt_elms = target_protein.elms 193 | mutant_elms = mutant_protein.elms 194 | gained_elms = mutant_elms - wt_elms 195 | 196 | return gained_elms 197 | 198 | def compute_retained_elms(target_protein, query : Union[Tuple[int,str],str]) -> Set: 199 | """This function takes a protein sequence and a target query and returns a 200 | the set of ELMs that were retained (no change) after mutation. The query can 201 | either be a list or tuple of the form (position, mutant) where position is 202 | the position of the mutation. or it can be a string in the HGVS format. 203 | 204 | Parameters 205 | ---------- 206 | target_protein : Union[sparrow.Protein, str] 207 | sparrow.Protein or amino acid sequence 208 | queries : Union[List[int,str], Tuple[int,str]] 209 | List or tuple of the form (position, mutant) where position is the position of the mutation. 210 | Returns 211 | ------- 212 | Set 213 | A set of ELMs containing the functional site name, the start and stop position, 214 | the sequence of the elm. 215 | """ 216 | if isinstance(target_protein, str): 217 | target_protein = sparrow.Protein(target_protein) 218 | 219 | if isinstance(query, str): 220 | position, mutation = parse_hgvs(query) 221 | else: 222 | position, mutation = query 223 | 224 | mutant_protein = sparrow.Protein(target_protein.sequence[:position] + mutation + target_protein.sequence[position+1:]) 225 | 226 | wt_elms = target_protein.elms 227 | mutant_elms = mutant_protein.elms 228 | retained_elms = wt_elms & mutant_elms 229 | 230 | return retained_elms -------------------------------------------------------------------------------- /sparrow/sequence_analysis/phospho_isoforms.py: -------------------------------------------------------------------------------- 1 | """ 2 | Snippit to build and adapted from localcider to get all phosphoisoforms 3 | of an amino acid sequence which is to integrated into sparrow 4 | 5 | By : Garrett M. Ginell 6 | 2023-02-08 7 | 8 | BASIC workflow is as followed: 9 | 10 | To get a list of run get_phosphoisoforms" 11 | 12 | phosphoSeqome = get_phosphoisoforms(sequence, mode='predict') 13 | # for options see various run variations in function header 14 | 15 | Then once you get the phosphoisoforms from the list above you can iterate 16 | the list and calculate a sequence parameter of choice and build a distribution: 17 | 18 | parameter_list = [] 19 | for s in phosphoSeqome: 20 | parameter_list.append(Protein(s).my_parameter_of_choice) 21 | 22 | This distribution can then be compared back to value of the original sequence: 23 | 24 | Protein(sequence).my_parameter_of_choice 25 | """ 26 | import itertools 27 | 28 | ## ----------------------------------------- 29 | ## 30 | def _predict_all_phosphosites(protein): 31 | """ 32 | Gets list of predicted phosphosites 33 | 34 | BASED OFF OF: 35 | predictors in sparrow: 36 | https://github.com/idptools/sparrow/tree/main/sparrow/predictors/phosphorylation 37 | 38 | Parameters 39 | ------------ 40 | protein : sparrow.Protein 41 | sparrow Protein object 42 | 43 | 44 | Returns: 45 | ---------- 46 | list 47 | list of predicted positions of sites of phosphorylated T, S, and Y 48 | Note positions are returned as indexed from 0 49 | 50 | """ 51 | 52 | # predict phosphosites 53 | pS = protein.predictor.serine_phosphorylation(return_sites_only=True) 54 | pT = protein.predictor.threonine_phosphorylation(return_sites_only=True) 55 | pY = protein.predictor.tyrosine_phosphorylation(return_sites_only=True) 56 | 57 | return list(pS + pT + pY) 58 | 59 | ## ---------------------------------------- 60 | ## 61 | def _get_all_phosphosites(sequence): 62 | """ 63 | Function which returns a list of all the positions which *could* be 64 | phosphorylated (i.e. are T/S/Y). NOTE this does not use any kind of 65 | smart lookup, metadata, or analysis. It's literally, where are the Y/T/S 66 | residues. 67 | Note positions are returned as indexed from 0 68 | 69 | Parameters 70 | ------------ 71 | sequence : str 72 | Valid amino acid sequence 73 | 74 | Returns: 75 | ---------- 76 | list 77 | list of integers corresponding to S/T/Y positions in your sequence 78 | 79 | """ 80 | sites = [] 81 | idx = 0 82 | for i in sequence: 83 | if i in ["Y", "S", "T"]: 84 | sites.append(idx) 85 | idx = idx + 1 86 | return sites 87 | 88 | ## ----------------------------------- 89 | ## 90 | def _build_phosphoSeqome(sequence, phosphosites, phospho_rate=1): 91 | """ 92 | Build all phospho-isoforms based on provided phosphosites 93 | 94 | Parameters 95 | ------------ 96 | sequence : str 97 | Valid amino acid sequence 98 | 99 | phosphosites : list 100 | List of valid phosphosite positions 101 | 102 | phospho_rate : float 103 | Value between 0 and 1 which defines the maximum percent of phosphosites 104 | can be 'phosphorylated' a each sequence. Defult is 1 (IE all sites can be 105 | phosphorylated) 106 | 107 | Returns 108 | ---------- 109 | list 110 | list of sequences for all posible phospho-isoforms 111 | based off of the provided inputed list of phosphosites 112 | 113 | When phospho_rate = 1 (100%) 114 | the length of output list = 2^n where n=len(phosphosites) 115 | """ 116 | 117 | _max_phospho_number = int(len(phosphosites)*phospho_rate) 118 | ## GET ALL phospho-sequence combinations 119 | phosphoSeqome = [] 120 | phosphoSeqome_info = [] 121 | for phosphostatus in itertools.product("01", repeat=len(phosphosites)): 122 | 123 | if phosphostatus.count('1') > _max_phospho_number: 124 | continue 125 | newseq = list(sequence) 126 | 127 | count = 0 128 | indx = 0 129 | # look over each element in our phosphosite on/off list 130 | for i in phosphostatus: 131 | # if that element is ON 132 | if int(i) == 1: 133 | # set the AA at that position to a negative residue (we use E but 134 | # could be D) 135 | newseq[phosphosites[indx]] = "E" 136 | count+=1 137 | indx = indx + 1 138 | 139 | # now we've replaced some number of T/Y/S with E representing a different 140 | # phosphostate 141 | newseq = "".join(newseq) 142 | phosphoSeqome.append(newseq) 143 | 144 | return phosphoSeqome 145 | 146 | ## ----------------------------------- 147 | ## 148 | def get_phosphoisoforms(protein, mode="all", phospho_rate=1, phosphosites=None): 149 | """Phosphosites are replaced with the phosphomimetic 'E', enabling approximate calculation 150 | of charge based sequence features with the presence of a phosphorylated residues. 151 | 152 | Parameters 153 | ---------- 154 | protein : sparrow.Protein 155 | sparrow Protein object 156 | 157 | mode : str, optional 158 | Defition for how the phosphosites should be determined, by default "all" 159 | 160 | 'all' : Assumes all S/T/Y residues are potential phosphosites 161 | 162 | 'predict' : Leverages PARROT trained predictors via _predict_all_phosphosites 163 | to predict phosphorylated sites based on sequence. 164 | 165 | 'custom' : uses the 'phosphosites' parameter as indices for phosphosites. 166 | 167 | phospho_rate : int, optional 168 | Value between 0 and 1 which defines the maximum percent of phosphosites 169 | can be 'phosphorylated' a each sequence, by default 1 (IE all sites can be 170 | phosphorylated) 171 | 172 | phosphosites : list, optional 173 | Custom list of indices for valid phosphosite positions, by default None 174 | 175 | Returns 176 | ------- 177 | list 178 | list of sequences for the possible phosphoisoforms based off the selected method. 179 | Phosphorylatable amino acids are replaced with 'E'. 180 | """ 181 | 182 | # get phosphosite positions 183 | if mode == 'all': 184 | _phosphosites = _get_all_phosphosites(protein.sequence) 185 | elif mode == 'predict': 186 | _phosphosites = _predict_all_phosphosites(protein) 187 | elif mode == 'custom': 188 | if phosphosites != None: 189 | _phosphosites = phosphosites 190 | else: 191 | raise Exception('To use custom phosphosites must be defined') 192 | else: 193 | raise Exception('Please specify mode to compute phosphosites') 194 | 195 | # generate all phospho-Isoforms 196 | return _build_phosphoSeqome(protein.sequence, _phosphosites, phospho_rate=phospho_rate) 197 | -------------------------------------------------------------------------------- /sparrow/sequence_analysis/physical_properties.py: -------------------------------------------------------------------------------- 1 | from sparrow.data import amino_acids 2 | 3 | ## The physical properties module contains stateless functions that compute sequence-dependent 4 | ## physical properties. See the "calculate_molecular_weight" function as a template for how 5 | ## these functions should work. 6 | ## 7 | ## 8 | 9 | def calculate_molecular_weight(sequence): 10 | """ 11 | Function that returns the molecular weight of a protein sequence assuming standard 12 | amino acid molecular weights. 13 | 14 | Parameters 15 | ------------- 16 | sequence : str 17 | String containing the amino acid sequence (upper case one-letter residue codes) 18 | 19 | Returns 20 | ----------- 21 | float 22 | Returns the residue or polypeptide molecular weight. 23 | 24 | """ 25 | 26 | # compute niave MW 27 | MW = 0 28 | for i in sequence: 29 | MW = MW + amino_acids.AA_MOLECULAR_WEIGHT[i] 30 | 31 | if len(sequence) == 1: 32 | return MW 33 | 34 | else: 35 | return MW - 18*(len(sequence)-1) 36 | 37 | -------------------------------------------------------------------------------- /sparrow/sequence_analysis/plugins.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import inspect 3 | import pkgutil 4 | from abc import ABC, abstractmethod 5 | from collections import defaultdict 6 | from typing import Any 7 | 8 | 9 | class PluginWrapper: 10 | """ 11 | A wrapper class for plugins that integrates with the plugin manager. 12 | 13 | This class is responsible for managing the execution of plugin instances 14 | and caching their results to avoid redundant computations. It uses a 15 | combination of the plugin name and the arguments passed to the plugin's 16 | `calculate` method to create a unique cache key for storing results. 17 | 18 | Attributes: 19 | name (str): The name of the plugin. 20 | cache_dict (dict): A dictionary used to store cached results. 21 | plugin_instance (object): An instance of the plugin to be wrapped. 22 | 23 | Methods: 24 | __call__(*args, **kwargs): 25 | Executes the plugin's `calculate` method with the provided arguments. 26 | Caches the result to avoid recomputation on subsequent calls with 27 | the same arguments. 28 | """ 29 | 30 | def __init__(self, name, cache_dict, plugin_instance): 31 | self.name = name 32 | self.cache_dict = cache_dict 33 | self.plugin_instance = plugin_instance 34 | 35 | def __call__(self, *args, **kwargs): 36 | """ 37 | Call calculate() with or without arguments. 38 | Implement caching to avoid recomputation. 39 | """ 40 | # Create hashable cache key for args and kwargs 41 | cache_key = (args, frozenset(kwargs.items())) 42 | 43 | # Check if the result is cached 44 | if cache_key not in self.cache_dict[self.name]: 45 | self.cache_dict[self.name][cache_key] = self.plugin_instance.calculate( 46 | *args, **kwargs 47 | ) 48 | 49 | return self.cache_dict[self.name][cache_key] 50 | 51 | 52 | class PluginManager: 53 | def __init__(self, protein: "sparrow.Protein"): 54 | self.__protein_obj = protein 55 | # Memoization for both args and no-args results 56 | self.__precomputed = defaultdict(dict) 57 | self.__plugins = {} 58 | 59 | self._available_plugins = self._discover_plugins() 60 | 61 | def _discover_plugins(self): 62 | """ 63 | Discover all plugins available in the contributed plugin module. 64 | """ 65 | plugin_module = "sparrow.sequence_analysis.community_plugins.contributed" 66 | try: 67 | module = importlib.import_module(plugin_module) 68 | return [ 69 | name 70 | for name, obj in inspect.getmembers(module, inspect.isclass) 71 | if issubclass(obj, BasePlugin) and obj.__module__ == plugin_module 72 | ] 73 | except ModuleNotFoundError: 74 | return [] 75 | 76 | def __getattr__(self, name: str): 77 | """ 78 | Dynamically load and return the plugin's calculate method result 79 | as if it were a property when accessed without arguments. 80 | """ 81 | if name not in self.__plugins: 82 | try: 83 | module = importlib.import_module( 84 | f"sparrow.sequence_analysis.community_plugins.contributed" 85 | ) 86 | plugin_class = getattr(module, name) 87 | if not issubclass(plugin_class, BasePlugin): 88 | raise AttributeError(f"{name} is not a valid plugin.") 89 | self.__plugins[name] = plugin_class(protein=self.__protein_obj) 90 | except (ModuleNotFoundError, AttributeError): 91 | raise AttributeError( 92 | f"Plugin '{name}' not found. Available plugins are: {list(self._available_plugins)}" 93 | ) 94 | 95 | plugin_instance = self.__plugins[name] 96 | 97 | return PluginWrapper(name, self.__precomputed, plugin_instance) 98 | 99 | def __dir__(self): 100 | """ 101 | Return the list of dynamically available plugins for autocompletion QoL. 102 | """ 103 | return super().__dir__() + self._available_plugins 104 | 105 | 106 | class BasePlugin(ABC): 107 | """Base class for all community contributed plugins.""" 108 | 109 | def __init__(self, protein: "sparrow.Protein"): 110 | """Constructor for all plugins. This must provide a protein object or sequence.""" 111 | self.__protein_obj = protein 112 | 113 | @abstractmethod 114 | def calculate(self) -> Any: 115 | """ 116 | This method must operate on the sequence attribute of the protein object. 117 | The method must return the result of the contributed analysis. 118 | 119 | Returns 120 | ------------- 121 | float 122 | Returns the result of the contributed analysis 123 | """ 124 | pass 125 | 126 | @property 127 | def protein(self): 128 | return self.__protein_obj 129 | -------------------------------------------------------------------------------- /sparrow/sparrow_exceptions.py: -------------------------------------------------------------------------------- 1 | class SparrowException(Exception): 2 | pass 3 | 4 | 5 | class ProteinException(Exception): 6 | pass 7 | 8 | 9 | class PatterningException(Exception): 10 | pass 11 | 12 | 13 | class CalculationException(Exception): 14 | pass 15 | -------------------------------------------------------------------------------- /sparrow/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Empty init file in case you choose a package besides PyTest such as Nose which may look for such a file 3 | """ 4 | 5 | import numpy as np 6 | 7 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS 8 | 9 | 10 | def build_seq(min_count=10,max_count=50): 11 | 12 | # how many residues 13 | n_res = np.random.randint(4,20) 14 | 15 | s = '' 16 | for i in range(n_res): 17 | aa_idx = np.random.randint(0,20) 18 | s = s + VALID_AMINO_ACIDS[aa_idx]*np.random.randint(min_count, max_count) 19 | 20 | s = list(s) 21 | np.random.shuffle(s) 22 | s = "".join(s) 23 | return s 24 | -------------------------------------------------------------------------------- /sparrow/tests/compute_test_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "725f76f7", 6 | "metadata": {}, 7 | "source": [ 8 | "### Dictionary that recomputes the test_data " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "92383cc0", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 41, 22 | "id": "5494688b", 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "0.43746367\n", 30 | "32.81683\n", 31 | "33.175683185378716\n", 32 | "81.15200236853273\n", 33 | "75.92058\n", 34 | "0.5773746\n", 35 | "5.815894\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "from sparrow import Protein\n", 41 | "\n", 42 | "P = Protein('MKYLAAYLLLNAAGNTPDATKIKAILESVGIEIEDEKVSSVLSALEGKSVDELITEGNEKLAAVPAAGPASAGGAAAASGDAAAEEEKEEEAAEESDDDMGFGLFD')\n", 43 | "\n", 44 | "print(P.predictor.asphericity())\n", 45 | "\n", 46 | "print(P.predictor.radius_of_gyration())\n", 47 | "print(P.predictor.radius_of_gyration(use_scaled=True))\n", 48 | "\n", 49 | "print(P.predictor.end_to_end_distance(use_scaled=True))\n", 50 | "print(P.predictor.end_to_end_distance(use_scaled=False))\n", 51 | "\n", 52 | "print(P.predictor.scaling_exponent())\n", 53 | "print(P.predictor.prefactor())\n", 54 | "from sparrow.data.amino_acids import VALID_AMINO_ACIDS\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 47, 60 | "id": "4e59d0a1", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "from sparrow import Protein\n", 65 | "import pytest\n", 66 | "import protfasta\n", 67 | "import os\n", 68 | "\n", 69 | "current_filepath = os.getcwd()\n", 70 | "onehundred_seqs = \"{}/test_data/test_seqs_100.fasta\".format(current_filepath)\n", 71 | "\n", 72 | "seqs = protfasta.read_fasta(onehundred_seqs)\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "id": "3ed4d1e5", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "k2rg = {}\n", 83 | "for k in seqs:\n", 84 | " k2rg[k] = Protein(seqs[k]).predictor.radius_of_gyration()\n", 85 | "\n", 86 | "np.save('test_data/test_100_rg_v2.npy', np.array(k2rg, dtype=dict)) " 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 48, 92 | "id": "e71f57dd", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "k2rg = {}\n", 97 | "for k in seqs:\n", 98 | " k2rg[k] = Protein(seqs[k]).predictor.radius_of_gyration(use_scaled=True)\n", 99 | "\n", 100 | "np.save('test_data/test_100_rg_scaled_v2.npy', np.array(k2rg, dtype=dict)) " 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 49, 106 | "id": "12872bec", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "k2re = {}\n", 111 | "for k in seqs:\n", 112 | " k2re[k] = Protein(seqs[k]).predictor.end_to_end_distance()\n", 113 | "\n", 114 | "np.save('test_data/test_100_re_v2.npy', np.array(k2re, dtype=dict)) " 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 50, 120 | "id": "3bc0cd1a", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "k2re = {}\n", 125 | "for k in seqs:\n", 126 | " k2re[k] = Protein(seqs[k]).predictor.end_to_end_distance(use_scaled=True)\n", 127 | "\n", 128 | "np.save('test_data/test_100_re_scaled_v2.npy', np.array(k2re, dtype=dict)) " 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 51, 134 | "id": "47f17564", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "k2asph = {}\n", 139 | "for k in seqs:\n", 140 | " k2asph[k] = Protein(seqs[k]).predictor.asphericity()\n", 141 | "\n", 142 | "np.save('test_data/test_100_asph_v2.npy', np.array(k2asph, dtype=dict)) " 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 52, 148 | "id": "202cdc34", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "k2scal_exp = {}\n", 153 | "for k in seqs:\n", 154 | " k2scal_exp[k] = Protein(seqs[k]).predictor.scaling_exponent()\n", 155 | "\n", 156 | "np.save('test_data/test_100_exponent_v2.npy', np.array(k2scal_exp, dtype=dict)) " 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 53, 162 | "id": "39eb54c0", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "k2prefact = {}\n", 167 | "for k in seqs:\n", 168 | " k2prefact[k] = Protein(seqs[k]).predictor.prefactor()\n", 169 | "\n", 170 | "np.save('test_data/test_100_prefactor_v2.npy', np.array(k2prefact, dtype=dict)) " 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 3, 176 | "id": "f4d6bf39", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "from sparrow.patterning import iwd" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 21, 186 | "id": "868afb92", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "k2_average_bivariate_inverse_distance_charge = {}\n", 191 | "k2_average_inverse_distance_charge_neg = {}\n", 192 | "k2_average_inverse_distance_charge_pos = {}\n", 193 | "k2_average_inverse_distance_ali = {}\n", 194 | "\n", 195 | "for k in seqs:\n", 196 | "\n", 197 | " local_seq = seqs[k]\n", 198 | " \n", 199 | " ncpr = Protein(seqs[k]).linear_sequence_profile('NCPR')\n", 200 | " \n", 201 | " k2_average_bivariate_inverse_distance_charge[k] = iwd.calculate_average_bivariate_inverse_distance_charge(ncpr, local_seq)\n", 202 | " k2_average_inverse_distance_charge_neg[k] = iwd.calculate_average_inverse_distance_charge(ncpr, local_seq, '-')\n", 203 | " k2_average_inverse_distance_charge_pos[k] = iwd.calculate_average_inverse_distance_charge(ncpr, local_seq, '+')\n", 204 | " k2_average_inverse_distance_ali[k] = iwd.calculate_average_inverse_distance_from_sequence(local_seq, 'ILVAM')\n", 205 | " \n", 206 | " \n", 207 | "np.save('test_data/test_average_bivariate_inverse_distance_charge.npy', np.array(k2_average_bivariate_inverse_distance_charge, dtype=dict)) \n", 208 | "np.save('test_data/test_average_inverse_distance_charge_neg.npy', np.array(k2_average_inverse_distance_charge_neg, dtype=dict)) \n", 209 | "np.save('test_data/test_average_inverse_distance_charge_pos.npy', np.array(k2_average_inverse_distance_charge_pos, dtype=dict)) \n", 210 | "np.save('test_data/test_average_inverse_distance_ali.npy', np.array(k2_average_inverse_distance_ali, dtype=dict)) \n", 211 | " \n", 212 | " \n", 213 | " " 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 26, 219 | "id": "54a12190", 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "0.27504330372096264" 226 | ] 227 | }, 228 | "execution_count": 26, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "Protein('ALEPLEALELASEPLALELAEPDEKKAEPLAEPLAEKAKEPALE').compute_iwd" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "52e332aa", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "Python 3 (ipykernel)", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.8.12" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 5 267 | } 268 | -------------------------------------------------------------------------------- /sparrow/tests/generate_test_data/generate_dssp_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "id": "c81ae04b", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from sparrow.predictors.dssp.dssp_predictor import DSSPPredictor\n", 11 | "import numpy as np\n", 12 | "import protfasta\n", 13 | "import pickle\n", 14 | "\n", 15 | "natural_proteins = protfasta.read_fasta('../test_data/test_seqs_100.fasta')\n", 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "503d8163", 22 | "metadata": {}, 23 | "source": [ 24 | "### Helicity predictions" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "ab18972d", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "helicity_class = {}\n", 35 | "helicity_prob = {}\n", 36 | "\n", 37 | "X2 = DSSPPredictor(version=2)\n", 38 | "\n", 39 | "for k in natural_proteins:\n", 40 | " s = natural_proteins[k]\n", 41 | " helicity_class[k] = X2.predict_helicity_smart(s)\n", 42 | " helicity_prob[k] = X2.predict_helical_probability(s)\n", 43 | "\n", 44 | "with open('../test_data/helicity_class_v2_default_test_seqs_100.pickle', 'wb') as f:\n", 45 | " pickle.dump(helicity_class, f) \n", 46 | " \n", 47 | "with open('../test_data/helicity_prob_v2_default_test_seqs_100.pickle', 'wb') as f:\n", 48 | " pickle.dump(helicity_prob, f) " 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 11, 54 | "id": "cf90aec4", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "extended_class = {}\n", 59 | "extended_prob = {}\n", 60 | "\n", 61 | "X2 = DSSPPredictor(version=2)\n", 62 | "\n", 63 | "for k in natural_proteins:\n", 64 | " s = natural_proteins[k]\n", 65 | " extended_class[k] = X2.predict_extended_smart(s)\n", 66 | " extended_prob[k] = X2.predict_extended_probability(s)\n", 67 | "\n", 68 | "with open('../test_data/extended_class_v2_default_test_seqs_100.pickle', 'wb') as f:\n", 69 | " pickle.dump(extended_class, f) \n", 70 | " \n", 71 | "with open('../test_data/extended_prob_v2_default_test_seqs_100.pickle', 'wb') as f:\n", 72 | " pickle.dump(extended_prob, f) " 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 12, 78 | "id": "8ae2e5c6", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "coil_class = {}\n", 83 | "coil_prob = {}\n", 84 | "\n", 85 | "X2 = DSSPPredictor(version=2)\n", 86 | "\n", 87 | "for k in natural_proteins:\n", 88 | " s = natural_proteins[k]\n", 89 | " coil_class[k] = X2.predict_coil_smart(s)\n", 90 | " coil_prob[k] = X2.predict_coil_probability(s)\n", 91 | "\n", 92 | "with open('../test_data/coil_class_v2_default_test_seqs_100.pickle', 'wb') as f:\n", 93 | " pickle.dump(coil_class, f) \n", 94 | " \n", 95 | "with open('../test_data/coil_prob_v2_default_test_seqs_100.pickle', 'wb') as f:\n", 96 | " pickle.dump(coil_prob, f) " 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "243f8c54", 102 | "metadata": {}, 103 | "source": [ 104 | "## Non-default data\n", 105 | "The code below generates sequences with non-default settings for the threshold and minimum length to vary this value and ensure all works well there" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 27, 111 | "id": "7d703b3d", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "helicity_class = {}\n", 116 | "\n", 117 | "X2 = DSSPPredictor(version=2)\n", 118 | "\n", 119 | "for k in natural_proteins:\n", 120 | " s = natural_proteins[k]\n", 121 | " \n", 122 | " thresh = np.random.random()\n", 123 | " minlen = np.random.randint(1,13)\n", 124 | " \n", 125 | " tmp = X2.predict_helicity_smart(s, threshold=thresh, minlen=minlen)\n", 126 | " \n", 127 | " helicity_class[k] = [thresh, minlen, tmp]\n", 128 | "\n", 129 | "with open('../test_data/helicity_class_v2_non_default_test_seqs_100.pickle', 'wb') as f:\n", 130 | " pickle.dump(helicity_class, f) \n", 131 | " \n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 29, 137 | "id": "09d2bdac", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "extended_class = {}\n", 142 | "\n", 143 | "X2 = DSSPPredictor(version=2)\n", 144 | "\n", 145 | "for k in natural_proteins:\n", 146 | " s = natural_proteins[k]\n", 147 | " \n", 148 | " thresh = np.random.random()\n", 149 | " minlen = np.random.randint(1,13)\n", 150 | " \n", 151 | " tmp = X2.predict_extended_smart(s, threshold=thresh, minlen=minlen)\n", 152 | " \n", 153 | " extended_class[k] = [thresh, minlen, tmp]\n", 154 | "\n", 155 | "with open('../test_data/extended_class_v2_non_default_test_seqs_100.pickle', 'wb') as f:\n", 156 | " pickle.dump(extended_class, f) \n", 157 | " \n" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 30, 163 | "id": "187ae833", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "coil_class = {}\n", 168 | "\n", 169 | "X2 = DSSPPredictor(version=2)\n", 170 | "\n", 171 | "for k in natural_proteins:\n", 172 | " s = natural_proteins[k]\n", 173 | " \n", 174 | " thresh = np.random.random()\n", 175 | " minlen = np.random.randint(1,13)\n", 176 | " \n", 177 | " tmp = X2.predict_coil_smart(s, threshold=thresh, minlen=minlen)\n", 178 | " \n", 179 | " coil_class[k] = [thresh, minlen, tmp]\n", 180 | "\n", 181 | "with open('../test_data/coil_class_v2_non_default_test_seqs_100.pickle', 'wb') as f:\n", 182 | " pickle.dump(coil_class, f) \n", 183 | " \n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "b42dee92", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python 3 (ipykernel)", 198 | "language": "python", 199 | "name": "python3" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.8.12" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 5 216 | } 217 | -------------------------------------------------------------------------------- /sparrow/tests/generate_test_data/helicity_class_v2_default.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/generate_test_data/helicity_class_v2_default.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/coil_class_v2_default_test_seqs_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/coil_class_v2_default_test_seqs_100.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/coil_class_v2_non_default_test_seqs_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/coil_class_v2_non_default_test_seqs_100.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/coil_prob_v2_default_test_seqs_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/coil_prob_v2_default_test_seqs_100.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/extended_class_v2_default_test_seqs_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/extended_class_v2_default_test_seqs_100.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/extended_class_v2_non_default_test_seqs_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/extended_class_v2_non_default_test_seqs_100.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/extended_prob_v2_default_test_seqs_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/extended_prob_v2_default_test_seqs_100.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/helicity_class_v2_default_test_seqs_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/helicity_class_v2_default_test_seqs_100.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/helicity_class_v2_non_default_test_seqs_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/helicity_class_v2_non_default_test_seqs_100.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/helicity_prob_v2_default_test_seqs_100.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/helicity_prob_v2_default_test_seqs_100.pickle -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_asph.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_asph.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_asph_v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_asph_v2.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_exponent.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_exponent.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_exponent_v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_exponent_v2.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_prefactor.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_prefactor.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_prefactor_v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_prefactor_v2.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_re.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_re_scaled.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re_scaled.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_re_scaled_v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re_scaled_v2.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_re_v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re_v2.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_rg.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_rg_scaled.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg_scaled.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_rg_scaled_v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg_scaled_v2.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_rg_v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg_v2.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_scd.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_scd.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_100_shd.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_shd.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_average_bivariate_inverse_distance_charge.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_bivariate_inverse_distance_charge.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_average_inverse_distance_ali.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_inverse_distance_ali.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_average_inverse_distance_charge_neg.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_inverse_distance_charge_neg.npy -------------------------------------------------------------------------------- /sparrow/tests/test_data/test_average_inverse_distance_charge_pos.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_inverse_distance_charge_pos.npy -------------------------------------------------------------------------------- /sparrow/tests/test_iwd.py: -------------------------------------------------------------------------------- 1 | from sparrow.patterning import iwd 2 | import os 3 | import protfasta 4 | import numpy as np 5 | from sparrow import Protein 6 | 7 | current_filepath = os.getcwd() 8 | onehundred_seqs = "{}/test_data/test_seqs_100.fasta".format(current_filepath) 9 | 10 | seqs = protfasta.read_fasta(onehundred_seqs) 11 | 12 | 13 | def test_average_bivariate_inverse_distance_charge(): 14 | 15 | k2val = np.load('test_data/test_average_bivariate_inverse_distance_charge.npy', allow_pickle=True).item() 16 | for k in seqs: 17 | assert np.isclose(Protein(seqs[k]).compute_bivariate_iwd_charged_weighted(), k2val[k]) 18 | 19 | 20 | def test_average_inverse_distance_charge_neg(): 21 | 22 | k2val = np.load('test_data/test_average_inverse_distance_charge_neg.npy', allow_pickle=True).item() 23 | for k in seqs: 24 | assert np.isclose(Protein(seqs[k]).compute_iwd_charged_weighted('-'), k2val[k]) 25 | 26 | def test_average_inverse_distance_charge_pos(): 27 | 28 | k2val = np.load('test_data/test_average_inverse_distance_charge_pos.npy', allow_pickle=True).item() 29 | for k in seqs: 30 | assert np.isclose(Protein(seqs[k]).compute_iwd_charged_weighted('+'), k2val[k]) 31 | 32 | def test_average_inverse_distance_ali(): 33 | 34 | k2val = np.load('test_data/test_average_inverse_distance_ali.npy', allow_pickle=True).item() 35 | for k in seqs: 36 | assert np.isclose(Protein(seqs[k]).compute_iwd('ILVAM'), k2val[k]) 37 | 38 | 39 | -------------------------------------------------------------------------------- /sparrow/tests/test_kappa.py: -------------------------------------------------------------------------------- 1 | # Import package, test suite, and other packages as needed 2 | import sparrow 3 | import pytest 4 | import sys 5 | import numpy as np 6 | from sparrow.protein import Protein 7 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS 8 | import random 9 | 10 | 11 | 12 | USE_LOCALCIDER = True 13 | 14 | 15 | def test_kappa(): 16 | 17 | das = [ 18 | 'EKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEK', 19 | 'EEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEK', 20 | 'KEKKKEKKEEKKEEKEKEKEKEEKKKEEKEKEKEKKKEEKEKEEKKEEEE', 21 | 'KEKEEKEKKKEEEEKEKKKKEEKEKEKEKEEKKEEKKKKEEKEEKEKEKE', 22 | 'KEKEKKEEKEKKEEEKKEKEKEKKKEEKKKEEKEEKKEEKKKEEKEEEKE', 23 | 'EEEKKEKKEEKEEKKEKKEKEEEKKKEKEEKKEEEKKKEKEEEEKKKKEK', 24 | 'EEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEK', 25 | 'KKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKE', 26 | 'EEKKEEEKEKEKEEEEEKKEKKEKKEKKKEEKEKEKKKEKKKKEKEEEKE', 27 | 'EKKKKKKEEKKKEEEEEKKKEEEKKKEKKEEKEKEEKEKKEKKEEKEEEE', 28 | 'EKEKKKKKEEEKKEKEEEEKEEEEKKKKKEKEEEKEEKKEEKEKKKEEKK', 29 | 'EKKEEEEEEKEKKEEEEKEKEKKEKEEKEKKEKKKEKKEEEKEKKKKEKK', 30 | 'KEKKKEKEKKEKKKEEEKKKEEEKEKKKEEKKEKKEKKEEEEEEEKEEKE', 31 | 'EKKEKEEKEEEEKKKKKEEKEKKEKKKKEKKKKKEEEEEEKEEKEKEKEE', 32 | 'KKEKKEKKKEKKEKKEEEKEKEKKEKKKKEKEKKEEEEEEEEKEEKKEEE', 33 | 'EKEKEEKKKEEKKKKEKKEKEEKKEKEKEKKEEEEEEEEEKEKKEKKKKE', 34 | 'EKEKKKKKKEKEKKKKEKEKKEKKEKEEEKEEKEKEKKEEKKEEEEEEEE', 35 | 'KEEKKEEEEEEEKEEKKKKKEKKKEKKEEEKKKEEKKKEEEEEEKKKKEK', 36 | 'EEEEEKKKKKEEEEEKKKKKEEEEEKKKKKEEEEEKKKKKEEEEEKKKKK', 37 | 'EEKEEEEEEKEEEKEEKKEEEKEKKEKKEKEEKKEKKKKKKKKKKKKEEE', 38 | 'EEEEEEEEEKEKKKKKEKEEKKKKKKEKKEKKKKEKKEEEEEEKEEEKKK', 39 | 'KEEEEKEEKEEKKKKEKEEKEKKKKKKKKKKKKEKKEEEEEEEEKEKEEE', 40 | 'EEEEEKEEEEEEEEEEEKEEKEKKKKKKEKKKKKKKEKEKKKKEKKEEKK', 41 | 'EEEEKEEEEEKEEEEEEEEEEEEKKKEEKKKKKEKKKKKKKEKKKKKKKK', 42 | 'EEEEEEEEEEEKEEEEKEEKEEKEKKKKKKKKKKKKKKKKKKEEKKEEKE', 43 | 'KEEEEEEEKEEKEEEEEEEEEKEEEEKEEKKKKKKKKKKKKKKKKKKKKE', 44 | 'KKEKKKEKKEEEEEEEEEEEEEEEEEEEEKEEKKKKKKKKKKKKKKKEKK', 45 | 'EKKKKKKKKKKKKKKKKKKKKKEEEEEEEEEEEEEEEEEEKKEEEEEKEK', 46 | 'KEEEEKEEEEEEEEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKK', 47 | 'EEEEEEEEEEEEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKKKK'] 48 | 49 | das_kappa_vals = [0.000963782329781065, 50 | 0.006849987601594839, 51 | 0.02510380091732725, 52 | 0.023779919834168346, 53 | 0.014793830994527891, 54 | 0.030699929748093432, 55 | 0.055155094748869704, 56 | 0.055155094748869704, 57 | 0.06207283537900597, 58 | 0.09244645817707578, 59 | 0.08182457866549872, 60 | 0.08535584477384989, 61 | 0.09376754013641903, 62 | 0.12779464725771064, 63 | 0.13589023055307498, 64 | 0.14253932524913954, 65 | 0.17465693111603184, 66 | 0.16361063576296123, 67 | 0.2184643791753562, 68 | 0.2683678441326591, 69 | 0.2836833506008589, 70 | 0.3168464032629612, 71 | 0.35941633427624997, 72 | 0.45755189798526164, 73 | 0.5278595348152701, 74 | 0.5935761144891406, 75 | 0.6553235220661426, 76 | 0.7440558474562516, 77 | 0.8658988417475169, 78 | 1.0] 79 | 80 | for p in range(len(das)): 81 | assert np.isclose(das_kappa_vals[p], Protein(das[p]).kappa, atol=0.03) 82 | 83 | if USE_LOCALCIDER: 84 | from localcider.sequenceParameters import SequenceParameters 85 | nseqs = 100 86 | max_count = 100 87 | n_diff_res = 10 88 | 89 | res_set = VALID_AMINO_ACIDS.copy() 90 | 91 | for i in range(nseqs): 92 | random.shuffle(res_set) 93 | local_res = res_set[:n_diff_res] 94 | seq = '' 95 | for aa in local_res: 96 | seq = seq + aa*random.randint(1,max_count) 97 | 98 | seq = list(seq) 99 | random.shuffle(seq) 100 | seq = "".join(seq) 101 | 102 | P = Protein(seq) 103 | 104 | # skip sequences 105 | if P.fraction_negative == 0 or P.fraction_positive == 0: 106 | continue 107 | 108 | SO = SequenceParameters(seq) 109 | assert np.isclose(P.NCPR, SO.get_NCPR()) 110 | assert np.isclose(P.FCR, SO.get_FCR()) 111 | 112 | # note, this will stochastically fial from time to time.. 113 | assert np.isclose(P.kappa, SO.get_kappa(), atol=0.03) 114 | 115 | 116 | def test_kappa_range(): 117 | 118 | for i in range(100): 119 | 120 | Es = 'E'*random.randint(1,60) 121 | Ks = 'K'*random.randint(1,60) 122 | Gs = 'G'*random.randint(1,100) 123 | 124 | tmp = Es+Ks+Gs 125 | if len(tmp) < 7: 126 | continue 127 | 128 | tmp_list = list(tmp) 129 | random.shuffle(tmp_list) 130 | tmp = "".join(tmp_list) 131 | 132 | p = Protein(tmp) 133 | k = p.kappa 134 | 135 | assert k > 0 136 | assert k < 1 137 | 138 | -------------------------------------------------------------------------------- /sparrow/tests/test_plugins.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sparrow.protein import Protein 4 | from sparrow.sequence_analysis.community_plugins.contributed import MultiplicativeFCR 5 | from sparrow.sequence_analysis.plugins import BasePlugin 6 | 7 | 8 | @pytest.fixture 9 | def protein(): 10 | sequence = "LLERYIPKHQKCLTSAQRSSIDPLDIEDVYQHKKPKFSSKSHIWHVYNENSNRQKLEHVKVNKGSKASLFINKEDVYEYYQKDPKNTKFGKSKHKQSTLDQIYSTGLRKGNLHNVKDPNTNVPKGIGRRKTQHKRTQVDDVDCNPRKILAVSPSRRINRLVTYQQHIPETHNDLPEELCEPSSLTLSSLRNGLDSSTEACSVSKEKHIQNLDLSDSQEVQCLELESVDQTEAVSFPGLLLHKEIKLPVVTTDKQPHTLQEQHHVLYKSHENSNLV" 11 | return Protein(sequence) 12 | 13 | 14 | def test_multiplicative_fcr_plugin(protein): 15 | plugin_manager = protein.plugin 16 | double_fcr_result = plugin_manager.MultiplicativeFCR() 17 | expected_result = 2.0 * protein.FCR 18 | assert pytest.approx(double_fcr_result, 0.000001) == expected_result 19 | 20 | 21 | def test_plugin_manager_cache(protein): 22 | plugin_manager = protein.plugin 23 | first_result = plugin_manager.MultiplicativeFCR() 24 | second_result = plugin_manager.MultiplicativeFCR() 25 | assert first_result == second_result 26 | 27 | 28 | def test_invalid_plugin(protein): 29 | plugin_manager = protein.plugin 30 | with pytest.raises(AttributeError): 31 | plugin_manager.NonExistentPlugin 32 | 33 | 34 | def test_multiple_plugins(protein): 35 | class TripleFCR(BasePlugin): 36 | def calculate(self, factor=3.0): 37 | return factor * self.protein.FCR 38 | 39 | class QuadrupleFCR(BasePlugin): 40 | def calculate(self, factor=4.0): 41 | return factor * self.protein.FCR 42 | 43 | plugin_manager = protein.plugin 44 | # plugin_manager._PluginManager__plugins is a dictionary that stores plugins. 45 | # we can add a new plugin to it by assigning a new key-value pair to it. 46 | plugin_manager._PluginManager__plugins["TripleFCR"] = TripleFCR(protein) 47 | plugin_manager._PluginManager__plugins["QuadrupleFCR"] = QuadrupleFCR(protein) 48 | 49 | # Testing TripleFCR plugin 50 | triple_fcr_result = plugin_manager.TripleFCR(factor=3.0) 51 | expected_triple_result = 3.0 * protein.FCR 52 | assert pytest.approx(triple_fcr_result, 0.000001) == expected_triple_result 53 | 54 | # Testing QuadrupleFCR plugin 55 | quadruple_fcr_result = plugin_manager.QuadrupleFCR(factor=4.0) 56 | expected_quadruple_result = 4.0 * protein.FCR 57 | assert pytest.approx(quadruple_fcr_result, 0.000001) == expected_quadruple_result 58 | 59 | 60 | def test_base_plugin_initialization(protein): 61 | class TestPlugin(BasePlugin): 62 | def calculate(self): 63 | return protein.FCR 64 | 65 | plugin = TestPlugin(protein) 66 | assert plugin.protein == protein 67 | 68 | 69 | def test_base_plugin_abstract_method(protein): 70 | with pytest.raises(TypeError): 71 | BasePlugin(protein) 72 | -------------------------------------------------------------------------------- /sparrow/tests/test_polymeric.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_polymeric.py -------------------------------------------------------------------------------- /sparrow/tests/test_predictor_disorder.py: -------------------------------------------------------------------------------- 1 | # Import package, test suite, and other packages as needed 2 | import sparrow 3 | import pytest 4 | import sys 5 | import numpy as np 6 | from sparrow.protein import Protein 7 | 8 | def test_protein_code_coverage(): 9 | 10 | P = Protein('MKASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQ') 11 | 12 | 13 | # V2 14 | # assert np.isclose(np.mean(P.predictor.disorder()), 0.8636131147540983) 15 | 16 | assert np.isclose(np.mean(P.predictor.disorder()), 0.92875415) 17 | -------------------------------------------------------------------------------- /sparrow/tests/test_protein.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit and regression test for the sparrow package. 3 | """ 4 | 5 | # Import package, test suite, and other packages as needed 6 | import random 7 | import sys 8 | 9 | import numpy as np 10 | import pytest 11 | 12 | import sparrow 13 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS 14 | from sparrow.protein import Protein 15 | from sparrow.sequence_analysis.elm import ( 16 | ELM, 17 | compute_gained_elms, 18 | compute_lost_elms, 19 | compute_retained_elms, 20 | ) 21 | 22 | 23 | def test_sparrow_imported(): 24 | """Sample test, will always pass so long as import statement worked""" 25 | assert "sparrow" in sys.modules 26 | 27 | 28 | def test_protein_code_coverage(): 29 | 30 | 31 | s = 'MKASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQ' 32 | # constructor 33 | P = Protein(s) 34 | assert len(P) == 61 35 | 36 | P = Protein(s, validate=True) 37 | assert len(P) == 61 38 | 39 | s_broken = 'MKASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSXYGQSSYSSYXQ' 40 | P = Protein(s_broken, validate=True) 41 | assert len(P) == 61 42 | assert s == P.sequence 43 | 44 | 45 | 46 | assert len(P.amino_acid_fractions) == 20 47 | assert P.FCR == 0.04918032786885246 48 | assert P.fraction_positive == 0.01639344262295082 49 | assert P.fraction_negative == 0.03278688524590164 50 | assert P.NCPR == -0.01639344262295082 51 | assert P.fraction_aromatic == 0.16393442622950818 52 | assert P.fraction_aliphatic == 0.06557377049180328 53 | assert P.fraction_polar == 0.6721311475409836 54 | assert P.fraction_proline == 0.04918032786885246 55 | 56 | # V2 57 | # assert np.mean(P.predictor.disorder()) == 0.8636131147540983 58 | 59 | assert np.isclose(np.mean(P.predictor.disorder()), 0.92875415) 60 | assert P.hydrophobicity == 3.052459016393442 61 | assert P.compute_residue_fractions(['P','E','K','R','D']) == 0.09836065573770492 62 | 63 | assert np.mean(P.linear_sequence_profile('FCR')) == 0.04918032786885246 64 | assert np.mean(P.linear_sequence_profile('NCPR')) == -0.02459016393442623 65 | assert np.mean(P.linear_sequence_profile('aromatic')) == 0.1680327868852459 66 | assert np.mean(P.linear_sequence_profile('aliphatic')) == 0.05737704918032787 67 | assert np.mean(P.linear_sequence_profile('polar')) == 0.6762295081967213 68 | assert np.mean(P.linear_sequence_profile('proline')) == 0.04918032786885246 69 | assert np.mean(P.linear_sequence_profile('positive')) == 0.012295081967213115 70 | assert np.mean(P.linear_sequence_profile('negative')) == 0.036885245901639344 71 | assert np.isclose(np.mean(P.linear_sequence_profile('hydrophobicity')),3.0450819672131146) 72 | assert np.mean(P.linear_composition_profile(['E','K'])) == 0.012295081967213115 73 | 74 | P = Protein("KRRARKRRARKRRARKRRAR") 75 | elms = P.elms 76 | func_sites = [] 77 | elm_sequences = [] 78 | start, end = [],[] 79 | for elm in elms: 80 | start.append(elm.start) 81 | end.append(elm.end) 82 | elm_sequences.append(elm.sequence) 83 | func_sites.append(elm.functional_site_name) 84 | func_sites = list(set(func_sites)) 85 | for func_site in func_sites: 86 | assert func_site in ['di Arginine retention/retrieving signal', 87 | 'CendR Motif Binding to Neuropilin Receptors', 88 | 'NLS classical Nuclear Localization Signals', 89 | 'N-degron', 90 | 'NRD cleavage site', 91 | 'PCSK cleavage site'] 92 | assert sorted(start) == sorted([1, 6, 11, 16, 4, 9, 14, 0, 5, 10, 15, 0, 5, 10, 15, 1, 11, 0, 16, 1, 6, 11, 16, 0, 3, 13, 4, 14, 1, 9]) 93 | assert sorted(end) == sorted([4, 9, 14, 19, 9, 14, 19, 3, 8, 13, 18, 3, 8, 13, 18, 8, 18, 3, 20, 5, 10, 15, 20, 20, 9, 19, 10, 20, 9, 15]) 94 | assert sorted(elm_sequences) == sorted(['RRA', 95 | 'RRA', 96 | 'RRA', 97 | 'RRA', 98 | 'RKRRA', 99 | 'RKRRA', 100 | 'RKRRA', 101 | 'KRR', 102 | 'KRR', 103 | 'KRR', 104 | 'KRR', 105 | 'KRR', 106 | 'KRR', 107 | 'KRR', 108 | 'KRR', 109 | 'RRARKRR', 110 | 'RRARKRR', 111 | 'KRR', 112 | 'RRAR', 113 | 'RRAR', 114 | 'RRAR', 115 | 'RRAR', 116 | 'RRAR', 117 | 'KRRARKRRARKRRARKRRAR', 118 | 'ARKRRA', 119 | 'ARKRRA', 120 | 'RKRRAR', 121 | 'RKRRAR', 122 | 'RRARKRRA', 123 | 'RKRRAR']) 124 | 125 | def test_elm_comparisons(): 126 | wt = sparrow.Protein("MKKK") 127 | mut = sparrow.Protein("MRKK") 128 | 129 | wt_elms = wt.elms 130 | mut_elms = mut.elms 131 | 132 | assert wt.elms == { 133 | ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=3, sequence='KKK') 134 | } 135 | assert mut.elms == { 136 | ELM(regex='(.RK)|(RR[^KR])', functional_site_name='NRD cleavage site', start=0, end=3, sequence='MRK'), 137 | ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MRKK') 138 | } 139 | 140 | assert wt.elms - mut.elms == set() 141 | assert wt.elms & mut.elms == {ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MKKK')} 142 | 143 | assert compute_lost_elms(wt,[2,"K"]) == set() 144 | assert compute_retained_elms(wt,"p.K1R") == {ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=3, sequence='RKK')} 145 | assert compute_gained_elms(wt,"p.K2R") == {ELM(regex='(.RK)|(RR[^KR])', functional_site_name='NRD cleavage site', start=0, end=3, sequence='MRK')} 146 | 147 | assert compute_retained_elms(mut,"p.M1K") == {ELM(regex='(.RK)|(RR[^KR])', functional_site_name='NRD cleavage site', start=0, end=3, sequence='MRK'), 148 | ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MRKK')} 149 | 150 | assert compute_gained_elms(mut,"p.M1K") == {ELM(regex='KR.', functional_site_name='PCSK cleavage site', start=0, end=3, sequence='KRK'), 151 | ELM(regex='[KR]R.', functional_site_name='PCSK cleavage site', start=0, end=3, sequence='KRK')} 152 | assert compute_lost_elms(mut, "p.M1G") == {ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MRKK')} 153 | 154 | 155 | -------------------------------------------------------------------------------- /sparrow/tests/test_scd.py: -------------------------------------------------------------------------------- 1 | from sparrow.patterning import scd 2 | import os 3 | import protfasta 4 | import numpy as np 5 | from sparrow import Protein 6 | from IPython import embed 7 | 8 | current_filepath = os.getcwd() 9 | onehundred_seqs = "{}/test_data/test_seqs_100.fasta".format(current_filepath) 10 | 11 | seqs = protfasta.read_fasta(onehundred_seqs) 12 | 13 | def test_scd(): 14 | 15 | k2val = np.load('test_data/test_100_scd.npy', allow_pickle=True).item() 16 | for k in seqs: 17 | s = seqs[k] 18 | cython_SCD = getattr(Protein(s),"SCD") 19 | no_cython_SCD = k2val[k] 20 | assert np.isclose(cython_SCD, no_cython_SCD) 21 | 22 | def test_shd(): 23 | k2val = np.load('test_data/test_100_shd.npy', allow_pickle=True).item() 24 | for k in seqs: 25 | s = seqs[k] 26 | assert np.isclose(getattr(Protein(s),"SHD"), k2val[k]) 27 | 28 | -------------------------------------------------------------------------------- /sparrow/tests/test_sparrow.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit and regression test for the sparrow package. 3 | """ 4 | 5 | # Import package, test suite, and other packages as needed 6 | import sparrow 7 | import pytest 8 | import sys 9 | 10 | def test_sparrow_imported(): 11 | """Sample test, will always pass so long as import statement worked""" 12 | assert "sparrow" in sys.modules 13 | -------------------------------------------------------------------------------- /sparrow/tests/test_sparrow_vs_localcider.py: -------------------------------------------------------------------------------- 1 | from localcider.sequenceParameters import SequenceParameters 2 | from sparrow import Protein 3 | 4 | from . import build_seq 5 | 6 | import numpy as np 7 | 8 | NSEQS=100 9 | 10 | def test_FCR(): 11 | 12 | for i in range(NSEQS): 13 | s = build_seq() 14 | assert np.isclose(SequenceParameters(s).get_FCR(), Protein(s).FCR, atol=1e-8) 15 | 16 | 17 | def test_NCPR(): 18 | 19 | for i in range(NSEQS): 20 | s = build_seq() 21 | assert np.isclose(SequenceParameters(s).get_NCPR(), Protein(s).NCPR, atol=1e-8) 22 | 23 | 24 | def test_fraction_neg_fraction_pos(): 25 | 26 | for i in range(NSEQS): 27 | s = build_seq() 28 | assert np.isclose(SequenceParameters(s).get_countNeg()/len(s), Protein(s).fraction_negative, atol=1e-8) 29 | assert np.isclose(SequenceParameters(s).get_countPos()/len(s), Protein(s).fraction_positive, atol=1e-8) 30 | 31 | def test_hydrophobiciyty(): 32 | 33 | for i in range(NSEQS): 34 | s = build_seq() 35 | assert np.isclose(SequenceParameters(s).get_uversky_hydropathy(), Protein(s).hydrophobicity/9, atol=1e-8) 36 | 37 | -------------------------------------------------------------------------------- /sparrow/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tools/__init__.py -------------------------------------------------------------------------------- /sparrow/tools/general_tools.py: -------------------------------------------------------------------------------- 1 | from sparrow.data import amino_acids 2 | 3 | 4 | def is_valid_protein_sequence(sequence): 5 | """ 6 | Function that tests if a passed sequence contains non-standard ammino acids 7 | 8 | Parameters 9 | ---------------- 10 | sequence : str 11 | Protein sequence 12 | 13 | Returns 14 | --------------- 15 | bool 16 | If sequences contains non-standard amino acids returns False, else returns 17 | True 18 | """ 19 | 20 | for i in sequence: 21 | if i not in amino_acids.VALID_AMINO_ACIDS: 22 | return False 23 | 24 | return True 25 | 26 | 27 | 28 | def compare_sequence(s1, s2, verbose=False, ignore_gaps=False, return_positions=False): 29 | """ 30 | Function that compares two sequences of the same length and returns 31 | either the set of positions where the sequences are different (indxed at 0) or 32 | the number of differences between them, depending on the status of the flag 33 | return_position. This function Will also print the differences if verbose is 34 | set to True. 35 | 36 | If ignore_gaps is set to True, will ignore gaps in the comparison (i.e. 37 | will ignore '-' characters in either sequence). This is useful when running 38 | analyses for aligned sequences. 39 | 40 | WARNING: Sequence must have the same length - if two passed sequences are not 41 | identical in terms of length then this function throws a ValueError 42 | 43 | Parameters 44 | ---------------- 45 | s1 : str 46 | First sequence to compare 47 | 48 | s2 : str 49 | Second sequence to compare 50 | 51 | verbose : bool 52 | If True, will print the differences between the two sequences. 53 | Default is False 54 | 55 | return_positions : bool 56 | If True, will return a list of positions where the two sequences 57 | differ. If false return the count only. 58 | 59 | Returns 60 | --------------- 61 | int 62 | Number of differences between the two sequences 63 | 64 | Raises 65 | --------------- 66 | ValueError 67 | If sequences are not the same length. 68 | 69 | """ 70 | 71 | # first things first check if sequences are the same length and 72 | # freak out if not! 73 | if len(s1) != len(s2): 74 | raise ValueError("Sequences must have the same length") 75 | 76 | # define comparison function based on ignore_gaps 77 | if ignore_gaps: 78 | def _compare(p1,p2): 79 | if p1 == "-" or p2 == "-": 80 | return False 81 | elif p1 == p2: 82 | return False 83 | else: 84 | return True 85 | else: 86 | def _compare(p1,p2): 87 | if p1 == p2: 88 | return False 89 | else: 90 | return True 91 | 92 | 93 | # cycle through each position in the sequence 94 | positions = [] 95 | for i in range(len(s1)): 96 | if _compare(s1[i],s2[i]): 97 | positions.append(i) 98 | if verbose: 99 | print(f"{i+1}: {s1[i]} vs. {s2[i]}") 100 | 101 | 102 | if return_positions: 103 | return positions 104 | else: 105 | return len(positions) 106 | -------------------------------------------------------------------------------- /sparrow/tools/utilities.py: -------------------------------------------------------------------------------- 1 | from sparrow.sparrow_exceptions import SparrowException 2 | 3 | def validate_keyword_option(keyword, allowed_vals, keyword_name, error_message=None): 4 | """ 5 | Helper function that checks a passed keyword is only one of a set of possible 6 | valid keywords 7 | 8 | Parameters 9 | ----------- 10 | keyword : str 11 | The actual passed keyword value 12 | 13 | allowed_vals : list of str 14 | A list of possible keywords 15 | 16 | keyword_name : str 17 | the name of the keyword as the user would select it in the function call 18 | 19 | error_message : str 20 | Allows the user to pass a custom error message 21 | 22 | 23 | Returns 24 | -------- 25 | None 26 | 27 | No return value, but raises ctexceptions.CTException if ``keyword `` is not 28 | found in the allowed_vals list 29 | 30 | """ 31 | 32 | 33 | if keyword not in allowed_vals: 34 | if error_message is None: 35 | raise SparrowException(f'Keyword {keyword_name} passed value [{keyword}], but this is not valid.\nMust be one of: {str(allowed_vals)}') 36 | -------------------------------------------------------------------------------- /sparrow/visualize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/visualize/__init__.py -------------------------------------------------------------------------------- /sparrow/visualize/sequence_visuals.py: -------------------------------------------------------------------------------- 1 | #from IPython import display 2 | from IPython.display import display 3 | from IPython.display import HTML 4 | 5 | from sparrow.data.amino_acids import AA_COLOR 6 | from sparrow.sparrow_exceptions import SparrowException 7 | 8 | def show_sequence(seq, 9 | blocksize=10, 10 | newline=50, 11 | fontsize=14, 12 | font_family='Courier', 13 | colors={}, 14 | header=None, 15 | bold_positions=[], 16 | bold_residues=[], 17 | opaque_positions=[], 18 | return_raw_string=False, 19 | warnings = True): 20 | 21 | """ 22 | Function that generates an HTML colored string that either renders in the browser or returns the 23 | html string. Contains various customizable components. 24 | 25 | Parameters 26 | ------------- 27 | 28 | blocksize : int 29 | Defines how big blocks of residues are. Blocks are equal to blocksize or the newline parameter, whicever is smaller. 30 | Default=10. If set to -1 uses length of the sequence. 31 | 32 | newline : int 33 | Defines how many residues are shown before a newline is printed. Default is 50. If set to -1 uses the length of 34 | the sequence. 35 | 36 | fontsize : int 37 | Fontsize used. Default is 14 38 | 39 | font_family : str 40 | Which font family (from HTML fonts) is used. Using a non-monospace font makes no sense as columns will be 41 | unaligned. Default is Courier. 42 | 43 | colors : dict 44 | Dictionary that allows overiding of default color scheme. Should be of format key-value as 'residue'-'color' where 45 | residue is a residue in the string and color is a valid HTML color (which can be a Hexcode, standard HTML color name). 46 | Note that this also lets you define colors for non-standard amino acids should these be useful. Default is an empty 47 | dictionary. Note also that the standard amino acid colorings are defined at sparrow.data.amino_acids.AA_COLOR 48 | 49 | header : str 50 | If provided, this is a string that provides a FASTA-style header (with a leading carrett included). Default None. 51 | 52 | bold_positions : list 53 | List of positions (indexing from 1 onwards) which will be bolded. Useful for highlighting specific regions. Note that this 54 | defines individual residues so (for example) to bold residues 10 to 15 would require bold_positions=[10,11,12,13,14,15]. 55 | Default is an empty list. 56 | 57 | bold_residues : list 58 | List of residue types that can be bolded. Useful for highlighting specific residue groups. Default is an empty list. 59 | 60 | opaque_positions : list 61 | List of positions (indexing from 1 onwards) which will be grey and slighlty opaque. Useful for highlighting specific regions. 62 | Note that this defines individual residues so (for example) to bold residues 10 to 15 would require 63 | bold_positions=[10,11,12,13,14,15]. Default is an empty list. 64 | 65 | return_raw_string : bool 66 | If set to true, the function returns the actual raw HTML string, as opposed to an in-notebook rendering. 67 | Default is False 68 | 69 | warnings : bool 70 | If set to true, the function will print warnings if invalid amino acids are found. Default is True. 71 | 72 | 73 | Returns 74 | ---------- 75 | None or str 76 | If return_raw_string is set to true then an HTML-compatible string is returned. 77 | 78 | 79 | Raises 80 | ------- 81 | sparrow.sparrow_exceptions.SparrowException 82 | Raises a sparrow exception if invalid input is provided (within reason). 83 | 84 | """ 85 | 86 | if blocksize > newline: 87 | newline = blocksize 88 | 89 | if blocksize == -1: 90 | blocksize = len(seq) 91 | newline = len(seq) 92 | 93 | 94 | if blocksize < 1: 95 | raise 96 | 97 | 98 | colorString = '

'%(font_family, fontsize) 99 | 100 | if header: 101 | colorString = colorString + ">%s
"%(str(header)) 102 | 103 | 104 | count = -1 105 | for residue in seq: 106 | 107 | count = count + 1 108 | 109 | if count > 0: 110 | if count % newline == 0: 111 | colorString = colorString + "
" 112 | 113 | elif count % blocksize == 0: 114 | colorString = colorString + " " 115 | 116 | 117 | if residue not in AA_COLOR and residue not in colors: 118 | if warnings: 119 | print('Warning: found invalid amino acid (%s and position %i'%(residue, count+1)) 120 | colorString = colorString + '%s' % ('black', residue) 121 | else: 122 | 123 | # override with user-suppplied pallete if present 124 | if residue in colors: 125 | c = colors[residue] 126 | 127 | # else fall back on the standard pallete 128 | else: 129 | c = AA_COLOR[residue] 130 | 131 | # check if residue should be light grey and opaque 132 | # This overrides other coloring 133 | if count+1 in opaque_positions: 134 | c = '#a9a9a9' 135 | 136 | # if the residue type OR residue position is to be bolded... 137 | if residue in bold_residues or (count+1) in bold_positions: 138 | colorString = colorString + '%s' % (c, residue) 139 | else: 140 | colorString = colorString + '%s' % (c, residue) 141 | 142 | 143 | 144 | colorString = colorString +"

" 145 | 146 | if return_raw_string: 147 | return colorString 148 | else: 149 | display(HTML(colorString)) 150 | #HTML(colorString) 151 | --------------------------------------------------------------------------------