├── .codecov.yml
├── .github
├── CONTRIBUTING.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ └── CI.yaml
├── .gitignore
├── .lgtm.yml
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── devtools
├── README.md
├── conda-envs
│ └── test_env.yaml
├── legacy-miniconda-setup
│ └── before_install.sh
└── scripts
│ └── create_conda_env.py
├── docs
├── Makefile
├── README.md
├── _static
│ └── README.md
├── _templates
│ └── README.md
├── api.rst
├── conf.py
├── getting_started.rst
├── index.rst
├── make.bat
├── predictors.rst
├── requirements.txt
└── requirements.yaml
├── examples
├── protein_example_1.py
└── sparrow_walk_through.ipynb
├── pyproject.toml
├── readthedocs.yml
├── setup.cfg
├── setup.py
└── sparrow
├── __init__.py
├── calculate_parameters.py
├── data
├── README.md
├── __init__.py
├── amino_acids.py
├── configs.py
├── elm_classes.tsv
├── look_and_say.dat
└── networks
│ ├── asphericity
│ ├── README
│ ├── asphericity_network_v1.pt
│ └── asphericity_network_v2.pt
│ ├── dssp
│ ├── dssp_predictor_network_v1.pt
│ └── dssp_predictor_network_v2.pt
│ ├── mitochondrial_targeting
│ └── mitochondrial_targeting_predictor_network_v1.pt
│ ├── nuclear_export_signal
│ └── nes_predictor_network_v1.pt
│ ├── nuclear_import_signal
│ └── nls_predictor_network_v1.pt
│ ├── phosphorylation
│ ├── ser_phosphorylation_predictor_network_v1.pt
│ ├── thr_phosphorylation_predictor_network_v1.pt
│ └── tyr_phosphorylation_predictor_network_v1.pt
│ ├── prefactor
│ ├── README
│ ├── prefactor_network_v1.pt
│ └── prefactor_network_v2.pt
│ ├── pscore
│ ├── pscore_predictor_network_v2.pt
│ ├── pscore_predictor_network_v3.pt
│ └── pscore_predictor_network_v4.pt
│ ├── re
│ ├── README
│ ├── re_network_v1.pt
│ └── re_network_v2.pt
│ ├── rg
│ ├── README
│ ├── rg_network_v1.pt
│ └── rg_network_v2.pt
│ ├── scaled_re
│ ├── README
│ ├── scaled_re_network_v1.pt
│ └── scaled_re_network_v2.pt
│ ├── scaled_rg
│ ├── README
│ ├── scaled_rg_network_v1.pt
│ └── scaled_rg_network_v2.pt
│ ├── scaling_exponent
│ ├── README
│ ├── scaling_exponent_network_v1.5.pt
│ ├── scaling_exponent_network_v1.pt
│ └── scaling_exponent_network_v2.pt
│ ├── transactivation_domains
│ └── tad_predictor_network_v1.pt
│ └── transmembrane
│ ├── transmembrane_predictor_network_v1.pt
│ └── transmembrane_predictor_network_v4.pt
├── patterning
├── __init__.py
├── iwd.pyx
├── kappa.pyx
├── patterning.pyx
└── scd.pyx
├── polymer
├── __init__.py
└── scaling_parameters.py
├── predictors
├── __init__.py
├── asphericity
│ ├── __init__.py
│ └── asphericity_predictor.py
├── batch_predict.py
├── dssp
│ ├── __init__.py
│ └── dssp_predictor.py
├── e2e
│ ├── __init__.py
│ └── end_to_end_distance_predictor.py
├── mitochondrial_targeting
│ ├── __init__.py
│ └── mitochondrial_targeting_predictor.py
├── nes
│ ├── __init__.py
│ └── nuclear_export_signal_predictor.py
├── nls
│ ├── __init__.py
│ └── nuclear_import_signal_predictor.py
├── phosphorylation
│ ├── __init__.py
│ ├── phospho_predictor_utils.py
│ ├── ser_phosphorylation_predictor.py
│ ├── thr_phosphorylation_predictor.py
│ └── tyr_phosphorylation_predictor.py
├── predictor_template.pyXX
├── prefactor
│ ├── __init__.py
│ └── prefactor_predictor.py
├── pscore
│ ├── __init__.py
│ └── pscore_predictor.py
├── rg
│ ├── __init__.py
│ └── radius_of_gyration_predictor.py
├── scaled_re
│ ├── __init__.py
│ └── scaled_end_to_end_distance_predictor.py
├── scaled_rg
│ ├── __init__.py
│ └── scaled_radius_of_gyration_predictor.py
├── scaling_exponent
│ ├── __init__.py
│ └── scaling_exponent_predictor.py
├── tad
│ ├── __init__.py
│ └── transactivation_domain_predictor.py
└── transmembrane
│ ├── __init__.py
│ └── transmembrane_predictor.py
├── protein.py
├── sequence_analysis
├── __init__.py
├── alignment.py
├── community_plugins
│ └── contributed.py
├── elm.py
├── phospho_isoforms.py
├── physical_properties.py
├── plugins.py
└── sequence_complexity.py
├── sparrow_exceptions.py
├── tests
├── __init__.py
├── compute_test_data.ipynb
├── generate_test_data
│ ├── generate_dssp_data.ipynb
│ └── helicity_class_v2_default.pickle
├── test_albatross.py
├── test_data
│ ├── coil_class_v2_default_test_seqs_100.pickle
│ ├── coil_class_v2_non_default_test_seqs_100.pickle
│ ├── coil_prob_v2_default_test_seqs_100.pickle
│ ├── extended_class_v2_default_test_seqs_100.pickle
│ ├── extended_class_v2_non_default_test_seqs_100.pickle
│ ├── extended_prob_v2_default_test_seqs_100.pickle
│ ├── helicity_class_v2_default_test_seqs_100.pickle
│ ├── helicity_class_v2_non_default_test_seqs_100.pickle
│ ├── helicity_prob_v2_default_test_seqs_100.pickle
│ ├── test_100_asph.npy
│ ├── test_100_asph_v2.npy
│ ├── test_100_exponent.npy
│ ├── test_100_exponent_v2.npy
│ ├── test_100_prefactor.npy
│ ├── test_100_prefactor_v2.npy
│ ├── test_100_re.npy
│ ├── test_100_re_scaled.npy
│ ├── test_100_re_scaled_v2.npy
│ ├── test_100_re_v2.npy
│ ├── test_100_rg.npy
│ ├── test_100_rg_scaled.npy
│ ├── test_100_rg_scaled_v2.npy
│ ├── test_100_rg_v2.npy
│ ├── test_100_scd.npy
│ ├── test_100_shd.npy
│ ├── test_average_bivariate_inverse_distance_charge.npy
│ ├── test_average_inverse_distance_ali.npy
│ ├── test_average_inverse_distance_charge_neg.npy
│ ├── test_average_inverse_distance_charge_pos.npy
│ └── test_seqs_100.fasta
├── test_iwd.py
├── test_kappa.py
├── test_plugins.py
├── test_polymeric.py
├── test_predictor_disorder.py
├── test_predictor_dssp.py
├── test_protein.py
├── test_scd.py
├── test_sparrow.py
└── test_sparrow_vs_localcider.py
├── tools
├── __init__.py
├── general_tools.py
├── io.py
├── track_tools.py
└── utilities.py
└── visualize
├── __init__.py
└── sequence_visuals.py
/.codecov.yml:
--------------------------------------------------------------------------------
1 | # Codecov configuration to make it a bit less noisy
2 | coverage:
3 | status:
4 | patch: false
5 | project:
6 | default:
7 | threshold: 50%
8 | comment:
9 | layout: "header"
10 | require_changes: false
11 | branches: null
12 | behavior: default
13 | flags: null
14 | paths: null
--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | We welcome contributions from external contributors, and this document
4 | describes how to merge code changes into this sparrow.
5 |
6 | ## Getting Started
7 |
8 | * Make sure you have a [GitHub account](https://github.com/signup/free).
9 | * [Fork](https://help.github.com/articles/fork-a-repo/) this repository on GitHub.
10 | * On your local machine,
11 | [clone](https://help.github.com/articles/cloning-a-repository/) your fork of
12 | the repository.
13 |
14 | ## Making Changes
15 |
16 | * Add some really awesome code to your local fork. It's usually a [good
17 | idea](http://blog.jasonmeridth.com/posts/do-not-issue-pull-requests-from-your-master-branch/)
18 | to make changes on a
19 | [branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/)
20 | with the branch name relating to the feature you are going to add.
21 | * When you are ready for others to examine and comment on your new feature,
22 | navigate to your fork of sparrow on GitHub and open a [pull
23 | request](https://help.github.com/articles/using-pull-requests/) (PR). Note that
24 | after you launch a PR from one of your fork's branches, all
25 | subsequent commits to that branch will be added to the open pull request
26 | automatically. Each commit added to the PR will be validated for
27 | mergability, compilation and test suite compliance; the results of these tests
28 | will be visible on the PR page.
29 | * If you're providing a new feature, you must add test cases and documentation.
30 | * When the code is ready to go, make sure you run the test suite using pytest.
31 | * When you're ready to be considered for merging, check the "Ready to go"
32 | box on the PR page to let the sparrow devs know that the changes are complete.
33 | The code will not be merged until this box is checked, the continuous
34 | integration returns checkmarks,
35 | and multiple core developers give "Approved" reviews.
36 |
37 | # Additional Resources
38 |
39 | * [General GitHub documentation](https://help.github.com/)
40 | * [PR best practices](http://codeinthehole.com/writing/pull-requests-and-other-good-practices-for-teams-using-github/)
41 | * [A guide to contributing to software packages](http://www.contribution-guide.org)
42 | * [Thinkful PR example](http://www.thinkful.com/learn/github-pull-request-tutorial/#Time-to-Submit-Your-First-PR)
43 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | Provide a brief description of the PR's purpose here.
3 |
4 | ## Todos
5 | Notable points that this PR has either accomplished or will accomplish.
6 | - [ ] TODO 1
7 |
8 | ## Questions
9 | - [ ] Question1
10 |
11 | ## Status
12 | - [ ] Ready to go
--------------------------------------------------------------------------------
/.github/workflows/CI.yaml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | # GitHub has started calling new repo's first branch "main" https://github.com/github/renaming
5 | # Existing codes likely still have "master" as the primary branch
6 | # Both are tracked here to keep legacy and new codes working
7 | push:
8 | branches:
9 | - "master"
10 | - "main"
11 | pull_request:
12 | branches:
13 | - "master"
14 | - "main"
15 | schedule:
16 | # Nightly tests run on master by default:
17 | # Scheduled workflows run on the latest commit on the default or base branch.
18 | # (from https://help.github.com/en/actions/reference/events-that-trigger-workflows#scheduled-events-schedule)
19 | - cron: "0 0 * * *"
20 |
21 | jobs:
22 | test:
23 | name: Test on ${{ matrix.os }}, Python ${{ matrix.python-version }}
24 | runs-on: ${{ matrix.os }}
25 | strategy:
26 | matrix:
27 | os: [macOS-latest, ubuntu-latest, windows-latest]
28 | python-version: [3.7, 3.8, 3.9]
29 |
30 | steps:
31 | - uses: actions/checkout@v1
32 |
33 | - name: Additional info about the build
34 | shell: bash
35 | run: |
36 | uname -a
37 | df -h
38 | ulimit -a
39 |
40 |
41 | # More info on options: https://github.com/conda-incubator/setup-miniconda
42 | - uses: conda-incubator/setup-miniconda@v2
43 | with:
44 | python-version: ${{ matrix.python-version }}
45 | environment-file: devtools/conda-envs/test_env.yaml
46 |
47 | channels: conda-forge,defaults
48 |
49 | activate-environment: test
50 | auto-update-conda: false
51 | auto-activate-base: false
52 | show-channel-urls: true
53 |
54 | - name: Install package
55 |
56 | # conda setup requires this special shell
57 | shell: bash -l {0}
58 | run: |
59 | python -m pip install . --no-deps
60 | conda list
61 |
62 |
63 | - name: Run tests
64 |
65 | # conda setup requires this special shell
66 | shell: bash -l {0}
67 |
68 | run: |
69 | pytest -v --cov=sparrow --cov-report=xml --color=yes sparrow/tests/
70 |
71 | - name: CodeCov
72 | uses: codecov/codecov-action@v1
73 | with:
74 | file: ./coverage.xml
75 | flags: unittests
76 | name: codecov-${{ matrix.os }}-py${{ matrix.python-version }}
77 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | sparrow/patterning.html
6 | .DS_Store
7 | sparrow/_version.py
8 |
9 | # C extensions
10 | *.so
11 | *~
12 | *.c
13 | \#*
14 | \.#*
15 |
16 | # Distribution / packaging
17 | .Python
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | .pytest_cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | .hypothesis/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # pyenv
82 | .python-version
83 |
84 | # celery beat schedule file
85 | celerybeat-schedule
86 |
87 | # SageMath parsed files
88 | *.sage.py
89 |
90 | # dotenv
91 | .env
92 |
93 | # virtualenv
94 | .venv
95 | venv/
96 | ENV/
97 |
98 | # Spyder project settings
99 | .spyderproject
100 | .spyproject
101 |
102 | # Rope project settings
103 | .ropeproject
104 |
105 | # mkdocs documentation
106 | /site
107 |
108 | # mypy
109 | .mypy_cache/
110 |
111 | # profraw files from LLVM? Unclear exactly what triggers this
112 | # There are reports this comes from LLVM profiling, but also Xcode 9.
113 | *profraw
114 |
115 | # pytorch weights
116 | # *pt
117 |
--------------------------------------------------------------------------------
/.lgtm.yml:
--------------------------------------------------------------------------------
1 | # Configure LGTM for this package
2 |
3 | extraction:
4 | python: # Configure Python
5 | python_setup: # Configure the setup
6 | version: 3 # Specify Version 3
7 | path_classifiers:
8 | library:
9 | - devtools/*
10 | generated:
11 | - sparrow/_version.py
12 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # File: .readthedocs.yaml
2 |
3 | version: 2
4 |
5 | # Specify the Python version and requirements file
6 | python:
7 | install:
8 | - requirements: docs/requirements.txt
9 |
10 | # Use the "readthedocs" environment to ensure all dependencies are installed before building
11 | build:
12 | os: ubuntu-20.04
13 | tools:
14 | python: "3.9"
15 |
16 | # Sphinx configuration
17 | sphinx:
18 | configuration: docs/conf.py
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age,
8 | body size, disability, ethnicity, gender identity and expression, level of
9 | experience, nationality, personal appearance, race, religion, or sexual
10 | identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment include:
15 |
16 | * Using welcoming and inclusive language
17 | * Being respectful of differing viewpoints and experiences
18 | * Gracefully accepting constructive criticism
19 | * Focusing on what is best for the community
20 | * Showing empathy towards other community members
21 |
22 | Examples of unacceptable behavior by participants include:
23 |
24 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
25 | * Trolling, insulting/derogatory comments, and personal or political attacks
26 | * Public or private harassment
27 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
28 | * Other conduct which could reasonably be considered inappropriate in a professional setting
29 |
30 | ## Our Responsibilities
31 |
32 | Project maintainers are responsible for clarifying the standards of acceptable
33 | behavior and are expected to take appropriate and fair corrective action in
34 | response to any instances of unacceptable behavior.
35 |
36 | Project maintainers have the right and responsibility to remove, edit, or
37 | reject comments, commits, code, wiki edits, issues, and other contributions
38 | that are not aligned to this Code of Conduct, or to ban temporarily or
39 | permanently any contributor for other behaviors that they deem inappropriate,
40 | threatening, offensive, or harmful.
41 |
42 | Moreover, project maintainers will strive to offer feedback and advice to
43 | ensure quality and consistency of contributions to the code. Contributions
44 | from outside the group of project maintainers are strongly welcomed but the
45 | final decision as to whether commits are merged into the codebase rests with
46 | the team of project maintainers.
47 |
48 | ## Scope
49 |
50 | This Code of Conduct applies both within project spaces and in public spaces
51 | when an individual is representing the project or its community. Examples of
52 | representing a project or community include using an official project e-mail
53 | address, posting via an official social media account, or acting as an
54 | appointed representative at an online or offline event. Representation of a
55 | project may be further defined and clarified by project maintainers.
56 |
57 | ## Enforcement
58 |
59 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
60 | reported by contacting the project team at 'alex.holehouse@wustl.edu'. The project team will
61 | review and investigate all complaints, and will respond in a way that it deems
62 | appropriate to the circumstances. The project team is obligated to maintain
63 | confidentiality with regard to the reporter of an incident. Further details of
64 | specific enforcement policies may be posted separately.
65 |
66 | Project maintainers who do not follow or enforce the Code of Conduct in good
67 | faith may face temporary or permanent repercussions as determined by other
68 | members of the project's leadership.
69 |
70 | ## Attribution
71 |
72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
73 | version 1.4, available at
74 | [http://contributor-covenant.org/version/1/4][version]
75 |
76 | [homepage]: http://contributor-covenant.org
77 | [version]: http://contributor-covenant.org/version/1/4/
78 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | MIT License
3 |
4 | Copyright (c) 2023 Jeffrey Lotthammer
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include MANIFEST.in
3 | include CODE_OF_CONDUCT.md
4 |
5 | graft sparrow
6 | global-exclude *.py[cod] __pycache__ *.so
7 |
8 | recursive-include sparrow/data *
9 |
10 |
--------------------------------------------------------------------------------
/devtools/README.md:
--------------------------------------------------------------------------------
1 | # Development, testing, and deployment tools
2 |
3 | This directory contains a collection of tools for running Continuous Integration (CI) tests,
4 | conda installation, and other development tools not directly related to the coding process.
5 |
6 |
7 | ## Manifest
8 |
9 | ### Continuous Integration
10 |
11 | You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and
12 | Windows testing if you only plan to deploy on specific platforms. These are just to help you get started.
13 |
14 | The items in this directory have been left for legacy purposes since the change to GitHub Actions,
15 | They will likely be removed in a future version.
16 |
17 | * `legacy-miniconda-setup`: A preserved copy of a helper directory which made Linux and OSX based testing through [Travis-CI](https://about.travis-ci.com/) simpler
18 | * `before_install.sh`: Pip/Miniconda pre-package installation script for Travis. No longer needed thanks to
19 | [GitHub Actions](https://docs.github.com/en/free-pro-team@latest/actions) and the [conda-incubator/setup-miniconda Action](https://github.com/conda-incubator/setup-miniconda)
20 |
21 | ### Conda Environment:
22 |
23 | This directory contains the files to setup the Conda environment for testing purposes
24 |
25 | * `conda-envs`: directory containing the YAML file(s) which fully describe Conda Environments, their dependencies, and those dependency provenance's
26 | * `test_env.yaml`: Simple test environment file with base dependencies. Channels are not specified here and therefore respect global Conda configuration
27 |
28 | ### Additional Scripts:
29 |
30 | This directory contains OS agnostic helper scripts which don't fall in any of the previous categories
31 | * `scripts`
32 | * `create_conda_env.py`: Helper program for spinning up new conda environments based on a starter file with Python Version and Env. Name command-line options
33 |
34 |
35 | ## How to contribute changes
36 | - Clone the repository if you have write access to the main repo, fork the repository if you are a collaborator.
37 | - Make a new branch with `git checkout -b {your branch name}`
38 | - Make changes and test your code
39 | - Ensure that the test environment dependencies (`conda-envs`) line up with the build and deploy dependencies (`conda-recipe/meta.yaml`)
40 | - Push the branch to the repo (either the main or your fork) with `git push -u origin {your branch name}`
41 | * Note that `origin` is the default name assigned to the remote, yours may be different
42 | - Make a PR on GitHub with your changes
43 | - We'll review the changes and get your code into the repo after lively discussion!
44 |
45 |
46 | ## Checklist for updates
47 | - [ ] Make sure there is an/are issue(s) opened for your specific update
48 | - [ ] Create the PR, referencing the issue
49 | - [ ] Debug the PR as needed until tests pass
50 | - [ ] Tag the final, debugged version
51 | * `git tag -a X.Y.Z [latest pushed commit] && git push --follow-tags`
52 | - [ ] Get the PR merged in
53 |
--------------------------------------------------------------------------------
/devtools/conda-envs/test_env.yaml:
--------------------------------------------------------------------------------
1 | name: test
2 | channels:
3 |
4 | - conda-forge
5 |
6 | - defaults
7 | dependencies:
8 | # Base depends
9 | - python
10 | - pip
11 |
12 | # Testing
13 | - pytest
14 | - pytest-cov
15 | - codecov
16 |
17 | # Pip-only installs
18 | #- pip:
19 | # - codecov
20 |
21 |
--------------------------------------------------------------------------------
/devtools/legacy-miniconda-setup/before_install.sh:
--------------------------------------------------------------------------------
1 | # Temporarily change directory to $HOME to install software
2 | pushd .
3 | cd $HOME
4 | # Make sure some level of pip is installed
5 | python -m ensurepip
6 |
7 | # Install Miniconda
8 | if [ "$TRAVIS_OS_NAME" == "osx" ]; then
9 | # Make OSX md5 mimic md5sum from linux, alias does not work
10 | md5sum () {
11 | command md5 -r "$@"
12 | }
13 | MINICONDA=Miniconda3-latest-MacOSX-x86_64.sh
14 | else
15 | MINICONDA=Miniconda3-latest-Linux-x86_64.sh
16 | fi
17 | MINICONDA_HOME=$HOME/miniconda
18 | MINICONDA_MD5=$(wget -qO- https://repo.anaconda.com/miniconda/ | grep -A3 $MINICONDA | sed -n '4p' | sed -n 's/ *
\(.*\)<\/td> */\1/p')
19 | wget -q https://repo.anaconda.com/miniconda/$MINICONDA
20 | if [[ $MINICONDA_MD5 != $(md5sum $MINICONDA | cut -d ' ' -f 1) ]]; then
21 | echo "Miniconda MD5 mismatch"
22 | exit 1
23 | fi
24 | bash $MINICONDA -b -p $MINICONDA_HOME
25 |
26 | # Configure miniconda
27 | export PIP_ARGS="-U"
28 | # New to conda >=4.4
29 | echo ". $MINICONDA_HOME/etc/profile.d/conda.sh" >> ~/.bashrc # Source the profile.d file
30 | echo "conda activate" >> ~/.bashrc # Activate conda
31 | source ~/.bashrc # source file to get new commands
32 | #export PATH=$MINICONDA_HOME/bin:$PATH # Old way, should not be needed anymore
33 |
34 | conda config --add channels conda-forge
35 |
36 | conda config --set always_yes yes
37 | conda install conda conda-build jinja2 anaconda-client
38 | conda update --quiet --all
39 |
40 | # Restore original directory
41 | popd
42 |
--------------------------------------------------------------------------------
/devtools/scripts/create_conda_env.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import re
4 | import glob
5 | import shutil
6 | import subprocess as sp
7 | from tempfile import TemporaryDirectory
8 | from contextlib import contextmanager
9 | # YAML imports
10 | try:
11 | import yaml # PyYAML
12 | loader = yaml.load
13 | except ImportError:
14 | try:
15 | import ruamel_yaml as yaml # Ruamel YAML
16 | except ImportError:
17 | try:
18 | # Load Ruamel YAML from the base conda environment
19 | from importlib import util as import_util
20 | CONDA_BIN = os.path.dirname(os.environ['CONDA_EXE'])
21 | ruamel_yaml_path = glob.glob(os.path.join(CONDA_BIN, '..',
22 | 'lib', 'python*.*', 'site-packages',
23 | 'ruamel_yaml', '__init__.py'))[0]
24 | # Based on importlib example, but only needs to load_module since its the whole package, not just
25 | # a module
26 | spec = import_util.spec_from_file_location('ruamel_yaml', ruamel_yaml_path)
27 | yaml = spec.loader.load_module()
28 | except (KeyError, ImportError, IndexError):
29 | raise ImportError("No YAML parser could be found in this or the conda environment. "
30 | "Could not find PyYAML or Ruamel YAML in the current environment, "
31 | "AND could not find Ruamel YAML in the base conda environment through CONDA_EXE path. "
32 | "Environment not created!")
33 | loader = yaml.YAML(typ="safe").load # typ="safe" avoids odd typing on output
34 |
35 |
36 | @contextmanager
37 | def temp_cd():
38 | """Temporary CD Helper"""
39 | cwd = os.getcwd()
40 | with TemporaryDirectory() as td:
41 | try:
42 | os.chdir(td)
43 | yield
44 | finally:
45 | os.chdir(cwd)
46 |
47 |
48 | # Args
49 | parser = argparse.ArgumentParser(description='Creates a conda environment from file for a given Python version.')
50 | parser.add_argument('-n', '--name', type=str,
51 | help='The name of the created Python environment')
52 | parser.add_argument('-p', '--python', type=str,
53 | help='The version of the created Python environment')
54 | parser.add_argument('conda_file',
55 | help='The file for the created Python environment')
56 |
57 | args = parser.parse_args()
58 |
59 | # Open the base file
60 | with open(args.conda_file, "r") as handle:
61 | yaml_script = loader(handle.read())
62 |
63 | python_replacement_string = "python {}*".format(args.python)
64 |
65 | try:
66 | for dep_index, dep_value in enumerate(yaml_script['dependencies']):
67 | if re.match('python([ ><=*]+[0-9.*]*)?$', dep_value): # Match explicitly 'python' and its formats
68 | yaml_script['dependencies'].pop(dep_index)
69 | break # Making the assumption there is only one Python entry, also avoids need to enumerate in reverse
70 | except (KeyError, TypeError):
71 | # Case of no dependencies key, or dependencies: None
72 | yaml_script['dependencies'] = []
73 | finally:
74 | # Ensure the python version is added in. Even if the code does not need it, we assume the env does
75 | yaml_script['dependencies'].insert(0, python_replacement_string)
76 |
77 | # Figure out conda path
78 | if "CONDA_EXE" in os.environ:
79 | conda_path = os.environ["CONDA_EXE"]
80 | else:
81 | conda_path = shutil.which("conda")
82 | if conda_path is None:
83 | raise RuntimeError("Could not find a conda binary in CONDA_EXE variable or in executable search path")
84 |
85 | print("CONDA ENV NAME {}".format(args.name))
86 | print("PYTHON VERSION {}".format(args.python))
87 | print("CONDA FILE NAME {}".format(args.conda_file))
88 | print("CONDA PATH {}".format(conda_path))
89 |
90 | # Write to a temp directory which will always be cleaned up
91 | with temp_cd():
92 | temp_file_name = "temp_script.yaml"
93 | with open(temp_file_name, 'w') as f:
94 | f.write(yaml.dump(yaml_script))
95 | sp.call("{} env create -n {} -f {}".format(conda_path, args.name, temp_file_name), shell=True)
96 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = sparrow
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Compiling sparrow's Documentation
2 |
3 | The docs for this project are built with [Sphinx](http://www.sphinx-doc.org/en/master/).
4 | To compile the docs, first ensure that Sphinx and the ReadTheDocs theme are installed.
5 |
6 |
7 | ```bash
8 | conda install sphinx sphinx_rtd_theme
9 | ```
10 |
11 |
12 | Once installed, you can use the `Makefile` in this directory to compile static HTML pages by
13 | ```bash
14 | make html
15 | ```
16 |
17 | The compiled docs will be in the `_build` directory and can be viewed by opening `index.html` (which may itself
18 | be inside a directory called `html/` depending on what version of Sphinx is installed).
19 |
20 |
21 | A configuration file for [Read The Docs](https://readthedocs.org/) (readthedocs.yaml) is included in the top level of the repository. To use Read the Docs to host your documentation, go to https://readthedocs.org/ and connect this repository. You may need to change your default branch to `main` under Advanced Settings for the project.
22 |
23 | If you would like to use Read The Docs with `autodoc` (included automatically) and your package has dependencies, you will need to include those dependencies in your documentation yaml file (`docs/requirements.yaml`).
24 |
25 |
--------------------------------------------------------------------------------
/docs/_static/README.md:
--------------------------------------------------------------------------------
1 | # Static Doc Directory
2 |
3 | Add any paths that contain custom static files (such as style sheets) here,
4 | relative to the `conf.py` file's directory.
5 | They are copied after the builtin static files,
6 | so a file named "default.css" will overwrite the builtin "default.css".
7 |
8 | The path to this folder is set in the Sphinx `conf.py` file in the line:
9 | ```python
10 | templates_path = ['_static']
11 | ```
12 |
13 | ## Examples of file to add to this directory
14 | * Custom Cascading Style Sheets
15 | * Custom JavaScript code
16 | * Static logo images
17 |
--------------------------------------------------------------------------------
/docs/_templates/README.md:
--------------------------------------------------------------------------------
1 | # Templates Doc Directory
2 |
3 | Add any paths that contain templates here, relative to
4 | the `conf.py` file's directory.
5 | They are copied after the builtin template files,
6 | so a file named "page.html" will overwrite the builtin "page.html".
7 |
8 | The path to this folder is set in the Sphinx `conf.py` file in the line:
9 | ```python
10 | html_static_path = ['_templates']
11 | ```
12 |
13 | ## Examples of file to add to this directory
14 | * HTML extensions of stock pages like `page.html` or `layout.html`
15 |
--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | API Documentation
2 | =================
3 |
4 | .. autosummary::
5 | :toctree: autosummary
6 |
7 | sparrow.canvas
8 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Configuration file for the Sphinx documentation builder.
4 | #
5 | # This file does only contain a selection of the most common options. For a
6 | # full list see the documentation:
7 | # http://www.sphinx-doc.org/en/stable/config
8 |
9 | # -- Path setup --------------------------------------------------------------
10 |
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 |
15 | # Incase the project was not installed
16 | import os
17 | import sys
18 | sys.path.insert(0, os.path.abspath('..'))
19 |
20 | import sparrow
21 |
22 |
23 | # -- Project information -----------------------------------------------------
24 |
25 | project = 'sparrow'
26 | copyright = ("2020, Alex Holehouse. Project structure based on the "
27 | "Computational Molecular Science Python Cookiecutter version 1.5")
28 | author = 'Alex Holehouse'
29 |
30 | # The short X.Y version
31 | version = ''
32 | # The full version, including alpha/beta/rc tags
33 | release = ''
34 |
35 |
36 | # -- General configuration ---------------------------------------------------
37 |
38 | # If your documentation needs a minimal Sphinx version, state it here.
39 | #
40 | # needs_sphinx = '1.0'
41 |
42 | # Add any Sphinx extension module names here, as strings. They can be
43 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
44 | # ones.
45 | extensions = [
46 | 'sphinx.ext.autosummary',
47 | 'sphinx.ext.autodoc',
48 | 'sphinx.ext.mathjax',
49 | 'sphinx.ext.viewcode',
50 | 'sphinx.ext.napoleon',
51 | 'sphinx.ext.intersphinx',
52 | 'sphinx.ext.extlinks',
53 | ]
54 |
55 | autosummary_generate = True
56 | napoleon_google_docstring = False
57 | napoleon_use_param = False
58 | napoleon_use_ivar = True
59 |
60 | # Add any paths that contain templates here, relative to this directory.
61 | templates_path = ['_templates']
62 |
63 | # The suffix(es) of source filenames.
64 | # You can specify multiple suffix as a list of string:
65 | #
66 | # source_suffix = ['.rst', '.md']
67 | source_suffix = '.rst'
68 |
69 | # The master toctree document.
70 | master_doc = 'index'
71 |
72 | # The language for content autogenerated by Sphinx. Refer to documentation
73 | # for a list of supported languages.
74 | #
75 | # This is also used if you do content translation via gettext catalogs.
76 | # Usually you set "language" from the command line for these cases.
77 | language = None
78 |
79 | # List of patterns, relative to source directory, that match files and
80 | # directories to ignore when looking for source files.
81 | # This pattern also affects html_static_path and html_extra_path .
82 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
83 |
84 | # The name of the Pygments (syntax highlighting) style to use.
85 | pygments_style = 'default'
86 |
87 |
88 | # -- Options for HTML output -------------------------------------------------
89 |
90 | # The theme to use for HTML and HTML Help pages. See the documentation for
91 | # a list of builtin themes.
92 | #
93 | html_theme = 'sphinx_rtd_theme'
94 |
95 | # Theme options are theme-specific and customize the look and feel of a theme
96 | # further. For a list of options available for each theme, see the
97 | # documentation.
98 | #
99 | # html_theme_options = {}
100 |
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = ['_static']
105 |
106 | # Custom sidebar templates, must be a dictionary that maps document names
107 | # to template names.
108 | #
109 | # The default sidebars (for documents that don't match any pattern) are
110 | # defined by theme itself. Builtin themes are using these templates by
111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
112 | # 'searchbox.html']``.
113 | #
114 | # html_sidebars = {}
115 |
116 |
117 | # -- Options for HTMLHelp output ---------------------------------------------
118 |
119 | # Output file base name for HTML help builder.
120 | htmlhelp_basename = 'sparrowdoc'
121 |
122 |
123 | # -- Options for LaTeX output ------------------------------------------------
124 |
125 | latex_elements = {
126 | # The paper size ('letterpaper' or 'a4paper').
127 | #
128 | # 'papersize': 'letterpaper',
129 |
130 | # The font size ('10pt', '11pt' or '12pt').
131 | #
132 | # 'pointsize': '10pt',
133 |
134 | # Additional stuff for the LaTeX preamble.
135 | #
136 | # 'preamble': '',
137 |
138 | # Latex figure (float) alignment
139 | #
140 | # 'figure_align': 'htbp',
141 | }
142 |
143 | # Grouping the document tree into LaTeX files. List of tuples
144 | # (source start file, target name, title,
145 | # author, documentclass [howto, manual, or own class]).
146 | latex_documents = [
147 | (master_doc, 'sparrow.tex', 'sparrow Documentation',
148 | 'sparrow', 'manual'),
149 | ]
150 |
151 |
152 | # -- Options for manual page output ------------------------------------------
153 |
154 | # One entry per manual page. List of tuples
155 | # (source start file, name, description, authors, manual section).
156 | man_pages = [
157 | (master_doc, 'sparrow', 'sparrow Documentation',
158 | [author], 1)
159 | ]
160 |
161 |
162 | # -- Options for Texinfo output ----------------------------------------------
163 |
164 | # Grouping the document tree into Texinfo files. List of tuples
165 | # (source start file, target name, title, author,
166 | # dir menu entry, description, category)
167 | texinfo_documents = [
168 | (master_doc, 'sparrow', 'sparrow Documentation',
169 | author, 'sparrow', 'Next generation package for sequence parameter calculation',
170 | 'Miscellaneous'),
171 | ]
172 |
173 |
174 | # -- Extension configuration -------------------------------------------------
175 |
--------------------------------------------------------------------------------
/docs/getting_started.rst:
--------------------------------------------------------------------------------
1 | Getting Started
2 | ===============
3 |
4 | This page details how to get started with sparrow.
5 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. sparrow documentation master file, created by
2 | sphinx-quickstart on Thu Mar 15 13:55:56 2018.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to sparrow's documentation!
7 | =========================================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 | getting_started
14 | api
15 | predictors
16 |
17 |
18 |
19 | Indices and tables
20 | ==================
21 |
22 | * :ref:`genindex`
23 | * :ref:`modindex`
24 | * :ref:`search`
25 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=sparrow
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/docs/predictors.rst:
--------------------------------------------------------------------------------
1 | Predictors
2 | =================
3 |
4 | sparrow implements a set of different sequence-based predictors in a modular, extendable way that enables additional predictors to be easily added.
5 |
6 |
7 | Creating new predictors with PARROT
8 | --------------------------------------
9 | The guide below assumes you have cloned the git repository of sparrow, created a new branch to add your new predictor to, and have switched into that branch to begin work. As a reminder, when adding new features in Git, the general workflow is:
10 |
11 | 1. Clone the current up-to-date version
12 | 2. Create a new branch (this is a seperate version where you can work in peace, but if new features are added to the main branch you can update your branch as you go)
13 | 3. Add in your snazzy new feature
14 | 4. Once complete, make a pull request to merge your branch back into the main branch.
15 |
16 | This guide assumes these ideas are clear, and specifically provides insight into the details of implementing a new predictor in sparrow, focussing here on using PARROT to train that predictor.
17 |
18 |
19 | **Step 1: Train a predictor with PARROT**
20 |
21 | The first step in adding a new PARROT based predictor is to use PARROT to train your model. The details of how one does this go beyond the scope of this documentation, but once trained you should be left with a Torch parameter file (a ``.pt`` file). This is the file we're going to use with SPARROW to add our custom predictor. Lets call this parameter file ``new_predictor.pt`` to make this concrete.
22 |
23 | Note that the PARROT predictor should be predicted in ``residues`` mode - i.e. we need to recieve one value per residue
24 |
25 |
26 | **Step 2: Copy the parameter file into SPARROW**
27 |
28 | Next we take ``new_predictor.pt`` and we're going to copy it into sparrow. Specifically, this trained network should be placed under::
29 |
30 | sparrow/data/networks/predictor
31 |
32 | and MUST follow the naming convention ``_network_v.pt``. Note there that:
33 |
34 | * ```` should be a single word or word connected by underscores, all lower case, that we will use as the function name to call the predictor. For example, *disorder*, *dssp* or *transmembrane* are good examples. Keep this simple but it should be clear and unambigious.
35 | * ```` here is the specific version of this network. It is possible that your network may be retrained later, and as such we want to enable future sparrow users to select specific network versions, althogh of the course the predictors should default to the most recent version. This ability to select specific network versions is built into the standard predictor template code.
36 |
37 | As an example, our transmembrane predictor has the format::
38 |
39 | transmembrane_predictor_network_v4.pt
40 |
41 |
42 | **Step 3: Build a predictor class which performs the prediction**
43 |
44 | The next step is to build a stand-alone predictor class which reads in this network file and enables the return of the per-residue prediction implemented therein. This file should be created in a new package (i.e. a directory with a ``__init__.py``) in the::
45 |
46 | sparrow/predictors
47 |
48 | and this file should be called ``_predictor.py``.
49 |
50 | As a specific example, our transmembrane predictor is implemented in::
51 |
52 | sparrow/predictors/transmembrane
53 |
54 | and within this directory there are two files::
55 |
56 | __init__.py # this is needed so we can import the predictor
57 | transmembrane_predictor.py # this is where the predictor is implemented
58 |
59 | The reason to make a separate package (directory) for every predictor is that if someone has a non-parrot-based predictor they want to incoporate into sparrow (1) this is absolutely welcome and (2) we want to provide a consistent file ecosystem where they have a directory to implement as much/little additional code as they want. As such, the ``__init__.py`` and ``_predictor.py`` are the **minimum** files needed, but you are free to add anything else as well.
60 |
61 | ``__init__.py``` should probably just be empty - it's what tells Python that this directory is a package.
62 |
63 | ``_predictor.py`` should NOT be empty, but should be based on the template file found under ``sparrow/predictors/predictor_template.py``. The template is REALTIVELY simple, but provides code for reading in a PARROT-trained network and performing a prediction. You could re-implement this yourself if you really wanted, but, assuming you're using one-hot encoding on the trained network, this code should work out of the box. The template itself walks through the various small configuration tweaks needed to make this work with your specific network of interest. Note that for classification vs. regression there are some small difference, but the template file provides code for both, so just delete/comment out the irrelevant lines (these are clearly marked).
64 |
65 | Once this is done, it's worth seeing if you can import and run predictions using this class/function as a stand-alone predictor i.e. you should be able to do::
66 |
67 |
68 | from sparrow.predictor.. import Predictor
69 |
70 | sequence = 'MSAAVTAGKLARAPADPGKAGVPGVAAPGAPAAAPPAKEIPEVLVDPRSRRRYVRGRFLG'
71 | P = Predictor()
72 | P.predict_(sequence)
73 |
74 |
75 | and it return a set of values.
76 |
77 |
78 | **Step 4: Integrate the predictior in the sparrow.Predictor class**
79 |
80 | At this stage we have a working predictor - the last step is to connect this predictor to the sparrow Protein object in a way that inccurs minimal computational overhead if not used, is syntactically simple, and offers functionality like other Protein analysis functions and properties.
81 |
82 | This is achieved by adding a function into the ``sparrow.predictor.Predictor`` class, a class implemented in the ``sparrow.predictor.__init__.py``.
83 |
84 | This class generates an object which is accessible in Protein object under the ``.predictor`` dot operator. As such, functions defined in the `sparrow.predictor.Predictor`` class are then accessible as::
85 |
86 | seq = 'MSAAVTAGKLARAPADPGKAGVPGVAAPGAPAAAPPAKEIPE'
87 | p = Protein(seq)
88 |
89 | p.predictor.
90 |
91 |
92 | As such, to finally make a new predictor accessible, ``sparrow.predictor.Predictor`` class should be edited to add a new function which is simply the name of the prediction (e.g. ``dssp``, ``transmembrane`` etc). This function should do three things:
93 |
94 | 1. It should UPON BEING CALLED import the predictor package you just created.
95 | 2. It should then perform the prediction on the underlying protein sequence
96 | 3. It should (ideally) memoize the outcome into a local dictionary that means if the same prediction is called again it is simply referenced rather than recomputed.
97 |
98 | Rather than going into the details here, the underlying code and example should make this clear. Noteably, see ``dssp()`` and ``transmembrane_regions()`` for good examples of PARROT-based predictors. One important thing is to document these predictors clearly
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | versioningit
2 | sphinx_rtd_theme
--------------------------------------------------------------------------------
/docs/requirements.yaml:
--------------------------------------------------------------------------------
1 | name: docs
2 | channels:
3 | dependencies:
4 | # Base depends
5 | - python
6 | - pip
7 |
8 |
9 |
10 | # Pip-only installs
11 | #- pip:
12 |
13 |
--------------------------------------------------------------------------------
/examples/protein_example_1.py:
--------------------------------------------------------------------------------
1 | from sparrow.protein import Protein
2 |
3 | P = Protein('MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP')
4 |
5 | print('Demo 1')
6 | print(P)
7 | print(f"sparrow makes the most of Python's syntactic sugar e.g. we can use len() operator - e.g., len(P) will show the sequence length: {len(P)}")
8 | print(P.predictor.disorder())
9 | print(P.FCR)
10 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 |
3 | # delete cython/numpy if not needed
4 | requires = ["setuptools>=61", "versioningit~=2.0", "cython", "numpy", ]
5 | build-backend = "setuptools.build_meta"
6 |
7 |
8 | # define project info
9 | [project]
10 | name = "sparrow"
11 | dynamic = ["version"]
12 | description = "Next generation sequence analysis package for working with disordered regions and disordered proteins"
13 | authors = [
14 | {name = "Alex Holehouse", email = "alex.holehouse@wustl.edu"}
15 | ]
16 | license = {text = "CC-NC-ND"}
17 | readme = "README.md"
18 | requires-python = ">=3.7"
19 |
20 | # add in as needed
21 | dependencies = [
22 | "numpy>=1.14.0,<2.0",
23 | "scipy",
24 | "cython",
25 | "protfasta",
26 | "metapredict>2",
27 | "ipython",
28 | "idptools-parrot @ git+https://git@github.com/idptools/parrot.git",
29 | "afrc",
30 | "tqdm",
31 | "pyfamsa",
32 | ]
33 |
34 | [project.optional-dependencies]
35 | test = [
36 | "pytest>=6.1.2",
37 | ]
38 |
39 |
40 | [tool.setuptools]
41 | zip-safe = false
42 | include-package-data = true
43 |
44 | [tool.setuptools.packages.find]
45 | namespaces = true
46 | where = ["."]
47 | include = ["sparrow", "sparrow.*"] # Discover all sub-packages inside the main package
48 |
49 | [tool.setuptools.package-data]
50 | sparrow = [
51 | "py.typed"
52 | ]
53 |
54 | [tool.versioningit]
55 | default-version = "1+unknown"
56 |
57 | [tool.versioningit.format]
58 | distance = "{base_version}+{distance}.{vcs}{rev}"
59 | dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
60 | distance-dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
61 |
62 | [tool.versioningit.vcs]
63 | # The method key:
64 | method = "git" # <- The method name
65 | # Parameters to pass to the method:
66 | match = ["*"]
67 | default-tag = "1.0.0"
68 |
69 | [tool.versioningit.write]
70 | file = "sparrow/_version.py"
71 |
--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
1 | # readthedocs.yml
2 |
3 | version: 2
4 |
5 | build:
6 | image: latest
7 |
8 | python:
9 | version: 3.8
10 | install:
11 | - method: pip
12 | path: .
13 |
14 | conda:
15 | environment: docs/requirements.yaml
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [coverage:run]
2 | # .coveragerc to control coverage.py and pytest-cov
3 | omit =
4 | # Omit the tests
5 | */tests/*
6 | # Omit generated versioningit
7 | metapredict/_version.py
8 |
9 | # define consistent style
10 | [yapf]
11 | COLUMN_LIMIT = 119
12 | INDENT_WIDTH = 4
13 | USE_TABS = False
14 |
15 | # define consistent style
16 | [flake8]
17 | max-line-length = 119
18 |
19 | # means we can run python setup.py test to
20 | # run tests... maybe...
21 | [aliases]
22 | test = pytest
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """
2 | sparrow
3 | Next generation package for sequence parameter calculation
4 | """
5 | from setuptools import setup, Extension, find_packages
6 | from Cython.Build import cythonize
7 | import os
8 | import numpy
9 |
10 | # defines the absolute path of where your cython files are
11 | cython_dir = os.path.join("sparrow", "patterning")
12 |
13 | # build a list of the files
14 | cython_files = [os.path.join(cython_dir, f) for f in os.listdir(cython_dir) if f.endswith('.pyx')]
15 |
16 |
17 | extensions = [
18 | Extension(
19 | name=f"sparrow.patterning.{os.path.splitext(os.path.basename(file))[0]}",
20 | sources=[file],
21 | include_dirs=[numpy.get_include()],
22 | ) for file in cython_files
23 | ]
24 |
25 | setup(
26 | ext_modules=cythonize(extensions, compiler_directives={'language_level': "3"}),
27 | packages=find_packages(),
28 | include_package_data=True,
29 | )
30 |
--------------------------------------------------------------------------------
/sparrow/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | sparrow
3 | Next generation package for sequence parameter calculation
4 | """
5 |
6 | import os
7 |
8 | # Add imports here
9 | from sparrow.tools import io
10 | from sparrow.protein import Protein
11 | from sparrow.tools.io import read_fasta
12 |
13 |
14 | # Generate _version.py if missing and in the Read the Docs environment
15 | if os.getenv("READTHEDOCS") == "True" and not os.path.isfile('../sparrow/_version.py'):
16 | import versioningit
17 | __version__ = versioningit.get_version('../')
18 | else:
19 | from ._version import __version__
20 |
21 | # code that allows access to the data directory
22 | _ROOT = os.path.abspath(os.path.dirname(__file__))
23 | def get_data(path):
24 | return os.path.join(_ROOT, 'data', path)
25 |
26 |
--------------------------------------------------------------------------------
/sparrow/calculate_parameters.py:
--------------------------------------------------------------------------------
1 | from sparrow.data import amino_acids
2 | import numpy as np
3 | import math
4 | from . import sparrow_exceptions
5 |
6 | # .................................................................
7 | #
8 | def calculate_aa_fractions(s):
9 | """
10 | Standalone function that computes amino-acid fractions for
11 | a given sequence.
12 |
13 | Parameters:
14 | --------------
15 | s : str
16 | Amino acid sequence
17 |
18 | Returns
19 | ---------------
20 | dict
21 | Returns dictionary with per-residue amino acid fraction
22 |
23 | """
24 |
25 | aa_dict = {}
26 | for i in amino_acids.VALID_AMINO_ACIDS:
27 | aa_dict[i] = 0
28 |
29 | for i in s:
30 | aa_dict[i] = aa_dict[i] + 1
31 |
32 |
33 | len_s = len(s)
34 | for i in amino_acids.VALID_AMINO_ACIDS:
35 | aa_dict[i] = aa_dict[i]/len_s
36 |
37 | return aa_dict
38 |
39 |
40 |
41 | def calculate_seg_complexity(s, alphabet=amino_acids.VALID_AMINO_ACIDS):
42 | """
43 | Function to calculate the Wootton-Federhen complexity of a sequence (also called
44 | seg complexity, as this the theory used in the classic SEG algorithm.
45 |
46 | Parameters
47 | -----------
48 | s : str
49 | Amino acid sequence
50 |
51 | alphabet : list
52 | List of amino acids found in alphabet. Note this does not sanity check in the
53 | case of non-standard amino acids. Default is the standard 20 amino acids
54 |
55 | Returns
56 | ----------
57 | float
58 | Returns a float that corresponds to the compositional complexity associated with
59 | the passed sequence.
60 |
61 | """
62 |
63 | alphabet_size = len(alphabet)
64 | seq_len = len(s)
65 |
66 | complexity = 0
67 | for a in alphabet:
68 | p = s.count(a)/seq_len
69 |
70 | if p > 0:
71 | complexity = p * math.log(p, alphabet_size) + complexity
72 |
73 | return -complexity
74 |
75 |
76 |
77 | # .................................................................
78 | #
79 | def calculate_hydrophobicity(s, mode='KD', normalize=False):
80 | """
81 | Standalone function that computes hydrophobicity
82 |
83 | Parameters:
84 | --------------
85 | s : str
86 | Amino acid sequence
87 |
88 | mode : str
89 | Hydrophobicity mode to be used. Currently only KD supported
90 | but can be expanded. Allowed values: 'KD'
91 |
92 | normalize : Bool
93 | If set to True hydrophobicity scales are normalized to be between 0
94 | and 1. Default = False.
95 |
96 | Returns
97 | ---------------
98 | Float
99 | Returns a floating point value with the mean hydrophobicity
100 | as defined based on the passed scale
101 |
102 | """
103 | return np.mean(calculate_linear_hydrophobicity(s, mode, normalize))
104 |
105 |
106 | # .................................................................
107 | #
108 | def calculate_linear_hydrophobicity(s, mode='KD', normalize=False):
109 | """
110 | Compute linear hydrophobicity from sequence using one of several possible
111 | hydrophobicity scales.
112 |
113 | By default this is Kyte-Doolitle, but, we'll add in additional scales
114 | as/when needed.
115 |
116 | Parameters:
117 | --------------
118 | s : str
119 | Amino acid sequence
120 |
121 | mode : str
122 | Selector for hydrophobicity table. Options available are
123 |
124 | 'KD' | Kyte-Doolittle
125 |
126 | normalize : bool
127 | Boolean that means hydrophobicity scales operate on a normalized
128 | dynamic range of 0 to 1
129 |
130 | Returns:
131 | ------------
132 | list
133 | List of values that correspond to per-residue hydrophobicity based on
134 | a given hydrophobicity scale.
135 |
136 | """
137 |
138 | if mode == 'KD':
139 | try:
140 | if normalize:
141 | return [amino_acids.AA_hydro_KD_normalized[r] for r in s]
142 | else:
143 | return [amino_acids.AA_hydro_KD[r] for r in s]
144 | except KeyError:
145 | raise sparrow_exceptions.CalculationException('Invalid residue found in %s' %(s))
146 | else:
147 | raise sparrow_exceptions.CalculationException('Invalid mode passed: %s' %(mode))
148 |
--------------------------------------------------------------------------------
/sparrow/data/README.md:
--------------------------------------------------------------------------------
1 | # Sample Package Data
2 |
3 | This directory contains sample additional data you may want to include with your package.
4 | This is a place where non-code related additional information (such as data files, molecular structures, etc.) can
5 | go that you want to ship alongside your code.
6 |
7 | Please note that it is not recommended to place large files in your git directory. If your project requires files larger
8 | than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files
9 | as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version
10 | in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository
11 | cap.
12 |
13 | ## Including package data
14 |
15 | Modify your package's `setup.py` file and the `setup()` command. Include the
16 | [`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the
17 | correct files.
18 |
19 | ## Manifest
20 |
21 | * `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150)
22 |
--------------------------------------------------------------------------------
/sparrow/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import configs # import general configurations
2 | from . import amino_acids # import residue-specific amino acid data
3 |
4 |
--------------------------------------------------------------------------------
/sparrow/data/amino_acids.py:
--------------------------------------------------------------------------------
1 | ##
2 | ## Data on individual amino acids
3 | ##
4 | ##
5 |
6 | VALID_AMINO_ACIDS = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
7 | VALID_AMINO_ACIDS_PHYS = ['W','Y','F','H','Q','N','T','S','C','G','A','I','L','V','M','E','D','K','R','P']
8 |
9 |
10 | ARO = ['Y','W','F']
11 | ALI = ['A','L','M','I','V']
12 | POLAR = ['Q','N','S','T','H','G']
13 | CHARGE = ['E','D','R','K']
14 | POS = ['R','K']
15 | NEG = ['E','D']
16 |
17 | AA_THREE_TO_ONE = {'ALA':'A',
18 | 'CYS':'C',
19 | 'ASP':'D',
20 | 'GLU':'E',
21 | 'PHE':'F',
22 | 'GLY':'G',
23 | 'HIS':'H',
24 | 'ILE':'I',
25 | 'LYS':'K',
26 | 'LEU':'L',
27 | 'MET':'M',
28 | 'ASN':'N',
29 | 'PRO':'P',
30 | 'GLN':'Q',
31 | 'ARG':'R',
32 | 'SER':'S',
33 | 'THR':'T',
34 | 'VAL':'V',
35 | 'TRP':'W',
36 | 'TYR':'Y'}
37 |
38 | AA_ONE_TO_THREE = {}
39 | for x in AA_THREE_TO_ONE:
40 | AA_ONE_TO_THREE[AA_THREE_TO_ONE[x]] = x
41 |
42 |
43 | # acetyl groups have 1C-2O, 4H prior to
44 | # peptide bond formation
45 | #
46 | AA_MOLECULAR_WEIGHT = {'A': 89.1,
47 | 'C': 121.2,
48 | 'D': 133.1,
49 | 'E': 147.1,
50 | 'F': 165.2,
51 | 'G': 75.1,
52 | 'H': 155.2,
53 | 'I': 131.2,
54 | 'K': 146.2,
55 | 'L': 130.2,
56 | 'M': 149.2,
57 | 'N': 132.1,
58 | 'P': 115.1,
59 | 'Q': 146.2,
60 | 'R': 174.2,
61 | 'S': 105.1,
62 | 'T': 119.1,
63 | 'V': 117.1,
64 | 'W': 204.2,
65 | 'Y': 181.2,
66 | '<': 48,
67 | '>': 48}
68 |
69 |
70 | AA_COLOR = {'Y':'#ff9d00',
71 | 'W':'#ff9d00',
72 | 'F':'#ff9d00',
73 | 'A':'#171616',
74 | 'L':'#171616',
75 | 'M':'#171616',
76 | 'I':'#171616',
77 | 'V':'#171616',
78 | 'Q':'#04700d',
79 | 'N':'#04700d',
80 | 'S':'#04700d',
81 | 'T':'#04700d',
82 | 'H':'#04700d',
83 | 'G':'#04700d',
84 | 'E':'#ff0d0d',
85 | 'D':'#ff0d0d',
86 | 'R':'#2900f5',
87 | 'K':'#2900f5',
88 | 'C':'#ffe70d',
89 | 'P':'#cf30b7'}
90 |
91 |
92 | # KYTE-DOOLITTLE SCALES
93 | # References
94 | # A simple method for displaying the hydropathic character of a protein.
95 | # Kyte J, Doolittle RF. J Mol Biol. 1982 May 5;157(1):105-32.
96 | # Why are "natively unfolded" proteins unstructured under physiological conditions?
97 | # Valdimir N. Uversky, Joel R. Gillespie, and Anthony L. Frink
98 | # Protines: Structure, function, and genetics 41:415-427 (2000)
99 | # Main hydrophobicity scale
100 |
101 | AA_hydro_KD = {"A": 6.3,
102 | "R": 0.0,
103 | "N": 1.0,
104 | "D": 1.0,
105 | "C": 7.0,
106 | "Q": 1.0,
107 | "E": 1.0,
108 | "G": 4.1,
109 | "H": 1.3,
110 | "I": 9.0,
111 | "L": 8.3,
112 | "K": 0.6,
113 | "M": 6.4,
114 | "F": 7.3,
115 | "P": 2.9,
116 | "S": 3.7,
117 | "T": 3.8,
118 | "W": 3.6,
119 | "Y": 3.2,
120 | "V": 8.7}
121 |
122 | AA_hydro_KD_normalized = {'A': 0.7,
123 | 'R': 0.0,
124 | 'N': 0.111,
125 | 'D': 0.111,
126 | 'C': 0.778,
127 | 'Q': 0.111,
128 | 'E': 0.111,
129 | 'G': 0.456,
130 | 'H': 0.144,
131 | 'I': 1.0,
132 | 'L': 0.922,
133 | 'K': 0.067,
134 | 'M': 0.711,
135 | 'F': 0.811,
136 | 'P': 0.322,
137 | 'S': 0.411,
138 | 'T': 0.422,
139 | 'W': 0.4,
140 | 'Y': 0.356,
141 | 'V': 0.967}
142 |
143 |
--------------------------------------------------------------------------------
/sparrow/data/configs.py:
--------------------------------------------------------------------------------
1 | DISORDER_THRESHOLD = 0.7
2 | MIN_LENGTH_ALBATROSS_RE_RG = 35
3 |
--------------------------------------------------------------------------------
/sparrow/data/look_and_say.dat:
--------------------------------------------------------------------------------
1 | 1
2 | 11
3 | 21
4 | 1211
5 | 111221
6 | 312211
7 | 13112221
8 | 1113213211
9 | 31131211131221
10 | 13211311123113112211
11 | 11131221133112132113212221
12 | 3113112221232112111312211312113211
13 | 1321132132111213122112311311222113111221131221
14 | 11131221131211131231121113112221121321132132211331222113112211
15 | 311311222113111231131112132112311321322112111312211312111322212311322113212221
--------------------------------------------------------------------------------
/sparrow/data/networks/asphericity/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 |
--------------------------------------------------------------------------------
/sparrow/data/networks/asphericity/asphericity_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/asphericity/asphericity_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/asphericity/asphericity_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/asphericity/asphericity_network_v2.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/dssp/dssp_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/dssp/dssp_predictor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/dssp/dssp_predictor_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/dssp/dssp_predictor_network_v2.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/mitochondrial_targeting/mitochondrial_targeting_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/mitochondrial_targeting/mitochondrial_targeting_predictor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/nuclear_export_signal/nes_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/nuclear_export_signal/nes_predictor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/nuclear_import_signal/nls_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/nuclear_import_signal/nls_predictor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/phosphorylation/ser_phosphorylation_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/phosphorylation/ser_phosphorylation_predictor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/phosphorylation/thr_phosphorylation_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/phosphorylation/thr_phosphorylation_predictor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/phosphorylation/tyr_phosphorylation_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/phosphorylation/tyr_phosphorylation_predictor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/prefactor/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 |
--------------------------------------------------------------------------------
/sparrow/data/networks/prefactor/prefactor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/prefactor/prefactor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/prefactor/prefactor_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/prefactor/prefactor_network_v2.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/pscore/pscore_predictor_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/pscore/pscore_predictor_network_v2.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/pscore/pscore_predictor_network_v3.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/pscore/pscore_predictor_network_v3.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/pscore/pscore_predictor_network_v4.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/pscore/pscore_predictor_network_v4.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/re/README:
--------------------------------------------------------------------------------
1 | ## The end-to-end networks are defined as re, although the actual predictor class is e2e. This is a rare exception where there's a mismatch in network name and predictor class name, because if the predictor class name were 're' then this would clash with the Python regular expression package ('re'), such that for code sanity purposes the predictor class and module is e2e even though the networks are re.
2 |
--------------------------------------------------------------------------------
/sparrow/data/networks/re/re_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/re/re_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/re/re_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/re/re_network_v2.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/rg/README:
--------------------------------------------------------------------------------
1 | # Proof of concept network
2 |
--------------------------------------------------------------------------------
/sparrow/data/networks/rg/rg_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/rg/rg_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/rg/rg_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/rg/rg_network_v2.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_re/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 |
--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_re/scaled_re_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_re/scaled_re_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_re/scaled_re_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_re/scaled_re_network_v2.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_rg/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 |
--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_rg/scaled_rg_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_rg/scaled_rg_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_rg/scaled_rg_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_rg/scaled_rg_network_v2.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/scaling_exponent/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 |
3 | v1.5 was never assessed or validated and shouldn't be used
4 |
--------------------------------------------------------------------------------
/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.5.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.5.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v2.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/transactivation_domains/tad_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/transactivation_domains/tad_predictor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v1.pt
--------------------------------------------------------------------------------
/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v4.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v4.pt
--------------------------------------------------------------------------------
/sparrow/patterning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/patterning/__init__.py
--------------------------------------------------------------------------------
/sparrow/patterning/scd.pyx:
--------------------------------------------------------------------------------
1 | # cython: language_level=3, boundscheck=False, wraparound=False, initializedcheck=False
2 | import numpy as np
3 | cimport numpy as np
4 | from cython.view cimport array
5 | from libc.math cimport sqrt,abs, fabs
6 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS
7 | from sparrow.sparrow_exceptions import SparrowException
8 |
9 | # Define a typed memoryview for efficient access to numpy arrays
10 | ctypedef np.float64_t DOUBLE_t
11 | ctypedef np.int64_t INT64_t
12 |
13 | cdef dict DEFAULT_HYDRO_DICT = {'A': 0.730, 'R': 0.000, 'N': 0.432, 'D': 0.378, 'C': 0.595, 'Q': 0.514, 'E': 0.459,
14 | 'G': 0.649, 'H': 0.514, 'I': 0.973, 'L': 0.973, 'K': 0.514, 'M': 0.838, 'F': 1.000,
15 | 'P': 1.000, 'S': 0.595, 'T': 0.676, 'W': 0.946, 'Y': 0.865, 'V': 0.892}
16 |
17 |
18 | cpdef double compute_scd_x(str sequence, group1=['E','D'], group2=['R','K']):
19 | cdef int m, n, seqlen
20 | cdef double total, m_val, n_val, charge_val, final_val
21 | cdef int cur_m_charge, cur_n_charge
22 | cdef char cur_m_res, cur_n_res
23 |
24 | # Pre-calculate group membership
25 | cdef int[:] group_membership = np.zeros(256, dtype=np.int32)
26 | for residue in group1:
27 | group_membership[ord(residue)] = -1
28 | for residue in group2:
29 | group_membership[ord(residue)] = 1
30 |
31 | total = 0
32 | seqlen = len(sequence)
33 |
34 | # Convert sequence to array of integers
35 | cdef int[:] sequence_array = np.array([ord(char) for char in sequence], dtype=np.int32)
36 |
37 | for m in range(1, seqlen):
38 | m_val = m + 1
39 |
40 | for n in range(0, m-1):
41 | n_val = n + 1
42 |
43 | # Access residues using array indexing
44 | cur_m_res = sequence_array[m]
45 | cur_n_res = sequence_array[n]
46 |
47 | # Retrieve group charge
48 | cur_m_charge = group_membership[cur_m_res]
49 | cur_n_charge = group_membership[cur_n_res]
50 |
51 | charge_val = cur_m_charge * cur_n_charge
52 | final_val = charge_val * sqrt(m_val - n_val)
53 | total += final_val
54 |
55 | return total / seqlen
56 |
57 | cdef validate_sequence(str seq, dict hydro_dict):
58 | cdef set all_res = set(seq)
59 | for res in all_res:
60 | if res not in hydro_dict:
61 | raise ValueError(f'When calculating SHD the hydrophobicity dictionary lacked the residue {res}')
62 |
63 | cpdef double compute_shd(str seq, dict hydro_dict=None):
64 | """
65 | Function takes in a sequence and returns Sequence
66 | Hydropathy Decoration (SHD), IE. patterning of hydrophobic
67 | residues in the sequence. This is computed as defined in ref 1
68 |
69 | As an optional parameter this function can take in a predefined
70 | hydropathy conversion dictionary for the amino acids, where the keys
71 | are Amino acids and values are floats.
72 |
73 | If a conversion dict is not provided the following conversion is used:
74 |
75 | 'A': 0.730,
76 | 'R': 0.000,
77 | 'N': 0.432,
78 | 'D': 0.378,
79 | 'C': 0.595,
80 | 'Q': 0.514,
81 | 'E': 0.459,
82 | 'G': 0.649,
83 | 'H': 0.514,
84 | 'I': 0.973,
85 | 'L': 0.973,
86 | 'K': 0.514,
87 | 'M': 0.838,
88 | 'F': 1.000,
89 | 'P': 1.000,
90 | 'S': 0.595,
91 | 'T': 0.676,
92 | 'W': 0.946,
93 | 'Y': 0.865,
94 | 'V': 0.892,
95 |
96 | These are the Kyte Doolitle normalized hydrophobicity.
97 |
98 | Parameters
99 | ------------
100 | seq : str
101 | Amino acid sequence passed as string
102 |
103 | hydro_dict : dict
104 | Dictionary that maps amino acid to hydrophobicity score
105 | (optional).
106 |
107 | Returns
108 | -----------
109 | float
110 | Returns a floating point value that reports on the sequence
111 | hydropathy decoration. This in principle should be a positive
112 | number.
113 |
114 | References
115 | --------------
116 | [1] Zheng, W., Dignon, G. L., Brown, M., Kim, Y. C. & Mittal, J. Hydropathy Patterning
117 | Complements Charge Patterning to Describe Conformational Preferences of Disordered
118 | Proteins. J. Phys. Chem. Lett. (2020). doi:10.1021/acs.jpclett.0c00288
119 | """
120 | if hydro_dict is None:
121 | hydro_dict = DEFAULT_HYDRO_DICT
122 |
123 | validate_sequence(seq, hydro_dict)
124 |
125 | cdef Py_ssize_t N = len(seq)
126 | cdef double[:] h = np.array([hydro_dict[res] for res in seq], dtype=np.double)
127 | cdef double t = 0.0
128 | cdef Py_ssize_t m, n
129 |
130 | for m in range(1, N):
131 | for n in range(m-1):
132 | t += (h[m] + h[n]) / abs(m - n)
133 |
134 | return t / N
135 |
136 |
--------------------------------------------------------------------------------
/sparrow/polymer/scaling_parameters.py:
--------------------------------------------------------------------------------
1 | from sparrow.patterning import scd
2 | import numpy as np
3 |
4 | def compute_nu_zheng2020(seq):
5 | """
6 | Function takes in a sequence and returns a calculate Nu scaling value
7 | from Sequence Hydropathy Decoration (SHD) and Sequence Charge Decoration)
8 |
9 | Nu = -0.0423×SHD + 0.0074×SCD+0.701
10 |
11 | This equation for predicting nu is adopeted from Zheng et al. [1].
12 |
13 | Parameters
14 | ------------------
15 | seq : str
16 | Amino acid sequence (must be valid amino acids only)
17 |
18 | Returns
19 | ------------------
20 | float
21 | Returns the predict scalinge exponent (nu), a dimensionless
22 | parameter which should fall between 0.33 and 0.6 (in theory).
23 |
24 | References
25 | ---------------
26 | [1] Zheng, W., Dignon, G. L., Brown, M., Kim, Y. C. & Mittal, J.
27 | Hydropathy Patterning Complements Charge Patterning to Describe
28 | Conformational Preferences of Disordered Proteins. J. Phys.
29 | Chem. Lett. (2020). doi:10.1021/acs.jpclett.0c00288
30 |
31 | """
32 |
33 | SHD = scd.compute_shd(seq)
34 | SCD = scd.compute_scd_x(seq)
35 |
36 | # calculate Nu from SHD and SCD
37 | nu = (-0.0423*SHD)+(0.0074*SCD)+0.701
38 |
39 | return nu
40 |
41 |
42 |
43 | def compute_rg_zheng2020(seq):
44 | """
45 | Function that takes in an amino acid sequence and computes the
46 | expected radius of gyration using the nu-dependent Rg as developed by
47 | Zheng et al.
48 |
49 | Parameters
50 | ------------------
51 | seq : str
52 | Amino acid sequence (must be valid amino acids only)
53 |
54 | Returns
55 | ------------------
56 | float
57 | Returns the predict radius of gyration in Angstorms
58 |
59 | References
60 | ---------------
61 | [1] Zheng, W., Dignon, G. L., Brown, M., Kim, Y. C. & Mittal, J.
62 | Hydropathy Patterning Complements Charge Patterning to Describe
63 | Conformational Preferences of Disordered Proteins. J. Phys.
64 | Chem. Lett. (2020). doi:10.1021/acs.jpclett.0c00288
65 | """
66 | nu = compute_nu_zheng2020(seq)
67 |
68 | gamma = 1.1615
69 | b = 5.5 # note in Angstroms instead of nanometers
70 | N = len(seq)
71 |
72 | numerator = gamma*(gamma+1)
73 |
74 | denominator = 2*(gamma+2*nu)*(gamma+2*nu+1)
75 |
76 | return np.sqrt(numerator/denominator)*b*np.power(N,nu)
77 |
78 |
--------------------------------------------------------------------------------
/sparrow/predictors/asphericity/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/asphericity/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/dssp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/dssp/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/e2e/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/mitochondrial_targeting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/mitochondrial_targeting/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/mitochondrial_targeting/mitochondrial_targeting_predictor.py:
--------------------------------------------------------------------------------
1 | from parrot import brnn_architecture
2 | from parrot import encode_sequence
3 |
4 | import sparrow
5 |
6 | import torch
7 | import numpy as np
8 | import os
9 | from sparrow.sparrow_exceptions import SparrowException
10 |
11 |
12 |
13 |
14 | DEFAULT_VERSION="1"
15 |
16 |
17 | class MitochondrialTargetingPredictor():
18 | """
19 |
20 | Class that loads in a network such that predict_mitochondrial_targeting() can be called to predict
21 | mitochondrial targeting for a sequence.
22 |
23 | """
24 | def __init__(self, version=None):
25 | """
26 | Constructor for building a MitochondrialTargetingPredictor object. The version keyword allows specific
27 | version(s) of the trained network associated with the underlying predictor to be defined.
28 | By default, it's set to None, which leads to the current best/default network being selected
29 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
30 | the ability to pass a string as version. This string is inserted at position in the filename
31 |
32 | mitochondrial_targeting_predictor_network_v{version}.pt
33 |
34 | i.e. no need to include the "v" part or the .pt extension
35 |
36 | """
37 |
38 | if version is None:
39 | version = DEFAULT_VERSION
40 |
41 | saved_weights = sparrow.get_data(f'networks/mitochondrial_targeting/mitochondrial_targeting_predictor_network_v{version}.pt')
42 |
43 | if not os.path.isfile(saved_weights):
44 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
45 |
46 |
47 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
48 |
49 |
50 | # Dynamically read in correct hyperparameters:
51 | num_layers = 0
52 | while True:
53 | s = f'lstm.weight_ih_l{num_layers}'
54 | try:
55 | temp = loaded_model[s]
56 | num_layers += 1
57 | except KeyError:
58 | break
59 |
60 |
61 | ## determine the number of classes; note you may need to change the key names here no leading
62 | # module. in ther
63 | number_of_classes = np.shape(loaded_model['fc.bias'])[0]
64 | input_size = 20 # (hardcoded at 20 for 20 amino acids)
65 |
66 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
67 |
68 |
69 | # set these here so we can sanity check if needed
70 | self.number_of_classes = number_of_classes
71 | self.input_size = input_size
72 | self.number_of_layers = num_layers
73 | self.hidden_vector_size = hidden_vector_size
74 |
75 | # Instantiate network weights into object
76 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
77 |
78 | self.network.load_state_dict(loaded_model)
79 |
80 |
81 |
82 | def predict_mitochondrial_targeting(self, seq):
83 | """
84 | Prediction function. seq should be a valid amino acid sequence.
85 |
86 | NOTE that this assumes mitochondrial targeting sequences (MTSs) are
87 | N-terminal, so truncates anything over 168 residues. This threshold
88 | was empyrically determined based on the set of annottated MTSs.
89 |
90 | Parameters
91 | ------------
92 | seq : str
93 | Valid amino acid sequence
94 |
95 | Returns
96 | ----------
97 | np.ndarray
98 | Returns a 1D np.ndarray the length of the sequence where each position
99 | is the transient helicity at that position.
100 |
101 | """
102 |
103 | # convert sequence to uppercase
104 | seq = seq.upper()
105 |
106 | # truncate all but 168 - if shorter than this just gets everything
107 | sub_seq = seq[0:168]
108 |
109 | # Convert to one-hot sequence vector
110 | seq_vector = encode_sequence.one_hot(sub_seq)
111 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting
112 |
113 | # Forward pass -this is specific for classication
114 | prediction = self.network(seq_vector.float()).detach().numpy()
115 | int_vals = []
116 | for row in prediction[0]:
117 | int_vals.append(np.argmax(row))
118 |
119 | prediction = int_vals
120 |
121 | # append empty 0s for remainder of sequence
122 | extra = [0]*(len(seq)-len(sub_seq))
123 |
124 | prediction.extend(extra)
125 | # return prediction + extra zeros
126 | return prediction
127 |
--------------------------------------------------------------------------------
/sparrow/predictors/nes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/nes/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/nes/nuclear_export_signal_predictor.py:
--------------------------------------------------------------------------------
1 | from parrot import brnn_architecture
2 | from parrot import encode_sequence
3 |
4 | import sparrow
5 |
6 | import torch
7 | import numpy as np
8 | import os
9 | from sparrow.sparrow_exceptions import SparrowException
10 |
11 |
12 | """
13 | NB: This network and predictor was imported from GOOSE, so is subtly different internally to how
14 | some of the other predictors work. Notably it includes a softmax project and a loop
15 | this loop below to define probabilities - this may be because these networks have 2 layers
16 | whereas the others only have one? Anyway, just making a note of this in case we need to debug in
17 | the future.
18 |
19 | score = []
20 | for val in prediction:
21 | score.append(round(val[1],5))
22 |
23 |
24 | """
25 |
26 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update
27 | # this default if you want that new network to be used by default
28 | DEFAULT_VERSION="1"
29 |
30 | def softmax(v):
31 | return (np.e ** v) / np.sum(np.e ** v)
32 |
33 |
34 | ## CHANGE class name
35 | class NESPredictor():
36 | """
37 |
38 | Class that loads in a network such that nuclear export signals can be predicted.
39 |
40 | """
41 | def __init__(self, version=None):
42 | """
43 | Constructor for building a predictor object object. The version keyword allows specific
44 | version(s) of the trained network associated with the predictor to be defined.
45 |
46 | By default, it's set to None, which leads to the current best/default network being selected
47 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
48 | the ability to pass a string as version. This string is inserted at position in the filename
49 |
50 | _network_v.pt
51 |
52 | i.e. no need to include the "v" part or the .pt extension
53 |
54 | """
55 |
56 |
57 |
58 | # if no version provided use default, then grab path and check that file actually exists!
59 | if version is None:
60 | version = DEFAULT_VERSION
61 |
62 | # CHANGE THIS!! Make sure oyu change the and to the appropriate
63 | # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected
64 | saved_weights = sparrow.get_data(f'networks/nuclear_export_signal/nes_predictor_network_v{version}.pt')
65 |
66 | if not os.path.isfile(saved_weights):
67 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
68 |
69 |
70 | # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because
71 | # we know everyone has a CPU...
72 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
73 |
74 | ## DELETE ME PROBABLY
75 | # this block of code is relevant ONLY if the trained network has this straneg
76 | # appended 'module.' text at the start of every keyword. This may happen in older
77 | # version of PARROT (see DSSP predictor as an example of where its needed) but in
78 | # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but
79 | # in case you're using an older network we've kept this to make things simple
80 |
81 | #for i in range(len(loaded_model)):
82 | # key, value = loaded_model.popitem(last=False)
83 | # new_key = key[7:]
84 | # loaded_model[new_key] = value
85 | ## END OF DELETE ME PROBABLY
86 |
87 |
88 | # Dynamically calculate the hyperparameters used to train the network.
89 | ## NOTE:
90 | #
91 | # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible
92 | # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example
93 | # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you
94 | # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this
95 | # keyword.
96 |
97 | # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords
98 | # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to
99 | # reach out to Alex or Dan about this!
100 |
101 | num_layers = 0
102 | while True:
103 | s = f'lstm.weight_ih_l{num_layers}'
104 | try:
105 | temp = loaded_model[s]
106 | num_layers += 1
107 | except KeyError:
108 | break
109 |
110 | number_of_classes = np.shape(loaded_model['fc.bias'])[0]
111 |
112 | # Hard coded because we always use one-hot encoding, note that if you trained a specific
113 | # predictor on a different encoding scheme you could, of course, here simply define that
114 | # encoding scheme
115 | input_size = 20
116 |
117 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
118 |
119 | # set these here so we can sanity check if needed
120 | self.number_of_classes = number_of_classes
121 | self.input_size = input_size
122 | self.number_of_layers = num_layers
123 | self.hidden_vector_size = hidden_vector_size
124 |
125 | # Instantiate network weights into object
126 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
127 |
128 | # load parameters into model
129 | self.network.load_state_dict(loaded_model)
130 |
131 |
132 |
133 | def predict_nuclear_export_signal(self, seq):
134 | """
135 | Function to predict the presence of nuclear export signals. Returns a per
136 | residue probability score of a residue being in an NES or not
137 |
138 | Parameters
139 | ------------
140 | seq : str
141 | Valid amino acid sequence
142 |
143 | Returns
144 | ----------
145 | np.ndarray
146 | Returns a 1D np.ndarray the length of the sequence where each position
147 | gives the prediction of that residue being an NES
148 |
149 | """
150 |
151 | # convert sequence to uppercase
152 | seq = seq.upper()
153 |
154 | # Convert to one-hot sequence vector - note, as mentioned above if you
155 | # did't use one-hot in the original training you could just edit this here
156 | seq_vector = encode_sequence.one_hot(seq)
157 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting
158 |
159 |
160 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
161 | ## CHANGE CODE BELOW HERE ##
162 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
163 |
164 |
165 | ## CLASSIFICATION CODE BLOCK
166 | # The block below should be kept if we're doing a classification
167 | # based prediction! if not, comment this out or delete it
168 | #prediction = self.network(seq_vector.float()).detach().numpy()
169 | #int_vals = []
170 | #for row in prediction[0]:
171 | # int_vals.append(np.argmax(row))
172 |
173 | #prediction = int_vals
174 |
175 | ## REGRESSION CODE BLOCK
176 | # This block should be kept if we're doing a regression-based
177 | # prediction. If not, comment this out or delete it
178 | prediction = self.network(seq_vector.float()).detach().numpy().flatten()
179 |
180 | prediction = prediction.reshape(-1, self.number_of_classes)
181 | prediction = np.array(list(map(softmax, prediction)))
182 |
183 | # finally we extract out local probabilities
184 | score = []
185 | for val in prediction:
186 | score.append(round(val[1],5))
187 |
188 | return score
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
--------------------------------------------------------------------------------
/sparrow/predictors/nls/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/nls/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/nls/nuclear_import_signal_predictor.py:
--------------------------------------------------------------------------------
1 | from parrot import brnn_architecture
2 | from parrot import encode_sequence
3 |
4 | import sparrow
5 |
6 | import torch
7 | import numpy as np
8 | import os
9 | from sparrow.sparrow_exceptions import SparrowException
10 |
11 |
12 | """
13 | NB: This network and predictor was imported from GOOSE, so is subtly different internally to how
14 | some of the other predictors work. Notably it includes a softmax project and a loop
15 | this loop below to define probabilities - this may be because these networks have 2 layers
16 | whereas the others only have one? Anyway, just making a note of this in case we need to debug in
17 | the future.
18 |
19 | score = []
20 | for val in prediction:
21 | score.append(round(val[1],5))
22 |
23 |
24 | """
25 |
26 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update
27 | # this default if you want that new network to be used by default
28 | DEFAULT_VERSION="1"
29 |
30 | def softmax(v):
31 | return (np.e ** v) / np.sum(np.e ** v)
32 |
33 |
34 | ## CHANGE class name
35 | class NLSPredictor():
36 | """
37 |
38 | Class that loads in a network such that predict_ser_phosphorylation() can be called to predict
39 | serine phosphorylation from a sequence.
40 |
41 | """
42 | def __init__(self, version=None):
43 | """
44 | Constructor for building a predictor object object. The version keyword allows specific
45 | version(s) of the trained network associated with the predictor to be defined.
46 |
47 | By default, it's set to None, which leads to the current best/default network being selected
48 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
49 | the ability to pass a string as version. This string is inserted at position in the filename
50 |
51 | _network_v.pt
52 |
53 | i.e. no need to include the "v" part or the .pt extension
54 |
55 | """
56 |
57 |
58 |
59 | # if no version provided use default, then grab path and check that file actually exists!
60 | if version is None:
61 | version = DEFAULT_VERSION
62 |
63 | # CHANGE THIS!! Make sure oyu change the and to the appropriate
64 | # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected
65 | saved_weights = sparrow.get_data(f'networks/nuclear_import_signal/nls_predictor_network_v{version}.pt')
66 |
67 | if not os.path.isfile(saved_weights):
68 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
69 |
70 |
71 | # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because
72 | # we know everyone has a CPU...
73 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
74 |
75 | ## DELETE ME PROBABLY
76 | # this block of code is relevant ONLY if the trained network has this straneg
77 | # appended 'module.' text at the start of every keyword. This may happen in older
78 | # version of PARROT (see DSSP predictor as an example of where its needed) but in
79 | # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but
80 | # in case you're using an older network we've kept this to make things simple
81 |
82 | #for i in range(len(loaded_model)):
83 | # key, value = loaded_model.popitem(last=False)
84 | # new_key = key[7:]
85 | # loaded_model[new_key] = value
86 | ## END OF DELETE ME PROBABLY
87 |
88 |
89 | # Dynamically calculate the hyperparameters used to train the network.
90 | ## NOTE:
91 | #
92 | # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible
93 | # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example
94 | # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you
95 | # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this
96 | # keyword.
97 |
98 | # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords
99 | # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to
100 | # reach out to Alex or Dan about this!
101 |
102 | num_layers = 0
103 | while True:
104 | s = f'lstm.weight_ih_l{num_layers}'
105 | try:
106 | temp = loaded_model[s]
107 | num_layers += 1
108 | except KeyError:
109 | break
110 |
111 | number_of_classes = np.shape(loaded_model['fc.bias'])[0]
112 |
113 | # Hard coded because we always use one-hot encoding, note that if you trained a specific
114 | # predictor on a different encoding scheme you could, of course, here simply define that
115 | # encoding scheme
116 | input_size = 20
117 |
118 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
119 |
120 | # set these here so we can sanity check if needed
121 | self.number_of_classes = number_of_classes
122 | self.input_size = input_size
123 | self.number_of_layers = num_layers
124 | self.hidden_vector_size = hidden_vector_size
125 |
126 | # Instantiate network weights into object
127 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
128 |
129 | # load parameters into model
130 | self.network.load_state_dict(loaded_model)
131 |
132 |
133 |
134 | def predict_nuclear_import_signal(self, seq):
135 | """
136 | Function to predict the presence of nuclear import signals. Returns a per
137 | residue probability score of a residue being in an NLS or not
138 |
139 | Parameters
140 | ------------
141 | seq : str
142 | Valid amino acid sequence
143 |
144 | Returns
145 | ----------
146 | np.ndarray
147 | Returns a 1D np.ndarray the length of the sequence where each position
148 | gives the prediction of that residue being an NLS
149 |
150 | """
151 |
152 | # convert sequence to uppercase
153 | seq = seq.upper()
154 |
155 | # Convert to one-hot sequence vector - note, as mentioned above if you
156 | # did't use one-hot in the original training you could just edit this here
157 | seq_vector = encode_sequence.one_hot(seq)
158 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting
159 |
160 |
161 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
162 | ## CHANGE CODE BELOW HERE ##
163 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
164 |
165 |
166 | ## CLASSIFICATION CODE BLOCK
167 | # The block below should be kept if we're doing a classification
168 | # based prediction! if not, comment this out or delete it
169 | #prediction = self.network(seq_vector.float()).detach().numpy()
170 | #int_vals = []
171 | #for row in prediction[0]:
172 | # int_vals.append(np.argmax(row))
173 |
174 | #prediction = int_vals
175 |
176 | ## REGRESSION CODE BLOCK
177 | # This block should be kept if we're doing a regression-based
178 | # prediction. If not, comment this out or delete it
179 | prediction = self.network(seq_vector.float()).detach().numpy().flatten()
180 |
181 | prediction = prediction.reshape(-1, self.number_of_classes)
182 | prediction = np.array(list(map(softmax, prediction)))
183 |
184 |
185 | ## CLIP
186 | # IF we want to ensure we have a value between 0 and 1 the clipping here
187 | # will do that. If not leave commented
188 | #prediction = np.clip(prediction, 0.0, 1.0)
189 |
190 | # finally we extract out local probabilities
191 | score = []
192 | for val in prediction:
193 | score.append(round(val[1],5))
194 |
195 | return score
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
--------------------------------------------------------------------------------
/sparrow/predictors/phosphorylation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/phosphorylation/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/phosphorylation/phospho_predictor_utils.py:
--------------------------------------------------------------------------------
1 | from sparrow.sparrow_exceptions import SparrowException
2 |
3 | def return_hits(seq, phospho_probability, target_res, windowsize=4, threshold=0.6, return_sites_only=False):
4 | """
5 | Function that parses through a sequence and annotated phosphosite
6 | probabilities to extract out specific positions or a per-residue
7 | binary mask of phosphorylatio or non-phosphorylation.
8 |
9 | This function works by sliding a +/- windowsize window across the
10 | sequence and if the central residue in that window has a probability
11 | > threshold then all the target_res in that window are set to be
12 | putative phosphosites.
13 |
14 | Parameters
15 | --------------
16 | seq : str
17 | Amino acid sequence
18 |
19 | phospho_probability : list
20 | A list with per-residue probabilities for a residue to have been
21 | phosphorylated or not.
22 |
23 | windowsize : int
24 | Define the size of the window this algorithm uses to extend the
25 | influenc of a local phosphosite probability. Note the windowsize
26 | gets applied +/- a central position
27 |
28 | target_res : str
29 | A string with a single residue which each residue in the sequence
30 | is compared against.
31 |
32 | threshold : float
33 | A threshold value used to deliniate between phosphosites for masking.
34 | Default is 0.6.
35 |
36 | return_sites_only : bool
37 | A flag which, if set to True, means the function returns only the positions
38 | found in a list. If set to False the function returns a binary mask
39 | list equal in length to the sequence, where '1's mean the residue
40 | is predicted to be a phosphosite and '0' mean they're not. Default
41 | is False.
42 |
43 | Returns
44 | -----------
45 | list
46 | Returns EITHER a list (len == seq) if return_positions = False which
47 | contains a per-residue phosphomask (i.e. 1 = phospho 0 if not) OR
48 | returns a list of index positions that correspond to phosphosites.
49 |
50 | If return_positions is True, the function guarentees the order of
51 | indices returned will be numerical
52 |
53 | """
54 |
55 | ## sanity checking first
56 | if len(target_res) != 1:
57 | raise SparrowException('Target res must be a single amino acid')
58 |
59 | if threshold > 1 or threshold < 0:
60 | raise SparrowException('Probability threshold used in phosphosite masking must be between 0 and 1')
61 |
62 | if windowsize < 1:
63 | raise SparrowException('Window size must be a positive integer')
64 |
65 | if len(seq) != len(phospho_probability):
66 | raise SparrowException('Sequence length and probability vector must be the same length')
67 |
68 |
69 | seqlen = len(seq)
70 |
71 | potential_hits = set([])
72 |
73 | if seqlen < (2*windowsize)+1:
74 | raise SparrowException(f'Cannot predict phosphosites when the sequence length is less than 1+{2*windowsize}. NB: length = {seqlen}')
75 |
76 | # for each residue
77 | for idx, res in enumerate(seq):
78 |
79 | # if this is a low-probablity residue skip and move on
80 | if phospho_probability[idx] < threshold:
81 | continue
82 |
83 | # if we're in the N-terminal residues just excise out a fragment of
84 | # varying size until we get into the sequence
85 | if idx < windowsize:
86 | slice_start = 0
87 | current_slice = seq[slice_start:idx+windowsize]
88 |
89 | # while in the 'middle' of the sequence
90 | elif idx >= windowsize and idx <= (seqlen - (windowsize+1)):
91 | slice_start = idx-windowsize
92 | current_slice = seq[slice_start:idx+windowsize]
93 |
94 | # at the C-terminus
95 | else:
96 | slice_start = idx-windowsize
97 | current_slice = seq[slice_start:]
98 |
99 | # for each residue in the
100 | for local_idx, aa in enumerate(current_slice):
101 | if aa == target_res:
102 | global_pos = local_idx + slice_start
103 |
104 | if global_pos not in potential_hits:
105 | potential_hits.add(global_pos)
106 |
107 |
108 | # if we just want to return the phosphoindices. Note
109 | # we sort these to guarentee the order of return.
110 | if return_sites_only:
111 | return sorted(list(potential_hits))
112 | else:
113 |
114 | return_list = []
115 | for i in range(0,len(seq)):
116 | if i in potential_hits:
117 | return_list.append(1)
118 | else:
119 | return_list.append(0)
120 |
121 | return return_list
122 |
123 |
124 |
125 |
126 |
--------------------------------------------------------------------------------
/sparrow/predictors/prefactor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/prefactor/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/pscore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/pscore/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/pscore/pscore_predictor.py:
--------------------------------------------------------------------------------
1 | from parrot import brnn_architecture
2 | from parrot import encode_sequence
3 |
4 | import sparrow
5 |
6 | import torch
7 | import numpy as np
8 | import os
9 | from sparrow.sparrow_exceptions import SparrowException
10 |
11 |
12 |
13 | """
14 | Predictor template file. This data file should, in principle, require
15 | minimal editing to convert into a specific predictor based on a copied
16 | network file found in sparrow/data/networks/. Some general
17 | guidelines below (also included in the predictor documentation) and inline
18 | comments on things you will want to change. This code WILL NOT RUN as is and
19 | requires you to update missing variables to customize the predictor!!
20 |
21 | Missing values will be enclosed in < > to indicate this is where you (the
22 | software developer) must add some content
23 |
24 |
25 | ## Nomenclature
26 |
27 | 1. The predictor file should be called _predictor.py
28 | 2. This should be inside a module in the /predictor/ directory called
29 | 3. The single class this module implements should be called
30 |
31 |
32 | ## Class structure
33 |
34 | The class should have (at least) two functions:
35 |
36 | 1. A constructor (__init__()) which PRE LOADS the network from sparrow/data/networks/relevant_name - the get_data() function
37 | is defined in sparrow/__init__.py and allows absolute-path access to the /data directory. The constructor should
38 | FULLY load the network along with standard PARROT-style options, as shown here. Trained networks should be versioned and
39 | implemented so previous versions can be chosene even if the default version changes
40 |
41 | 2. Define a function called predict_(self, seq) where is a convenient name that obviously means this is
42 | what the function does.
43 |
44 | The idea is that this class should actually be completely stand alone independent of sparrow - i.e. one should be able to run
45 |
46 | >> from sparrow.predictor. import Predictor
47 | >>
48 | >> P = Predictor()
49 | >> P.predict_('myvalidseqnce')
50 |
51 | And have it work!
52 |
53 |
54 |
55 |
56 |
57 | """
58 |
59 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update
60 | # this default if you want that new network to be used by default
61 | DEFAULT_VERSION="4"
62 |
63 |
64 | ## CHANGE class name
65 | class PScorePredictor():
66 | """
67 |
68 | Class that loads in a network such that predict_pscore() can be called to predict
69 | PScore propensity from a sequence.
70 |
71 | """
72 | def __init__(self, version=None):
73 | """
74 | Constructor for building a predictor object object. The version keyword allows specific
75 | version(s) of the trained network associated with the predictor to be defined.
76 |
77 | By default, it's set to None, which leads to the current best/default network being selected
78 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
79 | the ability to pass a string as version. This string is inserted at position in the filename
80 |
81 | pscore_predictor_network_v.pt
82 |
83 | i.e. no need to include the "v" part or the .pt extension
84 |
85 | """
86 |
87 |
88 |
89 | # if no version provided use default, then grab path and check that file actually exists!
90 | if version is None:
91 | version = DEFAULT_VERSION
92 |
93 | # CHANGE THIS!! Make sure oyu change the and to the appropriate
94 | # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected
95 | saved_weights = sparrow.get_data(f'networks/pscore/pscore_predictor_network_v{version}.pt')
96 |
97 | if not os.path.isfile(saved_weights):
98 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
99 |
100 |
101 | # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because
102 | # we know everyone has a CPU...
103 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
104 |
105 | ## DELETE ME PROBABLY
106 | # this block of code is relevant ONLY if the trained network has this straneg
107 | # appended 'module.' text at the start of every keyword. This may happen in older
108 | # version of PARROT (see DSSP predictor as an example of where its needed) but in
109 | # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but
110 | # in case you're using an older network we've kept this to make things simple
111 |
112 | for i in range(len(loaded_model)):
113 | key, value = loaded_model.popitem(last=False)
114 | new_key = key[7:]
115 | loaded_model[new_key] = value
116 | ## END OF DELETE ME PROBABLY
117 |
118 |
119 | # Dynamically calculate the hyperparameters used to train the network.
120 | ## NOTE:
121 | #
122 | # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible
123 | # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example
124 | # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you
125 | # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this
126 | # keyword.
127 |
128 | # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords
129 | # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to
130 | # reach out to Alex or Dan about this!
131 |
132 | num_layers = 0
133 | while True:
134 | s = f'lstm.weight_ih_l{num_layers}'
135 | try:
136 | temp = loaded_model[s]
137 | num_layers += 1
138 | except KeyError:
139 | break
140 |
141 |
142 | number_of_classes = np.shape(loaded_model['fc.bias'])[0]
143 |
144 | # hard coded because we always use one-hot encoding, note that if you trained a specific
145 | # predictor on a different encoding scheme you could, of course, here simply define that
146 | # encoding scheme
147 | input_size = 20
148 |
149 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
150 |
151 | # set these here so we can sanity check if needed
152 | self.number_of_classes = number_of_classes
153 | self.input_size = input_size
154 | self.number_of_layers = num_layers
155 | self.hidden_vector_size = hidden_vector_size
156 |
157 | # Instantiate network weights into object
158 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
159 |
160 | # load parameters into model
161 | self.network.load_state_dict(loaded_model)
162 |
163 |
164 | ## CHANGE FUNCTION NAME
165 | def predict_pscore(self, seq):
166 | """
167 |
168 | Prediction function. seq should be a valid amino acid sequence.
169 |
170 | Parameters
171 | ------------
172 | seq : str
173 | Valid amino acid sequence
174 |
175 | Returns
176 | ----------
177 | np.ndarray
178 | Returns a 1D np.ndarray the length of the sequence where each position
179 | is the predicted value
180 |
181 | """
182 |
183 | # convert sequence to uppercase
184 | seq = seq.upper()
185 |
186 | # Convert to one-hot sequence vector - note, as mentioned above if you
187 | # did't use one-hot in the original training you could just edit this here
188 | seq_vector = encode_sequence.one_hot(seq)
189 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting
190 |
191 |
192 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
193 | ## CHANGE CODE BELOW HERE ##
194 | ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
195 |
196 |
197 | ## CLASSIFICATION CODE BLOCK
198 | # The block below should be kept if we're doing a classification
199 | # based prediction! if not, comment this out or delete it
200 | #prediction = self.network(seq_vector.float()).detach().numpy()
201 | #int_vals = []
202 | #for row in prediction[0]:
203 | # int_vals.append(np.argmax(row))
204 |
205 | #prediction = int_vals
206 |
207 | ## REGRESSION CODE BLOCK
208 | # This block should be kept if we're doing a regression-based
209 | # prediction. If not, comment this out or delete it
210 | prediction = self.network(seq_vector.float()).detach().numpy().flatten()
211 |
212 |
213 | return prediction
214 |
--------------------------------------------------------------------------------
/sparrow/predictors/rg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/rg/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/scaled_re/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/scaled_re/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/scaled_rg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/scaled_rg/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/scaling_exponent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/scaling_exponent/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/tad/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/tad/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/tad/transactivation_domain_predictor.py:
--------------------------------------------------------------------------------
1 | from parrot import brnn_architecture
2 | from parrot import encode_sequence
3 |
4 | import sparrow
5 |
6 | import torch
7 | import numpy as np
8 | import os
9 | from sparrow.sparrow_exceptions import SparrowException
10 |
11 |
12 | """
13 | NB: This network and predictor was imported from GOOSE, so is subtly different internally to how
14 | some of the other predictors work. Notably it includes a softmax project and a loop
15 | this loop below to define probabilities - this may be because these networks have 2 layers
16 | whereas the others only have one? Anyway, just making a note of this in case we need to debug in
17 | the future.
18 |
19 | score = []
20 | for val in prediction:
21 | score.append(round(val[1],5))
22 |
23 |
24 | """
25 |
26 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update
27 | # this default if you want that new network to be used by default
28 | DEFAULT_VERSION="1"
29 |
30 | def softmax(v):
31 | return (np.e ** v) / np.sum(np.e ** v)
32 |
33 |
34 | ## CHANGE class name
35 | class TADPredictor():
36 | """
37 |
38 | Class that loads in a network such that predict_ser_phosphorylation() can be called to predict
39 | serine phosphorylation from a sequence.
40 |
41 | """
42 | def __init__(self, version=None):
43 | """
44 | Constructor for building a predictor object object. The version keyword allows specific
45 | version(s) of the trained network associated with the predictor to be defined.
46 |
47 | By default, it's set to None, which leads to the current best/default network being selected
48 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
49 | the ability to pass a string as version. This string is inserted at position in the filename
50 |
51 | _network_v.pt
52 |
53 | i.e. no need to include the "v" part or the .pt extension
54 |
55 | """
56 |
57 |
58 |
59 | # if no version provided use default, then grab path and check that file actually exists!
60 | if version is None:
61 | version = DEFAULT_VERSION
62 |
63 | # CHANGE THIS!! Make sure oyu change the and to the appropriate
64 | # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected
65 | saved_weights = sparrow.get_data(f'networks/transactivation_domains/tad_predictor_network_v{version}.pt')
66 |
67 | if not os.path.isfile(saved_weights):
68 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
69 |
70 |
71 | # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because
72 | # we know everyone has a CPU...
73 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
74 |
75 | ## DELETE ME PROBABLY
76 | # this block of code is relevant ONLY if the trained network has this straneg
77 | # appended 'module.' text at the start of every keyword. This may happen in older
78 | # version of PARROT (see DSSP predictor as an example of where its needed) but in
79 | # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but
80 | # in case you're using an older network we've kept this to make things simple
81 |
82 | #for i in range(len(loaded_model)):
83 | # key, value = loaded_model.popitem(last=False)
84 | # new_key = key[7:]
85 | # loaded_model[new_key] = value
86 | ## END OF DELETE ME PROBABLY
87 |
88 |
89 | # Dynamically calculate the hyperparameters used to train the network.
90 | ## NOTE:
91 | #
92 | # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible
93 | # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example
94 | # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you
95 | # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this
96 | # keyword.
97 |
98 | # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords
99 | # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to
100 | # reach out to Alex or Dan about this!
101 |
102 | num_layers = 0
103 | while True:
104 | s = f'lstm.weight_ih_l{num_layers}'
105 | try:
106 | temp = loaded_model[s]
107 | num_layers += 1
108 | except KeyError:
109 | break
110 |
111 | number_of_classes = np.shape(loaded_model['fc.bias'])[0]
112 |
113 | # Hard coded because we always use one-hot encoding, note that if you trained a specific
114 | # predictor on a different encoding scheme you could, of course, here simply define that
115 | # encoding scheme
116 | input_size = 20
117 |
118 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
119 |
120 | # set these here so we can sanity check if needed
121 | self.number_of_classes = number_of_classes
122 | self.input_size = input_size
123 | self.number_of_layers = num_layers
124 | self.hidden_vector_size = hidden_vector_size
125 |
126 | # Instantiate network weights into object
127 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
128 |
129 | # load parameters into model
130 | self.network.load_state_dict(loaded_model)
131 |
132 |
133 |
134 | def predict_transactivation_domains(self, seq):
135 | """
136 | Function to predict the presence of nuclear import signals. Returns a per
137 | residue probability score of a residue being in an NLS or not
138 |
139 | Parameters
140 | ------------
141 | seq : str
142 | Valid amino acid sequence
143 |
144 | Returns
145 | ----------
146 | np.ndarray
147 | Returns a 1D np.ndarray the length of the sequence where each position
148 | gives the prediction of that residue being an NLS
149 |
150 | """
151 |
152 | # convert sequence to uppercase
153 | seq = seq.upper()
154 |
155 | # Convert to one-hot sequence vector - note, as mentioned above if you
156 | # did't use one-hot in the original training you could just edit this here
157 | seq_vector = encode_sequence.one_hot(seq)
158 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting
159 |
160 |
161 | ## REGRESSION CODE BLOCK
162 | # This block should be kept if we're doing a regression-based
163 | # prediction. If not, comment this out or delete it
164 | prediction = self.network(seq_vector.float()).detach().numpy().flatten()
165 |
166 | prediction = prediction.reshape(-1, self.number_of_classes)
167 | prediction = np.array(list(map(softmax, prediction)))
168 |
169 | ## CLIP
170 | # IF we want to ensure we have a value between 0 and 1 the clipping here
171 | # will do that. If not leave commented
172 | #prediction = np.clip(prediction, 0.0, 1.0)
173 |
174 | # finally we extract out local probabilities
175 | score = []
176 | for val in prediction:
177 | score.append(round(val[1],5))
178 |
179 | return score
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
--------------------------------------------------------------------------------
/sparrow/predictors/transmembrane/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/transmembrane/__init__.py
--------------------------------------------------------------------------------
/sparrow/predictors/transmembrane/transmembrane_predictor.py:
--------------------------------------------------------------------------------
1 | from parrot import brnn_architecture
2 | from parrot import encode_sequence
3 |
4 | import sparrow
5 |
6 | import torch
7 | import numpy as np
8 | import os
9 | from sparrow.sparrow_exceptions import SparrowException
10 |
11 |
12 |
13 | """
14 | Predictor of transmembrane regions from sequence.
15 |
16 | This is an example of how to implement a system-specific predictor
17 | in sparrow and could/should be used as a template for adding in
18 | additional predictors.
19 |
20 |
21 | ## Nomenclature
22 |
23 | 1. The predictor file should be called _predictor.py
24 | 2. This should be inside a module in the /predictor/ directory called
25 | 3. The single class this module implements should be called
26 |
27 | e.g. here we have
28 |
29 | 1. transmembrane/
30 | 2. transmembrane_predictor.py
31 | 3. TransmembranePredictor
32 |
33 |
34 | ## Class structure
35 |
36 | The class should have (at least) two functions:
37 |
38 | 1. A constructor (__init__()) which PRE LOADS the network from sparrow/data/networks/relevant_name - the get_data() function
39 | is defined in sparrow/__init__.py and allows absolute-path access to the /data directory. The constructor should
40 | FULLY load the network along with standard PARROT-style options, as shown here. Trained networks should be versioned and
41 | implemented so previous versions can be chosene even if the default version changes
42 |
43 | 2. Define a function called predict_(self, seq) where is a convenient name that obviously means this is
44 | what the function does.
45 |
46 |
47 | """
48 |
49 | DEFAULT_VERSION="4"
50 |
51 |
52 | class TransmembranePredictor():
53 | """
54 |
55 | Class that loads in a network such that predict_transmebrane_regions() can be called to predict
56 | transmembrane regions in a sequence.
57 |
58 | """
59 | def __init__(self, version=None):
60 | """
61 | Constructor for building a TransmembranePredictor object. The version keyword allows specific
62 | version(s) of the trained network associated with the HelicityPredictor to be defined.
63 | By default, it's set to None, which leads to the current best/default network being selected
64 | and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
65 | the ability to pass a string as version. This string is inserted at position in the filename
66 |
67 | HelicityPredictor_network_v.pt
68 |
69 | i.e. no need to include the "v" part or the .pt extension
70 |
71 | """
72 |
73 | if version is None:
74 | version = DEFAULT_VERSION
75 |
76 | saved_weights = sparrow.get_data(f'networks/transmembrane/transmembrane_predictor_network_v{version}.pt')
77 |
78 | if not os.path.isfile(saved_weights):
79 | raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
80 |
81 |
82 | # use helicity predictor version 1
83 | loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
84 |
85 | # Dynamically read in correct hyperparameters:
86 | num_layers = 0
87 | while True:
88 | s = f'lstm.weight_ih_l{num_layers}'
89 | try:
90 | temp = loaded_model[s]
91 | num_layers += 1
92 | except KeyError:
93 | break
94 |
95 | number_of_classes = np.shape(loaded_model['fc.bias'])[0]
96 | input_size = 20 # hard coded because we always use one-hot encoding
97 |
98 | hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
99 |
100 | # set these here so we can sanity check if needed
101 | self.number_of_classes = number_of_classes
102 | self.input_size = input_size
103 | self.number_of_layers = num_layers
104 | self.hidden_vector_size = hidden_vector_size
105 |
106 | # Instantiate network weights into object
107 | self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
108 |
109 | self.network.load_state_dict(loaded_model)
110 |
111 |
112 |
113 | def predict_transmebrane_regions(self, seq):
114 | """
115 | Prediction function. seq should be a valid amino acid sequence.
116 |
117 | Parameters
118 | ------------
119 | seq : str
120 | Valid amino acid sequence
121 |
122 | Returns
123 | ----------
124 | np.ndarray
125 | Returns a 1D np.ndarray the length of the sequence where each position
126 | is the transient helicity at that position.
127 |
128 | """
129 |
130 | # convert sequence to uppercase
131 | seq = seq.upper()
132 |
133 | # Convert to one-hot sequence vector
134 | seq_vector = encode_sequence.one_hot(seq)
135 | seq_vector = seq_vector.view(1, len(seq_vector), -1) # formatting
136 |
137 | # Forward pass -this is specific for classication
138 | prediction = self.network(seq_vector.float()).detach().numpy()
139 | int_vals = []
140 | for row in prediction[0]:
141 | int_vals.append(np.argmax(row))
142 |
143 | prediction = int_vals
144 |
145 |
146 | # for regression use the line below instead - included here so this
147 | # file can be easily copied over for future predictors
148 | # prediction = self.network(seq_vector.float()).detach().numpy().flatten()
149 | # prediction = np.clip(prediction, 0.0, 1.0)
150 |
151 | return prediction
152 |
--------------------------------------------------------------------------------
/sparrow/sequence_analysis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/sequence_analysis/__init__.py
--------------------------------------------------------------------------------
/sparrow/sequence_analysis/alignment.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Union
2 |
3 | from protfasta import read_fasta, write_fasta
4 | from pyfamsa import Aligner, Sequence
5 |
6 | from sparrow import Protein
7 | from sparrow.visualize.sequence_visuals import show_sequence
8 |
9 |
10 | class SequenceAlignment:
11 | def __init__(
12 | self,
13 | input_data: Union[str, Dict[str, Protein]],
14 | threads: int = 0,
15 | scoring_matrix: str = "BLOSUM62",
16 | guide_tree: str = "upgma",
17 | tree_heuristic: Union[str, None] = None,
18 | medoid_threshold: int = 0,
19 | n_refinements: int = 200,
20 | keep_duplicates: bool = False,
21 | refine: Union[bool, None] = None,
22 | ):
23 | """
24 | Initialize the SequenceAlignment object.
25 |
26 | Parametersip
27 | ----------
28 | input_data : Union[List[Protein], str, Dict[str, str]]
29 | A list of Protein objects, a path to a FASTA file, or a dictionary
30 | of FASTA headers to sequences.
31 | """
32 | self.input_data = input_data
33 | self.threads = threads
34 | self.guide_tree = guide_tree
35 | self.tree_heuristic = tree_heuristic
36 | self.medoid_threshold = medoid_threshold
37 | self.n_refinements = n_refinements
38 | self.keep_duplicates = keep_duplicates
39 | self.refine = refine
40 | self.scoring_matrix = scoring_matrix
41 | self.aligner = self._initialize_aligner()
42 | self._cached_msa = None # Cache for the computed MSA
43 |
44 | def _initialize_aligner(self) -> Aligner:
45 | """
46 | Initialize the Aligner object with the given parameters.
47 | """
48 | return Aligner(
49 | threads=self.threads,
50 | guide_tree=self.guide_tree,
51 | tree_heuristic=self.tree_heuristic,
52 | medoid_threshold=self.medoid_threshold,
53 | n_refinements=self.n_refinements,
54 | keep_duplicates=self.keep_duplicates,
55 | refine=self.refine,
56 | scoring_matrix=self.scoring_matrix,
57 | )
58 |
59 | @staticmethod
60 | def _encode_string(string_to_encode: str, encoding: str = "utf-8") -> bytes:
61 | """
62 | Encode a string to bytes using the specified encoding.
63 | """
64 | return string_to_encode.encode(encoding)
65 |
66 | def _load_sequences(self) -> List[Sequence]:
67 | """
68 | Load sequences from either a list of Protein objects, a FASTA file, or
69 | a dictionary of header-sequence mappings.
70 |
71 | Returns
72 | -------
73 | List[Sequence]
74 | A list of pyfamsa.Sequence objects for alignment.
75 | """
76 | if isinstance(self.input_data, str):
77 | # Assume input_data is a path to a FASTA file
78 | fasta_data = read_fasta(self.input_data)
79 | sequences = [
80 | Sequence(self._encode_string(header), self._encode_string(seq))
81 | for header, seq in fasta_data.items()
82 | ]
83 | elif isinstance(self.input_data, dict):
84 | # Assume input_data is a dictionary of header-sequence mappings
85 | sequences = [
86 | Sequence(self._encode_string(header), self._encode_string(seq.sequence))
87 | for header, seq in self.input_data.items()
88 | ]
89 | else:
90 | raise ValueError(
91 | "Invalid input_data format. Must be either a list of Protein objects, "
92 | "a path to a FASTA file, or a dictionary of header-sequence mappings."
93 | )
94 |
95 | return sequences
96 |
97 | def construct_msa(self) -> Aligner:
98 | """
99 | Construct a multiple sequence alignment with pyFAMSA.
100 |
101 | Returns
102 | -------
103 | Aligner
104 | Returns the constructed MSA as a pyfamsa._famsa.Alignment.
105 | """
106 | if self._cached_msa is not None:
107 | # Return cached MSA if it exists
108 | return self._cached_msa
109 |
110 | sequences = self._load_sequences()
111 | self._cached_msa = self.aligner.align(sequences) # Cache the computed MSA
112 | return self._cached_msa
113 |
114 | @property
115 | def alignment(self) -> Aligner:
116 | """
117 | Property to access the cached MSA result.
118 |
119 | Returns
120 | -------
121 | Aligner
122 | Returns the cached MSA if available, otherwise computes it.
123 | """
124 | if self._cached_msa is None:
125 | # Compute MSA if it hasn't been computed yet
126 | self.construct_msa()
127 | return self._cached_msa
128 |
129 | def save_msa(
130 | self, filename: str, linelength: int = 60, append_to_fasta: bool = False
131 | ):
132 | """
133 | Save the multiple sequence alignment to a FASTA file.
134 |
135 | Parameters
136 | ----------
137 | filename : str
138 | The filename to save the MSA. Should end with .fasta or .fa.
139 |
140 | linelength : int, optional
141 | Length of lines in the output file, by default 60.
142 |
143 | append_to_fasta : bool, optional
144 | Whether to append to an existing FASTA file, by default False.
145 | """
146 | msa = self.alignment
147 | fasta_data = {seq.id.decode(): seq.sequence.decode() for seq in msa}
148 | write_fasta(
149 | fasta_data, filename, linelength=linelength, append_to_fasta=append_to_fasta
150 | )
151 |
152 | @property
153 | def display_msa(self, ljust: int = 10, html: bool = False):
154 | """
155 | Print the multiple sequence alignment using the cached MSA.
156 |
157 | Parameters
158 | ----------
159 | ljust : int, optional
160 | The number of spaces to pad the sequence ID, by default 10
161 |
162 | html : bool, optional
163 | Set to True to print the alignment in HTML format, by default False
164 | """
165 | msa = self.alignment
166 |
167 | for seq in msa:
168 | if html:
169 | print(seq.id.decode().ljust(ljust), end=None)
170 | show_sequence(seq.sequence.decode())
171 | else:
172 | print(seq.id.decode().ljust(ljust), seq.sequence.decode())
173 |
--------------------------------------------------------------------------------
/sparrow/sequence_analysis/community_plugins/contributed.py:
--------------------------------------------------------------------------------
1 | from sparrow.sequence_analysis.plugins import BasePlugin
2 |
3 |
4 | class MultiplicativeFCR(BasePlugin):
5 | def __init__(self, protein):
6 | super().__init__(protein)
7 |
8 | def calculate(self, factor=2.0):
9 | """
10 | This analysis doubles the FCR (fraction of charged residues) of the protein.
11 | This is a simple example of a contributed plugin.
12 |
13 | Parameters: factor (float)
14 | -------------
15 | factor: float
16 | The factor by which the FCR will be multiplied (default is 2.0)
17 |
18 | Returns
19 | -------------
20 | float
21 | Returns the result of the contributed analysis
22 | """
23 | return factor * self.protein.FCR
24 |
--------------------------------------------------------------------------------
/sparrow/sequence_analysis/elm.py:
--------------------------------------------------------------------------------
1 | import re
2 | from dataclasses import dataclass
3 | from typing import List, Set, Tuple, Union
4 | import pandas as pd
5 | from IPython import embed
6 |
7 | import sparrow
8 | from sparrow.sparrow_exceptions import SparrowException
9 |
10 |
11 | @dataclass(frozen=True)
12 | class ELM:
13 | regex: str
14 | identifier: str
15 | functional_site_name: str
16 | description: str
17 | probability: float
18 | start: int
19 | end: int
20 | sequence: str
21 |
22 | def __eq__(self, other):
23 | if self.start > other.end or self.end < other.start:
24 | return False
25 |
26 | # Only compare regex patterns for equality - all regexes for ELMs are unique - we could also check functional site names?
27 | return self.regex == other.regex
28 |
29 | def __hash__(self):
30 | # I THINK this works since we're basically saying we dont CARE about sequences if they're the same or not
31 | # this will let us do set differences and intersections
32 | # Does restrict motif to starting at the same position though which we know these could be diff spots from indels
33 | # This is fine for point mutation comparison, but this could probably be generalized.
34 | # don't want just look for "in the sequence" because their might be
35 | # multiple occurences of the same motif and motif positioning may matter.
36 | return hash((self.regex, self.functional_site_name, self.start))
37 |
38 |
39 | def parse_hgvs(hgvs_notation : str) -> Tuple:
40 | """This function takes an HGVS notation and returns a tuple of the form (position, mutation)
41 | where position is the position of the mutation and mutation is the amino acid change.
42 |
43 | Parameters
44 | ----------
45 | hgvs_notation : str
46 | HGVS notation of the form p.XXXX
47 |
48 | Returns
49 | -------
50 | Tuple[int,str]
51 | Tuple containing the position of the mutation and the amino acid change.
52 | """
53 | if not hgvs_notation.startswith("p."):
54 | raise SparrowException("Invalid HGVS notation. Must start with 'p.'")
55 |
56 | parts = hgvs_notation.split('p.')
57 | if len(parts) < 2:
58 | raise SparrowException("Invalid HGVS notation. Must be in the form p.xxx")
59 |
60 | # Extract the position and amino acids
61 | position = int(''.join(filter(str.isdigit, parts[1]))) # shift indexing to 0
62 | assert position > 0, SparrowException(f"Invalid position in HGVS notation, must be a 1 indexed integer greater than 0. Received {position}")
63 | mutation = parts[1][-1]
64 |
65 | # position shifted to 0 index
66 | return position-1, mutation.upper()
67 |
68 | def generate_elm_df(file : str) -> pd.DataFrame:
69 | """Generates a pandas DataFrame object containing all the information
70 | annotated as an elm.
71 |
72 | Parameters
73 | ----------
74 | file : str
75 | This generates a dataframe from the elm_classes.tsv in the data directory.
76 | The latest elm class list can be found at http://elm.eu.org/downloads.html
77 |
78 | Returns
79 | -------
80 | pandas.DataFrame
81 | DataFrame containing the elm annotations.
82 |
83 | """
84 | elm_data = []
85 | with open(f"{file}", "r", encoding="utf-8") as f:
86 | for line in f:
87 | if line.startswith("#"):
88 | continue
89 | if line.startswith('"Accession"'):
90 | columns = line.strip().split("\t")
91 | columns = [col.replace('"','') for col in columns]
92 | else:
93 | elm_data.append(line.replace('"','').strip().split("\t"))
94 | df = pd.DataFrame(elm_data,columns=columns)
95 | return df
96 |
97 | def find_all_elms(sequence : str) -> List[ELM]:
98 | """This function takes an input sequence and returns a namedtuple
99 | containing the regex used to find the elm from sequence, it's functional annotation,
100 | the start and stop position, as well as the sequence of the e
101 |
102 | Parameters
103 | ----------
104 | sequence : str
105 | Amino Acid Sequence
106 |
107 | Returns
108 | -------
109 | List[sparrow.sequence_analysis.elm.ELM]
110 | A list of NamedTuples containing all possible elms in a given sequence.
111 | """
112 | elm_file = sparrow.get_data("elm_classes.tsv")
113 | df = generate_elm_df(elm_file)
114 | elms = []
115 | for _, row in df.iterrows():
116 | regex = row["Regex"]
117 | elm_class = row["ELMIdentifier"]
118 | site = row["FunctionalSiteName"]
119 | elm_description = row["Description"]
120 | elm_probability = row["Probability"]
121 |
122 | match_indices = [(m.start(0), m.end(0)) for m in re.finditer(regex, sequence)]
123 | for (start,end) in match_indices:
124 | elm = ELM(regex, elm_class, site, elm_description, elm_probability, start, end, sequence[start:end])
125 | elms.append(elm)
126 | return set(elms)
127 |
128 | def compute_lost_elms(target_protein, query : Union[Tuple[int,str],str]) -> Set:
129 | """This function takes a protein sequence and a target query and returns a
130 | the set of ELMs that were lost due to the mutation. The query can either be
131 | a list or tuple of the form (position, mutant) where position is the position
132 | of the mutation. or it can be a string in the HGVS format.
133 | Parameters
134 | ----------
135 | target_protein : Union[sparrow.Protein, str]
136 | sparrow.Protein or amino acid sequence
137 | queries : Union[str, List[int,str], Tuple[int,str]]
138 | List or tuple of the form (position, mutant) where position is the position of the mutation.
139 | Returns
140 | -------
141 | Set
142 | A set of ELMs containing the functional site name, the start and stop position,
143 | the sequence of the elm.
144 | """
145 |
146 | if isinstance(target_protein, str):
147 | target_protein = sparrow.Protein(target_protein)
148 |
149 | if isinstance(query, str):
150 | position, mutation = parse_hgvs(query)
151 | else:
152 | position, mutation = query
153 |
154 | mutant_protein = sparrow.Protein(target_protein.sequence[:position] + mutation + target_protein.sequence[position+1:])
155 |
156 | wt_elms = target_protein.elms
157 | mutant_elms = mutant_protein.elms
158 | lost_elms = wt_elms - mutant_elms
159 |
160 | return lost_elms
161 |
162 | def compute_gained_elms(target_protein, query : Union[Tuple[int,str],str]) -> Set:
163 | """This function takes a protein sequence and a target query and returns a
164 | the set of ELMs that were gained due to the mutation. The query can either be
165 | a list or tuple of the form (position, mutant) where position is the position
166 | of the mutation. or it can be a string in the HGVS format.
167 |
168 | Parameters
169 | ----------
170 | target_protein : Union[sparrow.Protein, str]
171 | sparrow.Protein or amino acid sequence
172 | queries : Union[List[int,str], Tuple[int,str]]
173 | List or tuple of the form (position, mutant) where position is the position of the mutation.
174 | Returns
175 | -------
176 | Set
177 | A set of ELMs containing the functional site name, the start and stop position,
178 | the sequence of the elm.
179 | """
180 |
181 | if isinstance(target_protein, str):
182 | target_protein = sparrow.Protein(target_protein)
183 |
184 | if isinstance(query, str):
185 | position, mutation = parse_hgvs(query)
186 | else:
187 | position, mutation = query
188 |
189 | mutant_protein = sparrow.Protein(target_protein.sequence[:position] + mutation + target_protein.sequence[position+1:])
190 |
191 |
192 | wt_elms = target_protein.elms
193 | mutant_elms = mutant_protein.elms
194 | gained_elms = mutant_elms - wt_elms
195 |
196 | return gained_elms
197 |
198 | def compute_retained_elms(target_protein, query : Union[Tuple[int,str],str]) -> Set:
199 | """This function takes a protein sequence and a target query and returns a
200 | the set of ELMs that were retained (no change) after mutation. The query can
201 | either be a list or tuple of the form (position, mutant) where position is
202 | the position of the mutation. or it can be a string in the HGVS format.
203 |
204 | Parameters
205 | ----------
206 | target_protein : Union[sparrow.Protein, str]
207 | sparrow.Protein or amino acid sequence
208 | queries : Union[List[int,str], Tuple[int,str]]
209 | List or tuple of the form (position, mutant) where position is the position of the mutation.
210 | Returns
211 | -------
212 | Set
213 | A set of ELMs containing the functional site name, the start and stop position,
214 | the sequence of the elm.
215 | """
216 | if isinstance(target_protein, str):
217 | target_protein = sparrow.Protein(target_protein)
218 |
219 | if isinstance(query, str):
220 | position, mutation = parse_hgvs(query)
221 | else:
222 | position, mutation = query
223 |
224 | mutant_protein = sparrow.Protein(target_protein.sequence[:position] + mutation + target_protein.sequence[position+1:])
225 |
226 | wt_elms = target_protein.elms
227 | mutant_elms = mutant_protein.elms
228 | retained_elms = wt_elms & mutant_elms
229 |
230 | return retained_elms
--------------------------------------------------------------------------------
/sparrow/sequence_analysis/phospho_isoforms.py:
--------------------------------------------------------------------------------
1 | """
2 | Snippit to build and adapted from localcider to get all phosphoisoforms
3 | of an amino acid sequence which is to integrated into sparrow
4 |
5 | By : Garrett M. Ginell
6 | 2023-02-08
7 |
8 | BASIC workflow is as followed:
9 |
10 | To get a list of run get_phosphoisoforms"
11 |
12 | phosphoSeqome = get_phosphoisoforms(sequence, mode='predict')
13 | # for options see various run variations in function header
14 |
15 | Then once you get the phosphoisoforms from the list above you can iterate
16 | the list and calculate a sequence parameter of choice and build a distribution:
17 |
18 | parameter_list = []
19 | for s in phosphoSeqome:
20 | parameter_list.append(Protein(s).my_parameter_of_choice)
21 |
22 | This distribution can then be compared back to value of the original sequence:
23 |
24 | Protein(sequence).my_parameter_of_choice
25 | """
26 | import itertools
27 |
28 | ## -----------------------------------------
29 | ##
30 | def _predict_all_phosphosites(protein):
31 | """
32 | Gets list of predicted phosphosites
33 |
34 | BASED OFF OF:
35 | predictors in sparrow:
36 | https://github.com/idptools/sparrow/tree/main/sparrow/predictors/phosphorylation
37 |
38 | Parameters
39 | ------------
40 | protein : sparrow.Protein
41 | sparrow Protein object
42 |
43 |
44 | Returns:
45 | ----------
46 | list
47 | list of predicted positions of sites of phosphorylated T, S, and Y
48 | Note positions are returned as indexed from 0
49 |
50 | """
51 |
52 | # predict phosphosites
53 | pS = protein.predictor.serine_phosphorylation(return_sites_only=True)
54 | pT = protein.predictor.threonine_phosphorylation(return_sites_only=True)
55 | pY = protein.predictor.tyrosine_phosphorylation(return_sites_only=True)
56 |
57 | return list(pS + pT + pY)
58 |
59 | ## ----------------------------------------
60 | ##
61 | def _get_all_phosphosites(sequence):
62 | """
63 | Function which returns a list of all the positions which *could* be
64 | phosphorylated (i.e. are T/S/Y). NOTE this does not use any kind of
65 | smart lookup, metadata, or analysis. It's literally, where are the Y/T/S
66 | residues.
67 | Note positions are returned as indexed from 0
68 |
69 | Parameters
70 | ------------
71 | sequence : str
72 | Valid amino acid sequence
73 |
74 | Returns:
75 | ----------
76 | list
77 | list of integers corresponding to S/T/Y positions in your sequence
78 |
79 | """
80 | sites = []
81 | idx = 0
82 | for i in sequence:
83 | if i in ["Y", "S", "T"]:
84 | sites.append(idx)
85 | idx = idx + 1
86 | return sites
87 |
88 | ## -----------------------------------
89 | ##
90 | def _build_phosphoSeqome(sequence, phosphosites, phospho_rate=1):
91 | """
92 | Build all phospho-isoforms based on provided phosphosites
93 |
94 | Parameters
95 | ------------
96 | sequence : str
97 | Valid amino acid sequence
98 |
99 | phosphosites : list
100 | List of valid phosphosite positions
101 |
102 | phospho_rate : float
103 | Value between 0 and 1 which defines the maximum percent of phosphosites
104 | can be 'phosphorylated' a each sequence. Defult is 1 (IE all sites can be
105 | phosphorylated)
106 |
107 | Returns
108 | ----------
109 | list
110 | list of sequences for all posible phospho-isoforms
111 | based off of the provided inputed list of phosphosites
112 |
113 | When phospho_rate = 1 (100%)
114 | the length of output list = 2^n where n=len(phosphosites)
115 | """
116 |
117 | _max_phospho_number = int(len(phosphosites)*phospho_rate)
118 | ## GET ALL phospho-sequence combinations
119 | phosphoSeqome = []
120 | phosphoSeqome_info = []
121 | for phosphostatus in itertools.product("01", repeat=len(phosphosites)):
122 |
123 | if phosphostatus.count('1') > _max_phospho_number:
124 | continue
125 | newseq = list(sequence)
126 |
127 | count = 0
128 | indx = 0
129 | # look over each element in our phosphosite on/off list
130 | for i in phosphostatus:
131 | # if that element is ON
132 | if int(i) == 1:
133 | # set the AA at that position to a negative residue (we use E but
134 | # could be D)
135 | newseq[phosphosites[indx]] = "E"
136 | count+=1
137 | indx = indx + 1
138 |
139 | # now we've replaced some number of T/Y/S with E representing a different
140 | # phosphostate
141 | newseq = "".join(newseq)
142 | phosphoSeqome.append(newseq)
143 |
144 | return phosphoSeqome
145 |
146 | ## -----------------------------------
147 | ##
148 | def get_phosphoisoforms(protein, mode="all", phospho_rate=1, phosphosites=None):
149 | """Phosphosites are replaced with the phosphomimetic 'E', enabling approximate calculation
150 | of charge based sequence features with the presence of a phosphorylated residues.
151 |
152 | Parameters
153 | ----------
154 | protein : sparrow.Protein
155 | sparrow Protein object
156 |
157 | mode : str, optional
158 | Defition for how the phosphosites should be determined, by default "all"
159 |
160 | 'all' : Assumes all S/T/Y residues are potential phosphosites
161 |
162 | 'predict' : Leverages PARROT trained predictors via _predict_all_phosphosites
163 | to predict phosphorylated sites based on sequence.
164 |
165 | 'custom' : uses the 'phosphosites' parameter as indices for phosphosites.
166 |
167 | phospho_rate : int, optional
168 | Value between 0 and 1 which defines the maximum percent of phosphosites
169 | can be 'phosphorylated' a each sequence, by default 1 (IE all sites can be
170 | phosphorylated)
171 |
172 | phosphosites : list, optional
173 | Custom list of indices for valid phosphosite positions, by default None
174 |
175 | Returns
176 | -------
177 | list
178 | list of sequences for the possible phosphoisoforms based off the selected method.
179 | Phosphorylatable amino acids are replaced with 'E'.
180 | """
181 |
182 | # get phosphosite positions
183 | if mode == 'all':
184 | _phosphosites = _get_all_phosphosites(protein.sequence)
185 | elif mode == 'predict':
186 | _phosphosites = _predict_all_phosphosites(protein)
187 | elif mode == 'custom':
188 | if phosphosites != None:
189 | _phosphosites = phosphosites
190 | else:
191 | raise Exception('To use custom phosphosites must be defined')
192 | else:
193 | raise Exception('Please specify mode to compute phosphosites')
194 |
195 | # generate all phospho-Isoforms
196 | return _build_phosphoSeqome(protein.sequence, _phosphosites, phospho_rate=phospho_rate)
197 |
--------------------------------------------------------------------------------
/sparrow/sequence_analysis/physical_properties.py:
--------------------------------------------------------------------------------
1 | from sparrow.data import amino_acids
2 |
3 | ## The physical properties module contains stateless functions that compute sequence-dependent
4 | ## physical properties. See the "calculate_molecular_weight" function as a template for how
5 | ## these functions should work.
6 | ##
7 | ##
8 |
9 | def calculate_molecular_weight(sequence):
10 | """
11 | Function that returns the molecular weight of a protein sequence assuming standard
12 | amino acid molecular weights.
13 |
14 | Parameters
15 | -------------
16 | sequence : str
17 | String containing the amino acid sequence (upper case one-letter residue codes)
18 |
19 | Returns
20 | -----------
21 | float
22 | Returns the residue or polypeptide molecular weight.
23 |
24 | """
25 |
26 | # compute niave MW
27 | MW = 0
28 | for i in sequence:
29 | MW = MW + amino_acids.AA_MOLECULAR_WEIGHT[i]
30 |
31 | if len(sequence) == 1:
32 | return MW
33 |
34 | else:
35 | return MW - 18*(len(sequence)-1)
36 |
37 |
--------------------------------------------------------------------------------
/sparrow/sequence_analysis/plugins.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import inspect
3 | import pkgutil
4 | from abc import ABC, abstractmethod
5 | from collections import defaultdict
6 | from typing import Any
7 |
8 |
9 | class PluginWrapper:
10 | """
11 | A wrapper class for plugins that integrates with the plugin manager.
12 |
13 | This class is responsible for managing the execution of plugin instances
14 | and caching their results to avoid redundant computations. It uses a
15 | combination of the plugin name and the arguments passed to the plugin's
16 | `calculate` method to create a unique cache key for storing results.
17 |
18 | Attributes:
19 | name (str): The name of the plugin.
20 | cache_dict (dict): A dictionary used to store cached results.
21 | plugin_instance (object): An instance of the plugin to be wrapped.
22 |
23 | Methods:
24 | __call__(*args, **kwargs):
25 | Executes the plugin's `calculate` method with the provided arguments.
26 | Caches the result to avoid recomputation on subsequent calls with
27 | the same arguments.
28 | """
29 |
30 | def __init__(self, name, cache_dict, plugin_instance):
31 | self.name = name
32 | self.cache_dict = cache_dict
33 | self.plugin_instance = plugin_instance
34 |
35 | def __call__(self, *args, **kwargs):
36 | """
37 | Call calculate() with or without arguments.
38 | Implement caching to avoid recomputation.
39 | """
40 | # Create hashable cache key for args and kwargs
41 | cache_key = (args, frozenset(kwargs.items()))
42 |
43 | # Check if the result is cached
44 | if cache_key not in self.cache_dict[self.name]:
45 | self.cache_dict[self.name][cache_key] = self.plugin_instance.calculate(
46 | *args, **kwargs
47 | )
48 |
49 | return self.cache_dict[self.name][cache_key]
50 |
51 |
52 | class PluginManager:
53 | def __init__(self, protein: "sparrow.Protein"):
54 | self.__protein_obj = protein
55 | # Memoization for both args and no-args results
56 | self.__precomputed = defaultdict(dict)
57 | self.__plugins = {}
58 |
59 | self._available_plugins = self._discover_plugins()
60 |
61 | def _discover_plugins(self):
62 | """
63 | Discover all plugins available in the contributed plugin module.
64 | """
65 | plugin_module = "sparrow.sequence_analysis.community_plugins.contributed"
66 | try:
67 | module = importlib.import_module(plugin_module)
68 | return [
69 | name
70 | for name, obj in inspect.getmembers(module, inspect.isclass)
71 | if issubclass(obj, BasePlugin) and obj.__module__ == plugin_module
72 | ]
73 | except ModuleNotFoundError:
74 | return []
75 |
76 | def __getattr__(self, name: str):
77 | """
78 | Dynamically load and return the plugin's calculate method result
79 | as if it were a property when accessed without arguments.
80 | """
81 | if name not in self.__plugins:
82 | try:
83 | module = importlib.import_module(
84 | f"sparrow.sequence_analysis.community_plugins.contributed"
85 | )
86 | plugin_class = getattr(module, name)
87 | if not issubclass(plugin_class, BasePlugin):
88 | raise AttributeError(f"{name} is not a valid plugin.")
89 | self.__plugins[name] = plugin_class(protein=self.__protein_obj)
90 | except (ModuleNotFoundError, AttributeError):
91 | raise AttributeError(
92 | f"Plugin '{name}' not found. Available plugins are: {list(self._available_plugins)}"
93 | )
94 |
95 | plugin_instance = self.__plugins[name]
96 |
97 | return PluginWrapper(name, self.__precomputed, plugin_instance)
98 |
99 | def __dir__(self):
100 | """
101 | Return the list of dynamically available plugins for autocompletion QoL.
102 | """
103 | return super().__dir__() + self._available_plugins
104 |
105 |
106 | class BasePlugin(ABC):
107 | """Base class for all community contributed plugins."""
108 |
109 | def __init__(self, protein: "sparrow.Protein"):
110 | """Constructor for all plugins. This must provide a protein object or sequence."""
111 | self.__protein_obj = protein
112 |
113 | @abstractmethod
114 | def calculate(self) -> Any:
115 | """
116 | This method must operate on the sequence attribute of the protein object.
117 | The method must return the result of the contributed analysis.
118 |
119 | Returns
120 | -------------
121 | float
122 | Returns the result of the contributed analysis
123 | """
124 | pass
125 |
126 | @property
127 | def protein(self):
128 | return self.__protein_obj
129 |
--------------------------------------------------------------------------------
/sparrow/sparrow_exceptions.py:
--------------------------------------------------------------------------------
1 | class SparrowException(Exception):
2 | pass
3 |
4 |
5 | class ProteinException(Exception):
6 | pass
7 |
8 |
9 | class PatterningException(Exception):
10 | pass
11 |
12 |
13 | class CalculationException(Exception):
14 | pass
15 |
--------------------------------------------------------------------------------
/sparrow/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Empty init file in case you choose a package besides PyTest such as Nose which may look for such a file
3 | """
4 |
5 | import numpy as np
6 |
7 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS
8 |
9 |
10 | def build_seq(min_count=10,max_count=50):
11 |
12 | # how many residues
13 | n_res = np.random.randint(4,20)
14 |
15 | s = ''
16 | for i in range(n_res):
17 | aa_idx = np.random.randint(0,20)
18 | s = s + VALID_AMINO_ACIDS[aa_idx]*np.random.randint(min_count, max_count)
19 |
20 | s = list(s)
21 | np.random.shuffle(s)
22 | s = "".join(s)
23 | return s
24 |
--------------------------------------------------------------------------------
/sparrow/tests/compute_test_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "725f76f7",
6 | "metadata": {},
7 | "source": [
8 | "### Dictionary that recomputes the test_data "
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "92383cc0",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": []
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 41,
22 | "id": "5494688b",
23 | "metadata": {},
24 | "outputs": [
25 | {
26 | "name": "stdout",
27 | "output_type": "stream",
28 | "text": [
29 | "0.43746367\n",
30 | "32.81683\n",
31 | "33.175683185378716\n",
32 | "81.15200236853273\n",
33 | "75.92058\n",
34 | "0.5773746\n",
35 | "5.815894\n"
36 | ]
37 | }
38 | ],
39 | "source": [
40 | "from sparrow import Protein\n",
41 | "\n",
42 | "P = Protein('MKYLAAYLLLNAAGNTPDATKIKAILESVGIEIEDEKVSSVLSALEGKSVDELITEGNEKLAAVPAAGPASAGGAAAASGDAAAEEEKEEEAAEESDDDMGFGLFD')\n",
43 | "\n",
44 | "print(P.predictor.asphericity())\n",
45 | "\n",
46 | "print(P.predictor.radius_of_gyration())\n",
47 | "print(P.predictor.radius_of_gyration(use_scaled=True))\n",
48 | "\n",
49 | "print(P.predictor.end_to_end_distance(use_scaled=True))\n",
50 | "print(P.predictor.end_to_end_distance(use_scaled=False))\n",
51 | "\n",
52 | "print(P.predictor.scaling_exponent())\n",
53 | "print(P.predictor.prefactor())\n",
54 | "from sparrow.data.amino_acids import VALID_AMINO_ACIDS\n"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 47,
60 | "id": "4e59d0a1",
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "from sparrow import Protein\n",
65 | "import pytest\n",
66 | "import protfasta\n",
67 | "import os\n",
68 | "\n",
69 | "current_filepath = os.getcwd()\n",
70 | "onehundred_seqs = \"{}/test_data/test_seqs_100.fasta\".format(current_filepath)\n",
71 | "\n",
72 | "seqs = protfasta.read_fasta(onehundred_seqs)\n"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "id": "3ed4d1e5",
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "k2rg = {}\n",
83 | "for k in seqs:\n",
84 | " k2rg[k] = Protein(seqs[k]).predictor.radius_of_gyration()\n",
85 | "\n",
86 | "np.save('test_data/test_100_rg_v2.npy', np.array(k2rg, dtype=dict)) "
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 48,
92 | "id": "e71f57dd",
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "k2rg = {}\n",
97 | "for k in seqs:\n",
98 | " k2rg[k] = Protein(seqs[k]).predictor.radius_of_gyration(use_scaled=True)\n",
99 | "\n",
100 | "np.save('test_data/test_100_rg_scaled_v2.npy', np.array(k2rg, dtype=dict)) "
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 49,
106 | "id": "12872bec",
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "k2re = {}\n",
111 | "for k in seqs:\n",
112 | " k2re[k] = Protein(seqs[k]).predictor.end_to_end_distance()\n",
113 | "\n",
114 | "np.save('test_data/test_100_re_v2.npy', np.array(k2re, dtype=dict)) "
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 50,
120 | "id": "3bc0cd1a",
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "k2re = {}\n",
125 | "for k in seqs:\n",
126 | " k2re[k] = Protein(seqs[k]).predictor.end_to_end_distance(use_scaled=True)\n",
127 | "\n",
128 | "np.save('test_data/test_100_re_scaled_v2.npy', np.array(k2re, dtype=dict)) "
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 51,
134 | "id": "47f17564",
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "k2asph = {}\n",
139 | "for k in seqs:\n",
140 | " k2asph[k] = Protein(seqs[k]).predictor.asphericity()\n",
141 | "\n",
142 | "np.save('test_data/test_100_asph_v2.npy', np.array(k2asph, dtype=dict)) "
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 52,
148 | "id": "202cdc34",
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "k2scal_exp = {}\n",
153 | "for k in seqs:\n",
154 | " k2scal_exp[k] = Protein(seqs[k]).predictor.scaling_exponent()\n",
155 | "\n",
156 | "np.save('test_data/test_100_exponent_v2.npy', np.array(k2scal_exp, dtype=dict)) "
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 53,
162 | "id": "39eb54c0",
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "k2prefact = {}\n",
167 | "for k in seqs:\n",
168 | " k2prefact[k] = Protein(seqs[k]).predictor.prefactor()\n",
169 | "\n",
170 | "np.save('test_data/test_100_prefactor_v2.npy', np.array(k2prefact, dtype=dict)) "
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 3,
176 | "id": "f4d6bf39",
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "from sparrow.patterning import iwd"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 21,
186 | "id": "868afb92",
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "k2_average_bivariate_inverse_distance_charge = {}\n",
191 | "k2_average_inverse_distance_charge_neg = {}\n",
192 | "k2_average_inverse_distance_charge_pos = {}\n",
193 | "k2_average_inverse_distance_ali = {}\n",
194 | "\n",
195 | "for k in seqs:\n",
196 | "\n",
197 | " local_seq = seqs[k]\n",
198 | " \n",
199 | " ncpr = Protein(seqs[k]).linear_sequence_profile('NCPR')\n",
200 | " \n",
201 | " k2_average_bivariate_inverse_distance_charge[k] = iwd.calculate_average_bivariate_inverse_distance_charge(ncpr, local_seq)\n",
202 | " k2_average_inverse_distance_charge_neg[k] = iwd.calculate_average_inverse_distance_charge(ncpr, local_seq, '-')\n",
203 | " k2_average_inverse_distance_charge_pos[k] = iwd.calculate_average_inverse_distance_charge(ncpr, local_seq, '+')\n",
204 | " k2_average_inverse_distance_ali[k] = iwd.calculate_average_inverse_distance_from_sequence(local_seq, 'ILVAM')\n",
205 | " \n",
206 | " \n",
207 | "np.save('test_data/test_average_bivariate_inverse_distance_charge.npy', np.array(k2_average_bivariate_inverse_distance_charge, dtype=dict)) \n",
208 | "np.save('test_data/test_average_inverse_distance_charge_neg.npy', np.array(k2_average_inverse_distance_charge_neg, dtype=dict)) \n",
209 | "np.save('test_data/test_average_inverse_distance_charge_pos.npy', np.array(k2_average_inverse_distance_charge_pos, dtype=dict)) \n",
210 | "np.save('test_data/test_average_inverse_distance_ali.npy', np.array(k2_average_inverse_distance_ali, dtype=dict)) \n",
211 | " \n",
212 | " \n",
213 | " "
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 26,
219 | "id": "54a12190",
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "data": {
224 | "text/plain": [
225 | "0.27504330372096264"
226 | ]
227 | },
228 | "execution_count": 26,
229 | "metadata": {},
230 | "output_type": "execute_result"
231 | }
232 | ],
233 | "source": [
234 | "Protein('ALEPLEALELASEPLALELAEPDEKKAEPLAEPLAEKAKEPALE').compute_iwd"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "id": "52e332aa",
241 | "metadata": {},
242 | "outputs": [],
243 | "source": []
244 | }
245 | ],
246 | "metadata": {
247 | "kernelspec": {
248 | "display_name": "Python 3 (ipykernel)",
249 | "language": "python",
250 | "name": "python3"
251 | },
252 | "language_info": {
253 | "codemirror_mode": {
254 | "name": "ipython",
255 | "version": 3
256 | },
257 | "file_extension": ".py",
258 | "mimetype": "text/x-python",
259 | "name": "python",
260 | "nbconvert_exporter": "python",
261 | "pygments_lexer": "ipython3",
262 | "version": "3.8.12"
263 | }
264 | },
265 | "nbformat": 4,
266 | "nbformat_minor": 5
267 | }
268 |
--------------------------------------------------------------------------------
/sparrow/tests/generate_test_data/generate_dssp_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "id": "c81ae04b",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from sparrow.predictors.dssp.dssp_predictor import DSSPPredictor\n",
11 | "import numpy as np\n",
12 | "import protfasta\n",
13 | "import pickle\n",
14 | "\n",
15 | "natural_proteins = protfasta.read_fasta('../test_data/test_seqs_100.fasta')\n",
16 | "\n"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "503d8163",
22 | "metadata": {},
23 | "source": [
24 | "### Helicity predictions"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "id": "ab18972d",
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "helicity_class = {}\n",
35 | "helicity_prob = {}\n",
36 | "\n",
37 | "X2 = DSSPPredictor(version=2)\n",
38 | "\n",
39 | "for k in natural_proteins:\n",
40 | " s = natural_proteins[k]\n",
41 | " helicity_class[k] = X2.predict_helicity_smart(s)\n",
42 | " helicity_prob[k] = X2.predict_helical_probability(s)\n",
43 | "\n",
44 | "with open('../test_data/helicity_class_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
45 | " pickle.dump(helicity_class, f) \n",
46 | " \n",
47 | "with open('../test_data/helicity_prob_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
48 | " pickle.dump(helicity_prob, f) "
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 11,
54 | "id": "cf90aec4",
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "extended_class = {}\n",
59 | "extended_prob = {}\n",
60 | "\n",
61 | "X2 = DSSPPredictor(version=2)\n",
62 | "\n",
63 | "for k in natural_proteins:\n",
64 | " s = natural_proteins[k]\n",
65 | " extended_class[k] = X2.predict_extended_smart(s)\n",
66 | " extended_prob[k] = X2.predict_extended_probability(s)\n",
67 | "\n",
68 | "with open('../test_data/extended_class_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
69 | " pickle.dump(extended_class, f) \n",
70 | " \n",
71 | "with open('../test_data/extended_prob_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
72 | " pickle.dump(extended_prob, f) "
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 12,
78 | "id": "8ae2e5c6",
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "coil_class = {}\n",
83 | "coil_prob = {}\n",
84 | "\n",
85 | "X2 = DSSPPredictor(version=2)\n",
86 | "\n",
87 | "for k in natural_proteins:\n",
88 | " s = natural_proteins[k]\n",
89 | " coil_class[k] = X2.predict_coil_smart(s)\n",
90 | " coil_prob[k] = X2.predict_coil_probability(s)\n",
91 | "\n",
92 | "with open('../test_data/coil_class_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
93 | " pickle.dump(coil_class, f) \n",
94 | " \n",
95 | "with open('../test_data/coil_prob_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
96 | " pickle.dump(coil_prob, f) "
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "id": "243f8c54",
102 | "metadata": {},
103 | "source": [
104 | "## Non-default data\n",
105 | "The code below generates sequences with non-default settings for the threshold and minimum length to vary this value and ensure all works well there"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 27,
111 | "id": "7d703b3d",
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "helicity_class = {}\n",
116 | "\n",
117 | "X2 = DSSPPredictor(version=2)\n",
118 | "\n",
119 | "for k in natural_proteins:\n",
120 | " s = natural_proteins[k]\n",
121 | " \n",
122 | " thresh = np.random.random()\n",
123 | " minlen = np.random.randint(1,13)\n",
124 | " \n",
125 | " tmp = X2.predict_helicity_smart(s, threshold=thresh, minlen=minlen)\n",
126 | " \n",
127 | " helicity_class[k] = [thresh, minlen, tmp]\n",
128 | "\n",
129 | "with open('../test_data/helicity_class_v2_non_default_test_seqs_100.pickle', 'wb') as f:\n",
130 | " pickle.dump(helicity_class, f) \n",
131 | " \n"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 29,
137 | "id": "09d2bdac",
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "extended_class = {}\n",
142 | "\n",
143 | "X2 = DSSPPredictor(version=2)\n",
144 | "\n",
145 | "for k in natural_proteins:\n",
146 | " s = natural_proteins[k]\n",
147 | " \n",
148 | " thresh = np.random.random()\n",
149 | " minlen = np.random.randint(1,13)\n",
150 | " \n",
151 | " tmp = X2.predict_extended_smart(s, threshold=thresh, minlen=minlen)\n",
152 | " \n",
153 | " extended_class[k] = [thresh, minlen, tmp]\n",
154 | "\n",
155 | "with open('../test_data/extended_class_v2_non_default_test_seqs_100.pickle', 'wb') as f:\n",
156 | " pickle.dump(extended_class, f) \n",
157 | " \n"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 30,
163 | "id": "187ae833",
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "coil_class = {}\n",
168 | "\n",
169 | "X2 = DSSPPredictor(version=2)\n",
170 | "\n",
171 | "for k in natural_proteins:\n",
172 | " s = natural_proteins[k]\n",
173 | " \n",
174 | " thresh = np.random.random()\n",
175 | " minlen = np.random.randint(1,13)\n",
176 | " \n",
177 | " tmp = X2.predict_coil_smart(s, threshold=thresh, minlen=minlen)\n",
178 | " \n",
179 | " coil_class[k] = [thresh, minlen, tmp]\n",
180 | "\n",
181 | "with open('../test_data/coil_class_v2_non_default_test_seqs_100.pickle', 'wb') as f:\n",
182 | " pickle.dump(coil_class, f) \n",
183 | " \n"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "id": "b42dee92",
190 | "metadata": {},
191 | "outputs": [],
192 | "source": []
193 | }
194 | ],
195 | "metadata": {
196 | "kernelspec": {
197 | "display_name": "Python 3 (ipykernel)",
198 | "language": "python",
199 | "name": "python3"
200 | },
201 | "language_info": {
202 | "codemirror_mode": {
203 | "name": "ipython",
204 | "version": 3
205 | },
206 | "file_extension": ".py",
207 | "mimetype": "text/x-python",
208 | "name": "python",
209 | "nbconvert_exporter": "python",
210 | "pygments_lexer": "ipython3",
211 | "version": "3.8.12"
212 | }
213 | },
214 | "nbformat": 4,
215 | "nbformat_minor": 5
216 | }
217 |
--------------------------------------------------------------------------------
/sparrow/tests/generate_test_data/helicity_class_v2_default.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/generate_test_data/helicity_class_v2_default.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/coil_class_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/coil_class_v2_default_test_seqs_100.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/coil_class_v2_non_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/coil_class_v2_non_default_test_seqs_100.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/coil_prob_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/coil_prob_v2_default_test_seqs_100.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/extended_class_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/extended_class_v2_default_test_seqs_100.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/extended_class_v2_non_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/extended_class_v2_non_default_test_seqs_100.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/extended_prob_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/extended_prob_v2_default_test_seqs_100.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/helicity_class_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/helicity_class_v2_default_test_seqs_100.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/helicity_class_v2_non_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/helicity_class_v2_non_default_test_seqs_100.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/helicity_prob_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/helicity_prob_v2_default_test_seqs_100.pickle
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_asph.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_asph.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_asph_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_asph_v2.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_exponent.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_exponent.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_exponent_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_exponent_v2.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_prefactor.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_prefactor.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_prefactor_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_prefactor_v2.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_re.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_re_scaled.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re_scaled.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_re_scaled_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re_scaled_v2.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_re_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re_v2.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_rg.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_rg_scaled.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg_scaled.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_rg_scaled_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg_scaled_v2.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_rg_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg_v2.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_scd.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_scd.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_shd.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_shd.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_average_bivariate_inverse_distance_charge.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_bivariate_inverse_distance_charge.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_average_inverse_distance_ali.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_inverse_distance_ali.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_average_inverse_distance_charge_neg.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_inverse_distance_charge_neg.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_average_inverse_distance_charge_pos.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_inverse_distance_charge_pos.npy
--------------------------------------------------------------------------------
/sparrow/tests/test_iwd.py:
--------------------------------------------------------------------------------
1 | from sparrow.patterning import iwd
2 | import os
3 | import protfasta
4 | import numpy as np
5 | from sparrow import Protein
6 |
7 | current_filepath = os.getcwd()
8 | onehundred_seqs = "{}/test_data/test_seqs_100.fasta".format(current_filepath)
9 |
10 | seqs = protfasta.read_fasta(onehundred_seqs)
11 |
12 |
13 | def test_average_bivariate_inverse_distance_charge():
14 |
15 | k2val = np.load('test_data/test_average_bivariate_inverse_distance_charge.npy', allow_pickle=True).item()
16 | for k in seqs:
17 | assert np.isclose(Protein(seqs[k]).compute_bivariate_iwd_charged_weighted(), k2val[k])
18 |
19 |
20 | def test_average_inverse_distance_charge_neg():
21 |
22 | k2val = np.load('test_data/test_average_inverse_distance_charge_neg.npy', allow_pickle=True).item()
23 | for k in seqs:
24 | assert np.isclose(Protein(seqs[k]).compute_iwd_charged_weighted('-'), k2val[k])
25 |
26 | def test_average_inverse_distance_charge_pos():
27 |
28 | k2val = np.load('test_data/test_average_inverse_distance_charge_pos.npy', allow_pickle=True).item()
29 | for k in seqs:
30 | assert np.isclose(Protein(seqs[k]).compute_iwd_charged_weighted('+'), k2val[k])
31 |
32 | def test_average_inverse_distance_ali():
33 |
34 | k2val = np.load('test_data/test_average_inverse_distance_ali.npy', allow_pickle=True).item()
35 | for k in seqs:
36 | assert np.isclose(Protein(seqs[k]).compute_iwd('ILVAM'), k2val[k])
37 |
38 |
39 |
--------------------------------------------------------------------------------
/sparrow/tests/test_kappa.py:
--------------------------------------------------------------------------------
1 | # Import package, test suite, and other packages as needed
2 | import sparrow
3 | import pytest
4 | import sys
5 | import numpy as np
6 | from sparrow.protein import Protein
7 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS
8 | import random
9 |
10 |
11 |
12 | USE_LOCALCIDER = True
13 |
14 |
15 | def test_kappa():
16 |
17 | das = [
18 | 'EKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEK',
19 | 'EEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEK',
20 | 'KEKKKEKKEEKKEEKEKEKEKEEKKKEEKEKEKEKKKEEKEKEEKKEEEE',
21 | 'KEKEEKEKKKEEEEKEKKKKEEKEKEKEKEEKKEEKKKKEEKEEKEKEKE',
22 | 'KEKEKKEEKEKKEEEKKEKEKEKKKEEKKKEEKEEKKEEKKKEEKEEEKE',
23 | 'EEEKKEKKEEKEEKKEKKEKEEEKKKEKEEKKEEEKKKEKEEEEKKKKEK',
24 | 'EEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEK',
25 | 'KKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKE',
26 | 'EEKKEEEKEKEKEEEEEKKEKKEKKEKKKEEKEKEKKKEKKKKEKEEEKE',
27 | 'EKKKKKKEEKKKEEEEEKKKEEEKKKEKKEEKEKEEKEKKEKKEEKEEEE',
28 | 'EKEKKKKKEEEKKEKEEEEKEEEEKKKKKEKEEEKEEKKEEKEKKKEEKK',
29 | 'EKKEEEEEEKEKKEEEEKEKEKKEKEEKEKKEKKKEKKEEEKEKKKKEKK',
30 | 'KEKKKEKEKKEKKKEEEKKKEEEKEKKKEEKKEKKEKKEEEEEEEKEEKE',
31 | 'EKKEKEEKEEEEKKKKKEEKEKKEKKKKEKKKKKEEEEEEKEEKEKEKEE',
32 | 'KKEKKEKKKEKKEKKEEEKEKEKKEKKKKEKEKKEEEEEEEEKEEKKEEE',
33 | 'EKEKEEKKKEEKKKKEKKEKEEKKEKEKEKKEEEEEEEEEKEKKEKKKKE',
34 | 'EKEKKKKKKEKEKKKKEKEKKEKKEKEEEKEEKEKEKKEEKKEEEEEEEE',
35 | 'KEEKKEEEEEEEKEEKKKKKEKKKEKKEEEKKKEEKKKEEEEEEKKKKEK',
36 | 'EEEEEKKKKKEEEEEKKKKKEEEEEKKKKKEEEEEKKKKKEEEEEKKKKK',
37 | 'EEKEEEEEEKEEEKEEKKEEEKEKKEKKEKEEKKEKKKKKKKKKKKKEEE',
38 | 'EEEEEEEEEKEKKKKKEKEEKKKKKKEKKEKKKKEKKEEEEEEKEEEKKK',
39 | 'KEEEEKEEKEEKKKKEKEEKEKKKKKKKKKKKKEKKEEEEEEEEKEKEEE',
40 | 'EEEEEKEEEEEEEEEEEKEEKEKKKKKKEKKKKKKKEKEKKKKEKKEEKK',
41 | 'EEEEKEEEEEKEEEEEEEEEEEEKKKEEKKKKKEKKKKKKKEKKKKKKKK',
42 | 'EEEEEEEEEEEKEEEEKEEKEEKEKKKKKKKKKKKKKKKKKKEEKKEEKE',
43 | 'KEEEEEEEKEEKEEEEEEEEEKEEEEKEEKKKKKKKKKKKKKKKKKKKKE',
44 | 'KKEKKKEKKEEEEEEEEEEEEEEEEEEEEKEEKKKKKKKKKKKKKKKEKK',
45 | 'EKKKKKKKKKKKKKKKKKKKKKEEEEEEEEEEEEEEEEEEKKEEEEEKEK',
46 | 'KEEEEKEEEEEEEEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKK',
47 | 'EEEEEEEEEEEEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKKKK']
48 |
49 | das_kappa_vals = [0.000963782329781065,
50 | 0.006849987601594839,
51 | 0.02510380091732725,
52 | 0.023779919834168346,
53 | 0.014793830994527891,
54 | 0.030699929748093432,
55 | 0.055155094748869704,
56 | 0.055155094748869704,
57 | 0.06207283537900597,
58 | 0.09244645817707578,
59 | 0.08182457866549872,
60 | 0.08535584477384989,
61 | 0.09376754013641903,
62 | 0.12779464725771064,
63 | 0.13589023055307498,
64 | 0.14253932524913954,
65 | 0.17465693111603184,
66 | 0.16361063576296123,
67 | 0.2184643791753562,
68 | 0.2683678441326591,
69 | 0.2836833506008589,
70 | 0.3168464032629612,
71 | 0.35941633427624997,
72 | 0.45755189798526164,
73 | 0.5278595348152701,
74 | 0.5935761144891406,
75 | 0.6553235220661426,
76 | 0.7440558474562516,
77 | 0.8658988417475169,
78 | 1.0]
79 |
80 | for p in range(len(das)):
81 | assert np.isclose(das_kappa_vals[p], Protein(das[p]).kappa, atol=0.03)
82 |
83 | if USE_LOCALCIDER:
84 | from localcider.sequenceParameters import SequenceParameters
85 | nseqs = 100
86 | max_count = 100
87 | n_diff_res = 10
88 |
89 | res_set = VALID_AMINO_ACIDS.copy()
90 |
91 | for i in range(nseqs):
92 | random.shuffle(res_set)
93 | local_res = res_set[:n_diff_res]
94 | seq = ''
95 | for aa in local_res:
96 | seq = seq + aa*random.randint(1,max_count)
97 |
98 | seq = list(seq)
99 | random.shuffle(seq)
100 | seq = "".join(seq)
101 |
102 | P = Protein(seq)
103 |
104 | # skip sequences
105 | if P.fraction_negative == 0 or P.fraction_positive == 0:
106 | continue
107 |
108 | SO = SequenceParameters(seq)
109 | assert np.isclose(P.NCPR, SO.get_NCPR())
110 | assert np.isclose(P.FCR, SO.get_FCR())
111 |
112 | # note, this will stochastically fial from time to time..
113 | assert np.isclose(P.kappa, SO.get_kappa(), atol=0.03)
114 |
115 |
116 | def test_kappa_range():
117 |
118 | for i in range(100):
119 |
120 | Es = 'E'*random.randint(1,60)
121 | Ks = 'K'*random.randint(1,60)
122 | Gs = 'G'*random.randint(1,100)
123 |
124 | tmp = Es+Ks+Gs
125 | if len(tmp) < 7:
126 | continue
127 |
128 | tmp_list = list(tmp)
129 | random.shuffle(tmp_list)
130 | tmp = "".join(tmp_list)
131 |
132 | p = Protein(tmp)
133 | k = p.kappa
134 |
135 | assert k > 0
136 | assert k < 1
137 |
138 |
--------------------------------------------------------------------------------
/sparrow/tests/test_plugins.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from sparrow.protein import Protein
4 | from sparrow.sequence_analysis.community_plugins.contributed import MultiplicativeFCR
5 | from sparrow.sequence_analysis.plugins import BasePlugin
6 |
7 |
8 | @pytest.fixture
9 | def protein():
10 | sequence = "LLERYIPKHQKCLTSAQRSSIDPLDIEDVYQHKKPKFSSKSHIWHVYNENSNRQKLEHVKVNKGSKASLFINKEDVYEYYQKDPKNTKFGKSKHKQSTLDQIYSTGLRKGNLHNVKDPNTNVPKGIGRRKTQHKRTQVDDVDCNPRKILAVSPSRRINRLVTYQQHIPETHNDLPEELCEPSSLTLSSLRNGLDSSTEACSVSKEKHIQNLDLSDSQEVQCLELESVDQTEAVSFPGLLLHKEIKLPVVTTDKQPHTLQEQHHVLYKSHENSNLV"
11 | return Protein(sequence)
12 |
13 |
14 | def test_multiplicative_fcr_plugin(protein):
15 | plugin_manager = protein.plugin
16 | double_fcr_result = plugin_manager.MultiplicativeFCR()
17 | expected_result = 2.0 * protein.FCR
18 | assert pytest.approx(double_fcr_result, 0.000001) == expected_result
19 |
20 |
21 | def test_plugin_manager_cache(protein):
22 | plugin_manager = protein.plugin
23 | first_result = plugin_manager.MultiplicativeFCR()
24 | second_result = plugin_manager.MultiplicativeFCR()
25 | assert first_result == second_result
26 |
27 |
28 | def test_invalid_plugin(protein):
29 | plugin_manager = protein.plugin
30 | with pytest.raises(AttributeError):
31 | plugin_manager.NonExistentPlugin
32 |
33 |
34 | def test_multiple_plugins(protein):
35 | class TripleFCR(BasePlugin):
36 | def calculate(self, factor=3.0):
37 | return factor * self.protein.FCR
38 |
39 | class QuadrupleFCR(BasePlugin):
40 | def calculate(self, factor=4.0):
41 | return factor * self.protein.FCR
42 |
43 | plugin_manager = protein.plugin
44 | # plugin_manager._PluginManager__plugins is a dictionary that stores plugins.
45 | # we can add a new plugin to it by assigning a new key-value pair to it.
46 | plugin_manager._PluginManager__plugins["TripleFCR"] = TripleFCR(protein)
47 | plugin_manager._PluginManager__plugins["QuadrupleFCR"] = QuadrupleFCR(protein)
48 |
49 | # Testing TripleFCR plugin
50 | triple_fcr_result = plugin_manager.TripleFCR(factor=3.0)
51 | expected_triple_result = 3.0 * protein.FCR
52 | assert pytest.approx(triple_fcr_result, 0.000001) == expected_triple_result
53 |
54 | # Testing QuadrupleFCR plugin
55 | quadruple_fcr_result = plugin_manager.QuadrupleFCR(factor=4.0)
56 | expected_quadruple_result = 4.0 * protein.FCR
57 | assert pytest.approx(quadruple_fcr_result, 0.000001) == expected_quadruple_result
58 |
59 |
60 | def test_base_plugin_initialization(protein):
61 | class TestPlugin(BasePlugin):
62 | def calculate(self):
63 | return protein.FCR
64 |
65 | plugin = TestPlugin(protein)
66 | assert plugin.protein == protein
67 |
68 |
69 | def test_base_plugin_abstract_method(protein):
70 | with pytest.raises(TypeError):
71 | BasePlugin(protein)
72 |
--------------------------------------------------------------------------------
/sparrow/tests/test_polymeric.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_polymeric.py
--------------------------------------------------------------------------------
/sparrow/tests/test_predictor_disorder.py:
--------------------------------------------------------------------------------
1 | # Import package, test suite, and other packages as needed
2 | import sparrow
3 | import pytest
4 | import sys
5 | import numpy as np
6 | from sparrow.protein import Protein
7 |
8 | def test_protein_code_coverage():
9 |
10 | P = Protein('MKASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQ')
11 |
12 |
13 | # V2
14 | # assert np.isclose(np.mean(P.predictor.disorder()), 0.8636131147540983)
15 |
16 | assert np.isclose(np.mean(P.predictor.disorder()), 0.92875415)
17 |
--------------------------------------------------------------------------------
/sparrow/tests/test_protein.py:
--------------------------------------------------------------------------------
1 | """
2 | Unit and regression test for the sparrow package.
3 | """
4 |
5 | # Import package, test suite, and other packages as needed
6 | import random
7 | import sys
8 |
9 | import numpy as np
10 | import pytest
11 |
12 | import sparrow
13 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS
14 | from sparrow.protein import Protein
15 | from sparrow.sequence_analysis.elm import (
16 | ELM,
17 | compute_gained_elms,
18 | compute_lost_elms,
19 | compute_retained_elms,
20 | )
21 |
22 |
23 | def test_sparrow_imported():
24 | """Sample test, will always pass so long as import statement worked"""
25 | assert "sparrow" in sys.modules
26 |
27 |
28 | def test_protein_code_coverage():
29 |
30 |
31 | s = 'MKASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQ'
32 | # constructor
33 | P = Protein(s)
34 | assert len(P) == 61
35 |
36 | P = Protein(s, validate=True)
37 | assert len(P) == 61
38 |
39 | s_broken = 'MKASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSXYGQSSYSSYXQ'
40 | P = Protein(s_broken, validate=True)
41 | assert len(P) == 61
42 | assert s == P.sequence
43 |
44 |
45 |
46 | assert len(P.amino_acid_fractions) == 20
47 | assert P.FCR == 0.04918032786885246
48 | assert P.fraction_positive == 0.01639344262295082
49 | assert P.fraction_negative == 0.03278688524590164
50 | assert P.NCPR == -0.01639344262295082
51 | assert P.fraction_aromatic == 0.16393442622950818
52 | assert P.fraction_aliphatic == 0.06557377049180328
53 | assert P.fraction_polar == 0.6721311475409836
54 | assert P.fraction_proline == 0.04918032786885246
55 |
56 | # V2
57 | # assert np.mean(P.predictor.disorder()) == 0.8636131147540983
58 |
59 | assert np.isclose(np.mean(P.predictor.disorder()), 0.92875415)
60 | assert P.hydrophobicity == 3.052459016393442
61 | assert P.compute_residue_fractions(['P','E','K','R','D']) == 0.09836065573770492
62 |
63 | assert np.mean(P.linear_sequence_profile('FCR')) == 0.04918032786885246
64 | assert np.mean(P.linear_sequence_profile('NCPR')) == -0.02459016393442623
65 | assert np.mean(P.linear_sequence_profile('aromatic')) == 0.1680327868852459
66 | assert np.mean(P.linear_sequence_profile('aliphatic')) == 0.05737704918032787
67 | assert np.mean(P.linear_sequence_profile('polar')) == 0.6762295081967213
68 | assert np.mean(P.linear_sequence_profile('proline')) == 0.04918032786885246
69 | assert np.mean(P.linear_sequence_profile('positive')) == 0.012295081967213115
70 | assert np.mean(P.linear_sequence_profile('negative')) == 0.036885245901639344
71 | assert np.isclose(np.mean(P.linear_sequence_profile('hydrophobicity')),3.0450819672131146)
72 | assert np.mean(P.linear_composition_profile(['E','K'])) == 0.012295081967213115
73 |
74 | P = Protein("KRRARKRRARKRRARKRRAR")
75 | elms = P.elms
76 | func_sites = []
77 | elm_sequences = []
78 | start, end = [],[]
79 | for elm in elms:
80 | start.append(elm.start)
81 | end.append(elm.end)
82 | elm_sequences.append(elm.sequence)
83 | func_sites.append(elm.functional_site_name)
84 | func_sites = list(set(func_sites))
85 | for func_site in func_sites:
86 | assert func_site in ['di Arginine retention/retrieving signal',
87 | 'CendR Motif Binding to Neuropilin Receptors',
88 | 'NLS classical Nuclear Localization Signals',
89 | 'N-degron',
90 | 'NRD cleavage site',
91 | 'PCSK cleavage site']
92 | assert sorted(start) == sorted([1, 6, 11, 16, 4, 9, 14, 0, 5, 10, 15, 0, 5, 10, 15, 1, 11, 0, 16, 1, 6, 11, 16, 0, 3, 13, 4, 14, 1, 9])
93 | assert sorted(end) == sorted([4, 9, 14, 19, 9, 14, 19, 3, 8, 13, 18, 3, 8, 13, 18, 8, 18, 3, 20, 5, 10, 15, 20, 20, 9, 19, 10, 20, 9, 15])
94 | assert sorted(elm_sequences) == sorted(['RRA',
95 | 'RRA',
96 | 'RRA',
97 | 'RRA',
98 | 'RKRRA',
99 | 'RKRRA',
100 | 'RKRRA',
101 | 'KRR',
102 | 'KRR',
103 | 'KRR',
104 | 'KRR',
105 | 'KRR',
106 | 'KRR',
107 | 'KRR',
108 | 'KRR',
109 | 'RRARKRR',
110 | 'RRARKRR',
111 | 'KRR',
112 | 'RRAR',
113 | 'RRAR',
114 | 'RRAR',
115 | 'RRAR',
116 | 'RRAR',
117 | 'KRRARKRRARKRRARKRRAR',
118 | 'ARKRRA',
119 | 'ARKRRA',
120 | 'RKRRAR',
121 | 'RKRRAR',
122 | 'RRARKRRA',
123 | 'RKRRAR'])
124 |
125 | def test_elm_comparisons():
126 | wt = sparrow.Protein("MKKK")
127 | mut = sparrow.Protein("MRKK")
128 |
129 | wt_elms = wt.elms
130 | mut_elms = mut.elms
131 |
132 | assert wt.elms == {
133 | ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=3, sequence='KKK')
134 | }
135 | assert mut.elms == {
136 | ELM(regex='(.RK)|(RR[^KR])', functional_site_name='NRD cleavage site', start=0, end=3, sequence='MRK'),
137 | ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MRKK')
138 | }
139 |
140 | assert wt.elms - mut.elms == set()
141 | assert wt.elms & mut.elms == {ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MKKK')}
142 |
143 | assert compute_lost_elms(wt,[2,"K"]) == set()
144 | assert compute_retained_elms(wt,"p.K1R") == {ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=3, sequence='RKK')}
145 | assert compute_gained_elms(wt,"p.K2R") == {ELM(regex='(.RK)|(RR[^KR])', functional_site_name='NRD cleavage site', start=0, end=3, sequence='MRK')}
146 |
147 | assert compute_retained_elms(mut,"p.M1K") == {ELM(regex='(.RK)|(RR[^KR])', functional_site_name='NRD cleavage site', start=0, end=3, sequence='MRK'),
148 | ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MRKK')}
149 |
150 | assert compute_gained_elms(mut,"p.M1K") == {ELM(regex='KR.', functional_site_name='PCSK cleavage site', start=0, end=3, sequence='KRK'),
151 | ELM(regex='[KR]R.', functional_site_name='PCSK cleavage site', start=0, end=3, sequence='KRK')}
152 | assert compute_lost_elms(mut, "p.M1G") == {ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MRKK')}
153 |
154 |
155 |
--------------------------------------------------------------------------------
/sparrow/tests/test_scd.py:
--------------------------------------------------------------------------------
1 | from sparrow.patterning import scd
2 | import os
3 | import protfasta
4 | import numpy as np
5 | from sparrow import Protein
6 | from IPython import embed
7 |
8 | current_filepath = os.getcwd()
9 | onehundred_seqs = "{}/test_data/test_seqs_100.fasta".format(current_filepath)
10 |
11 | seqs = protfasta.read_fasta(onehundred_seqs)
12 |
13 | def test_scd():
14 |
15 | k2val = np.load('test_data/test_100_scd.npy', allow_pickle=True).item()
16 | for k in seqs:
17 | s = seqs[k]
18 | cython_SCD = getattr(Protein(s),"SCD")
19 | no_cython_SCD = k2val[k]
20 | assert np.isclose(cython_SCD, no_cython_SCD)
21 |
22 | def test_shd():
23 | k2val = np.load('test_data/test_100_shd.npy', allow_pickle=True).item()
24 | for k in seqs:
25 | s = seqs[k]
26 | assert np.isclose(getattr(Protein(s),"SHD"), k2val[k])
27 |
28 |
--------------------------------------------------------------------------------
/sparrow/tests/test_sparrow.py:
--------------------------------------------------------------------------------
1 | """
2 | Unit and regression test for the sparrow package.
3 | """
4 |
5 | # Import package, test suite, and other packages as needed
6 | import sparrow
7 | import pytest
8 | import sys
9 |
10 | def test_sparrow_imported():
11 | """Sample test, will always pass so long as import statement worked"""
12 | assert "sparrow" in sys.modules
13 |
--------------------------------------------------------------------------------
/sparrow/tests/test_sparrow_vs_localcider.py:
--------------------------------------------------------------------------------
1 | from localcider.sequenceParameters import SequenceParameters
2 | from sparrow import Protein
3 |
4 | from . import build_seq
5 |
6 | import numpy as np
7 |
8 | NSEQS=100
9 |
10 | def test_FCR():
11 |
12 | for i in range(NSEQS):
13 | s = build_seq()
14 | assert np.isclose(SequenceParameters(s).get_FCR(), Protein(s).FCR, atol=1e-8)
15 |
16 |
17 | def test_NCPR():
18 |
19 | for i in range(NSEQS):
20 | s = build_seq()
21 | assert np.isclose(SequenceParameters(s).get_NCPR(), Protein(s).NCPR, atol=1e-8)
22 |
23 |
24 | def test_fraction_neg_fraction_pos():
25 |
26 | for i in range(NSEQS):
27 | s = build_seq()
28 | assert np.isclose(SequenceParameters(s).get_countNeg()/len(s), Protein(s).fraction_negative, atol=1e-8)
29 | assert np.isclose(SequenceParameters(s).get_countPos()/len(s), Protein(s).fraction_positive, atol=1e-8)
30 |
31 | def test_hydrophobiciyty():
32 |
33 | for i in range(NSEQS):
34 | s = build_seq()
35 | assert np.isclose(SequenceParameters(s).get_uversky_hydropathy(), Protein(s).hydrophobicity/9, atol=1e-8)
36 |
37 |
--------------------------------------------------------------------------------
/sparrow/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tools/__init__.py
--------------------------------------------------------------------------------
/sparrow/tools/general_tools.py:
--------------------------------------------------------------------------------
1 | from sparrow.data import amino_acids
2 |
3 |
4 | def is_valid_protein_sequence(sequence):
5 | """
6 | Function that tests if a passed sequence contains non-standard ammino acids
7 |
8 | Parameters
9 | ----------------
10 | sequence : str
11 | Protein sequence
12 |
13 | Returns
14 | ---------------
15 | bool
16 | If sequences contains non-standard amino acids returns False, else returns
17 | True
18 | """
19 |
20 | for i in sequence:
21 | if i not in amino_acids.VALID_AMINO_ACIDS:
22 | return False
23 |
24 | return True
25 |
26 |
27 |
28 | def compare_sequence(s1, s2, verbose=False, ignore_gaps=False, return_positions=False):
29 | """
30 | Function that compares two sequences of the same length and returns
31 | either the set of positions where the sequences are different (indxed at 0) or
32 | the number of differences between them, depending on the status of the flag
33 | return_position. This function Will also print the differences if verbose is
34 | set to True.
35 |
36 | If ignore_gaps is set to True, will ignore gaps in the comparison (i.e.
37 | will ignore '-' characters in either sequence). This is useful when running
38 | analyses for aligned sequences.
39 |
40 | WARNING: Sequence must have the same length - if two passed sequences are not
41 | identical in terms of length then this function throws a ValueError
42 |
43 | Parameters
44 | ----------------
45 | s1 : str
46 | First sequence to compare
47 |
48 | s2 : str
49 | Second sequence to compare
50 |
51 | verbose : bool
52 | If True, will print the differences between the two sequences.
53 | Default is False
54 |
55 | return_positions : bool
56 | If True, will return a list of positions where the two sequences
57 | differ. If false return the count only.
58 |
59 | Returns
60 | ---------------
61 | int
62 | Number of differences between the two sequences
63 |
64 | Raises
65 | ---------------
66 | ValueError
67 | If sequences are not the same length.
68 |
69 | """
70 |
71 | # first things first check if sequences are the same length and
72 | # freak out if not!
73 | if len(s1) != len(s2):
74 | raise ValueError("Sequences must have the same length")
75 |
76 | # define comparison function based on ignore_gaps
77 | if ignore_gaps:
78 | def _compare(p1,p2):
79 | if p1 == "-" or p2 == "-":
80 | return False
81 | elif p1 == p2:
82 | return False
83 | else:
84 | return True
85 | else:
86 | def _compare(p1,p2):
87 | if p1 == p2:
88 | return False
89 | else:
90 | return True
91 |
92 |
93 | # cycle through each position in the sequence
94 | positions = []
95 | for i in range(len(s1)):
96 | if _compare(s1[i],s2[i]):
97 | positions.append(i)
98 | if verbose:
99 | print(f"{i+1}: {s1[i]} vs. {s2[i]}")
100 |
101 |
102 | if return_positions:
103 | return positions
104 | else:
105 | return len(positions)
106 |
--------------------------------------------------------------------------------
/sparrow/tools/utilities.py:
--------------------------------------------------------------------------------
1 | from sparrow.sparrow_exceptions import SparrowException
2 |
3 | def validate_keyword_option(keyword, allowed_vals, keyword_name, error_message=None):
4 | """
5 | Helper function that checks a passed keyword is only one of a set of possible
6 | valid keywords
7 |
8 | Parameters
9 | -----------
10 | keyword : str
11 | The actual passed keyword value
12 |
13 | allowed_vals : list of str
14 | A list of possible keywords
15 |
16 | keyword_name : str
17 | the name of the keyword as the user would select it in the function call
18 |
19 | error_message : str
20 | Allows the user to pass a custom error message
21 |
22 |
23 | Returns
24 | --------
25 | None
26 |
27 | No return value, but raises ctexceptions.CTException if ``keyword `` is not
28 | found in the allowed_vals list
29 |
30 | """
31 |
32 |
33 | if keyword not in allowed_vals:
34 | if error_message is None:
35 | raise SparrowException(f'Keyword {keyword_name} passed value [{keyword}], but this is not valid.\nMust be one of: {str(allowed_vals)}')
36 |
--------------------------------------------------------------------------------
/sparrow/visualize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/visualize/__init__.py
--------------------------------------------------------------------------------
/sparrow/visualize/sequence_visuals.py:
--------------------------------------------------------------------------------
1 | #from IPython import display
2 | from IPython.display import display
3 | from IPython.display import HTML
4 |
5 | from sparrow.data.amino_acids import AA_COLOR
6 | from sparrow.sparrow_exceptions import SparrowException
7 |
8 | def show_sequence(seq,
9 | blocksize=10,
10 | newline=50,
11 | fontsize=14,
12 | font_family='Courier',
13 | colors={},
14 | header=None,
15 | bold_positions=[],
16 | bold_residues=[],
17 | opaque_positions=[],
18 | return_raw_string=False,
19 | warnings = True):
20 |
21 | """
22 | Function that generates an HTML colored string that either renders in the browser or returns the
23 | html string. Contains various customizable components.
24 |
25 | Parameters
26 | -------------
27 |
28 | blocksize : int
29 | Defines how big blocks of residues are. Blocks are equal to blocksize or the newline parameter, whicever is smaller.
30 | Default=10. If set to -1 uses length of the sequence.
31 |
32 | newline : int
33 | Defines how many residues are shown before a newline is printed. Default is 50. If set to -1 uses the length of
34 | the sequence.
35 |
36 | fontsize : int
37 | Fontsize used. Default is 14
38 |
39 | font_family : str
40 | Which font family (from HTML fonts) is used. Using a non-monospace font makes no sense as columns will be
41 | unaligned. Default is Courier.
42 |
43 | colors : dict
44 | Dictionary that allows overiding of default color scheme. Should be of format key-value as 'residue'-'color' where
45 | residue is a residue in the string and color is a valid HTML color (which can be a Hexcode, standard HTML color name).
46 | Note that this also lets you define colors for non-standard amino acids should these be useful. Default is an empty
47 | dictionary. Note also that the standard amino acid colorings are defined at sparrow.data.amino_acids.AA_COLOR
48 |
49 | header : str
50 | If provided, this is a string that provides a FASTA-style header (with a leading carrett included). Default None.
51 |
52 | bold_positions : list
53 | List of positions (indexing from 1 onwards) which will be bolded. Useful for highlighting specific regions. Note that this
54 | defines individual residues so (for example) to bold residues 10 to 15 would require bold_positions=[10,11,12,13,14,15].
55 | Default is an empty list.
56 |
57 | bold_residues : list
58 | List of residue types that can be bolded. Useful for highlighting specific residue groups. Default is an empty list.
59 |
60 | opaque_positions : list
61 | List of positions (indexing from 1 onwards) which will be grey and slighlty opaque. Useful for highlighting specific regions.
62 | Note that this defines individual residues so (for example) to bold residues 10 to 15 would require
63 | bold_positions=[10,11,12,13,14,15]. Default is an empty list.
64 |
65 | return_raw_string : bool
66 | If set to true, the function returns the actual raw HTML string, as opposed to an in-notebook rendering.
67 | Default is False
68 |
69 | warnings : bool
70 | If set to true, the function will print warnings if invalid amino acids are found. Default is True.
71 |
72 |
73 | Returns
74 | ----------
75 | None or str
76 | If return_raw_string is set to true then an HTML-compatible string is returned.
77 |
78 |
79 | Raises
80 | -------
81 | sparrow.sparrow_exceptions.SparrowException
82 | Raises a sparrow exception if invalid input is provided (within reason).
83 |
84 | """
85 |
86 | if blocksize > newline:
87 | newline = blocksize
88 |
89 | if blocksize == -1:
90 | blocksize = len(seq)
91 | newline = len(seq)
92 |
93 |
94 | if blocksize < 1:
95 | raise
96 |
97 |
98 | colorString = ' '%(font_family, fontsize)
99 |
100 | if header:
101 | colorString = colorString + ">%s "%(str(header))
102 |
103 |
104 | count = -1
105 | for residue in seq:
106 |
107 | count = count + 1
108 |
109 | if count > 0:
110 | if count % newline == 0:
111 | colorString = colorString + " "
112 |
113 | elif count % blocksize == 0:
114 | colorString = colorString + " "
115 |
116 |
117 | if residue not in AA_COLOR and residue not in colors:
118 | if warnings:
119 | print('Warning: found invalid amino acid (%s and position %i'%(residue, count+1))
120 | colorString = colorString + '%s' % ('black', residue)
121 | else:
122 |
123 | # override with user-suppplied pallete if present
124 | if residue in colors:
125 | c = colors[residue]
126 |
127 | # else fall back on the standard pallete
128 | else:
129 | c = AA_COLOR[residue]
130 |
131 | # check if residue should be light grey and opaque
132 | # This overrides other coloring
133 | if count+1 in opaque_positions:
134 | c = '#a9a9a9'
135 |
136 | # if the residue type OR residue position is to be bolded...
137 | if residue in bold_residues or (count+1) in bold_positions:
138 | colorString = colorString + '%s' % (c, residue)
139 | else:
140 | colorString = colorString + '%s' % (c, residue)
141 |
142 |
143 |
144 | colorString = colorString +" "
145 |
146 | if return_raw_string:
147 | return colorString
148 | else:
149 | display(HTML(colorString))
150 | #HTML(colorString)
151 |
--------------------------------------------------------------------------------
|