├── .codecov.yml
├── .github
    ├── CONTRIBUTING.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── CI.yaml
├── .gitignore
├── .lgtm.yml
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── devtools
    ├── README.md
    ├── conda-envs
    │   └── test_env.yaml
    ├── legacy-miniconda-setup
    │   └── before_install.sh
    └── scripts
    │   └── create_conda_env.py
├── docs
    ├── Makefile
    ├── README.md
    ├── _static
    │   └── README.md
    ├── _templates
    │   └── README.md
    ├── api.rst
    ├── conf.py
    ├── getting_started.rst
    ├── index.rst
    ├── make.bat
    ├── predictors.rst
    ├── requirements.txt
    └── requirements.yaml
├── examples
    ├── protein_example_1.py
    └── sparrow_walk_through.ipynb
├── pyproject.toml
├── readthedocs.yml
├── setup.cfg
├── setup.py
└── sparrow
    ├── __init__.py
    ├── calculate_parameters.py
    ├── data
        ├── README.md
        ├── __init__.py
        ├── amino_acids.py
        ├── configs.py
        ├── elm_classes.tsv
        ├── look_and_say.dat
        └── networks
        │   ├── asphericity
        │       ├── README
        │       ├── asphericity_network_v1.pt
        │       └── asphericity_network_v2.pt
        │   ├── dssp
        │       ├── dssp_predictor_network_v1.pt
        │       └── dssp_predictor_network_v2.pt
        │   ├── mitochondrial_targeting
        │       └── mitochondrial_targeting_predictor_network_v1.pt
        │   ├── nuclear_export_signal
        │       └── nes_predictor_network_v1.pt
        │   ├── nuclear_import_signal
        │       └── nls_predictor_network_v1.pt
        │   ├── phosphorylation
        │       ├── ser_phosphorylation_predictor_network_v1.pt
        │       ├── thr_phosphorylation_predictor_network_v1.pt
        │       └── tyr_phosphorylation_predictor_network_v1.pt
        │   ├── prefactor
        │       ├── README
        │       ├── prefactor_network_v1.pt
        │       └── prefactor_network_v2.pt
        │   ├── pscore
        │       ├── pscore_predictor_network_v2.pt
        │       ├── pscore_predictor_network_v3.pt
        │       └── pscore_predictor_network_v4.pt
        │   ├── re
        │       ├── README
        │       ├── re_network_v1.pt
        │       └── re_network_v2.pt
        │   ├── rg
        │       ├── README
        │       ├── rg_network_v1.pt
        │       └── rg_network_v2.pt
        │   ├── scaled_re
        │       ├── README
        │       ├── scaled_re_network_v1.pt
        │       └── scaled_re_network_v2.pt
        │   ├── scaled_rg
        │       ├── README
        │       ├── scaled_rg_network_v1.pt
        │       └── scaled_rg_network_v2.pt
        │   ├── scaling_exponent
        │       ├── README
        │       ├── scaling_exponent_network_v1.5.pt
        │       ├── scaling_exponent_network_v1.pt
        │       └── scaling_exponent_network_v2.pt
        │   ├── transactivation_domains
        │       └── tad_predictor_network_v1.pt
        │   └── transmembrane
        │       ├── transmembrane_predictor_network_v1.pt
        │       └── transmembrane_predictor_network_v4.pt
    ├── patterning
        ├── __init__.py
        ├── iwd.pyx
        ├── kappa.pyx
        ├── patterning.pyx
        └── scd.pyx
    ├── polymer
        ├── __init__.py
        └── scaling_parameters.py
    ├── predictors
        ├── __init__.py
        ├── asphericity
        │   ├── __init__.py
        │   └── asphericity_predictor.py
        ├── batch_predict.py
        ├── dssp
        │   ├── __init__.py
        │   └── dssp_predictor.py
        ├── e2e
        │   ├── __init__.py
        │   └── end_to_end_distance_predictor.py
        ├── mitochondrial_targeting
        │   ├── __init__.py
        │   └── mitochondrial_targeting_predictor.py
        ├── nes
        │   ├── __init__.py
        │   └── nuclear_export_signal_predictor.py
        ├── nls
        │   ├── __init__.py
        │   └── nuclear_import_signal_predictor.py
        ├── phosphorylation
        │   ├── __init__.py
        │   ├── phospho_predictor_utils.py
        │   ├── ser_phosphorylation_predictor.py
        │   ├── thr_phosphorylation_predictor.py
        │   └── tyr_phosphorylation_predictor.py
        ├── predictor_template.pyXX
        ├── prefactor
        │   ├── __init__.py
        │   └── prefactor_predictor.py
        ├── pscore
        │   ├── __init__.py
        │   └── pscore_predictor.py
        ├── rg
        │   ├── __init__.py
        │   └── radius_of_gyration_predictor.py
        ├── scaled_re
        │   ├── __init__.py
        │   └── scaled_end_to_end_distance_predictor.py
        ├── scaled_rg
        │   ├── __init__.py
        │   └── scaled_radius_of_gyration_predictor.py
        ├── scaling_exponent
        │   ├── __init__.py
        │   └── scaling_exponent_predictor.py
        ├── tad
        │   ├── __init__.py
        │   └── transactivation_domain_predictor.py
        └── transmembrane
        │   ├── __init__.py
        │   └── transmembrane_predictor.py
    ├── protein.py
    ├── sequence_analysis
        ├── __init__.py
        ├── alignment.py
        ├── community_plugins
        │   └── contributed.py
        ├── elm.py
        ├── phospho_isoforms.py
        ├── physical_properties.py
        ├── plugins.py
        └── sequence_complexity.py
    ├── sparrow_exceptions.py
    ├── tests
        ├── __init__.py
        ├── compute_test_data.ipynb
        ├── generate_test_data
        │   ├── generate_dssp_data.ipynb
        │   └── helicity_class_v2_default.pickle
        ├── test_albatross.py
        ├── test_data
        │   ├── coil_class_v2_default_test_seqs_100.pickle
        │   ├── coil_class_v2_non_default_test_seqs_100.pickle
        │   ├── coil_prob_v2_default_test_seqs_100.pickle
        │   ├── extended_class_v2_default_test_seqs_100.pickle
        │   ├── extended_class_v2_non_default_test_seqs_100.pickle
        │   ├── extended_prob_v2_default_test_seqs_100.pickle
        │   ├── helicity_class_v2_default_test_seqs_100.pickle
        │   ├── helicity_class_v2_non_default_test_seqs_100.pickle
        │   ├── helicity_prob_v2_default_test_seqs_100.pickle
        │   ├── test_100_asph.npy
        │   ├── test_100_asph_v2.npy
        │   ├── test_100_exponent.npy
        │   ├── test_100_exponent_v2.npy
        │   ├── test_100_prefactor.npy
        │   ├── test_100_prefactor_v2.npy
        │   ├── test_100_re.npy
        │   ├── test_100_re_scaled.npy
        │   ├── test_100_re_scaled_v2.npy
        │   ├── test_100_re_v2.npy
        │   ├── test_100_rg.npy
        │   ├── test_100_rg_scaled.npy
        │   ├── test_100_rg_scaled_v2.npy
        │   ├── test_100_rg_v2.npy
        │   ├── test_100_scd.npy
        │   ├── test_100_shd.npy
        │   ├── test_average_bivariate_inverse_distance_charge.npy
        │   ├── test_average_inverse_distance_ali.npy
        │   ├── test_average_inverse_distance_charge_neg.npy
        │   ├── test_average_inverse_distance_charge_pos.npy
        │   └── test_seqs_100.fasta
        ├── test_iwd.py
        ├── test_kappa.py
        ├── test_plugins.py
        ├── test_polymeric.py
        ├── test_predictor_disorder.py
        ├── test_predictor_dssp.py
        ├── test_protein.py
        ├── test_scd.py
        ├── test_sparrow.py
        └── test_sparrow_vs_localcider.py
    ├── tools
        ├── __init__.py
        ├── general_tools.py
        ├── io.py
        ├── track_tools.py
        └── utilities.py
    └── visualize
        ├── __init__.py
        └── sequence_visuals.py


/.codecov.yml:
--------------------------------------------------------------------------------
 1 | # Codecov configuration to make it a bit less noisy
 2 | coverage:
 3 |   status:
 4 |     patch: false
 5 |     project:
 6 |       default:
 7 |         threshold: 50%
 8 | comment:
 9 |   layout: "header"
10 |   require_changes: false
11 |   branches: null
12 |   behavior: default
13 |   flags: null
14 |   paths: null


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | We welcome contributions from external contributors, and this document
 4 | describes how to merge code changes into this sparrow. 
 5 | 
 6 | ## Getting Started
 7 | 
 8 | * Make sure you have a [GitHub account](https://github.com/signup/free).
 9 | * [Fork](https://help.github.com/articles/fork-a-repo/) this repository on GitHub.
10 | * On your local machine,
11 |   [clone](https://help.github.com/articles/cloning-a-repository/) your fork of
12 |   the repository.
13 | 
14 | ## Making Changes
15 | 
16 | * Add some really awesome code to your local fork.  It's usually a [good
17 |   idea](http://blog.jasonmeridth.com/posts/do-not-issue-pull-requests-from-your-master-branch/)
18 |   to make changes on a
19 |   [branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/)
20 |   with the branch name relating to the feature you are going to add.
21 | * When you are ready for others to examine and comment on your new feature,
22 |   navigate to your fork of sparrow on GitHub and open a [pull
23 |   request](https://help.github.com/articles/using-pull-requests/) (PR). Note that
24 |   after you launch a PR from one of your fork's branches, all
25 |   subsequent commits to that branch will be added to the open pull request
26 |   automatically.  Each commit added to the PR will be validated for
27 |   mergability, compilation and test suite compliance; the results of these tests
28 |   will be visible on the PR page.
29 | * If you're providing a new feature, you must add test cases and documentation.
30 | * When the code is ready to go, make sure you run the test suite using pytest.
31 | * When you're ready to be considered for merging, check the "Ready to go"
32 |   box on the PR page to let the sparrow devs know that the changes are complete.
33 |   The code will not be merged until this box is checked, the continuous
34 |   integration returns checkmarks,
35 |   and multiple core developers give "Approved" reviews.
36 | 
37 | # Additional Resources
38 | 
39 | * [General GitHub documentation](https://help.github.com/)
40 | * [PR best practices](http://codeinthehole.com/writing/pull-requests-and-other-good-practices-for-teams-using-github/)
41 | * [A guide to contributing to software packages](http://www.contribution-guide.org)
42 | * [Thinkful PR example](http://www.thinkful.com/learn/github-pull-request-tutorial/#Time-to-Submit-Your-First-PR)
43 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | Provide a brief description of the PR's purpose here.
 3 | 
 4 | ## Todos
 5 | Notable points that this PR has either accomplished or will accomplish.
 6 |   - [ ] TODO 1
 7 | 
 8 | ## Questions
 9 | - [ ] Question1
10 | 
11 | ## Status
12 | - [ ] Ready to go


--------------------------------------------------------------------------------
/.github/workflows/CI.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   # GitHub has started calling new repo's first branch "main" https://github.com/github/renaming
 5 |   # Existing codes likely still have "master" as the primary branch
 6 |   # Both are tracked here to keep legacy and new codes working
 7 |   push:
 8 |     branches:
 9 |       - "master"
10 |       - "main"
11 |   pull_request:
12 |     branches:
13 |       - "master"
14 |       - "main"
15 |   schedule:
16 |     # Nightly tests run on master by default:
17 |     #   Scheduled workflows run on the latest commit on the default or base branch.
18 |     #   (from https://help.github.com/en/actions/reference/events-that-trigger-workflows#scheduled-events-schedule)
19 |     - cron: "0 0 * * *"
20 | 
21 | jobs:
22 |   test:
23 |     name: Test on ${{ matrix.os }}, Python ${{ matrix.python-version }}
24 |     runs-on: ${{ matrix.os }}
25 |     strategy:
26 |       matrix:
27 |         os: [macOS-latest, ubuntu-latest, windows-latest]
28 |         python-version: [3.7, 3.8, 3.9]
29 | 
30 |     steps:
31 |     - uses: actions/checkout@v1
32 | 
33 |     - name: Additional info about the build
34 |       shell: bash
35 |       run: |
36 |         uname -a
37 |         df -h
38 |         ulimit -a
39 | 
40 | 
41 |     # More info on options: https://github.com/conda-incubator/setup-miniconda
42 |     - uses: conda-incubator/setup-miniconda@v2
43 |       with:
44 |         python-version: ${{ matrix.python-version }}
45 |         environment-file: devtools/conda-envs/test_env.yaml
46 | 
47 |         channels: conda-forge,defaults
48 | 
49 |         activate-environment: test
50 |         auto-update-conda: false
51 |         auto-activate-base: false
52 |         show-channel-urls: true
53 | 
54 |     - name: Install package
55 | 
56 |       # conda setup requires this special shell
57 |       shell: bash -l {0}
58 |       run: |
59 |         python -m pip install . --no-deps
60 |         conda list
61 | 
62 | 
63 |     - name: Run tests
64 | 
65 |       # conda setup requires this special shell
66 |       shell: bash -l {0}
67 | 
68 |       run: |
69 |         pytest -v --cov=sparrow --cov-report=xml --color=yes sparrow/tests/
70 | 
71 |     - name: CodeCov
72 |       uses: codecov/codecov-action@v1
73 |       with:
74 |         file: ./coverage.xml
75 |         flags: unittests
76 |         name: codecov-${{ matrix.os }}-py${{ matrix.python-version }}
77 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | sparrow/patterning.html
  6 | .DS_Store
  7 | sparrow/_version.py
  8 | 
  9 | # C extensions
 10 | *.so
 11 | *~
 12 | *.c
 13 | \#*
 14 | \.#*
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | env/
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | .pytest_cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | .hypothesis/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # dotenv
 91 | .env
 92 | 
 93 | # virtualenv
 94 | .venv
 95 | venv/
 96 | ENV/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | 
111 | # profraw files from LLVM? Unclear exactly what triggers this
112 | # There are reports this comes from LLVM profiling, but also Xcode 9.
113 | *profraw
114 | 
115 | # pytorch weights
116 | # *pt
117 | 


--------------------------------------------------------------------------------
/.lgtm.yml:
--------------------------------------------------------------------------------
 1 | # Configure LGTM for this package
 2 | 
 3 | extraction:
 4 |   python:  # Configure Python
 5 |     python_setup:  # Configure the setup
 6 |       version: 3  # Specify Version 3
 7 | path_classifiers:
 8 |   library:
 9 |     - devtools/*
10 |   generated:
11 |     - sparrow/_version.py
12 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # File: .readthedocs.yaml
 2 | 
 3 | version: 2
 4 | 
 5 | # Specify the Python version and requirements file
 6 | python:
 7 |   install:
 8 |     - requirements: docs/requirements.txt
 9 | 
10 | # Use the "readthedocs" environment to ensure all dependencies are installed before building
11 | build:
12 |   os: ubuntu-20.04
13 |   tools:
14 |     python: "3.9"
15 | 
16 | # Sphinx configuration
17 | sphinx:
18 |   configuration: docs/conf.py


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age,
 8 | body size, disability, ethnicity, gender identity and expression, level of
 9 | experience, nationality, personal appearance, race, religion, or sexual
10 | identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment include:
15 | 
16 | * Using welcoming and inclusive language
17 | * Being respectful of differing viewpoints and experiences
18 | * Gracefully accepting constructive criticism
19 | * Focusing on what is best for the community
20 | * Showing empathy towards other community members
21 | 
22 | Examples of unacceptable behavior by participants include:
23 | 
24 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
25 | * Trolling, insulting/derogatory comments, and personal or political attacks
26 | * Public or private harassment
27 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
28 | * Other conduct which could reasonably be considered inappropriate in a professional setting
29 | 
30 | ## Our Responsibilities
31 | 
32 | Project maintainers are responsible for clarifying the standards of acceptable
33 | behavior and are expected to take appropriate and fair corrective action in
34 | response to any instances of unacceptable behavior.
35 | 
36 | Project maintainers have the right and responsibility to remove, edit, or
37 | reject comments, commits, code, wiki edits, issues, and other contributions
38 | that are not aligned to this Code of Conduct, or to ban temporarily or
39 | permanently any contributor for other behaviors that they deem inappropriate,
40 | threatening, offensive, or harmful.
41 | 
42 | Moreover, project maintainers will strive to offer feedback and advice to
43 | ensure quality and consistency of contributions to the code.  Contributions
44 | from outside the group of project maintainers are strongly welcomed but the
45 | final decision as to whether commits are merged into the codebase rests with
46 | the team of project maintainers.
47 | 
48 | ## Scope
49 | 
50 | This Code of Conduct applies both within project spaces and in public spaces
51 | when an individual is representing the project or its community. Examples of
52 | representing a project or community include using an official project e-mail
53 | address, posting via an official social media account, or acting as an
54 | appointed representative at an online or offline event. Representation of a
55 | project may be further defined and clarified by project maintainers.
56 | 
57 | ## Enforcement
58 | 
59 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
60 | reported by contacting the project team at 'alex.holehouse@wustl.edu'. The project team will
61 | review and investigate all complaints, and will respond in a way that it deems
62 | appropriate to the circumstances. The project team is obligated to maintain
63 | confidentiality with regard to the reporter of an incident. Further details of
64 | specific enforcement policies may be posted separately.
65 | 
66 | Project maintainers who do not follow or enforce the Code of Conduct in good
67 | faith may face temporary or permanent repercussions as determined by other
68 | members of the project's leadership.
69 | 
70 | ## Attribution
71 | 
72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
73 | version 1.4, available at
74 | [http://contributor-covenant.org/version/1/4][version]
75 | 
76 | [homepage]: http://contributor-covenant.org
77 | [version]: http://contributor-covenant.org/version/1/4/
78 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | MIT License
 3 | 
 4 | Copyright (c) 2023 Jeffrey Lotthammer
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE
 2 | include MANIFEST.in
 3 | include CODE_OF_CONDUCT.md
 4 | 
 5 | graft sparrow
 6 | global-exclude *.py[cod] __pycache__ *.so
 7 | 
 8 | recursive-include sparrow/data *
 9 | 
10 | 


--------------------------------------------------------------------------------
/devtools/README.md:
--------------------------------------------------------------------------------
 1 | # Development, testing, and deployment tools
 2 | 
 3 | This directory contains a collection of tools for running Continuous Integration (CI) tests, 
 4 | conda installation, and other development tools not directly related to the coding process.
 5 | 
 6 | 
 7 | ## Manifest
 8 | 
 9 | ### Continuous Integration
10 | 
11 | You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and 
12 | Windows testing if you only plan to deploy on specific platforms. These are just to help you get started.
13 | 
14 | The items in this directory have been left for legacy purposes since the change to GitHub Actions, 
15 | They will likely be removed in a future version.
16 | 
17 | * `legacy-miniconda-setup`: A preserved copy of a helper directory which made Linux and OSX based testing through [Travis-CI](https://about.travis-ci.com/) simpler
18 |   * `before_install.sh`: Pip/Miniconda pre-package installation script for Travis. No longer needed thanks to 
19 |     [GitHub Actions](https://docs.github.com/en/free-pro-team@latest/actions) and the [conda-incubator/setup-miniconda Action](https://github.com/conda-incubator/setup-miniconda)
20 | 
21 | ### Conda Environment:
22 | 
23 | This directory contains the files to setup the Conda environment for testing purposes
24 | 
25 | * `conda-envs`: directory containing the YAML file(s) which fully describe Conda Environments, their dependencies, and those dependency provenance's
26 |   * `test_env.yaml`: Simple test environment file with base dependencies. Channels are not specified here and therefore respect global Conda configuration
27 |   
28 | ### Additional Scripts:
29 | 
30 | This directory contains OS agnostic helper scripts which don't fall in any of the previous categories
31 | * `scripts`
32 |   * `create_conda_env.py`: Helper program for spinning up new conda environments based on a starter file with Python Version and Env. Name command-line options
33 | 
34 | 
35 | ## How to contribute changes
36 | - Clone the repository if you have write access to the main repo, fork the repository if you are a collaborator.
37 | - Make a new branch with `git checkout -b {your branch name}`
38 | - Make changes and test your code
39 | - Ensure that the test environment dependencies (`conda-envs`) line up with the build and deploy dependencies (`conda-recipe/meta.yaml`)
40 | - Push the branch to the repo (either the main or your fork) with `git push -u origin {your branch name}`
41 |   * Note that `origin` is the default name assigned to the remote, yours may be different
42 | - Make a PR on GitHub with your changes
43 | - We'll review the changes and get your code into the repo after lively discussion!
44 | 
45 | 
46 | ## Checklist for updates
47 | - [ ] Make sure there is an/are issue(s) opened for your specific update
48 | - [ ] Create the PR, referencing the issue
49 | - [ ] Debug the PR as needed until tests pass
50 | - [ ] Tag the final, debugged version 
51 |    *  `git tag -a X.Y.Z [latest pushed commit] && git push --follow-tags`
52 | - [ ] Get the PR merged in
53 | 


--------------------------------------------------------------------------------
/devtools/conda-envs/test_env.yaml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | channels:
 3 | 
 4 |   - conda-forge
 5 | 
 6 |   - defaults
 7 | dependencies:
 8 |     # Base depends
 9 |   - python
10 |   - pip
11 | 
12 |     # Testing
13 |   - pytest
14 |   - pytest-cov
15 |   - codecov
16 | 
17 |     # Pip-only installs
18 |   #- pip:
19 |   #  - codecov
20 | 
21 | 


--------------------------------------------------------------------------------
/devtools/legacy-miniconda-setup/before_install.sh:
--------------------------------------------------------------------------------
 1 | # Temporarily change directory to $HOME to install software
 2 | pushd .
 3 | cd $HOME
 4 | # Make sure some level of pip is installed
 5 | python -m ensurepip
 6 | 
 7 | # Install Miniconda
 8 | if [ "$TRAVIS_OS_NAME" == "osx" ]; then
 9 |     # Make OSX md5 mimic md5sum from linux, alias does not work
10 |     md5sum () {
11 |         command md5 -r "$@"
12 |     }
13 |     MINICONDA=Miniconda3-latest-MacOSX-x86_64.sh
14 | else
15 |     MINICONDA=Miniconda3-latest-Linux-x86_64.sh
16 | fi
17 | MINICONDA_HOME=$HOME/miniconda
18 | MINICONDA_MD5=$(wget -qO- https://repo.anaconda.com/miniconda/ | grep -A3 $MINICONDA | sed -n '4p' | sed -n 's/ *<td>\(.*\)<\/td> */\1/p')
19 | wget -q https://repo.anaconda.com/miniconda/$MINICONDA
20 | if [[ $MINICONDA_MD5 != $(md5sum $MINICONDA | cut -d ' ' -f 1) ]]; then
21 |     echo "Miniconda MD5 mismatch"
22 |     exit 1
23 | fi
24 | bash $MINICONDA -b -p $MINICONDA_HOME
25 | 
26 | # Configure miniconda
27 | export PIP_ARGS="-U"
28 | # New to conda >=4.4
29 | echo ". $MINICONDA_HOME/etc/profile.d/conda.sh" >> ~/.bashrc  # Source the profile.d file
30 | echo "conda activate" >> ~/.bashrc  # Activate conda
31 | source ~/.bashrc  # source file to get new commands
32 | #export PATH=$MINICONDA_HOME/bin:$PATH  # Old way, should not be needed anymore
33 |     
34 | conda config --add channels conda-forge
35 |     
36 | conda config --set always_yes yes
37 | conda install conda conda-build jinja2 anaconda-client
38 | conda update --quiet --all
39 | 
40 | # Restore original directory
41 | popd
42 | 


--------------------------------------------------------------------------------
/devtools/scripts/create_conda_env.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | import glob
 5 | import shutil
 6 | import subprocess as sp
 7 | from tempfile import TemporaryDirectory
 8 | from contextlib import contextmanager
 9 | # YAML imports
10 | try:
11 |     import yaml  # PyYAML
12 |     loader = yaml.load
13 | except ImportError:
14 |     try:
15 |         import ruamel_yaml as yaml  # Ruamel YAML
16 |     except ImportError:
17 |         try:
18 |             # Load Ruamel YAML from the base conda environment
19 |             from importlib import util as import_util
20 |             CONDA_BIN = os.path.dirname(os.environ['CONDA_EXE'])
21 |             ruamel_yaml_path = glob.glob(os.path.join(CONDA_BIN, '..',
22 |                                                       'lib', 'python*.*', 'site-packages',
23 |                                                       'ruamel_yaml', '__init__.py'))[0]
24 |             # Based on importlib example, but only needs to load_module since its the whole package, not just
25 |             # a module
26 |             spec = import_util.spec_from_file_location('ruamel_yaml', ruamel_yaml_path)
27 |             yaml = spec.loader.load_module()
28 |         except (KeyError, ImportError, IndexError):
29 |             raise ImportError("No YAML parser could be found in this or the conda environment. "
30 |                               "Could not find PyYAML or Ruamel YAML in the current environment, "
31 |                               "AND could not find Ruamel YAML in the base conda environment through CONDA_EXE path. " 
32 |                               "Environment not created!")
33 |     loader = yaml.YAML(typ="safe").load  # typ="safe" avoids odd typing on output
34 | 
35 | 
36 | @contextmanager
37 | def temp_cd():
38 |     """Temporary CD Helper"""
39 |     cwd = os.getcwd()
40 |     with TemporaryDirectory() as td:
41 |         try:
42 |             os.chdir(td)
43 |             yield
44 |         finally:
45 |             os.chdir(cwd)
46 | 
47 | 
48 | # Args
49 | parser = argparse.ArgumentParser(description='Creates a conda environment from file for a given Python version.')
50 | parser.add_argument('-n', '--name', type=str,
51 |                     help='The name of the created Python environment')
52 | parser.add_argument('-p', '--python', type=str,
53 |                     help='The version of the created Python environment')
54 | parser.add_argument('conda_file',
55 |                     help='The file for the created Python environment')
56 | 
57 | args = parser.parse_args()
58 | 
59 | # Open the base file
60 | with open(args.conda_file, "r") as handle:
61 |     yaml_script = loader(handle.read())
62 | 
63 | python_replacement_string = "python {}*".format(args.python)
64 | 
65 | try:
66 |     for dep_index, dep_value in enumerate(yaml_script['dependencies']):
67 |         if re.match('python([ ><=*]+[0-9.*]*)?$', dep_value):  # Match explicitly 'python' and its formats
68 |             yaml_script['dependencies'].pop(dep_index)
69 |             break  # Making the assumption there is only one Python entry, also avoids need to enumerate in reverse
70 | except (KeyError, TypeError):
71 |     # Case of no dependencies key, or dependencies: None
72 |     yaml_script['dependencies'] = []
73 | finally:
74 |     # Ensure the python version is added in. Even if the code does not need it, we assume the env does
75 |     yaml_script['dependencies'].insert(0, python_replacement_string)
76 | 
77 | # Figure out conda path
78 | if "CONDA_EXE" in os.environ:
79 |     conda_path = os.environ["CONDA_EXE"]
80 | else:
81 |     conda_path = shutil.which("conda")
82 | if conda_path is None:
83 |     raise RuntimeError("Could not find a conda binary in CONDA_EXE variable or in executable search path")
84 | 
85 | print("CONDA ENV NAME  {}".format(args.name))
86 | print("PYTHON VERSION  {}".format(args.python))
87 | print("CONDA FILE NAME {}".format(args.conda_file))
88 | print("CONDA PATH      {}".format(conda_path))
89 | 
90 | # Write to a temp directory which will always be cleaned up
91 | with temp_cd():
92 |     temp_file_name = "temp_script.yaml"
93 |     with open(temp_file_name, 'w') as f:
94 |         f.write(yaml.dump(yaml_script))
95 |     sp.call("{} env create -n {} -f {}".format(conda_path, args.name, temp_file_name), shell=True)
96 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = sparrow
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Compiling sparrow's Documentation
 2 | 
 3 | The docs for this project are built with [Sphinx](http://www.sphinx-doc.org/en/master/).
 4 | To compile the docs, first ensure that Sphinx and the ReadTheDocs theme are installed.
 5 | 
 6 | 
 7 | ```bash
 8 | conda install sphinx sphinx_rtd_theme 
 9 | ```
10 | 
11 | 
12 | Once installed, you can use the `Makefile` in this directory to compile static HTML pages by
13 | ```bash
14 | make html
15 | ```
16 | 
17 | The compiled docs will be in the `_build` directory and can be viewed by opening `index.html` (which may itself 
18 | be inside a directory called `html/` depending on what version of Sphinx is installed).
19 | 
20 | 
21 | A configuration file for [Read The Docs](https://readthedocs.org/) (readthedocs.yaml) is included in the top level of the repository. To use Read the Docs to host your documentation, go to https://readthedocs.org/ and connect this repository. You may need to change your default branch to `main` under Advanced Settings for the project.
22 | 
23 | If you would like to use Read The Docs with `autodoc` (included automatically) and your package has dependencies, you will need to include those dependencies in your documentation yaml file (`docs/requirements.yaml`).
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/_static/README.md:
--------------------------------------------------------------------------------
 1 | # Static Doc Directory
 2 | 
 3 | Add any paths that contain custom static files (such as style sheets) here,
 4 | relative to the `conf.py` file's directory. 
 5 | They are copied after the builtin static files,
 6 | so a file named "default.css" will overwrite the builtin "default.css".
 7 | 
 8 | The path to this folder is set in the Sphinx `conf.py` file in the line: 
 9 | ```python
10 | templates_path = ['_static']
11 | ```
12 | 
13 | ## Examples of file to add to this directory
14 | * Custom Cascading Style Sheets
15 | * Custom JavaScript code
16 | * Static logo images
17 | 


--------------------------------------------------------------------------------
/docs/_templates/README.md:
--------------------------------------------------------------------------------
 1 | # Templates Doc Directory
 2 | 
 3 | Add any paths that contain templates here, relative to  
 4 | the `conf.py` file's directory.
 5 | They are copied after the builtin template files,
 6 | so a file named "page.html" will overwrite the builtin "page.html".
 7 | 
 8 | The path to this folder is set in the Sphinx `conf.py` file in the line: 
 9 | ```python
10 | html_static_path = ['_templates']
11 | ```
12 | 
13 | ## Examples of file to add to this directory
14 | * HTML extensions of stock pages like `page.html` or `layout.html`
15 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | API Documentation
2 | =================
3 | 
4 | .. autosummary::
5 |    :toctree: autosummary
6 | 
7 |    sparrow.canvas
8 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/stable/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | 
 15 | # Incase the project was not installed
 16 | import os
 17 | import sys
 18 | sys.path.insert(0, os.path.abspath('..'))
 19 | 
 20 | import sparrow
 21 | 
 22 | 
 23 | # -- Project information -----------------------------------------------------
 24 | 
 25 | project = 'sparrow'
 26 | copyright = ("2020, Alex Holehouse. Project structure based on the "
 27 |              "Computational Molecular Science Python Cookiecutter version 1.5")
 28 | author = 'Alex Holehouse'
 29 | 
 30 | # The short X.Y version
 31 | version = ''
 32 | # The full version, including alpha/beta/rc tags
 33 | release = ''
 34 | 
 35 | 
 36 | # -- General configuration ---------------------------------------------------
 37 | 
 38 | # If your documentation needs a minimal Sphinx version, state it here.
 39 | #
 40 | # needs_sphinx = '1.0'
 41 | 
 42 | # Add any Sphinx extension module names here, as strings. They can be
 43 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 44 | # ones.
 45 | extensions = [
 46 |     'sphinx.ext.autosummary',
 47 |     'sphinx.ext.autodoc',
 48 |     'sphinx.ext.mathjax',
 49 |     'sphinx.ext.viewcode',
 50 |     'sphinx.ext.napoleon',
 51 |     'sphinx.ext.intersphinx',
 52 |     'sphinx.ext.extlinks',
 53 | ]
 54 | 
 55 | autosummary_generate = True
 56 | napoleon_google_docstring = False
 57 | napoleon_use_param = False
 58 | napoleon_use_ivar = True
 59 | 
 60 | # Add any paths that contain templates here, relative to this directory.
 61 | templates_path = ['_templates']
 62 | 
 63 | # The suffix(es) of source filenames.
 64 | # You can specify multiple suffix as a list of string:
 65 | #
 66 | # source_suffix = ['.rst', '.md']
 67 | source_suffix = '.rst'
 68 | 
 69 | # The master toctree document.
 70 | master_doc = 'index'
 71 | 
 72 | # The language for content autogenerated by Sphinx. Refer to documentation
 73 | # for a list of supported languages.
 74 | #
 75 | # This is also used if you do content translation via gettext catalogs.
 76 | # Usually you set "language" from the command line for these cases.
 77 | language = None
 78 | 
 79 | # List of patterns, relative to source directory, that match files and
 80 | # directories to ignore when looking for source files.
 81 | # This pattern also affects html_static_path and html_extra_path .
 82 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 83 | 
 84 | # The name of the Pygments (syntax highlighting) style to use.
 85 | pygments_style = 'default'
 86 | 
 87 | 
 88 | # -- Options for HTML output -------------------------------------------------
 89 | 
 90 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 91 | # a list of builtin themes.
 92 | #
 93 | html_theme = 'sphinx_rtd_theme'
 94 | 
 95 | # Theme options are theme-specific and customize the look and feel of a theme
 96 | # further.  For a list of options available for each theme, see the
 97 | # documentation.
 98 | #
 99 | # html_theme_options = {}
100 | 
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = ['_static']
105 | 
106 | # Custom sidebar templates, must be a dictionary that maps document names
107 | # to template names.
108 | #
109 | # The default sidebars (for documents that don't match any pattern) are
110 | # defined by theme itself.  Builtin themes are using these templates by
111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
112 | # 'searchbox.html']``.
113 | #
114 | # html_sidebars = {}
115 | 
116 | 
117 | # -- Options for HTMLHelp output ---------------------------------------------
118 | 
119 | # Output file base name for HTML help builder.
120 | htmlhelp_basename = 'sparrowdoc'
121 | 
122 | 
123 | # -- Options for LaTeX output ------------------------------------------------
124 | 
125 | latex_elements = {
126 |     # The paper size ('letterpaper' or 'a4paper').
127 |     #
128 |     # 'papersize': 'letterpaper',
129 | 
130 |     # The font size ('10pt', '11pt' or '12pt').
131 |     #
132 |     # 'pointsize': '10pt',
133 | 
134 |     # Additional stuff for the LaTeX preamble.
135 |     #
136 |     # 'preamble': '',
137 | 
138 |     # Latex figure (float) alignment
139 |     #
140 |     # 'figure_align': 'htbp',
141 | }
142 | 
143 | # Grouping the document tree into LaTeX files. List of tuples
144 | # (source start file, target name, title,
145 | #  author, documentclass [howto, manual, or own class]).
146 | latex_documents = [
147 |     (master_doc, 'sparrow.tex', 'sparrow Documentation',
148 |      'sparrow', 'manual'),
149 | ]
150 | 
151 | 
152 | # -- Options for manual page output ------------------------------------------
153 | 
154 | # One entry per manual page. List of tuples
155 | # (source start file, name, description, authors, manual section).
156 | man_pages = [
157 |     (master_doc, 'sparrow', 'sparrow Documentation',
158 |      [author], 1)
159 | ]
160 | 
161 | 
162 | # -- Options for Texinfo output ----------------------------------------------
163 | 
164 | # Grouping the document tree into Texinfo files. List of tuples
165 | # (source start file, target name, title, author,
166 | #  dir menu entry, description, category)
167 | texinfo_documents = [
168 |     (master_doc, 'sparrow', 'sparrow Documentation',
169 |      author, 'sparrow', 'Next generation package for sequence parameter calculation',
170 |      'Miscellaneous'),
171 | ]
172 | 
173 | 
174 | # -- Extension configuration -------------------------------------------------
175 | 


--------------------------------------------------------------------------------
/docs/getting_started.rst:
--------------------------------------------------------------------------------
1 | Getting Started
2 | ===============
3 | 
4 | This page details how to get started with sparrow. 
5 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. sparrow documentation master file, created by
 2 |    sphinx-quickstart on Thu Mar 15 13:55:56 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to sparrow's documentation!
 7 | =========================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    getting_started
14 |    api
15 |    predictors
16 | 
17 | 
18 | 
19 | Indices and tables
20 | ==================
21 | 
22 | * :ref:`genindex`
23 | * :ref:`modindex`
24 | * :ref:`search`
25 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=sparrow
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/predictors.rst:
--------------------------------------------------------------------------------
  1 | Predictors
  2 | =================
  3 | 
  4 | sparrow implements a set of different sequence-based predictors in a modular, extendable way that enables additional predictors to be easily added. 
  5 | 
  6 | 
  7 | Creating new predictors with PARROT
  8 | --------------------------------------
  9 | The guide below assumes you have cloned the git repository of sparrow, created a new branch to add your new predictor to, and have switched into that branch to begin work. As a reminder, when adding new features in Git, the general workflow is:
 10 | 
 11 | 1. Clone the current up-to-date version
 12 | 2. Create a new branch (this is a seperate version where you can work in peace, but if new features are added to the main branch you can update your branch as you go)
 13 | 3. Add in your snazzy new feature
 14 | 4. Once complete, make a pull request to merge your branch back into the main branch.
 15 | 
 16 | This guide assumes these ideas are clear, and specifically provides insight into the details of implementing a new predictor in sparrow, focussing here on using PARROT to train that predictor. 
 17 | 
 18 | 
 19 | **Step 1: Train a predictor with PARROT**
 20 | 
 21 | The first step in adding a new PARROT based predictor is to use PARROT to train your model. The details of how one does this go beyond the scope of this documentation, but once trained you should be left with a Torch parameter file (a ``.pt`` file). This is the file we're going to use with SPARROW to add our custom predictor. Lets call this parameter file ``new_predictor.pt`` to make this concrete.
 22 | 
 23 | Note that the PARROT predictor should be predicted in ``residues`` mode - i.e. we need to recieve one value per residue
 24 | 
 25 | 
 26 | **Step 2: Copy the parameter file into SPARROW**
 27 | 
 28 | Next we take ``new_predictor.pt`` and we're going to copy it into sparrow. Specifically, this trained network should be placed under::
 29 | 
 30 |   sparrow/data/networks/predictor
 31 | 
 32 | and MUST follow the naming convention ``<predictor_name>_network_v<X>.pt``. Note there that:
 33 | 
 34 | * ``<predictor_name>`` should be a single word or word connected by underscores, all lower case, that we will use as the function name to call the predictor. For example, *disorder*, *dssp* or *transmembrane* are good examples. Keep this simple but it should be clear and unambigious.
 35 | * ``<X>`` here is the specific version of this network. It is possible that your network may be retrained later, and as such we want to enable future sparrow users to select specific network versions, althogh of the course the predictors should default to the most recent version. This ability to select specific network versions is built into the standard predictor template code.
 36 | 
 37 | As an example, our transmembrane predictor has the format::
 38 | 
 39 |   transmembrane_predictor_network_v4.pt
 40 | 
 41 | 
 42 | **Step 3: Build a predictor class which performs the prediction**
 43 | 
 44 | The next step is to build a stand-alone predictor class which reads in this network file and enables the return of the per-residue prediction implemented therein. This file should be created in a new package (i.e. a directory with a ``__init__.py``) in the::
 45 | 
 46 |   sparrow/predictors
 47 | 
 48 | and this file should be called ``<relevant_name>_predictor.py``.
 49 | 
 50 | As a specific example, our transmembrane predictor is implemented in::
 51 | 
 52 |   sparrow/predictors/transmembrane
 53 | 
 54 | and within this directory there are two files::
 55 | 
 56 |   __init__.py # this is needed so we can import the predictor
 57 |   transmembrane_predictor.py # this is where the predictor is implemented
 58 | 
 59 | The reason to make a separate package (directory) for every predictor is that if someone has a non-parrot-based predictor they want to incoporate into sparrow (1) this is absolutely welcome and (2) we want to provide a consistent file ecosystem where they have a directory to implement as much/little additional code as they want. As such, the ``__init__.py`` and ``<predictor_name>_predictor.py`` are the **minimum** files needed, but you are free to add anything else as well.
 60 | 
 61 | ``__init__.py``` should probably just be empty - it's what tells Python that this directory is a package. 
 62 | 
 63 | ``<relevant_name>_predictor.py`` should NOT be empty, but should be based on the template file found under ``sparrow/predictors/predictor_template.py``. The template is REALTIVELY simple, but provides code for reading in a PARROT-trained network and performing a prediction. You could re-implement this yourself if you really wanted, but, assuming you're using one-hot encoding on the trained network, this code should work out of the box. The template itself walks through the various small configuration tweaks needed to make this work with your specific network of interest. Note that for classification vs. regression there are some small difference, but the template file provides code for both, so just delete/comment out the irrelevant lines (these are clearly marked).
 64 | 
 65 | Once this is done, it's worth seeing if you can import and run predictions using this class/function as a stand-alone predictor i.e. you should be able to do::
 66 | 
 67 | 
 68 |     from sparrow.predictor.<predictor_package>.<predictor_module> import <RelevantName>Predictor
 69 |     
 70 |     sequence = 'MSAAVTAGKLARAPADPGKAGVPGVAAPGAPAAAPPAKEIPEVLVDPRSRRRYVRGRFLG'
 71 |     P = <RelevantName>Predictor()
 72 |     P.predict_<SOMETHING_RELEVANT>(sequence)
 73 | 
 74 | 
 75 | and it return a set of values.
 76 | 
 77 | 
 78 | **Step 4: Integrate the predictior in the sparrow.Predictor class**
 79 | 
 80 | At this stage we have a working predictor - the last step is to connect this predictor to the sparrow Protein object in a way that inccurs minimal computational overhead if not used, is syntactically simple, and offers functionality like other Protein analysis functions and properties.
 81 | 
 82 | This is achieved by adding a function into the ``sparrow.predictor.Predictor`` class, a class implemented in the ``sparrow.predictor.__init__.py``.
 83 | 
 84 | This class generates an object which is accessible in Protein object under the ``.predictor`` dot operator. As such, functions defined in the `sparrow.predictor.Predictor`` class are then accessible as::
 85 | 
 86 |   seq = 'MSAAVTAGKLARAPADPGKAGVPGVAAPGAPAAAPPAKEIPE'
 87 |   p = Protein(seq)
 88 | 
 89 |   p.predictor.<predictor function>
 90 | 
 91 | 
 92 | As such, to finally make a new predictor accessible, ``sparrow.predictor.Predictor`` class should be edited to add a new function which is simply the name of the prediction (e.g. ``dssp``, ``transmembrane`` etc). This function should do three things:
 93 | 
 94 | 1. It should UPON BEING CALLED import the predictor package you just created.
 95 | 2. It should then perform the prediction on the underlying protein sequence
 96 | 3. It should (ideally) memoize the outcome into a local dictionary that means if the same prediction is called again it is simply referenced rather than recomputed.
 97 | 
 98 | Rather than going into the details here, the underlying code and example should make this clear. Noteably, see ``dssp()`` and ``transmembrane_regions()`` for good examples of PARROT-based predictors. One important thing is to document these predictors clearly
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | versioningit
2 | sphinx_rtd_theme


--------------------------------------------------------------------------------
/docs/requirements.yaml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | channels:
 3 | dependencies:
 4 |     # Base depends
 5 |   - python
 6 |   - pip
 7 | 
 8 | 
 9 | 
10 |     # Pip-only installs
11 |   #- pip:
12 | 
13 | 


--------------------------------------------------------------------------------
/examples/protein_example_1.py:
--------------------------------------------------------------------------------
 1 | from sparrow.protein import Protein
 2 | 
 3 | P = Protein('MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP')
 4 | 
 5 | print('Demo 1')
 6 | print(P)
 7 | print(f"sparrow makes the most of Python's syntactic sugar e.g. we can use len() operator - e.g., len(P) will show the sequence length: {len(P)}")
 8 | print(P.predictor.disorder())
 9 | print(P.FCR)
10 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | 
 3 | # delete cython/numpy if not needed
 4 | requires = ["setuptools>=61", "versioningit~=2.0", "cython", "numpy", ]
 5 | build-backend = "setuptools.build_meta"
 6 | 
 7 | 
 8 | # define project info
 9 | [project]
10 | name = "sparrow"
11 | dynamic = ["version"]
12 | description = "Next generation sequence analysis package for working with disordered regions and disordered proteins"
13 | authors = [
14 |     {name = "Alex Holehouse", email = "alex.holehouse@wustl.edu"}
15 | ]
16 | license = {text = "CC-NC-ND"}
17 | readme = "README.md"  
18 | requires-python = ">=3.7"
19 | 
20 | # add in as needed  
21 | dependencies = [      
22 |     "numpy>=1.14.0,<2.0",
23 |     "scipy",
24 |     "cython",
25 |     "protfasta",
26 |     "metapredict>2",
27 |     "ipython",
28 |     "idptools-parrot @ git+https://git@github.com/idptools/parrot.git",
29 |     "afrc",
30 |     "tqdm",
31 |     "pyfamsa",
32 | ]
33 | 
34 | [project.optional-dependencies]
35 | test = [
36 |   "pytest>=6.1.2",
37 | ]
38 | 
39 | 
40 | [tool.setuptools]
41 | zip-safe = false
42 | include-package-data = true 
43 | 
44 | [tool.setuptools.packages.find]
45 | namespaces = true
46 | where = ["."]
47 | include = ["sparrow", "sparrow.*"]  # Discover all sub-packages inside the main package
48 | 
49 | [tool.setuptools.package-data]
50 | sparrow = [
51 |     "py.typed"
52 | ]
53 | 
54 | [tool.versioningit]
55 | default-version = "1+unknown"
56 | 
57 | [tool.versioningit.format]
58 | distance = "{base_version}+{distance}.{vcs}{rev}"
59 | dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
60 | distance-dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
61 | 
62 | [tool.versioningit.vcs]
63 | # The method key:
64 | method = "git"  # <- The method name
65 | # Parameters to pass to the method:
66 | match = ["*"]
67 | default-tag = "1.0.0"
68 | 
69 | [tool.versioningit.write]
70 | file = "sparrow/_version.py"
71 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # readthedocs.yml
 2 | 
 3 | version: 2
 4 | 
 5 | build:
 6 |   image: latest
 7 | 
 8 | python:
 9 |   version: 3.8
10 |   install:
11 |     - method: pip
12 |       path: .
13 | 
14 | conda:
15 |   environment: docs/requirements.yaml


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [coverage:run]
 2 | # .coveragerc to control coverage.py and pytest-cov
 3 | omit =
 4 |     # Omit the tests
 5 |     */tests/*
 6 |     # Omit generated versioningit
 7 |     metapredict/_version.py
 8 | 
 9 | # define consistent style
10 | [yapf]
11 | COLUMN_LIMIT = 119
12 | INDENT_WIDTH = 4
13 | USE_TABS = False
14 | 
15 | # define consistent style
16 | [flake8]
17 | max-line-length = 119
18 | 
19 | # means we can run python setup.py test to
20 | # run tests... maybe...
21 | [aliases]
22 | test = pytest


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | sparrow
 3 | Next generation package for sequence parameter calculation
 4 | """
 5 | from setuptools import setup, Extension, find_packages
 6 | from Cython.Build import cythonize
 7 | import os
 8 | import numpy
 9 | 
10 | # defines the absolute path of where your cython files are
11 | cython_dir = os.path.join("sparrow", "patterning")
12 | 
13 | # build a list of the files 
14 | cython_files = [os.path.join(cython_dir, f) for f in os.listdir(cython_dir) if f.endswith('.pyx')]
15 | 
16 | 
17 | extensions = [
18 |     Extension(
19 |         name=f"sparrow.patterning.{os.path.splitext(os.path.basename(file))[0]}",
20 |         sources=[file],
21 |         include_dirs=[numpy.get_include()],
22 |     ) for file in cython_files
23 | ]
24 | 
25 | setup(
26 |     ext_modules=cythonize(extensions, compiler_directives={'language_level': "3"}),
27 |     packages=find_packages(),
28 |     include_package_data=True,
29 | )
30 | 


--------------------------------------------------------------------------------
/sparrow/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | sparrow
 3 | Next generation package for sequence parameter calculation
 4 | """
 5 | 
 6 | import os
 7 | 
 8 | # Add imports here
 9 | from sparrow.tools import io
10 | from sparrow.protein import Protein
11 | from sparrow.tools.io import read_fasta
12 | 
13 | 
14 | # Generate _version.py if missing and in the Read the Docs environment
15 | if os.getenv("READTHEDOCS") == "True" and not os.path.isfile('../sparrow/_version.py'):   
16 |     import versioningit            
17 |     __version__ = versioningit.get_version('../')
18 | else:
19 |     from ._version import __version__
20 | 
21 | # code that allows access to the data directory
22 | _ROOT = os.path.abspath(os.path.dirname(__file__))
23 | def get_data(path):
24 |     return os.path.join(_ROOT, 'data', path)
25 | 
26 | 


--------------------------------------------------------------------------------
/sparrow/calculate_parameters.py:
--------------------------------------------------------------------------------
  1 | from sparrow.data import amino_acids
  2 | import numpy as np
  3 | import math
  4 | from . import sparrow_exceptions
  5 | 
  6 | # .................................................................
  7 | #
  8 | def calculate_aa_fractions(s):
  9 |     """
 10 |     Standalone function that computes amino-acid fractions for
 11 |     a given sequence.
 12 | 
 13 |     Parameters:
 14 |     --------------
 15 |     s : str
 16 |         Amino acid sequence
 17 | 
 18 |     Returns
 19 |     ---------------
 20 |     dict
 21 |         Returns dictionary with per-residue amino acid fraction
 22 |     
 23 |     """
 24 |     
 25 |     aa_dict = {}
 26 |     for i in amino_acids.VALID_AMINO_ACIDS:
 27 |         aa_dict[i] = 0
 28 | 
 29 |     for i in s:
 30 |         aa_dict[i] = aa_dict[i] + 1
 31 | 
 32 |     
 33 |     len_s = len(s)
 34 |     for i in amino_acids.VALID_AMINO_ACIDS:
 35 |         aa_dict[i] = aa_dict[i]/len_s
 36 | 
 37 |     return aa_dict
 38 |  
 39 | 
 40 | 
 41 | def calculate_seg_complexity(s, alphabet=amino_acids.VALID_AMINO_ACIDS):
 42 |     """
 43 |     Function to calculate the Wootton-Federhen complexity of a sequence (also called
 44 |     seg complexity, as this the theory used in the classic SEG algorithm.
 45 | 
 46 |     Parameters
 47 |     -----------
 48 |     s : str
 49 |         Amino acid sequence
 50 | 
 51 |     alphabet : list
 52 |         List of amino acids found in alphabet. Note this does not sanity check in the 
 53 |         case of non-standard amino acids. Default is the standard 20 amino acids
 54 | 
 55 |     Returns
 56 |     ----------
 57 |     float
 58 |         Returns a float that corresponds to the compositional complexity associated with 
 59 |         the passed sequence.
 60 | 
 61 |     """
 62 | 
 63 |     alphabet_size = len(alphabet)
 64 |     seq_len = len(s)
 65 | 
 66 |     complexity = 0 
 67 |     for a in alphabet:
 68 |         p = s.count(a)/seq_len
 69 | 
 70 |         if p > 0:
 71 |             complexity = p * math.log(p, alphabet_size) + complexity  
 72 | 
 73 |     return -complexity
 74 |     
 75 |    
 76 | 
 77 | # .................................................................
 78 | #
 79 | def calculate_hydrophobicity(s, mode='KD', normalize=False):
 80 |     """
 81 |     Standalone function that computes hydrophobicity 
 82 | 
 83 |     Parameters:
 84 |     --------------
 85 |     s : str
 86 |         Amino acid sequence
 87 | 
 88 |     mode : str 
 89 |         Hydrophobicity mode to be used. Currently only KD supported
 90 |         but can be expanded. Allowed values: 'KD'
 91 | 
 92 |     normalize : Bool
 93 |         If set to True hydrophobicity scales are normalized to be between 0
 94 |         and 1. Default = False.
 95 | 
 96 |     Returns
 97 |     ---------------
 98 |     Float
 99 |         Returns a floating point value with the mean hydrophobicity 
100 |         as defined based on the passed scale
101 | 
102 |     """
103 |     return np.mean(calculate_linear_hydrophobicity(s, mode, normalize))
104 |     
105 | 
106 | # .................................................................
107 | #
108 | def calculate_linear_hydrophobicity(s, mode='KD', normalize=False):
109 |     """
110 |     Compute linear hydrophobicity from sequence using one of several possible 
111 |     hydrophobicity scales. 
112 | 
113 |     By default this is Kyte-Doolitle, but, we'll add in additional scales
114 |     as/when needed.
115 | 
116 |     Parameters:
117 |     --------------
118 |     s : str
119 |          Amino acid sequence
120 | 
121 |     mode : str
122 |         Selector for hydrophobicity table. Options available are
123 | 
124 |         'KD'    | Kyte-Doolittle
125 | 
126 |     normalize : bool
127 |         Boolean that means hydrophobicity scales operate on a normalized
128 |         dynamic range of 0 to 1
129 | 
130 |     Returns:
131 |     ------------
132 |     list 
133 |         List of values that correspond to per-residue hydrophobicity based on
134 |         a given hydrophobicity scale.
135 |     
136 |     """
137 | 
138 |     if mode == 'KD':
139 |         try:
140 |             if normalize:
141 |                 return [amino_acids.AA_hydro_KD_normalized[r] for r in s]
142 |             else:
143 |                 return [amino_acids.AA_hydro_KD[r] for r in s]
144 |         except KeyError:
145 |             raise sparrow_exceptions.CalculationException('Invalid residue found in %s' %(s))
146 |     else:
147 |         raise sparrow_exceptions.CalculationException('Invalid mode passed: %s' %(mode))
148 | 


--------------------------------------------------------------------------------
/sparrow/data/README.md:
--------------------------------------------------------------------------------
 1 | # Sample Package Data
 2 | 
 3 | This directory contains sample additional data you may want to include with your package.
 4 | This is a place where non-code related additional information (such as data files, molecular structures,  etc.) can 
 5 | go that you want to ship alongside your code.
 6 | 
 7 | Please note that it is not recommended to place large files in your git directory. If your project requires files larger
 8 | than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files
 9 | as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version
10 | in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository
11 | cap.
12 | 
13 | ## Including package data
14 | 
15 | Modify your package's `setup.py` file and the `setup()` command. Include the 
16 | [`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the 
17 | correct files.
18 | 
19 | ## Manifest
20 | 
21 | * `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150)
22 | 


--------------------------------------------------------------------------------
/sparrow/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import configs # import general configurations
2 | from . import amino_acids # import residue-specific amino acid data
3 | 
4 | 


--------------------------------------------------------------------------------
/sparrow/data/amino_acids.py:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## Data on individual amino acids
  3 | ##
  4 | ##
  5 | 
  6 | VALID_AMINO_ACIDS =      ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
  7 | VALID_AMINO_ACIDS_PHYS = ['W','Y','F','H','Q','N','T','S','C','G','A','I','L','V','M','E','D','K','R','P']
  8 | 
  9 | 
 10 | ARO   = ['Y','W','F']
 11 | ALI   = ['A','L','M','I','V']
 12 | POLAR = ['Q','N','S','T','H','G']
 13 | CHARGE = ['E','D','R','K']
 14 | POS = ['R','K']
 15 | NEG = ['E','D']
 16 | 
 17 | AA_THREE_TO_ONE = {'ALA':'A',
 18 |                    'CYS':'C',
 19 |                    'ASP':'D',
 20 |                    'GLU':'E',
 21 |                    'PHE':'F',
 22 |                    'GLY':'G',
 23 |                    'HIS':'H',
 24 |                    'ILE':'I',
 25 |                    'LYS':'K',
 26 |                    'LEU':'L',
 27 |                    'MET':'M',
 28 |                    'ASN':'N',
 29 |                    'PRO':'P',
 30 |                    'GLN':'Q',
 31 |                    'ARG':'R',
 32 |                    'SER':'S',
 33 |                    'THR':'T',
 34 |                    'VAL':'V',
 35 |                    'TRP':'W',
 36 |                    'TYR':'Y'}
 37 | 
 38 | AA_ONE_TO_THREE = {}
 39 | for x in AA_THREE_TO_ONE:
 40 |     AA_ONE_TO_THREE[AA_THREE_TO_ONE[x]] = x
 41 | 
 42 |     
 43 | # acetyl groups have 1C-2O, 4H prior to
 44 | # peptide bond formation
 45 | # 
 46 | AA_MOLECULAR_WEIGHT = {'A': 89.1,
 47 |                        'C': 121.2,
 48 |                        'D': 133.1,
 49 |                        'E': 147.1,
 50 |                        'F': 165.2,
 51 |                        'G': 75.1,
 52 |                        'H': 155.2,
 53 |                        'I': 131.2,
 54 |                        'K': 146.2,
 55 |                        'L': 130.2,
 56 |                        'M': 149.2,
 57 |                        'N': 132.1,
 58 |                        'P': 115.1,
 59 |                        'Q': 146.2,
 60 |                        'R': 174.2,
 61 |                        'S': 105.1,
 62 |                        'T': 119.1,
 63 |                        'V': 117.1,
 64 |                        'W': 204.2,
 65 |                        'Y': 181.2,
 66 |                        '<': 48,
 67 |                        '>': 48}
 68 | 
 69 | 
 70 | AA_COLOR = {'Y':'#ff9d00',
 71 |             'W':'#ff9d00',
 72 |             'F':'#ff9d00',
 73 |             'A':'#171616',
 74 |             'L':'#171616',
 75 |             'M':'#171616',
 76 |             'I':'#171616',
 77 |             'V':'#171616',
 78 |             'Q':'#04700d',
 79 |             'N':'#04700d',
 80 |             'S':'#04700d',
 81 |             'T':'#04700d',
 82 |             'H':'#04700d',
 83 |             'G':'#04700d',
 84 |             'E':'#ff0d0d',
 85 |             'D':'#ff0d0d',
 86 |             'R':'#2900f5',
 87 |             'K':'#2900f5',
 88 |             'C':'#ffe70d',
 89 |             'P':'#cf30b7'}
 90 |             
 91 | 
 92 | # KYTE-DOOLITTLE SCALES
 93 | # References
 94 | # A simple method for displaying the hydropathic character of a protein.
 95 | # Kyte J, Doolittle RF. J Mol Biol. 1982 May 5;157(1):105-32.
 96 | # Why are "natively unfolded" proteins unstructured under physiological conditions?
 97 | # Valdimir N. Uversky, Joel R. Gillespie, and Anthony L. Frink
 98 | # Protines: Structure, function, and genetics 41:415-427 (2000)
 99 | # Main hydrophobicity scale
100 | 
101 | AA_hydro_KD = {"A": 6.3,
102 |                "R": 0.0,
103 |                "N": 1.0,
104 |                "D": 1.0,
105 |                "C": 7.0,
106 |                "Q": 1.0,
107 |                "E": 1.0,
108 |                "G": 4.1,
109 |                "H": 1.3,
110 |                "I": 9.0,
111 |                "L": 8.3,
112 |                "K": 0.6,
113 |                "M": 6.4,
114 |                "F": 7.3,
115 |                "P": 2.9,
116 |                "S": 3.7,
117 |                "T": 3.8,
118 |                "W": 3.6,
119 |                "Y": 3.2,
120 |                "V": 8.7}
121 | 
122 | AA_hydro_KD_normalized = {'A': 0.7, 
123 |                           'R': 0.0, 
124 |                           'N': 0.111, 
125 |                           'D': 0.111, 
126 |                           'C': 0.778, 
127 |                           'Q': 0.111, 
128 |                           'E': 0.111, 
129 |                           'G': 0.456, 
130 |                           'H': 0.144, 
131 |                           'I': 1.0, 
132 |                           'L': 0.922, 
133 |                           'K': 0.067, 
134 |                           'M': 0.711, 
135 |                           'F': 0.811, 
136 |                           'P': 0.322, 
137 |                           'S': 0.411, 
138 |                           'T': 0.422, 
139 |                           'W': 0.4, 
140 |                           'Y': 0.356,
141 |                           'V': 0.967}
142 | 
143 | 


--------------------------------------------------------------------------------
/sparrow/data/configs.py:
--------------------------------------------------------------------------------
1 | DISORDER_THRESHOLD = 0.7
2 | MIN_LENGTH_ALBATROSS_RE_RG = 35
3 | 


--------------------------------------------------------------------------------
/sparrow/data/look_and_say.dat:
--------------------------------------------------------------------------------
 1 | 1
 2 | 11
 3 | 21
 4 | 1211
 5 | 111221
 6 | 312211
 7 | 13112221
 8 | 1113213211
 9 | 31131211131221
10 | 13211311123113112211
11 | 11131221133112132113212221
12 | 3113112221232112111312211312113211
13 | 1321132132111213122112311311222113111221131221
14 | 11131221131211131231121113112221121321132132211331222113112211
15 | 311311222113111231131112132112311321322112111312211312111322212311322113212221


--------------------------------------------------------------------------------
/sparrow/data/networks/asphericity/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 | 


--------------------------------------------------------------------------------
/sparrow/data/networks/asphericity/asphericity_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/asphericity/asphericity_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/asphericity/asphericity_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/asphericity/asphericity_network_v2.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/dssp/dssp_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/dssp/dssp_predictor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/dssp/dssp_predictor_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/dssp/dssp_predictor_network_v2.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/mitochondrial_targeting/mitochondrial_targeting_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/mitochondrial_targeting/mitochondrial_targeting_predictor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/nuclear_export_signal/nes_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/nuclear_export_signal/nes_predictor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/nuclear_import_signal/nls_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/nuclear_import_signal/nls_predictor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/phosphorylation/ser_phosphorylation_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/phosphorylation/ser_phosphorylation_predictor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/phosphorylation/thr_phosphorylation_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/phosphorylation/thr_phosphorylation_predictor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/phosphorylation/tyr_phosphorylation_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/phosphorylation/tyr_phosphorylation_predictor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/prefactor/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 | 


--------------------------------------------------------------------------------
/sparrow/data/networks/prefactor/prefactor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/prefactor/prefactor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/prefactor/prefactor_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/prefactor/prefactor_network_v2.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/pscore/pscore_predictor_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/pscore/pscore_predictor_network_v2.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/pscore/pscore_predictor_network_v3.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/pscore/pscore_predictor_network_v3.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/pscore/pscore_predictor_network_v4.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/pscore/pscore_predictor_network_v4.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/re/README:
--------------------------------------------------------------------------------
1 | ## The end-to-end networks are defined as re, although the actual predictor class is e2e. This is a rare exception where there's a mismatch in network name and predictor class name, because if the predictor class name were 're' then this would clash with the Python regular expression package ('re'), such that for code sanity purposes the predictor class and module is e2e even though the networks are re.
2 | 


--------------------------------------------------------------------------------
/sparrow/data/networks/re/re_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/re/re_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/re/re_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/re/re_network_v2.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/rg/README:
--------------------------------------------------------------------------------
1 | # Proof of concept network
2 | 


--------------------------------------------------------------------------------
/sparrow/data/networks/rg/rg_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/rg/rg_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/rg/rg_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/rg/rg_network_v2.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_re/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 | 


--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_re/scaled_re_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_re/scaled_re_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_re/scaled_re_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_re/scaled_re_network_v2.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_rg/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 | 


--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_rg/scaled_rg_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_rg/scaled_rg_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/scaled_rg/scaled_rg_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaled_rg/scaled_rg_network_v2.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/scaling_exponent/README:
--------------------------------------------------------------------------------
1 | # To Be Trained
2 | 
3 | v1.5 was never assessed or validated and shouldn't be used
4 | 


--------------------------------------------------------------------------------
/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.5.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.5.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/scaling_exponent/scaling_exponent_network_v2.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/transactivation_domains/tad_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/transactivation_domains/tad_predictor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v1.pt


--------------------------------------------------------------------------------
/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v4.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/data/networks/transmembrane/transmembrane_predictor_network_v4.pt


--------------------------------------------------------------------------------
/sparrow/patterning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/patterning/__init__.py


--------------------------------------------------------------------------------
/sparrow/patterning/scd.pyx:
--------------------------------------------------------------------------------
  1 | # cython: language_level=3, boundscheck=False, wraparound=False, initializedcheck=False
  2 | import numpy as np
  3 | cimport numpy as np
  4 | from cython.view cimport array
  5 | from libc.math cimport sqrt,abs, fabs
  6 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS
  7 | from sparrow.sparrow_exceptions import SparrowException
  8 | 
  9 | # Define a typed memoryview for efficient access to numpy arrays
 10 | ctypedef np.float64_t DOUBLE_t
 11 | ctypedef np.int64_t INT64_t
 12 | 
 13 | cdef dict DEFAULT_HYDRO_DICT = {'A': 0.730, 'R': 0.000, 'N': 0.432, 'D': 0.378, 'C': 0.595, 'Q': 0.514, 'E': 0.459,
 14 |                       'G': 0.649, 'H': 0.514, 'I': 0.973, 'L': 0.973, 'K': 0.514, 'M': 0.838, 'F': 1.000,
 15 |                       'P': 1.000, 'S': 0.595, 'T': 0.676, 'W': 0.946, 'Y': 0.865, 'V': 0.892}
 16 | 
 17 | 
 18 | cpdef double compute_scd_x(str sequence, group1=['E','D'], group2=['R','K']):
 19 |     cdef int m, n, seqlen
 20 |     cdef double total, m_val, n_val, charge_val, final_val
 21 |     cdef int cur_m_charge, cur_n_charge
 22 |     cdef char cur_m_res, cur_n_res
 23 | 
 24 |     # Pre-calculate group membership
 25 |     cdef int[:] group_membership = np.zeros(256, dtype=np.int32)
 26 |     for residue in group1:
 27 |         group_membership[ord(residue)] = -1
 28 |     for residue in group2:
 29 |         group_membership[ord(residue)] = 1
 30 | 
 31 |     total = 0
 32 |     seqlen = len(sequence)
 33 | 
 34 |     # Convert sequence to array of integers
 35 |     cdef int[:] sequence_array = np.array([ord(char) for char in sequence], dtype=np.int32)
 36 |         
 37 |     for m in range(1, seqlen):
 38 |         m_val = m + 1
 39 |             
 40 |         for n in range(0, m-1):
 41 |             n_val = n + 1
 42 | 
 43 |             # Access residues using array indexing
 44 |             cur_m_res = sequence_array[m]
 45 |             cur_n_res = sequence_array[n]
 46 |             
 47 |             # Retrieve group charge
 48 |             cur_m_charge = group_membership[cur_m_res]
 49 |             cur_n_charge = group_membership[cur_n_res]
 50 | 
 51 |             charge_val = cur_m_charge * cur_n_charge
 52 |             final_val = charge_val * sqrt(m_val - n_val)
 53 |             total += final_val
 54 | 
 55 |     return total / seqlen
 56 | 
 57 | cdef validate_sequence(str seq, dict hydro_dict):
 58 |     cdef set all_res = set(seq)
 59 |     for res in all_res:
 60 |         if res not in hydro_dict:
 61 |             raise ValueError(f'When calculating SHD the hydrophobicity dictionary lacked the residue {res}')
 62 | 
 63 | cpdef double compute_shd(str seq, dict hydro_dict=None):
 64 |     """
 65 |     Function takes in a sequence and returns Sequence 
 66 |     Hydropathy Decoration (SHD), IE. patterning of hydrophobic 
 67 |     residues in the sequence. This is computed as defined in ref 1
 68 |     
 69 |     As an optional parameter this function can take in a predefined 
 70 |     hydropathy conversion dictionary for the amino acids, where the keys 
 71 |     are Amino acids and values are floats.
 72 |     
 73 |     If a conversion dict is not provided the following conversion is used:
 74 | 
 75 |     'A': 0.730,
 76 |     'R': 0.000, 
 77 |     'N': 0.432, 
 78 |     'D': 0.378, 
 79 |     'C': 0.595, 
 80 |     'Q': 0.514, 
 81 |     'E': 0.459,
 82 |     'G': 0.649, 
 83 |     'H': 0.514, 
 84 |     'I': 0.973, 
 85 |     'L': 0.973, 
 86 |     'K': 0.514, 
 87 |     'M': 0.838, 
 88 |     'F': 1.000,
 89 |     'P': 1.000, 
 90 |     'S': 0.595, 
 91 |     'T': 0.676, 
 92 |     'W': 0.946, 
 93 |     'Y': 0.865, 
 94 |     'V': 0.892,
 95 | 
 96 |     These are the Kyte Doolitle normalized hydrophobicity. 
 97 | 
 98 |     Parameters
 99 |     ------------
100 |     seq : str 
101 |         Amino acid sequence passed as string
102 | 
103 |     hydro_dict : dict
104 |         Dictionary that maps amino acid to hydrophobicity score
105 |         (optional).
106 | 
107 |     Returns
108 |     -----------
109 |     float
110 |         Returns a floating point value that reports on the sequence
111 |         hydropathy decoration. This in principle should be a positive
112 |         number.
113 |         
114 |     References
115 |     --------------
116 |     [1] Zheng, W., Dignon, G. L., Brown, M., Kim, Y. C. & Mittal, J. Hydropathy Patterning
117 |     Complements Charge Patterning to Describe Conformational Preferences of Disordered 
118 |     Proteins. J. Phys. Chem. Lett. (2020). doi:10.1021/acs.jpclett.0c00288
119 |     """
120 |     if hydro_dict is None:
121 |         hydro_dict = DEFAULT_HYDRO_DICT
122 | 
123 |     validate_sequence(seq, hydro_dict)
124 | 
125 |     cdef Py_ssize_t N = len(seq)
126 |     cdef double[:] h = np.array([hydro_dict[res] for res in seq], dtype=np.double)
127 |     cdef double t = 0.0
128 |     cdef Py_ssize_t m, n
129 |     
130 |     for m in range(1, N):
131 |         for n in range(m-1):
132 |             t += (h[m] + h[n]) / abs(m - n)
133 | 
134 |     return t / N
135 | 
136 |     


--------------------------------------------------------------------------------
/sparrow/polymer/scaling_parameters.py:
--------------------------------------------------------------------------------
 1 | from sparrow.patterning import scd
 2 | import numpy as np
 3 | 
 4 | def compute_nu_zheng2020(seq):
 5 |     """
 6 |     Function takes in a sequence and returns a calculate Nu scaling value 
 7 |     from Sequence Hydropathy Decoration (SHD) and Sequence Charge Decoration)
 8 |     
 9 |     Nu = -0.0423×SHD + 0.0074×SCD+0.701
10 |     
11 |     This equation for predicting nu is adopeted from Zheng et al. [1].
12 | 
13 |     Parameters
14 |     ------------------
15 |     seq : str
16 |         Amino acid sequence (must be valid amino acids only)
17 | 
18 |     Returns
19 |     ------------------
20 |     float
21 |         Returns the predict scalinge exponent (nu), a dimensionless 
22 |         parameter which should fall between 0.33 and 0.6 (in theory).
23 | 
24 |     References
25 |     ---------------
26 |     [1] Zheng, W., Dignon, G. L., Brown, M., Kim, Y. C. & Mittal, J. 
27 |     Hydropathy Patterning Complements Charge Patterning to Describe 
28 |     Conformational Preferences of Disordered Proteins. J. Phys. 
29 |     Chem. Lett. (2020). doi:10.1021/acs.jpclett.0c00288
30 | 
31 |     """
32 | 
33 |     SHD = scd.compute_shd(seq)
34 |     SCD = scd.compute_scd_x(seq)
35 |         
36 |     # calculate Nu from SHD and SCD
37 |     nu = (-0.0423*SHD)+(0.0074*SCD)+0.701
38 |     
39 |     return nu
40 | 
41 | 
42 | 
43 | def compute_rg_zheng2020(seq):
44 |     """
45 |     Function that takes in an amino acid sequence and computes the 
46 |     expected radius of gyration using the nu-dependent Rg as developed by 
47 |     Zheng et al.
48 | 
49 |     Parameters
50 |     ------------------
51 |     seq : str
52 |         Amino acid sequence (must be valid amino acids only)
53 | 
54 |     Returns
55 |     ------------------
56 |     float
57 |         Returns the predict radius of gyration in Angstorms
58 | 
59 |     References
60 |     ---------------
61 |     [1] Zheng, W., Dignon, G. L., Brown, M., Kim, Y. C. & Mittal, J. 
62 |     Hydropathy Patterning Complements Charge Patterning to Describe 
63 |     Conformational Preferences of Disordered Proteins. J. Phys. 
64 |     Chem. Lett. (2020). doi:10.1021/acs.jpclett.0c00288
65 |     """
66 |     nu = compute_nu_zheng2020(seq)
67 |     
68 |     gamma = 1.1615
69 |     b = 5.5 # note in Angstroms instead of nanometers
70 |     N = len(seq)
71 | 
72 |     numerator = gamma*(gamma+1)
73 | 
74 |     denominator = 2*(gamma+2*nu)*(gamma+2*nu+1)
75 | 
76 |     return np.sqrt(numerator/denominator)*b*np.power(N,nu)
77 |     
78 | 


--------------------------------------------------------------------------------
/sparrow/predictors/asphericity/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/asphericity/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/dssp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/dssp/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/e2e/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/mitochondrial_targeting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/mitochondrial_targeting/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/mitochondrial_targeting/mitochondrial_targeting_predictor.py:
--------------------------------------------------------------------------------
  1 | from parrot import brnn_architecture
  2 | from parrot import encode_sequence
  3 | 
  4 | import sparrow
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import os
  9 | from sparrow.sparrow_exceptions import SparrowException
 10 | 
 11 | 
 12 | 
 13 | 
 14 | DEFAULT_VERSION="1"
 15 | 
 16 | 
 17 | class MitochondrialTargetingPredictor():
 18 |     """
 19 | 
 20 |     Class that loads in a network such that predict_mitochondrial_targeting() can be called to predict
 21 |     mitochondrial targeting for a sequence.
 22 | 
 23 |     """
 24 |     def __init__(self, version=None):
 25 |         """
 26 |         Constructor for building a MitochondrialTargetingPredictor object. The version keyword allows specific
 27 |         version(s) of the trained network associated with the underlying predictor to be defined. 
 28 |         By default, it's set to None, which leads to the current best/default network being selected
 29 |         and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
 30 |         the ability to pass a string as version. This string is inserted at position <X> in the filename
 31 |         
 32 |             mitochondrial_targeting_predictor_network_v{version}.pt
 33 | 
 34 |         i.e. no need to include the "v" part or the .pt extension
 35 | 
 36 |         """
 37 | 
 38 |         if version is None:
 39 |             version = DEFAULT_VERSION
 40 | 
 41 |         saved_weights = sparrow.get_data(f'networks/mitochondrial_targeting/mitochondrial_targeting_predictor_network_v{version}.pt')
 42 | 
 43 |         if not os.path.isfile(saved_weights):
 44 |             raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
 45 |             
 46 |         
 47 |         loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
 48 | 
 49 |       
 50 |         # Dynamically read in correct hyperparameters:
 51 |         num_layers = 0
 52 |         while True:
 53 |             s = f'lstm.weight_ih_l{num_layers}'
 54 |             try:
 55 |                 temp = loaded_model[s]
 56 |                 num_layers += 1
 57 |             except KeyError:
 58 |                 break
 59 |                      
 60 | 
 61 |         ##  determine the number of classes; note you may need to change the key names here no leading
 62 |         # module. in ther
 63 |         number_of_classes = np.shape(loaded_model['fc.bias'])[0]
 64 |         input_size = 20 # (hardcoded at 20 for 20 amino acids)
 65 | 
 66 |         hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
 67 | 
 68 |         
 69 |         # set these here so we can sanity check if needed
 70 |         self.number_of_classes = number_of_classes
 71 |         self.input_size = input_size
 72 |         self.number_of_layers = num_layers
 73 |         self.hidden_vector_size = hidden_vector_size
 74 | 
 75 |         # Instantiate network weights into object
 76 |         self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
 77 |                                                   
 78 |         self.network.load_state_dict(loaded_model)
 79 | 
 80 | 
 81 | 
 82 |     def predict_mitochondrial_targeting(self, seq):
 83 |         """
 84 |         Prediction function. seq should be a valid amino acid sequence.
 85 | 
 86 |         NOTE that this assumes mitochondrial targeting sequences (MTSs) are 
 87 |         N-terminal, so truncates anything over 168 residues. This threshold
 88 |         was empyrically determined based on the set of annottated MTSs.
 89 | 
 90 |         Parameters
 91 |         ------------
 92 |         seq : str
 93 |             Valid amino acid sequence
 94 | 
 95 |         Returns
 96 |         ----------
 97 |         np.ndarray
 98 |             Returns a 1D np.ndarray the length of the sequence where each position
 99 |             is the transient helicity at that position.
100 | 
101 |         """
102 | 
103 |         # convert sequence to uppercase
104 |         seq = seq.upper()
105 | 
106 |         # truncate all but 168 - if shorter than this just gets everything
107 |         sub_seq = seq[0:168]
108 | 
109 |         # Convert to one-hot sequence vector
110 |         seq_vector = encode_sequence.one_hot(sub_seq)
111 |         seq_vector = seq_vector.view(1, len(seq_vector), -1)  # formatting
112 | 
113 |         # Forward pass  -this is specific for classication
114 |         prediction = self.network(seq_vector.float()).detach().numpy()
115 |         int_vals = []
116 |         for row in prediction[0]:
117 |             int_vals.append(np.argmax(row))
118 | 
119 |         prediction = int_vals
120 | 
121 |         # append empty 0s for remainder of sequence
122 |         extra = [0]*(len(seq)-len(sub_seq))
123 | 
124 |         prediction.extend(extra)
125 |         # return prediction + extra zeros
126 |         return prediction
127 | 


--------------------------------------------------------------------------------
/sparrow/predictors/nes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/nes/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/nes/nuclear_export_signal_predictor.py:
--------------------------------------------------------------------------------
  1 | from parrot import brnn_architecture
  2 | from parrot import encode_sequence
  3 | 
  4 | import sparrow
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import os
  9 | from sparrow.sparrow_exceptions import SparrowException
 10 | 
 11 | 
 12 | """
 13 | NB: This network and predictor was imported from GOOSE, so is subtly different internally to how
 14 | some of the other predictors work. Notably it includes a softmax project and a loop 
 15 | this loop below to define probabilities - this may be because these networks have 2 layers 
 16 | whereas the others only have one? Anyway, just making a note of this in case we need to debug in 
 17 | the future.
 18 | 
 19 |         score = []
 20 |         for val in prediction:
 21 |             score.append(round(val[1],5))
 22 | 
 23 | 
 24 | """
 25 | 
 26 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update
 27 | # this default if you want that new network to be used by default
 28 | DEFAULT_VERSION="1"
 29 | 
 30 | def softmax(v):
 31 |     return (np.e ** v) / np.sum(np.e ** v)
 32 | 
 33 | 
 34 | ## CHANGE class name
 35 | class NESPredictor():
 36 |     """
 37 | 
 38 |     Class that loads in a network such that nuclear export signals can be predicted.
 39 | 
 40 |     """
 41 |     def __init__(self, version=None):
 42 |         """
 43 |         Constructor for building a predictor object object. The version keyword allows specific
 44 |         version(s) of the trained network associated with the predictor to be defined. 
 45 | 
 46 |         By default, it's set to None, which leads to the current best/default network being selected
 47 |         and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
 48 |         the ability to pass a string as version. This string is inserted at position <X> in the filename
 49 | 
 50 |             <RelevantName>_network_v<X>.pt
 51 | 
 52 |         i.e. no need to include the "v" part or the .pt extension
 53 | 
 54 |         """
 55 | 
 56 |         
 57 | 
 58 |         # if no version provided use default, then grab path and check that file actually exists! 
 59 |         if version is None:
 60 |             version = DEFAULT_VERSION
 61 | 
 62 |         # CHANGE THIS!! Make sure oyu change the <DIRECTORY_NAME> and <PREDICTOR_NAME> to the appropriate
 63 |         # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected
 64 |         saved_weights = sparrow.get_data(f'networks/nuclear_export_signal/nes_predictor_network_v{version}.pt')
 65 | 
 66 |         if not os.path.isfile(saved_weights):
 67 |             raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
 68 |             
 69 |         
 70 |         # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because
 71 |         # we know everyone has a CPU... 
 72 |         loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
 73 | 
 74 |         ## DELETE ME PROBABLY
 75 |         # this block of code is relevant ONLY if the trained network has this straneg
 76 |         # appended 'module.' text at the start of every keyword. This may happen in older
 77 |         # version of PARROT (see DSSP predictor as an example of where its needed) but in
 78 |         # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but
 79 |         # in case you're using an older network we've kept this to make things simple
 80 |         
 81 |         #for i in range(len(loaded_model)):
 82 |         #    key, value = loaded_model.popitem(last=False)
 83 |         #    new_key = key[7:]
 84 |         #    loaded_model[new_key] = value
 85 |         ## END OF DELETE ME PROBABLY
 86 | 
 87 |       
 88 |         # Dynamically calculate the hyperparameters used to train the network. 
 89 |         ## NOTE:
 90 |         # 
 91 |         # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible
 92 |         # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example
 93 |         # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you
 94 |         # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this
 95 |         # keyword. 
 96 | 
 97 |         # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords
 98 |         # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to 
 99 |         # reach out to Alex or Dan about this!
100 | 
101 |         num_layers = 0
102 |         while True:
103 |             s = f'lstm.weight_ih_l{num_layers}'
104 |             try:
105 |                 temp = loaded_model[s]
106 |                 num_layers += 1
107 |             except KeyError:
108 |                 break
109 |                         
110 |         number_of_classes = np.shape(loaded_model['fc.bias'])[0]
111 | 
112 |         # Hard coded because we always use one-hot encoding, note that if you trained a specific
113 |         # predictor on a different encoding scheme you could, of course, here simply define that
114 |         # encoding scheme 
115 |         input_size = 20 
116 | 
117 |         hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
118 |         
119 |         # set these here so we can sanity check if needed
120 |         self.number_of_classes = number_of_classes
121 |         self.input_size = input_size
122 |         self.number_of_layers = num_layers
123 |         self.hidden_vector_size = hidden_vector_size
124 | 
125 |         # Instantiate network weights into object
126 |         self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
127 |                                                  
128 |         # load parameters into model
129 |         self.network.load_state_dict(loaded_model)
130 | 
131 | 
132 | 
133 |     def predict_nuclear_export_signal(self, seq):
134 |         """
135 |         Function to predict the presence of nuclear export signals. Returns a per
136 |         residue probability score of a residue being in an NES or not
137 | 
138 |         Parameters
139 |         ------------
140 |         seq : str
141 |             Valid amino acid sequence
142 | 
143 |         Returns
144 |         ----------
145 |         np.ndarray
146 |             Returns a 1D np.ndarray the length of the sequence where each position
147 |             gives the prediction of that residue being an NES
148 | 
149 |         """
150 | 
151 |         # convert sequence to uppercase
152 |         seq = seq.upper()
153 | 
154 |         # Convert to one-hot sequence vector - note, as mentioned above if you 
155 |         # did't use one-hot in the original training you could just edit this here        
156 |         seq_vector = encode_sequence.one_hot(seq)
157 |         seq_vector = seq_vector.view(1, len(seq_vector), -1)  # formatting
158 | 
159 | 
160 |         ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
161 |         ## CHANGE CODE BELOW HERE ##
162 |         ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
163 | 
164 | 
165 |         ## CLASSIFICATION CODE BLOCK        
166 |         # The block below should be kept if we're doing a classification
167 |         # based prediction! if not, comment this out or delete it
168 |         #prediction = self.network(seq_vector.float()).detach().numpy()
169 |         #int_vals = []
170 |         #for row in prediction[0]:
171 |         #    int_vals.append(np.argmax(row))
172 | 
173 |         #prediction = int_vals
174 | 
175 |         ## REGRESSION CODE BLOCK
176 |         # This block should be kept if we're doing a regression-based
177 |         # prediction. If not, comment this out or delete it
178 |         prediction = self.network(seq_vector.float()).detach().numpy().flatten()
179 | 
180 |         prediction = prediction.reshape(-1, self.number_of_classes)
181 |         prediction = np.array(list(map(softmax, prediction)))
182 | 
183 |         # finally we extract out local probabilities
184 |         score = []
185 |         for val in prediction:
186 |             score.append(round(val[1],5))
187 | 
188 |         return score
189 | 
190 |                     
191 | 
192 |                     
193 |                     
194 | 
195 |                                         
196 |                                         
197 |                 
198 |                 
199 |             
200 |             
201 | 


--------------------------------------------------------------------------------
/sparrow/predictors/nls/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/nls/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/nls/nuclear_import_signal_predictor.py:
--------------------------------------------------------------------------------
  1 | from parrot import brnn_architecture
  2 | from parrot import encode_sequence
  3 | 
  4 | import sparrow
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import os
  9 | from sparrow.sparrow_exceptions import SparrowException
 10 | 
 11 | 
 12 | """
 13 | NB: This network and predictor was imported from GOOSE, so is subtly different internally to how
 14 | some of the other predictors work. Notably it includes a softmax project and a loop 
 15 | this loop below to define probabilities - this may be because these networks have 2 layers 
 16 | whereas the others only have one? Anyway, just making a note of this in case we need to debug in 
 17 | the future.
 18 | 
 19 |         score = []
 20 |         for val in prediction:
 21 |             score.append(round(val[1],5))
 22 | 
 23 | 
 24 | """
 25 | 
 26 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update
 27 | # this default if you want that new network to be used by default
 28 | DEFAULT_VERSION="1"
 29 | 
 30 | def softmax(v):
 31 |     return (np.e ** v) / np.sum(np.e ** v)
 32 | 
 33 | 
 34 | ## CHANGE class name
 35 | class NLSPredictor():
 36 |     """
 37 | 
 38 |     Class that loads in a network such that predict_ser_phosphorylation() can be called to predict
 39 |     serine phosphorylation from a sequence.
 40 | 
 41 |     """
 42 |     def __init__(self, version=None):
 43 |         """
 44 |         Constructor for building a predictor object object. The version keyword allows specific
 45 |         version(s) of the trained network associated with the predictor to be defined. 
 46 | 
 47 |         By default, it's set to None, which leads to the current best/default network being selected
 48 |         and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
 49 |         the ability to pass a string as version. This string is inserted at position <X> in the filename
 50 | 
 51 |             <RelevantName>_network_v<X>.pt
 52 | 
 53 |         i.e. no need to include the "v" part or the .pt extension
 54 | 
 55 |         """
 56 | 
 57 |         
 58 | 
 59 |         # if no version provided use default, then grab path and check that file actually exists! 
 60 |         if version is None:
 61 |             version = DEFAULT_VERSION
 62 | 
 63 |         # CHANGE THIS!! Make sure oyu change the <DIRECTORY_NAME> and <PREDICTOR_NAME> to the appropriate
 64 |         # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected
 65 |         saved_weights = sparrow.get_data(f'networks/nuclear_import_signal/nls_predictor_network_v{version}.pt')
 66 | 
 67 |         if not os.path.isfile(saved_weights):
 68 |             raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
 69 |             
 70 |         
 71 |         # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because
 72 |         # we know everyone has a CPU... 
 73 |         loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
 74 | 
 75 |         ## DELETE ME PROBABLY
 76 |         # this block of code is relevant ONLY if the trained network has this straneg
 77 |         # appended 'module.' text at the start of every keyword. This may happen in older
 78 |         # version of PARROT (see DSSP predictor as an example of where its needed) but in
 79 |         # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but
 80 |         # in case you're using an older network we've kept this to make things simple
 81 |         
 82 |         #for i in range(len(loaded_model)):
 83 |         #    key, value = loaded_model.popitem(last=False)
 84 |         #    new_key = key[7:]
 85 |         #    loaded_model[new_key] = value
 86 |         ## END OF DELETE ME PROBABLY
 87 | 
 88 |       
 89 |         # Dynamically calculate the hyperparameters used to train the network. 
 90 |         ## NOTE:
 91 |         # 
 92 |         # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible
 93 |         # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example
 94 |         # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you
 95 |         # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this
 96 |         # keyword. 
 97 | 
 98 |         # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords
 99 |         # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to 
100 |         # reach out to Alex or Dan about this!
101 | 
102 |         num_layers = 0
103 |         while True:
104 |             s = f'lstm.weight_ih_l{num_layers}'
105 |             try:
106 |                 temp = loaded_model[s]
107 |                 num_layers += 1
108 |             except KeyError:
109 |                 break
110 |                         
111 |         number_of_classes = np.shape(loaded_model['fc.bias'])[0]
112 | 
113 |         # Hard coded because we always use one-hot encoding, note that if you trained a specific
114 |         # predictor on a different encoding scheme you could, of course, here simply define that
115 |         # encoding scheme 
116 |         input_size = 20 
117 | 
118 |         hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
119 |         
120 |         # set these here so we can sanity check if needed
121 |         self.number_of_classes = number_of_classes
122 |         self.input_size = input_size
123 |         self.number_of_layers = num_layers
124 |         self.hidden_vector_size = hidden_vector_size
125 | 
126 |         # Instantiate network weights into object
127 |         self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
128 |                                                  
129 |         # load parameters into model
130 |         self.network.load_state_dict(loaded_model)
131 | 
132 | 
133 | 
134 |     def predict_nuclear_import_signal(self, seq):
135 |         """
136 |         Function to predict the presence of nuclear import signals. Returns a per
137 |         residue probability score of a residue being in an NLS or not
138 | 
139 |         Parameters
140 |         ------------
141 |         seq : str
142 |             Valid amino acid sequence
143 | 
144 |         Returns
145 |         ----------
146 |         np.ndarray
147 |             Returns a 1D np.ndarray the length of the sequence where each position
148 |             gives the prediction of that residue being an NLS
149 | 
150 |         """
151 | 
152 |         # convert sequence to uppercase
153 |         seq = seq.upper()
154 | 
155 |         # Convert to one-hot sequence vector - note, as mentioned above if you 
156 |         # did't use one-hot in the original training you could just edit this here        
157 |         seq_vector = encode_sequence.one_hot(seq)
158 |         seq_vector = seq_vector.view(1, len(seq_vector), -1)  # formatting
159 | 
160 | 
161 |         ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
162 |         ## CHANGE CODE BELOW HERE ##
163 |         ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
164 | 
165 | 
166 |         ## CLASSIFICATION CODE BLOCK        
167 |         # The block below should be kept if we're doing a classification
168 |         # based prediction! if not, comment this out or delete it
169 |         #prediction = self.network(seq_vector.float()).detach().numpy()
170 |         #int_vals = []
171 |         #for row in prediction[0]:
172 |         #    int_vals.append(np.argmax(row))
173 | 
174 |         #prediction = int_vals
175 | 
176 |         ## REGRESSION CODE BLOCK
177 |         # This block should be kept if we're doing a regression-based
178 |         # prediction. If not, comment this out or delete it
179 |         prediction = self.network(seq_vector.float()).detach().numpy().flatten()
180 | 
181 |         prediction = prediction.reshape(-1, self.number_of_classes)
182 |         prediction = np.array(list(map(softmax, prediction)))
183 | 
184 | 
185 |         ## CLIP
186 |         # IF we want to ensure we have a value between 0 and 1 the clipping here 
187 |         # will do that. If not leave commented
188 |         #prediction = np.clip(prediction, 0.0, 1.0)
189 | 
190 |         # finally we extract out local probabilities
191 |         score = []
192 |         for val in prediction:
193 |             score.append(round(val[1],5))
194 | 
195 |         return score
196 | 
197 |                     
198 | 
199 |                     
200 |                     
201 | 
202 |                                         
203 |                                         
204 |                 
205 |                 
206 |             
207 |             
208 | 


--------------------------------------------------------------------------------
/sparrow/predictors/phosphorylation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/phosphorylation/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/phosphorylation/phospho_predictor_utils.py:
--------------------------------------------------------------------------------
  1 | from sparrow.sparrow_exceptions import SparrowException
  2 | 
  3 | def return_hits(seq, phospho_probability, target_res, windowsize=4, threshold=0.6, return_sites_only=False):
  4 |     """
  5 |     Function that parses through a sequence and annotated phosphosite 
  6 |     probabilities to extract out specific positions or a per-residue 
  7 |     binary mask of phosphorylatio or non-phosphorylation.
  8 |     
  9 |     This function works by sliding a +/- windowsize window across the 
 10 |     sequence and if the central residue in that window has a probability 
 11 |     > threshold then all the target_res in that window are set to be 
 12 |     putative phosphosites.
 13 |         
 14 |     Parameters
 15 |     --------------
 16 |     seq : str
 17 |         Amino acid sequence
 18 | 
 19 |     phospho_probability : list
 20 |         A list with per-residue probabilities for a residue to have been 
 21 |         phosphorylated or not.
 22 | 
 23 |     windowsize : int
 24 |         Define the size of the window this algorithm uses to extend the
 25 |         influenc of a local phosphosite probability. Note the windowsize
 26 |         gets applied +/- a central position
 27 |         
 28 |     target_res : str
 29 |         A string with a single residue which each residue in the sequence 
 30 |         is compared against. 
 31 | 
 32 |     threshold : float
 33 |         A threshold value used to deliniate between phosphosites for masking.
 34 |         Default is 0.6.
 35 | 
 36 |     return_sites_only : bool
 37 |         A flag which, if set to True, means the function returns only the positions 
 38 |         found in a list. If set to False the function returns a binary mask
 39 |         list equal in length to the sequence, where '1's mean the residue
 40 |         is predicted to be a phosphosite and '0' mean they're not. Default
 41 |         is False.
 42 | 
 43 |     Returns
 44 |     -----------
 45 |     list 
 46 |         Returns EITHER a list (len == seq) if return_positions = False which
 47 |         contains a per-residue phosphomask (i.e. 1 = phospho 0 if not) OR
 48 |         returns a list of index positions that correspond to phosphosites.
 49 | 
 50 |         If return_positions is True, the function guarentees the order of 
 51 |         indices returned will be numerical
 52 | 
 53 |     """
 54 | 
 55 |     ## sanity checking first
 56 |     if len(target_res) != 1:
 57 |         raise SparrowException('Target res must be a single amino acid')
 58 |     
 59 |     if threshold > 1 or threshold < 0:
 60 |         raise SparrowException('Probability threshold used in phosphosite masking must be between 0 and 1')
 61 | 
 62 |     if windowsize < 1:
 63 |         raise SparrowException('Window size must be a positive integer')
 64 | 
 65 |     if len(seq) != len(phospho_probability):
 66 |         raise SparrowException('Sequence length and probability vector must be the same length')
 67 |     
 68 | 
 69 |     seqlen = len(seq)
 70 |     
 71 |     potential_hits = set([])
 72 | 
 73 |     if seqlen <  (2*windowsize)+1:
 74 |         raise SparrowException(f'Cannot predict phosphosites when the sequence length is less than 1+{2*windowsize}. NB: length = {seqlen}')
 75 | 
 76 |     # for each residue
 77 |     for idx, res in enumerate(seq):
 78 | 
 79 |         # if this is a low-probablity residue skip and move on
 80 |         if phospho_probability[idx] < threshold:
 81 |             continue
 82 | 
 83 |         # if we're in the N-terminal residues just excise out a fragment of
 84 |         # varying size until we get into the sequence
 85 |         if idx < windowsize:
 86 |             slice_start = 0
 87 |             current_slice = seq[slice_start:idx+windowsize]
 88 | 
 89 |         # while in the 'middle' of the sequence
 90 |         elif idx >= windowsize and idx <= (seqlen - (windowsize+1)):
 91 |             slice_start = idx-windowsize
 92 |             current_slice = seq[slice_start:idx+windowsize]
 93 |             
 94 |         # at the C-terminus 
 95 |         else:
 96 |             slice_start = idx-windowsize
 97 |             current_slice = seq[slice_start:]
 98 | 
 99 |         # for each residue in the
100 |         for local_idx, aa in enumerate(current_slice):
101 |             if aa == target_res:
102 |                 global_pos = local_idx + slice_start
103 |                 
104 |                 if global_pos not in potential_hits:
105 |                     potential_hits.add(global_pos)
106 | 
107 | 
108 |     # if we just want to return the phosphoindices. Note
109 |     # we sort these to guarentee the order of return.
110 |     if return_sites_only:        
111 |         return sorted(list(potential_hits))
112 |     else:
113 | 
114 |         return_list = []
115 |         for i in range(0,len(seq)):
116 |             if i in potential_hits:
117 |                 return_list.append(1)
118 |             else:
119 |                 return_list.append(0)
120 |                 
121 |         return return_list
122 |                     
123 | 
124 | 
125 |                                     
126 | 


--------------------------------------------------------------------------------
/sparrow/predictors/prefactor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/prefactor/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/pscore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/pscore/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/pscore/pscore_predictor.py:
--------------------------------------------------------------------------------
  1 | from parrot import brnn_architecture
  2 | from parrot import encode_sequence
  3 | 
  4 | import sparrow
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import os
  9 | from sparrow.sparrow_exceptions import SparrowException
 10 | 
 11 | 
 12 | 
 13 | """
 14 | Predictor template file. This data file should, in principle, require 
 15 | minimal editing to convert into a specific predictor based on a copied
 16 | network file found in sparrow/data/networks/<predictor_name>. Some general
 17 | guidelines below (also included in the predictor documentation) and inline
 18 | comments on things you will want to change. This code WILL NOT RUN as is and
 19 | requires you to update missing variables to customize the predictor!!
 20 | 
 21 | Missing values will be enclosed in < > to indicate this is where you (the
 22 | software developer) must add some content
 23 | 
 24 | 
 25 | ## Nomenclature
 26 | 
 27 | 1. The predictor file should be called <relevant_name>_predictor.py
 28 | 2. This should be inside a module in the /predictor/ directory called <relevant_thing>
 29 | 3. The single class this module implements should be called <RevevantthingPredictor>
 30 | 
 31 | 
 32 | ## Class structure
 33 | 
 34 | The class should have (at least) two functions:
 35 | 
 36 | 1. A constructor (__init__()) which PRE LOADS the network from sparrow/data/networks/relevant_name - the get_data() function
 37 |    is defined in sparrow/__init__.py and allows absolute-path access to the /data directory. The constructor should
 38 |    FULLY load the network along with standard PARROT-style options, as shown here. Trained networks should be versioned and 
 39 |    implemented so previous versions can be chosene even if the default version changes
 40 | 
 41 | 2. Define a function called predict_<THING>(self, seq) where <THING> is a convenient name that obviously means this is
 42 |    what the function does.
 43 | 
 44 | The idea is that this class should actually be completely stand alone independent of sparrow - i.e. one should be able to run
 45 | 
 46 |     >> from sparrow.predictor.<predictor_name> import <RelevantName>Predictor
 47 |     >>
 48 |     >> P = <RelevantName>Predictor()
 49 |     >> P.predict_<SOMETHING_RELEVANT>('myvalidseqnce')
 50 | 
 51 | And have it work!
 52 |     
 53 | 
 54 | 
 55 | 
 56 | 
 57 | """
 58 | 
 59 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update
 60 | # this default if you want that new network to be used by default
 61 | DEFAULT_VERSION="4"
 62 | 
 63 | 
 64 | ## CHANGE class name
 65 | class PScorePredictor():
 66 |     """
 67 | 
 68 |     Class that loads in a network such that predict_pscore() can be called to predict
 69 |     PScore propensity from a sequence.
 70 | 
 71 |     """
 72 |     def __init__(self, version=None):
 73 |         """
 74 |         Constructor for building a predictor object object. The version keyword allows specific
 75 |         version(s) of the trained network associated with the predictor to be defined. 
 76 | 
 77 |         By default, it's set to None, which leads to the current best/default network being selected
 78 |         and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
 79 |         the ability to pass a string as version. This string is inserted at position <X> in the filename
 80 | 
 81 |             pscore_predictor_network_v<X>.pt
 82 | 
 83 |         i.e. no need to include the "v" part or the .pt extension
 84 | 
 85 |         """
 86 | 
 87 |         
 88 | 
 89 |         # if no version provided use default, then grab path and check that file actually exists! 
 90 |         if version is None:
 91 |             version = DEFAULT_VERSION
 92 | 
 93 |         # CHANGE THIS!! Make sure oyu change the <DIRECTORY_NAME> and <PREDICTOR_NAME> to the appropriate
 94 |         # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected
 95 |         saved_weights = sparrow.get_data(f'networks/pscore/pscore_predictor_network_v{version}.pt')
 96 | 
 97 |         if not os.path.isfile(saved_weights):
 98 |             raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
 99 |             
100 |         
101 |         # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because
102 |         # we know everyone has a CPU... 
103 |         loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
104 | 
105 |         ## DELETE ME PROBABLY
106 |         # this block of code is relevant ONLY if the trained network has this straneg
107 |         # appended 'module.' text at the start of every keyword. This may happen in older
108 |         # version of PARROT (see DSSP predictor as an example of where its needed) but in
109 |         # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but
110 |         # in case you're using an older network we've kept this to make things simple
111 |         
112 |         for i in range(len(loaded_model)):
113 |             key, value = loaded_model.popitem(last=False)
114 |             new_key = key[7:]
115 |             loaded_model[new_key] = value
116 |         ## END OF DELETE ME PROBABLY
117 | 
118 |       
119 |         # Dynamically calculate the hyperparameters used to train the network. 
120 |         ## NOTE:
121 |         # 
122 |         # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible
123 |         # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example
124 |         # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you
125 |         # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this
126 |         # keyword. 
127 | 
128 |         # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords
129 |         # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to 
130 |         # reach out to Alex or Dan about this!
131 | 
132 |         num_layers = 0
133 |         while True:
134 |             s = f'lstm.weight_ih_l{num_layers}'
135 |             try:
136 |                 temp = loaded_model[s]
137 |                 num_layers += 1
138 |             except KeyError:
139 |                 break
140 |                         
141 | 
142 |         number_of_classes = np.shape(loaded_model['fc.bias'])[0]
143 | 
144 |         # hard coded because we always use one-hot encoding, note that if you trained a specific
145 |         # predictor on a different encoding scheme you could, of course, here simply define that
146 |         # encoding scheme 
147 |         input_size = 20 
148 | 
149 |         hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
150 |         
151 |         # set these here so we can sanity check if needed
152 |         self.number_of_classes = number_of_classes
153 |         self.input_size = input_size
154 |         self.number_of_layers = num_layers
155 |         self.hidden_vector_size = hidden_vector_size
156 | 
157 |         # Instantiate network weights into object
158 |         self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
159 |                                                  
160 |         # load parameters into model
161 |         self.network.load_state_dict(loaded_model)
162 | 
163 | 
164 |     ## CHANGE FUNCTION NAME
165 |     def predict_pscore(self, seq):
166 |         """
167 |         
168 |         Prediction function. seq should be a valid amino acid sequence.
169 | 
170 |         Parameters
171 |         ------------
172 |         seq : str
173 |             Valid amino acid sequence
174 | 
175 |         Returns
176 |         ----------
177 |         np.ndarray
178 |             Returns a 1D np.ndarray the length of the sequence where each position
179 |             is the predicted value <CHANGE THIS>
180 | 
181 |         """
182 | 
183 |         # convert sequence to uppercase
184 |         seq = seq.upper()
185 | 
186 |         # Convert to one-hot sequence vector - note, as mentioned above if you 
187 |         # did't use one-hot in the original training you could just edit this here        
188 |         seq_vector = encode_sequence.one_hot(seq)
189 |         seq_vector = seq_vector.view(1, len(seq_vector), -1)  # formatting
190 | 
191 | 
192 |         ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
193 |         ## CHANGE CODE BELOW HERE ##
194 |         ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
195 | 
196 | 
197 |         ## CLASSIFICATION CODE BLOCK        
198 |         # The block below should be kept if we're doing a classification
199 |         # based prediction! if not, comment this out or delete it
200 |         #prediction = self.network(seq_vector.float()).detach().numpy()
201 |         #int_vals = []
202 |         #for row in prediction[0]:
203 |         #    int_vals.append(np.argmax(row))
204 | 
205 |         #prediction = int_vals
206 | 
207 |         ## REGRESSION CODE BLOCK
208 |         # This block should be kept if we're doing a regression-based
209 |         # prediction. If not, comment this out or delete it
210 |         prediction = self.network(seq_vector.float()).detach().numpy().flatten()
211 |         
212 | 
213 |         return prediction
214 | 


--------------------------------------------------------------------------------
/sparrow/predictors/rg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/rg/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/scaled_re/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/scaled_re/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/scaled_rg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/scaled_rg/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/scaling_exponent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/scaling_exponent/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/tad/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/tad/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/tad/transactivation_domain_predictor.py:
--------------------------------------------------------------------------------
  1 | from parrot import brnn_architecture
  2 | from parrot import encode_sequence
  3 | 
  4 | import sparrow
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import os
  9 | from sparrow.sparrow_exceptions import SparrowException
 10 | 
 11 | 
 12 | """
 13 | NB: This network and predictor was imported from GOOSE, so is subtly different internally to how
 14 | some of the other predictors work. Notably it includes a softmax project and a loop 
 15 | this loop below to define probabilities - this may be because these networks have 2 layers 
 16 | whereas the others only have one? Anyway, just making a note of this in case we need to debug in 
 17 | the future.
 18 | 
 19 |         score = []
 20 |         for val in prediction:
 21 |             score.append(round(val[1],5))
 22 | 
 23 | 
 24 | """
 25 | 
 26 | # NOTE - this is where you can define the version number that is read by default. If you add a new network MAKE SURE you update
 27 | # this default if you want that new network to be used by default
 28 | DEFAULT_VERSION="1"
 29 | 
 30 | def softmax(v):
 31 |     return (np.e ** v) / np.sum(np.e ** v)
 32 | 
 33 | 
 34 | ## CHANGE class name
 35 | class TADPredictor():
 36 |     """
 37 | 
 38 |     Class that loads in a network such that predict_ser_phosphorylation() can be called to predict
 39 |     serine phosphorylation from a sequence.
 40 | 
 41 |     """
 42 |     def __init__(self, version=None):
 43 |         """
 44 |         Constructor for building a predictor object object. The version keyword allows specific
 45 |         version(s) of the trained network associated with the predictor to be defined. 
 46 | 
 47 |         By default, it's set to None, which leads to the current best/default network being selected
 48 |         and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
 49 |         the ability to pass a string as version. This string is inserted at position <X> in the filename
 50 | 
 51 |             <RelevantName>_network_v<X>.pt
 52 | 
 53 |         i.e. no need to include the "v" part or the .pt extension
 54 | 
 55 |         """
 56 | 
 57 |         
 58 | 
 59 |         # if no version provided use default, then grab path and check that file actually exists! 
 60 |         if version is None:
 61 |             version = DEFAULT_VERSION
 62 | 
 63 |         # CHANGE THIS!! Make sure oyu change the <DIRECTORY_NAME> and <PREDICTOR_NAME> to the appropriate
 64 |         # paths. Keep the network_v{version}.pt because this is how a version-specific string is selected
 65 |         saved_weights = sparrow.get_data(f'networks/transactivation_domains/tad_predictor_network_v{version}.pt')
 66 | 
 67 |         if not os.path.isfile(saved_weights):
 68 |             raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
 69 |             
 70 |         
 71 |         # assuming the file is there, we next read in the parameeter file. Note that we force this to be CPU mainly because
 72 |         # we know everyone has a CPU... 
 73 |         loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
 74 | 
 75 |         ## DELETE ME PROBABLY
 76 |         # this block of code is relevant ONLY if the trained network has this straneg
 77 |         # appended 'module.' text at the start of every keyword. This may happen in older
 78 |         # version of PARROT (see DSSP predictor as an example of where its needed) but in
 79 |         # 2022 trained networks didn't need this. As such, this can PROBABLY be deleted but
 80 |         # in case you're using an older network we've kept this to make things simple
 81 |         
 82 |         #for i in range(len(loaded_model)):
 83 |         #    key, value = loaded_model.popitem(last=False)
 84 |         #    new_key = key[7:]
 85 |         #    loaded_model[new_key] = value
 86 |         ## END OF DELETE ME PROBABLY
 87 | 
 88 |       
 89 |         # Dynamically calculate the hyperparameters used to train the network. 
 90 |         ## NOTE:
 91 |         # 
 92 |         # The code here works on networks trained using the current version of PARROT (2022), HOWEVER, it's possible
 93 |         # that in prevoius versions the keys into the parameter file may be different or may have a prefix. Best example
 94 |         # of this is that for the DSSP predictor the world `module.` randomly appears in from of each keyword. If you
 95 |         # look at dssp/dssp_predictor.py you can see at this point in the code there's a re-assignment to remove this
 96 |         # keyword. 
 97 | 
 98 |         # When PARROT runs it's predictions it REQUIRES the keywords in the parameter file to match the expected keywords
 99 |         # in PARROT, so it's imperative that these keywords are right. If you run into weird issues here feel free to 
100 |         # reach out to Alex or Dan about this!
101 | 
102 |         num_layers = 0
103 |         while True:
104 |             s = f'lstm.weight_ih_l{num_layers}'
105 |             try:
106 |                 temp = loaded_model[s]
107 |                 num_layers += 1
108 |             except KeyError:
109 |                 break
110 |                         
111 |         number_of_classes = np.shape(loaded_model['fc.bias'])[0]
112 | 
113 |         # Hard coded because we always use one-hot encoding, note that if you trained a specific
114 |         # predictor on a different encoding scheme you could, of course, here simply define that
115 |         # encoding scheme 
116 |         input_size = 20 
117 | 
118 |         hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
119 |         
120 |         # set these here so we can sanity check if needed
121 |         self.number_of_classes = number_of_classes
122 |         self.input_size = input_size
123 |         self.number_of_layers = num_layers
124 |         self.hidden_vector_size = hidden_vector_size
125 | 
126 |         # Instantiate network weights into object
127 |         self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
128 |                                                  
129 |         # load parameters into model
130 |         self.network.load_state_dict(loaded_model)
131 | 
132 | 
133 | 
134 |     def predict_transactivation_domains(self, seq):
135 |         """
136 |         Function to predict the presence of nuclear import signals. Returns a per
137 |         residue probability score of a residue being in an NLS or not
138 | 
139 |         Parameters
140 |         ------------
141 |         seq : str
142 |             Valid amino acid sequence
143 | 
144 |         Returns
145 |         ----------
146 |         np.ndarray
147 |             Returns a 1D np.ndarray the length of the sequence where each position
148 |             gives the prediction of that residue being an NLS
149 | 
150 |         """
151 | 
152 |         # convert sequence to uppercase
153 |         seq = seq.upper()
154 | 
155 |         # Convert to one-hot sequence vector - note, as mentioned above if you 
156 |         # did't use one-hot in the original training you could just edit this here        
157 |         seq_vector = encode_sequence.one_hot(seq)
158 |         seq_vector = seq_vector.view(1, len(seq_vector), -1)  # formatting
159 | 
160 | 
161 |         ## REGRESSION CODE BLOCK
162 |         # This block should be kept if we're doing a regression-based
163 |         # prediction. If not, comment this out or delete it
164 |         prediction = self.network(seq_vector.float()).detach().numpy().flatten()
165 | 
166 |         prediction = prediction.reshape(-1, self.number_of_classes)
167 |         prediction = np.array(list(map(softmax, prediction)))
168 | 
169 |         ## CLIP
170 |         # IF we want to ensure we have a value between 0 and 1 the clipping here 
171 |         # will do that. If not leave commented
172 |         #prediction = np.clip(prediction, 0.0, 1.0)
173 | 
174 |         # finally we extract out local probabilities
175 |         score = []
176 |         for val in prediction:
177 |             score.append(round(val[1],5))
178 | 
179 |         return score
180 | 
181 |                     
182 | 
183 |                     
184 |                     
185 | 
186 |                                         
187 |                                         
188 |                 
189 |                 
190 |             
191 |             
192 | 


--------------------------------------------------------------------------------
/sparrow/predictors/transmembrane/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/predictors/transmembrane/__init__.py


--------------------------------------------------------------------------------
/sparrow/predictors/transmembrane/transmembrane_predictor.py:
--------------------------------------------------------------------------------
  1 | from parrot import brnn_architecture
  2 | from parrot import encode_sequence
  3 | 
  4 | import sparrow
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import os
  9 | from sparrow.sparrow_exceptions import SparrowException
 10 | 
 11 | 
 12 | 
 13 | """
 14 | Predictor of transmembrane regions from sequence.
 15 | 
 16 | This is an example of how to implement a system-specific predictor 
 17 | in sparrow and could/should be used as a template for adding in 
 18 | additional predictors.
 19 | 
 20 | 
 21 | ## Nomenclature
 22 | 
 23 | 1. The predictor file should be called <relevant_name>_predictor.py
 24 | 2. This should be inside a module in the /predictor/ directory called <relevant_thing>
 25 | 3. The single class this module implements should be called <RevevantthingPredictor>
 26 | 
 27 | e.g. here we have 
 28 | 
 29 | 1. transmembrane/
 30 | 2. transmembrane_predictor.py
 31 | 3. TransmembranePredictor
 32 | 
 33 | 
 34 | ## Class structure
 35 | 
 36 | The class should have (at least) two functions:
 37 | 
 38 | 1. A constructor (__init__()) which PRE LOADS the network from sparrow/data/networks/relevant_name - the get_data() function
 39 |    is defined in sparrow/__init__.py and allows absolute-path access to the /data directory. The constructor should
 40 |    FULLY load the network along with standard PARROT-style options, as shown here. Trained networks should be versioned and 
 41 |    implemented so previous versions can be chosene even if the default version changes
 42 | 
 43 | 2. Define a function called predict_<THING>(self, seq) where <THING> is a convenient name that obviously means this is
 44 |    what the function does.
 45 | 
 46 | 
 47 | """
 48 | 
 49 | DEFAULT_VERSION="4"
 50 | 
 51 | 
 52 | class TransmembranePredictor():
 53 |     """
 54 | 
 55 |     Class that loads in a network such that predict_transmebrane_regions() can be called to predict
 56 |     transmembrane regions in a sequence.
 57 | 
 58 |     """
 59 |     def __init__(self, version=None):
 60 |         """
 61 |         Constructor for building a TransmembranePredictor object. The version keyword allows specific
 62 |         version(s) of the trained network associated with the HelicityPredictor to be defined. 
 63 |         By default, it's set to None, which leads to the current best/default network being selected
 64 |         and is MOSTLY going to be the right option. However, to preserve backwards compatibility we provide
 65 |         the ability to pass a string as version. This string is inserted at position <X> in the filename
 66 | 
 67 |             HelicityPredictor_network_v<X>.pt
 68 | 
 69 |         i.e. no need to include the "v" part or the .pt extension
 70 | 
 71 |         """
 72 | 
 73 |         if version is None:
 74 |             version = DEFAULT_VERSION
 75 | 
 76 |         saved_weights = sparrow.get_data(f'networks/transmembrane/transmembrane_predictor_network_v{version}.pt')
 77 | 
 78 |         if not os.path.isfile(saved_weights):
 79 |             raise SparrowException('Error: could not find saved weights file [%s] for %s predictor' %( saved_weights, type(self).__name__))
 80 |             
 81 |         
 82 |         # use helicity predictor version 1
 83 |         loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
 84 |       
 85 |         # Dynamically read in correct hyperparameters:
 86 |         num_layers = 0
 87 |         while True:
 88 |             s = f'lstm.weight_ih_l{num_layers}'
 89 |             try:
 90 |                 temp = loaded_model[s]
 91 |                 num_layers += 1
 92 |             except KeyError:
 93 |                 break
 94 |                         
 95 |         number_of_classes = np.shape(loaded_model['fc.bias'])[0]
 96 |         input_size = 20 # hard coded because we always use one-hot encoding
 97 | 
 98 |         hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
 99 |         
100 |         # set these here so we can sanity check if needed
101 |         self.number_of_classes = number_of_classes
102 |         self.input_size = input_size
103 |         self.number_of_layers = num_layers
104 |         self.hidden_vector_size = hidden_vector_size
105 | 
106 |         # Instantiate network weights into object
107 |         self.network = brnn_architecture.BRNN_MtM(input_size, hidden_vector_size, num_layers, number_of_classes, 'cpu')
108 |                                                   
109 |         self.network.load_state_dict(loaded_model)
110 | 
111 | 
112 | 
113 |     def predict_transmebrane_regions(self, seq):
114 |         """
115 |         Prediction function. seq should be a valid amino acid sequence.
116 | 
117 |         Parameters
118 |         ------------
119 |         seq : str
120 |             Valid amino acid sequence
121 | 
122 |         Returns
123 |         ----------
124 |         np.ndarray
125 |             Returns a 1D np.ndarray the length of the sequence where each position
126 |             is the transient helicity at that position.
127 | 
128 |         """
129 | 
130 |         # convert sequence to uppercase
131 |         seq = seq.upper()
132 | 
133 |         # Convert to one-hot sequence vector
134 |         seq_vector = encode_sequence.one_hot(seq)
135 |         seq_vector = seq_vector.view(1, len(seq_vector), -1)  # formatting
136 | 
137 |         # Forward pass  -this is specific for classication
138 |         prediction = self.network(seq_vector.float()).detach().numpy()
139 |         int_vals = []
140 |         for row in prediction[0]:
141 |             int_vals.append(np.argmax(row))
142 | 
143 |         prediction = int_vals
144 | 
145 | 
146 |         # for regression use the line below instead  - included here so this
147 |         # file can be easily copied over for future predictors
148 |         # prediction = self.network(seq_vector.float()).detach().numpy().flatten()
149 |         # prediction = np.clip(prediction, 0.0, 1.0)
150 | 
151 |         return prediction
152 | 


--------------------------------------------------------------------------------
/sparrow/sequence_analysis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/sequence_analysis/__init__.py


--------------------------------------------------------------------------------
/sparrow/sequence_analysis/alignment.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Union
  2 | 
  3 | from protfasta import read_fasta, write_fasta
  4 | from pyfamsa import Aligner, Sequence
  5 | 
  6 | from sparrow import Protein
  7 | from sparrow.visualize.sequence_visuals import show_sequence
  8 | 
  9 | 
 10 | class SequenceAlignment:
 11 |     def __init__(
 12 |         self,
 13 |         input_data: Union[str, Dict[str, Protein]],
 14 |         threads: int = 0,
 15 |         scoring_matrix: str = "BLOSUM62",
 16 |         guide_tree: str = "upgma",
 17 |         tree_heuristic: Union[str, None] = None,
 18 |         medoid_threshold: int = 0,
 19 |         n_refinements: int = 200,
 20 |         keep_duplicates: bool = False,
 21 |         refine: Union[bool, None] = None,
 22 |     ):
 23 |         """
 24 |         Initialize the SequenceAlignment object.
 25 | 
 26 |         Parametersip
 27 |         ----------
 28 |         input_data : Union[List[Protein], str, Dict[str, str]]
 29 |             A list of Protein objects, a path to a FASTA file, or a dictionary
 30 |             of FASTA headers to sequences.
 31 |         """
 32 |         self.input_data = input_data
 33 |         self.threads = threads
 34 |         self.guide_tree = guide_tree
 35 |         self.tree_heuristic = tree_heuristic
 36 |         self.medoid_threshold = medoid_threshold
 37 |         self.n_refinements = n_refinements
 38 |         self.keep_duplicates = keep_duplicates
 39 |         self.refine = refine
 40 |         self.scoring_matrix = scoring_matrix
 41 |         self.aligner = self._initialize_aligner()
 42 |         self._cached_msa = None  # Cache for the computed MSA
 43 | 
 44 |     def _initialize_aligner(self) -> Aligner:
 45 |         """
 46 |         Initialize the Aligner object with the given parameters.
 47 |         """
 48 |         return Aligner(
 49 |             threads=self.threads,
 50 |             guide_tree=self.guide_tree,
 51 |             tree_heuristic=self.tree_heuristic,
 52 |             medoid_threshold=self.medoid_threshold,
 53 |             n_refinements=self.n_refinements,
 54 |             keep_duplicates=self.keep_duplicates,
 55 |             refine=self.refine,
 56 |             scoring_matrix=self.scoring_matrix,
 57 |         )
 58 | 
 59 |     @staticmethod
 60 |     def _encode_string(string_to_encode: str, encoding: str = "utf-8") -> bytes:
 61 |         """
 62 |         Encode a string to bytes using the specified encoding.
 63 |         """
 64 |         return string_to_encode.encode(encoding)
 65 | 
 66 |     def _load_sequences(self) -> List[Sequence]:
 67 |         """
 68 |         Load sequences from either a list of Protein objects, a FASTA file, or
 69 |         a dictionary of header-sequence mappings.
 70 | 
 71 |         Returns
 72 |         -------
 73 |         List[Sequence]
 74 |             A list of pyfamsa.Sequence objects for alignment.
 75 |         """
 76 |         if isinstance(self.input_data, str):
 77 |             # Assume input_data is a path to a FASTA file
 78 |             fasta_data = read_fasta(self.input_data)
 79 |             sequences = [
 80 |                 Sequence(self._encode_string(header), self._encode_string(seq))
 81 |                 for header, seq in fasta_data.items()
 82 |             ]
 83 |         elif isinstance(self.input_data, dict):
 84 |             # Assume input_data is a dictionary of header-sequence mappings
 85 |             sequences = [
 86 |                 Sequence(self._encode_string(header), self._encode_string(seq.sequence))
 87 |                 for header, seq in self.input_data.items()
 88 |             ]
 89 |         else:
 90 |             raise ValueError(
 91 |                 "Invalid input_data format. Must be either a list of Protein objects, "
 92 |                 "a path to a FASTA file, or a dictionary of header-sequence mappings."
 93 |             )
 94 | 
 95 |         return sequences
 96 | 
 97 |     def construct_msa(self) -> Aligner:
 98 |         """
 99 |         Construct a multiple sequence alignment with pyFAMSA.
100 | 
101 |         Returns
102 |         -------
103 |         Aligner
104 |             Returns the constructed MSA as a pyfamsa._famsa.Alignment.
105 |         """
106 |         if self._cached_msa is not None:
107 |             # Return cached MSA if it exists
108 |             return self._cached_msa
109 | 
110 |         sequences = self._load_sequences()
111 |         self._cached_msa = self.aligner.align(sequences)  # Cache the computed MSA
112 |         return self._cached_msa
113 | 
114 |     @property
115 |     def alignment(self) -> Aligner:
116 |         """
117 |         Property to access the cached MSA result.
118 | 
119 |         Returns
120 |         -------
121 |         Aligner
122 |             Returns the cached MSA if available, otherwise computes it.
123 |         """
124 |         if self._cached_msa is None:
125 |             # Compute MSA if it hasn't been computed yet
126 |             self.construct_msa()
127 |         return self._cached_msa
128 | 
129 |     def save_msa(
130 |         self, filename: str, linelength: int = 60, append_to_fasta: bool = False
131 |     ):
132 |         """
133 |         Save the multiple sequence alignment to a FASTA file.
134 | 
135 |         Parameters
136 |         ----------
137 |         filename : str
138 |             The filename to save the MSA. Should end with .fasta or .fa.
139 | 
140 |         linelength : int, optional
141 |             Length of lines in the output file, by default 60.
142 | 
143 |         append_to_fasta : bool, optional
144 |             Whether to append to an existing FASTA file, by default False.
145 |         """
146 |         msa = self.alignment
147 |         fasta_data = {seq.id.decode(): seq.sequence.decode() for seq in msa}
148 |         write_fasta(
149 |             fasta_data, filename, linelength=linelength, append_to_fasta=append_to_fasta
150 |         )
151 | 
152 |     @property
153 |     def display_msa(self, ljust: int = 10, html: bool = False):
154 |         """
155 |         Print the multiple sequence alignment using the cached MSA.
156 | 
157 |         Parameters
158 |         ----------
159 |         ljust : int, optional
160 |             The number of spaces to pad the sequence ID, by default 10
161 | 
162 |         html : bool, optional
163 |             Set to True to print the alignment in HTML format, by default False
164 |         """
165 |         msa = self.alignment
166 | 
167 |         for seq in msa:
168 |             if html:
169 |                 print(seq.id.decode().ljust(ljust), end=None)
170 |                 show_sequence(seq.sequence.decode())
171 |             else:
172 |                 print(seq.id.decode().ljust(ljust), seq.sequence.decode())
173 | 


--------------------------------------------------------------------------------
/sparrow/sequence_analysis/community_plugins/contributed.py:
--------------------------------------------------------------------------------
 1 | from sparrow.sequence_analysis.plugins import BasePlugin
 2 | 
 3 | 
 4 | class MultiplicativeFCR(BasePlugin):
 5 |     def __init__(self, protein):
 6 |         super().__init__(protein)
 7 | 
 8 |     def calculate(self, factor=2.0):
 9 |         """
10 |         This analysis doubles the FCR (fraction of charged residues) of the protein.
11 |         This is a simple example of a contributed plugin.
12 | 
13 |         Parameters: factor (float)
14 |         -------------
15 |         factor: float
16 |             The factor by which the FCR will be multiplied (default is 2.0)
17 | 
18 |         Returns
19 |         -------------
20 |         float
21 |             Returns the result of the contributed analysis
22 |         """
23 |         return factor * self.protein.FCR
24 | 


--------------------------------------------------------------------------------
/sparrow/sequence_analysis/elm.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from dataclasses import dataclass
  3 | from typing import List, Set, Tuple, Union
  4 | import pandas as pd
  5 | from IPython import embed
  6 | 
  7 | import sparrow
  8 | from sparrow.sparrow_exceptions import SparrowException
  9 | 
 10 | 
 11 | @dataclass(frozen=True)
 12 | class ELM:
 13 |     regex: str
 14 |     identifier: str
 15 |     functional_site_name: str
 16 |     description: str
 17 |     probability: float
 18 |     start: int
 19 |     end: int
 20 |     sequence: str
 21 | 
 22 |     def __eq__(self, other):
 23 |         if self.start > other.end or self.end < other.start:
 24 |             return False
 25 | 
 26 |         # Only compare regex patterns for equality - all regexes for ELMs are unique - we could also check functional site names?
 27 |         return self.regex == other.regex 
 28 | 
 29 |     def __hash__(self):
 30 |         # I THINK this works since we're basically saying we dont CARE about sequences if they're the same or not
 31 |         # this will let us do set differences and intersections
 32 |         # Does restrict motif to starting at the same position though which we know these could be diff spots from indels
 33 |         # This is fine for point mutation comparison, but this could probably be generalized.
 34 |         # don't want just look for "in the sequence" because their might be 
 35 |         # multiple occurences of the same motif and motif positioning may matter.
 36 |         return hash((self.regex, self.functional_site_name, self.start))
 37 |         
 38 | 
 39 | def parse_hgvs(hgvs_notation : str) -> Tuple:
 40 |     """This function takes an HGVS notation and returns a tuple of the form (position, mutation)
 41 |     where position is the position of the mutation and mutation is the amino acid change.
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     hgvs_notation : str
 46 |         HGVS notation of the form p.XXXX
 47 | 
 48 |     Returns
 49 |     -------
 50 |     Tuple[int,str]
 51 |         Tuple containing the position of the mutation and the amino acid change.
 52 |     """
 53 |     if not hgvs_notation.startswith("p."):
 54 |         raise SparrowException("Invalid HGVS notation. Must start with 'p.'")
 55 |     
 56 |     parts = hgvs_notation.split('p.')
 57 |     if len(parts) < 2:
 58 |         raise SparrowException("Invalid HGVS notation. Must be in the form p.xxx")
 59 | 
 60 |     # Extract the position and amino acids
 61 |     position = int(''.join(filter(str.isdigit, parts[1]))) # shift indexing to 0
 62 |     assert position > 0, SparrowException(f"Invalid position in HGVS notation, must be a 1 indexed integer greater than 0. Received {position}")
 63 |     mutation = parts[1][-1]
 64 | 
 65 |     # position shifted to 0 index
 66 |     return position-1, mutation.upper()
 67 | 
 68 | def generate_elm_df(file : str) -> pd.DataFrame:
 69 |     """Generates a pandas DataFrame object containing all the information 
 70 |     annotated as an elm.
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     file : str
 75 |         This generates a dataframe from the elm_classes.tsv in the data directory.
 76 |         The latest elm class list can be found at http://elm.eu.org/downloads.html
 77 | 
 78 |     Returns
 79 |     -------
 80 |     pandas.DataFrame
 81 |         DataFrame containing the elm annotations.
 82 |         
 83 |     """
 84 |     elm_data = []
 85 |     with open(f"{file}", "r", encoding="utf-8") as f:
 86 |         for line in f:
 87 |             if line.startswith("#"):
 88 |                 continue
 89 |             if line.startswith('"Accession"'):
 90 |                 columns = line.strip().split("\t")
 91 |                 columns = [col.replace('"','') for col in columns]
 92 |             else:
 93 |                 elm_data.append(line.replace('"','').strip().split("\t"))
 94 |     df = pd.DataFrame(elm_data,columns=columns)
 95 |     return df
 96 | 
 97 | def find_all_elms(sequence : str) -> List[ELM]:
 98 |     """This function takes an input sequence and returns a namedtuple
 99 |     containing the regex used to find the elm from sequence, it's functional annotation,
100 |     the start and stop position, as well as the sequence of the e
101 | 
102 |     Parameters
103 |     ----------
104 |     sequence : str
105 |         Amino Acid Sequence
106 | 
107 |     Returns
108 |     -------
109 |     List[sparrow.sequence_analysis.elm.ELM]
110 |         A list of NamedTuples containing all possible elms in a given sequence.
111 |     """
112 |     elm_file = sparrow.get_data("elm_classes.tsv")
113 |     df = generate_elm_df(elm_file)
114 |     elms = []
115 |     for _, row in df.iterrows():
116 |         regex = row["Regex"]
117 |         elm_class = row["ELMIdentifier"]
118 |         site = row["FunctionalSiteName"]
119 |         elm_description = row["Description"]
120 |         elm_probability = row["Probability"]
121 | 
122 |         match_indices = [(m.start(0), m.end(0)) for m in re.finditer(regex, sequence)]
123 |         for (start,end) in match_indices:
124 |             elm = ELM(regex, elm_class, site, elm_description, elm_probability, start, end, sequence[start:end]) 
125 |             elms.append(elm)
126 |     return set(elms)
127 | 
128 | def compute_lost_elms(target_protein, query : Union[Tuple[int,str],str]) -> Set:
129 |     """This function takes a protein sequence and a target query and returns a 
130 |     the set of ELMs that were lost due to the mutation. The query can either be 
131 |     a list or tuple of the form (position, mutant) where position is the position
132 |     of the mutation. or it can be a string in the HGVS format.
133 |     Parameters
134 |     ----------
135 |     target_protein : Union[sparrow.Protein, str]
136 |         sparrow.Protein or amino acid sequence
137 |     queries : Union[str, List[int,str], Tuple[int,str]]
138 |         List or tuple of the form (position, mutant) where position is the position of the mutation.
139 |     Returns
140 |     -------
141 |     Set
142 |         A set of ELMs containing the functional site name, the start and stop position,
143 |         the sequence of the elm. 
144 |     """
145 |     
146 |     if isinstance(target_protein, str):
147 |         target_protein = sparrow.Protein(target_protein)
148 |     
149 |     if isinstance(query, str):
150 |         position, mutation = parse_hgvs(query)
151 |     else:
152 |         position, mutation = query
153 | 
154 |     mutant_protein = sparrow.Protein(target_protein.sequence[:position] + mutation + target_protein.sequence[position+1:])
155 |     
156 |     wt_elms = target_protein.elms
157 |     mutant_elms = mutant_protein.elms
158 |     lost_elms = wt_elms - mutant_elms
159 | 
160 |     return lost_elms
161 | 
162 | def compute_gained_elms(target_protein, query : Union[Tuple[int,str],str]) -> Set:
163 |     """This function takes a protein sequence and a target query and returns a 
164 |     the set of ELMs that were gained due to the mutation. The query can either be 
165 |     a list or tuple of the form (position, mutant) where position is the position
166 |     of the mutation. or it can be a string in the HGVS format.
167 | 
168 |     Parameters
169 |     ----------
170 |     target_protein : Union[sparrow.Protein, str]
171 |         sparrow.Protein or amino acid sequence
172 |     queries : Union[List[int,str], Tuple[int,str]]
173 |         List or tuple of the form (position, mutant) where position is the position of the mutation.
174 |     Returns
175 |     -------
176 |     Set
177 |         A set of ELMs containing the functional site name, the start and stop position,
178 |         the sequence of the elm. 
179 |     """
180 |     
181 |     if isinstance(target_protein, str):
182 |         target_protein = sparrow.Protein(target_protein)
183 |     
184 |     if isinstance(query, str):
185 |         position, mutation = parse_hgvs(query)
186 |     else:
187 |         position, mutation = query
188 | 
189 |     mutant_protein = sparrow.Protein(target_protein.sequence[:position] + mutation + target_protein.sequence[position+1:])
190 |     
191 |     
192 |     wt_elms = target_protein.elms
193 |     mutant_elms = mutant_protein.elms
194 |     gained_elms = mutant_elms - wt_elms
195 |     
196 |     return gained_elms
197 | 
198 | def compute_retained_elms(target_protein, query : Union[Tuple[int,str],str]) -> Set:
199 |     """This function takes a protein sequence and a target query and returns a 
200 |     the set of ELMs that were retained (no change) after mutation. The query can 
201 |     either be a list or tuple of the form (position, mutant) where position is 
202 |     the position of the mutation. or it can be a string in the HGVS format.
203 | 
204 |     Parameters
205 |     ----------
206 |     target_protein : Union[sparrow.Protein, str]
207 |         sparrow.Protein or amino acid sequence
208 |     queries : Union[List[int,str], Tuple[int,str]]
209 |         List or tuple of the form (position, mutant) where position is the position of the mutation.
210 |     Returns
211 |     -------
212 |     Set
213 |         A set of ELMs containing the functional site name, the start and stop position,
214 |         the sequence of the elm. 
215 |     """
216 |     if isinstance(target_protein, str):
217 |         target_protein = sparrow.Protein(target_protein)
218 | 
219 |     if isinstance(query, str):
220 |         position, mutation = parse_hgvs(query)
221 |     else:
222 |         position, mutation = query
223 | 
224 |     mutant_protein = sparrow.Protein(target_protein.sequence[:position] + mutation + target_protein.sequence[position+1:])
225 |     
226 |     wt_elms = target_protein.elms
227 |     mutant_elms = mutant_protein.elms
228 |     retained_elms = wt_elms & mutant_elms
229 |     
230 |     return retained_elms


--------------------------------------------------------------------------------
/sparrow/sequence_analysis/phospho_isoforms.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Snippit to build and adapted from localcider to get all phosphoisoforms 
  3 | of an amino acid sequence which is to integrated into sparrow
  4 | 
  5 | By : Garrett M. Ginell
  6 | 2023-02-08
  7 | 
  8 | BASIC workflow is as followed:
  9 | 
 10 |   To get a list of run get_phosphoisoforms"
 11 | 
 12 |     phosphoSeqome = get_phosphoisoforms(sequence, mode='predict')
 13 |     # for options see various run variations in function header
 14 |     
 15 |   Then once you get the phosphoisoforms from the list above you can iterate
 16 |   the list and calculate a sequence parameter of choice and build a distribution: 
 17 | 
 18 |     parameter_list = []
 19 |     for s in phosphoSeqome:
 20 |         parameter_list.append(Protein(s).my_parameter_of_choice)
 21 |   
 22 |   This distribution can then be compared back to value of the original sequence:
 23 | 
 24 |     Protein(sequence).my_parameter_of_choice 
 25 | """
 26 | import itertools
 27 | 
 28 | ## -----------------------------------------
 29 | ##
 30 | def _predict_all_phosphosites(protein):
 31 |     """
 32 |     Gets list of predicted phosphosites 
 33 | 
 34 |     BASED OFF OF:
 35 |         predictors in sparrow:
 36 |         https://github.com/idptools/sparrow/tree/main/sparrow/predictors/phosphorylation
 37 | 
 38 |     Parameters
 39 |     ------------
 40 |     protein : sparrow.Protein
 41 |         sparrow Protein object
 42 | 
 43 | 
 44 |     Returns:
 45 |     ----------
 46 |     list
 47 |         list of predicted positions of sites of phosphorylated T, S, and Y 
 48 |         Note positions are returned as indexed from 0
 49 | 
 50 |     """
 51 | 
 52 |     # predict phosphosites
 53 |     pS = protein.predictor.serine_phosphorylation(return_sites_only=True)
 54 |     pT = protein.predictor.threonine_phosphorylation(return_sites_only=True)
 55 |     pY = protein.predictor.tyrosine_phosphorylation(return_sites_only=True)
 56 | 
 57 |     return list(pS + pT + pY)
 58 | 
 59 | ## ----------------------------------------
 60 | ##
 61 | def _get_all_phosphosites(sequence):
 62 |     """
 63 |     Function which returns a list of all the positions which *could* be
 64 |     phosphorylated (i.e. are T/S/Y). NOTE this does not use any kind of
 65 |     smart lookup, metadata, or analysis. It's literally, where are the Y/T/S
 66 |     residues.
 67 |     Note positions are returned as indexed from 0 
 68 |     
 69 |     Parameters
 70 |     ------------
 71 |     sequence : str
 72 |         Valid amino acid sequence
 73 | 
 74 |     Returns:
 75 |     ----------
 76 |     list
 77 |         list of integers corresponding to S/T/Y positions in your sequence
 78 | 
 79 |     """ 
 80 |     sites = []
 81 |     idx = 0
 82 |     for i in sequence:
 83 |         if i in ["Y", "S", "T"]:
 84 |             sites.append(idx)
 85 |         idx = idx + 1
 86 |     return sites
 87 | 
 88 | ## -----------------------------------
 89 | ##
 90 | def _build_phosphoSeqome(sequence, phosphosites, phospho_rate=1):
 91 |     """
 92 |     Build all phospho-isoforms based on provided phosphosites
 93 | 
 94 |     Parameters
 95 |     ------------
 96 |     sequence : str
 97 |         Valid amino acid sequence
 98 | 
 99 |     phosphosites : list
100 |         List of valid phosphosite positions
101 | 
102 |     phospho_rate : float 
103 |         Value between 0 and 1 which defines the maximum percent of phosphosites 
104 |         can be 'phosphorylated' a each sequence. Defult is 1 (IE all sites can be 
105 |         phosphorylated)
106 | 
107 |     Returns
108 |     ----------
109 |     list
110 |         list of sequences for all posible phospho-isoforms 
111 |         based off of the provided inputed list of phosphosites
112 | 
113 |         When phospho_rate = 1 (100%) 
114 |             the length of output list = 2^n where n=len(phosphosites)
115 |     """
116 | 
117 |     _max_phospho_number = int(len(phosphosites)*phospho_rate)
118 |     ## GET ALL phospho-sequence combinations 
119 |     phosphoSeqome = []
120 |     phosphoSeqome_info = []
121 |     for phosphostatus in itertools.product("01", repeat=len(phosphosites)):
122 |         
123 |         if phosphostatus.count('1') > _max_phospho_number:
124 |             continue
125 |         newseq = list(sequence)
126 | 
127 |         count = 0
128 |         indx = 0
129 |         # look over each element in our phosphosite on/off list
130 |         for i in phosphostatus:
131 |             # if that element is ON
132 |             if int(i) == 1:
133 |                 # set the AA at that position to a negative residue (we use E but
134 |                 # could be D)
135 |                 newseq[phosphosites[indx]] = "E"
136 |                 count+=1 
137 |             indx = indx + 1
138 |             
139 |         # now we've replaced some number of T/Y/S with E representing a different
140 |         # phosphostate
141 |         newseq = "".join(newseq)
142 |         phosphoSeqome.append(newseq)
143 | 
144 |     return phosphoSeqome
145 | 
146 | ## -----------------------------------
147 | ##
148 | def get_phosphoisoforms(protein, mode="all", phospho_rate=1, phosphosites=None):
149 |     """Phosphosites are replaced with the phosphomimetic 'E', enabling approximate calculation 
150 |     of charge based sequence features with the presence of a phosphorylated residues.
151 | 
152 |     Parameters
153 |     ----------
154 |     protein : sparrow.Protein 
155 |         sparrow Protein object
156 | 
157 |     mode : str, optional
158 |         Defition for how the phosphosites should be determined, by default "all"
159 | 
160 |         'all'       : Assumes all S/T/Y residues are potential phosphosites
161 | 
162 |         'predict'   : Leverages PARROT trained predictors via _predict_all_phosphosites
163 |                         to predict phosphorylated sites based on sequence.
164 |     
165 |         'custom'    : uses the 'phosphosites' parameter as indices for phosphosites.
166 |         
167 |     phospho_rate : int, optional
168 |         Value between 0 and 1 which defines the maximum percent of phosphosites 
169 |         can be 'phosphorylated' a each sequence, by default 1 (IE all sites can be 
170 |         phosphorylated)
171 | 
172 |     phosphosites : list, optional
173 |         Custom list of indices for valid phosphosite positions, by default None
174 | 
175 |     Returns
176 |     -------
177 |     list
178 |         list of sequences for the possible phosphoisoforms based off the selected method.
179 |         Phosphorylatable amino acids are replaced with 'E'.    
180 |     """
181 | 
182 |     # get phosphosite positions
183 |     if mode == 'all':
184 |         _phosphosites = _get_all_phosphosites(protein.sequence)
185 |     elif mode == 'predict':
186 |         _phosphosites = _predict_all_phosphosites(protein)
187 |     elif mode == 'custom':
188 |         if phosphosites != None:
189 |             _phosphosites = phosphosites
190 |         else:
191 |             raise Exception('To use custom phosphosites must be defined')
192 |     else:
193 |         raise Exception('Please specify mode to compute phosphosites')
194 | 
195 |     # generate all phospho-Isoforms
196 |     return _build_phosphoSeqome(protein.sequence, _phosphosites, phospho_rate=phospho_rate)
197 | 


--------------------------------------------------------------------------------
/sparrow/sequence_analysis/physical_properties.py:
--------------------------------------------------------------------------------
 1 | from sparrow.data import amino_acids 
 2 | 
 3 | ## The physical properties module contains stateless functions that compute sequence-dependent
 4 | ## physical properties. See the "calculate_molecular_weight" function as a template for how
 5 | ## these functions should work.
 6 | ##
 7 | ##
 8 | 
 9 | def calculate_molecular_weight(sequence):
10 |     """
11 |     Function that returns the molecular weight of a protein sequence assuming standard 
12 |     amino acid molecular weights.
13 | 
14 |     Parameters
15 |     -------------
16 |     sequence : str
17 |         String containing the amino acid sequence (upper case one-letter residue codes)
18 | 
19 |     Returns
20 |     -----------
21 |     float
22 |         Returns the residue or polypeptide molecular weight. 
23 | 
24 |     """
25 | 
26 |     # compute niave MW
27 |     MW = 0
28 |     for i in sequence:
29 |         MW = MW + amino_acids.AA_MOLECULAR_WEIGHT[i]
30 | 
31 |     if len(sequence) == 1:
32 |         return MW
33 | 
34 |     else:
35 |         return MW - 18*(len(sequence)-1)
36 |         
37 | 


--------------------------------------------------------------------------------
/sparrow/sequence_analysis/plugins.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import inspect
  3 | import pkgutil
  4 | from abc import ABC, abstractmethod
  5 | from collections import defaultdict
  6 | from typing import Any
  7 | 
  8 | 
  9 | class PluginWrapper:
 10 |     """
 11 |     A wrapper class for plugins that integrates with the plugin manager.
 12 | 
 13 |     This class is responsible for managing the execution of plugin instances
 14 |     and caching their results to avoid redundant computations. It uses a
 15 |     combination of the plugin name and the arguments passed to the plugin's
 16 |     `calculate` method to create a unique cache key for storing results.
 17 | 
 18 |     Attributes:
 19 |         name (str): The name of the plugin.
 20 |         cache_dict (dict): A dictionary used to store cached results.
 21 |         plugin_instance (object): An instance of the plugin to be wrapped.
 22 | 
 23 |     Methods:
 24 |         __call__(*args, **kwargs):
 25 |             Executes the plugin's `calculate` method with the provided arguments.
 26 |             Caches the result to avoid recomputation on subsequent calls with
 27 |             the same arguments.
 28 |     """
 29 | 
 30 |     def __init__(self, name, cache_dict, plugin_instance):
 31 |         self.name = name
 32 |         self.cache_dict = cache_dict
 33 |         self.plugin_instance = plugin_instance
 34 | 
 35 |     def __call__(self, *args, **kwargs):
 36 |         """
 37 |         Call calculate() with or without arguments.
 38 |         Implement caching to avoid recomputation.
 39 |         """
 40 |         # Create hashable cache key for args and kwargs
 41 |         cache_key = (args, frozenset(kwargs.items()))
 42 | 
 43 |         # Check if the result is cached
 44 |         if cache_key not in self.cache_dict[self.name]:
 45 |             self.cache_dict[self.name][cache_key] = self.plugin_instance.calculate(
 46 |                 *args, **kwargs
 47 |             )
 48 | 
 49 |         return self.cache_dict[self.name][cache_key]
 50 | 
 51 | 
 52 | class PluginManager:
 53 |     def __init__(self, protein: "sparrow.Protein"):
 54 |         self.__protein_obj = protein
 55 |         # Memoization for both args and no-args results
 56 |         self.__precomputed = defaultdict(dict)
 57 |         self.__plugins = {}
 58 | 
 59 |         self._available_plugins = self._discover_plugins()
 60 | 
 61 |     def _discover_plugins(self):
 62 |         """
 63 |         Discover all plugins available in the contributed plugin module.
 64 |         """
 65 |         plugin_module = "sparrow.sequence_analysis.community_plugins.contributed"
 66 |         try:
 67 |             module = importlib.import_module(plugin_module)
 68 |             return [
 69 |                 name
 70 |                 for name, obj in inspect.getmembers(module, inspect.isclass)
 71 |                 if issubclass(obj, BasePlugin) and obj.__module__ == plugin_module
 72 |             ]
 73 |         except ModuleNotFoundError:
 74 |             return []
 75 | 
 76 |     def __getattr__(self, name: str):
 77 |         """
 78 |         Dynamically load and return the plugin's calculate method result
 79 |         as if it were a property when accessed without arguments.
 80 |         """
 81 |         if name not in self.__plugins:
 82 |             try:
 83 |                 module = importlib.import_module(
 84 |                     f"sparrow.sequence_analysis.community_plugins.contributed"
 85 |                 )
 86 |                 plugin_class = getattr(module, name)
 87 |                 if not issubclass(plugin_class, BasePlugin):
 88 |                     raise AttributeError(f"{name} is not a valid plugin.")
 89 |                 self.__plugins[name] = plugin_class(protein=self.__protein_obj)
 90 |             except (ModuleNotFoundError, AttributeError):
 91 |                 raise AttributeError(
 92 |                     f"Plugin '{name}' not found. Available plugins are: {list(self._available_plugins)}"
 93 |                 )
 94 | 
 95 |         plugin_instance = self.__plugins[name]
 96 | 
 97 |         return PluginWrapper(name, self.__precomputed, plugin_instance)
 98 | 
 99 |     def __dir__(self):
100 |         """
101 |         Return the list of dynamically available plugins for autocompletion QoL.
102 |         """
103 |         return super().__dir__() + self._available_plugins
104 | 
105 | 
106 | class BasePlugin(ABC):
107 |     """Base class for all community contributed plugins."""
108 | 
109 |     def __init__(self, protein: "sparrow.Protein"):
110 |         """Constructor for all plugins. This must provide a protein object or sequence."""
111 |         self.__protein_obj = protein
112 | 
113 |     @abstractmethod
114 |     def calculate(self) -> Any:
115 |         """
116 |         This method must operate on the sequence attribute of the protein object.
117 |         The method must return the result of the contributed analysis.
118 | 
119 |         Returns
120 |         -------------
121 |         float
122 |             Returns the result of the contributed analysis
123 |         """
124 |         pass
125 | 
126 |     @property
127 |     def protein(self):
128 |         return self.__protein_obj
129 | 


--------------------------------------------------------------------------------
/sparrow/sparrow_exceptions.py:
--------------------------------------------------------------------------------
 1 | class SparrowException(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class ProteinException(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class PatterningException(Exception):
10 |     pass
11 | 
12 | 
13 | class CalculationException(Exception):
14 |     pass
15 | 


--------------------------------------------------------------------------------
/sparrow/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Empty init file in case you choose a package besides PyTest such as Nose which may look for such a file
 3 | """
 4 | 
 5 | import numpy as np
 6 | 
 7 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS
 8 | 
 9 | 
10 | def build_seq(min_count=10,max_count=50):
11 | 
12 |     # how many residues
13 |     n_res = np.random.randint(4,20)
14 | 
15 |     s = ''
16 |     for i in range(n_res):
17 |         aa_idx = np.random.randint(0,20)
18 |         s = s + VALID_AMINO_ACIDS[aa_idx]*np.random.randint(min_count, max_count)
19 |         
20 |     s = list(s)
21 |     np.random.shuffle(s)
22 |     s = "".join(s)
23 |     return s
24 | 


--------------------------------------------------------------------------------
/sparrow/tests/compute_test_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "725f76f7",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "### Dictionary that recomputes the test_data "
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "92383cc0",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": []
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 41,
 22 |    "id": "5494688b",
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "0.43746367\n",
 30 |       "32.81683\n",
 31 |       "33.175683185378716\n",
 32 |       "81.15200236853273\n",
 33 |       "75.92058\n",
 34 |       "0.5773746\n",
 35 |       "5.815894\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "from sparrow import Protein\n",
 41 |     "\n",
 42 |     "P = Protein('MKYLAAYLLLNAAGNTPDATKIKAILESVGIEIEDEKVSSVLSALEGKSVDELITEGNEKLAAVPAAGPASAGGAAAASGDAAAEEEKEEEAAEESDDDMGFGLFD')\n",
 43 |     "\n",
 44 |     "print(P.predictor.asphericity())\n",
 45 |     "\n",
 46 |     "print(P.predictor.radius_of_gyration())\n",
 47 |     "print(P.predictor.radius_of_gyration(use_scaled=True))\n",
 48 |     "\n",
 49 |     "print(P.predictor.end_to_end_distance(use_scaled=True))\n",
 50 |     "print(P.predictor.end_to_end_distance(use_scaled=False))\n",
 51 |     "\n",
 52 |     "print(P.predictor.scaling_exponent())\n",
 53 |     "print(P.predictor.prefactor())\n",
 54 |     "from sparrow.data.amino_acids import VALID_AMINO_ACIDS\n"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 47,
 60 |    "id": "4e59d0a1",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "from sparrow import Protein\n",
 65 |     "import pytest\n",
 66 |     "import protfasta\n",
 67 |     "import os\n",
 68 |     "\n",
 69 |     "current_filepath = os.getcwd()\n",
 70 |     "onehundred_seqs = \"{}/test_data/test_seqs_100.fasta\".format(current_filepath)\n",
 71 |     "\n",
 72 |     "seqs = protfasta.read_fasta(onehundred_seqs)\n"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "id": "3ed4d1e5",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "k2rg = {}\n",
 83 |     "for k in seqs:\n",
 84 |     "    k2rg[k] = Protein(seqs[k]).predictor.radius_of_gyration()\n",
 85 |     "\n",
 86 |     "np.save('test_data/test_100_rg_v2.npy', np.array(k2rg, dtype=dict))    "
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 48,
 92 |    "id": "e71f57dd",
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "k2rg = {}\n",
 97 |     "for k in seqs:\n",
 98 |     "    k2rg[k] = Protein(seqs[k]).predictor.radius_of_gyration(use_scaled=True)\n",
 99 |     "\n",
100 |     "np.save('test_data/test_100_rg_scaled_v2.npy', np.array(k2rg, dtype=dict))    "
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 49,
106 |    "id": "12872bec",
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "k2re = {}\n",
111 |     "for k in seqs:\n",
112 |     "    k2re[k] = Protein(seqs[k]).predictor.end_to_end_distance()\n",
113 |     "\n",
114 |     "np.save('test_data/test_100_re_v2.npy', np.array(k2re, dtype=dict))    "
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 50,
120 |    "id": "3bc0cd1a",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "k2re = {}\n",
125 |     "for k in seqs:\n",
126 |     "    k2re[k] = Protein(seqs[k]).predictor.end_to_end_distance(use_scaled=True)\n",
127 |     "\n",
128 |     "np.save('test_data/test_100_re_scaled_v2.npy', np.array(k2re, dtype=dict))    "
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 51,
134 |    "id": "47f17564",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "k2asph = {}\n",
139 |     "for k in seqs:\n",
140 |     "    k2asph[k] = Protein(seqs[k]).predictor.asphericity()\n",
141 |     "\n",
142 |     "np.save('test_data/test_100_asph_v2.npy', np.array(k2asph, dtype=dict))    "
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 52,
148 |    "id": "202cdc34",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "k2scal_exp = {}\n",
153 |     "for k in seqs:\n",
154 |     "    k2scal_exp[k] = Protein(seqs[k]).predictor.scaling_exponent()\n",
155 |     "\n",
156 |     "np.save('test_data/test_100_exponent_v2.npy', np.array(k2scal_exp, dtype=dict))    "
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 53,
162 |    "id": "39eb54c0",
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "k2prefact = {}\n",
167 |     "for k in seqs:\n",
168 |     "    k2prefact[k] = Protein(seqs[k]).predictor.prefactor()\n",
169 |     "\n",
170 |     "np.save('test_data/test_100_prefactor_v2.npy', np.array(k2prefact, dtype=dict))    "
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 3,
176 |    "id": "f4d6bf39",
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "from sparrow.patterning import iwd"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 21,
186 |    "id": "868afb92",
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "k2_average_bivariate_inverse_distance_charge = {}\n",
191 |     "k2_average_inverse_distance_charge_neg = {}\n",
192 |     "k2_average_inverse_distance_charge_pos = {}\n",
193 |     "k2_average_inverse_distance_ali = {}\n",
194 |     "\n",
195 |     "for k in seqs:\n",
196 |     "\n",
197 |     "    local_seq = seqs[k]\n",
198 |     "    \n",
199 |     "    ncpr = Protein(seqs[k]).linear_sequence_profile('NCPR')\n",
200 |     "    \n",
201 |     "    k2_average_bivariate_inverse_distance_charge[k] = iwd.calculate_average_bivariate_inverse_distance_charge(ncpr, local_seq)\n",
202 |     "    k2_average_inverse_distance_charge_neg[k] = iwd.calculate_average_inverse_distance_charge(ncpr, local_seq, '-')\n",
203 |     "    k2_average_inverse_distance_charge_pos[k] = iwd.calculate_average_inverse_distance_charge(ncpr, local_seq, '+')\n",
204 |     "    k2_average_inverse_distance_ali[k] = iwd.calculate_average_inverse_distance_from_sequence(local_seq, 'ILVAM')\n",
205 |     "    \n",
206 |     "    \n",
207 |     "np.save('test_data/test_average_bivariate_inverse_distance_charge.npy', np.array(k2_average_bivariate_inverse_distance_charge, dtype=dict))        \n",
208 |     "np.save('test_data/test_average_inverse_distance_charge_neg.npy', np.array(k2_average_inverse_distance_charge_neg, dtype=dict))        \n",
209 |     "np.save('test_data/test_average_inverse_distance_charge_pos.npy', np.array(k2_average_inverse_distance_charge_pos, dtype=dict))        \n",
210 |     "np.save('test_data/test_average_inverse_distance_ali.npy', np.array(k2_average_inverse_distance_ali, dtype=dict))        \n",
211 |     "    \n",
212 |     "    \n",
213 |     "    "
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 26,
219 |    "id": "54a12190",
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/plain": [
225 |        "0.27504330372096264"
226 |       ]
227 |      },
228 |      "execution_count": 26,
229 |      "metadata": {},
230 |      "output_type": "execute_result"
231 |     }
232 |    ],
233 |    "source": [
234 |     "Protein('ALEPLEALELASEPLALELAEPDEKKAEPLAEPLAEKAKEPALE').compute_iwd"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "id": "52e332aa",
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": []
244 |   }
245 |  ],
246 |  "metadata": {
247 |   "kernelspec": {
248 |    "display_name": "Python 3 (ipykernel)",
249 |    "language": "python",
250 |    "name": "python3"
251 |   },
252 |   "language_info": {
253 |    "codemirror_mode": {
254 |     "name": "ipython",
255 |     "version": 3
256 |    },
257 |    "file_extension": ".py",
258 |    "mimetype": "text/x-python",
259 |    "name": "python",
260 |    "nbconvert_exporter": "python",
261 |    "pygments_lexer": "ipython3",
262 |    "version": "3.8.12"
263 |   }
264 |  },
265 |  "nbformat": 4,
266 |  "nbformat_minor": 5
267 | }
268 | 


--------------------------------------------------------------------------------
/sparrow/tests/generate_test_data/generate_dssp_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "id": "c81ae04b",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from sparrow.predictors.dssp.dssp_predictor import DSSPPredictor\n",
 11 |     "import numpy as np\n",
 12 |     "import protfasta\n",
 13 |     "import pickle\n",
 14 |     "\n",
 15 |     "natural_proteins = protfasta.read_fasta('../test_data/test_seqs_100.fasta')\n",
 16 |     "\n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "id": "503d8163",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "### Helicity predictions"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "ab18972d",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "helicity_class = {}\n",
 35 |     "helicity_prob = {}\n",
 36 |     "\n",
 37 |     "X2 = DSSPPredictor(version=2)\n",
 38 |     "\n",
 39 |     "for k in natural_proteins:\n",
 40 |     "    s = natural_proteins[k]\n",
 41 |     "    helicity_class[k] = X2.predict_helicity_smart(s)\n",
 42 |     "    helicity_prob[k] = X2.predict_helical_probability(s)\n",
 43 |     "\n",
 44 |     "with open('../test_data/helicity_class_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
 45 |     "    pickle.dump(helicity_class, f)    \n",
 46 |     "    \n",
 47 |     "with open('../test_data/helicity_prob_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
 48 |     "    pickle.dump(helicity_prob, f)    "
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 11,
 54 |    "id": "cf90aec4",
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "extended_class = {}\n",
 59 |     "extended_prob = {}\n",
 60 |     "\n",
 61 |     "X2 = DSSPPredictor(version=2)\n",
 62 |     "\n",
 63 |     "for k in natural_proteins:\n",
 64 |     "    s = natural_proteins[k]\n",
 65 |     "    extended_class[k] = X2.predict_extended_smart(s)\n",
 66 |     "    extended_prob[k] = X2.predict_extended_probability(s)\n",
 67 |     "\n",
 68 |     "with open('../test_data/extended_class_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
 69 |     "    pickle.dump(extended_class, f)    \n",
 70 |     "    \n",
 71 |     "with open('../test_data/extended_prob_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
 72 |     "    pickle.dump(extended_prob, f)    "
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 12,
 78 |    "id": "8ae2e5c6",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "coil_class = {}\n",
 83 |     "coil_prob = {}\n",
 84 |     "\n",
 85 |     "X2 = DSSPPredictor(version=2)\n",
 86 |     "\n",
 87 |     "for k in natural_proteins:\n",
 88 |     "    s = natural_proteins[k]\n",
 89 |     "    coil_class[k] = X2.predict_coil_smart(s)\n",
 90 |     "    coil_prob[k] = X2.predict_coil_probability(s)\n",
 91 |     "\n",
 92 |     "with open('../test_data/coil_class_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
 93 |     "    pickle.dump(coil_class, f)    \n",
 94 |     "    \n",
 95 |     "with open('../test_data/coil_prob_v2_default_test_seqs_100.pickle', 'wb') as f:\n",
 96 |     "    pickle.dump(coil_prob, f)    "
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "id": "243f8c54",
102 |    "metadata": {},
103 |    "source": [
104 |     "## Non-default data\n",
105 |     "The code below generates sequences with non-default settings for the threshold and minimum length to vary this value and ensure all works well there"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 27,
111 |    "id": "7d703b3d",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "helicity_class = {}\n",
116 |     "\n",
117 |     "X2 = DSSPPredictor(version=2)\n",
118 |     "\n",
119 |     "for k in natural_proteins:\n",
120 |     "    s = natural_proteins[k]\n",
121 |     "    \n",
122 |     "    thresh = np.random.random()\n",
123 |     "    minlen = np.random.randint(1,13)\n",
124 |     "    \n",
125 |     "    tmp = X2.predict_helicity_smart(s, threshold=thresh, minlen=minlen)\n",
126 |     "    \n",
127 |     "    helicity_class[k] = [thresh, minlen, tmp]\n",
128 |     "\n",
129 |     "with open('../test_data/helicity_class_v2_non_default_test_seqs_100.pickle', 'wb') as f:\n",
130 |     "    pickle.dump(helicity_class, f)    \n",
131 |     "    \n"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 29,
137 |    "id": "09d2bdac",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "extended_class = {}\n",
142 |     "\n",
143 |     "X2 = DSSPPredictor(version=2)\n",
144 |     "\n",
145 |     "for k in natural_proteins:\n",
146 |     "    s = natural_proteins[k]\n",
147 |     "    \n",
148 |     "    thresh = np.random.random()\n",
149 |     "    minlen = np.random.randint(1,13)\n",
150 |     "    \n",
151 |     "    tmp = X2.predict_extended_smart(s, threshold=thresh, minlen=minlen)\n",
152 |     "    \n",
153 |     "    extended_class[k] = [thresh, minlen, tmp]\n",
154 |     "\n",
155 |     "with open('../test_data/extended_class_v2_non_default_test_seqs_100.pickle', 'wb') as f:\n",
156 |     "    pickle.dump(extended_class, f)    \n",
157 |     "    \n"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 30,
163 |    "id": "187ae833",
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "coil_class = {}\n",
168 |     "\n",
169 |     "X2 = DSSPPredictor(version=2)\n",
170 |     "\n",
171 |     "for k in natural_proteins:\n",
172 |     "    s = natural_proteins[k]\n",
173 |     "    \n",
174 |     "    thresh = np.random.random()\n",
175 |     "    minlen = np.random.randint(1,13)\n",
176 |     "    \n",
177 |     "    tmp = X2.predict_coil_smart(s, threshold=thresh, minlen=minlen)\n",
178 |     "    \n",
179 |     "    coil_class[k] = [thresh, minlen, tmp]\n",
180 |     "\n",
181 |     "with open('../test_data/coil_class_v2_non_default_test_seqs_100.pickle', 'wb') as f:\n",
182 |     "    pickle.dump(coil_class, f)    \n",
183 |     "    \n"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "id": "b42dee92",
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": []
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "kernelspec": {
197 |    "display_name": "Python 3 (ipykernel)",
198 |    "language": "python",
199 |    "name": "python3"
200 |   },
201 |   "language_info": {
202 |    "codemirror_mode": {
203 |     "name": "ipython",
204 |     "version": 3
205 |    },
206 |    "file_extension": ".py",
207 |    "mimetype": "text/x-python",
208 |    "name": "python",
209 |    "nbconvert_exporter": "python",
210 |    "pygments_lexer": "ipython3",
211 |    "version": "3.8.12"
212 |   }
213 |  },
214 |  "nbformat": 4,
215 |  "nbformat_minor": 5
216 | }
217 | 


--------------------------------------------------------------------------------
/sparrow/tests/generate_test_data/helicity_class_v2_default.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/generate_test_data/helicity_class_v2_default.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/coil_class_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/coil_class_v2_default_test_seqs_100.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/coil_class_v2_non_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/coil_class_v2_non_default_test_seqs_100.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/coil_prob_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/coil_prob_v2_default_test_seqs_100.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/extended_class_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/extended_class_v2_default_test_seqs_100.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/extended_class_v2_non_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/extended_class_v2_non_default_test_seqs_100.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/extended_prob_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/extended_prob_v2_default_test_seqs_100.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/helicity_class_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/helicity_class_v2_default_test_seqs_100.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/helicity_class_v2_non_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/helicity_class_v2_non_default_test_seqs_100.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/helicity_prob_v2_default_test_seqs_100.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/helicity_prob_v2_default_test_seqs_100.pickle


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_asph.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_asph.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_asph_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_asph_v2.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_exponent.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_exponent.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_exponent_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_exponent_v2.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_prefactor.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_prefactor.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_prefactor_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_prefactor_v2.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_re.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_re_scaled.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re_scaled.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_re_scaled_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re_scaled_v2.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_re_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_re_v2.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_rg.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_rg_scaled.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg_scaled.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_rg_scaled_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg_scaled_v2.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_rg_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_rg_v2.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_scd.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_scd.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_100_shd.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_100_shd.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_average_bivariate_inverse_distance_charge.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_bivariate_inverse_distance_charge.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_average_inverse_distance_ali.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_inverse_distance_ali.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_average_inverse_distance_charge_neg.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_inverse_distance_charge_neg.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_data/test_average_inverse_distance_charge_pos.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_data/test_average_inverse_distance_charge_pos.npy


--------------------------------------------------------------------------------
/sparrow/tests/test_iwd.py:
--------------------------------------------------------------------------------
 1 | from sparrow.patterning import iwd
 2 | import os
 3 | import protfasta
 4 | import numpy as np
 5 | from sparrow import Protein
 6 | 
 7 | current_filepath = os.getcwd()
 8 | onehundred_seqs = "{}/test_data/test_seqs_100.fasta".format(current_filepath)
 9 | 
10 | seqs = protfasta.read_fasta(onehundred_seqs)
11 | 
12 | 
13 | def test_average_bivariate_inverse_distance_charge():
14 | 
15 |     k2val = np.load('test_data/test_average_bivariate_inverse_distance_charge.npy', allow_pickle=True).item()        
16 |     for k in seqs:                
17 |         assert np.isclose(Protein(seqs[k]).compute_bivariate_iwd_charged_weighted(), k2val[k])
18 | 
19 | 
20 | def test_average_inverse_distance_charge_neg():
21 | 
22 |     k2val = np.load('test_data/test_average_inverse_distance_charge_neg.npy', allow_pickle=True).item()
23 |     for k in seqs:
24 |         assert np.isclose(Protein(seqs[k]).compute_iwd_charged_weighted('-'), k2val[k])
25 | 
26 | def test_average_inverse_distance_charge_pos():
27 | 
28 |     k2val = np.load('test_data/test_average_inverse_distance_charge_pos.npy', allow_pickle=True).item()        
29 |     for k in seqs:
30 |         assert np.isclose(Protein(seqs[k]).compute_iwd_charged_weighted('+'), k2val[k])
31 | 
32 | def test_average_inverse_distance_ali():
33 | 
34 |     k2val = np.load('test_data/test_average_inverse_distance_ali.npy', allow_pickle=True).item()        
35 |     for k in seqs:                
36 |         assert np.isclose(Protein(seqs[k]).compute_iwd('ILVAM'), k2val[k])
37 |         
38 |         
39 | 


--------------------------------------------------------------------------------
/sparrow/tests/test_kappa.py:
--------------------------------------------------------------------------------
  1 | # Import package, test suite, and other packages as needed
  2 | import sparrow
  3 | import pytest
  4 | import sys
  5 | import numpy as np
  6 | from sparrow.protein import Protein
  7 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS
  8 | import random
  9 | 
 10 | 
 11 | 
 12 | USE_LOCALCIDER = True
 13 | 
 14 | 
 15 | def test_kappa():
 16 | 
 17 |     das = [
 18 |         'EKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEK',
 19 |         'EEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEK',
 20 |         'KEKKKEKKEEKKEEKEKEKEKEEKKKEEKEKEKEKKKEEKEKEEKKEEEE',
 21 |         'KEKEEKEKKKEEEEKEKKKKEEKEKEKEKEEKKEEKKKKEEKEEKEKEKE',
 22 |         'KEKEKKEEKEKKEEEKKEKEKEKKKEEKKKEEKEEKKEEKKKEEKEEEKE',
 23 |         'EEEKKEKKEEKEEKKEKKEKEEEKKKEKEEKKEEEKKKEKEEEEKKKKEK',
 24 |         'EEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEK',
 25 |         'KKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKE',
 26 |         'EEKKEEEKEKEKEEEEEKKEKKEKKEKKKEEKEKEKKKEKKKKEKEEEKE',
 27 |         'EKKKKKKEEKKKEEEEEKKKEEEKKKEKKEEKEKEEKEKKEKKEEKEEEE',
 28 |         'EKEKKKKKEEEKKEKEEEEKEEEEKKKKKEKEEEKEEKKEEKEKKKEEKK',
 29 |         'EKKEEEEEEKEKKEEEEKEKEKKEKEEKEKKEKKKEKKEEEKEKKKKEKK',
 30 |         'KEKKKEKEKKEKKKEEEKKKEEEKEKKKEEKKEKKEKKEEEEEEEKEEKE',
 31 |         'EKKEKEEKEEEEKKKKKEEKEKKEKKKKEKKKKKEEEEEEKEEKEKEKEE',
 32 |         'KKEKKEKKKEKKEKKEEEKEKEKKEKKKKEKEKKEEEEEEEEKEEKKEEE',
 33 |         'EKEKEEKKKEEKKKKEKKEKEEKKEKEKEKKEEEEEEEEEKEKKEKKKKE',
 34 |         'EKEKKKKKKEKEKKKKEKEKKEKKEKEEEKEEKEKEKKEEKKEEEEEEEE',
 35 |         'KEEKKEEEEEEEKEEKKKKKEKKKEKKEEEKKKEEKKKEEEEEEKKKKEK',
 36 |         'EEEEEKKKKKEEEEEKKKKKEEEEEKKKKKEEEEEKKKKKEEEEEKKKKK',
 37 |         'EEKEEEEEEKEEEKEEKKEEEKEKKEKKEKEEKKEKKKKKKKKKKKKEEE',
 38 |         'EEEEEEEEEKEKKKKKEKEEKKKKKKEKKEKKKKEKKEEEEEEKEEEKKK',
 39 |         'KEEEEKEEKEEKKKKEKEEKEKKKKKKKKKKKKEKKEEEEEEEEKEKEEE',
 40 |         'EEEEEKEEEEEEEEEEEKEEKEKKKKKKEKKKKKKKEKEKKKKEKKEEKK',
 41 |         'EEEEKEEEEEKEEEEEEEEEEEEKKKEEKKKKKEKKKKKKKEKKKKKKKK',
 42 |         'EEEEEEEEEEEKEEEEKEEKEEKEKKKKKKKKKKKKKKKKKKEEKKEEKE',
 43 |         'KEEEEEEEKEEKEEEEEEEEEKEEEEKEEKKKKKKKKKKKKKKKKKKKKE',
 44 |         'KKEKKKEKKEEEEEEEEEEEEEEEEEEEEKEEKKKKKKKKKKKKKKKEKK',
 45 |         'EKKKKKKKKKKKKKKKKKKKKKEEEEEEEEEEEEEEEEEEKKEEEEEKEK',
 46 |         'KEEEEKEEEEEEEEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKK',
 47 |         'EEEEEEEEEEEEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKKKK']
 48 | 
 49 |     das_kappa_vals = [0.000963782329781065,
 50 |                       0.006849987601594839,
 51 |                       0.02510380091732725,
 52 |                       0.023779919834168346,
 53 |                       0.014793830994527891,
 54 |                       0.030699929748093432,
 55 |                       0.055155094748869704,
 56 |                       0.055155094748869704,
 57 |                       0.06207283537900597,
 58 |                       0.09244645817707578,
 59 |                       0.08182457866549872,
 60 |                       0.08535584477384989,
 61 |                       0.09376754013641903,
 62 |                       0.12779464725771064,
 63 |                       0.13589023055307498,
 64 |                       0.14253932524913954,
 65 |                       0.17465693111603184,
 66 |                       0.16361063576296123,
 67 |                       0.2184643791753562,
 68 |                       0.2683678441326591,
 69 |                       0.2836833506008589,
 70 |                       0.3168464032629612,
 71 |                       0.35941633427624997,
 72 |                       0.45755189798526164,
 73 |                       0.5278595348152701,
 74 |                       0.5935761144891406,
 75 |                       0.6553235220661426,
 76 |                       0.7440558474562516,
 77 |                       0.8658988417475169,
 78 |                       1.0]
 79 | 
 80 |     for p in range(len(das)):
 81 |         assert np.isclose(das_kappa_vals[p], Protein(das[p]).kappa, atol=0.03)
 82 |         
 83 |     if USE_LOCALCIDER:
 84 |         from localcider.sequenceParameters import SequenceParameters
 85 |         nseqs = 100
 86 |         max_count = 100
 87 |         n_diff_res = 10
 88 | 
 89 |         res_set = VALID_AMINO_ACIDS.copy()
 90 | 
 91 |         for i in range(nseqs):
 92 |             random.shuffle(res_set)
 93 |             local_res = res_set[:n_diff_res]
 94 |             seq = ''
 95 |             for aa in local_res:
 96 |                 seq = seq + aa*random.randint(1,max_count)
 97 | 
 98 |             seq = list(seq)
 99 |             random.shuffle(seq)
100 |             seq = "".join(seq)
101 | 
102 |             P = Protein(seq)
103 | 
104 |             # skip sequences 
105 |             if P.fraction_negative == 0 or P.fraction_positive == 0:
106 |                 continue
107 |             
108 |             SO = SequenceParameters(seq)
109 |             assert np.isclose(P.NCPR, SO.get_NCPR())
110 |             assert np.isclose(P.FCR, SO.get_FCR())
111 | 
112 |             # note, this will stochastically fial from time to time..
113 |             assert np.isclose(P.kappa, SO.get_kappa(), atol=0.03)
114 | 
115 | 
116 | def test_kappa_range():
117 | 
118 |     for i in range(100):
119 | 
120 |         Es = 'E'*random.randint(1,60)
121 |         Ks = 'K'*random.randint(1,60)
122 |         Gs = 'G'*random.randint(1,100)
123 | 
124 |         tmp = Es+Ks+Gs
125 |         if len(tmp) < 7:
126 |             continue
127 | 
128 |         tmp_list = list(tmp)
129 |         random.shuffle(tmp_list)
130 |         tmp = "".join(tmp_list)
131 |     
132 |         p = Protein(tmp)
133 |         k = p.kappa
134 | 
135 |         assert k > 0
136 |         assert k < 1
137 |             
138 | 


--------------------------------------------------------------------------------
/sparrow/tests/test_plugins.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from sparrow.protein import Protein
 4 | from sparrow.sequence_analysis.community_plugins.contributed import MultiplicativeFCR
 5 | from sparrow.sequence_analysis.plugins import BasePlugin
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def protein():
10 |     sequence = "LLERYIPKHQKCLTSAQRSSIDPLDIEDVYQHKKPKFSSKSHIWHVYNENSNRQKLEHVKVNKGSKASLFINKEDVYEYYQKDPKNTKFGKSKHKQSTLDQIYSTGLRKGNLHNVKDPNTNVPKGIGRRKTQHKRTQVDDVDCNPRKILAVSPSRRINRLVTYQQHIPETHNDLPEELCEPSSLTLSSLRNGLDSSTEACSVSKEKHIQNLDLSDSQEVQCLELESVDQTEAVSFPGLLLHKEIKLPVVTTDKQPHTLQEQHHVLYKSHENSNLV"
11 |     return Protein(sequence)
12 | 
13 | 
14 | def test_multiplicative_fcr_plugin(protein):
15 |     plugin_manager = protein.plugin
16 |     double_fcr_result = plugin_manager.MultiplicativeFCR()
17 |     expected_result = 2.0 * protein.FCR
18 |     assert pytest.approx(double_fcr_result, 0.000001) == expected_result
19 | 
20 | 
21 | def test_plugin_manager_cache(protein):
22 |     plugin_manager = protein.plugin
23 |     first_result = plugin_manager.MultiplicativeFCR()
24 |     second_result = plugin_manager.MultiplicativeFCR()
25 |     assert first_result == second_result
26 | 
27 | 
28 | def test_invalid_plugin(protein):
29 |     plugin_manager = protein.plugin
30 |     with pytest.raises(AttributeError):
31 |         plugin_manager.NonExistentPlugin
32 | 
33 | 
34 | def test_multiple_plugins(protein):
35 |     class TripleFCR(BasePlugin):
36 |         def calculate(self, factor=3.0):
37 |             return factor * self.protein.FCR
38 | 
39 |     class QuadrupleFCR(BasePlugin):
40 |         def calculate(self, factor=4.0):
41 |             return factor * self.protein.FCR
42 | 
43 |     plugin_manager = protein.plugin
44 |     # plugin_manager._PluginManager__plugins is a dictionary that stores plugins.
45 |     # we can add a new plugin to it by assigning a new key-value pair to it.
46 |     plugin_manager._PluginManager__plugins["TripleFCR"] = TripleFCR(protein)
47 |     plugin_manager._PluginManager__plugins["QuadrupleFCR"] = QuadrupleFCR(protein)
48 | 
49 |     # Testing TripleFCR plugin
50 |     triple_fcr_result = plugin_manager.TripleFCR(factor=3.0)
51 |     expected_triple_result = 3.0 * protein.FCR
52 |     assert pytest.approx(triple_fcr_result, 0.000001) == expected_triple_result
53 | 
54 |     # Testing QuadrupleFCR plugin
55 |     quadruple_fcr_result = plugin_manager.QuadrupleFCR(factor=4.0)
56 |     expected_quadruple_result = 4.0 * protein.FCR
57 |     assert pytest.approx(quadruple_fcr_result, 0.000001) == expected_quadruple_result
58 | 
59 | 
60 | def test_base_plugin_initialization(protein):
61 |     class TestPlugin(BasePlugin):
62 |         def calculate(self):
63 |             return protein.FCR
64 | 
65 |     plugin = TestPlugin(protein)
66 |     assert plugin.protein == protein
67 | 
68 | 
69 | def test_base_plugin_abstract_method(protein):
70 |     with pytest.raises(TypeError):
71 |         BasePlugin(protein)
72 | 


--------------------------------------------------------------------------------
/sparrow/tests/test_polymeric.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tests/test_polymeric.py


--------------------------------------------------------------------------------
/sparrow/tests/test_predictor_disorder.py:
--------------------------------------------------------------------------------
 1 | # Import package, test suite, and other packages as needed
 2 | import sparrow
 3 | import pytest
 4 | import sys
 5 | import numpy as np
 6 | from sparrow.protein import Protein
 7 | 
 8 | def test_protein_code_coverage():
 9 | 
10 |     P = Protein('MKASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQ')
11 | 
12 | 
13 |     # V2
14 |     # assert np.isclose(np.mean(P.predictor.disorder()), 0.8636131147540983)
15 |     
16 |     assert np.isclose(np.mean(P.predictor.disorder()), 0.92875415)
17 | 


--------------------------------------------------------------------------------
/sparrow/tests/test_protein.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Unit and regression test for the sparrow package.
  3 | """
  4 | 
  5 | # Import package, test suite, and other packages as needed
  6 | import random
  7 | import sys
  8 | 
  9 | import numpy as np
 10 | import pytest
 11 | 
 12 | import sparrow
 13 | from sparrow.data.amino_acids import VALID_AMINO_ACIDS
 14 | from sparrow.protein import Protein
 15 | from sparrow.sequence_analysis.elm import (
 16 |     ELM,
 17 |     compute_gained_elms,
 18 |     compute_lost_elms,
 19 |     compute_retained_elms,
 20 | )
 21 | 
 22 | 
 23 | def test_sparrow_imported():
 24 |     """Sample test, will always pass so long as import statement worked"""
 25 |     assert "sparrow" in sys.modules
 26 | 
 27 | 
 28 | def test_protein_code_coverage():
 29 | 
 30 |         
 31 |     s = 'MKASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQ'
 32 |     # constructor
 33 |     P = Protein(s)
 34 |     assert len(P) == 61
 35 | 
 36 |     P = Protein(s, validate=True)
 37 |     assert len(P) == 61
 38 | 
 39 |     s_broken = 'MKASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSXYGQSSYSSYXQ'
 40 |     P = Protein(s_broken, validate=True)
 41 |     assert len(P) == 61
 42 |     assert s == P.sequence
 43 | 
 44 |     
 45 | 
 46 |     assert len(P.amino_acid_fractions) == 20
 47 |     assert P.FCR == 0.04918032786885246 
 48 |     assert P.fraction_positive == 0.01639344262295082
 49 |     assert P.fraction_negative == 0.03278688524590164
 50 |     assert P.NCPR == -0.01639344262295082
 51 |     assert P.fraction_aromatic == 0.16393442622950818
 52 |     assert P.fraction_aliphatic == 0.06557377049180328
 53 |     assert P.fraction_polar == 0.6721311475409836
 54 |     assert P.fraction_proline == 0.04918032786885246
 55 | 
 56 |     # V2
 57 |     # assert np.mean(P.predictor.disorder()) == 0.8636131147540983
 58 |     
 59 |     assert np.isclose(np.mean(P.predictor.disorder()), 0.92875415)
 60 |     assert P.hydrophobicity == 3.052459016393442
 61 |     assert P.compute_residue_fractions(['P','E','K','R','D']) == 0.09836065573770492
 62 | 
 63 |     assert np.mean(P.linear_sequence_profile('FCR')) == 0.04918032786885246
 64 |     assert np.mean(P.linear_sequence_profile('NCPR')) == -0.02459016393442623
 65 |     assert np.mean(P.linear_sequence_profile('aromatic')) == 0.1680327868852459
 66 |     assert np.mean(P.linear_sequence_profile('aliphatic')) == 0.05737704918032787
 67 |     assert np.mean(P.linear_sequence_profile('polar')) == 0.6762295081967213
 68 |     assert np.mean(P.linear_sequence_profile('proline')) == 0.04918032786885246
 69 |     assert np.mean(P.linear_sequence_profile('positive')) == 0.012295081967213115
 70 |     assert np.mean(P.linear_sequence_profile('negative')) == 0.036885245901639344
 71 |     assert np.isclose(np.mean(P.linear_sequence_profile('hydrophobicity')),3.0450819672131146)
 72 |     assert np.mean(P.linear_composition_profile(['E','K'])) == 0.012295081967213115
 73 | 
 74 |     P = Protein("KRRARKRRARKRRARKRRAR")
 75 |     elms = P.elms
 76 |     func_sites = []
 77 |     elm_sequences = []
 78 |     start, end = [],[]
 79 |     for elm in elms:
 80 |         start.append(elm.start)
 81 |         end.append(elm.end)
 82 |         elm_sequences.append(elm.sequence)
 83 |         func_sites.append(elm.functional_site_name)
 84 |     func_sites = list(set(func_sites))
 85 |     for func_site in func_sites:
 86 |         assert func_site in ['di Arginine retention/retrieving signal',
 87 |                             'CendR Motif Binding to Neuropilin Receptors',
 88 |                             'NLS classical Nuclear Localization Signals',
 89 |                             'N-degron',
 90 |                             'NRD cleavage site',
 91 |                             'PCSK cleavage site']
 92 |     assert sorted(start) == sorted([1, 6, 11, 16, 4, 9, 14, 0, 5, 10, 15, 0, 5, 10, 15, 1, 11, 0, 16, 1, 6, 11, 16, 0, 3, 13, 4, 14, 1, 9])
 93 |     assert sorted(end) == sorted([4, 9, 14, 19, 9, 14, 19, 3, 8, 13, 18, 3, 8, 13, 18, 8, 18, 3, 20, 5, 10, 15, 20, 20, 9, 19, 10, 20, 9, 15])
 94 |     assert sorted(elm_sequences) == sorted(['RRA',
 95 |                             'RRA',
 96 |                             'RRA',
 97 |                             'RRA',
 98 |                             'RKRRA',
 99 |                             'RKRRA',
100 |                             'RKRRA',
101 |                             'KRR',
102 |                             'KRR',
103 |                             'KRR',
104 |                             'KRR',
105 |                             'KRR',
106 |                             'KRR',
107 |                             'KRR',
108 |                             'KRR',
109 |                             'RRARKRR',
110 |                             'RRARKRR',
111 |                             'KRR',
112 |                             'RRAR',
113 |                             'RRAR',
114 |                             'RRAR',
115 |                             'RRAR',
116 |                             'RRAR',
117 |                             'KRRARKRRARKRRARKRRAR',
118 |                             'ARKRRA',
119 |                             'ARKRRA',
120 |                             'RKRRAR',
121 |                             'RKRRAR',
122 |                             'RRARKRRA',
123 |                             'RKRRAR'])
124 | 
125 | def test_elm_comparisons():
126 |     wt = sparrow.Protein("MKKK")
127 |     mut = sparrow.Protein("MRKK")
128 | 
129 |     wt_elms = wt.elms
130 |     mut_elms = mut.elms
131 | 
132 |     assert wt.elms == {
133 |                         ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=3, sequence='KKK')
134 |                     }
135 |     assert mut.elms == {
136 |                         ELM(regex='(.RK)|(RR[^KR])', functional_site_name='NRD cleavage site', start=0, end=3, sequence='MRK'),
137 |                         ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MRKK')
138 |                     }
139 | 
140 |     assert wt.elms - mut.elms == set()
141 |     assert wt.elms & mut.elms == {ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MKKK')}
142 |     
143 |     assert compute_lost_elms(wt,[2,"K"]) == set()
144 |     assert compute_retained_elms(wt,"p.K1R") == {ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=3, sequence='RKK')}
145 |     assert compute_gained_elms(wt,"p.K2R") == {ELM(regex='(.RK)|(RR[^KR])', functional_site_name='NRD cleavage site', start=0, end=3, sequence='MRK')}
146 |     
147 |     assert compute_retained_elms(mut,"p.M1K") == {ELM(regex='(.RK)|(RR[^KR])', functional_site_name='NRD cleavage site', start=0, end=3, sequence='MRK'),
148 |                                                  ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MRKK')}
149 |     
150 |     assert compute_gained_elms(mut,"p.M1K") == {ELM(regex='KR.', functional_site_name='PCSK cleavage site', start=0, end=3, sequence='KRK'),
151 |                                                  ELM(regex='[KR]R.', functional_site_name='PCSK cleavage site', start=0, end=3, sequence='KRK')}
152 |     assert compute_lost_elms(mut, "p.M1G") == {ELM(regex='^M{0,1}[RK][^P].', functional_site_name='N-degron', start=0, end=4, sequence='MRKK')}
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/sparrow/tests/test_scd.py:
--------------------------------------------------------------------------------
 1 | from sparrow.patterning import scd
 2 | import os
 3 | import protfasta
 4 | import numpy as np
 5 | from sparrow import Protein
 6 | from IPython import embed
 7 | 
 8 | current_filepath = os.getcwd()
 9 | onehundred_seqs = "{}/test_data/test_seqs_100.fasta".format(current_filepath)
10 | 
11 | seqs = protfasta.read_fasta(onehundred_seqs)
12 | 
13 | def test_scd():
14 | 
15 |     k2val = np.load('test_data/test_100_scd.npy', allow_pickle=True).item()
16 |     for k in seqs:
17 |         s = seqs[k]
18 |         cython_SCD = getattr(Protein(s),"SCD")
19 |         no_cython_SCD = k2val[k]
20 |         assert np.isclose(cython_SCD, no_cython_SCD)
21 | 
22 | def test_shd():
23 |     k2val = np.load('test_data/test_100_shd.npy', allow_pickle=True).item()
24 |     for k in seqs:
25 |         s = seqs[k]
26 |         assert np.isclose(getattr(Protein(s),"SHD"), k2val[k])
27 | 
28 | 


--------------------------------------------------------------------------------
/sparrow/tests/test_sparrow.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit and regression test for the sparrow package.
 3 | """
 4 | 
 5 | # Import package, test suite, and other packages as needed
 6 | import sparrow
 7 | import pytest
 8 | import sys
 9 | 
10 | def test_sparrow_imported():
11 |     """Sample test, will always pass so long as import statement worked"""
12 |     assert "sparrow" in sys.modules
13 | 


--------------------------------------------------------------------------------
/sparrow/tests/test_sparrow_vs_localcider.py:
--------------------------------------------------------------------------------
 1 | from localcider.sequenceParameters import SequenceParameters
 2 | from sparrow import Protein
 3 | 
 4 | from . import build_seq
 5 | 
 6 | import numpy as np
 7 | 
 8 | NSEQS=100
 9 | 
10 | def test_FCR():
11 | 
12 |     for i in range(NSEQS):
13 |         s = build_seq()
14 |         assert np.isclose(SequenceParameters(s).get_FCR(), Protein(s).FCR, atol=1e-8)
15 |     
16 | 
17 | def test_NCPR():
18 | 
19 |     for i in range(NSEQS):
20 |         s = build_seq()
21 |         assert np.isclose(SequenceParameters(s).get_NCPR(), Protein(s).NCPR, atol=1e-8)
22 |     
23 |         
24 | def test_fraction_neg_fraction_pos():
25 | 
26 |     for i in range(NSEQS):
27 |         s = build_seq()
28 |         assert np.isclose(SequenceParameters(s).get_countNeg()/len(s), Protein(s).fraction_negative, atol=1e-8)
29 |         assert np.isclose(SequenceParameters(s).get_countPos()/len(s), Protein(s).fraction_positive, atol=1e-8)
30 |     
31 | def test_hydrophobiciyty():
32 | 
33 |     for i in range(NSEQS):
34 |         s = build_seq()
35 |         assert np.isclose(SequenceParameters(s).get_uversky_hydropathy(), Protein(s).hydrophobicity/9, atol=1e-8)
36 | 
37 | 


--------------------------------------------------------------------------------
/sparrow/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/tools/__init__.py


--------------------------------------------------------------------------------
/sparrow/tools/general_tools.py:
--------------------------------------------------------------------------------
  1 | from sparrow.data import amino_acids
  2 | 
  3 | 
  4 | def is_valid_protein_sequence(sequence):
  5 |     """
  6 |     Function that tests if a passed sequence contains non-standard ammino acids
  7 | 
  8 |     Parameters
  9 |     ----------------
 10 |     sequence : str
 11 |         Protein sequence
 12 | 
 13 |     Returns
 14 |     ---------------
 15 |     bool
 16 |         If sequences contains non-standard amino acids returns False, else returns 
 17 |         True
 18 |     """
 19 |     
 20 |     for i in sequence:
 21 |         if i not in amino_acids.VALID_AMINO_ACIDS:
 22 |             return False
 23 | 
 24 |     return True
 25 | 
 26 | 
 27 | 
 28 | def compare_sequence(s1, s2, verbose=False, ignore_gaps=False, return_positions=False):
 29 |     """
 30 |     Function that compares two sequences of the same length and returns 
 31 |     either the set of positions where the sequences are different (indxed at 0) or 
 32 |     the number of differences between them, depending on the status of the flag 
 33 |     return_position. This function Will also print the differences if verbose is 
 34 |     set to True.
 35 |     
 36 |     If ignore_gaps is set to True, will ignore gaps in the comparison (i.e.
 37 |     will ignore '-' characters in either sequence). This is useful when running
 38 |     analyses for aligned sequences.
 39 | 
 40 |     WARNING: Sequence must have the same length - if two passed sequences are not
 41 |     identical in terms of length then this function throws a ValueError
 42 | 
 43 |     Parameters
 44 |     ----------------
 45 |     s1 : str
 46 |         First sequence to compare
 47 | 
 48 |     s2 : str
 49 |         Second sequence to compare
 50 | 
 51 |     verbose : bool
 52 |         If True, will print the differences between the two sequences.
 53 |         Default is False
 54 | 
 55 |     return_positions : bool
 56 |         If True, will return a list of positions where the two sequences 
 57 |         differ. If false return the count only.
 58 | 
 59 |     Returns
 60 |     ---------------
 61 |     int
 62 |         Number of differences between the two sequences
 63 | 
 64 |     Raises
 65 |     ---------------
 66 |     ValueError
 67 |         If sequences are not the same length.
 68 | 
 69 |     """
 70 | 
 71 |     # first things first check if sequences are the same length and
 72 |     # freak out if not!
 73 |     if len(s1) != len(s2):
 74 |         raise ValueError("Sequences must have the same length")
 75 | 
 76 |     # define comparison function based on ignore_gaps
 77 |     if ignore_gaps:
 78 |         def _compare(p1,p2):
 79 |             if p1 == "-" or p2 == "-":
 80 |                 return False
 81 |             elif p1 == p2:
 82 |                 return False
 83 |             else:
 84 |                 return True
 85 |     else:
 86 |         def _compare(p1,p2):
 87 |             if p1 == p2:
 88 |                 return False
 89 |             else:
 90 |                 return True
 91 |             
 92 | 
 93 |     # cycle through each position in the sequence
 94 |     positions = []
 95 |     for i in range(len(s1)):
 96 |         if _compare(s1[i],s2[i]):
 97 |             positions.append(i)
 98 |             if verbose:
 99 |                 print(f"{i+1}: {s1[i]} vs. {s2[i]}")
100 | 
101 | 
102 |     if return_positions:
103 |         return positions
104 |     else:
105 |         return len(positions)
106 | 


--------------------------------------------------------------------------------
/sparrow/tools/utilities.py:
--------------------------------------------------------------------------------
 1 | from sparrow.sparrow_exceptions import SparrowException
 2 | 
 3 | def validate_keyword_option(keyword, allowed_vals, keyword_name, error_message=None):
 4 |     """
 5 |     Helper function that checks a passed keyword is only one of a set of possible
 6 |     valid keywords
 7 | 
 8 |     Parameters
 9 |     -----------
10 |     keyword : str
11 |         The actual passed keyword value
12 | 
13 |     allowed_vals : list of str
14 |         A list of possible keywords
15 | 
16 |     keyword_name : str
17 |         the name of the keyword as the user would select it in the function call
18 | 
19 |     error_message : str
20 |         Allows the user to pass a custom error message
21 | 
22 | 
23 |     Returns
24 |     --------
25 |     None
26 | 
27 |         No return value, but raises ctexceptions.CTException if ``keyword `` is not
28 |         found in the allowed_vals list
29 |            
30 |     """
31 | 
32 | 
33 |     if keyword not in allowed_vals:
34 |         if error_message is None:
35 |             raise SparrowException(f'Keyword {keyword_name} passed value [{keyword}], but this is not valid.\nMust be one of: {str(allowed_vals)}')
36 | 


--------------------------------------------------------------------------------
/sparrow/visualize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idptools/sparrow/4afdef8c640bd878cd5632f8f452db580f5fa574/sparrow/visualize/__init__.py


--------------------------------------------------------------------------------
/sparrow/visualize/sequence_visuals.py:
--------------------------------------------------------------------------------
  1 | #from IPython import display
  2 | from IPython.display import display
  3 | from IPython.display import HTML
  4 | 
  5 | from sparrow.data.amino_acids import AA_COLOR
  6 | from sparrow.sparrow_exceptions import SparrowException
  7 | 
  8 | def show_sequence(seq, 
  9 |                   blocksize=10, 
 10 |                   newline=50, 
 11 |                   fontsize=14, 
 12 |                   font_family='Courier', 
 13 |                   colors={},
 14 |                   header=None,
 15 |                   bold_positions=[],
 16 |                   bold_residues=[],
 17 |                   opaque_positions=[],
 18 |                   return_raw_string=False,
 19 |                   warnings = True):
 20 |                   
 21 |     """
 22 |     Function that generates an HTML colored string that either renders in the browser or returns the 
 23 |     html string. Contains various customizable components.
 24 | 
 25 |     Parameters
 26 |     -------------
 27 | 
 28 |     blocksize : int
 29 |         Defines how big blocks of residues are. Blocks are equal to blocksize or the newline parameter, whicever is smaller. 
 30 |         Default=10. If set to -1 uses length of the sequence.
 31 | 
 32 |     newline : int
 33 |         Defines how many residues are shown before a newline is printed. Default is 50. If set to -1 uses the length of
 34 |         the sequence.
 35 | 
 36 |     fontsize : int
 37 |         Fontsize used. Default is 14
 38 | 
 39 |     font_family : str
 40 |         Which font family (from HTML fonts) is used. Using a non-monospace font makes no sense as columns will be 
 41 |         unaligned. Default is Courier. 
 42 | 
 43 |     colors : dict
 44 |         Dictionary that allows overiding of default color scheme. Should be of format key-value as 'residue'-'color' where 
 45 |         residue is a residue in the string and color is a valid HTML color (which can be a Hexcode, standard HTML color name). 
 46 |         Note that this also lets you define colors for non-standard amino acids should these be useful. Default is an empty 
 47 |         dictionary. Note also that the standard amino acid colorings are defined at sparrow.data.amino_acids.AA_COLOR
 48 |       
 49 |     header : str
 50 |         If provided, this is a string that provides a FASTA-style header (with a leading carrett included). Default None.
 51 | 
 52 |     bold_positions : list
 53 |         List of positions (indexing from 1 onwards) which will be bolded. Useful for highlighting specific regions. Note that this
 54 |         defines individual residues so (for example) to bold residues 10 to 15 would require bold_positions=[10,11,12,13,14,15]. 
 55 |         Default is an empty list.
 56 | 
 57 |     bold_residues : list
 58 |         List of residue types that can be bolded. Useful for highlighting specific residue groups.  Default is an empty list.
 59 |     
 60 |     opaque_positions : list
 61 |         List of positions (indexing from 1 onwards) which will be grey and slighlty opaque. Useful for highlighting specific regions. 
 62 |         Note that this defines individual residues so (for example) to bold residues 10 to 15 would require 
 63 |         bold_positions=[10,11,12,13,14,15]. Default is an empty list.
 64 | 
 65 |     return_raw_string : bool
 66 |         If set to true, the function returns the actual raw HTML string, as opposed to an in-notebook rendering. 
 67 |         Default is False
 68 | 
 69 |     warnings : bool
 70 |         If set to true, the function will print warnings if invalid amino acids are found. Default is True.
 71 |                   
 72 | 
 73 |     Returns
 74 |     ----------
 75 |     None or str
 76 |         If return_raw_string is set to true then an HTML-compatible string is returned.
 77 | 
 78 |     
 79 |     Raises
 80 |     -------
 81 |     sparrow.sparrow_exceptions.SparrowException
 82 |         Raises a sparrow exception if invalid input is provided (within reason).
 83 | 
 84 |     """
 85 |     
 86 |     if blocksize > newline:
 87 |         newline = blocksize
 88 | 
 89 |     if blocksize == -1:
 90 |         blocksize = len(seq)
 91 |         newline = len(seq)
 92 | 
 93 | 
 94 |     if blocksize < 1:
 95 |         raise 
 96 | 
 97 | 
 98 |     colorString = '<p style="font-family:%s;font-size: %ipx">'%(font_family, fontsize)
 99 | 
100 |     if header:
101 |         colorString = colorString + "><b>%s</b><br>"%(str(header))
102 |         
103 | 
104 |     count = -1
105 |     for residue in seq:
106 | 
107 |         count = count + 1
108 | 
109 |         if count > 0:
110 |             if count % newline == 0:
111 |                 colorString = colorString + "<br>"
112 |             
113 |             elif count % blocksize == 0:
114 |                 colorString = colorString + " "
115 | 
116 | 
117 |         if residue not in AA_COLOR and residue not in colors:
118 |             if warnings:
119 |                 print('Warning: found invalid amino acid (%s and position %i'%(residue, count+1))
120 |             colorString = colorString + '<span style="color:%s"><b>%s</b></span>' % ('black', residue)
121 |         else:
122 | 
123 |             # override with user-suppplied pallete if present
124 |             if residue in colors:
125 |                 c = colors[residue]
126 | 
127 |             # else fall back on the standard pallete 
128 |             else:
129 |                 c = AA_COLOR[residue]
130 | 
131 |              # check if residue should be light grey and opaque
132 |             # This overrides other coloring 
133 |             if count+1 in opaque_positions:
134 |                  c = '#a9a9a9'
135 | 
136 |             # if the residue type OR residue position is to be bolded...
137 |             if residue in bold_residues or (count+1) in bold_positions:
138 |                 colorString = colorString + '<span style="color:%s"><b>%s</b></span>' % (c, residue)
139 |             else:
140 |                 colorString = colorString + '<span style="color:%s">%s</span>' % (c, residue)
141 | 
142 | 
143 | 
144 |     colorString = colorString +"</p>"
145 |             
146 |     if return_raw_string:
147 |         return colorString
148 |     else:
149 |         display(HTML(colorString))
150 |         #HTML(colorString)
151 | 


--------------------------------------------------------------------------------