├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── publish-pages-doc.yml
    │   ├── python-package.yml
    │   └── python-publish.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── HISTORY.rst
├── LICENSE
├── README.md
├── autopeptideml
    ├── __init__.py
    ├── autopeptideml.py
    ├── config.py
    ├── data
    │   ├── __init__.py
    │   └── readme_ex.md
    ├── db
    │   ├── __init__.py
    │   └── db.py
    ├── main.py
    ├── pipeline
    │   ├── __init__.py
    │   ├── pipeline.py
    │   ├── sequence.py
    │   └── smiles.py
    ├── reps
    │   ├── __init__.py
    │   ├── engine.py
    │   ├── fps.py
    │   ├── lms.py
    │   ├── seq_based.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   └── peptideclm_tokenizer.py
    └── train
    │   ├── __init__.py
    │   ├── architectures.py
    │   ├── deep_learning
    │       ├── __init__.py
    │       ├── dataset.py
    │       ├── loss.py
    │       └── model.py
    │   ├── metrics.py
    │   └── trainer.py
├── docs
    ├── autopeptideml.md
    ├── imgs
    │   ├── APML_dark.png
    │   └── APML_light.png
    ├── index.md
    ├── repenginebase.md
    ├── repenginefp.md
    ├── repenginelm.md
    └── repengineseqbased.md
├── examples
    ├── AB_positives.csv
    ├── API_docs.ipynb
    └── AutoPeptideML_Collab.ipynb
├── mkdocs.yml
├── renovate.json
├── setup.py
└── tests
    ├── sample
        ├── example.csv
        └── example2.csv
    ├── test_apml.py
    ├── test_db.py
    ├── test_pipeline.py
    └── test_reps.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-pages-doc.yml:
--------------------------------------------------------------------------------
 1 | name: Publish docs via GitHub Pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - uses: actions/setup-python@v5
14 |         with:
15 |           python-version: 3.x
16 |       - run: pip install mkdocs mkdocs-material mkdocstrings[python] mkdocs-markdownextradata-plugin mdx_include mkdocs-include-markdown-plugin
17 |       - run: mkdocs gh-deploy --force
18 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.9", "3.11", "3.13"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v5
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         sudo apt-get install libprotobuf-dev protobuf-compiler
30 |         python -m pip install --upgrade pip
31 |         python -m pip install flake8 pytest sentencepiece rdkit
32 |         python -m pip install .
33 |         python -m pip install biopython
34 |         python -m pip install git+https://github.com/novonordisk-research/pepfunn.git --no-deps
35 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
36 |     - name: Lint with flake8
37 |       run: |
38 |         # stop the build if there are Python syntax errors or undefined names
39 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
40 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
41 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
42 |     - name: Test with pytest
43 |       run: |
44 |         pytest
45 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 |     
15 |   # Allows to run this workflow manually from the Actions tab
16 |   workflow_dispatch:
17 | 
18 | permissions:
19 |   contents: read
20 | 
21 | jobs:
22 |   deploy:
23 | 
24 |     runs-on: ubuntu-latest
25 | 
26 |     steps:
27 |     - uses: actions/checkout@v4
28 |     - name: Set up Python
29 |       uses: actions/setup-python@v5
30 |       with:
31 |         python-version: '3.x'
32 |     - name: Install dependencies
33 |       run: |
34 |         python -m pip install --upgrade pip
35 |         pip install build
36 |     - name: Build package
37 |       run: python -m build
38 |     - name: Publish package
39 |       uses: pypa/gh-action-pypi-publish@e9ccbe5a211ba3e8363f472cae362b56b104e796
40 |       with:
41 |         user: __token__
42 |         password: ${{ secrets.PYPI_API_TOKEN }}
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | .DS_Store
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/
162 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | raul.fernandezdiaz@ucdconnect.ie.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
1 | =======
2 | History
3 | =======
4 | 
5 | 0.1.0 (2023-04-27)
6 | ------------------
7 | 
8 | * First release on PyPI.
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 International Business Machines
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 |   <picture>
  4 |     <source media="(prefers-color-scheme: light)" srcset="https://ibm.github.io/AutoPeptideML/imgs/APML_light.png" height="250x">
  5 |     <img alt="logo" src="https://ibm.github.io/AutoPeptideML/imgs/APML_dark.png">
  6 |   </picture>
  7 | 
  8 |   <h1>AutoPeptideML</h1>
  9 | 
 10 |   <p>
 11 |     <strong>AutoML system for building trustworthy peptide bioactivity predictors</strong>
 12 |   </p>
 13 | 
 14 |   <p>    
 15 |     
 16 | <a href="https://ibm.github.io/AutoPeptideML/"><img alt="Tutorials" src="https://img.shields.io/badge/docs-tutorials-green" /></a>
 17 | <a href="https://github.com/IBM/AutoPeptideML/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/IBM/AutoPeptideML" /></a>
 18 | <a href="https://pypi.org/project/autopeptideml/"><img src="https://img.shields.io/pypi/v/autopeptideml" /></a>
 19 | <a href="https://static.pepy.tech/project/autopeptideml/"><img src="https://static.pepy.tech/badge/autopeptideml" /></a>
 20 | <a target="_blank" href="https://colab.research.google.com/github/IBM/AutoPeptideML/blob/main/examples/AutoPeptideML_Collab.ipynb">
 21 |   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 22 | </a>
 23 | 
 24 | 
 25 | 
 26 |   </p>
 27 | </div>
 28 | 
 29 | - **Documentation:**  <a href="https://ibm.github.io/AutoPeptideML/" target="_blank">https://ibm.github.io/AutoPeptideML</a>
 30 | - **Source Code:** <a href="https://github.com/IBM/AutoPeptideML" target="_blank">https://github.com/IBM/AutoPeptideML</a>
 31 | - **Webserver:** <a href="http://peptide.ucd.ie/AutoPeptideML" target="_blank">http://peptide.ucd.ie/AutoPeptideML</a>
 32 | - **Google Collaboratory Notebook:** <a href="https://colab.research.google.com/github/IBM/AutoPeptideML/blob/main/examples/AutoPeptideML_Collab.ipynb" target="_blank">AutoPeptideML_Collab.ipynb</a>
 33 | - **Blog post:** <a href="https://portal.valencelabs.com/blogs/post/autopeptideml-building-peptide-bioactivity-predictors-automatically-IZZKbJ3Un0qjo4i" target="_blank">Portal - AutoPeptideML v. 1.0 Tutorial</a>
 34 | - **Papers:** 
 35 |   - [AutoPeptideML (v. 1.0)](https://doi.org/10.1093/bioinformatics/btae555)
 36 |   - [ML Generalization from canonical to non-canonical peptides](https://doi.org/10.26434/chemrxiv-2025-ggp8n)
 37 | 
 38 | AutoPeptideML allows researchers without prior knowledge of machine learning to build models that are:
 39 | 
 40 | - **Trustworthy:** Robust evaluation following community guidelines for ML evaluation reporting in life sciences [DOME](https://www.nature.com/articles/s41592-021-01205-4).
 41 | - **Interpretable:** Output contains a PDF summary of the model evaluation explaining how to interpret the results to understand how reliable the model is.
 42 | - **Reproducible:** Output contains all necessary information for other researchers to reproduce the training and verify the results.
 43 | - **State-of-the-art:** Models generated with this system are competitive with state-of-the-art handcrafted approaches.
 44 | 
 45 | To use version 1.0, which may be necessary for retrocompatibility with previously built models, please defer to the branch: [AutoPeptideML v.1.0.6](https://github.com/IBM/AutoPeptideML/tree/apml-1.0.6)
 46 | 
 47 | ## Contents
 48 | 
 49 | <details open markdown="1"><summary><b>Table of Contents</b></summary>
 50 | 
 51 | - [Model builder](#helper)
 52 | - [Prediction](#prediction)
 53 | - [Benchmark Data](#benchmark)
 54 | - [Intallation Guide](#installation)
 55 | - [Documentation](#documentation)
 56 | - [License](#license)
 57 | - [Acknowledgements](#acknowledgements)
 58 |  </details>
 59 | 
 60 | 
 61 | ## Model builder <a name="helper"></a>
 62 | 
 63 | In order to build a new model, AutoPeptideML (v.2.0), introduces a new utility to automatically prepare an experiment configuration file, to i) improve the reproducibility of the pipeline and ii) to keep a user-friendly interface despite the much increased flexibility.
 64 | 
 65 | ```bash
 66 | autopeptideml prepare-config
 67 | ```
 68 | This launches an interactive CLI that walks you through:
 69 | 
 70 | - Choosing a modeling task (classification or regression)
 71 | - Selecting input modality (macromolecules or sequences)
 72 | - Loading and parsing datasets (csv, tsv, or fasta)
 73 | - Defining evaluation strategy
 74 | - Picking models and representations
 75 | - Setting hyperparameter search strategy and training parameters
 76 | 
 77 | 
 78 | You’ll be prompted to answer various questions like:
 79 | 
 80 | ```
 81 | - What is the modelling problem you're facing? (Classification or Regression)
 82 | 
 83 | - How do you want to define your peptides? (Macromolecules or Sequences)
 84 | 
 85 | - What models would you like to consider? (knn, adaboost, rf, etc.)
 86 | ```
 87 | 
 88 | And so on. The final config is written to:
 89 | 
 90 | ```
 91 | <outputdir>/config.yml
 92 | ```
 93 | 
 94 | This config file allows for easy reproducibility of the results, so that anyone can repeat the training processes. You can check the configuration file and make any changes you deem necessary. Finally, you can build the model by simply running:
 95 | 
 96 | ```
 97 | autopeptideml build-model --config-path <outputdir>/config.yml
 98 | ```
 99 | 
100 | ## Prediction <a name="prediction"></a>
101 | 
102 | In order to use a model that has already built you can run:
103 | 
104 | ```bash
105 | autopeptideml predict <model_outputdir> <features_path> <feature_field> --output-path <my_predictions_path.csv>
106 | ```
107 | 
108 | Where `<features_path>` is the path to a `CSV` file with a column `features_field` that contains the peptide sequences/SMILES. The output file `<my_predictions_path>` will contain the original data with two additional columns `score` (which are the predictions) and `std` which is the standard deviation between the predictions of the models in the ensemble, which can be used as a measure of the uncertainty of the prediction.
109 | 
110 | ## Benchmark data <a name="benchmark"></a>
111 | 
112 | Data used to benchmark our approach has been selected from the benchmarks collected by [Du et al, 2023](https://academic.oup.com/bib/article-abstract/24/3/bbad135/7107929). A new set of benchmarks was constructed from the original set following the new data acquisition and dataset partitioning methods within AutoPeptideML. To download the datasets:
113 | 
114 | - **Original UniDL4BioPep Benchmarks:** Please check the project [Github Repository](https://github.com/dzjxzyd/UniDL4BioPep/tree/main).
115 | - **⚠️ New AutoPeptideML Benchmarks (Amended version):** Can be downloaded from this [link](https://drive.google.com/u/0/uc?id=1UmDu773CdkBFqkitK550uO6zoxhU1bUB&export=download). Please note that these are not exactly the same benchmarks as used in the paper (see [Issue #24](https://github.com/IBM/AutoPeptideML/issues/24) for more details).
116 | - **PeptideGeneralizationBenchmarks:** Benchmarks evaluating how peptide representation methods generalize from canonical (peptides composed of the 20 standard amino acids) to non-canonical (peptides with non-standard amino acids or other chemical modifications). Check out the [paper pre-print](https://chemrxiv.org/engage/chemrxiv/article-details/67d2f3ae81d2151a023d64f8). They have their own dedicated repository: [PeptideGeneralizationBenchmarks Github repository](https://github.com/IBM/PeptideGeneralizationBenchmarks).
117 | 
118 | ## Installation <a name="installation"></a>
119 | 
120 | Installing in a conda environment is recommended. For creating the environment, please run:
121 | 
122 | ```bash
123 | conda create -n autopeptideml python
124 | conda activate autopeptideml
125 | ```
126 | 
127 | ### 1. Python Package
128 | 
129 | #### 1.1.From PyPI
130 | 
131 | 
132 | ```bash
133 | pip install autopeptideml
134 | ```
135 | 
136 | #### 1.2. Directly from source
137 | 
138 | ```bash
139 | pip install git+https://github.com/IBM/AutoPeptideML
140 | ```
141 | 
142 | ### 2. Third-party dependencies
143 | 
144 | To use MMSeqs2 [https://github.com/steineggerlab/mmseqs2](https://github.com/steineggerlab/mmseqs2)
145 | 
146 |   ```bash
147 |   # static build with AVX2 (fastest) (check using: cat /proc/cpuinfo | grep avx2)
148 |   wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz; tar xvfz mmseqs-linux-avx2.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH
149 | 
150 |   # static build with SSE4.1  (check using: cat /proc/cpuinfo | grep sse4)
151 |   wget https://mmseqs.com/latest/mmseqs-linux-sse41.tar.gz; tar xvfz mmseqs-linux-sse41.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH
152 | 
153 |   # static build with SSE2 (slowest, for very old systems)  (check using: cat /proc/cpuinfo | grep sse2)
154 |   wget https://mmseqs.com/latest/mmseqs-linux-sse2.tar.gz; tar xvfz mmseqs-linux-sse2.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH
155 | 
156 |   # MacOS
157 |   brew install mmseqs2  
158 |   ```
159 | 
160 |   To use Needleman-Wunch, either:
161 | 
162 |   ```bash
163 |   conda install -c bioconda emboss
164 |   ```
165 |   or
166 | 
167 |   ```bash
168 |   sudo apt install emboss
169 |   ```
170 | 
171 | To use ECFP fingerprints:
172 | 
173 | ```bash
174 | pip install rdkit
175 | ```
176 | 
177 | To use MAPc fingeprints:
178 | 
179 | ```bash
180 | pip install mapchiral
181 | ```
182 | 
183 | To use PepFuNN fingeprints:
184 | 
185 | ```bash
186 | pip install git+https://github.com/novonordisk-research/pepfunn
187 | ```
188 | 
189 | To use PeptideCLM:
190 | 
191 | ```bash
192 | pip install smilesPE
193 | ```
194 | 
195 | ## Documentation <a name="documentation"></a>
196 | 
197 | ### Configuration file
198 | 
199 | #### Top-level structure
200 | 
201 | ```yaml
202 | pipeline: {...}
203 | databases: {...}
204 | test: {...}
205 | val: {...}
206 | train: {...}
207 | representation: {...}
208 | outputdir: "path/to/experiment_results"
209 | ```
210 | 
211 | #### `pipeline`
212 | Defines the preprocessing pipeline depending on the modality (`mol` or `seqs`). It includes data cleaning and transformations, such as:
213 | 
214 | - `filter-smiles`
215 | - `canonical-cleaner`
216 | - `sequence-to-smiles`
217 | - `smiles-to-sequences`
218 | 
219 | The name of a pipeline object has to include the word `pipe`. Pipelines can be elements within a pipeline. Here, is an example. Aggregate will combine the output from the different elements. In this case, the two elements process SMILES and sequences independently and then combine them into a single datastream.
220 | 
221 | 
222 | ```yaml
223 | pipeline:
224 |   name: "macromolecules_pipe"
225 |   aggregate: true
226 |   verbose: false
227 |   elements:
228 |     - pipe-smiles-input: {...}
229 |     - pipe-seq-input: {...}
230 | 
231 | ```
232 | 
233 | ### `databases`
234 | 
235 | Defines dataset paths and how to interpret them.
236 | 
237 | **Required:**
238 | - `path`: Path to main dataset.
239 | - `feat_fields`: Column name with SMILES or sequences.
240 | - `label_field`: Column with classification/regression labels.
241 | - `verbose`: Logging flag.
242 | 
243 | **Optional:**
244 | - `neg_database`: If using negative sampling.
245 | - `path`: Path to negative dataset.
246 | - `feat_fields`: Feature column.
247 | - `columns_to_exclude`: Bioactivity columns to ignore.
248 | 
249 | ```yaml
250 | databases:
251 |   dataset:
252 |     path: "data/main.csv"
253 |     feat_fields: "sequence"
254 |     label_field: "activity"
255 |     verbose: false
256 |   neg_database:
257 |     path: "data/negatives.csv"
258 |     feat_fields: "sequence"
259 |     columns_to_exclude: ["to_exclude"]
260 |     verbose: false
261 | ```
262 | 
263 | ### `test`
264 | 
265 | Defines evaluation and similarity filtering settings.
266 | 
267 | - min_threshold: Identity threshold for filtering.
268 | - sim_arguments: Similarity computation details.
269 | 
270 | For sequences:
271 | 
272 | - `alignment_algorithm`: `mmseqs`, `mmseqs+prefilter`, `needle`
273 | - `denominator`: How identity is normalized: `longest`, `shortest`, `n_aligned`
274 | - `prefilter`: Whether to use a prefilter.
275 | - `field_name`: Name of column with the peptide sequences/SMILES
276 | - `verbose`: Logging flag.
277 | 
278 | For molecules:
279 | 
280 | - `sim_function`: e.g., tanimoto, jaccard
281 | - `radius`: Radius to define the substructures when computing the fingerprint
282 | - `bits`: Size of the fingerprint, greater gives more resolution but demands more computational resources.
283 | - `partitions`: `min`, `all`, `<threshold>`
284 | - `algorithm`: `ccpart`, `ccpart_random`, `graph_part`
285 | - `threshold_step`: Step size for threshold evaluation.
286 | - `filter`: Minimum proportion of data in the test set that is acceptable (test set proportion = 20%, `filter=0.185`, does not consider test sets with less than 18.5%)
287 | - `verbose`: Logging level.
288 | 
289 | Example:
290 | 
291 | ```yaml
292 | test:
293 |   min_threshold: 0.1
294 |   sim_arguments:
295 |     data_type: "sequence"
296 |     alignment_algorithm: "mmseqs"
297 |     denominator: "shortest"
298 |     prefilter: true
299 |     min_threshold: 0.1
300 |     field_name: "sequence"
301 |     verbose: 2
302 |   partitions: "all"
303 |   algorithm: "ccpart"
304 |   threshold_step: 0.1
305 |   filter: 0.185
306 |   verbose: 2
307 | ```
308 | 
309 | ### `val`
310 | 
311 | Cross-validation strategy:
312 | 
313 | - `type`: `kfold` or `single`
314 | - `k`: Number of folds.
315 | - `random_state`: Seed for reproducibility.
316 | 
317 | ### `train`
318 | Training configuration.
319 | 
320 | Required:
321 | 
322 | - `task`: class or reg
323 | - `optim_strategy`: Optimization strategy.
324 | - `trainer`: grid or optuna
325 | - `n_steps`: Number of trials (Optuna only).
326 | - `direction`: maximize or minimize
327 | - `metric`: mcc or mse
328 | - `partition`: Partitioning type.
329 | - `n_jobs`: Parallel jobs.
330 | - `patience`: Early stopping patience.
331 | - `hspace`: Search space.
332 | - `representations`: List of representations to try.
333 | - `models`:
334 | - `type`: select or ensemble
335 | - `elements`: model names and their hyperparameter space.
336 | 
337 | Example: 
338 | 
339 | ```yaml
340 | train:
341 |   task: "class"
342 |   optim_strategy:
343 |     trainer: "optuna"
344 |     n_steps: 100
345 |     direction: "maximize"
346 |     task: "class"
347 |     metric: "mcc"
348 |     partition: "random"
349 |     n_jobs: 8
350 |     patience: 20
351 |   hspace:
352 |     representations: ["chemberta-2", "ecfp-4"]
353 |     models:
354 |       type: "select"
355 |       elements:
356 |         knn:
357 |           n_neighbors:
358 |             type: int
359 |             min: 1
360 |             max: 20
361 |             log: false
362 |           weights:
363 |             type: categorical
364 |             values: ["uniform", "distance"]
365 | ```
366 | 
367 | 
368 | ### `representation`
369 | Specifies molecular or sequence representations.
370 | 
371 | Each element includes:
372 | 
373 | - `engine`: `lm` (language model) or `fp` (fingerprint)
374 | - `model`: Model name (e.g., chemberta-2, esm2-150m)
375 | - `device`: `cpu`, `gpu`, or `mps`
376 | - `batch_size`: Size per batch
377 | - `average_pooling`: Whether to average token representations (only for `lm`)
378 | 
379 | ```yaml
380 | representation:
381 |   verbose: true
382 |   elements:
383 |     - chemberta-2:
384 |         engine: "lm"
385 |         model: "chemberta-2"
386 |         device: "gpu"
387 |         batch_size: 32
388 |         average_pooling: true
389 |     - ecfp-4:
390 |         engine: "fp"
391 |         fp: "ecfp"
392 |         radius: 2
393 |         nbits: 2048
394 | ```
395 | 
396 | ### More details about API
397 | 
398 | Please check the [Code reference documentation](https://ibm.github.io/AutoPeptideML/autopeptideml/)
399 | 
400 | 
401 | 
402 | License <a name="license"></a>
403 | -------
404 | AutoPeptideML is an open-source software licensed under the MIT Clause License. Check the details in the [LICENSE](https://github.com/IBM/AutoPeptideML/blob/master/LICENSE) file.
405 | 
406 | Credits <a name="acknowledgements"></a>
407 | -------
408 | 
409 | Special thanks to [Silvia González López](https://www.linkedin.com/in/silvia-gonz%C3%A1lez-l%C3%B3pez-717558221/) for designing the AutoPeptideML logo and to [Marcos Martínez Galindo](https://www.linkedin.com/in/marcosmartinezgalindo) for his aid in setting up the AutoPeptideML webserver.
410 | 


--------------------------------------------------------------------------------
/autopeptideml/__init__.py:
--------------------------------------------------------------------------------
1 | """Top-level package for AutoPeptideML."""
2 | 
3 | __author__ = """Raul Fernandez-Diaz"""
4 | __email__ = 'raul.fernandezdiaz@ucdconnect.ie'
5 | # __all__ = ['AutoPeptideML', '__version__', '']
6 | 
7 | from .autopeptideml import AutoPeptideML, __version__
8 | 


--------------------------------------------------------------------------------
/autopeptideml/config.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | import os.path as osp
  4 | 
  5 | from multiprocessing import cpu_count
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from ItsPrompt.prompt import Prompt
 10 | 
 11 | 
 12 | HP_SPACES = {
 13 |     "knn": {
 14 |         "n_neighbors": {
 15 |             "type": "int",
 16 |             "min": 1,
 17 |             "max": 20,
 18 |             "log": False
 19 |         },
 20 |         "weights": {
 21 |             "type": "categorical",
 22 |             "values": ["uniform", "distance"]
 23 |         }
 24 |     },
 25 |     "adaboost": {
 26 |         "n_estimators": {
 27 |             "type": "int",
 28 |             "min": 10,
 29 |             "max": 1000,
 30 |             "log": False
 31 |         },
 32 |         "learning_rate": {
 33 |             "type": "float",
 34 |             "min": 1e-7,
 35 |             "max": 1e-1,
 36 |             "log": True
 37 |         }
 38 |     },
 39 |     "gradboost": {
 40 |         "learning_rate": {
 41 |             "type": "float",
 42 |             "min": 1e-5,
 43 |             "max": 1e-1,
 44 |             "log": True
 45 |         },
 46 |         "n_estimators": {
 47 |             "type": "int",
 48 |             "min": 10,
 49 |             "max": 1000,
 50 |             "log": False
 51 |         },
 52 |         "min_samples_split": {
 53 |             "type": "int",
 54 |             "min": 2,
 55 |             "max": 100,
 56 |             "log": False
 57 |         }
 58 |     },
 59 |     "rf": {
 60 |         "n_estimators": {
 61 |             "type": "int",
 62 |             "min": 10,
 63 |             "max": 1000,
 64 |             "log": False
 65 |         },
 66 |         "ccp_alpha": {
 67 |             "type": "float",
 68 |             "min": 1e-10,
 69 |             "max": 1e-3,
 70 |             "log": True
 71 |         },
 72 |         "min_samples_split": {
 73 |             "type": "int",
 74 |             "min": 2,
 75 |             "max": 100,
 76 |             "log": False
 77 |         }
 78 |     },
 79 |     "lightgbm": {
 80 |         "n_estimators": {
 81 |             "type": "int",
 82 |             "min": 10,
 83 |             "max": 1000,
 84 |             "log": False
 85 |         },
 86 |         "num_leaves": {
 87 |             "type": "int",
 88 |             "min": 8,
 89 |             "max": 1024,
 90 |             "log": False
 91 |         },
 92 |         "max_depth": {
 93 |             "type": "int",
 94 |             "min": 3,
 95 |             "max": 10,
 96 |             "log": False
 97 |         },
 98 |         "subsample": {
 99 |             "type": "float",
100 |             "min": 0.5,
101 |             "max": 1.0,
102 |             "log": False
103 |         },
104 |         "colsample_bytree": {
105 |             "type": "float",
106 |             "min": 0,
107 |             "max": 1.0,
108 |             "log": False
109 |         },
110 |         "min_split_gain": {
111 |             "type": "float",
112 |             "min": 1e-10,
113 |             "max": 1e-3,
114 |             "log": True
115 |         },
116 |         "reg_alpha": {
117 |             "type": "float",
118 |             "min": 1e-10,
119 |             "max": 1e-3,
120 |             "log": True
121 |         },
122 |         "learning_rate": {
123 |             "type": "float",
124 |             "min": 1e-7,
125 |             "max": 1e-1,
126 |             "log": True
127 |         },
128 |         "verbose": {
129 |             "type": "fixed",
130 |             "value": -1
131 |         }
132 |     },
133 |     "xgboost": {
134 |         "n_estimators": {
135 |             "type": "int",
136 |             "min": 10,
137 |             "max": 1000,
138 |             "log": False
139 |         },
140 |         "min_split_alpha": {
141 |             "type": "float",
142 |             "min": 1e-10,
143 |             "max": 1e-3,
144 |             "log": True
145 |         },
146 |         "reg_alpha": {
147 |             "type": "float",
148 |             "min": 1e-10,
149 |             "max": 1e-3,
150 |             "log": True
151 |         },
152 |         "learning_rate": {
153 |             "type": "float",
154 |             "min": 1e-7,
155 |             "max": 1e-1,
156 |             "log": True
157 |         },
158 |         "verbose": {
159 |             "type": "fixed",
160 |             "value": -1
161 |         }
162 |     },
163 |     "svm": {
164 |         "C": {
165 |             "type": "float",
166 |             "min": 1e-7,
167 |             "max": 0.1,
168 |             "log": True
169 |         },
170 |         "probability": {
171 |             "type": "fixed",
172 |             "value": True
173 |         },
174 |         "kernel": {
175 |             "values": [
176 |                 "linear",
177 |                 "poly",
178 |                 "rbf",
179 |                 "sigmoid"
180 |             ],
181 |             "type": "categorical"
182 |         },
183 |         "max_iter": {
184 |             "type": "fixed",
185 |             "value": int(1e4)
186 |         },
187 |         "degree": {
188 |             "condition": "kernel-poly",
189 |             "log": False,
190 |             "max": 7,
191 |             "min": 2,
192 |             "type": "int"
193 |         }
194 |     }
195 | }
196 | MACROMOLECULES_PIPELINE = {
197 |     "name": "macromolecules_pipe",
198 |     "aggregate": True,
199 |     "verbose": False,
200 |     "elements": [
201 |         {
202 |             "pipe-smiles-input": {
203 |                 "name": "smiles-input",
204 |                 "aggregate": False,
205 |                 "verbose": False,
206 |                 "elements": [
207 |                     {"filter-smiles": {}}
208 |                 ]
209 |             }
210 |         },
211 |         {
212 |             "pipe-seq-input": {
213 |                 "name": "seq-input",
214 |                 "aggregate": False,
215 |                 "verbose": False,
216 |                 "elements": [
217 |                     {"filter-smiles": {'keep_smiles': False}},
218 |                     {"canonical-cleaner": {"substitution": "G"}},
219 |                     {"sequence-to-smiles": {}}
220 |                 ]
221 |             }
222 |         }
223 |     ]
224 | }
225 | SEQUENCE_PIPELINE = {
226 |     "name": 'sequences-pipe',
227 |     "aggregate": True,
228 |     "verbose": False,
229 |     "elements": [
230 |         {
231 |             "clean-seqs-pipe": {
232 |                 "name": "clean-seqs-pipe",
233 |                 "aggregate": False,
234 |                 "verbose": False,
235 |                 "elements": [
236 |                     {"filter-smiles": {"keep_smiles": False}},
237 |                     {"canonical-cleaner": {"substitution": "X"}},
238 |                 ]
239 |             }
240 |         },
241 |         {
242 |             "smiles-to-seqs-pipe": {
243 |                 "name": "smiles-to-seqs-pipe",
244 |                 "aggregate": False,
245 |                 "verbose": False,
246 |                 "elements": [
247 |                     {"filter-smiles": {"keep_smiles": True}},
248 |                     {"smiles-to-sequences": {}},
249 |                     {"canonical-cleaner": {"substitution": "X"}}
250 |                 ]
251 |             }
252 |         }
253 |     ]
254 | }
255 | MOL_REPS = {
256 |     "chemberta-2": {
257 |         "engine": "lm",
258 |         "device": "cpu",
259 |         "batch_size": 32,
260 |         "average_pooling": True,
261 |         'model': 'chemberta-2'
262 |     },
263 |     "molformer-xl": {
264 |         'engine': "lm",
265 |         "device": "cpu",
266 |         "batch_size": 32,
267 |         "average_pooling": True,
268 |         'model': 'molformer-xl'
269 |     },
270 |     "peptideclm": {
271 |         'engine': 'lm',
272 |         'device': 'cpu',
273 |         'batch_size': 32,
274 |         'average_pooling': True,
275 |         'model': "peptideclm"
276 |     },
277 |     "ecfp-16": {
278 |         "engine": "fp",
279 |         "nbits": 2048,
280 |         "radius": 8,
281 |         'fp': 'ecfp'
282 |     },
283 | 
284 | }
285 | MOL_REPS.update(
286 |     {f'ecfp-{int(radius*2)}': {
287 |         'engine': "fp",
288 |         'nbits': 2048,
289 |         'radius': radius,
290 |         "fp": "ecfp"
291 |     } for radius in range(1, 10, 1)}
292 | )
293 | MOL_REPS.update(
294 |     {f'fcfp-{int(radius*2)}': {
295 |         'engine': "fp",
296 |         'nbits': 2048,
297 |         'radius': radius,
298 |         "fp": "fcfp"
299 |     } for radius in range(1, 10, 1)}
300 | )
301 | MOL_REPS.update(
302 |     {f'ecfp-counts-{int(radius*2)}': {
303 |         'engine': "fp",
304 |         'nbits': 2048,
305 |         'radius': radius,
306 |         "fp": "ecfp-count"
307 |     } for radius in range(1, 10, 1)}
308 | )
309 | SEQ_REPS = {
310 |     "esm2-8m": {
311 |         'engine': 'lm',
312 |         'device': "cpu",
313 |         'batch_size': 32,
314 |         "average_pooling": True,
315 |         'model': 'esm2-8m'
316 |     },
317 |     "esm2-150m": {
318 |         'engine': 'lm',
319 |         'device': 'cpu',
320 |         'batch_size': 32,
321 |         'average_pooling': True,
322 |         'model': 'esm2-150m'
323 |     },
324 |     "esm2-650m": {
325 |         'engine': 'lm',
326 |         'device': 'cpu',
327 |         'batch_size': 16,
328 |         'average_pooling': True,
329 |         'model': "esm2-650m"
330 |     },
331 |     'prot-t5-xl': {
332 |         'engine': 'lm',
333 |         'device': 'cpu',
334 |         'batch_size': 16,
335 |         'average_pooling': True,
336 |         'model': 'prot-t5-xl'
337 |     },
338 |     'prost-t5': {
339 |         'engine': 'lm',
340 |         'device': 'cpu',
341 |         'batch_size': 8,
342 |         'average_pooling': True,
343 |         'model': 'prost-t5'
344 |     }
345 | }
346 | 
347 | 
348 | def _is_int(text: str) -> bool:
349 |     try:
350 |         int(text)
351 |         return True
352 |     except ValueError:
353 |         return False
354 | 
355 | 
356 | def define_dataset(dataset: str, task: str, modality: str, neg: bool = False):
357 |     if dataset.endswith('.csv') or dataset.endswith('.tsv'):
358 |         df = pd.read_csv(dataset)
359 |         print("These are the contents of the file you selected\n")
360 |         print(df.head())
361 |         print()
362 |         columns = df.columns.tolist()
363 |         feat_field = Prompt().select(
364 |             "What is the number of the column with the sequences/SMILES?",
365 |             options=columns
366 |         )
367 |         columns.remove(feat_field)
368 |         if neg:
369 |             columns_to_exclude = Prompt().checkbox(
370 |                 "What columns describe a bioactivity you would like to exclude from the negative class?",
371 |                 options=columns,
372 |                 min_selections=0
373 |             )
374 |             return feat_field, columns_to_exclude
375 | 
376 |         if task == 'class':
377 |             label_field = Prompt().select(
378 |                 "What is the column containing the labels?",
379 |                 options=columns + ['Assume all entries are positive']
380 |             )
381 |         else:
382 |             label_field = Prompt().select(
383 |                 "What is the column containing the labels?",
384 |                 options=columns
385 |             )
386 |     elif dataset.endswith('.fasta'):
387 |         if modality != 'seqs':
388 |             raise ValueError("FASTA is not an acceptable format for Macromolecules. Options: `csv`, `tsv`, `smi`.")
389 |         feat_field, label_field = 'sequences', None
390 |     return feat_field, label_field
391 | 
392 | 
393 | def config_helper() -> dict:
394 |     print()
395 |     print("Part 1 - Define the data and preprocessing steps")
396 |     config = {}
397 |     task = Prompt().select(
398 |         "What is the modelling problem you're facing?",
399 |         options=['Classification (returning categorical value)',
400 |                  "Regression(returnin continuous value)"]
401 |     )
402 |     modality = Prompt().select(
403 |         "How do you want to define your peptides?",
404 |         options=['Macromolecules - allows for canonical, non-canonical, and peptidomimetics',
405 |                  'Sequences - only canonical peptides, slightly better performance']
406 |     )
407 |     if 'macromolecule' in modality.lower():
408 |         modality = 'mol'
409 |         config['pipeline'] = MACROMOLECULES_PIPELINE
410 |     else:
411 |         modality = 'seqs'
412 |         config['pipeline'] = SEQUENCE_PIPELINE
413 |     if 'class' in task.lower():
414 |         task = 'class'
415 |     else:
416 |         task = 'reg'
417 | 
418 |     dataset = Prompt().input(
419 |         "What is the path to the dataset with your data",
420 |         validate=lambda x: osp.exists(x)
421 |     )
422 |     feat_field, label_field = define_dataset(dataset, task, modality)
423 | 
424 |     if task == 'class':
425 |         print("Part 1.5 - Negative sampling")
426 |         neg_db = Prompt().select(
427 |             "What negative sampling strategy do you prefer?",
428 |             options=[
429 |                 "DB of bioactive canonical peptides",
430 |                 "DB of bioactive non-canonical peptides",
431 |                 "DB of both bioactive and non-bioactive peptides",
432 |                 "Personalised DB",
433 |                 "No negative sampling"
434 |             ]
435 |         )
436 |         if neg_db == 'Personalised DB':
437 |             neg_path = Prompt().input(
438 |                 "What is the path to the dataset with your data",
439 |                 validate=lambda x: osp.exists(x)
440 |             )
441 |             neg_feat_field, columns_to_exclude = define_dataset(
442 |                 neg_path, task, modality, neg=True
443 |             )
444 |             neg_db = {
445 |                 'path': neg_path,
446 |                 'feat_fields': neg_feat_field,
447 |                 'columns_to_exclude': columns_to_exclude,
448 |                 "verbose": False
449 |             }
450 |     config['databases'] = {
451 |         'dataset': {
452 |             'path': dataset,
453 |             'feat_fields': feat_field,
454 |             'label_field': label_field,
455 |             'verbose': False
456 |         }
457 |     }
458 |     if task == 'class' and neg_db != 'No negative sampling':
459 |         config['databases']['neg_database'] = neg_db
460 | 
461 |     print("Part 2 - Define evaluation strategy")
462 |     config['test'] = {'min_threshold': 0.1}
463 | 
464 |     if modality == 'seqs':
465 |         sim_functions = ['needle (recommended)', 'mmseqs', 'mmseqs+prefilter (for huge datasets)']
466 |         denominators = ['shortest', 'longest', 'n_aligned']
467 |         sim_function = Prompt().select(
468 |             "What alignment algorithm would you like to use?",
469 |             options=sim_functions
470 |         )
471 |         denominator = Prompt().select(
472 |             "What denominator would you like to use to compute the sequence identity?",
473 |             options=denominators
474 |         )
475 |         config['test']['sim_arguments'] = {
476 |             'data_type': 'sequence',
477 |             'alignment_algorithm': sim_function if '+' not in sim_function else sim_function.split('+')[0],
478 |             'denominator': denominator,
479 |             'prefilter': 'prefilter' in sim_function,
480 |             'min_threshold': 0.1,
481 |             'field_name': feat_field,
482 |             'verbose': 2
483 |         }
484 |     else:
485 |         fps = ['mapc', 'ecfp', 'fcfp']
486 |         bits = [str(int(2**v)) for v in range(8, 12, 1)]
487 |         radii = [str(int(i)) for i in range(2, 12)]
488 |         fp = Prompt().select(
489 |             "What fingerprint would you like to use?",
490 |             options=fps
491 |         )
492 |         bit = Prompt().select(
493 |             "How many bits would you like the fingerprints to have? (Greater better, but more expensive)",
494 |             options=bits
495 |         )
496 |         radius = Prompt().select(
497 |             "What radius would you like to use?",
498 |             options=radii
499 |         )
500 |         config['test']['sim_arguments'] = {
501 |             'data_type': 'molecule',
502 |             'min_threshold': 0.1,
503 |             'sim_function': 'tanimoto' if fp == 'ecfp' else 'jaccard',
504 |             'field_name': feat_field,
505 |             'radius': int(radius),
506 |             'bits': int(bit),
507 |             'verbose': 2
508 |         }
509 |     partition = Prompt().select(
510 |         "What thresholds would you like to evaluate in?",
511 |         options=['min (AutoPeptideML v.1.0)', 'all']
512 |     )
513 |     part_alg = Prompt().select(
514 |         "What partitioning algorithm would you like to use?",
515 |         options=['ccpart', 'ccpart_random', 'graph_part'],
516 |         default='ccpart'
517 |     )
518 |     config['test']['partitions'] = partition
519 |     config['test']['algorithm'] = part_alg
520 |     config['test']['threshold_step'] = 0.1
521 |     config['test']['verbose'] = 2
522 |     config['test']['filter'] = 0.185
523 |     config['val'] = {
524 |         'type': 'kfold',
525 |         "k": 10,
526 |         "random_state": 1
527 |     }
528 | 
529 |     print("Part 3 - Define model training")
530 |     config['train'] = {}
531 | 
532 |     learning_alg = Prompt().checkbox(
533 |         "What models would you like to consider?",
534 |         options=list(HP_SPACES.keys()),
535 |         min_selections=1
536 |     )
537 |     model_selection = Prompt().select(
538 |         "What model selection would you like to use?",
539 |         options=['select', "ensemble"]
540 |     )
541 |     hp_search = Prompt().select(
542 |         "What type of search for optimal hyperparameters would you like to use?",
543 |         options=['grid', 'bayesian'],
544 |     )
545 |     reps = Prompt().checkbox("What representations would you like to use?",
546 |                              options=list(MOL_REPS.keys()) if modality == 'mol'
547 |                              else list(SEQ_REPS.keys()), min_selections=1)
548 |     acc = Prompt().select("Which accelerator would you like to use to compute the representations?",
549 |                           options=['cpu', "cuda", "mps"])
550 |     hp_search = hp_search if hp_search != 'bayesian' else 'optuna'
551 |     if hp_search == 'optuna':
552 |         n_steps = Prompt().input(
553 |             "How many steps for optimisation would you like to conduct?",
554 |             default=100,
555 |             validate=_is_int
556 |         )
557 |         patience = Prompt().input(
558 |             "What patience would you like EarlyStopping to have?",
559 |             validate=_is_int
560 |         )
561 |     n_jobs = Prompt().input(
562 |         "How many parallel jobs do you want to run?",
563 |         default=cpu_count(),
564 |         validate=_is_int
565 |     )
566 |     config['train']['task'] = task
567 |     config['train']['optim_strategy'] = {
568 |         'trainer': hp_search,
569 |         'n_steps': int(n_steps) if hp_search == 'optuna' else None,
570 |         'direction': "maximize",
571 |         'task': task,
572 |         'metric': 'pcc' if task == 'reg' else 'mcc',
573 |         'partition': 'random',
574 |         'n_jobs': int(n_jobs),
575 |         'patience': int(patience)
576 |     }
577 |     config['train']['hspace'] = {'representations': reps}
578 |     config['train']['hspace']['models'] = {
579 |         'type': model_selection,
580 |         'elements': {model: HP_SPACES[model] for model in learning_alg},
581 |     }
582 |     config['representation'] = {
583 |         'verbose': True,
584 |         'elements': [
585 |             {
586 |                 r: MOL_REPS[r] if modality == 'mol' else SEQ_REPS[r]
587 |             } for r in reps
588 |         ]
589 |     }
590 |     for idx, element in enumerate(config['representation']['elements']):
591 |         name = list(element.keys())[0]
592 |         if config['representation']['elements'][idx][name]['engine'] != 'lm':
593 |             continue
594 |         config['representation']['elements'][idx][name]['device'] = acc
595 |     path = Prompt().input(
596 |         "Where do you want to save the experiment results?",
597 |         validate=lambda x: not osp.isdir(x)
598 |     )
599 |     config['outputdir'] = path
600 |     os.makedirs(path, exist_ok=True)
601 |     path = osp.join(path, 'config.yml')
602 | 
603 |     yaml.safe_dump(config, open(path, 'w'), indent=2)
604 |     return path
605 | 


--------------------------------------------------------------------------------
/autopeptideml/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/AutoPeptideML/adb18578f145d1c3a78d6860ad5f7d35c726159a/autopeptideml/data/__init__.py


--------------------------------------------------------------------------------
/autopeptideml/data/readme_ex.md:
--------------------------------------------------------------------------------
  1 | # AutoPeptideML output summary
  2 | 
  3 | ## 1. Introduction
  4 | 
  5 | This document is automatically generated from each run of the AutoPeptideML software and it is meant to provide an easy guide into the interpretation of the results obtained. General qualitative comments (e.g., "the MCC score obtained is between 0.25-0.5, which indicates a low correlation and model predictions will not be greatly reliable") are given as a common-sense guideline and the actual criteria for considering a model acceptable may depend on the target application and current state-of-the-art.
  6 | 
  7 | ## 2. Confusion matrix and main performance metrics
  8 | 
  9 | ### 2.1. Confusion matrix
 10 | 
 11 | The confusion matrix is the simplest way to visualize the behaviour of the model. The rows describe the true labels of the samples, which can be Positive or Negative; the columns describe the predicted labels from the ensemble.
 12 | 
 13 | 
 14 | - **First quadrant (upper-left corner):** describes the True Negative predictions (TN) that is to say samples that are negative which are predicted as negative by the model.
 15 | - **Second quadrant (upper-right corner):** describes the False Positive predictions (FP), which are negative samples that are erroneously predicted as positives. If this error is high usually indicates an over-sensitive predictor.
 16 | - **Third quadrant (lower-left corner):** describes the False Negative predictions (FN), which are positive samples that are erroneously predicted as negatives. If this error is high usually indicates a highly specific predictor.
 17 | - **Fourth quadrant (lower-right corner):** describes the True Positive predictions, which are positive samples predicted as positives.
 18 | 
 19 | ![Confusion matrix](./figures/confusion_matrix.png)
 20 | 
 21 | ## 2.2. Model performance metrics
 22 | 
 23 | The confusion matrix can be analysed in different ways depending on what properties of the predictor we are most interested in. The following list contains the main performance metrics used when describing ML predictors, a formula relating them to the confusion matrix above, and a explanation of what they tell us about the model. 
 24 | 
 25 | - **Accuracy:**
 26 |     - *What is it?* proportion of correct predictions among all predictions.
 27 |     - *How is it calculated?* `(TP+TN)/(TP+TN+FP+FN)`
 28 |     - *What does it say about the model?* How often is it right.
 29 |     - *When to use?* Only when working with a balanced dataset (same number of positive and negative samples, default AutoPeptideML run with search for bioactive negatives and homology partitioning). If dataset is not balanced check `evaluation_data/test_scores.csv` for `balanced_accuracy` which is a variation that takes into account the imbalance between the labels.
 30 |     - *Value:*
 31 |     - *Interpretation of value:*
 32 |         - Worse than random: `0-0.45` 
 33 |         - Random model: `0.45-0.55`
 34 |         - Bad model: `0.55-0.7`
 35 |         - Acceptable model: `0.7-0.8`
 36 |         - Good model: `0.8-0.9`
 37 |         - Really good model: `0.9-0.97`
 38 |         - Too good a model (please make sure training and evaluation sets are independent): `>0.97`
 39 | 
 40 | - **Sensitivity or recall:**
 41 |     - *What is it?* proportion of positive samples predicted as positive among all positive samples.
 42 |     - *How is it calculated?* `(TP)/(TP+FN)`
 43 |     - *What does it say about the model?* How likely it is to misclassify a positive sample as negative. May be relevant when the consequence of missing positives is important (e.g., a cancer diagnostics tool).
 44 |     - *When to use?* Only when working with a balanced dataset (same number of positive and negative samples, default AutoPeptideML run with search for bioactive negatives and homology partitioning). If dataset is not balanced check `evaluation_data/test_scores.csv` for `recall_weighted` which is a variation that takes into account the imbalance between the labels.
 45 |     - *Value:*
 46 |     - *Interpretation of value:*
 47 |         - Worse than random: `0-0.45` 
 48 |         - Random model: `0.45-0.55`
 49 |         - Bad model: `0.55-0.7`
 50 |         - Acceptable model: `0.7-0.8`
 51 |         - Good model: `0.8-0.9`
 52 |         - Really good model (check that specificity is, at least, good): `0.9-0.97`
 53 |         - Too good a model (please make sure training and evaluation sets are independent, also check that specificity is, at least, good): `>0.97`
 54 | 
 55 | - **Specificity or precision:**
 56 |     - *What is it?* proportion of positive predictions that were actually true.
 57 |     - *How is it calculated?* `(TP)/(TP+FP)`
 58 |     - *What does it say about the model?* How likely it is to misclassify a positive sample as negative. May be relevant when the aim is to reduce the number of samples to further analyse (e.g., when conducting virtual screening on large databases).
 59 |     - *When to use?* Only when working with a balanced dataset (same number of positive and negative samples, default AutoPeptideML run with search for bioactive negatives and homology partitioning). If dataset is not balanced check `evaluation_data/test_scores.csv` for `precision_weighted` which is a variation that takes into account the imbalance between the labels.
 60 |     - *Value:*
 61 |     - *Interpretation of value:*
 62 |         - Worse than random: `0-0.45` 
 63 |         - Random model: `0.45-0.55`
 64 |         - Bad model: `0.55-0.7`
 65 |         - Acceptable model: `0.7-0.8`
 66 |         - Good model: `0.8-0.9`
 67 |         - Really good model (check that sensitivity is, at least, good): `0.9-0.97`
 68 |         - Too good a model (please make sure training and evaluation sets are independent, also check that sensitivity is, at least, good): `>0.97`
 69 | 
 70 | - **F1:**
 71 |     - *What is it?* harmonic mean between sensitivity and precision.
 72 |     - *How is it calculated?* `(1/2) * (TP)/[2*TP + (FP+FN)]`
 73 |     - *What does it say about the model?* Overall model performance, conceptually similar to accuracy.
 74 |     - *When to use?* Only when working with a balanced dataset (same number of positive and negative samples, default AutoPeptideML run with search for bioactive negatives and homology partitioning). If dataset is not balanced check `evaluation_data/test_scores.csv` for `f1_weighted` which is a variation that takes into account the imbalance between the labels.
 75 |     - *Value:*
 76 |     - *Interpretation of value:*
 77 |         - Worse than random: `0-0.45` 
 78 |         - Random model: `0.45-0.55`
 79 |         - Bad model: `0.55-0.7`
 80 |         - Acceptable model: `0.7-0.8`
 81 |         - Good model: `0.8-0.9`
 82 |         - Really good model: `0.9-0.97`
 83 |         - Too good a model (please make sure training and evaluation sets are independent): `>0.97`
 84 | 
 85 | - **Matthew's correlation coefficient:**
 86 |     - *What is it?* correlation between the predictions of the model and the actual true labels.
 87 |     - *How is it calculated?* `(TP*TN – FP*FN) / √(TP+FP)(TP+FN)(TN+FP)(TN+FN)`
 88 |     - *What does it say about the model?* Overall model performance, conceptually similar to accuracy.
 89 |     - *When to use?* Any case, particularly with binary classification.
 90 |     - *Value:*
 91 |     - *Interpretation of value:*
 92 |         - Worse than random: `< -0.2` 
 93 |         - Random model: `-0.2-0.2`
 94 |         - Bad model: `0.2-0.3`
 95 |         - Acceptable model: `0.3-0.4`
 96 |         - Good model: `0.4-0.7`
 97 |         - Really good model: `0.7-0.95`
 98 |         - Too good a model (please make sure training and evaluation sets are independent): `>0.95`
 99 | 
100 | ## 3. Calibration curve
101 | 
102 | The calibration curve indicates whether the `score` obtained from the predictions of the ensemble can be considered as a probability of the sample being positive, i.e., whether a higher `score` represents a greater likelihood for the sample to be positive. 
103 | 
104 | If the `Classifier 1` curve follows the doted diagonal curve (`Perfectly calibrated`) then the `score` values can be considered as a probability. Otherwise, they cannot. If the curve approximates the diagonal in a region and not in other (e.g., below 0.5 is well calibrated and above 0.5 it is not), it can only be considered as a probability if the `score` falls within that region.
105 | 
106 | ![Calibration curve](./figures/calibration_curve.png)
107 | 
108 | ## 4. Receiver-operating characteristic curve (ROC)
109 | 
110 | The ROC curve represents the positive sensitivity (see above **sensitivity or recall**) of the predictor against the false positive rate (the proportion of observations that are incorrectly predicted to be positive out of all negative observations: `FP/(TN+FP)`). The closer that the curve is to the upper-left corner the better the model is. There is also a metric associated to this curve the **AUROC** (area under the ROC) which is often used in the ML community.
111 | 
112 | ![AUROC](./figures/roc_curve.png)
113 | 
114 | ## 5. Precision-recall curve
115 | 
116 | The precision-recall represents the specificity/precision against the sensitivity/recall and provides an idea of the trade-offs existing in the model between both measurements. The area under the curve is also a common evaluating metric in the ML community.
117 | 
118 | ![Precision-recall curve](./figures/precision_recall_curve.png)
119 | 
120 | # Credit
121 | 
122 | AutoPeptideML has been developed and is maintained by [Raul Fernandez-Diaz](https://www.linkedin.com/in/raul-fernandez-diaz-939440203/) PhD Student at UCD and IBM Research under the supervision of Denis C. Shields (UCD Conway Institute and School of Medicine) and Thanh Lam Hoang (IBM Research).
123 | 
124 | If you have found the tool useful consider citing out paper: 
125 | 


--------------------------------------------------------------------------------
/autopeptideml/db/__init__.py:
--------------------------------------------------------------------------------
1 | from .db import Database


--------------------------------------------------------------------------------
/autopeptideml/db/db.py:
--------------------------------------------------------------------------------
  1 | from typing import *
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | from ..pipeline import Pipeline
  7 | from ..pipeline.smiles import is_smiles
  8 | 
  9 | from tqdm import tqdm
 10 | 
 11 | 
 12 | class Database:
 13 |     """
 14 |     Class that handles dataset operations within AutoPeptideML.
 15 |     """
 16 |     df: pd.DataFrame
 17 |     # Pipeline can be a single pipeline or a dictionary of field - Pipeline
 18 |     pipe: Union[Pipeline, Dict[str, Pipeline]]
 19 |     # Feat_fields can be a single field or a list of fields (e.g, ['seq', 'smiles'])
 20 |     feat_fields: Union[str, List[str]]
 21 |     label_field: Optional[str]
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         path: Optional[str] = None,
 26 |         df: Optional[pd.DataFrame] = None,
 27 |         feat_fields: Union[str, List[str]] = None,
 28 |         pipe: Optional[Union[Pipeline, Dict[str, Pipeline]]] = None,
 29 |         label_field: Optional[str] = None,
 30 |         verbose: bool = False,
 31 |         seed: int = 1
 32 |     ):
 33 |         """Initialises a Database instance.
 34 | 
 35 |         :type path: Optional[str]
 36 |         :param path: Path to the CSV file containing the dataset. If provided, the dataset will be loaded from this path.
 37 | 
 38 |         :type df: Optional[pd.DataFrame]
 39 |         :param df: The dataset represented as a pandas DataFrame. If `path` is provided, this will be ignored.
 40 | 
 41 |         :type pipe: Union[Pipeline, Dict[str, Pipeline]]
 42 |         :param pipe: A preprocessing pipeline or a dictionary of feature fields mapped to their respective pipelines. 
 43 |                     If not provided, no preprocessing is applied.
 44 | 
 45 |         :type feat_fields: Union[str, List[str]]
 46 |         :param feat_fields: A single feature field or a list of feature fields (e.g., `['seq', 'smiles']`) 
 47 |                             used for processing and model input. This parameter is required.
 48 | 
 49 |         :type label_field: Optional[str]
 50 |         :param label_field: The name of the column representing labels in the dataset. If `None`, no label column is specified.
 51 | 
 52 |         :type verbose: bool
 53 |         :param verbose: Enables verbose output if set to `True`. Logs detailed preprocessing steps. Default is `False`.
 54 | 
 55 |         """
 56 |         if path is not None:
 57 |             self.df = pd.read_csv(path)
 58 |         else:
 59 |             self.df = df
 60 |         if feat_fields is None:
 61 |             raise ValueError(f'`feat_fields` cannot be left empty')
 62 |         if isinstance(feat_fields, str):
 63 |             feat_fields = [feat_fields]
 64 |         if (not isinstance(pipe, dict) and pipe is not None):
 65 |             self.pipe = {field: pipe for field in feat_fields}
 66 |         else:
 67 |             self.pipe = pipe
 68 |         self.seed = seed
 69 |         self.label_field = label_field
 70 |         self.feat_fields = feat_fields
 71 |         self.verbose = verbose
 72 |         self._preprocess(verbose)
 73 | 
 74 |     def draw_samples(
 75 |         self,
 76 |         target_db: "Database",
 77 |         columns_to_exclude: Optional[Union[List[str], str]] = None
 78 |     ) -> pd.DataFrame:
 79 |         """
 80 |         Draws samples from the current database to match the distribution of the target database. 
 81 |         Excludes specified columns if provided.
 82 | 
 83 |         :type target_db: Database
 84 |           :param target_db: The target `Database` whose distribution is used to sample data.
 85 | 
 86 |         :type columns_to_exclude: Optional[Union[List[str], str]]
 87 |           :param columns_to_exclude: A single column or list of columns to exclude from sampling. If `None`, no columns are excluded.
 88 | 
 89 |         :rtype: pd.DataFrame
 90 |           :return: A DataFrame containing the sampled data matching the target database distribution.
 91 |         """
 92 |         if columns_to_exclude is not None:
 93 |             self._filter(columns_to_exclude)
 94 | 
 95 |         target_hist = target_db._hist()
 96 |         hist = self._hist()
 97 | 
 98 |         entries = {field: [] for field in self.feat_fields}
 99 |         left_out = 0
100 |         for idx, h in enumerate(target_hist):
101 |             if idx > len(hist):
102 |                 break
103 |             tmp_df = self.df.iloc[hist[idx]]
104 |             tgt_df = target_db.df.iloc[h]
105 | 
106 |             if len(tmp_df) < len(tgt_df):
107 |                 left_out += len(tgt_df) - len(tmp_df)
108 |             elif len(tmp_df) > len(tgt_df) + np.abs(left_out):
109 |                 if left_out < 0:
110 |                     smp = len(tgt_df)
111 |                 else:
112 |                     smp = len(tgt_df) + left_out
113 |                 tmp_df = tmp_df.sample(smp, replace=False, random_state=self.seed)
114 |                 left_out = 0
115 |             else:
116 |                 smp = len(tmp_df) - len(tgt_df)
117 |                 tmp_df = tmp_df.sample(smp, replace=False, random_state=self.seed)
118 |             for field in self.feat_fields:
119 |                 entries[field].extend(tmp_df[field].tolist())
120 | 
121 |         entries_df = pd.DataFrame(entries)
122 |         for field in self.feat_fields:
123 |             entries_df.drop_duplicates(field, inplace=True)
124 |         return entries_df
125 | 
126 |     def add_negatives(
127 |         self, other: "Database",
128 |         columns_to_exclude: Optional[Union[List[str], str]] = None
129 |     ):
130 |         """
131 |         Adds negative samples to the current database using another database. 
132 |         The label for negative samples is set to `0`.
133 | 
134 |         :type other: Database
135 |           :param other: The source `Database` from which negative samples are drawn.
136 | 
137 |         :type columns_to_exclude: Optional[Union[List[str], str]]
138 |           :param columns_to_exclude: A single column or list of columns to exclude during sampling. If `None`, no columns are excluded.
139 | 
140 |         :rtype: None
141 |           :return: Updates the current database with the added negative samples.
142 |         """
143 |         other.df = other.draw_samples(self, columns_to_exclude)
144 |         if self.label_field is None:
145 |             self.label_field = "Y"
146 |             self.df[self.label_field] = 1
147 | 
148 |         other.df[self.label_field] = 0
149 |         if other.feat_fields[0] != self.feat_fields[0]:
150 |             other.df[self.feat_fields[0]] = other.df[other.feat_fields[0]]
151 |         self.df = pd.concat([self.df, other.df])
152 |         self.df = self.df[[self.label_field, *self.feat_fields]]
153 | 
154 |     def _check_fields(self):
155 |         """
156 |         Validates that all feature fields exist in the dataset.
157 | 
158 |         :rtype: None
159 |           :return: Raises a `KeyError` if any feature field is missing from the dataset.
160 |         """
161 |         for field in self.feat_fields:
162 |             if field not in self.df.columns:
163 |                 raise KeyError(
164 |                     f"Field: {field} is not in df",
165 |                     f"df columns are: {', '.join(self.df.columns.tolist())}"
166 |                 )
167 | 
168 |     def _get_mw(self):
169 |         """
170 |         Computes the molecular weight (MW) for each entry in the dataset using RDKit.
171 | 
172 |         :rtype: None
173 |           :return: Adds a `tmp_mw` column to the dataset with computed molecular weights.
174 |         """
175 |         try:
176 |             from rdkit import Chem
177 |             from rdkit.Chem import Descriptors
178 |         except ImportError:
179 |             raise ImportError("Rdkit is required for this function",
180 |                               "Please install: `pip install rdkit`")
181 |         item = self.df.iloc[0, :]
182 |         for field in self.feat_fields:
183 |             if is_smiles(item[field]):
184 |                 self.df['tmp_mw'] = self.df[field].map(
185 |                     lambda x: Descriptors.ExactMolWt(
186 |                         Chem.MolFromSmiles(x)
187 |                     )
188 |                 )
189 |             else:
190 |                 self.df['tmp_mw'] = self.df[field].map(
191 |                     lambda x: Descriptors.ExactMolWt(
192 |                         Chem.MolFromFASTA(x)
193 |                     )
194 |                 )
195 | 
196 |     def _preprocess(self, verbose):
197 |         """
198 |         Applies preprocessing steps to the dataset, including field validation and pipeline execution.
199 | 
200 |         :type verbose: bool
201 |           :param verbose: Enables verbose output if set to `True`.
202 | 
203 |         :rtype: None
204 |           :return: Updates the dataset with preprocessed feature fields.
205 |         """
206 |         self._check_fields()
207 |         if verbose:
208 |             print("Preprocessing database")
209 |         if self.pipe is not None:
210 |             for field in self.feat_fields:
211 |                 self.df[field] = self.pipe[field](self.df[field], verbose=verbose)
212 |         self._get_mw()
213 | 
214 |     def _filter(self, columns: Union[List[str], str]):
215 |         """
216 |         Filters out rows where specified columns contain the value `1`.
217 | 
218 |         :type columns: Union[List[str], str]
219 |           :param columns: A single column or list of columns to filter.
220 | 
221 |         :rtype: None
222 |           :return: Updates the dataset after filtering.
223 |         """
224 |         if isinstance(columns, str):
225 |             columns = [columns]
226 |         for column in columns:
227 |             self.df = self.df[self.df[column] != 1].copy().reset_index(drop=True)
228 | 
229 |     def _hist(self) -> List[np.ndarray]:
230 |         """
231 |         Creates histograms based on molecular weight ranges for the dataset.
232 | 
233 |         :rtype: List[np.ndarray]
234 |           :return: A list of boolean arrays indicating the molecular weight bins.
235 |         """
236 |         av_mw_aa = 110
237 |         step = 5 * av_mw_aa
238 |         max_mw = int(self.df['tmp_mw'].max())
239 |         out = []
240 |         if self.verbose:
241 |             pbar = tqdm(range(0, max_mw, step), desc='Computing MW')
242 |         else:
243 |             pbar = range(0, max_mw, step)
244 |         for mw in pbar:
245 |             cond = ((self.df.tmp_mw > mw) & (self.df.tmp_mw <= mw + step)).to_numpy()
246 |             cond = cond.astype(np.bool_)
247 |             out.append(cond)
248 |         return out
249 | 
250 |     def __len__(self) -> int:
251 |         """
252 |         Returns the number of rows in the dataset.
253 | 
254 |         :rtype: int
255 |           :return: The number of rows in the dataset.
256 |         """
257 |         return len(self.df)
258 | 
259 |     def __getitem__(self, idx: int) -> pd.Series:
260 |         """
261 |         Retrieves a row from the dataset by index, returning only the feature fields and the label field, if specified.
262 | 
263 |         :type idx: int
264 |           :param idx: The index of the row to retrieve.
265 | 
266 |         :rtype: pd.Series
267 |           :return: A series containing the feature fields and the label field if specified for the specified row.
268 |         """
269 |         item = self.df.iloc[idx]
270 |         if self.label_field is None:
271 |             return item[self.feat_fields]
272 |         else:
273 |             return item[self.feat_fields + self.label_field]
274 | 
275 |     def __str__(self) -> str:
276 |         return str(self.df.head())
277 | 


--------------------------------------------------------------------------------
/autopeptideml/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import time
 4 | import yaml
 5 | 
 6 | from typing import *
 7 | 
 8 | import pandas as pd
 9 | import typer
10 | 
11 | from .autopeptideml import AutoPeptideML, __version__
12 | from .config import config_helper
13 | 
14 | 
15 | app = typer.Typer()
16 | 
17 | 
18 | @app.command()
19 | def build_model(config_path: Optional[str] = None):
20 |     """
21 |     Build a machine learning model based on the provided configuration. If no configuration is provided
22 |     the configuration helper will prompt you for more details about the job you want to run.
23 | 
24 |     Args:
25 |         config_path (str, optional): Path to the configuration file. Defaults to None.
26 | 
27 |     Returns:
28 |         None
29 |     """
30 |     if config_path is not None:
31 |         config = yaml.safe_load(open(config_path))
32 |         mssg = f"| AutoPeptideML v.{__version__} |"
33 |         print("-"*(len(mssg)))
34 |         print(mssg)
35 |         print("-"*(len(mssg)))
36 | 
37 |     else:
38 |         config_path = prepare_config()
39 |         config = yaml.safe_load(open(config_path))
40 |     print("** Model Builder **")
41 |     apml = AutoPeptideML(config)
42 |     db = apml.get_database()
43 |     reps = apml.get_reps()
44 |     test = apml.get_test()
45 |     models = apml.run_hpo()
46 |     r_df = apml.run_evaluation(models)
47 |     apml.save_experiment(save_reps=True, save_test=False)
48 |     print(r_df)
49 | 
50 | 
51 | @app.command()
52 | def prepare_config() -> dict:
53 |     mssg = f"| AutoPeptideML v.{__version__} |"
54 |     print("-"*(len(mssg)))
55 |     print(mssg)
56 |     print("-"*(len(mssg)))
57 |     print("** Experiment Builder **")
58 |     print("Please, answer the following questions to design your experiment.")
59 | 
60 |     config_path = config_helper()
61 |     return config_path
62 | 
63 | 
64 | @app.command()
65 | def predict(experiment_dir: str, features_path: str, feature_field: str,
66 |             output_path: str = 'apml_predictions.csv'):
67 |     config_path = osp.join(experiment_dir, 'config.yml')
68 |     if not osp.exists(config_path):
69 |         raise FileNotFoundError("Configuration file was not found in experiment dir.")
70 |     config = yaml.safe_load(open(config_path))
71 |     apml = AutoPeptideML(config)
72 |     df = pd.read_csv(features_path)
73 |     results_df = apml.predict(
74 |         df, feature_field=feature_field,
75 |         experiment_dir=experiment_dir, backend='onnx'
76 |     )
77 |     results_df.to_csv(output_path, index=False, float_format="%.3g")
78 | 
79 | 
80 | def _main():
81 |     app()
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     _main()
86 | 


--------------------------------------------------------------------------------
/autopeptideml/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline import Pipeline, BaseElement
2 | from .sequence import CanonicalCleaner, CanonicalFilter
3 | from .smiles import SequenceToSMILES, FilterSMILES, SmilesToSequence
4 | 


--------------------------------------------------------------------------------
/autopeptideml/pipeline/pipeline.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import yaml
  3 | from typing import *
  4 | 
  5 | from concurrent.futures import ThreadPoolExecutor
  6 | from multiprocessing import cpu_count
  7 | from tqdm import tqdm
  8 | 
  9 | 
 10 | class BaseElement:
 11 |     """
 12 |     Class `BaseElement` provides a foundation for implementing molecular processing elements.
 13 |     It supports both single and parallel processing of molecular data, making it suitable for operations
 14 |     that can be applied to molecular representations such as SMILES strings.
 15 | 
 16 |     Attributes:
 17 |         :type name: str
 18 |         :param name: The name of the processing element.
 19 | 
 20 |         :type properties: Dict[str, Any]
 21 |         :param properties: A dictionary of additional properties for the processing element.
 22 |                             Default is an empty dictionary.
 23 |     """
 24 |     name: str
 25 |     properties: Dict[str, Any] = {}
 26 | 
 27 |     def __call__(self, mol: Union[str, List[str]],
 28 |                  n_jobs: int = cpu_count(),
 29 |                  verbose: bool = False) -> Union[str, List[str]]:
 30 |         """
 31 |         Processes molecular data, either as a single molecule or a list of molecules.
 32 |         Automatically selects single or parallel processing based on the input type.
 33 | 
 34 |         :type mol: Union[str, List[str]]
 35 |           :param mol: A single molecular representation (e.g., SMILES string) or a list of such representations.
 36 | 
 37 |         :type n_jobs: int
 38 |           :param n_jobs: The number of parallel jobs to use for processing. Default is the number of CPU cores.
 39 | 
 40 |         :type verbose: bool
 41 |           :param verbose: Enables verbose output if set to `True`, displaying a progress bar for parallel processing.
 42 |                           Default is `False`.
 43 | 
 44 |         :rtype: Union[str, List[str]]
 45 |           :return: The processed molecular representation(s).
 46 |         """
 47 |         if isinstance(mol, str):
 48 |             return self._single_call(mol)
 49 |         elif len(mol) == 0:
 50 |             return mol
 51 |         else:
 52 |             return self._parallel_call(mol, n_jobs=n_jobs,
 53 |                                        verbose=verbose)
 54 | 
 55 |     def _single_call(self, mol: str) -> str:
 56 |         """
 57 |         Processes a single molecular representation. 
 58 |         Must be implemented in a subclass.
 59 | 
 60 |         :type mol: str
 61 |           :param mol: A single molecular representation (e.g., SMILES string).
 62 | 
 63 |         :rtype: str
 64 |           :return: The processed molecular representation.
 65 | 
 66 |         :raises NotImplementedError: If the method is not implemented in a subclass.
 67 |         """
 68 |         raise NotImplementedError
 69 | 
 70 |     def _clean(self, mol: List[Optional[str]]) -> List[str]:
 71 |         """
 72 |         Cleans the processed molecular data by removing `None` values.
 73 | 
 74 |         :type mol: List[Optional[str]]
 75 |           :param mol: A list of processed molecular representations, some of which may be `None`.
 76 | 
 77 |         :rtype: List[str]
 78 |           :return: A cleaned list of molecular representations without `None` values.
 79 |         """
 80 |         return [m for m in mol if m is not None]
 81 | 
 82 |     def _parallel_call(self, mol: List[str], n_jobs: int,
 83 |                        verbose: bool) -> List[str]:
 84 |         """
 85 |         Processes a list of molecular representations in parallel using a thread pool.
 86 | 
 87 |         :type mol: List[str]
 88 |           :param mol: A list of molecular representations (e.g., SMILES strings) to process.
 89 | 
 90 |         :type n_jobs: int
 91 |           :param n_jobs: The number of parallel jobs to use for processing. If set to `1`, processes sequentially.
 92 | 
 93 |         :type verbose: bool
 94 |           :param verbose: Enables verbose output if set to `True`, displaying a progress bar for parallel processing.
 95 | 
 96 |         :rtype: List[str]
 97 |           :return: A list of processed molecular representations.
 98 | 
 99 |         :raises RuntimeError: If any parallel job raises an exception.
100 |         """
101 |         if n_jobs > 1:
102 |             jobs, out = [], []
103 |             with ThreadPoolExecutor(n_jobs) as exec:
104 |                 for item in mol:
105 |                     job = exec.submit(self._single_call, item)
106 |                     jobs.append(job)
107 | 
108 |                 if verbose:
109 |                     pbar = tqdm(jobs, unit_scale=True)
110 |                 else:
111 |                     pbar = jobs
112 | 
113 |                 for job in pbar:
114 |                     if job.exception() is not None:
115 |                         raise RuntimeError(job.exception())
116 |                     out.append(job.result())
117 |         else:
118 |             out = []
119 |             for item in mol:
120 |                 out.append(self._single_call(item))
121 |         return self._clean(out)
122 | 
123 | 
124 | class Pipeline:
125 |     """
126 |     Class `Pipeline` represents a sequence of molecular processing steps, where each step is defined by an element 
127 |     (`BaseElement` or another `Pipeline`). The pipeline can process molecular data sequentially and optionally 
128 |     aggregate results across all steps.
129 | 
130 |     Attributes:
131 |         :type elements: Union[List[BaseElement], List[Pipeline]]
132 |         :param elements: A list of `BaseElement` or `Pipeline` instances that define the processing steps.
133 | 
134 |         :type name: str
135 |         :param name: The name of the pipeline. Default is `'pipeline'`.
136 | 
137 |         :type aggregate: bool
138 |         :param aggregate: If `True`, the pipeline aggregates results from all steps. 
139 |                             If `False`, the results of one step are passed to the next. Default is `False`.
140 |     """
141 |     def __init__(self, elements: Union[List[BaseElement], List["Pipeline"]],
142 |                  name: str = 'pipeline',
143 |                  aggregate: bool = False):
144 |         """
145 |         Initializes the pipeline with a sequence of processing elements and configuration.
146 | 
147 |         :type elements: Union[List[BaseElement], List[Pipeline]]
148 |           :param elements: A list of `BaseElement` or `Pipeline` instances to define the processing steps.
149 | 
150 |         :type name: str
151 |           :param name: The name of the pipeline. Default is `'pipeline'`.
152 | 
153 |         :type aggregate: bool
154 |           :param aggregate: If `True`, results from all steps are aggregated. If `False`, results of one step 
155 |                             are passed to the next. Default is `False`.
156 | 
157 |         :rtype: None
158 |         """
159 |         self.elements = elements
160 |         self.name = name
161 |         self.properties = {name: {
162 |                 'name': name,
163 |                 'aggregate': aggregate,
164 |                 'elements': [{e.name: e.properties} for e in elements]}
165 |         }
166 |         self.properties['aggregate'] = aggregate
167 |         self.aggregate = aggregate
168 | 
169 |     def __str__(self) -> str:
170 |         """
171 |         Returns a JSON string representation of the pipeline's properties.
172 | 
173 |         :rtype: str
174 |           :return: A JSON string representing the pipeline's configuration and properties.
175 |         """
176 |         return json.dumps(self.properties, indent=3)
177 | 
178 |     def __call__(self, mols: List[str],
179 |                  n_jobs: int = cpu_count(),
180 |                  verbose: bool = False):
181 |         """
182 |         Processes a list of molecular representations using the pipeline.
183 | 
184 |         :type mols: List[str]
185 |           :param mols: A list of molecular representations (e.g., SMILES strings) to process.
186 | 
187 |         :type n_jobs: int
188 |           :param n_jobs: The number of parallel jobs to use for processing. Default is the number of CPU cores.
189 | 
190 |         :type verbose: bool
191 |           :param verbose: Enables verbose output if set to `True`. Displays progress and step information.
192 | 
193 |         :rtype: Union[List[str], List[List[str]]]
194 |           :return: Processed molecular data. If `aggregate` is `True`, returns aggregated results from all steps.
195 |                    Otherwise, returns the final processed molecular data.
196 |         """
197 |         original_mols = mols
198 |         aggregation = []
199 |         for idx, e in enumerate(self.elements):
200 |             if verbose:
201 |                 print(f"Executing preprocessing step {idx+1} of",
202 |                       f"{len(self.elements)}: {e.name}")
203 |             if self.aggregate:
204 |                 mols = e(original_mols, n_jobs=n_jobs, verbose=verbose)
205 |                 aggregation.extend(mols)
206 |             else:
207 |                 mols = e(mols, n_jobs=n_jobs, verbose=verbose)
208 | 
209 |             if verbose and not self.aggregate:
210 |                 print(f'Total molecules removed: {len(original_mols)-len(mols):,}')
211 | 
212 |         if self.aggregate:
213 |             return aggregation
214 |         else:
215 |             return mols
216 | 
217 |     def save(self, filename: str):
218 |         """
219 |         Saves the pipeline's properties to a YAML file.
220 | 
221 |         :type filename: str
222 |           :param filename: The name of the file to save the pipeline's properties.
223 | 
224 |         :rtype: None
225 |         """
226 |         yaml.safe_dump(self.properties, open(filename, 'w'))
227 | 
228 |     @classmethod
229 |     def load(self, filename: str, element_registry: dict):
230 |         """
231 |         Loads a pipeline from a YAML file and reconstructs its elements using a registry.
232 | 
233 |         :type filename: str
234 |           :param filename: The name of the file containing the saved pipeline properties.
235 | 
236 |         :type element_registry: Dict[str, Callable]
237 |           :param element_registry: A dictionary mapping element names to their constructor functions.
238 | 
239 |         :rtype: Pipeline
240 |           :return: A reconstructed `Pipeline` instance based on the saved properties.
241 |         """
242 |         self.properties = json.load(open(filename))
243 |         elements = []
244 |         for e, e_prop in self.config.items():
245 |             elements.append(element_registry[e](**e_prop))
246 |         return Pipeline(elements)
247 | 


--------------------------------------------------------------------------------
/autopeptideml/pipeline/sequence.py:
--------------------------------------------------------------------------------
  1 | from typing import *
  2 | 
  3 | from .pipeline import BaseElement
  4 | 
  5 | 
  6 | RESIDUES = {
  7 |     'V': 'VAL', 'I': 'ILE', 'L': 'LEU', 'E': 'GLU', 'Q': 'GLN',
  8 |     'D': 'ASP', 'N': 'ASN', 'H': 'HIS', 'W': 'TRP', 'F': 'PHE',
  9 |     'Y': 'TYR', 'R': 'ARG', 'K': 'LYS', 'S': 'SER', 'T': 'THR',
 10 |     'M': 'MET', 'A': 'ALA', 'G': 'GLY', 'P': 'PRO', 'C': 'CYS'
 11 | }
 12 | 
 13 | 
 14 | def is_canonical(sequence: str):
 15 |     if not (len(sequence) > 0):
 16 |         return False
 17 |     for char in sequence:
 18 |         if char not in RESIDUES:
 19 |             return False
 20 |     return True
 21 | 
 22 | 
 23 | class CanonicalCleaner(BaseElement):
 24 |     """
 25 |     Class `CanonicalCleaner` is a molecular processing element that standardizes molecular representations 
 26 |     by replacing non-canonical residues with a specified substitution character.
 27 | 
 28 |     Attributes:
 29 |         :type name: str
 30 |         :param name: The name of the element. Default is `'canonical-cleaner'`.
 31 | 
 32 |         :type substitution: str
 33 |         :param substitution: The character used to replace non-canonical residues. Default is `'X'`.
 34 |     """
 35 |     name = 'canonical-cleaner'
 36 | 
 37 |     def __init__(self, substitution: str = 'X'):
 38 |         """
 39 |         Initializes the `CanonicalCleaner` with a substitution character.
 40 | 
 41 |         :type substitution: str
 42 |           :param substitution: The character used to replace non-canonical residues. Default is `'X'`.
 43 | 
 44 |         :rtype: None
 45 |         """
 46 |         self.sub = substitution
 47 |         self.properties = {'substitution': substitution}
 48 | 
 49 |     def _single_call(self, mol: str) -> str:
 50 |         """
 51 |         Cleans a single molecular representation by replacing non-canonical residues.
 52 | 
 53 |         :type mol: str
 54 |           :param mol: A single molecular representation (e.g., a sequence of residues).
 55 | 
 56 |         :rtype: str
 57 |           :return: The cleaned molecular representation with non-canonical residues replaced by the substitution.
 58 |         """
 59 |         return ''.join([c if c in RESIDUES else self.sub for c in mol])
 60 | 
 61 | 
 62 | class CanonicalFilter(BaseElement):
 63 |     """
 64 |     Class `CanonicalFilter` is a molecular processing element that filters molecular representations based on 
 65 |     their canonicality. It can either keep or discard canonical molecules based on the configuration.
 66 | 
 67 |     Attributes:
 68 |         :type name: str
 69 |         :param name: The name of the element. Default is `'canonical-filter'`.
 70 | 
 71 |         :type keep_canonical: bool
 72 |         :param keep_canonical: Determines whether to keep canonical molecules (`True`) or discard them (`False`).
 73 |                                 Default is `True`.
 74 |     """
 75 |     name = 'canonical-filter'
 76 | 
 77 |     def __init__(self, keep_canonical: bool = True):
 78 |         """
 79 |         Initializes the `CanonicalFilter` with a configuration to keep or discard canonical molecules.
 80 | 
 81 |         :type keep_canonical: bool
 82 |           :param keep_canonical: Determines whether to keep canonical molecules (`True`) or discard them (`False`).
 83 |                                  Default is `True`.
 84 | 
 85 |         :rtype: None
 86 |         """
 87 |         self.keep_canonical = keep_canonical
 88 | 
 89 |     def _single_call(self, mol: str) -> Union[str, None]:
 90 |         """
 91 |         Filters a single molecular representation based on its canonicality.
 92 | 
 93 |         :type mol: str
 94 |           :param mol: A single molecular representation (e.g., a sequence of residues).
 95 | 
 96 |         :rtype: Union[str, None]
 97 |           :return: The molecule if it meets the canonicality condition, or `None` otherwise.
 98 |         """
 99 |         if not (len(mol) > 0):
100 |             return None
101 |         if ((is_canonical(mol) and self.keep_canonical) or
102 |            (not is_canonical(mol) and not self.keep_canonical)):
103 |             return mol
104 |         else:
105 |             return None
106 | 


--------------------------------------------------------------------------------
/autopeptideml/pipeline/smiles.py:
--------------------------------------------------------------------------------
  1 | from typing import *
  2 | 
  3 | from .pipeline import BaseElement
  4 | try:
  5 |     import rdkit.Chem.rdmolfiles as rdm
  6 | except ImportError:
  7 |     raise ImportError("You need to install rdkit to use this method.",
  8 |                       " Try: `pip install rdkit`")
  9 | 
 10 | 
 11 | def is_smiles(mol: str):
 12 |     return (
 13 |         '(' in mol or ')' in mol or
 14 |         '[' in mol or ']' in mol or
 15 |         '@' in mol or 'O' in mol
 16 |     )
 17 | 
 18 | 
 19 | class SequenceToSMILES(BaseElement):
 20 |     """
 21 |     Class `SequenceToSMILES` converts peptide sequences (e.g., FASTA format) into SMILES (Simplified Molecular Input Line Entry System) representations using RDKit.
 22 | 
 23 |     Attributes:
 24 |         :type name: str
 25 |         :param name: The name of the element. Default is `'sequence-to-smiles'`.
 26 |     """
 27 |     name = 'sequence-to-smiles'
 28 | 
 29 |     def _single_call(self, mol):
 30 |         """
 31 |         Converts a single peptide sequence into a SMILES representation.
 32 | 
 33 |         :type mol: str
 34 |           :param mol: A peptide sequence (e.g., FASTA format).
 35 | 
 36 |         :rtype: str
 37 |           :return: The SMILES representation of the molecule.
 38 | 
 39 |         :raises RuntimeError: If the molecule cannot be read by RDKit.
 40 |         """
 41 |         rd_mol = rdm.MolFromFASTA(mol)
 42 |         if rd_mol is None:
 43 |             raise RuntimeError(f'Molecule: {mol} could not be read by RDKit.',
 44 |                                'Maybe introduce a filtering step in your pipeline')
 45 |         return rdm.MolToSmiles(rd_mol, canonical=True, isomericSmiles=True)
 46 | 
 47 | 
 48 | class SmilesToSequence(BaseElement):
 49 |     try:
 50 |         from pepfunn.sequence import peptideFromSMILES
 51 |     except ImportError:
 52 |         raise ImportError("This class requires PepFuNN to be installed. Please try: `pip install git+https://github.com/novonordisk-research/pepfunn.git`")
 53 | 
 54 |     name = 'smiles-to-sequence'
 55 |     fun = peptideFromSMILES
 56 | 
 57 |     def _single_call(self, mol):
 58 |         return ''.join(self.fun(mol).split('-'))
 59 | 
 60 | 
 61 | class FilterSMILES(BaseElement):
 62 |     """
 63 |     Class `FilterSMILES` filters molecular representations based on whether they are valid SMILES strings. 
 64 |     It can either retain or discard SMILES strings based on the configuration.
 65 | 
 66 |     Attributes:
 67 |         :type name: str
 68 |         :param name: The name of the element. Default is `'filter-smiles'`.
 69 | 
 70 |         :type keep_smiles: Optional[bool]
 71 |         :param keep_smiles: Determines whether to retain valid SMILES strings (`True`) or discard them (`False`).
 72 |                             Default is `True`.
 73 |     """
 74 |     name = 'filter-smiles'
 75 | 
 76 |     def __init__(self, keep_smiles: Optional[bool] = True):
 77 |         """
 78 |         Initializes the `FilterSMILES` element with a configuration to retain or discard SMILES strings.
 79 | 
 80 |         :type keep_smiles: Optional[bool]
 81 |           :param keep_smiles: Determines whether to retain valid SMILES strings (`True`) or discard them (`False`).
 82 |                               Default is `True`.
 83 | 
 84 |         :rtype: None
 85 |         """
 86 |         self.properties['keep_smiles'] = keep_smiles
 87 |         self.keep_smiles = keep_smiles
 88 | 
 89 |     def _single_call(self, mol: str):
 90 |         """
 91 |         Filters a single molecular representation based on its validity as a SMILES string.
 92 | 
 93 |         :type mol: str
 94 |           :param mol: A molecular representation to evaluate.
 95 | 
 96 |         :rtype: Union[str, None]
 97 |           :return: The molecule if it meets the SMILES validity condition, or `None` otherwise.
 98 |         """
 99 |         if ((is_smiles(mol) and self.keep_smiles) or
100 |            (not is_smiles(mol) and not self.keep_smiles)):
101 |             return mol
102 |         else:
103 |             return None
104 | 
105 | 
106 | class CanonicalizeSmiles(BaseElement):
107 |     """
108 |     Class `CanonicalizeSmiles` converts SMILES (Simplified Molecular Input Line Entry System) strings into their canonical forms using RDKit.
109 | 
110 |     Attributes:
111 |         :type name: str
112 |         :param name: The name of the element. Default is `'canonicalize-smiles'`.
113 |     """
114 |     name = 'canonicalize-smiles'
115 | 
116 |     def _single_call(self, mol):
117 |         """
118 |         Converts a SMILES string into its canonical representation.
119 | 
120 |         :type mol: str
121 |           :param mol: A SMILES string representing a molecule.
122 | 
123 |         :rtype: str
124 |           :return: The canonical SMILES representation of the molecule.
125 | 
126 |         :raises RuntimeError: If the molecule cannot be read by RDKit.
127 |         """
128 |         rd_mol = rdm.MolFromSmiles(mol)
129 |         if rd_mol is None:
130 |             raise RuntimeError(f'Molecule: {mol} could not be read by RDKit.',
131 |                                'Maybe introduce a filtering step in your pipeline')
132 |         return rdm.MolToSmiles(rd_mol, canonical=True, isomericSmiles=True)
133 | 


--------------------------------------------------------------------------------
/autopeptideml/reps/__init__.py:
--------------------------------------------------------------------------------
1 | from .engine import RepEngineBase
2 | from .seq_based import RepEngineOnehot
3 | 


--------------------------------------------------------------------------------
/autopeptideml/reps/engine.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | import yaml
  4 | from typing import *
  5 | 
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | 
  9 | try:
 10 |     from itertools import batched
 11 | except ImportError:
 12 |     from itertools import islice
 13 | 
 14 |     def batched(iterable, n, *, strict=False):
 15 |         # batched('ABCDEFG', 3) → ABC DEF G
 16 |         if n < 1:
 17 |             raise ValueError('n must be at least one')
 18 |         iterator = iter(iterable)
 19 |         while batch := tuple(islice(iterator, n)):
 20 |             if strict and len(batch) != n:
 21 |                 raise ValueError('batched(): incomplete batch')
 22 |             yield batch
 23 | 
 24 | 
 25 | class RepEngineBase:
 26 |     """
 27 |     Class `RepEngineBase` is an abstract base class for implementing molecular representation engines. 
 28 |     It defines a framework for computing molecular representations in batches and includes utilities for 
 29 |     serialization and property management.
 30 | 
 31 |     Attributes:
 32 |         :type engine: str
 33 |         :param engine: The name of the representation engine.
 34 | 
 35 |         :type rep: str
 36 |         :param rep: The type of molecular representation (e.g., fingerprint, embedding).
 37 | 
 38 |         :type properties: dict
 39 |         :param properties: A dictionary containing the engine's properties, including configuration arguments passed during initialization.
 40 |     """
 41 |     engine: str
 42 | 
 43 |     def __init__(self, rep: str, **args):
 44 |         """
 45 |         Initializes the `RepEngineBase` with the specified representation type and additional configuration arguments.
 46 | 
 47 |         :type rep: str
 48 |           :param rep: The type of molecular representation (e.g., fingerprint, embedding).
 49 | 
 50 |         :type **args: dict
 51 |           :param **args: Additional arguments for configuring the representation engine.
 52 | 
 53 |         :rtype: None
 54 |         """
 55 |         self.rep = rep
 56 |         self.__dict__.update(args)
 57 |         self.properties = copy.deepcopy(self.__dict__)
 58 | 
 59 |     def compute_reps(self, mols: List[str],
 60 |                      verbose: Optional[bool] = False,
 61 |                      batch_size: Optional[int] = 12) -> Union[np.ndarray, List[np.ndarray]]:
 62 |         """
 63 |         Computes molecular representations for a list of molecules in batches.
 64 | 
 65 |         :type mols: List[str]
 66 |           :param mols: A list of molecular representations (e.g., SMILES strings).
 67 | 
 68 |         :type verbose: Optional[bool]
 69 |           :param verbose: If `True`, displays a progress bar during batch processing. Default is `False`.
 70 | 
 71 |         :type batch_size: Optional[int]
 72 |           :param batch_size: The size of each batch for processing. Default is `12`.
 73 | 
 74 |         :rtype: Union[np.ndarray, List[np.ndarray]]
 75 |           :return: A stacked NumPy array of computed representations, or a list of arrays if pooling is disabled.
 76 |         """
 77 |         batches = batched(mols, batch_size)
 78 |         out = []
 79 | 
 80 |         if verbose:
 81 |             pbar = tqdm(list(batches))
 82 |         else:
 83 |             pbar = batches
 84 | 
 85 |         for batch in pbar:
 86 |             batch = self._preprocess_batch(batch)
 87 |             rep = self._rep_batch(batch)
 88 |             out.extend(rep)
 89 | 
 90 |         if 'average_pooling' in self.__dict__:
 91 |             if not self.__dict__['average_pooling']:
 92 |                 return out
 93 |         return np.stack(out)
 94 | 
 95 |     def dim(self) -> int:
 96 |         """
 97 |         Returns the dimensionality of the molecular representations.
 98 | 
 99 |         :rtype: int
100 |           :return: The dimensionality of the computed representations.
101 | 
102 |         :raises NotImplementedError: This method must be implemented by subclasses.
103 |         """
104 |         raise NotImplementedError
105 | 
106 |     def _rep_batch(self, batch: List[str]) -> np.ndarray:
107 |         """
108 |         Computes representations for a batch of molecules. Must be implemented by subclasses.
109 | 
110 |         :type batch: List[str]
111 |           :param batch: A batch of molecular representations (e.g., SMILES strings).
112 | 
113 |         :rtype: np.ndarray
114 |           :return: A NumPy array of computed representations for the batch.
115 | 
116 |         :raises NotImplementedError: This method must be implemented by subclasses.
117 |         """
118 |         raise NotImplementedError
119 | 
120 |     def _preprocess_batch(self, batch: List[str]) -> List[str]:
121 |         """
122 |         Preprocesses a batch of molecules before computing representations. Must be implemented by subclasses.
123 | 
124 |         :type batch: List[str]
125 |           :param batch: A batch of molecular representations (e.g., SMILES strings).
126 | 
127 |         :rtype: List[str]
128 |           :return: A preprocessed list of molecular representations.
129 | 
130 |         :raises NotImplementedError: This method must be implemented by subclasses.
131 |         """
132 |         raise NotImplementedError
133 | 
134 |     def save(self, filename: str):
135 |         """
136 |         Saves the engine's properties to a file in YAML format.
137 | 
138 |         :type filename: str
139 |           :param filename: The path to the file where the properties will be saved.
140 | 
141 |         :rtype: None
142 |         """
143 |         yaml.safe_dump(self.properties, open(filename, 'w'))
144 | 
145 |     def __str__(self) -> str:
146 |         """
147 |         Returns a string representation of the engine's properties in JSON format.
148 | 
149 |         :rtype: str
150 |           :return: A JSON string representation of the engine's properties.
151 |         """
152 |         return str(json.dumps(self.properties))
153 | 


--------------------------------------------------------------------------------
/autopeptideml/reps/fps.py:
--------------------------------------------------------------------------------
  1 | from typing import *
  2 | 
  3 | import numpy as np
  4 | 
  5 | from .engine import RepEngineBase
  6 | try:
  7 |     import rdkit.Chem.rdmolfiles as rdm
  8 |     from rdkit.Chem import rdFingerprintGenerator as rfp
  9 | except ImportError:
 10 |     raise ImportError("You need to install rdkit to use this method.",
 11 |                       " Try: `pip install rdkit`")
 12 | 
 13 | 
 14 | class RepEngineFP(RepEngineBase):
 15 |     """
 16 |     Class `RepEngineFP` is a subclass of `RepEngineBase` designed for computing molecular fingerprints (FPs) 
 17 |     using popular fingerprinting algorithms such as ECFP or FCFP. This engine generates fixed-length bit vectors 
 18 |     representing molecular structures based on their topological features.
 19 | 
 20 |     Attributes:
 21 |         :type engine: str
 22 |         :param engine: The name of the engine. Default is `'fp'`, indicating a fingerprint-based representation.
 23 | 
 24 |         :type nbits: int
 25 |         :param nbits: The length of the fingerprint bit vector. This determines the number of bits in the fingerprint.
 26 | 
 27 |         :type radius: int
 28 |         :param radius: The radius parameter used for fingerprint generation, determining the neighborhood size around each atom.
 29 | 
 30 |         :type name: str
 31 |         :param name: The name of the fingerprint generator, which includes the engine type, `nbits`, and `radius`.
 32 | 
 33 |         :type generator: object
 34 |         :param generator: The fingerprint generator object, loaded based on the specified `rep` type.
 35 |     """
 36 |     engine = 'fp'
 37 | 
 38 |     def __init__(self, rep: str, nbits: int, radius: int):
 39 |         """
 40 |         Initializes the `RepEngineFP` with the specified representation type, fingerprint size, and radius.
 41 | 
 42 |         :type rep: str
 43 |           :param rep: The type of fingerprint to generate (e.g., 'ecfp', 'fcfp').
 44 | 
 45 |         :type nbits: int
 46 |           :param nbits: The length of the fingerprint bit vector.
 47 | 
 48 |         :type radius: int
 49 |           :param radius: The radius of the neighborhood around each atom to consider when generating the fingerprint.
 50 | 
 51 |         :rtype: None
 52 |         """
 53 |         super().__init__(rep, nbits=nbits, radius=radius)
 54 |         self.nbits = nbits
 55 |         self.radius = radius
 56 |         self.name = f'{self.engine}-{rep}-{self.nbits}-{self.radius}'
 57 |         self.generator = self._load_generator(rep)
 58 | 
 59 |     def _preprocess_batch(self, batch: List[str]) -> List[str]:
 60 |         """
 61 |         Preprocesses a batch of molecular representations. For this class, no preprocessing is required.
 62 | 
 63 |         :type batch: List[str]
 64 |           :param batch: A list of molecular representations (e.g., SMILES strings).
 65 | 
 66 |         :rtype: List[str]
 67 |           :return: The same batch of molecular representations as input.
 68 |         """
 69 |         return batch
 70 | 
 71 |     def _rep_batch(self, batch: List[str]) -> List[np.ndarray]:
 72 |         """
 73 |         Computes the fingerprint for each molecule in a batch and returns the results as a list of NumPy arrays.
 74 | 
 75 |         :type batch: List[str]
 76 |           :param batch: A list of molecular representations (e.g., SMILES strings).
 77 | 
 78 |         :rtype: List[np.ndarray]
 79 |           :return: A list of NumPy arrays representing the molecular fingerprints.
 80 |         """
 81 |         out = []
 82 |         for i in batch:
 83 |             mol = rdm.MolFromSmiles(i)
 84 |             if mol is None:
 85 |                 fp = np.zeros((1, self.nbits))
 86 |             else:
 87 |                 fp = self.generator.GetCountFingerprintAsNumPy(mol)
 88 |             out.append(fp)
 89 |         return out
 90 | 
 91 |     def _load_generator(self, rep: str):
 92 |         """
 93 |         Loads the appropriate fingerprint generator based on the specified representation type.
 94 | 
 95 |         :type rep: str
 96 |           :param rep: The type of fingerprint to generate (e.g., 'ecfp', 'fcfp').
 97 | 
 98 |         :rtype: object
 99 |           :return: The fingerprint generator object based on the `rep` type.
100 | 
101 |         :raises NotImplementedError: If the specified `rep` type is not supported.
102 |         """
103 |         if 'ecfp' in rep or 'morgan' in rep:
104 |             return rfp.GetMorganGenerator(radius=self.radius,
105 |                                           includeChirality=True,
106 |                                           fpSize=self.nbits,
107 |                                           countSimulation='count' in rep)
108 |         elif 'fcfp' in rep:
109 |             invgen = rfp.GetMorganFeatureAtomInvGen()
110 |             return rfp.GetMorganGenerator(radius=self.radius,
111 |                                           fpSize=self.nbits,
112 |                                           includeChirality=True,
113 |                                           atomInvariantsGenerator=invgen,
114 |                                           countSimulation='count' in rep)
115 |         else:
116 |             raise NotImplementedError(
117 |                 f'Representation: {rep} is not currently implemented.',
118 |                 'Please, request this new feature in the Issues page of the',
119 |                 'github repo: https://IBM/AutoPeptideML'
120 |             )
121 | 
122 |     def dim(self) -> int:
123 |         """
124 |         Returns the dimensionality (bit size) of the generated fingerprint.
125 | 
126 |         :rtype: int
127 |           :return: The number of bits in the fingerprint (i.e., `nbits`).
128 |         """
129 |         return self.nbits
130 | 


--------------------------------------------------------------------------------
/autopeptideml/reps/lms.py:
--------------------------------------------------------------------------------
  1 | from contextlib import nullcontext
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import transformers
  6 | from transformers import AutoModel, AutoTokenizer, T5Tokenizer, T5EncoderModel
  7 | from typing import *
  8 | 
  9 | from .engine import RepEngineBase
 10 | 
 11 | transformers.logging.set_verbosity(transformers.logging.ERROR)
 12 | 
 13 | 
 14 | AVAILABLE_MODELS = {
 15 |     'esm2_t48_15B_UR50D': 5120,
 16 |     'esm2_t36_3B_UR50D': 2560,
 17 |     'esm2_t33_650M_UR50D': 1280,
 18 |     'esm1b_t33_650M_UR50S': 1280,
 19 |     'esm2_t30_150M_UR50D': 640,
 20 |     'esm2_t12_35M_UR50D': 480,
 21 |     'esm2_t6_8M_UR50D': 320,
 22 |     'ESMplusplus_small': 960,
 23 |     'ESMplusplus_large': 1152,
 24 |     'prot_t5_xxl_uniref50': 1024,
 25 |     'prot_t5_xl_half_uniref50-enc': 1024,
 26 |     'prot_bert': 1024,
 27 |     'ProstT5': 1024,
 28 |     'ankh-base': 768,
 29 |     'ankh-large': 1536,
 30 |     'MoLFormer-XL-both-10pct': 768,
 31 |     'ChemBERTa-77M-MLM': 384,
 32 |     'PeptideCLM-23M-all': 768
 33 | }
 34 | 
 35 | SYNONYMS = {
 36 |     'prot-t5-xl': 'prot_t5_xl_half_uniref50-enc',
 37 |     'prot-t5-xxl': 'prot_t5_xxl_uniref50',
 38 |     'protbert': 'prot_bert',
 39 |     'prost-t5': 'ProstT5',
 40 |     'esm2-15b': 'esm2_t48_15B_UR50D',
 41 |     'esm2-3b': 'esm2_t36_3B_UR50D',
 42 |     'esm2-650m': 'esm2_t33_650M_UR50D',
 43 |     'esm1b': 'esm1b_t33_650M_UR50S',
 44 |     'esm2-150m': 'esm2_t30_150M_UR50D',
 45 |     'esm2-35m': 'esm2_t12_35M_UR50D',
 46 |     'esm2-8m': 'esm2_t6_8M_UR50D',
 47 |     'esmc-300m': 'ESMplusplus_small',
 48 |     'esmc-600m': 'ESMplusplus_large',
 49 |     'ankh-base': 'ankh-base',
 50 |     'ankh-large': 'ankh-large',
 51 |     'molformer-xl': 'MoLFormer-XL-both-10pct',
 52 |     'chemberta-2': 'ChemBERTa-77M-MLM',
 53 |     'peptideclm': 'PeptideCLM-23M-all'
 54 | 
 55 | }
 56 | 
 57 | 
 58 | class RepEngineLM(RepEngineBase):
 59 |     """
 60 |     Class `RepEngineLM` is a subclass of `RepEngineBase` designed to compute molecular representations 
 61 |     using pre-trained language models (LMs) such as T5, ESM, or ChemBERTa. This engine generates vector-based 
 62 |     embeddings for input sequences, typically protein or peptide sequences, by leveraging transformer-based models.
 63 | 
 64 |     Attributes:
 65 |         :type engine: str
 66 |         :param engine: The name of the engine. Default is `'lm'`, indicating a language model-based representation.
 67 | 
 68 |         :type device: str
 69 |         :param device: The device on which the model runs, either `'cuda'` for GPU or `'cpu'`.
 70 | 
 71 |         :type model: object
 72 |         :param model: The pre-trained model used for generating representations. The model is loaded from a repository 
 73 |                         based on the `model` parameter.
 74 | 
 75 |         :type name: str
 76 |         :param name: The name of the model engine combined with the model type.
 77 | 
 78 |         :type dimension: int
 79 |         :param dimension: The dimensionality of the output representation, corresponding to the model's embedding size.
 80 | 
 81 |         :type model_name: str
 82 |         :param model_name: The specific model name used for generating representations.
 83 | 
 84 |         :type tokenizer: object
 85 |         :param tokenizer: The tokenizer associated with the model, used for converting sequences into tokenized input.
 86 | 
 87 |         :type lab: str
 88 |         :param lab: The laboratory or organization associated with the model (e.g., 'Rostlab', 'facebook', etc.).
 89 |     """
 90 |     engine = 'lm'
 91 | 
 92 |     def __init__(self, model: str, average_pooling: Optional[bool] = True,
 93 |                  cls_token: Optional[bool] = False, fp16: bool = True):
 94 |         """
 95 |         Initializes the `RepEngineLM` with the specified model and pooling options. The model is loaded based on 
 96 |         the given `model` name and its associated tokenizer.
 97 | 
 98 |         :type model: str
 99 |           :param model: The pre-trained model to use for generating representations (e.g., 'esm2_t48_15B_UR50D').
100 | 
101 |         :type average_pooling: Optional[bool]
102 |           :param average_pooling: If `True`, the embeddings are averaged across all tokens. Default is `True`.
103 | 
104 |         :type cls_token: Optional[bool]
105 |           :param cls_token: If `True`, only the representation of the [CLS] token is used. Default is `False`.
106 | 
107 |         :rtype: None
108 |         """
109 |         super().__init__(model, average_pooling=average_pooling,
110 |                          cls_token=cls_token)
111 |         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
112 |         self.model = None
113 |         self.name = f'{self.engine}-{model}'
114 |         self.fp16 = fp16
115 |         self._load_model(model)
116 | 
117 |     def move_to_device(self, device: str):
118 |         """
119 |         Moves the model to the specified device (e.g., 'cuda' or 'cpu').
120 | 
121 |         :type device: str
122 |           :param device: The target device to move the model to.
123 | 
124 |         :rtype: None
125 |         """
126 |         self.device = device
127 |         self.model.to(self.device)
128 | 
129 |     def dim(self) -> int:
130 |         """
131 |         Returns the dimensionality of the output representation generated by the model.
132 | 
133 |         :rtype: int
134 |           :return: The dimensionality (embedding size) of the model's output.
135 |         """
136 |         return self.dimension
137 | 
138 |     def max_len(self) -> int:
139 |         """
140 |         Returns the maximum allowed sequence length for the model. Some models have a specific maximum sequence length.
141 | 
142 |         :rtype: int
143 |           :return: The maximum sequence length for the model.
144 |         """
145 |         if self.lab == 'facebook':
146 |             return 1022
147 |         elif self.lab.lower() == 'evolutionaryscale':
148 |             return 2046
149 |         else:
150 |             return 2046
151 | 
152 |     def get_num_params(self) -> int:
153 |         """
154 |         Returns the total number of parameters in the model.
155 | 
156 |         :rtype: int
157 |           :return: The number of parameters in the model.
158 |         """
159 |         return sum(p.numel() for p in self.model.parameters())
160 | 
161 |     def _load_model(self, model: str):
162 |         """
163 |         Loads the specified pre-trained model and tokenizer based on the provided model name. 
164 |         The model is selected from the available models in the `AVAILABLE_MODELS` dictionary.
165 | 
166 |         :type model: str
167 |           :param model: The model name or synonym to load (e.g., 'esm2_t48_15B_UR50D').
168 | 
169 |         :raises NotImplementedError: If the specified model is not found in `AVAILABLE_MODELS` or `SYNONYMS`.
170 |         :rtype: None
171 |         """
172 |         if model not in AVAILABLE_MODELS and SYNONYMS[model.lower()] not in AVAILABLE_MODELS:
173 |             raise NotImplementedError(
174 |                 f"Model: {model} not implemented.",
175 |                 f"Available models: {', '.join(AVAILABLE_MODELS)}"
176 |             )
177 |         if model not in AVAILABLE_MODELS:
178 |             model = SYNONYMS[model.lower()]
179 |         if model.lower().startswith('pro'):
180 |             self.lab = 'Rostlab'
181 |         elif 'plusplus' in model.lower():
182 |             self.lab = 'Synthyra'
183 |         elif 'esmc' in model.lower():
184 |             self.lab = 'EvolutionaryScale'
185 |         elif 'esm' in model.lower():
186 |             self.lab = 'facebook'
187 |         elif 'lobster' in model.lower():
188 |             self.lab = 'asalam91'
189 |         elif 'ankh' in model.lower():
190 |             self.lab = 'ElnaggarLab'
191 |         elif 'molformer' in model.lower():
192 |             self.lab = 'ibm'
193 |         elif 'chemberta' in model.lower():
194 |             self.lab = 'DeepChem'
195 |         elif 'clm' in model.lower():
196 |             self.lab = 'aaronfeller'
197 |         if 't5' in model.lower():
198 |             self.tokenizer = T5Tokenizer.from_pretrained(f'Rostlab/{model}',
199 |                                                          do_lower_case=False)
200 |             self.model = T5EncoderModel.from_pretrained(f"Rostlab/{model}")
201 |         elif 'feller' in self.lab.lower():
202 |             import os
203 |             import urllib
204 |             import urllib.request as request
205 |             try:
206 |                 from .utils.peptideclm_tokenizer import SMILES_SPE_Tokenizer
207 |             except ImportError:
208 |                 raise ImportError("This function requires smilespe. Please install: `pip install smilespe`")
209 |             if os.getenv('HF_HOME') is None:
210 |                 hf_home = os.path.abspath('~/.cache/huggingface/hub/')
211 |             else:
212 |                 hf_home = os.path.abspath(os.getenv('HF_HOME'))
213 |             path = os.path.join(hf_home, 'peptideclm_tokenizer')
214 |             vocab = os.path.join(path, 'new_vocab.txt')
215 |             splits = os.path.join(path, 'new_splits.txt')
216 | 
217 |             if not (os.path.exists(vocab) and os.path.exists(splits)):
218 |                 os.makedirs(path, exist_ok=True)
219 |                 try:
220 |                     url1 = 'https://raw.githubusercontent.com/AaronFeller/PeptideCLM/refs/heads/master/tokenizer/new_vocab.txt'
221 |                     url2 = 'https://raw.githubusercontent.com/AaronFeller/PeptideCLM/refs/heads/master/tokenizer/new_splits.txt'
222 |                     request.urlretrieve(url1, vocab)
223 |                     request.urlretrieve(url2, splits)
224 |                 except urllib.error.URLError:
225 |                     raise RuntimeError("Tokenizer could not be downloaded. Please try again later and if the problem persists,",
226 |                                        "raise an issue in on the AutoPeptideML github so that the issue can be",
227 |                                        "investigated: https://github.com/IBM/AutoPeptideML/issues")
228 |             self.tokenizer = SMILES_SPE_Tokenizer(vocab_file=vocab,
229 |                                                   spe_file=splits)
230 |             self.model = AutoModel.from_pretrained(f'{self.lab}/{model}',
231 |                                                    trust_remote_code=True)
232 |         else:
233 |             self.model = AutoModel.from_pretrained(f'{self.lab}/{model}',
234 |                                                    trust_remote_code=True)
235 |             if 'plusplus' in model.lower():
236 |                 self.tokenizer = self.model.tokenizer
237 |             else:
238 |                 self.tokenizer = AutoTokenizer.from_pretrained(
239 |                     f'{self.lab}/{model}', trust_remote_code=True
240 |                 )
241 | 
242 |         self.dimension = AVAILABLE_MODELS[model]
243 |         self.model_name = model
244 |         self.model.to(self.device)
245 | 
246 |     def _preprocess_batch(self, sequences: List[str]) -> List[List[str]]:
247 |         """
248 |         Preprocesses a batch of input sequences by adjusting formatting, truncating, and applying special tokens 
249 |         based on the model type.
250 | 
251 |         :type sequences: List[str]
252 |           :param sequences: A list of input sequences (e.g., protein sequences in FASTA format).
253 | 
254 |         :rtype: List[List[str]]
255 |           :return: A list of preprocessed sequences.
256 |         """
257 |         if self.lab == 'Rostlab':
258 |             sequences = [' '.join([char for char in seq]) for seq in sequences]
259 |         if self.model_name == 'ProstT5':
260 |             sequences = ["<AA2fold> " + seq for seq in sequences]
261 |         sequences = [seq[:self.max_len()] for seq in sequences]
262 |         return sequences
263 | 
264 |     def _rep_batch(
265 |         self, batch: List[str],
266 |     ) -> List[np.ndarray]:
267 |         """
268 |         Generates representations for a batch of sequences using the loaded pre-trained model. The representations 
269 |         are extracted from the model's output and returned based on the specified pooling strategy.
270 | 
271 |         :type batch: List[str]
272 |           :param batch: A list of input sequences (e.g., protein sequences in FASTA format).
273 | 
274 |         :rtype: List[np.ndarray]
275 |           :return: A list of numpy arrays representing the embeddings of each input sequence.
276 |         """
277 |         inputs = self.tokenizer(batch, add_special_tokens=True,
278 |                                 truncation=True,
279 |                                 padding="longest", return_tensors="pt")
280 |         inputs = inputs.to(self.device)
281 |         mps_autocast = int(torch.__version__.split('.')[1]) >= 6
282 |         autocast = self.fp16 and (self.device == 'cuda' or
283 |                                   (self.device == 'mps' and mps_autocast) or
284 |                                   self.device == 'cpu')
285 |         if autocast:
286 |             autocast = torch.autocast(
287 |                 device_type=self.device,
288 |                 dtype=torch.bfloat16
289 |             )
290 |         else:
291 |             autocast = nullcontext()
292 | 
293 |         with torch.no_grad():
294 |             with autocast:
295 |                 if self.lab == 'ElnaggarLab':
296 |                     embd_rpr = self.model(
297 |                         input_ids=inputs['input_ids'],
298 |                         attention_mask=inputs['attention_mask'],
299 |                         decoder_input_ids=inputs['input_ids']
300 |                     ).last_hidden_state
301 |                 else:
302 |                     embd_rpr = self.model(**inputs).last_hidden_state
303 |         output = []
304 |         for idx in range(len(batch)):
305 |             if self.lab == 'facebook' or self.lab == 'EvolutionaryScale':
306 |                 initial = 1
307 |                 final = len(batch[idx]) + 1
308 |             elif self.lab == 'RostLab':
309 |                 initial = 0
310 |                 final = len(batch[idx].replace(' ', ''))
311 |             else:
312 |                 initial = 0
313 |                 final = len(batch[idx])
314 | 
315 |             if self.average_pooling:
316 |                 output.append(embd_rpr[idx, initial:final].mean(0).float().detach().cpu().numpy())
317 |             elif self.cls_token:
318 |                 output.append(embd_rpr[idx, 0].float().detach().cpu().numpy())
319 |             else:
320 |                 output.append(embd_rpr[idx, initial:final].detach().cpu().numpy())
321 | 
322 |             if autocast:
323 |                 output[-1] = output[-1].astype(np.float16)
324 |         return output
325 | 


--------------------------------------------------------------------------------
/autopeptideml/reps/seq_based.py:
--------------------------------------------------------------------------------
 1 | from typing import *
 2 | 
 3 | import numpy as np
 4 | 
 5 | from .engine import RepEngineBase
 6 | 
 7 | 
 8 | RESIDUES = {
 9 |     'V': 0,  'I': 1,  'L': 2,  'E': 3,  'Q': 4,
10 |     'D': 5,  'N': 6,  'H': 7,  'W': 8,  'F': 9,
11 |     'Y': 10, 'R': 11, 'K': 12, 'S': 13, 'T': 14,
12 |     'M': 15, 'A': 16, 'G': 17, 'P': 18, 'C': 19,
13 |     'X': 20
14 | }
15 | 
16 | 
17 | class RepEngineOnehot(RepEngineBase):
18 |     """
19 |     Class `RepEngineOnehot` is a subclass of `RepEngineBase` that generates one-hot encoded representations 
20 |     for input sequences. This representation is commonly used for tasks in machine learning and bioinformatics, 
21 |     such as protein sequence classification, where each amino acid in the sequence is represented by a binary vector.
22 | 
23 |     Attributes:
24 |         :type engine: str
25 |         :param engine: The name of the engine. Default is `'one-hot'`, indicating one-hot encoding representation.
26 | 
27 |         :type max_length: int
28 |         :param max_length: The maximum length of the input sequences. Sequences longer than this length will be truncated.
29 | 
30 |         :type name: str
31 |         :param name: The name of the representation engine, which is set to `'one-hot'`.
32 |     """
33 |     engine = 'one-hot'
34 | 
35 |     def __init__(self, max_length: int):
36 |         """
37 |         Initializes the `RepEngineOnehot` with the specified maximum sequence length. The one-hot encoding will
38 |         use this length to determine the size of the output vectors.
39 | 
40 |         :type max_length: int
41 |           :param max_length: The maximum length of the input sequences. Sequences longer than this will be truncated.
42 | 
43 |         :rtype: None
44 |         """
45 |         super().__init__('one-hot', max_length=max_length)
46 |         self.max_length = max_length
47 |         self.name = f'{self.engine}'
48 | 
49 |     def _preprocess_batch(self, batch: List[str]):
50 |         """
51 |         Preprocesses a batch of input sequences by truncating them to the specified maximum length.
52 | 
53 |         :type batch: List[str]
54 |           :param batch: A list of input sequences (e.g., protein sequences in FASTA format).
55 | 
56 |         :rtype: List[str]
57 |           :return: A list of preprocessed sequences truncated to the maximum length.
58 |         """
59 |         return [s[:self.max_length] for s in batch]
60 | 
61 |     def _rep_batch(self, batch: List[str]) -> np.ndarray:
62 |         """
63 |         Converts a batch of input sequences into one-hot encoded representations. Each amino acid in the sequence 
64 |         is represented by a binary vector where the position corresponding to the amino acid is set to 1, and 
65 |         all other positions are set to 0.
66 | 
67 |         :type batch: List[str]
68 |           :param batch: A list of input sequences (e.g., protein sequences in FASTA format).
69 | 
70 |         :rtype: np.ndarray
71 |           :return: A 2D numpy array where each row corresponds to a one-hot encoded representation of a sequence.
72 |         """
73 |         out = np.zeros((len(batch), self.max_length * len(RESIDUES)),
74 |                        dtype=np.int8)
75 |         for idx, s in enumerate(batch):
76 |             for idx2, c in enumerate(s):
77 |                 out[idx, idx2 * len(RESIDUES) + RESIDUES[c]] = 1
78 |         return out
79 | 
80 |     def dim(self) -> int:
81 |         """
82 |         Returns the dimensionality of the one-hot encoded representation, which is the product of the 
83 |         maximum sequence length and the number of possible amino acids.
84 | 
85 |         :rtype: int
86 |           :return: The dimensionality of the one-hot encoded representation.
87 |         """
88 |         return int(len(RESIDUES) * self.max_length)
89 | 


--------------------------------------------------------------------------------
/autopeptideml/reps/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/AutoPeptideML/adb18578f145d1c3a78d6860ad5f7d35c726159a/autopeptideml/reps/utils/__init__.py


--------------------------------------------------------------------------------
/autopeptideml/reps/utils/peptideclm_tokenizer.py:
--------------------------------------------------------------------------------
  1 | """Code adapted from PeptideCLM
  2 | https://github.com/AaronFeller/PeptideCLM
  3 | Github repository under MIT license
  4 | 
  5 | Copyright (c) 2024 Aaron Feller
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | """
 25 | import collections
 26 | import os
 27 | import re
 28 | from typing import List, Optional
 29 | from transformers import PreTrainedTokenizer
 30 | from SmilesPE.tokenizer import SPE_Tokenizer
 31 | 
 32 | 
 33 | def load_vocab(vocab_file):
 34 |     """Loads a vocabulary file into a dictionary."""
 35 |     vocab = collections.OrderedDict()
 36 |     with open(vocab_file, "r", encoding="utf-8") as reader:
 37 |         tokens = reader.readlines()
 38 |     for index, token in enumerate(tokens):
 39 |         token = token.rstrip("\n")
 40 |         vocab[token] = index
 41 |     return vocab
 42 | 
 43 | 
 44 | class Atomwise_Tokenizer(object):
 45 |     """Run atom-level SMILES tokenization"""
 46 | 
 47 |     def __init__(self):
 48 |         """ Constructs a atom-level Tokenizer.
 49 |         """
 50 |         # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
 51 |         self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
 52 | 
 53 |         self.regex = re.compile(self.regex_pattern)
 54 | 
 55 |     def tokenize(self, text):
 56 |         """ Basic Tokenization of a SMILES.
 57 |         """
 58 |         tokens = [token for token in self.regex.findall(text)]
 59 |         return tokens
 60 | 
 61 | 
 62 | class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
 63 |     r"""
 64 |     Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
 65 |     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
 66 |     should refer to the superclass for more information regarding methods.
 67 |     Args:
 68 |         vocab_file (:obj:`string`):
 69 |             File containing the vocabulary.
 70 |         spe_file (:obj:`string`):
 71 |             File containing the trained SMILES Pair Encoding vocabulary.
 72 |         unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
 73 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
 74 |             token instead.
 75 |         sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
 76 |             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
 77 |             for sequence classification or for a text and a question for question answering.
 78 |             It is also used as the last token of a sequence built with special tokens.
 79 |         pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
 80 |             The token used for padding, for example when batching sequences of different lengths.
 81 |         cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
 82 |             The classifier token which is used when doing sequence classification (classification of the whole
 83 |             sequence instead of per-token classification). It is the first token of the sequence when built with
 84 |             special tokens.
 85 |         mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
 86 |             The token used for masking values. This is the token used when training this model with masked language
 87 |             modeling. This is the token which the model will try to predict.
 88 |     """
 89 | 
 90 |     def __init__(self, vocab_file, spe_file,
 91 |                 unk_token="[UNK]",
 92 |                 sep_token="[SEP]",
 93 |                 pad_token="[PAD]",
 94 |                 cls_token="[CLS]",
 95 |                 mask_token="[MASK]",
 96 |                 **kwargs):
 97 |         if not os.path.isfile(vocab_file):
 98 |             raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
 99 |         if not os.path.isfile(spe_file):
100 |             raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
101 | 
102 |         self.vocab = load_vocab(vocab_file)
103 |         self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
104 |         self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
105 |         self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
106 | 
107 |         super().__init__(
108 |             unk_token=unk_token,
109 |             sep_token=sep_token,
110 |             pad_token=pad_token,
111 |             cls_token=cls_token,
112 |             mask_token=mask_token,
113 |             **kwargs)
114 | 
115 |     @property
116 |     def vocab_size(self):
117 |         return len(self.vocab)
118 | 
119 |     def get_vocab(self):
120 |         return dict(self.vocab, **self.added_tokens_encoder)
121 | 
122 |     def _tokenize(self, text):
123 |         return self.spe_tokenizer.tokenize(text).split(' ')
124 | 
125 |     def _convert_token_to_id(self, token):
126 |         """ Converts a token (str) in an id using the vocab. """
127 |         return self.vocab.get(token, self.vocab.get(self.unk_token))
128 | 
129 |     def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
130 |         text = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
131 |         return self.convert_tokens_to_string(text)
132 | 
133 |     def _convert_id_to_token(self, index):
134 |         """Converts an index (integer) in a token (str) using the vocab."""
135 |         return self.ids_to_tokens.get(index, self.unk_token)
136 | 
137 |     def convert_tokens_to_string(self, tokens):
138 |         """ Converts a sequence of tokens (string) in a single string. """
139 |         out_string = " ".join(tokens).replace(" ##", "").strip()
140 |         return out_string
141 | 
142 |     def build_inputs_with_special_tokens(
143 |         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
144 |     ) -> List[int]:
145 |         """
146 |         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
147 |         by concatenating and adding special tokens.
148 |         A BERT sequence has the following format:
149 |         - single sequence: ``[CLS] X [SEP]``
150 |         - pair of sequences: ``[CLS] A [SEP] B [SEP]``
151 |         Args:
152 |             token_ids_0 (:obj:`List[int]`):
153 |                 List of IDs to which the special tokens will be added
154 |             token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
155 |                 Optional second list of IDs for sequence pairs.
156 |         Returns:
157 |             :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
158 |         """
159 |         if token_ids_1 is None:
160 |             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
161 |         cls = [self.cls_token_id]
162 |         sep = [self.sep_token_id]
163 |         return cls + token_ids_0 + sep + token_ids_1 + sep
164 | 
165 |     def get_special_tokens_mask(
166 |         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
167 |     ) -> List[int]:
168 |         """
169 |         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
170 |         special tokens using the tokenizer ``prepare_for_model`` method.
171 |         Args:
172 |             token_ids_0 (:obj:`List[int]`):
173 |                 List of ids.
174 |             token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
175 |                 Optional second list of IDs for sequence pairs.
176 |             already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
177 |                 Set to True if the token list is already formatted with special tokens for the model
178 |         Returns:
179 |             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
180 |         """
181 | 
182 |         if already_has_special_tokens:
183 |             if token_ids_1 is not None:
184 |                 raise ValueError(
185 |                     "You should not supply a second sequence if the provided sequence of "
186 |                     "ids is already formated with special tokens for the model."
187 |                 )
188 |             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
189 | 
190 |         if token_ids_1 is not None:
191 |             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
192 |         return [1] + ([0] * len(token_ids_0)) + [1]
193 | 
194 |     def create_token_type_ids_from_sequences(
195 |         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
196 |     ) -> List[int]:
197 |         """
198 |         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
199 |         A BERT sequence pair mask has the following format:
200 |         ::
201 |             0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
202 |             | first sequence    | second sequence |
203 |         if token_ids_1 is None, only returns the first portion of the mask (0's).
204 |         Args:
205 |             token_ids_0 (:obj:`List[int]`):
206 |                 List of ids.
207 |             token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
208 |                 Optional second list of IDs for sequence pairs.
209 |         Returns:
210 |             :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
211 |             sequence(s).
212 |         """
213 |         sep = [self.sep_token_id]
214 |         cls = [self.cls_token_id]
215 |         if token_ids_1 is None:
216 |             return len(cls + token_ids_0 + sep) * [0]
217 |         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
218 | 
219 |     def save_vocabulary(self, vocab_path):
220 |         """
221 |         Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
222 |         Args:
223 |             vocab_path (:obj:`str`):
224 |                 The directory in which to save the vocabulary.
225 |         Returns:
226 |             :obj:`Tuple(str)`: Paths to the files saved.
227 |         """
228 |         index = 0
229 | 
230 |         vocab_file = vocab_path
231 |         with open(vocab_file, "w", encoding="utf-8") as writer:
232 |             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
233 |                 if index != token_index:
234 |                     index = token_index
235 |                 writer.write(token + "\n")
236 |                 index += 1
237 |         return (vocab_file,)
238 | 
239 | class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
240 |     r"""
241 |     Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
242 |     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
243 |     should refer to the superclass for more information regarding methods.
244 |     Args:
245 |         vocab_file (:obj:`string`):
246 |             File containing the vocabulary.
247 |         unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
248 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
249 |             token instead.
250 |         sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
251 |             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
252 |             for sequence classification or for a text and a question for question answering.
253 |             It is also used as the last token of a sequence built with special tokens.
254 |         pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
255 |             The token used for padding, for example when batching sequences of different lengths.
256 |         cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
257 |             The classifier token which is used when doing sequence classification (classification of the whole
258 |             sequence instead of per-token classification). It is the first token of the sequence when built with
259 |             special tokens.
260 |         mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
261 |             The token used for masking values. This is the token used when training this model with masked language
262 |             modeling. This is the token which the model will try to predict.
263 |     """
264 | 
265 |     def __init__(
266 |         self,
267 |         vocab_file,
268 |         unk_token="[UNK]",
269 |         sep_token="[SEP]",
270 |         pad_token="[PAD]",
271 |         cls_token="[CLS]",
272 |         mask_token="[MASK]",
273 |         **kwargs
274 |     ):
275 |         super().__init__(
276 |             unk_token=unk_token,
277 |             sep_token=sep_token,
278 |             pad_token=pad_token,
279 |             cls_token=cls_token,
280 |             mask_token=mask_token,
281 |             **kwargs,
282 |         )
283 | 
284 |         if not os.path.isfile(vocab_file):
285 |             raise ValueError(
286 |                 "Can't find a vocabulary file at path '{}'.".format(vocab_file)
287 |             )
288 |         self.vocab = load_vocab(vocab_file)
289 |         self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
290 |         self.tokenizer = Atomwise_Tokenizer()
291 | 
292 |     @property
293 |     def vocab_size(self):
294 |         return len(self.vocab)
295 | 
296 |     def get_vocab(self):
297 |         return dict(self.vocab, **self.added_tokens_encoder)
298 | 
299 |     def _tokenize(self, text):
300 |         return self.tokenizer.tokenize(text)
301 | 
302 |     def _convert_token_to_id(self, token):
303 |         """ Converts a token (str) in an id using the vocab. """
304 |         return self.vocab.get(token, self.vocab.get(self.unk_token))
305 | 
306 |     def _convert_id_to_token(self, index):
307 |         """Converts an index (integer) in a token (str) using the vocab."""
308 |         return self.ids_to_tokens.get(index, self.unk_token)
309 | 
310 |     def convert_tokens_to_string(self, tokens):
311 |         """ Converts a sequence of tokens (string) in a single string. """
312 |         out_string = " ".join(tokens).replace(" ##", "").strip()
313 |         return out_string
314 | 
315 |     def build_inputs_with_special_tokens(
316 |         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
317 |     ) -> List[int]:
318 |         """
319 |         Build model inputs from a sequence or a pair of sequence for sequence classification tasks
320 |         by concatenating and adding special tokens.
321 |         A BERT sequence has the following format:
322 |         - single sequence: ``[CLS] X [SEP]``
323 |         - pair of sequences: ``[CLS] A [SEP] B [SEP]``
324 |         Args:
325 |             token_ids_0 (:obj:`List[int]`):
326 |                 List of IDs to which the special tokens will be added
327 |             token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
328 |                 Optional second list of IDs for sequence pairs.
329 |         Returns:
330 |             :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
331 |         """
332 |         if token_ids_1 is None:
333 |             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
334 |         cls = [self.cls_token_id]
335 |         sep = [self.sep_token_id]
336 |         return cls + token_ids_0 + sep + token_ids_1 + sep
337 | 
338 |     def get_special_tokens_mask(
339 |         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
340 |     ) -> List[int]:
341 |         """
342 |         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
343 |         special tokens using the tokenizer ``prepare_for_model`` method.
344 |         Args:
345 |             token_ids_0 (:obj:`List[int]`):
346 |                 List of ids.
347 |             token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
348 |                 Optional second list of IDs for sequence pairs.
349 |             already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
350 |                 Set to True if the token list is already formatted with special tokens for the model
351 |         Returns:
352 |             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
353 |         """
354 | 
355 |         if already_has_special_tokens:
356 |             if token_ids_1 is not None:
357 |                 raise ValueError(
358 |                     "You should not supply a second sequence if the provided sequence of "
359 |                     "ids is already formated with special tokens for the model."
360 |                 )
361 |             return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
362 | 
363 |         if token_ids_1 is not None:
364 |             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
365 |         return [1] + ([0] * len(token_ids_0)) + [1]
366 | 
367 |     def create_token_type_ids_from_sequences(
368 |         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
369 |     ) -> List[int]:
370 |         """
371 |         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
372 |         A BERT sequence pair mask has the following format:
373 |         ::
374 |             0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
375 |             | first sequence    | second sequence |
376 |         if token_ids_1 is None, only returns the first portion of the mask (0's).
377 |         Args:
378 |             token_ids_0 (:obj:`List[int]`):
379 |                 List of ids.
380 |             token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
381 |                 Optional second list of IDs for sequence pairs.
382 |         Returns:
383 |             :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
384 |             sequence(s).
385 |         """
386 |         sep = [self.sep_token_id]
387 |         cls = [self.cls_token_id]
388 |         if token_ids_1 is None:
389 |             return len(cls + token_ids_0 + sep) * [0]
390 |         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
391 | 
392 |     def save_vocabulary(self, vocab_path):
393 |         """
394 |         Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
395 |         Args:
396 |             vocab_path (:obj:`str`):
397 |                 The directory in which to save the vocabulary.
398 |         Returns:
399 |             :obj:`Tuple(str)`: Paths to the files saved.
400 |         """
401 |         index = 0
402 |         vocab_file = vocab_path
403 |         with open(vocab_file, "w", encoding="utf-8") as writer:
404 |             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
405 |                 if index != token_index:
406 |                     index = token_index
407 |                 writer.write(token + "\n")
408 |                 index += 1
409 |         return (vocab_file,)
410 | 


--------------------------------------------------------------------------------
/autopeptideml/train/__init__.py:
--------------------------------------------------------------------------------
1 | from .trainer import BaseTrainer, OptunaTrainer, GridTrainer, NoHpoTrainer
2 | 


--------------------------------------------------------------------------------
/autopeptideml/train/architectures.py:
--------------------------------------------------------------------------------
 1 | from typing import *
 2 | 
 3 | 
 4 | SKLEARN_MODELS = ['knn', 'svm', 'rf', 'adaboost', 'gradboost']
 5 | ALL_MODELS = SKLEARN_MODELS + ['lightgbm', 'xgboost']
 6 | 
 7 | 
 8 | def load_sklearn_models(task: str) -> Dict[str, Callable]:
 9 |     try:
10 |         import sklearn as sk
11 |     except ImportError:
12 |         raise ImportError("This function requires scikit-learn",
13 |                           "Please try: `pip install scikit-learn`")
14 | 
15 |     from sklearn import (svm, ensemble, neighbors)
16 |     if 'class' in task:
17 |         arch = {
18 |             'knn': neighbors.KNeighborsClassifier,
19 |             'svm': svm.SVC,
20 |             'rf': ensemble.RandomForestClassifier,
21 |             'adaboost': ensemble.AdaBoostClassifier,
22 |             'gradboost': ensemble.GradientBoostingClassifier,
23 | 
24 |         }
25 |     elif 'reg' in task:
26 |         arch = {
27 |             'knn': neighbors.KNeighborsRegressor,
28 |             'svm': svm.SVR,
29 |             'rf': ensemble.RandomForestRegressor,
30 |             'adaboost': ensemble.AdaBoostRegressor,
31 |             'gradboost': ensemble.GradientBoostingRegressor
32 |         }
33 |     else:
34 |         raise NotImplementedError(
35 |             f"Task type: {task} not implemented."
36 |         )
37 |     return arch
38 | 
39 | 
40 | def load_lightgbm(task: str) -> Dict[str, Callable]:
41 |     try:
42 |         import lightgbm
43 |     except ImportError:
44 |         raise ImportError("This function requires lightgbm",
45 |                           "Please try: `pip install lightgbm`")
46 |     if 'class' in task:
47 |         arch = {'lightgbm': lightgbm.LGBMClassifier}
48 |     elif 'reg' in task:
49 |         arch = {'lightgbm': lightgbm.LGBMRegressor}
50 |     else:
51 |         raise NotImplementedError(
52 |             f"Task type: {task} not implemented."
53 |         )
54 |     return arch
55 | 
56 | 
57 | def load_xgboost(task: str) -> Dict[str, Callable]:
58 |     try:
59 |         import xgboost
60 |     except ImportError:
61 |         raise ImportError("This function requires lightgbm",
62 |                           "Please try: `pip install lightgbm`")
63 |     if 'class' in task:
64 |         arch = {'xgboost': xgboost.XGBClassifier}
65 |     elif 'reg' in task:
66 |         arch = {'xgboost': xgboost.XGBRegressor}
67 |     else:
68 |         raise NotImplementedError(
69 |             f"Task type: {task} not implemented."
70 |         )
71 |     return arch
72 | 
73 | 
74 | def load_torch(task: str) -> Dict[str, Callable]:
75 |     try:
76 |         from .deep_learning import Cnn
77 |     except ImportError:
78 |         raise ImportError("This function requires torch",
79 |                           "Please try: `pip install torch`")
80 | 
81 |     return {"cnn": Cnn}
82 | 


--------------------------------------------------------------------------------
/autopeptideml/train/deep_learning/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import Cnn


--------------------------------------------------------------------------------
/autopeptideml/train/deep_learning/dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code Adapted from the UniDL4BioPep
 3 | implementation of their model for PyTorch
 4 | in the GitHub Repository:
 5 | https://github.com/David-Dingle/UniDL4BioPep_ASL_PyTorch/
 6 | """
 7 | 
 8 | import torch
 9 | from torch.utils.data import Dataset
10 | 
11 | 
12 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
13 | 
14 | 
15 | class UniDL4BioPep_Dataset(Dataset):
16 |     def __init__(self, x, y):
17 |         super().__init__()
18 |         self.data = torch.from_numpy(x).float().to(device)
19 |         self.labels = torch.from_numpy(y).float().to(device)
20 | 
21 |     def __len__(self):
22 |         return len(self.labels)
23 | 
24 |     def __getitem__(self, idx):
25 |         return self.data[idx], self.labels[idx]
26 | 
27 |     def get_labels(self):
28 |         return self.labels
29 | 
30 |     def get_data(self):
31 |         return self.data
32 | 
33 | 
34 | class UniDL4BioPep_Inference(Dataset):
35 |     def __init__(self, x):
36 |         super().__init__()
37 |         self.data = torch.from_numpy(x).float().to(device)
38 | 
39 |     def __len__(self):
40 |         return self.data.shape[0]
41 | 
42 |     def __getitem__(self, index):
43 |         return self.data[index]
44 | 


--------------------------------------------------------------------------------
/autopeptideml/train/deep_learning/loss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code Adapted from the UniDL4BioPep
 3 | implementation of their model for PyTorch
 4 | in the GitHub Repository:
 5 | https://github.com/David-Dingle/UniDL4BioPep_ASL_PyTorch/
 6 | """
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | 
11 | 
12 | class ASLSingleLabel(nn.Module):
13 |     '''
14 |     This loss is intended for single-label classification problems
15 |     '''
16 |     def __init__(self, gamma_pos=0, gamma_neg=4, eps: float = 0.1, reduction='mean'):
17 |         super(ASLSingleLabel, self).__init__()
18 | 
19 |         self.eps = eps
20 |         self.logsoftmax = nn.LogSoftmax(dim=-1)
21 |         self.targets_classes = []
22 |         self.gamma_pos = gamma_pos
23 |         self.gamma_neg = gamma_neg
24 |         self.reduction = reduction
25 | 
26 |     def forward(self, inputs, target):
27 |         '''
28 |         "input" dimensions: - (batch_size,number_classes)
29 |         "target" dimensions: - (batch_size)
30 |         '''
31 |         num_classes = inputs.size()[-1]
32 |         log_preds = self.logsoftmax(inputs)
33 |         self.targets_classes = torch.zeros_like(inputs).scatter_(1, target.long().unsqueeze(1), 1)
34 | 
35 |         # ASL weights
36 |         targets = self.targets_classes
37 |         anti_targets = 1 - targets
38 |         xs_pos = torch.exp(log_preds)
39 |         xs_neg = 1 - xs_pos
40 |         xs_pos = xs_pos * targets
41 |         xs_neg = xs_neg * anti_targets
42 |         asymmetric_w = torch.pow(1 - xs_pos - xs_neg,
43 |                                  self.gamma_pos * targets + self.gamma_neg * anti_targets)
44 |         log_preds = log_preds * asymmetric_w
45 | 
46 |         if self.eps > 0:  # label smoothing
47 |             self.targets_classes = self.targets_classes.mul(1 - self.eps).add(self.eps / num_classes)
48 | 
49 |         # loss calculation
50 |         loss = - self.targets_classes.mul(log_preds)
51 | 
52 |         loss = loss.sum(dim=-1)
53 |         if self.reduction == 'mean':
54 |             loss = loss.mean()
55 | 
56 |         return loss
57 | 


--------------------------------------------------------------------------------
/autopeptideml/train/deep_learning/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code Adapted from the UniDL4BioPep
  3 | implementation of their model for PyTorch
  4 | in the GitHub Repository:
  5 | https://github.com/David-Dingle/UniDL4BioPep_ASL_PyTorch/
  6 | """
  7 | 
  8 | import copy
  9 | import os
 10 | 
 11 | import numpy as np
 12 | import sklearn.metrics
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | from tqdm import tqdm
 17 | 
 18 | from ..metrics import evaluate
 19 | from .dataset import UniDL4BioPep_Dataset, UniDL4BioPep_Inference
 20 | from .loss import ASLSingleLabel
 21 | 
 22 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 23 | 
 24 | 
 25 | class BaseModel(nn.Module):
 26 |     def get_params(self):
 27 |         return self.params
 28 | 
 29 |     def predict_proba(
 30 |         self,
 31 |         x: np.array
 32 |     ):
 33 |         self.eval()
 34 |         x_dataloader = self._prepare_data(x, y=None)
 35 |         outputs = []
 36 |         for batch in x_dataloader:
 37 |             batch = batch.to(self.device_name)
 38 |             output = self(batch).cpu().detach().numpy()
 39 |             outputs.append(output)
 40 | 
 41 |         return np.concatenate(outputs)
 42 | 
 43 |     def predict(
 44 |         self,
 45 |         x: np.array,
 46 |         device: str
 47 |     ):
 48 |         self.to('cpu')
 49 |         self.load_state_dict(self.best_model)
 50 |         self.to(device)
 51 |         self.device_name = device
 52 |         if 'reg' not in self.task:
 53 |             return (self.predict_proba(x)[:, 1] > 0.5).astype(int)
 54 |         else:
 55 |             return self.predict_proba(x)
 56 | 
 57 |     def evaluate(self, x, y):
 58 |         self.eval()
 59 |         x_dataloader = self._prepare_data(x, y=None)
 60 |         for batch in x_dataloader:
 61 |             output = self(batch).cpu().detach()
 62 |         report = self._scores(output, torch.Tensor(y))
 63 |         return report
 64 | 
 65 |     def fit(
 66 |         self,
 67 |         train_x: np.array,
 68 |         train_y: np.array,
 69 |         valid_x: np.array,
 70 |         valid_y: np.array,
 71 |         device: str
 72 |     ):
 73 |         if not os.path.exists(self.logger):
 74 |             os.mkdir(self.logger)
 75 |         logger_training = os.path.join(self.logger, 'train.log')
 76 |         logger_validation = os.path.join(self.logger, 'valid.log')
 77 |         logger_checkpoint = os.path.join(self.logger, 'best_chckpt.pt')
 78 | 
 79 |         train_set = self._prepare_data(train_x, train_y)
 80 |         valid_set = self._prepare_data(valid_x, valid_y)
 81 | 
 82 |         self = self.to(device)
 83 |         self.device_name = device
 84 |         min_valid_loss = float("inf")
 85 |         for epoch in range(self.epochs):
 86 |             running_loss = 0.0
 87 |             train_acc = []
 88 |             valid_loss = 0.0
 89 |             self.train()
 90 |             counter = 0
 91 |             for i, (inputs, labels) in enumerate(train_set):
 92 |                 self.optimizer.zero_grad()
 93 |                 inputs = inputs.to(device)
 94 |                 labels = labels.to(device)
 95 |                 if 'class' in self.task:
 96 |                     new_labels = torch.zeros((labels.shape[0], 2))
 97 |                     new_labels[labels == 0, 0] = 1
 98 |                     new_labels[labels == 1, 1] = 1
 99 |                     labels = new_labels.to(device)
100 |                 outputs = self(inputs)
101 |                 if 'multi' in self.task:
102 |                     loss = self.loss(outputs.unravel(), labels.unravel())
103 |                 else:
104 |                     loss = self.loss(outputs.float(), labels.float())
105 | 
106 |                 loss.backward()
107 |                 self.optimizer.step()
108 |                 running_loss += loss.item()
109 |                 if 'reg' in self.task:
110 |                     train_acc.append(self._scores(outputs.to("cpu"), labels.to("cpu"))["mse"])
111 |                 else:
112 |                     train_acc.append(self._scores(outputs.to("cpu"), labels.to("cpu"))["f1_weighted"])
113 | 
114 |             self.eval()
115 |             acc = 0
116 |             for j, (valid_inputs, valid_labels) in enumerate(valid_set):
117 |                 valid_labels = valid_labels.to(device)
118 |                 valid_inputs = valid_inputs.to(device)
119 |                 with torch.no_grad():
120 |                     valid_outputs = self(valid_inputs)
121 |                 if 'multi' in self.task:
122 |                     valid_loss = self.loss(outputs.unravel(), labels.unravel())
123 |                 else:
124 |                     valid_loss = self.loss(outputs.float(), labels.float())
125 | 
126 |                 if 'reg' in self.task:
127 |                     acc = self._scores(valid_outputs.to('cpu'), valid_labels.to('cpu'))["mse"]
128 |                 else:
129 |                     acc = self._scores(valid_outputs.to('cpu'), valid_labels.to('cpu'))["f1_weighted"]
130 | 
131 |                 if valid_loss < min_valid_loss:
132 |                     min_valid_loss = valid_loss
133 |                     torch.save({
134 |                         'epoch': epoch,
135 |                         'model_state_dict': self.state_dict(),
136 |                         'optimizer_state_dict': self.optimizer.state_dict(),
137 |                     }, logger_checkpoint)
138 |                     self.cpu()
139 |                     self.best_model = copy.deepcopy(self.state_dict())
140 |                     self.to(device)
141 | 
142 |     def _get_confusion_matrix(self, y_pred: torch.Tensor, y_test: torch.Tensor):
143 |         predictions = torch.argmax(y_pred, dim=-1).numpy()
144 |         labels = torch.argmax(y_test, dim=-1).numpy()  # A:0, B:1, C:2, [D:3]
145 |         confusion_matrix = sklearn.metrics.confusion_matrix(labels, predictions)
146 |         return confusion_matrix
147 | 
148 |     def _scores(self, y_pred: torch.Tensor, y_test: torch.Tensor):
149 |         predictions = torch.argmax(y_pred, dim=-1).numpy()
150 |         labels = y_test.numpy()
151 |         task = 'reg' if 'reg' in self.task else 'class'
152 |         return evaluate(predictions, labels, task)
153 | 
154 |     def _prepare_data(self, x, y, shuffle: bool=False):
155 |         if y is None:
156 |             dataset = UniDL4BioPep_Inference(x)
157 |         else:
158 |             dataset = UniDL4BioPep_Dataset(x, y)
159 |         dataloader = torch.utils.data.DataLoader(dataset, batch_size=x.shape[0] if y is None else 64, shuffle=shuffle)
160 |         return dataloader
161 | 
162 |     def _get_optimizer(self, optim_algorithm: str='adam', lr: float=0.0001, weight_decay: float=0):
163 |         OPTIMIZERS = {
164 |             'adam': torch.optim.Adam
165 |         }
166 |         return OPTIMIZERS[optim_algorithm](self.parameters(), lr=lr, weight_decay=weight_decay)
167 | 
168 |     def _get_criteria(self, **kwargs):
169 |         return ASLSingleLabel(**kwargs)
170 | 
171 | 
172 | class Cnn(BaseModel):
173 |     """
174 |     CNN model
175 |     """
176 |     def __init__(
177 |         self,
178 |         optimizer: dict,
179 |         logger: str,
180 |         labels: int,
181 |         task: str,
182 |         epochs: int = 200,
183 |     ):
184 |         super().__init__()
185 |         self.output_dim = labels
186 |         self.input_dim = 320
187 |         self.dropout = 0.3
188 |         self.stride = 2
189 |         self.kernel_1 = 3
190 |         self.channel_1 = 32
191 | 
192 |         self.conv_1 = nn.Conv1d(kernel_size=self.kernel_1,
193 |                                 out_channels=self.channel_1,
194 |                                 in_channels=1, stride=1)
195 |         self.normalizer_1 = nn.BatchNorm1d(self.channel_1)
196 |         self.pooling_1 = nn.MaxPool1d(kernel_size=self.kernel_1,
197 |                                       stride=self.stride)
198 | 
199 |         self.dropout = nn.Dropout(p=self.dropout)
200 |         self.fc1 = nn.LazyLinear(128)
201 |         self.normalizer_2 = nn.BatchNorm1d(128)
202 |         self.fc2 = nn.Linear(128, self.output_dim)
203 |         self.device_name = 'cpu'
204 |         self.epochs = epochs
205 |         self.optimizer = self._get_optimizer(**optimizer)
206 |         # self.criteria = self._get_criteria(**criteria)
207 |         self.logger = logger
208 |         if 'multi' in task:
209 |             self.loss = nn.BCELoss()
210 |         elif 'class' in task:
211 |             self.loss = nn.CrossEntropyLoss()
212 |         else:
213 |             self.loss = nn.MSELoss()
214 |         self.task = task
215 |         self.params = {
216 |             'epochs': self.epochs,
217 |             'optimizer': optimizer,
218 |         }
219 | 
220 |     def forward(self, x):
221 |         x = torch.unsqueeze(x, dim=1)  # (batch, embedding_dim) -> (batch, 1, embedding_dim)
222 |         x = self.conv_1(x)
223 |         if x.shape[0] > 1:
224 |             x = self.normalizer_1(x)
225 |         c_1 = self.pooling_1(F.relu(x))
226 | 
227 |         c_2 = torch.flatten(c_1, start_dim=1)
228 |         c_2 = self.dropout(c_2)
229 |         c_2 = self.fc1(c_2)
230 |         if x.shape[0] > 1:
231 |             c_2 = self.normalizer_2(c_2)
232 |         out = F.relu(c_2)
233 |         out = self.fc2(out)
234 |         if 'class' in self.task or 'multi' in self.task:
235 |             out = torch.softmax(out, dim=-1)
236 |         return out
237 | 
238 | 
239 | class MLP(BaseModel, nn.Module):
240 |     def __init__(
241 |         self,
242 |         optimizer: dict,
243 |         logger: str,
244 |         labels: int,
245 |         task: str,
246 |         epochs: int = 200,
247 |     ):
248 |         super().__init__()
249 |         self.output_dim = labels
250 |         self.input_dim = 320
251 |         self.dropout = 0.3
252 |         self.stride = 2
253 |         self.kernel_1 = 3
254 |         self.channel_1 = 32
255 | 
256 |         self.mlp = nn.Sequential(
257 |             nn.LazyLinear(self.input_dim),
258 |             nn.LeakyReLU(),
259 |             nn.Linear(self.input_dim, self.input_dim),
260 |             nn.LeakyReLU(),
261 |             nn.Linear(self.input_dim, self.output_dim)
262 |         )
263 |         self.device_name = 'cpu'
264 |         self.epochs = epochs
265 |         self.optimizer = self._get_optimizer(**optimizer)
266 |         # self.criteria = self._get_criteria(**criteria)
267 |         self.logger = logger
268 |         if 'multi' in task:
269 |             self.loss = nn.BCELoss()
270 |         elif 'class' in task:
271 |             self.loss = nn.CrossEntropyLoss()
272 |         else:
273 |             self.loss = nn.MSELoss()
274 |         self.task = task
275 |         self.params = {
276 |             'epochs': self.epochs,
277 |             'optimizer': optimizer,
278 |         }
279 | 
280 |     def forward(self, x):
281 |         # x = torch.unsqueeze(x, dim=1)  # (batch, embedding_dim) -> (batch, 1, embedding_dim)
282 |         out = self.mlp(x)
283 |         if 'class' in self.task or 'multi' in self.task:
284 |             out = torch.softmax(out, dim=-1)
285 |         # else:
286 |         #     out = out.squeeze(1)
287 |         return out
288 | 


--------------------------------------------------------------------------------
/autopeptideml/train/metrics.py:
--------------------------------------------------------------------------------
 1 | from typing import *
 2 | import numpy as np
 3 | from scipy.stats import pearsonr, spearmanr
 4 | from sklearn.metrics import (matthews_corrcoef,
 5 |                              accuracy_score, f1_score,
 6 |                              precision_score, recall_score, mean_squared_error,
 7 |                              mean_absolute_error, roc_auc_score)
 8 | 
 9 | 
10 | def _pcc(preds, truths):
11 |     return pearsonr(preds, truths)[0]
12 | 
13 | 
14 | def _spcc(preds, truths):
15 |     return spearmanr(preds, truths)[0]
16 | 
17 | 
18 | def _f1_weighted(preds, truths):
19 |     return f1_score(preds, truths, average='weighted')
20 | 
21 | 
22 | def _recall(preds, truths):
23 |     return recall_score(preds, truths, zero_division=True)
24 | 
25 | 
26 | CLASSIFICATION_METRICS = {
27 |     'mcc': matthews_corrcoef,
28 |     'acc': accuracy_score,
29 |     'f1': f1_score,
30 |     'f1_weighted': _f1_weighted,
31 |     'precision': precision_score,
32 |     'recall': _recall,
33 |     'auroc': roc_auc_score
34 | }
35 | 
36 | REGRESSION_METRICS = {
37 |     'mse': mean_squared_error,
38 |     'mae': mean_absolute_error,
39 |     'pcc': _pcc,
40 |     'spcc': _spcc
41 | }
42 | 
43 | 
44 | def evaluate(preds, truth, pred_task) -> Dict[str, float]:
45 |     result = {}
46 |     if pred_task == 'reg':
47 |         metrics = REGRESSION_METRICS
48 |     else:
49 |         preds = preds > 0.5
50 |         metrics = CLASSIFICATION_METRICS
51 | 
52 |     for key, value in metrics.items():
53 |         try:
54 |             result[key] = value(preds, truth)
55 |         except ValueError as e:
56 |             result[key] = np.nan
57 |     return result
58 | 


--------------------------------------------------------------------------------
/docs/autopeptideml.md:
--------------------------------------------------------------------------------
  1 | # Class AutoPeptideML
  2 | 
  3 | ## Overview
  4 | 
  5 | `AutoPeptideML` is a configurable machine learning workflow class designed for peptide modeling. It integrates data pipelines, representations, model training (with HPO), evaluation, and export.
  6 | 
  7 | ---
  8 | 
  9 | ## Class: `AutoPeptideML`
 10 | 
 11 | ### Constructor
 12 | 
 13 | ```python
 14 | AutoPeptideML(config: dict)
 15 | ```
 16 | 
 17 | * Initializes the AutoPeptideML workflow with a provided configuration dictionary.
 18 | * Creates output directories and stores pipeline, representation, training, and database settings.
 19 | 
 20 | ---
 21 | 
 22 | ### Public Methods
 23 | 
 24 | #### `get_pipeline`
 25 | 
 26 | ```python
 27 | get_pipeline(pipe_config: Optional[dict] = None) -> Pipeline
 28 | ```
 29 | 
 30 | Load or construct the preprocessing pipeline.
 31 | 
 32 | #### `get_database`
 33 | 
 34 | ```python
 35 | get_database(db_config: Optional[dict] = None) -> Database
 36 | ```
 37 | 
 38 | Create or load the peptide database with optional negative data support.
 39 | 
 40 | #### `get_reps`
 41 | 
 42 | ```python
 43 | get_reps(rep_config: Optional[dict] = None) -> Tuple[Dict[str, RepEngineBase], Dict[str, np.ndarray]]
 44 | ```
 45 | 
 46 | Load or compute representations for the data.
 47 | 
 48 | #### `get_test`
 49 | 
 50 | ```python
 51 | get_test(test_config: Optional[Dict] = None) -> HestiaGenerator
 52 | ```
 53 | 
 54 | Partition the dataset into training/validation/test using `HestiaGenerator`.
 55 | 
 56 | #### `get_train`
 57 | 
 58 | ```python
 59 | get_train(train_config: Optional[Dict] = None) -> BaseTrainer
 60 | ```
 61 | 
 62 | Load and return the trainer based on the configuration (supports Optuna and Grid).
 63 | 
 64 | #### `run_hpo`
 65 | 
 66 | ```python
 67 | run_hpo() -> Dict
 68 | ```
 69 | 
 70 | Perform hyperparameter optimization across dataset partitions.
 71 | 
 72 | #### `run_evaluation`
 73 | 
 74 | ```python
 75 | run_evaluation(models) -> pd.DataFrame
 76 | ```
 77 | 
 78 | Run evaluation on the trained models and return a DataFrame of results.
 79 | 
 80 | #### `save_experiment`
 81 | 
 82 | ```python
 83 | save_experiment(model_backend: str = 'onnx', save_reps: bool = False, save_test: bool = True, save_all_models: bool = True)
 84 | ```
 85 | 
 86 | Save the full experiment including models, test partitions, and configuration.
 87 | 
 88 | #### `save_database`
 89 | 
 90 | ```python
 91 | save_database()
 92 | ```
 93 | 
 94 | Export the database to CSV.
 95 | 
 96 | #### `save_models`
 97 | 
 98 | ```python
 99 | save_models(ensemble_path: str, backend: str = 'onnx', save_all: bool = True)
100 | ```
101 | 
102 | Save models using `onnx` or `joblib` backends.
103 | 
104 | #### `save_reps`
105 | 
106 | ```python
107 | save_reps(rep_dir: str)
108 | ```
109 | 
110 | Save precomputed representations to disk.
111 | 
112 | #### `predict`
113 | 
114 | ```python
115 | predict(df: pd.DataFrame, feature_field: str, experiment_dir: str, backend: str = 'onnx') -> np.ndarray
116 | ```
117 | 
118 | Load a saved experiment and predict using the trained ensemble on new data.
119 | 
120 | ---
121 | 
122 | ### Configuration Keys
123 | 
124 | The `config` dictionary passed to the constructor must include the following keys:
125 | 
126 | * `outputdir`: str
127 | * `pipeline`: dict or str
128 | * `representation`: dict or str
129 | * `train`: dict or str
130 | * `databases`: dict
131 | * `test`: dict
132 | 
133 | ---
134 | 
135 | ### Dependencies
136 | 
137 | * pandas, numpy
138 | * yaml, json
139 | * hestia
140 | * sklearn
141 | * skl2onnx, onnxmltools, joblib (optional)
142 | 
143 | ---
144 | 
145 | ## Example Usage
146 | 
147 | ```python
148 | from autopipeline.autopeptideml import AutoPeptideML
149 | 
150 | config = yaml.safe_load(open('config.yml'))
151 | runner = AutoPeptideML(config)
152 | pipeline = runner.get_pipeline()
153 | db = runner.get_database()
154 | reps, x = runner.get_reps()
155 | test = runner.get_test()
156 | trainer = runner.get_train()
157 | models = runner.run_hpo()
158 | evaluation = runner.run_evaluation(models)
159 | runner.save_experiment()
160 | ```
161 | 
162 | ---
163 | 
164 | For detailed config templates and supported options, see the corresponding YAML schema documentation.
165 | 


--------------------------------------------------------------------------------
/docs/imgs/APML_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/AutoPeptideML/adb18578f145d1c3a78d6860ad5f7d35c726159a/docs/imgs/APML_dark.png


--------------------------------------------------------------------------------
/docs/imgs/APML_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/AutoPeptideML/adb18578f145d1c3a78d6860ad5f7d35c726159a/docs/imgs/APML_light.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | {%
2 |    include-markdown "../README.md"
3 | %}
4 | 


--------------------------------------------------------------------------------
/docs/repenginebase.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # `RepEngineBase` Class Documentation
  4 | 
  5 | **Module:** `rep_engine_base`
  6 | 
  7 | ## Purpose
  8 | `RepEngineBase` is an abstract base class for molecular representation engines. It defines a standard interface and utilities for computing molecular representations from a list of molecules (e.g., SMILES strings), particularly in batched processing. This class is intended to be subclassed, with core functionality like preprocessing and representation computation implemented in derived classes.
  9 | 
 10 | ---
 11 | 
 12 | ## Attributes
 13 | 
 14 | - **`engine`** (`str`):  
 15 |   Name of the representation engine. Typically defined in a subclass or passed during instantiation.
 16 | 
 17 | - **`rep`** (`str`):  
 18 |   Type of molecular representation (e.g., `'fingerprint'`, `'embedding'`).
 19 | 
 20 | - **`properties`** (`dict`):  
 21 |   A deep copy of the instance's dictionary at initialization. Captures configuration state.
 22 | 
 23 | ---
 24 | 
 25 | ## Constructor
 26 | 
 27 | ```python
 28 | def __init__(self, rep: str, **args)
 29 | ```
 30 | 
 31 | **Parameters:**
 32 | - `rep` (`str`): Type of molecular representation.
 33 | - `**args` (`dict`): Additional configuration options stored as attributes.
 34 | 
 35 | **Effect:**  
 36 | Initializes the object, stores `rep`, and adds all additional keyword arguments to the instance. Also creates a deep copy of all these attributes in `self.properties` for serialization.
 37 | 
 38 | ---
 39 | 
 40 | ## Public Methods
 41 | 
 42 | ### `compute_reps`
 43 | 
 44 | ```python
 45 | def compute_reps(self, mols: List[str], verbose: Optional[bool] = False, batch_size: Optional[int] = 12) -> Union[np.ndarray, List[np.ndarray]]
 46 | ```
 47 | 
 48 | **Description:**  
 49 | Computes molecular representations in batches using `_preprocess_batch` and `_rep_batch`.
 50 | 
 51 | **Parameters:**
 52 | - `mols` (`List[str]`): List of molecular inputs (e.g., SMILES strings).
 53 | - `verbose` (`bool`, optional): If `True`, shows a progress bar.
 54 | - `batch_size` (`int`, optional): Number of molecules per batch.
 55 | 
 56 | **Returns:**  
 57 | - `np.ndarray` if `average_pooling` is `True` or unset.
 58 | - `List[np.ndarray]` if `average_pooling` is explicitly set to `False`.
 59 | 
 60 | ---
 61 | 
 62 | ### `dim`
 63 | 
 64 | ```python
 65 | def dim(self) -> int
 66 | ```
 67 | 
 68 | **Description:**  
 69 | Abstract method. Must return the dimensionality of the computed representation.
 70 | 
 71 | **Raises:**  
 72 | - `NotImplementedError`
 73 | 
 74 | ---
 75 | 
 76 | ### `_rep_batch`
 77 | 
 78 | ```python
 79 | def _rep_batch(self, batch: List[str]) -> np.ndarray
 80 | ```
 81 | 
 82 | **Description:**  
 83 | Abstract method. Must compute and return the representation for a batch of molecules.
 84 | 
 85 | **Raises:**  
 86 | - `NotImplementedError`
 87 | 
 88 | ---
 89 | 
 90 | ### `_preprocess_batch`
 91 | 
 92 | ```python
 93 | def _preprocess_batch(self, batch: List[str]) -> List[str]
 94 | ```
 95 | 
 96 | **Description:**  
 97 | Abstract method. Must return a preprocessed version of the batch for representation.
 98 | 
 99 | **Raises:**  
100 | - `NotImplementedError`
101 | 
102 | ---
103 | 
104 | ### `save`
105 | 
106 | ```python
107 | def save(self, filename: str)
108 | ```
109 | 
110 | **Description:**  
111 | Serializes and saves the engine’s properties to a YAML file.
112 | 
113 | **Parameters:**
114 | - `filename` (`str`): Destination path for the YAML file.
115 | 
116 | ## Design Notes
117 | 
118 | - This class provides **batch processing** support and optional **average pooling** control.
119 | - The use of `batched` from `itertools` supports Python 3.10+ but also includes a fallback implementation for older versions.
120 | - Intended for extension: Subclasses must implement `_rep_batch`, `_preprocess_batch`, and `dim`.
121 | 


--------------------------------------------------------------------------------
/docs/repenginefp.md:
--------------------------------------------------------------------------------
1 | # RepEngineFP
2 | 
3 | ::: autopeptideml.reps.fps.RepEngineFP
4 | 
5 | 


--------------------------------------------------------------------------------
/docs/repenginelm.md:
--------------------------------------------------------------------------------
1 | # RepEngineLM
2 | 
3 | ::: autopeptideml.reps.lms.RepEngineLM
4 | 
5 | 


--------------------------------------------------------------------------------
/docs/repengineseqbased.md:
--------------------------------------------------------------------------------
1 | # RepEngineOneHotEncoding
2 | 
3 | ::: autopeptideml.reps.seq_based.RepEngineOnehot
4 | 
5 | 


--------------------------------------------------------------------------------
/examples/API_docs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# AutoPeptideML API Python\n",
  8 |     "\n",
  9 |     "## 1. Introduction\n",
 10 |     "\n",
 11 |     "The functionalities of AutoPeptideML Python API is focused in a single class, `AutoPeptideML`. Initialization of the class includes 3 possible arguments:\n",
 12 |     "\n",
 13 |     "- `verbose`: boolean value. Default: `True`.\n",
 14 |     "- `threads`: number of threads to use for multithreading. By default it uses all available CPU cores.\n",
 15 |     "- `seed`: seed for pseudo-random number generator for all stochastic processes. Default: `42`."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {
 22 |     "vscode": {
 23 |      "languageId": "plaintext"
 24 |     }
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from autopeptideml.autopeptideml import AutoPeptideML\n",
 29 |     "\n",
 30 |     "apml = AutoPeptideML(\n",
 31 |     "    verbose=True,\n",
 32 |     "    threads=8,\n",
 33 |     "    seed=42\n",
 34 |     ")"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## 2. Dataset preparation\n",
 42 |     "\n",
 43 |     "There are 3 methods to handle dataset preparation:\n",
 44 |     "\n",
 45 |     "- `autosearch_negatives`: Searches for negative bioactive peptides\n",
 46 |     "    - `df_pos`: `pd.DataFrame` with positive samples\n",
 47 |     "    - `positive_tags`: `List[str]` with all bioactivities that may overlap with the positive class\n",
 48 |     "    - `proportion`: `float` number. Target negative:positive ratio. Default:  `1.0`.\n",
 49 |     "- `balance_samples`: Balances labels in the dataset by oversampling the underepresented classes.\n",
 50 |     "    - `df`: `pd.DataFrame`. Dataframe with `Y` column, for which labels will be balanced.\n",
 51 |     "- `curate_dataset`: Load the dataset, remove non-canonical and empty sequences.\n",
 52 |     "    - `dataset`: `Union[str, pd.DataFrame]`. The input can be either the path to a `.fasta`, `.csv`, or `.tsv` file or a `pd.DataFrame`.\n",
 53 |     "    - `outputdir`: `str`. Path to a directory where to save the curated dataset.\n"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "vscode": {
 61 |      "languageId": "plaintext"
 62 |     }
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# Dataset curation\n",
 67 |     "df_negs = apml.curate_dataset(\n",
 68 |     "    dataset='example_dataset_with_negatives.fasta',\n",
 69 |     "    output='output_dir'\n",
 70 |     ")\n",
 71 |     "df_pos = apml.curate_dataset(\n",
 72 |     "    dataset='example_dataset_with_positives.fasta',\n",
 73 |     "    output='output_dir_2'\n",
 74 |     ")\n",
 75 |     "\n",
 76 |     "# Balance samples_to_draw (only if df contains negative samples)\n",
 77 |     "df_negs_balanced = apml.balance_samples(df_negs)\n",
 78 |     "\n",
 79 |     "# Autosearch for negatives\n",
 80 |     "df = apml.autosearch_negatives(\n",
 81 |     "    df_pos=df_pos,\n",
 82 |     "    positive_tags=['Neuropeptides'],\n",
 83 |     "    proportion=1.0\n",
 84 |     ")"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "## 3. Dataset partitioning\n",
 92 |     "\n",
 93 |     "There are two steps of dataset partitioning: training/evaluation and training/validation folds.\n",
 94 |     "\n",
 95 |     "- `train_test_partition`: Creates training/evaluation sets using novel homology partitioning algorithm\n",
 96 |     "    - `df`: `pd.DataFrame`\n",
 97 |     "    - `threshold`: `float`. Maximum sequence identity value between sequences in training and evaluation sets. Default: `0.3`\n",
 98 |     "    - `test_size`: `float`. Proportion of samples that should comprise the evaluation set. Default: `0.2`\n",
 99 |     "    - `alignment`: `str`. Alignment method to be used. Options: `needle`, `mmseqs` and `mmseqs+prefilter`. Default: `mmseqs+prefilter`\n",
100 |     "    - `outputdir`: `str`. Path to a directory where to save the generated datasets.\n",
101 |     "- `train_val_partition`: Creates n training/validation folds\n",
102 |     "    - `df`: `pd.DataFrame`. Should be the training dataset generated with the previous step.\n",
103 |     "    - `method`: `str`. Method for partitioning. Options: `random` and `graph-part`. `random` refers to `StratifiedKFold` from `sklearn.model_selection` and `graph-part` to `stratified_k_fold` from the GraphPart algorithm. For more details see the [Project Github Repository](https://github.com/graph-part/graph-part).\n",
104 |     "    - `threshold`: `float`. Maximum sequence identity value between sequences in training and valdation folds. Only valid if method is `graph-part`. Default: `0.5`.\n",
105 |     "    - `alignment`: `str`. Alignment method to be used. Options: `needle`, `mmseqs` and `mmseqs+prefilter`. Only valid if method is `graph-part`. Default: `mmseqs+prefilter`.\n",
106 |     "    - `n_folds`: `int`. Number of folds to be generated. Default: `10`.\n",
107 |     "    - `outputdir`: `str`. Path to a directory where to save the generated datasets.\n",
108 |     "    "
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "vscode": {
116 |      "languageId": "plaintext"
117 |     }
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "datasets = apml.train_test_partition(\n",
122 |     "    df=df,\n",
123 |     "    threshold=0.3,\n",
124 |     "    test_size=0.2,\n",
125 |     "    alignment='mmseqs+prefilter',\n",
126 |     "    outputdir='outputdir/splits'\n",
127 |     ")\n",
128 |     "folds = apml.train_val_partition(\n",
129 |     "    df=datasets['train'],\n",
130 |     "    method='random',\n",
131 |     "    n_folds=10,\n",
132 |     "    outputdir='outputdir/folds'\n",
133 |     ")"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "## 4. Peptide Representation\n",
141 |     "\n",
142 |     "The Peptide Representation step requires an additional class within the AutoPeptideML package, `RepresentationEngine`, that loads the Protein Language Model (PLM) of choice.\n",
143 |     "\n",
144 |     "- `RepresentationEngine`:\n",
145 |     "    - `model`: `str`. Protein Language Model, see Github Repo `README.md` file. Default: `esm2-8m`\n",
146 |     "    - `batch_size`: Number of peptide sequences to compute in each batch, depends on the RAM memory either in the CPU or the GPU. Default: `64`.\n",
147 |     "- `AutoPeptideML`:\n",
148 |     "    - `compute_representation`: Uses the `RepresentationEngine` class to compute the representations in the dataset.\n",
149 |     "        - `datasets`: `Dict[str, pd.DataFrame]` dictionary with the dataset partitions\n",
150 |     "        - `re`: `RepresentationEngine`\n",
151 |     "\n"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "vscode": {
159 |      "languageId": "plaintext"
160 |     }
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "from autopeptideml.utils.embeddings import RepresentationEngine\n",
165 |     "\n",
166 |     "re = RepresentationEngine(\n",
167 |     "    model='esm2-8m',\n",
168 |     "    batch_size=64\n",
169 |     ")\n",
170 |     "id2rep = apml.compute_representations(\n",
171 |     "    datasets=datasets,\n",
172 |     "    re=re\n",
173 |     ")"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "## 5. Hyperparameter Optimisation and Model Training\n",
181 |     "\n",
182 |     "- `hpo_train`\n",
183 |     "    - `config`: `dict`. `JSON` file with the hyperparameter search space, for examples of the format please refer to the files in `autopeptideml/data/configs`.\n",
184 |     "    - `train_df`: `pd.DataFrame` with the training dataset.\n",
185 |     "    - `id2rep`: `dict`. Result from running `apml.compute_representation`\n",
186 |     "    - `folds`: `list`. List of training/validation folds.\n",
187 |     "    - `outputdir`: `str`. Path to a directory where to save the results."
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "vscode": {
195 |      "languageId": "plaintext"
196 |     }
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "model = apml.hpo_train(\n",
201 |     "    config=json.load(open('../autopeptideml/data/config/default_config.json')),\n",
202 |     "    train_df=datasets['train],\n",
203 |     "    id2rep=id2rep,\n",
204 |     "    folds=folds,\n",
205 |     "    outputdir='outputdir/ensemble'\n",
206 |     ")"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "## 6. Ensemble Evaluation\n",
214 |     "\n",
215 |     "- `evaluate_model`\n",
216 |     "    - `best_model`. Ensemble generated in previous step.\n",
217 |     "    - `test_df`: `pd.DataFrame` with the evaluation set.\n",
218 |     "    - `id2rep`: `dict`. Representations generated in Step 4\n",
219 |     "    - `outputdir`: `str`.\n"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {
226 |     "vscode": {
227 |      "languageId": "plaintext"
228 |     }
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "results = apml.evaluate_model(\n",
233 |     "    best_model=model,\n",
234 |     "    test_df=datasets['test'],\n",
235 |     "    id2rep=id2rep,\n",
236 |     "    outputdir='outputdir/results'\n",
237 |     ")"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "## 7. Prediction\n",
245 |     "\n",
246 |     "- `predict`: Predict the bioactivity of a set of peptide sequences given an ensemble already trained.\n",
247 |     "    - `df`: `pd.DataFrame` with the peptide sequences.\n",
248 |     "    - `re`: `RepresentationEngine`\n",
249 |     "    - `ensemble_path`: Path where the ensemble files were saved.\n",
250 |     "    - `outputdir`"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {
257 |     "vscode": {
258 |      "languageId": "plaintext"
259 |     }
260 |    },
261 |    "outputs": [],
262 |    "source": [
263 |     "apml.predict(\n",
264 |     "    df=pd.read_csv('New_samples.csv'),\n",
265 |     "    re=re,\n",
266 |     "    ensemble_path='outputdir/ensemble',\n",
267 |     "    outputdir='prediction'\n",
268 |     ")"
269 |    ]
270 |   }
271 |  ],
272 |  "metadata": {
273 |   "language_info": {
274 |    "name": "python"
275 |   }
276 |  },
277 |  "nbformat": 4,
278 |  "nbformat_minor": 2
279 | }
280 | 


--------------------------------------------------------------------------------
/examples/AutoPeptideML_Collab.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"private_outputs":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["# @title Install AutoPeptideML\n","%%capture\n","!pip install autopeptideml\n","!wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz\n","!tar xvfz /content/mmseqs-linux-avx2.tar.gz\n","!cp /content/mmseqs/bin/mmseqs /bin/\n","%env mmseqs=/bin/mmseqs\n"],"metadata":{"id":"Ssp28JzPWsWD"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"id":"jRIN9Z3jxtnp","cellView":"form"},"outputs":[],"source":["# @title Import AutoPeptideML\n","import pandas as pd\n","\n","from autopeptideml import AutoPeptideML, RepresentationEngine\n","from autopeptideml.utils.embeddings import AVAILABLE_MODELS, SYNONYMS\n","\n","apml = AutoPeptideML(verbose=True)"]},{"cell_type":"markdown","source":["# Hyperparameter Space\n","\n","You can define the hyperparameter search space for a single model (`hpo_single`), for an ensemble of models (`hpo_ensemble`), the UniDL4BioPep architecture can also be used. Both options are added below and you can execute whichever you prefer to use. The search spaces can be modified at will, more information in the project documentation: https://ibm.github.io/AutoPeptideML/."],"metadata":{"id":"LLhFmK9NmEaw"}},{"cell_type":"code","source":["# @title HPO single (model selection)\n","hpo_space = {\n","    \"trials\": 100,\n","    \"model_selection\": [\n","{\n","            \"model\": \"K-Nearest Neighbours\",\n","            \"optimization_metric\": \"test_matthews_corrcoef\",\n","            \"hyperparameter-space\": [\n","                {\n","                    \"name\": \"n_neighbors\",\n","                    \"type\": \"int\",\n","                    \"min\": 1,\n","                    \"max\": 30,\n","                    \"log\": \"False\"\n","                },\n","                {\n","                    \"name\": \"weights\",\n","                    \"type\": \"categorical\",\n","                    \"values\": [\"uniform\", \"distance\"]\n","                }\n","            ]\n","        },\n","        # {\n","        #     \"model\": \"mlp\",\n","        #     \"optimization_metric\": \"test_matthews_corrcoef\",\n","        #     \"hyperparameter-space\": [\n","        #         {\n","        #             \"name\": \"learning_rate\",\n","        #             \"type\": \"float\",\n","        #             \"min\": 1e-7,\n","        #             \"max\": 1,\n","        #             \"log\": \"True\"\n","        #         },\n","        #         {\n","        #             \"name\": \"activation\",\n","        #             \"type\": \"categorical\",\n","        #             \"values\": [\"identity\", \"logistic\", \"tanh\", \"relu\"]\n","        #         },\n","        #         {\n","        #             \"name\": \"solver\",\n","        #             \"type\": \"categorical\",\n","        #             \"values\": [\"adam\", \"sgd\"]\n","        #         },\n","        #         {\n","        #             \"name\": \"hidden_layer_sizes\",\n","        #             \"type\": \"categorical\",\n","        #             \"values\": [[12, 12], [120, 120], [12, 12, 12], [120, 120, 120], [12, 12, 12, 12]]\n","        #         }\n","        #     ]\n","        # },\n","        # {\n","        #     \"model\": \"XGBoost\",\n","        #     \"optimization_metric\": \"test_matthews_corrcoef\",\n","        #     \"hyperparameter-space\": [\n","        #         {\n","        #             \"name\": \"learning_rate\",\n","        #             \"type\": \"float\",\n","        #             \"min\": 1e-5,\n","        #             \"max\": 1,\n","        #             \"log\": \"True\"\n","        #         },\n","        #         {\n","        #             \"name\": \"n_estimators\",\n","        #             \"type\": \"int\",\n","        #             \"min\": 1,\n","        #             \"max\": 100,\n","        #             \"log\": \"False\"\n","        #         },\n","        #         {\n","        #             \"name\": \"max_depth\",\n","        #             \"type\": \"int\",\n","        #             \"min\": 1,\n","        #             \"max\": 10,\n","        #             \"log\": \"False\"\n","        #         },\n","        #     ]\n","        # },\n","        {\n","            \"model\": \"RFC\",\n","            \"optimization_metric\": \"test_matthews_corrcoef\",\n","            \"hyperparameter-space\": [\n","                {\n","                    \"name\": \"max_depth\",\n","                    \"type\": \"int\",\n","                    \"min\": 2,\n","                    \"max\": 20,\n","                    \"log\": \"False\"\n","                },\n","                {\n","                    \"name\": \"n_estimators\",\n","                    \"type\": \"int\",\n","                    \"min\": 10,\n","                    \"max\": 100,\n","                    \"log\": \"False\"\n","                }\n","            ]\n","        },\n","        {\n","            \"model\": \"LightGBM\",\n","            \"optimization_metric\": \"test_matthews_corrcoef\",\n","            \"hyperparameter-space\": [\n","                {\n","                    \"name\": \"max_depth\",\n","                    \"type\": \"int\",\n","                    \"min\": 1,\n","                    \"max\": 30,\n","                    \"log\": \"True\"\n","                },\n","                {\n","                    \"name\": \"num_leaves\",\n","                    \"type\": \"int\",\n","                    \"min\": 5,\n","                    \"max\": 50,\n","                    \"log\": \"True\"\n","                },\n","                 {\n","                    \"name\": \"learning_rate\",\n","                    \"type\": \"float\",\n","                    \"min\": 0.001,\n","                    \"max\": 0.3,\n","                     \"log\": \"True\"\n","                },\n","                {\n","                    \"name\": \"verbose\",\n","                    \"type\": \"fixed\",\n","                    \"value\": -1\n","                }\n","            ]\n","        }\n","    ]\n","}"],"metadata":{"id":"RPtMhCzvlW1D","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title HPO UniDL4BioPep\n","hpo_space = {\n","    \"ensemble\":\n","    [\n","        {\n","            \"model\": \"unidl4biopep\",\n","            \"trials\": 100,\n","            \"optimization_metric\": \"test_matthews_corrcoef\",\n","            \"hyperparameter-space\": {\n","                \"epochs\": 20,\n","                \"optimizer\": [\n","                    {\n","                        \"name\": \"lr\",\n","                        \"type\": \"float\",\n","                        \"min\": 1e-7,\n","                        \"max\": 0.1\n","                    }\n","                ]\n","            }\n","        }\n","    ]\n","}"],"metadata":{"cellView":"form","id":"vOEdRcfknX2H"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title HPO ensemble\n","hpo_space = {\n","    \"ensemble\": [\n","        {\n","            \"model\": \"K-Nearest Neighbours\",\n","            \"trials\": 30,\n","            \"optimization_metric\": \"test_matthews_corrcoef\",\n","            \"hyperparameter-space\": [\n","                {\n","                    \"name\": \"n_neighbors\",\n","                    \"type\": \"int\",\n","                    \"min\": 1,\n","                    \"max\": 30,\n","                    \"log\": \"False\"\n","                },\n","                {\n","                    \"name\": \"weights\",\n","                    \"type\": \"categorical\",\n","                    \"values\": [\"uniform\", \"distance\"]\n","                }\n","            ]\n","        },\n","        # {\n","        #     \"model\": \"mlp\",\n","        #     \"trials\": 30,\n","        #     \"optimization_metric\": \"test_matthews_corrcoef\",\n","        #     \"hyperparameter-space\": [\n","        #         {\n","        #             \"name\": \"learning_rate\",\n","        #             \"type\": \"categorical\",\n","        #             \"values\": [\"constant\", \"invscaling\", \"adaptive\"]\n","        #         },\n","        #         {\n","        #             \"name\": \"activation\",\n","        #             \"type\": \"categorical\",\n","        #             \"values\": [\"identity\", \"logistic\", \"tanh\", \"relu\"]\n","        #         },\n","        #         {\n","        #           \"name\": \"learning_rate_init\",\n","        #           \"type\": \"float\",\n","        #           \"min\": 1e-7,\n","        #           \"max\": 1e-1,\n","        #           \"log\": True\n","        #         },\n","        #         {\n","        #             \"name\": \"solver\",\n","        #             \"type\": \"categorical\",\n","        #             \"values\": [\"adam\", \"sgd\"]\n","        #         },\n","        #         {\n","        #             \"name\": \"hidden_layer_sizes\",\n","        #             \"type\": \"categorical\",\n","        #             \"values\": [[12, 12], [120, 120], [12, 12, 12], [120, 120, 120], [12, 12, 12, 12]]\n","        #         }\n","        #     ]\n","        # },\n","        # {\n","        #     \"model\": \"XGBoost\",\n","        #     \"trials\": 30,\n","        #     \"optimization_metric\": \"test_matthews_corrcoef\",\n","        #     \"hyperparameter-space\": [\n","        #         {\n","        #             \"name\": \"learning_rate\",\n","        #             \"type\": \"float\",\n","        #             \"min\": 1e-5,\n","        #             \"max\": 1,\n","        #             \"log\": \"True\"\n","        #         },\n","        #         {\n","        #             \"name\": \"n_estimators\",\n","        #             \"type\": \"int\",\n","        #             \"min\": 1,\n","        #             \"max\": 100,\n","        #             \"log\": \"False\"\n","        #         },\n","        #         {\n","        #             \"name\": \"max_depth\",\n","        #             \"type\": \"int\",\n","        #             \"min\": 1,\n","        #             \"max\": 10,\n","        #             \"log\": \"False\"\n","        #         },\n","        #     ]\n","        # },\n","        {\n","            \"model\": \"RFC\",\n","            \"trials\": 30,\n","            \"optimization_metric\": \"test_matthews_corrcoef\",\n","            \"hyperparameter-space\": [\n","                {\n","                    \"name\": \"max_depth\",\n","                    \"type\": \"int\",\n","                    \"min\": 2,\n","                    \"max\": 20,\n","                    \"log\": \"False\"\n","                },\n","                {\n","                    \"name\": \"n_estimators\",\n","                    \"type\": \"int\",\n","                    \"min\": 10,\n","                    \"max\": 100,\n","                    \"log\": \"False\"\n","                }\n","            ]\n","        },\n","        {\n","            \"model\": \"LightGBM\",\n","            \"trials\": 30,\n","            \"optimization_metric\": \"test_matthews_corrcoef\",\n","            \"hyperparameter-space\": [\n","                {\n","                    \"name\": \"max_depth\",\n","                    \"type\": \"int\",\n","                    \"min\": 1,\n","                    \"max\": 30,\n","                    \"log\": \"True\"\n","                },\n","                {\n","                    \"name\": \"num_leaves\",\n","                    \"type\": \"int\",\n","                    \"min\": 5,\n","                    \"max\": 50,\n","                    \"log\": \"True\"\n","                },\n","                 {\n","                    \"name\": \"learning_rate\",\n","                    \"type\": \"float\",\n","                    \"min\": 0.001,\n","                    \"max\": 0.3,\n","                     \"log\": \"True\"\n","                },\n","                {\n","                    \"name\": \"verbose\",\n","                    \"type\": \"fixed\",\n","                    \"value\": -1\n","                }\n","            ]\n","        }\n","    ]\n","}\n"],"metadata":{"id":"FNr1wMMlm6ys","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Load your data\n"],"metadata":{"id":"Cjzpf_QZnzgH"}},{"cell_type":"code","source":["# @title Upload dataset\n","from google.colab import files\n","import io\n","\n","uploaded = files.upload()\n","df = pd.read_csv(io.StringIO(uploaded[list(uploaded.keys())[0]].decode('utf-8')))\n","df.head()"],"metadata":{"id":"sHKkOfGGYEyi","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Inputs\n","\n","field_name = 'sequence' # @param {type: 'string'}\n","id_field = None # @param{type: 'raw'}\n","label_name = 'bioactivity' #@param{type: 'string'}\n","alignment_algorithm = 'mmseqs' #@param{type: 'string'}\n","threshold = 0.3 #@param\n","plm_model = 'esm2-8m' #@param {type: 'string'}\n","\n","if plm_model not in AVAILABLE_MODELS and plm_model not in SYNONYMS:\n","  print(f'Model: {plm_model} is not supported, please use one of the following: {list(SYNONYMS.keys())}')\n","\n","if id_field is None:\n","  df['id'] = df.index\n","else:\n","  df['id'] = df[id_field]\n","\n","df['sequence'] = df[field_name]\n","df['labels'] = df[label_name]\n","\n","df.head()"],"metadata":{"id":"AHMxISg2WT85","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Split dataset\n","from hestia.partition import ccpart, random_partition\n","\n","datasets = apml.train_test_partition(df, threshold=threshold, alignment=alignment_algorithm, denominator='n_aligned')\n"],"metadata":{"id":"S_6699wxVFtX"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Preparing cross-validation folds\n","\n","folds = apml.train_val_partition(\n","    datasets['train'], method='random',\n","    threshold=0.4, alignment='mmseqs+prefilter',\n","    n_folds=10, outputdir='results/folds',\n","\n",")"],"metadata":{"id":"o3oYbY-6p7j5","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title What Representation Model do you want to use?\n","# @markdown It is recommended to set the runtime to GPU in order to accelerate embedding computation.\n","\n","re = RepresentationEngine(plm_model, 12)\n","id2rep = apml.compute_representations(datasets, re)\n","id2rep = {id: rep.numpy() for id, rep in id2rep.items()}"],"metadata":{"id":"g9iid82xoXAP","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Train models\n","\n","model = apml.hpo_train(\n","    hpo_space, datasets['train'], id2rep, folds, 'results'\n",")"],"metadata":{"id":"oUlBUkKIljFW","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Evaluate models\n","\n","results = apml.evaluate_model(\n","    model, datasets['test'], id2rep, 'results'\n",")\n","print(results)"],"metadata":{"id":"0XLgate7lxBr","cellView":"form"},"execution_count":null,"outputs":[]}]}


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: AutoPeptideML
 2 | site_description: Computational Pipeline for the Automatised Development of Peptide Bioactivity Prediction Models
 3 | watch: [autopeptideml]
 4 | 
 5 | nav:
 6 | - Home:
 7 |   - Overview: index.md
 8 |   - Code reference: autopeptideml/
 9 | - Code reference:
10 |   - AutoPeptideML:
11 |       - autopeptideml.md
12 |   - RepEngineBase:
13 |       - repenginebase.md
14 |   - RepEngineFP:
15 |       - repenginefp.md
16 |   - RepEngineLM:
17 |       - repenginelm.md
18 |   - RepEngineSeqBased:
19 |       - repengineseqbased.md
20 | markdown_extensions:
21 |   - attr_list
22 | theme:
23 |     name: material
24 |     features:
25 |       - content.code.annotate
26 |       - navigation.tabs
27 |       - navigation.top
28 |     palette:
29 |     - media: "(prefers-color-scheme: light)"
30 |       scheme: default
31 |       primary: black
32 |       accent: purple
33 |       # toggle:
34 |       #   icon: material/weather-sunny
35 |       #   name: Switch to light mode
36 |     # - media: "(prefers-color-scheme: dark)"
37 |     #   scheme: slate
38 |     #   primary: black
39 |     #   accent: lime
40 |       # toggle:
41 |       #   icon: material/weather-night
42 |       #   name: Switch to dark mode
43 |       features:
44 |         - search.suggest
45 |         - search.highlight
46 |         - content.tabs.link
47 |       icon:
48 |         repo: fontawesome/brands/github-alt
49 |       language: en
50 | repo_name: IBM/AutoPeptideML
51 | repo_url: https://github.com/IBM/AutoPeptideML
52 | edit_uri: ''
53 | plugins:
54 | - search
55 | - include-markdown
56 | - mkdocstrings:
57 |     handlers:
58 |       python:
59 |         import:
60 |         - https://docs.python.org/3/objects.inv
61 |         - https://installer.readthedocs.io/en/stable/objects.inv  # demonstration purpose in the docs
62 |         - https://mkdocstrings.github.io/autorefs/objects.inv
63 |         options:
64 |           show_source: false
65 |           docstring_style: sphinx
66 |           docstring_options:
67 |             # ignore_init_summary: yes
68 |           merge_init_into_class: yes
69 |           show_submodules: yes
70 | - markdownextradata:
71 |     data: data
72 | markdown_extensions:
73 | - toc:
74 |     permalink: true
75 | - markdown.extensions.codehilite:
76 |     guess_lang: false
77 | - mdx_include:
78 |     base_path: docs
79 | - admonition
80 | - codehilite
81 | - extra
82 | - pymdownx.superfences:
83 |     custom_fences:
84 |     - name: mermaid
85 |       class: mermaid
86 |       format: !!python/name:pymdownx.superfences.fence_code_format ''
87 | - pymdownx.tabbed:
88 |     alternate_style: true
89 | - attr_list
90 | - md_in_html
91 | extra:
92 |   social:
93 |   - icon: fontawesome/brands/github-alt
94 |     link: https://github.com/IBM/AutoPeptideML
95 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 |   "extends": [
4 |     "config:recommended"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """The setup script."""
 4 | import os
 5 | from setuptools import setup, find_packages
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | this_directory = Path(__file__).parent
10 | readme = (this_directory / "README.md").read_text()
11 | 
12 | requirements = [
13 |     'optuna',
14 |     'scikit-learn',
15 |     'typer',
16 |     'mljar-scikit-plot',
17 |     'tokenizers',
18 |     'torch',
19 |     'transformers',
20 |     'lightgbm',
21 |     'xgboost',
22 |     'mdpdf',
23 |     'hestia-good',
24 |     'onnxmltools',
25 |     'skl2onnx',
26 |     'onnxruntime',
27 | ]
28 | 
29 | test_requirements = requirements
30 | files = ['autopeptideml/data/readme_ex.md']
31 | setup(
32 |     author="Raul Fernandez-Diaz",
33 |     author_email='raulfd@ibm.com',
34 |     python_requires='>=3.9',
35 |     classifiers=[
36 |     ],
37 |     description="AutoML system for building trustworthy peptide bioactivity predictors",
38 |     entry_points={
39 |         'console_scripts': [
40 |             'apml=autopeptideml.main:_main',
41 |             'autopeptideml=autopeptideml.main:_main'
42 |         ],
43 |     },
44 |     install_requires=requirements,
45 |     license="MIT",
46 |     long_description=readme,
47 |     long_description_content_type='text/markdown',
48 |     data_files=[('', files)],
49 |     include_package_data=True,
50 |     keywords='autopeptideml',
51 |     name='autopeptideml',
52 |     packages=find_packages(exclude=['examples']),
53 |     url='https://ibm.github.io/AutoPeptideML/',
54 |     version='2.0.0',
55 |     zip_safe=False,
56 | )
57 | 


--------------------------------------------------------------------------------
/tests/test_apml.py:
--------------------------------------------------------------------------------
1 | from autopeptideml import AutoPeptideML
2 | 
3 | 
4 | # def test_load():
5 | #     apml = AutoPeptideML()
6 | #     df = apml.curate_dataset('examples/AB_positives.csv')
7 | #     assert len(df) == 6_583
8 | 


--------------------------------------------------------------------------------
/tests/test_db.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | import numpy as np
 4 | 
 5 | from autopeptideml.db import Database
 6 | from autopeptideml.pipeline import Pipeline, CanonicalFilter
 7 | 
 8 | 
 9 | def test_database():
10 |     dir_path = osp.abspath(osp.dirname(__file__))
11 |     path = osp.join(dir_path, 'sample', 'example.csv')
12 |     db = Database(path, feat_fields=['sequence'],
13 |                   pipe=Pipeline([CanonicalFilter()]))
14 |     assert len(db) == 500
15 |     path2 = osp.join(dir_path, 'sample', 'example2.csv')
16 |     db2 = Database(path2, feat_fields=['sequence'],
17 |                    pipe=Pipeline([CanonicalFilter()]),
18 |                    label_field='Y')
19 |     db2.df['Y'] = 1
20 |     db2.add_negatives(db, columns_to_exclude=['Allergen', 'Toxic'])
21 |     labels, counts = np.unique(db2.df.Y, return_counts=True)
22 |     assert labels.tolist() == [0, 1]
23 |     assert counts.tolist() == [272, 300]
24 | 


--------------------------------------------------------------------------------
/tests/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from autopeptideml.pipeline import Pipeline
 4 | from autopeptideml.pipeline import (Pipeline, CanonicalCleaner, CanonicalFilter,
 5 |                                     SequenceToSMILES, FilterSMILES, SmilesToSequence)
 6 | 
 7 | 
 8 | def test_canonical_filter():
 9 |     seqs = ['AAACCTWSFB', 'AAACCTWF', 'AAACCTWaF']
10 |     pipe = Pipeline([CanonicalFilter()])
11 |     seqs_out = pipe(seqs)
12 |     assert seqs_out == ['AAACCTWF']
13 | 
14 | 
15 | @pytest.mark.parametrize("smiles", "seq_out",
16 |                          [
17 |                              ('C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]', "AAACCTWSFB"),
18 |                              ('C[C@@H](O)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O', "AAACCTWF"),
19 |                          ])
20 | def to_sequence(smiles, seq_out):
21 |     pipe = Pipeline([SmilesToSequence()])
22 |     seq_pred = pipe(smiles)
23 |     assert seq_pred == seq_out
24 | 
25 | 
26 | def test_canonical_cleaner():
27 |     seqs = ['AAACCTWSFB', 'AAACCTWF', 'AAACCTWaF']
28 |     pipe = Pipeline([CanonicalCleaner()])
29 |     seqs_out = pipe(seqs)
30 |     assert seqs_out == ['AAACCTWSFX', 'AAACCTWF', 'AAACCTWXF']
31 | 
32 | 
33 | def test_to_smiles():
34 |     seqs = ['BRTWSF', 'ARTWF', 'aRTWSF', 'C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]']
35 |     pipe1 = Pipeline([FilterSMILES()], name='pipe_smiles')
36 |     pipe2 = Pipeline([FilterSMILES(keep_smiles=False),
37 |                      CanonicalCleaner(substitution='G'),
38 |                      SequenceToSMILES()], name='pipe_seqs')
39 |     pipe = Pipeline([pipe1, pipe2], name='main_pipeline', aggregate=True)
40 |     seqs_out = pipe(seqs, verbose=True)
41 |     assert seqs_out == [
42 |         'C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]',
43 |         'C[C@@H](O)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O',
44 |         'C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)O',
45 |         'C[C@@H](O)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O'
46 |     ]
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     test_canonical_cleaner()
51 |     test_canonical_filter()
52 |     test_to_smiles()
53 | 


--------------------------------------------------------------------------------
/tests/test_reps.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import numpy as np
 4 | 
 5 | from autopeptideml.reps.lms import RepEngineLM
 6 | from autopeptideml.reps.seq_based import RepEngineOnehot
 7 | from autopeptideml.reps.fps import RepEngineFP
 8 | 
 9 | 
10 | def test_esm_family():
11 |     re = RepEngineLM('esm2-8m', average_pooling=True)
12 |     a = re.compute_reps(['AACFFF'], batch_size=12)
13 |     b = re.compute_reps(['AACFFF', 'AACCF'], batch_size=12)
14 |     re = RepEngineLM('esm2-8m', average_pooling=False)
15 |     c = re.compute_reps(['AACFFF'], batch_size=12)
16 | 
17 |     assert re.dim() == 320
18 |     assert a.shape == (1, 320)
19 |     assert b.shape == (2, 320)
20 |     assert np.array(c).shape == (1, 6, 320)
21 | 
22 | 
23 | def test_elnaggar_family():
24 |     re = RepEngineLM('ankh-base')
25 |     a = re.compute_reps(['AACFFF'], batch_size=12)
26 |     assert re.dim() == 768
27 |     assert np.array(a).shape == (1, re.dim())
28 | 
29 | 
30 | def test_one_hot():
31 |     re = RepEngineOnehot(19)
32 |     a = re.compute_reps(['AACFFF', 'AACCF'], batch_size=4)
33 |     dict_re = json.loads(str(re))
34 |     assert dict_re == {'rep': 'one-hot', 'max_length': 19}
35 |     assert re.dim() == 19 * 21
36 |     assert a.shape == (2, 19 * 21)
37 | 
38 | 
39 | def test_fps():
40 |     re1 = RepEngineFP('ecfp', 512, 8)
41 |     a = re1.compute_reps(['C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]'], batch_size=1)
42 |     re2 = RepEngineFP('fcfp', 256, 12)
43 |     b = re2.compute_reps(['C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]'], batch_size=1)
44 |     dict_1, dict_2 = json.loads(str(re1)), json.loads(str(re2))
45 |     assert dict_1 == {'rep': 'ecfp', 'nbits': 512, 'radius': 8}
46 |     assert dict_2 == {'rep': 'fcfp', 'nbits': 256, 'radius': 12}
47 |     assert re1.dim() == 512
48 |     assert re2.dim() == 256
49 |     assert a.shape == (1, 512)
50 |     assert b.shape == (1, 256)
51 | 
52 | 
53 | def test_rostlab_family():
54 |     re = RepEngineLM('prot-t5-xl')
55 |     a = re.compute_reps(['AACFFF'], batch_size=12)
56 |     assert re.dim() == 1024
57 |     assert np.array(a).shape == (1, re.dim())
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     test_esm_family()
62 |     print('ESM OK')
63 |     test_elnaggar_family()
64 |     print('Elnaggar OK')
65 |     test_one_hot()
66 |     print('Onehot OK')
67 |     test_fps()
68 |     print("FPs OK")
69 |     test_rostlab_family()
70 |     print("Rostlab OK")
71 | 


--------------------------------------------------------------------------------