├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── publish-pages-doc.yml │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── HISTORY.rst ├── LICENSE ├── README.md ├── autopeptideml ├── __init__.py ├── autopeptideml.py ├── config.py ├── data │ ├── __init__.py │ └── readme_ex.md ├── db │ ├── __init__.py │ └── db.py ├── main.py ├── pipeline │ ├── __init__.py │ ├── pipeline.py │ ├── sequence.py │ └── smiles.py ├── reps │ ├── __init__.py │ ├── engine.py │ ├── fps.py │ ├── lms.py │ ├── seq_based.py │ └── utils │ │ ├── __init__.py │ │ └── peptideclm_tokenizer.py └── train │ ├── __init__.py │ ├── architectures.py │ ├── deep_learning │ ├── __init__.py │ ├── dataset.py │ ├── loss.py │ └── model.py │ ├── metrics.py │ └── trainer.py ├── docs ├── autopeptideml.md ├── imgs │ ├── APML_dark.png │ └── APML_light.png ├── index.md ├── repenginebase.md ├── repenginefp.md ├── repenginelm.md └── repengineseqbased.md ├── examples ├── AB_positives.csv ├── API_docs.ipynb └── AutoPeptideML_Collab.ipynb ├── mkdocs.yml ├── renovate.json ├── setup.py └── tests ├── sample ├── example.csv └── example2.csv ├── test_apml.py ├── test_db.py ├── test_pipeline.py └── test_reps.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/publish-pages-doc.yml: -------------------------------------------------------------------------------- 1 | name: Publish docs via GitHub Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - uses: actions/setup-python@v5 14 | with: 15 | python-version: 3.x 16 | - run: pip install mkdocs mkdocs-material mkdocstrings[python] mkdocs-markdownextradata-plugin mdx_include mkdocs-include-markdown-plugin 17 | - run: mkdocs gh-deploy --force 18 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.9", "3.11", "3.13"] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | sudo apt-get install libprotobuf-dev protobuf-compiler 30 | python -m pip install --upgrade pip 31 | python -m pip install flake8 pytest sentencepiece rdkit 32 | python -m pip install . 33 | python -m pip install biopython 34 | python -m pip install git+https://github.com/novonordisk-research/pepfunn.git --no-deps 35 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 36 | - name: Lint with flake8 37 | run: | 38 | # stop the build if there are Python syntax errors or undefined names 39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 41 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 42 | - name: Test with pytest 43 | run: | 44 | pytest 45 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | # Allows to run this workflow manually from the Actions tab 16 | workflow_dispatch: 17 | 18 | permissions: 19 | contents: read 20 | 21 | jobs: 22 | deploy: 23 | 24 | runs-on: ubuntu-latest 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | - name: Set up Python 29 | uses: actions/setup-python@v5 30 | with: 31 | python-version: '3.x' 32 | - name: Install dependencies 33 | run: | 34 | python -m pip install --upgrade pip 35 | pip install build 36 | - name: Build package 37 | run: python -m build 38 | - name: Publish package 39 | uses: pypa/gh-action-pypi-publish@e9ccbe5a211ba3e8363f472cae362b56b104e796 40 | with: 41 | user: __token__ 42 | password: ${{ secrets.PYPI_API_TOKEN }} 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | .DS_Store 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ 162 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | raul.fernandezdiaz@ucdconnect.ie. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 0.1.0 (2023-04-27) 6 | ------------------ 7 | 8 | * First release on PyPI. 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 International Business Machines 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | logo 6 | 7 | 8 |

AutoPeptideML

9 | 10 |

11 | AutoML system for building trustworthy peptide bioactivity predictors 12 |

13 | 14 |

15 | 16 | Tutorials 17 | GitHub 18 | 19 | 20 | 21 | Open In Colab 22 | 23 | 24 | 25 | 26 |

27 |
28 | 29 | - **Documentation:** https://ibm.github.io/AutoPeptideML 30 | - **Source Code:** https://github.com/IBM/AutoPeptideML 31 | - **Webserver:** http://peptide.ucd.ie/AutoPeptideML 32 | - **Google Collaboratory Notebook:** AutoPeptideML_Collab.ipynb 33 | - **Blog post:** Portal - AutoPeptideML v. 1.0 Tutorial 34 | - **Papers:** 35 | - [AutoPeptideML (v. 1.0)](https://doi.org/10.1093/bioinformatics/btae555) 36 | - [ML Generalization from canonical to non-canonical peptides](https://doi.org/10.26434/chemrxiv-2025-ggp8n) 37 | 38 | AutoPeptideML allows researchers without prior knowledge of machine learning to build models that are: 39 | 40 | - **Trustworthy:** Robust evaluation following community guidelines for ML evaluation reporting in life sciences [DOME](https://www.nature.com/articles/s41592-021-01205-4). 41 | - **Interpretable:** Output contains a PDF summary of the model evaluation explaining how to interpret the results to understand how reliable the model is. 42 | - **Reproducible:** Output contains all necessary information for other researchers to reproduce the training and verify the results. 43 | - **State-of-the-art:** Models generated with this system are competitive with state-of-the-art handcrafted approaches. 44 | 45 | To use version 1.0, which may be necessary for retrocompatibility with previously built models, please defer to the branch: [AutoPeptideML v.1.0.6](https://github.com/IBM/AutoPeptideML/tree/apml-1.0.6) 46 | 47 | ## Contents 48 | 49 |
Table of Contents 50 | 51 | - [Model builder](#helper) 52 | - [Prediction](#prediction) 53 | - [Benchmark Data](#benchmark) 54 | - [Intallation Guide](#installation) 55 | - [Documentation](#documentation) 56 | - [License](#license) 57 | - [Acknowledgements](#acknowledgements) 58 |
59 | 60 | 61 | ## Model builder 62 | 63 | In order to build a new model, AutoPeptideML (v.2.0), introduces a new utility to automatically prepare an experiment configuration file, to i) improve the reproducibility of the pipeline and ii) to keep a user-friendly interface despite the much increased flexibility. 64 | 65 | ```bash 66 | autopeptideml prepare-config 67 | ``` 68 | This launches an interactive CLI that walks you through: 69 | 70 | - Choosing a modeling task (classification or regression) 71 | - Selecting input modality (macromolecules or sequences) 72 | - Loading and parsing datasets (csv, tsv, or fasta) 73 | - Defining evaluation strategy 74 | - Picking models and representations 75 | - Setting hyperparameter search strategy and training parameters 76 | 77 | 78 | You’ll be prompted to answer various questions like: 79 | 80 | ``` 81 | - What is the modelling problem you're facing? (Classification or Regression) 82 | 83 | - How do you want to define your peptides? (Macromolecules or Sequences) 84 | 85 | - What models would you like to consider? (knn, adaboost, rf, etc.) 86 | ``` 87 | 88 | And so on. The final config is written to: 89 | 90 | ``` 91 | /config.yml 92 | ``` 93 | 94 | This config file allows for easy reproducibility of the results, so that anyone can repeat the training processes. You can check the configuration file and make any changes you deem necessary. Finally, you can build the model by simply running: 95 | 96 | ``` 97 | autopeptideml build-model --config-path /config.yml 98 | ``` 99 | 100 | ## Prediction 101 | 102 | In order to use a model that has already built you can run: 103 | 104 | ```bash 105 | autopeptideml predict --output-path 106 | ``` 107 | 108 | Where `` is the path to a `CSV` file with a column `features_field` that contains the peptide sequences/SMILES. The output file `` will contain the original data with two additional columns `score` (which are the predictions) and `std` which is the standard deviation between the predictions of the models in the ensemble, which can be used as a measure of the uncertainty of the prediction. 109 | 110 | ## Benchmark data 111 | 112 | Data used to benchmark our approach has been selected from the benchmarks collected by [Du et al, 2023](https://academic.oup.com/bib/article-abstract/24/3/bbad135/7107929). A new set of benchmarks was constructed from the original set following the new data acquisition and dataset partitioning methods within AutoPeptideML. To download the datasets: 113 | 114 | - **Original UniDL4BioPep Benchmarks:** Please check the project [Github Repository](https://github.com/dzjxzyd/UniDL4BioPep/tree/main). 115 | - **⚠️ New AutoPeptideML Benchmarks (Amended version):** Can be downloaded from this [link](https://drive.google.com/u/0/uc?id=1UmDu773CdkBFqkitK550uO6zoxhU1bUB&export=download). Please note that these are not exactly the same benchmarks as used in the paper (see [Issue #24](https://github.com/IBM/AutoPeptideML/issues/24) for more details). 116 | - **PeptideGeneralizationBenchmarks:** Benchmarks evaluating how peptide representation methods generalize from canonical (peptides composed of the 20 standard amino acids) to non-canonical (peptides with non-standard amino acids or other chemical modifications). Check out the [paper pre-print](https://chemrxiv.org/engage/chemrxiv/article-details/67d2f3ae81d2151a023d64f8). They have their own dedicated repository: [PeptideGeneralizationBenchmarks Github repository](https://github.com/IBM/PeptideGeneralizationBenchmarks). 117 | 118 | ## Installation 119 | 120 | Installing in a conda environment is recommended. For creating the environment, please run: 121 | 122 | ```bash 123 | conda create -n autopeptideml python 124 | conda activate autopeptideml 125 | ``` 126 | 127 | ### 1. Python Package 128 | 129 | #### 1.1.From PyPI 130 | 131 | 132 | ```bash 133 | pip install autopeptideml 134 | ``` 135 | 136 | #### 1.2. Directly from source 137 | 138 | ```bash 139 | pip install git+https://github.com/IBM/AutoPeptideML 140 | ``` 141 | 142 | ### 2. Third-party dependencies 143 | 144 | To use MMSeqs2 [https://github.com/steineggerlab/mmseqs2](https://github.com/steineggerlab/mmseqs2) 145 | 146 | ```bash 147 | # static build with AVX2 (fastest) (check using: cat /proc/cpuinfo | grep avx2) 148 | wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz; tar xvfz mmseqs-linux-avx2.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH 149 | 150 | # static build with SSE4.1 (check using: cat /proc/cpuinfo | grep sse4) 151 | wget https://mmseqs.com/latest/mmseqs-linux-sse41.tar.gz; tar xvfz mmseqs-linux-sse41.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH 152 | 153 | # static build with SSE2 (slowest, for very old systems) (check using: cat /proc/cpuinfo | grep sse2) 154 | wget https://mmseqs.com/latest/mmseqs-linux-sse2.tar.gz; tar xvfz mmseqs-linux-sse2.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH 155 | 156 | # MacOS 157 | brew install mmseqs2 158 | ``` 159 | 160 | To use Needleman-Wunch, either: 161 | 162 | ```bash 163 | conda install -c bioconda emboss 164 | ``` 165 | or 166 | 167 | ```bash 168 | sudo apt install emboss 169 | ``` 170 | 171 | To use ECFP fingerprints: 172 | 173 | ```bash 174 | pip install rdkit 175 | ``` 176 | 177 | To use MAPc fingeprints: 178 | 179 | ```bash 180 | pip install mapchiral 181 | ``` 182 | 183 | To use PepFuNN fingeprints: 184 | 185 | ```bash 186 | pip install git+https://github.com/novonordisk-research/pepfunn 187 | ``` 188 | 189 | To use PeptideCLM: 190 | 191 | ```bash 192 | pip install smilesPE 193 | ``` 194 | 195 | ## Documentation 196 | 197 | ### Configuration file 198 | 199 | #### Top-level structure 200 | 201 | ```yaml 202 | pipeline: {...} 203 | databases: {...} 204 | test: {...} 205 | val: {...} 206 | train: {...} 207 | representation: {...} 208 | outputdir: "path/to/experiment_results" 209 | ``` 210 | 211 | #### `pipeline` 212 | Defines the preprocessing pipeline depending on the modality (`mol` or `seqs`). It includes data cleaning and transformations, such as: 213 | 214 | - `filter-smiles` 215 | - `canonical-cleaner` 216 | - `sequence-to-smiles` 217 | - `smiles-to-sequences` 218 | 219 | The name of a pipeline object has to include the word `pipe`. Pipelines can be elements within a pipeline. Here, is an example. Aggregate will combine the output from the different elements. In this case, the two elements process SMILES and sequences independently and then combine them into a single datastream. 220 | 221 | 222 | ```yaml 223 | pipeline: 224 | name: "macromolecules_pipe" 225 | aggregate: true 226 | verbose: false 227 | elements: 228 | - pipe-smiles-input: {...} 229 | - pipe-seq-input: {...} 230 | 231 | ``` 232 | 233 | ### `databases` 234 | 235 | Defines dataset paths and how to interpret them. 236 | 237 | **Required:** 238 | - `path`: Path to main dataset. 239 | - `feat_fields`: Column name with SMILES or sequences. 240 | - `label_field`: Column with classification/regression labels. 241 | - `verbose`: Logging flag. 242 | 243 | **Optional:** 244 | - `neg_database`: If using negative sampling. 245 | - `path`: Path to negative dataset. 246 | - `feat_fields`: Feature column. 247 | - `columns_to_exclude`: Bioactivity columns to ignore. 248 | 249 | ```yaml 250 | databases: 251 | dataset: 252 | path: "data/main.csv" 253 | feat_fields: "sequence" 254 | label_field: "activity" 255 | verbose: false 256 | neg_database: 257 | path: "data/negatives.csv" 258 | feat_fields: "sequence" 259 | columns_to_exclude: ["to_exclude"] 260 | verbose: false 261 | ``` 262 | 263 | ### `test` 264 | 265 | Defines evaluation and similarity filtering settings. 266 | 267 | - min_threshold: Identity threshold for filtering. 268 | - sim_arguments: Similarity computation details. 269 | 270 | For sequences: 271 | 272 | - `alignment_algorithm`: `mmseqs`, `mmseqs+prefilter`, `needle` 273 | - `denominator`: How identity is normalized: `longest`, `shortest`, `n_aligned` 274 | - `prefilter`: Whether to use a prefilter. 275 | - `field_name`: Name of column with the peptide sequences/SMILES 276 | - `verbose`: Logging flag. 277 | 278 | For molecules: 279 | 280 | - `sim_function`: e.g., tanimoto, jaccard 281 | - `radius`: Radius to define the substructures when computing the fingerprint 282 | - `bits`: Size of the fingerprint, greater gives more resolution but demands more computational resources. 283 | - `partitions`: `min`, `all`, `` 284 | - `algorithm`: `ccpart`, `ccpart_random`, `graph_part` 285 | - `threshold_step`: Step size for threshold evaluation. 286 | - `filter`: Minimum proportion of data in the test set that is acceptable (test set proportion = 20%, `filter=0.185`, does not consider test sets with less than 18.5%) 287 | - `verbose`: Logging level. 288 | 289 | Example: 290 | 291 | ```yaml 292 | test: 293 | min_threshold: 0.1 294 | sim_arguments: 295 | data_type: "sequence" 296 | alignment_algorithm: "mmseqs" 297 | denominator: "shortest" 298 | prefilter: true 299 | min_threshold: 0.1 300 | field_name: "sequence" 301 | verbose: 2 302 | partitions: "all" 303 | algorithm: "ccpart" 304 | threshold_step: 0.1 305 | filter: 0.185 306 | verbose: 2 307 | ``` 308 | 309 | ### `val` 310 | 311 | Cross-validation strategy: 312 | 313 | - `type`: `kfold` or `single` 314 | - `k`: Number of folds. 315 | - `random_state`: Seed for reproducibility. 316 | 317 | ### `train` 318 | Training configuration. 319 | 320 | Required: 321 | 322 | - `task`: class or reg 323 | - `optim_strategy`: Optimization strategy. 324 | - `trainer`: grid or optuna 325 | - `n_steps`: Number of trials (Optuna only). 326 | - `direction`: maximize or minimize 327 | - `metric`: mcc or mse 328 | - `partition`: Partitioning type. 329 | - `n_jobs`: Parallel jobs. 330 | - `patience`: Early stopping patience. 331 | - `hspace`: Search space. 332 | - `representations`: List of representations to try. 333 | - `models`: 334 | - `type`: select or ensemble 335 | - `elements`: model names and their hyperparameter space. 336 | 337 | Example: 338 | 339 | ```yaml 340 | train: 341 | task: "class" 342 | optim_strategy: 343 | trainer: "optuna" 344 | n_steps: 100 345 | direction: "maximize" 346 | task: "class" 347 | metric: "mcc" 348 | partition: "random" 349 | n_jobs: 8 350 | patience: 20 351 | hspace: 352 | representations: ["chemberta-2", "ecfp-4"] 353 | models: 354 | type: "select" 355 | elements: 356 | knn: 357 | n_neighbors: 358 | type: int 359 | min: 1 360 | max: 20 361 | log: false 362 | weights: 363 | type: categorical 364 | values: ["uniform", "distance"] 365 | ``` 366 | 367 | 368 | ### `representation` 369 | Specifies molecular or sequence representations. 370 | 371 | Each element includes: 372 | 373 | - `engine`: `lm` (language model) or `fp` (fingerprint) 374 | - `model`: Model name (e.g., chemberta-2, esm2-150m) 375 | - `device`: `cpu`, `gpu`, or `mps` 376 | - `batch_size`: Size per batch 377 | - `average_pooling`: Whether to average token representations (only for `lm`) 378 | 379 | ```yaml 380 | representation: 381 | verbose: true 382 | elements: 383 | - chemberta-2: 384 | engine: "lm" 385 | model: "chemberta-2" 386 | device: "gpu" 387 | batch_size: 32 388 | average_pooling: true 389 | - ecfp-4: 390 | engine: "fp" 391 | fp: "ecfp" 392 | radius: 2 393 | nbits: 2048 394 | ``` 395 | 396 | ### More details about API 397 | 398 | Please check the [Code reference documentation](https://ibm.github.io/AutoPeptideML/autopeptideml/) 399 | 400 | 401 | 402 | License 403 | ------- 404 | AutoPeptideML is an open-source software licensed under the MIT Clause License. Check the details in the [LICENSE](https://github.com/IBM/AutoPeptideML/blob/master/LICENSE) file. 405 | 406 | Credits 407 | ------- 408 | 409 | Special thanks to [Silvia González López](https://www.linkedin.com/in/silvia-gonz%C3%A1lez-l%C3%B3pez-717558221/) for designing the AutoPeptideML logo and to [Marcos Martínez Galindo](https://www.linkedin.com/in/marcosmartinezgalindo) for his aid in setting up the AutoPeptideML webserver. 410 | -------------------------------------------------------------------------------- /autopeptideml/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for AutoPeptideML.""" 2 | 3 | __author__ = """Raul Fernandez-Diaz""" 4 | __email__ = 'raul.fernandezdiaz@ucdconnect.ie' 5 | # __all__ = ['AutoPeptideML', '__version__', ''] 6 | 7 | from .autopeptideml import AutoPeptideML, __version__ 8 | -------------------------------------------------------------------------------- /autopeptideml/config.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | import os.path as osp 4 | 5 | from multiprocessing import cpu_count 6 | 7 | import pandas as pd 8 | 9 | from ItsPrompt.prompt import Prompt 10 | 11 | 12 | HP_SPACES = { 13 | "knn": { 14 | "n_neighbors": { 15 | "type": "int", 16 | "min": 1, 17 | "max": 20, 18 | "log": False 19 | }, 20 | "weights": { 21 | "type": "categorical", 22 | "values": ["uniform", "distance"] 23 | } 24 | }, 25 | "adaboost": { 26 | "n_estimators": { 27 | "type": "int", 28 | "min": 10, 29 | "max": 1000, 30 | "log": False 31 | }, 32 | "learning_rate": { 33 | "type": "float", 34 | "min": 1e-7, 35 | "max": 1e-1, 36 | "log": True 37 | } 38 | }, 39 | "gradboost": { 40 | "learning_rate": { 41 | "type": "float", 42 | "min": 1e-5, 43 | "max": 1e-1, 44 | "log": True 45 | }, 46 | "n_estimators": { 47 | "type": "int", 48 | "min": 10, 49 | "max": 1000, 50 | "log": False 51 | }, 52 | "min_samples_split": { 53 | "type": "int", 54 | "min": 2, 55 | "max": 100, 56 | "log": False 57 | } 58 | }, 59 | "rf": { 60 | "n_estimators": { 61 | "type": "int", 62 | "min": 10, 63 | "max": 1000, 64 | "log": False 65 | }, 66 | "ccp_alpha": { 67 | "type": "float", 68 | "min": 1e-10, 69 | "max": 1e-3, 70 | "log": True 71 | }, 72 | "min_samples_split": { 73 | "type": "int", 74 | "min": 2, 75 | "max": 100, 76 | "log": False 77 | } 78 | }, 79 | "lightgbm": { 80 | "n_estimators": { 81 | "type": "int", 82 | "min": 10, 83 | "max": 1000, 84 | "log": False 85 | }, 86 | "num_leaves": { 87 | "type": "int", 88 | "min": 8, 89 | "max": 1024, 90 | "log": False 91 | }, 92 | "max_depth": { 93 | "type": "int", 94 | "min": 3, 95 | "max": 10, 96 | "log": False 97 | }, 98 | "subsample": { 99 | "type": "float", 100 | "min": 0.5, 101 | "max": 1.0, 102 | "log": False 103 | }, 104 | "colsample_bytree": { 105 | "type": "float", 106 | "min": 0, 107 | "max": 1.0, 108 | "log": False 109 | }, 110 | "min_split_gain": { 111 | "type": "float", 112 | "min": 1e-10, 113 | "max": 1e-3, 114 | "log": True 115 | }, 116 | "reg_alpha": { 117 | "type": "float", 118 | "min": 1e-10, 119 | "max": 1e-3, 120 | "log": True 121 | }, 122 | "learning_rate": { 123 | "type": "float", 124 | "min": 1e-7, 125 | "max": 1e-1, 126 | "log": True 127 | }, 128 | "verbose": { 129 | "type": "fixed", 130 | "value": -1 131 | } 132 | }, 133 | "xgboost": { 134 | "n_estimators": { 135 | "type": "int", 136 | "min": 10, 137 | "max": 1000, 138 | "log": False 139 | }, 140 | "min_split_alpha": { 141 | "type": "float", 142 | "min": 1e-10, 143 | "max": 1e-3, 144 | "log": True 145 | }, 146 | "reg_alpha": { 147 | "type": "float", 148 | "min": 1e-10, 149 | "max": 1e-3, 150 | "log": True 151 | }, 152 | "learning_rate": { 153 | "type": "float", 154 | "min": 1e-7, 155 | "max": 1e-1, 156 | "log": True 157 | }, 158 | "verbose": { 159 | "type": "fixed", 160 | "value": -1 161 | } 162 | }, 163 | "svm": { 164 | "C": { 165 | "type": "float", 166 | "min": 1e-7, 167 | "max": 0.1, 168 | "log": True 169 | }, 170 | "probability": { 171 | "type": "fixed", 172 | "value": True 173 | }, 174 | "kernel": { 175 | "values": [ 176 | "linear", 177 | "poly", 178 | "rbf", 179 | "sigmoid" 180 | ], 181 | "type": "categorical" 182 | }, 183 | "max_iter": { 184 | "type": "fixed", 185 | "value": int(1e4) 186 | }, 187 | "degree": { 188 | "condition": "kernel-poly", 189 | "log": False, 190 | "max": 7, 191 | "min": 2, 192 | "type": "int" 193 | } 194 | } 195 | } 196 | MACROMOLECULES_PIPELINE = { 197 | "name": "macromolecules_pipe", 198 | "aggregate": True, 199 | "verbose": False, 200 | "elements": [ 201 | { 202 | "pipe-smiles-input": { 203 | "name": "smiles-input", 204 | "aggregate": False, 205 | "verbose": False, 206 | "elements": [ 207 | {"filter-smiles": {}} 208 | ] 209 | } 210 | }, 211 | { 212 | "pipe-seq-input": { 213 | "name": "seq-input", 214 | "aggregate": False, 215 | "verbose": False, 216 | "elements": [ 217 | {"filter-smiles": {'keep_smiles': False}}, 218 | {"canonical-cleaner": {"substitution": "G"}}, 219 | {"sequence-to-smiles": {}} 220 | ] 221 | } 222 | } 223 | ] 224 | } 225 | SEQUENCE_PIPELINE = { 226 | "name": 'sequences-pipe', 227 | "aggregate": True, 228 | "verbose": False, 229 | "elements": [ 230 | { 231 | "clean-seqs-pipe": { 232 | "name": "clean-seqs-pipe", 233 | "aggregate": False, 234 | "verbose": False, 235 | "elements": [ 236 | {"filter-smiles": {"keep_smiles": False}}, 237 | {"canonical-cleaner": {"substitution": "X"}}, 238 | ] 239 | } 240 | }, 241 | { 242 | "smiles-to-seqs-pipe": { 243 | "name": "smiles-to-seqs-pipe", 244 | "aggregate": False, 245 | "verbose": False, 246 | "elements": [ 247 | {"filter-smiles": {"keep_smiles": True}}, 248 | {"smiles-to-sequences": {}}, 249 | {"canonical-cleaner": {"substitution": "X"}} 250 | ] 251 | } 252 | } 253 | ] 254 | } 255 | MOL_REPS = { 256 | "chemberta-2": { 257 | "engine": "lm", 258 | "device": "cpu", 259 | "batch_size": 32, 260 | "average_pooling": True, 261 | 'model': 'chemberta-2' 262 | }, 263 | "molformer-xl": { 264 | 'engine': "lm", 265 | "device": "cpu", 266 | "batch_size": 32, 267 | "average_pooling": True, 268 | 'model': 'molformer-xl' 269 | }, 270 | "peptideclm": { 271 | 'engine': 'lm', 272 | 'device': 'cpu', 273 | 'batch_size': 32, 274 | 'average_pooling': True, 275 | 'model': "peptideclm" 276 | }, 277 | "ecfp-16": { 278 | "engine": "fp", 279 | "nbits": 2048, 280 | "radius": 8, 281 | 'fp': 'ecfp' 282 | }, 283 | 284 | } 285 | MOL_REPS.update( 286 | {f'ecfp-{int(radius*2)}': { 287 | 'engine': "fp", 288 | 'nbits': 2048, 289 | 'radius': radius, 290 | "fp": "ecfp" 291 | } for radius in range(1, 10, 1)} 292 | ) 293 | MOL_REPS.update( 294 | {f'fcfp-{int(radius*2)}': { 295 | 'engine': "fp", 296 | 'nbits': 2048, 297 | 'radius': radius, 298 | "fp": "fcfp" 299 | } for radius in range(1, 10, 1)} 300 | ) 301 | MOL_REPS.update( 302 | {f'ecfp-counts-{int(radius*2)}': { 303 | 'engine': "fp", 304 | 'nbits': 2048, 305 | 'radius': radius, 306 | "fp": "ecfp-count" 307 | } for radius in range(1, 10, 1)} 308 | ) 309 | SEQ_REPS = { 310 | "esm2-8m": { 311 | 'engine': 'lm', 312 | 'device': "cpu", 313 | 'batch_size': 32, 314 | "average_pooling": True, 315 | 'model': 'esm2-8m' 316 | }, 317 | "esm2-150m": { 318 | 'engine': 'lm', 319 | 'device': 'cpu', 320 | 'batch_size': 32, 321 | 'average_pooling': True, 322 | 'model': 'esm2-150m' 323 | }, 324 | "esm2-650m": { 325 | 'engine': 'lm', 326 | 'device': 'cpu', 327 | 'batch_size': 16, 328 | 'average_pooling': True, 329 | 'model': "esm2-650m" 330 | }, 331 | 'prot-t5-xl': { 332 | 'engine': 'lm', 333 | 'device': 'cpu', 334 | 'batch_size': 16, 335 | 'average_pooling': True, 336 | 'model': 'prot-t5-xl' 337 | }, 338 | 'prost-t5': { 339 | 'engine': 'lm', 340 | 'device': 'cpu', 341 | 'batch_size': 8, 342 | 'average_pooling': True, 343 | 'model': 'prost-t5' 344 | } 345 | } 346 | 347 | 348 | def _is_int(text: str) -> bool: 349 | try: 350 | int(text) 351 | return True 352 | except ValueError: 353 | return False 354 | 355 | 356 | def define_dataset(dataset: str, task: str, modality: str, neg: bool = False): 357 | if dataset.endswith('.csv') or dataset.endswith('.tsv'): 358 | df = pd.read_csv(dataset) 359 | print("These are the contents of the file you selected\n") 360 | print(df.head()) 361 | print() 362 | columns = df.columns.tolist() 363 | feat_field = Prompt().select( 364 | "What is the number of the column with the sequences/SMILES?", 365 | options=columns 366 | ) 367 | columns.remove(feat_field) 368 | if neg: 369 | columns_to_exclude = Prompt().checkbox( 370 | "What columns describe a bioactivity you would like to exclude from the negative class?", 371 | options=columns, 372 | min_selections=0 373 | ) 374 | return feat_field, columns_to_exclude 375 | 376 | if task == 'class': 377 | label_field = Prompt().select( 378 | "What is the column containing the labels?", 379 | options=columns + ['Assume all entries are positive'] 380 | ) 381 | else: 382 | label_field = Prompt().select( 383 | "What is the column containing the labels?", 384 | options=columns 385 | ) 386 | elif dataset.endswith('.fasta'): 387 | if modality != 'seqs': 388 | raise ValueError("FASTA is not an acceptable format for Macromolecules. Options: `csv`, `tsv`, `smi`.") 389 | feat_field, label_field = 'sequences', None 390 | return feat_field, label_field 391 | 392 | 393 | def config_helper() -> dict: 394 | print() 395 | print("Part 1 - Define the data and preprocessing steps") 396 | config = {} 397 | task = Prompt().select( 398 | "What is the modelling problem you're facing?", 399 | options=['Classification (returning categorical value)', 400 | "Regression(returnin continuous value)"] 401 | ) 402 | modality = Prompt().select( 403 | "How do you want to define your peptides?", 404 | options=['Macromolecules - allows for canonical, non-canonical, and peptidomimetics', 405 | 'Sequences - only canonical peptides, slightly better performance'] 406 | ) 407 | if 'macromolecule' in modality.lower(): 408 | modality = 'mol' 409 | config['pipeline'] = MACROMOLECULES_PIPELINE 410 | else: 411 | modality = 'seqs' 412 | config['pipeline'] = SEQUENCE_PIPELINE 413 | if 'class' in task.lower(): 414 | task = 'class' 415 | else: 416 | task = 'reg' 417 | 418 | dataset = Prompt().input( 419 | "What is the path to the dataset with your data", 420 | validate=lambda x: osp.exists(x) 421 | ) 422 | feat_field, label_field = define_dataset(dataset, task, modality) 423 | 424 | if task == 'class': 425 | print("Part 1.5 - Negative sampling") 426 | neg_db = Prompt().select( 427 | "What negative sampling strategy do you prefer?", 428 | options=[ 429 | "DB of bioactive canonical peptides", 430 | "DB of bioactive non-canonical peptides", 431 | "DB of both bioactive and non-bioactive peptides", 432 | "Personalised DB", 433 | "No negative sampling" 434 | ] 435 | ) 436 | if neg_db == 'Personalised DB': 437 | neg_path = Prompt().input( 438 | "What is the path to the dataset with your data", 439 | validate=lambda x: osp.exists(x) 440 | ) 441 | neg_feat_field, columns_to_exclude = define_dataset( 442 | neg_path, task, modality, neg=True 443 | ) 444 | neg_db = { 445 | 'path': neg_path, 446 | 'feat_fields': neg_feat_field, 447 | 'columns_to_exclude': columns_to_exclude, 448 | "verbose": False 449 | } 450 | config['databases'] = { 451 | 'dataset': { 452 | 'path': dataset, 453 | 'feat_fields': feat_field, 454 | 'label_field': label_field, 455 | 'verbose': False 456 | } 457 | } 458 | if task == 'class' and neg_db != 'No negative sampling': 459 | config['databases']['neg_database'] = neg_db 460 | 461 | print("Part 2 - Define evaluation strategy") 462 | config['test'] = {'min_threshold': 0.1} 463 | 464 | if modality == 'seqs': 465 | sim_functions = ['needle (recommended)', 'mmseqs', 'mmseqs+prefilter (for huge datasets)'] 466 | denominators = ['shortest', 'longest', 'n_aligned'] 467 | sim_function = Prompt().select( 468 | "What alignment algorithm would you like to use?", 469 | options=sim_functions 470 | ) 471 | denominator = Prompt().select( 472 | "What denominator would you like to use to compute the sequence identity?", 473 | options=denominators 474 | ) 475 | config['test']['sim_arguments'] = { 476 | 'data_type': 'sequence', 477 | 'alignment_algorithm': sim_function if '+' not in sim_function else sim_function.split('+')[0], 478 | 'denominator': denominator, 479 | 'prefilter': 'prefilter' in sim_function, 480 | 'min_threshold': 0.1, 481 | 'field_name': feat_field, 482 | 'verbose': 2 483 | } 484 | else: 485 | fps = ['mapc', 'ecfp', 'fcfp'] 486 | bits = [str(int(2**v)) for v in range(8, 12, 1)] 487 | radii = [str(int(i)) for i in range(2, 12)] 488 | fp = Prompt().select( 489 | "What fingerprint would you like to use?", 490 | options=fps 491 | ) 492 | bit = Prompt().select( 493 | "How many bits would you like the fingerprints to have? (Greater better, but more expensive)", 494 | options=bits 495 | ) 496 | radius = Prompt().select( 497 | "What radius would you like to use?", 498 | options=radii 499 | ) 500 | config['test']['sim_arguments'] = { 501 | 'data_type': 'molecule', 502 | 'min_threshold': 0.1, 503 | 'sim_function': 'tanimoto' if fp == 'ecfp' else 'jaccard', 504 | 'field_name': feat_field, 505 | 'radius': int(radius), 506 | 'bits': int(bit), 507 | 'verbose': 2 508 | } 509 | partition = Prompt().select( 510 | "What thresholds would you like to evaluate in?", 511 | options=['min (AutoPeptideML v.1.0)', 'all'] 512 | ) 513 | part_alg = Prompt().select( 514 | "What partitioning algorithm would you like to use?", 515 | options=['ccpart', 'ccpart_random', 'graph_part'], 516 | default='ccpart' 517 | ) 518 | config['test']['partitions'] = partition 519 | config['test']['algorithm'] = part_alg 520 | config['test']['threshold_step'] = 0.1 521 | config['test']['verbose'] = 2 522 | config['test']['filter'] = 0.185 523 | config['val'] = { 524 | 'type': 'kfold', 525 | "k": 10, 526 | "random_state": 1 527 | } 528 | 529 | print("Part 3 - Define model training") 530 | config['train'] = {} 531 | 532 | learning_alg = Prompt().checkbox( 533 | "What models would you like to consider?", 534 | options=list(HP_SPACES.keys()), 535 | min_selections=1 536 | ) 537 | model_selection = Prompt().select( 538 | "What model selection would you like to use?", 539 | options=['select', "ensemble"] 540 | ) 541 | hp_search = Prompt().select( 542 | "What type of search for optimal hyperparameters would you like to use?", 543 | options=['grid', 'bayesian'], 544 | ) 545 | reps = Prompt().checkbox("What representations would you like to use?", 546 | options=list(MOL_REPS.keys()) if modality == 'mol' 547 | else list(SEQ_REPS.keys()), min_selections=1) 548 | acc = Prompt().select("Which accelerator would you like to use to compute the representations?", 549 | options=['cpu', "cuda", "mps"]) 550 | hp_search = hp_search if hp_search != 'bayesian' else 'optuna' 551 | if hp_search == 'optuna': 552 | n_steps = Prompt().input( 553 | "How many steps for optimisation would you like to conduct?", 554 | default=100, 555 | validate=_is_int 556 | ) 557 | patience = Prompt().input( 558 | "What patience would you like EarlyStopping to have?", 559 | validate=_is_int 560 | ) 561 | n_jobs = Prompt().input( 562 | "How many parallel jobs do you want to run?", 563 | default=cpu_count(), 564 | validate=_is_int 565 | ) 566 | config['train']['task'] = task 567 | config['train']['optim_strategy'] = { 568 | 'trainer': hp_search, 569 | 'n_steps': int(n_steps) if hp_search == 'optuna' else None, 570 | 'direction': "maximize", 571 | 'task': task, 572 | 'metric': 'pcc' if task == 'reg' else 'mcc', 573 | 'partition': 'random', 574 | 'n_jobs': int(n_jobs), 575 | 'patience': int(patience) 576 | } 577 | config['train']['hspace'] = {'representations': reps} 578 | config['train']['hspace']['models'] = { 579 | 'type': model_selection, 580 | 'elements': {model: HP_SPACES[model] for model in learning_alg}, 581 | } 582 | config['representation'] = { 583 | 'verbose': True, 584 | 'elements': [ 585 | { 586 | r: MOL_REPS[r] if modality == 'mol' else SEQ_REPS[r] 587 | } for r in reps 588 | ] 589 | } 590 | for idx, element in enumerate(config['representation']['elements']): 591 | name = list(element.keys())[0] 592 | if config['representation']['elements'][idx][name]['engine'] != 'lm': 593 | continue 594 | config['representation']['elements'][idx][name]['device'] = acc 595 | path = Prompt().input( 596 | "Where do you want to save the experiment results?", 597 | validate=lambda x: not osp.isdir(x) 598 | ) 599 | config['outputdir'] = path 600 | os.makedirs(path, exist_ok=True) 601 | path = osp.join(path, 'config.yml') 602 | 603 | yaml.safe_dump(config, open(path, 'w'), indent=2) 604 | return path 605 | -------------------------------------------------------------------------------- /autopeptideml/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/AutoPeptideML/adb18578f145d1c3a78d6860ad5f7d35c726159a/autopeptideml/data/__init__.py -------------------------------------------------------------------------------- /autopeptideml/data/readme_ex.md: -------------------------------------------------------------------------------- 1 | # AutoPeptideML output summary 2 | 3 | ## 1. Introduction 4 | 5 | This document is automatically generated from each run of the AutoPeptideML software and it is meant to provide an easy guide into the interpretation of the results obtained. General qualitative comments (e.g., "the MCC score obtained is between 0.25-0.5, which indicates a low correlation and model predictions will not be greatly reliable") are given as a common-sense guideline and the actual criteria for considering a model acceptable may depend on the target application and current state-of-the-art. 6 | 7 | ## 2. Confusion matrix and main performance metrics 8 | 9 | ### 2.1. Confusion matrix 10 | 11 | The confusion matrix is the simplest way to visualize the behaviour of the model. The rows describe the true labels of the samples, which can be Positive or Negative; the columns describe the predicted labels from the ensemble. 12 | 13 | 14 | - **First quadrant (upper-left corner):** describes the True Negative predictions (TN) that is to say samples that are negative which are predicted as negative by the model. 15 | - **Second quadrant (upper-right corner):** describes the False Positive predictions (FP), which are negative samples that are erroneously predicted as positives. If this error is high usually indicates an over-sensitive predictor. 16 | - **Third quadrant (lower-left corner):** describes the False Negative predictions (FN), which are positive samples that are erroneously predicted as negatives. If this error is high usually indicates a highly specific predictor. 17 | - **Fourth quadrant (lower-right corner):** describes the True Positive predictions, which are positive samples predicted as positives. 18 | 19 | ![Confusion matrix](./figures/confusion_matrix.png) 20 | 21 | ## 2.2. Model performance metrics 22 | 23 | The confusion matrix can be analysed in different ways depending on what properties of the predictor we are most interested in. The following list contains the main performance metrics used when describing ML predictors, a formula relating them to the confusion matrix above, and a explanation of what they tell us about the model. 24 | 25 | - **Accuracy:** 26 | - *What is it?* proportion of correct predictions among all predictions. 27 | - *How is it calculated?* `(TP+TN)/(TP+TN+FP+FN)` 28 | - *What does it say about the model?* How often is it right. 29 | - *When to use?* Only when working with a balanced dataset (same number of positive and negative samples, default AutoPeptideML run with search for bioactive negatives and homology partitioning). If dataset is not balanced check `evaluation_data/test_scores.csv` for `balanced_accuracy` which is a variation that takes into account the imbalance between the labels. 30 | - *Value:* 31 | - *Interpretation of value:* 32 | - Worse than random: `0-0.45` 33 | - Random model: `0.45-0.55` 34 | - Bad model: `0.55-0.7` 35 | - Acceptable model: `0.7-0.8` 36 | - Good model: `0.8-0.9` 37 | - Really good model: `0.9-0.97` 38 | - Too good a model (please make sure training and evaluation sets are independent): `>0.97` 39 | 40 | - **Sensitivity or recall:** 41 | - *What is it?* proportion of positive samples predicted as positive among all positive samples. 42 | - *How is it calculated?* `(TP)/(TP+FN)` 43 | - *What does it say about the model?* How likely it is to misclassify a positive sample as negative. May be relevant when the consequence of missing positives is important (e.g., a cancer diagnostics tool). 44 | - *When to use?* Only when working with a balanced dataset (same number of positive and negative samples, default AutoPeptideML run with search for bioactive negatives and homology partitioning). If dataset is not balanced check `evaluation_data/test_scores.csv` for `recall_weighted` which is a variation that takes into account the imbalance between the labels. 45 | - *Value:* 46 | - *Interpretation of value:* 47 | - Worse than random: `0-0.45` 48 | - Random model: `0.45-0.55` 49 | - Bad model: `0.55-0.7` 50 | - Acceptable model: `0.7-0.8` 51 | - Good model: `0.8-0.9` 52 | - Really good model (check that specificity is, at least, good): `0.9-0.97` 53 | - Too good a model (please make sure training and evaluation sets are independent, also check that specificity is, at least, good): `>0.97` 54 | 55 | - **Specificity or precision:** 56 | - *What is it?* proportion of positive predictions that were actually true. 57 | - *How is it calculated?* `(TP)/(TP+FP)` 58 | - *What does it say about the model?* How likely it is to misclassify a positive sample as negative. May be relevant when the aim is to reduce the number of samples to further analyse (e.g., when conducting virtual screening on large databases). 59 | - *When to use?* Only when working with a balanced dataset (same number of positive and negative samples, default AutoPeptideML run with search for bioactive negatives and homology partitioning). If dataset is not balanced check `evaluation_data/test_scores.csv` for `precision_weighted` which is a variation that takes into account the imbalance between the labels. 60 | - *Value:* 61 | - *Interpretation of value:* 62 | - Worse than random: `0-0.45` 63 | - Random model: `0.45-0.55` 64 | - Bad model: `0.55-0.7` 65 | - Acceptable model: `0.7-0.8` 66 | - Good model: `0.8-0.9` 67 | - Really good model (check that sensitivity is, at least, good): `0.9-0.97` 68 | - Too good a model (please make sure training and evaluation sets are independent, also check that sensitivity is, at least, good): `>0.97` 69 | 70 | - **F1:** 71 | - *What is it?* harmonic mean between sensitivity and precision. 72 | - *How is it calculated?* `(1/2) * (TP)/[2*TP + (FP+FN)]` 73 | - *What does it say about the model?* Overall model performance, conceptually similar to accuracy. 74 | - *When to use?* Only when working with a balanced dataset (same number of positive and negative samples, default AutoPeptideML run with search for bioactive negatives and homology partitioning). If dataset is not balanced check `evaluation_data/test_scores.csv` for `f1_weighted` which is a variation that takes into account the imbalance between the labels. 75 | - *Value:* 76 | - *Interpretation of value:* 77 | - Worse than random: `0-0.45` 78 | - Random model: `0.45-0.55` 79 | - Bad model: `0.55-0.7` 80 | - Acceptable model: `0.7-0.8` 81 | - Good model: `0.8-0.9` 82 | - Really good model: `0.9-0.97` 83 | - Too good a model (please make sure training and evaluation sets are independent): `>0.97` 84 | 85 | - **Matthew's correlation coefficient:** 86 | - *What is it?* correlation between the predictions of the model and the actual true labels. 87 | - *How is it calculated?* `(TP*TN – FP*FN) / √(TP+FP)(TP+FN)(TN+FP)(TN+FN)` 88 | - *What does it say about the model?* Overall model performance, conceptually similar to accuracy. 89 | - *When to use?* Any case, particularly with binary classification. 90 | - *Value:* 91 | - *Interpretation of value:* 92 | - Worse than random: `< -0.2` 93 | - Random model: `-0.2-0.2` 94 | - Bad model: `0.2-0.3` 95 | - Acceptable model: `0.3-0.4` 96 | - Good model: `0.4-0.7` 97 | - Really good model: `0.7-0.95` 98 | - Too good a model (please make sure training and evaluation sets are independent): `>0.95` 99 | 100 | ## 3. Calibration curve 101 | 102 | The calibration curve indicates whether the `score` obtained from the predictions of the ensemble can be considered as a probability of the sample being positive, i.e., whether a higher `score` represents a greater likelihood for the sample to be positive. 103 | 104 | If the `Classifier 1` curve follows the doted diagonal curve (`Perfectly calibrated`) then the `score` values can be considered as a probability. Otherwise, they cannot. If the curve approximates the diagonal in a region and not in other (e.g., below 0.5 is well calibrated and above 0.5 it is not), it can only be considered as a probability if the `score` falls within that region. 105 | 106 | ![Calibration curve](./figures/calibration_curve.png) 107 | 108 | ## 4. Receiver-operating characteristic curve (ROC) 109 | 110 | The ROC curve represents the positive sensitivity (see above **sensitivity or recall**) of the predictor against the false positive rate (the proportion of observations that are incorrectly predicted to be positive out of all negative observations: `FP/(TN+FP)`). The closer that the curve is to the upper-left corner the better the model is. There is also a metric associated to this curve the **AUROC** (area under the ROC) which is often used in the ML community. 111 | 112 | ![AUROC](./figures/roc_curve.png) 113 | 114 | ## 5. Precision-recall curve 115 | 116 | The precision-recall represents the specificity/precision against the sensitivity/recall and provides an idea of the trade-offs existing in the model between both measurements. The area under the curve is also a common evaluating metric in the ML community. 117 | 118 | ![Precision-recall curve](./figures/precision_recall_curve.png) 119 | 120 | # Credit 121 | 122 | AutoPeptideML has been developed and is maintained by [Raul Fernandez-Diaz](https://www.linkedin.com/in/raul-fernandez-diaz-939440203/) PhD Student at UCD and IBM Research under the supervision of Denis C. Shields (UCD Conway Institute and School of Medicine) and Thanh Lam Hoang (IBM Research). 123 | 124 | If you have found the tool useful consider citing out paper: 125 | -------------------------------------------------------------------------------- /autopeptideml/db/__init__.py: -------------------------------------------------------------------------------- 1 | from .db import Database -------------------------------------------------------------------------------- /autopeptideml/db/db.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | import pandas as pd 4 | import numpy as np 5 | 6 | from ..pipeline import Pipeline 7 | from ..pipeline.smiles import is_smiles 8 | 9 | from tqdm import tqdm 10 | 11 | 12 | class Database: 13 | """ 14 | Class that handles dataset operations within AutoPeptideML. 15 | """ 16 | df: pd.DataFrame 17 | # Pipeline can be a single pipeline or a dictionary of field - Pipeline 18 | pipe: Union[Pipeline, Dict[str, Pipeline]] 19 | # Feat_fields can be a single field or a list of fields (e.g, ['seq', 'smiles']) 20 | feat_fields: Union[str, List[str]] 21 | label_field: Optional[str] 22 | 23 | def __init__( 24 | self, 25 | path: Optional[str] = None, 26 | df: Optional[pd.DataFrame] = None, 27 | feat_fields: Union[str, List[str]] = None, 28 | pipe: Optional[Union[Pipeline, Dict[str, Pipeline]]] = None, 29 | label_field: Optional[str] = None, 30 | verbose: bool = False, 31 | seed: int = 1 32 | ): 33 | """Initialises a Database instance. 34 | 35 | :type path: Optional[str] 36 | :param path: Path to the CSV file containing the dataset. If provided, the dataset will be loaded from this path. 37 | 38 | :type df: Optional[pd.DataFrame] 39 | :param df: The dataset represented as a pandas DataFrame. If `path` is provided, this will be ignored. 40 | 41 | :type pipe: Union[Pipeline, Dict[str, Pipeline]] 42 | :param pipe: A preprocessing pipeline or a dictionary of feature fields mapped to their respective pipelines. 43 | If not provided, no preprocessing is applied. 44 | 45 | :type feat_fields: Union[str, List[str]] 46 | :param feat_fields: A single feature field or a list of feature fields (e.g., `['seq', 'smiles']`) 47 | used for processing and model input. This parameter is required. 48 | 49 | :type label_field: Optional[str] 50 | :param label_field: The name of the column representing labels in the dataset. If `None`, no label column is specified. 51 | 52 | :type verbose: bool 53 | :param verbose: Enables verbose output if set to `True`. Logs detailed preprocessing steps. Default is `False`. 54 | 55 | """ 56 | if path is not None: 57 | self.df = pd.read_csv(path) 58 | else: 59 | self.df = df 60 | if feat_fields is None: 61 | raise ValueError(f'`feat_fields` cannot be left empty') 62 | if isinstance(feat_fields, str): 63 | feat_fields = [feat_fields] 64 | if (not isinstance(pipe, dict) and pipe is not None): 65 | self.pipe = {field: pipe for field in feat_fields} 66 | else: 67 | self.pipe = pipe 68 | self.seed = seed 69 | self.label_field = label_field 70 | self.feat_fields = feat_fields 71 | self.verbose = verbose 72 | self._preprocess(verbose) 73 | 74 | def draw_samples( 75 | self, 76 | target_db: "Database", 77 | columns_to_exclude: Optional[Union[List[str], str]] = None 78 | ) -> pd.DataFrame: 79 | """ 80 | Draws samples from the current database to match the distribution of the target database. 81 | Excludes specified columns if provided. 82 | 83 | :type target_db: Database 84 | :param target_db: The target `Database` whose distribution is used to sample data. 85 | 86 | :type columns_to_exclude: Optional[Union[List[str], str]] 87 | :param columns_to_exclude: A single column or list of columns to exclude from sampling. If `None`, no columns are excluded. 88 | 89 | :rtype: pd.DataFrame 90 | :return: A DataFrame containing the sampled data matching the target database distribution. 91 | """ 92 | if columns_to_exclude is not None: 93 | self._filter(columns_to_exclude) 94 | 95 | target_hist = target_db._hist() 96 | hist = self._hist() 97 | 98 | entries = {field: [] for field in self.feat_fields} 99 | left_out = 0 100 | for idx, h in enumerate(target_hist): 101 | if idx > len(hist): 102 | break 103 | tmp_df = self.df.iloc[hist[idx]] 104 | tgt_df = target_db.df.iloc[h] 105 | 106 | if len(tmp_df) < len(tgt_df): 107 | left_out += len(tgt_df) - len(tmp_df) 108 | elif len(tmp_df) > len(tgt_df) + np.abs(left_out): 109 | if left_out < 0: 110 | smp = len(tgt_df) 111 | else: 112 | smp = len(tgt_df) + left_out 113 | tmp_df = tmp_df.sample(smp, replace=False, random_state=self.seed) 114 | left_out = 0 115 | else: 116 | smp = len(tmp_df) - len(tgt_df) 117 | tmp_df = tmp_df.sample(smp, replace=False, random_state=self.seed) 118 | for field in self.feat_fields: 119 | entries[field].extend(tmp_df[field].tolist()) 120 | 121 | entries_df = pd.DataFrame(entries) 122 | for field in self.feat_fields: 123 | entries_df.drop_duplicates(field, inplace=True) 124 | return entries_df 125 | 126 | def add_negatives( 127 | self, other: "Database", 128 | columns_to_exclude: Optional[Union[List[str], str]] = None 129 | ): 130 | """ 131 | Adds negative samples to the current database using another database. 132 | The label for negative samples is set to `0`. 133 | 134 | :type other: Database 135 | :param other: The source `Database` from which negative samples are drawn. 136 | 137 | :type columns_to_exclude: Optional[Union[List[str], str]] 138 | :param columns_to_exclude: A single column or list of columns to exclude during sampling. If `None`, no columns are excluded. 139 | 140 | :rtype: None 141 | :return: Updates the current database with the added negative samples. 142 | """ 143 | other.df = other.draw_samples(self, columns_to_exclude) 144 | if self.label_field is None: 145 | self.label_field = "Y" 146 | self.df[self.label_field] = 1 147 | 148 | other.df[self.label_field] = 0 149 | if other.feat_fields[0] != self.feat_fields[0]: 150 | other.df[self.feat_fields[0]] = other.df[other.feat_fields[0]] 151 | self.df = pd.concat([self.df, other.df]) 152 | self.df = self.df[[self.label_field, *self.feat_fields]] 153 | 154 | def _check_fields(self): 155 | """ 156 | Validates that all feature fields exist in the dataset. 157 | 158 | :rtype: None 159 | :return: Raises a `KeyError` if any feature field is missing from the dataset. 160 | """ 161 | for field in self.feat_fields: 162 | if field not in self.df.columns: 163 | raise KeyError( 164 | f"Field: {field} is not in df", 165 | f"df columns are: {', '.join(self.df.columns.tolist())}" 166 | ) 167 | 168 | def _get_mw(self): 169 | """ 170 | Computes the molecular weight (MW) for each entry in the dataset using RDKit. 171 | 172 | :rtype: None 173 | :return: Adds a `tmp_mw` column to the dataset with computed molecular weights. 174 | """ 175 | try: 176 | from rdkit import Chem 177 | from rdkit.Chem import Descriptors 178 | except ImportError: 179 | raise ImportError("Rdkit is required for this function", 180 | "Please install: `pip install rdkit`") 181 | item = self.df.iloc[0, :] 182 | for field in self.feat_fields: 183 | if is_smiles(item[field]): 184 | self.df['tmp_mw'] = self.df[field].map( 185 | lambda x: Descriptors.ExactMolWt( 186 | Chem.MolFromSmiles(x) 187 | ) 188 | ) 189 | else: 190 | self.df['tmp_mw'] = self.df[field].map( 191 | lambda x: Descriptors.ExactMolWt( 192 | Chem.MolFromFASTA(x) 193 | ) 194 | ) 195 | 196 | def _preprocess(self, verbose): 197 | """ 198 | Applies preprocessing steps to the dataset, including field validation and pipeline execution. 199 | 200 | :type verbose: bool 201 | :param verbose: Enables verbose output if set to `True`. 202 | 203 | :rtype: None 204 | :return: Updates the dataset with preprocessed feature fields. 205 | """ 206 | self._check_fields() 207 | if verbose: 208 | print("Preprocessing database") 209 | if self.pipe is not None: 210 | for field in self.feat_fields: 211 | self.df[field] = self.pipe[field](self.df[field], verbose=verbose) 212 | self._get_mw() 213 | 214 | def _filter(self, columns: Union[List[str], str]): 215 | """ 216 | Filters out rows where specified columns contain the value `1`. 217 | 218 | :type columns: Union[List[str], str] 219 | :param columns: A single column or list of columns to filter. 220 | 221 | :rtype: None 222 | :return: Updates the dataset after filtering. 223 | """ 224 | if isinstance(columns, str): 225 | columns = [columns] 226 | for column in columns: 227 | self.df = self.df[self.df[column] != 1].copy().reset_index(drop=True) 228 | 229 | def _hist(self) -> List[np.ndarray]: 230 | """ 231 | Creates histograms based on molecular weight ranges for the dataset. 232 | 233 | :rtype: List[np.ndarray] 234 | :return: A list of boolean arrays indicating the molecular weight bins. 235 | """ 236 | av_mw_aa = 110 237 | step = 5 * av_mw_aa 238 | max_mw = int(self.df['tmp_mw'].max()) 239 | out = [] 240 | if self.verbose: 241 | pbar = tqdm(range(0, max_mw, step), desc='Computing MW') 242 | else: 243 | pbar = range(0, max_mw, step) 244 | for mw in pbar: 245 | cond = ((self.df.tmp_mw > mw) & (self.df.tmp_mw <= mw + step)).to_numpy() 246 | cond = cond.astype(np.bool_) 247 | out.append(cond) 248 | return out 249 | 250 | def __len__(self) -> int: 251 | """ 252 | Returns the number of rows in the dataset. 253 | 254 | :rtype: int 255 | :return: The number of rows in the dataset. 256 | """ 257 | return len(self.df) 258 | 259 | def __getitem__(self, idx: int) -> pd.Series: 260 | """ 261 | Retrieves a row from the dataset by index, returning only the feature fields and the label field, if specified. 262 | 263 | :type idx: int 264 | :param idx: The index of the row to retrieve. 265 | 266 | :rtype: pd.Series 267 | :return: A series containing the feature fields and the label field if specified for the specified row. 268 | """ 269 | item = self.df.iloc[idx] 270 | if self.label_field is None: 271 | return item[self.feat_fields] 272 | else: 273 | return item[self.feat_fields + self.label_field] 274 | 275 | def __str__(self) -> str: 276 | return str(self.df.head()) 277 | -------------------------------------------------------------------------------- /autopeptideml/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import time 4 | import yaml 5 | 6 | from typing import * 7 | 8 | import pandas as pd 9 | import typer 10 | 11 | from .autopeptideml import AutoPeptideML, __version__ 12 | from .config import config_helper 13 | 14 | 15 | app = typer.Typer() 16 | 17 | 18 | @app.command() 19 | def build_model(config_path: Optional[str] = None): 20 | """ 21 | Build a machine learning model based on the provided configuration. If no configuration is provided 22 | the configuration helper will prompt you for more details about the job you want to run. 23 | 24 | Args: 25 | config_path (str, optional): Path to the configuration file. Defaults to None. 26 | 27 | Returns: 28 | None 29 | """ 30 | if config_path is not None: 31 | config = yaml.safe_load(open(config_path)) 32 | mssg = f"| AutoPeptideML v.{__version__} |" 33 | print("-"*(len(mssg))) 34 | print(mssg) 35 | print("-"*(len(mssg))) 36 | 37 | else: 38 | config_path = prepare_config() 39 | config = yaml.safe_load(open(config_path)) 40 | print("** Model Builder **") 41 | apml = AutoPeptideML(config) 42 | db = apml.get_database() 43 | reps = apml.get_reps() 44 | test = apml.get_test() 45 | models = apml.run_hpo() 46 | r_df = apml.run_evaluation(models) 47 | apml.save_experiment(save_reps=True, save_test=False) 48 | print(r_df) 49 | 50 | 51 | @app.command() 52 | def prepare_config() -> dict: 53 | mssg = f"| AutoPeptideML v.{__version__} |" 54 | print("-"*(len(mssg))) 55 | print(mssg) 56 | print("-"*(len(mssg))) 57 | print("** Experiment Builder **") 58 | print("Please, answer the following questions to design your experiment.") 59 | 60 | config_path = config_helper() 61 | return config_path 62 | 63 | 64 | @app.command() 65 | def predict(experiment_dir: str, features_path: str, feature_field: str, 66 | output_path: str = 'apml_predictions.csv'): 67 | config_path = osp.join(experiment_dir, 'config.yml') 68 | if not osp.exists(config_path): 69 | raise FileNotFoundError("Configuration file was not found in experiment dir.") 70 | config = yaml.safe_load(open(config_path)) 71 | apml = AutoPeptideML(config) 72 | df = pd.read_csv(features_path) 73 | results_df = apml.predict( 74 | df, feature_field=feature_field, 75 | experiment_dir=experiment_dir, backend='onnx' 76 | ) 77 | results_df.to_csv(output_path, index=False, float_format="%.3g") 78 | 79 | 80 | def _main(): 81 | app() 82 | 83 | 84 | if __name__ == "__main__": 85 | _main() 86 | -------------------------------------------------------------------------------- /autopeptideml/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline import Pipeline, BaseElement 2 | from .sequence import CanonicalCleaner, CanonicalFilter 3 | from .smiles import SequenceToSMILES, FilterSMILES, SmilesToSequence 4 | -------------------------------------------------------------------------------- /autopeptideml/pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | import json 2 | import yaml 3 | from typing import * 4 | 5 | from concurrent.futures import ThreadPoolExecutor 6 | from multiprocessing import cpu_count 7 | from tqdm import tqdm 8 | 9 | 10 | class BaseElement: 11 | """ 12 | Class `BaseElement` provides a foundation for implementing molecular processing elements. 13 | It supports both single and parallel processing of molecular data, making it suitable for operations 14 | that can be applied to molecular representations such as SMILES strings. 15 | 16 | Attributes: 17 | :type name: str 18 | :param name: The name of the processing element. 19 | 20 | :type properties: Dict[str, Any] 21 | :param properties: A dictionary of additional properties for the processing element. 22 | Default is an empty dictionary. 23 | """ 24 | name: str 25 | properties: Dict[str, Any] = {} 26 | 27 | def __call__(self, mol: Union[str, List[str]], 28 | n_jobs: int = cpu_count(), 29 | verbose: bool = False) -> Union[str, List[str]]: 30 | """ 31 | Processes molecular data, either as a single molecule or a list of molecules. 32 | Automatically selects single or parallel processing based on the input type. 33 | 34 | :type mol: Union[str, List[str]] 35 | :param mol: A single molecular representation (e.g., SMILES string) or a list of such representations. 36 | 37 | :type n_jobs: int 38 | :param n_jobs: The number of parallel jobs to use for processing. Default is the number of CPU cores. 39 | 40 | :type verbose: bool 41 | :param verbose: Enables verbose output if set to `True`, displaying a progress bar for parallel processing. 42 | Default is `False`. 43 | 44 | :rtype: Union[str, List[str]] 45 | :return: The processed molecular representation(s). 46 | """ 47 | if isinstance(mol, str): 48 | return self._single_call(mol) 49 | elif len(mol) == 0: 50 | return mol 51 | else: 52 | return self._parallel_call(mol, n_jobs=n_jobs, 53 | verbose=verbose) 54 | 55 | def _single_call(self, mol: str) -> str: 56 | """ 57 | Processes a single molecular representation. 58 | Must be implemented in a subclass. 59 | 60 | :type mol: str 61 | :param mol: A single molecular representation (e.g., SMILES string). 62 | 63 | :rtype: str 64 | :return: The processed molecular representation. 65 | 66 | :raises NotImplementedError: If the method is not implemented in a subclass. 67 | """ 68 | raise NotImplementedError 69 | 70 | def _clean(self, mol: List[Optional[str]]) -> List[str]: 71 | """ 72 | Cleans the processed molecular data by removing `None` values. 73 | 74 | :type mol: List[Optional[str]] 75 | :param mol: A list of processed molecular representations, some of which may be `None`. 76 | 77 | :rtype: List[str] 78 | :return: A cleaned list of molecular representations without `None` values. 79 | """ 80 | return [m for m in mol if m is not None] 81 | 82 | def _parallel_call(self, mol: List[str], n_jobs: int, 83 | verbose: bool) -> List[str]: 84 | """ 85 | Processes a list of molecular representations in parallel using a thread pool. 86 | 87 | :type mol: List[str] 88 | :param mol: A list of molecular representations (e.g., SMILES strings) to process. 89 | 90 | :type n_jobs: int 91 | :param n_jobs: The number of parallel jobs to use for processing. If set to `1`, processes sequentially. 92 | 93 | :type verbose: bool 94 | :param verbose: Enables verbose output if set to `True`, displaying a progress bar for parallel processing. 95 | 96 | :rtype: List[str] 97 | :return: A list of processed molecular representations. 98 | 99 | :raises RuntimeError: If any parallel job raises an exception. 100 | """ 101 | if n_jobs > 1: 102 | jobs, out = [], [] 103 | with ThreadPoolExecutor(n_jobs) as exec: 104 | for item in mol: 105 | job = exec.submit(self._single_call, item) 106 | jobs.append(job) 107 | 108 | if verbose: 109 | pbar = tqdm(jobs, unit_scale=True) 110 | else: 111 | pbar = jobs 112 | 113 | for job in pbar: 114 | if job.exception() is not None: 115 | raise RuntimeError(job.exception()) 116 | out.append(job.result()) 117 | else: 118 | out = [] 119 | for item in mol: 120 | out.append(self._single_call(item)) 121 | return self._clean(out) 122 | 123 | 124 | class Pipeline: 125 | """ 126 | Class `Pipeline` represents a sequence of molecular processing steps, where each step is defined by an element 127 | (`BaseElement` or another `Pipeline`). The pipeline can process molecular data sequentially and optionally 128 | aggregate results across all steps. 129 | 130 | Attributes: 131 | :type elements: Union[List[BaseElement], List[Pipeline]] 132 | :param elements: A list of `BaseElement` or `Pipeline` instances that define the processing steps. 133 | 134 | :type name: str 135 | :param name: The name of the pipeline. Default is `'pipeline'`. 136 | 137 | :type aggregate: bool 138 | :param aggregate: If `True`, the pipeline aggregates results from all steps. 139 | If `False`, the results of one step are passed to the next. Default is `False`. 140 | """ 141 | def __init__(self, elements: Union[List[BaseElement], List["Pipeline"]], 142 | name: str = 'pipeline', 143 | aggregate: bool = False): 144 | """ 145 | Initializes the pipeline with a sequence of processing elements and configuration. 146 | 147 | :type elements: Union[List[BaseElement], List[Pipeline]] 148 | :param elements: A list of `BaseElement` or `Pipeline` instances to define the processing steps. 149 | 150 | :type name: str 151 | :param name: The name of the pipeline. Default is `'pipeline'`. 152 | 153 | :type aggregate: bool 154 | :param aggregate: If `True`, results from all steps are aggregated. If `False`, results of one step 155 | are passed to the next. Default is `False`. 156 | 157 | :rtype: None 158 | """ 159 | self.elements = elements 160 | self.name = name 161 | self.properties = {name: { 162 | 'name': name, 163 | 'aggregate': aggregate, 164 | 'elements': [{e.name: e.properties} for e in elements]} 165 | } 166 | self.properties['aggregate'] = aggregate 167 | self.aggregate = aggregate 168 | 169 | def __str__(self) -> str: 170 | """ 171 | Returns a JSON string representation of the pipeline's properties. 172 | 173 | :rtype: str 174 | :return: A JSON string representing the pipeline's configuration and properties. 175 | """ 176 | return json.dumps(self.properties, indent=3) 177 | 178 | def __call__(self, mols: List[str], 179 | n_jobs: int = cpu_count(), 180 | verbose: bool = False): 181 | """ 182 | Processes a list of molecular representations using the pipeline. 183 | 184 | :type mols: List[str] 185 | :param mols: A list of molecular representations (e.g., SMILES strings) to process. 186 | 187 | :type n_jobs: int 188 | :param n_jobs: The number of parallel jobs to use for processing. Default is the number of CPU cores. 189 | 190 | :type verbose: bool 191 | :param verbose: Enables verbose output if set to `True`. Displays progress and step information. 192 | 193 | :rtype: Union[List[str], List[List[str]]] 194 | :return: Processed molecular data. If `aggregate` is `True`, returns aggregated results from all steps. 195 | Otherwise, returns the final processed molecular data. 196 | """ 197 | original_mols = mols 198 | aggregation = [] 199 | for idx, e in enumerate(self.elements): 200 | if verbose: 201 | print(f"Executing preprocessing step {idx+1} of", 202 | f"{len(self.elements)}: {e.name}") 203 | if self.aggregate: 204 | mols = e(original_mols, n_jobs=n_jobs, verbose=verbose) 205 | aggregation.extend(mols) 206 | else: 207 | mols = e(mols, n_jobs=n_jobs, verbose=verbose) 208 | 209 | if verbose and not self.aggregate: 210 | print(f'Total molecules removed: {len(original_mols)-len(mols):,}') 211 | 212 | if self.aggregate: 213 | return aggregation 214 | else: 215 | return mols 216 | 217 | def save(self, filename: str): 218 | """ 219 | Saves the pipeline's properties to a YAML file. 220 | 221 | :type filename: str 222 | :param filename: The name of the file to save the pipeline's properties. 223 | 224 | :rtype: None 225 | """ 226 | yaml.safe_dump(self.properties, open(filename, 'w')) 227 | 228 | @classmethod 229 | def load(self, filename: str, element_registry: dict): 230 | """ 231 | Loads a pipeline from a YAML file and reconstructs its elements using a registry. 232 | 233 | :type filename: str 234 | :param filename: The name of the file containing the saved pipeline properties. 235 | 236 | :type element_registry: Dict[str, Callable] 237 | :param element_registry: A dictionary mapping element names to their constructor functions. 238 | 239 | :rtype: Pipeline 240 | :return: A reconstructed `Pipeline` instance based on the saved properties. 241 | """ 242 | self.properties = json.load(open(filename)) 243 | elements = [] 244 | for e, e_prop in self.config.items(): 245 | elements.append(element_registry[e](**e_prop)) 246 | return Pipeline(elements) 247 | -------------------------------------------------------------------------------- /autopeptideml/pipeline/sequence.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | from .pipeline import BaseElement 4 | 5 | 6 | RESIDUES = { 7 | 'V': 'VAL', 'I': 'ILE', 'L': 'LEU', 'E': 'GLU', 'Q': 'GLN', 8 | 'D': 'ASP', 'N': 'ASN', 'H': 'HIS', 'W': 'TRP', 'F': 'PHE', 9 | 'Y': 'TYR', 'R': 'ARG', 'K': 'LYS', 'S': 'SER', 'T': 'THR', 10 | 'M': 'MET', 'A': 'ALA', 'G': 'GLY', 'P': 'PRO', 'C': 'CYS' 11 | } 12 | 13 | 14 | def is_canonical(sequence: str): 15 | if not (len(sequence) > 0): 16 | return False 17 | for char in sequence: 18 | if char not in RESIDUES: 19 | return False 20 | return True 21 | 22 | 23 | class CanonicalCleaner(BaseElement): 24 | """ 25 | Class `CanonicalCleaner` is a molecular processing element that standardizes molecular representations 26 | by replacing non-canonical residues with a specified substitution character. 27 | 28 | Attributes: 29 | :type name: str 30 | :param name: The name of the element. Default is `'canonical-cleaner'`. 31 | 32 | :type substitution: str 33 | :param substitution: The character used to replace non-canonical residues. Default is `'X'`. 34 | """ 35 | name = 'canonical-cleaner' 36 | 37 | def __init__(self, substitution: str = 'X'): 38 | """ 39 | Initializes the `CanonicalCleaner` with a substitution character. 40 | 41 | :type substitution: str 42 | :param substitution: The character used to replace non-canonical residues. Default is `'X'`. 43 | 44 | :rtype: None 45 | """ 46 | self.sub = substitution 47 | self.properties = {'substitution': substitution} 48 | 49 | def _single_call(self, mol: str) -> str: 50 | """ 51 | Cleans a single molecular representation by replacing non-canonical residues. 52 | 53 | :type mol: str 54 | :param mol: A single molecular representation (e.g., a sequence of residues). 55 | 56 | :rtype: str 57 | :return: The cleaned molecular representation with non-canonical residues replaced by the substitution. 58 | """ 59 | return ''.join([c if c in RESIDUES else self.sub for c in mol]) 60 | 61 | 62 | class CanonicalFilter(BaseElement): 63 | """ 64 | Class `CanonicalFilter` is a molecular processing element that filters molecular representations based on 65 | their canonicality. It can either keep or discard canonical molecules based on the configuration. 66 | 67 | Attributes: 68 | :type name: str 69 | :param name: The name of the element. Default is `'canonical-filter'`. 70 | 71 | :type keep_canonical: bool 72 | :param keep_canonical: Determines whether to keep canonical molecules (`True`) or discard them (`False`). 73 | Default is `True`. 74 | """ 75 | name = 'canonical-filter' 76 | 77 | def __init__(self, keep_canonical: bool = True): 78 | """ 79 | Initializes the `CanonicalFilter` with a configuration to keep or discard canonical molecules. 80 | 81 | :type keep_canonical: bool 82 | :param keep_canonical: Determines whether to keep canonical molecules (`True`) or discard them (`False`). 83 | Default is `True`. 84 | 85 | :rtype: None 86 | """ 87 | self.keep_canonical = keep_canonical 88 | 89 | def _single_call(self, mol: str) -> Union[str, None]: 90 | """ 91 | Filters a single molecular representation based on its canonicality. 92 | 93 | :type mol: str 94 | :param mol: A single molecular representation (e.g., a sequence of residues). 95 | 96 | :rtype: Union[str, None] 97 | :return: The molecule if it meets the canonicality condition, or `None` otherwise. 98 | """ 99 | if not (len(mol) > 0): 100 | return None 101 | if ((is_canonical(mol) and self.keep_canonical) or 102 | (not is_canonical(mol) and not self.keep_canonical)): 103 | return mol 104 | else: 105 | return None 106 | -------------------------------------------------------------------------------- /autopeptideml/pipeline/smiles.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | from .pipeline import BaseElement 4 | try: 5 | import rdkit.Chem.rdmolfiles as rdm 6 | except ImportError: 7 | raise ImportError("You need to install rdkit to use this method.", 8 | " Try: `pip install rdkit`") 9 | 10 | 11 | def is_smiles(mol: str): 12 | return ( 13 | '(' in mol or ')' in mol or 14 | '[' in mol or ']' in mol or 15 | '@' in mol or 'O' in mol 16 | ) 17 | 18 | 19 | class SequenceToSMILES(BaseElement): 20 | """ 21 | Class `SequenceToSMILES` converts peptide sequences (e.g., FASTA format) into SMILES (Simplified Molecular Input Line Entry System) representations using RDKit. 22 | 23 | Attributes: 24 | :type name: str 25 | :param name: The name of the element. Default is `'sequence-to-smiles'`. 26 | """ 27 | name = 'sequence-to-smiles' 28 | 29 | def _single_call(self, mol): 30 | """ 31 | Converts a single peptide sequence into a SMILES representation. 32 | 33 | :type mol: str 34 | :param mol: A peptide sequence (e.g., FASTA format). 35 | 36 | :rtype: str 37 | :return: The SMILES representation of the molecule. 38 | 39 | :raises RuntimeError: If the molecule cannot be read by RDKit. 40 | """ 41 | rd_mol = rdm.MolFromFASTA(mol) 42 | if rd_mol is None: 43 | raise RuntimeError(f'Molecule: {mol} could not be read by RDKit.', 44 | 'Maybe introduce a filtering step in your pipeline') 45 | return rdm.MolToSmiles(rd_mol, canonical=True, isomericSmiles=True) 46 | 47 | 48 | class SmilesToSequence(BaseElement): 49 | try: 50 | from pepfunn.sequence import peptideFromSMILES 51 | except ImportError: 52 | raise ImportError("This class requires PepFuNN to be installed. Please try: `pip install git+https://github.com/novonordisk-research/pepfunn.git`") 53 | 54 | name = 'smiles-to-sequence' 55 | fun = peptideFromSMILES 56 | 57 | def _single_call(self, mol): 58 | return ''.join(self.fun(mol).split('-')) 59 | 60 | 61 | class FilterSMILES(BaseElement): 62 | """ 63 | Class `FilterSMILES` filters molecular representations based on whether they are valid SMILES strings. 64 | It can either retain or discard SMILES strings based on the configuration. 65 | 66 | Attributes: 67 | :type name: str 68 | :param name: The name of the element. Default is `'filter-smiles'`. 69 | 70 | :type keep_smiles: Optional[bool] 71 | :param keep_smiles: Determines whether to retain valid SMILES strings (`True`) or discard them (`False`). 72 | Default is `True`. 73 | """ 74 | name = 'filter-smiles' 75 | 76 | def __init__(self, keep_smiles: Optional[bool] = True): 77 | """ 78 | Initializes the `FilterSMILES` element with a configuration to retain or discard SMILES strings. 79 | 80 | :type keep_smiles: Optional[bool] 81 | :param keep_smiles: Determines whether to retain valid SMILES strings (`True`) or discard them (`False`). 82 | Default is `True`. 83 | 84 | :rtype: None 85 | """ 86 | self.properties['keep_smiles'] = keep_smiles 87 | self.keep_smiles = keep_smiles 88 | 89 | def _single_call(self, mol: str): 90 | """ 91 | Filters a single molecular representation based on its validity as a SMILES string. 92 | 93 | :type mol: str 94 | :param mol: A molecular representation to evaluate. 95 | 96 | :rtype: Union[str, None] 97 | :return: The molecule if it meets the SMILES validity condition, or `None` otherwise. 98 | """ 99 | if ((is_smiles(mol) and self.keep_smiles) or 100 | (not is_smiles(mol) and not self.keep_smiles)): 101 | return mol 102 | else: 103 | return None 104 | 105 | 106 | class CanonicalizeSmiles(BaseElement): 107 | """ 108 | Class `CanonicalizeSmiles` converts SMILES (Simplified Molecular Input Line Entry System) strings into their canonical forms using RDKit. 109 | 110 | Attributes: 111 | :type name: str 112 | :param name: The name of the element. Default is `'canonicalize-smiles'`. 113 | """ 114 | name = 'canonicalize-smiles' 115 | 116 | def _single_call(self, mol): 117 | """ 118 | Converts a SMILES string into its canonical representation. 119 | 120 | :type mol: str 121 | :param mol: A SMILES string representing a molecule. 122 | 123 | :rtype: str 124 | :return: The canonical SMILES representation of the molecule. 125 | 126 | :raises RuntimeError: If the molecule cannot be read by RDKit. 127 | """ 128 | rd_mol = rdm.MolFromSmiles(mol) 129 | if rd_mol is None: 130 | raise RuntimeError(f'Molecule: {mol} could not be read by RDKit.', 131 | 'Maybe introduce a filtering step in your pipeline') 132 | return rdm.MolToSmiles(rd_mol, canonical=True, isomericSmiles=True) 133 | -------------------------------------------------------------------------------- /autopeptideml/reps/__init__.py: -------------------------------------------------------------------------------- 1 | from .engine import RepEngineBase 2 | from .seq_based import RepEngineOnehot 3 | -------------------------------------------------------------------------------- /autopeptideml/reps/engine.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import yaml 4 | from typing import * 5 | 6 | import numpy as np 7 | from tqdm import tqdm 8 | 9 | try: 10 | from itertools import batched 11 | except ImportError: 12 | from itertools import islice 13 | 14 | def batched(iterable, n, *, strict=False): 15 | # batched('ABCDEFG', 3) → ABC DEF G 16 | if n < 1: 17 | raise ValueError('n must be at least one') 18 | iterator = iter(iterable) 19 | while batch := tuple(islice(iterator, n)): 20 | if strict and len(batch) != n: 21 | raise ValueError('batched(): incomplete batch') 22 | yield batch 23 | 24 | 25 | class RepEngineBase: 26 | """ 27 | Class `RepEngineBase` is an abstract base class for implementing molecular representation engines. 28 | It defines a framework for computing molecular representations in batches and includes utilities for 29 | serialization and property management. 30 | 31 | Attributes: 32 | :type engine: str 33 | :param engine: The name of the representation engine. 34 | 35 | :type rep: str 36 | :param rep: The type of molecular representation (e.g., fingerprint, embedding). 37 | 38 | :type properties: dict 39 | :param properties: A dictionary containing the engine's properties, including configuration arguments passed during initialization. 40 | """ 41 | engine: str 42 | 43 | def __init__(self, rep: str, **args): 44 | """ 45 | Initializes the `RepEngineBase` with the specified representation type and additional configuration arguments. 46 | 47 | :type rep: str 48 | :param rep: The type of molecular representation (e.g., fingerprint, embedding). 49 | 50 | :type **args: dict 51 | :param **args: Additional arguments for configuring the representation engine. 52 | 53 | :rtype: None 54 | """ 55 | self.rep = rep 56 | self.__dict__.update(args) 57 | self.properties = copy.deepcopy(self.__dict__) 58 | 59 | def compute_reps(self, mols: List[str], 60 | verbose: Optional[bool] = False, 61 | batch_size: Optional[int] = 12) -> Union[np.ndarray, List[np.ndarray]]: 62 | """ 63 | Computes molecular representations for a list of molecules in batches. 64 | 65 | :type mols: List[str] 66 | :param mols: A list of molecular representations (e.g., SMILES strings). 67 | 68 | :type verbose: Optional[bool] 69 | :param verbose: If `True`, displays a progress bar during batch processing. Default is `False`. 70 | 71 | :type batch_size: Optional[int] 72 | :param batch_size: The size of each batch for processing. Default is `12`. 73 | 74 | :rtype: Union[np.ndarray, List[np.ndarray]] 75 | :return: A stacked NumPy array of computed representations, or a list of arrays if pooling is disabled. 76 | """ 77 | batches = batched(mols, batch_size) 78 | out = [] 79 | 80 | if verbose: 81 | pbar = tqdm(list(batches)) 82 | else: 83 | pbar = batches 84 | 85 | for batch in pbar: 86 | batch = self._preprocess_batch(batch) 87 | rep = self._rep_batch(batch) 88 | out.extend(rep) 89 | 90 | if 'average_pooling' in self.__dict__: 91 | if not self.__dict__['average_pooling']: 92 | return out 93 | return np.stack(out) 94 | 95 | def dim(self) -> int: 96 | """ 97 | Returns the dimensionality of the molecular representations. 98 | 99 | :rtype: int 100 | :return: The dimensionality of the computed representations. 101 | 102 | :raises NotImplementedError: This method must be implemented by subclasses. 103 | """ 104 | raise NotImplementedError 105 | 106 | def _rep_batch(self, batch: List[str]) -> np.ndarray: 107 | """ 108 | Computes representations for a batch of molecules. Must be implemented by subclasses. 109 | 110 | :type batch: List[str] 111 | :param batch: A batch of molecular representations (e.g., SMILES strings). 112 | 113 | :rtype: np.ndarray 114 | :return: A NumPy array of computed representations for the batch. 115 | 116 | :raises NotImplementedError: This method must be implemented by subclasses. 117 | """ 118 | raise NotImplementedError 119 | 120 | def _preprocess_batch(self, batch: List[str]) -> List[str]: 121 | """ 122 | Preprocesses a batch of molecules before computing representations. Must be implemented by subclasses. 123 | 124 | :type batch: List[str] 125 | :param batch: A batch of molecular representations (e.g., SMILES strings). 126 | 127 | :rtype: List[str] 128 | :return: A preprocessed list of molecular representations. 129 | 130 | :raises NotImplementedError: This method must be implemented by subclasses. 131 | """ 132 | raise NotImplementedError 133 | 134 | def save(self, filename: str): 135 | """ 136 | Saves the engine's properties to a file in YAML format. 137 | 138 | :type filename: str 139 | :param filename: The path to the file where the properties will be saved. 140 | 141 | :rtype: None 142 | """ 143 | yaml.safe_dump(self.properties, open(filename, 'w')) 144 | 145 | def __str__(self) -> str: 146 | """ 147 | Returns a string representation of the engine's properties in JSON format. 148 | 149 | :rtype: str 150 | :return: A JSON string representation of the engine's properties. 151 | """ 152 | return str(json.dumps(self.properties)) 153 | -------------------------------------------------------------------------------- /autopeptideml/reps/fps.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | import numpy as np 4 | 5 | from .engine import RepEngineBase 6 | try: 7 | import rdkit.Chem.rdmolfiles as rdm 8 | from rdkit.Chem import rdFingerprintGenerator as rfp 9 | except ImportError: 10 | raise ImportError("You need to install rdkit to use this method.", 11 | " Try: `pip install rdkit`") 12 | 13 | 14 | class RepEngineFP(RepEngineBase): 15 | """ 16 | Class `RepEngineFP` is a subclass of `RepEngineBase` designed for computing molecular fingerprints (FPs) 17 | using popular fingerprinting algorithms such as ECFP or FCFP. This engine generates fixed-length bit vectors 18 | representing molecular structures based on their topological features. 19 | 20 | Attributes: 21 | :type engine: str 22 | :param engine: The name of the engine. Default is `'fp'`, indicating a fingerprint-based representation. 23 | 24 | :type nbits: int 25 | :param nbits: The length of the fingerprint bit vector. This determines the number of bits in the fingerprint. 26 | 27 | :type radius: int 28 | :param radius: The radius parameter used for fingerprint generation, determining the neighborhood size around each atom. 29 | 30 | :type name: str 31 | :param name: The name of the fingerprint generator, which includes the engine type, `nbits`, and `radius`. 32 | 33 | :type generator: object 34 | :param generator: The fingerprint generator object, loaded based on the specified `rep` type. 35 | """ 36 | engine = 'fp' 37 | 38 | def __init__(self, rep: str, nbits: int, radius: int): 39 | """ 40 | Initializes the `RepEngineFP` with the specified representation type, fingerprint size, and radius. 41 | 42 | :type rep: str 43 | :param rep: The type of fingerprint to generate (e.g., 'ecfp', 'fcfp'). 44 | 45 | :type nbits: int 46 | :param nbits: The length of the fingerprint bit vector. 47 | 48 | :type radius: int 49 | :param radius: The radius of the neighborhood around each atom to consider when generating the fingerprint. 50 | 51 | :rtype: None 52 | """ 53 | super().__init__(rep, nbits=nbits, radius=radius) 54 | self.nbits = nbits 55 | self.radius = radius 56 | self.name = f'{self.engine}-{rep}-{self.nbits}-{self.radius}' 57 | self.generator = self._load_generator(rep) 58 | 59 | def _preprocess_batch(self, batch: List[str]) -> List[str]: 60 | """ 61 | Preprocesses a batch of molecular representations. For this class, no preprocessing is required. 62 | 63 | :type batch: List[str] 64 | :param batch: A list of molecular representations (e.g., SMILES strings). 65 | 66 | :rtype: List[str] 67 | :return: The same batch of molecular representations as input. 68 | """ 69 | return batch 70 | 71 | def _rep_batch(self, batch: List[str]) -> List[np.ndarray]: 72 | """ 73 | Computes the fingerprint for each molecule in a batch and returns the results as a list of NumPy arrays. 74 | 75 | :type batch: List[str] 76 | :param batch: A list of molecular representations (e.g., SMILES strings). 77 | 78 | :rtype: List[np.ndarray] 79 | :return: A list of NumPy arrays representing the molecular fingerprints. 80 | """ 81 | out = [] 82 | for i in batch: 83 | mol = rdm.MolFromSmiles(i) 84 | if mol is None: 85 | fp = np.zeros((1, self.nbits)) 86 | else: 87 | fp = self.generator.GetCountFingerprintAsNumPy(mol) 88 | out.append(fp) 89 | return out 90 | 91 | def _load_generator(self, rep: str): 92 | """ 93 | Loads the appropriate fingerprint generator based on the specified representation type. 94 | 95 | :type rep: str 96 | :param rep: The type of fingerprint to generate (e.g., 'ecfp', 'fcfp'). 97 | 98 | :rtype: object 99 | :return: The fingerprint generator object based on the `rep` type. 100 | 101 | :raises NotImplementedError: If the specified `rep` type is not supported. 102 | """ 103 | if 'ecfp' in rep or 'morgan' in rep: 104 | return rfp.GetMorganGenerator(radius=self.radius, 105 | includeChirality=True, 106 | fpSize=self.nbits, 107 | countSimulation='count' in rep) 108 | elif 'fcfp' in rep: 109 | invgen = rfp.GetMorganFeatureAtomInvGen() 110 | return rfp.GetMorganGenerator(radius=self.radius, 111 | fpSize=self.nbits, 112 | includeChirality=True, 113 | atomInvariantsGenerator=invgen, 114 | countSimulation='count' in rep) 115 | else: 116 | raise NotImplementedError( 117 | f'Representation: {rep} is not currently implemented.', 118 | 'Please, request this new feature in the Issues page of the', 119 | 'github repo: https://IBM/AutoPeptideML' 120 | ) 121 | 122 | def dim(self) -> int: 123 | """ 124 | Returns the dimensionality (bit size) of the generated fingerprint. 125 | 126 | :rtype: int 127 | :return: The number of bits in the fingerprint (i.e., `nbits`). 128 | """ 129 | return self.nbits 130 | -------------------------------------------------------------------------------- /autopeptideml/reps/lms.py: -------------------------------------------------------------------------------- 1 | from contextlib import nullcontext 2 | 3 | import numpy as np 4 | import torch 5 | import transformers 6 | from transformers import AutoModel, AutoTokenizer, T5Tokenizer, T5EncoderModel 7 | from typing import * 8 | 9 | from .engine import RepEngineBase 10 | 11 | transformers.logging.set_verbosity(transformers.logging.ERROR) 12 | 13 | 14 | AVAILABLE_MODELS = { 15 | 'esm2_t48_15B_UR50D': 5120, 16 | 'esm2_t36_3B_UR50D': 2560, 17 | 'esm2_t33_650M_UR50D': 1280, 18 | 'esm1b_t33_650M_UR50S': 1280, 19 | 'esm2_t30_150M_UR50D': 640, 20 | 'esm2_t12_35M_UR50D': 480, 21 | 'esm2_t6_8M_UR50D': 320, 22 | 'ESMplusplus_small': 960, 23 | 'ESMplusplus_large': 1152, 24 | 'prot_t5_xxl_uniref50': 1024, 25 | 'prot_t5_xl_half_uniref50-enc': 1024, 26 | 'prot_bert': 1024, 27 | 'ProstT5': 1024, 28 | 'ankh-base': 768, 29 | 'ankh-large': 1536, 30 | 'MoLFormer-XL-both-10pct': 768, 31 | 'ChemBERTa-77M-MLM': 384, 32 | 'PeptideCLM-23M-all': 768 33 | } 34 | 35 | SYNONYMS = { 36 | 'prot-t5-xl': 'prot_t5_xl_half_uniref50-enc', 37 | 'prot-t5-xxl': 'prot_t5_xxl_uniref50', 38 | 'protbert': 'prot_bert', 39 | 'prost-t5': 'ProstT5', 40 | 'esm2-15b': 'esm2_t48_15B_UR50D', 41 | 'esm2-3b': 'esm2_t36_3B_UR50D', 42 | 'esm2-650m': 'esm2_t33_650M_UR50D', 43 | 'esm1b': 'esm1b_t33_650M_UR50S', 44 | 'esm2-150m': 'esm2_t30_150M_UR50D', 45 | 'esm2-35m': 'esm2_t12_35M_UR50D', 46 | 'esm2-8m': 'esm2_t6_8M_UR50D', 47 | 'esmc-300m': 'ESMplusplus_small', 48 | 'esmc-600m': 'ESMplusplus_large', 49 | 'ankh-base': 'ankh-base', 50 | 'ankh-large': 'ankh-large', 51 | 'molformer-xl': 'MoLFormer-XL-both-10pct', 52 | 'chemberta-2': 'ChemBERTa-77M-MLM', 53 | 'peptideclm': 'PeptideCLM-23M-all' 54 | 55 | } 56 | 57 | 58 | class RepEngineLM(RepEngineBase): 59 | """ 60 | Class `RepEngineLM` is a subclass of `RepEngineBase` designed to compute molecular representations 61 | using pre-trained language models (LMs) such as T5, ESM, or ChemBERTa. This engine generates vector-based 62 | embeddings for input sequences, typically protein or peptide sequences, by leveraging transformer-based models. 63 | 64 | Attributes: 65 | :type engine: str 66 | :param engine: The name of the engine. Default is `'lm'`, indicating a language model-based representation. 67 | 68 | :type device: str 69 | :param device: The device on which the model runs, either `'cuda'` for GPU or `'cpu'`. 70 | 71 | :type model: object 72 | :param model: The pre-trained model used for generating representations. The model is loaded from a repository 73 | based on the `model` parameter. 74 | 75 | :type name: str 76 | :param name: The name of the model engine combined with the model type. 77 | 78 | :type dimension: int 79 | :param dimension: The dimensionality of the output representation, corresponding to the model's embedding size. 80 | 81 | :type model_name: str 82 | :param model_name: The specific model name used for generating representations. 83 | 84 | :type tokenizer: object 85 | :param tokenizer: The tokenizer associated with the model, used for converting sequences into tokenized input. 86 | 87 | :type lab: str 88 | :param lab: The laboratory or organization associated with the model (e.g., 'Rostlab', 'facebook', etc.). 89 | """ 90 | engine = 'lm' 91 | 92 | def __init__(self, model: str, average_pooling: Optional[bool] = True, 93 | cls_token: Optional[bool] = False, fp16: bool = True): 94 | """ 95 | Initializes the `RepEngineLM` with the specified model and pooling options. The model is loaded based on 96 | the given `model` name and its associated tokenizer. 97 | 98 | :type model: str 99 | :param model: The pre-trained model to use for generating representations (e.g., 'esm2_t48_15B_UR50D'). 100 | 101 | :type average_pooling: Optional[bool] 102 | :param average_pooling: If `True`, the embeddings are averaged across all tokens. Default is `True`. 103 | 104 | :type cls_token: Optional[bool] 105 | :param cls_token: If `True`, only the representation of the [CLS] token is used. Default is `False`. 106 | 107 | :rtype: None 108 | """ 109 | super().__init__(model, average_pooling=average_pooling, 110 | cls_token=cls_token) 111 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 112 | self.model = None 113 | self.name = f'{self.engine}-{model}' 114 | self.fp16 = fp16 115 | self._load_model(model) 116 | 117 | def move_to_device(self, device: str): 118 | """ 119 | Moves the model to the specified device (e.g., 'cuda' or 'cpu'). 120 | 121 | :type device: str 122 | :param device: The target device to move the model to. 123 | 124 | :rtype: None 125 | """ 126 | self.device = device 127 | self.model.to(self.device) 128 | 129 | def dim(self) -> int: 130 | """ 131 | Returns the dimensionality of the output representation generated by the model. 132 | 133 | :rtype: int 134 | :return: The dimensionality (embedding size) of the model's output. 135 | """ 136 | return self.dimension 137 | 138 | def max_len(self) -> int: 139 | """ 140 | Returns the maximum allowed sequence length for the model. Some models have a specific maximum sequence length. 141 | 142 | :rtype: int 143 | :return: The maximum sequence length for the model. 144 | """ 145 | if self.lab == 'facebook': 146 | return 1022 147 | elif self.lab.lower() == 'evolutionaryscale': 148 | return 2046 149 | else: 150 | return 2046 151 | 152 | def get_num_params(self) -> int: 153 | """ 154 | Returns the total number of parameters in the model. 155 | 156 | :rtype: int 157 | :return: The number of parameters in the model. 158 | """ 159 | return sum(p.numel() for p in self.model.parameters()) 160 | 161 | def _load_model(self, model: str): 162 | """ 163 | Loads the specified pre-trained model and tokenizer based on the provided model name. 164 | The model is selected from the available models in the `AVAILABLE_MODELS` dictionary. 165 | 166 | :type model: str 167 | :param model: The model name or synonym to load (e.g., 'esm2_t48_15B_UR50D'). 168 | 169 | :raises NotImplementedError: If the specified model is not found in `AVAILABLE_MODELS` or `SYNONYMS`. 170 | :rtype: None 171 | """ 172 | if model not in AVAILABLE_MODELS and SYNONYMS[model.lower()] not in AVAILABLE_MODELS: 173 | raise NotImplementedError( 174 | f"Model: {model} not implemented.", 175 | f"Available models: {', '.join(AVAILABLE_MODELS)}" 176 | ) 177 | if model not in AVAILABLE_MODELS: 178 | model = SYNONYMS[model.lower()] 179 | if model.lower().startswith('pro'): 180 | self.lab = 'Rostlab' 181 | elif 'plusplus' in model.lower(): 182 | self.lab = 'Synthyra' 183 | elif 'esmc' in model.lower(): 184 | self.lab = 'EvolutionaryScale' 185 | elif 'esm' in model.lower(): 186 | self.lab = 'facebook' 187 | elif 'lobster' in model.lower(): 188 | self.lab = 'asalam91' 189 | elif 'ankh' in model.lower(): 190 | self.lab = 'ElnaggarLab' 191 | elif 'molformer' in model.lower(): 192 | self.lab = 'ibm' 193 | elif 'chemberta' in model.lower(): 194 | self.lab = 'DeepChem' 195 | elif 'clm' in model.lower(): 196 | self.lab = 'aaronfeller' 197 | if 't5' in model.lower(): 198 | self.tokenizer = T5Tokenizer.from_pretrained(f'Rostlab/{model}', 199 | do_lower_case=False) 200 | self.model = T5EncoderModel.from_pretrained(f"Rostlab/{model}") 201 | elif 'feller' in self.lab.lower(): 202 | import os 203 | import urllib 204 | import urllib.request as request 205 | try: 206 | from .utils.peptideclm_tokenizer import SMILES_SPE_Tokenizer 207 | except ImportError: 208 | raise ImportError("This function requires smilespe. Please install: `pip install smilespe`") 209 | if os.getenv('HF_HOME') is None: 210 | hf_home = os.path.abspath('~/.cache/huggingface/hub/') 211 | else: 212 | hf_home = os.path.abspath(os.getenv('HF_HOME')) 213 | path = os.path.join(hf_home, 'peptideclm_tokenizer') 214 | vocab = os.path.join(path, 'new_vocab.txt') 215 | splits = os.path.join(path, 'new_splits.txt') 216 | 217 | if not (os.path.exists(vocab) and os.path.exists(splits)): 218 | os.makedirs(path, exist_ok=True) 219 | try: 220 | url1 = 'https://raw.githubusercontent.com/AaronFeller/PeptideCLM/refs/heads/master/tokenizer/new_vocab.txt' 221 | url2 = 'https://raw.githubusercontent.com/AaronFeller/PeptideCLM/refs/heads/master/tokenizer/new_splits.txt' 222 | request.urlretrieve(url1, vocab) 223 | request.urlretrieve(url2, splits) 224 | except urllib.error.URLError: 225 | raise RuntimeError("Tokenizer could not be downloaded. Please try again later and if the problem persists,", 226 | "raise an issue in on the AutoPeptideML github so that the issue can be", 227 | "investigated: https://github.com/IBM/AutoPeptideML/issues") 228 | self.tokenizer = SMILES_SPE_Tokenizer(vocab_file=vocab, 229 | spe_file=splits) 230 | self.model = AutoModel.from_pretrained(f'{self.lab}/{model}', 231 | trust_remote_code=True) 232 | else: 233 | self.model = AutoModel.from_pretrained(f'{self.lab}/{model}', 234 | trust_remote_code=True) 235 | if 'plusplus' in model.lower(): 236 | self.tokenizer = self.model.tokenizer 237 | else: 238 | self.tokenizer = AutoTokenizer.from_pretrained( 239 | f'{self.lab}/{model}', trust_remote_code=True 240 | ) 241 | 242 | self.dimension = AVAILABLE_MODELS[model] 243 | self.model_name = model 244 | self.model.to(self.device) 245 | 246 | def _preprocess_batch(self, sequences: List[str]) -> List[List[str]]: 247 | """ 248 | Preprocesses a batch of input sequences by adjusting formatting, truncating, and applying special tokens 249 | based on the model type. 250 | 251 | :type sequences: List[str] 252 | :param sequences: A list of input sequences (e.g., protein sequences in FASTA format). 253 | 254 | :rtype: List[List[str]] 255 | :return: A list of preprocessed sequences. 256 | """ 257 | if self.lab == 'Rostlab': 258 | sequences = [' '.join([char for char in seq]) for seq in sequences] 259 | if self.model_name == 'ProstT5': 260 | sequences = [" " + seq for seq in sequences] 261 | sequences = [seq[:self.max_len()] for seq in sequences] 262 | return sequences 263 | 264 | def _rep_batch( 265 | self, batch: List[str], 266 | ) -> List[np.ndarray]: 267 | """ 268 | Generates representations for a batch of sequences using the loaded pre-trained model. The representations 269 | are extracted from the model's output and returned based on the specified pooling strategy. 270 | 271 | :type batch: List[str] 272 | :param batch: A list of input sequences (e.g., protein sequences in FASTA format). 273 | 274 | :rtype: List[np.ndarray] 275 | :return: A list of numpy arrays representing the embeddings of each input sequence. 276 | """ 277 | inputs = self.tokenizer(batch, add_special_tokens=True, 278 | truncation=True, 279 | padding="longest", return_tensors="pt") 280 | inputs = inputs.to(self.device) 281 | mps_autocast = int(torch.__version__.split('.')[1]) >= 6 282 | autocast = self.fp16 and (self.device == 'cuda' or 283 | (self.device == 'mps' and mps_autocast) or 284 | self.device == 'cpu') 285 | if autocast: 286 | autocast = torch.autocast( 287 | device_type=self.device, 288 | dtype=torch.bfloat16 289 | ) 290 | else: 291 | autocast = nullcontext() 292 | 293 | with torch.no_grad(): 294 | with autocast: 295 | if self.lab == 'ElnaggarLab': 296 | embd_rpr = self.model( 297 | input_ids=inputs['input_ids'], 298 | attention_mask=inputs['attention_mask'], 299 | decoder_input_ids=inputs['input_ids'] 300 | ).last_hidden_state 301 | else: 302 | embd_rpr = self.model(**inputs).last_hidden_state 303 | output = [] 304 | for idx in range(len(batch)): 305 | if self.lab == 'facebook' or self.lab == 'EvolutionaryScale': 306 | initial = 1 307 | final = len(batch[idx]) + 1 308 | elif self.lab == 'RostLab': 309 | initial = 0 310 | final = len(batch[idx].replace(' ', '')) 311 | else: 312 | initial = 0 313 | final = len(batch[idx]) 314 | 315 | if self.average_pooling: 316 | output.append(embd_rpr[idx, initial:final].mean(0).float().detach().cpu().numpy()) 317 | elif self.cls_token: 318 | output.append(embd_rpr[idx, 0].float().detach().cpu().numpy()) 319 | else: 320 | output.append(embd_rpr[idx, initial:final].detach().cpu().numpy()) 321 | 322 | if autocast: 323 | output[-1] = output[-1].astype(np.float16) 324 | return output 325 | -------------------------------------------------------------------------------- /autopeptideml/reps/seq_based.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | import numpy as np 4 | 5 | from .engine import RepEngineBase 6 | 7 | 8 | RESIDUES = { 9 | 'V': 0, 'I': 1, 'L': 2, 'E': 3, 'Q': 4, 10 | 'D': 5, 'N': 6, 'H': 7, 'W': 8, 'F': 9, 11 | 'Y': 10, 'R': 11, 'K': 12, 'S': 13, 'T': 14, 12 | 'M': 15, 'A': 16, 'G': 17, 'P': 18, 'C': 19, 13 | 'X': 20 14 | } 15 | 16 | 17 | class RepEngineOnehot(RepEngineBase): 18 | """ 19 | Class `RepEngineOnehot` is a subclass of `RepEngineBase` that generates one-hot encoded representations 20 | for input sequences. This representation is commonly used for tasks in machine learning and bioinformatics, 21 | such as protein sequence classification, where each amino acid in the sequence is represented by a binary vector. 22 | 23 | Attributes: 24 | :type engine: str 25 | :param engine: The name of the engine. Default is `'one-hot'`, indicating one-hot encoding representation. 26 | 27 | :type max_length: int 28 | :param max_length: The maximum length of the input sequences. Sequences longer than this length will be truncated. 29 | 30 | :type name: str 31 | :param name: The name of the representation engine, which is set to `'one-hot'`. 32 | """ 33 | engine = 'one-hot' 34 | 35 | def __init__(self, max_length: int): 36 | """ 37 | Initializes the `RepEngineOnehot` with the specified maximum sequence length. The one-hot encoding will 38 | use this length to determine the size of the output vectors. 39 | 40 | :type max_length: int 41 | :param max_length: The maximum length of the input sequences. Sequences longer than this will be truncated. 42 | 43 | :rtype: None 44 | """ 45 | super().__init__('one-hot', max_length=max_length) 46 | self.max_length = max_length 47 | self.name = f'{self.engine}' 48 | 49 | def _preprocess_batch(self, batch: List[str]): 50 | """ 51 | Preprocesses a batch of input sequences by truncating them to the specified maximum length. 52 | 53 | :type batch: List[str] 54 | :param batch: A list of input sequences (e.g., protein sequences in FASTA format). 55 | 56 | :rtype: List[str] 57 | :return: A list of preprocessed sequences truncated to the maximum length. 58 | """ 59 | return [s[:self.max_length] for s in batch] 60 | 61 | def _rep_batch(self, batch: List[str]) -> np.ndarray: 62 | """ 63 | Converts a batch of input sequences into one-hot encoded representations. Each amino acid in the sequence 64 | is represented by a binary vector where the position corresponding to the amino acid is set to 1, and 65 | all other positions are set to 0. 66 | 67 | :type batch: List[str] 68 | :param batch: A list of input sequences (e.g., protein sequences in FASTA format). 69 | 70 | :rtype: np.ndarray 71 | :return: A 2D numpy array where each row corresponds to a one-hot encoded representation of a sequence. 72 | """ 73 | out = np.zeros((len(batch), self.max_length * len(RESIDUES)), 74 | dtype=np.int8) 75 | for idx, s in enumerate(batch): 76 | for idx2, c in enumerate(s): 77 | out[idx, idx2 * len(RESIDUES) + RESIDUES[c]] = 1 78 | return out 79 | 80 | def dim(self) -> int: 81 | """ 82 | Returns the dimensionality of the one-hot encoded representation, which is the product of the 83 | maximum sequence length and the number of possible amino acids. 84 | 85 | :rtype: int 86 | :return: The dimensionality of the one-hot encoded representation. 87 | """ 88 | return int(len(RESIDUES) * self.max_length) 89 | -------------------------------------------------------------------------------- /autopeptideml/reps/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/AutoPeptideML/adb18578f145d1c3a78d6860ad5f7d35c726159a/autopeptideml/reps/utils/__init__.py -------------------------------------------------------------------------------- /autopeptideml/reps/utils/peptideclm_tokenizer.py: -------------------------------------------------------------------------------- 1 | """Code adapted from PeptideCLM 2 | https://github.com/AaronFeller/PeptideCLM 3 | Github repository under MIT license 4 | 5 | Copyright (c) 2024 Aaron Feller 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | """ 25 | import collections 26 | import os 27 | import re 28 | from typing import List, Optional 29 | from transformers import PreTrainedTokenizer 30 | from SmilesPE.tokenizer import SPE_Tokenizer 31 | 32 | 33 | def load_vocab(vocab_file): 34 | """Loads a vocabulary file into a dictionary.""" 35 | vocab = collections.OrderedDict() 36 | with open(vocab_file, "r", encoding="utf-8") as reader: 37 | tokens = reader.readlines() 38 | for index, token in enumerate(tokens): 39 | token = token.rstrip("\n") 40 | vocab[token] = index 41 | return vocab 42 | 43 | 44 | class Atomwise_Tokenizer(object): 45 | """Run atom-level SMILES tokenization""" 46 | 47 | def __init__(self): 48 | """ Constructs a atom-level Tokenizer. 49 | """ 50 | # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" 51 | self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" 52 | 53 | self.regex = re.compile(self.regex_pattern) 54 | 55 | def tokenize(self, text): 56 | """ Basic Tokenization of a SMILES. 57 | """ 58 | tokens = [token for token in self.regex.findall(text)] 59 | return tokens 60 | 61 | 62 | class SMILES_SPE_Tokenizer(PreTrainedTokenizer): 63 | r""" 64 | Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). 65 | This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users 66 | should refer to the superclass for more information regarding methods. 67 | Args: 68 | vocab_file (:obj:`string`): 69 | File containing the vocabulary. 70 | spe_file (:obj:`string`): 71 | File containing the trained SMILES Pair Encoding vocabulary. 72 | unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): 73 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 74 | token instead. 75 | sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): 76 | The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences 77 | for sequence classification or for a text and a question for question answering. 78 | It is also used as the last token of a sequence built with special tokens. 79 | pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): 80 | The token used for padding, for example when batching sequences of different lengths. 81 | cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): 82 | The classifier token which is used when doing sequence classification (classification of the whole 83 | sequence instead of per-token classification). It is the first token of the sequence when built with 84 | special tokens. 85 | mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): 86 | The token used for masking values. This is the token used when training this model with masked language 87 | modeling. This is the token which the model will try to predict. 88 | """ 89 | 90 | def __init__(self, vocab_file, spe_file, 91 | unk_token="[UNK]", 92 | sep_token="[SEP]", 93 | pad_token="[PAD]", 94 | cls_token="[CLS]", 95 | mask_token="[MASK]", 96 | **kwargs): 97 | if not os.path.isfile(vocab_file): 98 | raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file)) 99 | if not os.path.isfile(spe_file): 100 | raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file)) 101 | 102 | self.vocab = load_vocab(vocab_file) 103 | self.spe_vocab = open(spe_file, 'r', encoding='utf-8') 104 | self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) 105 | self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab) 106 | 107 | super().__init__( 108 | unk_token=unk_token, 109 | sep_token=sep_token, 110 | pad_token=pad_token, 111 | cls_token=cls_token, 112 | mask_token=mask_token, 113 | **kwargs) 114 | 115 | @property 116 | def vocab_size(self): 117 | return len(self.vocab) 118 | 119 | def get_vocab(self): 120 | return dict(self.vocab, **self.added_tokens_encoder) 121 | 122 | def _tokenize(self, text): 123 | return self.spe_tokenizer.tokenize(text).split(' ') 124 | 125 | def _convert_token_to_id(self, token): 126 | """ Converts a token (str) in an id using the vocab. """ 127 | return self.vocab.get(token, self.vocab.get(self.unk_token)) 128 | 129 | def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): 130 | text = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) 131 | return self.convert_tokens_to_string(text) 132 | 133 | def _convert_id_to_token(self, index): 134 | """Converts an index (integer) in a token (str) using the vocab.""" 135 | return self.ids_to_tokens.get(index, self.unk_token) 136 | 137 | def convert_tokens_to_string(self, tokens): 138 | """ Converts a sequence of tokens (string) in a single string. """ 139 | out_string = " ".join(tokens).replace(" ##", "").strip() 140 | return out_string 141 | 142 | def build_inputs_with_special_tokens( 143 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None 144 | ) -> List[int]: 145 | """ 146 | Build model inputs from a sequence or a pair of sequence for sequence classification tasks 147 | by concatenating and adding special tokens. 148 | A BERT sequence has the following format: 149 | - single sequence: ``[CLS] X [SEP]`` 150 | - pair of sequences: ``[CLS] A [SEP] B [SEP]`` 151 | Args: 152 | token_ids_0 (:obj:`List[int]`): 153 | List of IDs to which the special tokens will be added 154 | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): 155 | Optional second list of IDs for sequence pairs. 156 | Returns: 157 | :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. 158 | """ 159 | if token_ids_1 is None: 160 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] 161 | cls = [self.cls_token_id] 162 | sep = [self.sep_token_id] 163 | return cls + token_ids_0 + sep + token_ids_1 + sep 164 | 165 | def get_special_tokens_mask( 166 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False 167 | ) -> List[int]: 168 | """ 169 | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding 170 | special tokens using the tokenizer ``prepare_for_model`` method. 171 | Args: 172 | token_ids_0 (:obj:`List[int]`): 173 | List of ids. 174 | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): 175 | Optional second list of IDs for sequence pairs. 176 | already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): 177 | Set to True if the token list is already formatted with special tokens for the model 178 | Returns: 179 | :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. 180 | """ 181 | 182 | if already_has_special_tokens: 183 | if token_ids_1 is not None: 184 | raise ValueError( 185 | "You should not supply a second sequence if the provided sequence of " 186 | "ids is already formated with special tokens for the model." 187 | ) 188 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) 189 | 190 | if token_ids_1 is not None: 191 | return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] 192 | return [1] + ([0] * len(token_ids_0)) + [1] 193 | 194 | def create_token_type_ids_from_sequences( 195 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None 196 | ) -> List[int]: 197 | """ 198 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. 199 | A BERT sequence pair mask has the following format: 200 | :: 201 | 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 202 | | first sequence | second sequence | 203 | if token_ids_1 is None, only returns the first portion of the mask (0's). 204 | Args: 205 | token_ids_0 (:obj:`List[int]`): 206 | List of ids. 207 | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): 208 | Optional second list of IDs for sequence pairs. 209 | Returns: 210 | :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given 211 | sequence(s). 212 | """ 213 | sep = [self.sep_token_id] 214 | cls = [self.cls_token_id] 215 | if token_ids_1 is None: 216 | return len(cls + token_ids_0 + sep) * [0] 217 | return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] 218 | 219 | def save_vocabulary(self, vocab_path): 220 | """ 221 | Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. 222 | Args: 223 | vocab_path (:obj:`str`): 224 | The directory in which to save the vocabulary. 225 | Returns: 226 | :obj:`Tuple(str)`: Paths to the files saved. 227 | """ 228 | index = 0 229 | 230 | vocab_file = vocab_path 231 | with open(vocab_file, "w", encoding="utf-8") as writer: 232 | for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): 233 | if index != token_index: 234 | index = token_index 235 | writer.write(token + "\n") 236 | index += 1 237 | return (vocab_file,) 238 | 239 | class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer): 240 | r""" 241 | Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). 242 | This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users 243 | should refer to the superclass for more information regarding methods. 244 | Args: 245 | vocab_file (:obj:`string`): 246 | File containing the vocabulary. 247 | unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): 248 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 249 | token instead. 250 | sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): 251 | The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences 252 | for sequence classification or for a text and a question for question answering. 253 | It is also used as the last token of a sequence built with special tokens. 254 | pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): 255 | The token used for padding, for example when batching sequences of different lengths. 256 | cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): 257 | The classifier token which is used when doing sequence classification (classification of the whole 258 | sequence instead of per-token classification). It is the first token of the sequence when built with 259 | special tokens. 260 | mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): 261 | The token used for masking values. This is the token used when training this model with masked language 262 | modeling. This is the token which the model will try to predict. 263 | """ 264 | 265 | def __init__( 266 | self, 267 | vocab_file, 268 | unk_token="[UNK]", 269 | sep_token="[SEP]", 270 | pad_token="[PAD]", 271 | cls_token="[CLS]", 272 | mask_token="[MASK]", 273 | **kwargs 274 | ): 275 | super().__init__( 276 | unk_token=unk_token, 277 | sep_token=sep_token, 278 | pad_token=pad_token, 279 | cls_token=cls_token, 280 | mask_token=mask_token, 281 | **kwargs, 282 | ) 283 | 284 | if not os.path.isfile(vocab_file): 285 | raise ValueError( 286 | "Can't find a vocabulary file at path '{}'.".format(vocab_file) 287 | ) 288 | self.vocab = load_vocab(vocab_file) 289 | self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) 290 | self.tokenizer = Atomwise_Tokenizer() 291 | 292 | @property 293 | def vocab_size(self): 294 | return len(self.vocab) 295 | 296 | def get_vocab(self): 297 | return dict(self.vocab, **self.added_tokens_encoder) 298 | 299 | def _tokenize(self, text): 300 | return self.tokenizer.tokenize(text) 301 | 302 | def _convert_token_to_id(self, token): 303 | """ Converts a token (str) in an id using the vocab. """ 304 | return self.vocab.get(token, self.vocab.get(self.unk_token)) 305 | 306 | def _convert_id_to_token(self, index): 307 | """Converts an index (integer) in a token (str) using the vocab.""" 308 | return self.ids_to_tokens.get(index, self.unk_token) 309 | 310 | def convert_tokens_to_string(self, tokens): 311 | """ Converts a sequence of tokens (string) in a single string. """ 312 | out_string = " ".join(tokens).replace(" ##", "").strip() 313 | return out_string 314 | 315 | def build_inputs_with_special_tokens( 316 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None 317 | ) -> List[int]: 318 | """ 319 | Build model inputs from a sequence or a pair of sequence for sequence classification tasks 320 | by concatenating and adding special tokens. 321 | A BERT sequence has the following format: 322 | - single sequence: ``[CLS] X [SEP]`` 323 | - pair of sequences: ``[CLS] A [SEP] B [SEP]`` 324 | Args: 325 | token_ids_0 (:obj:`List[int]`): 326 | List of IDs to which the special tokens will be added 327 | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): 328 | Optional second list of IDs for sequence pairs. 329 | Returns: 330 | :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. 331 | """ 332 | if token_ids_1 is None: 333 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] 334 | cls = [self.cls_token_id] 335 | sep = [self.sep_token_id] 336 | return cls + token_ids_0 + sep + token_ids_1 + sep 337 | 338 | def get_special_tokens_mask( 339 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False 340 | ) -> List[int]: 341 | """ 342 | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding 343 | special tokens using the tokenizer ``prepare_for_model`` method. 344 | Args: 345 | token_ids_0 (:obj:`List[int]`): 346 | List of ids. 347 | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): 348 | Optional second list of IDs for sequence pairs. 349 | already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): 350 | Set to True if the token list is already formatted with special tokens for the model 351 | Returns: 352 | :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. 353 | """ 354 | 355 | if already_has_special_tokens: 356 | if token_ids_1 is not None: 357 | raise ValueError( 358 | "You should not supply a second sequence if the provided sequence of " 359 | "ids is already formated with special tokens for the model." 360 | ) 361 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) 362 | 363 | if token_ids_1 is not None: 364 | return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] 365 | return [1] + ([0] * len(token_ids_0)) + [1] 366 | 367 | def create_token_type_ids_from_sequences( 368 | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None 369 | ) -> List[int]: 370 | """ 371 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. 372 | A BERT sequence pair mask has the following format: 373 | :: 374 | 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 375 | | first sequence | second sequence | 376 | if token_ids_1 is None, only returns the first portion of the mask (0's). 377 | Args: 378 | token_ids_0 (:obj:`List[int]`): 379 | List of ids. 380 | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): 381 | Optional second list of IDs for sequence pairs. 382 | Returns: 383 | :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given 384 | sequence(s). 385 | """ 386 | sep = [self.sep_token_id] 387 | cls = [self.cls_token_id] 388 | if token_ids_1 is None: 389 | return len(cls + token_ids_0 + sep) * [0] 390 | return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] 391 | 392 | def save_vocabulary(self, vocab_path): 393 | """ 394 | Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. 395 | Args: 396 | vocab_path (:obj:`str`): 397 | The directory in which to save the vocabulary. 398 | Returns: 399 | :obj:`Tuple(str)`: Paths to the files saved. 400 | """ 401 | index = 0 402 | vocab_file = vocab_path 403 | with open(vocab_file, "w", encoding="utf-8") as writer: 404 | for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): 405 | if index != token_index: 406 | index = token_index 407 | writer.write(token + "\n") 408 | index += 1 409 | return (vocab_file,) 410 | -------------------------------------------------------------------------------- /autopeptideml/train/__init__.py: -------------------------------------------------------------------------------- 1 | from .trainer import BaseTrainer, OptunaTrainer, GridTrainer, NoHpoTrainer 2 | -------------------------------------------------------------------------------- /autopeptideml/train/architectures.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | 4 | SKLEARN_MODELS = ['knn', 'svm', 'rf', 'adaboost', 'gradboost'] 5 | ALL_MODELS = SKLEARN_MODELS + ['lightgbm', 'xgboost'] 6 | 7 | 8 | def load_sklearn_models(task: str) -> Dict[str, Callable]: 9 | try: 10 | import sklearn as sk 11 | except ImportError: 12 | raise ImportError("This function requires scikit-learn", 13 | "Please try: `pip install scikit-learn`") 14 | 15 | from sklearn import (svm, ensemble, neighbors) 16 | if 'class' in task: 17 | arch = { 18 | 'knn': neighbors.KNeighborsClassifier, 19 | 'svm': svm.SVC, 20 | 'rf': ensemble.RandomForestClassifier, 21 | 'adaboost': ensemble.AdaBoostClassifier, 22 | 'gradboost': ensemble.GradientBoostingClassifier, 23 | 24 | } 25 | elif 'reg' in task: 26 | arch = { 27 | 'knn': neighbors.KNeighborsRegressor, 28 | 'svm': svm.SVR, 29 | 'rf': ensemble.RandomForestRegressor, 30 | 'adaboost': ensemble.AdaBoostRegressor, 31 | 'gradboost': ensemble.GradientBoostingRegressor 32 | } 33 | else: 34 | raise NotImplementedError( 35 | f"Task type: {task} not implemented." 36 | ) 37 | return arch 38 | 39 | 40 | def load_lightgbm(task: str) -> Dict[str, Callable]: 41 | try: 42 | import lightgbm 43 | except ImportError: 44 | raise ImportError("This function requires lightgbm", 45 | "Please try: `pip install lightgbm`") 46 | if 'class' in task: 47 | arch = {'lightgbm': lightgbm.LGBMClassifier} 48 | elif 'reg' in task: 49 | arch = {'lightgbm': lightgbm.LGBMRegressor} 50 | else: 51 | raise NotImplementedError( 52 | f"Task type: {task} not implemented." 53 | ) 54 | return arch 55 | 56 | 57 | def load_xgboost(task: str) -> Dict[str, Callable]: 58 | try: 59 | import xgboost 60 | except ImportError: 61 | raise ImportError("This function requires lightgbm", 62 | "Please try: `pip install lightgbm`") 63 | if 'class' in task: 64 | arch = {'xgboost': xgboost.XGBClassifier} 65 | elif 'reg' in task: 66 | arch = {'xgboost': xgboost.XGBRegressor} 67 | else: 68 | raise NotImplementedError( 69 | f"Task type: {task} not implemented." 70 | ) 71 | return arch 72 | 73 | 74 | def load_torch(task: str) -> Dict[str, Callable]: 75 | try: 76 | from .deep_learning import Cnn 77 | except ImportError: 78 | raise ImportError("This function requires torch", 79 | "Please try: `pip install torch`") 80 | 81 | return {"cnn": Cnn} 82 | -------------------------------------------------------------------------------- /autopeptideml/train/deep_learning/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import Cnn -------------------------------------------------------------------------------- /autopeptideml/train/deep_learning/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code Adapted from the UniDL4BioPep 3 | implementation of their model for PyTorch 4 | in the GitHub Repository: 5 | https://github.com/David-Dingle/UniDL4BioPep_ASL_PyTorch/ 6 | """ 7 | 8 | import torch 9 | from torch.utils.data import Dataset 10 | 11 | 12 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 13 | 14 | 15 | class UniDL4BioPep_Dataset(Dataset): 16 | def __init__(self, x, y): 17 | super().__init__() 18 | self.data = torch.from_numpy(x).float().to(device) 19 | self.labels = torch.from_numpy(y).float().to(device) 20 | 21 | def __len__(self): 22 | return len(self.labels) 23 | 24 | def __getitem__(self, idx): 25 | return self.data[idx], self.labels[idx] 26 | 27 | def get_labels(self): 28 | return self.labels 29 | 30 | def get_data(self): 31 | return self.data 32 | 33 | 34 | class UniDL4BioPep_Inference(Dataset): 35 | def __init__(self, x): 36 | super().__init__() 37 | self.data = torch.from_numpy(x).float().to(device) 38 | 39 | def __len__(self): 40 | return self.data.shape[0] 41 | 42 | def __getitem__(self, index): 43 | return self.data[index] 44 | -------------------------------------------------------------------------------- /autopeptideml/train/deep_learning/loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code Adapted from the UniDL4BioPep 3 | implementation of their model for PyTorch 4 | in the GitHub Repository: 5 | https://github.com/David-Dingle/UniDL4BioPep_ASL_PyTorch/ 6 | """ 7 | 8 | import torch 9 | import torch.nn as nn 10 | 11 | 12 | class ASLSingleLabel(nn.Module): 13 | ''' 14 | This loss is intended for single-label classification problems 15 | ''' 16 | def __init__(self, gamma_pos=0, gamma_neg=4, eps: float = 0.1, reduction='mean'): 17 | super(ASLSingleLabel, self).__init__() 18 | 19 | self.eps = eps 20 | self.logsoftmax = nn.LogSoftmax(dim=-1) 21 | self.targets_classes = [] 22 | self.gamma_pos = gamma_pos 23 | self.gamma_neg = gamma_neg 24 | self.reduction = reduction 25 | 26 | def forward(self, inputs, target): 27 | ''' 28 | "input" dimensions: - (batch_size,number_classes) 29 | "target" dimensions: - (batch_size) 30 | ''' 31 | num_classes = inputs.size()[-1] 32 | log_preds = self.logsoftmax(inputs) 33 | self.targets_classes = torch.zeros_like(inputs).scatter_(1, target.long().unsqueeze(1), 1) 34 | 35 | # ASL weights 36 | targets = self.targets_classes 37 | anti_targets = 1 - targets 38 | xs_pos = torch.exp(log_preds) 39 | xs_neg = 1 - xs_pos 40 | xs_pos = xs_pos * targets 41 | xs_neg = xs_neg * anti_targets 42 | asymmetric_w = torch.pow(1 - xs_pos - xs_neg, 43 | self.gamma_pos * targets + self.gamma_neg * anti_targets) 44 | log_preds = log_preds * asymmetric_w 45 | 46 | if self.eps > 0: # label smoothing 47 | self.targets_classes = self.targets_classes.mul(1 - self.eps).add(self.eps / num_classes) 48 | 49 | # loss calculation 50 | loss = - self.targets_classes.mul(log_preds) 51 | 52 | loss = loss.sum(dim=-1) 53 | if self.reduction == 'mean': 54 | loss = loss.mean() 55 | 56 | return loss 57 | -------------------------------------------------------------------------------- /autopeptideml/train/deep_learning/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code Adapted from the UniDL4BioPep 3 | implementation of their model for PyTorch 4 | in the GitHub Repository: 5 | https://github.com/David-Dingle/UniDL4BioPep_ASL_PyTorch/ 6 | """ 7 | 8 | import copy 9 | import os 10 | 11 | import numpy as np 12 | import sklearn.metrics 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | from tqdm import tqdm 17 | 18 | from ..metrics import evaluate 19 | from .dataset import UniDL4BioPep_Dataset, UniDL4BioPep_Inference 20 | from .loss import ASLSingleLabel 21 | 22 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 23 | 24 | 25 | class BaseModel(nn.Module): 26 | def get_params(self): 27 | return self.params 28 | 29 | def predict_proba( 30 | self, 31 | x: np.array 32 | ): 33 | self.eval() 34 | x_dataloader = self._prepare_data(x, y=None) 35 | outputs = [] 36 | for batch in x_dataloader: 37 | batch = batch.to(self.device_name) 38 | output = self(batch).cpu().detach().numpy() 39 | outputs.append(output) 40 | 41 | return np.concatenate(outputs) 42 | 43 | def predict( 44 | self, 45 | x: np.array, 46 | device: str 47 | ): 48 | self.to('cpu') 49 | self.load_state_dict(self.best_model) 50 | self.to(device) 51 | self.device_name = device 52 | if 'reg' not in self.task: 53 | return (self.predict_proba(x)[:, 1] > 0.5).astype(int) 54 | else: 55 | return self.predict_proba(x) 56 | 57 | def evaluate(self, x, y): 58 | self.eval() 59 | x_dataloader = self._prepare_data(x, y=None) 60 | for batch in x_dataloader: 61 | output = self(batch).cpu().detach() 62 | report = self._scores(output, torch.Tensor(y)) 63 | return report 64 | 65 | def fit( 66 | self, 67 | train_x: np.array, 68 | train_y: np.array, 69 | valid_x: np.array, 70 | valid_y: np.array, 71 | device: str 72 | ): 73 | if not os.path.exists(self.logger): 74 | os.mkdir(self.logger) 75 | logger_training = os.path.join(self.logger, 'train.log') 76 | logger_validation = os.path.join(self.logger, 'valid.log') 77 | logger_checkpoint = os.path.join(self.logger, 'best_chckpt.pt') 78 | 79 | train_set = self._prepare_data(train_x, train_y) 80 | valid_set = self._prepare_data(valid_x, valid_y) 81 | 82 | self = self.to(device) 83 | self.device_name = device 84 | min_valid_loss = float("inf") 85 | for epoch in range(self.epochs): 86 | running_loss = 0.0 87 | train_acc = [] 88 | valid_loss = 0.0 89 | self.train() 90 | counter = 0 91 | for i, (inputs, labels) in enumerate(train_set): 92 | self.optimizer.zero_grad() 93 | inputs = inputs.to(device) 94 | labels = labels.to(device) 95 | if 'class' in self.task: 96 | new_labels = torch.zeros((labels.shape[0], 2)) 97 | new_labels[labels == 0, 0] = 1 98 | new_labels[labels == 1, 1] = 1 99 | labels = new_labels.to(device) 100 | outputs = self(inputs) 101 | if 'multi' in self.task: 102 | loss = self.loss(outputs.unravel(), labels.unravel()) 103 | else: 104 | loss = self.loss(outputs.float(), labels.float()) 105 | 106 | loss.backward() 107 | self.optimizer.step() 108 | running_loss += loss.item() 109 | if 'reg' in self.task: 110 | train_acc.append(self._scores(outputs.to("cpu"), labels.to("cpu"))["mse"]) 111 | else: 112 | train_acc.append(self._scores(outputs.to("cpu"), labels.to("cpu"))["f1_weighted"]) 113 | 114 | self.eval() 115 | acc = 0 116 | for j, (valid_inputs, valid_labels) in enumerate(valid_set): 117 | valid_labels = valid_labels.to(device) 118 | valid_inputs = valid_inputs.to(device) 119 | with torch.no_grad(): 120 | valid_outputs = self(valid_inputs) 121 | if 'multi' in self.task: 122 | valid_loss = self.loss(outputs.unravel(), labels.unravel()) 123 | else: 124 | valid_loss = self.loss(outputs.float(), labels.float()) 125 | 126 | if 'reg' in self.task: 127 | acc = self._scores(valid_outputs.to('cpu'), valid_labels.to('cpu'))["mse"] 128 | else: 129 | acc = self._scores(valid_outputs.to('cpu'), valid_labels.to('cpu'))["f1_weighted"] 130 | 131 | if valid_loss < min_valid_loss: 132 | min_valid_loss = valid_loss 133 | torch.save({ 134 | 'epoch': epoch, 135 | 'model_state_dict': self.state_dict(), 136 | 'optimizer_state_dict': self.optimizer.state_dict(), 137 | }, logger_checkpoint) 138 | self.cpu() 139 | self.best_model = copy.deepcopy(self.state_dict()) 140 | self.to(device) 141 | 142 | def _get_confusion_matrix(self, y_pred: torch.Tensor, y_test: torch.Tensor): 143 | predictions = torch.argmax(y_pred, dim=-1).numpy() 144 | labels = torch.argmax(y_test, dim=-1).numpy() # A:0, B:1, C:2, [D:3] 145 | confusion_matrix = sklearn.metrics.confusion_matrix(labels, predictions) 146 | return confusion_matrix 147 | 148 | def _scores(self, y_pred: torch.Tensor, y_test: torch.Tensor): 149 | predictions = torch.argmax(y_pred, dim=-1).numpy() 150 | labels = y_test.numpy() 151 | task = 'reg' if 'reg' in self.task else 'class' 152 | return evaluate(predictions, labels, task) 153 | 154 | def _prepare_data(self, x, y, shuffle: bool=False): 155 | if y is None: 156 | dataset = UniDL4BioPep_Inference(x) 157 | else: 158 | dataset = UniDL4BioPep_Dataset(x, y) 159 | dataloader = torch.utils.data.DataLoader(dataset, batch_size=x.shape[0] if y is None else 64, shuffle=shuffle) 160 | return dataloader 161 | 162 | def _get_optimizer(self, optim_algorithm: str='adam', lr: float=0.0001, weight_decay: float=0): 163 | OPTIMIZERS = { 164 | 'adam': torch.optim.Adam 165 | } 166 | return OPTIMIZERS[optim_algorithm](self.parameters(), lr=lr, weight_decay=weight_decay) 167 | 168 | def _get_criteria(self, **kwargs): 169 | return ASLSingleLabel(**kwargs) 170 | 171 | 172 | class Cnn(BaseModel): 173 | """ 174 | CNN model 175 | """ 176 | def __init__( 177 | self, 178 | optimizer: dict, 179 | logger: str, 180 | labels: int, 181 | task: str, 182 | epochs: int = 200, 183 | ): 184 | super().__init__() 185 | self.output_dim = labels 186 | self.input_dim = 320 187 | self.dropout = 0.3 188 | self.stride = 2 189 | self.kernel_1 = 3 190 | self.channel_1 = 32 191 | 192 | self.conv_1 = nn.Conv1d(kernel_size=self.kernel_1, 193 | out_channels=self.channel_1, 194 | in_channels=1, stride=1) 195 | self.normalizer_1 = nn.BatchNorm1d(self.channel_1) 196 | self.pooling_1 = nn.MaxPool1d(kernel_size=self.kernel_1, 197 | stride=self.stride) 198 | 199 | self.dropout = nn.Dropout(p=self.dropout) 200 | self.fc1 = nn.LazyLinear(128) 201 | self.normalizer_2 = nn.BatchNorm1d(128) 202 | self.fc2 = nn.Linear(128, self.output_dim) 203 | self.device_name = 'cpu' 204 | self.epochs = epochs 205 | self.optimizer = self._get_optimizer(**optimizer) 206 | # self.criteria = self._get_criteria(**criteria) 207 | self.logger = logger 208 | if 'multi' in task: 209 | self.loss = nn.BCELoss() 210 | elif 'class' in task: 211 | self.loss = nn.CrossEntropyLoss() 212 | else: 213 | self.loss = nn.MSELoss() 214 | self.task = task 215 | self.params = { 216 | 'epochs': self.epochs, 217 | 'optimizer': optimizer, 218 | } 219 | 220 | def forward(self, x): 221 | x = torch.unsqueeze(x, dim=1) # (batch, embedding_dim) -> (batch, 1, embedding_dim) 222 | x = self.conv_1(x) 223 | if x.shape[0] > 1: 224 | x = self.normalizer_1(x) 225 | c_1 = self.pooling_1(F.relu(x)) 226 | 227 | c_2 = torch.flatten(c_1, start_dim=1) 228 | c_2 = self.dropout(c_2) 229 | c_2 = self.fc1(c_2) 230 | if x.shape[0] > 1: 231 | c_2 = self.normalizer_2(c_2) 232 | out = F.relu(c_2) 233 | out = self.fc2(out) 234 | if 'class' in self.task or 'multi' in self.task: 235 | out = torch.softmax(out, dim=-1) 236 | return out 237 | 238 | 239 | class MLP(BaseModel, nn.Module): 240 | def __init__( 241 | self, 242 | optimizer: dict, 243 | logger: str, 244 | labels: int, 245 | task: str, 246 | epochs: int = 200, 247 | ): 248 | super().__init__() 249 | self.output_dim = labels 250 | self.input_dim = 320 251 | self.dropout = 0.3 252 | self.stride = 2 253 | self.kernel_1 = 3 254 | self.channel_1 = 32 255 | 256 | self.mlp = nn.Sequential( 257 | nn.LazyLinear(self.input_dim), 258 | nn.LeakyReLU(), 259 | nn.Linear(self.input_dim, self.input_dim), 260 | nn.LeakyReLU(), 261 | nn.Linear(self.input_dim, self.output_dim) 262 | ) 263 | self.device_name = 'cpu' 264 | self.epochs = epochs 265 | self.optimizer = self._get_optimizer(**optimizer) 266 | # self.criteria = self._get_criteria(**criteria) 267 | self.logger = logger 268 | if 'multi' in task: 269 | self.loss = nn.BCELoss() 270 | elif 'class' in task: 271 | self.loss = nn.CrossEntropyLoss() 272 | else: 273 | self.loss = nn.MSELoss() 274 | self.task = task 275 | self.params = { 276 | 'epochs': self.epochs, 277 | 'optimizer': optimizer, 278 | } 279 | 280 | def forward(self, x): 281 | # x = torch.unsqueeze(x, dim=1) # (batch, embedding_dim) -> (batch, 1, embedding_dim) 282 | out = self.mlp(x) 283 | if 'class' in self.task or 'multi' in self.task: 284 | out = torch.softmax(out, dim=-1) 285 | # else: 286 | # out = out.squeeze(1) 287 | return out 288 | -------------------------------------------------------------------------------- /autopeptideml/train/metrics.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | import numpy as np 3 | from scipy.stats import pearsonr, spearmanr 4 | from sklearn.metrics import (matthews_corrcoef, 5 | accuracy_score, f1_score, 6 | precision_score, recall_score, mean_squared_error, 7 | mean_absolute_error, roc_auc_score) 8 | 9 | 10 | def _pcc(preds, truths): 11 | return pearsonr(preds, truths)[0] 12 | 13 | 14 | def _spcc(preds, truths): 15 | return spearmanr(preds, truths)[0] 16 | 17 | 18 | def _f1_weighted(preds, truths): 19 | return f1_score(preds, truths, average='weighted') 20 | 21 | 22 | def _recall(preds, truths): 23 | return recall_score(preds, truths, zero_division=True) 24 | 25 | 26 | CLASSIFICATION_METRICS = { 27 | 'mcc': matthews_corrcoef, 28 | 'acc': accuracy_score, 29 | 'f1': f1_score, 30 | 'f1_weighted': _f1_weighted, 31 | 'precision': precision_score, 32 | 'recall': _recall, 33 | 'auroc': roc_auc_score 34 | } 35 | 36 | REGRESSION_METRICS = { 37 | 'mse': mean_squared_error, 38 | 'mae': mean_absolute_error, 39 | 'pcc': _pcc, 40 | 'spcc': _spcc 41 | } 42 | 43 | 44 | def evaluate(preds, truth, pred_task) -> Dict[str, float]: 45 | result = {} 46 | if pred_task == 'reg': 47 | metrics = REGRESSION_METRICS 48 | else: 49 | preds = preds > 0.5 50 | metrics = CLASSIFICATION_METRICS 51 | 52 | for key, value in metrics.items(): 53 | try: 54 | result[key] = value(preds, truth) 55 | except ValueError as e: 56 | result[key] = np.nan 57 | return result 58 | -------------------------------------------------------------------------------- /docs/autopeptideml.md: -------------------------------------------------------------------------------- 1 | # Class AutoPeptideML 2 | 3 | ## Overview 4 | 5 | `AutoPeptideML` is a configurable machine learning workflow class designed for peptide modeling. It integrates data pipelines, representations, model training (with HPO), evaluation, and export. 6 | 7 | --- 8 | 9 | ## Class: `AutoPeptideML` 10 | 11 | ### Constructor 12 | 13 | ```python 14 | AutoPeptideML(config: dict) 15 | ``` 16 | 17 | * Initializes the AutoPeptideML workflow with a provided configuration dictionary. 18 | * Creates output directories and stores pipeline, representation, training, and database settings. 19 | 20 | --- 21 | 22 | ### Public Methods 23 | 24 | #### `get_pipeline` 25 | 26 | ```python 27 | get_pipeline(pipe_config: Optional[dict] = None) -> Pipeline 28 | ``` 29 | 30 | Load or construct the preprocessing pipeline. 31 | 32 | #### `get_database` 33 | 34 | ```python 35 | get_database(db_config: Optional[dict] = None) -> Database 36 | ``` 37 | 38 | Create or load the peptide database with optional negative data support. 39 | 40 | #### `get_reps` 41 | 42 | ```python 43 | get_reps(rep_config: Optional[dict] = None) -> Tuple[Dict[str, RepEngineBase], Dict[str, np.ndarray]] 44 | ``` 45 | 46 | Load or compute representations for the data. 47 | 48 | #### `get_test` 49 | 50 | ```python 51 | get_test(test_config: Optional[Dict] = None) -> HestiaGenerator 52 | ``` 53 | 54 | Partition the dataset into training/validation/test using `HestiaGenerator`. 55 | 56 | #### `get_train` 57 | 58 | ```python 59 | get_train(train_config: Optional[Dict] = None) -> BaseTrainer 60 | ``` 61 | 62 | Load and return the trainer based on the configuration (supports Optuna and Grid). 63 | 64 | #### `run_hpo` 65 | 66 | ```python 67 | run_hpo() -> Dict 68 | ``` 69 | 70 | Perform hyperparameter optimization across dataset partitions. 71 | 72 | #### `run_evaluation` 73 | 74 | ```python 75 | run_evaluation(models) -> pd.DataFrame 76 | ``` 77 | 78 | Run evaluation on the trained models and return a DataFrame of results. 79 | 80 | #### `save_experiment` 81 | 82 | ```python 83 | save_experiment(model_backend: str = 'onnx', save_reps: bool = False, save_test: bool = True, save_all_models: bool = True) 84 | ``` 85 | 86 | Save the full experiment including models, test partitions, and configuration. 87 | 88 | #### `save_database` 89 | 90 | ```python 91 | save_database() 92 | ``` 93 | 94 | Export the database to CSV. 95 | 96 | #### `save_models` 97 | 98 | ```python 99 | save_models(ensemble_path: str, backend: str = 'onnx', save_all: bool = True) 100 | ``` 101 | 102 | Save models using `onnx` or `joblib` backends. 103 | 104 | #### `save_reps` 105 | 106 | ```python 107 | save_reps(rep_dir: str) 108 | ``` 109 | 110 | Save precomputed representations to disk. 111 | 112 | #### `predict` 113 | 114 | ```python 115 | predict(df: pd.DataFrame, feature_field: str, experiment_dir: str, backend: str = 'onnx') -> np.ndarray 116 | ``` 117 | 118 | Load a saved experiment and predict using the trained ensemble on new data. 119 | 120 | --- 121 | 122 | ### Configuration Keys 123 | 124 | The `config` dictionary passed to the constructor must include the following keys: 125 | 126 | * `outputdir`: str 127 | * `pipeline`: dict or str 128 | * `representation`: dict or str 129 | * `train`: dict or str 130 | * `databases`: dict 131 | * `test`: dict 132 | 133 | --- 134 | 135 | ### Dependencies 136 | 137 | * pandas, numpy 138 | * yaml, json 139 | * hestia 140 | * sklearn 141 | * skl2onnx, onnxmltools, joblib (optional) 142 | 143 | --- 144 | 145 | ## Example Usage 146 | 147 | ```python 148 | from autopipeline.autopeptideml import AutoPeptideML 149 | 150 | config = yaml.safe_load(open('config.yml')) 151 | runner = AutoPeptideML(config) 152 | pipeline = runner.get_pipeline() 153 | db = runner.get_database() 154 | reps, x = runner.get_reps() 155 | test = runner.get_test() 156 | trainer = runner.get_train() 157 | models = runner.run_hpo() 158 | evaluation = runner.run_evaluation(models) 159 | runner.save_experiment() 160 | ``` 161 | 162 | --- 163 | 164 | For detailed config templates and supported options, see the corresponding YAML schema documentation. 165 | -------------------------------------------------------------------------------- /docs/imgs/APML_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/AutoPeptideML/adb18578f145d1c3a78d6860ad5f7d35c726159a/docs/imgs/APML_dark.png -------------------------------------------------------------------------------- /docs/imgs/APML_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/AutoPeptideML/adb18578f145d1c3a78d6860ad5f7d35c726159a/docs/imgs/APML_light.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | {% 2 | include-markdown "../README.md" 3 | %} 4 | -------------------------------------------------------------------------------- /docs/repenginebase.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # `RepEngineBase` Class Documentation 4 | 5 | **Module:** `rep_engine_base` 6 | 7 | ## Purpose 8 | `RepEngineBase` is an abstract base class for molecular representation engines. It defines a standard interface and utilities for computing molecular representations from a list of molecules (e.g., SMILES strings), particularly in batched processing. This class is intended to be subclassed, with core functionality like preprocessing and representation computation implemented in derived classes. 9 | 10 | --- 11 | 12 | ## Attributes 13 | 14 | - **`engine`** (`str`): 15 | Name of the representation engine. Typically defined in a subclass or passed during instantiation. 16 | 17 | - **`rep`** (`str`): 18 | Type of molecular representation (e.g., `'fingerprint'`, `'embedding'`). 19 | 20 | - **`properties`** (`dict`): 21 | A deep copy of the instance's dictionary at initialization. Captures configuration state. 22 | 23 | --- 24 | 25 | ## Constructor 26 | 27 | ```python 28 | def __init__(self, rep: str, **args) 29 | ``` 30 | 31 | **Parameters:** 32 | - `rep` (`str`): Type of molecular representation. 33 | - `**args` (`dict`): Additional configuration options stored as attributes. 34 | 35 | **Effect:** 36 | Initializes the object, stores `rep`, and adds all additional keyword arguments to the instance. Also creates a deep copy of all these attributes in `self.properties` for serialization. 37 | 38 | --- 39 | 40 | ## Public Methods 41 | 42 | ### `compute_reps` 43 | 44 | ```python 45 | def compute_reps(self, mols: List[str], verbose: Optional[bool] = False, batch_size: Optional[int] = 12) -> Union[np.ndarray, List[np.ndarray]] 46 | ``` 47 | 48 | **Description:** 49 | Computes molecular representations in batches using `_preprocess_batch` and `_rep_batch`. 50 | 51 | **Parameters:** 52 | - `mols` (`List[str]`): List of molecular inputs (e.g., SMILES strings). 53 | - `verbose` (`bool`, optional): If `True`, shows a progress bar. 54 | - `batch_size` (`int`, optional): Number of molecules per batch. 55 | 56 | **Returns:** 57 | - `np.ndarray` if `average_pooling` is `True` or unset. 58 | - `List[np.ndarray]` if `average_pooling` is explicitly set to `False`. 59 | 60 | --- 61 | 62 | ### `dim` 63 | 64 | ```python 65 | def dim(self) -> int 66 | ``` 67 | 68 | **Description:** 69 | Abstract method. Must return the dimensionality of the computed representation. 70 | 71 | **Raises:** 72 | - `NotImplementedError` 73 | 74 | --- 75 | 76 | ### `_rep_batch` 77 | 78 | ```python 79 | def _rep_batch(self, batch: List[str]) -> np.ndarray 80 | ``` 81 | 82 | **Description:** 83 | Abstract method. Must compute and return the representation for a batch of molecules. 84 | 85 | **Raises:** 86 | - `NotImplementedError` 87 | 88 | --- 89 | 90 | ### `_preprocess_batch` 91 | 92 | ```python 93 | def _preprocess_batch(self, batch: List[str]) -> List[str] 94 | ``` 95 | 96 | **Description:** 97 | Abstract method. Must return a preprocessed version of the batch for representation. 98 | 99 | **Raises:** 100 | - `NotImplementedError` 101 | 102 | --- 103 | 104 | ### `save` 105 | 106 | ```python 107 | def save(self, filename: str) 108 | ``` 109 | 110 | **Description:** 111 | Serializes and saves the engine’s properties to a YAML file. 112 | 113 | **Parameters:** 114 | - `filename` (`str`): Destination path for the YAML file. 115 | 116 | ## Design Notes 117 | 118 | - This class provides **batch processing** support and optional **average pooling** control. 119 | - The use of `batched` from `itertools` supports Python 3.10+ but also includes a fallback implementation for older versions. 120 | - Intended for extension: Subclasses must implement `_rep_batch`, `_preprocess_batch`, and `dim`. 121 | -------------------------------------------------------------------------------- /docs/repenginefp.md: -------------------------------------------------------------------------------- 1 | # RepEngineFP 2 | 3 | ::: autopeptideml.reps.fps.RepEngineFP 4 | 5 | -------------------------------------------------------------------------------- /docs/repenginelm.md: -------------------------------------------------------------------------------- 1 | # RepEngineLM 2 | 3 | ::: autopeptideml.reps.lms.RepEngineLM 4 | 5 | -------------------------------------------------------------------------------- /docs/repengineseqbased.md: -------------------------------------------------------------------------------- 1 | # RepEngineOneHotEncoding 2 | 3 | ::: autopeptideml.reps.seq_based.RepEngineOnehot 4 | 5 | -------------------------------------------------------------------------------- /examples/API_docs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# AutoPeptideML API Python\n", 8 | "\n", 9 | "## 1. Introduction\n", 10 | "\n", 11 | "The functionalities of AutoPeptideML Python API is focused in a single class, `AutoPeptideML`. Initialization of the class includes 3 possible arguments:\n", 12 | "\n", 13 | "- `verbose`: boolean value. Default: `True`.\n", 14 | "- `threads`: number of threads to use for multithreading. By default it uses all available CPU cores.\n", 15 | "- `seed`: seed for pseudo-random number generator for all stochastic processes. Default: `42`." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "vscode": { 23 | "languageId": "plaintext" 24 | } 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "from autopeptideml.autopeptideml import AutoPeptideML\n", 29 | "\n", 30 | "apml = AutoPeptideML(\n", 31 | " verbose=True,\n", 32 | " threads=8,\n", 33 | " seed=42\n", 34 | ")" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## 2. Dataset preparation\n", 42 | "\n", 43 | "There are 3 methods to handle dataset preparation:\n", 44 | "\n", 45 | "- `autosearch_negatives`: Searches for negative bioactive peptides\n", 46 | " - `df_pos`: `pd.DataFrame` with positive samples\n", 47 | " - `positive_tags`: `List[str]` with all bioactivities that may overlap with the positive class\n", 48 | " - `proportion`: `float` number. Target negative:positive ratio. Default: `1.0`.\n", 49 | "- `balance_samples`: Balances labels in the dataset by oversampling the underepresented classes.\n", 50 | " - `df`: `pd.DataFrame`. Dataframe with `Y` column, for which labels will be balanced.\n", 51 | "- `curate_dataset`: Load the dataset, remove non-canonical and empty sequences.\n", 52 | " - `dataset`: `Union[str, pd.DataFrame]`. The input can be either the path to a `.fasta`, `.csv`, or `.tsv` file or a `pd.DataFrame`.\n", 53 | " - `outputdir`: `str`. Path to a directory where to save the curated dataset.\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "vscode": { 61 | "languageId": "plaintext" 62 | } 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "# Dataset curation\n", 67 | "df_negs = apml.curate_dataset(\n", 68 | " dataset='example_dataset_with_negatives.fasta',\n", 69 | " output='output_dir'\n", 70 | ")\n", 71 | "df_pos = apml.curate_dataset(\n", 72 | " dataset='example_dataset_with_positives.fasta',\n", 73 | " output='output_dir_2'\n", 74 | ")\n", 75 | "\n", 76 | "# Balance samples_to_draw (only if df contains negative samples)\n", 77 | "df_negs_balanced = apml.balance_samples(df_negs)\n", 78 | "\n", 79 | "# Autosearch for negatives\n", 80 | "df = apml.autosearch_negatives(\n", 81 | " df_pos=df_pos,\n", 82 | " positive_tags=['Neuropeptides'],\n", 83 | " proportion=1.0\n", 84 | ")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## 3. Dataset partitioning\n", 92 | "\n", 93 | "There are two steps of dataset partitioning: training/evaluation and training/validation folds.\n", 94 | "\n", 95 | "- `train_test_partition`: Creates training/evaluation sets using novel homology partitioning algorithm\n", 96 | " - `df`: `pd.DataFrame`\n", 97 | " - `threshold`: `float`. Maximum sequence identity value between sequences in training and evaluation sets. Default: `0.3`\n", 98 | " - `test_size`: `float`. Proportion of samples that should comprise the evaluation set. Default: `0.2`\n", 99 | " - `alignment`: `str`. Alignment method to be used. Options: `needle`, `mmseqs` and `mmseqs+prefilter`. Default: `mmseqs+prefilter`\n", 100 | " - `outputdir`: `str`. Path to a directory where to save the generated datasets.\n", 101 | "- `train_val_partition`: Creates n training/validation folds\n", 102 | " - `df`: `pd.DataFrame`. Should be the training dataset generated with the previous step.\n", 103 | " - `method`: `str`. Method for partitioning. Options: `random` and `graph-part`. `random` refers to `StratifiedKFold` from `sklearn.model_selection` and `graph-part` to `stratified_k_fold` from the GraphPart algorithm. For more details see the [Project Github Repository](https://github.com/graph-part/graph-part).\n", 104 | " - `threshold`: `float`. Maximum sequence identity value between sequences in training and valdation folds. Only valid if method is `graph-part`. Default: `0.5`.\n", 105 | " - `alignment`: `str`. Alignment method to be used. Options: `needle`, `mmseqs` and `mmseqs+prefilter`. Only valid if method is `graph-part`. Default: `mmseqs+prefilter`.\n", 106 | " - `n_folds`: `int`. Number of folds to be generated. Default: `10`.\n", 107 | " - `outputdir`: `str`. Path to a directory where to save the generated datasets.\n", 108 | " " 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "vscode": { 116 | "languageId": "plaintext" 117 | } 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "datasets = apml.train_test_partition(\n", 122 | " df=df,\n", 123 | " threshold=0.3,\n", 124 | " test_size=0.2,\n", 125 | " alignment='mmseqs+prefilter',\n", 126 | " outputdir='outputdir/splits'\n", 127 | ")\n", 128 | "folds = apml.train_val_partition(\n", 129 | " df=datasets['train'],\n", 130 | " method='random',\n", 131 | " n_folds=10,\n", 132 | " outputdir='outputdir/folds'\n", 133 | ")" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## 4. Peptide Representation\n", 141 | "\n", 142 | "The Peptide Representation step requires an additional class within the AutoPeptideML package, `RepresentationEngine`, that loads the Protein Language Model (PLM) of choice.\n", 143 | "\n", 144 | "- `RepresentationEngine`:\n", 145 | " - `model`: `str`. Protein Language Model, see Github Repo `README.md` file. Default: `esm2-8m`\n", 146 | " - `batch_size`: Number of peptide sequences to compute in each batch, depends on the RAM memory either in the CPU or the GPU. Default: `64`.\n", 147 | "- `AutoPeptideML`:\n", 148 | " - `compute_representation`: Uses the `RepresentationEngine` class to compute the representations in the dataset.\n", 149 | " - `datasets`: `Dict[str, pd.DataFrame]` dictionary with the dataset partitions\n", 150 | " - `re`: `RepresentationEngine`\n", 151 | "\n" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "vscode": { 159 | "languageId": "plaintext" 160 | } 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "from autopeptideml.utils.embeddings import RepresentationEngine\n", 165 | "\n", 166 | "re = RepresentationEngine(\n", 167 | " model='esm2-8m',\n", 168 | " batch_size=64\n", 169 | ")\n", 170 | "id2rep = apml.compute_representations(\n", 171 | " datasets=datasets,\n", 172 | " re=re\n", 173 | ")" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## 5. Hyperparameter Optimisation and Model Training\n", 181 | "\n", 182 | "- `hpo_train`\n", 183 | " - `config`: `dict`. `JSON` file with the hyperparameter search space, for examples of the format please refer to the files in `autopeptideml/data/configs`.\n", 184 | " - `train_df`: `pd.DataFrame` with the training dataset.\n", 185 | " - `id2rep`: `dict`. Result from running `apml.compute_representation`\n", 186 | " - `folds`: `list`. List of training/validation folds.\n", 187 | " - `outputdir`: `str`. Path to a directory where to save the results." 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "vscode": { 195 | "languageId": "plaintext" 196 | } 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "model = apml.hpo_train(\n", 201 | " config=json.load(open('../autopeptideml/data/config/default_config.json')),\n", 202 | " train_df=datasets['train],\n", 203 | " id2rep=id2rep,\n", 204 | " folds=folds,\n", 205 | " outputdir='outputdir/ensemble'\n", 206 | ")" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "## 6. Ensemble Evaluation\n", 214 | "\n", 215 | "- `evaluate_model`\n", 216 | " - `best_model`. Ensemble generated in previous step.\n", 217 | " - `test_df`: `pd.DataFrame` with the evaluation set.\n", 218 | " - `id2rep`: `dict`. Representations generated in Step 4\n", 219 | " - `outputdir`: `str`.\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "vscode": { 227 | "languageId": "plaintext" 228 | } 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "results = apml.evaluate_model(\n", 233 | " best_model=model,\n", 234 | " test_df=datasets['test'],\n", 235 | " id2rep=id2rep,\n", 236 | " outputdir='outputdir/results'\n", 237 | ")" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "## 7. Prediction\n", 245 | "\n", 246 | "- `predict`: Predict the bioactivity of a set of peptide sequences given an ensemble already trained.\n", 247 | " - `df`: `pd.DataFrame` with the peptide sequences.\n", 248 | " - `re`: `RepresentationEngine`\n", 249 | " - `ensemble_path`: Path where the ensemble files were saved.\n", 250 | " - `outputdir`" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "vscode": { 258 | "languageId": "plaintext" 259 | } 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "apml.predict(\n", 264 | " df=pd.read_csv('New_samples.csv'),\n", 265 | " re=re,\n", 266 | " ensemble_path='outputdir/ensemble',\n", 267 | " outputdir='prediction'\n", 268 | ")" 269 | ] 270 | } 271 | ], 272 | "metadata": { 273 | "language_info": { 274 | "name": "python" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 2 279 | } 280 | -------------------------------------------------------------------------------- /examples/AutoPeptideML_Collab.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"private_outputs":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["# @title Install AutoPeptideML\n","%%capture\n","!pip install autopeptideml\n","!wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz\n","!tar xvfz /content/mmseqs-linux-avx2.tar.gz\n","!cp /content/mmseqs/bin/mmseqs /bin/\n","%env mmseqs=/bin/mmseqs\n"],"metadata":{"id":"Ssp28JzPWsWD"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"id":"jRIN9Z3jxtnp","cellView":"form"},"outputs":[],"source":["# @title Import AutoPeptideML\n","import pandas as pd\n","\n","from autopeptideml import AutoPeptideML, RepresentationEngine\n","from autopeptideml.utils.embeddings import AVAILABLE_MODELS, SYNONYMS\n","\n","apml = AutoPeptideML(verbose=True)"]},{"cell_type":"markdown","source":["# Hyperparameter Space\n","\n","You can define the hyperparameter search space for a single model (`hpo_single`), for an ensemble of models (`hpo_ensemble`), the UniDL4BioPep architecture can also be used. Both options are added below and you can execute whichever you prefer to use. The search spaces can be modified at will, more information in the project documentation: https://ibm.github.io/AutoPeptideML/."],"metadata":{"id":"LLhFmK9NmEaw"}},{"cell_type":"code","source":["# @title HPO single (model selection)\n","hpo_space = {\n"," \"trials\": 100,\n"," \"model_selection\": [\n","{\n"," \"model\": \"K-Nearest Neighbours\",\n"," \"optimization_metric\": \"test_matthews_corrcoef\",\n"," \"hyperparameter-space\": [\n"," {\n"," \"name\": \"n_neighbors\",\n"," \"type\": \"int\",\n"," \"min\": 1,\n"," \"max\": 30,\n"," \"log\": \"False\"\n"," },\n"," {\n"," \"name\": \"weights\",\n"," \"type\": \"categorical\",\n"," \"values\": [\"uniform\", \"distance\"]\n"," }\n"," ]\n"," },\n"," # {\n"," # \"model\": \"mlp\",\n"," # \"optimization_metric\": \"test_matthews_corrcoef\",\n"," # \"hyperparameter-space\": [\n"," # {\n"," # \"name\": \"learning_rate\",\n"," # \"type\": \"float\",\n"," # \"min\": 1e-7,\n"," # \"max\": 1,\n"," # \"log\": \"True\"\n"," # },\n"," # {\n"," # \"name\": \"activation\",\n"," # \"type\": \"categorical\",\n"," # \"values\": [\"identity\", \"logistic\", \"tanh\", \"relu\"]\n"," # },\n"," # {\n"," # \"name\": \"solver\",\n"," # \"type\": \"categorical\",\n"," # \"values\": [\"adam\", \"sgd\"]\n"," # },\n"," # {\n"," # \"name\": \"hidden_layer_sizes\",\n"," # \"type\": \"categorical\",\n"," # \"values\": [[12, 12], [120, 120], [12, 12, 12], [120, 120, 120], [12, 12, 12, 12]]\n"," # }\n"," # ]\n"," # },\n"," # {\n"," # \"model\": \"XGBoost\",\n"," # \"optimization_metric\": \"test_matthews_corrcoef\",\n"," # \"hyperparameter-space\": [\n"," # {\n"," # \"name\": \"learning_rate\",\n"," # \"type\": \"float\",\n"," # \"min\": 1e-5,\n"," # \"max\": 1,\n"," # \"log\": \"True\"\n"," # },\n"," # {\n"," # \"name\": \"n_estimators\",\n"," # \"type\": \"int\",\n"," # \"min\": 1,\n"," # \"max\": 100,\n"," # \"log\": \"False\"\n"," # },\n"," # {\n"," # \"name\": \"max_depth\",\n"," # \"type\": \"int\",\n"," # \"min\": 1,\n"," # \"max\": 10,\n"," # \"log\": \"False\"\n"," # },\n"," # ]\n"," # },\n"," {\n"," \"model\": \"RFC\",\n"," \"optimization_metric\": \"test_matthews_corrcoef\",\n"," \"hyperparameter-space\": [\n"," {\n"," \"name\": \"max_depth\",\n"," \"type\": \"int\",\n"," \"min\": 2,\n"," \"max\": 20,\n"," \"log\": \"False\"\n"," },\n"," {\n"," \"name\": \"n_estimators\",\n"," \"type\": \"int\",\n"," \"min\": 10,\n"," \"max\": 100,\n"," \"log\": \"False\"\n"," }\n"," ]\n"," },\n"," {\n"," \"model\": \"LightGBM\",\n"," \"optimization_metric\": \"test_matthews_corrcoef\",\n"," \"hyperparameter-space\": [\n"," {\n"," \"name\": \"max_depth\",\n"," \"type\": \"int\",\n"," \"min\": 1,\n"," \"max\": 30,\n"," \"log\": \"True\"\n"," },\n"," {\n"," \"name\": \"num_leaves\",\n"," \"type\": \"int\",\n"," \"min\": 5,\n"," \"max\": 50,\n"," \"log\": \"True\"\n"," },\n"," {\n"," \"name\": \"learning_rate\",\n"," \"type\": \"float\",\n"," \"min\": 0.001,\n"," \"max\": 0.3,\n"," \"log\": \"True\"\n"," },\n"," {\n"," \"name\": \"verbose\",\n"," \"type\": \"fixed\",\n"," \"value\": -1\n"," }\n"," ]\n"," }\n"," ]\n","}"],"metadata":{"id":"RPtMhCzvlW1D","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title HPO UniDL4BioPep\n","hpo_space = {\n"," \"ensemble\":\n"," [\n"," {\n"," \"model\": \"unidl4biopep\",\n"," \"trials\": 100,\n"," \"optimization_metric\": \"test_matthews_corrcoef\",\n"," \"hyperparameter-space\": {\n"," \"epochs\": 20,\n"," \"optimizer\": [\n"," {\n"," \"name\": \"lr\",\n"," \"type\": \"float\",\n"," \"min\": 1e-7,\n"," \"max\": 0.1\n"," }\n"," ]\n"," }\n"," }\n"," ]\n","}"],"metadata":{"cellView":"form","id":"vOEdRcfknX2H"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title HPO ensemble\n","hpo_space = {\n"," \"ensemble\": [\n"," {\n"," \"model\": \"K-Nearest Neighbours\",\n"," \"trials\": 30,\n"," \"optimization_metric\": \"test_matthews_corrcoef\",\n"," \"hyperparameter-space\": [\n"," {\n"," \"name\": \"n_neighbors\",\n"," \"type\": \"int\",\n"," \"min\": 1,\n"," \"max\": 30,\n"," \"log\": \"False\"\n"," },\n"," {\n"," \"name\": \"weights\",\n"," \"type\": \"categorical\",\n"," \"values\": [\"uniform\", \"distance\"]\n"," }\n"," ]\n"," },\n"," # {\n"," # \"model\": \"mlp\",\n"," # \"trials\": 30,\n"," # \"optimization_metric\": \"test_matthews_corrcoef\",\n"," # \"hyperparameter-space\": [\n"," # {\n"," # \"name\": \"learning_rate\",\n"," # \"type\": \"categorical\",\n"," # \"values\": [\"constant\", \"invscaling\", \"adaptive\"]\n"," # },\n"," # {\n"," # \"name\": \"activation\",\n"," # \"type\": \"categorical\",\n"," # \"values\": [\"identity\", \"logistic\", \"tanh\", \"relu\"]\n"," # },\n"," # {\n"," # \"name\": \"learning_rate_init\",\n"," # \"type\": \"float\",\n"," # \"min\": 1e-7,\n"," # \"max\": 1e-1,\n"," # \"log\": True\n"," # },\n"," # {\n"," # \"name\": \"solver\",\n"," # \"type\": \"categorical\",\n"," # \"values\": [\"adam\", \"sgd\"]\n"," # },\n"," # {\n"," # \"name\": \"hidden_layer_sizes\",\n"," # \"type\": \"categorical\",\n"," # \"values\": [[12, 12], [120, 120], [12, 12, 12], [120, 120, 120], [12, 12, 12, 12]]\n"," # }\n"," # ]\n"," # },\n"," # {\n"," # \"model\": \"XGBoost\",\n"," # \"trials\": 30,\n"," # \"optimization_metric\": \"test_matthews_corrcoef\",\n"," # \"hyperparameter-space\": [\n"," # {\n"," # \"name\": \"learning_rate\",\n"," # \"type\": \"float\",\n"," # \"min\": 1e-5,\n"," # \"max\": 1,\n"," # \"log\": \"True\"\n"," # },\n"," # {\n"," # \"name\": \"n_estimators\",\n"," # \"type\": \"int\",\n"," # \"min\": 1,\n"," # \"max\": 100,\n"," # \"log\": \"False\"\n"," # },\n"," # {\n"," # \"name\": \"max_depth\",\n"," # \"type\": \"int\",\n"," # \"min\": 1,\n"," # \"max\": 10,\n"," # \"log\": \"False\"\n"," # },\n"," # ]\n"," # },\n"," {\n"," \"model\": \"RFC\",\n"," \"trials\": 30,\n"," \"optimization_metric\": \"test_matthews_corrcoef\",\n"," \"hyperparameter-space\": [\n"," {\n"," \"name\": \"max_depth\",\n"," \"type\": \"int\",\n"," \"min\": 2,\n"," \"max\": 20,\n"," \"log\": \"False\"\n"," },\n"," {\n"," \"name\": \"n_estimators\",\n"," \"type\": \"int\",\n"," \"min\": 10,\n"," \"max\": 100,\n"," \"log\": \"False\"\n"," }\n"," ]\n"," },\n"," {\n"," \"model\": \"LightGBM\",\n"," \"trials\": 30,\n"," \"optimization_metric\": \"test_matthews_corrcoef\",\n"," \"hyperparameter-space\": [\n"," {\n"," \"name\": \"max_depth\",\n"," \"type\": \"int\",\n"," \"min\": 1,\n"," \"max\": 30,\n"," \"log\": \"True\"\n"," },\n"," {\n"," \"name\": \"num_leaves\",\n"," \"type\": \"int\",\n"," \"min\": 5,\n"," \"max\": 50,\n"," \"log\": \"True\"\n"," },\n"," {\n"," \"name\": \"learning_rate\",\n"," \"type\": \"float\",\n"," \"min\": 0.001,\n"," \"max\": 0.3,\n"," \"log\": \"True\"\n"," },\n"," {\n"," \"name\": \"verbose\",\n"," \"type\": \"fixed\",\n"," \"value\": -1\n"," }\n"," ]\n"," }\n"," ]\n","}\n"],"metadata":{"id":"FNr1wMMlm6ys","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Load your data\n"],"metadata":{"id":"Cjzpf_QZnzgH"}},{"cell_type":"code","source":["# @title Upload dataset\n","from google.colab import files\n","import io\n","\n","uploaded = files.upload()\n","df = pd.read_csv(io.StringIO(uploaded[list(uploaded.keys())[0]].decode('utf-8')))\n","df.head()"],"metadata":{"id":"sHKkOfGGYEyi","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Inputs\n","\n","field_name = 'sequence' # @param {type: 'string'}\n","id_field = None # @param{type: 'raw'}\n","label_name = 'bioactivity' #@param{type: 'string'}\n","alignment_algorithm = 'mmseqs' #@param{type: 'string'}\n","threshold = 0.3 #@param\n","plm_model = 'esm2-8m' #@param {type: 'string'}\n","\n","if plm_model not in AVAILABLE_MODELS and plm_model not in SYNONYMS:\n"," print(f'Model: {plm_model} is not supported, please use one of the following: {list(SYNONYMS.keys())}')\n","\n","if id_field is None:\n"," df['id'] = df.index\n","else:\n"," df['id'] = df[id_field]\n","\n","df['sequence'] = df[field_name]\n","df['labels'] = df[label_name]\n","\n","df.head()"],"metadata":{"id":"AHMxISg2WT85","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Split dataset\n","from hestia.partition import ccpart, random_partition\n","\n","datasets = apml.train_test_partition(df, threshold=threshold, alignment=alignment_algorithm, denominator='n_aligned')\n"],"metadata":{"id":"S_6699wxVFtX"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Preparing cross-validation folds\n","\n","folds = apml.train_val_partition(\n"," datasets['train'], method='random',\n"," threshold=0.4, alignment='mmseqs+prefilter',\n"," n_folds=10, outputdir='results/folds',\n","\n",")"],"metadata":{"id":"o3oYbY-6p7j5","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title What Representation Model do you want to use?\n","# @markdown It is recommended to set the runtime to GPU in order to accelerate embedding computation.\n","\n","re = RepresentationEngine(plm_model, 12)\n","id2rep = apml.compute_representations(datasets, re)\n","id2rep = {id: rep.numpy() for id, rep in id2rep.items()}"],"metadata":{"id":"g9iid82xoXAP","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Train models\n","\n","model = apml.hpo_train(\n"," hpo_space, datasets['train'], id2rep, folds, 'results'\n",")"],"metadata":{"id":"oUlBUkKIljFW","cellView":"form"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# @title Evaluate models\n","\n","results = apml.evaluate_model(\n"," model, datasets['test'], id2rep, 'results'\n",")\n","print(results)"],"metadata":{"id":"0XLgate7lxBr","cellView":"form"},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: AutoPeptideML 2 | site_description: Computational Pipeline for the Automatised Development of Peptide Bioactivity Prediction Models 3 | watch: [autopeptideml] 4 | 5 | nav: 6 | - Home: 7 | - Overview: index.md 8 | - Code reference: autopeptideml/ 9 | - Code reference: 10 | - AutoPeptideML: 11 | - autopeptideml.md 12 | - RepEngineBase: 13 | - repenginebase.md 14 | - RepEngineFP: 15 | - repenginefp.md 16 | - RepEngineLM: 17 | - repenginelm.md 18 | - RepEngineSeqBased: 19 | - repengineseqbased.md 20 | markdown_extensions: 21 | - attr_list 22 | theme: 23 | name: material 24 | features: 25 | - content.code.annotate 26 | - navigation.tabs 27 | - navigation.top 28 | palette: 29 | - media: "(prefers-color-scheme: light)" 30 | scheme: default 31 | primary: black 32 | accent: purple 33 | # toggle: 34 | # icon: material/weather-sunny 35 | # name: Switch to light mode 36 | # - media: "(prefers-color-scheme: dark)" 37 | # scheme: slate 38 | # primary: black 39 | # accent: lime 40 | # toggle: 41 | # icon: material/weather-night 42 | # name: Switch to dark mode 43 | features: 44 | - search.suggest 45 | - search.highlight 46 | - content.tabs.link 47 | icon: 48 | repo: fontawesome/brands/github-alt 49 | language: en 50 | repo_name: IBM/AutoPeptideML 51 | repo_url: https://github.com/IBM/AutoPeptideML 52 | edit_uri: '' 53 | plugins: 54 | - search 55 | - include-markdown 56 | - mkdocstrings: 57 | handlers: 58 | python: 59 | import: 60 | - https://docs.python.org/3/objects.inv 61 | - https://installer.readthedocs.io/en/stable/objects.inv # demonstration purpose in the docs 62 | - https://mkdocstrings.github.io/autorefs/objects.inv 63 | options: 64 | show_source: false 65 | docstring_style: sphinx 66 | docstring_options: 67 | # ignore_init_summary: yes 68 | merge_init_into_class: yes 69 | show_submodules: yes 70 | - markdownextradata: 71 | data: data 72 | markdown_extensions: 73 | - toc: 74 | permalink: true 75 | - markdown.extensions.codehilite: 76 | guess_lang: false 77 | - mdx_include: 78 | base_path: docs 79 | - admonition 80 | - codehilite 81 | - extra 82 | - pymdownx.superfences: 83 | custom_fences: 84 | - name: mermaid 85 | class: mermaid 86 | format: !!python/name:pymdownx.superfences.fence_code_format '' 87 | - pymdownx.tabbed: 88 | alternate_style: true 89 | - attr_list 90 | - md_in_html 91 | extra: 92 | social: 93 | - icon: fontawesome/brands/github-alt 94 | link: https://github.com/IBM/AutoPeptideML 95 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:recommended" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """The setup script.""" 4 | import os 5 | from setuptools import setup, find_packages 6 | from pathlib import Path 7 | 8 | 9 | this_directory = Path(__file__).parent 10 | readme = (this_directory / "README.md").read_text() 11 | 12 | requirements = [ 13 | 'optuna', 14 | 'scikit-learn', 15 | 'typer', 16 | 'mljar-scikit-plot', 17 | 'tokenizers', 18 | 'torch', 19 | 'transformers', 20 | 'lightgbm', 21 | 'xgboost', 22 | 'mdpdf', 23 | 'hestia-good', 24 | 'onnxmltools', 25 | 'skl2onnx', 26 | 'onnxruntime', 27 | ] 28 | 29 | test_requirements = requirements 30 | files = ['autopeptideml/data/readme_ex.md'] 31 | setup( 32 | author="Raul Fernandez-Diaz", 33 | author_email='raulfd@ibm.com', 34 | python_requires='>=3.9', 35 | classifiers=[ 36 | ], 37 | description="AutoML system for building trustworthy peptide bioactivity predictors", 38 | entry_points={ 39 | 'console_scripts': [ 40 | 'apml=autopeptideml.main:_main', 41 | 'autopeptideml=autopeptideml.main:_main' 42 | ], 43 | }, 44 | install_requires=requirements, 45 | license="MIT", 46 | long_description=readme, 47 | long_description_content_type='text/markdown', 48 | data_files=[('', files)], 49 | include_package_data=True, 50 | keywords='autopeptideml', 51 | name='autopeptideml', 52 | packages=find_packages(exclude=['examples']), 53 | url='https://ibm.github.io/AutoPeptideML/', 54 | version='2.0.0', 55 | zip_safe=False, 56 | ) 57 | -------------------------------------------------------------------------------- /tests/test_apml.py: -------------------------------------------------------------------------------- 1 | from autopeptideml import AutoPeptideML 2 | 3 | 4 | # def test_load(): 5 | # apml = AutoPeptideML() 6 | # df = apml.curate_dataset('examples/AB_positives.csv') 7 | # assert len(df) == 6_583 8 | -------------------------------------------------------------------------------- /tests/test_db.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | import numpy as np 4 | 5 | from autopeptideml.db import Database 6 | from autopeptideml.pipeline import Pipeline, CanonicalFilter 7 | 8 | 9 | def test_database(): 10 | dir_path = osp.abspath(osp.dirname(__file__)) 11 | path = osp.join(dir_path, 'sample', 'example.csv') 12 | db = Database(path, feat_fields=['sequence'], 13 | pipe=Pipeline([CanonicalFilter()])) 14 | assert len(db) == 500 15 | path2 = osp.join(dir_path, 'sample', 'example2.csv') 16 | db2 = Database(path2, feat_fields=['sequence'], 17 | pipe=Pipeline([CanonicalFilter()]), 18 | label_field='Y') 19 | db2.df['Y'] = 1 20 | db2.add_negatives(db, columns_to_exclude=['Allergen', 'Toxic']) 21 | labels, counts = np.unique(db2.df.Y, return_counts=True) 22 | assert labels.tolist() == [0, 1] 23 | assert counts.tolist() == [272, 300] 24 | -------------------------------------------------------------------------------- /tests/test_pipeline.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from autopeptideml.pipeline import Pipeline 4 | from autopeptideml.pipeline import (Pipeline, CanonicalCleaner, CanonicalFilter, 5 | SequenceToSMILES, FilterSMILES, SmilesToSequence) 6 | 7 | 8 | def test_canonical_filter(): 9 | seqs = ['AAACCTWSFB', 'AAACCTWF', 'AAACCTWaF'] 10 | pipe = Pipeline([CanonicalFilter()]) 11 | seqs_out = pipe(seqs) 12 | assert seqs_out == ['AAACCTWF'] 13 | 14 | 15 | @pytest.mark.parametrize("smiles", "seq_out", 16 | [ 17 | ('C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]', "AAACCTWSFB"), 18 | ('C[C@@H](O)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O', "AAACCTWF"), 19 | ]) 20 | def to_sequence(smiles, seq_out): 21 | pipe = Pipeline([SmilesToSequence()]) 22 | seq_pred = pipe(smiles) 23 | assert seq_pred == seq_out 24 | 25 | 26 | def test_canonical_cleaner(): 27 | seqs = ['AAACCTWSFB', 'AAACCTWF', 'AAACCTWaF'] 28 | pipe = Pipeline([CanonicalCleaner()]) 29 | seqs_out = pipe(seqs) 30 | assert seqs_out == ['AAACCTWSFX', 'AAACCTWF', 'AAACCTWXF'] 31 | 32 | 33 | def test_to_smiles(): 34 | seqs = ['BRTWSF', 'ARTWF', 'aRTWSF', 'C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]'] 35 | pipe1 = Pipeline([FilterSMILES()], name='pipe_smiles') 36 | pipe2 = Pipeline([FilterSMILES(keep_smiles=False), 37 | CanonicalCleaner(substitution='G'), 38 | SequenceToSMILES()], name='pipe_seqs') 39 | pipe = Pipeline([pipe1, pipe2], name='main_pipeline', aggregate=True) 40 | seqs_out = pipe(seqs, verbose=True) 41 | assert seqs_out == [ 42 | 'C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]', 43 | 'C[C@@H](O)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O', 44 | 'C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](Cc1ccccc1)C(=O)O)[C@@H](C)O', 45 | 'C[C@@H](O)[C@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O' 46 | ] 47 | 48 | 49 | if __name__ == '__main__': 50 | test_canonical_cleaner() 51 | test_canonical_filter() 52 | test_to_smiles() 53 | -------------------------------------------------------------------------------- /tests/test_reps.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | 5 | from autopeptideml.reps.lms import RepEngineLM 6 | from autopeptideml.reps.seq_based import RepEngineOnehot 7 | from autopeptideml.reps.fps import RepEngineFP 8 | 9 | 10 | def test_esm_family(): 11 | re = RepEngineLM('esm2-8m', average_pooling=True) 12 | a = re.compute_reps(['AACFFF'], batch_size=12) 13 | b = re.compute_reps(['AACFFF', 'AACCF'], batch_size=12) 14 | re = RepEngineLM('esm2-8m', average_pooling=False) 15 | c = re.compute_reps(['AACFFF'], batch_size=12) 16 | 17 | assert re.dim() == 320 18 | assert a.shape == (1, 320) 19 | assert b.shape == (2, 320) 20 | assert np.array(c).shape == (1, 6, 320) 21 | 22 | 23 | def test_elnaggar_family(): 24 | re = RepEngineLM('ankh-base') 25 | a = re.compute_reps(['AACFFF'], batch_size=12) 26 | assert re.dim() == 768 27 | assert np.array(a).shape == (1, re.dim()) 28 | 29 | 30 | def test_one_hot(): 31 | re = RepEngineOnehot(19) 32 | a = re.compute_reps(['AACFFF', 'AACCF'], batch_size=4) 33 | dict_re = json.loads(str(re)) 34 | assert dict_re == {'rep': 'one-hot', 'max_length': 19} 35 | assert re.dim() == 19 * 21 36 | assert a.shape == (2, 19 * 21) 37 | 38 | 39 | def test_fps(): 40 | re1 = RepEngineFP('ecfp', 512, 8) 41 | a = re1.compute_reps(['C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]'], batch_size=1) 42 | re2 = RepEngineFP('fcfp', 256, 12) 43 | b = re2.compute_reps(['C[C@H](N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H]'], batch_size=1) 44 | dict_1, dict_2 = json.loads(str(re1)), json.loads(str(re2)) 45 | assert dict_1 == {'rep': 'ecfp', 'nbits': 512, 'radius': 8} 46 | assert dict_2 == {'rep': 'fcfp', 'nbits': 256, 'radius': 12} 47 | assert re1.dim() == 512 48 | assert re2.dim() == 256 49 | assert a.shape == (1, 512) 50 | assert b.shape == (1, 256) 51 | 52 | 53 | def test_rostlab_family(): 54 | re = RepEngineLM('prot-t5-xl') 55 | a = re.compute_reps(['AACFFF'], batch_size=12) 56 | assert re.dim() == 1024 57 | assert np.array(a).shape == (1, re.dim()) 58 | 59 | 60 | if __name__ == '__main__': 61 | test_esm_family() 62 | print('ESM OK') 63 | test_elnaggar_family() 64 | print('Elnaggar OK') 65 | test_one_hot() 66 | print('Onehot OK') 67 | test_fps() 68 | print("FPs OK") 69 | test_rostlab_family() 70 | print("Rostlab OK") 71 | --------------------------------------------------------------------------------