├── .github
    └── workflows
    │   ├── python-app.yml
    │   └── python-publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── .source
    └── _static
    │   ├── bio.png
    │   ├── deepchain.png
    │   ├── protein.png
    │   ├── score_mutation.png
    │   ├── sequence.png
    │   └── transformers.png
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── biotransformers
    ├── __init__.py
    ├── bio_transformers.py
    ├── lightning_utils
    │   ├── __init__.py
    │   ├── data.py
    │   ├── models.py
    │   └── optimizer.py
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_accuracy.py
    │   ├── test_embeddings.py
    │   ├── test_logits.py
    │   ├── test_loglikelihoods.py
    │   └── test_mutation_score.py
    ├── utils
    │   ├── __init__.py
    │   ├── compute_utils.py
    │   ├── constant.py
    │   ├── deprecated.py
    │   ├── logger.py
    │   ├── msa_utils.py
    │   ├── tqdm_utils.py
    │   └── utils.py
    ├── version.py
    └── wrappers
    │   ├── __init__.py
    │   ├── esm_wrappers.py
    │   ├── language_model.py
    │   ├── rostlab_wrapper.py
    │   └── transformers_wrappers.py
├── data
    ├── fasta
    │   └── example_fasta.fasta
    └── msa
    │   ├── seq0_swissprot.a3m
    │   ├── seq10_swissprot.a3m
    │   ├── seq11_swissprot.a3m
    │   ├── seq12_swissprot.a3m
    │   └── seq1_swissprot.a3m
├── docs
    ├── Makefile
    ├── environment_docs.yaml
    ├── make.bat
    └── source
    │   ├── _build
    │       ├── .buildinfo
    │       ├── .doctrees
    │       │   ├── api
    │       │   │   └── biotransformers.doctree
    │       │   ├── autoapi
    │       │   │   ├── biotransformers
    │       │   │   │   ├── bio_transformers
    │       │   │   │   │   └── index.doctree
    │       │   │   │   ├── index.doctree
    │       │   │   │   ├── lightning_utils
    │       │   │   │   │   ├── data
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   ├── index.doctree
    │       │   │   │   │   ├── models
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   └── optimizer
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   ├── tests
    │       │   │   │   │   ├── conftest
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   ├── index.doctree
    │       │   │   │   │   ├── test_accuracy
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   ├── test_embeddings
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   ├── test_logits
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   ├── test_loglikelihoods
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   └── test_msa
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   ├── utils
    │       │   │   │   │   ├── constant
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   ├── deprecated
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   ├── index.doctree
    │       │   │   │   │   ├── logger
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   ├── msa_utils
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   ├── tqdm_utils
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   │   └── utils
    │       │   │   │   │   │   └── index.doctree
    │       │   │   │   ├── version
    │       │   │   │   │   └── index.doctree
    │       │   │   │   └── wrappers
    │       │   │   │   │   ├── esm_wrappers
    │       │   │   │   │       └── index.doctree
    │       │   │   │   │   ├── index.doctree
    │       │   │   │   │   ├── language_model
    │       │   │   │   │       └── index.doctree
    │       │   │   │   │   ├── rostlab_wrapper
    │       │   │   │   │       └── index.doctree
    │       │   │   │   │   └── transformers_wrappers
    │       │   │   │   │       └── index.doctree
    │       │   │   └── index.doctree
    │       │   ├── contributing
    │       │   │   ├── CHANGELOG.doctree
    │       │   │   └── CONTRIBUTING.doctree
    │       │   ├── documentation
    │       │   │   ├── course.doctree
    │       │   │   ├── logging.doctree
    │       │   │   ├── msa.doctree
    │       │   │   └── multi_gpus.doctree
    │       │   ├── environment.pickle
    │       │   ├── getting_started
    │       │   │   ├── install.doctree
    │       │   │   └── quick_start.doctree
    │       │   ├── index.doctree
    │       │   └── tutorial
    │       │   │   ├── embeddings.doctree
    │       │   │   ├── finetuning.doctree
    │       │   │   └── loglikelihood.doctree
    │       ├── 404.html
    │       ├── _sources
    │       │   ├── api
    │       │   │   └── biotransformers.rst.txt
    │       │   ├── autoapi
    │       │   │   ├── biotransformers
    │       │   │   │   ├── bio_transformers
    │       │   │   │   │   └── index.rst.txt
    │       │   │   │   ├── index.rst.txt
    │       │   │   │   ├── lightning_utils
    │       │   │   │   │   ├── data
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   ├── index.rst.txt
    │       │   │   │   │   ├── models
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   └── optimizer
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   ├── tests
    │       │   │   │   │   ├── conftest
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   ├── index.rst.txt
    │       │   │   │   │   ├── test_accuracy
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   ├── test_embeddings
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   ├── test_logits
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   ├── test_loglikelihoods
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   └── test_msa
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   ├── utils
    │       │   │   │   │   ├── constant
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   ├── deprecated
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   ├── index.rst.txt
    │       │   │   │   │   ├── logger
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   ├── msa_utils
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   ├── tqdm_utils
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   │   └── utils
    │       │   │   │   │   │   └── index.rst.txt
    │       │   │   │   ├── version
    │       │   │   │   │   └── index.rst.txt
    │       │   │   │   └── wrappers
    │       │   │   │   │   ├── esm_wrappers
    │       │   │   │   │       └── index.rst.txt
    │       │   │   │   │   ├── index.rst.txt
    │       │   │   │   │   ├── language_model
    │       │   │   │   │       └── index.rst.txt
    │       │   │   │   │   ├── rostlab_wrapper
    │       │   │   │   │       └── index.rst.txt
    │       │   │   │   │   └── transformers_wrappers
    │       │   │   │   │       └── index.rst.txt
    │       │   │   └── index.rst.txt
    │       │   ├── contributing
    │       │   │   ├── CHANGELOG.md.txt
    │       │   │   └── CONTRIBUTING.md.txt
    │       │   ├── documentation
    │       │   │   ├── course.md.txt
    │       │   │   ├── logging.md.txt
    │       │   │   ├── msa.md.txt
    │       │   │   └── multi_gpus.md.txt
    │       │   ├── getting_started
    │       │   │   ├── install.rst.txt
    │       │   │   └── quick_start.md.txt
    │       │   ├── index.rst.txt
    │       │   └── tutorial
    │       │   │   ├── embeddings.md.txt
    │       │   │   ├── finetuning.md.txt
    │       │   │   └── loglikelihood.md.txt
    │       ├── _static
    │       │   ├── __init__.py
    │       │   ├── basic.css
    │       │   ├── css
    │       │   │   ├── index.c5995385ac14fb8791e8eb36b4908be2.css
    │       │   │   └── theme.css
    │       │   ├── deepchain-small.png
    │       │   ├── doctools.js
    │       │   ├── documentation_options.js
    │       │   ├── file.png
    │       │   ├── graphviz.css
    │       │   ├── images
    │       │   │   ├── logo_binder.svg
    │       │   │   ├── logo_colab.png
    │       │   │   └── logo_jupyterhub.svg
    │       │   ├── jquery-3.5.1.js
    │       │   ├── jquery.js
    │       │   ├── js
    │       │   │   └── index.1c5a1a01449ed65a7b51.js
    │       │   ├── language_data.js
    │       │   ├── minus.png
    │       │   ├── plus.png
    │       │   ├── pygments.css
    │       │   ├── searchtools.js
    │       │   ├── sphinx-book-theme.12a9622fbb08dcb3a2a40b2c02b83a57.js
    │       │   ├── sphinx-book-theme.acff12b8f9c144ce68a297486a2fa670.css
    │       │   ├── sphinx-book-theme.css
    │       │   ├── tabs.css
    │       │   ├── tabs.js
    │       │   ├── underscore-1.12.0.js
    │       │   ├── underscore.js
    │       │   ├── vendor
    │       │   │   └── fontawesome
    │       │   │   │   └── 5.13.0
    │       │   │   │       ├── LICENSE.txt
    │       │   │   │       ├── css
    │       │   │   │           └── all.min.css
    │       │   │   │       └── webfonts
    │       │   │   │           ├── fa-brands-400.eot
    │       │   │   │           ├── fa-brands-400.svg
    │       │   │   │           ├── fa-brands-400.ttf
    │       │   │   │           ├── fa-brands-400.woff
    │       │   │   │           ├── fa-brands-400.woff2
    │       │   │   │           ├── fa-regular-400.eot
    │       │   │   │           ├── fa-regular-400.svg
    │       │   │   │           ├── fa-regular-400.ttf
    │       │   │   │           ├── fa-regular-400.woff
    │       │   │   │           ├── fa-regular-400.woff2
    │       │   │   │           ├── fa-solid-900.eot
    │       │   │   │           ├── fa-solid-900.svg
    │       │   │   │           ├── fa-solid-900.ttf
    │       │   │   │           ├── fa-solid-900.woff
    │       │   │   │           └── fa-solid-900.woff2
    │       │   └── webpack-macros.html
    │       ├── api
    │       │   └── biotransformers.html
    │       ├── autoapi
    │       │   ├── biotransformers
    │       │   │   ├── bio_transformers
    │       │   │   │   └── index.html
    │       │   │   ├── index.html
    │       │   │   ├── lightning_utils
    │       │   │   │   ├── data
    │       │   │   │   │   └── index.html
    │       │   │   │   ├── index.html
    │       │   │   │   ├── models
    │       │   │   │   │   └── index.html
    │       │   │   │   └── optimizer
    │       │   │   │   │   └── index.html
    │       │   │   ├── tests
    │       │   │   │   ├── conftest
    │       │   │   │   │   └── index.html
    │       │   │   │   ├── index.html
    │       │   │   │   ├── test_accuracy
    │       │   │   │   │   └── index.html
    │       │   │   │   ├── test_embeddings
    │       │   │   │   │   └── index.html
    │       │   │   │   ├── test_logits
    │       │   │   │   │   └── index.html
    │       │   │   │   ├── test_loglikelihoods
    │       │   │   │   │   └── index.html
    │       │   │   │   └── test_msa
    │       │   │   │   │   └── index.html
    │       │   │   ├── utils
    │       │   │   │   ├── constant
    │       │   │   │   │   └── index.html
    │       │   │   │   ├── deprecated
    │       │   │   │   │   └── index.html
    │       │   │   │   ├── index.html
    │       │   │   │   ├── logger
    │       │   │   │   │   └── index.html
    │       │   │   │   ├── msa_utils
    │       │   │   │   │   └── index.html
    │       │   │   │   ├── tqdm_utils
    │       │   │   │   │   └── index.html
    │       │   │   │   └── utils
    │       │   │   │   │   └── index.html
    │       │   │   ├── version
    │       │   │   │   └── index.html
    │       │   │   └── wrappers
    │       │   │   │   ├── esm_wrappers
    │       │   │   │       └── index.html
    │       │   │   │   ├── index.html
    │       │   │   │   ├── language_model
    │       │   │   │       └── index.html
    │       │   │   │   ├── rostlab_wrapper
    │       │   │   │       └── index.html
    │       │   │   │   └── transformers_wrappers
    │       │   │   │       └── index.html
    │       │   └── index.html
    │       ├── contributing
    │       │   ├── CHANGELOG.html
    │       │   └── CONTRIBUTING.html
    │       ├── documentation
    │       │   ├── course.html
    │       │   ├── logging.html
    │       │   ├── msa.html
    │       │   └── multi_gpus.html
    │       ├── genindex.html
    │       ├── getting_started
    │       │   ├── install.html
    │       │   └── quick_start.html
    │       ├── index.html
    │       ├── objects.inv
    │       ├── py-modindex.html
    │       ├── search.html
    │       ├── searchindex.js
    │       └── tutorial
    │       │   ├── embeddings.html
    │       │   ├── finetuning.html
    │       │   └── loglikelihood.html
    │   ├── _static
    │       └── deepchain-small.png
    │   ├── api
    │       └── biotransformers.rst
    │   ├── conf.py
    │   ├── contributing
    │       ├── CHANGELOG.md
    │       └── CONTRIBUTING.md
    │   ├── documentation
    │       ├── course.md
    │       ├── logging.md
    │       ├── msa.md
    │       └── multi_gpus.md
    │   ├── getting_started
    │       ├── install.rst
    │       └── quick_start.md
    │   ├── images
    │       ├── bio.png
    │       └── score_mutation.jpeg
    │   ├── index.rst
    │   └── tutorial
    │       ├── embeddings.md
    │       ├── finetuning.md
    │       ├── loglikelihood.md
    │       └── mutations_score.rst
├── environment_dev.yaml
├── requirements.txt
└── setup.py


/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ develop ]
 9 |   pull_request:
10 |     branches: [ develop ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python 3.7
20 |       uses: actions/setup-python@v2
21 |       with:
22 |         python-version: 3.7
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install flake8 pytest pytest-cov
27 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28 |     - name: Lint with flake8
29 |       run: |
30 |         # stop the build if there are Python syntax errors or undefined names
31 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=100 --statistics
34 |     - name: Test with pytest
35 |       run: |
36 |         pip install .
37 |         pytest --cov=./ --cov-report=xml
38 |     - name: "Upload coverage to Codecov"
39 |       uses: codecov/codecov-action@v1
40 |       with:
41 |         token: ${{ secrets.CODECOV_TOKEN }}
42 |         fail_ci_if_error: true
43 |         files: ./coverage.xml
44 |         path_to_write_report: ./coverage/codecov_report.txt
45 |         verbose: true
46 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created,edited]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.7'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
26 |     - name: Build and publish
27 |       env:
28 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
29 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
30 |       run: |
31 |         python setup.py sdist bdist_wheel
32 |         twine upload dist/*
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # OS generated files
  2 | .DS_Store
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | deploy.md
 33 | notebooks/
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | docs/source/_build/html
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | .vscode/
 93 | logs/
 94 | 
 95 | # pipenv
 96 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 97 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 98 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 99 | #   install all needed dependencies.
100 | #Pipfile.lock
101 | 
102 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
103 | __pypackages__/
104 | 
105 | # Celery stuff
106 | celerybeat-schedule
107 | celerybeat.pid
108 | 
109 | # SageMath parsed files
110 | *.sage.py
111 | 
112 | # Environments
113 | .env
114 | .venv
115 | env/
116 | venv/
117 | ENV/
118 | env.bak/
119 | venv.bak/
120 | 
121 | # Spyder project settings
122 | .spyderproject
123 | .spyproject
124 | 
125 | # Rope project settings
126 | .ropeproject
127 | 
128 | # mkdocs documentation
129 | /site
130 | 
131 | # mypy
132 | .mypy_cache/
133 | .dmypy.json
134 | dmypy.json
135 | 
136 | # Pyre type checker
137 | .pyre/
138 | 
139 | #model checkpoint
140 | *.pt
141 | 
142 | #personal test script
143 | multigpus_embeddings.py
144 | multigpus_training.py
145 | multigpus_accuracy.py
146 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3.7
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/ambv/black
 6 |     rev: 20.8b1
 7 |     hooks:
 8 |       - id: black
 9 |         args:
10 |           - --line-length=88
11 | 
12 |   - repo: https://github.com/pre-commit/pre-commit-hooks
13 |     rev: v3.4.0
14 |     hooks:
15 |       - id: debug-statements
16 |       - id: requirements-txt-fixer
17 |       - id: check-ast # Simply check whether the files parse as valid python
18 |       - id: check-case-conflict # Check for files that would conflict in case-insensitive filesystems
19 |       - id: check-builtin-literals # Require literal syntax when initializing empty or zero Python builtin types
20 |       - id: check-docstring-first # Check a common error of defining a docstring after code
21 |       - id: check-merge-conflict # Check for files that contain merge conflict strings
22 |       - id: check-yaml # Check yaml files
23 |       - id: end-of-file-fixer # Ensure that a file is either empty, or ends with one newline
24 |       - id: mixed-line-ending # Replace or checks mixed line ending
25 |       - id: trailing-whitespace # This hook trims trailing whitespace
26 | 
27 |   - repo: https://github.com/pre-commit/mirrors-mypy
28 |     rev: v0.800
29 |     hooks:
30 |       - id: mypy
31 |         args:
32 |           - --no-strict-optional
33 |           - --ignore-missing-imports
34 | 
35 |   - repo: https://gitlab.com/pycqa/flake8
36 |     rev: 3.8.4
37 |     hooks:
38 |       - id: flake8
39 |         args:
40 |           - --max-line-length=88
41 |           - --max-cognitive-complexity=15
42 |           - --ignore=E203,E266,E501,W503
43 |         additional_dependencies:
44 |           - pep8-naming
45 |           - flake8-builtins
46 |           - flake8-comprehensions
47 |           - flake8-bugbear
48 |           - flake8-pytest-style
49 |           - flake8-cognitive-complexity
50 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |    configuration: docs/source/conf.py
11 | 
12 | # Optionally build your docs in additional formats such as PDF
13 | formats:
14 |    - pdf
15 | 
16 | # Optionally set the version of Python and requirements required to build your docs
17 | python:
18 |    version: 3.7
19 | 
20 | conda:
21 |   environment: docs/environment_docs.yaml
22 | 


--------------------------------------------------------------------------------
/.source/_static/bio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/.source/_static/bio.png


--------------------------------------------------------------------------------
/.source/_static/deepchain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/.source/_static/deepchain.png


--------------------------------------------------------------------------------
/.source/_static/protein.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/.source/_static/protein.png


--------------------------------------------------------------------------------
/.source/_static/score_mutation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/.source/_static/score_mutation.png


--------------------------------------------------------------------------------
/.source/_static/sequence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/.source/_static/sequence.png


--------------------------------------------------------------------------------
/.source/_static/transformers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/.source/_static/transformers.png


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Change log
  2 | 
  3 | # [0.1.3] - 2021-07-09
  4 | 
  5 | Fixed:
  6 |  - Fix filtering of logits which impacts loglikelihood computation
  7 |  - Fix fasta file reading in compute_loglikelihood
  8 | 
  9 | Features:
 10 |   - Add `normalize` mode in compute_loglikelihood.
 11 | 
 12 | 
 13 | # [0.1.3] - 2021-07-01
 14 | 
 15 | Features:
 16 |  - Add msa-transformers for methods:
 17 |     - compute_logits
 18 |     - compute_embeddings
 19 |     - compute_probabilities
 20 |     - compute_accuracy
 21 | 
 22 | Fixed:
 23 |  - Remove torch DataParallel wrapper.
 24 | 
 25 | # [0.1.0] - 2021-07-01
 26 | 
 27 | Features:
 28 |  - Add ray worker for multi-gpus inference
 29 | 
 30 | Removed:
 31 |  - Remove torch DataParallel wrapper.
 32 | 
 33 | # [0.0.10] - 2021-06-14
 34 | Note on the release
 35 | 
 36 | Features:
 37 |  - Add BIO_LOG_LEVEL environnement variable to control logging message (logger)
 38 |  - Check if every unique amino acids in sequences are in tokens_list (compute_probabilities)
 39 | 
 40 | Fixed:
 41 |  - Add shuffling in batch_sampler (lightning_utils)
 42 |  - Fix tokens argument for dataloader (lightning_utils)
 43 |  - Fix rtd CI to separates docs and package environment.
 44 | 
 45 | Changed:
 46 |  - Modified the signature of some functions to improve clarity (tansformers_wrappers)
 47 |  - Update `train_masked` method to `finetune` (tansformers_wrappers)
 48 |  - `compute_embeddings` with option `full` return a list of embeddingsn, no matter the size (tansformers_wrappers)
 49 | 
 50 | Removed:
 51 |  - Remove the tokens_list argument when not necessary and tried to make its usage clearer (tansformers_wrappers)
 52 |  - Remove functions (tansformers_wrappers):
 53 |     - _filter_and_pool_embeddings
 54 |     - _split_logits
 55 |     -  _slabels_remaping
 56 |     - _filter_logits
 57 |     -  _filter_loglikelihood
 58 |     - _compute_accuracy
 59 |     - _compute_calibration
 60 | 
 61 | 
 62 | # [0.0.9] - 2021-06-04
 63 | 
 64 | Fixed:
 65 |  - Batch_sampler issue
 66 | 
 67 | # [0.0.8] - 2021-06-03
 68 | Note on the release
 69 | 
 70 | Features:
 71 |  - Merge ESM/protbert for finetuning model with pytorch-lightning
 72 |  - Possibility to restore a training session.
 73 | 
 74 | Fixed:
 75 |  - Fix conflicts when saving model with DDP
 76 |  - Fix loading checkpoint created by pytorch-lightning
 77 | 
 78 | 
 79 | # [0.0.7] - 2021-05-12
 80 | Note on the release
 81 | 
 82 | Features:
 83 |  - Add fasta files support for each compute function.
 84 |  - Add train_masked function to finetune model on custom dataset. (Only ESM for the moment, protbert is coming.)
 85 | 
 86 | Docs:
 87 |  - Update documentation to add tutorial on training.
 88 | 
 89 | Changed:
 90 |  - GPU is used by default if found, even if not specified.
 91 | 
 92 | # [0.0.6] - 2021-05-24
 93 | Note on the release
 94 | 
 95 | Fixed:
 96 |  - Update torch dependencies to be less restrictive. Create conflict with other packages.
 97 | 
 98 | # [0.0.5] - 2021-05-12
 99 | 
100 | Note on the release
101 | 
102 | Added
103 |  - added multi-gpu support for inference
104 |  - added function to finetuned a model on a specific dataset on multi-gpu
105 | 
106 | Changed
107 | 
108 | Fixed
109 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | In order to contibute to this repository you will need developer access to this repo. To know more about the project go to the [README](README.md) first.
  4 | 
  5 | 
  6 | ## Install Dev environment
  7 | 
  8 | From the root of this repo, run
  9 | `conda env create -f environment_dev.yaml`
 10 | 
 11 | ## Pre-commit hooks
 12 | 
 13 | Pre-commits hooks have been configured for this project using the [pre-commit](https://pre-commit.com/) library:
 14 | 
 15 | - [black](https://github.com/psf/black) python formatter
 16 | - [flake8](https://flake8.pycqa.org/en/latest/) python linter
 17 | - [isort](https://pypi.org/project/isort/) sorts imports
 18 | 
 19 | To get them going on your side, make sure to have python installed, and run the following
 20 | commands from the root directory of this repository:
 21 | 
 22 | ```bash
 23 | pip install pre-commit
 24 | pre-commit install
 25 | pre-commit run --all-files
 26 | ```
 27 | 
 28 | # Git conventions
 29 | 
 30 | - The section relies on the [Commit Message Guidelines](https://github.com/angular/angular/blob/master/CONTRIBUTING.md#commit)
 31 | - It provides conventions to write commits messages based on the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/)
 32 | 
 33 | - It aims to :
 34 |     - Get a well-structured and easily understandable git history
 35 |     - Generate changelogs easily for each release since we can use scripts that parse the commit messages
 36 | 
 37 | 
 38 | The commit messages must have the following structure :
 39 | 
 40 | ```
 41 | <type>(<scope>): <subject>
 42 | <BLANK LINE>
 43 | <body>
 44 | <BLANK LINE>
 45 | <footer>
 46 | ```
 47 | 
 48 | - `<type>` section :
 49 |     - It is mandatory
 50 |     - It must be one of the following :
 51 |         - build: Changes to our deployment configuration (e.g. docker, requirements)
 52 |         - ci : Changes to our CI configuration files and scripts
 53 |         - chore: Changes not linked to CI / build or the code (e.g. add issue templates)
 54 |         - docs : Documentation changes
 55 |         - feat : A new feature
 56 |         - fix : A bug fix
 57 |         - perf : A code change that improves performance
 58 |         - revert: Linked to a revert of a commit
 59 |         - refactor : A code change that neither fixes a bug nor adds a feature
 60 | 
 61 |         - style : Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc)
 62 |         - test : Adding missing tests or correcting existing tests
 63 | 
 64 | - `<scope>` section:
 65 |     - optional
 66 |     - describes module affected by changes
 67 | 
 68 | - `<subject>` section :
 69 |     - It is mandatory
 70 |     - It contains a succinct description of the change
 71 |     - Few recommendations about the subject :
 72 |         - use the imperative, present tense: "change" not "changed" nor "changes"
 73 |         - don't capitalize the first letter
 74 |         - no dot (.) at the end
 75 | 
 76 | - `<body>` section :
 77 |     - It is optional
 78 |     - It is an extension of the <subject> section used to add a longer description about the changes if relevant
 79 | 
 80 | ## Coding conventions
 81 | 
 82 | Please respect the following conventions to contribute to the code:
 83 | 
 84 | - Use hard wrap at 88
 85 | - Respect black, isort and flake8 conventions
 86 | - Classes' names are Caml case (example: MyClass)
 87 | - Functions and variables are in lower case with _ as separator (example: my_function, my_var)
 88 | - Names are explicit: avoid mathematical notations, functions' names start with a verb
 89 | - Use python typing library: each class and method should be typed (both for inputs and outputs)
 90 | - Create custom types if needed
 91 | - All classes and functions should have a docstring
 92 | - Avoid repeating arguments and returns in docstring (should be explicit with the types) except when it is truly necessary
 93 | - A function (or a class) does not take more than 5 arguments, if you need more create a data class
 94 | - Avoid dictionaries to pass arguments when possible and prefer dataclasses instead
 95 | - Repeat inputs names when calling a function: ex: compute_custom(arg1=arg1, arg2=my_arg2)
 96 | - Use list comprehension when it is possible
 97 | - Use f strings to add variables in strings: ex: print(f'my var value is {my_var}')
 98 | - Use PathLib to handle pathes
 99 | - Prefer shutil to os to manage files/ folders creations and deletions
100 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.1-runtime-ubuntu18.04
 2 | 
 3 | ENV CONDA_DIR=/opt/conda
 4 | ENV CONDA_PYTHON_VERSION=3
 5 | ENV PYTHONDONTWRITEBYTECODE=true
 6 | 
 7 | RUN apt-get update && \
 8 |     apt-get install -y --no-install-recommends \
 9 |     libgomp1 liblapack3 openmpi-bin openmpi-common jq git wget gcc libmpich-dev unzip bzip2 build-essential ca-certificates uuid-runtime libxrender1 libxext6 && \
10 |     apt-get clean && rm -rf /var/lib/apt/lists/*
11 | 
12 | # install miniconda
13 | RUN wget --quiet https://repo.continuum.io/miniconda/Miniconda$CONDA_PYTHON_VERSION-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
14 |     echo 'export PATH=$CONDA_DIR/bin:$PATH' > /etc/profile.d/conda.sh && \
15 |     /bin/bash /tmp/miniconda.sh -b -p $CONDA_DIR && \
16 |     rm -rf /tmp/* && \
17 |     apt-get clean && \
18 |     rm -rf /var/lib/apt/lists/*
19 | ENV PATH=$CONDA_DIR/bin:$PATH
20 | 
21 | WORKDIR /app
22 | 
23 | RUN git clone https://github.com/DeepChainBio/bio-transformers
24 | 
25 | WORKDIR /app/bio-transformers
26 | 
27 | RUN conda env create -f environment_dev.yaml
28 | 
29 | SHELL ["/bin/bash", "-c"]
30 | 
31 | RUN source activate bio-transformers-dev && pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
32 | RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> /root/.bashrc && \
33 |     echo "source activate bio-transformers-dev" >> /root/.bashrc
34 | RUN ${CONDA_DIR}/envs/bio-transformers-dev/bin/pip install -e .
35 | 
36 | WORKDIR /app
37 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/biotransformers/__init__.py:
--------------------------------------------------------------------------------
1 | from biotransformers.bio_transformers import BioTransformers  # noqa
2 | from biotransformers.utils.logger import logger  # noqa
3 | 
4 | from .version import VERSION  # noqa
5 | 
6 | log = logger("biotransformers")
7 | 
8 | __version__ = VERSION
9 | 


--------------------------------------------------------------------------------
/biotransformers/bio_transformers.py:
--------------------------------------------------------------------------------
 1 | """Main module to build either ESM or protbert model"""
 2 | 
 3 | from biotransformers.utils.constant import BACKEND_LIST, MAPPING_PROTBERT
 4 | from biotransformers.utils.deprecated import deprecated_alias
 5 | from biotransformers.utils.utils import format_backend
 6 | from biotransformers.wrappers.esm_wrappers import ESMWrapper
 7 | from biotransformers.wrappers.rostlab_wrapper import RostlabWrapper
 8 | from biotransformers.wrappers.transformers_wrappers import TransformersWrapper
 9 | 
10 | 
11 | class BioTransformers(TransformersWrapper):
12 |     """
13 |     General class to choose an ESM or ProtBert backend
14 |     Abstract method are implemented in transformers
15 |     """
16 | 
17 |     def __init__(
18 |         self,
19 |         backend: str = "esm1_t6_43M_UR50S",
20 |         num_gpus: int = 0,
21 |     ):
22 |         """General class to compute method for a list of provided backend
23 | 
24 |         If you want to restrict the use of GPUS, do make gpu1 and gpu3 available:
25 |         os.environ["CUDA_VISIBLE_DEVICES"]="0,3" or export CUDA_VISIBLE_DEVICES="0,3"
26 | 
27 |         Args:
28 |             backend (str, optional): name of the backend displayed with `list_backend()` . Defaults to "esm1_t6_43M_UR50S".
29 |             num_gpus (int, optional): number of gpu to use. Defaults to 0.
30 |         """
31 |         pass
32 | 
33 |     @deprecated_alias(device="num_gpus")
34 |     @deprecated_alias(multi_gpu="num_gpus")
35 |     def __new__(
36 |         cls,
37 |         backend: str = "esm1_t6_43M_UR50S",
38 |         num_gpus: int = 0,
39 |     ):
40 |         format_list = "\n".join(format_backend(BACKEND_LIST))
41 |         assert backend in BACKEND_LIST, f"Choose backend in \n\n{format_list}"
42 |         if not type(num_gpus) == int:
43 |             raise TypeError(f"num_gpus should be of type int, not {type(num_gpus)}.")
44 | 
45 |         if "esm" in backend:
46 |             model_dir = backend
47 |             return TransformersWrapper(
48 |                 model_dir=model_dir, language_model_cls=ESMWrapper, num_gpus=num_gpus
49 |             )
50 |         else:
51 |             model_dir = MAPPING_PROTBERT[backend]
52 |             return TransformersWrapper(
53 |                 model_dir=model_dir,
54 |                 language_model_cls=RostlabWrapper,
55 |                 num_gpus=num_gpus,
56 |             )
57 | 
58 |     @staticmethod
59 |     def list_backend() -> None:
60 |         """Get all possible backend for the model"""
61 |         print(
62 |             "Use backend in this list :\n\n",
63 |             "\n".join(format_backend(BACKEND_LIST)),
64 |             sep="",
65 |         )
66 | 


--------------------------------------------------------------------------------
/biotransformers/lightning_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/biotransformers/lightning_utils/__init__.py


--------------------------------------------------------------------------------
/biotransformers/lightning_utils/models.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Tuple
  2 | 
  3 | import pytorch_lightning as pl
  4 | import torch
  5 | import torchmetrics
  6 | from torch.nn import functional as F  # noqa: N812 pylint: disable=wrong-import-order
  7 | 
  8 | from .optimizer import lr_update
  9 | 
 10 | 
 11 | class LightningModule(pl.LightningModule):
 12 |     """Create lightning model to use ddp"""
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         model,
 17 |         alphabet,
 18 |         lr: float,
 19 |         warmup_end_lr: float,
 20 |         warmup_updates: int = 10,
 21 |         warmup_init_lr: float = 1e-7,
 22 |     ):
 23 |         super().__init__()
 24 |         self.model = model
 25 |         self.alphabet = alphabet
 26 |         self.lr = lr
 27 |         self.automatic_optimization = True
 28 |         self.warmup_updates = warmup_updates
 29 |         self.warmup_init_lr = min(warmup_init_lr, lr)
 30 |         self.lr_step = (warmup_end_lr - self.warmup_init_lr) / warmup_updates
 31 |         self.decay_factor = warmup_end_lr * warmup_updates ** 0.5
 32 |         self.train_acc = torchmetrics.Accuracy()
 33 |         self.val_acc = torchmetrics.Accuracy()
 34 | 
 35 |     def forward(self, x):
 36 |         return self.model(x)["logits"]
 37 | 
 38 |     def configure_optimizers(self) -> Tuple[List[torch.optim.Optimizer], List[Dict]]:
 39 |         """Configure the optimizer and learning rate scheduler.
 40 | 
 41 |         Returns:
 42 |             - list of optimizers.
 43 |             - list of lr schedulers.
 44 |         """
 45 |         optimizer = torch.optim.Adam(self.model.parameters(), self.lr)
 46 | 
 47 |         lr_scheduler = {
 48 |             "scheduler": torch.optim.lr_scheduler.LambdaLR(
 49 |                 optimizer,
 50 |                 lr_lambda=lambda x: lr_update(
 51 |                     num_updates=x,
 52 |                     warmup_updates=self.warmup_updates,
 53 |                     warmup_init_lr=self.warmup_init_lr,
 54 |                     lr_step=self.lr_step,
 55 |                     decay_factor=self.decay_factor,
 56 |                 ),
 57 |             ),
 58 |             "name": "learning_rate",
 59 |             "interval": "step",
 60 |             "frequency": 1,
 61 |         }
 62 | 
 63 |         return [optimizer], [lr_scheduler]
 64 | 
 65 |     def cross_entropy_loss(self, logits, targets):
 66 |         return F.cross_entropy(
 67 |             logits.reshape(-1, logits.size(-1)),
 68 |             targets.reshape(-1),
 69 |             reduction="sum",
 70 |             ignore_index=self.alphabet.padding_idx,
 71 |         )
 72 | 
 73 |     def training_step(self, train_batch, batch_idx):
 74 |         tokens, target = train_batch
 75 |         logits = self.forward(tokens)
 76 |         loss = self.cross_entropy_loss(logits, target)
 77 | 
 78 |         masked_preds, masked_targets = self.get_tensor_accuracy(logits, target)
 79 |         self.train_acc(masked_preds, masked_targets)
 80 | 
 81 |         masked_tokens = target.ne(self.alphabet.padding_idx)
 82 |         sample_size = masked_tokens.int().sum()
 83 |         loss = loss / sample_size
 84 | 
 85 |         self.log_dict(
 86 |             {"train_loss": loss, "train_acc": self.train_acc},
 87 |             on_step=False,
 88 |             on_epoch=True,
 89 |             prog_bar=True,
 90 |             logger=True,
 91 |         )
 92 | 
 93 |         return loss
 94 | 
 95 |     def validation_step(self, val_batch, batch_idx):
 96 |         """Log the loss and metrics for a batch.
 97 | 
 98 |         Args:
 99 |             batch: batch input.
100 |             batch_idx: index of the batch.
101 |         """
102 |         tokens, target = val_batch
103 |         logits = self.forward(tokens)
104 |         loss = self.cross_entropy_loss(logits, target)
105 | 
106 |         masked_preds, masked_targets = self.get_tensor_accuracy(logits, target)
107 |         self.val_acc(masked_preds, masked_targets)
108 | 
109 |         masked_tokens = target.ne(self.alphabet.padding_idx)
110 |         sample_size = masked_tokens.int().sum()
111 |         loss = loss / sample_size
112 | 
113 |         self.log_dict(
114 |             {"val_loss": loss, "val_acc": self.val_acc},
115 |             on_step=False,
116 |             on_epoch=True,
117 |             prog_bar=True,
118 |             logger=True,
119 |         )
120 | 
121 |         return loss
122 | 
123 |     def get_tensor_accuracy(
124 |         self, logits: torch.Tensor, targets: torch.Tensor
125 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
126 |         """Calculate accuracy for multi-masking, summed over batch.
127 | 
128 |         Args:
129 |             logits: prediction from the model, shape = (batch, len_tokens, len_vocab)
130 |             targets: ground truth, shape = (batch, len_tokens)
131 | 
132 |         Returns:
133 |             accuracy value.
134 |         """
135 |         preds = torch.argmax(logits, dim=-1)  # (batch, len_tokens)
136 |         masked_tokens = targets.ne(self.alphabet.padding_idx)
137 | 
138 |         masked_preds = torch.masked_select(preds, masked_tokens)
139 |         masked_targets = torch.masked_select(targets, masked_tokens)
140 | 
141 |         return masked_preds.detach().cpu(), masked_targets.detach().cpu()
142 | 


--------------------------------------------------------------------------------
/biotransformers/lightning_utils/optimizer.py:
--------------------------------------------------------------------------------
 1 | def lr_update(
 2 |     num_updates: int,
 3 |     warmup_updates: int,
 4 |     warmup_init_lr: float,
 5 |     lr_step: float,
 6 |     decay_factor: float,
 7 | ) -> float:
 8 |     """InverseSquareRootSchedule.
 9 | 
10 |     https://github.com/pytorch/fairseq/blob/master/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py#L32
11 | 
12 |     Args:
13 |         num_updates: number of batches already used.
14 |         warmup_updates: number of batch steps for warm up.
15 |         warmup_init_lr: initial learning rate.
16 |         lr_step: step for increasing learning rate during warm up.
17 |         decay_factor: factor for decreasing learning rate after warm up.
18 | 
19 |     Returns:
20 |         learning rate multiplicate factor
21 |     """
22 |     if num_updates < warmup_updates:
23 |         lr = warmup_init_lr + num_updates * lr_step
24 |     else:
25 |         lr = decay_factor * num_updates ** -0.5
26 |     if warmup_init_lr > 0:
27 |         return lr / warmup_init_lr
28 | 
29 |     return 0
30 | 


--------------------------------------------------------------------------------
/biotransformers/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/biotransformers/tests/__init__.py


--------------------------------------------------------------------------------
/biotransformers/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from biotransformers import BioTransformers
  3 | 
  4 | test_models = [
  5 |     # "esm1_t34_670M_UR100",
  6 |     "esm1_t6_43M_UR50S",
  7 |     # "esm1b_t33_650M_UR50S",
  8 |     "protbert",
  9 |     # "protbert_bfd",
 10 | ]
 11 | 
 12 | 
 13 | @pytest.fixture(scope="session", params=test_models)
 14 | def init_model(request):
 15 |     print(request.param)
 16 |     # thanks to fixture class instance (which takes time) will be reused over tests.
 17 |     return BioTransformers(request.param)
 18 | 
 19 | 
 20 | @pytest.fixture(scope="session")
 21 | def fasta_path():
 22 |     # thanks to fixture class instance (which takes time) will be reused over tests.
 23 |     return "data/fasta/example_fasta.fasta"
 24 | 
 25 | 
 26 | @pytest.fixture(scope="session")
 27 | def sequences():
 28 |     # thanks to fixture class instance (which takes time) will be reused over tests.
 29 |     return ["AAAA", "AKKF", "AHHFK", "KKKKKKKLLL"]
 30 | 
 31 | 
 32 | @pytest.fixture(scope="session")
 33 | def lengths_sequence_fasta():
 34 |     # thanks to fixture class instance (which takes time) will be reused over tests.
 35 |     lengths = [476, 201, 60, 35, 284]
 36 |     return lengths
 37 | 
 38 | 
 39 | @pytest.fixture(scope="session")
 40 | def loglikelihoods_results():
 41 |     results = {
 42 |         "esm1_t6_43M_UR50S": {
 43 |             "params1": [
 44 |                 -1.0990107895886871,
 45 |                 -2.622793987825229,
 46 |                 -2.9771586269887687,
 47 |                 -0.9673027336521154,
 48 |             ],
 49 |             "params2": [
 50 |                 -1.025939130590268,
 51 |                 -3.251339913625416,
 52 |                 -3.471589807380847,
 53 |                 -0.7787956539618508,
 54 |             ],
 55 |             "params3": [
 56 |                 -1.099011026298114,
 57 |                 -2.6227955882668903,
 58 |                 -2.9771586069197626,
 59 |                 -0.9673018411528049,
 60 |             ],
 61 |         },
 62 |         "Rostlab/prot_bert": {
 63 |             "params1": [
 64 |                 -1.2851557324014358,
 65 |                 -1.6284870947647039,
 66 |                 -0.3962127573832269,
 67 |                 -5.623104014419566,
 68 |             ],
 69 |             "params2": [
 70 |                 -3.314856902784298,
 71 |                 -3.7017992105942645,
 72 |                 -3.5632398628301134,
 73 |                 -4.201843982611495,
 74 |             ],
 75 |             "params3": [
 76 |                 -1.2851545893520187,
 77 |                 -1.6284828574671406,
 78 |                 -0.3962134847024349,
 79 |                 -5.623108921843686,
 80 |             ],
 81 |         },
 82 |     }
 83 |     return results
 84 | 
 85 | 
 86 | @pytest.fixture(scope="session")
 87 | def loglikelihoods_fasta_results():
 88 |     results = {
 89 |         "esm1_t6_43M_UR50S": [
 90 |             -3.088591489364207,
 91 |             -2.258382942624286,
 92 |             -1.8204519701969795,
 93 |             -1.712802577224046,
 94 |             -1.0209009845359696,
 95 |         ],
 96 |         "Rostlab/prot_bert": [
 97 |             -0.13746537830140257,
 98 |             -0.2248445733096602,
 99 |             -0.3628546008213477,
100 |             -0.2788796518420453,
101 |             -0.32384316791422224,
102 |         ],
103 |     }
104 |     return results
105 | 
106 | 
107 | @pytest.fixture(scope="session")
108 | def mutations_score_results():
109 |     results = {
110 |         "esm1_t6_43M_UR50S": {
111 |             "params1": [
112 |                 -2.522218942642212,
113 |                 0.9405336380004883,
114 |                 0.2436962127685547,
115 |                 -17.050978302955627,
116 |             ],
117 |         },
118 |         "Rostlab/prot_bert": {
119 |             "params1": [
120 |                 -0.496506929397583,
121 |                 1.3073345646262169,
122 |                 0.24045005440711975,
123 |                 -2.4403315782547,
124 |             ]
125 |         },
126 |     }
127 |     return results
128 | 


--------------------------------------------------------------------------------
/biotransformers/tests/test_accuracy.py:
--------------------------------------------------------------------------------
 1 | """Test module for testing accuracy function"""
 2 | import pytest
 3 | 
 4 | test_params = [
 5 |     (1, "forward"),
 6 |     (2, "masked"),
 7 |     (10, "forward"),
 8 | ]
 9 | 
10 | test_params_fasta = [(2, "forward")]
11 | 
12 | 
13 | @pytest.mark.parametrize("batch_size, pass_mode", test_params)
14 | def test_accuracy_type_and_range(init_model, sequences, batch_size, pass_mode):
15 |     test_trans = init_model
16 |     accuracy = test_trans.compute_accuracy(
17 |         sequences,
18 |         batch_size=batch_size,
19 |         pass_mode=pass_mode,
20 |     )
21 |     assert isinstance(accuracy, float)
22 |     assert (accuracy >= 0.0) and (accuracy <= 1.0)
23 | 
24 | 
25 | @pytest.mark.parametrize("batch_size, pass_mode", test_params_fasta)
26 | def test_accuracy_type_and_range_fasta(init_model, fasta_path, batch_size, pass_mode):
27 |     test_trans = init_model
28 | 
29 |     accuracy_fasta = test_trans.compute_accuracy(
30 |         fasta_path,
31 |         batch_size=batch_size,
32 |         pass_mode=pass_mode,
33 |     )
34 |     assert isinstance(accuracy_fasta, float)
35 |     assert (accuracy_fasta >= 0.0) and (accuracy_fasta <= 1.0)
36 | 


--------------------------------------------------------------------------------
/biotransformers/tests/test_embeddings.py:
--------------------------------------------------------------------------------
 1 | """Test module for testing embeddings function"""
 2 | import numpy as np
 3 | import pytest
 4 | 
 5 | test_params = [
 6 |     (1, ["cls", "mean"]),
 7 |     (2, ["full", "mean", "cls"]),
 8 |     (10, ["cls", "full"]),
 9 | ]
10 | 
11 | 
12 | @pytest.mark.parametrize("batch_size, pool_mode", test_params)
13 | def test_embeddings_type_and_shape(init_model, sequences, batch_size, pool_mode):
14 |     test_trans = init_model
15 |     embeddings = test_trans.compute_embeddings(
16 |         sequences,
17 |         batch_size=batch_size,
18 |         pool_mode=pool_mode,
19 |     )
20 | 
21 |     assert isinstance(embeddings, dict)
22 |     if "full" in pool_mode:
23 |         for emb, sequence in zip(embeddings["full"], sequences):
24 |             assert emb.shape[0] == len(sequence)
25 |     if "cls" in pool_mode:
26 |         assert isinstance(embeddings["cls"], np.ndarray)
27 | 
28 |     if "mean" in pool_mode:
29 |         assert isinstance(embeddings["mean"], np.ndarray)
30 | 
31 | 
32 | @pytest.mark.parametrize("batch_size, pool_mode", test_params)
33 | def test_embeddings_type_and_shape_fasta(
34 |     init_model, fasta_path, lengths_sequence_fasta, batch_size, pool_mode
35 | ):
36 |     test_trans = init_model
37 |     embeddings = test_trans.compute_embeddings(
38 |         fasta_path,
39 |         batch_size=batch_size,
40 |         pool_mode=pool_mode,
41 |     )
42 |     if "full" in pool_mode:
43 |         for emb, length in zip(embeddings["full"], lengths_sequence_fasta):
44 |             assert emb.shape[0] == length
45 | 
46 |     assert isinstance(embeddings, dict)
47 |     if "cls" in pool_mode:
48 |         assert isinstance(embeddings["cls"], np.ndarray)
49 | 
50 |     if "mean" in pool_mode:
51 |         assert isinstance(embeddings["mean"], np.ndarray)
52 | 


--------------------------------------------------------------------------------
/biotransformers/tests/test_logits.py:
--------------------------------------------------------------------------------
 1 | """Test module for testing logits function"""
 2 | import pytest
 3 | 
 4 | test_params = [
 5 |     (1, "forward"),
 6 |     (2, "masked"),
 7 |     (10, "forward"),
 8 | ]
 9 | 
10 | test_params_fasta = [(2, "forward")]
11 | 
12 | 
13 | @pytest.mark.parametrize("batch_size, pass_mode", test_params)
14 | def test_logits_type(init_model, batch_size, sequences, pass_mode):
15 |     test_trans = init_model
16 |     logits = test_trans.compute_logits(
17 |         sequences,
18 |         batch_size=batch_size,
19 |         pass_mode=pass_mode,
20 |     )
21 |     assert len(logits) == len(sequences)
22 |     for logit, sequence in zip(logits, sequences):
23 |         assert logit.shape[0] == len(sequence)
24 | 
25 | 
26 | @pytest.mark.parametrize("batch_size, pass_mode", test_params_fasta)
27 | def test_logits_type_fasta(
28 |     init_model, batch_size, fasta_path, lengths_sequence_fasta, pass_mode
29 | ):
30 |     test_trans = init_model
31 |     logits = test_trans.compute_logits(
32 |         fasta_path,
33 |         batch_size=batch_size,
34 |         pass_mode=pass_mode,
35 |     )
36 |     for logit, length in zip(logits, lengths_sequence_fasta):
37 |         assert logit.shape[0] == length
38 | 


--------------------------------------------------------------------------------
/biotransformers/tests/test_loglikelihoods.py:
--------------------------------------------------------------------------------
 1 | """Test module for testing loglikelihoods function"""
 2 | import pytest
 3 | from numpy.testing import assert_allclose
 4 | 
 5 | test_params = [
 6 |     (1, list("ACDEFGHIKLMNPQRSTVWY"), "forward", "params1"),
 7 |     (2, list("ACDEFGHIKLMNPQRSTVWY") + ["MASK"], "masked", "params2"),
 8 |     (10, list("ACDEFGHIKLMNPQRSTVWY") + ["MASK"], "forward", "params3"),
 9 | ]
10 | 
11 | test_params_fasta = [(1, list("KFQRVACEXWIHYPNGSMTDL"), "forward")]
12 | 
13 | 
14 | @pytest.mark.parametrize("batch_size, tokens_list, pass_mode, params", test_params)
15 | def test_loglikelihoods_type_shape_and_range(
16 |     init_model,
17 |     sequences,
18 |     loglikelihoods_results,
19 |     batch_size,
20 |     tokens_list,
21 |     pass_mode,
22 |     params,
23 | ):
24 |     test_trans = init_model
25 |     loglikelihoods = test_trans.compute_loglikelihood(
26 |         sequences,
27 |         batch_size=batch_size,
28 |         tokens_list=tokens_list,
29 |         pass_mode=pass_mode,
30 |         normalize=True,
31 |     )
32 |     assert len(loglikelihoods) == len(sequences)
33 |     if test_trans._model_dir in loglikelihoods_results.keys():
34 |         results = loglikelihoods_results[test_trans._model_dir][params]
35 |         assert_allclose(loglikelihoods, results, rtol=0.01)
36 | 
37 | 
38 | @pytest.mark.parametrize("batch_size, tokens_list, pass_mode", test_params_fasta)
39 | def test_loglikelihoods_type_shape_and_range_fasta(
40 |     init_model,
41 |     fasta_path,
42 |     lengths_sequence_fasta,
43 |     loglikelihoods_fasta_results,
44 |     batch_size,
45 |     tokens_list,
46 |     pass_mode,
47 | ):
48 |     test_trans = init_model
49 |     loglikelihoods = test_trans.compute_loglikelihood(
50 |         fasta_path,
51 |         batch_size=batch_size,
52 |         tokens_list=tokens_list,
53 |         pass_mode=pass_mode,
54 |         normalize=True,
55 |     )
56 |     assert len(loglikelihoods) == len(lengths_sequence_fasta)
57 |     if test_trans._model_dir in loglikelihoods_fasta_results.keys():
58 |         results = loglikelihoods_fasta_results[test_trans._model_dir]
59 |         assert_allclose(loglikelihoods, results, rtol=0.01)
60 | 


--------------------------------------------------------------------------------
/biotransformers/tests/test_mutation_score.py:
--------------------------------------------------------------------------------
 1 | """Test module for testing loglikelihoods function"""
 2 | import pytest
 3 | from numpy.testing import assert_allclose
 4 | 
 5 | test_params = [
 6 |     ([["A1Q"], ["A1K", "K2A"], ["A1H"], ["K3W", "K2D", "L9H"]], "params1"),
 7 | ]
 8 | 
 9 | 
10 | @pytest.mark.parametrize("mutations, params", test_params)
11 | def test_mutation_score_type_shape_and_range(
12 |     init_model, sequences, mutations_score_results, mutations, params
13 | ):
14 |     test_trans = init_model
15 |     mutations_scores = test_trans.compute_mutation_score(sequences, mutations)
16 |     assert len(mutations_scores) == len(sequences)
17 |     if test_trans._model_dir in mutations_score_results.keys():
18 |         results = mutations_score_results[test_trans._model_dir][params]
19 |         assert_allclose(mutations_scores, results, rtol=0.01)
20 | 


--------------------------------------------------------------------------------
/biotransformers/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/biotransformers/utils/__init__.py


--------------------------------------------------------------------------------
/biotransformers/utils/compute_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Dict, List, Tuple
  3 | 
  4 | import numpy as np
  5 | from biotransformers.utils.constant import NATURAL_AAS_LIST
  6 | 
  7 | ProbTuple = Tuple[float, float]
  8 | TokenProbsDict = Dict[int, Dict[str, float]]
  9 | SequenceProbsList = List[TokenProbsDict]
 10 | 
 11 | 
 12 | class InvalidPositionStringError(Exception):
 13 |     """Raised when a position string is passed with incorrect format"""
 14 | 
 15 | 
 16 | def validate_position_str(position_str: str):
 17 |     """Checks positions str format"""
 18 |     regex = re.compile(r"[A-Z]{1}[0-9]{1,4}[A-Z]{1}", re.I)
 19 |     if not regex.match(position_str):
 20 |         raise InvalidPositionStringError(
 21 |             f"'{position_str}' is not a valid position string"
 22 |         )
 23 | 
 24 | 
 25 | class Mutation:
 26 |     """register a mutation from a string
 27 | 
 28 |     Args:
 29 |         mutation (str): string mutation format "A8U" -> "NativeIdMutant"
 30 |     """
 31 | 
 32 |     def __init__(self, mutation_str: str) -> None:
 33 |         validate_position_str(mutation_str)
 34 |         self.mutation_str = mutation_str
 35 |         self.mutation = mutation_str[-1]
 36 |         self.native = mutation_str[0]
 37 |         self.position = int(mutation_str[1:-1])
 38 | 
 39 |     def __repr__(self) -> str:
 40 |         return f"Mutation >> Native: {self.native} New: {self.mutation} at position {self.position}"
 41 | 
 42 |     def is_valid_mutation(self, sequence: str):
 43 |         """Check if mutation is valid for the sequence of AA
 44 |         Args:
 45 |             sequence (str): protein sequence string
 46 |         """
 47 |         if len(sequence) < self.position:
 48 |             raise ValueError(
 49 |                 f"Sequence smaller than position {self.position} for mutation {self.mutation_str}"
 50 |             )
 51 |         if self.native != sequence[self.position - 1]:
 52 |             raise InvalidPositionStringError(
 53 |                 f"'{self.native}' is not a valid native position for mutation {self.mutation_str}"
 54 |             )
 55 |         if self.mutation not in NATURAL_AAS_LIST:
 56 |             raise ValueError(
 57 |                 f"New amino acid {self.mutation} is not a valid mutation for {self.mutation_str}"
 58 |             )
 59 |         return True
 60 | 
 61 | def get_list_probs(
 62 |     mutation_list: List[Tuple[Mutation]],
 63 |     mutate_probs: SequenceProbsList,
 64 |     length_mutations: List[int],
 65 | ) -> Tuple[List[List[float]], List[List[float]]]:
 66 |     """This function build a list of mutate and native probabilities to compute
 67 |     the mutate_score. For each position in the mutate list, we catch the native probability
 68 |     and the mutate probability of this position. We do this for each sequence and return two
 69 |     lists : native_probs and mutate probs.
 70 | 
 71 |     Args:
 72 |         mutation_list (List[Mutation]): list with integer which are mutations
 73 |         mutate_probs (List[Dict[Any]]): probabilities for mutate sequence
 74 |         length_mutations (List[int]):  length of indivual mutation for each sequence
 75 |     """
 76 |     flat_mutation = [mut for tup in mutation_list for mut in tup]
 77 |     native_probs_list, mutate_probs_list = [], []
 78 |     for prob, mut in zip(mutate_probs, flat_mutation):
 79 |         native_probs_list.append(prob[mut.position - 1][mut.native])
 80 |         mutate_probs_list.append(prob[mut.position - 1][mut.mutation])
 81 |     return split_list(native_probs_list, length_mutations), split_list(
 82 |         mutate_probs_list, length_mutations
 83 |     )
 84 | 
 85 | 
 86 | def mutation_score(native_probs: List[float], mutate_probs: List[float]) -> float:
 87 |     """
 88 |     Compute mutate score based on Masked marginal probability
 89 |     Sum(log(p(xi=xi_mutate|x-M))-log(p(xi=xi_native|x-M))) over M (M s a mutation set)
 90 | 
 91 |     Args:
 92 |         native_probs (List[ProbTuple]): [description]
 93 |         mutate_probs (List[ProbTuple]): [description]
 94 | 
 95 |     Returns:
 96 |         List[float]: [description]
 97 |     """
 98 |     return np.sum(
 99 |         [np.log(m_p) - np.log(n_p) for m_p, n_p in zip(mutate_probs, native_probs)]
100 |     )
101 | 
102 | 
103 | def split_list(list_to_split: List, lengths_list: List) -> List[List]:  # type: ignore
104 |     """split a list in sublist
105 | 
106 |     Args:
107 |         list_to_split (List): native list
108 |         lengths_list (List): length of each sublist
109 | 
110 |     Returns:
111 |         [type]: List of sublist
112 |     """
113 |     assert len(list_to_split) == sum(
114 |         lengths_list
115 |     ), "Sum of sublist length is not valid."
116 |     splitted_list = []
117 |     count = 0
118 |     for length in lengths_list:
119 |         splitted_list.append(list_to_split[count : (count + length)])
120 |         count += length
121 |     return splitted_list
122 | 


--------------------------------------------------------------------------------
/biotransformers/utils/constant.py:
--------------------------------------------------------------------------------
 1 | ESM_LIST = [
 2 |     # "esm1_t34_670M_UR50S",
 3 |     # "esm1_t34_670M_UR50D",
 4 |     "esm1_t34_670M_UR100",
 5 |     "esm1_t12_85M_UR50S",
 6 |     "esm1_t6_43M_UR50S",
 7 |     "esm1b_t33_650M_UR50S",
 8 |     "esm_msa1_t12_100M_UR50S",
 9 |     "esm_msa1b_t12_100M_UR50S",
10 |     "esm1v_t33_650M_UR90S_1",
11 | ]
12 | 
13 | ROSTLAB_LIST = ["Rostlab/prot_bert", "Rostlab/prot_bert_bfd"]
14 | 
15 | MAPPING_PROTBERT = {
16 |     "protbert": "Rostlab/prot_bert",
17 |     "protbert_bfd": "Rostlab/prot_bert_bfd",
18 | }
19 | 
20 | DEFAULT_ESM_MODEL = "esm1_t34_670M_UR100"
21 | DEFAULT_ROSTLAB_MODEL = "Rostlab/prot_bert"
22 | 
23 | BACKEND_LIST = ESM_LIST + list(MAPPING_PROTBERT.keys())
24 | 
25 | NATURAL_AAS_LIST = list("ACDEFGHIKLMNPQRSTVWY")
26 | 


--------------------------------------------------------------------------------
/biotransformers/utils/deprecated.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import warnings
 3 | 
 4 | 
 5 | def deprecated_alias(**aliases):
 6 |     def deco(f):
 7 |         @functools.wraps(f)
 8 |         def wrapper(*args, **kwargs):
 9 |             rename_kwargs(f.__name__, kwargs, aliases)
10 |             return f(*args, **kwargs)
11 | 
12 |         return wrapper
13 | 
14 |     return deco
15 | 
16 | 
17 | def rename_kwargs(func_name, kwargs, aliases):  # noqa
18 |     for alias, new in aliases.items():
19 |         if alias in kwargs:
20 |             if new in kwargs:
21 |                 raise TypeError(
22 |                     "{} received both {} and {}".format(func_name, alias, new)
23 |                 )
24 |             warnings.warn(
25 |                 "{} is deprecated; use {}".format(alias, new), DeprecationWarning, 3
26 |             )
27 | 
28 |             if alias == "device":
29 |                 if "cuda" in kwargs[alias]:
30 |                     kwargs.pop(alias)
31 |                     kwargs[new] = 1
32 |                 elif "cpu" in kwargs[alias]:
33 |                     kwargs.pop(alias)
34 |                     kwargs[new] = 0
35 |                 else:
36 |                     kwargs[new] = kwargs.pop(alias)
37 | 
38 |             elif alias == "multi_gpu":
39 |                 kwargs.pop(alias)
40 |             else:
41 |                 kwargs[new] = kwargs.pop(alias)
42 | 


--------------------------------------------------------------------------------
/biotransformers/utils/logger.py:
--------------------------------------------------------------------------------
 1 | """This module build a general logger module"""
 2 | import logging
 3 | import os
 4 | 
 5 | 
 6 | def logger(module_name: str) -> logging.Logger:
 7 |     """Configure the logger with formatter and handlers.
 8 | 
 9 |     The log level depends on the environment variable `BIO_LOG_LEVEL`.
10 | 
11 |     - 0: NOTSET, will be set to DEBUG
12 |     - 1: DEBUG
13 |     - 2: INFO (default)
14 |     - 3: WARNING
15 |     - 4: ERROR
16 |     - 5: CRITICAL
17 |     https://docs.python.org/3/library/logging.html#levels
18 | 
19 |     Args:
20 |         module_name (str): module name
21 | 
22 |     Returns:
23 |         [Logger]: instantiate logger object
24 |     """
25 |     if module_name.endswith("py"):
26 |         module_name = os.path.splitext(module_name)[0]
27 | 
28 |     logger_ = logging.getLogger(module_name)
29 |     logger_.propagate = False
30 |     log_level = os.environ.get("BIO_LOG_LEVEL", "2")
31 |     log_level_int = max(int(log_level) * 10, 10)
32 |     logger_.setLevel(log_level_int)
33 | 
34 |     handler = logging.StreamHandler()
35 |     formatter = logging.Formatter("%(levelname)s: %(message)s")
36 |     handler.setFormatter(formatter)
37 |     handler.setLevel(log_level_int)
38 |     logger_.addHandler(handler)
39 | 
40 |     return logger_
41 | 


--------------------------------------------------------------------------------
/biotransformers/utils/msa_utils.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import os
  3 | import string
  4 | from glob import glob
  5 | from pathlib import Path
  6 | from typing import Any, Dict, List, Optional, Tuple
  7 | 
  8 | from Bio import SeqIO
  9 | 
 10 | 
 11 | def get_translation() -> Dict[int, Any]:
 12 |     """
 13 |     get translation dict to convert unused character in MSA
 14 |     """
 15 |     delete_keys = dict.fromkeys(string.ascii_lowercase)
 16 |     delete_keys["."] = None  # gap character  '-' alignement character
 17 |     delete_keys["*"] = None
 18 |     translation = str.maketrans(delete_keys)
 19 |     return translation
 20 | 
 21 | 
 22 | def read_sequence(filename: str) -> Tuple[str, str]:
 23 |     """Reads the first (reference) sequences from a fasta or MSA file."""
 24 |     record = next(SeqIO.parse(filename, "fasta"))
 25 |     return record.description, str(record.seq)
 26 | 
 27 | 
 28 | def remove_insertions(sequence: str) -> str:
 29 |     """Removes any insertions into the sequence.
 30 |     Needed to load aligned sequences in an MSA."""
 31 |     translation = get_translation()
 32 |     return sequence.translate(translation)
 33 | 
 34 | 
 35 | def read_msa(filename: str, nseq: int) -> List[Tuple[str, str]]:
 36 |     """Reads the first nseq sequences from an MSA file,
 37 |     automatically removes insertions."""
 38 |     return [
 39 |         (record.description, remove_insertions(str(record.seq)))
 40 |         for record in itertools.islice(SeqIO.parse(filename, "fasta"), nseq)
 41 |     ]
 42 | 
 43 | 
 44 | def get_msa_list(path_msa: Optional[str]) -> List[str]:
 45 |     """Get all files of the msa folder and check file format
 46 | 
 47 |     Args:
 48 |         path_msa (Optional[str]): path of the folder with a3m file
 49 |     """
 50 |     if path_msa is None:
 51 |         raise ValueError("The path of the msa folder could not be None with msa model.")
 52 |     if not os.path.isdir(path_msa):
 53 |         raise FileExistsError(f"{path_msa} is not a valid directory")
 54 | 
 55 |     list_msa = glob(path_msa + "/*.a3m")
 56 |     all_a3m_file = all([msa.endswith("a3m") for msa in list_msa])
 57 |     if len(list_msa) == 0:
 58 |         raise FileNotFoundError(
 59 |             "Can't find any msa files with .a3m format in this folder."
 60 |         )
 61 |     if not all_a3m_file:
 62 |         raise ValueError("All files in msa folder should have a3m format.")
 63 | 
 64 |     return list_msa
 65 | 
 66 | 
 67 | def get_msa_lengths(list_msa: List[List[Tuple[str, str]]], nseq: int) -> List[int]:
 68 |     """Get length of an MSA list
 69 | 
 70 |     All MSA must have at least nseq in msa
 71 | 
 72 |     Args:
 73 |         list_msa (List[List[Tuple[str,str]]]): list of MSA. MSA is a list of tuple
 74 |         nseq
 75 |     Returns:
 76 |         List[int]: [description]
 77 |     """
 78 | 
 79 |     def _msa_length(msa: List[Tuple[str, str]]) -> List[int]:
 80 |         """get length of each sequence in msa
 81 | 
 82 |         Example: >> input = ['AAAB','AAAA','AAA-']
 83 |                  >> _msa_length(input)
 84 |                  >> [4,4,4]
 85 |         Raises:
 86 |             ValueError if number of seq in the MSA is less than nseq
 87 |         Args:
 88 |             msa (List[Tuple[str, str]]): List of sequence
 89 | 
 90 |         Returns:
 91 |             List[int]: List of length of each msa
 92 |         """
 93 |         return [len(seq[1]) for seq in msa]
 94 | 
 95 |     lengths = [_msa_length(msa) for msa in list_msa]
 96 |     n_different_seq = sum([len(length) != nseq for length in lengths])
 97 |     if n_different_seq > 0:
 98 |         msg = (
 99 |             f"Find {n_different_seq} files with less than {nseq} sequences in the msa. "
100 |             f"All msa files must have at least {nseq} sequences. "
101 |             f"Use `from biotransformers.utils.msa_utils.msa_to_remove` to get the file to remove."
102 |         )
103 |         raise ValueError(msg)
104 |     unique_length = [max(length) for length in lengths]
105 |     return unique_length
106 | 
107 | 
108 | def msa_to_remove(path_msa: str, n_seq) -> List[str]:
109 |     """Get list of msa with less than nseq sequence
110 | 
111 |     Args:
112 |         path_msa (str): [description]
113 | 
114 |     Returns:
115 |         List of msa filepath that don't have enough enough sequences.
116 |     """
117 |     path_msa = str(Path(path_msa).resolve())
118 |     list_msa_filepath = get_msa_list(path_msa)
119 |     list_msa = [read_msa(file, n_seq) for file in list_msa_filepath]
120 | 
121 |     def _msa_length(msa: List[Tuple[str, str]]) -> List[int]:
122 |         return [len(seq[1]) for seq in msa]
123 | 
124 |     lengths = [_msa_length(msa) for msa in list_msa]
125 |     msa_to_remove = []
126 |     for i, length in enumerate(lengths):
127 |         if len(length) != n_seq:
128 |             msa_to_remove.append(list_msa_filepath[i])
129 |     print(
130 |         f"{len(msa_to_remove)}/{len(list_msa)} have insufficient number of sequences in MSA."
131 |     )
132 |     return msa_to_remove
133 | 


--------------------------------------------------------------------------------
/biotransformers/utils/tqdm_utils.py:
--------------------------------------------------------------------------------
 1 | """This module provides a ProgressBar that works with ray and tqdm.
 2 | Each Ray workers update the progress bar remotly.
 3 | """
 4 | from asyncio import Event
 5 | from typing import Tuple
 6 | 
 7 | import ray
 8 | 
 9 | # For typing purposes
10 | from ray.actor import ActorHandle
11 | from tqdm import tqdm
12 | 
13 | 
14 | @ray.remote
15 | class ProgressBarActor:
16 |     counter: int
17 |     delta: int
18 |     event: Event
19 | 
20 |     def __init__(self) -> None:
21 |         self.counter = 0
22 |         self.delta = 0
23 |         self.event = Event()
24 | 
25 |     def update(self, num_items_completed: int) -> None:
26 |         """Updates the ProgressBar with the incremental
27 |         number of items that were just completed.
28 |         """
29 |         self.counter += num_items_completed
30 |         self.delta += num_items_completed
31 |         self.event.set()
32 | 
33 |     async def wait_for_update(self) -> Tuple[int, int]:
34 |         """Blocking call.
35 | 
36 |         Waits until somebody calls `update`, then returns a tuple of
37 |         the number of updates since the last call to
38 |         `wait_for_update`, and the total number of completed items.
39 |         """
40 |         await self.event.wait()
41 |         self.event.clear()
42 |         saved_delta = self.delta
43 |         self.delta = 0
44 |         return saved_delta, self.counter
45 | 
46 |     def get_counter(self) -> int:
47 |         """
48 |         Returns the total number of complete items.
49 |         """
50 |         return self.counter
51 | 
52 | 
53 | # Back on the local node, once you launch your remote Ray tasks, call
54 | # `print_until_done`, which will feed everything back into a `tqdm` counter.
55 | 
56 | 
57 | class ProgressBar:
58 |     progress_actor: ActorHandle
59 |     total: int
60 |     description: str
61 |     pbar: tqdm
62 | 
63 |     def __init__(self, total: int, description: str = ""):
64 |         # Ray actors don't seem to play nice with mypy, generating
65 |         # a spurious warning for the following line,
66 |         # which we need to suppress. The code is fine.
67 |         self.progress_actor = ProgressBarActor.remote()  # type: ignore
68 |         self.total = total
69 |         self.description = description
70 | 
71 |     @property
72 |     def actor(self) -> ActorHandle:
73 |         """Returns a reference to the remote `ProgressBarActor`.
74 | 
75 |         When you complete tasks, call `update` on the actor.
76 |         """
77 |         return self.progress_actor
78 | 
79 |     def print_until_done(self) -> None:
80 |         """Blocking call.
81 | 
82 |         Do this after starting a series of remote Ray tasks, to which you've
83 |         passed the actor handle. Each of them calls `update` on the actor.
84 |         When the progress meter reaches 100%, this method returns.
85 |         """
86 |         pbar = tqdm(desc=self.description, total=self.total)
87 |         while True:
88 |             delta, counter = ray.get(self.actor.wait_for_update.remote())
89 |             pbar.update(delta)
90 |             if counter >= self.total:
91 |                 pbar.close()
92 |                 return
93 | 


--------------------------------------------------------------------------------
/biotransformers/version.py:
--------------------------------------------------------------------------------
1 | VERSION = "0.1.17"
2 | 


--------------------------------------------------------------------------------
/biotransformers/wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/biotransformers/wrappers/__init__.py


--------------------------------------------------------------------------------
/biotransformers/wrappers/language_model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script defines a generic template class for any language model.
  3 | Both ESM and Rostlab language models should implement this class.
  4 | """
  5 | from abc import ABC, abstractmethod
  6 | from typing import Dict, List, Tuple
  7 | 
  8 | import torch
  9 | from ray.actor import ActorHandle
 10 | 
 11 | 
 12 | class LanguageModel(ABC):
 13 |     """
 14 |     Class that implements a language model.
 15 |     """
 16 | 
 17 |     def __init__(self, model_dir: str, device):
 18 |         self._model_dir = model_dir
 19 |         self._device = device
 20 |         self.is_msa = False
 21 | 
 22 |     @property
 23 |     def model_id(self) -> str:
 24 |         """Model ID, as specified in the model directory"""
 25 |         return self._model_dir.lower()
 26 | 
 27 |     @property
 28 |     @abstractmethod
 29 |     def clean_model_id(self) -> str:
 30 |         """Clean model ID (in case the model directory is not)"""
 31 |         pass
 32 | 
 33 |     @property
 34 |     @abstractmethod
 35 |     def model_vocabulary(self) -> List[str]:
 36 |         """Returns the whole vocabulary list"""
 37 |         pass
 38 | 
 39 |     @property
 40 |     @abstractmethod
 41 |     def vocab_size(self) -> int:
 42 |         """Returns the whole vocabulary size"""
 43 |         pass
 44 | 
 45 |     @property
 46 |     @abstractmethod
 47 |     def mask_token(self) -> str:
 48 |         """Representation of the mask token (as a string)"""
 49 |         pass
 50 | 
 51 |     @property
 52 |     @abstractmethod
 53 |     def pad_token(self) -> str:
 54 |         """Representation of the pad token (as a string)"""
 55 |         pass
 56 | 
 57 |     @property
 58 |     @abstractmethod
 59 |     def begin_token(self) -> str:
 60 |         """Representation of the beginning of sentence token (as a string)"""
 61 |         pass
 62 | 
 63 |     @property
 64 |     @abstractmethod
 65 |     def end_token(self) -> str:
 66 |         """Representation of the end of sentence token (as a string)."""
 67 |         pass
 68 | 
 69 |     @property
 70 |     @abstractmethod
 71 |     def does_end_token_exist(self) -> bool:
 72 |         """Returns true if a end of sequence token exists"""
 73 |         pass
 74 | 
 75 |     @property
 76 |     @abstractmethod
 77 |     def token_to_id(self):
 78 |         """Returns a function which maps tokens to IDs"""
 79 |         pass
 80 | 
 81 |     @property
 82 |     @abstractmethod
 83 |     def embeddings_size(self) -> int:
 84 |         """Returns size of the embeddings"""
 85 |         pass
 86 | 
 87 |     @abstractmethod
 88 |     def process_sequences_and_tokens(
 89 |         self,
 90 |         sequences_list: List[str],
 91 |     ) -> Dict[str, torch.Tensor]:
 92 |         """Function to transform tokens string to IDs; it depends on the model used"""
 93 |         pass
 94 | 
 95 |     @property
 96 |     @abstractmethod
 97 |     def model(self) -> torch.nn.Module:
 98 |         """Return torch model."""
 99 |         pass
100 | 
101 |     @abstractmethod
102 |     def set_model(self, model: torch.nn.Module):
103 |         """Set torch model."""
104 |         pass
105 | 
106 |     @abstractmethod
107 |     def model_pass(
108 |         self,
109 |         model_inputs: Dict[str, torch.tensor],
110 |         batch_size: int,
111 |         silent: bool = False,
112 |         pba: ActorHandle = None,
113 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
114 |         """
115 |         Function which computes logits and embeddings based on a dict of sequences
116 |         tensors, a provided batch size and an inference configuration. The output is
117 |         obtained by computing a forward pass through the model ("forward inference")
118 | 
119 |         Args:
120 |             model_inputs (Dict[str, torch.tensor]): [description]
121 |             batch_size (int): size of the batch
122 |             silent : display or not progress bar
123 |             pba : tqdm progress bar for ray actor
124 |         Returns:
125 |             Tuple[torch.tensor, torch.tensor]:
126 |                     * logits [num_seqs, max_len_seqs, vocab_size]
127 |                     * embeddings [num_seqs, max_len_seqs+1, embedding_size]
128 |         """
129 |         pass
130 | 
131 |     @abstractmethod
132 |     def get_alphabet_dataloader(self):
133 |         """Define an alphabet mapping for common method between
134 |         protbert and ESM
135 |         """
136 |         pass
137 | 


--------------------------------------------------------------------------------
/data/fasta/example_fasta.fasta:
--------------------------------------------------------------------------------
 1 | >sp|O24396|PURA_WHEAT Adenylosuccinate synthetase, chloroplastic (Fragment) OS=Triticum aestivum OX=4565 PE=1 SV=1
 2 | AAAAAGRGRSFSPAAPAPSSVRLPGRQAPAPAAASALAVEADPAADRVSSLSQVSGVLGS
 3 | QWGDEGKGKLVDVLAPRFDIVARCQGGANAGHTIYNSEGKKFALHLVPSGILHEGTLCVV
 4 | GNGAVIHVPGFFGEIDGLQSNGVSCDGRILVSDRAHLLFDLHQTVDGLREAELANSFIGT
 5 | TKRGIGPCYSSKVTRNGLRVCDLRHMDTFGDKLDVLFEDAAARFEGFKYSKGMLKEEVER
 6 | YKRFAERLEPFIADTVHVLNESIRQKKKILVEGGQATMLDIDFGTYPFVTSSSPSAGGIC
 7 | TGLGIAPRVIGDLIGVVKAYTTRVGSGPFPTELLGEEGDVLRKAGMEFGTTTGRPRRCGW
 8 | LDIVALKYCCDINGFSSLNLTKLDVLSGLPEIKLGVSYNQMDGEKLQSFPGDLDTLEQVQ
 9 | VNYEVLPGWDSDISSVRSYSELPQAARRYVERIEELAGVPVHYIGVGPGRDALIYK
10 | >sp|P80405|MTHFS_RABIT 5-formyltetrahydrofolate cyclo-ligase OS=Oryctolagus cuniculus OX=9986 GN=MTHFS PE=1 SV=1
11 | AAAAAVSGAKRSLRAELKQRLRAISAEERLRCQRLLTQKVIAHRQYQKSQRISIFLSMPD
12 | EIETEEIIKDIFQQGKVCFIPRYRLQSNHMDMVKLASADEISSLPKTSWNIHQPSESDTR
13 | EEALATGGLDLIFMPGLGFDRNGNRLGRGRGYYDTYLQRCLQQQGAKPYTIALAFREQIC
14 | PQVPVDDTDVSVDEVLYVDAA
15 | >sp|P21624|UTS1_PLAFE UI (Fragments) OS=Platichthys flesus OX=8260 PE=1 SV=1
16 | AAAAGDSAASDLLGDNILRSEDPPMSIDLTFHMLRNMIHMAKMEGEREQAQINRNLLDEV
17 | >sp|Q9PRR0|SMS_LAMFL Somatostatin (Fragment) OS=Lampetra fluviatilis OX=7748 GN=sst PE=1 SV=1
18 | AAAAPGAAGGAQLPLGNRERKAGCKNFFWKTFSSC
19 | >sp|Q40963|MPA5B_PHLPR Pollen allergen Phl p 5b (Fragment) OS=Phleum pratense OX=15957 PE=1 SV=2
20 | AAAAVPRRGPRGGPGRSYTADAGYAPATPAAAGAAAGKATTEEQKLIEDINVGFKAAVAA
21 | AASVPAADKFKTFEAAFTSSSKAAAAKAPGLVPKLDAAYSVAYKAAVGATPEAKFDSFVA
22 | SLTEALRVIAGALEVHAVKPVTEEPGMAKIPAGELQIIDKIDAAFKVAATAAATAPADDK
23 | FTVFEAAFNKAIKESTGGAYDTYKCIPSLEAAVKQAYAATVAAAPQVKYAVFEAALTKAI
24 | TAMSEVQKVSQPATGAATVAAGAATTAAGAASGAATVAAGGYKV
25 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/environment_docs.yaml:
--------------------------------------------------------------------------------
 1 | name: bio-transformers-dev
 2 | channels:
 3 |   #- conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.7
 7 |   - pip=20.0.2
 8 |   - pip:
 9 |     - sphinx==4.0.2
10 |     - sphinx-notfound-page==0.6
11 |     #- sphinx_rtd_theme==0.5.2
12 |     - sphinx-book-theme
13 |     - sphinx_tabs==2.1.0
14 |     - pre-commit==2.2.0
15 |     - biopython==1.78
16 |     #- ray==1.4.0
17 |     - fair-esm==0.3.1
18 |     - numpy>=1.16
19 |     - pandas>=1.2.3
20 |     - pytest==6.2.4
21 |     - pytest-cov==2.12.0
22 |     - tqdm>=4.60.0
23 |     #- transformers>=4.6.1,<4.7
24 |     - myst-parser==0.15.1
25 |     #- myst-nb==0.12.3
26 |     - sphinx-autoapi==1.8.1
27 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/_build/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: 9fac7781d42123977978002993cc7640
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/api/biotransformers.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/api/biotransformers.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/bio_transformers/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/bio_transformers/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/lightning_utils/data/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/lightning_utils/data/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/lightning_utils/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/lightning_utils/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/lightning_utils/models/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/lightning_utils/models/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/lightning_utils/optimizer/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/lightning_utils/optimizer/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/tests/conftest/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/tests/conftest/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/tests/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/tests/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_accuracy/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_accuracy/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_embeddings/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_embeddings/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_logits/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_logits/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_loglikelihoods/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_loglikelihoods/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_msa/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/tests/test_msa/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/utils/constant/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/utils/constant/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/utils/deprecated/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/utils/deprecated/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/utils/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/utils/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/utils/logger/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/utils/logger/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/utils/msa_utils/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/utils/msa_utils/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/utils/tqdm_utils/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/utils/tqdm_utils/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/utils/utils/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/utils/utils/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/version/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/version/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/esm_wrappers/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/esm_wrappers/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/language_model/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/language_model/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/rostlab_wrapper/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/rostlab_wrapper/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/transformers_wrappers/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/biotransformers/wrappers/transformers_wrappers/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/autoapi/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/autoapi/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/contributing/CHANGELOG.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/contributing/CHANGELOG.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/contributing/CONTRIBUTING.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/contributing/CONTRIBUTING.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/documentation/course.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/documentation/course.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/documentation/logging.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/documentation/logging.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/documentation/msa.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/documentation/msa.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/documentation/multi_gpus.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/documentation/multi_gpus.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/environment.pickle


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/getting_started/install.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/getting_started/install.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/getting_started/quick_start.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/getting_started/quick_start.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/index.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/tutorial/embeddings.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/tutorial/embeddings.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/tutorial/finetuning.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/tutorial/finetuning.doctree


--------------------------------------------------------------------------------
/docs/source/_build/.doctrees/tutorial/loglikelihood.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/.doctrees/tutorial/loglikelihood.doctree


--------------------------------------------------------------------------------
/docs/source/_build/_sources/api/biotransformers.rst.txt:
--------------------------------------------------------------------------------
1 | Bio-transformers method
2 | =======================
3 | 
4 | .. automodule:: biotransformers.wrappers.transformers_wrappers
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/bio_transformers/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.bio_transformers`
 2 | =======================================
 3 | 
 4 | .. py:module:: biotransformers.bio_transformers
 5 | 
 6 | .. autoapi-nested-parse::
 7 | 
 8 |    Main module to build either ESM or protbert model
 9 | 
10 | 
11 | 
12 | Module Contents
13 | ---------------
14 | 
15 | Classes
16 | ~~~~~~~
17 | 
18 | .. autoapisummary::
19 | 
20 |    biotransformers.bio_transformers.BioTransformers
21 | 
22 | 
23 | 
24 | 
25 | .. class:: BioTransformers(backend: str = 'esm1_t6_43M_UR50S', num_gpus: int = 0)
26 | 
27 | 
28 |    Bases: :py:obj:`biotransformers.wrappers.transformers_wrappers.TransformersWrapper`
29 | 
30 |    General class to choose an ESM or ProtBert backend
31 |    Abstract method are implemented in transformers
32 | 
33 |    .. method:: list_backend() -> None
34 |       :staticmethod:
35 | 
36 |       Get all possible backend for the model
37 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/index.rst.txt:
--------------------------------------------------------------------------------
  1 | :mod:`biotransformers`
  2 | ======================
  3 | 
  4 | .. py:module:: biotransformers
  5 | 
  6 | 
  7 | Subpackages
  8 | -----------
  9 | .. toctree::
 10 |    :titlesonly:
 11 |    :maxdepth: 3
 12 | 
 13 |    lightning_utils/index.rst
 14 |    tests/index.rst
 15 |    utils/index.rst
 16 |    wrappers/index.rst
 17 | 
 18 | 
 19 | Submodules
 20 | ----------
 21 | .. toctree::
 22 |    :titlesonly:
 23 |    :maxdepth: 1
 24 | 
 25 |    bio_transformers/index.rst
 26 |    version/index.rst
 27 | 
 28 | 
 29 | Package Contents
 30 | ----------------
 31 | 
 32 | Classes
 33 | ~~~~~~~
 34 | 
 35 | .. autoapisummary::
 36 | 
 37 |    biotransformers.BioTransformers
 38 | 
 39 | 
 40 | 
 41 | Functions
 42 | ~~~~~~~~~
 43 | 
 44 | .. autoapisummary::
 45 | 
 46 |    biotransformers.logger
 47 | 
 48 | 
 49 | 
 50 | Attributes
 51 | ~~~~~~~~~~
 52 | 
 53 | .. autoapisummary::
 54 | 
 55 |    biotransformers.VERSION
 56 |    biotransformers.log
 57 |    biotransformers.__version__
 58 | 
 59 | 
 60 | .. class:: BioTransformers(backend: str = 'esm1_t6_43M_UR50S', num_gpus: int = 0)
 61 | 
 62 | 
 63 |    Bases: :py:obj:`biotransformers.wrappers.transformers_wrappers.TransformersWrapper`
 64 | 
 65 |    General class to choose an ESM or ProtBert backend
 66 |    Abstract method are implemented in transformers
 67 | 
 68 |    .. method:: list_backend() -> None
 69 |       :staticmethod:
 70 | 
 71 |       Get all possible backend for the model
 72 | 
 73 | 
 74 | 
 75 | .. function:: logger(module_name: str) -> logging.Logger
 76 | 
 77 |    Configure the logger with formatter and handlers.
 78 | 
 79 |    The log level depends on the environment variable `BIO_LOG_LEVEL`.
 80 | 
 81 |    - 0: NOTSET, will be set to DEBUG
 82 |    - 1: DEBUG
 83 |    - 2: INFO (default)
 84 |    - 3: WARNING
 85 |    - 4: ERROR
 86 |    - 5: CRITICAL
 87 |    https://docs.python.org/3/library/logging.html#levels
 88 | 
 89 |    :param module_name: module name
 90 |    :type module_name: str
 91 | 
 92 |    :returns: instantiate logger object
 93 |    :rtype: [Logger]
 94 | 
 95 | 
 96 | .. data:: VERSION
 97 |    :annotation: = 0.1.3
 98 | 
 99 | 
100 | 
101 | .. data:: log
102 | 
103 | 
104 | 
105 | 
106 | .. data:: __version__
107 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/lightning_utils/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.lightning_utils`
 2 | ======================================
 3 | 
 4 | .. py:module:: biotransformers.lightning_utils
 5 | 
 6 | 
 7 | Submodules
 8 | ----------
 9 | .. toctree::
10 |    :titlesonly:
11 |    :maxdepth: 1
12 | 
13 |    data/index.rst
14 |    models/index.rst
15 |    optimizer/index.rst
16 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/lightning_utils/models/index.rst.txt:
--------------------------------------------------------------------------------
  1 | :mod:`biotransformers.lightning_utils.models`
  2 | =============================================
  3 | 
  4 | .. py:module:: biotransformers.lightning_utils.models
  5 | 
  6 | 
  7 | Module Contents
  8 | ---------------
  9 | 
 10 | Classes
 11 | ~~~~~~~
 12 | 
 13 | .. autoapisummary::
 14 | 
 15 |    biotransformers.lightning_utils.models.LightningModule
 16 | 
 17 | 
 18 | 
 19 | 
 20 | .. class:: LightningModule(model, alphabet, lr: float, warmup_end_lr: float, warmup_updates: int = 10, warmup_init_lr: float = 1e-07)
 21 | 
 22 | 
 23 |    Bases: :py:obj:`pytorch_lightning.LightningModule`
 24 | 
 25 |    Create lightning model to use ddp
 26 | 
 27 |    .. method:: forward(self, x)
 28 | 
 29 |       Same as :meth:`torch.nn.Module.forward()`.
 30 | 
 31 |       :param \*args: Whatever you decide to pass into the forward method.
 32 |       :param \*\*kwargs: Keyword arguments are also possible.
 33 | 
 34 |       :returns: Your model's output
 35 | 
 36 | 
 37 |    .. method:: configure_optimizers(self) -> Tuple[List[torch.optim.Optimizer], List[Dict]]
 38 | 
 39 |       Configure the optimizer and learning rate scheduler.
 40 | 
 41 |       :returns:
 42 | 
 43 |                 - list of optimizers.
 44 |                 - list of lr schedulers.
 45 | 
 46 | 
 47 |    .. method:: cross_entropy_loss(self, logits, targets)
 48 | 
 49 | 
 50 |    .. method:: training_step(self, train_batch, batch_idx)
 51 | 
 52 |       Here you compute and return the training loss and some additional metrics for e.g.
 53 |       the progress bar or logger.
 54 | 
 55 |       :param batch: The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
 56 |       :type batch: :class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]
 57 |       :param batch_idx: Integer displaying index of this batch
 58 |       :type batch_idx: int
 59 |       :param optimizer_idx: When using multiple optimizers, this argument will also be present.
 60 |       :type optimizer_idx: int
 61 |       :param hiddens: Passed in if
 62 |                       :paramref:`~pytorch_lightning.core.lightning.LightningModule.truncated_bptt_steps` > 0.
 63 |       :type hiddens: :class:`~torch.Tensor`
 64 | 
 65 |       :returns: Any of.
 66 | 
 67 |                 - :class:`~torch.Tensor` - The loss tensor
 68 |                 - ``dict`` - A dictionary. Can include any keys, but must include the key ``'loss'``
 69 |                 - ``None`` - Training will skip to the next batch
 70 | 
 71 |       .. note:: Returning ``None`` is currently not supported for multi-GPU or TPU, or with 16-bit precision enabled.
 72 | 
 73 |       In this step you'd normally do the forward pass and calculate the loss for a batch.
 74 |       You can also do fancier things like multiple forward passes or something model specific.
 75 | 
 76 |       Example::
 77 | 
 78 |           def training_step(self, batch, batch_idx):
 79 |               x, y, z = batch
 80 |               out = self.encoder(x)
 81 |               loss = self.loss(out, x)
 82 |               return loss
 83 | 
 84 |       If you define multiple optimizers, this step will be called with an additional
 85 |       ``optimizer_idx`` parameter.
 86 | 
 87 |       .. code-block:: python
 88 | 
 89 |           # Multiple optimizers (e.g.: GANs)
 90 |           def training_step(self, batch, batch_idx, optimizer_idx):
 91 |               if optimizer_idx == 0:
 92 |                   # do training_step with encoder
 93 |               if optimizer_idx == 1:
 94 |                   # do training_step with decoder
 95 | 
 96 | 
 97 |       If you add truncated back propagation through time you will also get an additional
 98 |       argument with the hidden states of the previous step.
 99 | 
100 |       .. code-block:: python
101 | 
102 |           # Truncated back-propagation through time
103 |           def training_step(self, batch, batch_idx, hiddens):
104 |               # hiddens are the hidden states from the previous truncated backprop step
105 |               ...
106 |               out, hiddens = self.lstm(data, hiddens)
107 |               ...
108 |               return {'loss': loss, 'hiddens': hiddens}
109 | 
110 |       .. note::
111 | 
112 |          The loss value shown in the progress bar is smoothed (averaged) over the last values,
113 |          so it differs from the actual loss returned in train/validation step.
114 | 
115 | 
116 |    .. method:: validation_step(self, val_batch, batch_idx)
117 | 
118 |       Log the loss and metrics for a batch.
119 | 
120 |       :param batch: batch input.
121 |       :param batch_idx: index of the batch.
122 | 
123 | 
124 |    .. method:: get_tensor_accuracy(self, logits: torch.Tensor, targets: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
125 | 
126 |       Calculate accuracy for multi-masking, summed over batch.
127 | 
128 |       :param logits: prediction from the model, shape = (batch, len_tokens, len_vocab)
129 |       :param targets: ground truth, shape = (batch, len_tokens)
130 | 
131 |       :returns: accuracy value.
132 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/lightning_utils/optimizer/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.lightning_utils.optimizer`
 2 | ================================================
 3 | 
 4 | .. py:module:: biotransformers.lightning_utils.optimizer
 5 | 
 6 | 
 7 | Module Contents
 8 | ---------------
 9 | 
10 | 
11 | Functions
12 | ~~~~~~~~~
13 | 
14 | .. autoapisummary::
15 | 
16 |    biotransformers.lightning_utils.optimizer.lr_update
17 | 
18 | 
19 | 
20 | .. function:: lr_update(num_updates: int, warmup_updates: int, warmup_init_lr: float, lr_step: float, decay_factor: float) -> float
21 | 
22 |    InverseSquareRootSchedule.
23 | 
24 |    https://github.com/pytorch/fairseq/blob/master/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py#L32
25 | 
26 |    :param num_updates: number of batches already used.
27 |    :param warmup_updates: number of batch steps for warm up.
28 |    :param warmup_init_lr: initial learning rate.
29 |    :param lr_step: step for increasing learning rate during warm up.
30 |    :param decay_factor: factor for decreasing learning rate after warm up.
31 | 
32 |    :returns: learning rate multiplicate factor
33 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/tests/conftest/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.tests.conftest`
 2 | =====================================
 3 | 
 4 | .. py:module:: biotransformers.tests.conftest
 5 | 
 6 | 
 7 | Module Contents
 8 | ---------------
 9 | 
10 | 
11 | Functions
12 | ~~~~~~~~~
13 | 
14 | .. autoapisummary::
15 | 
16 |    biotransformers.tests.conftest.init_model
17 | 
18 | 
19 | 
20 | Attributes
21 | ~~~~~~~~~~
22 | 
23 | .. autoapisummary::
24 | 
25 |    biotransformers.tests.conftest.test_models
26 | 
27 | 
28 | .. data:: test_models
29 |    :annotation: = ['esm1_t6_43M_UR50S', 'esm1b_t33_650M_UR50S', 'protbert']
30 | 
31 | 
32 | 
33 | .. function:: init_model(request)
34 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/tests/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.tests`
 2 | ============================
 3 | 
 4 | .. py:module:: biotransformers.tests
 5 | 
 6 | 
 7 | Submodules
 8 | ----------
 9 | .. toctree::
10 |    :titlesonly:
11 |    :maxdepth: 1
12 | 
13 |    conftest/index.rst
14 |    test_accuracy/index.rst
15 |    test_embeddings/index.rst
16 |    test_logits/index.rst
17 |    test_loglikelihoods/index.rst
18 |    test_msa/index.rst
19 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/tests/test_accuracy/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.tests.test_accuracy`
 2 | ==========================================
 3 | 
 4 | .. py:module:: biotransformers.tests.test_accuracy
 5 | 
 6 | .. autoapi-nested-parse::
 7 | 
 8 |    Test module for testing accuracy function
 9 | 
10 | 
11 | 
12 | Module Contents
13 | ---------------
14 | 
15 | 
16 | Functions
17 | ~~~~~~~~~
18 | 
19 | .. autoapisummary::
20 | 
21 |    biotransformers.tests.test_accuracy.test_accuracy_type_and_range
22 | 
23 | 
24 | 
25 | Attributes
26 | ~~~~~~~~~~
27 | 
28 | .. autoapisummary::
29 | 
30 |    biotransformers.tests.test_accuracy.test_sequences
31 |    biotransformers.tests.test_accuracy.test_params
32 | 
33 | 
34 | .. data:: test_sequences
35 |    :annotation: = ['AAAA', 'AKKF', 'AHHFK', 'KKKKKKKLLL']
36 | 
37 | 
38 | 
39 | .. data:: test_params
40 |    :annotation: = [[1, 'forward'], [2, 'masked'], [10, 'forward']]
41 | 
42 | 
43 | 
44 | .. function:: test_accuracy_type_and_range(init_model, batch_size, pass_mode)
45 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/tests/test_embeddings/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.tests.test_embeddings`
 2 | ============================================
 3 | 
 4 | .. py:module:: biotransformers.tests.test_embeddings
 5 | 
 6 | .. autoapi-nested-parse::
 7 | 
 8 |    Test module for testing embeddings function
 9 | 
10 | 
11 | 
12 | Module Contents
13 | ---------------
14 | 
15 | 
16 | Functions
17 | ~~~~~~~~~
18 | 
19 | .. autoapisummary::
20 | 
21 |    biotransformers.tests.test_embeddings.test_embeddings_type_and_shape
22 | 
23 | 
24 | 
25 | Attributes
26 | ~~~~~~~~~~
27 | 
28 | .. autoapisummary::
29 | 
30 |    biotransformers.tests.test_embeddings.test_sequences
31 |    biotransformers.tests.test_embeddings.test_params
32 | 
33 | 
34 | .. data:: test_sequences
35 |    :annotation: = ['AAAA', 'AKKF', 'AHHFK', 'KKKKKKKLLL']
36 | 
37 | 
38 | 
39 | .. data:: test_params
40 |    :annotation: = [[1, ['cls', 'mean']], [2, ['full', 'mean', 'cls']], [10, ['cls', 'full']]]
41 | 
42 | 
43 | 
44 | .. function:: test_embeddings_type_and_shape(init_model, batch_size, pool_mode)
45 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/tests/test_logits/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.tests.test_logits`
 2 | ========================================
 3 | 
 4 | .. py:module:: biotransformers.tests.test_logits
 5 | 
 6 | .. autoapi-nested-parse::
 7 | 
 8 |    Test module for testing logits function
 9 | 
10 | 
11 | 
12 | Module Contents
13 | ---------------
14 | 
15 | 
16 | Functions
17 | ~~~~~~~~~
18 | 
19 | .. autoapisummary::
20 | 
21 |    biotransformers.tests.test_logits.test_logits_type
22 | 
23 | 
24 | 
25 | Attributes
26 | ~~~~~~~~~~
27 | 
28 | .. autoapisummary::
29 | 
30 |    biotransformers.tests.test_logits.test_sequences
31 |    biotransformers.tests.test_logits.test_params
32 | 
33 | 
34 | .. data:: test_sequences
35 |    :annotation: = ['AAAA', 'AKKF', 'AHHFK', 'KKKKKKKLLL']
36 | 
37 | 
38 | 
39 | .. data:: test_params
40 |    :annotation: = [[1, 'forward'], [2, 'masked'], [10, 'forward']]
41 | 
42 | 
43 | 
44 | .. function:: test_logits_type(init_model, batch_size, pass_mode)
45 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/tests/test_loglikelihoods/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.tests.test_loglikelihoods`
 2 | ================================================
 3 | 
 4 | .. py:module:: biotransformers.tests.test_loglikelihoods
 5 | 
 6 | .. autoapi-nested-parse::
 7 | 
 8 |    Test module for testing loglikelihoods function
 9 | 
10 | 
11 | 
12 | Module Contents
13 | ---------------
14 | 
15 | 
16 | Functions
17 | ~~~~~~~~~
18 | 
19 | .. autoapisummary::
20 | 
21 |    biotransformers.tests.test_loglikelihoods.test_loglikelihoods_type_shape_and_range
22 | 
23 | 
24 | 
25 | Attributes
26 | ~~~~~~~~~~
27 | 
28 | .. autoapisummary::
29 | 
30 |    biotransformers.tests.test_loglikelihoods.test_sequences
31 |    biotransformers.tests.test_loglikelihoods.test_params
32 | 
33 | 
34 | .. data:: test_sequences
35 |    :annotation: = ['AAAA', 'AKKF', 'AHHFK', 'KKKKKKKLLL']
36 | 
37 | 
38 | 
39 | .. data:: test_params
40 |    :annotation: = [None, None, None]
41 | 
42 | 
43 | 
44 | .. function:: test_loglikelihoods_type_shape_and_range(init_model, batch_size, tokens_list, pass_mode)
45 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/tests/test_msa/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.tests.test_msa`
 2 | =====================================
 3 | 
 4 | .. py:module:: biotransformers.tests.test_msa
 5 | 
 6 | 
 7 | Module Contents
 8 | ---------------
 9 | 
10 | 
11 | Functions
12 | ~~~~~~~~~
13 | 
14 | .. autoapisummary::
15 | 
16 |    biotransformers.tests.test_msa.test_msa_embeddings_type_and_shape
17 |    biotransformers.tests.test_msa.test_msa_logits_type
18 | 
19 | 
20 | 
21 | Attributes
22 | ~~~~~~~~~~
23 | 
24 | .. autoapisummary::
25 | 
26 |    biotransformers.tests.test_msa.path_msa
27 |    biotransformers.tests.test_msa.model
28 | 
29 | 
30 | .. data:: path_msa
31 |    :annotation: = biotransformers/data/msa
32 | 
33 | 
34 | 
35 | .. data:: model
36 | 
37 | 
38 | 
39 | 
40 | .. function:: test_msa_embeddings_type_and_shape()
41 | 
42 | 
43 | .. function:: test_msa_logits_type()
44 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/utils/constant/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.utils.constant`
 2 | =====================================
 3 | 
 4 | .. py:module:: biotransformers.utils.constant
 5 | 
 6 | 
 7 | Module Contents
 8 | ---------------
 9 | 
10 | .. data:: ESM_LIST
11 |    :annotation: = ['esm1_t34_670M_UR100', 'esm1_t12_85M_UR50S', 'esm1_t6_43M_UR50S', 'esm1b_t33_650M_UR50S',...
12 | 
13 | 
14 | 
15 | .. data:: ROSTLAB_LIST
16 |    :annotation: = ['Rostlab/prot_bert', 'Rostlab/prot_bert_bfd']
17 | 
18 | 
19 | 
20 | .. data:: MAPPING_PROTBERT
21 | 
22 | 
23 | 
24 | 
25 | .. data:: DEFAULT_ESM_MODEL
26 |    :annotation: = esm1_t34_670M_UR100
27 | 
28 | 
29 | 
30 | .. data:: DEFAULT_ROSTLAB_MODEL
31 |    :annotation: = Rostlab/prot_bert
32 | 
33 | 
34 | 
35 | .. data:: BACKEND_LIST
36 | 
37 | 
38 | 
39 | 
40 | .. data:: NATURAL_AAS_LIST
41 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/utils/deprecated/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.utils.deprecated`
 2 | =======================================
 3 | 
 4 | .. py:module:: biotransformers.utils.deprecated
 5 | 
 6 | 
 7 | Module Contents
 8 | ---------------
 9 | 
10 | 
11 | Functions
12 | ~~~~~~~~~
13 | 
14 | .. autoapisummary::
15 | 
16 |    biotransformers.utils.deprecated.deprecated_alias
17 |    biotransformers.utils.deprecated.rename_kwargs
18 | 
19 | 
20 | 
21 | .. function:: deprecated_alias(**aliases)
22 | 
23 | 
24 | .. function:: rename_kwargs(func_name, kwargs, aliases)
25 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/utils/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.utils`
 2 | ============================
 3 | 
 4 | .. py:module:: biotransformers.utils
 5 | 
 6 | 
 7 | Submodules
 8 | ----------
 9 | .. toctree::
10 |    :titlesonly:
11 |    :maxdepth: 1
12 | 
13 |    constant/index.rst
14 |    deprecated/index.rst
15 |    logger/index.rst
16 |    msa_utils/index.rst
17 |    tqdm_utils/index.rst
18 |    utils/index.rst
19 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/utils/logger/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.utils.logger`
 2 | ===================================
 3 | 
 4 | .. py:module:: biotransformers.utils.logger
 5 | 
 6 | 
 7 | Module Contents
 8 | ---------------
 9 | 
10 | 
11 | Functions
12 | ~~~~~~~~~
13 | 
14 | .. autoapisummary::
15 | 
16 |    biotransformers.utils.logger.logger
17 | 
18 | 
19 | 
20 | .. function:: logger(module_name: str) -> logging.Logger
21 | 
22 |    Configure the logger with formatter and handlers.
23 | 
24 |    The log level depends on the environment variable `BIO_LOG_LEVEL`.
25 | 
26 |    - 0: NOTSET, will be set to DEBUG
27 |    - 1: DEBUG
28 |    - 2: INFO (default)
29 |    - 3: WARNING
30 |    - 4: ERROR
31 |    - 5: CRITICAL
32 |    https://docs.python.org/3/library/logging.html#levels
33 | 
34 |    :param module_name: module name
35 |    :type module_name: str
36 | 
37 |    :returns: instantiate logger object
38 |    :rtype: [Logger]
39 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/utils/msa_utils/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.utils.msa_utils`
 2 | ======================================
 3 | 
 4 | .. py:module:: biotransformers.utils.msa_utils
 5 | 
 6 | 
 7 | Module Contents
 8 | ---------------
 9 | 
10 | 
11 | Functions
12 | ~~~~~~~~~
13 | 
14 | .. autoapisummary::
15 | 
16 |    biotransformers.utils.msa_utils.get_translation
17 |    biotransformers.utils.msa_utils.read_sequence
18 |    biotransformers.utils.msa_utils.remove_insertions
19 |    biotransformers.utils.msa_utils.read_msa
20 |    biotransformers.utils.msa_utils.get_msa_list
21 |    biotransformers.utils.msa_utils.get_msa_lengths
22 |    biotransformers.utils.msa_utils.msa_to_remove
23 | 
24 | 
25 | 
26 | .. function:: get_translation() -> Dict[int, Any]
27 | 
28 |    get translation dict to convert unused character in MSA
29 | 
30 | 
31 | .. function:: read_sequence(filename: str) -> Tuple[str, str]
32 | 
33 |    Reads the first (reference) sequences from a fasta or MSA file.
34 | 
35 | 
36 | .. function:: remove_insertions(sequence: str) -> str
37 | 
38 |    Removes any insertions into the sequence.
39 |    Needed to load aligned sequences in an MSA.
40 | 
41 | 
42 | .. function:: read_msa(filename: str, nseq: int) -> List[Tuple[str, str]]
43 | 
44 |    Reads the first nseq sequences from an MSA file,
45 |    automatically removes insertions.
46 | 
47 | 
48 | .. function:: get_msa_list(path_msa: Optional[str]) -> List[str]
49 | 
50 |    Get all files of the msa folder and check file format
51 | 
52 |    :param path_msa: path of the folder with a3m file
53 |    :type path_msa: Optional[str]
54 | 
55 | 
56 | .. function:: get_msa_lengths(list_msa: List[List[Tuple[str, str]]], nseq: int) -> List[int]
57 | 
58 |    Get length of an MSA list
59 | 
60 |    All MSA must have at least nseq in msa
61 | 
62 |    :param list_msa: list of MSA. MSA is a list of tuple
63 |    :type list_msa: List[List[Tuple[str,str]]]
64 |    :param nseq:
65 | 
66 |    :returns: [description]
67 |    :rtype: List[int]
68 | 
69 | 
70 | .. function:: msa_to_remove(path_msa: str, n_seq) -> List[str]
71 | 
72 |    Get list of msa with less than nseq sequence
73 | 
74 |    :param path_msa: [description]
75 |    :type path_msa: str
76 | 
77 |    :returns: List of msa filepath that don't have enough enough sequences.
78 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/utils/tqdm_utils/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.utils.tqdm_utils`
 2 | =======================================
 3 | 
 4 | .. py:module:: biotransformers.utils.tqdm_utils
 5 | 
 6 | 
 7 | Module Contents
 8 | ---------------
 9 | 
10 | Classes
11 | ~~~~~~~
12 | 
13 | .. autoapisummary::
14 | 
15 |    biotransformers.utils.tqdm_utils.ProgressBarActor
16 |    biotransformers.utils.tqdm_utils.ProgressBar
17 | 
18 | 
19 | 
20 | 
21 | .. class:: ProgressBarActor
22 | 
23 | 
24 |    .. attribute:: counter
25 |       :annotation: :int
26 | 
27 | 
28 | 
29 |    .. attribute:: delta
30 |       :annotation: :int
31 | 
32 | 
33 | 
34 |    .. attribute:: event
35 |       :annotation: :asyncio.Event
36 | 
37 | 
38 | 
39 |    .. method:: update(self, num_items_completed: int) -> None
40 | 
41 |       Updates the ProgressBar with the incremental
42 |       number of items that were just completed.
43 | 
44 | 
45 |    .. method:: wait_for_update(self) -> Tuple[int, int]
46 |       :async:
47 | 
48 |       Blocking call.
49 | 
50 |       Waits until somebody calls `update`, then returns a tuple of
51 |       the number of updates since the last call to
52 |       `wait_for_update`, and the total number of completed items.
53 | 
54 | 
55 |    .. method:: get_counter(self) -> int
56 | 
57 |       Returns the total number of complete items.
58 | 
59 | 
60 | 
61 | .. class:: ProgressBar(total: int, description: str = '')
62 | 
63 | 
64 |    .. attribute:: progress_actor
65 |       :annotation: :ray.actor.ActorHandle
66 | 
67 | 
68 | 
69 |    .. attribute:: total
70 |       :annotation: :int
71 | 
72 | 
73 | 
74 |    .. attribute:: description
75 |       :annotation: :str
76 | 
77 | 
78 | 
79 |    .. attribute:: pbar
80 |       :annotation: :tqdm.tqdm
81 | 
82 | 
83 | 
84 |    .. method:: actor(self) -> ray.actor.ActorHandle
85 |       :property:
86 | 
87 |       Returns a reference to the remote `ProgressBarActor`.
88 | 
89 |       When you complete tasks, call `update` on the actor.
90 | 
91 | 
92 |    .. method:: print_until_done(self) -> None
93 | 
94 |       Blocking call.
95 | 
96 |       Do this after starting a series of remote Ray tasks, to which you've
97 |       passed the actor handle. Each of them calls `update` on the actor.
98 |       When the progress meter reaches 100%, this method returns.
99 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/version/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.version`
 2 | ==============================
 3 | 
 4 | .. py:module:: biotransformers.version
 5 | 
 6 | 
 7 | Module Contents
 8 | ---------------
 9 | 
10 | .. data:: VERSION
11 |    :annotation: = 0.1.3
12 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/wrappers/esm_wrappers/index.rst.txt:
--------------------------------------------------------------------------------
  1 | :mod:`biotransformers.wrappers.esm_wrappers`
  2 | ============================================
  3 | 
  4 | .. py:module:: biotransformers.wrappers.esm_wrappers
  5 | 
  6 | .. autoapi-nested-parse::
  7 | 
  8 |    This script defines a class which inherits from the LanguageModel class, and is
  9 |    specific to the ESM model developed by FAIR (https://github.com/facebookresearch/esm).
 10 | 
 11 | 
 12 | 
 13 | Module Contents
 14 | ---------------
 15 | 
 16 | Classes
 17 | ~~~~~~~
 18 | 
 19 | .. autoapisummary::
 20 | 
 21 |    biotransformers.wrappers.esm_wrappers.ESMWrapper
 22 | 
 23 | 
 24 | 
 25 | 
 26 | Attributes
 27 | ~~~~~~~~~~
 28 | 
 29 | .. autoapisummary::
 30 | 
 31 |    biotransformers.wrappers.esm_wrappers.log
 32 |    biotransformers.wrappers.esm_wrappers.path_msa_folder
 33 | 
 34 | 
 35 | .. data:: log
 36 | 
 37 | 
 38 | 
 39 | 
 40 | .. data:: path_msa_folder
 41 | 
 42 | 
 43 | 
 44 | 
 45 | .. class:: ESMWrapper(model_dir: str, device: str)
 46 | 
 47 | 
 48 |    Bases: :py:obj:`biotransformers.wrappers.language_model.LanguageModel`
 49 | 
 50 |    Class that uses an ESM type of pretrained transformers model to evaluate
 51 |    a protein likelihood so as other insights.
 52 | 
 53 |    .. method:: model(self) -> torch.nn.Module
 54 |       :property:
 55 | 
 56 |       Return torch model.
 57 | 
 58 | 
 59 |    .. method:: clean_model_id(self) -> str
 60 |       :property:
 61 | 
 62 |       Clean model ID (in case the model directory is not)
 63 | 
 64 | 
 65 |    .. method:: model_vocabulary(self) -> List[str]
 66 |       :property:
 67 | 
 68 |       Returns the whole vocabulary list
 69 | 
 70 | 
 71 |    .. method:: vocab_size(self) -> int
 72 |       :property:
 73 | 
 74 |       Returns the whole vocabulary size
 75 | 
 76 | 
 77 |    .. method:: mask_token(self) -> str
 78 |       :property:
 79 | 
 80 |       Representation of the mask token (as a string)
 81 | 
 82 | 
 83 |    .. method:: pad_token(self) -> str
 84 |       :property:
 85 | 
 86 |       Representation of the pad token (as a string)
 87 | 
 88 | 
 89 |    .. method:: begin_token(self) -> str
 90 |       :property:
 91 | 
 92 |       Representation of the beginning of sentence token (as a string)
 93 | 
 94 | 
 95 |    .. method:: end_token(self) -> str
 96 |       :property:
 97 | 
 98 |       Representation of the end of sentence token (as a string)
 99 | 
100 | 
101 |    .. method:: does_end_token_exist(self) -> bool
102 |       :property:
103 | 
104 |       Returns true if a end of sequence token exists
105 | 
106 | 
107 |    .. method:: token_to_id(self)
108 |       :property:
109 | 
110 |       Returns a function which maps tokens to IDs
111 | 
112 | 
113 |    .. method:: embeddings_size(self)
114 |       :property:
115 | 
116 |       Returns size of the embeddings
117 | 
118 | 
119 |    .. method:: process_sequences_and_tokens(self, sequences_list: List[str]) -> Dict[str, torch.Tensor]
120 | 
121 |       Function to transform tokens string to IDs; it depends on the model used
122 | 
123 | 
124 |    .. method:: _load_model(self, path_model: str, map_location=None)
125 | 
126 |       Load model.
127 | 
128 | 
129 |    .. method:: model_pass(self, model_inputs: Dict[str, torch.Tensor], batch_size: int, silent: bool = False, pba: ray.actor.ActorHandle = None) -> Tuple[torch.Tensor, torch.Tensor]
130 | 
131 |       Function which computes logits and embeddings based on a list of sequences,
132 |       a provided batch size and an inference configuration. The output is obtained
133 |       by computing a forward pass through the model ("forward inference")
134 | 
135 |       The datagenerator is not the same the multi_gpus inference. We use a tqdm progress bar
136 |       that is updated by the worker. The progress bar is instantiated before ray.remote
137 | 
138 |       :param model_inputs: [description]
139 |       :type model_inputs: Dict[str, torch.tensor]
140 |       :param batch_size: size of the batch
141 |       :type batch_size: int
142 |       :param silent: display or not progress bar
143 |       :param pba: tqdm progress bar for ray actor
144 | 
145 |       :returns:         * logits [num_seqs, max_len_seqs, vocab_size]
146 |                         * embeddings [num_seqs, max_len_seqs+1, embedding_size]
147 |       :rtype: Tuple[torch.tensor, torch.tensor]
148 | 
149 | 
150 |    .. method:: get_alphabet_dataloader(self)
151 | 
152 |       Define an alphabet mapping for common method between
153 |       protbert and ESM
154 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/wrappers/index.rst.txt:
--------------------------------------------------------------------------------
 1 | :mod:`biotransformers.wrappers`
 2 | ===============================
 3 | 
 4 | .. py:module:: biotransformers.wrappers
 5 | 
 6 | 
 7 | Submodules
 8 | ----------
 9 | .. toctree::
10 |    :titlesonly:
11 |    :maxdepth: 1
12 | 
13 |    esm_wrappers/index.rst
14 |    language_model/index.rst
15 |    rostlab_wrapper/index.rst
16 |    transformers_wrappers/index.rst
17 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/wrappers/language_model/index.rst.txt:
--------------------------------------------------------------------------------
  1 | :mod:`biotransformers.wrappers.language_model`
  2 | ==============================================
  3 | 
  4 | .. py:module:: biotransformers.wrappers.language_model
  5 | 
  6 | .. autoapi-nested-parse::
  7 | 
  8 |    This script defines a generic template class for any language model.
  9 |    Both ESM and Rostlab language models should implement this class.
 10 | 
 11 | 
 12 | 
 13 | Module Contents
 14 | ---------------
 15 | 
 16 | Classes
 17 | ~~~~~~~
 18 | 
 19 | .. autoapisummary::
 20 | 
 21 |    biotransformers.wrappers.language_model.LanguageModel
 22 | 
 23 | 
 24 | 
 25 | 
 26 | .. class:: LanguageModel(model_dir: str, device)
 27 | 
 28 | 
 29 |    Bases: :py:obj:`abc.ABC`
 30 | 
 31 |    Class that implements a language model.
 32 | 
 33 |    .. method:: model_id(self) -> str
 34 |       :property:
 35 | 
 36 |       Model ID, as specified in the model directory
 37 | 
 38 | 
 39 |    .. method:: clean_model_id(self) -> str
 40 |       :property:
 41 | 
 42 |       Clean model ID (in case the model directory is not)
 43 | 
 44 | 
 45 |    .. method:: model_vocabulary(self) -> List[str]
 46 |       :property:
 47 | 
 48 |       Returns the whole vocabulary list
 49 | 
 50 | 
 51 |    .. method:: vocab_size(self) -> int
 52 |       :property:
 53 | 
 54 |       Returns the whole vocabulary size
 55 | 
 56 | 
 57 |    .. method:: mask_token(self) -> str
 58 |       :property:
 59 | 
 60 |       Representation of the mask token (as a string)
 61 | 
 62 | 
 63 |    .. method:: pad_token(self) -> str
 64 |       :property:
 65 | 
 66 |       Representation of the pad token (as a string)
 67 | 
 68 | 
 69 |    .. method:: begin_token(self) -> str
 70 |       :property:
 71 | 
 72 |       Representation of the beginning of sentence token (as a string)
 73 | 
 74 | 
 75 |    .. method:: end_token(self) -> str
 76 |       :property:
 77 | 
 78 |       Representation of the end of sentence token (as a string).
 79 | 
 80 | 
 81 |    .. method:: does_end_token_exist(self) -> bool
 82 |       :property:
 83 | 
 84 |       Returns true if a end of sequence token exists
 85 | 
 86 | 
 87 |    .. method:: token_to_id(self)
 88 |       :property:
 89 | 
 90 |       Returns a function which maps tokens to IDs
 91 | 
 92 | 
 93 |    .. method:: embeddings_size(self) -> int
 94 |       :property:
 95 | 
 96 |       Returns size of the embeddings
 97 | 
 98 | 
 99 |    .. method:: process_sequences_and_tokens(self, sequences_list: List[str]) -> Dict[str, torch.Tensor]
100 |       :abstractmethod:
101 | 
102 |       Function to transform tokens string to IDs; it depends on the model used
103 | 
104 | 
105 |    .. method:: model(self) -> torch.nn.Module
106 |       :property:
107 | 
108 |       Return torch model.
109 | 
110 | 
111 |    .. method:: _load_model(self, path: str)
112 |       :abstractmethod:
113 | 
114 |       Load model.
115 | 
116 | 
117 |    .. method:: model_pass(self, model_inputs: Dict[str, torch.tensor], batch_size: int, silent: bool = False, pba: ray.actor.ActorHandle = None) -> Tuple[torch.Tensor, torch.Tensor]
118 |       :abstractmethod:
119 | 
120 |       Function which computes logits and embeddings based on a dict of sequences
121 |       tensors, a provided batch size and an inference configuration. The output is
122 |       obtained by computing a forward pass through the model ("forward inference")
123 | 
124 |       :param model_inputs: [description]
125 |       :type model_inputs: Dict[str, torch.tensor]
126 |       :param batch_size: size of the batch
127 |       :type batch_size: int
128 |       :param silent: display or not progress bar
129 |       :param pba: tqdm progress bar for ray actor
130 | 
131 |       :returns:         * logits [num_seqs, max_len_seqs, vocab_size]
132 |                         * embeddings [num_seqs, max_len_seqs+1, embedding_size]
133 |       :rtype: Tuple[torch.tensor, torch.tensor]
134 | 
135 | 
136 |    .. method:: get_alphabet_dataloader(self)
137 |       :abstractmethod:
138 | 
139 |       Define an alphabet mapping for common method between
140 |       protbert and ESM
141 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/biotransformers/wrappers/rostlab_wrapper/index.rst.txt:
--------------------------------------------------------------------------------
  1 | :mod:`biotransformers.wrappers.rostlab_wrapper`
  2 | ===============================================
  3 | 
  4 | .. py:module:: biotransformers.wrappers.rostlab_wrapper
  5 | 
  6 | .. autoapi-nested-parse::
  7 | 
  8 |    This script defines a class which inherits from the LanguageModel class, and is
  9 |    specific to the Rostlab models (eg ProtBert and ProtBert-BFD) developed by
 10 |    hugging face
 11 |    - ProtBert: https://huggingface.co/Rostlab/prot_bert
 12 |    - ProtBert BFD: https://huggingface.co/Rostlab/prot_bert_bfd
 13 | 
 14 | 
 15 | 
 16 | Module Contents
 17 | ---------------
 18 | 
 19 | Classes
 20 | ~~~~~~~
 21 | 
 22 | .. autoapisummary::
 23 | 
 24 |    biotransformers.wrappers.rostlab_wrapper.RostlabWrapper
 25 | 
 26 | 
 27 | 
 28 | 
 29 | Attributes
 30 | ~~~~~~~~~~
 31 | 
 32 | .. autoapisummary::
 33 | 
 34 |    biotransformers.wrappers.rostlab_wrapper.log
 35 | 
 36 | 
 37 | .. data:: log
 38 | 
 39 | 
 40 | 
 41 | 
 42 | .. class:: RostlabWrapper(model_dir: str, device)
 43 | 
 44 | 
 45 |    Bases: :py:obj:`biotransformers.wrappers.language_model.LanguageModel`
 46 | 
 47 |    Class that uses a rostlab type of pretrained transformers model to evaluate
 48 |    a protein likelihood so as other insights.
 49 | 
 50 |    .. method:: model(self) -> torch.nn.Module
 51 |       :property:
 52 | 
 53 |       Return torch model.
 54 | 
 55 | 
 56 |    .. method:: clean_model_id(self) -> str
 57 |       :property:
 58 | 
 59 |       Clean model ID (in case the model directory is not)
 60 | 
 61 | 
 62 |    .. method:: model_vocabulary(self) -> List[str]
 63 |       :property:
 64 | 
 65 |       Returns the whole vocabulary list
 66 | 
 67 | 
 68 |    .. method:: vocab_size(self) -> int
 69 |       :property:
 70 | 
 71 |       Returns the whole vocabulary size
 72 | 
 73 | 
 74 |    .. method:: mask_token(self) -> str
 75 |       :property:
 76 | 
 77 |       Representation of the mask token (as a string)
 78 | 
 79 | 
 80 |    .. method:: pad_token(self) -> str
 81 |       :property:
 82 | 
 83 |       Representation of the pad token (as a string)
 84 | 
 85 | 
 86 |    .. method:: begin_token(self) -> str
 87 |       :property:
 88 | 
 89 |       Representation of the beginning of sentence token (as a string)
 90 | 
 91 | 
 92 |    .. method:: end_token(self) -> str
 93 |       :property:
 94 | 
 95 |       Representation of the end of sentence token (as a string).
 96 | 
 97 | 
 98 |    .. method:: does_end_token_exist(self) -> bool
 99 |       :property:
100 | 
101 |       Returns true if a end of sequence token exists
102 | 
103 | 
104 |    .. method:: token_to_id(self)
105 |       :property:
106 | 
107 |       Returns a function which maps tokens to IDs
108 | 
109 | 
110 |    .. method:: embeddings_size(self) -> int
111 |       :property:
112 | 
113 |       Returns size of the embeddings
114 | 
115 | 
116 |    .. method:: _load_model(self, path_model: str, map_location=None)
117 | 
118 |       Load model.
119 | 
120 | 
121 |    .. method:: process_sequences_and_tokens(self, sequences_list: List[str]) -> Dict[str, torch.tensor]
122 | 
123 |       Function to transform tokens string to IDs; it depends on the model used
124 | 
125 | 
126 |    .. method:: model_pass(self, model_inputs: Dict[str, torch.tensor], batch_size: int, silent: bool = False, pba: ray.actor.ActorHandle = None) -> Tuple[torch.Tensor, torch.Tensor]
127 | 
128 |       Function which computes logits and embeddings based on a dict of sequences
129 |       tensors, a provided batch size and an inference configuration. The output is
130 |       obtained by computing a forward pass through the model ("forward inference")
131 | 
132 |       :param model_inputs: [description]
133 |       :type model_inputs: Dict[str, torch.tensor]
134 |       :param batch_size: size of the batch
135 |       :type batch_size: int
136 |       :param silent: display or not progress bar
137 |       :param pba: tqdm progress bar for ray actor
138 | 
139 |       :returns:         * logits [num_seqs, max_len_seqs, vocab_size]
140 |                         * embeddings [num_seqs, max_len_seqs+1, embedding_size]
141 |       :rtype: Tuple[torch.tensor, torch.tensor]
142 | 
143 | 
144 |    .. method:: get_alphabet_dataloader(self)
145 | 
146 |       Define an alphabet mapping for common method between
147 |       protbert and ESM
148 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/autoapi/index.rst.txt:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | This page contains auto-generated API reference documentation [#f1]_.
 5 | 
 6 | .. toctree::
 7 |    :titlesonly:
 8 | 
 9 |    /autoapi/biotransformers/index
10 | 
11 | .. [#f1] Created with `sphinx-autoapi <https://github.com/readthedocs/sphinx-autoapi>`_
12 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/contributing/CHANGELOG.md.txt:
--------------------------------------------------------------------------------
  1 | # Change log
  2 | 
  3 | # [0.1.0] - 2021-06-22
  4 | 
  5 | Note on the release
  6 | 
  7 | Breaking change:
  8 | 
  9 | - Remove `multi_gpu` and `device` arguments. Replace by n_gpus.
 10 | 
 11 | Features:
 12 | 
 13 | - Add Ray parallelization to shorten inference time
 14 | - Add Dockerfile
 15 | 
 16 | Docs:
 17 | 
 18 |  - Improve documentation
 19 | 
 20 | Fixed:
 21 | 
 22 | - fix `tokens_list` argument: all sequences' token must be in tokens_list.
 23 | 
 24 | # [0.0.10] - 2021-06-14
 25 | 
 26 | Note on the release
 27 | 
 28 | Features:
 29 | 
 30 | - Add BIO_LOG_LEVEL environnement variable to control logging message (logger)
 31 | - Check if every unique amino acids in sequences are in tokens_list (compute_probabilities)
 32 | 
 33 | Fixed:
 34 | 
 35 | - Add shuffling in batch_sampler (lightning_utils)
 36 | - Fix tokens argument for dataloader (lightning_utils)
 37 | 
 38 | Changed:
 39 | 
 40 | - Modified the signature of some functions to improve clarity (tansformers_wrappers)
 41 | - Update `train_masked` method to `finetune` (tansformers_wrappers)
 42 | - `compute_embeddings` with option `full` return a list of embeddingsn, no matter the size (tansformers_wrappers)
 43 | 
 44 | Removed:
 45 | 
 46 | - Remove the tokens_list argument when not necessary and tried to make its usage clearer (tansformers_wrappers)
 47 | 
 48 | - Remove functions (tansformers_wrappers):
 49 | 
 50 |     - _filter_and_pool_embeddings
 51 |     - _split_logits
 52 |     -  _slabels_remaping
 53 |     - _filter_logits
 54 |     -  _filter_loglikelihood
 55 |     - _compute_accuracy
 56 |     - _compute_calibration
 57 | 
 58 | 
 59 | # [0.0.9] - 2021-06-04
 60 | 
 61 | Fixed:
 62 |  - Batch_sampler issue
 63 | 
 64 | # [0.0.8] - 2021-06-03
 65 | Note on the release
 66 | 
 67 | Features:
 68 |  - Merge ESM/protbert for finetuning model with pytorch-lightning
 69 |  - Possibility to restore a training session.
 70 | 
 71 | Fixed:
 72 |  - Fix conflicts when saving model with DDP
 73 |  - Fix loading checkpoint created by pytorch-lightning
 74 | 
 75 | 
 76 | # [0.0.7] - 2021-05-12
 77 | Note on the release
 78 | 
 79 | Features:
 80 |  - Add fasta files support for each compute function.
 81 |  - Add train_masked function to finetune model on custom dataset. (Only ESM for the moment, protbert is coming.)
 82 | 
 83 | Docs:
 84 |  - Update documentation to add tutorial on training.
 85 | 
 86 | Changed:
 87 |  - GPU is used by default if found, even if not specified.
 88 | 
 89 | # [0.0.6] - 2021-05-24
 90 | Note on the release
 91 | 
 92 | Fixed:
 93 |  - Update torch dependencies to be less restrictive. Create conflict with other packages.
 94 | 
 95 | # [0.0.5] - 2021-05-12
 96 | 
 97 | Note on the release
 98 | 
 99 | Added
100 |  - added multi-gpu support for inference
101 |  - added function to finetuned a model on a specific dataset on multi-gpu
102 | 
103 | Changed
104 | 
105 | Fixed
106 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/contributing/CONTRIBUTING.md.txt:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | In order to contibute to this repository you will need developer access to this repo. To know more about the project go to the [README](README.md) first.
 4 | 
 5 | 
 6 | ## Install Dev environment
 7 | 
 8 | From the root of this repo, run
 9 | `conda env create -f environment_dev.yaml`
10 | 
11 | ## Pre-commit hooks
12 | 
13 | Pre-commits hooks have been configured for this project using the [pre-commit](https://pre-commit.com/) library:
14 | 
15 | - [black](https://github.com/psf/black) python formatter
16 | - [flake8](https://flake8.pycqa.org/en/latest/) python linter
17 | - [isort](https://pypi.org/project/isort/) sorts imports
18 | 
19 | To get them going on your side, make sure to have python installed, and run the following
20 | commands from the root directory of this repository:
21 | 
22 | ```bash
23 | pip install pre-commit
24 | pre-commit install
25 | pre-commit run --all-files
26 | ```
27 | 
28 | ## Coding conventions
29 | 
30 | Please respect the following conventions to contribute to the code:
31 | 
32 | - Use hard wrap at 88
33 | - Respect black, isort and flake8 conventions
34 | - Classes' names are Caml case (example: MyClass)
35 | - Functions and variables are in lower case with _ as separator (example: my_function, my_var)
36 | - Names are explicit: avoid mathematical notations, functions' names start with a verb
37 | - Use python typing library: each class and method should be typed (both for inputs and outputs)
38 | - Create custom types if needed
39 | - All classes and functions should have a docstring
40 | - Avoid repeating arguments and returns in docstring (should be explicit with the types) except when it is truly necessary
41 | - A function (or a class) does not take more than 5 arguments, if you need more create a data class
42 | - Avoid dictionaries to pass arguments when possible and prefer dataclasses instead
43 | - Repeat inputs names when calling a function: ex: compute_custom(arg1=arg1, arg2=my_arg2)
44 | - Use list comprehension when it is possible
45 | - Use f strings to add variables in strings: ex: print(f'my var value is {my_var}')
46 | - Use PathLib to handle pathes
47 | - Prefer shutil to os to manage files/ folders creations and deletions
48 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/documentation/course.md.txt:
--------------------------------------------------------------------------------
 1 | # Getting starting with transformers
 2 | 
 3 | If you want to know more about the transformers architecture, have a look at:
 4 | - [The illustrated transformers](http://jalammar.github.io/illustrated-transformer/)
 5 | - [The annotated transformers](https://nlp.seas.harvard.edu/2018/04/03/attention.html?s=09)
 6 | 
 7 | ## Bio-transformers
 8 | 
 9 | If you want to understand the secret of how to train deep-learning model on protein, the two best repo are:
10 | - [ProtTrans repo](https://github.com/agemagician/ProtTrans)
11 | - [ESM repo](https://github.com/facebookresearch/esm)
12 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/documentation/logging.md.txt:
--------------------------------------------------------------------------------
 1 | # Logging
 2 | 
 3 | When running bio-transformers, printed messages are entirely controles by bio-transformers code.
 4 | The log level controls which types of log messages would be printed.
 5 | 
 6 | bio-transformers uses the Python module ``logging` <https://docs.python.org/3/library/logging.html>`_ to log the messages. The log level is controlled by the environment variable BIO_LOG_LEVEL. The levels are given in the table below. The default level is “2”.
 7 | 
 8 | To adjust the logging level, you can export the environment variable:
 9 | 
10 |   ```bash
11 |   export BIO_LOG_LEVEL=1
12 |   ```
13 | 
14 |   | DEEPREG_LOG_LEVEL | Behavior                                                                                   |
15 | | ----------------- | ------------------------------------------------------------------------------------------ |
16 | | "0"               | Log all messages, equivalent to `logging.DEBUG`. Same as log level "1".                    |
17 | | "1"               | Log all messages, equivalent to `logging.DEBUG`.                                           |
18 | | "2"               | Log all messages except DEBUG, equivalent to `logging.INFO`. (default)                     |
19 | | "3"               | Log all messages except DEBUG and INFO, equivalent to `logging.WARNING`.                   |
20 | | "4"               | Log all messages except DEBUG, INFO, and WARNING, equivalent to `logging.ERROR`.           |
21 | | "5"               | Log all messages except DEBUG, INFO, WARNING, and ERROR, equivalent to `logging.CRITICAL`. |
22 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/documentation/msa.md.txt:
--------------------------------------------------------------------------------
 1 | # MSA
 2 | 
 3 | ## What is an MSA?
 4 | 
 5 | An MSA (multiple sequence alignment) is a file that contains a sequence of amino acids and a number of aligned variant sequences. The aim is to stored information about the evolution of the main sequence. Using this for training allows to reduce the number of model parameters, and use evolution information in the model as explained in the [MSA Transformers papers](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1.full.pdf).
 6 | 
 7 | The MSA are generated using [hh-suite](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3019-7) tools, searching similar sequences in the UniClust30 database.
 8 | 
 9 | ```{note}
10 | The generated MSA must have the .a3m extension to be used in `bio-transformers`
11 | ```
12 | 
13 | Below, you can see an example of an MSA file. The first line is the main sequence. The following sequences are the most probable variants sequences. The `-` character in the msa sequences corresponds to the alignment character.
14 | 
15 | ```bash
16 | >> head msa_example.a3m
17 | ```
18 | 
19 | ```bash
20 | >sp|O24396|PURA_WHEAT Adenylosuccinate synthetase, chloroplastic (Fragment) OS=Triticum aestivum OX=4565 PE=1 SV=1
21 | AAAAAGRGRSFSPAAPAPSSVRLPGRQAPAPAAASALAVEADPAADRVSSLSQVSGVLGSQWGDEGKGKLVDVLAPRFDIVARCQGGANAGHTIYNSEGKKFALHLVPSGILHEGTLCVVGNGAVIHVPGFFGEIDGLQSNGVSCDGRILVSDRAHLLFDLHQTVDGLREAELANSFIGTTKRGIGPCYSSKVTRNGLRVCDLRHMDTFGDKLDVLFEDAAARFEGFKYSKGMLKEEVERYKRFAERLEPFIADTVHVLNESIRQKKKILVEGGQATMLDIDFGTYPFVTSSSPSAGGICTGLGIAPRVIGDLIGVVKAYTTRVGSGPFPTELLGEEGDVLRKAGMEFGTTTGRPRRCGWLDIVALKYCCDINGFSSLNLTKLDVLSGLPEIKLGVSYNQMDGEKLQSFPGDLDTLEQVQVNYEVLPGWDSDISSVRSYSELPQAARRYVERIEELAGVPVHYIGVGPGRDALIYK
22 | >UniRef100_A0A0N4UWB0 Adenylosuccinate synthetase n=1 Tax=Enterobius vermicularis TaxID=51028 RepID=A0A0N4UWB0_ENTVE
23 | --------------------------------------------MNDQKRKAPVIVILGAQFGDEGKGKIVDFLIEKekIQLTARCQGGNNAGHTVV-VNGRKSDFHLLPTGIINEDCYNIIGNGVVVNLDALFKEIEHNEIDKLNgWEKRLMISELAHLVTSMHMQADGQQEKSLSSEKIGTTSKGIGPTYSTKCFRNGIRVGELlGDFEAFSAKFRSLAAFYLKQFPGIEVN---VEEELDNYKKHAVCLKRLgiVGDTITYLDEMRAQGKAILVEGANGAMLDIDFGsflytffchsgTYPFVTSSNATVGGAVTGLGIPPTAITEIIGVVKAYETRVGSGPFPTEQQGKIGEDLQSIGHEVGVTTGRKRRCGWLDLFLLKRSSVINGFTALALTKLDILDNFDEIKVATGYR-IDGKSLKAPPSCAADWSRIELEYKTFSGWKDDVSKIRSFNELPENCKTYVKFIEGFVGVPIKWIGVGEDREALIVM
24 | >UniRef100_A0A139AVD1 Adenylosuccinate synthetase n=1 Tax=Gonapodya prolifera (strain JEL478) TaxID=1344416 RepID=A0A139AVD1_GONPJ
25 | -----------------------------------------ATG-------NKAVVVLGAQWGDEGKGKLVDILTQQADLVARCQGGNNAGHTIV-VDGVKFDFHMLPSGLLGaPSTVSLVGSGVVLHLPSFFEEVKKTESKGVSCANRLFVSDRCHLVFDLHQIVDGLKEGELAShkQEIGTTKKGIGPAYSSKASRGGVRVHHLiaPDFAEFESRFRQMAANKKRRYGDFPYD---VDAEVERYRQYRDLIRPYVVDSVTYVHKALQEGKRVLVEGANAVMLDIDFGTFPYVTSSNTTIGGVCTGLGLPPKSIGKVIGVVKAYTTRVGAGPFPTEQLNEVGEHLQTVGAEFGVTTGRKRRCGWLDAAVLRWSHMINGYDSINLTKLDILDGLPTLRIGIAYKHrATGQVYETFPADLHLLEECDVIYEELPGWKESIGGCKSWDALPENARKYVERIEQLVGVNVEYIGVGVSRDSMITK
26 | ```
27 | 
28 | ```{caution}
29 | `finetune` method for MSA transormers and `pass_mode=masked` are not available.
30 | ```
31 | 
32 | ## How to use msa-transformers?
33 | 
34 | The library supports `esm_msa1_t12_100M_UR50S` backend which is based on the [MSA Transformers papers](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1.full.pdf).
35 | 
36 | Instead of passing a list of sequences of the path of a fasta file, you can pass the path to the folder where the .a3m files are stored.
37 | 
38 | ```python
39 | from biotransformers import BioTransformers
40 | 
41 | msa_folder = "msa_folder"
42 | bio_trans = BioTransformers("esm_msa1_t12_100M_UR50S",num_gpus=1)
43 | msa_embeddings = bio_trans.compute_embeddings(sequences=msa_folder, pool_mode=("cls","mean"), n_seqs_msa=128)
44 | ```
45 | 
46 | As an MSA is composed of multiple sequences, the results have an extra-dimension, which corresponds to all the aligned sequences.
47 | 
48 | ```python
49 | msa_embeddings['cls'].shape
50 | ```
51 | 
52 | If 100 msa files are in the folder, we have the following dimension for the embeddings. We take le `<CLS>` token of each of the 128 sequences in the files.
53 | 
54 | ```python
55 | >> (100, 128, 768)
56 | ```
57 | 
58 | ```{caution}
59 | All files in the msa folder must have at least `n_seqs_msa` sequences in each MSA. If you want to remove files with less than `n_seqs_msa` arguments, you can use the `biotransformers.utils.msa_utils.msa_to_remove` function.
60 | ```
61 | 
62 | ```python
63 | msa_to_remove("data_msa_sample/", n_seq=128)
64 | ```
65 | 
66 | ```python
67 | >>
68 | 
69 | 3/8 have insufficient number of sequences in MSA.
70 | ['data/data_msa_sample/seq130_swissprot.a3m',
71 |  'data/data_msa_sample/seq109_swissprot.a3m',
72 |  'data/data_msa_sample/seq124_swissprot.a3m']
73 | ```
74 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/documentation/multi_gpus.md.txt:
--------------------------------------------------------------------------------
 1 | # Multi-gpus
 2 | 
 3 | ```{note}
 4 | These changes have been introduced in ``bio-transformers`` v0.0.11.
 5 | ```
 6 | 
 7 | The use ``torch.nn.DataParallel`` is strongly [discourage](https://pytorch.org/docs/stable/notes/cuda.html#cuda-nn-ddp-instead), as a consequence, ``bio-transformers`` relies on [ray](https://docs.ray.io/en/master/?badge=master#) to distribute the compute on multiple GPUs. This parallelization scale far better, with performance increasing with the number of GPUs.
 8 | 
 9 | Ray is used only when the ``num_gpus>1``. See the difference below:
10 | 
11 | ```{important}
12 | Note that ray parallelization is only used for inference function. `finetune` method uses pytorch-lightning with its built-in function to train a model.
13 | ```
14 | 
15 | ```python
16 | from biotransformers import BioTransformers
17 | import ray
18 | 
19 | ray.init()
20 | 
21 | sequences = [...]
22 | bio_trans = BioTransformers("esm1b_t33_650M_UR50S",num_gpus=4)
23 | embeddings = bio_trans.compute_embeddings(sequences, pool_mode=("cls","mean"), batch_size=8)
24 | ```
25 | 
26 | ```{note}
27 | You don't have to use ``ray.init()`` when num_gpus=1
28 | ```
29 | 
30 | ## Configure GPU environment variable
31 | 
32 | Sometimes it can be useful to specify which GPU you want to use. It can be done in the terminal or at the beginning of the script. You just have to export the GPU index you want to use.
33 | 
34 | For example, if you have 8 GPUs but you just want to use 3 of them (0,5,6):
35 | 
36 | ```bash
37 | export CUDA_VISIBLE_DEVICES="0,5,6"
38 | ```
39 | 
40 | or
41 | 
42 | ```python
43 | import os
44 | os.environ["CUDA_VISIBLE_DEVICES"]="0,5,6"
45 | ```
46 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/getting_started/install.rst.txt:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | 
 5 | Bio-transformers can be installed in Python 3.7 and external python dependencies are mainly defined in `requirements`_.
 6 | There are multiple different methods to install Bio-transformers:
 7 | 
 8 | 1. Clone `Bio-transformers`_ and create a virtual environment using `Anaconda`_ / `Miniconda`_ (**recommended**).
 9 | 2. Clone `Bio-transformers`_ and build a docker image using the provided docker file. (**not implemented**)
10 | 3. Install directly from PyPI release without cloning `Bio-transformers`_.
11 | 
12 | 
13 | 
14 | Install torch/cuda
15 | ------------------
16 | 
17 | .. WARNING:: ``bio-transformers`` doesn't manage the installation of cuda toolkit and torch gpu version.
18 | 
19 | If you want to find a specific version or torch based on your CUDA setup, please refer to this `page <https://pytorch.org/get-started/previous-versions/>`_
20 | 
21 | The Dockerfile provided in the `github repository <https://github.com/DeepChainBio/bio-transformers>`_ relies on :
22 |     - `pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html`
23 | 
24 | Install in conda environment
25 | ----------------------------
26 | The recommended method is to install Bio-transformers in a dedicated virtual
27 | environment using `Anaconda`_ / `Miniconda`_.
28 | 
29 | 
30 | .. code:: bash
31 | 
32 |     conda create --name bio-transformers python=3.7 -y
33 |     conda activate bio-transformers
34 |     pip install bio-transformers
35 | 
36 | .. _Quick Start: quick_start.html
37 | .. _Anaconda: https://docs.anaconda.com/anaconda/install
38 | .. _Miniconda: https://docs.conda.io/en/latest/miniconda.html
39 | .. _Bio-transformers: https://github.com/DeepChainBio/bio-transformers
40 | .. _requirements: https://github.com/DeepChainBio/bio-transformers/blob/main/requirements.txt
41 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/getting_started/quick_start.md.txt:
--------------------------------------------------------------------------------
 1 | # Quick Start
 2 | 
 3 | ## Display available backend
 4 | 
 5 | ```python
 6 | from biotransformers import BioTransformers
 7 | BioTransformers.list_backend()
 8 | 
 9 | >>
10 |     *   esm1_t34_670M_UR100
11 |     *   esm1_t6_43M_UR50S
12 |     *   esm1b_t33_650M_UR50S
13 |     *   esm_msa1_t12_100M_UR50S
14 |     *   protbert
15 |     *   protbert_bfd
16 | ```
17 | 
18 | ## Compute embeddings on gpu
19 | 
20 | Please refer to the [multi-gpus section](https://bio-transformers.readthedocs.io/en/develop/documentation/multi_gpus.html) to have a full understanding of the functionnality.
21 | 
22 | ```python
23 | import ray
24 | 
25 | sequences = [
26 |         "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
27 |         "RSKEPVSGFDLIRDHISQTGMPPTRAEIARSKEPVSGRKGVIEIVSGASRGIRLLQEE",
28 |         "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
29 |         "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
30 |     ]
31 | 
32 | ray.init()
33 | bio_trans = BioTransformers(backend="protbert", num_gpus=4)
34 | embeddings = bio_trans.compute_embeddings(sequences, pool_mode=('cls','mean'))
35 | 
36 | cls_emb = embeddings['cls']
37 | mean_emb = embeddings['mean']
38 | ```
39 | 
40 | where:
41 | 
42 | - pooling_list: kind of aggregation functions to be used. 'cls' return the `<CLS>` token embedding used for classification. 'mean' will make the mean of all the tokens a sequence.
43 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/index.rst.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Bio-transformers : Documentation and Tutorial
 3 | =============================================
 4 | 
 5 | .. Caution:: Bio-transformers introduces breaking changes replacing ``device`` and ``multi_gpu`` arguments by ``num_gpus``. Multi-GPU inference is now managed with ``ray``, which leverage the full computational capacity of each GPU in contrast to ``torch.DataParallel``
 6 | 
 7 | bio-transformers is a python wrapper on top of the ESM/Protbert model,
 8 | which are Transformers protein language model, trained on millions on proteins and used to predict embeddings.
 9 | This package provides other functionalities that you can use to build apps thanks to `deepchain-apps <https://deepchain-apps.readthedocs.io/en/latest/index.html>`_
10 | 
11 | Features
12 | --------
13 | 
14 | .. Note:: Bio-transformers now use `Ray <https://docs.ray.io/en/master/?badge=master#>`_ to manage multi-GPU inference.
15 | 
16 | Bio-transformers extends and simplifies workflows for manipulating amino acids sequences with Pytorch, and can be
17 | used to test several pre-trained transformers models without taking into account the syntax specificity of different models.
18 | 
19 | The main features are:
20 |    - ``compute_loglikelihood``
21 |    - ``compute_probabilities``
22 |    - ``compute_embeddings``
23 |    - ``compute_accuracy``
24 |    - ``finetune``
25 | 
26 | Our development and all related work involved in the project is public,
27 | and released under the Apache 2.0 license.
28 | 
29 | 
30 | Contributors
31 | ------------
32 | 
33 | Bio-transformers is a package belonging to the DeepChainBio repository, maintained by a team of
34 | developers and researchers at Instadeep.
35 | 
36 | 
37 | .. toctree::
38 |    :hidden:
39 |    :maxdepth: 2
40 |    :caption: Getting Started
41 | 
42 |    getting_started/install
43 |    getting_started/quick_start
44 | 
45 | .. toctree::
46 |    :hidden:
47 |    :maxdepth: 0
48 |    :caption: Documentation
49 | 
50 |    documentation/course
51 |    documentation/logging
52 |    documentation/multi_gpus
53 |    documentation/msa
54 | 
55 | .. toctree::
56 |    :hidden:
57 |    :maxdepth: 0
58 |    :caption: Tutorial
59 | 
60 |    tutorial/loglikelihood
61 |    tutorial/embeddings
62 |    tutorial/finetuning
63 | 
64 | .. toctree::
65 |    :hidden:
66 |    :maxdepth: 2
67 |    :caption: Api reference
68 | 
69 |    api/biotransformers
70 | 
71 | .. toctree::
72 |    :hidden:
73 |    :maxdepth: 0
74 |    :caption: Contributing
75 | 
76 |    contributing/CONTRIBUTING
77 | 
78 | .. _documentation: documentation/course.html
79 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/tutorial/embeddings.md.txt:
--------------------------------------------------------------------------------
 1 | # Embeddings
 2 | 
 3 | The library allow to easily compute embeddings with a specific model in the backend.
 4 | 
 5 | ```python
 6 | from biotransformers import BioTransformers
 7 | 
 8 | sequences = [...]
 9 | bio_trans = BioTransformers("esm1b_t33_650M_UR50S",num_gpus=1)
10 | embeddings = bio_trans.compute_embeddings(sequences, pool_mode=("cls","mean"), batch_size=8)
11 | ```
12 | 
13 | By default, the `pool_mode` argument contains 3 mode:
14 | 
15 | - `cls` : return the `<CLS>` token embedding in the sequence.
16 | - `mean` : if sequence has shape (num_token, embedding_size), the num_token dimension is averaging and the embedding has shape (num_token,)
17 | - `full` : no pooling function applied, all the embeddings for each sequence are return.
18 | 
19 | ## Multi-gpu inference
20 | 
21 | If you want to make the inference on several GPUs, you have to intialize ray as below to use instantiate multiple workers.
22 | 
23 | ```{tip}
24 | batch_size corresponds to the number of sequence that you want to distribute on each GPU.
25 | ```
26 | 
27 | ```python
28 | from biotransformers import BioTransformers
29 | import ray
30 | 
31 | ray.init()
32 | sequences = [...]
33 | bio_trans = BioTransformers("esm1b_t33_650M_UR50S",num_gpus=4)
34 | embeddings = bio_trans.compute_embeddings(sequences, pool_mode=("cls","mean"), batch_size=8)
35 | ```
36 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/tutorial/finetuning.md.txt:
--------------------------------------------------------------------------------
  1 | # Finetuning
  2 | 
  3 | ## How to finetune a model?
  4 | 
  5 | `bio-transformers` uses pytorch-lightning to easily load pre-trained model and finetune it on your own datasets. The method `finetune` automatically scale on your visible GPU to train in parallel thanks to the different accelerator.
  6 | 
  7 | It is strongly recommended to use the `DDP` accelerator for training : [ddp](https://pytorch.org/docs/stable/notes/ddp.html). You should know that `DDP` will launch several python instances, as a consequence, a model should be finetuned in a separate script, and not be mixed with inference function like `compute_loglikelihood` or `compute_embeddings` to avoid GPU conflicts.
  8 | 
  9 | The model will be finetuned randomly by masking a proportion of amino acid in a sequence it commonly does in most state of the art paper. By default, 15% of amino acids will be masked;
 10 | 
 11 | ```{caution}
 12 | This method is developed to be runned on GPU, please take care to have the proper CUDA installation. Refer to this section for more informations.
 13 | ```
 14 | 
 15 | Do not train model `DDP` **accelerator** in a notebook. Do not mix training and compute inference function like `compute_accuracy` or `compute_loglikelihood`  in the same script except with `DP` acceletator.
 16 |  With `DDP`, load the finetune model in a separate script like below.
 17 | 
 18 | ```python
 19 | from biotransformers import BioTransformers
 20 | 
 21 | bio_trans = BioTransformers("esm1_t6_43M_UR50S", num_gpus=1)
 22 | bio_trans.load_model("logs/finetune_masked/version_X/esm1_t6_43M_UR50S_finetuned.pt")
 23 | acc_after = bio_trans.compute_accuracy(..., batch_size=32)
 24 | ```
 25 | 
 26 | ## Parameters
 27 | 
 28 | The function can handle a fasta file or a list of sequences directly:
 29 | 
 30 | - **train_sequences**: Could be a list of sequence of a the path of a fasta files with SeqRecords.
 31 | 
 32 | Seven arguments are important for the training:
 33 | 
 34 | - **lr**: the default learning rate (keep it low : <5e10-4)
 35 | - **warmup_updates**:  the number of step (not epochs, optimizer step) to do while increasing the leraning rate from a **warmup_init_lr** to **lr**.
 36 | - **epochs** :  number of epoch for training. Defaults to 10.
 37 | - **batch_size** :  This size is only uses internally to compute the **accumulate_grad_batches** for gradient accumulation (TO BE UPDATED). The **toks_per_batch** will dynamically determine the number of sequences in a batch, in order to avoid GPU saturation.
 38 | - **acc_batch_size** : Number of batch to consider befor computing gradient.
 39 | 
 40 | Three arguments allow to custom the masking function used for building the training dataset:
 41 | 
 42 | - **masking_ratio** : ratio of tokens to be masked. Defaults to 0.025.
 43 | - **random_token_prob** : the probability that the chose token is replaced with a random token.
 44 | - **masking_prob**: the probability that the chose token is replaced with a mask token.
 45 | 
 46 | All the results will be saved in logs directory:
 47 | 
 48 | - **logs_save_dir**: Defaults directory to logs.
 49 | - **logs_name_exp**: Name of the experience in the logs.
 50 | - **checkpoint**: Path to a checkpoint file to restore training session.
 51 | - **save_last_checkpoint**: Save last checkpoint and 2 best trainings models
 52 | to restore the training session. Take a large amount of time and memory.
 53 | 
 54 | ## Example : training script
 55 | 
 56 | Training on some swissprot sequences. Training only works on GPU.
 57 | 
 58 | ```python
 59 | import biodatasets
 60 | import numpy as np
 61 | from biotransformers import BioTransformers
 62 | import ray
 63 | 
 64 | data = biodatasets.load_dataset("swissProt")
 65 | X, y = data.to_npy_arrays(input_names=["sequence"])
 66 | X = X[0]
 67 | 
 68 | # Train on small sequence
 69 | length = np.array(list(map(len, X))) < 200
 70 | train_seq = X[length][:15000]
 71 | 
 72 | ray.init()
 73 | bio_trans = BioTransformers("esm1_t6_43M_UR50S", num_gpus=4)
 74 | 
 75 | bio_trans.finetune(
 76 |     train_seq,
 77 |     lr=1.0e-5,
 78 |     warmup_init_lr=1e-7,
 79 |     toks_per_batch=2000,
 80 |     epochs=20,
 81 |     batch_size=16,
 82 |     acc_batch_size=256,
 83 |     warmup_updates=1024,
 84 |     accelerator="ddp",
 85 |     checkpoint=None,
 86 |     save_last_checkpoint=False,
 87 | )
 88 | ```
 89 | 
 90 | ## Example : evaluation script
 91 | 
 92 | You can easily assees the quality of your finetuning by using the provided function such as `compute_accuracy`.
 93 | 
 94 | ```python
 95 | import biodatasets
 96 | import numpy as np
 97 | from biotransformers import BioTransformers
 98 | import ray
 99 | 
100 | 
101 | data = biodatasets.load_dataset("swissProt")
102 | X, y = data.to_npy_arrays(input_names=["sequence"])
103 | X = X[0]
104 | 
105 | # Train sequence with length less than 200 AA
106 | # Test on sequence that was not used for training.
107 | length = np.array(list(map(len, X))) < 200
108 | train_seq = X[length][15000:20000]
109 | 
110 | ray.init()
111 | bio_trans = BioTransformers("esm1_t6_43M_UR50S", num_gpus=4)
112 | acc_before = bio_trans.compute_accuracy(train_seq, batch_size=32)
113 | print(f"Accuracy before finetuning : {acc_before}")
114 | ```
115 | 
116 | ```python
117 | >> Accuracy before finetuning : 0.46
118 | ```
119 | 
120 | ```python
121 | bio_trans.load_model("logs/finetune_masked/version_X/esm1_t6_43M_UR50S_finetuned.pt")
122 | acc_after = bio_trans.compute_accuracy(train_seq, batch_size=32)
123 | print(f"Accuracy after finetuning : {acc_after}")
124 | ```
125 | 
126 | ```python
127 | >> Accuracy before finetuning : 0.76
128 | ```
129 | 


--------------------------------------------------------------------------------
/docs/source/_build/_sources/tutorial/loglikelihood.md.txt:
--------------------------------------------------------------------------------
 1 | # Loglikelihood
 2 | 
 3 | The protein loglikelihood is a metric which estimates the joint probability of
 4 | observing a given sequence of amino-acids. The idea behind such an estimator is to approximate the
 5 | probability that a mutated protein will be “natural”, and can effectively be produced by a cell.
 6 | 
 7 | These metrics rely on transformers language model. These models are trained to predict a “masked” amino-acid in a sequence.
 8 | As a consequence, they can provide us an estimate of the probability of observing an amino-acid given the “context” (the surrounding amino-acids).
 9 | By multiplying individual probabilities computed for a given amino-acid given its context, we obtain a pseudo-likelihood, which can be a candidate estimator to approximate a sequence stability.
10 | 
11 | ```python
12 | from biotransformers import BioTransformers
13 | import ray
14 | 
15 | ray.init()
16 | bio_trans = BioTransformers(backend="protbert",num_gpus=2)
17 | 
18 | sequences = [
19 |         "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
20 |         "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE",
21 |         "RQQEVFDLIQQEVFDLIQQEVFIRDAQRLGFRQQEVFDLIRDHISQTGMPPTRAALARKGVIEIVSGASRGIRLLQEE",
22 |         "QEEVFDLIQQEVFDLIRDHISQTGMPPTRAMPPTRAEIAQQARKGVIEIVSGASRGIRLLQEE"
23 |     ]
24 | 
25 | loglikelihood = bio_trans.compute_loglikelihood(sequences, batch_size=2)
26 | ```
27 | 
28 | ## Different pass mode
29 | 
30 | For each provided methods, you can do the compute in a ``forward`` mode or in a ``masked`` mode. The last one is
31 | longer as we have to mask and compute the probabilities for each masked amino acid.
32 | 
33 | ```python
34 | embeddings = bio_trans.compute_loglikelihood(sequences, pass_mode="masked", batch_size=2)
35 | ```
36 | 
37 | ## Tokens list
38 | 
39 | The method give the ability to compute the loglikelihood for only a provided list of amino acids, which will be considered.
40 | 
41 | ```python
42 | UNNATURAL = list("ACDEFGHIKLMNPQRSTVWY") + ["-"]
43 | loglikelihood = bio_trans.compute_loglikelihood(sequences, tokens_list=UNNATURAL)
44 | ```
45 | 
46 | ## Probabilities
47 | 
48 | The ``compute_loglikelihoods`` relies on the ``compute_probabilities`` function.
49 | 
50 | This last function will compute for each amino acids position in the sequence the a dictionnary where keys represent the natural amino acids, and values the probabilities to be at the position.
51 | 
52 | For example:
53 | 
54 | ```python
55 | from biotransformers import BioTransformers
56 | 
57 | bio_trans = BioTransformers(backend="protbert",num_gpus=1)
58 | 
59 | sequence = ["MKT"]
60 | probabilities = bio_trans.compute_(sequence, batch_size=1)
61 | 
62 | print(probabilities)
63 | ```
64 | 
65 | ```python
66 | >>
67 | [{0: {'L': 0.06550145316598321, 'A': 0.021559458419220974, 'G': 0.029741129950678777, 'V': 0.0329506745800003, 'E': 0.03389950500319548, 'S': 0.10401323529266542, 'I': 0.04399518228657259, 'K': 0.1534323153578508, 'R': 0.08616676439914424, 'D': 0.010983572050921635, 'T': 0.04474224433539647, 'P': 0.01569993609938641, 'N': 0.027836286891774507, 'Q': 0.037557728840479546, 'F': 0.020606235301203788, 'Y': 0.01243454224917041, 'M': 0.21207524064947852, 'H': 0.015025274369047291, 'C': 0.013031914446968728, 'W': 0.018747306310860856},
68 | 
69 |  1: {'L': 0.03176897920072879, 'A': 0.013685848027567242, 'G': 0.01709074216275199, 'V': 0.018786360542915624, 'E': 0.016411511761942357, 'S': 0.02157161007259761, 'I': 0.019570515195473124, 'K': 0.026416232407458887, 'R': 0.021930249525274396, 'D': 0.008674132240173953, 'T': 0.018818536773492975, 'P': 0.010970933229272459, 'N': 0.01349720693939123, 'Q': 0.014703372924399499, 'F': 0.010715260172378251, 'Y': 0.00931640096204737, 'M': 0.7010288899792522, 'H': 0.009361870192728095, 'C': 0.007965577806480653, 'W': 0.007715769883673336},
70 | 
71 |   2: {'L': 0.07383247230045219, 'A': 0.03555995965068629, 'G': 0.03454727111803637, 'V': 0.043748770514437235, 'E': 0.04069625263096508, 'S': 0.06924489597284503, 'I': 0.046173613390643166, 'K': 0.2299759248798167, 'R': 0.06749564661032614, 'D': 0.0224069594369746, 'T': 0.03940009938504622, 'P': 0.02301058203142933, 'N': 0.03441775848661052, 'Q': 0.04373499771477881, 'F': 0.028093375324345762, 'Y': 0.02461900744880924, 'M': 0.025029056199102815, 'H': 0.0818692944874724, 'C': 0.016498739542946495, 'W': 0.01964532287427556}}]
72 | ```
73 | 
74 | For each position, we have  0,1,2 which correpond to amino acids M,K,T, we have a dictionnary of probabilities for each natural amino acids.
75 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/__init__.py


--------------------------------------------------------------------------------
/docs/source/_build/_static/css/theme.css:
--------------------------------------------------------------------------------
  1 | :root {
  2 |   /*****************************************************************************
  3 |   * Theme config
  4 |   **/
  5 |   --pst-header-height: 60px;
  6 | 
  7 |   /*****************************************************************************
  8 |   * Font size
  9 |   **/
 10 |   --pst-font-size-base: 15px; /* base font size - applied at body / html level */
 11 | 
 12 |   /* heading font sizes */
 13 |   --pst-font-size-h1: 36px;
 14 |   --pst-font-size-h2: 32px;
 15 |   --pst-font-size-h3: 26px;
 16 |   --pst-font-size-h4: 21px;
 17 |   --pst-font-size-h5: 18px;
 18 |   --pst-font-size-h6: 16px;
 19 | 
 20 |   /* smaller then heading font sizes*/
 21 |   --pst-font-size-milli: 12px;
 22 | 
 23 |   --pst-sidebar-font-size: .9em;
 24 |   --pst-sidebar-caption-font-size: .9em;
 25 | 
 26 |   /*****************************************************************************
 27 |   * Font family
 28 |   **/
 29 |   /* These are adapted from https://systemfontstack.com/ */
 30 |   --pst-font-family-base-system: -apple-system, BlinkMacSystemFont, Segoe UI, "Helvetica Neue",
 31 |     Arial, sans-serif, Apple Color Emoji, Segoe UI Emoji, Segoe UI Symbol;
 32 |   --pst-font-family-monospace-system: "SFMono-Regular", Menlo, Consolas, Monaco,
 33 |     Liberation Mono, Lucida Console, monospace;
 34 | 
 35 |   --pst-font-family-base: var(--pst-font-family-base-system);
 36 |   --pst-font-family-heading: var(--pst-font-family-base);
 37 |   --pst-font-family-monospace: var(--pst-font-family-monospace-system);
 38 | 
 39 |   /*****************************************************************************
 40 |   * Color
 41 |   *
 42 |   * Colors are defined in rgb string way, "red, green, blue"
 43 |   **/
 44 |   --pst-color-primary: 19, 6, 84;
 45 |   --pst-color-success: 40, 167, 69;
 46 |   --pst-color-info: 0, 123, 255;  /*23, 162, 184;*/
 47 |   --pst-color-warning: 255, 193, 7;
 48 |   --pst-color-danger: 220, 53, 69;
 49 |   --pst-color-text-base: 51, 51, 51;
 50 | 
 51 |   --pst-color-h1: var(--pst-color-primary);
 52 |   --pst-color-h2: var(--pst-color-primary);
 53 |   --pst-color-h3: var(--pst-color-text-base);
 54 |   --pst-color-h4: var(--pst-color-text-base);
 55 |   --pst-color-h5: var(--pst-color-text-base);
 56 |   --pst-color-h6: var(--pst-color-text-base);
 57 |   --pst-color-paragraph: var(--pst-color-text-base);
 58 |   --pst-color-link: 0, 91, 129;
 59 |   --pst-color-link-hover: 227, 46, 0;
 60 |   --pst-color-headerlink: 198, 15, 15;
 61 |   --pst-color-headerlink-hover: 255, 255, 255;
 62 |   --pst-color-preformatted-text: 34, 34, 34;
 63 |   --pst-color-preformatted-background: 250, 250, 250;
 64 |   --pst-color-inline-code: 232, 62, 140;
 65 | 
 66 |   --pst-color-active-navigation: 19, 6, 84;
 67 |   --pst-color-navbar-link: 77, 77, 77;
 68 |   --pst-color-navbar-link-hover: var(--pst-color-active-navigation);
 69 |   --pst-color-navbar-link-active: var(--pst-color-active-navigation);
 70 |   --pst-color-sidebar-link: 77, 77, 77;
 71 |   --pst-color-sidebar-link-hover: var(--pst-color-active-navigation);
 72 |   --pst-color-sidebar-link-active: var(--pst-color-active-navigation);
 73 |   --pst-color-sidebar-expander-background-hover: 244, 244, 244;
 74 |   --pst-color-sidebar-caption: 77, 77, 77;
 75 |   --pst-color-toc-link: 119, 117, 122;
 76 |   --pst-color-toc-link-hover: var(--pst-color-active-navigation);
 77 |   --pst-color-toc-link-active: var(--pst-color-active-navigation);
 78 | 
 79 |   /*****************************************************************************
 80 |   * Icon
 81 |   **/
 82 | 
 83 |   /* font awesome icons*/
 84 |   --pst-icon-check-circle: '\f058';
 85 |   --pst-icon-info-circle: '\f05a';
 86 |   --pst-icon-exclamation-triangle: '\f071';
 87 |   --pst-icon-exclamation-circle: '\f06a';
 88 |   --pst-icon-times-circle: '\f057';
 89 |   --pst-icon-lightbulb: '\f0eb';
 90 | 
 91 |   /*****************************************************************************
 92 |   * Admonitions
 93 |   **/
 94 | 
 95 |   --pst-color-admonition-default: var(--pst-color-info);
 96 |   --pst-color-admonition-note: var(--pst-color-info);
 97 |   --pst-color-admonition-attention: var(--pst-color-warning);
 98 |   --pst-color-admonition-caution: var(--pst-color-warning);
 99 |   --pst-color-admonition-warning: var(--pst-color-warning);
100 |   --pst-color-admonition-danger: var(--pst-color-danger);
101 |   --pst-color-admonition-error: var(--pst-color-danger);
102 |   --pst-color-admonition-hint: var(--pst-color-success);
103 |   --pst-color-admonition-tip: var(--pst-color-success);
104 |   --pst-color-admonition-important: var(--pst-color-success);
105 | 
106 |   --pst-icon-admonition-default: var(--pst-icon-info-circle);
107 |   --pst-icon-admonition-note: var(--pst-icon-info-circle);
108 |   --pst-icon-admonition-attention: var(--pst-icon-exclamation-circle);
109 |   --pst-icon-admonition-caution: var(--pst-icon-exclamation-triangle);
110 |   --pst-icon-admonition-warning: var(--pst-icon-exclamation-triangle);
111 |   --pst-icon-admonition-danger: var(--pst-icon-exclamation-triangle);
112 |   --pst-icon-admonition-error: var(--pst-icon-times-circle);
113 |   --pst-icon-admonition-hint: var(--pst-icon-lightbulb);
114 |   --pst-icon-admonition-tip: var(--pst-icon-lightbulb);
115 |   --pst-icon-admonition-important: var(--pst-icon-exclamation-circle);
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/deepchain-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/deepchain-small.png


--------------------------------------------------------------------------------
/docs/source/_build/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | var DOCUMENTATION_OPTIONS = {
 2 |     URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
 3 |     VERSION: '0.1.0',
 4 |     LANGUAGE: 'None',
 5 |     COLLAPSE_INDEX: false,
 6 |     BUILDER: 'html',
 7 |     FILE_SUFFIX: '.html',
 8 |     LINK_SUFFIX: '.html',
 9 |     HAS_SOURCE: true,
10 |     SOURCELINK_SUFFIX: '.txt',
11 |     NAVIGATION_WITH_KEYS: true
12 | };
13 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/file.png


--------------------------------------------------------------------------------
/docs/source/_build/_static/graphviz.css:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * graphviz.css
 3 |  * ~~~~~~~~~~~~
 4 |  *
 5 |  * Sphinx stylesheet -- graphviz extension.
 6 |  *
 7 |  * :copyright: Copyright 2007-2021 by the Sphinx team, see AUTHORS.
 8 |  * :license: BSD, see LICENSE for details.
 9 |  *
10 |  */
11 | 
12 | img.graphviz {
13 |     border: 0;
14 |     max-width: 100%;
15 | }
16 | 
17 | object.graphviz {
18 |     max-width: 100%;
19 | }
20 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/images/logo_binder.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 23.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 44.4 44.4" style="enable-background:new 0 0 44.4 44.4;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:none;stroke:#F5A252;stroke-width:5;stroke-miterlimit:10;}
 7 | 	.st1{fill:none;stroke:#579ACA;stroke-width:5;stroke-miterlimit:10;}
 8 | 	.st2{fill:none;stroke:#E66581;stroke-width:5;stroke-miterlimit:10;}
 9 | </style>
10 | <title>logo</title>
11 | <g>
12 | 	<path class="st0" d="M33.9,6.4c3.6,3.9,3.4,9.9-0.5,13.5s-9.9,3.4-13.5-0.5s-3.4-9.9,0.5-13.5l0,0C24.2,2.4,30.2,2.6,33.9,6.4z"/>
13 | 	<path class="st1" d="M35.1,27.3c2.6,4.6,1.1,10.4-3.5,13c-4.6,2.6-10.4,1.1-13-3.5s-1.1-10.4,3.5-13l0,0
14 | 		C26.6,21.2,32.4,22.7,35.1,27.3z"/>
15 | 	<path class="st2" d="M25.9,17.8c2.6,4.6,1.1,10.4-3.5,13s-10.4,1.1-13-3.5s-1.1-10.4,3.5-13l0,0C17.5,11.7,23.3,13.2,25.9,17.8z"/>
16 | 	<path class="st1" d="M19.2,26.4c3.1-4.3,9.1-5.2,13.3-2.1c1.1,0.8,2,1.8,2.7,3"/>
17 | 	<path class="st0" d="M19.9,19.4c-3.6-3.9-3.4-9.9,0.5-13.5s9.9-3.4,13.5,0.5"/>
18 | </g>
19 | </svg>
20 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/images/logo_colab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/images/logo_colab.png


--------------------------------------------------------------------------------
/docs/source/_build/_static/images/logo_jupyterhub.svg:
--------------------------------------------------------------------------------
1 | <svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" width="38.73" height="50" viewBox="0 0 38.73 50"><defs><style>.cls-1{fill:#767677;}.cls-2{fill:#f37726;}.cls-3{fill:#9e9e9e;}.cls-4{fill:#616262;}.cls-5{font-size:17.07px;fill:#fff;font-family:Roboto-Regular, Roboto;}</style></defs><title>logo_jupyterhub</title><g id="Canvas"><path id="path7_fill" data-name="path7 fill" class="cls-1" d="M39.51,3.53a3,3,0,0,1-1.7,2.9A3,3,0,0,1,34.48,6a3,3,0,0,1-.82-3.26,3,3,0,0,1,1.05-1.41A3,3,0,0,1,37.52.86a2.88,2.88,0,0,1,1,.6,3,3,0,0,1,.7.93,3.18,3.18,0,0,1,.28,1.14Z" transform="translate(-1.87 -0.69)"/><path id="path8_fill" data-name="path8 fill" class="cls-2" d="M21.91,38.39c-8,0-15.06-2.87-18.7-7.12a19.93,19.93,0,0,0,37.39,0C37,35.52,30,38.39,21.91,38.39Z" transform="translate(-1.87 -0.69)"/><path id="path9_fill" data-name="path9 fill" class="cls-2" d="M21.91,10.78c8,0,15.05,2.87,18.69,7.12a19.93,19.93,0,0,0-37.39,0C6.85,13.64,13.86,10.78,21.91,10.78Z" transform="translate(-1.87 -0.69)"/><path id="path10_fill" data-name="path10 fill" class="cls-3" d="M10.88,46.66a3.86,3.86,0,0,1-.52,2.15,3.81,3.81,0,0,1-1.62,1.51,3.93,3.93,0,0,1-2.19.34,3.79,3.79,0,0,1-2-.94,3.73,3.73,0,0,1-1.14-1.9,3.79,3.79,0,0,1,.1-2.21,3.86,3.86,0,0,1,1.33-1.78,3.92,3.92,0,0,1,3.54-.53,3.85,3.85,0,0,1,2.14,1.93,3.74,3.74,0,0,1,.37,1.43Z" transform="translate(-1.87 -0.69)"/><path id="path11_fill" data-name="path11 fill" class="cls-4" d="M4.12,9.81A2.18,2.18,0,0,1,2.9,9.48a2.23,2.23,0,0,1-.84-1A2.26,2.26,0,0,1,1.9,7.26a2.13,2.13,0,0,1,.56-1.13,2.18,2.18,0,0,1,2.36-.56,2.13,2.13,0,0,1,1,.76,2.18,2.18,0,0,1,.42,1.2A2.22,2.22,0,0,1,4.12,9.81Z" transform="translate(-1.87 -0.69)"/></g><text class="cls-5" transform="translate(5.24 30.01)">Hub</text></svg>
2 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/minus.png


--------------------------------------------------------------------------------
/docs/source/_build/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/plus.png


--------------------------------------------------------------------------------
/docs/source/_build/_static/sphinx-book-theme.12a9622fbb08dcb3a2a40b2c02b83a57.js:
--------------------------------------------------------------------------------
 1 | var initTriggerNavBar=()=>{if($(window).width()<768){$("#navbar-toggler").trigger("click")}}
 2 | var scrollToActive=()=>{var navbar=document.getElementById('site-navigation')
 3 | var active_pages=navbar.querySelectorAll(".active")
 4 | var active_page=active_pages[active_pages.length-1]
 5 | if(active_page!==undefined&&active_page.offsetTop>($(window).height()*.5)){navbar.scrollTop=active_page.offsetTop-($(window).height()*.2)}}
 6 | var sbRunWhenDOMLoaded=cb=>{if(document.readyState!='loading'){cb()}else if(document.addEventListener){document.addEventListener('DOMContentLoaded',cb)}else{document.attachEvent('onreadystatechange',function(){if(document.readyState=='complete')cb()})}}
 7 | function toggleFullScreen(){var navToggler=$("#navbar-toggler");if(!document.fullscreenElement){document.documentElement.requestFullscreen();if(!navToggler.hasClass("collapsed")){navToggler.click();}}else{if(document.exitFullscreen){document.exitFullscreen();if(navToggler.hasClass("collapsed")){navToggler.click();}}}}
 8 | var initTooltips=()=>{$(document).ready(function(){$('[data-toggle="tooltip"]').tooltip();});}
 9 | var initTocHide=()=>{var scrollTimeout;var throttle=200;var tocHeight=$("#bd-toc-nav").outerHeight(true)+$(".bd-toc").outerHeight(true);var hideTocAfter=tocHeight+200;var checkTocScroll=function(){var margin_content=$(".margin, .tag_margin, .full-width, .full_width, .tag_full-width, .tag_full_width, .sidebar, .tag_sidebar, .popout, .tag_popout");margin_content.each((index,item)=>{var topOffset=$(item).offset().top-$(window).scrollTop();var bottomOffset=topOffset+$(item).outerHeight(true);var topOverlaps=((topOffset>=0)&&(topOffset<hideTocAfter));var bottomOverlaps=((bottomOffset>=0)&&(bottomOffset<hideTocAfter));var removeToc=(topOverlaps||bottomOverlaps);if(removeToc&&window.pageYOffset>20){$("div.bd-toc").removeClass("show")
10 | return false}else{$("div.bd-toc").addClass("show")};})};var manageScrolledClassOnBody=function(){if(window.scrollY>0){document.body.classList.add("scrolled");}else{document.body.classList.remove("scrolled");}}
11 | $(window).on('scroll',function(){if(!scrollTimeout){scrollTimeout=setTimeout(function(){checkTocScroll();manageScrolledClassOnBody();scrollTimeout=null;},throttle);}});}
12 | var initThebeSBT=()=>{var title=$("div.section h1")[0]
13 | if(!$(title).next().hasClass("thebe-launch-button")){$("<button class='thebe-launch-button'></button>").insertAfter($(title))}
14 | initThebe();}
15 | sbRunWhenDOMLoaded(initTooltips)
16 | sbRunWhenDOMLoaded(initTriggerNavBar)
17 | sbRunWhenDOMLoaded(scrollToActive)
18 | sbRunWhenDOMLoaded(initTocHide)
19 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/tabs.css:
--------------------------------------------------------------------------------
 1 | .sphinx-tabs {
 2 |   margin-bottom: 1rem;
 3 | }
 4 | 
 5 | [role="tablist"] {
 6 |   border-bottom: 1px solid #a0b3bf;
 7 | }
 8 | 
 9 | .sphinx-tabs-tab {
10 |   position: relative;
11 |   font-family: Lato,'Helvetica Neue',Arial,Helvetica,sans-serif;
12 |   color: #1D5C87;
13 |   line-height: 24px;
14 |   margin: 0;
15 |   font-size: 16px;
16 |   font-weight: 400;
17 |   background-color: rgba(255, 255, 255, 0);
18 |   border-radius: 5px 5px 0 0;
19 |   border: 0;
20 |   padding: 1rem 1.5rem;
21 |   margin-bottom: 0;
22 | }
23 | 
24 | .sphinx-tabs-tab[aria-selected="true"] {
25 |   font-weight: 700;
26 |   border: 1px solid #a0b3bf;
27 |   border-bottom: 1px solid white;
28 |   margin: -1px;
29 |   background-color: white;
30 | }
31 | 
32 | .sphinx-tabs-tab:focus {
33 |   z-index: 1;
34 |   outline-offset: 1px;
35 | }
36 | 
37 | .sphinx-tabs-panel {
38 |   position: relative;
39 |   padding: 1rem;
40 |   border: 1px solid #a0b3bf;
41 |   margin: 0px -1px -1px -1px;
42 |   border-radius: 0 0 5px 5px;
43 |   border-top: 0;
44 |   background: white;
45 | }
46 | 
47 | .sphinx-tabs-panel.code-tab {
48 |   padding: 0.4rem;
49 | }
50 | 
51 | .sphinx-tab img {
52 | 	margin-bottom: 24 px;
53 | }
54 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/tabs.js:
--------------------------------------------------------------------------------
  1 | try {
  2 |   var session = window.sessionStorage || {};
  3 | } catch (e) {
  4 |   var session = {};
  5 | }
  6 | 
  7 | window.addEventListener("DOMContentLoaded", () => {
  8 |   const allTabs = document.querySelectorAll('.sphinx-tabs-tab');
  9 |   const tabLists = document.querySelectorAll('[role="tablist"]');
 10 | 
 11 |   allTabs.forEach(tab => {
 12 |     tab.addEventListener("click", changeTabs);
 13 |   });
 14 | 
 15 |   tabLists.forEach(tabList => {
 16 |     tabList.addEventListener("keydown", keyTabs);
 17 |   });
 18 | 
 19 |   // Restore group tab selection from session
 20 |   const lastSelected = session.getItem('sphinx-tabs-last-selected');
 21 |   if (lastSelected != null) selectGroupedTabs(lastSelected);
 22 | });
 23 | 
 24 | /**
 25 |  * Key focus left and right between sibling elements using arrows
 26 |  * @param  {Node} e the element in focus when key was pressed
 27 |  */
 28 | function keyTabs(e) {
 29 |     const tab = e.target;
 30 |     let nextTab = null;
 31 |     if (e.keyCode === 39 || e.keyCode === 37) {
 32 |       tab.setAttribute("tabindex", -1);
 33 |       // Move right
 34 |       if (e.keyCode === 39) {
 35 |         nextTab = tab.nextElementSibling;
 36 |         if (nextTab === null) {
 37 |           nextTab = tab.parentNode.firstElementChild;
 38 |         }
 39 |       // Move left
 40 |       } else if (e.keyCode === 37) {
 41 |         nextTab = tab.previousElementSibling;
 42 |         if (nextTab === null) {
 43 |           nextTab = tab.parentNode.lastElementChild;
 44 |         }
 45 |       }
 46 |     }
 47 | 
 48 |     if (nextTab !== null) {
 49 |       nextTab.setAttribute("tabindex", 0);
 50 |       nextTab.focus();
 51 |     }
 52 | }
 53 | 
 54 | /**
 55 |  * Select or deselect clicked tab. If a group tab
 56 |  * is selected, also select tab in other tabLists.
 57 |  * @param  {Node} e the element that was clicked
 58 |  */
 59 | function changeTabs(e) {
 60 |   // Use this instead of the element that was clicked, in case it's a child
 61 |   const selected = this.getAttribute("aria-selected") === "true";
 62 |   const positionBefore = this.parentNode.getBoundingClientRect().top;
 63 |   const closable = this.parentNode.classList.contains("closeable");
 64 | 
 65 |   deselectTabList(this);
 66 | 
 67 |   if (!selected || !closable) {
 68 |     selectTab(this);
 69 |     const name = this.getAttribute("name");
 70 |     selectGroupedTabs(name, this.id);
 71 | 
 72 |     if (this.classList.contains("group-tab")) {
 73 |       // Persist during session
 74 |       session.setItem('sphinx-tabs-last-selected', name);
 75 |     }
 76 |   }
 77 | 
 78 |   const positionAfter = this.parentNode.getBoundingClientRect().top;
 79 |   const positionDelta = positionAfter - positionBefore;
 80 |   // Scroll to offset content resizing
 81 |   window.scrollTo(0, window.scrollY + positionDelta);
 82 | }
 83 | 
 84 | function selectTab(target) {
 85 |   target.setAttribute("aria-selected", true);
 86 | 
 87 |   // Show the associated panel
 88 |   document
 89 |     .getElementById(target.getAttribute("aria-controls"))
 90 |     .removeAttribute("hidden");
 91 | }
 92 | 
 93 | /**
 94 |  * Select all other grouped tabs via tab name.
 95 |  * @param  {Node} name name of grouped tab to be selected
 96 |  * @param  {Node} clickedId id of clicked tab
 97 |  */
 98 | function selectGroupedTabs(name, clickedId=null) {
 99 |   const groupedTabs = document.querySelectorAll(`.sphinx-tabs-tab[name="${name}"]`);
100 |   const tabLists = Array.from(groupedTabs).map(tab => tab.parentNode);
101 | 
102 |   tabLists
103 |     .forEach(tabList => {
104 |       // Don't want to change the tabList containing the clicked tab
105 |       const clickedTab = tabList.querySelector(`[id="${clickedId}"]`);
106 |       if (clickedTab === null ) {
107 |         // Select first tab with matching name
108 |         const tab = tabList.querySelector(`.sphinx-tabs-tab[name="${name}"]`);
109 |         deselectTabList(tab);
110 |         selectTab(tab);
111 |       }
112 |     })
113 | }
114 | 
115 | /**
116 |  * Hide the panels associated with all tabs within the
117 |  * tablist containing this tab.
118 |  * @param  {Node} tab a tab within the tablist to deselect
119 |  */
120 | function deselectTabList(tab) {
121 |   const parent = tab.parentNode;
122 |   const grandparent = parent.parentNode;
123 | 
124 |   Array.from(parent.children)
125 |   .forEach(t => t.setAttribute("aria-selected", false));
126 | 
127 |   Array.from(grandparent.children)
128 |     .slice(1)  // Skip tablist
129 |     .forEach(p => p.setAttribute("hidden", true));
130 | }
131 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Font Awesome Free License
 2 | -------------------------
 3 | 
 4 | Font Awesome Free is free, open source, and GPL friendly. You can use it for
 5 | commercial projects, open source projects, or really almost whatever you want.
 6 | Full Font Awesome Free license: https://fontawesome.com/license/free.
 7 | 
 8 | # Icons: CC BY 4.0 License (https://creativecommons.org/licenses/by/4.0/)
 9 | In the Font Awesome Free download, the CC BY 4.0 license applies to all icons
10 | packaged as SVG and JS file types.
11 | 
12 | # Fonts: SIL OFL 1.1 License (https://scripts.sil.org/OFL)
13 | In the Font Awesome Free download, the SIL OFL license applies to all icons
14 | packaged as web and desktop font files.
15 | 
16 | # Code: MIT License (https://opensource.org/licenses/MIT)
17 | In the Font Awesome Free download, the MIT license applies to all non-font and
18 | non-icon files.
19 | 
20 | # Attribution
21 | Attribution is required by MIT, SIL OFL, and CC BY licenses. Downloaded Font
22 | Awesome Free files already contain embedded comments with sufficient
23 | attribution, so you shouldn't need to do anything additional when using these
24 | files normally.
25 | 
26 | We've kept attribution comments terse, so we ask that you do not actively work
27 | to remove them from files, especially code. They're a great way for folks to
28 | learn about Font Awesome.
29 | 
30 | # Brand Icons
31 | All brand icons are trademarks of their respective owners. The use of these
32 | trademarks does not indicate endorsement of the trademark holder by Font
33 | Awesome, nor vice versa. **Please do not use brand logos for any purpose except
34 | to represent the company, product, or service to which they refer.**
35 | 


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.eot


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.ttf


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.eot


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.ttf


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-regular-400.woff2


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.eot


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.ttf


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff


--------------------------------------------------------------------------------
/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2


--------------------------------------------------------------------------------
/docs/source/_build/_static/webpack-macros.html:
--------------------------------------------------------------------------------
 1 | <!-- these macros are generated by "yarn build:production". do not edit by hand. -->
 2 | {% macro head_pre_icons() %}
 3 |   <link rel="stylesheet"
 4 |     href="{{ pathto('_static/vendor/fontawesome/5.13.0/css/all.min.css', 1) }}">
 5 |   <link rel="preload" as="font" type="font/woff2" crossorigin
 6 |     href="{{ pathto('_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2', 1) }}">
 7 |   <link rel="preload" as="font" type="font/woff2" crossorigin
 8 |     href="{{ pathto('_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2', 1) }}">
 9 | {% endmacro %}
10 | 
11 | {% macro head_pre_fonts() %}
12 | {% endmacro %}
13 | 
14 | {% macro head_pre_bootstrap() %}
15 |   <link href="{{ pathto('_static/css/theme.css', 1) }}" rel="stylesheet" />
16 |   <link href="{{ pathto('_static/css/index.c5995385ac14fb8791e8eb36b4908be2.css', 1) }}" rel="stylesheet" />
17 | {% endmacro %}
18 | 
19 | {% macro head_js_preload() %}
20 |   <link rel="preload" as="script" href="{{ pathto('_static/js/index.1c5a1a01449ed65a7b51.js', 1) }}">
21 | {% endmacro %}
22 | 
23 | {% macro body_post() %}
24 |   <script src="{{ pathto('_static/js/index.1c5a1a01449ed65a7b51.js', 1) }}"></script>
25 | {% endmacro %}
26 | 


--------------------------------------------------------------------------------
/docs/source/_build/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_build/objects.inv


--------------------------------------------------------------------------------
/docs/source/_static/deepchain-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/_static/deepchain-small.png


--------------------------------------------------------------------------------
/docs/source/api/biotransformers.rst:
--------------------------------------------------------------------------------
1 | Bio-transformers method
2 | =======================
3 | 
4 | .. automodule:: biotransformers.wrappers.transformers_wrappers
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # type: ignore
  2 | # flake8: noqa
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file only contains a selection of the most common options. For a full
  6 | # list see the documentation:
  7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os
 16 | import sys
 17 | 
 18 | sys.path.insert(0, os.path.abspath("../.."))
 19 | 
 20 | 
 21 | # -- Project information -----------------------------------------------------
 22 | 
 23 | project = "bio-transformers"
 24 | copyright = "2021, InstaDeep"
 25 | author = "InstaDeep"
 26 | 
 27 | # The full version, including alpha/beta/rc tags
 28 | release = "0.1.14"
 29 | 
 30 | 
 31 | # -- General configuration ---------------------------------------------------
 32 | 
 33 | # Add any Sphinx extension module names here, as strings. They can be
 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 35 | # ones.
 36 | 
 37 | # Add any paths that contain templates here, relative to this directory.
 38 | templates_path = ["_templates"]
 39 | 
 40 | # List of patterns, relative to source directory, that match files and
 41 | # directories to ignore when looking for source files.
 42 | # This pattern also affects html_static_path and html_extra_path.
 43 | exclude_patterns = []
 44 | 
 45 | 
 46 | # -- Options for HTML output -------------------------------------------------
 47 | 
 48 | # Add any paths that contain custom static files (such as style sheets) here,
 49 | # relative to this directory. They are copied after the builtin static files,
 50 | # so a file named "default.css" will overwrite the builtin "default.css".
 51 | html_static_path = ["_static"]
 52 | html_title = f"bio-transformers v{release}"
 53 | html_logo = "_static/deepchain-small.png"
 54 | 
 55 | extensions = [
 56 |     # "sphinx.ext.autodoc",  # autodoc for API
 57 |     "sphinx_tabs.tabs",  # for tabs in rst
 58 |     # "m2r2",  # for supporting md files
 59 |     "notfound.extension",  # for 404 pages
 60 |     "sphinx.ext.napoleon",  # extensions for google style docstring
 61 |     "myst_parser",
 62 |     "autoapi.extension",
 63 | ]
 64 | autoapi_dirs = ["../../biotransformers"]
 65 | 
 66 | html_theme_options = {
 67 |     "theme_dev_mode": True,
 68 |     "path_to_docs": "docs",
 69 |     "repository_url": "https://github.com/DeepChainBio/bio-transformers",
 70 |     # "repository_branch": "gh-pages",  # For testing
 71 |     "use_edit_page_button": True,
 72 |     # "use_issues_button": True,
 73 |     "use_repository_button": True,
 74 |     "use_download_button": True,
 75 |     # For testing
 76 |     # "use_fullscreen_button": False,
 77 |     # "home_page_in_toc": True,
 78 |     # "single_page": True,
 79 |     # "extra_footer": "<a href='https://google.com'>Test</a>",  # DEPRECATED KEY
 80 |     # "extra_navbar": "<a href='https://google.com'>Test</a>",
 81 |     # "show_navbar_depth": 2,
 82 | }
 83 | 
 84 | html_theme = "sphinx_book_theme"
 85 | source_suffix = [".rst", ".md"]
 86 | 
 87 | 
 88 | # Napoleon settings
 89 | napoleon_google_docstring = True
 90 | napoleon_numpy_docstring = True
 91 | napoleon_include_init_with_doc = False
 92 | napoleon_include_private_with_doc = False
 93 | napoleon_include_special_with_doc = True
 94 | napoleon_use_admonition_for_examples = False
 95 | napoleon_use_admonition_for_notes = False
 96 | napoleon_use_admonition_for_references = False
 97 | napoleon_use_ivar = False
 98 | napoleon_use_param = True
 99 | napoleon_use_rtype = True
100 | napoleon_preprocess_types = False
101 | napoleon_type_aliases = None
102 | napoleon_attr_annotations = True
103 | 
104 | # add docstring of __init__
105 | autoclass_content = "both"
106 | 


--------------------------------------------------------------------------------
/docs/source/contributing/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Change log
  2 | 
  3 | # [0.1.8] - 2021-07-29
  4 | 
  5 | Features:
  6 |   - Add compute_mutation_score method to evaluate a set of mutation on a sequence.
  7 |     Metric based on [paper](<https://www.biorxiv.org/content/10.1101/2021.07.09.450648v1.full.pdf>)
  8 | 
  9 | # [0.1.7] - 2021-07-19
 10 | 
 11 | Features:
 12 |   - Add esm1v_t33_650M_UR90S_1 model.
 13 | 
 14 | # [0.1.6] - 2021-07-09
 15 | 
 16 | Fixed:
 17 |  - Fix filtering of logits which impacts loglikelihood computation
 18 |  - Fix fasta file reading in compute_loglikelihood
 19 | 
 20 | Features:
 21 |   - Add `normalize` mode in compute_loglikelihood.
 22 | 
 23 | # [0.1.3] - 2021-07-01
 24 | 
 25 | Features:
 26 |  - Add msa-transformers for methods:
 27 |     - compute_logits
 28 |     - compute_embeddings
 29 |     - compute_probabilities
 30 |     - compute_accuracy
 31 | 
 32 | Fixed:
 33 |  - Remove torch DataParallel wrapper.
 34 | 
 35 | # [0.1.0] - 2021-07-01
 36 | 
 37 | Features:
 38 |  - Add ray worker for multi-gpus inference
 39 | 
 40 | Removed:
 41 |  - Remove torch DataParallel wrapper.
 42 | 
 43 | # [0.0.10] - 2021-06-14
 44 | Note on the release
 45 | 
 46 | Features:
 47 |  - Add BIO_LOG_LEVEL environnement variable to control logging message (logger)
 48 |  - Check if every unique amino acids in sequences are in tokens_list (compute_probabilities)
 49 | 
 50 | Fixed:
 51 |  - Add shuffling in batch_sampler (lightning_utils)
 52 |  - Fix tokens argument for dataloader (lightning_utils)
 53 |  - Fix rtd CI to separates docs and package environment.
 54 | 
 55 | Changed:
 56 |  - Modified the signature of some functions to improve clarity (tansformers_wrappers)
 57 |  - Update `train_masked` method to `finetune` (tansformers_wrappers)
 58 |  - `compute_embeddings` with option `full` return a list of embeddingsn, no matter the size (tansformers_wrappers)
 59 | 
 60 | Removed:
 61 |  - Remove the tokens_list argument when not necessary and tried to make its usage clearer (tansformers_wrappers)
 62 |  - Remove functions (tansformers_wrappers):
 63 |     - _filter_and_pool_embeddings
 64 |     - _split_logits
 65 |     -  _slabels_remaping
 66 |     - _filter_logits
 67 |     -  _filter_loglikelihood
 68 |     - _compute_accuracy
 69 |     - _compute_calibration
 70 | 
 71 | 
 72 | # [0.0.9] - 2021-06-04
 73 | 
 74 | Fixed:
 75 |  - Batch_sampler issue
 76 | 
 77 | # [0.0.8] - 2021-06-03
 78 | Note on the release
 79 | 
 80 | Features:
 81 |  - Merge ESM/protbert for finetuning model with pytorch-lightning
 82 |  - Possibility to restore a training session.
 83 | 
 84 | Fixed:
 85 |  - Fix conflicts when saving model with DDP
 86 |  - Fix loading checkpoint created by pytorch-lightning
 87 | 
 88 | 
 89 | # [0.0.7] - 2021-05-12
 90 | Note on the release
 91 | 
 92 | Features:
 93 |  - Add fasta files support for each compute function.
 94 |  - Add train_masked function to finetune model on custom dataset. (Only ESM for the moment, protbert is coming.)
 95 | 
 96 | Docs:
 97 |  - Update documentation to add tutorial on training.
 98 | 
 99 | Changed:
100 |  - GPU is used by default if found, even if not specified.
101 | 
102 | # [0.0.6] - 2021-05-24
103 | Note on the release
104 | 
105 | Fixed:
106 |  - Update torch dependencies to be less restrictive. Create conflict with other packages.
107 | 
108 | # [0.0.5] - 2021-05-12
109 | 
110 | Note on the release
111 | 
112 | Added
113 |  - added multi-gpu support for inference
114 |  - added function to finetuned a model on a specific dataset on multi-gpu
115 | 
116 | Changed
117 | 
118 | Fixed
119 | 


--------------------------------------------------------------------------------
/docs/source/contributing/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | In order to contibute to this repository you will need developer access to this repo. To know more about the project go to the [README](README.md) first.
 4 | 
 5 | 
 6 | ## Install Dev environment
 7 | 
 8 | From the root of this repo, run
 9 | `conda env create -f environment_dev.yaml`
10 | 
11 | ## Pre-commit hooks
12 | 
13 | Pre-commits hooks have been configured for this project using the [pre-commit](https://pre-commit.com/) library:
14 | 
15 | - [black](https://github.com/psf/black) python formatter
16 | - [flake8](https://flake8.pycqa.org/en/latest/) python linter
17 | - [isort](https://pypi.org/project/isort/) sorts imports
18 | 
19 | To get them going on your side, make sure to have python installed, and run the following
20 | commands from the root directory of this repository:
21 | 
22 | ```bash
23 | pip install pre-commit
24 | pre-commit install
25 | pre-commit run --all-files
26 | ```
27 | 
28 | ## Coding conventions
29 | 
30 | Please respect the following conventions to contribute to the code:
31 | 
32 | - Use hard wrap at 88
33 | - Respect black, isort and flake8 conventions
34 | - Classes' names are Caml case (example: MyClass)
35 | - Functions and variables are in lower case with _ as separator (example: my_function, my_var)
36 | - Names are explicit: avoid mathematical notations, functions' names start with a verb
37 | - Use python typing library: each class and method should be typed (both for inputs and outputs)
38 | - Create custom types if needed
39 | - All classes and functions should have a docstring
40 | - Avoid repeating arguments and returns in docstring (should be explicit with the types) except when it is truly necessary
41 | - A function (or a class) does not take more than 5 arguments, if you need more create a data class
42 | - Avoid dictionaries to pass arguments when possible and prefer dataclasses instead
43 | - Repeat inputs names when calling a function: ex: compute_custom(arg1=arg1, arg2=my_arg2)
44 | - Use list comprehension when it is possible
45 | - Use f strings to add variables in strings: ex: print(f'my var value is {my_var}')
46 | - Use PathLib to handle pathes
47 | - Prefer shutil to os to manage files/ folders creations and deletions
48 | 


--------------------------------------------------------------------------------
/docs/source/documentation/course.md:
--------------------------------------------------------------------------------
 1 | # Getting starting with transformers
 2 | 
 3 | If you want to know more about the transformers architecture, have a look at:
 4 | - [The illustrated transformers](http://jalammar.github.io/illustrated-transformer/)
 5 | - [The annotated transformers](https://nlp.seas.harvard.edu/2018/04/03/attention.html?s=09)
 6 | 
 7 | ## Bio-transformers
 8 | 
 9 | If you want to understand the secret of how to train deep-learning model on protein, the two best repo are:
10 | - [ProtTrans repo](https://github.com/agemagician/ProtTrans)
11 | - [ESM repo](https://github.com/facebookresearch/esm)
12 | 


--------------------------------------------------------------------------------
/docs/source/documentation/logging.md:
--------------------------------------------------------------------------------
 1 | # Logging
 2 | 
 3 | When running bio-transformers, printed messages are entirely controles by bio-transformers code.
 4 | The log level controls which types of log messages would be printed.
 5 | 
 6 | bio-transformers uses the Python module ``logging` <https://docs.python.org/3/library/logging.html>`_ to log the messages. The log level is controlled by the environment variable BIO_LOG_LEVEL. The levels are given in the table below. The default level is “2”.
 7 | 
 8 | To adjust the logging level, you can export the environment variable:
 9 | 
10 |   ```bash
11 |   export BIO_LOG_LEVEL=1
12 |   ```
13 | 
14 |   | DEEPREG_LOG_LEVEL | Behavior                                                                                   |
15 | | ----------------- | ------------------------------------------------------------------------------------------ |
16 | | "0"               | Log all messages, equivalent to `logging.DEBUG`. Same as log level "1".                    |
17 | | "1"               | Log all messages, equivalent to `logging.DEBUG`.                                           |
18 | | "2"               | Log all messages except DEBUG, equivalent to `logging.INFO`. (default)                     |
19 | | "3"               | Log all messages except DEBUG and INFO, equivalent to `logging.WARNING`.                   |
20 | | "4"               | Log all messages except DEBUG, INFO, and WARNING, equivalent to `logging.ERROR`.           |
21 | | "5"               | Log all messages except DEBUG, INFO, WARNING, and ERROR, equivalent to `logging.CRITICAL`. |
22 | 


--------------------------------------------------------------------------------
/docs/source/documentation/msa.md:
--------------------------------------------------------------------------------
 1 | # MSA
 2 | 
 3 | ## What is an MSA?
 4 | 
 5 | An MSA (multiple sequence alignment) is a file that contains a sequence of amino acids and a number of aligned variant sequences. The aim is to stored information about the evolution of the main sequence. Using this for training allows to reduce the number of model parameters, and use evolution information in the model as explained in the [MSA Transformers papers](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1.full.pdf).
 6 | 
 7 | The MSA are generated using [hh-suite](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3019-7) tools, searching similar sequences in the UniClust30 database.
 8 | 
 9 | ```{note}
10 | The generated MSA must have the .a3m extension to be used in `bio-transformers`
11 | ```
12 | 
13 | Below, you can see an example of an MSA file. The first line is the main sequence. The following sequences are the most probable variants sequences. The `-` character in the msa sequences corresponds to the alignment character.
14 | 
15 | ```bash
16 | >> head msa_example.a3m
17 | ```
18 | 
19 | ```bash
20 | >sp|O24396|PURA_WHEAT Adenylosuccinate synthetase, chloroplastic (Fragment) OS=Triticum aestivum OX=4565 PE=1 SV=1
21 | AAAAAGRGRSFSPAAPAPSSVRLPGRQAPAPAAASALAVEADPAADRVSSLSQVSGVLGSQWGDEGKGKLVDVLAPRFDIVARCQGGANAGHTIYNSEGKKFALHLVPSGILHEGTLCVVGNGAVIHVPGFFGEIDGLQSNGVSCDGRILVSDRAHLLFDLHQTVDGLREAELANSFIGTTKRGIGPCYSSKVTRNGLRVCDLRHMDTFGDKLDVLFEDAAARFEGFKYSKGMLKEEVERYKRFAERLEPFIADTVHVLNESIRQKKKILVEGGQATMLDIDFGTYPFVTSSSPSAGGICTGLGIAPRVIGDLIGVVKAYTTRVGSGPFPTELLGEEGDVLRKAGMEFGTTTGRPRRCGWLDIVALKYCCDINGFSSLNLTKLDVLSGLPEIKLGVSYNQMDGEKLQSFPGDLDTLEQVQVNYEVLPGWDSDISSVRSYSELPQAARRYVERIEELAGVPVHYIGVGPGRDALIYK
22 | >UniRef100_A0A0N4UWB0 Adenylosuccinate synthetase n=1 Tax=Enterobius vermicularis TaxID=51028 RepID=A0A0N4UWB0_ENTVE
23 | --------------------------------------------MNDQKRKAPVIVILGAQFGDEGKGKIVDFLIEKekIQLTARCQGGNNAGHTVV-VNGRKSDFHLLPTGIINEDCYNIIGNGVVVNLDALFKEIEHNEIDKLNgWEKRLMISELAHLVTSMHMQADGQQEKSLSSEKIGTTSKGIGPTYSTKCFRNGIRVGELlGDFEAFSAKFRSLAAFYLKQFPGIEVN---VEEELDNYKKHAVCLKRLgiVGDTITYLDEMRAQGKAILVEGANGAMLDIDFGsflytffchsgTYPFVTSSNATVGGAVTGLGIPPTAITEIIGVVKAYETRVGSGPFPTEQQGKIGEDLQSIGHEVGVTTGRKRRCGWLDLFLLKRSSVINGFTALALTKLDILDNFDEIKVATGYR-IDGKSLKAPPSCAADWSRIELEYKTFSGWKDDVSKIRSFNELPENCKTYVKFIEGFVGVPIKWIGVGEDREALIVM
24 | >UniRef100_A0A139AVD1 Adenylosuccinate synthetase n=1 Tax=Gonapodya prolifera (strain JEL478) TaxID=1344416 RepID=A0A139AVD1_GONPJ
25 | -----------------------------------------ATG-------NKAVVVLGAQWGDEGKGKLVDILTQQADLVARCQGGNNAGHTIV-VDGVKFDFHMLPSGLLGaPSTVSLVGSGVVLHLPSFFEEVKKTESKGVSCANRLFVSDRCHLVFDLHQIVDGLKEGELAShkQEIGTTKKGIGPAYSSKASRGGVRVHHLiaPDFAEFESRFRQMAANKKRRYGDFPYD---VDAEVERYRQYRDLIRPYVVDSVTYVHKALQEGKRVLVEGANAVMLDIDFGTFPYVTSSNTTIGGVCTGLGLPPKSIGKVIGVVKAYTTRVGAGPFPTEQLNEVGEHLQTVGAEFGVTTGRKRRCGWLDAAVLRWSHMINGYDSINLTKLDILDGLPTLRIGIAYKHrATGQVYETFPADLHLLEECDVIYEELPGWKESIGGCKSWDALPENARKYVERIEQLVGVNVEYIGVGVSRDSMITK
26 | ```
27 | 
28 | ```{caution}
29 | `finetune` method for MSA transormers and `pass_mode=masked` are not available.
30 | ```
31 | 
32 | ## How to use msa-transformers?
33 | 
34 | The library supports `esm_msa1_t12_100M_UR50S` backend which is based on the [MSA Transformers papers](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1.full.pdf).
35 | 
36 | Instead of passing a list of sequences of the path of a fasta file, you can pass the path to the folder where the .a3m files are stored.
37 | 
38 | ```python
39 | from biotransformers import BioTransformers
40 | 
41 | msa_folder = "msa_folder"
42 | bio_trans = BioTransformers("esm_msa1_t12_100M_UR50S",num_gpus=1)
43 | msa_embeddings = bio_trans.compute_embeddings(sequences=msa_folder, pool_mode=("cls","mean"), n_seqs_msa=128)
44 | ```
45 | 
46 | As an MSA is composed of multiple sequences, the results have an extra-dimension, which corresponds to all the aligned sequences.
47 | 
48 | ```python
49 | msa_embeddings['cls'].shape
50 | ```
51 | 
52 | If 100 msa files are in the folder, we have the following dimension for the embeddings. We take le `<CLS>` token of each of the 128 sequences in the files.
53 | 
54 | ```python
55 | >> (100, 128, 768)
56 | ```
57 | 
58 | ```{caution}
59 | All files in the msa folder must have at least `n_seqs_msa` sequences in each MSA. If you want to remove files with less than `n_seqs_msa` arguments, you can use the `biotransformers.utils.msa_utils.msa_to_remove` function.
60 | ```
61 | 
62 | ```python
63 | msa_to_remove("data_msa_sample/", n_seq=128)
64 | ```
65 | 
66 | ```python
67 | >>
68 | 
69 | 3/8 have insufficient number of sequences in MSA.
70 | ['data/data_msa_sample/seq130_swissprot.a3m',
71 |  'data/data_msa_sample/seq109_swissprot.a3m',
72 |  'data/data_msa_sample/seq124_swissprot.a3m']
73 | ```
74 | 


--------------------------------------------------------------------------------
/docs/source/documentation/multi_gpus.md:
--------------------------------------------------------------------------------
 1 | # Multi-gpus
 2 | 
 3 | ```{note}
 4 | These changes have been introduced in ``bio-transformers`` v0.0.11.
 5 | ```
 6 | 
 7 | The use ``torch.nn.DataParallel`` is strongly [discourage](https://pytorch.org/docs/stable/notes/cuda.html#cuda-nn-ddp-instead), as a consequence, ``bio-transformers`` relies on [ray](https://docs.ray.io/en/master/?badge=master#) to distribute the compute on multiple GPUs. This parallelization scale far better, with performance increasing with the number of GPUs.
 8 | 
 9 | Ray is used only when the ``num_gpus>1``. See the difference below:
10 | 
11 | ```{important}
12 | Note that ray parallelization is only used for inference function. `finetune` method uses pytorch-lightning with its built-in function to train a model.
13 | ```
14 | 
15 | ```python
16 | from biotransformers import BioTransformers
17 | import ray
18 | 
19 | ray.init()
20 | 
21 | sequences = [...]
22 | bio_trans = BioTransformers("esm1b_t33_650M_UR50S",num_gpus=4)
23 | embeddings = bio_trans.compute_embeddings(sequences, pool_mode=("cls","mean"), batch_size=8)
24 | ```
25 | 
26 | ```{note}
27 | You don't have to use ``ray.init()`` when num_gpus=1
28 | ```
29 | 
30 | ## Configure GPU environment variable
31 | 
32 | Sometimes it can be useful to specify which GPU you want to use. It can be done in the terminal or at the beginning of the script. You just have to export the GPU index you want to use.
33 | 
34 | For example, if you have 8 GPUs but you just want to use 3 of them (0,5,6):
35 | 
36 | ```bash
37 | export CUDA_VISIBLE_DEVICES="0,5,6"
38 | ```
39 | 
40 | or
41 | 
42 | ```python
43 | import os
44 | os.environ["CUDA_VISIBLE_DEVICES"]="0,5,6"
45 | ```
46 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/install.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | 
 5 | Bio-transformers can be installed in Python 3.7 and external python dependencies are mainly defined in `requirements`_.
 6 | There are multiple different methods to install Bio-transformers:
 7 | 
 8 | 1. Clone `Bio-transformers`_ and create a virtual environment using `Anaconda`_ / `Miniconda`_ (**recommended**).
 9 | 2. Clone `Bio-transformers`_ and build a docker image using the provided docker file. (**not implemented**)
10 | 3. Install directly from PyPI release without cloning `Bio-transformers`_.
11 | 
12 | 
13 | 
14 | Install torch/cuda
15 | ------------------
16 | 
17 | .. WARNING:: ``bio-transformers`` doesn't manage the installation of cuda toolkit and torch gpu version.
18 | 
19 | If you want to find a specific version or torch based on your CUDA setup, please refer to this `page <https://pytorch.org/get-started/previous-versions/>`_
20 | 
21 | The Dockerfile provided in the `github repository <https://github.com/DeepChainBio/bio-transformers>`_ relies on :
22 |     - `pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html`
23 | 
24 | Install in conda environment
25 | ----------------------------
26 | The recommended method is to install Bio-transformers in a dedicated virtual
27 | environment using `Anaconda`_ / `Miniconda`_.
28 | 
29 | 
30 | .. code:: bash
31 | 
32 |     conda create --name bio-transformers python=3.7 -y
33 |     conda activate bio-transformers
34 |     pip install bio-transformers
35 | 
36 | .. _Quick Start: quick_start.html
37 | .. _Anaconda: https://docs.anaconda.com/anaconda/install
38 | .. _Miniconda: https://docs.conda.io/en/latest/miniconda.html
39 | .. _Bio-transformers: https://github.com/DeepChainBio/bio-transformers
40 | .. _requirements: https://github.com/DeepChainBio/bio-transformers/blob/main/requirements.txt
41 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/quick_start.md:
--------------------------------------------------------------------------------
 1 | # Quick Start
 2 | 
 3 | ## Display available backend
 4 | 
 5 | ```python
 6 | from biotransformers import BioTransformers
 7 | BioTransformers.list_backend()
 8 | 
 9 | >>
10 |     *   esm1_t34_670M_UR100
11 |     *   esm1_t6_43M_UR50S
12 |     *   esm1b_t33_650M_UR50S
13 |     *   esm_msa1_t12_100M_UR50S
14 |     *   protbert
15 |     *   protbert_bfd
16 | ```
17 | 
18 | ## Compute embeddings on gpu
19 | 
20 | Please refer to the [multi-gpus section](https://bio-transformers.readthedocs.io/en/develop/documentation/multi_gpus.html) to have a full understanding of the functionnality.
21 | 
22 | ```python
23 | import ray
24 | 
25 | sequences = [
26 |         "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
27 |         "RSKEPVSGFDLIRDHISQTGMPPTRAEIARSKEPVSGRKGVIEIVSGASRGIRLLQEE",
28 |         "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
29 |         "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
30 |     ]
31 | 
32 | ray.init()
33 | bio_trans = BioTransformers(backend="protbert", num_gpus=4)
34 | embeddings = bio_trans.compute_embeddings(sequences, pool_mode=('cls','mean'))
35 | 
36 | cls_emb = embeddings['cls']
37 | mean_emb = embeddings['mean']
38 | ```
39 | 
40 | where:
41 | 
42 | - pooling_list: kind of aggregation functions to be used. 'cls' return the `<CLS>` token embedding used for classification. 'mean' will make the mean of all the tokens a sequence.
43 | 


--------------------------------------------------------------------------------
/docs/source/images/bio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/images/bio.png


--------------------------------------------------------------------------------
/docs/source/images/score_mutation.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepChainBio/bio-transformers/a4bf57164464f5d763129e6008dbf06263287972/docs/source/images/score_mutation.jpeg


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Bio-transformers : Documentation and Tutorial
 3 | =============================================
 4 | 
 5 | .. Caution:: Bio-transformers introduces breaking changes replacing ``device`` and ``multi_gpu`` arguments by ``num_gpus``. Multi-GPU inference is now managed with ``ray``, which leverage the full computational capacity of each GPU in contrast to ``torch.DataParallel``
 6 | 
 7 | bio-transformers is a python wrapper on top of the ESM/Protbert model,
 8 | which are Transformers protein language model, trained on millions on proteins and used to predict embeddings.
 9 | This package provides other functionalities that you can use to build apps thanks to `deepchain-apps <https://deepchain-apps.readthedocs.io/en/latest/index.html>`_
10 | 
11 | Features
12 | --------
13 | 
14 | .. Note:: Bio-transformers now use `Ray <https://docs.ray.io/en/master/?badge=master#>`_ to manage multi-GPU inference.
15 | 
16 | Bio-transformers extends and simplifies workflows for manipulating amino acids sequences with Pytorch, and can be
17 | used to test several pre-trained transformers models without taking into account the syntax specificity of different models.
18 | 
19 | The main features are:
20 |    - ``compute_loglikelihood``
21 |    - ``compute_probabilities``
22 |    - ``compute_embeddings``
23 |    - ``compute_accuracy``
24 |    - ``finetune``
25 | 
26 | Our development and all related work involved in the project is public,
27 | and released under the Apache 2.0 license.
28 | 
29 | 
30 | Contributors
31 | ------------
32 | 
33 | Bio-transformers is a package belonging to the DeepChainBio repository, maintained by a team of
34 | developers and researchers at Instadeep.
35 | 
36 | 
37 | .. toctree::
38 |    :hidden:
39 |    :maxdepth: 2
40 |    :caption: Getting Started
41 | 
42 |    getting_started/install
43 |    getting_started/quick_start
44 | 
45 | .. toctree::
46 |    :hidden:
47 |    :maxdepth: 0
48 |    :caption: Documentation
49 | 
50 |    documentation/course
51 |    documentation/logging
52 |    documentation/multi_gpus
53 |    documentation/msa
54 | 
55 | .. toctree::
56 |    :hidden:
57 |    :maxdepth: 0
58 |    :caption: Tutorial
59 | 
60 |    tutorial/loglikelihood
61 |    tutorial/embeddings
62 |    tutorial/finetuning
63 |    tutorial/mutations_score.rst
64 | 
65 | .. toctree::
66 |    :hidden:
67 |    :maxdepth: 2
68 |    :caption: Api reference
69 | 
70 |    api/biotransformers
71 | 
72 | .. toctree::
73 |    :hidden:
74 |    :maxdepth: 0
75 |    :caption: Contributing
76 | 
77 |    contributing/CONTRIBUTING
78 | 
79 | .. _documentation: documentation/course.html
80 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/embeddings.md:
--------------------------------------------------------------------------------
 1 | # Embeddings
 2 | 
 3 | The library allow to easily compute embeddings with a specific model in the backend.
 4 | 
 5 | ```python
 6 | from biotransformers import BioTransformers
 7 | 
 8 | sequences = [...]
 9 | bio_trans = BioTransformers("esm1b_t33_650M_UR50S",num_gpus=1)
10 | embeddings = bio_trans.compute_embeddings(sequences, pool_mode=("cls","mean"), batch_size=8)
11 | ```
12 | 
13 | By default, the `pool_mode` argument contains 3 mode:
14 | 
15 | - `cls` : return the `<CLS>` token embedding in the sequence.
16 | - `mean` : if sequence has shape (num_token, embedding_size), the num_token dimension is averaging and the embedding has shape (num_token,)
17 | - `full` : no pooling function applied, all the embeddings for each sequence are return.
18 | 
19 | ## Multi-gpu inference
20 | 
21 | If you want to make the inference on several GPUs, you have to intialize ray as below to use instantiate multiple workers.
22 | 
23 | ```{tip}
24 | batch_size corresponds to the number of sequence that you want to distribute on each GPU.
25 | ```
26 | 
27 | ```python
28 | from biotransformers import BioTransformers
29 | import ray
30 | 
31 | ray.init()
32 | sequences = [...]
33 | bio_trans = BioTransformers("esm1b_t33_650M_UR50S",num_gpus=4)
34 | embeddings = bio_trans.compute_embeddings(sequences, pool_mode=("cls","mean"), batch_size=8)
35 | ```
36 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/finetuning.md:
--------------------------------------------------------------------------------
  1 | # Finetuning
  2 | 
  3 | ## How to finetune a model?
  4 | 
  5 | `bio-transformers` uses pytorch-lightning to easily load pre-trained model and finetune it on your own datasets. The method `finetune` automatically scale on your visible GPU to train in parallel thanks to the different accelerator.
  6 | 
  7 | It is strongly recommended to use the `DDP` accelerator for training : [ddp](https://pytorch.org/docs/stable/notes/ddp.html). You should know that `DDP` will launch several python instances, as a consequence, a model should be finetuned in a separate script, and not be mixed with inference function like `compute_loglikelihood` or `compute_embeddings` to avoid GPU conflicts.
  8 | 
  9 | The model will be finetuned randomly by masking a proportion of amino acid in a sequence it commonly does in most state of the art paper. By default, 15% of amino acids will be masked;
 10 | 
 11 | ```{caution}
 12 | This method is developed to be runned on GPU, please take care to have the proper CUDA installation. Refer to this section for more informations.
 13 | ```
 14 | 
 15 | Do not train model `DDP` **accelerator** in a notebook. Do not mix training and compute inference function like `compute_accuracy` or `compute_loglikelihood`  in the same script except with `DP` acceletator.
 16 |  With `DDP`, load the finetune model in a separate script like below.
 17 | 
 18 | ```python
 19 | from biotransformers import BioTransformers
 20 | 
 21 | bio_trans = BioTransformers("esm1_t6_43M_UR50S", num_gpus=1)
 22 | bio_trans.load_model("logs/finetune_masked/version_X/esm1_t6_43M_UR50S_finetuned.pt")
 23 | acc_after = bio_trans.compute_accuracy(..., batch_size=32)
 24 | ```
 25 | 
 26 | ## Parameters
 27 | 
 28 | The function can handle a fasta file or a list of sequences directly:
 29 | 
 30 | - **train_sequences**: Could be a list of sequence of a the path of a fasta files with SeqRecords.
 31 | 
 32 | Seven arguments are important for the training:
 33 | 
 34 | - **lr**: the default learning rate (keep it low : <5e10-4)
 35 | - **warmup_updates**:  the number of step (not epochs, optimizer step) to do while increasing the leraning rate from a **warmup_init_lr** to **lr**.
 36 | - **epochs** :  number of epoch for training. Defaults to 10.
 37 | - **batch_size** :  This size is only uses internally to compute the **accumulate_grad_batches** for gradient accumulation (TO BE UPDATED). The **toks_per_batch** will dynamically determine the number of sequences in a batch, in order to avoid GPU saturation.
 38 | - **acc_batch_size** : Number of batch to consider befor computing gradient.
 39 | 
 40 | Three arguments allow to custom the masking function used for building the training dataset:
 41 | 
 42 | - **masking_ratio** : ratio of tokens to be masked. Defaults to 0.025.
 43 | - **random_token_prob** : the probability that the chose token is replaced with a random token.
 44 | - **masking_prob**: the probability that the chose token is replaced with a mask token.
 45 | 
 46 | All the results will be saved in logs directory:
 47 | 
 48 | - **logs_save_dir**: Defaults directory to logs.
 49 | - **logs_name_exp**: Name of the experience in the logs.
 50 | - **checkpoint**: Path to a checkpoint file to restore training session.
 51 | - **save_last_checkpoint**: Save last checkpoint and 2 best trainings models
 52 | to restore the training session. Take a large amount of time and memory.
 53 | 
 54 | ## Example : training script
 55 | 
 56 | Training on some swissprot sequences. Training only works on GPU.
 57 | 
 58 | ```python
 59 | import biodatasets
 60 | import numpy as np
 61 | from biotransformers import BioTransformers
 62 | import ray
 63 | 
 64 | data = biodatasets.load_dataset("swissProt")
 65 | X, y = data.to_npy_arrays(input_names=["sequence"])
 66 | X = X[0]
 67 | 
 68 | # Train on small sequence
 69 | length = np.array(list(map(len, X))) < 200
 70 | train_seq = X[length][:15000]
 71 | 
 72 | ray.init()
 73 | bio_trans = BioTransformers("esm1_t6_43M_UR50S", num_gpus=4)
 74 | 
 75 | bio_trans.finetune(
 76 |     train_seq,
 77 |     lr=1.0e-5,
 78 |     warmup_init_lr=1e-7,
 79 |     toks_per_batch=2000,
 80 |     epochs=20,
 81 |     batch_size=16,
 82 |     acc_batch_size=256,
 83 |     warmup_updates=1024,
 84 |     accelerator="ddp",
 85 |     checkpoint=None,
 86 |     save_last_checkpoint=False,
 87 | )
 88 | ```
 89 | 
 90 | ## Example : evaluation script
 91 | 
 92 | You can easily assees the quality of your finetuning by using the provided function such as `compute_accuracy`.
 93 | 
 94 | ```python
 95 | import biodatasets
 96 | import numpy as np
 97 | from biotransformers import BioTransformers
 98 | import ray
 99 | 
100 | 
101 | data = biodatasets.load_dataset("swissProt")
102 | X, y = data.to_npy_arrays(input_names=["sequence"])
103 | X = X[0]
104 | 
105 | # Train sequence with length less than 200 AA
106 | # Test on sequence that was not used for training.
107 | length = np.array(list(map(len, X))) < 200
108 | train_seq = X[length][15000:20000]
109 | 
110 | ray.init()
111 | bio_trans = BioTransformers("esm1_t6_43M_UR50S", num_gpus=4)
112 | acc_before = bio_trans.compute_accuracy(train_seq, batch_size=32)
113 | print(f"Accuracy before finetuning : {acc_before}")
114 | ```
115 | 
116 | ```python
117 | >> Accuracy before finetuning : 0.46
118 | ```
119 | 
120 | ```python
121 | bio_trans.load_model("logs/finetune_masked/version_X/esm1_t6_43M_UR50S_finetuned.pt")
122 | acc_after = bio_trans.compute_accuracy(train_seq, batch_size=32)
123 | print(f"Accuracy after finetuning : {acc_after}")
124 | ```
125 | 
126 | ```python
127 | >> Accuracy before finetuning : 0.76
128 | ```
129 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/loglikelihood.md:
--------------------------------------------------------------------------------
 1 | # Loglikelihood
 2 | 
 3 | The protein loglikelihood is a metric which estimates the joint probability of
 4 | observing a given sequence of amino-acids. The idea behind such an estimator is to approximate the
 5 | probability that a mutated protein will be “natural”, and can effectively be produced by a cell.
 6 | 
 7 | These metrics rely on transformers language model. These models are trained to predict a “masked” amino-acid in a sequence.
 8 | As a consequence, they can provide us an estimate of the probability of observing an amino-acid given the “context” (the surrounding amino-acids).
 9 | By multiplying individual probabilities computed for a given amino-acid given its context, we obtain a pseudo-likelihood, which can be a candidate estimator to approximate a sequence stability.
10 | 
11 | ```python
12 | from biotransformers import BioTransformers
13 | import ray
14 | 
15 | ray.init()
16 | bio_trans = BioTransformers(backend="protbert",num_gpus=2)
17 | 
18 | sequences = [
19 |         "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
20 |         "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE",
21 |         "RQQEVFDLIQQEVFDLIQQEVFIRDAQRLGFRQQEVFDLIRDHISQTGMPPTRAALARKGVIEIVSGASRGIRLLQEE",
22 |         "QEEVFDLIQQEVFDLIRDHISQTGMPPTRAMPPTRAEIAQQARKGVIEIVSGASRGIRLLQEE"
23 |     ]
24 | 
25 | loglikelihood = bio_trans.compute_loglikelihood(sequences, batch_size=2)
26 | ```
27 | 
28 | ## Different pass mode
29 | 
30 | For each provided methods, you can do the compute in a ``forward`` mode or in a ``masked`` mode. The last one is
31 | longer as we have to mask and compute the probabilities for each masked amino acid.
32 | 
33 | ```python
34 | embeddings = bio_trans.compute_loglikelihood(sequences, pass_mode="masked", batch_size=2)
35 | ```
36 | 
37 | ## Tokens list
38 | 
39 | The method give the ability to compute the loglikelihood for only a provided list of amino acids, which will be considered.
40 | 
41 | ```python
42 | UNNATURAL = list("ACDEFGHIKLMNPQRSTVWY") + ["-"]
43 | loglikelihood = bio_trans.compute_loglikelihood(sequences, tokens_list=UNNATURAL)
44 | ```
45 | 
46 | ## Probabilities
47 | 
48 | The ``compute_loglikelihoods`` relies on the ``compute_probabilities`` function.
49 | 
50 | This last function will compute for each amino acids position in the sequence the a dictionnary where keys represent the natural amino acids, and values the probabilities to be at the position.
51 | 
52 | For example:
53 | 
54 | ```python
55 | from biotransformers import BioTransformers
56 | 
57 | bio_trans = BioTransformers(backend="protbert",num_gpus=1)
58 | 
59 | sequence = ["MKT"]
60 | probabilities = bio_trans.compute_(sequence, batch_size=1)
61 | 
62 | print(probabilities)
63 | ```
64 | 
65 | ```python
66 | >>
67 | [{0: {'L': 0.06550145316598321, 'A': 0.021559458419220974, 'G': 0.029741129950678777, 'V': 0.0329506745800003, 'E': 0.03389950500319548, 'S': 0.10401323529266542, 'I': 0.04399518228657259, 'K': 0.1534323153578508, 'R': 0.08616676439914424, 'D': 0.010983572050921635, 'T': 0.04474224433539647, 'P': 0.01569993609938641, 'N': 0.027836286891774507, 'Q': 0.037557728840479546, 'F': 0.020606235301203788, 'Y': 0.01243454224917041, 'M': 0.21207524064947852, 'H': 0.015025274369047291, 'C': 0.013031914446968728, 'W': 0.018747306310860856},
68 | 
69 |  1: {'L': 0.03176897920072879, 'A': 0.013685848027567242, 'G': 0.01709074216275199, 'V': 0.018786360542915624, 'E': 0.016411511761942357, 'S': 0.02157161007259761, 'I': 0.019570515195473124, 'K': 0.026416232407458887, 'R': 0.021930249525274396, 'D': 0.008674132240173953, 'T': 0.018818536773492975, 'P': 0.010970933229272459, 'N': 0.01349720693939123, 'Q': 0.014703372924399499, 'F': 0.010715260172378251, 'Y': 0.00931640096204737, 'M': 0.7010288899792522, 'H': 0.009361870192728095, 'C': 0.007965577806480653, 'W': 0.007715769883673336},
70 | 
71 |   2: {'L': 0.07383247230045219, 'A': 0.03555995965068629, 'G': 0.03454727111803637, 'V': 0.043748770514437235, 'E': 0.04069625263096508, 'S': 0.06924489597284503, 'I': 0.046173613390643166, 'K': 0.2299759248798167, 'R': 0.06749564661032614, 'D': 0.0224069594369746, 'T': 0.03940009938504622, 'P': 0.02301058203142933, 'N': 0.03441775848661052, 'Q': 0.04373499771477881, 'F': 0.028093375324345762, 'Y': 0.02461900744880924, 'M': 0.025029056199102815, 'H': 0.0818692944874724, 'C': 0.016498739542946495, 'W': 0.01964532287427556}}]
72 | ```
73 | 
74 | For each position, we have  0,1,2 which correpond to amino acids M,K,T, we have a dictionnary of probabilities for each natural amino acids.
75 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/mutations_score.rst:
--------------------------------------------------------------------------------
 1 | Mutation score
 2 | ==============
 3 | 
 4 | To compute the ``mutation score``, for a given mutation we can consider the amino acid in the wildtype protein as a reference state, and compare the probability assigned to the mutated amino acid with the probability assigned to the wildtype.
 5 | In practice, at each mutated position, we introduce a mask token and record the model’s predicted probabilities of the tokens at that position. This metric is describe in the `paper <https://www.biorxiv.org/content/10.1101/2021.07.09.450648v1.full.pdf>`_
 6 | 
 7 | We call it the ``masked marginal probability``, which is described below:
 8 | 
 9 | .. image:: ../images/score_mutation.jpeg
10 |     :width: 400px
11 |     :align: center
12 | 
13 | 
14 | compute_mutation_score function
15 | -------------------------------
16 | 
17 | To compute this metric, we need to provide to things:
18 | 
19 | - **sequences**: a list of sequence to score
20 | 
21 | - **mutations**: a list of sequence' mutations. A sequence" mutations can be composed of multiple single mutations.
22 | 
23 | Example
24 | -------
25 | 
26 | 
27 | .. code-block:: python
28 | 
29 |     from biotransformers import BioTransformers
30 | 
31 | 
32 |     sequences = ["MAPSRKFFVGGNWKMNVVCAPPTAYIDFARQKLDPKI",
33 |                  "AVAAQNCYKVTNGAFTGEISPGMIKDCGATWVVLGH",
34 |                  "GRKQSLGELIGTLNAAKVPADTE"]
35 | 
36 |     mutations = [["M1P","K6F","N12G"],["A4V"],["A15L","V18A"]]
37 |     bio_trans.compute_mutation_score(sequences,mutations)
38 | 
39 | .. code-block:: bash
40 | 
41 |     >> [-4.07569046318531, -0.848480224609375, 0.4472615122795105]
42 | 
43 | To create a mutations list, you need to respect the mutations format above in the example:
44 | 
45 | - format : `Native_aaPositionMutated_aa`.
46 | 
47 | .. important:: Your position index for mutation must start at 1.
48 | 


--------------------------------------------------------------------------------
/environment_dev.yaml:
--------------------------------------------------------------------------------
 1 | name: bio-transformers-dev
 2 | channels:
 3 |   #- conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.7
 7 |   - pip=20.0.2
 8 |   - pip:
 9 |     - sphinx==4.0.2
10 |     - sphinx-notfound-page==0.6
11 |     #- sphinx_rtd_theme==0.5.2
12 |     - sphinx-book-theme
13 |     - sphinx_tabs==2.1.0
14 |     - pre-commit==2.2.0
15 |     - biopython==1.78
16 |     - ray==1.4.0
17 |     - fair-esm==0.3.1
18 |     - numpy>=1.16
19 |     - pandas>=1.2.3
20 |     - pytest==6.2.4
21 |     - pytest-cov==2.12.0
22 |     - pytorch_lightning>=1.3.3
23 |     - scikit-learn>=0.22.2
24 |     - torch>=1.7.0,<1.9
25 |     - torchmetrics==0.3.2
26 |     - tqdm>=4.60.0
27 |     - transformers>=4.6.1,<4.7
28 |     - myst-parser==0.15.1
29 |     - myst-nb==0.12.3
30 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | biopython>=1.78,<2
 2 | fair-esm==0.4.0
 3 | numpy>=1.16,<1.19
 4 | pandas>=1.2.3
 5 | pytorch_lightning>=1.3.3
 6 | ray>=1.4.0
 7 | scikit-learn>=0.22.2
 8 | torch>=1.7.0,<1.9
 9 | torchmetrics>=0.3.2,<0.5
10 | tqdm>=4.60.0
11 | transformers>=4.8,<4.9
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup file"""
 2 | 
 3 | import pathlib
 4 | from typing import List
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | HERE = pathlib.Path(__file__).parent
 9 | README = (HERE / "README.md").read_text(encoding="utf-8")
10 | 
11 | with open("biotransformers/version.py") as v:
12 |     exec(v.readline())
13 | 
14 | 
15 | def read_requirements() -> List:
16 |     with open("requirements.txt", "r+") as file:
17 |         requirements = [line.strip() for line in file.readlines()]
18 | 
19 |     return requirements
20 | 
21 | 
22 | DESCRIPTION = "Wrapper on top of ESM/Protbert model in order to easily work with protein embedding"
23 | 
24 | 
25 | def make_install():
26 |     """main install function"""
27 | 
28 |     setup_fn = setup(
29 |         name="bio-transformers",
30 |         license="Apache-2.0",
31 |         version=VERSION,  # noqa
32 |         description=DESCRIPTION,
33 |         author="Instadeep",
34 |         long_description=README,
35 |         long_description_content_type="text/markdown",
36 |         author_email="a.delfosse@instadeep.com",
37 |         packages=find_packages(exclude=["test"]),
38 |         classifiers=[
39 |             "License :: OSI Approved :: Apache Software License",
40 |             "Operating System :: OS Independent",
41 |             "Programming Language :: Python :: 3.7",
42 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
43 |             "Topic :: Scientific/Engineering :: Bio-Informatics",
44 |             "Topic :: Software Development",
45 |         ],
46 |         install_requires=read_requirements(),
47 |         include_package_data=True,
48 |         zip_safe=False,
49 |         python_requires=">=3.7",
50 |     )
51 | 
52 |     return setup_fn
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     make_install()
57 | 


--------------------------------------------------------------------------------