├── assets
    └── overview.png
├── requirements.txt
├── conda.yml
├── LICENSE
├── .gitignore
├── .github
    └── workflows
    │   └── build.yml
└── README.md


/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaccMann/paccmann_sarscov2/HEAD/assets/overview.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.14.3
 2 | scipy>=1.3.1
 3 | torch>=1.3.0
 4 | pytoda @ git+https://github.com/PaccMann/paccmann_datasets@0.1.1
 5 | paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
 6 | toxsmi @ git+https://github.com/PaccMann/toxsmi@0.0.2
 7 | paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@sarscov2
 8 | paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@sarscov2
 9 | paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@sarscov2
10 | absl-py>=0.8.1
11 | 


--------------------------------------------------------------------------------
/conda.yml:
--------------------------------------------------------------------------------
 1 | name: paccmann_sarscov2
 2 | channels:
 3 |   - https://conda.anaconda.org/rdkit
 4 | dependencies:
 5 |   - rdkit=2019.03.1
 6 |   - python>=3.7,<3.8
 7 |   - pip>=19.1,<20.3
 8 |   - pip:
 9 |     - numpy>=1.14.3
10 |     - scipy>=1.3.1
11 |     - torch>=1.3.0
12 |     - pytoda @ git+https://github.com/PaccMann/paccmann_datasets@0.1.1
13 |     - paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
14 |     - toxsmi @ git+https://github.com/PaccMann/toxsmi@0.0.2
15 |     - paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@sarscov2
16 |     - paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@sarscov2
17 |     - paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@sarscov2
18 |     - absl-py>=0.8.1
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 Jannis Born, Matteo Manica, Joris Cadow, Greta Markert, Nil Adell Mill, Modestas Filipavicius, Nikita Janakarajan
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | *.DS_Store
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | /results
109 | 
110 | # Pipeline steps
111 | 
112 | # data folder
113 | data
114 | 
115 | # source code
116 | code
117 | 
118 | # models
119 | models
120 | biased_models
121 | logs


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: build
 3 | on: [push]
 4 | jobs:
 5 |   conda-tests:
 6 |     name: Test with conda (${{ matrix.os }})
 7 |     runs-on: ${{ matrix.os }}
 8 |     continue-on-error: ${{ matrix.experimental }}
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         include:
13 |           - os: ubuntu-18.04
14 |             pip_cache_path: ~/.cache/pip
15 |             experimental: false
16 |           - os: macos-latest
17 |             pip_cache_path: ~/Library/Caches/pip
18 |             experimental: false
19 |     defaults:
20 |       run:
21 |         shell: bash -l {0}  # For conda
22 |     env:
23 |       # Increase this value to reset cache if conda.yml and requirements.txt
24 |       # have not changed
25 |       CACHE_NUMBER: 0
26 |     steps:
27 |       - uses: actions/checkout@v2
28 |       - name: Checkout and setup python
29 |         uses: actions/setup-python@v2
30 |         with:
31 |           python-version: 3.6
32 |           architecture: 'x64'
33 | 
34 |       - name: Cache conda
35 |         uses: actions/cache@v2
36 |         with:
37 |           path: ~/conda_pkgs_dir  # from: conda-incubator/setup-miniconda@v2
38 |           key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{
39 |             hashFiles('conda.yml') }}
40 | 
41 |       - name: Cache pip
42 |         uses: actions/cache@v2
43 |         with:
44 |           path: ${{ matrix.pip_cache_path }}
45 |           key: ${{ runner.os }}-pip--${{ env.CACHE_NUMBER }}-${{
46 |             hashFiles('requirements.txt') }}
47 | 
48 |       - name: Conda environment setup
49 |         uses: conda-incubator/setup-miniconda@v2
50 |         with:
51 |           activate-environment: paccmann_sarscov2
52 |           environment-file: conda.yml
53 |           auto-activate-base: false
54 |           use-only-tar-bz2: true  # This needs to be set for proper caching
55 |           auto-update-conda: true  # Required for windows for `use-only-tar-bz2`
56 | 
57 |       - name: Install dependencies and test code 
58 |         run: |
59 |           git clone --branch sarscov2 https://github.com/PaccMann/paccmann_predictor 
60 |           git clone --branch 0.0.2 https://github.com/PaccMann/toxsmi
61 |           git clone --branch sarscov2 https://github.com/PaccMann/paccmann_omics 
62 |           git clone --branch sarscov2 https://github.com/PaccMann/paccmann_chemistry 
63 |           git clone --branch sarscov2 https://github.com/PaccMann/paccmann_generator
64 |           pip3 install --no-cache-dir -r requirements.txt
65 |           python3 -c "import pytoda"
66 |           python3 -c "import paccmann_predictor"
67 |           python3 -c "import toxsmi"
68 |           python3 -c "import paccmann_omics"
69 |           python3 -c "import paccmann_chemistry"
70 |           python3 -c "import paccmann_generator"
71 |           python3 paccmann_predictor/examples/affinity/train_affinity.py -h
72 |           python3 toxsmi/scripts/train_tox.py -h
73 |           python3 paccmann_omics/examples/encoded_proteins/train_protein_encoding_vae.py -h
74 |           python3 paccmann_chemistry/examples/train_vae.py -h
75 |           python3 paccmann_generator/examples/affinity/train_conditional_generator.py -h
76 | 
77 |       - name: Send Slack notification
78 |         uses: 8398a7/action-slack@v2
79 |         if: always()
80 |         with:
81 |           status: ${{ job.status }}
82 |           text: "CI Build ${{ matrix.os }}"
83 |           author_name: ${{ github.actor }}
84 |         env:
85 |           SLACK_WEBHOOK_URL: ${{ secrets.SLACK_HOOK_URL }}
86 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://github.com/PaccMann/paccmann_sarscov2/actions/workflows/build.yml/badge.svg)](https://github.com/PaccMann/paccmann_sarscov2/actions/workflows/build.yml)
  2 | 
  3 | 
  4 | # paccmann_sarscov2
  5 | 
  6 | Pipeline to reproduce the results of the paper [Data-driven molecular design for discovery and synthesis of novel ligands: a case study on SARS-CoV-2](https://iopscience.iop.org/article/10.1088/2632-2153/abe808) (_Machine Learning: Science and Technology_, 2021). In that paper, we propose a de-novo molecular generative model for protein driven molecular design and bundle it with molecular retrosynthesis models to automatize all steps before the actual synthesis of a drug candidate.
  7 | 
  8 | ![Graphical abstract](https://github.com/PaccMann/paccmann_sarscov2/blob/master/assets/overview.png "Graphical abstract")
  9 | 
 10 | 
 11 | ## Description
 12 | 
 13 | In the repo we provide a conda environment and instructions to reproduce the pipeline described in the manuscript:
 14 | 
 15 | 1. Train a multimodal protein-compound interaction classifier, also known as the affinity predictor ([source code](https://github.com/PaccMann/paccmann_predictor))
 16 | 2. Train a toxicity predictor ([source code](https://github.com/PaccMann/toxsmi))
 17 | 3. Train a generative model for encoded proteins, also known as the ProteinVAE ([source code](https://github.com/PaccMann/paccmann_omics))
 18 | 4. Train a generative model for molecules, also known as the SELFIESVAE ([source code](https://github.com/PaccMann/paccmann_chemistry))
 19 | 5. Train PaccMann^RL on SARS-CoV-2 using the pretained models from above ([source code](https://github.com/PaccMann/paccmann_generator))
 20 | 
 21 | 
 22 | **NOTE:** In the linked repositories, there are often multiple examples for training. For the use case of `paccmann_sarscov2`, relevant examples are named `affinity` or `encoded_proteins`.
 23 | 
 24 | ## Requirements
 25 | 
 26 | - `conda>=3.7`
 27 | - The following data from this [Box link](https://ibm.ent.box.com/v/paccmann-sarscov2-data).  
 28 |   View the respective `README.md` files on data sources.  
 29 | - The git repos linked in the [previous section](#description)
 30 | 
 31 | <!-- **NOTE:** please refer to the [README.md](https://ibm.ent.box.com/v/paccmann-pytoda-data/file/548614344106) and to the manuscript for details on the datasets used and the preprocessing applied. -->
 32 | 
 33 | ## Setup
 34 | 
 35 | ### Install the environment
 36 | 
 37 | Create a conda environment:
 38 | 
 39 | ```sh
 40 | conda env create -f conda.yml
 41 | ```
 42 | 
 43 | Activate the environment:
 44 | 
 45 | ```sh
 46 | conda activate paccmann_sarscov2
 47 | ```
 48 | 
 49 | **NOTE:** On Ubuntu, you may now need to run the following to obtain a functional `RDKit` distribution: 
 50 | ```sh
 51 | sudo apt-get install libxrender1
 52 | ```
 53 | 
 54 | ### Download data and pretrained models
 55 | 
 56 | Download the [data](https://ibm.ent.box.com/v/paccmann-sarscov2-data) as reported in the [requirements section](#requirements).
 57 | From now on, we will assume that they are stored in the root of the repository in a folder called `data`, following this structure:
 58 | 
 59 | ```console
 60 | data
 61 | ├── pretraining
 62 | │   ├── ProteinVAE
 63 | │   ├── SELFIESVAE
 64 | │   ├── affinity_predictor
 65 | │   ├── language_models
 66 | │   └── toxicity_predictor
 67 | └── training
 68 | ```
 69 | This is around **6GB** of data, required for pretaining multiple models.
 70 | Also, the workload required to run the full pipeline is intensive and might not be straightforward to run all the steps on a desktop laptop.
 71 | 
 72 | For these reasons we also provide [pretrained models](https://ibm.ent.box.com/v/paccmann-sarscov2-models) (ca. 700MB) for download.
 73 | 
 74 | Once the download of the pretrained models is completed, the directory structure looks like this:
 75 | 
 76 | ```console
 77 | models
 78 | ├── ProteinVAE
 79 | ├── SELFIESVAE
 80 | ├── Tox21
 81 | └── affinity
 82 | ```
 83 | 
 84 | **NOTE:** no worries, the `data` and `models` folders are in the [.gitignore](./.gitignore).
 85 | 
 86 | ## PaccMann^RL on SARS-CoV-2
 87 | 
 88 | Using the pretrained models to train the conditional generator you would only require the data under `data/training/` (8MB).
 89 | 
 90 | ### Clone the repo
 91 | 
 92 | To get the training script simply type this:
 93 | 
 94 | ```sh
 95 | mkdir code && cd code && \
 96 |   git clone --branch sarscov2 https://github.com/PaccMann/paccmann_generator && \
 97 |   cd ..
 98 | ```
 99 | The branch is given to ensure a version working with the provided conda environment.
100 | 
101 | **NOTE:** no worries, the `code` folder is in the [.gitignore](./.gitignore).
102 | 
103 | ### Running training
104 | 
105 | Running the training is as easy as running:
106 | 
107 | ``` console
108 | (paccmann_sarscov2) $ python ./code/paccmann_generator/examples/affinity/train_conditional_generator.py \
109 |     ./models/SELFIESVAE \
110 |     ./models/ProteinVAE \
111 |     ./models/affinity \
112 |     ./data/training/merged_sequence_encoding/uniprot_covid-19.csv \
113 |     ./code/paccmann_generator/examples/affinity/conditional_generator.json \
114 |     paccmann_sarscov2 \
115 |     35 \
116 |     ./data/training/unbiased_predictions \
117 |     --tox21_path ./models/Tox21
118 | ```
119 | 
120 | This will create a `biased_models` folder containing the conditional generators, biased for all provided proteins from [covid-19.uniprot.org](https://covid-19.uniprot.org/) except one, in the example for ACE2_HUMAN. The biased generator generates compounds with a shifted distribution compared to unbiased predictions. Ideally, the model generalizes to ACE2_HUMAN and the biased compounds have overall higher affinity (to ACE2_HUMAN) **according to the affinity predictor**. See the pdf files in `biased_models/paccmann_sarscov2_35/results` to observe the effect at different stages of training.  
121 | 
122 | **NOTE:** no worries, the `biased_models` folder is in the [.gitignore](./.gitignore).
123 | 
124 | ## Pretraining pipeline
125 | 
126 | We also provide instructions and scripts to reproduce the full pretraining pipeline, keep in mind **we discourage you from running this on a desktop laptop**.
127 | 
128 | Calling any of the scripts with the `-h` or `--help` flag will provide you with some information on the arguments.
129 | 
130 | **NOTE:** in the following, we assume a folder `models` has been created in the root of the repository.  
131 | 
132 | ### Clone the repos
133 | 
134 | To get the scripts to run each of the component create a `code` folder and clone the repos. Simply type this:
135 | 
136 | ```sh
137 | mkdir code && cd code && \
138 |   git clone --branch sarscov2 https://github.com/PaccMann/paccmann_predictor && \ 
139 |   git clone --branch 0.0.2 https://github.com/PaccMann/toxsmi && \
140 |   git clone --branch sarscov2 https://github.com/PaccMann/paccmann_omics && \ 
141 |   git clone --branch sarscov2 https://github.com/PaccMann/paccmann_chemistry && \ 
142 |   git clone --branch sarscov2 https://github.com/PaccMann/paccmann_generator && \
143 |   cd ..
144 | ```
145 | The branch is given to ensure a version working with the provided conda environment.
146 | 
147 | ### affinity predictor
148 | ```console
149 | (paccmann_sarscov2) $ python ./code/paccmann_predictor/examples/affinity/train_affinity.py \
150 |     ./data/pretraining/affinity_predictor/filtered_train_binding_data.csv \
151 |     ./data/pretraining/affinity_predictor/filtered_val_binding_data.csv \
152 |     ./data/pretraining/affinity_predictor/sequences.smi \
153 |     ./data/pretraining/affinity_predictor/filtered_ligands.smi \
154 |     ./data/pretraining/language_models/smiles_language_chembl_gdsc_ccle_tox21_zinc_organdb_bindingdb.pkl \
155 |     ./data/pretraining/language_models/protein_language_bindingdb.pkl \
156 |     ./models/ \
157 |     ./code/paccmann_predictor/examples/affinity/affinity.json \
158 |     affinity
159 | ```
160 | 
161 | ### toxicity predictor
162 | ```console
163 | (paccmann_sarscov2) $ python ./code/toxsmi/scripts/train_tox.py \
164 |     ./data/pretraining/toxicity_predictor/tox21_train.csv \
165 |     ./data/pretraining/toxicity_predictor/tox21_test.csv \
166 |     ./data/pretraining/toxicity_predictor/tox21.smi \
167 |     ./data/pretraining/language_models/smiles_language_tox21.pkl \
168 |     ./models/ \
169 |     ./code/toxsmi/params/mca.json \
170 |     Tox21 \
171 |     --embedding_path ./data/pretraining/toxicity_predictor/smiles_vae_embeddings.pkl
172 | ```
173 | 
174 | ### protein VAE
175 | ``` console
176 | (paccmann_sarscov2) $ python ./code/paccmann_omics/examples/encoded_proteins/train_protein_encoding_vae.py \
177 |     ./data/pretraining/proteinVAE/tape_encoded/train_representation.csv \
178 |     ./data/pretraining/proteinVAE/tape_encoded/val_representation.csv \
179 |     ./models/ \
180 |     ./code/paccmann_omics/examples/encoded_proteins/protein_encoding_vae_params.json \
181 |     ProteinVAE
182 | ```
183 | 
184 | ### SELFIES VAE
185 | ``` console
186 | (paccmann_sarscov2) $ python ./code/paccmann_chemistry/examples/train_vae.py \
187 |     ./data/pretraining/SELFIESVAE/train_chembl_22_clean_1576904_sorted_std_final.smi \
188 |     ./data/pretraining/SELFIESVAE/test_chembl_22_clean_1576904_sorted_std_final.smi \
189 |     ./data/pretraining/language_models/selfies_language.pkl \
190 |     ./models/ \
191 |     ./code/paccmann_chemistry/examples/example_params.json \
192 |     SELFIESVAE
193 | ```
194 | 
195 | ## References
196 | 
197 | If you use `paccmann_sarscov2` in your projects, please cite the following:
198 | 
199 | ```bib
200 | @article{born2021datadriven,
201 |   author = {Born, Jannis and Manica, Matteo and Cadow, Joris and Markert, Greta and Mill, Nil Adell and Filipavicius, Modestas and Janakarajan, Nikita and Cardinale, Antonio and Laino, Teodoro and {Rodr{\'{i}}guez Mart{\'{i}}nez}, Mar{\'{i}}a},
202 |   doi = {10.1088/2632-2153/abe808},
203 |   issn = {2632-2153},
204 |   journal = {Machine Learning: Science and Technology},
205 |   number = {2},
206 |   pages = {025024},
207 |   title = {{Data-driven molecular design for discovery and synthesis of novel ligands: a case study on SARS-CoV-2}},
208 |   url = {https://iopscience.iop.org/article/10.1088/2632-2153/abe808},
209 |   volume = {2},
210 |   year = {2021}
211 | }
212 | ```
213 | 


--------------------------------------------------------------------------------