├── requirements.txt
├── conda.yml
├── LICENSE
├── .gitignore
├── .github
    └── workflows
    │   └── build.yml
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytoda @ git+https://git@github.com/PaccMann/paccmann_datasets@0.0.1
 2 | paccmann_predictor @ git+https://git@github.com/PaccMann/paccmann_predictor@0.0.1
 3 | paccmann_omics @ git+https://git@github.com/PaccMann/paccmann_omics@0.0.1
 4 | paccmann_chemistry @ git+https://git@github.com/PaccMann/paccmann_chemistry@0.0.1
 5 | paccmann_generator @ git+https://git@github.com/PaccMann/paccmann_generator@0.0.1
 6 | numpy>=1.14.3
 7 | pandas>=0.24.1
 8 | torch>=1.0.1
 9 | matplotlib >= 2.2.2
10 | seaborn>=0.9.0


--------------------------------------------------------------------------------
/conda.yml:
--------------------------------------------------------------------------------
 1 | name: paccmann_rl
 2 | channels:
 3 |   - https://conda.anaconda.org/rdkit
 4 | dependencies:
 5 |   - rdkit=2019.03.1
 6 |   - python>=3.6,<3.8
 7 |   - pip>=19.1
 8 |   - pip:
 9 |     - pytoda @ git+https://git@github.com/PaccMann/paccmann_datasets@0.0.1
10 |     - paccmann_predictor @ git+https://git@github.com/PaccMann/paccmann_predictor@0.0.1
11 |     - paccmann_omics @ git+https://git@github.com/PaccMann/paccmann_omics@0.0.1
12 |     - paccmann_chemistry @ git+https://git@github.com/PaccMann/paccmann_chemistry@0.0.1
13 |     - paccmann_generator @ git+https://git@github.com/PaccMann/paccmann_generator@0.0.1
14 |     - numpy>=1.14.3
15 |     - pandas>=0.24.1
16 |     - torch>=1.0.1
17 |     - matplotlib>=2.2.2
18 |     - seaborn>=0.9.0
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2019 Jannis Born, Matteo Manica, Ali Oskooei, Joris Cadow
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | *.DS_Store
  7 | 
  8 | /biased_models
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | 
110 | /results
111 | 
112 | # Pipeline steps
113 | 
114 | # data folder
115 | data
116 | 
117 | # source code
118 | code
119 | 
120 | # models
121 | models
122 | biased_models


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: build
 3 | on: [push]
 4 | jobs:
 5 |   conda-tests:
 6 |     name: Test with conda (${{ matrix.os }})
 7 |     runs-on: ${{ matrix.os }}
 8 |     continue-on-error: ${{ matrix.experimental }}
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         include:
13 |           - os: ubuntu-18.04
14 |             pip_cache_path: ~/.cache/pip
15 |             experimental: false
16 |           - os: macos-latest
17 |             pip_cache_path: ~/Library/Caches/pip
18 |             experimental: false
19 |     defaults:
20 |       run:
21 |         shell: bash -l {0}  # For conda
22 |     env:
23 |       # Increase this value to reset cache if conda.yml and requirements.txt
24 |       # have not changed
25 |       CACHE_NUMBER: 0
26 |     steps:
27 |       - uses: actions/checkout@v2
28 |       - name: Checkout and setup python
29 |         uses: actions/setup-python@v2
30 |         with:
31 |           python-version: 3.6
32 |           architecture: 'x64'
33 | 
34 |       - name: Cache conda
35 |         uses: actions/cache@v2
36 |         with:
37 |           path: ~/conda_pkgs_dir  # from: conda-incubator/setup-miniconda@v2
38 |           key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{
39 |             hashFiles('conda.yml') }}
40 | 
41 |       - name: Cache pip
42 |         uses: actions/cache@v2
43 |         with:
44 |           path: ${{ matrix.pip_cache_path }}
45 |           key: ${{ runner.os }}-pip--${{ env.CACHE_NUMBER }}-${{
46 |             hashFiles('requirements.txt') }}
47 | 
48 |       - name: Conda environment setup
49 |         uses: conda-incubator/setup-miniconda@v2
50 |         with:
51 |           activate-environment: pytoda
52 |           environment-file: conda.yml
53 |           auto-activate-base: false
54 |           use-only-tar-bz2: true  # This needs to be set for proper caching
55 |           auto-update-conda: true  # Required for windows for `use-only-tar-bz2`
56 | 
57 |       - name: Install dependencies
58 |         run: |
59 |           python3 -m pip install --upgrade pip
60 |           pip3 install --no-cache-dir -r requirements.txt
61 |           python3 -c "import pytoda"
62 |           python3 -c "import paccmann_predictor"
63 |           python3 -c "import paccmann_omics"
64 |           python3 -c "import paccmann_chemistry"
65 |           python3 -c "import paccmann_generator"
66 |       - name: Send Slack notification
67 |         uses: 8398a7/action-slack@v2
68 |         if: always()
69 |         with:
70 |           status: ${{ job.status }}
71 |           text: "CI Build ${{ matrix.os }}"
72 |           author_name: ${{ github.actor }}
73 |         env:
74 |           SLACK_WEBHOOK_URL: ${{ secrets.SLACK_HOOK_URL }}
75 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://github.com/PaccMann/paccmann_rl/actions/workflows/build.yml/badge.svg)](https://github.com/PaccMann/paccmann_rl/actions/workflows/build.yml)
  2 | # paccmann_rl
  3 | 
  4 | Pipeline to reproduce the results of the [PaccMann<sup>RL</sup> paper](https://www.cell.com/iscience/fulltext/S2589-0042(21)00237-6) published in _iScience_.
  5 | 
  6 | ## Description
  7 | 
  8 | In the repo we provide a conda environment and instructions to reproduce the pipeline described in the manuscript:
  9 | 
 10 | 1. Train a multimodal drug sensitivity predictor ([source code](https://github.com/PaccMann/paccmann_predictor))
 11 | 2. Train a generative model for omic profiles, also known as the PVAE ([source code](https://github.com/PaccMann/paccmann_omics))
 12 | 3. Train a generative model for molecules, also known as the SVAE ([source code](https://github.com/PaccMann/paccmann_chemistry))
 13 | 4. Train PaccMann^RL ([source code](https://github.com/PaccMann/paccmann_generator))
 14 | 
 15 | ## Requirements
 16 | 
 17 | - `conda>=3.7`
 18 | - The following data from [here](https://ibm.ent.box.com/v/paccmann-pytoda-data):
 19 |   - The processed splitted data from the folder `splitted_data`
 20 |   - The processed gene expression data from [GDSC](https://www.cancerrxgene.org/): `data/gene_expression/gdsc-rnaseq_gene-expression.csv`
 21 |   - The processed SMILES from the drugs from [GDSC](https://www.cancerrxgene.org/): `data/smiles/gdsc.smi`
 22 |   - A pickled [SMILESLanguage](https://github.com/PaccMann/paccmann_datasets/blob/master/pytoda/smiles/smiles_language.py) object (`data/smiles_language_chembl_gdsc_ccle.pkl`)
 23 |   - A pickled list of genes representing the panel considered in the paper (`data/2128_genes.pkl`)
 24 |   - A pickled pandas DataFrame containing expression values and metadata for the cell lines considered in the paper (`data/gdsc_transcriptomics_for_conditional_generation.pkl`)
 25 | - The git repos linked in the [previous section](#description)
 26 | 
 27 | **NOTE:** please refer to the [README.md](https://ibm.ent.box.com/v/paccmann-pytoda-data/file/548614344106) and to the manuscript for details on the datasets used and the preprocessing applied.
 28 | 
 29 | ## Setup
 30 | 
 31 | ### Install the environment
 32 | 
 33 | Create a conda environment:
 34 | 
 35 | ```sh
 36 | conda env create -f conda.yml
 37 | ```
 38 | 
 39 | Activate the environment:
 40 | 
 41 | ```sh
 42 | conda activate paccmann_rl
 43 | ```
 44 | 
 45 | ### Download data
 46 | 
 47 | Download the data reported in the [requirements section](#requirements).
 48 | From now on, we will assume that they are stored in the root of the repository in a folder called `data`, following this structure:
 49 | 
 50 | ```console
 51 | data
 52 | ├── 2128_genes.pkl
 53 | ├── gdsc-rnaseq_gene-expression.csv
 54 | ├── gdsc.smi
 55 | ├── gdsc_transcriptomics_for_conditional_generation.pkl
 56 | ├── smiles_language_chembl_gdsc_ccle.pkl
 57 | └── splitted_data
 58 |     ├── gdsc_cell_line_ic50_test_fraction_0.1_id_997_seed_42.csv
 59 |     ├── gdsc_cell_line_ic50_train_fraction_0.9_id_997_seed_42.csv
 60 |     ├── tcga_rnaseq_test_fraction_0.1_id_242870585127480531622270373503581547167_seed_42.csv
 61 |     ├── tcga_rnaseq_train_fraction_0.9_id_242870585127480531622270373503581547167_seed_42.csv
 62 |     ├── test_chembl_22_clean_1576904_sorted_std_final.smi
 63 |     └── train_chembl_22_clean_1576904_sorted_std_final.smi
 64 | 
 65 | 1 directory, 11 files
 66 | ```
 67 | 
 68 | **NOTE:** no worries, the `data` folder is in the [.gitignore](./.gitignore).
 69 | 
 70 | ### Clone the repos
 71 | 
 72 | To get the scripts to run each of the component create a `code` folder and clone the repos. Simply type this:
 73 | 
 74 | ```sh
 75 | mkdir code && cd code && \
 76 |   git clone --branch 0.0.1 https://github.com/PaccMann/paccmann_predictor && \ 
 77 |   git clone --branch 0.0.1 https://github.com/PaccMann/paccmann_omics && \ 
 78 |   git clone --branch 0.0.1 https://github.com/PaccMann/paccmann_chemistry && \ 
 79 |   git clone --branch 0.0.1 https://github.com/PaccMann/paccmann_generator && \
 80 |   cd ..
 81 | ```
 82 | 
 83 | **NOTE:** no worries, the `code` folder is in the [.gitignore](./.gitignore).
 84 | 
 85 | ## Pipeline
 86 | 
 87 | Now it's all set to run the full pipeline.
 88 | 
 89 | **NOTE:** the workload required to run the full pipeline is intesive and might not be straightforward to run all the steps on a desktop laptop. For this reason, we also provide [pretrained models](https://ibm.ent.box.com/v/paccmann-pytoda-data/folder/91897885403) that can be downloaded and used to run the different steps.
 90 | 
 91 | **NOTE:** in the following, we assume a folder `models` has been created in the root of the repository. No worries, the `models` folder is in the [.gitignore](./.gitignore).
 92 | 
 93 | ### Multimodal drug sensitivity predictor
 94 | 
 95 | ```console
 96 | (paccmann_rl) $ python ./code/paccmann_predictor/examples/train_paccmann.py \
 97 |     ./data/splitted_data/gdsc_cell_line_ic50_train_fraction_0.9_id_997_seed_42.csv \
 98 |     ./data/splitted_data/gdsc_cell_line_ic50_test_fraction_0.1_id_997_seed_42.csv \
 99 |     ./data/gdsc-rnaseq_gene-expression.csv \
100 |     ./data/gdsc.smi \
101 |     ./data/2128_genes.pkl \
102 |     ./data/smiles_language_chembl_gdsc_ccle.pkl \
103 |     ./models/ \
104 |     ./code/paccmann_predictor/examples/example_params.json paccmann
105 | ```
106 | 
107 | ### PVAE
108 | 
109 | ``` console
110 | (paccmann_rl) $ python ./code/paccmann_omics/examples/train_vae.py \
111 |     ./data/splitted_data/tcga_rnaseq_train_fraction_0.9_id_242870585127480531622270373503581547167_seed_42.csv \
112 |     ./data/splitted_data/tcga_rnaseq_test_fraction_0.1_id_242870585127480531622270373503581547167_seed_42.csv \
113 |     ./data/2128_genes.pkl \
114 |     ./models/ \
115 |     ./code/paccmann_omics/examples/example_params.json pvae
116 | ```
117 | 
118 | ### SVAE
119 | 
120 | ``` console
121 | (paccmann_rl) $ python ./code/paccmann_chemistry/examples/train_vae.py \
122 |     ./data/splitted_data/train_chembl_22_clean_1576904_sorted_std_final.smi \
123 |     ./data/splitted_data/test_chembl_22_clean_1576904_sorted_std_final.smi \
124 |     ./data/smiles_language_chembl_gdsc_ccle.pkl \
125 |     ./models/ \
126 |     ./code/paccmann_chemistry/examples/example_params.json svae
127 | ```
128 | 
129 | ### PaccMann^RL
130 | 
131 | ``` console
132 | (paccmann_rl) $ python ./code/paccmann_generator/examples/train_paccmann_rl.py \
133 |     ./models/svae \
134 |     ./models/pvae \
135 |     ./models/paccmann \
136 |     ./data/smiles_language_chembl_gdsc_ccle.pkl \
137 |     ./data/gdsc_transcriptomics_for_conditional_generation.pkl \
138 |     ./code/paccmann_generator/examples/example_params.json \
139 |     paccmann_rl breast
140 | ```
141 | 
142 | **NOTE:** this will create a `biased_model` folder containing the conditional generator and the baseline SMILES generator used. In this case: `breast_paccmann_rl` and `baseline`. No worries, the `biased_models` folder is in the [.gitignore](./.gitignore).
143 | 
144 | ## References
145 | 
146 | If you use `paccmann_rl` in your projects, please cite the following:
147 | 
148 | ```bib
149 | @article{born2021paccmannrl,
150 |   title = {PaccMann\textsuperscript{RL}: De novo generation of hit-like anticancer molecules from transcriptomic data via reinforcement learning},
151 |   journal = {iScience},
152 |   volume = {24},
153 |   number = {4},
154 |   pages = {102269},
155 |   year = {2021},
156 |   issn = {2589-0042},
157 |   doi = {https://doi.org/10.1016/j.isci.2021.102269},
158 |   url = {https://www.cell.com/iscience/fulltext/S2589-0042(21)00237-6},
159 |   author = {Born, Jannis and Manica, Matteo and Oskooei, Ali and Cadow, Joris and Markert, Greta and {Rodr{\'{i}}guez Mart{\'{i}}nez}, Mar{\'{i}}a}
160 | }
161 | ```
162 | 


--------------------------------------------------------------------------------