├── .gitignore
├── LICENSE
├── Makefile
├── Procfile
├── README.md
├── azure-pipelines.yml
├── data
    ├── hiv-protease-consensus.txt
    ├── hiv-protease-data-expanded.csv
    ├── models
    │   ├── ATV.pkl.gz
    │   ├── DRV.pkl.gz
    │   ├── FPV.pkl.gz
    │   ├── IDV.pkl.gz
    │   ├── LPV.pkl.gz
    │   ├── NFV.pkl.gz
    │   ├── SQV.pkl.gz
    │   └── TPV.pkl.gz
    └── scores.pkl.gz
├── environment.yml
├── hiv-resistance.ipynb
├── home.ipynb
├── index.html
├── iris.ipynb
├── model-training.ipynb
├── requirements.txt
├── test_utils.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Custom
  2 | dask-worker-space/*
  3 | .vscode/*
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Eric Ma
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | test:
2 | 	jupyter nbconvert --execute minimal-panel.ipynb
3 | 	rm minimal-panel.html
4 | 


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: panel serve --address="0.0.0.0" --port=$PORT hiv-resistance.ipynb home.ipynb iris.ipynb --allow-websocket-origin=minimal-panel-app.herokuapp.com --index=`pwd`/index.html
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # minimal-panel-app
 2 | 
 3 | A pedagogical implementation of panel apps served up on a remote machine.
 4 | 
 5 | See the full app [here](http://minimal-panel-app.herokuapp.com/home).
 6 | 
 7 | ## why this project exists
 8 | 
 9 | I spent a day figuring out how to make this happen at work,
10 | and decided to spend an evening consolidating my knowledge.
11 | 
12 | ## "how to use"
13 | 
14 | ```
15 | git clone https://github.com/ericmjl/minimal-panel-app
16 | ```
17 | 
18 | ## anything else interesting?
19 | 
20 | ### iPad development
21 | 
22 | The first version of the app was coded up entirely on an iPad,
23 | using a combination of [blink](http://blink.sh)
24 | [Juno](jhttp://juno.sh),
25 | and `nano` on my home remote server
26 | (which is nothing more than a converted gaming tower).
27 | 
28 | Web app development in Python is now doable
29 | and we can use modern tablets as a thin client!
30 | 
31 | ### memory usage
32 | 
33 | Deploying the HIV drug resistance model to Heroku was challenging
34 | because I had to watch out for memory and storage usage.
35 | There are 8 models to make predictions on,
36 | and loading all of them together causes memory overload
37 | on Heroku's free tier.
38 | 
39 | I got around this by pickling the models individually,
40 | and only loading them when needed.
41 | I also minimized disk usage by using gzip
42 | when pickling the files.
43 | 
44 | ### multi-app hosting
45 | 
46 | There are multiple "apps" that are being hosted by a single Panel server here.
47 | Each "app" is basically one Jupyter notebook.
48 | In each notebook, I define a self-contained, hostable unit
49 | that an end-user can interact with.
50 | One of them is the homepage,
51 | written using Panel's tooling just to prove the point,
52 | but the others are actual user-facing interfaces
53 | that provide a way to interact with either data or a machine learning model.
54 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
 1 | pr:
 2 | - master
 3 | 
 4 | jobs:
 5 |   - job: linux
 6 |     variables:
 7 |       activate.command: "source activate"
 8 |     strategy:
 9 |       matrix:
10 |         py37:
11 |           python.version: "3.7"
12 | 
13 |     pool:
14 |       vmImage: ubuntu-16.04
15 | 
16 |     steps:
17 |     - bash: echo "##vso[task.prependpath]$CONDA/bin"
18 |       displayName: Add conda to PATH
19 |     - script: |
20 |         conda env create -f environment.yml
21 |       displayName: Create environment
22 |     - script: | 
23 |         source activate minimal-panel
24 |         python -m ipykernel install --user --name minimal-panel
25 |         make test
26 |       displayName: Run tests
27 | 


--------------------------------------------------------------------------------
/data/hiv-protease-consensus.txt:
--------------------------------------------------------------------------------
1 | >protease
2 | PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
3 | 


--------------------------------------------------------------------------------
/data/models/ATV.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/ATV.pkl.gz


--------------------------------------------------------------------------------
/data/models/DRV.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/DRV.pkl.gz


--------------------------------------------------------------------------------
/data/models/FPV.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/FPV.pkl.gz


--------------------------------------------------------------------------------
/data/models/IDV.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/IDV.pkl.gz


--------------------------------------------------------------------------------
/data/models/LPV.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/LPV.pkl.gz


--------------------------------------------------------------------------------
/data/models/NFV.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/NFV.pkl.gz


--------------------------------------------------------------------------------
/data/models/SQV.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/SQV.pkl.gz


--------------------------------------------------------------------------------
/data/models/TPV.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/TPV.pkl.gz


--------------------------------------------------------------------------------
/data/scores.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/scores.pkl.gz


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: minimal-panel
 2 | channels:
 3 | - conda-forge
 4 | - defaults
 5 | dependencies:
 6 | - python=3.7
 7 | - biopython=1.74
 8 | - bokeh=1.2.0
 9 | - dask=2.6.0
10 | - holoviews=1.12.3
11 | - hvplot
12 | - ipython
13 | - jupyter
14 | - jupyterlab
15 | - pandas=0.24.0
16 | - panel=0.6.0
17 | - pyjanitor=0.18.0
18 | - scikit-learn=0.21.2
19 | - pytest
20 | - hypothesis
21 | - black
22 | - pylint
23 | - pycodestyle
24 | 


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 | <!doctype html>
2 | <html lang="en">
3 |   <head>
4 |     <!-- Required meta tags -->
5 |     <meta http-equiv="Refresh" content="0; url=/home" />
6 |   </head>
7 | </html>
8 | 


--------------------------------------------------------------------------------
/model-training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction\n",
  8 |     "\n",
  9 |     "This notebook gives you a short introduction on how to use Dask to parallelize model training, particularly if you have multiple learning tasks on which you want to train individual models for.\n",
 10 |     "\n",
 11 |     "For brevity, I will not be elaborating on the exact machine learning task here, but focus on the idioms that we need to use Dask for this task."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "%load_ext autoreload\n",
 21 |     "%autoreload 2\n",
 22 |     "%matplotlib inline\n",
 23 |     "%config InlineBackend.figure_format = 'retina'\n",
 24 |     "\n",
 25 |     "from dask.distributed import LocalCluster, Client\n",
 26 |     "import numpy as np\n",
 27 |     "import pandas as pd\n",
 28 |     "import janitor"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Instantiate a Dask Cluster\n",
 36 |     "\n",
 37 |     "Here, we instantiate a Dask `cluster` (this is only a `LocalCluster`, but other cluster types can be created too, such as an `SGECluster` or `KubeCluster`. We then connect a `client` to the cluster."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stderr",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "/home/ericmjl/anaconda/envs/minimal-panel/lib/python3.7/site-packages/distributed/dashboard/core.py:72: UserWarning: \n",
 50 |       "Port 8787 is already in use. \n",
 51 |       "Perhaps you already have a cluster running?\n",
 52 |       "Hosting the diagnostics dashboard on a random port instead.\n",
 53 |       "  warnings.warn(\"\\n\" + msg)\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "client = Client()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Data Preprocessing\n",
 66 |     "\n",
 67 |     "We will now preprocess our data and get it into a shape for machine learning."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from utils import molecular_weights, featurize_sequence_"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 6,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "drugs = ['ATV', 'DRV', 'FPV', 'IDV', 'LPV', 'NFV', 'SQV', 'TPV']\n",
 86 |     "\n",
 87 |     "data = (\n",
 88 |     "    pd.read_csv(\"data/hiv-protease-data-expanded.csv\", index_col=0)\n",
 89 |     "    .query(\"weight == 1.0\")\n",
 90 |     "    .transform_column(\"sequence\", lambda x: len(x), \"seq_length\")\n",
 91 |     "    .query(\"seq_length == 99\")\n",
 92 |     "    .transform_column(\"sequence\", featurize_sequence_, \"features\")\n",
 93 |     "    .transform_columns(drugs, np.log10)\n",
 94 |     ")\n",
 95 |     "\n",
 96 |     "features = pd.DataFrame(np.vstack(data['features'])).set_index(data.index)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 7,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/html": [
107 |        "<div>\n",
108 |        "<style scoped>\n",
109 |        "    .dataframe tbody tr th:only-of-type {\n",
110 |        "        vertical-align: middle;\n",
111 |        "    }\n",
112 |        "\n",
113 |        "    .dataframe tbody tr th {\n",
114 |        "        vertical-align: top;\n",
115 |        "    }\n",
116 |        "\n",
117 |        "    .dataframe thead th {\n",
118 |        "        text-align: right;\n",
119 |        "    }\n",
120 |        "</style>\n",
121 |        "<table border=\"1\" class=\"dataframe\">\n",
122 |        "  <thead>\n",
123 |        "    <tr style=\"text-align: right;\">\n",
124 |        "      <th></th>\n",
125 |        "      <th>ATV</th>\n",
126 |        "      <th>DRV</th>\n",
127 |        "      <th>FPV</th>\n",
128 |        "      <th>IDV</th>\n",
129 |        "      <th>LPV</th>\n",
130 |        "      <th>NFV</th>\n",
131 |        "      <th>SQV</th>\n",
132 |        "      <th>SeqID</th>\n",
133 |        "      <th>TPV</th>\n",
134 |        "      <th>seqid</th>\n",
135 |        "      <th>sequence</th>\n",
136 |        "      <th>sequence_object</th>\n",
137 |        "      <th>weight</th>\n",
138 |        "      <th>seq_length</th>\n",
139 |        "      <th>features</th>\n",
140 |        "    </tr>\n",
141 |        "  </thead>\n",
142 |        "  <tbody>\n",
143 |        "    <tr>\n",
144 |        "      <th>6</th>\n",
145 |        "      <td>1.50515</td>\n",
146 |        "      <td>NaN</td>\n",
147 |        "      <td>0.477121</td>\n",
148 |        "      <td>1.544068</td>\n",
149 |        "      <td>1.50515</td>\n",
150 |        "      <td>1.462398</td>\n",
151 |        "      <td>2.214844</td>\n",
152 |        "      <td>4426</td>\n",
153 |        "      <td>NaN</td>\n",
154 |        "      <td>4426-0</td>\n",
155 |        "      <td>PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKM...</td>\n",
156 |        "      <td>ID: 4426-0\\nName: &lt;unknown name&gt;\\nDescription:...</td>\n",
157 |        "      <td>1.0</td>\n",
158 |        "      <td>99</td>\n",
159 |        "      <td>[[115.131, 146.1451, 131.1736, 119.1197, 131.1...</td>\n",
160 |        "    </tr>\n",
161 |        "    <tr>\n",
162 |        "      <th>7</th>\n",
163 |        "      <td>NaN</td>\n",
164 |        "      <td>NaN</td>\n",
165 |        "      <td>0.176091</td>\n",
166 |        "      <td>0.000000</td>\n",
167 |        "      <td>NaN</td>\n",
168 |        "      <td>0.342423</td>\n",
169 |        "      <td>0.041393</td>\n",
170 |        "      <td>4432</td>\n",
171 |        "      <td>NaN</td>\n",
172 |        "      <td>4432-0</td>\n",
173 |        "      <td>PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...</td>\n",
174 |        "      <td>ID: 4432-0\\nName: &lt;unknown name&gt;\\nDescription:...</td>\n",
175 |        "      <td>1.0</td>\n",
176 |        "      <td>99</td>\n",
177 |        "      <td>[[115.131, 146.1451, 131.1736, 119.1197, 131.1...</td>\n",
178 |        "    </tr>\n",
179 |        "    <tr>\n",
180 |        "      <th>14</th>\n",
181 |        "      <td>NaN</td>\n",
182 |        "      <td>NaN</td>\n",
183 |        "      <td>0.491362</td>\n",
184 |        "      <td>0.939519</td>\n",
185 |        "      <td>NaN</td>\n",
186 |        "      <td>1.505150</td>\n",
187 |        "      <td>1.227887</td>\n",
188 |        "      <td>4664</td>\n",
189 |        "      <td>NaN</td>\n",
190 |        "      <td>4664-0</td>\n",
191 |        "      <td>PQITLWQRPIVTIKVGGQLIEALLDTGADDTVLEEINLPGRWKPKM...</td>\n",
192 |        "      <td>ID: 4664-0\\nName: &lt;unknown name&gt;\\nDescription:...</td>\n",
193 |        "      <td>1.0</td>\n",
194 |        "      <td>99</td>\n",
195 |        "      <td>[[115.131, 146.1451, 131.1736, 119.1197, 131.1...</td>\n",
196 |        "    </tr>\n",
197 |        "  </tbody>\n",
198 |        "</table>\n",
199 |        "</div>"
200 |       ],
201 |       "text/plain": [
202 |        "        ATV  DRV       FPV       IDV      LPV       NFV       SQV  SeqID  TPV  \\\n",
203 |        "6   1.50515  NaN  0.477121  1.544068  1.50515  1.462398  2.214844   4426  NaN   \n",
204 |        "7       NaN  NaN  0.176091  0.000000      NaN  0.342423  0.041393   4432  NaN   \n",
205 |        "14      NaN  NaN  0.491362  0.939519      NaN  1.505150  1.227887   4664  NaN   \n",
206 |        "\n",
207 |        "     seqid                                           sequence  \\\n",
208 |        "6   4426-0  PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKM...   \n",
209 |        "7   4432-0  PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...   \n",
210 |        "14  4664-0  PQITLWQRPIVTIKVGGQLIEALLDTGADDTVLEEINLPGRWKPKM...   \n",
211 |        "\n",
212 |        "                                      sequence_object  weight  seq_length  \\\n",
213 |        "6   ID: 4426-0\\nName: <unknown name>\\nDescription:...     1.0          99   \n",
214 |        "7   ID: 4432-0\\nName: <unknown name>\\nDescription:...     1.0          99   \n",
215 |        "14  ID: 4664-0\\nName: <unknown name>\\nDescription:...     1.0          99   \n",
216 |        "\n",
217 |        "                                             features  \n",
218 |        "6   [[115.131, 146.1451, 131.1736, 119.1197, 131.1...  \n",
219 |        "7   [[115.131, 146.1451, 131.1736, 119.1197, 131.1...  \n",
220 |        "14  [[115.131, 146.1451, 131.1736, 119.1197, 131.1...  "
221 |       ]
222 |      },
223 |      "execution_count": 7,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "data.head(3)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 8,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "data": {
239 |       "text/html": [
240 |        "<div>\n",
241 |        "<style scoped>\n",
242 |        "    .dataframe tbody tr th:only-of-type {\n",
243 |        "        vertical-align: middle;\n",
244 |        "    }\n",
245 |        "\n",
246 |        "    .dataframe tbody tr th {\n",
247 |        "        vertical-align: top;\n",
248 |        "    }\n",
249 |        "\n",
250 |        "    .dataframe thead th {\n",
251 |        "        text-align: right;\n",
252 |        "    }\n",
253 |        "</style>\n",
254 |        "<table border=\"1\" class=\"dataframe\">\n",
255 |        "  <thead>\n",
256 |        "    <tr style=\"text-align: right;\">\n",
257 |        "      <th></th>\n",
258 |        "      <th>0</th>\n",
259 |        "      <th>1</th>\n",
260 |        "      <th>2</th>\n",
261 |        "      <th>3</th>\n",
262 |        "      <th>4</th>\n",
263 |        "      <th>5</th>\n",
264 |        "      <th>6</th>\n",
265 |        "      <th>7</th>\n",
266 |        "      <th>8</th>\n",
267 |        "      <th>9</th>\n",
268 |        "      <th>...</th>\n",
269 |        "      <th>89</th>\n",
270 |        "      <th>90</th>\n",
271 |        "      <th>91</th>\n",
272 |        "      <th>92</th>\n",
273 |        "      <th>93</th>\n",
274 |        "      <th>94</th>\n",
275 |        "      <th>95</th>\n",
276 |        "      <th>96</th>\n",
277 |        "      <th>97</th>\n",
278 |        "      <th>98</th>\n",
279 |        "    </tr>\n",
280 |        "  </thead>\n",
281 |        "  <tbody>\n",
282 |        "    <tr>\n",
283 |        "      <th>6</th>\n",
284 |        "      <td>115.131</td>\n",
285 |        "      <td>146.1451</td>\n",
286 |        "      <td>131.1736</td>\n",
287 |        "      <td>119.1197</td>\n",
288 |        "      <td>131.1736</td>\n",
289 |        "      <td>204.2262</td>\n",
290 |        "      <td>146.1451</td>\n",
291 |        "      <td>174.2017</td>\n",
292 |        "      <td>115.131</td>\n",
293 |        "      <td>131.1736</td>\n",
294 |        "      <td>...</td>\n",
295 |        "      <td>131.1736</td>\n",
296 |        "      <td>119.1197</td>\n",
297 |        "      <td>146.1451</td>\n",
298 |        "      <td>131.1736</td>\n",
299 |        "      <td>75.0669</td>\n",
300 |        "      <td>121.159</td>\n",
301 |        "      <td>119.1197</td>\n",
302 |        "      <td>131.1736</td>\n",
303 |        "      <td>132.1184</td>\n",
304 |        "      <td>165.19</td>\n",
305 |        "    </tr>\n",
306 |        "    <tr>\n",
307 |        "      <th>7</th>\n",
308 |        "      <td>115.131</td>\n",
309 |        "      <td>146.1451</td>\n",
310 |        "      <td>131.1736</td>\n",
311 |        "      <td>119.1197</td>\n",
312 |        "      <td>131.1736</td>\n",
313 |        "      <td>204.2262</td>\n",
314 |        "      <td>146.1451</td>\n",
315 |        "      <td>174.2017</td>\n",
316 |        "      <td>115.131</td>\n",
317 |        "      <td>131.1736</td>\n",
318 |        "      <td>...</td>\n",
319 |        "      <td>131.1736</td>\n",
320 |        "      <td>119.1197</td>\n",
321 |        "      <td>146.1451</td>\n",
322 |        "      <td>131.1736</td>\n",
323 |        "      <td>75.0669</td>\n",
324 |        "      <td>121.159</td>\n",
325 |        "      <td>119.1197</td>\n",
326 |        "      <td>131.1736</td>\n",
327 |        "      <td>132.1184</td>\n",
328 |        "      <td>165.19</td>\n",
329 |        "    </tr>\n",
330 |        "    <tr>\n",
331 |        "      <th>14</th>\n",
332 |        "      <td>115.131</td>\n",
333 |        "      <td>146.1451</td>\n",
334 |        "      <td>131.1736</td>\n",
335 |        "      <td>119.1197</td>\n",
336 |        "      <td>131.1736</td>\n",
337 |        "      <td>204.2262</td>\n",
338 |        "      <td>146.1451</td>\n",
339 |        "      <td>174.2017</td>\n",
340 |        "      <td>115.131</td>\n",
341 |        "      <td>131.1736</td>\n",
342 |        "      <td>...</td>\n",
343 |        "      <td>149.2124</td>\n",
344 |        "      <td>119.1197</td>\n",
345 |        "      <td>146.1451</td>\n",
346 |        "      <td>131.1736</td>\n",
347 |        "      <td>75.0669</td>\n",
348 |        "      <td>121.159</td>\n",
349 |        "      <td>119.1197</td>\n",
350 |        "      <td>131.1736</td>\n",
351 |        "      <td>132.1184</td>\n",
352 |        "      <td>165.19</td>\n",
353 |        "    </tr>\n",
354 |        "  </tbody>\n",
355 |        "</table>\n",
356 |        "<p>3 rows × 99 columns</p>\n",
357 |        "</div>"
358 |       ],
359 |       "text/plain": [
360 |        "         0         1         2         3         4         5         6   \\\n",
361 |        "6   115.131  146.1451  131.1736  119.1197  131.1736  204.2262  146.1451   \n",
362 |        "7   115.131  146.1451  131.1736  119.1197  131.1736  204.2262  146.1451   \n",
363 |        "14  115.131  146.1451  131.1736  119.1197  131.1736  204.2262  146.1451   \n",
364 |        "\n",
365 |        "          7        8         9   ...        89        90        91        92  \\\n",
366 |        "6   174.2017  115.131  131.1736  ...  131.1736  119.1197  146.1451  131.1736   \n",
367 |        "7   174.2017  115.131  131.1736  ...  131.1736  119.1197  146.1451  131.1736   \n",
368 |        "14  174.2017  115.131  131.1736  ...  149.2124  119.1197  146.1451  131.1736   \n",
369 |        "\n",
370 |        "         93       94        95        96        97      98  \n",
371 |        "6   75.0669  121.159  119.1197  131.1736  132.1184  165.19  \n",
372 |        "7   75.0669  121.159  119.1197  131.1736  132.1184  165.19  \n",
373 |        "14  75.0669  121.159  119.1197  131.1736  132.1184  165.19  \n",
374 |        "\n",
375 |        "[3 rows x 99 columns]"
376 |       ]
377 |      },
378 |      "execution_count": 8,
379 |      "metadata": {},
380 |      "output_type": "execute_result"
381 |     }
382 |    ],
383 |    "source": [
384 |     "features.head(3)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "## Define training functions\n",
392 |     "\n",
393 |     "When writing code to interface with Dask, a functional paradigm is often preferred. Hence, we will write the procedures that are needed inside functions that can be submitted by the `client` to the `cluster`."
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 9,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "from utils import featurize_sequence_, fit_model, cross_validate, predict"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {},
408 |    "source": [
409 |     "Now, we'll scatter the data around the workers. `dataf` is named as such because this is the \"data futures\", a \"promise\" to the workers that `data` will exist for them and that they can access it. Likewise for `featuresf`."
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 10,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "dataf = client.scatter(data)\n",
419 |     "featuresf = client.scatter(features)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "metadata": {},
425 |    "source": [
426 |     "Now, we fit the models, and collect their cross-validated scores."
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": 11,
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": [
435 |     "models = dict()\n",
436 |     "scores = dict()\n",
437 |     "\n",
438 |     "\n",
439 |     "for drug in drugs:\n",
440 |     "    models[drug] = client.submit(fit_model, dataf, featuresf, drug)\n",
441 |     "    scores[drug] = client.submit(cross_validate, dataf, featuresf, drug)\n",
442 |     "    \n",
443 |     "models = client.gather(models)"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "markdown",
448 |    "metadata": {},
449 |    "source": [
450 |     "Finally, let's save the models. To save space on disk, we will pickle and gzip them."
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 12,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": [
459 |     "import pickle as pkl\n",
460 |     "import gzip\n",
461 |     "\n",
462 |     "for name, model in models.items():\n",
463 |     "    with gzip.open(f\"data/models/{name}.pkl.gz\", 'wb') as f:\n",
464 |     "        pkl.dump(model, f)"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 13,
470 |    "metadata": {},
471 |    "outputs": [],
472 |    "source": [
473 |     "scores = client.gather(scores)\n",
474 |     "with gzip.open(\"data/scores.pkl.gz\", \"wb\") as f:\n",
475 |     "    pkl.dump(scores, f)"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": null,
481 |    "metadata": {},
482 |    "outputs": [],
483 |    "source": []
484 |   }
485 |  ],
486 |  "metadata": {
487 |   "kernelspec": {
488 |    "display_name": "minimal-panel",
489 |    "language": "python",
490 |    "name": "minimal-panel"
491 |   },
492 |   "language_info": {
493 |    "codemirror_mode": {
494 |     "name": "ipython",
495 |     "version": 3
496 |    },
497 |    "file_extension": ".py",
498 |    "mimetype": "text/x-python",
499 |    "name": "python",
500 |    "nbconvert_exporter": "python",
501 |    "pygments_lexer": "ipython3",
502 |    "version": "3.7.3"
503 |   }
504 |  },
505 |  "nbformat": 4,
506 |  "nbformat_minor": 4
507 | }
508 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # For deployment environment.
 2 | bokeh==1.2.0
 3 | dask==2.6.0
 4 | holoviews==1.12.3
 5 | hvplot
 6 | ipython
 7 | jupyter
 8 | pandas==0.24.0
 9 | panel==0.6.0
10 | pip
11 | pyjanitor==0.18.0
12 | scikit-learn==0.21.2


--------------------------------------------------------------------------------
/test_utils.py:
--------------------------------------------------------------------------------
 1 | from utils import featurize_sequence_, molecular_weights, predict
 2 | import pytest
 3 | from hypothesis import given, strategies as st
 4 | import joblib
 5 | import gzip
 6 | import pickle as pkl
 7 | 
 8 | @given(
 9 |     st.text(alphabet=list(molecular_weights.keys()), min_size=0, max_size=200)
10 | )
11 | def test_featurize_sequence_(sequence):
12 |     feats = featurize_sequence_(sequence)
13 |     assert feats.shape[-1] == len(sequence)
14 | 
15 | 
16 | @given(sequence=st.text(alphabet=list(molecular_weights.keys()), min_size=99, max_size=99))
17 | def test_predict(sequence):
18 |     """
19 |     Baseline test that under ideal situations,
20 |     i.e. len(sequence) == 99 and
21 |     set(sequence).issubset(molecular_weights.keys()),
22 |     that the function executes correctly.
23 |     """
24 |     with gzip.open("data/models/ATV.pkl.gz", "rb") as f:
25 |         model = pkl.load(f)
26 | 
27 |     preds = predict(model, sequence)


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Dumping ground for functions that I don't want polluting the notebook.
 3 | 
 4 | Can be refactored at a later date, when it gets a bit unwieldy.
 5 | """
 6 | 
 7 | import numpy as np
 8 | 
 9 | molecular_weights = {
10 |     "A": 89.0935,
11 |     "R": 174.2017,
12 |     "N": 132.1184,
13 |     "D": 133.1032,
14 |     "C": 121.1590,
15 |     "E": 147.1299,
16 |     "Q": 146.1451,
17 |     "G": 75.0669,
18 |     "H": 155.1552,
19 |     "I": 131.1736,
20 |     "L": 131.1736,
21 |     "K": 146.1882,
22 |     "M": 149.2124,
23 |     "F": 165.1900,
24 |     "P": 115.1310,
25 |     "S": 105.0930,
26 |     "T": 119.1197,
27 |     "W": 204.2262,
28 |     "Y": 181.1894,
29 |     "V": 117.1469,
30 |     "X": 100.00,
31 | }
32 | 
33 | 
34 | def featurize_sequence_(x, expected_size=99):
35 |     """
36 |     :param x: a string in a pandas DataFrame cell
37 |     """
38 |     feats = np.zeros(len(x))
39 |     for i, letter in enumerate(x):
40 |         feats[i] = molecular_weights[letter]
41 |     return feats.reshape(1, -1)
42 | 
43 | 
44 | from sklearn.ensemble import RandomForestRegressor
45 | from sklearn.model_selection import cross_val_score
46 | 
47 | 
48 | def fit_model(data, features, target):
49 |     import janitor
50 |     model = RandomForestRegressor(n_estimators=300)
51 |     
52 |     resistance_data = features.join(data[target]).dropna()
53 |     X, y = resistance_data.get_features_targets(target_column_names=target)
54 |     
55 |     model.fit(X, y)
56 |     return model
57 | 
58 | 
59 | def cross_validate(data, features, target):
60 |     import janitor
61 |     model = RandomForestRegressor(n_estimators=500)
62 |     
63 |     resistance_data = features.join(data[target]).dropna()
64 |     X, y = resistance_data.get_features_targets(target_column_names=target)
65 |     
66 |     return -cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
67 | 
68 | 
69 | class SequenceError(Exception):
70 |     pass
71 | 
72 | def predict(model, sequence):
73 |     """
74 |     :param model: sklearn model
75 |     :param sequence: A string, should be 99 characters long.
76 |     """
77 |     if len(sequence) != 99:
78 |         raise ValueError(f"sequence must be of length 99. Your sequence is of length {len(sequence)}")
79 |     
80 |     if not set(sequence).issubset(set(molecular_weights.keys())):
81 |         invalid_chars = set(sequence).difference(molecular_weights.keys())
82 |         raise SequenceError(f"sequence contains invalid characters: {invalid_chars}")
83 |     
84 |     seqfeat = featurize_sequence_(sequence)
85 |     return model.predict(seqfeat)
86 | 
87 | 
88 | # App Navigation
89 | 
90 | import panel as pn
91 | 
92 | navbar = """
93 | [Home](/home) | 
94 | [HIV Resistance](/hiv-resistance) | 
95 | [Iris](/iris)
96 | """
97 | 
98 | navpane = pn.pane.Markdown(navbar)
99 | 


--------------------------------------------------------------------------------