├── .gitignore ├── LICENSE ├── Makefile ├── Procfile ├── README.md ├── azure-pipelines.yml ├── data ├── hiv-protease-consensus.txt ├── hiv-protease-data-expanded.csv ├── models │ ├── ATV.pkl.gz │ ├── DRV.pkl.gz │ ├── FPV.pkl.gz │ ├── IDV.pkl.gz │ ├── LPV.pkl.gz │ ├── NFV.pkl.gz │ ├── SQV.pkl.gz │ └── TPV.pkl.gz └── scores.pkl.gz ├── environment.yml ├── hiv-resistance.ipynb ├── home.ipynb ├── index.html ├── iris.ipynb ├── model-training.ipynb ├── requirements.txt ├── test_utils.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Custom 2 | dask-worker-space/* 3 | .vscode/* 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Eric Ma 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | jupyter nbconvert --execute minimal-panel.ipynb 3 | rm minimal-panel.html 4 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: panel serve --address="0.0.0.0" --port=$PORT hiv-resistance.ipynb home.ipynb iris.ipynb --allow-websocket-origin=minimal-panel-app.herokuapp.com --index=`pwd`/index.html 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # minimal-panel-app 2 | 3 | A pedagogical implementation of panel apps served up on a remote machine. 4 | 5 | See the full app [here](http://minimal-panel-app.herokuapp.com/home). 6 | 7 | ## why this project exists 8 | 9 | I spent a day figuring out how to make this happen at work, 10 | and decided to spend an evening consolidating my knowledge. 11 | 12 | ## "how to use" 13 | 14 | ``` 15 | git clone https://github.com/ericmjl/minimal-panel-app 16 | ``` 17 | 18 | ## anything else interesting? 19 | 20 | ### iPad development 21 | 22 | The first version of the app was coded up entirely on an iPad, 23 | using a combination of [blink](http://blink.sh) 24 | [Juno](jhttp://juno.sh), 25 | and `nano` on my home remote server 26 | (which is nothing more than a converted gaming tower). 27 | 28 | Web app development in Python is now doable 29 | and we can use modern tablets as a thin client! 30 | 31 | ### memory usage 32 | 33 | Deploying the HIV drug resistance model to Heroku was challenging 34 | because I had to watch out for memory and storage usage. 35 | There are 8 models to make predictions on, 36 | and loading all of them together causes memory overload 37 | on Heroku's free tier. 38 | 39 | I got around this by pickling the models individually, 40 | and only loading them when needed. 41 | I also minimized disk usage by using gzip 42 | when pickling the files. 43 | 44 | ### multi-app hosting 45 | 46 | There are multiple "apps" that are being hosted by a single Panel server here. 47 | Each "app" is basically one Jupyter notebook. 48 | In each notebook, I define a self-contained, hostable unit 49 | that an end-user can interact with. 50 | One of them is the homepage, 51 | written using Panel's tooling just to prove the point, 52 | but the others are actual user-facing interfaces 53 | that provide a way to interact with either data or a machine learning model. 54 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | pr: 2 | - master 3 | 4 | jobs: 5 | - job: linux 6 | variables: 7 | activate.command: "source activate" 8 | strategy: 9 | matrix: 10 | py37: 11 | python.version: "3.7" 12 | 13 | pool: 14 | vmImage: ubuntu-16.04 15 | 16 | steps: 17 | - bash: echo "##vso[task.prependpath]$CONDA/bin" 18 | displayName: Add conda to PATH 19 | - script: | 20 | conda env create -f environment.yml 21 | displayName: Create environment 22 | - script: | 23 | source activate minimal-panel 24 | python -m ipykernel install --user --name minimal-panel 25 | make test 26 | displayName: Run tests 27 | -------------------------------------------------------------------------------- /data/hiv-protease-consensus.txt: -------------------------------------------------------------------------------- 1 | >protease 2 | PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF 3 | -------------------------------------------------------------------------------- /data/models/ATV.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/ATV.pkl.gz -------------------------------------------------------------------------------- /data/models/DRV.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/DRV.pkl.gz -------------------------------------------------------------------------------- /data/models/FPV.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/FPV.pkl.gz -------------------------------------------------------------------------------- /data/models/IDV.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/IDV.pkl.gz -------------------------------------------------------------------------------- /data/models/LPV.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/LPV.pkl.gz -------------------------------------------------------------------------------- /data/models/NFV.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/NFV.pkl.gz -------------------------------------------------------------------------------- /data/models/SQV.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/SQV.pkl.gz -------------------------------------------------------------------------------- /data/models/TPV.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/models/TPV.pkl.gz -------------------------------------------------------------------------------- /data/scores.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericmjl/minimal-panel-app/40b95008fec7c95241b0296d8bd57cdcb1b1eb17/data/scores.pkl.gz -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: minimal-panel 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.7 7 | - biopython=1.74 8 | - bokeh=1.2.0 9 | - dask=2.6.0 10 | - holoviews=1.12.3 11 | - hvplot 12 | - ipython 13 | - jupyter 14 | - jupyterlab 15 | - pandas=0.24.0 16 | - panel=0.6.0 17 | - pyjanitor=0.18.0 18 | - scikit-learn=0.21.2 19 | - pytest 20 | - hypothesis 21 | - black 22 | - pylint 23 | - pycodestyle 24 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /model-training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction\n", 8 | "\n", 9 | "This notebook gives you a short introduction on how to use Dask to parallelize model training, particularly if you have multiple learning tasks on which you want to train individual models for.\n", 10 | "\n", 11 | "For brevity, I will not be elaborating on the exact machine learning task here, but focus on the idioms that we need to use Dask for this task." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%load_ext autoreload\n", 21 | "%autoreload 2\n", 22 | "%matplotlib inline\n", 23 | "%config InlineBackend.figure_format = 'retina'\n", 24 | "\n", 25 | "from dask.distributed import LocalCluster, Client\n", 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "import janitor" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Instantiate a Dask Cluster\n", 36 | "\n", 37 | "Here, we instantiate a Dask `cluster` (this is only a `LocalCluster`, but other cluster types can be created too, such as an `SGECluster` or `KubeCluster`. We then connect a `client` to the cluster." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stderr", 47 | "output_type": "stream", 48 | "text": [ 49 | "/home/ericmjl/anaconda/envs/minimal-panel/lib/python3.7/site-packages/distributed/dashboard/core.py:72: UserWarning: \n", 50 | "Port 8787 is already in use. \n", 51 | "Perhaps you already have a cluster running?\n", 52 | "Hosting the diagnostics dashboard on a random port instead.\n", 53 | " warnings.warn(\"\\n\" + msg)\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "client = Client()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Data Preprocessing\n", 66 | "\n", 67 | "We will now preprocess our data and get it into a shape for machine learning." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "from utils import molecular_weights, featurize_sequence_" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 6, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "drugs = ['ATV', 'DRV', 'FPV', 'IDV', 'LPV', 'NFV', 'SQV', 'TPV']\n", 86 | "\n", 87 | "data = (\n", 88 | " pd.read_csv(\"data/hiv-protease-data-expanded.csv\", index_col=0)\n", 89 | " .query(\"weight == 1.0\")\n", 90 | " .transform_column(\"sequence\", lambda x: len(x), \"seq_length\")\n", 91 | " .query(\"seq_length == 99\")\n", 92 | " .transform_column(\"sequence\", featurize_sequence_, \"features\")\n", 93 | " .transform_columns(drugs, np.log10)\n", 94 | ")\n", 95 | "\n", 96 | "features = pd.DataFrame(np.vstack(data['features'])).set_index(data.index)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 7, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/html": [ 107 | "
\n", 108 | "\n", 121 | "\n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | "
ATVDRVFPVIDVLPVNFVSQVSeqIDTPVseqidsequencesequence_objectweightseq_lengthfeatures
61.50515NaN0.4771211.5440681.505151.4623982.2148444426NaN4426-0PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKM...ID: 4426-0\\nName: <unknown name>\\nDescription:...1.099[[115.131, 146.1451, 131.1736, 119.1197, 131.1...
7NaNNaN0.1760910.000000NaN0.3424230.0413934432NaN4432-0PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...ID: 4432-0\\nName: <unknown name>\\nDescription:...1.099[[115.131, 146.1451, 131.1736, 119.1197, 131.1...
14NaNNaN0.4913620.939519NaN1.5051501.2278874664NaN4664-0PQITLWQRPIVTIKVGGQLIEALLDTGADDTVLEEINLPGRWKPKM...ID: 4664-0\\nName: <unknown name>\\nDescription:...1.099[[115.131, 146.1451, 131.1736, 119.1197, 131.1...
\n", 199 | "
" 200 | ], 201 | "text/plain": [ 202 | " ATV DRV FPV IDV LPV NFV SQV SeqID TPV \\\n", 203 | "6 1.50515 NaN 0.477121 1.544068 1.50515 1.462398 2.214844 4426 NaN \n", 204 | "7 NaN NaN 0.176091 0.000000 NaN 0.342423 0.041393 4432 NaN \n", 205 | "14 NaN NaN 0.491362 0.939519 NaN 1.505150 1.227887 4664 NaN \n", 206 | "\n", 207 | " seqid sequence \\\n", 208 | "6 4426-0 PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKM... \n", 209 | "7 4432-0 PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM... \n", 210 | "14 4664-0 PQITLWQRPIVTIKVGGQLIEALLDTGADDTVLEEINLPGRWKPKM... \n", 211 | "\n", 212 | " sequence_object weight seq_length \\\n", 213 | "6 ID: 4426-0\\nName: \\nDescription:... 1.0 99 \n", 214 | "7 ID: 4432-0\\nName: \\nDescription:... 1.0 99 \n", 215 | "14 ID: 4664-0\\nName: \\nDescription:... 1.0 99 \n", 216 | "\n", 217 | " features \n", 218 | "6 [[115.131, 146.1451, 131.1736, 119.1197, 131.1... \n", 219 | "7 [[115.131, 146.1451, 131.1736, 119.1197, 131.1... \n", 220 | "14 [[115.131, 146.1451, 131.1736, 119.1197, 131.1... " 221 | ] 222 | }, 223 | "execution_count": 7, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "data.head(3)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 8, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/html": [ 240 | "
\n", 241 | "\n", 254 | "\n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | "
0123456789...89909192939495969798
6115.131146.1451131.1736119.1197131.1736204.2262146.1451174.2017115.131131.1736...131.1736119.1197146.1451131.173675.0669121.159119.1197131.1736132.1184165.19
7115.131146.1451131.1736119.1197131.1736204.2262146.1451174.2017115.131131.1736...131.1736119.1197146.1451131.173675.0669121.159119.1197131.1736132.1184165.19
14115.131146.1451131.1736119.1197131.1736204.2262146.1451174.2017115.131131.1736...149.2124119.1197146.1451131.173675.0669121.159119.1197131.1736132.1184165.19
\n", 356 | "

3 rows × 99 columns

\n", 357 | "
" 358 | ], 359 | "text/plain": [ 360 | " 0 1 2 3 4 5 6 \\\n", 361 | "6 115.131 146.1451 131.1736 119.1197 131.1736 204.2262 146.1451 \n", 362 | "7 115.131 146.1451 131.1736 119.1197 131.1736 204.2262 146.1451 \n", 363 | "14 115.131 146.1451 131.1736 119.1197 131.1736 204.2262 146.1451 \n", 364 | "\n", 365 | " 7 8 9 ... 89 90 91 92 \\\n", 366 | "6 174.2017 115.131 131.1736 ... 131.1736 119.1197 146.1451 131.1736 \n", 367 | "7 174.2017 115.131 131.1736 ... 131.1736 119.1197 146.1451 131.1736 \n", 368 | "14 174.2017 115.131 131.1736 ... 149.2124 119.1197 146.1451 131.1736 \n", 369 | "\n", 370 | " 93 94 95 96 97 98 \n", 371 | "6 75.0669 121.159 119.1197 131.1736 132.1184 165.19 \n", 372 | "7 75.0669 121.159 119.1197 131.1736 132.1184 165.19 \n", 373 | "14 75.0669 121.159 119.1197 131.1736 132.1184 165.19 \n", 374 | "\n", 375 | "[3 rows x 99 columns]" 376 | ] 377 | }, 378 | "execution_count": 8, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "features.head(3)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "## Define training functions\n", 392 | "\n", 393 | "When writing code to interface with Dask, a functional paradigm is often preferred. Hence, we will write the procedures that are needed inside functions that can be submitted by the `client` to the `cluster`." 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 9, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "from utils import featurize_sequence_, fit_model, cross_validate, predict" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "Now, we'll scatter the data around the workers. `dataf` is named as such because this is the \"data futures\", a \"promise\" to the workers that `data` will exist for them and that they can access it. Likewise for `featuresf`." 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 10, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "dataf = client.scatter(data)\n", 419 | "featuresf = client.scatter(features)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "Now, we fit the models, and collect their cross-validated scores." 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 11, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "models = dict()\n", 436 | "scores = dict()\n", 437 | "\n", 438 | "\n", 439 | "for drug in drugs:\n", 440 | " models[drug] = client.submit(fit_model, dataf, featuresf, drug)\n", 441 | " scores[drug] = client.submit(cross_validate, dataf, featuresf, drug)\n", 442 | " \n", 443 | "models = client.gather(models)" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "Finally, let's save the models. To save space on disk, we will pickle and gzip them." 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 12, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "import pickle as pkl\n", 460 | "import gzip\n", 461 | "\n", 462 | "for name, model in models.items():\n", 463 | " with gzip.open(f\"data/models/{name}.pkl.gz\", 'wb') as f:\n", 464 | " pkl.dump(model, f)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 13, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "scores = client.gather(scores)\n", 474 | "with gzip.open(\"data/scores.pkl.gz\", \"wb\") as f:\n", 475 | " pkl.dump(scores, f)" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [] 484 | } 485 | ], 486 | "metadata": { 487 | "kernelspec": { 488 | "display_name": "minimal-panel", 489 | "language": "python", 490 | "name": "minimal-panel" 491 | }, 492 | "language_info": { 493 | "codemirror_mode": { 494 | "name": "ipython", 495 | "version": 3 496 | }, 497 | "file_extension": ".py", 498 | "mimetype": "text/x-python", 499 | "name": "python", 500 | "nbconvert_exporter": "python", 501 | "pygments_lexer": "ipython3", 502 | "version": "3.7.3" 503 | } 504 | }, 505 | "nbformat": 4, 506 | "nbformat_minor": 4 507 | } 508 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # For deployment environment. 2 | bokeh==1.2.0 3 | dask==2.6.0 4 | holoviews==1.12.3 5 | hvplot 6 | ipython 7 | jupyter 8 | pandas==0.24.0 9 | panel==0.6.0 10 | pip 11 | pyjanitor==0.18.0 12 | scikit-learn==0.21.2 -------------------------------------------------------------------------------- /test_utils.py: -------------------------------------------------------------------------------- 1 | from utils import featurize_sequence_, molecular_weights, predict 2 | import pytest 3 | from hypothesis import given, strategies as st 4 | import joblib 5 | import gzip 6 | import pickle as pkl 7 | 8 | @given( 9 | st.text(alphabet=list(molecular_weights.keys()), min_size=0, max_size=200) 10 | ) 11 | def test_featurize_sequence_(sequence): 12 | feats = featurize_sequence_(sequence) 13 | assert feats.shape[-1] == len(sequence) 14 | 15 | 16 | @given(sequence=st.text(alphabet=list(molecular_weights.keys()), min_size=99, max_size=99)) 17 | def test_predict(sequence): 18 | """ 19 | Baseline test that under ideal situations, 20 | i.e. len(sequence) == 99 and 21 | set(sequence).issubset(molecular_weights.keys()), 22 | that the function executes correctly. 23 | """ 24 | with gzip.open("data/models/ATV.pkl.gz", "rb") as f: 25 | model = pkl.load(f) 26 | 27 | preds = predict(model, sequence) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dumping ground for functions that I don't want polluting the notebook. 3 | 4 | Can be refactored at a later date, when it gets a bit unwieldy. 5 | """ 6 | 7 | import numpy as np 8 | 9 | molecular_weights = { 10 | "A": 89.0935, 11 | "R": 174.2017, 12 | "N": 132.1184, 13 | "D": 133.1032, 14 | "C": 121.1590, 15 | "E": 147.1299, 16 | "Q": 146.1451, 17 | "G": 75.0669, 18 | "H": 155.1552, 19 | "I": 131.1736, 20 | "L": 131.1736, 21 | "K": 146.1882, 22 | "M": 149.2124, 23 | "F": 165.1900, 24 | "P": 115.1310, 25 | "S": 105.0930, 26 | "T": 119.1197, 27 | "W": 204.2262, 28 | "Y": 181.1894, 29 | "V": 117.1469, 30 | "X": 100.00, 31 | } 32 | 33 | 34 | def featurize_sequence_(x, expected_size=99): 35 | """ 36 | :param x: a string in a pandas DataFrame cell 37 | """ 38 | feats = np.zeros(len(x)) 39 | for i, letter in enumerate(x): 40 | feats[i] = molecular_weights[letter] 41 | return feats.reshape(1, -1) 42 | 43 | 44 | from sklearn.ensemble import RandomForestRegressor 45 | from sklearn.model_selection import cross_val_score 46 | 47 | 48 | def fit_model(data, features, target): 49 | import janitor 50 | model = RandomForestRegressor(n_estimators=300) 51 | 52 | resistance_data = features.join(data[target]).dropna() 53 | X, y = resistance_data.get_features_targets(target_column_names=target) 54 | 55 | model.fit(X, y) 56 | return model 57 | 58 | 59 | def cross_validate(data, features, target): 60 | import janitor 61 | model = RandomForestRegressor(n_estimators=500) 62 | 63 | resistance_data = features.join(data[target]).dropna() 64 | X, y = resistance_data.get_features_targets(target_column_names=target) 65 | 66 | return -cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5) 67 | 68 | 69 | class SequenceError(Exception): 70 | pass 71 | 72 | def predict(model, sequence): 73 | """ 74 | :param model: sklearn model 75 | :param sequence: A string, should be 99 characters long. 76 | """ 77 | if len(sequence) != 99: 78 | raise ValueError(f"sequence must be of length 99. Your sequence is of length {len(sequence)}") 79 | 80 | if not set(sequence).issubset(set(molecular_weights.keys())): 81 | invalid_chars = set(sequence).difference(molecular_weights.keys()) 82 | raise SequenceError(f"sequence contains invalid characters: {invalid_chars}") 83 | 84 | seqfeat = featurize_sequence_(sequence) 85 | return model.predict(seqfeat) 86 | 87 | 88 | # App Navigation 89 | 90 | import panel as pn 91 | 92 | navbar = """ 93 | [Home](/home) | 94 | [HIV Resistance](/hiv-resistance) | 95 | [Iris](/iris) 96 | """ 97 | 98 | navpane = pn.pane.Markdown(navbar) 99 | --------------------------------------------------------------------------------