├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── benchmarking_examples.ipynb ├── calculate_ged.py ├── custom_rnn.py ├── data.py ├── data ├── datasets.zip ├── doc2vec_features.csv ├── doc2vec_features_lowdim.csv └── recepies_example.json ├── embed_regularize.py ├── locked_dropout.py ├── main_one_model_train.py ├── make_arch_embeddings.ipynb ├── model.py ├── models_weights └── dump_weights_model_2226_2020-04-18_07-35-19_999938929.pt ├── multilinear.py ├── nas_environment.py ├── plotting.py ├── reproduce_model.ipynb ├── requirements.txt ├── search_space.py ├── search_space_analysis.ipynb ├── search_space_examples.ipynb ├── setup.py ├── splitcross.py ├── train.py ├── train_logs_multi_runs └── logs.zip ├── train_logs_single_run └── logs.zip ├── train_logs_wikitext-2 └── logs.zip ├── utils.py └── weight_drop.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .ipynb_checkpoints/ 3 | train_logs_single_run/* 4 | !train_logs_single_run/logs.zip 5 | train_logs_multi_runs/* 6 | !train_logs_multi_runs/logs.zip 7 | train_logs_wikitext-2/* 8 | !train_logs_wikitext-2/logs.zip 9 | data/ptb 10 | data/wikitext-2 11 | data/figures 12 | models_weights/* 13 | !models_weights/dump_weights_model_2226_2020-04-18_07-35-19_999938929.pt 14 | tmp/ 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NAS-Bench-NLP 2 | 3 | Preparation: 4 | 1. unzip data/datasets.zip, train\_logs\_single\_run/logs.zip, train\_logs\_multi\_runs/logs.zip, and train\_logs\_wikitext-2/logs.zip; 5 | 2. install requirements.txt (currently contains unuseed packages; to be cleaned); 6 | 3. optionally, copy models from the dropbox (sample: https://www.dropbox.com/sh/qviytkrlbu2cy5u/AABy59Bb9CpiS7D4osbvY_xva?dl=0, all models: https://www.dropbox.com/scl/fo/4r36x7wqb6gvzcmz8zo61/AIzcRCPZhmzORxJdSI2AdtY?rlkey=516wk0knseuuow45wn4mhy0ak&e=1&dl=0) to the folder models\_weights. 7 | 8 | Usage: 9 | * search\_space\_examples.ipynb demonstrates how to generate architectures from the search space; 10 | * to train a model, run script main\_one\_model\_train.py --recepie\_id=, where the list of architectures is by defaultin data/recepies\_example.json; logs and final weights will be stored in tmp folder by default (see script argumens for more info); 11 | * reproduce\_model.ipynb demonstrates how to load and apply the trained model; 12 | * make\_arch\_embedding.ipynb creates graph2vec features for architectures; 13 | * search\_space\_analysis.ipynb reproduces figures from the analysis section in the paper; 14 | * benchmarking\_examples.ipynb shows how NAS methods can be tested based on precomputed results in the logs. 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/__init__.py -------------------------------------------------------------------------------- /benchmarking_examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import json\n", 20 | "import os\n", 21 | "import numpy as np\n", 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import matplotlib \n", 32 | "import matplotlib.pyplot as plt\n", 33 | "%matplotlib inline" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from tqdm import tqdm_notebook" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "from nas_environment import Environment" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from sklearn.ensemble import BaggingRegressor\n", 61 | "from xgboost import XGBRegressor\n", 62 | "from sklearn.preprocessing import StandardScaler\n", 63 | "import sklearn.metrics.pairwise\n", 64 | "from scipy.spatial import distance" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "from hyperopt import hp\n", 74 | "from hyperopt import fmin, tpe, space_eval" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "# Set-up the environment" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "precomputed_logs_path = 'train_logs_single_run/'" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "env = Environment(precomputed_logs_path)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "len(env._logs)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "search_set = env.get_precomputed_recepies()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "alg_resutls = {}" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "rounds = 5\n", 136 | "iters_per_round = 100" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "# Random seach" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "for train_epochs in [10, 50]:\n", 153 | " all_test_losses = []\n", 154 | " all_wall_times = []\n", 155 | "\n", 156 | " N = int(iters_per_round*50/train_epochs)\n", 157 | " \n", 158 | " for seed in tqdm_notebook(range(rounds)):\n", 159 | " np.random.seed(seed)\n", 160 | " env.reset()\n", 161 | " selected_inds = []\n", 162 | " test_losses = []\n", 163 | " wall_times = []\n", 164 | " for i in range(N):\n", 165 | " cur_ind = np.random.choice(np.setdiff1d(np.arange(len(search_set)), np.array(selected_inds)), \n", 166 | " 1, replace=False)[0]\n", 167 | " env.simulated_train(search_set[cur_ind], train_epochs)\n", 168 | " selected_inds.append(cur_ind)\n", 169 | " test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n", 170 | " wall_times.append(env.get_total_time())\n", 171 | " all_test_losses.append(test_losses)\n", 172 | " all_wall_times.append(wall_times)\n", 173 | " alg_resutls[f'random_search_{train_epochs}_epochs'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "# Hyperbands" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "all_test_losses = []\n", 190 | "all_wall_times = []\n", 191 | "N = iters_per_round\n", 192 | "for seed in tqdm_notebook(range(rounds)):\n", 193 | " env.reset()\n", 194 | " np.random.seed(seed)\n", 195 | "\n", 196 | " # HYPERBAND\n", 197 | " \n", 198 | " #inputs\n", 199 | " R = 50 # the maximum amount of resource that can be allocated to a single configuration (number of epochs)\n", 200 | " eta = 3 # an input that controls the proportion of configurations discarded in each round of SuccessiveHalving\n", 201 | "\n", 202 | " # initialization\n", 203 | " s_max = int(np.floor(np.log(R)/np.log(eta)))\n", 204 | " # B = (s_max + 1)*R\n", 205 | " B = N*R/3.5 # to approximately match budgets in random search\n", 206 | "\n", 207 | " test_losses = []\n", 208 | " wall_times = []\n", 209 | " \n", 210 | " log_cnt = 0\n", 211 | " for s in range(s_max, -1, -1):\n", 212 | " n = int(np.ceil(float(B)/R * float(eta)**s/(s + 1)))\n", 213 | " r = R*float(eta)**(-s)\n", 214 | " #print(s, n, r)\n", 215 | " # Successive Halving inner loop\n", 216 | " # init sample of n architectures\n", 217 | " T = np.random.choice(len(search_set), n, replace=False)\n", 218 | " #print(T)\n", 219 | " for i in range(s + 1):\n", 220 | " n_i = int(np.floor(n*float(eta)**(-i)))\n", 221 | " r_i = int(np.floor(r*eta**i))\n", 222 | " L = []\n", 223 | " for t in T:\n", 224 | " env.simulated_train(search_set[t], r_i)\n", 225 | " if env.get_model_status(search_set[t]) == 'OK':\n", 226 | " L.append(env.get_model_stats(search_set[t], r_i - 1)['val_loss'])\n", 227 | " else:\n", 228 | " L.append(np.inf) # if model fails accidently within r_i epichs, it is discated further\n", 229 | " log_cnt += 1\n", 230 | " if log_cnt % 25 == 0:\n", 231 | " test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n", 232 | " wall_times.append(env.get_total_time())\n", 233 | " test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n", 234 | " wall_times.append(env.get_total_time())\n", 235 | "\n", 236 | " L = np.array(L)\n", 237 | " halved_inds = np.argsort(L)[:int(np.floor(n_i/float(eta)))]\n", 238 | " halved_inds = halved_inds[L[halved_inds] < np.inf] # discard accidently failed models\n", 239 | " T = T[halved_inds]\n", 240 | " #print(T)\n", 241 | " all_test_losses.append(test_losses)\n", 242 | " all_wall_times.append(wall_times)\n", 243 | "alg_resutls['hyperband'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "# BayesOpt" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "df_recepie_vectors = pd.read_csv('data/doc2vec_features.csv').set_index('recepie_id')\n", 260 | "df_recepie_vectors_lowdim = pd.read_csv('data/doc2vec_features_lowdim.csv').set_index('recepie_id')" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "search_set_recepie_ids = np.array(env.get_recepie_ids())" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "X_highdim = df_recepie_vectors.loc[search_set_recepie_ids].values\n", 279 | "X_lowdim = df_recepie_vectors_lowdim.loc[search_set_recepie_ids].values" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "for X, alias in zip([X_highdim, X_lowdim], ['50D', '10D']):\n", 289 | "# if alias == '50D':\n", 290 | "# continue\n", 291 | " all_test_losses = []\n", 292 | " all_wall_times = []\n", 293 | " epochs_train = 50\n", 294 | " N_init = 20 # check randomly a few architectures at first\n", 295 | " beta = 2.0\n", 296 | " N = int(1.3*iters_per_round)\n", 297 | " train_batch = 10\n", 298 | " for seed in tqdm_notebook(range(rounds)):\n", 299 | " #for seed in tqdm_notebook(range(5)):\n", 300 | " np.random.seed(seed)\n", 301 | " env.reset()\n", 302 | " selected_inds = []\n", 303 | " test_losses = []\n", 304 | " wall_times = []\n", 305 | " X_train = []\n", 306 | " y_train = []\n", 307 | " # check a few random architectures at first\n", 308 | " for i in range(N_init):\n", 309 | " cur_ind = np.random.choice(np.setdiff1d(np.arange(len(search_set)), np.array(selected_inds)), \n", 310 | " 1, replace=False)[0]\n", 311 | " env.simulated_train(search_set[cur_ind], epochs_train)\n", 312 | " selected_inds.append(cur_ind)\n", 313 | " if env.get_model_status(search_set[cur_ind]) == 'OK':\n", 314 | " X_train.append(X[cur_ind])\n", 315 | " y_train.append(env.get_model_stats(search_set[cur_ind], epochs_train - 1)['val_loss'])\n", 316 | " test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n", 317 | " wall_times.append(env.get_total_time())\n", 318 | "\n", 319 | "\n", 320 | " regr = BaggingRegressor(XGBRegressor(n_estimators=100, max_depth=15), \n", 321 | " n_estimators=14, max_samples=0.5, n_jobs=14)\n", 322 | "\n", 323 | " # train estimator and score new candidates according to the lower-confidence-bound acquisition function\n", 324 | " for i in range(N_init, N):\n", 325 | " if i % train_batch == 0:\n", 326 | " regr.fit(np.array(X_train), np.array(y_train))\n", 327 | " y_pred_mean = regr.predict(X)\n", 328 | " y_pred_std = np.std([e.predict(X) for e in regr.estimators_], axis=0)\n", 329 | " scores = y_pred_mean - beta * y_pred_std\n", 330 | "\n", 331 | " scores[np.array(selected_inds)] = np.inf\n", 332 | "\n", 333 | " cur_ind = np.argmin(scores)\n", 334 | "\n", 335 | " env.simulated_train(search_set[cur_ind], epochs_train)\n", 336 | " if env.get_model_status(search_set[cur_ind]) == 'OK':\n", 337 | " X_train.append(X[cur_ind])\n", 338 | " y_train.append(env.get_model_stats(search_set[cur_ind], epochs_train - 1)['val_loss'])\n", 339 | " selected_inds.append(cur_ind)\n", 340 | " test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n", 341 | " wall_times.append(env.get_total_time())\n", 342 | " all_test_losses.append(test_losses)\n", 343 | " all_wall_times.append(wall_times)\n", 344 | "\n", 345 | "\n", 346 | " alg_resutls[f'bayes_opt_{alias}'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "# Regularized evolution" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "def mutate_embedded(e, std=1, axes_bounds=None):\n", 363 | " e_new = e + np.random.randn(len(e)) * std\n", 364 | " if axes_bounds is not None:\n", 365 | " e_new = np.clip(e_new, axes_bounds[0], axes_bounds[1])\n", 366 | " return e_new" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "def find_closest(E, e):\n", 376 | " #dists = np.linalg.norm(E - e.reshape(1, -1), axis=1)\n", 377 | " dists = distance.cdist([e], E, \"cosine\")[0]\n", 378 | " return np.argmin(dists)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "all_test_losses = []\n", 388 | "all_wall_times = []\n", 389 | "\n", 390 | "train_epochs = 50\n", 391 | "P = 20\n", 392 | "C = int(1.3*iters_per_round)\n", 393 | "S = 10\n", 394 | "\n", 395 | "axes_bounds = (np.min(X, axis=0), np.max(X, axis=0))\n", 396 | "\n", 397 | "for seed in tqdm_notebook(range(rounds)):\n", 398 | " np.random.seed(seed)\n", 399 | " env.reset()\n", 400 | " test_losses = []\n", 401 | " wall_times = []\n", 402 | " \n", 403 | " # init first P architectures\n", 404 | " population = []\n", 405 | " history = []\n", 406 | " for i in np.random.choice(np.arange(len(search_set)), P, replace=False):\n", 407 | " env.simulated_train(search_set[i], train_epochs)\n", 408 | " population.append(i)\n", 409 | " history.append(i)\n", 410 | " test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n", 411 | " wall_times.append(env.get_total_time())\n", 412 | " \n", 413 | " attempt = 0\n", 414 | " valid_round = True\n", 415 | " while len(history) < C:\n", 416 | " sample = np.random.choice(population, S, replace=False)\n", 417 | " sample_scores = [env.get_model_stats(search_set[i], epochs_train - 1)['val_loss'] \n", 418 | " for i in sample if env.get_model_status(search_set[i]) == 'OK']\n", 419 | " if len(sample_scores) == 0: \n", 420 | " # this is unlikely to happen, but just to make sure that the code will work anyway\n", 421 | " attempt += 1\n", 422 | " if attempt > 5:\n", 423 | " valid_round = False\n", 424 | " break\n", 425 | " else:\n", 426 | " continue\n", 427 | " else:\n", 428 | " attempt = 0\n", 429 | " parent = sample[np.argmin(sample_scores)]\n", 430 | " \n", 431 | " for std in [0.5, 1.0, 2.0, 4.0, 8.0]:\n", 432 | " e_new = mutate_embedded(X[parent], std, axes_bounds)\n", 433 | " child = find_closest(X, e_new)\n", 434 | " if child != parent:\n", 435 | " # stop when we find a child that differs from the parent\n", 436 | " break\n", 437 | " \n", 438 | " env.simulated_train(search_set[child], train_epochs)\n", 439 | " history.append(child)\n", 440 | " population = population[1:] + [child]\n", 441 | " test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n", 442 | " wall_times.append(env.get_total_time())\n", 443 | " \n", 444 | " if valid_round:\n", 445 | " all_test_losses.append(test_losses)\n", 446 | " all_wall_times.append(wall_times)\n", 447 | " #break\n", 448 | " \n", 449 | "alg_resutls['regularized_evolution'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "# TPE" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "def objective_vec(vec):\n", 466 | " \n", 467 | " # The most similar vector in X:\n", 468 | " distances = distance.cdist([vec], X, \"cosine\")[0]\n", 469 | " #distances = np.linalg.norm(X - vec.reshape(1, -1), axis=1)\n", 470 | " recepie_id = np.argmin(distances)\n", 471 | " recepie = search_set[recepie_id]\n", 472 | "\n", 473 | "\n", 474 | " env.simulated_train(recepie, epochs_train)\n", 475 | " test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n", 476 | " wall_times.append(env.get_total_time())\n", 477 | "\n", 478 | "\n", 479 | " if env.get_model_status(recepie) == 'OK':\n", 480 | " result = env.get_model_stats(recepie, epochs_train - 1)['val_loss']\n", 481 | " else:\n", 482 | " result = 10\n", 483 | " return result" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "def objective_dict(vec_as_dict):\n", 493 | " \n", 494 | " vec = np.zeros(50)\n", 495 | " for k, v in vec_as_dict.items():\n", 496 | " vec[int(k)] = v\n", 497 | " \n", 498 | " return objective_vec(vec)" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "X = X_highdim" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "components_space = {}\n", 517 | "for i in range(X.shape[-1]):\n", 518 | " min_val = X[:, i].min()\n", 519 | " max_val = X[:, i].max()\n", 520 | " components_space[i] = hp.uniform(f'component_{i}', min_val, max_val)\n", 521 | "search_space = components_space" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [ 530 | "N = int(1.3*iters_per_round)" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "for epochs_train in [50]:\n", 540 | " all_test_losses = []\n", 541 | " all_wall_times = []\n", 542 | "\n", 543 | " for seed in tqdm_notebook(range(rounds)):\n", 544 | " np.random.seed(seed)\n", 545 | " os.environ['HYPEROPT_FMIN_SEED'] = str(seed)\n", 546 | " env.reset()\n", 547 | " test_losses, wall_times = [], []\n", 548 | " \n", 549 | " # minimize the objective over the space\n", 550 | " best = fmin(objective_dict, search_space, algo=tpe.suggest, max_evals=N,\n", 551 | " verbose=False, show_progressbar=True, max_queue_len=20)\n", 552 | " all_test_losses.append(test_losses)\n", 553 | " all_wall_times.append(wall_times)\n", 554 | "\n", 555 | " alg_resutls[f'TPE_{epochs_train}_epochs'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}" 556 | ] 557 | }, 558 | { 559 | "cell_type": "markdown", 560 | "metadata": {}, 561 | "source": [ 562 | "# SMAC" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "from smac.facade.func_facade import fmin_smac\n", 572 | "from smac.initial_design.latin_hypercube_design import LHDesign\n", 573 | "import logging\n", 574 | "from ConfigSpace.hyperparameters import UniformFloatHyperparameter\n", 575 | "\n", 576 | "# Import ConfigSpace and different types of parameters\n", 577 | "from smac.configspace import ConfigurationSpace\n", 578 | "from smac.facade.smac_hpo_facade import SMAC4HPO\n", 579 | "from smac.facade.smac_bo_facade import SMAC4BO\n", 580 | "from smac.initial_design.latin_hypercube_design import LHDesign\n", 581 | "from smac.optimizer.acquisition import LCB, EI, PI\n", 582 | "from smac.runhistory.runhistory2epm import RunHistory2EPM4InvScaledCost\n", 583 | "# Import SMAC-utilities\n", 584 | "from smac.scenario.scenario import Scenario" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "from smac.facade.smac_ac_facade import SMAC4AC\n", 594 | "from smac.scenario.scenario import Scenario\n", 595 | "from smac.tae.execute_ta_run import ExecuteTARun\n", 596 | "from smac.tae.execute_func import ExecuteTAFuncDict\n", 597 | "from smac.configspace import ConfigurationSpace\n", 598 | "from smac.stats.stats import Stats\n", 599 | "from smac.initial_design.random_configuration_design import RandomConfigurations\n", 600 | "from smac.initial_design.latin_hypercube_design import LHDesign\n", 601 | "from ConfigSpace.hyperparameters import UniformFloatHyperparameter\n", 602 | "import json\n", 603 | "import pathlib" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "class SMACUtils(object):\n", 613 | " def __init__(self, env, X=X, search_set=search_set,\n", 614 | " epochs_train=epochs_train):\n", 615 | "\n", 616 | " self.env = env\n", 617 | " self.X = X\n", 618 | " self.search_set = search_set\n", 619 | " self.epochs_train = epochs_train\n", 620 | " self.stat = {}\n", 621 | "# self.stat_file = stat_file\n", 622 | " \n", 623 | "# with open(self.stat_file, \"w\") as f:\n", 624 | "# json.dump({}, f)\n", 625 | " \n", 626 | " def objective_function(self, config):\n", 627 | " vec = self._config_to_vec(config)\n", 628 | " \n", 629 | " distances = distance.cdist([vec], self.X, \"cosine\")[0]\n", 630 | " recepie_id = np.argmin(distances)\n", 631 | " recepie = self.search_set[recepie_id]\n", 632 | " \n", 633 | " self.env.simulated_train(recepie, self.epochs_train)\n", 634 | " \n", 635 | " test_loss = self.env.get_test_loss_of_the_best_validated_architecture()\n", 636 | " wall_time = self.env.get_total_time()\n", 637 | " self._collect_eval_stat(test_loss, wall_time)\n", 638 | " \n", 639 | " if self.env.get_model_status(recepie) == 'OK':\n", 640 | " r = self.env.get_model_stats(recepie, self.epochs_train - 1)['val_loss']\n", 641 | " else:\n", 642 | " r = 1000\n", 643 | " \n", 644 | " return r, {\"test_loss\": test_loss, \"wall_time\": wall_time}\n", 645 | "\n", 646 | " \n", 647 | " def _config_to_vec(self, config):\n", 648 | " vec_as_dict = config.get_dictionary()\n", 649 | " vec = np.zeros(self.X.shape[-1])\n", 650 | " for k, v in vec_as_dict.items():\n", 651 | " vec[int(k)] = v\n", 652 | " return vec\n", 653 | " \n", 654 | " def _collect_eval_stat(self, test_loss, wall_time):\n", 655 | " stat = self.stat\n", 656 | " \n", 657 | " if 'test_losses' not in stat:\n", 658 | " stat['test_losses'] = []\n", 659 | " stat['test_losses'].append(test_loss)\n", 660 | " \n", 661 | " if 'wall_times' not in stat:\n", 662 | " stat['wall_times'] = []\n", 663 | " stat['wall_times'].append(wall_time)\n", 664 | " \n", 665 | " if 'eval_step' not in stat:\n", 666 | " stat['eval_step'] = 0\n", 667 | " stat['eval_step'] += 1\n", 668 | " \n" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "for X in [X_lowdim]:\n", 678 | " axes_bounds = (np.min(X, axis=0), np.max(X, axis=0))\n", 679 | " bounds = list(zip(axes_bounds[0], axes_bounds[1]))\n", 680 | "\n", 681 | " cs = ConfigurationSpace()\n", 682 | " cs.add_hyperparameters([\n", 683 | " UniformFloatHyperparameter(str(i), X[:, i].min(), X[:, i].max(), default_value=0)\n", 684 | " for i in range(X.shape[-1])\n", 685 | " ]);\n", 686 | "\n", 687 | " for initial_design in [LHDesign]: #[RandomConfigurations, LHDesign]:\n", 688 | " all_test_losses = []\n", 689 | " all_wall_times = []\n", 690 | "\n", 691 | " for seed in range(rounds):\n", 692 | " print(f\"START WITH INITIAL DESIGN: {initial_design.__name__} SEED: {seed}\")\n", 693 | " scenario = Scenario({\"run_obj\": \"quality\",\n", 694 | " \"runcount-limit\": int(1.2*iters_per_round),\n", 695 | " \"wallclock-limit\": 3000,\n", 696 | " \"cs\": cs,\n", 697 | " \"deterministic\": \"true\",\n", 698 | " \"initial_incumbent\": \"RANDOM\",\n", 699 | " \"output_dir\": \"./tmp\",\n", 700 | " \"seed\": seed,\n", 701 | " \"limit_resources\": \"false\"})\n", 702 | "\n", 703 | " env.reset()\n", 704 | " b = SMACUtils(env, X=X)\n", 705 | "\n", 706 | " def objective_function(config, **kwargs):\n", 707 | " y, stat = b.objective_function(config)\n", 708 | " return float(y)\n", 709 | "\n", 710 | " stats = Stats(scenario=scenario)\n", 711 | " smac = SMAC4AC(scenario=scenario,\n", 712 | " tae_runner=objective_function,\n", 713 | " initial_design=initial_design)\n", 714 | " smac.optimize()\n", 715 | "\n", 716 | " stat = b.stat\n", 717 | " all_test_losses.append(stat['test_losses'])\n", 718 | " all_wall_times.append(stat['wall_times'])\n", 719 | "\n", 720 | " alg_resutls[f'SMAC[{initial_design.__name__}_{X.shape[-1]}D]'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "# Plot resuls" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": null, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "legend_algs = {\n", 737 | " 'random_search_50_epochs': 'RS 50E',\n", 738 | " 'random_search_10_epochs': 'RS 10E',\n", 739 | " 'hyperband':'HB',\n", 740 | " 'bayes_opt_50D':'BO 50D',\n", 741 | " 'bayes_opt_10D':'BO 10D',\n", 742 | " 'regularized_evolution':'RE',\n", 743 | " 'TPE_50_epochs':'TPE',\n", 744 | " 'SMAC[LHDesign_10D]':'SMAC'\n", 745 | "}" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": null, 751 | "metadata": {}, 752 | "outputs": [], 753 | "source": [ 754 | "y_opt = env.get_best_possible_test_loss()" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "metadata": {}, 761 | "outputs": [], 762 | "source": [ 763 | "plt.figure(figsize=(8, 5), dpi=100)\n", 764 | "\n", 765 | "\n", 766 | "for i, alg_name in enumerate(['random_search_50_epochs', 'random_search_10_epochs', 'hyperband', \n", 767 | " 'bayes_opt_50D', 'bayes_opt_10D', 'regularized_evolution', 'TPE_50_epochs',\n", 768 | " 'SMAC[LHDesign_10D]']):\n", 769 | " all_test_losses = alg_resutls[alg_name]['all_test_losses']\n", 770 | " all_wall_times = alg_resutls[alg_name]['all_wall_times']\n", 771 | "\n", 772 | " all_xs = np.array(all_test_losses)\n", 773 | " all_ts = np.array(all_wall_times)/3600.\n", 774 | "\n", 775 | " s = 1.96/np.sqrt(all_xs.shape[0])\n", 776 | "\n", 777 | " all_ts_mean = all_ts.mean(axis=0)\n", 778 | "\n", 779 | " all_ts_max = all_ts_mean + s*all_ts.std(axis=0)\n", 780 | " all_ts_min = all_ts_mean - s*all_ts.std(axis=0)\n", 781 | "\n", 782 | " all_xs_mean = np.nanmean(all_xs, axis=0)\n", 783 | "\n", 784 | " all_xs_max = all_xs_mean + s*np.nanstd(all_xs, axis=0)\n", 785 | " all_xs_min = all_xs_mean - s*np.nanstd(all_xs, axis=0)\n", 786 | "\n", 787 | "\n", 788 | " plt.plot(all_ts_mean, all_xs_mean - y_opt, lw=1.5, color=f'C{i}', label=legend_algs[alg_name])\n", 789 | "\n", 790 | " plt.fill_between(all_ts_mean, all_xs_min - y_opt, all_xs_max - y_opt, alpha=0.1, edgecolor=f'C{i}')\n", 791 | "\n", 792 | "plt.legend()\n", 793 | "plt.xlabel('Total train time [h]', fontsize=14)\n", 794 | "plt.ylabel('Regret', fontsize=14)\n", 795 | "plt.ylim([0.1, 1.0])\n", 796 | "plt.xscale('log')\n", 797 | "plt.yscale('log')\n", 798 | "plt.xlim([5, 1500])\n", 799 | "plt.xticks([10, 20, 50, 100, 200, 500, 1000]);\n", 800 | "plt.gca().get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())\n", 801 | "plt.savefig('data/figures/benchmarks_log_y_scale.png', dpi=300, bbox_inches='tight')" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": null, 807 | "metadata": {}, 808 | "outputs": [], 809 | "source": [ 810 | "from matplotlib import ticker\n", 811 | "\n", 812 | "plt.figure(figsize=(8, 5), dpi=100)\n", 813 | "\n", 814 | "for i, alg_name in enumerate(['random_search_50_epochs', 'random_search_10_epochs', 'hyperband', \n", 815 | " 'bayes_opt_50D', 'bayes_opt_10D', 'regularized_evolution', 'TPE_50_epochs',\n", 816 | " 'SMAC[LHDesign_10D]']):\n", 817 | " all_test_losses = alg_resutls[alg_name]['all_test_losses']\n", 818 | " all_wall_times = alg_resutls[alg_name]['all_wall_times']\n", 819 | "\n", 820 | " all_xs = np.array(all_test_losses)\n", 821 | " all_ts = np.array(all_wall_times)/3600.\n", 822 | "\n", 823 | " s = 1.96/np.sqrt(all_xs.shape[0])\n", 824 | "\n", 825 | " all_ts_mean = all_ts.mean(axis=0)\n", 826 | "\n", 827 | " all_ts_max = all_ts_mean + s*all_ts.std(axis=0)\n", 828 | " all_ts_min = all_ts_mean - s*all_ts.std(axis=0)\n", 829 | "\n", 830 | " all_xs_mean = np.nanmean(all_xs, axis=0)\n", 831 | "\n", 832 | " all_xs_max = all_xs_mean + s*np.nanstd(all_xs, axis=0)\n", 833 | " all_xs_min = all_xs_mean - s*np.nanstd(all_xs, axis=0)\n", 834 | "\n", 835 | " plt.plot(np.sort(all_xs[:, -1]) - y_opt, np.linspace(0, 1, len(all_xs)), color=f'C{i}', label=legend_algs[alg_name])\n", 836 | "plt.legend()\n", 837 | "\n", 838 | "\n", 839 | "plt.xlabel('Final test regret', fontsize=14)\n", 840 | "plt.ylabel('CDF', fontsize=14)\n", 841 | "plt.savefig('data/figures/benchmarks_CDF_regret.png', dpi=300, bbox_inches='tight')" 842 | ] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": null, 847 | "metadata": {}, 848 | "outputs": [], 849 | "source": [] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": null, 854 | "metadata": {}, 855 | "outputs": [], 856 | "source": [] 857 | } 858 | ], 859 | "metadata": { 860 | "kernelspec": { 861 | "display_name": "Python 3", 862 | "language": "python", 863 | "name": "python3" 864 | }, 865 | "language_info": { 866 | "codemirror_mode": { 867 | "name": "ipython", 868 | "version": 3 869 | }, 870 | "file_extension": ".py", 871 | "mimetype": "text/x-python", 872 | "name": "python", 873 | "nbconvert_exporter": "python", 874 | "pygments_lexer": "ipython3", 875 | "version": "3.6.10" 876 | } 877 | }, 878 | "nbformat": 4, 879 | "nbformat_minor": 2 880 | } 881 | -------------------------------------------------------------------------------- /calculate_ged.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import contextlib 3 | import itertools 4 | import json 5 | import random 6 | import time 7 | from pathlib import Path 8 | 9 | import joblib 10 | import networkx as nx 11 | from interruptingcow import Quota, timeout 12 | from joblib import Parallel, delayed 13 | from tqdm.auto import tqdm 14 | 15 | from utils import make_graph 16 | 17 | 18 | @contextlib.contextmanager 19 | def tqdm_joblib(tqdm_object): 20 | """Context manager to patch joblib to report into tqdm progress bar given as argument""" 21 | class TqdmBatchCompletionCallback: 22 | def __init__(self, time, index, parallel): 23 | self.index = index 24 | self.parallel = parallel 25 | 26 | def __call__(self, index): 27 | tqdm_object.update() 28 | if self.parallel._original_iterator is not None: 29 | self.parallel.dispatch_next() 30 | 31 | old_batch_callback = joblib.parallel.BatchCompletionCallBack 32 | joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback 33 | try: 34 | yield tqdm_object 35 | finally: 36 | joblib.parallel.BatchCompletionCallBack = old_batch_callback 37 | tqdm_object.close() 38 | 39 | def calc_ged(recepie1, recepie2, timeout_val=600): 40 | start_time = time.time() 41 | G1 = make_graph(recepie1) 42 | G2 = make_graph(recepie2) 43 | ged = None 44 | 45 | try: 46 | status = "OK" 47 | with timeout(Quota(timeout_val), exception=RuntimeError): 48 | for ged in nx.optimize_graph_edit_distance(G1, G2, lambda n1, n2: n1['op'] == n2['op']): 49 | pass 50 | 51 | except RuntimeError as e: 52 | status = "Timeout" 53 | 54 | except Exception as e: 55 | status = "Exception: " + str(e) 56 | 57 | return { 58 | "recepie_i": recepie1, 59 | "recepie_j": recepie2, 60 | "ged": ged, 61 | "time": time.time() - start_time, 62 | "status": status 63 | } 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser(description='Calculate GED') 67 | parser.add_argument('--recepies', type=str, default="./new_recepies_fix.json", 68 | help='path to JSON file with recepies') 69 | parser.add_argument('--num', type=int, default=10, 70 | help='number of random recepies for calculating GED to all another') 71 | parser.add_argument('--timeout', type=int, default=600, help="timeout for calculating one GED value in seconds") 72 | parser.add_argument('--n_jobs', type=int, default=-2, 73 | help="n_jobs in skit learn style") 74 | parser.add_argument('--num_parts', type=int, default=10, 75 | help="Num results parts for saving") 76 | 77 | args = parser.parse_args() 78 | 79 | with open(args.recepies, "r") as f: 80 | recepies = json.load(f) 81 | 82 | key_recepies = random.sample(recepies, args.num) 83 | part_size = len(recepies)//args.num_parts 84 | for part in range(1, args.num_parts+1): 85 | _recepies = recepies[(part-1)*part_size:part*part_size] 86 | combs = list(itertools.product(key_recepies, _recepies)) 87 | 88 | with tqdm_joblib(tqdm(desc="GED part {} of {}".format(part, args.num_parts), total=len(combs))) as progress_bar: 89 | results = Parallel(n_jobs=args.n_jobs, backend='multiprocessing')(delayed(calc_ged)(r1, r2, args.timeout) for r1, r2 in combs) 90 | 91 | with open("GED_CALC_RESULTS_part_{}.json".format(part), 'w') as f: 92 | json.dump(results, f) 93 | -------------------------------------------------------------------------------- /custom_rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | import networkx as nx 4 | 5 | from multilinear import MultiLinear 6 | import math 7 | 8 | class CustomRNNCell(torch.nn.Module): 9 | 10 | elementwise_ops_dict = { 11 | 'prod': torch.mul, 12 | 'sum': torch.add 13 | } 14 | 15 | def __init__(self, input_size, hidden_size, recepie): 16 | super(CustomRNNCell, self).__init__() 17 | 18 | self.activations_dict = { 19 | 'tanh': torch.nn.Tanh(), 20 | 'sigm': torch.nn.Sigmoid(), 21 | 'leaky_relu': torch.nn.LeakyReLU() 22 | } 23 | 24 | self.input_size = input_size 25 | self.hidden_size = hidden_size 26 | self.recepie = recepie 27 | self.hidden_tuple_size = 0 28 | 29 | components_dict = {} 30 | 31 | self.G = nx.DiGraph() 32 | for k in recepie.keys(): 33 | if k not in components_dict: 34 | 35 | component = self._make_component(recepie[k]) 36 | if component is not None: 37 | components_dict[k] = component 38 | if k.startswith('h_new'): 39 | suffix = k.replace('h_new_', '') 40 | if suffix.isdigit(): 41 | self.hidden_tuple_size = max([self.hidden_tuple_size, int(suffix) + 1]) 42 | 43 | if k not in self.G.nodes(): 44 | self.G.add_node(k) 45 | for i, n in enumerate(recepie[k]['input']): 46 | if n not in self.G.nodes(): 47 | self.G.add_node(k) 48 | self.G.add_edge(n, k) 49 | 50 | self.components = torch.nn.ModuleDict(components_dict) 51 | self.nodes_order = list(nx.algorithms.dag.topological_sort(self.G)) 52 | 53 | def forward(self, x, hidden_tuple): 54 | calculated_nodes = {} 55 | for n in self.nodes_order: 56 | if n == 'x': 57 | calculated_nodes['x'] = x.unsqueeze(0) 58 | elif n.startswith('h_prev') and n.replace('h_prev_', '').isdigit(): 59 | calculated_nodes[n] = hidden_tuple[int(n.replace('h_prev_', ''))].unsqueeze(0) 60 | elif n in self.components: 61 | inputs = [calculated_nodes[k] for k in self.recepie[n]['input']] 62 | calculated_nodes[n] = self.components[n](*inputs) 63 | else: 64 | # simple operations 65 | op = self.recepie[n]['op'] 66 | inputs = [calculated_nodes[k] for k in self.recepie[n]['input']] 67 | if op in ['elementwise_prod', 'elementwise_sum']: 68 | op_func = CustomRNNCell.elementwise_ops_dict[op.replace('elementwise_', '')] 69 | calculated_nodes[n] = op_func(inputs[0], inputs[1]) 70 | for inp in range(2, len(inputs)): 71 | calculated_nodes[n] = op_func(calculated_nodes[n], inputs[i]) 72 | elif op == 'blend': 73 | calculated_nodes[n] = inputs[0]*inputs[1] + (1 - inputs[0])*inputs[2] 74 | elif op.startswith('activation'): 75 | op_func = self.activations_dict[op.replace('activation_', '')] 76 | calculated_nodes[n] = op_func(inputs[0]) 77 | return tuple([calculated_nodes[f'h_new_{i}'][0] for i in range(self.hidden_tuple_size)]) 78 | 79 | def _make_component(self, spec): 80 | if spec['op'] == 'linear': 81 | input_sizes = [self.input_size if inp=='x' else self.hidden_size for inp in spec['input']] 82 | return MultiLinear(input_sizes, self.hidden_size) 83 | 84 | 85 | class CustomRNN(torch.nn.Module): 86 | 87 | def __init__(self, input_size, hidden_size, recepie): 88 | super(CustomRNN, self).__init__() 89 | self.hidden_size = hidden_size 90 | self.cell = CustomRNNCell(input_size, hidden_size, recepie) 91 | self.reset_parameters() 92 | 93 | def forward(self, inputs, hidden_tuple=None): 94 | batch_size = inputs.size(1) 95 | if hidden_tuple is None: 96 | hidden_tuple = tuple([self.init_hidden(batch_size) for _ in range(self.cell.hidden_tuple_size)]) 97 | 98 | self.check_hidden_size(hidden_tuple, batch_size) 99 | 100 | hidden_tuple = tuple([x[0] for x in hidden_tuple]) 101 | outputs = [] 102 | for x in torch.unbind(inputs, dim=0): 103 | hidden_tuple = self.cell(x, hidden_tuple) 104 | outputs.append(hidden_tuple[0].clone()) 105 | 106 | return torch.stack(outputs, dim=0), tuple([x.unsqueeze(0) for x in hidden_tuple]) 107 | 108 | def init_hidden(self, batch_size): 109 | # num_layers == const (1) 110 | return torch.zeros(1, batch_size, self.hidden_size).to(next(self.parameters()).device) 111 | 112 | def reset_parameters(self): 113 | stdv = 1.0 / math.sqrt(self.hidden_size) 114 | for param in self.parameters(): 115 | torch.nn.init.uniform_(param, -stdv, stdv) 116 | 117 | def check_hidden_size(self, hidden_tuple, batch_size): 118 | expected_hidden_size = (1, batch_size, self.hidden_size) 119 | msg = 'Expected hidden size {}, got {}' 120 | for hx in hidden_tuple: 121 | if hx.size() != expected_hidden_size: 122 | raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size()))) 123 | 124 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from collections import Counter 5 | 6 | 7 | class Dictionary(object): 8 | def __init__(self): 9 | self.word2idx = {} 10 | self.idx2word = [] 11 | self.counter = Counter() 12 | self.total = 0 13 | 14 | def add_word(self, word): 15 | if word not in self.word2idx: 16 | self.idx2word.append(word) 17 | self.word2idx[word] = len(self.idx2word) - 1 18 | token_id = self.word2idx[word] 19 | self.counter[token_id] += 1 20 | self.total += 1 21 | return self.word2idx[word] 22 | 23 | def __len__(self): 24 | return len(self.idx2word) 25 | 26 | 27 | class Corpus(object): 28 | def __init__(self, path): 29 | self.dictionary = Dictionary() 30 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 31 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 32 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 33 | 34 | def tokenize(self, path): 35 | """Tokenizes a text file.""" 36 | assert os.path.exists(path) 37 | # Add words to the dictionary 38 | with open(path, 'r') as f: 39 | tokens = 0 40 | for line in f: 41 | words = line.split() + [''] 42 | tokens += len(words) 43 | for word in words: 44 | self.dictionary.add_word(word) 45 | 46 | # Tokenize file content 47 | with open(path, 'r') as f: 48 | ids = torch.LongTensor(tokens) 49 | token = 0 50 | for line in f: 51 | words = line.split() + [''] 52 | for word in words: 53 | ids[token] = self.dictionary.word2idx[word] 54 | token += 1 55 | 56 | return ids 57 | -------------------------------------------------------------------------------- /data/datasets.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/data/datasets.zip -------------------------------------------------------------------------------- /data/recepies_example.json: -------------------------------------------------------------------------------- 1 | [{"f": {"op": "linear", "input": ["x", "h_prev_0"]}, "h_new_0": {"op": "activation_tanh", "input": ["f"]}}, {"i": {"op": "linear", "input": ["x", "h_prev_0"]}, "i_act": {"op": "activation_tanh", "input": ["i"]}, "j": {"op": "linear", "input": ["x", "h_prev_0"]}, "j_act": {"op": "activation_sigm", "input": ["j"]}, "f": {"op": "linear", "input": ["x", "h_prev_0"]}, "f_act": {"op": "activation_sigm", "input": ["f"]}, "o": {"op": "linear", "input": ["x", "h_prev_0"]}, "o_act": {"op": "activation_tanh", "input": ["o"]}, "h_new_1_part1": {"op": "elementwise_prod", "input": ["f_act", "h_prev_1"]}, "h_new_1_part2": {"op": "elementwise_prod", "input": ["i_act", "j_act"]}, "h_new_1": {"op": "elementwise_sum", "input": ["h_new_1_part1", "h_new_1_part2"]}, "h_new_1_act": {"op": "activation_tanh", "input": ["h_new_1"]}, "h_new_0": {"op": "elementwise_prod", "input": ["h_new_1_act", "o_act"]}}, {"r": {"op": "linear", "input": ["x", "h_prev_0"]}, "r_act": {"op": "activation_sigm", "input": ["r"]}, "z": {"op": "linear", "input": ["x", "h_prev_0"]}, "z_act": {"op": "activation_sigm", "input": ["z"]}, "rh": {"op": "elementwise_prod", "input": ["r_act", "h_prev_0"]}, "h_tilde": {"op": "linear", "input": ["x", "rh"]}, "h_tilde_act": {"op": "activation_tanh", "input": ["h_tilde"]}, "h_new_0": {"op": "blend", "input": ["z_act", "h_prev_0", "h_tilde_act"]}}, {"node_0": {"op": "linear", "input": ["h_prev_0", "x"]}, "h_new_0": {"op": "activation_tanh", "input": ["node_0"]}}, {"node_0": {"op": "linear", "input": ["x", "h_prev_0"]}, "node_1": {"op": "activation_tanh", "input": ["node_0"]}, "node_3": {"op": "linear", "input": ["h_prev_0", "node_1"]}, "node_4": {"op": "activation_leaky_relu", "input": ["node_3"]}, "node_6": {"op": "linear", "input": ["node_4", "node_1"]}, "node_7": {"op": "activation_tanh", "input": ["node_6"]}, "node_8": {"op": "linear", "input": ["node_1", "x", "node_7"]}, "h_new_0": {"op": "activation_sigm", "input": ["node_8"]}}, {"node_0": {"op": "linear", "input": ["h_prev_0", "x"]}, "h_new_0": {"op": "activation_leaky_relu", "input": ["node_0"]}}, {"node_0": {"op": "linear", "input": ["x", "h_prev_0"]}, "node_1": {"op": "activation_sigm", "input": ["node_0"]}, "node_2": {"op": "linear", "input": ["x", "node_1"]}, "node_3": {"op": "activation_tanh", "input": ["node_2"]}, "h_new_0": {"op": "linear", "input": ["node_3", "node_1"]}}, {"node_0": {"op": "linear", "input": ["h_prev_0", "x"]}, "node_1": {"op": "activation_leaky_relu", "input": ["node_0"]}, "node_2": {"op": "elementwise_sum", "input": ["node_0", "node_1"]}, "node_3": {"op": "linear", "input": ["node_1", "h_prev_0"]}, "node_4": {"op": "activation_sigm", "input": ["node_3"]}, "node_5": {"op": "linear", "input": ["node_4", "x", "node_1"]}, "node_7": {"op": "elementwise_sum", "input": ["node_5", "node_3"]}, "h_new_0": {"op": "elementwise_sum", "input": ["node_2", "node_7"]}}, {"node_0": {"op": "linear", "input": ["h_prev_0", "x"]}, "node_1": {"op": "activation_sigm", "input": ["node_0"]}, "node_3": {"op": "linear", "input": ["node_1", "h_prev_0"]}, "node_4": {"op": "activation_leaky_relu", "input": ["node_3"]}, "node_11": {"op": "blend", "input": ["h_prev_0", "node_1", "node_4"]}, "h_new_0": {"op": "elementwise_prod", "input": ["node_3", "node_11"]}}, {"node_0": {"op": "linear", "input": ["x", "h_prev_0"]}, "node_1": {"op": "activation_leaky_relu", "input": ["node_0"]}, "node_2": {"op": "linear", "input": ["x", "node_1", "h_prev_0"]}, "h_new_0": {"op": "activation_tanh", "input": ["node_2"]}}] -------------------------------------------------------------------------------- /embed_regularize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | 5 | def embedded_dropout(embed, words, dropout=0.1, scale=None): 6 | if dropout: 7 | mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout) 8 | masked_embed_weight = mask * embed.weight 9 | else: 10 | masked_embed_weight = embed.weight 11 | if scale: 12 | masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight 13 | 14 | padding_idx = embed.padding_idx 15 | if padding_idx is None: 16 | padding_idx = -1 17 | 18 | X = torch.nn.functional.embedding(words, masked_embed_weight, 19 | padding_idx, embed.max_norm, embed.norm_type, 20 | embed.scale_grad_by_freq, embed.sparse 21 | ) 22 | return X 23 | 24 | if __name__ == '__main__': 25 | V = 50 26 | h = 4 27 | bptt = 10 28 | batch_size = 2 29 | 30 | embed = torch.nn.Embedding(V, h) 31 | 32 | words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt)) 33 | words = torch.LongTensor(words) 34 | 35 | origX = embed(words) 36 | X = embedded_dropout(embed, words) 37 | 38 | print(origX) 39 | print(X) 40 | -------------------------------------------------------------------------------- /locked_dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | class LockedDropout(nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def forward(self, x, dropout=0.5): 10 | if not self.training or not dropout: 11 | return x 12 | m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout) 13 | mask = Variable(m, requires_grad=False) / (1 - dropout) 14 | mask = mask.expand_as(x) 15 | return mask * x 16 | -------------------------------------------------------------------------------- /main_one_model_train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | import torch.optim 4 | import torch.utils.data 5 | import torch.nn.functional as F 6 | from splitcross import SplitCrossEntropyLoss 7 | 8 | import numpy as np 9 | import networkx as nx 10 | import math 11 | import json 12 | import time 13 | 14 | import data 15 | import os 16 | from utils import batchify 17 | from argparse import Namespace 18 | from model import AWDRNNModel 19 | from train import train, evaluate 20 | import datetime 21 | 22 | import argparse 23 | 24 | parser = argparse.ArgumentParser(description='PyTorch Custom RNN Language Model') 25 | 26 | parser.add_argument('--dataset_path', type=str, default='data/ptb', 27 | help='location of the data corpus') 28 | parser.add_argument('--logs_path', type=str, default='tmp', 29 | help='path to logs folder') 30 | parser.add_argument('--recepies_list_path', type=str, default='data/recepies_example.json', 31 | help='list of models recepies') 32 | parser.add_argument('--recepie_id', type=int, required=True, 33 | help='id of a model recepie from the models list') 34 | parser.add_argument('--epochs', type=int, default=50, 35 | help='number of epochs to train') 36 | parser.add_argument('--emsize', type=int, default=400, 37 | help='emsize') 38 | parser.add_argument('--nhid', type=int, default=600, 39 | help='nhid') 40 | parser.add_argument('--nlayers', type=int, default=3, 41 | help='nlayers') 42 | parser.add_argument('--dropout', type=float, default=0.4, 43 | help='dropout') 44 | parser.add_argument('--dropouth', type=float, default=0.25, 45 | help='dropouth') 46 | parser.add_argument('--dropouti', type=float, default=0.4, 47 | help='dropouti') 48 | parser.add_argument('--dropoute', type=float, default=0.1, 49 | help='dropoute') 50 | parser.add_argument('--wdrop', type=float, default=0.5, 51 | help='wdrop') 52 | parser.add_argument('--experiment_id', type=int, 53 | help='some specific id of the experiment') 54 | 55 | if __name__ == '__main__': 56 | 57 | init_time = str(datetime.datetime.now()).replace(':', '-').split('.')[0].replace(' ', '_') 58 | 59 | main_args = parser.parse_args() 60 | 61 | if main_args.experiment_id is None: 62 | main_args.experiment_id = 999999999 - np.random.randint(100000) 63 | 64 | all_recepies = json.load(open(main_args.recepies_list_path, 'r')) 65 | 66 | args = Namespace(data=main_args.dataset_path, 67 | recepie_id=main_args.recepie_id, 68 | recepies_list_path=main_args.recepies_list_path, 69 | cuda=True, 70 | batch_size=20, 71 | model='CustomRNN', 72 | emsize=main_args.emsize, 73 | nhid=main_args.nhid, 74 | nlayers=main_args.nlayers, 75 | dropout=main_args.dropout, 76 | dropouth=main_args.dropouth, 77 | dropouti=main_args.dropouti, 78 | dropoute=main_args.dropoute, 79 | wdrop=main_args.wdrop, 80 | tied=True, 81 | bptt=70, 82 | lr=1e-3, 83 | wdecay=1.2e-6, 84 | epochs=main_args.epochs, 85 | alpha=2, 86 | beta=1, 87 | log_interval=200, 88 | clip=0.25, 89 | eval_batch_size = 50, 90 | recepie=json.dumps(all_recepies[main_args.recepie_id])) 91 | 92 | corpus = data.Corpus(args.data) 93 | cuda = 'cuda' 94 | 95 | train_data = batchify(corpus.train, args.batch_size, args, cuda) 96 | train_eval_data = batchify(corpus.train, args.eval_batch_size, args, cuda) 97 | val_data = batchify(corpus.valid, args.eval_batch_size, args, cuda) 98 | test_data = batchify(corpus.test, args.eval_batch_size, args, cuda) 99 | 100 | ntokens = len(corpus.dictionary) 101 | 102 | custom_model = AWDRNNModel(args.model, 103 | ntokens, 104 | args.emsize, 105 | args.nhid, 106 | args.nlayers, 107 | args.dropout, 108 | args.dropouth, 109 | args.dropouti, 110 | args.dropoute, 111 | args.wdrop, 112 | args.tied, 113 | args.recepie, 114 | verbose=False) 115 | 116 | 117 | log_stats = vars(args) 118 | log_stats['experiment_id'] = main_args.experiment_id 119 | log_stats['init_time'] = init_time 120 | log_stats['num_params'] = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] 121 | for x in custom_model.parameters() if x.size()) 122 | 123 | 124 | criterion = SplitCrossEntropyLoss(args.emsize, splits=[], verbose=False) 125 | 126 | if args.cuda: 127 | custom_model = custom_model.to(cuda) 128 | criterion = criterion.to(cuda) 129 | 130 | params = list(custom_model.parameters()) + list(criterion.parameters()) 131 | 132 | optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay) 133 | 134 | lr = args.lr 135 | train_losses = [] 136 | val_losses = [] 137 | test_losses = [] 138 | wall_times = [] 139 | 140 | # At any point you can hit Ctrl + C to break out of training early. 141 | status = 'OK' 142 | try: 143 | for epoch in range(1, args.epochs+1): 144 | epoch_start_time = time.time() 145 | train(custom_model, optimizer, params, criterion, train_data, args, epoch) 146 | epoch_end_time = time.time() 147 | train_loss = evaluate(custom_model, criterion, train_eval_data, args.eval_batch_size, args) 148 | val_loss = evaluate(custom_model, criterion, val_data, args.eval_batch_size, args) 149 | test_loss = evaluate(custom_model, criterion, test_data, args.eval_batch_size, args) 150 | print('-' * 89) 151 | print('| end of epoch {:3d} | time: {:5.2f}s |\n| train loss {:5.2f} | ' 152 | 'train ppl {:8.2f} | train bpw {:8.3f} |\n| valid loss {:5.2f} | ' 153 | 'valid ppl {:8.2f} | valid bpw {:8.3f} |\n| test loss {:5.2f} | ' 154 | 'test ppl {:8.2f} | test bpw {:8.3f} |'.format( 155 | epoch, (epoch_end_time - epoch_start_time), 156 | train_loss, math.exp(train_loss), train_loss / math.log(2), 157 | val_loss, math.exp(val_loss), val_loss / math.log(2), 158 | test_loss, math.exp(test_loss), test_loss / math.log(2))) 159 | print('-' * 89) 160 | 161 | wall_times.append(epoch_end_time - epoch_start_time) 162 | train_losses.append(train_loss) 163 | val_losses.append(val_loss) 164 | test_losses.append(test_loss) 165 | 166 | if np.isnan(np.array([train_loss, val_loss, test_loss])).any(): 167 | status = 'loss is nan!' 168 | break 169 | 170 | except KeyboardInterrupt: 171 | print('-' * 89) 172 | status = 'KeyboardInterrupt' 173 | print('Exiting from training early') 174 | except Exception as e: 175 | status = 'Exception: ' + str(e) 176 | print('Exception', e) 177 | 178 | log_stats['wall_times'] = wall_times 179 | log_stats['train_losses'] = train_losses 180 | log_stats['val_losses'] = val_losses 181 | log_stats['test_losses'] = test_losses 182 | log_stats['status'] = status 183 | 184 | json.dump(log_stats, open(os.path.join(main_args.logs_path, f'log_stats_model_{args.recepie_id}_{init_time}_{main_args.experiment_id}.json'), 'w')) 185 | torch.save(custom_model.state_dict(), os.path.join(main_args.logs_path, f'dump_weights_model_{args.recepie_id}_{init_time}_{main_args.experiment_id}.pt')) 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /make_arch_embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "\n", 12 | "import json\n", 13 | "from tqdm.auto import tqdm\n", 14 | "from pathlib import Path\n", 15 | "import plotting\n", 16 | "import networkx as nx \n", 17 | "from joblib import Parallel, delayed\n", 18 | "import contextlib\n", 19 | "import random\n", 20 | "import numpy as np\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "from sklearn.manifold import TSNE\n", 23 | "from utils import make_graph\n", 24 | "import pandas as pd\n", 25 | "from copy import deepcopy\n", 26 | "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n", 27 | "import hashlib" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution\n", 37 | "@contextlib.contextmanager\n", 38 | "def tqdm_joblib(tqdm_object):\n", 39 | " \"\"\"Context manager to patch joblib to report into tqdm progress bar given as argument\"\"\"\n", 40 | " class TqdmBatchCompletionCallback:\n", 41 | " def __init__(self, time, index, parallel):\n", 42 | " self.index = index\n", 43 | " self.parallel = parallel\n", 44 | "\n", 45 | " def __call__(self, index):\n", 46 | " tqdm_object.update()\n", 47 | " if self.parallel._original_iterator is not None:\n", 48 | " self.parallel.dispatch_next()\n", 49 | "\n", 50 | " old_batch_callback = joblib.parallel.BatchCompletionCallBack\n", 51 | " joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback\n", 52 | " try:\n", 53 | " yield tqdm_object\n", 54 | " finally:\n", 55 | " joblib.parallel.BatchCompletionCallBack = old_batch_callback\n", 56 | " tqdm_object.close() " 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "logs_path = Path('train_logs_single_run')\n", 66 | "logs_paths = list(logs_path.glob(\"*.json\"))\n", 67 | "\n", 68 | "logs_ok = []\n", 69 | "logs_not_ok = []\n", 70 | "for idx, log_path in tqdm(enumerate(logs_paths), total=len(logs_paths)):\n", 71 | " with open(log_path, \"r\") as f:\n", 72 | " log = json.load(f)\n", 73 | " recepie = json.loads(log['recepie'])\n", 74 | " log['recepie'] = recepie\n", 75 | " log['idx'] = idx\n", 76 | " \n", 77 | " if log['status'] == 'OK':\n", 78 | " logs_ok.append(log)\n", 79 | " else:\n", 80 | " logs_not_ok.append(log)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "print(\"number ok: \", len(logs_ok))\n", 90 | "print(\"number not ok: \", len(logs_not_ok))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "len(logs_not_ok)/len(logs_ok)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "def map_node(x):\n", 109 | " for v in ['node', 'h_prev', 'h_new']:\n", 110 | " if x.find(v) != -1:\n", 111 | " x = v\n", 112 | " if x not in ['x', 'node', 'h_prev', 'h_new']: # to make lstm and gru recepies standard\n", 113 | " x = 'node'\n", 114 | " return x" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "def make_graph_2(recepie):\n", 124 | " \n", 125 | " G = nx.DiGraph()\n", 126 | " for n in recepie.keys():\n", 127 | " if n not in G.nodes():\n", 128 | " G.add_node(n)\n", 129 | " for k in recepie[n]['input']:\n", 130 | " if k not in G.nodes():\n", 131 | " G.add_node(k)\n", 132 | " G.add_edge(n, k, label=recepie[n]['op'])\n", 133 | " G.add_edge(k, n, label='rev_' + recepie[n]['op'])\n", 134 | " return G" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "def random_walk_features(G, steps=10):\n", 144 | " walk = []\n", 145 | " node = np.random.choice(G.nodes(), 1)[0]\n", 146 | " for _ in range(steps):\n", 147 | " k = np.random.choice(list(G.adj[node]), 1)[0]\n", 148 | " walk.extend([map_node(node), G.adj[node][k]['label']])\n", 149 | " node = k\n", 150 | " walk.append(map_node(node))\n", 151 | " return walk" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "def make_graph_sentences(G, sentences_num=20):\n", 161 | " sentences = []\n", 162 | " for _ in range(sentences_num):\n", 163 | " sentences.extend(random_walk_features(G) + ['.'])\n", 164 | " return sentences" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "def feature_extractor(log):\n", 174 | " name = f\"log_{log['idx']}\"\n", 175 | " recepie = log['recepie']\n", 176 | " G = make_graph_2(recepie)\n", 177 | " doc = TaggedDocument(words=make_graph_sentences(G), tags=[name])\n", 178 | " return doc" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "document_collections = Parallel(n_jobs=-2)(delayed(feature_extractor)(log) for log in tqdm(logs_not_ok+logs_ok))" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "# takes a while!!\n", 197 | "size = 10\n", 198 | "# size = 50\n", 199 | "doc2vec_model = Doc2Vec(document_collections, \n", 200 | " size=size, window=3, dm=1, min_count=0, workers=8, epochs=100, hs=1,\n", 201 | " dbow_words=0)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "all_words = set()\n", 211 | "for d in document_collections:\n", 212 | " all_words |= set(d.words)\n", 213 | "len(all_words)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "not_dublicates_ok_logs = []\n", 223 | "not_dublicates_not_ok_logs = []\n", 224 | "not_dublicates_recepies = []\n", 225 | "for log in logs_not_ok+logs_ok:\n", 226 | " recepie = log['recepie']\n", 227 | " if recepie not in not_dublicates_recepies:\n", 228 | " not_dublicates_recepies.append(recepie)\n", 229 | " if log['status'] == 'OK':\n", 230 | " not_dublicates_ok_logs.append(log)\n", 231 | " else:\n", 232 | " not_dublicates_not_ok_logs.append(log)\n", 233 | "print(\"total: \", len(logs_not_ok+logs_ok))\n", 234 | "print(\"without dublicates: \", len(not_dublicates_ok_logs+not_dublicates_not_ok_logs))" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "#dump vectors\n", 244 | "recepie_id_vectors = {log['recepie_id']:doc2vec_model.docvecs[f\"log_{log['idx']}\"]\n", 245 | " for log in not_dublicates_ok_logs+not_dublicates_not_ok_logs}\n" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "recepie_id_vectors_list = []\n", 255 | "for k in recepie_id_vectors:\n", 256 | " k_dict = {'recepie_id':k}\n", 257 | " for i in range(doc2vec_model.vector_size):\n", 258 | " k_dict[f'v{i:02d}'] = recepie_id_vectors[k][i]\n", 259 | " recepie_id_vectors_list.append(k_dict)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "df_recepie_vectors = pd.DataFrame(recepie_id_vectors_list)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "df_recepie_vectors.head()" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "# if size == 50:\n", 294 | "# df_recepie_vectors.to_csv('data/doc2vec_features.csv', index=False)\n", 295 | "# elif size == 10:\n", 296 | "# df_recepie_vectors.to_csv('data/doc2vec_features_lowdim.csv', index=False)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "df_recepie_vectors = pd.read_csv('data/doc2vec_features.csv').set_index('recepie_id')" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "trainset_list = [\n", 329 | " (df_recepie_vectors.loc[log['recepie_id']], int(log['status'] == 'OK'))\n", 330 | " for log in not_dublicates_ok_logs+not_dublicates_not_ok_logs\n", 331 | "]\n", 332 | "\n", 333 | "random.shuffle(trainset_list)\n", 334 | "\n", 335 | "trainset_np, testset_np = np.array(trainset_list[:7000]), np.array(trainset_list[7000:])\n", 336 | "train_X, train_y = np.array(list(trainset_np[:, 0])), np.array(list(trainset_np[:, 1]))\n", 337 | "test_X, test_y = np.array(list(testset_np[:, 0])), np.array(list(testset_np[:, 1]))\n", 338 | "\n", 339 | "print(\"Train:\", len(trainset_np))\n", 340 | "print(\"Test: \", len(testset_np))\n", 341 | "\n", 342 | "num_train_not_ok = len(train_y) - train_y.sum()\n", 343 | "print(\"\\nTrain OK: \", train_y.sum())\n", 344 | "print(\"Train not OK: \", num_train_not_ok)\n", 345 | "\n", 346 | "num_test_not_ok = len(test_y) - test_y.sum()\n", 347 | "print(\"\\nTest OK: \", test_y.sum())\n", 348 | "print(\"Test not OK: \", num_test_not_ok)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "tsne = TSNE(n_components=2, n_iter=300, verbose=True)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "E = tsne.fit_transform(train_X)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "plt.figure(figsize=(8, 8))\n", 376 | "plt.scatter(E[:, 0], E[:, 1], s=3, c=train_y)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "from xgboost import XGBClassifier\n", 386 | "from catboost import CatBoostClassifier\n", 387 | "from sklearn.metrics import f1_score, precision_recall_curve, average_precision_score, roc_curve, auc " 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "model = XGBClassifier(n_jobs=-1)\n", 397 | "model.fit(train_X, train_y)\n", 398 | "\n", 399 | "pred_y = model.predict_proba(test_X)\n", 400 | "precision_xgboost, recall_xgboost, _ = precision_recall_curve(test_y, pred_y[:, 1])\n", 401 | "fpr_xgboost, tpr_xgboost, _ = roc_curve(test_y, pred_y[:, 1])\n", 402 | "roc_auc_axboost = auc(fpr_xgboost, tpr_xgboost)\n", 403 | "\n", 404 | "f1_test_score = f1_score(test_y, np.argmax(pred_y, 1))\n", 405 | "ap_test_score = average_precision_score(test_y, pred_y[:, 1])\n", 406 | "print(\"XGBoost F1 score: \", f1_test_score)\n", 407 | "print(\"XGBoost Average Precision score: \", ap_test_score)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "from sklearn.linear_model import LogisticRegression\n", 417 | "\n", 418 | "clf = LogisticRegression().fit(train_X, train_y)\n", 419 | "pred_y = clf.predict_proba(test_X)\n", 420 | "precision_lr, recall_lr, _ = precision_recall_curve(test_y, pred_y[:,1])\n", 421 | "fpr_lr, tpr_lr, _ = roc_curve(test_y, pred_y[:, 1])\n", 422 | "roc_auc_lr = auc(fpr_lr, tpr_lr)\n", 423 | "\n", 424 | "f1_test_score = f1_score(test_y, np.argmax(pred_y, 1))\n", 425 | "ap_test_score = average_precision_score(test_y, pred_y[:, 1])\n", 426 | "print(\"Logistic Regression F1 score: \", f1_test_score)\n", 427 | "print(\"CatBoost Average Precision score: \", ap_test_score)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "#plt.title('Receiver Operating Characteristic')\n", 437 | "plt.figure(figsize=(6, 6))\n", 438 | "plt.plot(fpr_xgboost, tpr_xgboost, label='XGBoost AUC = %0.2f' % roc_auc_axboost)\n", 439 | "plt.plot(fpr_lr, tpr_lr, label='Logistic Regression AUC = %0.2f' % roc_auc_lr)\n", 440 | "plt.legend(loc = 'lower right')\n", 441 | "plt.plot([0, 1], [0, 1],'k--')\n", 442 | "plt.xlim([0, 1])\n", 443 | "plt.ylim([0, 1])\n", 444 | "plt.ylabel('True Positive Rate', fontsize=16)\n", 445 | "plt.xlabel('False Positive Rate', fontsize=16)\n", 446 | "\n", 447 | "plt.savefig('data/figures/prediction_faulty.png', dpi=300, bbox_inches='tight')" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "trainset_list = [\n", 457 | " (doc2vec_model.docvecs[f\"log_{log['idx']}\"], np.array(log['val_losses']).min())\n", 458 | " for log in not_dublicates_ok_logs\n", 459 | "]\n", 460 | "\n", 461 | "\n", 462 | "np.random.seed(0)\n", 463 | "random.shuffle(trainset_list)\n", 464 | "\n", 465 | "trainset_np, testset_np = np.array(trainset_list[:7000]), np.array(trainset_list[7000:])\n", 466 | "train_X, train_y = np.array(list(trainset_np[:, 0])), np.array(list(trainset_np[:, 1]))\n", 467 | "test_X, test_y = np.array(list(testset_np[:, 0])), np.array(list(testset_np[:, 1]))\n", 468 | "\n", 469 | "print(\"Train:\", len(trainset_np))\n", 470 | "print(\"Test: \", len(testset_np))\n", 471 | "\n", 472 | "num_train_not_ok = len(train_y) - train_y.sum()\n", 473 | "print(\"\\nTrain OK: \", train_y.sum())\n", 474 | "print(\"Train not OK: \", num_train_not_ok)\n", 475 | "\n", 476 | "num_test_not_ok = len(test_y) - test_y.sum()\n", 477 | "print(\"\\nTest OK: \", test_y.sum())\n", 478 | "print(\"Test not OK: \", num_test_not_ok)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "plt.figure(figsize=(8, 8))\n", 488 | "plt.scatter(E[:, 0], E[:, 1], s=3, c=train_y, cmap=plt.cm.plasma_r)\n", 489 | "plt.colorbar()\n", 490 | "plt.clim([4.5, 6])" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "train_y" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "(train_y > 6).mean()" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "plt.figure(figsize=(8, 8))\n", 518 | "plt.scatter(E[:, 0], E[:, 1], s=3, color='C0')\n", 519 | "sub_inds = np.where(train_y > 6)[0]\n", 520 | "plt.scatter(E[sub_inds, 0], E[sub_inds, 1], s=5, color='red')" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "model = XGBClassifier(n_jobs=-1)\n", 530 | "model.fit(train_X, train_y > 6)\n", 531 | "\n", 532 | "pred_y = model.predict_proba(test_X)\n", 533 | "precision_xgboost, recall_xgboost, _ = precision_recall_curve(test_y > 6, pred_y[:, 1])\n", 534 | "fpr_xgboost, tpr_xgboost, _ = roc_curve(test_y > 6, pred_y[:, 1])\n", 535 | "roc_auc_axboost = auc(fpr_xgboost, tpr_xgboost)\n", 536 | "\n", 537 | "f1_test_score = f1_score(test_y > 6, np.argmax(pred_y, 1))\n", 538 | "ap_test_score = average_precision_score(test_y > 6, pred_y[:, 1])\n", 539 | "print(\"XGBoost F1 score: \", f1_test_score)\n", 540 | "print(\"XGBoost Average Precision score: \", ap_test_score)" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "#plt.title('Receiver Operating Characteristic')\n", 550 | "plt.figure(figsize=(6, 6))\n", 551 | "plt.plot(fpr_xgboost, tpr_xgboost, label='XGBoost AUC = %0.2f' % roc_auc_axboost)\n", 552 | "plt.legend(loc = 'lower right')\n", 553 | "plt.plot([0, 1], [0, 1],'k--')\n", 554 | "plt.xlim([0, 1])\n", 555 | "plt.ylim([0, 1])\n", 556 | "plt.ylabel('True Positive Rate', fontsize=16)\n", 557 | "plt.xlabel('False Positive Rate', fontsize=16)\n" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": {}, 564 | "outputs": [], 565 | "source": [] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "from xgboost import XGBRegressor\n", 574 | "from sklearn.ensemble import BaggingRegressor\n", 575 | "from sklearn.metrics import r2_score\n", 576 | "\n", 577 | "regr = BaggingRegressor(XGBRegressor(n_estimators=100, max_depth=15), n_jobs=10, n_estimators=20, max_samples=0.5).fit(train_X, train_y)\n", 578 | "regr_6 = BaggingRegressor(XGBRegressor(n_estimators=100, max_depth=15), n_jobs=10, n_estimators=20, max_samples=0.5).fit(train_X[train_y < 6], train_y[train_y < 6])\n", 579 | "pred_y = regr.predict(test_X)\n", 580 | "pred_y_6 = regr_6.predict(test_X)\n" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "plt.figure(figsize=(6, 6))\n", 590 | "plt.scatter(test_y, pred_y, s=1)\n", 591 | "plt.xlabel('Testing log perplexity', fontsize=16)\n", 592 | "plt.ylabel('Predicted testing log perplexity', fontsize=16)\n", 593 | "plt.xlim([4.5, 7])\n", 594 | "plt.ylim([4.5, 7])\n", 595 | "plt.savefig('data/figures/prediction_loss.png', dpi=300, bbox_inches='tight')" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": {}, 602 | "outputs": [], 603 | "source": [ 604 | "r2_score(test_y, pred_y)" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "r2_score(test_y, pred_y_6)" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": null, 619 | "metadata": {}, 620 | "outputs": [], 621 | "source": [ 622 | "r2_score(test_y[test_y < 6], pred_y[test_y < 6])" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "r2_score(test_y[test_y < 6], pred_y_6[test_y < 6])" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [] 661 | } 662 | ], 663 | "metadata": { 664 | "kernelspec": { 665 | "display_name": "Python 3", 666 | "language": "python", 667 | "name": "python3" 668 | }, 669 | "language_info": { 670 | "codemirror_mode": { 671 | "name": "ipython", 672 | "version": 3 673 | }, 674 | "file_extension": ".py", 675 | "mimetype": "text/x-python", 676 | "name": "python", 677 | "nbconvert_exporter": "python", 678 | "pygments_lexer": "ipython3", 679 | "version": "3.6.10" 680 | } 681 | }, 682 | "nbformat": 4, 683 | "nbformat_minor": 2 684 | } 685 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | 4 | from embed_regularize import embedded_dropout 5 | from locked_dropout import LockedDropout 6 | from weight_drop import WeightDrop, ParameterListWeightDrop 7 | 8 | from custom_rnn import CustomRNN 9 | 10 | import json 11 | import numpy as np 12 | 13 | class AWDRNNModel(torch.nn.Module): 14 | """Container module with an encoder, a recurrent module, and a decoder.""" 15 | 16 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, 17 | dropout=0.5, dropouth=0.5, dropouti=0.5, dropoute=0.1, wdrop=0, tie_weights=False, 18 | recepie=None, verbose=True): 19 | super(AWDRNNModel, self).__init__() 20 | self.lockdrop = LockedDropout() 21 | self.idrop = torch.nn.Dropout(dropouti) 22 | self.hdrop = torch.nn.Dropout(dropouth) 23 | self.drop = torch.nn.Dropout(dropout) 24 | self.encoder = torch.nn.Embedding(ntoken, ninp) 25 | self.wdrop = wdrop 26 | self.verbose = verbose 27 | 28 | if recepie is not None: 29 | recepie = json.loads(recepie) 30 | 31 | self.rnns = [] 32 | for i in range(nlayers): 33 | input_size = ninp if i == 0 else nhid 34 | hidden_size = nhid if i != nlayers - 1 else (ninp if tie_weights else nhid) 35 | if rnn_type == 'LSTM': 36 | self.rnns.append(torch.nn.LSTM(input_size, hidden_size)) 37 | elif rnn_type == 'CustomRNN': 38 | self.rnns.append(CustomRNN(input_size, hidden_size, recepie)) 39 | 40 | if wdrop: 41 | if rnn_type == 'LSTM': 42 | self.rnns = [WeightDrop(rnn, ['weight_hh_l0'], dropout=wdrop) for rnn in self.rnns] 43 | elif rnn_type == 'CustomRNN': 44 | wd_rnns = [] 45 | for rnn in self.rnns: 46 | multilinear_components = [] 47 | for k, v in rnn.cell.components.items(): 48 | if rnn.cell.recepie[k]['op'] == 'linear': 49 | for i in np.where(np.array(rnn.cell.recepie[k]['input']) != 'x')[0]: 50 | multilinear_components.append(f'cell.components.{k}.weights.{i}') 51 | wd_rnns.append(ParameterListWeightDrop(rnn, multilinear_components, dropout=wdrop)) 52 | self.rnns = wd_rnns 53 | 54 | if self.verbose: 55 | print(self.rnns) 56 | self.rnns = torch.nn.ModuleList(self.rnns) 57 | self.decoder = torch.nn.Linear(nhid, ntoken) 58 | 59 | if tie_weights: 60 | self.decoder.weight = self.encoder.weight 61 | 62 | self.init_weights() 63 | 64 | self.rnn_type = rnn_type 65 | self.ninp = ninp 66 | self.nhid = nhid 67 | self.nlayers = nlayers 68 | self.dropout = dropout 69 | self.dropouti = dropouti 70 | self.dropouth = dropouth 71 | self.dropoute = dropoute 72 | self.tie_weights = tie_weights 73 | self.recepie = recepie 74 | 75 | def reset(self): 76 | pass 77 | 78 | def init_weights(self): 79 | initrange = 0.1 80 | self.encoder.weight.data.uniform_(-initrange, initrange) 81 | self.decoder.bias.data.fill_(0) 82 | self.decoder.weight.data.uniform_(-initrange, initrange) 83 | 84 | def forward(self, input, hidden, return_h=False): 85 | emb = embedded_dropout(self.encoder, input, dropout=self.dropoute if self.training else 0) 86 | #emb = self.idrop(emb) 87 | 88 | emb = self.lockdrop(emb, self.dropouti) 89 | 90 | raw_output = emb 91 | new_hidden = [] 92 | raw_outputs = [] 93 | outputs = [] 94 | for i, rnn in enumerate(self.rnns): 95 | raw_output, new_h = rnn(raw_output, hidden[i]) 96 | new_hidden.append(new_h) 97 | raw_outputs.append(raw_output) 98 | if i != self.nlayers - 1: 99 | #self.hdrop(raw_output) add??? 100 | raw_output = self.lockdrop(raw_output, self.dropouth) 101 | outputs.append(raw_output) 102 | hidden = new_hidden 103 | 104 | output = self.lockdrop(raw_output, self.dropout) 105 | outputs.append(output) 106 | result = output.view(output.size(0)*output.size(1), output.size(2)) 107 | if return_h: 108 | return result, hidden, raw_outputs, outputs 109 | return result, hidden 110 | 111 | def init_hidden(self, bsz): 112 | weight = next(self.parameters()).data 113 | hidden = [] 114 | for i in range(self.nlayers): 115 | if self.rnn_type == 'LSTM': 116 | hidden_tuple_size = 2 117 | elif self.rnn_type == 'CustomRNN': 118 | if self.wdrop: 119 | # wrapped with ParameterListWeightDrop 120 | hidden_tuple_size = self.rnns[0].module.cell.hidden_tuple_size 121 | else: 122 | hidden_tuple_size = self.rnns[0].cell.hidden_tuple_size 123 | hidden_size = self.nhid if i != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid) 124 | hidden.append(tuple([weight.new(1, bsz, hidden_size).zero_() for _ in range(hidden_tuple_size)])) 125 | 126 | return hidden -------------------------------------------------------------------------------- /models_weights/dump_weights_model_2226_2020-04-18_07-35-19_999938929.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/models_weights/dump_weights_model_2226_2020-04-18_07-35-19_999938929.pt -------------------------------------------------------------------------------- /multilinear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | import torch.nn.functional as F 4 | 5 | import math 6 | 7 | class MultiLinear(torch.nn.Module): 8 | 9 | def __init__(self, input_sizes, output_size): 10 | super(MultiLinear, self).__init__() 11 | self.input_sizes = input_sizes 12 | self.output_size = output_size 13 | 14 | weights = [] 15 | for input_size in input_sizes: 16 | weights.append(torch.nn.Parameter(torch.Tensor(output_size, input_size))) 17 | self.weights = torch.nn.ParameterList(weights) 18 | 19 | self.bias = torch.nn.Parameter(torch.Tensor(output_size)) 20 | 21 | self.reset_parameters() 22 | 23 | def reset_parameters(self): 24 | for i in range(len(self.weights)): 25 | torch.nn.init.kaiming_uniform_(self.weights[i], a=math.sqrt(5)) 26 | 27 | fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weights[0]) 28 | bound = 1 / math.sqrt(fan_in) 29 | torch.nn.init.uniform_(self.bias, -bound, bound) 30 | 31 | def forward(self, *inputs): 32 | result = F.linear(inputs[0], self.weights[0], self.bias) 33 | for i in range(1, len(self.weights)): 34 | result = result + F.linear(inputs[i], self.weights[i]) 35 | return result 36 | 37 | def extra_repr(self): 38 | return 'input_sizes={}, output_size={}'.format( 39 | self.input_sizes, self.output_size 40 | ) 41 | -------------------------------------------------------------------------------- /nas_environment.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | 5 | 6 | class Environment: 7 | ''' 8 | Simulates NAS environment. Architecutres can be trained for a specified amount of epochs. 9 | Tarining results are cached, that is, training the same model for larer epochs 10 | will be timed as a continuation from the model's checkpoint. 11 | ''' 12 | def __init__(self, logs_dir): 13 | self._logs = [] 14 | self._arch_to_id = {} 15 | 16 | arch_id = 0 17 | for i, filename in enumerate(os.listdir(logs_dir)): 18 | if filename.endswith('.json'): 19 | log_path = os.path.join(logs_dir, filename) 20 | x = json.load(open(log_path, 'r')) 21 | self._logs.append(x) 22 | assert x['recepie'] not in self._arch_to_id 23 | self._arch_to_id[x['recepie']] = arch_id 24 | arch_id += 1 25 | 26 | self._training_states = {} 27 | 28 | def get_total_time(self): 29 | return sum([x['wall_time'] for x in self._training_states.values()]) 30 | 31 | 32 | def get_best_possible_test_loss(self): 33 | min_loss = np.inf 34 | for log in self._logs: 35 | if len(log['test_losses']) > 0: 36 | cur_loss = np.nanmin(log['test_losses']) 37 | if cur_loss < min_loss: 38 | min_loss = cur_loss 39 | return min_loss 40 | 41 | def get_test_loss_of_the_best_validated_architecture(self): 42 | return self._logs[self.best_arch_id]['test_losses'][self.best_arch_epoch] 43 | 44 | def get_precomputed_recepies(self): 45 | return [json.loads(x['recepie']) for x in self._logs] 46 | 47 | def get_recepie_ids(self): 48 | return [x['recepie_id'] for x in self._logs] 49 | 50 | def reset(self): 51 | self.best_arch_id = -1 52 | self.best_arch_epoch = -1 53 | self._training_states = {} 54 | 55 | def _make_state_dict(self, arch_id, epoch): 56 | state_dict = {f'{phase}_loss':self._logs[arch_id][f'{phase}_losses'][epoch] if epoch >= 0 else np.nan 57 | for phase in ['train', 'val', 'test']} 58 | state_dict['wall_time'] = np.sum(self._logs[arch_id]['wall_times'][:epoch]) 59 | state_dict['cur_epoch'] = epoch 60 | state_dict['status'] = 'OK' if epoch < len(self._logs[arch_id]['train_losses']) - 1 else self._logs[arch_id]['status'] 61 | return state_dict 62 | 63 | def simulated_train(self, arch, max_epoch): 64 | arch_id = self._arch_to_id[json.dumps(arch)] 65 | if (arch_id not in self._training_states) or (max_epoch > self._training_states[arch_id]['cur_epoch']): 66 | max_epoch = min([max_epoch, len(self._logs[arch_id]['train_losses']) - 1]) 67 | self._training_states[arch_id] = self._make_state_dict(arch_id, max_epoch) 68 | 69 | # update best result 70 | val_losses = self._logs[arch_id]['val_losses'][:self._training_states[arch_id]['cur_epoch'] + 1] 71 | if np.sum(~np.isnan(val_losses)) > 0: 72 | cur_best_epoch = np.nanargmin(val_losses) 73 | if (self.best_arch_id == -1) or\ 74 | (self._logs[self.best_arch_id]['val_losses'][self.best_arch_epoch] > val_losses[cur_best_epoch]): 75 | self.best_arch_id = arch_id 76 | self.best_arch_epoch = cur_best_epoch 77 | 78 | def get_model_status(self, arch): 79 | arch_id = self._arch_to_id[json.dumps(arch)] 80 | return self._training_states[arch_id]['status'] 81 | 82 | def get_model_stats(self, arch, epoch): 83 | arch_id = self._arch_to_id[json.dumps(arch)] 84 | if self._training_states[arch_id]['cur_epoch'] < epoch: 85 | raise Exception('Required epoch exceeds current training epochs.') 86 | 87 | return self._make_state_dict(arch_id, epoch) -------------------------------------------------------------------------------- /plotting.py: -------------------------------------------------------------------------------- 1 | import pygraphviz as pgv 2 | from IPython.display import Image 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def plot_recepie(recepie, dpi=100): 7 | graph = pgv.AGraph(directed=True, strict=True, 8 | fontname='Helvetica', arrowtype='open') 9 | 10 | node_color = {'x':'forestgreen', 11 | 'h_prev_0':'orange', 12 | 'h_new_0':'orange', 13 | 'h_prev_1':'cyan', 14 | 'h_new_1':'cyan', 15 | 'h_prev_2':'purple', 16 | 'h_new_2':'purple'} 17 | 18 | blend_i_to_color = {1:'blue3', 19 | 2:'brown3'} 20 | 21 | nodes_dict = {} 22 | for k in recepie.keys(): 23 | if k not in nodes_dict: 24 | graph.add_node(len(nodes_dict), label=recepie[k]['op'] + ':\n' + k, 25 | fillcolor=node_color.get(k, 'white'), style='filled') 26 | nodes_dict[k] = len(nodes_dict) 27 | for k in recepie.keys(): 28 | for i, n in enumerate(recepie[k]['input']): 29 | if n not in nodes_dict: 30 | graph.add_node(len(nodes_dict), label=n, 31 | fillcolor=node_color.get(n, 'white'), style='filled') 32 | nodes_dict[n] = len(nodes_dict) 33 | #print(nodes_dict[k], nodes_dict[n]) 34 | if recepie[k]['op'] != 'blend': 35 | graph.add_edge(nodes_dict[n], nodes_dict[k]) 36 | else: 37 | if i == 0: 38 | graph.add_edge(nodes_dict[n], nodes_dict[k], style='dashed') 39 | else: 40 | graph.add_edge(nodes_dict[n], nodes_dict[k], color=blend_i_to_color[i]) 41 | 42 | return Image(graph.draw(format='png', prog='dot', args=f'-Gdpi={dpi} -Nfontsize=8')) -------------------------------------------------------------------------------- /reproduce_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "import torch.nn\n", 11 | "import torch.optim\n", 12 | "import torch.utils.data\n", 13 | "import torch.nn.functional as F\n", 14 | "from splitcross import SplitCrossEntropyLoss\n", 15 | "\n", 16 | "import numpy as np\n", 17 | "import networkx as nx\n", 18 | "import math\n", 19 | "import json\n", 20 | "import time\n", 21 | "\n", 22 | "import data\n", 23 | "import os\n", 24 | "from utils import batchify\n", 25 | "from argparse import Namespace\n", 26 | "from model import AWDRNNModel\n", 27 | "from train import train, evaluate\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "suffix = '2226_2020-04-18_07-35-19_999938929'" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "log = json.load(open('train_logs_multi_runs/log_stats_model_100' + suffix + '.json', 'r'))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "args = Namespace(**log)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "corpus = data.Corpus(args.data)\n", 65 | "cuda = 'cuda'\n", 66 | "\n", 67 | "train_data = batchify(corpus.train, args.batch_size, args, cuda)\n", 68 | "train_eval_data = batchify(corpus.train, args.eval_batch_size, args, cuda)\n", 69 | "val_data = batchify(corpus.valid, args.eval_batch_size, args, cuda)\n", 70 | "test_data = batchify(corpus.test, args.eval_batch_size, args, cuda)\n", 71 | "\n", 72 | "ntokens = len(corpus.dictionary)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "custom_model = AWDRNNModel(args.model, \n", 82 | " ntokens, \n", 83 | " args.emsize, \n", 84 | " args.nhid, \n", 85 | " args.nlayers, \n", 86 | " args.dropout, \n", 87 | " args.dropouth, \n", 88 | " args.dropouti, \n", 89 | " args.dropoute, \n", 90 | " args.wdrop, \n", 91 | " args.tied,\n", 92 | " args.recepie,\n", 93 | " verbose=False)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "custom_model.load_state_dict(torch.load('models_weights/dump_weights_model_' + suffix + '.pt'))" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "custom_model.to(cuda);" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "criterion = SplitCrossEntropyLoss(args.emsize, splits=[], verbose=False)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "train_loss = evaluate(custom_model, criterion, train_eval_data, args.eval_batch_size, args)\n", 130 | "val_loss = evaluate(custom_model, criterion, val_data, args.eval_batch_size, args)\n", 131 | "test_loss = evaluate(custom_model, criterion, test_data, args.eval_batch_size, args)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "print('-' * 89)\n", 141 | "print('train loss {:5.4f} | '\n", 142 | " 'train ppl {:8.2f} | train bpw {:8.3f} |\\n| valid loss {:5.4f} | '\n", 143 | " 'valid ppl {:8.2f} | valid bpw {:8.3f} |\\n| test loss {:5.4f} | '\n", 144 | " 'test ppl {:8.2f} | test bpw {:8.3f} |'.format(\n", 145 | " train_loss, math.exp(train_loss), train_loss / math.log(2),\n", 146 | " val_loss, math.exp(val_loss), val_loss / math.log(2),\n", 147 | " test_loss, math.exp(test_loss), test_loss / math.log(2)))\n", 148 | "print('-' * 89)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "print('logged train loss', log['train_losses'][-1])\n", 158 | "print('logged valid loss', log['val_losses'][-1])\n", 159 | "print('logged test loss', log['test_losses'][-1])" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.6.10" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 2 191 | } 192 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.8.1 2 | alignment==1.0.10 3 | asn1crypto==0.24.0 4 | astor==0.8.0 5 | atomicwrites==1.1.5 6 | attrs==18.1.0 7 | backcall==0.1.0 8 | bleach==2.1.3 9 | blis==0.2.4 10 | bokeh==0.12.15 11 | boto==2.48.0 12 | boto3==1.5.32 13 | botocore==1.8.46 14 | boxsdk==1.5.5 15 | bpemb==0.3.0 16 | bz2file==0.98 17 | catboost==0.8.1.1 18 | cchardet==2.1.4 19 | certifi==2020.4.5.1 20 | cffi==1.11.5 21 | chainer==6.0.0 22 | chardet==3.0.4 23 | click==6.7 24 | climin==0.1a1 25 | cloudpickle==0.5.3 26 | colorama==0.4.1 27 | colorlog==4.0.2 28 | colormap==1.0.2 29 | conda==4.8.3 30 | conda-package-handling==1.6.0 31 | config-pkg==1.1.4 32 | ConfigSpace==0.4.13 33 | cplxmodule==0.8.0 34 | cryptography==2.6.1 35 | cssselect==1.0.3 36 | cupy==7.0.0a1 37 | cupy-cuda90==7.0.0a1 38 | cycler==0.10.0 39 | cymem==2.0.2 40 | Cython==0.29.13 41 | cytoolz==0.9.0.1 42 | dask==0.15.2 43 | decorator==4.4.0 44 | Deprecated==1.2.5 45 | dill==0.2.9 46 | distributed==1.18.3 47 | dlispy==0.0.2 48 | docutils==0.14 49 | -e git+https://github.com/fmsnew/DPDall.git@92be3038315d7c8470ec1375da24fb478cfc22cf#egg=dpd_project 50 | easydev==0.9.37 51 | editdistance==0.4 52 | eli5==0.8.1 53 | email-reply-parser==0.5.9 54 | emcee==3.0.2 55 | en-core-web-sm==2.0.0 56 | entrypoints==0.2.3 57 | et-xmlfile==1.0.1 58 | fastdtw==0.3.2 59 | fastprogress==0.1.20 60 | fastrlock==0.4 61 | fasttext==0.9.1 62 | filelock==3.0.12 63 | flair==0.4.2 64 | Flask==1.0.3 65 | ftfy==5.6 66 | future==0.16.0 67 | gast==0.3.2 68 | gdown==3.8.3 69 | gensim==3.4.0 70 | Glances==3.1.0 71 | GPy==1.9.6 72 | graphviz==0.10.1 73 | grpcio==1.24.1 74 | h5py==2.7.1 75 | HeapDict==1.0.0 76 | HetMOGP==0.1 77 | html2text==2018.1.9 78 | html5lib==1.0.1 79 | hyperopt==0.1.2 80 | idna==2.8 81 | ijson==2.3 82 | imageio==2.5.0 83 | implicit==0.3.6 84 | importlib-metadata==0.18 85 | inspyred==1.0.1 86 | interruptingcow==0.8 87 | ipaddress==1.0.22 88 | ipdb==0.12 89 | ipykernel==4.8.2 90 | ipython==6.3.1 91 | ipython-genutils==0.2.0 92 | ipywidgets==7.2.1 93 | itsdangerous==1.1.0 94 | jdcal==1.4 95 | jedi==0.12.0 96 | jellyfish==0.7.2 97 | Jinja2==2.10 98 | jmespath==0.9.3 99 | joblib==0.12.5 100 | jsonschema==3.0.1 101 | jupyter==1.0.0 102 | jupyter-client==5.2.3 103 | jupyter-console==5.2.0 104 | jupyter-core==4.4.0 105 | jupyterthemes==0.20.0 106 | Keras-Applications==1.0.8 107 | Keras-Preprocessing==1.1.0 108 | kiwisolver==1.0.1 109 | lasio==0.21 110 | lazy-import==0.2.2 111 | lesscpy==0.13.0 112 | lightgbm==2.1.0 113 | line-profiler==2.1.2 114 | lockfile==0.12.2 115 | luigi==2.7.8 116 | lxml==4.3.3 117 | Mako==1.0.7 118 | Markdown==3.1.1 119 | MarkupSafe==1.0 120 | matplotlib==3.0.3 121 | mistune==0.8.3 122 | mkl-fft==1.0.10 123 | mkl-random==1.0.2 124 | mkl-service==2.0.2 125 | mock==3.0.5 126 | more-itertools==4.3.0 127 | mpld3==0.3 128 | msgpack==0.5.6 129 | msgpack-numpy==0.4.3.2 130 | msgpack-python==0.5.4 131 | murmurhash==1.0.2 132 | nasbench==1.0 133 | nasbench-encoder==0.0.1 134 | nasbench-pytorch==0.0.0 135 | nbconvert==5.3.1 136 | nbformat==4.4.0 137 | networkx==2.4rc1.dev20190610154137 138 | nltk==3.4.1 139 | notebook==5.7.2 140 | numpy==1.17.3 141 | olefile==0.45.1 142 | openmdao==2.8.0 143 | openpyxl==2.5.4 144 | ordereddict==1.1 145 | packaging==17.1 146 | pandas==0.24.2 147 | pandocfilters==1.4.2 148 | paramz==0.9.4 149 | parso==0.2.0 150 | patsy==0.5.0 151 | pexpect==4.5.0 152 | pickleshare==0.7.4 153 | Pillow==7.1.2 154 | pke==1.8 155 | plac==0.9.6 156 | plotly==2.5.1 157 | pluggy==0.7.1 158 | ply==3.11 159 | pmlb==0.3 160 | preshed==2.0.1 161 | progressbar==2.5 162 | prometheus-client==0.5.0 163 | prompt-toolkit==1.0.15 164 | protobuf==3.7.1 165 | psutil==5.4.6 166 | psycopg2==2.8.4 167 | ptyprocess==0.5.2 168 | py==1.5.4 169 | pyaml==20.4.0 170 | pybind11==2.4.3 171 | pycosat==0.6.3 172 | pycparser==2.18 173 | pyDOE==0.3.8 174 | pyDOE2==1.2.1 175 | Pygments==2.2.0 176 | pygpu==0.7.5 177 | pygraphviz==1.5 178 | PyJWT==1.6.4 179 | pyKriging==0.2.0 180 | pymc3==3.3 181 | pymongo==3.8.0 182 | pymystem3==0.1.9 183 | pynisher==0.5.0 184 | pyOpenSSL==17.5.0 185 | pyparsing==2.2.0 186 | pyrfr==0.8.0 187 | pyrsistent==0.15.2 188 | PySocks==1.6.8 189 | pytest==3.7.1 190 | python-daemon==2.2.0 191 | python-dateutil==2.8.0 192 | python-dotenv==0.9.1 193 | pytorch-pretrained-bert==0.6.2 194 | pytz==2019.1 195 | PyWavelets==1.1.1 196 | pywt==1.0.6 197 | PyYAML==3.12 198 | pyzmq==17.0.0 199 | qtconsole==4.3.1 200 | quotequail==0.2.3 201 | rake-nltk==1.0.4 202 | regex==2018.1.10 203 | requests==2.22.0 204 | requests-toolbelt==0.8.0 205 | ruamel-yaml==0.15.35 206 | s3transfer==0.1.13 207 | sacremoses==0.0.35 208 | schedule==0.5.0 209 | scikit-image==0.17.2 210 | scikit-learn==0.23.1 211 | scikit-optimize==0.7.4 212 | scipy==1.2.1 213 | seaborn==0.8.1 214 | seafileapi==0.1.2 215 | segtok==1.5.7 216 | Send2Trash==1.5.0 217 | sentencepiece==0.1.82 218 | simplegeneric==0.8.1 219 | singledispatch==3.4.0.3 220 | six==1.11.0 221 | sklearn==0.0 222 | smac==0.12.2 223 | smart-open==1.5.7 224 | sobol-seq==0.2.0 225 | sortedcontainers==2.0.4 226 | spacy==2.0.18 227 | SQLAlchemy==1.3.16 228 | sqlitedict==1.6.0 229 | srsly==0.0.5 230 | statsmodels==0.8.0 231 | tabulate==0.8.3 232 | talon==1.4.4 233 | tblib==1.3.2 234 | tensorboard==1.13.1 235 | tensorboardX==1.6 236 | tensorflow==1.13.1 237 | tensorflow-estimator==1.13.0 238 | termcolor==1.1.0 239 | terminado==0.8.1 240 | testpath==0.3.1 241 | Theano==1.0.1+2.gcd195ed28 242 | thinc==6.12.1 243 | thinc-gpu-ops==0.0.4 244 | threadpoolctl==2.1.0 245 | tifffile==2020.5.25 246 | toolz==0.9.0 247 | torch==1.1.0 248 | torchgan==0.0.4 249 | torchvision==0.3.0 250 | tornado==4.5.3 251 | tqdm==4.32.1 252 | traitlets==4.3.2 253 | transformers==2.2.0 254 | transliterate==1.10.2 255 | tsfresh==0.11.0 256 | typing==3.6.4 257 | typing-extensions==3.7.2 258 | ujson==1.35 259 | Unidecode==1.1.1 260 | urllib3==1.24.3 261 | virtualenv==16.7.9 262 | wasabi==0.2.2 263 | wcwidth==0.1.7 264 | webencodings==0.5.1 265 | Werkzeug==0.15.4 266 | widgetsnbextension==3.2.1 267 | wrapt==1.10.11 268 | xgboost==0.71 269 | xlrd==1.1.0 270 | xmltodict==0.12.0 271 | yake==0.4.1 272 | zict==0.1.3 273 | zipp==0.5.1 274 | -------------------------------------------------------------------------------- /search_space.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RecepieGenerator: 5 | 6 | def __init__( 7 | self, 8 | hidden_tuple_size=2, 9 | intermediate_vertices=7, 10 | main_operations = ['linear', 'blend', 'elementwise_prod', 'elementwise_sum'], 11 | main_weights = [3., 1., 1., 1.], 12 | activations = ['activation_tanh', 'activation_sigm', 'activation_leaky_relu'], 13 | activation_weights = [1., 1., 1.], 14 | linear_connections = [2, 3], 15 | linear_connections_weights = [4, 1] 16 | ): 17 | self.hidden_tuple_size = hidden_tuple_size 18 | self.intermediate_vertices = intermediate_vertices 19 | self.main_operations = main_operations 20 | self.main_probabilities = np.array(main_weights)/np.sum(main_weights) 21 | self.activations = activations 22 | self.activation_probabilities = np.array(activation_weights)/np.sum(activation_weights) 23 | self.linear_connections = linear_connections 24 | self.linear_connections_probabilities = np.array(linear_connections_weights)/np.sum(linear_connections_weights) 25 | 26 | def _generate_redundant_graph(self, recepie, base_nodes): 27 | i = 0 28 | activation_nodes = [] 29 | while i < self.hidden_tuple_size + self.intermediate_vertices: 30 | op = np.random.choice(self.main_operations, 1, p=self.main_probabilities)[0] 31 | if op == 'linear': 32 | num_connections = np.random.choice(self.linear_connections, 1, 33 | p=self.linear_connections_probabilities)[0] 34 | connection_candidates = base_nodes + activation_nodes 35 | if num_connections > len(connection_candidates): 36 | num_connections = len(connection_candidates) 37 | 38 | connections = np.random.choice(connection_candidates, num_connections, replace=False) 39 | recepie[f'node_{i}'] = {'op':op, 'input':connections} 40 | i += 1 41 | 42 | # after linear force add activation node tied to the new node, if possible (nodes budget) 43 | op = np.random.choice(self.activations, 1, p=self.activation_probabilities)[0] 44 | recepie[f'node_{i}'] = {'op':op, 'input':[f'node_{i - 1}']} 45 | activation_nodes.append(f'node_{i}') 46 | i += 1 47 | 48 | elif op in ['blend', 'elementwise_prod', 'elementwise_sum']: 49 | # inputs must exclude x 50 | if op == 'blend': 51 | num_connections = 3 52 | else: 53 | num_connections = 2 54 | connection_candidates = list(set(base_nodes) - set('x')) + list(recepie.keys()) 55 | if num_connections <= len(connection_candidates): 56 | connections = np.random.choice(connection_candidates, num_connections, replace=False) 57 | recepie[f'node_{i}'] = {'op':op, 'input':connections} 58 | i += 1 59 | 60 | def _create_hidden_nodes(self, recepie): 61 | new_hiddens_map = {} 62 | for k in np.random.choice(list(recepie.keys()), self.hidden_tuple_size, replace=False): 63 | new_hiddens_map[k] = f'h_new_{len(new_hiddens_map)}' 64 | 65 | for k in new_hiddens_map: 66 | recepie[new_hiddens_map[k]] = recepie[k] 67 | del recepie[k] 68 | 69 | for k in recepie: 70 | recepie[k]['input'] = [new_hiddens_map.get(x, x) for x in recepie[k]['input']] 71 | 72 | def _remove_redundant_nodes(self, recepie): 73 | q = [f'h_new_{i}' for i in range(self.hidden_tuple_size)] 74 | visited = set(q) 75 | while len(q) > 0: 76 | if q[0] in recepie: 77 | for node in recepie[q[0]]['input']: 78 | if node not in visited: 79 | q.append(node) 80 | visited.add(node) 81 | q = q[1:] 82 | 83 | for k in list(recepie.keys()): 84 | if k not in visited: 85 | del recepie[k] 86 | 87 | return visited 88 | 89 | def generate_random_recepie(self, seed=None): 90 | if seed is not None: 91 | np.random.seed(seed) 92 | prev_hidden_nodes = [f'h_prev_{i}' for i in range(self.hidden_tuple_size)] 93 | base_nodes = ['x'] + prev_hidden_nodes 94 | 95 | recepie = {} 96 | self._generate_redundant_graph(recepie, base_nodes) 97 | self._create_hidden_nodes(recepie) 98 | visited = self._remove_redundant_nodes(recepie) 99 | 100 | is_sanity_check_ok = True 101 | 102 | # check that all input nodes are in the graph 103 | for node in base_nodes: 104 | if node not in visited: 105 | is_sanity_check_ok = False 106 | break 107 | 108 | # constraint: prev hidden nodes are not connected directly to new hidden nodes 109 | for i in range(self.hidden_tuple_size): 110 | if len(set(recepie[f'h_new_{i}']['input']) & set(prev_hidden_nodes)) > 0: 111 | is_sanity_check_ok = False 112 | break 113 | 114 | return recepie, is_sanity_check_ok 115 | 116 | def get_example_recepie(self, name): 117 | if name == 'rnn': 118 | recepie = { 119 | 'f':{'op':'linear', 'input':['x', 'h_prev_0']}, 120 | 'h_new_0':{'op':'activation_tanh', 'input':['f']} 121 | } 122 | elif name == 'lstm': 123 | recepie = { 124 | 'i':{'op':'linear', 'input':['x', 'h_prev_0']}, 125 | 'i_act':{'op':'activation_tanh', 'input':['i']}, 126 | 127 | 'j':{'op':'linear', 'input':['x', 'h_prev_0']}, 128 | 'j_act':{'op':'activation_sigm', 'input':['j']}, 129 | 130 | 'f':{'op':'linear', 'input':['x', 'h_prev_0']}, 131 | 'f_act':{'op':'activation_sigm', 'input':['f']}, 132 | 133 | 'o':{'op':'linear', 'input':['x', 'h_prev_0']}, 134 | 'o_act':{'op':'activation_tanh', 'input':['o']}, 135 | 136 | 'h_new_1_part1':{'op':'elementwise_prod', 'input':['f_act', 'h_prev_1']}, 137 | 'h_new_1_part2':{'op':'elementwise_prod', 'input':['i_act', 'j_act']}, 138 | 139 | 'h_new_1':{'op':'elementwise_sum', 'input':['h_new_1_part1', 'h_new_1_part2']}, 140 | 141 | 'h_new_1_act':{'op':'activation_tanh', 'input':['h_new_1']}, 142 | 'h_new_0':{'op':'elementwise_prod', 'input':['h_new_1_act', 'o_act']} 143 | } 144 | elif name == 'gru': 145 | recepie = { 146 | 'r':{'op':'linear', 'input':['x', 'h_prev_0']}, 147 | 'r_act':{'op':'activation_sigm', 'input':['r']}, 148 | 149 | 'z':{'op':'linear', 'input':['x', 'h_prev_0']}, 150 | 'z_act':{'op':'activation_sigm', 'input':['z']}, 151 | 152 | 'rh':{'op':'elementwise_prod', 'input':['r_act', 'h_prev_0']}, 153 | 'h_tilde':{'op':'linear', 'input':['x', 'rh']}, 154 | 'h_tilde_act':{'op':'activation_tanh', 'input':['h_tilde']}, 155 | 156 | 'h_new_0':{'op':'blend', 'input':['z_act', 'h_prev_0', 'h_tilde_act']} 157 | } 158 | else: 159 | raise Exception(f'Unknown recepie name: {name}') 160 | return recepie 161 | 162 | -------------------------------------------------------------------------------- /search_space_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import json" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import matplotlib.pyplot as plt\n", 22 | "%matplotlib inline" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "all_stats = []\n", 32 | "for fn in os.listdir('train_logs_single_run'):\n", 33 | " if fn.endswith('.json'):\n", 34 | " all_stats.append(json.load(open(os.path.join('train_logs_single_run', fn), 'r')))" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "len(all_stats)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "def get_nodes_cnt(x):\n", 53 | " all_nodes = set(x.keys())\n", 54 | " for k in x.keys():\n", 55 | " all_nodes |= set(x[k]['input'])\n", 56 | " return len(all_nodes)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "plt.hist([get_nodes_cnt(json.loads(x['recepie'])) for x in all_stats], bins=16, range=(4, 20))\n", 66 | "plt.xlabel('Number of nodes', fontsize=16)\n", 67 | "plt.ylabel('Number of architectures', fontsize=16)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "ok_stats = [x for x in all_stats if x['status'] == 'OK']" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "recepie_ids = [x['recepie_id'] for x in ok_stats]" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "plt.hist([np.exp(np.min(x['test_losses'])) for x in all_stats if x['status'] == 'OK'], \n", 95 | " bins=50, range=(65, 250));\n", 96 | "plt.ylabel('Num. architectures', fontsize=14)\n", 97 | "plt.xlabel('Perplexity', fontsize=14)\n", 98 | "labels = ['RNN', 'LSTM', 'GRU']\n", 99 | "\n", 100 | "for i in range(3):\n", 101 | " seek_id = recepie_ids.index(1000000 + i)\n", 102 | " x = ok_stats[seek_id]\n", 103 | " plt.vlines(np.exp(np.min(x['test_losses'])), 0, 1000, color=f'C{i+1}', label=labels[i], linestyle='--')\n", 104 | "plt.legend(fontsize=14)\n", 105 | "plt.yscale('log')\n", 106 | "plt.savefig('data/figures/ppl_distrib.png', dpi=300, bbox_inches='tight')" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "plt.figure(figsize=(9, 8))\n", 116 | "plt.scatter([np.sum(x['wall_times']) for x in ok_stats], \n", 117 | " [x['num_params'] for x in ok_stats], s=15,\n", 118 | " c=[(np.min(np.exp(x['test_losses']))) for x in ok_stats],\n", 119 | " cmap=plt.cm.viridis_r, alpha=0.99)\n", 120 | "cbar = plt.colorbar()\n", 121 | "cbar.set_label('Test perplexity', fontsize=16)\n", 122 | "plt.clim([65, 400])\n", 123 | "\n", 124 | "labels = ['RNN', 'LSTM', 'GRU']\n", 125 | "markers = ['X', '^', 'o']\n", 126 | "for i in range(3):\n", 127 | " seek_id = recepie_ids.index(1000000 + i)\n", 128 | " x = ok_stats[seek_id]\n", 129 | " plt.scatter([np.sum(x['wall_times'])],\n", 130 | " [x['num_params']],\n", 131 | " c='r', marker=markers[i], zorder=10, edgecolor='k', lw=0.5,\n", 132 | " s=200, label=labels[i])\n", 133 | "\n", 134 | "plt.legend(fontsize=14)\n", 135 | "plt.xlabel('Wall time [s]', fontsize=16)\n", 136 | "plt.ylabel('Num params', fontsize=16)\n", 137 | "plt.savefig('data/figures/main_metrics.png', dpi=300, bbox_inches='tight')" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "def get_rank(x):\n", 147 | " r = np.zeros_like(x)\n", 148 | " r[np.argsort(x)] = np.arange(len(x))\n", 149 | " return r" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "Y = np.array([x['test_losses'] for x in all_stats if x['status'] == 'OK'])" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "X = np.array([x['val_losses'] for x in all_stats if x['status'] == 'OK'])" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "plt.figure(figsize=(21, 5))\n", 177 | "for i, e in enumerate([5, 10, 25, 50]):\n", 178 | " plt.subplot(1, 4, i + 1)\n", 179 | " plt.scatter(get_rank(X[:, e - 1]), get_rank(Y[:, -1]), s=1)\n", 180 | " plt.xlabel(f'Validation rank {e} epoch', fontsize=14)\n", 181 | " plt.ylabel('Test rank 50 epoch', fontsize=14)\n", 182 | "plt.tight_layout()\n", 183 | "plt.savefig('data/figures/dynamic_ranking.png', dpi=300, bbox_inches='tight')" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "### Correlation with performance on wikitext" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "for fn in os.listdir('train_logs_multi_runs'):\n", 200 | " if fn.endswith('.json'):\n", 201 | " all_stats.append(json.load(open(os.path.join('train_logs_multi_runs', fn), 'r')))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "all_stats_wiki = []\n", 211 | "for fn in os.listdir('train_logs_wikitext-2'):\n", 212 | " if fn.endswith('.json'):\n", 213 | " all_stats_wiki.append(json.load(open(os.path.join('train_logs_wikitext-2', fn), 'r')))" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "len(all_stats_wiki)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "ok_stats_wiki = [x for x in all_stats_wiki if x['status'] == 'OK']" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "id_to_ppl = {x['recepie_id']:np.exp(np.min(x['test_losses'])) for x in all_stats if x['status'] == 'OK'}\n", 241 | "id_to_ppl_wiki = {x['recepie_id']:np.exp(np.min(x['test_losses'])) for x in all_stats_wiki if x['status'] == 'OK'}" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "ppl = []\n", 251 | "ppl_wiki = []\n", 252 | "for k in id_to_ppl_wiki:\n", 253 | " if k in id_to_ppl:\n", 254 | " ppl.append(id_to_ppl[k])\n", 255 | " ppl_wiki.append(id_to_ppl_wiki[k])" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "plt.figure(figsize=(7, 7))\n", 265 | "plt.scatter(np.log(ppl), np.log(ppl_wiki))\n", 266 | "plt.xlabel('PTB testing log perplexity', fontsize=16)\n", 267 | "plt.ylabel('WikiText-2 testing log perplexity', fontsize=16)\n", 268 | "plt.savefig('data/figures/transfer_corr.png', dpi=300, bbox_inches='tight')" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [] 298 | } 299 | ], 300 | "metadata": { 301 | "kernelspec": { 302 | "display_name": "Python 3", 303 | "language": "python", 304 | "name": "python3" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.6.10" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 2 321 | } 322 | -------------------------------------------------------------------------------- /search_space_examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import search_space\n", 20 | "import plotting\n", 21 | "import json\n", 22 | "\n", 23 | "%matplotlib inline\n", 24 | "import matplotlib.pyplot as plt" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "recepie_generator = search_space.RecepieGenerator()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "recepie = recepie_generator.get_example_recepie('rnn')\n", 43 | "print(recepie)\n", 44 | "fig = plotting.plot_recepie(recepie, dpi=100)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "with open(\"data/figures/rnn_cell.png\", \"wb\") as png:\n", 54 | " png.write(fig.data)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "fig" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "recepie = recepie_generator.get_example_recepie('lstm')\n", 73 | "print(recepie)\n", 74 | "fig = plotting.plot_recepie(recepie, dpi=100)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "with open(\"data/figures/lstm_cell.png\", \"wb\") as png:\n", 84 | " png.write(fig.data)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "fig" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "recepie = recepie_generator.get_example_recepie('gru')\n", 103 | "print(recepie)\n", 104 | "fig = plotting.plot_recepie(recepie, dpi=100)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "with open(\"data/figures/gru_cell.png\", \"wb\") as png:\n", 114 | " png.write(fig.data)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "fig" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "recepie, sanity_check = recepie_generator.generate_random_recepie(4)\n", 133 | "print('valid recepie: ', sanity_check)\n", 134 | "plotting.plot_recepie(recepie)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "recepie, sanity_check = recepie_generator.generate_random_recepie(10)\n", 144 | "print('valid recepie: ', sanity_check)\n", 145 | "plotting.plot_recepie(recepie)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "# Make search space elements example" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "from tqdm import tqdm_notebook" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "max_valid_confs = 100\n", 171 | "all_recepies = []\n", 172 | "rnd_offset = 0\n", 173 | "for hidden_tuple_size in [1, 2, 3]:\n", 174 | " for intermediate_elements in [7, 14, 21]:\n", 175 | " recepie_generator = search_space.RecepieGenerator(hidden_tuple_size, intermediate_elements)\n", 176 | " N = 200\n", 177 | " valid_seeds = []\n", 178 | " for i in tqdm_notebook(range(N)):\n", 179 | " recepie, sanity_check = recepie_generator.generate_random_recepie(i + rnd_offset)\n", 180 | " if sanity_check:\n", 181 | " valid_seeds.append(i)\n", 182 | " for i in valid_seeds[:max_valid_confs]:\n", 183 | " recepie, sanity_check = recepie_generator.generate_random_recepie(i + rnd_offset)\n", 184 | " all_recepies.append(recepie)\n", 185 | " rnd_offset += N" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "json_recepies = [json.dumps(x) for x in all_recepies]" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "mind about duplicates that can appear during generation" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "len(json_recepies)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "len(set(json_recepies))" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [] 235 | } 236 | ], 237 | "metadata": { 238 | "kernelspec": { 239 | "display_name": "Python 3", 240 | "language": "python", 241 | "name": "python3" 242 | }, 243 | "language_info": { 244 | "codemirror_mode": { 245 | "name": "ipython", 246 | "version": 3 247 | }, 248 | "file_extension": ".py", 249 | "mimetype": "text/x-python", 250 | "name": "python", 251 | "nbconvert_exporter": "python", 252 | "pygments_lexer": "ipython3", 253 | "version": "3.6.10" 254 | } 255 | }, 256 | "nbformat": 4, 257 | "nbformat_minor": 2 258 | } 259 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup(name='CustomAwdRnn', 5 | version='0.0.0', 6 | description='Custom awd rnn', 7 | author='Nikita Klyuchnikov', 8 | author_email='nikita.klyuchnikov@skolkovotech.ru', 9 | packages=['CustomAwdRnn', ], 10 | zip_safe=False) -------------------------------------------------------------------------------- /splitcross.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | import numpy as np 7 | 8 | 9 | class SplitCrossEntropyLoss(nn.Module): 10 | r'''SplitCrossEntropyLoss calculates an approximate softmax''' 11 | def __init__(self, hidden_size, splits, verbose=False): 12 | # We assume splits is [0, split1, split2, N] where N >= |V| 13 | # For example, a vocab of 1000 words may have splits [0] + [100, 500] + [inf] 14 | super(SplitCrossEntropyLoss, self).__init__() 15 | self.hidden_size = hidden_size 16 | self.splits = [0] + splits + [100 * 1000000] 17 | self.nsplits = len(self.splits) - 1 18 | self.stats = defaultdict(list) 19 | self.verbose = verbose 20 | # Each of the splits that aren't in the head require a pretend token, we'll call them tombstones 21 | # The probability given to this tombstone is the probability of selecting an item from the represented split 22 | if self.nsplits > 1: 23 | self.tail_vectors = nn.Parameter(torch.zeros(self.nsplits - 1, hidden_size)) 24 | self.tail_bias = nn.Parameter(torch.zeros(self.nsplits - 1)) 25 | 26 | def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, verbose=False): 27 | # First we perform the first softmax on the head vocabulary and the tombstones 28 | if softmaxed_head_res is None: 29 | start, end = self.splits[0], self.splits[1] 30 | head_weight = None if end - start == 0 else weight[start:end] 31 | head_bias = None if end - start == 0 else bias[start:end] 32 | # We only add the tombstones if we have more than one split 33 | if self.nsplits > 1: 34 | head_weight = self.tail_vectors if head_weight is None else torch.cat([head_weight, self.tail_vectors]) 35 | head_bias = self.tail_bias if head_bias is None else torch.cat([head_bias, self.tail_bias]) 36 | 37 | # Perform the softmax calculation for the word vectors in the head for all splits 38 | # We need to guard against empty splits as torch.cat does not like random lists 39 | head_res = torch.nn.functional.linear(hiddens, head_weight, bias=head_bias) 40 | softmaxed_head_res = torch.nn.functional.log_softmax(head_res, dim=-1) 41 | 42 | if splits is None: 43 | splits = list(range(self.nsplits)) 44 | 45 | results = [] 46 | running_offset = 0 47 | for idx in splits: 48 | 49 | # For those targets in the head (idx == 0) we only need to return their loss 50 | if idx == 0: 51 | results.append(softmaxed_head_res[:, :-(self.nsplits - 1)]) 52 | 53 | # If the target is in one of the splits, the probability is the p(tombstone) * p(word within tombstone) 54 | else: 55 | start, end = self.splits[idx], self.splits[idx + 1] 56 | tail_weight = weight[start:end] 57 | tail_bias = bias[start:end] 58 | 59 | # Calculate the softmax for the words in the tombstone 60 | tail_res = torch.nn.functional.linear(hiddens, tail_weight, bias=tail_bias) 61 | 62 | # Then we calculate p(tombstone) * p(word in tombstone) 63 | # Adding is equivalent to multiplication in log space 64 | head_entropy = (softmaxed_head_res[:, -idx]).contiguous() 65 | tail_entropy = torch.nn.functional.log_softmax(tail_res, dim=-1) 66 | results.append(head_entropy.view(-1, 1) + tail_entropy) 67 | 68 | if len(results) > 1: 69 | return torch.cat(results, dim=1) 70 | return results[0] 71 | 72 | def split_on_targets(self, hiddens, targets): 73 | # Split the targets into those in the head and in the tail 74 | split_targets = [] 75 | split_hiddens = [] 76 | 77 | # Determine to which split each element belongs (for each start split value, add 1 if equal or greater) 78 | # This method appears slower at least for WT-103 values for approx softmax 79 | #masks = [(targets >= self.splits[idx]).view(1, -1) for idx in range(1, self.nsplits)] 80 | #mask = torch.sum(torch.cat(masks, dim=0), dim=0) 81 | ### 82 | # This is equally fast for smaller splits as method below but scales linearly 83 | mask = None 84 | for idx in range(1, self.nsplits): 85 | partial_mask = targets >= self.splits[idx] 86 | mask = mask + partial_mask if mask is not None else partial_mask 87 | ### 88 | #masks = torch.stack([targets] * (self.nsplits - 1)) 89 | #mask = torch.sum(masks >= self.split_starts, dim=0) 90 | for idx in range(self.nsplits): 91 | # If there are no splits, avoid costly masked select 92 | if self.nsplits == 1: 93 | split_targets, split_hiddens = [targets], [hiddens] 94 | continue 95 | # If all the words are covered by earlier targets, we have empties so later stages don't freak out 96 | if sum(len(t) for t in split_targets) == len(targets): 97 | split_targets.append([]) 98 | split_hiddens.append([]) 99 | continue 100 | # Are you in our split? 101 | tmp_mask = mask == idx 102 | split_targets.append(torch.masked_select(targets, tmp_mask)) 103 | split_hiddens.append(hiddens.masked_select(tmp_mask.unsqueeze(1).expand_as(hiddens)).view(-1, hiddens.size(1))) 104 | return split_targets, split_hiddens 105 | 106 | def forward(self, weight, bias, hiddens, targets, verbose=False): 107 | if self.verbose or verbose: 108 | for idx in sorted(self.stats): 109 | print('{}: {}'.format(idx, int(np.mean(self.stats[idx]))), end=', ') 110 | print() 111 | 112 | total_loss = None 113 | if len(hiddens.size()) > 2: hiddens = hiddens.view(-1, hiddens.size(2)) 114 | 115 | split_targets, split_hiddens = self.split_on_targets(hiddens, targets) 116 | 117 | # First we perform the first softmax on the head vocabulary and the tombstones 118 | start, end = self.splits[0], self.splits[1] 119 | head_weight = None if end - start == 0 else weight[start:end] 120 | head_bias = None if end - start == 0 else bias[start:end] 121 | 122 | # We only add the tombstones if we have more than one split 123 | if self.nsplits > 1: 124 | head_weight = self.tail_vectors if head_weight is None else torch.cat([head_weight, self.tail_vectors]) 125 | head_bias = self.tail_bias if head_bias is None else torch.cat([head_bias, self.tail_bias]) 126 | 127 | # Perform the softmax calculation for the word vectors in the head for all splits 128 | # We need to guard against empty splits as torch.cat does not like random lists 129 | combo = torch.cat([split_hiddens[i] for i in range(self.nsplits) if len(split_hiddens[i])]) 130 | ### 131 | all_head_res = torch.nn.functional.linear(combo, head_weight, bias=head_bias) 132 | softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res, dim=-1) 133 | if self.verbose or verbose: 134 | self.stats[0].append(combo.size()[0] * head_weight.size()[0]) 135 | 136 | running_offset = 0 137 | for idx in range(self.nsplits): 138 | # If there are no targets for this split, continue 139 | if len(split_targets[idx]) == 0: continue 140 | 141 | # For those targets in the head (idx == 0) we only need to return their loss 142 | if idx == 0: 143 | softmaxed_head_res = softmaxed_all_head_res[running_offset:running_offset + len(split_hiddens[idx])] 144 | entropy = -torch.gather(softmaxed_head_res, dim=1, index=split_targets[idx].view(-1, 1)) 145 | # If the target is in one of the splits, the probability is the p(tombstone) * p(word within tombstone) 146 | else: 147 | softmaxed_head_res = softmaxed_all_head_res[running_offset:running_offset + len(split_hiddens[idx])] 148 | 149 | if self.verbose or verbose: 150 | start, end = self.splits[idx], self.splits[idx + 1] 151 | tail_weight = weight[start:end] 152 | self.stats[idx].append(split_hiddens[idx].size()[0] * tail_weight.size()[0]) 153 | 154 | # Calculate the softmax for the words in the tombstone 155 | tail_res = self.logprob(weight, bias, split_hiddens[idx], splits=[idx], softmaxed_head_res=softmaxed_head_res) 156 | 157 | # Then we calculate p(tombstone) * p(word in tombstone) 158 | # Adding is equivalent to multiplication in log space 159 | head_entropy = softmaxed_head_res[:, -idx] 160 | # All indices are shifted - if the first split handles [0,...,499] then the 500th in the second split will be 0 indexed 161 | indices = (split_targets[idx] - self.splits[idx]).view(-1, 1) 162 | # Warning: if you don't squeeze, you get an N x 1 return, which acts oddly with broadcasting 163 | tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res, dim=-1), dim=1, index=indices).squeeze() 164 | entropy = -(head_entropy + tail_entropy) 165 | ### 166 | running_offset += len(split_hiddens[idx]) 167 | total_loss = entropy.float().sum() if total_loss is None else total_loss + entropy.float().sum() 168 | 169 | return (total_loss / len(targets)).type_as(weight) 170 | 171 | 172 | if __name__ == '__main__': 173 | np.random.seed(42) 174 | torch.manual_seed(42) 175 | if torch.cuda.is_available(): 176 | torch.cuda.manual_seed(42) 177 | 178 | V = 8 179 | H = 10 180 | N = 100 181 | E = 10 182 | 183 | embed = torch.nn.Embedding(V, H) 184 | crit = SplitCrossEntropyLoss(hidden_size=H, splits=[V // 2]) 185 | bias = torch.nn.Parameter(torch.ones(V)) 186 | optimizer = torch.optim.SGD(list(embed.parameters()) + list(crit.parameters()), lr=1) 187 | 188 | for _ in range(E): 189 | prev = torch.autograd.Variable((torch.rand(N, 1) * 0.999 * V).int().long()) 190 | x = torch.autograd.Variable((torch.rand(N, 1) * 0.999 * V).int().long()) 191 | y = embed(prev).squeeze() 192 | c = crit(embed.weight, bias, y, x.view(N)) 193 | print('Crit', c.exp().data[0]) 194 | 195 | logprobs = crit.logprob(embed.weight, bias, y[:2]).exp() 196 | print(logprobs) 197 | print(logprobs.sum(dim=1)) 198 | 199 | optimizer.zero_grad() 200 | c.backward() 201 | optimizer.step() 202 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn 3 | import time 4 | import numpy as np 5 | import math 6 | 7 | from utils import get_batch, repackage_hidden 8 | 9 | 10 | def evaluate(model, criterion, data_source, batch_size, args): 11 | # Turn on evaluation mode which disables dropout. 12 | model.eval() 13 | total_loss = 0 14 | hidden = model.init_hidden(batch_size) 15 | for i in range(0, data_source.size(0) - 1, args.bptt): 16 | data, targets = get_batch(data_source, i, args, evaluation=True) 17 | output, hidden = model(data, hidden) 18 | total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data 19 | hidden = repackage_hidden(hidden) 20 | return total_loss.item() / len(data_source) 21 | 22 | 23 | def train(model, optimizer, params, criterion, train_data, args, epoch): 24 | # Turn on training mode which enables dropout. 25 | total_loss = 0 26 | start_time = time.time() 27 | hidden = model.init_hidden(args.batch_size) 28 | batch, i = 0, 0 29 | while i < train_data.size(0) - 1 - 1: 30 | bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. 31 | # Prevent excessively small or negative sequence lengths 32 | seq_len = max(5, int(np.random.normal(bptt, 5))) 33 | # There's a very small chance that it could select a very long sequence length resulting in OOM 34 | # seq_len = min(seq_len, args.bptt + 10) 35 | 36 | lr2 = optimizer.param_groups[0]['lr'] 37 | optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt 38 | model.train() 39 | data, targets = get_batch(train_data, i, args, seq_len=seq_len) 40 | 41 | # Starting each batch, we detach the hidden state from how it was previously produced. 42 | # If we didn't, the model would try backpropagating all the way to start of the dataset. 43 | hidden = repackage_hidden(hidden) 44 | optimizer.zero_grad() 45 | 46 | output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) 47 | raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets) 48 | 49 | loss = raw_loss 50 | # Activiation Regularization 51 | if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) 52 | # Temporal Activation Regularization (slowness) 53 | if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) 54 | loss.backward() 55 | 56 | # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. 57 | if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) 58 | optimizer.step() 59 | 60 | total_loss += raw_loss.data 61 | optimizer.param_groups[0]['lr'] = lr2 62 | if batch % args.log_interval == 0 and batch > 0: 63 | cur_loss = total_loss.item() / args.log_interval 64 | elapsed = time.time() - start_time 65 | print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 66 | 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( 67 | epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], 68 | elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) 69 | total_loss = 0 70 | start_time = time.time() 71 | ### 72 | batch += 1 73 | i += seq_len -------------------------------------------------------------------------------- /train_logs_multi_runs/logs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/train_logs_multi_runs/logs.zip -------------------------------------------------------------------------------- /train_logs_single_run/logs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/train_logs_single_run/logs.zip -------------------------------------------------------------------------------- /train_logs_wikitext-2/logs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/train_logs_wikitext-2/logs.zip -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import networkx as nx 4 | from itertools import permutations 5 | 6 | 7 | def repackage_hidden(h): 8 | """Wraps hidden states in new Tensors, 9 | to detach them from their history.""" 10 | if isinstance(h, torch.Tensor): 11 | return h.detach() 12 | else: 13 | return tuple(repackage_hidden(v) for v in h) 14 | 15 | 16 | def batchify(data, bsz, args, cuda='cuda'): 17 | # Work out how cleanly we can divide the dataset into bsz parts. 18 | nbatch = data.size(0) // bsz 19 | # Trim off any extra elements that wouldn't cleanly fit (remainders). 20 | data = data.narrow(0, 0, nbatch * bsz) 21 | # Evenly divide the data across the bsz batches. 22 | data = data.view(bsz, -1).t().contiguous() 23 | if args.cuda: 24 | data = data.to(cuda) 25 | return data 26 | 27 | 28 | def get_batch(source, i, args, seq_len=None, evaluation=False): 29 | seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i) 30 | data = source[i:i+seq_len] 31 | target = source[i+1:i+1+seq_len].view(-1) 32 | return data, target 33 | 34 | 35 | def make_graph(recepie): 36 | G = nx.DiGraph() 37 | 38 | for key in recepie.keys(): 39 | op = recepie[key]['op'] 40 | if key.startswith("h_new_"): 41 | op = key+":"+op 42 | G.add_node(key, name=key, op=op) 43 | for inp in recepie[key]['input']: 44 | if "h_prev" in inp or inp == "x": 45 | G.add_node(inp, name=inp, op=inp) 46 | else: 47 | G.add_node(inp, name=inp) 48 | G.add_edge(inp, key) 49 | return G 50 | 51 | 52 | def recepie2matrixops(recepie): 53 | G = make_graph(recepie) 54 | labels = nx.get_node_attributes(G, "op") 55 | nodelist_with_ops = np.array(list(labels.items())) 56 | 57 | matrix = nx.to_numpy_array(G, nodelist=nodelist_with_ops[:, 0]) 58 | ops = nodelist_with_ops[:, 1] 59 | 60 | return matrix, ops 61 | 62 | 63 | 64 | def graph_edit_distance(matrixops1, matrixops2): 65 | m1, l1 = matrixops1 66 | m2, l2 = matrixops2 67 | 68 | # Pad 69 | n1, n2 = m1.shape[0], m2.shape[0] 70 | max_n = max(n1, n2) 71 | m1 = np.pad(m1, ((0, max_n - m1.shape[0]), (0, max_n - m1.shape[0]))) 72 | m2 = np.pad(m2, ((0, max_n - m2.shape[0]), (0, max_n - m2.shape[0]))) 73 | l1 = np.pad(l1, (0, max_n - l1.shape[0]), constant_values=None) 74 | l2 = np.pad(l2, (0, max_n - l2.shape[0]), constant_values=None) 75 | 76 | 77 | d = 100000000 78 | for p in permutations(range(len(m1))): 79 | p = list(p) 80 | d_p = (m1 != m2[p][:, p]).sum() + (l1 != l2[p]).sum() 81 | d = min(d, d_p) 82 | return d 83 | -------------------------------------------------------------------------------- /weight_drop.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import Parameter 3 | from functools import wraps 4 | import functools 5 | 6 | class WeightDrop(torch.nn.Module): 7 | def __init__(self, module, weights, dropout=0, variational=False): 8 | super(WeightDrop, self).__init__() 9 | self.module = module 10 | self.weights = weights 11 | self.dropout = dropout 12 | self.variational = variational 13 | self._setup() 14 | 15 | def widget_demagnetizer_y2k_edition(*args, **kwargs): 16 | # We need to replace flatten_parameters with a nothing function 17 | # It must be a function rather than a lambda as otherwise pickling explodes 18 | # We can't write boring code though, so ... WIDGET DEMAGNETIZER Y2K EDITION! 19 | # (╯°□°)╯︵ ┻━┻ 20 | return 21 | 22 | def _setup(self): 23 | # Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN 24 | if issubclass(type(self.module), torch.nn.RNNBase): 25 | self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition 26 | 27 | for name_w in self.weights: 28 | #print('Applying weight drop of {} to {}'.format(self.dropout, name_w)) 29 | w = getattr(self.module, name_w) 30 | del self.module._parameters[name_w] 31 | self.module.register_parameter(name_w + '_raw', Parameter(w.data)) 32 | 33 | def _setweights(self): 34 | for name_w in self.weights: 35 | raw_w = getattr(self.module, name_w + '_raw') 36 | w = None 37 | if self.variational: 38 | mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1)) 39 | if raw_w.is_cuda: mask = mask.cuda() 40 | mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True) 41 | w = torch.nn.Parameter(mask.expand_as(raw_w) * raw_w) 42 | else: 43 | w = torch.nn.Parameter(torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)) 44 | setattr(self.module, name_w, w) 45 | 46 | def forward(self, *args): 47 | self._setweights() 48 | return self.module.forward(*args) 49 | 50 | def rsetattr(obj, attr, val): 51 | pre, _, post = attr.rpartition('.') 52 | return setattr(rgetattr(obj, pre) if pre else obj, post, val) 53 | 54 | def rgetattr(obj, attr, *args): 55 | def _getattr(obj, attr): 56 | return getattr(obj, attr, *args) 57 | return functools.reduce(_getattr, [obj] + attr.split('.')) 58 | 59 | class ParameterListWeightDrop(torch.nn.Module): 60 | def __init__(self, module, weights, dropout=0, variational=False): 61 | super(ParameterListWeightDrop, self).__init__() 62 | self.module = module 63 | self.weights = weights 64 | self.parents = {} 65 | for w in self.weights: 66 | p = '.'.join(w.split('.')[:-1]) 67 | i = int(w.split('.')[-1]) 68 | if p not in self.parents: 69 | self.parents[p] = [] 70 | self.parents[p].append(i) 71 | self.dropout = dropout 72 | self.variational = variational 73 | self._setup() 74 | 75 | 76 | def _setup(self): 77 | for name_w in self.parents: 78 | #print('Applying weight drop of {} to {}'.format(self.dropout, name_w)) 79 | ws = rgetattr(self.module, name_w) 80 | rsetattr(self.module, name_w, None) 81 | rsetattr(self.module, name_w + '_raw', torch.nn.ParameterList(ws)) 82 | 83 | def _setweights(self): 84 | for name_w in self.parents: 85 | raw_ws = rgetattr(self.module, name_w + '_raw') 86 | ws = [] 87 | for i, raw_w in enumerate(raw_ws): 88 | if i in self.parents[name_w]: 89 | if self.variational: 90 | mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1)) 91 | if raw_w.is_cuda: mask = mask.cuda() 92 | mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True) 93 | w = torch.nn.Parameter(mask.expand_as(raw_w) * raw_w) 94 | else: 95 | w = torch.nn.Parameter(torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)) 96 | else: 97 | w = raw_w 98 | ws.append(w) 99 | rsetattr(self.module, name_w, torch.nn.ParameterList(ws)) 100 | 101 | def forward(self, *args): 102 | self._setweights() 103 | return self.module.forward(*args) --------------------------------------------------------------------------------