├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── benchmarking_examples.ipynb
├── calculate_ged.py
├── custom_rnn.py
├── data.py
├── data
    ├── datasets.zip
    ├── doc2vec_features.csv
    ├── doc2vec_features_lowdim.csv
    └── recepies_example.json
├── embed_regularize.py
├── locked_dropout.py
├── main_one_model_train.py
├── make_arch_embeddings.ipynb
├── model.py
├── models_weights
    └── dump_weights_model_2226_2020-04-18_07-35-19_999938929.pt
├── multilinear.py
├── nas_environment.py
├── plotting.py
├── reproduce_model.ipynb
├── requirements.txt
├── search_space.py
├── search_space_analysis.ipynb
├── search_space_examples.ipynb
├── setup.py
├── splitcross.py
├── train.py
├── train_logs_multi_runs
    └── logs.zip
├── train_logs_single_run
    └── logs.zip
├── train_logs_wikitext-2
    └── logs.zip
├── utils.py
└── weight_drop.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .ipynb_checkpoints/
 3 | train_logs_single_run/*
 4 | !train_logs_single_run/logs.zip
 5 | train_logs_multi_runs/*
 6 | !train_logs_multi_runs/logs.zip
 7 | train_logs_wikitext-2/*
 8 | !train_logs_wikitext-2/logs.zip
 9 | data/ptb
10 | data/wikitext-2
11 | data/figures
12 | models_weights/*
13 | !models_weights/dump_weights_model_2226_2020-04-18_07-35-19_999938929.pt
14 | tmp/
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NAS-Bench-NLP
 2 | 
 3 | Preparation:
 4 | 1. unzip data/datasets.zip, train\_logs\_single\_run/logs.zip, train\_logs\_multi\_runs/logs.zip, and train\_logs\_wikitext-2/logs.zip;
 5 | 2. install requirements.txt (currently contains unuseed packages; to be cleaned);
 6 | 3. optionally, copy models from the dropbox (sample: https://www.dropbox.com/sh/qviytkrlbu2cy5u/AABy59Bb9CpiS7D4osbvY_xva?dl=0, all models: https://www.dropbox.com/scl/fo/4r36x7wqb6gvzcmz8zo61/AIzcRCPZhmzORxJdSI2AdtY?rlkey=516wk0knseuuow45wn4mhy0ak&e=1&dl=0) to the folder models\_weights.
 7 | 
 8 | Usage:
 9 | * search\_space\_examples.ipynb demonstrates how to generate architectures from the search space;
10 | * to train a model, run script main\_one\_model\_train.py --recepie\_id=<index of the architecture from the list>, where the list of architectures is by defaultin data/recepies\_example.json; logs and final weights will be stored in tmp folder by default (see script argumens for more info); 
11 | * reproduce\_model.ipynb demonstrates how to load and apply the trained model;
12 | * make\_arch\_embedding.ipynb creates graph2vec features for architectures;
13 | * search\_space\_analysis.ipynb reproduces figures from the analysis section in the paper;
14 | * benchmarking\_examples.ipynb shows how NAS methods can be tested based on precomputed results in the logs.
15 | 
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/__init__.py


--------------------------------------------------------------------------------
/benchmarking_examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import json\n",
 20 |     "import os\n",
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import matplotlib \n",
 32 |     "import matplotlib.pyplot as plt\n",
 33 |     "%matplotlib inline"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from tqdm import tqdm_notebook"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from nas_environment import Environment"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from sklearn.ensemble import BaggingRegressor\n",
 61 |     "from xgboost import XGBRegressor\n",
 62 |     "from sklearn.preprocessing import StandardScaler\n",
 63 |     "import sklearn.metrics.pairwise\n",
 64 |     "from scipy.spatial import distance"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "from hyperopt import hp\n",
 74 |     "from hyperopt import fmin, tpe, space_eval"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "# Set-up the environment"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "precomputed_logs_path = 'train_logs_single_run/'"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "env = Environment(precomputed_logs_path)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "len(env._logs)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "search_set = env.get_precomputed_recepies()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "alg_resutls = {}"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "rounds = 5\n",
136 |     "iters_per_round = 100"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "# Random seach"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "for train_epochs in [10, 50]:\n",
153 |     "    all_test_losses = []\n",
154 |     "    all_wall_times = []\n",
155 |     "\n",
156 |     "    N = int(iters_per_round*50/train_epochs)\n",
157 |     "    \n",
158 |     "    for seed in tqdm_notebook(range(rounds)):\n",
159 |     "        np.random.seed(seed)\n",
160 |     "        env.reset()\n",
161 |     "        selected_inds = []\n",
162 |     "        test_losses = []\n",
163 |     "        wall_times = []\n",
164 |     "        for i in range(N):\n",
165 |     "            cur_ind = np.random.choice(np.setdiff1d(np.arange(len(search_set)), np.array(selected_inds)), \n",
166 |     "                                       1, replace=False)[0]\n",
167 |     "            env.simulated_train(search_set[cur_ind], train_epochs)\n",
168 |     "            selected_inds.append(cur_ind)\n",
169 |     "            test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n",
170 |     "            wall_times.append(env.get_total_time())\n",
171 |     "        all_test_losses.append(test_losses)\n",
172 |     "        all_wall_times.append(wall_times)\n",
173 |     "    alg_resutls[f'random_search_{train_epochs}_epochs'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "# Hyperbands"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "all_test_losses = []\n",
190 |     "all_wall_times = []\n",
191 |     "N = iters_per_round\n",
192 |     "for seed in tqdm_notebook(range(rounds)):\n",
193 |     "    env.reset()\n",
194 |     "    np.random.seed(seed)\n",
195 |     "\n",
196 |     "    # HYPERBAND\n",
197 |     "    \n",
198 |     "    #inputs\n",
199 |     "    R = 50 # the maximum amount of resource that can be allocated to a single configuration (number of epochs)\n",
200 |     "    eta = 3 # an input that controls the proportion of configurations discarded in each round of SuccessiveHalving\n",
201 |     "\n",
202 |     "    # initialization\n",
203 |     "    s_max = int(np.floor(np.log(R)/np.log(eta)))\n",
204 |     "    # B = (s_max + 1)*R\n",
205 |     "    B = N*R/3.5 # to approximately match budgets in random search\n",
206 |     "\n",
207 |     "    test_losses = []\n",
208 |     "    wall_times = []\n",
209 |     "    \n",
210 |     "    log_cnt = 0\n",
211 |     "    for s in range(s_max, -1, -1):\n",
212 |     "        n = int(np.ceil(float(B)/R * float(eta)**s/(s + 1)))\n",
213 |     "        r = R*float(eta)**(-s)\n",
214 |     "        #print(s, n, r)\n",
215 |     "        # Successive Halving inner loop\n",
216 |     "        # init sample of n architectures\n",
217 |     "        T = np.random.choice(len(search_set), n, replace=False)\n",
218 |     "        #print(T)\n",
219 |     "        for i in range(s + 1):\n",
220 |     "            n_i = int(np.floor(n*float(eta)**(-i)))\n",
221 |     "            r_i = int(np.floor(r*eta**i))\n",
222 |     "            L = []\n",
223 |     "            for t in T:\n",
224 |     "                env.simulated_train(search_set[t], r_i)\n",
225 |     "                if env.get_model_status(search_set[t]) == 'OK':\n",
226 |     "                    L.append(env.get_model_stats(search_set[t], r_i - 1)['val_loss'])\n",
227 |     "                else:\n",
228 |     "                    L.append(np.inf) # if model fails accidently within r_i epichs, it is discated further\n",
229 |     "                log_cnt += 1\n",
230 |     "                if log_cnt % 25 == 0:\n",
231 |     "                    test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n",
232 |     "                    wall_times.append(env.get_total_time())\n",
233 |     "            test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n",
234 |     "            wall_times.append(env.get_total_time())\n",
235 |     "\n",
236 |     "            L = np.array(L)\n",
237 |     "            halved_inds = np.argsort(L)[:int(np.floor(n_i/float(eta)))]\n",
238 |     "            halved_inds = halved_inds[L[halved_inds] < np.inf] # discard accidently failed models\n",
239 |     "            T = T[halved_inds]\n",
240 |     "            #print(T)\n",
241 |     "    all_test_losses.append(test_losses)\n",
242 |     "    all_wall_times.append(wall_times)\n",
243 |     "alg_resutls['hyperband'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "# BayesOpt"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "df_recepie_vectors = pd.read_csv('data/doc2vec_features.csv').set_index('recepie_id')\n",
260 |     "df_recepie_vectors_lowdim = pd.read_csv('data/doc2vec_features_lowdim.csv').set_index('recepie_id')"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "search_set_recepie_ids = np.array(env.get_recepie_ids())"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "X_highdim = df_recepie_vectors.loc[search_set_recepie_ids].values\n",
279 |     "X_lowdim = df_recepie_vectors_lowdim.loc[search_set_recepie_ids].values"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "for X, alias in zip([X_highdim, X_lowdim], ['50D', '10D']):\n",
289 |     "#     if alias == '50D':\n",
290 |     "#         continue\n",
291 |     "    all_test_losses = []\n",
292 |     "    all_wall_times = []\n",
293 |     "    epochs_train = 50\n",
294 |     "    N_init = 20 # check randomly a few architectures at first\n",
295 |     "    beta = 2.0\n",
296 |     "    N = int(1.3*iters_per_round)\n",
297 |     "    train_batch = 10\n",
298 |     "    for seed in tqdm_notebook(range(rounds)):\n",
299 |     "    #for seed in tqdm_notebook(range(5)):\n",
300 |     "        np.random.seed(seed)\n",
301 |     "        env.reset()\n",
302 |     "        selected_inds = []\n",
303 |     "        test_losses = []\n",
304 |     "        wall_times = []\n",
305 |     "        X_train = []\n",
306 |     "        y_train = []\n",
307 |     "        # check a few random architectures at first\n",
308 |     "        for i in range(N_init):\n",
309 |     "            cur_ind = np.random.choice(np.setdiff1d(np.arange(len(search_set)), np.array(selected_inds)), \n",
310 |     "                                       1, replace=False)[0]\n",
311 |     "            env.simulated_train(search_set[cur_ind], epochs_train)\n",
312 |     "            selected_inds.append(cur_ind)\n",
313 |     "            if env.get_model_status(search_set[cur_ind]) == 'OK':\n",
314 |     "                X_train.append(X[cur_ind])\n",
315 |     "                y_train.append(env.get_model_stats(search_set[cur_ind], epochs_train - 1)['val_loss'])\n",
316 |     "            test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n",
317 |     "            wall_times.append(env.get_total_time())\n",
318 |     "\n",
319 |     "\n",
320 |     "        regr = BaggingRegressor(XGBRegressor(n_estimators=100, max_depth=15), \n",
321 |     "                                n_estimators=14, max_samples=0.5, n_jobs=14)\n",
322 |     "\n",
323 |     "        # train estimator and score new candidates according to the lower-confidence-bound acquisition function\n",
324 |     "        for i in range(N_init, N):\n",
325 |     "            if i % train_batch == 0:\n",
326 |     "                regr.fit(np.array(X_train), np.array(y_train))\n",
327 |     "                y_pred_mean = regr.predict(X)\n",
328 |     "                y_pred_std = np.std([e.predict(X) for e in regr.estimators_], axis=0)\n",
329 |     "                scores = y_pred_mean - beta * y_pred_std\n",
330 |     "\n",
331 |     "            scores[np.array(selected_inds)] = np.inf\n",
332 |     "\n",
333 |     "            cur_ind = np.argmin(scores)\n",
334 |     "\n",
335 |     "            env.simulated_train(search_set[cur_ind], epochs_train)\n",
336 |     "            if env.get_model_status(search_set[cur_ind]) == 'OK':\n",
337 |     "                X_train.append(X[cur_ind])\n",
338 |     "                y_train.append(env.get_model_stats(search_set[cur_ind], epochs_train - 1)['val_loss'])\n",
339 |     "            selected_inds.append(cur_ind)\n",
340 |     "            test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n",
341 |     "            wall_times.append(env.get_total_time())\n",
342 |     "        all_test_losses.append(test_losses)\n",
343 |     "        all_wall_times.append(wall_times)\n",
344 |     "\n",
345 |     "\n",
346 |     "    alg_resutls[f'bayes_opt_{alias}'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "# Regularized evolution"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "def mutate_embedded(e, std=1, axes_bounds=None):\n",
363 |     "    e_new = e + np.random.randn(len(e)) * std\n",
364 |     "    if axes_bounds is not None:\n",
365 |     "        e_new = np.clip(e_new, axes_bounds[0], axes_bounds[1])\n",
366 |     "    return e_new"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "def find_closest(E, e):\n",
376 |     "    #dists = np.linalg.norm(E - e.reshape(1, -1), axis=1)\n",
377 |     "    dists = distance.cdist([e], E, \"cosine\")[0]\n",
378 |     "    return np.argmin(dists)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "metadata": {},
385 |    "outputs": [],
386 |    "source": [
387 |     "all_test_losses = []\n",
388 |     "all_wall_times = []\n",
389 |     "\n",
390 |     "train_epochs = 50\n",
391 |     "P = 20\n",
392 |     "C = int(1.3*iters_per_round)\n",
393 |     "S = 10\n",
394 |     "\n",
395 |     "axes_bounds = (np.min(X, axis=0), np.max(X, axis=0))\n",
396 |     "\n",
397 |     "for seed in tqdm_notebook(range(rounds)):\n",
398 |     "    np.random.seed(seed)\n",
399 |     "    env.reset()\n",
400 |     "    test_losses = []\n",
401 |     "    wall_times = []\n",
402 |     "    \n",
403 |     "    # init first P architectures\n",
404 |     "    population = []\n",
405 |     "    history = []\n",
406 |     "    for i in np.random.choice(np.arange(len(search_set)), P, replace=False):\n",
407 |     "        env.simulated_train(search_set[i], train_epochs)\n",
408 |     "        population.append(i)\n",
409 |     "        history.append(i)\n",
410 |     "        test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n",
411 |     "        wall_times.append(env.get_total_time())\n",
412 |     "    \n",
413 |     "    attempt = 0\n",
414 |     "    valid_round = True\n",
415 |     "    while len(history) < C:\n",
416 |     "        sample = np.random.choice(population, S, replace=False)\n",
417 |     "        sample_scores = [env.get_model_stats(search_set[i], epochs_train - 1)['val_loss'] \n",
418 |     "                         for i in sample if env.get_model_status(search_set[i]) == 'OK']\n",
419 |     "        if len(sample_scores) == 0: \n",
420 |     "            # this is unlikely to happen, but just to make sure that the code will work anyway\n",
421 |     "            attempt += 1\n",
422 |     "            if attempt > 5:\n",
423 |     "                valid_round = False\n",
424 |     "                break\n",
425 |     "            else:\n",
426 |     "                continue\n",
427 |     "        else:\n",
428 |     "            attempt = 0\n",
429 |     "        parent = sample[np.argmin(sample_scores)]\n",
430 |     "        \n",
431 |     "        for std in [0.5, 1.0, 2.0, 4.0, 8.0]:\n",
432 |     "            e_new = mutate_embedded(X[parent], std, axes_bounds)\n",
433 |     "            child = find_closest(X, e_new)\n",
434 |     "            if child != parent:\n",
435 |     "                # stop when we find a child that differs from the parent\n",
436 |     "                break\n",
437 |     "        \n",
438 |     "        env.simulated_train(search_set[child], train_epochs)\n",
439 |     "        history.append(child)\n",
440 |     "        population = population[1:] + [child]\n",
441 |     "        test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n",
442 |     "        wall_times.append(env.get_total_time())\n",
443 |     "    \n",
444 |     "    if valid_round:\n",
445 |     "        all_test_losses.append(test_losses)\n",
446 |     "        all_wall_times.append(wall_times)\n",
447 |     "    #break\n",
448 |     "    \n",
449 |     "alg_resutls['regularized_evolution'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "# TPE"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "def objective_vec(vec):\n",
466 |     "    \n",
467 |     "    # The most similar vector in X:\n",
468 |     "    distances = distance.cdist([vec], X, \"cosine\")[0]\n",
469 |     "    #distances = np.linalg.norm(X - vec.reshape(1, -1), axis=1)\n",
470 |     "    recepie_id = np.argmin(distances)\n",
471 |     "    recepie = search_set[recepie_id]\n",
472 |     "\n",
473 |     "\n",
474 |     "    env.simulated_train(recepie, epochs_train)\n",
475 |     "    test_losses.append(env.get_test_loss_of_the_best_validated_architecture())\n",
476 |     "    wall_times.append(env.get_total_time())\n",
477 |     "\n",
478 |     "\n",
479 |     "    if env.get_model_status(recepie) == 'OK':\n",
480 |     "        result = env.get_model_stats(recepie, epochs_train - 1)['val_loss']\n",
481 |     "    else:\n",
482 |     "        result = 10\n",
483 |     "    return result"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": null,
489 |    "metadata": {},
490 |    "outputs": [],
491 |    "source": [
492 |     "def objective_dict(vec_as_dict):\n",
493 |     "    \n",
494 |     "    vec = np.zeros(50)\n",
495 |     "    for k, v in vec_as_dict.items():\n",
496 |     "        vec[int(k)] = v\n",
497 |     "    \n",
498 |     "    return objective_vec(vec)"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "metadata": {},
505 |    "outputs": [],
506 |    "source": [
507 |     "X = X_highdim"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {},
514 |    "outputs": [],
515 |    "source": [
516 |     "components_space = {}\n",
517 |     "for i in range(X.shape[-1]):\n",
518 |     "    min_val = X[:, i].min()\n",
519 |     "    max_val = X[:, i].max()\n",
520 |     "    components_space[i] = hp.uniform(f'component_{i}', min_val, max_val)\n",
521 |     "search_space = components_space"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": null,
527 |    "metadata": {},
528 |    "outputs": [],
529 |    "source": [
530 |     "N = int(1.3*iters_per_round)"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "for epochs_train in [50]:\n",
540 |     "    all_test_losses = []\n",
541 |     "    all_wall_times = []\n",
542 |     "\n",
543 |     "    for seed in tqdm_notebook(range(rounds)):\n",
544 |     "        np.random.seed(seed)\n",
545 |     "        os.environ['HYPEROPT_FMIN_SEED'] = str(seed)\n",
546 |     "        env.reset()\n",
547 |     "        test_losses, wall_times = [], []\n",
548 |     "        \n",
549 |     "        # minimize the objective over the space\n",
550 |     "        best = fmin(objective_dict, search_space, algo=tpe.suggest, max_evals=N,\n",
551 |     "                    verbose=False, show_progressbar=True, max_queue_len=20)\n",
552 |     "        all_test_losses.append(test_losses)\n",
553 |     "        all_wall_times.append(wall_times)\n",
554 |     "\n",
555 |     "    alg_resutls[f'TPE_{epochs_train}_epochs'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "markdown",
560 |    "metadata": {},
561 |    "source": [
562 |     "# SMAC"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": null,
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": [
571 |     "from smac.facade.func_facade import fmin_smac\n",
572 |     "from smac.initial_design.latin_hypercube_design import LHDesign\n",
573 |     "import logging\n",
574 |     "from ConfigSpace.hyperparameters import UniformFloatHyperparameter\n",
575 |     "\n",
576 |     "# Import ConfigSpace and different types of parameters\n",
577 |     "from smac.configspace import ConfigurationSpace\n",
578 |     "from smac.facade.smac_hpo_facade import SMAC4HPO\n",
579 |     "from smac.facade.smac_bo_facade import SMAC4BO\n",
580 |     "from smac.initial_design.latin_hypercube_design import LHDesign\n",
581 |     "from smac.optimizer.acquisition import LCB, EI, PI\n",
582 |     "from smac.runhistory.runhistory2epm import RunHistory2EPM4InvScaledCost\n",
583 |     "# Import SMAC-utilities\n",
584 |     "from smac.scenario.scenario import Scenario"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "from smac.facade.smac_ac_facade import SMAC4AC\n",
594 |     "from smac.scenario.scenario import Scenario\n",
595 |     "from smac.tae.execute_ta_run import ExecuteTARun\n",
596 |     "from smac.tae.execute_func import ExecuteTAFuncDict\n",
597 |     "from smac.configspace import ConfigurationSpace\n",
598 |     "from smac.stats.stats import Stats\n",
599 |     "from smac.initial_design.random_configuration_design import RandomConfigurations\n",
600 |     "from smac.initial_design.latin_hypercube_design import LHDesign\n",
601 |     "from ConfigSpace.hyperparameters import UniformFloatHyperparameter\n",
602 |     "import json\n",
603 |     "import pathlib"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": null,
609 |    "metadata": {},
610 |    "outputs": [],
611 |    "source": [
612 |     "class SMACUtils(object):\n",
613 |     "    def __init__(self, env, X=X, search_set=search_set,\n",
614 |     "                 epochs_train=epochs_train):\n",
615 |     "\n",
616 |     "        self.env = env\n",
617 |     "        self.X = X\n",
618 |     "        self.search_set = search_set\n",
619 |     "        self.epochs_train = epochs_train\n",
620 |     "        self.stat = {}\n",
621 |     "#         self.stat_file = stat_file\n",
622 |     "        \n",
623 |     "#         with open(self.stat_file, \"w\") as f:\n",
624 |     "#             json.dump({}, f)\n",
625 |     "    \n",
626 |     "    def objective_function(self, config):\n",
627 |     "        vec = self._config_to_vec(config)\n",
628 |     "        \n",
629 |     "        distances = distance.cdist([vec], self.X, \"cosine\")[0]\n",
630 |     "        recepie_id = np.argmin(distances)\n",
631 |     "        recepie = self.search_set[recepie_id]\n",
632 |     "        \n",
633 |     "        self.env.simulated_train(recepie, self.epochs_train)\n",
634 |     "        \n",
635 |     "        test_loss = self.env.get_test_loss_of_the_best_validated_architecture()\n",
636 |     "        wall_time = self.env.get_total_time()\n",
637 |     "        self._collect_eval_stat(test_loss, wall_time)\n",
638 |     "        \n",
639 |     "        if self.env.get_model_status(recepie) == 'OK':\n",
640 |     "            r = self.env.get_model_stats(recepie, self.epochs_train - 1)['val_loss']\n",
641 |     "        else:\n",
642 |     "            r = 1000\n",
643 |     "        \n",
644 |     "        return r, {\"test_loss\": test_loss, \"wall_time\": wall_time}\n",
645 |     "\n",
646 |     "        \n",
647 |     "    def _config_to_vec(self, config):\n",
648 |     "        vec_as_dict = config.get_dictionary()\n",
649 |     "        vec = np.zeros(self.X.shape[-1])\n",
650 |     "        for k, v in vec_as_dict.items():\n",
651 |     "            vec[int(k)] = v\n",
652 |     "        return vec\n",
653 |     "    \n",
654 |     "    def _collect_eval_stat(self, test_loss, wall_time):\n",
655 |     "        stat = self.stat\n",
656 |     "        \n",
657 |     "        if 'test_losses' not in stat:\n",
658 |     "            stat['test_losses'] = []\n",
659 |     "        stat['test_losses'].append(test_loss)\n",
660 |     "        \n",
661 |     "        if 'wall_times' not in stat:\n",
662 |     "            stat['wall_times'] = []\n",
663 |     "        stat['wall_times'].append(wall_time)\n",
664 |     "        \n",
665 |     "        if 'eval_step' not in stat:\n",
666 |     "            stat['eval_step'] = 0\n",
667 |     "        stat['eval_step'] += 1\n",
668 |     "        \n"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": null,
674 |    "metadata": {},
675 |    "outputs": [],
676 |    "source": [
677 |     "for X in [X_lowdim]:\n",
678 |     "    axes_bounds = (np.min(X, axis=0), np.max(X, axis=0))\n",
679 |     "    bounds = list(zip(axes_bounds[0], axes_bounds[1]))\n",
680 |     "\n",
681 |     "    cs = ConfigurationSpace()\n",
682 |     "    cs.add_hyperparameters([\n",
683 |     "        UniformFloatHyperparameter(str(i), X[:, i].min(), X[:, i].max(), default_value=0)\n",
684 |     "        for i in range(X.shape[-1])\n",
685 |     "    ]);\n",
686 |     "\n",
687 |     "    for initial_design in [LHDesign]: #[RandomConfigurations, LHDesign]:\n",
688 |     "        all_test_losses = []\n",
689 |     "        all_wall_times = []\n",
690 |     "\n",
691 |     "        for seed in range(rounds):\n",
692 |     "            print(f\"START WITH INITIAL DESIGN: {initial_design.__name__} SEED: {seed}\")\n",
693 |     "            scenario = Scenario({\"run_obj\": \"quality\",\n",
694 |     "                             \"runcount-limit\": int(1.2*iters_per_round),\n",
695 |     "                             \"wallclock-limit\": 3000,\n",
696 |     "                             \"cs\": cs,\n",
697 |     "                             \"deterministic\": \"true\",\n",
698 |     "                             \"initial_incumbent\": \"RANDOM\",\n",
699 |     "                             \"output_dir\": \"./tmp\",\n",
700 |     "                             \"seed\": seed,\n",
701 |     "                             \"limit_resources\": \"false\"})\n",
702 |     "\n",
703 |     "            env.reset()\n",
704 |     "            b = SMACUtils(env, X=X)\n",
705 |     "\n",
706 |     "            def objective_function(config, **kwargs):\n",
707 |     "                y, stat = b.objective_function(config)\n",
708 |     "                return float(y)\n",
709 |     "\n",
710 |     "            stats = Stats(scenario=scenario)\n",
711 |     "            smac = SMAC4AC(scenario=scenario,\n",
712 |     "                           tae_runner=objective_function,\n",
713 |     "                           initial_design=initial_design)\n",
714 |     "            smac.optimize()\n",
715 |     "\n",
716 |     "            stat = b.stat\n",
717 |     "            all_test_losses.append(stat['test_losses'])\n",
718 |     "            all_wall_times.append(stat['wall_times'])\n",
719 |     "\n",
720 |     "        alg_resutls[f'SMAC[{initial_design.__name__}_{X.shape[-1]}D]'] = {'all_test_losses':all_test_losses, 'all_wall_times':all_wall_times}"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "markdown",
725 |    "metadata": {},
726 |    "source": [
727 |     "# Plot resuls"
728 |    ]
729 |   },
730 |   {
731 |    "cell_type": "code",
732 |    "execution_count": null,
733 |    "metadata": {},
734 |    "outputs": [],
735 |    "source": [
736 |     "legend_algs = {\n",
737 |     "    'random_search_50_epochs': 'RS 50E',\n",
738 |     "    'random_search_10_epochs': 'RS 10E',\n",
739 |     "    'hyperband':'HB',\n",
740 |     "    'bayes_opt_50D':'BO 50D',\n",
741 |     "    'bayes_opt_10D':'BO 10D',\n",
742 |     "    'regularized_evolution':'RE',\n",
743 |     "    'TPE_50_epochs':'TPE',\n",
744 |     "    'SMAC[LHDesign_10D]':'SMAC'\n",
745 |     "}"
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "code",
750 |    "execution_count": null,
751 |    "metadata": {},
752 |    "outputs": [],
753 |    "source": [
754 |     "y_opt = env.get_best_possible_test_loss()"
755 |    ]
756 |   },
757 |   {
758 |    "cell_type": "code",
759 |    "execution_count": null,
760 |    "metadata": {},
761 |    "outputs": [],
762 |    "source": [
763 |     "plt.figure(figsize=(8, 5), dpi=100)\n",
764 |     "\n",
765 |     "\n",
766 |     "for i, alg_name in enumerate(['random_search_50_epochs', 'random_search_10_epochs', 'hyperband', \n",
767 |     "                              'bayes_opt_50D', 'bayes_opt_10D', 'regularized_evolution', 'TPE_50_epochs',\n",
768 |     "                             'SMAC[LHDesign_10D]']):\n",
769 |     "    all_test_losses = alg_resutls[alg_name]['all_test_losses']\n",
770 |     "    all_wall_times = alg_resutls[alg_name]['all_wall_times']\n",
771 |     "\n",
772 |     "    all_xs = np.array(all_test_losses)\n",
773 |     "    all_ts = np.array(all_wall_times)/3600.\n",
774 |     "\n",
775 |     "    s = 1.96/np.sqrt(all_xs.shape[0])\n",
776 |     "\n",
777 |     "    all_ts_mean = all_ts.mean(axis=0)\n",
778 |     "\n",
779 |     "    all_ts_max = all_ts_mean + s*all_ts.std(axis=0)\n",
780 |     "    all_ts_min = all_ts_mean - s*all_ts.std(axis=0)\n",
781 |     "\n",
782 |     "    all_xs_mean = np.nanmean(all_xs, axis=0)\n",
783 |     "\n",
784 |     "    all_xs_max = all_xs_mean + s*np.nanstd(all_xs, axis=0)\n",
785 |     "    all_xs_min = all_xs_mean - s*np.nanstd(all_xs, axis=0)\n",
786 |     "\n",
787 |     "\n",
788 |     "    plt.plot(all_ts_mean, all_xs_mean - y_opt, lw=1.5, color=f'C{i}', label=legend_algs[alg_name])\n",
789 |     "\n",
790 |     "    plt.fill_between(all_ts_mean, all_xs_min - y_opt, all_xs_max - y_opt, alpha=0.1, edgecolor=f'C{i}')\n",
791 |     "\n",
792 |     "plt.legend()\n",
793 |     "plt.xlabel('Total train time [h]', fontsize=14)\n",
794 |     "plt.ylabel('Regret', fontsize=14)\n",
795 |     "plt.ylim([0.1, 1.0])\n",
796 |     "plt.xscale('log')\n",
797 |     "plt.yscale('log')\n",
798 |     "plt.xlim([5, 1500])\n",
799 |     "plt.xticks([10, 20, 50, 100, 200, 500, 1000]);\n",
800 |     "plt.gca().get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())\n",
801 |     "plt.savefig('data/figures/benchmarks_log_y_scale.png', dpi=300, bbox_inches='tight')"
802 |    ]
803 |   },
804 |   {
805 |    "cell_type": "code",
806 |    "execution_count": null,
807 |    "metadata": {},
808 |    "outputs": [],
809 |    "source": [
810 |     "from matplotlib import ticker\n",
811 |     "\n",
812 |     "plt.figure(figsize=(8, 5), dpi=100)\n",
813 |     "\n",
814 |     "for i, alg_name in enumerate(['random_search_50_epochs', 'random_search_10_epochs', 'hyperband', \n",
815 |     "                              'bayes_opt_50D', 'bayes_opt_10D', 'regularized_evolution', 'TPE_50_epochs',\n",
816 |     "                             'SMAC[LHDesign_10D]']):\n",
817 |     "    all_test_losses = alg_resutls[alg_name]['all_test_losses']\n",
818 |     "    all_wall_times = alg_resutls[alg_name]['all_wall_times']\n",
819 |     "\n",
820 |     "    all_xs = np.array(all_test_losses)\n",
821 |     "    all_ts = np.array(all_wall_times)/3600.\n",
822 |     "\n",
823 |     "    s = 1.96/np.sqrt(all_xs.shape[0])\n",
824 |     "\n",
825 |     "    all_ts_mean = all_ts.mean(axis=0)\n",
826 |     "\n",
827 |     "    all_ts_max = all_ts_mean + s*all_ts.std(axis=0)\n",
828 |     "    all_ts_min = all_ts_mean - s*all_ts.std(axis=0)\n",
829 |     "\n",
830 |     "    all_xs_mean = np.nanmean(all_xs, axis=0)\n",
831 |     "\n",
832 |     "    all_xs_max = all_xs_mean + s*np.nanstd(all_xs, axis=0)\n",
833 |     "    all_xs_min = all_xs_mean - s*np.nanstd(all_xs, axis=0)\n",
834 |     "\n",
835 |     "    plt.plot(np.sort(all_xs[:, -1]) - y_opt, np.linspace(0, 1, len(all_xs)), color=f'C{i}', label=legend_algs[alg_name])\n",
836 |     "plt.legend()\n",
837 |     "\n",
838 |     "\n",
839 |     "plt.xlabel('Final test regret', fontsize=14)\n",
840 |     "plt.ylabel('CDF', fontsize=14)\n",
841 |     "plt.savefig('data/figures/benchmarks_CDF_regret.png', dpi=300, bbox_inches='tight')"
842 |    ]
843 |   },
844 |   {
845 |    "cell_type": "code",
846 |    "execution_count": null,
847 |    "metadata": {},
848 |    "outputs": [],
849 |    "source": []
850 |   },
851 |   {
852 |    "cell_type": "code",
853 |    "execution_count": null,
854 |    "metadata": {},
855 |    "outputs": [],
856 |    "source": []
857 |   }
858 |  ],
859 |  "metadata": {
860 |   "kernelspec": {
861 |    "display_name": "Python 3",
862 |    "language": "python",
863 |    "name": "python3"
864 |   },
865 |   "language_info": {
866 |    "codemirror_mode": {
867 |     "name": "ipython",
868 |     "version": 3
869 |    },
870 |    "file_extension": ".py",
871 |    "mimetype": "text/x-python",
872 |    "name": "python",
873 |    "nbconvert_exporter": "python",
874 |    "pygments_lexer": "ipython3",
875 |    "version": "3.6.10"
876 |   }
877 |  },
878 |  "nbformat": 4,
879 |  "nbformat_minor": 2
880 | }
881 | 


--------------------------------------------------------------------------------
/calculate_ged.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import contextlib
 3 | import itertools
 4 | import json
 5 | import random
 6 | import time
 7 | from pathlib import Path
 8 | 
 9 | import joblib
10 | import networkx as nx
11 | from interruptingcow import Quota, timeout
12 | from joblib import Parallel, delayed
13 | from tqdm.auto import tqdm
14 | 
15 | from utils import make_graph
16 | 
17 | 
18 | @contextlib.contextmanager
19 | def tqdm_joblib(tqdm_object):
20 |     """Context manager to patch joblib to report into tqdm progress bar given as argument"""
21 |     class TqdmBatchCompletionCallback:
22 |         def __init__(self, time, index, parallel):
23 |             self.index = index
24 |             self.parallel = parallel
25 | 
26 |         def __call__(self, index):
27 |             tqdm_object.update()
28 |             if self.parallel._original_iterator is not None:
29 |                 self.parallel.dispatch_next()
30 | 
31 |     old_batch_callback = joblib.parallel.BatchCompletionCallBack
32 |     joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
33 |     try:
34 |         yield tqdm_object
35 |     finally:
36 |         joblib.parallel.BatchCompletionCallBack = old_batch_callback
37 |         tqdm_object.close()
38 | 
39 | def calc_ged(recepie1, recepie2, timeout_val=600):
40 |     start_time = time.time()
41 |     G1 = make_graph(recepie1)
42 |     G2 = make_graph(recepie2)
43 |     ged = None
44 | 
45 |     try:
46 |         status = "OK"
47 |         with timeout(Quota(timeout_val), exception=RuntimeError):
48 |             for ged in nx.optimize_graph_edit_distance(G1, G2, lambda n1, n2: n1['op'] == n2['op']):
49 |                 pass
50 | 
51 |     except RuntimeError as e:
52 |         status = "Timeout"
53 | 
54 |     except Exception as e:
55 |         status = "Exception: " + str(e)
56 | 
57 |     return {
58 |         "recepie_i": recepie1,
59 |         "recepie_j": recepie2,
60 |         "ged": ged,
61 |         "time": time.time() - start_time,
62 |         "status": status
63 |     }
64 | 
65 | if __name__ == "__main__":
66 |     parser = argparse.ArgumentParser(description='Calculate GED')
67 |     parser.add_argument('--recepies', type=str, default="./new_recepies_fix.json",
68 |                         help='path to JSON file with recepies')
69 |     parser.add_argument('--num', type=int, default=10,
70 |                         help='number of random recepies for calculating GED to all another')
71 |     parser.add_argument('--timeout', type=int, default=600, help="timeout for calculating one GED value in seconds")
72 |     parser.add_argument('--n_jobs', type=int, default=-2,
73 |                         help="n_jobs in skit learn style")
74 |     parser.add_argument('--num_parts', type=int, default=10,
75 |                         help="Num results parts for saving")
76 | 
77 |     args = parser.parse_args()
78 |     
79 |     with open(args.recepies, "r") as f:
80 |         recepies = json.load(f)
81 | 
82 |     key_recepies = random.sample(recepies, args.num)
83 |     part_size = len(recepies)//args.num_parts
84 |     for part in range(1, args.num_parts+1):
85 |         _recepies = recepies[(part-1)*part_size:part*part_size]
86 |         combs = list(itertools.product(key_recepies, _recepies))
87 | 
88 |         with tqdm_joblib(tqdm(desc="GED part {} of {}".format(part, args.num_parts), total=len(combs))) as progress_bar:
89 |             results = Parallel(n_jobs=args.n_jobs, backend='multiprocessing')(delayed(calc_ged)(r1, r2, args.timeout) for r1, r2 in combs)
90 | 
91 |         with open("GED_CALC_RESULTS_part_{}.json".format(part), 'w') as f:
92 |             json.dump(results, f)
93 | 


--------------------------------------------------------------------------------
/custom_rnn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn
  3 | import networkx as nx
  4 | 
  5 | from multilinear import MultiLinear
  6 | import math
  7 | 
  8 | class CustomRNNCell(torch.nn.Module):
  9 |     
 10 |     elementwise_ops_dict = {
 11 |         'prod': torch.mul,
 12 |         'sum': torch.add
 13 |     }
 14 |     
 15 |     def __init__(self, input_size, hidden_size, recepie):
 16 |         super(CustomRNNCell, self).__init__()
 17 |         
 18 |         self.activations_dict = {
 19 |             'tanh': torch.nn.Tanh(),
 20 |             'sigm': torch.nn.Sigmoid(),
 21 |             'leaky_relu': torch.nn.LeakyReLU()
 22 |         }
 23 |         
 24 |         self.input_size = input_size
 25 |         self.hidden_size = hidden_size
 26 |         self.recepie = recepie
 27 |         self.hidden_tuple_size = 0
 28 |         
 29 |         components_dict = {}    
 30 |     
 31 |         self.G = nx.DiGraph()
 32 |         for k in recepie.keys():
 33 |             if k not in components_dict:
 34 |                 
 35 |                 component = self._make_component(recepie[k])
 36 |                 if component is not None:
 37 |                     components_dict[k] = component 
 38 |                 if k.startswith('h_new'):
 39 |                     suffix = k.replace('h_new_', '')
 40 |                     if suffix.isdigit():
 41 |                         self.hidden_tuple_size = max([self.hidden_tuple_size, int(suffix) + 1])
 42 |                 
 43 |                 if k not in self.G.nodes():
 44 |                     self.G.add_node(k)
 45 |                 for i, n in enumerate(recepie[k]['input']):
 46 |                     if n not in self.G.nodes():
 47 |                         self.G.add_node(k)
 48 |                     self.G.add_edge(n, k)
 49 | 
 50 |         self.components = torch.nn.ModuleDict(components_dict)
 51 |         self.nodes_order = list(nx.algorithms.dag.topological_sort(self.G))
 52 |         
 53 |     def forward(self, x, hidden_tuple):
 54 |         calculated_nodes = {}
 55 |         for n in self.nodes_order:
 56 |             if n == 'x':
 57 |                 calculated_nodes['x'] = x.unsqueeze(0)
 58 |             elif n.startswith('h_prev') and n.replace('h_prev_', '').isdigit():
 59 |                 calculated_nodes[n] = hidden_tuple[int(n.replace('h_prev_', ''))].unsqueeze(0)
 60 |             elif n in self.components:
 61 |                 inputs = [calculated_nodes[k] for k in self.recepie[n]['input']]
 62 |                 calculated_nodes[n] = self.components[n](*inputs)
 63 |             else:
 64 |                 # simple operations
 65 |                 op = self.recepie[n]['op']
 66 |                 inputs = [calculated_nodes[k] for k in self.recepie[n]['input']]
 67 |                 if op in ['elementwise_prod', 'elementwise_sum']:
 68 |                     op_func = CustomRNNCell.elementwise_ops_dict[op.replace('elementwise_', '')]
 69 |                     calculated_nodes[n] = op_func(inputs[0], inputs[1])
 70 |                     for inp in range(2, len(inputs)):
 71 |                         calculated_nodes[n] = op_func(calculated_nodes[n], inputs[i])
 72 |                 elif op == 'blend':
 73 |                     calculated_nodes[n] = inputs[0]*inputs[1] + (1 - inputs[0])*inputs[2]
 74 |                 elif op.startswith('activation'):
 75 |                     op_func = self.activations_dict[op.replace('activation_', '')]
 76 |                     calculated_nodes[n] = op_func(inputs[0])
 77 |         return tuple([calculated_nodes[f'h_new_{i}'][0] for i in range(self.hidden_tuple_size)])
 78 |     
 79 |     def _make_component(self, spec):
 80 |         if spec['op'] == 'linear':
 81 |             input_sizes = [self.input_size if inp=='x' else self.hidden_size for inp in spec['input']]
 82 |             return MultiLinear(input_sizes, self.hidden_size)
 83 | 
 84 | 
 85 | class CustomRNN(torch.nn.Module):
 86 |     
 87 |     def __init__(self, input_size, hidden_size, recepie):
 88 |         super(CustomRNN, self).__init__()
 89 |         self.hidden_size = hidden_size
 90 |         self.cell = CustomRNNCell(input_size, hidden_size, recepie)
 91 |         self.reset_parameters()
 92 |         
 93 |     def forward(self, inputs, hidden_tuple=None):
 94 |         batch_size = inputs.size(1)
 95 |         if hidden_tuple is None:
 96 |             hidden_tuple = tuple([self.init_hidden(batch_size) for _ in range(self.cell.hidden_tuple_size)])
 97 |         
 98 |         self.check_hidden_size(hidden_tuple, batch_size)
 99 |         
100 |         hidden_tuple = tuple([x[0] for x in hidden_tuple])
101 |         outputs = []
102 |         for x in torch.unbind(inputs, dim=0):
103 |             hidden_tuple = self.cell(x, hidden_tuple)
104 |             outputs.append(hidden_tuple[0].clone())
105 | 
106 |         return torch.stack(outputs, dim=0), tuple([x.unsqueeze(0) for x in hidden_tuple])
107 |     
108 |     def init_hidden(self, batch_size):
109 |         # num_layers == const (1)
110 |         return torch.zeros(1, batch_size, self.hidden_size).to(next(self.parameters()).device)
111 |     
112 |     def reset_parameters(self):
113 |         stdv = 1.0 / math.sqrt(self.hidden_size)
114 |         for param in self.parameters():
115 |             torch.nn.init.uniform_(param, -stdv, stdv)
116 |             
117 |     def check_hidden_size(self, hidden_tuple, batch_size):
118 |         expected_hidden_size = (1, batch_size, self.hidden_size)
119 |         msg = 'Expected hidden size {}, got {}'
120 |         for hx in hidden_tuple:
121 |             if hx.size() != expected_hidden_size:
122 |                 raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
123 | 
124 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | from collections import Counter
 5 | 
 6 | 
 7 | class Dictionary(object):
 8 |     def __init__(self):
 9 |         self.word2idx = {}
10 |         self.idx2word = []
11 |         self.counter = Counter()
12 |         self.total = 0
13 | 
14 |     def add_word(self, word):
15 |         if word not in self.word2idx:
16 |             self.idx2word.append(word)
17 |             self.word2idx[word] = len(self.idx2word) - 1
18 |         token_id = self.word2idx[word]
19 |         self.counter[token_id] += 1
20 |         self.total += 1
21 |         return self.word2idx[word]
22 | 
23 |     def __len__(self):
24 |         return len(self.idx2word)
25 | 
26 | 
27 | class Corpus(object):
28 |     def __init__(self, path):
29 |         self.dictionary = Dictionary()
30 |         self.train = self.tokenize(os.path.join(path, 'train.txt'))
31 |         self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
32 |         self.test = self.tokenize(os.path.join(path, 'test.txt'))
33 | 
34 |     def tokenize(self, path):
35 |         """Tokenizes a text file."""
36 |         assert os.path.exists(path)
37 |         # Add words to the dictionary
38 |         with open(path, 'r') as f:
39 |             tokens = 0
40 |             for line in f:
41 |                 words = line.split() + ['<eos>']
42 |                 tokens += len(words)
43 |                 for word in words:
44 |                     self.dictionary.add_word(word)
45 | 
46 |         # Tokenize file content
47 |         with open(path, 'r') as f:
48 |             ids = torch.LongTensor(tokens)
49 |             token = 0
50 |             for line in f:
51 |                 words = line.split() + ['<eos>']
52 |                 for word in words:
53 |                     ids[token] = self.dictionary.word2idx[word]
54 |                     token += 1
55 | 
56 |         return ids
57 | 


--------------------------------------------------------------------------------
/data/datasets.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/data/datasets.zip


--------------------------------------------------------------------------------
/data/recepies_example.json:
--------------------------------------------------------------------------------
1 | [{"f": {"op": "linear", "input": ["x", "h_prev_0"]}, "h_new_0": {"op": "activation_tanh", "input": ["f"]}}, {"i": {"op": "linear", "input": ["x", "h_prev_0"]}, "i_act": {"op": "activation_tanh", "input": ["i"]}, "j": {"op": "linear", "input": ["x", "h_prev_0"]}, "j_act": {"op": "activation_sigm", "input": ["j"]}, "f": {"op": "linear", "input": ["x", "h_prev_0"]}, "f_act": {"op": "activation_sigm", "input": ["f"]}, "o": {"op": "linear", "input": ["x", "h_prev_0"]}, "o_act": {"op": "activation_tanh", "input": ["o"]}, "h_new_1_part1": {"op": "elementwise_prod", "input": ["f_act", "h_prev_1"]}, "h_new_1_part2": {"op": "elementwise_prod", "input": ["i_act", "j_act"]}, "h_new_1": {"op": "elementwise_sum", "input": ["h_new_1_part1", "h_new_1_part2"]}, "h_new_1_act": {"op": "activation_tanh", "input": ["h_new_1"]}, "h_new_0": {"op": "elementwise_prod", "input": ["h_new_1_act", "o_act"]}}, {"r": {"op": "linear", "input": ["x", "h_prev_0"]}, "r_act": {"op": "activation_sigm", "input": ["r"]}, "z": {"op": "linear", "input": ["x", "h_prev_0"]}, "z_act": {"op": "activation_sigm", "input": ["z"]}, "rh": {"op": "elementwise_prod", "input": ["r_act", "h_prev_0"]}, "h_tilde": {"op": "linear", "input": ["x", "rh"]}, "h_tilde_act": {"op": "activation_tanh", "input": ["h_tilde"]}, "h_new_0": {"op": "blend", "input": ["z_act", "h_prev_0", "h_tilde_act"]}}, {"node_0": {"op": "linear", "input": ["h_prev_0", "x"]}, "h_new_0": {"op": "activation_tanh", "input": ["node_0"]}}, {"node_0": {"op": "linear", "input": ["x", "h_prev_0"]}, "node_1": {"op": "activation_tanh", "input": ["node_0"]}, "node_3": {"op": "linear", "input": ["h_prev_0", "node_1"]}, "node_4": {"op": "activation_leaky_relu", "input": ["node_3"]}, "node_6": {"op": "linear", "input": ["node_4", "node_1"]}, "node_7": {"op": "activation_tanh", "input": ["node_6"]}, "node_8": {"op": "linear", "input": ["node_1", "x", "node_7"]}, "h_new_0": {"op": "activation_sigm", "input": ["node_8"]}}, {"node_0": {"op": "linear", "input": ["h_prev_0", "x"]}, "h_new_0": {"op": "activation_leaky_relu", "input": ["node_0"]}}, {"node_0": {"op": "linear", "input": ["x", "h_prev_0"]}, "node_1": {"op": "activation_sigm", "input": ["node_0"]}, "node_2": {"op": "linear", "input": ["x", "node_1"]}, "node_3": {"op": "activation_tanh", "input": ["node_2"]}, "h_new_0": {"op": "linear", "input": ["node_3", "node_1"]}}, {"node_0": {"op": "linear", "input": ["h_prev_0", "x"]}, "node_1": {"op": "activation_leaky_relu", "input": ["node_0"]}, "node_2": {"op": "elementwise_sum", "input": ["node_0", "node_1"]}, "node_3": {"op": "linear", "input": ["node_1", "h_prev_0"]}, "node_4": {"op": "activation_sigm", "input": ["node_3"]}, "node_5": {"op": "linear", "input": ["node_4", "x", "node_1"]}, "node_7": {"op": "elementwise_sum", "input": ["node_5", "node_3"]}, "h_new_0": {"op": "elementwise_sum", "input": ["node_2", "node_7"]}}, {"node_0": {"op": "linear", "input": ["h_prev_0", "x"]}, "node_1": {"op": "activation_sigm", "input": ["node_0"]}, "node_3": {"op": "linear", "input": ["node_1", "h_prev_0"]}, "node_4": {"op": "activation_leaky_relu", "input": ["node_3"]}, "node_11": {"op": "blend", "input": ["h_prev_0", "node_1", "node_4"]}, "h_new_0": {"op": "elementwise_prod", "input": ["node_3", "node_11"]}}, {"node_0": {"op": "linear", "input": ["x", "h_prev_0"]}, "node_1": {"op": "activation_leaky_relu", "input": ["node_0"]}, "node_2": {"op": "linear", "input": ["x", "node_1", "h_prev_0"]}, "h_new_0": {"op": "activation_tanh", "input": ["node_2"]}}]


--------------------------------------------------------------------------------
/embed_regularize.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torch
 4 | 
 5 | def embedded_dropout(embed, words, dropout=0.1, scale=None):
 6 |   if dropout:
 7 |     mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
 8 |     masked_embed_weight = mask * embed.weight
 9 |   else:
10 |     masked_embed_weight = embed.weight
11 |   if scale:
12 |     masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight
13 | 
14 |   padding_idx = embed.padding_idx
15 |   if padding_idx is None:
16 |       padding_idx = -1
17 | 
18 |   X = torch.nn.functional.embedding(words, masked_embed_weight,
19 |     padding_idx, embed.max_norm, embed.norm_type,
20 |     embed.scale_grad_by_freq, embed.sparse
21 |   )
22 |   return X
23 | 
24 | if __name__ == '__main__':
25 |   V = 50
26 |   h = 4
27 |   bptt = 10
28 |   batch_size = 2
29 | 
30 |   embed = torch.nn.Embedding(V, h)
31 | 
32 |   words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt))
33 |   words = torch.LongTensor(words)
34 | 
35 |   origX = embed(words)
36 |   X = embedded_dropout(embed, words)
37 | 
38 |   print(origX)
39 |   print(X)
40 | 


--------------------------------------------------------------------------------
/locked_dropout.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | 
 5 | class LockedDropout(nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def forward(self, x, dropout=0.5):
10 |         if not self.training or not dropout:
11 |             return x
12 |         m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
13 |         mask = Variable(m, requires_grad=False) / (1 - dropout)
14 |         mask = mask.expand_as(x)
15 |         return mask * x
16 | 


--------------------------------------------------------------------------------
/main_one_model_train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn
  3 | import torch.optim
  4 | import torch.utils.data
  5 | import torch.nn.functional as F
  6 | from splitcross import SplitCrossEntropyLoss
  7 | 
  8 | import numpy as np
  9 | import networkx as nx
 10 | import math
 11 | import json
 12 | import time
 13 | 
 14 | import data
 15 | import os
 16 | from utils import batchify
 17 | from argparse import Namespace
 18 | from model import AWDRNNModel
 19 | from train import train, evaluate
 20 | import datetime
 21 | 
 22 | import argparse
 23 | 
 24 | parser = argparse.ArgumentParser(description='PyTorch Custom RNN Language Model')
 25 | 
 26 | parser.add_argument('--dataset_path', type=str, default='data/ptb',
 27 |                     help='location of the data corpus')
 28 | parser.add_argument('--logs_path', type=str, default='tmp',
 29 |                     help='path to logs folder')
 30 | parser.add_argument('--recepies_list_path', type=str, default='data/recepies_example.json',
 31 |                     help='list of models recepies')
 32 | parser.add_argument('--recepie_id', type=int, required=True,
 33 |                     help='id of a model recepie from the models list')
 34 | parser.add_argument('--epochs', type=int, default=50,
 35 |                     help='number of epochs to train')
 36 | parser.add_argument('--emsize', type=int, default=400,
 37 |                     help='emsize')
 38 | parser.add_argument('--nhid', type=int, default=600,
 39 |                     help='nhid')
 40 | parser.add_argument('--nlayers', type=int, default=3,
 41 |                     help='nlayers')
 42 | parser.add_argument('--dropout', type=float, default=0.4,
 43 |                     help='dropout')
 44 | parser.add_argument('--dropouth', type=float, default=0.25,
 45 |                     help='dropouth')
 46 | parser.add_argument('--dropouti', type=float, default=0.4,
 47 |                     help='dropouti')
 48 | parser.add_argument('--dropoute', type=float, default=0.1,
 49 |                     help='dropoute')
 50 | parser.add_argument('--wdrop', type=float, default=0.5,
 51 |                     help='wdrop')
 52 | parser.add_argument('--experiment_id', type=int,
 53 |                     help='some specific id of the experiment')
 54 | 
 55 | if __name__ == '__main__':
 56 |     
 57 |     init_time = str(datetime.datetime.now()).replace(':', '-').split('.')[0].replace(' ', '_')
 58 |     
 59 |     main_args = parser.parse_args()
 60 |     
 61 |     if main_args.experiment_id is None:
 62 |         main_args.experiment_id = 999999999 - np.random.randint(100000)
 63 |     
 64 |     all_recepies = json.load(open(main_args.recepies_list_path, 'r'))
 65 |     
 66 |     args = Namespace(data=main_args.dataset_path,
 67 |                      recepie_id=main_args.recepie_id,
 68 |                      recepies_list_path=main_args.recepies_list_path,
 69 |                      cuda=True,
 70 |                      batch_size=20,
 71 |                      model='CustomRNN',
 72 |                      emsize=main_args.emsize,
 73 |                      nhid=main_args.nhid, 
 74 |                      nlayers=main_args.nlayers,
 75 |                      dropout=main_args.dropout,
 76 |                      dropouth=main_args.dropouth,
 77 |                      dropouti=main_args.dropouti,
 78 |                      dropoute=main_args.dropoute,
 79 |                      wdrop=main_args.wdrop,
 80 |                      tied=True,
 81 |                      bptt=70,
 82 |                      lr=1e-3,
 83 |                      wdecay=1.2e-6,
 84 |                      epochs=main_args.epochs,
 85 |                      alpha=2,
 86 |                      beta=1,
 87 |                      log_interval=200,
 88 |                      clip=0.25,
 89 |                      eval_batch_size = 50,
 90 |                      recepie=json.dumps(all_recepies[main_args.recepie_id]))
 91 |     
 92 |     corpus = data.Corpus(args.data)
 93 |     cuda = 'cuda'
 94 | 
 95 |     train_data = batchify(corpus.train, args.batch_size, args, cuda)
 96 |     train_eval_data = batchify(corpus.train, args.eval_batch_size, args, cuda)
 97 |     val_data = batchify(corpus.valid, args.eval_batch_size, args, cuda)
 98 |     test_data = batchify(corpus.test, args.eval_batch_size, args, cuda)
 99 |     
100 |     ntokens = len(corpus.dictionary)
101 |     
102 |     custom_model = AWDRNNModel(args.model, 
103 |                                ntokens, 
104 |                                args.emsize, 
105 |                                args.nhid, 
106 |                                args.nlayers, 
107 |                                args.dropout, 
108 |                                args.dropouth, 
109 |                                args.dropouti, 
110 |                                args.dropoute, 
111 |                                args.wdrop, 
112 |                                args.tied,
113 |                                args.recepie,
114 |                                verbose=False)
115 |     
116 |     
117 |     log_stats = vars(args)
118 |     log_stats['experiment_id'] = main_args.experiment_id
119 |     log_stats['init_time'] = init_time
120 |     log_stats['num_params'] = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] 
121 |                                   for x in custom_model.parameters() if x.size())
122 |     
123 |     
124 |     criterion = SplitCrossEntropyLoss(args.emsize, splits=[], verbose=False)
125 |     
126 |     if args.cuda:
127 |         custom_model = custom_model.to(cuda)
128 |         criterion = criterion.to(cuda)
129 | 
130 |     params = list(custom_model.parameters()) + list(criterion.parameters())
131 | 
132 |     optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay)
133 | 
134 |     lr = args.lr
135 |     train_losses = []
136 |     val_losses = []
137 |     test_losses = []
138 |     wall_times = []
139 | 
140 |     # At any point you can hit Ctrl + C to break out of training early.
141 |     status = 'OK'
142 |     try:
143 |         for epoch in range(1, args.epochs+1):
144 |             epoch_start_time = time.time()
145 |             train(custom_model, optimizer, params, criterion, train_data, args, epoch)
146 |             epoch_end_time = time.time()
147 |             train_loss = evaluate(custom_model, criterion, train_eval_data, args.eval_batch_size, args)
148 |             val_loss = evaluate(custom_model, criterion, val_data, args.eval_batch_size, args)
149 |             test_loss = evaluate(custom_model, criterion, test_data, args.eval_batch_size, args)
150 |             print('-' * 89)
151 |             print('| end of epoch {:3d} | time: {:5.2f}s |\n| train loss {:5.2f} | '
152 |                 'train ppl {:8.2f} | train bpw {:8.3f} |\n| valid loss {:5.2f} | '
153 |                 'valid ppl {:8.2f} | valid bpw {:8.3f} |\n| test loss {:5.2f} | '
154 |                 'test ppl {:8.2f} | test bpw {:8.3f} |'.format(
155 |               epoch, (epoch_end_time - epoch_start_time), 
156 |                     train_loss, math.exp(train_loss), train_loss / math.log(2),
157 |                     val_loss, math.exp(val_loss), val_loss / math.log(2),
158 |                 test_loss, math.exp(test_loss), test_loss / math.log(2)))
159 |             print('-' * 89)
160 | 
161 |             wall_times.append(epoch_end_time - epoch_start_time)
162 |             train_losses.append(train_loss)
163 |             val_losses.append(val_loss)
164 |             test_losses.append(test_loss)
165 |             
166 |             if np.isnan(np.array([train_loss, val_loss, test_loss])).any():
167 |                 status = 'loss is nan!'
168 |                 break
169 | 
170 |     except KeyboardInterrupt:
171 |         print('-' * 89)
172 |         status = 'KeyboardInterrupt'
173 |         print('Exiting from training early')
174 |     except Exception as e:
175 |         status = 'Exception: ' + str(e)
176 |         print('Exception', e)
177 | 
178 |     log_stats['wall_times'] = wall_times
179 |     log_stats['train_losses'] = train_losses
180 |     log_stats['val_losses'] = val_losses
181 |     log_stats['test_losses'] = test_losses
182 |     log_stats['status'] = status
183 |     
184 |     json.dump(log_stats, open(os.path.join(main_args.logs_path, f'log_stats_model_{args.recepie_id}_{init_time}_{main_args.experiment_id}.json'), 'w'))
185 |     torch.save(custom_model.state_dict(), os.path.join(main_args.logs_path, f'dump_weights_model_{args.recepie_id}_{init_time}_{main_args.experiment_id}.pt'))
186 | 
187 | 
188 | 
189 | 
190 |     
191 |     
192 |     
193 |     
194 |     
195 |     


--------------------------------------------------------------------------------
/make_arch_embeddings.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2\n",
 11 |     "\n",
 12 |     "import json\n",
 13 |     "from tqdm.auto import tqdm\n",
 14 |     "from pathlib import Path\n",
 15 |     "import plotting\n",
 16 |     "import networkx as nx \n",
 17 |     "from joblib import Parallel, delayed\n",
 18 |     "import contextlib\n",
 19 |     "import random\n",
 20 |     "import numpy as np\n",
 21 |     "import matplotlib.pyplot as plt\n",
 22 |     "from sklearn.manifold import TSNE\n",
 23 |     "from utils import make_graph\n",
 24 |     "import pandas as pd\n",
 25 |     "from copy import deepcopy\n",
 26 |     "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n",
 27 |     "import hashlib"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution\n",
 37 |     "@contextlib.contextmanager\n",
 38 |     "def tqdm_joblib(tqdm_object):\n",
 39 |     "    \"\"\"Context manager to patch joblib to report into tqdm progress bar given as argument\"\"\"\n",
 40 |     "    class TqdmBatchCompletionCallback:\n",
 41 |     "        def __init__(self, time, index, parallel):\n",
 42 |     "            self.index = index\n",
 43 |     "            self.parallel = parallel\n",
 44 |     "\n",
 45 |     "        def __call__(self, index):\n",
 46 |     "            tqdm_object.update()\n",
 47 |     "            if self.parallel._original_iterator is not None:\n",
 48 |     "                self.parallel.dispatch_next()\n",
 49 |     "\n",
 50 |     "    old_batch_callback = joblib.parallel.BatchCompletionCallBack\n",
 51 |     "    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback\n",
 52 |     "    try:\n",
 53 |     "        yield tqdm_object\n",
 54 |     "    finally:\n",
 55 |     "        joblib.parallel.BatchCompletionCallBack = old_batch_callback\n",
 56 |     "        tqdm_object.close() "
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "logs_path = Path('train_logs_single_run')\n",
 66 |     "logs_paths = list(logs_path.glob(\"*.json\"))\n",
 67 |     "\n",
 68 |     "logs_ok = []\n",
 69 |     "logs_not_ok = []\n",
 70 |     "for idx, log_path in tqdm(enumerate(logs_paths), total=len(logs_paths)):\n",
 71 |     "    with open(log_path, \"r\") as f:\n",
 72 |     "        log = json.load(f)\n",
 73 |     "    recepie = json.loads(log['recepie'])\n",
 74 |     "    log['recepie'] = recepie\n",
 75 |     "    log['idx'] = idx\n",
 76 |     "    \n",
 77 |     "    if log['status'] == 'OK':\n",
 78 |     "        logs_ok.append(log)\n",
 79 |     "    else:\n",
 80 |     "        logs_not_ok.append(log)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "print(\"number ok: \", len(logs_ok))\n",
 90 |     "print(\"number not ok: \", len(logs_not_ok))"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "len(logs_not_ok)/len(logs_ok)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "def map_node(x):\n",
109 |     "    for v in ['node', 'h_prev', 'h_new']:\n",
110 |     "        if x.find(v) != -1:\n",
111 |     "            x = v\n",
112 |     "    if x not in ['x', 'node', 'h_prev', 'h_new']: # to make lstm and gru recepies standard\n",
113 |     "        x = 'node'\n",
114 |     "    return x"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "def make_graph_2(recepie):\n",
124 |     "    \n",
125 |     "    G = nx.DiGraph()\n",
126 |     "    for n in recepie.keys():\n",
127 |     "        if n not in G.nodes():\n",
128 |     "            G.add_node(n)\n",
129 |     "        for k in recepie[n]['input']:\n",
130 |     "            if k not in G.nodes():\n",
131 |     "                G.add_node(k)\n",
132 |     "            G.add_edge(n, k, label=recepie[n]['op'])\n",
133 |     "            G.add_edge(k, n, label='rev_' + recepie[n]['op'])\n",
134 |     "    return G"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "def random_walk_features(G, steps=10):\n",
144 |     "    walk = []\n",
145 |     "    node = np.random.choice(G.nodes(), 1)[0]\n",
146 |     "    for _ in range(steps):\n",
147 |     "        k = np.random.choice(list(G.adj[node]), 1)[0]\n",
148 |     "        walk.extend([map_node(node), G.adj[node][k]['label']])\n",
149 |     "        node = k\n",
150 |     "    walk.append(map_node(node))\n",
151 |     "    return walk"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "def make_graph_sentences(G, sentences_num=20):\n",
161 |     "    sentences = []\n",
162 |     "    for _ in range(sentences_num):\n",
163 |     "        sentences.extend(random_walk_features(G) + ['.'])\n",
164 |     "    return sentences"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "def feature_extractor(log):\n",
174 |     "    name = f\"log_{log['idx']}\"\n",
175 |     "    recepie = log['recepie']\n",
176 |     "    G = make_graph_2(recepie)\n",
177 |     "    doc = TaggedDocument(words=make_graph_sentences(G), tags=[name])\n",
178 |     "    return doc"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "document_collections = Parallel(n_jobs=-2)(delayed(feature_extractor)(log) for log in tqdm(logs_not_ok+logs_ok))"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "# takes a while!!\n",
197 |     "size = 10\n",
198 |     "# size = 50\n",
199 |     "doc2vec_model = Doc2Vec(document_collections, \n",
200 |     "                        size=size, window=3, dm=1, min_count=0, workers=8, epochs=100, hs=1,\n",
201 |     "                        dbow_words=0)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "all_words = set()\n",
211 |     "for d in document_collections:\n",
212 |     "    all_words |= set(d.words)\n",
213 |     "len(all_words)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "not_dublicates_ok_logs = []\n",
223 |     "not_dublicates_not_ok_logs = []\n",
224 |     "not_dublicates_recepies = []\n",
225 |     "for log in logs_not_ok+logs_ok:\n",
226 |     "    recepie = log['recepie']\n",
227 |     "    if recepie not in not_dublicates_recepies:\n",
228 |     "        not_dublicates_recepies.append(recepie)\n",
229 |     "        if log['status'] == 'OK':\n",
230 |     "            not_dublicates_ok_logs.append(log)\n",
231 |     "        else:\n",
232 |     "            not_dublicates_not_ok_logs.append(log)\n",
233 |     "print(\"total: \", len(logs_not_ok+logs_ok))\n",
234 |     "print(\"without dublicates: \", len(not_dublicates_ok_logs+not_dublicates_not_ok_logs))"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "#dump vectors\n",
244 |     "recepie_id_vectors = {log['recepie_id']:doc2vec_model.docvecs[f\"log_{log['idx']}\"]\n",
245 |     "                      for log in not_dublicates_ok_logs+not_dublicates_not_ok_logs}\n"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "recepie_id_vectors_list = []\n",
255 |     "for k in recepie_id_vectors:\n",
256 |     "    k_dict = {'recepie_id':k}\n",
257 |     "    for i in range(doc2vec_model.vector_size):\n",
258 |     "        k_dict[f'v{i:02d}'] = recepie_id_vectors[k][i]\n",
259 |     "    recepie_id_vectors_list.append(k_dict)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": []
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "df_recepie_vectors = pd.DataFrame(recepie_id_vectors_list)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "df_recepie_vectors.head()"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "# if size == 50:\n",
294 |     "#     df_recepie_vectors.to_csv('data/doc2vec_features.csv', index=False)\n",
295 |     "# elif size == 10:\n",
296 |     "#     df_recepie_vectors.to_csv('data/doc2vec_features_lowdim.csv', index=False)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": []
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": []
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "df_recepie_vectors = pd.read_csv('data/doc2vec_features.csv').set_index('recepie_id')"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": [
328 |     "trainset_list = [\n",
329 |     "    (df_recepie_vectors.loc[log['recepie_id']], int(log['status'] == 'OK'))\n",
330 |     "    for log in not_dublicates_ok_logs+not_dublicates_not_ok_logs\n",
331 |     "]\n",
332 |     "\n",
333 |     "random.shuffle(trainset_list)\n",
334 |     "\n",
335 |     "trainset_np, testset_np = np.array(trainset_list[:7000]), np.array(trainset_list[7000:])\n",
336 |     "train_X, train_y = np.array(list(trainset_np[:, 0])), np.array(list(trainset_np[:, 1]))\n",
337 |     "test_X, test_y = np.array(list(testset_np[:, 0])), np.array(list(testset_np[:, 1]))\n",
338 |     "\n",
339 |     "print(\"Train:\", len(trainset_np))\n",
340 |     "print(\"Test: \", len(testset_np))\n",
341 |     "\n",
342 |     "num_train_not_ok = len(train_y) - train_y.sum()\n",
343 |     "print(\"\\nTrain OK: \", train_y.sum())\n",
344 |     "print(\"Train not OK: \", num_train_not_ok)\n",
345 |     "\n",
346 |     "num_test_not_ok = len(test_y) - test_y.sum()\n",
347 |     "print(\"\\nTest OK: \", test_y.sum())\n",
348 |     "print(\"Test not OK: \", num_test_not_ok)"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {},
355 |    "outputs": [],
356 |    "source": [
357 |     "tsne = TSNE(n_components=2, n_iter=300, verbose=True)"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "E = tsne.fit_transform(train_X)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "plt.figure(figsize=(8, 8))\n",
376 |     "plt.scatter(E[:, 0], E[:, 1], s=3, c=train_y)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "from xgboost import XGBClassifier\n",
386 |     "from catboost import CatBoostClassifier\n",
387 |     "from sklearn.metrics import f1_score, precision_recall_curve, average_precision_score, roc_curve, auc "
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": null,
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "model = XGBClassifier(n_jobs=-1)\n",
397 |     "model.fit(train_X, train_y)\n",
398 |     "\n",
399 |     "pred_y = model.predict_proba(test_X)\n",
400 |     "precision_xgboost, recall_xgboost, _ = precision_recall_curve(test_y, pred_y[:, 1])\n",
401 |     "fpr_xgboost, tpr_xgboost, _ = roc_curve(test_y, pred_y[:, 1])\n",
402 |     "roc_auc_axboost = auc(fpr_xgboost, tpr_xgboost)\n",
403 |     "\n",
404 |     "f1_test_score = f1_score(test_y, np.argmax(pred_y, 1))\n",
405 |     "ap_test_score = average_precision_score(test_y, pred_y[:, 1])\n",
406 |     "print(\"XGBoost F1 score: \", f1_test_score)\n",
407 |     "print(\"XGBoost Average Precision score: \", ap_test_score)"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": null,
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": [
416 |     "from sklearn.linear_model import LogisticRegression\n",
417 |     "\n",
418 |     "clf = LogisticRegression().fit(train_X, train_y)\n",
419 |     "pred_y = clf.predict_proba(test_X)\n",
420 |     "precision_lr, recall_lr, _ = precision_recall_curve(test_y, pred_y[:,1])\n",
421 |     "fpr_lr, tpr_lr, _ = roc_curve(test_y, pred_y[:, 1])\n",
422 |     "roc_auc_lr = auc(fpr_lr, tpr_lr)\n",
423 |     "\n",
424 |     "f1_test_score = f1_score(test_y, np.argmax(pred_y, 1))\n",
425 |     "ap_test_score = average_precision_score(test_y, pred_y[:, 1])\n",
426 |     "print(\"Logistic Regression F1 score: \", f1_test_score)\n",
427 |     "print(\"CatBoost Average Precision score: \", ap_test_score)"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "#plt.title('Receiver Operating Characteristic')\n",
437 |     "plt.figure(figsize=(6, 6))\n",
438 |     "plt.plot(fpr_xgboost, tpr_xgboost, label='XGBoost AUC = %0.2f' % roc_auc_axboost)\n",
439 |     "plt.plot(fpr_lr, tpr_lr, label='Logistic Regression AUC = %0.2f' % roc_auc_lr)\n",
440 |     "plt.legend(loc = 'lower right')\n",
441 |     "plt.plot([0, 1], [0, 1],'k--')\n",
442 |     "plt.xlim([0, 1])\n",
443 |     "plt.ylim([0, 1])\n",
444 |     "plt.ylabel('True Positive Rate', fontsize=16)\n",
445 |     "plt.xlabel('False Positive Rate', fontsize=16)\n",
446 |     "\n",
447 |     "plt.savefig('data/figures/prediction_faulty.png', dpi=300, bbox_inches='tight')"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "trainset_list = [\n",
457 |     "    (doc2vec_model.docvecs[f\"log_{log['idx']}\"], np.array(log['val_losses']).min())\n",
458 |     "    for log in not_dublicates_ok_logs\n",
459 |     "]\n",
460 |     "\n",
461 |     "\n",
462 |     "np.random.seed(0)\n",
463 |     "random.shuffle(trainset_list)\n",
464 |     "\n",
465 |     "trainset_np, testset_np = np.array(trainset_list[:7000]), np.array(trainset_list[7000:])\n",
466 |     "train_X, train_y = np.array(list(trainset_np[:, 0])), np.array(list(trainset_np[:, 1]))\n",
467 |     "test_X, test_y = np.array(list(testset_np[:, 0])), np.array(list(testset_np[:, 1]))\n",
468 |     "\n",
469 |     "print(\"Train:\", len(trainset_np))\n",
470 |     "print(\"Test: \", len(testset_np))\n",
471 |     "\n",
472 |     "num_train_not_ok = len(train_y) - train_y.sum()\n",
473 |     "print(\"\\nTrain OK: \", train_y.sum())\n",
474 |     "print(\"Train not OK: \", num_train_not_ok)\n",
475 |     "\n",
476 |     "num_test_not_ok = len(test_y) - test_y.sum()\n",
477 |     "print(\"\\nTest OK: \", test_y.sum())\n",
478 |     "print(\"Test not OK: \", num_test_not_ok)"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": null,
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "plt.figure(figsize=(8, 8))\n",
488 |     "plt.scatter(E[:, 0], E[:, 1], s=3, c=train_y, cmap=plt.cm.plasma_r)\n",
489 |     "plt.colorbar()\n",
490 |     "plt.clim([4.5, 6])"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "train_y"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "(train_y > 6).mean()"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "plt.figure(figsize=(8, 8))\n",
518 |     "plt.scatter(E[:, 0], E[:, 1], s=3, color='C0')\n",
519 |     "sub_inds = np.where(train_y > 6)[0]\n",
520 |     "plt.scatter(E[sub_inds, 0], E[sub_inds, 1], s=5, color='red')"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": null,
526 |    "metadata": {},
527 |    "outputs": [],
528 |    "source": [
529 |     "model = XGBClassifier(n_jobs=-1)\n",
530 |     "model.fit(train_X, train_y > 6)\n",
531 |     "\n",
532 |     "pred_y = model.predict_proba(test_X)\n",
533 |     "precision_xgboost, recall_xgboost, _ = precision_recall_curve(test_y > 6, pred_y[:, 1])\n",
534 |     "fpr_xgboost, tpr_xgboost, _ = roc_curve(test_y > 6, pred_y[:, 1])\n",
535 |     "roc_auc_axboost = auc(fpr_xgboost, tpr_xgboost)\n",
536 |     "\n",
537 |     "f1_test_score = f1_score(test_y > 6, np.argmax(pred_y, 1))\n",
538 |     "ap_test_score = average_precision_score(test_y > 6, pred_y[:, 1])\n",
539 |     "print(\"XGBoost F1 score: \", f1_test_score)\n",
540 |     "print(\"XGBoost Average Precision score: \", ap_test_score)"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": null,
546 |    "metadata": {},
547 |    "outputs": [],
548 |    "source": [
549 |     "#plt.title('Receiver Operating Characteristic')\n",
550 |     "plt.figure(figsize=(6, 6))\n",
551 |     "plt.plot(fpr_xgboost, tpr_xgboost, label='XGBoost AUC = %0.2f' % roc_auc_axboost)\n",
552 |     "plt.legend(loc = 'lower right')\n",
553 |     "plt.plot([0, 1], [0, 1],'k--')\n",
554 |     "plt.xlim([0, 1])\n",
555 |     "plt.ylim([0, 1])\n",
556 |     "plt.ylabel('True Positive Rate', fontsize=16)\n",
557 |     "plt.xlabel('False Positive Rate', fontsize=16)\n"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": null,
563 |    "metadata": {},
564 |    "outputs": [],
565 |    "source": []
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {},
571 |    "outputs": [],
572 |    "source": [
573 |     "from xgboost import XGBRegressor\n",
574 |     "from sklearn.ensemble import BaggingRegressor\n",
575 |     "from sklearn.metrics import r2_score\n",
576 |     "\n",
577 |     "regr = BaggingRegressor(XGBRegressor(n_estimators=100, max_depth=15), n_jobs=10, n_estimators=20, max_samples=0.5).fit(train_X, train_y)\n",
578 |     "regr_6 = BaggingRegressor(XGBRegressor(n_estimators=100, max_depth=15), n_jobs=10, n_estimators=20, max_samples=0.5).fit(train_X[train_y < 6], train_y[train_y < 6])\n",
579 |     "pred_y = regr.predict(test_X)\n",
580 |     "pred_y_6 = regr_6.predict(test_X)\n"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": null,
586 |    "metadata": {},
587 |    "outputs": [],
588 |    "source": [
589 |     "plt.figure(figsize=(6, 6))\n",
590 |     "plt.scatter(test_y, pred_y, s=1)\n",
591 |     "plt.xlabel('Testing log perplexity', fontsize=16)\n",
592 |     "plt.ylabel('Predicted testing log perplexity', fontsize=16)\n",
593 |     "plt.xlim([4.5, 7])\n",
594 |     "plt.ylim([4.5, 7])\n",
595 |     "plt.savefig('data/figures/prediction_loss.png', dpi=300, bbox_inches='tight')"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "code",
600 |    "execution_count": null,
601 |    "metadata": {},
602 |    "outputs": [],
603 |    "source": [
604 |     "r2_score(test_y, pred_y)"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": [
613 |     "r2_score(test_y, pred_y_6)"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": null,
619 |    "metadata": {},
620 |    "outputs": [],
621 |    "source": [
622 |     "r2_score(test_y[test_y < 6], pred_y[test_y < 6])"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {},
629 |    "outputs": [],
630 |    "source": [
631 |     "r2_score(test_y[test_y < 6], pred_y_6[test_y < 6])"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": null,
637 |    "metadata": {},
638 |    "outputs": [],
639 |    "source": []
640 |   },
641 |   {
642 |    "cell_type": "code",
643 |    "execution_count": null,
644 |    "metadata": {},
645 |    "outputs": [],
646 |    "source": []
647 |   },
648 |   {
649 |    "cell_type": "code",
650 |    "execution_count": null,
651 |    "metadata": {},
652 |    "outputs": [],
653 |    "source": []
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": null,
658 |    "metadata": {},
659 |    "outputs": [],
660 |    "source": []
661 |   }
662 |  ],
663 |  "metadata": {
664 |   "kernelspec": {
665 |    "display_name": "Python 3",
666 |    "language": "python",
667 |    "name": "python3"
668 |   },
669 |   "language_info": {
670 |    "codemirror_mode": {
671 |     "name": "ipython",
672 |     "version": 3
673 |    },
674 |    "file_extension": ".py",
675 |    "mimetype": "text/x-python",
676 |    "name": "python",
677 |    "nbconvert_exporter": "python",
678 |    "pygments_lexer": "ipython3",
679 |    "version": "3.6.10"
680 |   }
681 |  },
682 |  "nbformat": 4,
683 |  "nbformat_minor": 2
684 | }
685 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn
  3 | 
  4 | from embed_regularize import embedded_dropout
  5 | from locked_dropout import LockedDropout
  6 | from weight_drop import WeightDrop, ParameterListWeightDrop
  7 | 
  8 | from custom_rnn import CustomRNN
  9 | 
 10 | import json
 11 | import numpy as np
 12 | 
 13 | class AWDRNNModel(torch.nn.Module):
 14 |     """Container module with an encoder, a recurrent module, and a decoder."""
 15 | 
 16 |     def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, 
 17 |                  dropout=0.5, dropouth=0.5, dropouti=0.5, dropoute=0.1, wdrop=0, tie_weights=False,
 18 |                  recepie=None, verbose=True):
 19 |         super(AWDRNNModel, self).__init__()
 20 |         self.lockdrop = LockedDropout()
 21 |         self.idrop = torch.nn.Dropout(dropouti)
 22 |         self.hdrop = torch.nn.Dropout(dropouth)
 23 |         self.drop = torch.nn.Dropout(dropout)
 24 |         self.encoder = torch.nn.Embedding(ntoken, ninp)
 25 |         self.wdrop = wdrop
 26 |         self.verbose = verbose
 27 |         
 28 |         if recepie is not None:
 29 |             recepie = json.loads(recepie)
 30 |             
 31 |         self.rnns = []
 32 |         for i in range(nlayers):
 33 |             input_size = ninp if i == 0 else nhid
 34 |             hidden_size = nhid if i != nlayers - 1 else (ninp if tie_weights else nhid)
 35 |             if rnn_type == 'LSTM':
 36 |                 self.rnns.append(torch.nn.LSTM(input_size, hidden_size))
 37 |             elif rnn_type == 'CustomRNN':
 38 |                 self.rnns.append(CustomRNN(input_size, hidden_size, recepie))
 39 | 
 40 |         if wdrop:
 41 |             if rnn_type == 'LSTM':
 42 |                 self.rnns = [WeightDrop(rnn, ['weight_hh_l0'], dropout=wdrop) for rnn in self.rnns]
 43 |             elif rnn_type == 'CustomRNN':
 44 |                 wd_rnns = []
 45 |                 for rnn in self.rnns:
 46 |                     multilinear_components = []
 47 |                     for k, v in rnn.cell.components.items():
 48 |                         if rnn.cell.recepie[k]['op'] == 'linear':
 49 |                             for i in np.where(np.array(rnn.cell.recepie[k]['input']) != 'x')[0]:
 50 |                                 multilinear_components.append(f'cell.components.{k}.weights.{i}')
 51 |                     wd_rnns.append(ParameterListWeightDrop(rnn, multilinear_components, dropout=wdrop))
 52 |                     self.rnns = wd_rnns
 53 |        
 54 |         if self.verbose:
 55 |             print(self.rnns)
 56 |         self.rnns = torch.nn.ModuleList(self.rnns)
 57 |         self.decoder = torch.nn.Linear(nhid, ntoken)
 58 | 
 59 |         if tie_weights:
 60 |             self.decoder.weight = self.encoder.weight
 61 | 
 62 |         self.init_weights()
 63 | 
 64 |         self.rnn_type = rnn_type
 65 |         self.ninp = ninp
 66 |         self.nhid = nhid
 67 |         self.nlayers = nlayers
 68 |         self.dropout = dropout
 69 |         self.dropouti = dropouti
 70 |         self.dropouth = dropouth
 71 |         self.dropoute = dropoute
 72 |         self.tie_weights = tie_weights
 73 |         self.recepie = recepie
 74 |         
 75 |     def reset(self):
 76 |         pass
 77 | 
 78 |     def init_weights(self):
 79 |         initrange = 0.1
 80 |         self.encoder.weight.data.uniform_(-initrange, initrange)
 81 |         self.decoder.bias.data.fill_(0)
 82 |         self.decoder.weight.data.uniform_(-initrange, initrange)
 83 | 
 84 |     def forward(self, input, hidden, return_h=False):
 85 |         emb = embedded_dropout(self.encoder, input, dropout=self.dropoute if self.training else 0)
 86 |         #emb = self.idrop(emb)
 87 | 
 88 |         emb = self.lockdrop(emb, self.dropouti)
 89 | 
 90 |         raw_output = emb
 91 |         new_hidden = []
 92 |         raw_outputs = []
 93 |         outputs = []
 94 |         for i, rnn in enumerate(self.rnns):
 95 |             raw_output, new_h = rnn(raw_output, hidden[i])
 96 |             new_hidden.append(new_h)
 97 |             raw_outputs.append(raw_output)
 98 |             if i != self.nlayers - 1:
 99 |                 #self.hdrop(raw_output) add??? 
100 |                 raw_output = self.lockdrop(raw_output, self.dropouth)
101 |                 outputs.append(raw_output)
102 |         hidden = new_hidden
103 | 
104 |         output = self.lockdrop(raw_output, self.dropout)
105 |         outputs.append(output)
106 |         result = output.view(output.size(0)*output.size(1), output.size(2))
107 |         if return_h:
108 |             return result, hidden, raw_outputs, outputs
109 |         return result, hidden
110 | 
111 |     def init_hidden(self, bsz):
112 |         weight = next(self.parameters()).data
113 |         hidden = []
114 |         for i in range(self.nlayers):
115 |             if self.rnn_type == 'LSTM':
116 |                 hidden_tuple_size = 2
117 |             elif self.rnn_type == 'CustomRNN':
118 |                 if self.wdrop: 
119 |                     # wrapped with ParameterListWeightDrop
120 |                     hidden_tuple_size = self.rnns[0].module.cell.hidden_tuple_size
121 |                 else:
122 |                     hidden_tuple_size = self.rnns[0].cell.hidden_tuple_size
123 |             hidden_size = self.nhid if i != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)
124 |             hidden.append(tuple([weight.new(1, bsz, hidden_size).zero_() for _ in range(hidden_tuple_size)]))
125 |         
126 |         return hidden    


--------------------------------------------------------------------------------
/models_weights/dump_weights_model_2226_2020-04-18_07-35-19_999938929.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/models_weights/dump_weights_model_2226_2020-04-18_07-35-19_999938929.pt


--------------------------------------------------------------------------------
/multilinear.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn
 3 | import torch.nn.functional as F
 4 | 
 5 | import math
 6 | 
 7 | class MultiLinear(torch.nn.Module):
 8 | 
 9 |     def __init__(self, input_sizes, output_size):
10 |         super(MultiLinear, self).__init__()
11 |         self.input_sizes = input_sizes
12 |         self.output_size = output_size
13 |         
14 |         weights = []
15 |         for input_size in input_sizes:
16 |             weights.append(torch.nn.Parameter(torch.Tensor(output_size, input_size)))
17 |         self.weights = torch.nn.ParameterList(weights)
18 |         
19 |         self.bias = torch.nn.Parameter(torch.Tensor(output_size))
20 |         
21 |         self.reset_parameters()
22 | 
23 |     def reset_parameters(self):
24 |         for i in range(len(self.weights)):
25 |             torch.nn.init.kaiming_uniform_(self.weights[i], a=math.sqrt(5))
26 |         
27 |         fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weights[0])
28 |         bound = 1 / math.sqrt(fan_in)
29 |         torch.nn.init.uniform_(self.bias, -bound, bound)
30 | 
31 |     def forward(self, *inputs):
32 |         result = F.linear(inputs[0], self.weights[0], self.bias)
33 |         for i in range(1, len(self.weights)):
34 |             result = result + F.linear(inputs[i], self.weights[i])            
35 |         return result
36 | 
37 |     def extra_repr(self):
38 |         return 'input_sizes={}, output_size={}'.format(
39 |             self.input_sizes, self.output_size
40 |         )
41 | 


--------------------------------------------------------------------------------
/nas_environment.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Environment:
 7 |     '''
 8 |     Simulates NAS environment. Architecutres can be trained for a specified amount of epochs. 
 9 |     Tarining results are cached, that is, training the same model for larer epochs 
10 |     will be timed as a continuation from the model's checkpoint. 
11 |     '''
12 |     def __init__(self, logs_dir):
13 |         self._logs = []
14 |         self._arch_to_id = {}
15 |         
16 |         arch_id = 0
17 |         for i, filename in enumerate(os.listdir(logs_dir)):
18 |             if filename.endswith('.json'):
19 |                 log_path = os.path.join(logs_dir, filename)
20 |                 x = json.load(open(log_path, 'r'))
21 |                 self._logs.append(x)
22 |                 assert x['recepie'] not in self._arch_to_id
23 |                 self._arch_to_id[x['recepie']] = arch_id
24 |                 arch_id += 1
25 |         
26 |         self._training_states = {}
27 |         
28 |     def get_total_time(self):
29 |         return sum([x['wall_time'] for x in self._training_states.values()])
30 |     
31 | 
32 |     def get_best_possible_test_loss(self):
33 |         min_loss = np.inf
34 |         for log in self._logs:
35 |             if len(log['test_losses']) > 0:
36 |                 cur_loss = np.nanmin(log['test_losses'])
37 |                 if cur_loss < min_loss:
38 |                     min_loss = cur_loss
39 |         return min_loss
40 | 
41 |     def get_test_loss_of_the_best_validated_architecture(self):
42 |         return self._logs[self.best_arch_id]['test_losses'][self.best_arch_epoch]
43 |             
44 |     def get_precomputed_recepies(self):
45 |         return [json.loads(x['recepie']) for x in self._logs]
46 |     
47 |     def get_recepie_ids(self):
48 |         return [x['recepie_id'] for x in self._logs]
49 |     
50 |     def reset(self):
51 |         self.best_arch_id = -1
52 |         self.best_arch_epoch = -1
53 |         self._training_states = {}
54 |         
55 |     def _make_state_dict(self, arch_id, epoch):
56 |         state_dict = {f'{phase}_loss':self._logs[arch_id][f'{phase}_losses'][epoch] if epoch >= 0 else np.nan 
57 |                       for phase in ['train', 'val', 'test']}
58 |         state_dict['wall_time'] = np.sum(self._logs[arch_id]['wall_times'][:epoch])
59 |         state_dict['cur_epoch'] = epoch
60 |         state_dict['status'] = 'OK' if epoch < len(self._logs[arch_id]['train_losses']) - 1 else self._logs[arch_id]['status']
61 |         return state_dict
62 |     
63 |     def simulated_train(self, arch, max_epoch):
64 |         arch_id = self._arch_to_id[json.dumps(arch)]
65 |         if (arch_id not in self._training_states) or (max_epoch > self._training_states[arch_id]['cur_epoch']):
66 |             max_epoch = min([max_epoch, len(self._logs[arch_id]['train_losses']) - 1])
67 |             self._training_states[arch_id] = self._make_state_dict(arch_id, max_epoch)
68 |             
69 |             # update best result
70 |             val_losses = self._logs[arch_id]['val_losses'][:self._training_states[arch_id]['cur_epoch'] + 1]
71 |             if np.sum(~np.isnan(val_losses)) > 0:
72 |                 cur_best_epoch = np.nanargmin(val_losses)
73 |                 if (self.best_arch_id == -1) or\
74 |                 (self._logs[self.best_arch_id]['val_losses'][self.best_arch_epoch] > val_losses[cur_best_epoch]):
75 |                     self.best_arch_id = arch_id
76 |                     self.best_arch_epoch = cur_best_epoch
77 |         
78 |     def get_model_status(self, arch):
79 |         arch_id = self._arch_to_id[json.dumps(arch)]
80 |         return self._training_states[arch_id]['status']
81 |     
82 |     def get_model_stats(self, arch, epoch):
83 |         arch_id = self._arch_to_id[json.dumps(arch)]
84 |         if self._training_states[arch_id]['cur_epoch'] < epoch:
85 |             raise Exception('Required epoch exceeds current training epochs.')
86 |         
87 |         return self._make_state_dict(arch_id, epoch)


--------------------------------------------------------------------------------
/plotting.py:
--------------------------------------------------------------------------------
 1 | import pygraphviz as pgv
 2 | from IPython.display import Image
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | def plot_recepie(recepie, dpi=100):
 7 |     graph = pgv.AGraph(directed=True, strict=True,
 8 |                        fontname='Helvetica', arrowtype='open')
 9 | 
10 |     node_color = {'x':'forestgreen', 
11 |                   'h_prev_0':'orange',
12 |                   'h_new_0':'orange',
13 |                   'h_prev_1':'cyan',
14 |                   'h_new_1':'cyan',
15 |                   'h_prev_2':'purple',
16 |                   'h_new_2':'purple'}
17 |     
18 |     blend_i_to_color = {1:'blue3', 
19 |                         2:'brown3'}
20 |     
21 |     nodes_dict = {}
22 |     for k in recepie.keys():
23 |         if k not in nodes_dict:
24 |             graph.add_node(len(nodes_dict), label=recepie[k]['op'] + ':\n' + k, 
25 |                            fillcolor=node_color.get(k, 'white'), style='filled')
26 |             nodes_dict[k] = len(nodes_dict)
27 |     for k in recepie.keys():
28 |         for i, n in enumerate(recepie[k]['input']):
29 |             if n not in nodes_dict:
30 |                 graph.add_node(len(nodes_dict), label=n, 
31 |                                fillcolor=node_color.get(n, 'white'), style='filled')
32 |                 nodes_dict[n] = len(nodes_dict)
33 |             #print(nodes_dict[k], nodes_dict[n])
34 |             if recepie[k]['op'] != 'blend':
35 |                 graph.add_edge(nodes_dict[n], nodes_dict[k])
36 |             else:
37 |                 if i == 0:
38 |                     graph.add_edge(nodes_dict[n], nodes_dict[k], style='dashed')
39 |                 else:
40 |                     graph.add_edge(nodes_dict[n], nodes_dict[k], color=blend_i_to_color[i])    
41 | 
42 |     return Image(graph.draw(format='png', prog='dot', args=f'-Gdpi={dpi} -Nfontsize=8'))


--------------------------------------------------------------------------------
/reproduce_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import torch\n",
 10 |     "import torch.nn\n",
 11 |     "import torch.optim\n",
 12 |     "import torch.utils.data\n",
 13 |     "import torch.nn.functional as F\n",
 14 |     "from splitcross import SplitCrossEntropyLoss\n",
 15 |     "\n",
 16 |     "import numpy as np\n",
 17 |     "import networkx as nx\n",
 18 |     "import math\n",
 19 |     "import json\n",
 20 |     "import time\n",
 21 |     "\n",
 22 |     "import data\n",
 23 |     "import os\n",
 24 |     "from utils import batchify\n",
 25 |     "from argparse import Namespace\n",
 26 |     "from model import AWDRNNModel\n",
 27 |     "from train import train, evaluate\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "suffix = '2226_2020-04-18_07-35-19_999938929'"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "log = json.load(open('train_logs_multi_runs/log_stats_model_100' + suffix + '.json', 'r'))"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "args = Namespace(**log)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "corpus = data.Corpus(args.data)\n",
 65 |     "cuda = 'cuda'\n",
 66 |     "\n",
 67 |     "train_data = batchify(corpus.train, args.batch_size, args, cuda)\n",
 68 |     "train_eval_data = batchify(corpus.train, args.eval_batch_size, args, cuda)\n",
 69 |     "val_data = batchify(corpus.valid, args.eval_batch_size, args, cuda)\n",
 70 |     "test_data = batchify(corpus.test, args.eval_batch_size, args, cuda)\n",
 71 |     "\n",
 72 |     "ntokens = len(corpus.dictionary)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "custom_model = AWDRNNModel(args.model, \n",
 82 |     "                               ntokens, \n",
 83 |     "                               args.emsize, \n",
 84 |     "                               args.nhid, \n",
 85 |     "                               args.nlayers, \n",
 86 |     "                               args.dropout, \n",
 87 |     "                               args.dropouth, \n",
 88 |     "                               args.dropouti, \n",
 89 |     "                               args.dropoute, \n",
 90 |     "                               args.wdrop, \n",
 91 |     "                               args.tied,\n",
 92 |     "                               args.recepie,\n",
 93 |     "                               verbose=False)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "custom_model.load_state_dict(torch.load('models_weights/dump_weights_model_' + suffix + '.pt'))"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "custom_model.to(cuda);"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "criterion = SplitCrossEntropyLoss(args.emsize, splits=[], verbose=False)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "train_loss = evaluate(custom_model, criterion, train_eval_data, args.eval_batch_size, args)\n",
130 |     "val_loss = evaluate(custom_model, criterion, val_data, args.eval_batch_size, args)\n",
131 |     "test_loss = evaluate(custom_model, criterion, test_data, args.eval_batch_size, args)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "print('-' * 89)\n",
141 |     "print('train loss {:5.4f} | '\n",
142 |     "    'train ppl {:8.2f} | train bpw {:8.3f} |\\n| valid loss {:5.4f} | '\n",
143 |     "    'valid ppl {:8.2f} | valid bpw {:8.3f} |\\n| test loss {:5.4f} | '\n",
144 |     "    'test ppl {:8.2f} | test bpw {:8.3f} |'.format(\n",
145 |     "        train_loss, math.exp(train_loss), train_loss / math.log(2),\n",
146 |     "        val_loss, math.exp(val_loss), val_loss / math.log(2),\n",
147 |     "    test_loss, math.exp(test_loss), test_loss / math.log(2)))\n",
148 |     "print('-' * 89)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "print('logged train loss', log['train_losses'][-1])\n",
158 |     "print('logged valid loss', log['val_losses'][-1])\n",
159 |     "print('logged test loss', log['test_losses'][-1])"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": []
168 |   }
169 |  ],
170 |  "metadata": {
171 |   "kernelspec": {
172 |    "display_name": "Python 3",
173 |    "language": "python",
174 |    "name": "python3"
175 |   },
176 |   "language_info": {
177 |    "codemirror_mode": {
178 |     "name": "ipython",
179 |     "version": 3
180 |    },
181 |    "file_extension": ".py",
182 |    "mimetype": "text/x-python",
183 |    "name": "python",
184 |    "nbconvert_exporter": "python",
185 |    "pygments_lexer": "ipython3",
186 |    "version": "3.6.10"
187 |   }
188 |  },
189 |  "nbformat": 4,
190 |  "nbformat_minor": 2
191 | }
192 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.8.1
  2 | alignment==1.0.10
  3 | asn1crypto==0.24.0
  4 | astor==0.8.0
  5 | atomicwrites==1.1.5
  6 | attrs==18.1.0
  7 | backcall==0.1.0
  8 | bleach==2.1.3
  9 | blis==0.2.4
 10 | bokeh==0.12.15
 11 | boto==2.48.0
 12 | boto3==1.5.32
 13 | botocore==1.8.46
 14 | boxsdk==1.5.5
 15 | bpemb==0.3.0
 16 | bz2file==0.98
 17 | catboost==0.8.1.1
 18 | cchardet==2.1.4
 19 | certifi==2020.4.5.1
 20 | cffi==1.11.5
 21 | chainer==6.0.0
 22 | chardet==3.0.4
 23 | click==6.7
 24 | climin==0.1a1
 25 | cloudpickle==0.5.3
 26 | colorama==0.4.1
 27 | colorlog==4.0.2
 28 | colormap==1.0.2
 29 | conda==4.8.3
 30 | conda-package-handling==1.6.0
 31 | config-pkg==1.1.4
 32 | ConfigSpace==0.4.13
 33 | cplxmodule==0.8.0
 34 | cryptography==2.6.1
 35 | cssselect==1.0.3
 36 | cupy==7.0.0a1
 37 | cupy-cuda90==7.0.0a1
 38 | cycler==0.10.0
 39 | cymem==2.0.2
 40 | Cython==0.29.13
 41 | cytoolz==0.9.0.1
 42 | dask==0.15.2
 43 | decorator==4.4.0
 44 | Deprecated==1.2.5
 45 | dill==0.2.9
 46 | distributed==1.18.3
 47 | dlispy==0.0.2
 48 | docutils==0.14
 49 | -e git+https://github.com/fmsnew/DPDall.git@92be3038315d7c8470ec1375da24fb478cfc22cf#egg=dpd_project
 50 | easydev==0.9.37
 51 | editdistance==0.4
 52 | eli5==0.8.1
 53 | email-reply-parser==0.5.9
 54 | emcee==3.0.2
 55 | en-core-web-sm==2.0.0
 56 | entrypoints==0.2.3
 57 | et-xmlfile==1.0.1
 58 | fastdtw==0.3.2
 59 | fastprogress==0.1.20
 60 | fastrlock==0.4
 61 | fasttext==0.9.1
 62 | filelock==3.0.12
 63 | flair==0.4.2
 64 | Flask==1.0.3
 65 | ftfy==5.6
 66 | future==0.16.0
 67 | gast==0.3.2
 68 | gdown==3.8.3
 69 | gensim==3.4.0
 70 | Glances==3.1.0
 71 | GPy==1.9.6
 72 | graphviz==0.10.1
 73 | grpcio==1.24.1
 74 | h5py==2.7.1
 75 | HeapDict==1.0.0
 76 | HetMOGP==0.1
 77 | html2text==2018.1.9
 78 | html5lib==1.0.1
 79 | hyperopt==0.1.2
 80 | idna==2.8
 81 | ijson==2.3
 82 | imageio==2.5.0
 83 | implicit==0.3.6
 84 | importlib-metadata==0.18
 85 | inspyred==1.0.1
 86 | interruptingcow==0.8
 87 | ipaddress==1.0.22
 88 | ipdb==0.12
 89 | ipykernel==4.8.2
 90 | ipython==6.3.1
 91 | ipython-genutils==0.2.0
 92 | ipywidgets==7.2.1
 93 | itsdangerous==1.1.0
 94 | jdcal==1.4
 95 | jedi==0.12.0
 96 | jellyfish==0.7.2
 97 | Jinja2==2.10
 98 | jmespath==0.9.3
 99 | joblib==0.12.5
100 | jsonschema==3.0.1
101 | jupyter==1.0.0
102 | jupyter-client==5.2.3
103 | jupyter-console==5.2.0
104 | jupyter-core==4.4.0
105 | jupyterthemes==0.20.0
106 | Keras-Applications==1.0.8
107 | Keras-Preprocessing==1.1.0
108 | kiwisolver==1.0.1
109 | lasio==0.21
110 | lazy-import==0.2.2
111 | lesscpy==0.13.0
112 | lightgbm==2.1.0
113 | line-profiler==2.1.2
114 | lockfile==0.12.2
115 | luigi==2.7.8
116 | lxml==4.3.3
117 | Mako==1.0.7
118 | Markdown==3.1.1
119 | MarkupSafe==1.0
120 | matplotlib==3.0.3
121 | mistune==0.8.3
122 | mkl-fft==1.0.10
123 | mkl-random==1.0.2
124 | mkl-service==2.0.2
125 | mock==3.0.5
126 | more-itertools==4.3.0
127 | mpld3==0.3
128 | msgpack==0.5.6
129 | msgpack-numpy==0.4.3.2
130 | msgpack-python==0.5.4
131 | murmurhash==1.0.2
132 | nasbench==1.0
133 | nasbench-encoder==0.0.1
134 | nasbench-pytorch==0.0.0
135 | nbconvert==5.3.1
136 | nbformat==4.4.0
137 | networkx==2.4rc1.dev20190610154137
138 | nltk==3.4.1
139 | notebook==5.7.2
140 | numpy==1.17.3
141 | olefile==0.45.1
142 | openmdao==2.8.0
143 | openpyxl==2.5.4
144 | ordereddict==1.1
145 | packaging==17.1
146 | pandas==0.24.2
147 | pandocfilters==1.4.2
148 | paramz==0.9.4
149 | parso==0.2.0
150 | patsy==0.5.0
151 | pexpect==4.5.0
152 | pickleshare==0.7.4
153 | Pillow==7.1.2
154 | pke==1.8
155 | plac==0.9.6
156 | plotly==2.5.1
157 | pluggy==0.7.1
158 | ply==3.11
159 | pmlb==0.3
160 | preshed==2.0.1
161 | progressbar==2.5
162 | prometheus-client==0.5.0
163 | prompt-toolkit==1.0.15
164 | protobuf==3.7.1
165 | psutil==5.4.6
166 | psycopg2==2.8.4
167 | ptyprocess==0.5.2
168 | py==1.5.4
169 | pyaml==20.4.0
170 | pybind11==2.4.3
171 | pycosat==0.6.3
172 | pycparser==2.18
173 | pyDOE==0.3.8
174 | pyDOE2==1.2.1
175 | Pygments==2.2.0
176 | pygpu==0.7.5
177 | pygraphviz==1.5
178 | PyJWT==1.6.4
179 | pyKriging==0.2.0
180 | pymc3==3.3
181 | pymongo==3.8.0
182 | pymystem3==0.1.9
183 | pynisher==0.5.0
184 | pyOpenSSL==17.5.0
185 | pyparsing==2.2.0
186 | pyrfr==0.8.0
187 | pyrsistent==0.15.2
188 | PySocks==1.6.8
189 | pytest==3.7.1
190 | python-daemon==2.2.0
191 | python-dateutil==2.8.0
192 | python-dotenv==0.9.1
193 | pytorch-pretrained-bert==0.6.2
194 | pytz==2019.1
195 | PyWavelets==1.1.1
196 | pywt==1.0.6
197 | PyYAML==3.12
198 | pyzmq==17.0.0
199 | qtconsole==4.3.1
200 | quotequail==0.2.3
201 | rake-nltk==1.0.4
202 | regex==2018.1.10
203 | requests==2.22.0
204 | requests-toolbelt==0.8.0
205 | ruamel-yaml==0.15.35
206 | s3transfer==0.1.13
207 | sacremoses==0.0.35
208 | schedule==0.5.0
209 | scikit-image==0.17.2
210 | scikit-learn==0.23.1
211 | scikit-optimize==0.7.4
212 | scipy==1.2.1
213 | seaborn==0.8.1
214 | seafileapi==0.1.2
215 | segtok==1.5.7
216 | Send2Trash==1.5.0
217 | sentencepiece==0.1.82
218 | simplegeneric==0.8.1
219 | singledispatch==3.4.0.3
220 | six==1.11.0
221 | sklearn==0.0
222 | smac==0.12.2
223 | smart-open==1.5.7
224 | sobol-seq==0.2.0
225 | sortedcontainers==2.0.4
226 | spacy==2.0.18
227 | SQLAlchemy==1.3.16
228 | sqlitedict==1.6.0
229 | srsly==0.0.5
230 | statsmodels==0.8.0
231 | tabulate==0.8.3
232 | talon==1.4.4
233 | tblib==1.3.2
234 | tensorboard==1.13.1
235 | tensorboardX==1.6
236 | tensorflow==1.13.1
237 | tensorflow-estimator==1.13.0
238 | termcolor==1.1.0
239 | terminado==0.8.1
240 | testpath==0.3.1
241 | Theano==1.0.1+2.gcd195ed28
242 | thinc==6.12.1
243 | thinc-gpu-ops==0.0.4
244 | threadpoolctl==2.1.0
245 | tifffile==2020.5.25
246 | toolz==0.9.0
247 | torch==1.1.0
248 | torchgan==0.0.4
249 | torchvision==0.3.0
250 | tornado==4.5.3
251 | tqdm==4.32.1
252 | traitlets==4.3.2
253 | transformers==2.2.0
254 | transliterate==1.10.2
255 | tsfresh==0.11.0
256 | typing==3.6.4
257 | typing-extensions==3.7.2
258 | ujson==1.35
259 | Unidecode==1.1.1
260 | urllib3==1.24.3
261 | virtualenv==16.7.9
262 | wasabi==0.2.2
263 | wcwidth==0.1.7
264 | webencodings==0.5.1
265 | Werkzeug==0.15.4
266 | widgetsnbextension==3.2.1
267 | wrapt==1.10.11
268 | xgboost==0.71
269 | xlrd==1.1.0
270 | xmltodict==0.12.0
271 | yake==0.4.1
272 | zict==0.1.3
273 | zipp==0.5.1
274 | 


--------------------------------------------------------------------------------
/search_space.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class RecepieGenerator:
  5 | 
  6 |     def __init__(
  7 |         self, 
  8 |         hidden_tuple_size=2,
  9 |         intermediate_vertices=7,
 10 |         main_operations = ['linear', 'blend', 'elementwise_prod', 'elementwise_sum'],
 11 |         main_weights = [3., 1., 1., 1.],
 12 |         activations = ['activation_tanh', 'activation_sigm', 'activation_leaky_relu'],
 13 |         activation_weights = [1., 1., 1.],
 14 |         linear_connections = [2, 3],
 15 |         linear_connections_weights = [4, 1]
 16 |         ):
 17 |         self.hidden_tuple_size = hidden_tuple_size
 18 |         self.intermediate_vertices = intermediate_vertices
 19 |         self.main_operations = main_operations
 20 |         self.main_probabilities = np.array(main_weights)/np.sum(main_weights)
 21 |         self.activations = activations
 22 |         self.activation_probabilities = np.array(activation_weights)/np.sum(activation_weights)
 23 |         self.linear_connections = linear_connections
 24 |         self.linear_connections_probabilities = np.array(linear_connections_weights)/np.sum(linear_connections_weights)
 25 | 
 26 |     def _generate_redundant_graph(self, recepie, base_nodes):
 27 |         i = 0
 28 |         activation_nodes = []
 29 |         while i < self.hidden_tuple_size + self.intermediate_vertices:
 30 |             op = np.random.choice(self.main_operations, 1, p=self.main_probabilities)[0]
 31 |             if op == 'linear':
 32 |                 num_connections = np.random.choice(self.linear_connections, 1, 
 33 |                                                    p=self.linear_connections_probabilities)[0]
 34 |                 connection_candidates = base_nodes + activation_nodes
 35 |                 if num_connections > len(connection_candidates):
 36 |                     num_connections = len(connection_candidates)
 37 |                 
 38 |                 connections = np.random.choice(connection_candidates, num_connections, replace=False)
 39 |                 recepie[f'node_{i}'] = {'op':op, 'input':connections}
 40 |                 i += 1
 41 |                 
 42 |                 # after linear force add activation node tied to the new node, if possible (nodes budget)
 43 |                 op = np.random.choice(self.activations, 1, p=self.activation_probabilities)[0]
 44 |                 recepie[f'node_{i}'] = {'op':op, 'input':[f'node_{i - 1}']}
 45 |                 activation_nodes.append(f'node_{i}')
 46 |                 i += 1
 47 |                 
 48 |             elif op in ['blend', 'elementwise_prod', 'elementwise_sum']:
 49 |                 # inputs must exclude x
 50 |                 if op == 'blend':
 51 |                     num_connections = 3
 52 |                 else:
 53 |                     num_connections = 2
 54 |                 connection_candidates = list(set(base_nodes) - set('x')) + list(recepie.keys())
 55 |                 if num_connections <= len(connection_candidates):
 56 |                     connections = np.random.choice(connection_candidates, num_connections, replace=False)
 57 |                     recepie[f'node_{i}'] = {'op':op, 'input':connections}
 58 |                     i += 1
 59 | 
 60 |     def _create_hidden_nodes(self, recepie):
 61 |         new_hiddens_map = {}
 62 |         for k in np.random.choice(list(recepie.keys()), self.hidden_tuple_size, replace=False):
 63 |             new_hiddens_map[k] = f'h_new_{len(new_hiddens_map)}'
 64 |             
 65 |         for k in new_hiddens_map:
 66 |             recepie[new_hiddens_map[k]] = recepie[k]
 67 |             del recepie[k]
 68 |             
 69 |         for k in recepie:
 70 |             recepie[k]['input'] = [new_hiddens_map.get(x, x) for x in recepie[k]['input']]
 71 | 
 72 |     def _remove_redundant_nodes(self, recepie):
 73 |         q = [f'h_new_{i}' for i in range(self.hidden_tuple_size)]
 74 |         visited = set(q)
 75 |         while len(q) > 0:
 76 |             if q[0] in recepie:
 77 |                 for node in recepie[q[0]]['input']:
 78 |                     if node not in visited:
 79 |                         q.append(node)
 80 |                         visited.add(node)
 81 |             q = q[1:]
 82 | 
 83 |         for k in list(recepie.keys()):
 84 |             if k not in visited:
 85 |                 del recepie[k]
 86 | 
 87 |         return visited
 88 | 
 89 |     def generate_random_recepie(self, seed=None):
 90 |         if seed is not None:
 91 |             np.random.seed(seed)
 92 |         prev_hidden_nodes = [f'h_prev_{i}' for i in range(self.hidden_tuple_size)]
 93 |         base_nodes = ['x'] + prev_hidden_nodes
 94 |         
 95 |         recepie = {}
 96 |         self._generate_redundant_graph(recepie, base_nodes)
 97 |         self._create_hidden_nodes(recepie)
 98 |         visited = self._remove_redundant_nodes(recepie)
 99 | 
100 |         is_sanity_check_ok = True
101 | 
102 |         # check that all input nodes are in the graph
103 |         for node in base_nodes:
104 |             if node not in visited:
105 |                 is_sanity_check_ok = False
106 |                 break
107 | 
108 |         # constraint: prev hidden nodes are not connected directly to new hidden nodes
109 |         for i in range(self.hidden_tuple_size):
110 |             if len(set(recepie[f'h_new_{i}']['input']) & set(prev_hidden_nodes)) > 0:
111 |                 is_sanity_check_ok = False
112 |                 break
113 | 
114 |         return recepie, is_sanity_check_ok
115 | 
116 |     def get_example_recepie(self, name):
117 |         if name == 'rnn':
118 |             recepie = {
119 |                 'f':{'op':'linear', 'input':['x', 'h_prev_0']},
120 |                 'h_new_0':{'op':'activation_tanh', 'input':['f']}
121 |             }
122 |         elif name == 'lstm':
123 |             recepie = {
124 |                 'i':{'op':'linear', 'input':['x', 'h_prev_0']},
125 |                 'i_act':{'op':'activation_tanh', 'input':['i']},
126 |                 
127 |                 'j':{'op':'linear', 'input':['x', 'h_prev_0']},
128 |                 'j_act':{'op':'activation_sigm', 'input':['j']},
129 |                 
130 |                 'f':{'op':'linear', 'input':['x', 'h_prev_0']},
131 |                 'f_act':{'op':'activation_sigm', 'input':['f']},
132 |                 
133 |                 'o':{'op':'linear', 'input':['x', 'h_prev_0']},
134 |                 'o_act':{'op':'activation_tanh', 'input':['o']},
135 |                 
136 |                 'h_new_1_part1':{'op':'elementwise_prod', 'input':['f_act', 'h_prev_1']},
137 |                 'h_new_1_part2':{'op':'elementwise_prod', 'input':['i_act', 'j_act']},
138 |                 
139 |                 'h_new_1':{'op':'elementwise_sum', 'input':['h_new_1_part1', 'h_new_1_part2']},
140 |                 
141 |                 'h_new_1_act':{'op':'activation_tanh', 'input':['h_new_1']},
142 |                 'h_new_0':{'op':'elementwise_prod', 'input':['h_new_1_act', 'o_act']}
143 |             }
144 |         elif name == 'gru':
145 |             recepie = {
146 |                 'r':{'op':'linear', 'input':['x', 'h_prev_0']},
147 |                 'r_act':{'op':'activation_sigm', 'input':['r']},
148 |                 
149 |                 'z':{'op':'linear', 'input':['x', 'h_prev_0']},
150 |                 'z_act':{'op':'activation_sigm', 'input':['z']},
151 |                 
152 |                 'rh':{'op':'elementwise_prod', 'input':['r_act', 'h_prev_0']},
153 |                 'h_tilde':{'op':'linear', 'input':['x', 'rh']},
154 |                 'h_tilde_act':{'op':'activation_tanh', 'input':['h_tilde']},
155 |                 
156 |                 'h_new_0':{'op':'blend', 'input':['z_act', 'h_prev_0', 'h_tilde_act']}
157 |             }
158 |         else:
159 |             raise Exception(f'Unknown recepie name: {name}')
160 |         return recepie
161 | 
162 | 


--------------------------------------------------------------------------------
/search_space_analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "import json"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import matplotlib.pyplot as plt\n",
 22 |     "%matplotlib inline"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "all_stats = []\n",
 32 |     "for fn in os.listdir('train_logs_single_run'):\n",
 33 |     "    if fn.endswith('.json'):\n",
 34 |     "        all_stats.append(json.load(open(os.path.join('train_logs_single_run', fn), 'r')))"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "len(all_stats)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "def get_nodes_cnt(x):\n",
 53 |     "    all_nodes = set(x.keys())\n",
 54 |     "    for k in x.keys():\n",
 55 |     "        all_nodes |= set(x[k]['input'])\n",
 56 |     "    return len(all_nodes)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "plt.hist([get_nodes_cnt(json.loads(x['recepie'])) for x in all_stats], bins=16, range=(4, 20))\n",
 66 |     "plt.xlabel('Number of nodes', fontsize=16)\n",
 67 |     "plt.ylabel('Number of architectures', fontsize=16)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "ok_stats = [x for x in all_stats if x['status'] == 'OK']"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "recepie_ids = [x['recepie_id'] for x in ok_stats]"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "plt.hist([np.exp(np.min(x['test_losses'])) for x in all_stats if x['status'] == 'OK'], \n",
 95 |     "         bins=50, range=(65, 250));\n",
 96 |     "plt.ylabel('Num. architectures', fontsize=14)\n",
 97 |     "plt.xlabel('Perplexity', fontsize=14)\n",
 98 |     "labels = ['RNN', 'LSTM', 'GRU']\n",
 99 |     "\n",
100 |     "for i in range(3):\n",
101 |     "    seek_id = recepie_ids.index(1000000 + i)\n",
102 |     "    x = ok_stats[seek_id]\n",
103 |     "    plt.vlines(np.exp(np.min(x['test_losses'])), 0, 1000, color=f'C{i+1}', label=labels[i], linestyle='--')\n",
104 |     "plt.legend(fontsize=14)\n",
105 |     "plt.yscale('log')\n",
106 |     "plt.savefig('data/figures/ppl_distrib.png', dpi=300, bbox_inches='tight')"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "plt.figure(figsize=(9, 8))\n",
116 |     "plt.scatter([np.sum(x['wall_times']) for x in ok_stats], \n",
117 |     "            [x['num_params'] for x in ok_stats], s=15,\n",
118 |     "            c=[(np.min(np.exp(x['test_losses']))) for x in ok_stats],\n",
119 |     "            cmap=plt.cm.viridis_r, alpha=0.99)\n",
120 |     "cbar = plt.colorbar()\n",
121 |     "cbar.set_label('Test perplexity', fontsize=16)\n",
122 |     "plt.clim([65, 400])\n",
123 |     "\n",
124 |     "labels = ['RNN', 'LSTM', 'GRU']\n",
125 |     "markers = ['X', '^', 'o']\n",
126 |     "for i in range(3):\n",
127 |     "    seek_id = recepie_ids.index(1000000 + i)\n",
128 |     "    x = ok_stats[seek_id]\n",
129 |     "    plt.scatter([np.sum(x['wall_times'])],\n",
130 |     "                [x['num_params']],\n",
131 |     "                c='r', marker=markers[i], zorder=10, edgecolor='k', lw=0.5,\n",
132 |     "                s=200, label=labels[i])\n",
133 |     "\n",
134 |     "plt.legend(fontsize=14)\n",
135 |     "plt.xlabel('Wall time [s]', fontsize=16)\n",
136 |     "plt.ylabel('Num params', fontsize=16)\n",
137 |     "plt.savefig('data/figures/main_metrics.png', dpi=300, bbox_inches='tight')"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "def get_rank(x):\n",
147 |     "    r = np.zeros_like(x)\n",
148 |     "    r[np.argsort(x)] = np.arange(len(x))\n",
149 |     "    return r"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "Y = np.array([x['test_losses'] for x in all_stats if x['status'] == 'OK'])"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "X = np.array([x['val_losses'] for x in all_stats if x['status'] == 'OK'])"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "plt.figure(figsize=(21, 5))\n",
177 |     "for i, e in enumerate([5, 10, 25, 50]):\n",
178 |     "    plt.subplot(1, 4, i + 1)\n",
179 |     "    plt.scatter(get_rank(X[:, e - 1]), get_rank(Y[:, -1]), s=1)\n",
180 |     "    plt.xlabel(f'Validation rank {e} epoch', fontsize=14)\n",
181 |     "    plt.ylabel('Test rank 50 epoch', fontsize=14)\n",
182 |     "plt.tight_layout()\n",
183 |     "plt.savefig('data/figures/dynamic_ranking.png', dpi=300, bbox_inches='tight')"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "### Correlation with performance on wikitext"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "for fn in os.listdir('train_logs_multi_runs'):\n",
200 |     "    if fn.endswith('.json'):\n",
201 |     "        all_stats.append(json.load(open(os.path.join('train_logs_multi_runs', fn), 'r')))"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "all_stats_wiki = []\n",
211 |     "for fn in os.listdir('train_logs_wikitext-2'):\n",
212 |     "    if fn.endswith('.json'):\n",
213 |     "        all_stats_wiki.append(json.load(open(os.path.join('train_logs_wikitext-2', fn), 'r')))"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "len(all_stats_wiki)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "ok_stats_wiki = [x for x in all_stats_wiki if x['status'] == 'OK']"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "id_to_ppl = {x['recepie_id']:np.exp(np.min(x['test_losses'])) for x in all_stats if x['status'] == 'OK'}\n",
241 |     "id_to_ppl_wiki = {x['recepie_id']:np.exp(np.min(x['test_losses'])) for x in all_stats_wiki if x['status'] == 'OK'}"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "ppl = []\n",
251 |     "ppl_wiki = []\n",
252 |     "for k in id_to_ppl_wiki:\n",
253 |     "    if k in id_to_ppl:\n",
254 |     "        ppl.append(id_to_ppl[k])\n",
255 |     "        ppl_wiki.append(id_to_ppl_wiki[k])"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "plt.figure(figsize=(7, 7))\n",
265 |     "plt.scatter(np.log(ppl), np.log(ppl_wiki))\n",
266 |     "plt.xlabel('PTB testing log perplexity', fontsize=16)\n",
267 |     "plt.ylabel('WikiText-2 testing log perplexity', fontsize=16)\n",
268 |     "plt.savefig('data/figures/transfer_corr.png', dpi=300, bbox_inches='tight')"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": []
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": []
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": []
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": []
298 |   }
299 |  ],
300 |  "metadata": {
301 |   "kernelspec": {
302 |    "display_name": "Python 3",
303 |    "language": "python",
304 |    "name": "python3"
305 |   },
306 |   "language_info": {
307 |    "codemirror_mode": {
308 |     "name": "ipython",
309 |     "version": 3
310 |    },
311 |    "file_extension": ".py",
312 |    "mimetype": "text/x-python",
313 |    "name": "python",
314 |    "nbconvert_exporter": "python",
315 |    "pygments_lexer": "ipython3",
316 |    "version": "3.6.10"
317 |   }
318 |  },
319 |  "nbformat": 4,
320 |  "nbformat_minor": 2
321 | }
322 | 


--------------------------------------------------------------------------------
/search_space_examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import search_space\n",
 20 |     "import plotting\n",
 21 |     "import json\n",
 22 |     "\n",
 23 |     "%matplotlib inline\n",
 24 |     "import matplotlib.pyplot as plt"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "recepie_generator = search_space.RecepieGenerator()"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "recepie = recepie_generator.get_example_recepie('rnn')\n",
 43 |     "print(recepie)\n",
 44 |     "fig = plotting.plot_recepie(recepie, dpi=100)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "with open(\"data/figures/rnn_cell.png\", \"wb\") as png:\n",
 54 |     "    png.write(fig.data)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "fig"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "recepie = recepie_generator.get_example_recepie('lstm')\n",
 73 |     "print(recepie)\n",
 74 |     "fig = plotting.plot_recepie(recepie, dpi=100)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "with open(\"data/figures/lstm_cell.png\", \"wb\") as png:\n",
 84 |     "    png.write(fig.data)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "fig"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "recepie = recepie_generator.get_example_recepie('gru')\n",
103 |     "print(recepie)\n",
104 |     "fig = plotting.plot_recepie(recepie, dpi=100)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "with open(\"data/figures/gru_cell.png\", \"wb\") as png:\n",
114 |     "    png.write(fig.data)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "fig"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "recepie, sanity_check = recepie_generator.generate_random_recepie(4)\n",
133 |     "print('valid recepie: ', sanity_check)\n",
134 |     "plotting.plot_recepie(recepie)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "recepie, sanity_check = recepie_generator.generate_random_recepie(10)\n",
144 |     "print('valid recepie: ', sanity_check)\n",
145 |     "plotting.plot_recepie(recepie)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "# Make search space elements example"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "from tqdm import tqdm_notebook"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "max_valid_confs = 100\n",
171 |     "all_recepies = []\n",
172 |     "rnd_offset = 0\n",
173 |     "for hidden_tuple_size in [1, 2, 3]:\n",
174 |     "    for intermediate_elements in [7, 14, 21]:\n",
175 |     "        recepie_generator = search_space.RecepieGenerator(hidden_tuple_size, intermediate_elements)\n",
176 |     "        N = 200\n",
177 |     "        valid_seeds = []\n",
178 |     "        for i in tqdm_notebook(range(N)):\n",
179 |     "            recepie, sanity_check = recepie_generator.generate_random_recepie(i + rnd_offset)\n",
180 |     "            if sanity_check:\n",
181 |     "                valid_seeds.append(i)\n",
182 |     "        for i in valid_seeds[:max_valid_confs]:\n",
183 |     "            recepie, sanity_check = recepie_generator.generate_random_recepie(i + rnd_offset)\n",
184 |     "            all_recepies.append(recepie)\n",
185 |     "        rnd_offset += N"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "json_recepies = [json.dumps(x) for x in all_recepies]"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "mind about duplicates that can appear during generation"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "len(json_recepies)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "len(set(json_recepies))"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": []
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": []
235 |   }
236 |  ],
237 |  "metadata": {
238 |   "kernelspec": {
239 |    "display_name": "Python 3",
240 |    "language": "python",
241 |    "name": "python3"
242 |   },
243 |   "language_info": {
244 |    "codemirror_mode": {
245 |     "name": "ipython",
246 |     "version": 3
247 |    },
248 |    "file_extension": ".py",
249 |    "mimetype": "text/x-python",
250 |    "name": "python",
251 |    "nbconvert_exporter": "python",
252 |    "pygments_lexer": "ipython3",
253 |    "version": "3.6.10"
254 |   }
255 |  },
256 |  "nbformat": 4,
257 |  "nbformat_minor": 2
258 | }
259 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | setup(name='CustomAwdRnn',
 5 |       version='0.0.0',
 6 |       description='Custom awd rnn',
 7 |       author='Nikita Klyuchnikov',
 8 |       author_email='nikita.klyuchnikov@skolkovotech.ru',
 9 |       packages=['CustomAwdRnn', ],
10 |       zip_safe=False)


--------------------------------------------------------------------------------
/splitcross.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | import numpy as np
  7 | 
  8 | 
  9 | class SplitCrossEntropyLoss(nn.Module):
 10 |     r'''SplitCrossEntropyLoss calculates an approximate softmax'''
 11 |     def __init__(self, hidden_size, splits, verbose=False):
 12 |         # We assume splits is [0, split1, split2, N] where N >= |V|
 13 |         # For example, a vocab of 1000 words may have splits [0] + [100, 500] + [inf]
 14 |         super(SplitCrossEntropyLoss, self).__init__()
 15 |         self.hidden_size = hidden_size
 16 |         self.splits = [0] + splits + [100 * 1000000]
 17 |         self.nsplits = len(self.splits) - 1
 18 |         self.stats = defaultdict(list)
 19 |         self.verbose = verbose
 20 |         # Each of the splits that aren't in the head require a pretend token, we'll call them tombstones
 21 |         # The probability given to this tombstone is the probability of selecting an item from the represented split
 22 |         if self.nsplits > 1:
 23 |             self.tail_vectors = nn.Parameter(torch.zeros(self.nsplits - 1, hidden_size))
 24 |             self.tail_bias = nn.Parameter(torch.zeros(self.nsplits - 1))
 25 | 
 26 |     def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, verbose=False):
 27 |         # First we perform the first softmax on the head vocabulary and the tombstones
 28 |         if softmaxed_head_res is None:
 29 |             start, end = self.splits[0], self.splits[1]
 30 |             head_weight = None if end - start == 0 else weight[start:end]
 31 |             head_bias = None if end - start == 0 else bias[start:end]
 32 |             # We only add the tombstones if we have more than one split
 33 |             if self.nsplits > 1:
 34 |                 head_weight = self.tail_vectors if head_weight is None else torch.cat([head_weight, self.tail_vectors])
 35 |                 head_bias = self.tail_bias if head_bias is None else torch.cat([head_bias, self.tail_bias])
 36 | 
 37 |             # Perform the softmax calculation for the word vectors in the head for all splits
 38 |             # We need to guard against empty splits as torch.cat does not like random lists
 39 |             head_res = torch.nn.functional.linear(hiddens, head_weight, bias=head_bias)
 40 |             softmaxed_head_res = torch.nn.functional.log_softmax(head_res, dim=-1)
 41 | 
 42 |         if splits is None:
 43 |             splits = list(range(self.nsplits))
 44 | 
 45 |         results = []
 46 |         running_offset = 0
 47 |         for idx in splits:
 48 | 
 49 |             # For those targets in the head (idx == 0) we only need to return their loss
 50 |             if idx == 0:
 51 |                 results.append(softmaxed_head_res[:, :-(self.nsplits - 1)])
 52 | 
 53 |             # If the target is in one of the splits, the probability is the p(tombstone) * p(word within tombstone)
 54 |             else:
 55 |                 start, end = self.splits[idx], self.splits[idx + 1]
 56 |                 tail_weight = weight[start:end]
 57 |                 tail_bias = bias[start:end]
 58 | 
 59 |                 # Calculate the softmax for the words in the tombstone
 60 |                 tail_res = torch.nn.functional.linear(hiddens, tail_weight, bias=tail_bias)
 61 | 
 62 |                 # Then we calculate p(tombstone) * p(word in tombstone)
 63 |                 # Adding is equivalent to multiplication in log space
 64 |                 head_entropy = (softmaxed_head_res[:, -idx]).contiguous()
 65 |                 tail_entropy = torch.nn.functional.log_softmax(tail_res, dim=-1)
 66 |                 results.append(head_entropy.view(-1, 1) + tail_entropy)
 67 | 
 68 |         if len(results) > 1:
 69 |             return torch.cat(results, dim=1)
 70 |         return results[0]
 71 | 
 72 |     def split_on_targets(self, hiddens, targets):
 73 |         # Split the targets into those in the head and in the tail
 74 |         split_targets = []
 75 |         split_hiddens = []
 76 | 
 77 |         # Determine to which split each element belongs (for each start split value, add 1 if equal or greater)
 78 |         # This method appears slower at least for WT-103 values for approx softmax
 79 |         #masks = [(targets >= self.splits[idx]).view(1, -1) for idx in range(1, self.nsplits)]
 80 |         #mask = torch.sum(torch.cat(masks, dim=0), dim=0)
 81 |         ###
 82 |         # This is equally fast for smaller splits as method below but scales linearly
 83 |         mask = None
 84 |         for idx in range(1, self.nsplits):
 85 |             partial_mask = targets >= self.splits[idx]
 86 |             mask = mask + partial_mask if mask is not None else partial_mask
 87 |         ###
 88 |         #masks = torch.stack([targets] * (self.nsplits - 1))
 89 |         #mask = torch.sum(masks >= self.split_starts, dim=0)
 90 |         for idx in range(self.nsplits):
 91 |             # If there are no splits, avoid costly masked select
 92 |             if self.nsplits == 1:
 93 |                 split_targets, split_hiddens = [targets], [hiddens]
 94 |                 continue
 95 |             # If all the words are covered by earlier targets, we have empties so later stages don't freak out
 96 |             if sum(len(t) for t in split_targets) == len(targets):
 97 |                 split_targets.append([])
 98 |                 split_hiddens.append([])
 99 |                 continue
100 |             # Are you in our split?
101 |             tmp_mask = mask == idx
102 |             split_targets.append(torch.masked_select(targets, tmp_mask))
103 |             split_hiddens.append(hiddens.masked_select(tmp_mask.unsqueeze(1).expand_as(hiddens)).view(-1, hiddens.size(1)))
104 |         return split_targets, split_hiddens
105 | 
106 |     def forward(self, weight, bias, hiddens, targets, verbose=False):
107 |         if self.verbose or verbose:
108 |             for idx in sorted(self.stats):
109 |                 print('{}: {}'.format(idx, int(np.mean(self.stats[idx]))), end=', ')
110 |             print()
111 | 
112 |         total_loss = None
113 |         if len(hiddens.size()) > 2: hiddens = hiddens.view(-1, hiddens.size(2))
114 | 
115 |         split_targets, split_hiddens = self.split_on_targets(hiddens, targets)
116 | 
117 |         # First we perform the first softmax on the head vocabulary and the tombstones
118 |         start, end = self.splits[0], self.splits[1]
119 |         head_weight = None if end - start == 0 else weight[start:end]
120 |         head_bias = None if end - start == 0 else bias[start:end]
121 | 
122 |         # We only add the tombstones if we have more than one split
123 |         if self.nsplits > 1:
124 |             head_weight = self.tail_vectors if head_weight is None else torch.cat([head_weight, self.tail_vectors])
125 |             head_bias = self.tail_bias if head_bias is None else torch.cat([head_bias, self.tail_bias])
126 | 
127 |         # Perform the softmax calculation for the word vectors in the head for all splits
128 |         # We need to guard against empty splits as torch.cat does not like random lists
129 |         combo = torch.cat([split_hiddens[i] for i in range(self.nsplits) if len(split_hiddens[i])])
130 |         ###
131 |         all_head_res = torch.nn.functional.linear(combo, head_weight, bias=head_bias)
132 |         softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res, dim=-1)
133 |         if self.verbose or verbose:
134 |             self.stats[0].append(combo.size()[0] * head_weight.size()[0])
135 | 
136 |         running_offset = 0
137 |         for idx in range(self.nsplits):
138 |             # If there are no targets for this split, continue
139 |             if len(split_targets[idx]) == 0: continue
140 | 
141 |             # For those targets in the head (idx == 0) we only need to return their loss
142 |             if idx == 0:
143 |                 softmaxed_head_res = softmaxed_all_head_res[running_offset:running_offset + len(split_hiddens[idx])]
144 |                 entropy = -torch.gather(softmaxed_head_res, dim=1, index=split_targets[idx].view(-1, 1))
145 |             # If the target is in one of the splits, the probability is the p(tombstone) * p(word within tombstone)
146 |             else:
147 |                 softmaxed_head_res = softmaxed_all_head_res[running_offset:running_offset + len(split_hiddens[idx])]
148 | 
149 |                 if self.verbose or verbose:
150 |                     start, end = self.splits[idx], self.splits[idx + 1]
151 |                     tail_weight = weight[start:end]
152 |                     self.stats[idx].append(split_hiddens[idx].size()[0] * tail_weight.size()[0])
153 | 
154 |                 # Calculate the softmax for the words in the tombstone
155 |                 tail_res = self.logprob(weight, bias, split_hiddens[idx], splits=[idx], softmaxed_head_res=softmaxed_head_res)
156 | 
157 |                 # Then we calculate p(tombstone) * p(word in tombstone)
158 |                 # Adding is equivalent to multiplication in log space
159 |                 head_entropy = softmaxed_head_res[:, -idx]
160 |                 # All indices are shifted - if the first split handles [0,...,499] then the 500th in the second split will be 0 indexed
161 |                 indices = (split_targets[idx] - self.splits[idx]).view(-1, 1)
162 |                 # Warning: if you don't squeeze, you get an N x 1 return, which acts oddly with broadcasting
163 |                 tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res, dim=-1), dim=1, index=indices).squeeze()
164 |                 entropy = -(head_entropy + tail_entropy)
165 |             ###
166 |             running_offset += len(split_hiddens[idx])
167 |             total_loss = entropy.float().sum() if total_loss is None else total_loss + entropy.float().sum()
168 | 
169 |         return (total_loss / len(targets)).type_as(weight)
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     np.random.seed(42)
174 |     torch.manual_seed(42)
175 |     if torch.cuda.is_available():
176 |         torch.cuda.manual_seed(42)
177 | 
178 |     V = 8
179 |     H = 10
180 |     N = 100
181 |     E = 10
182 | 
183 |     embed = torch.nn.Embedding(V, H)
184 |     crit = SplitCrossEntropyLoss(hidden_size=H, splits=[V // 2])
185 |     bias = torch.nn.Parameter(torch.ones(V))
186 |     optimizer = torch.optim.SGD(list(embed.parameters()) + list(crit.parameters()), lr=1)
187 | 
188 |     for _ in range(E):
189 |         prev = torch.autograd.Variable((torch.rand(N, 1) * 0.999 * V).int().long())
190 |         x = torch.autograd.Variable((torch.rand(N, 1) * 0.999 * V).int().long())
191 |         y = embed(prev).squeeze()
192 |         c = crit(embed.weight, bias, y, x.view(N))
193 |         print('Crit', c.exp().data[0])
194 | 
195 |         logprobs = crit.logprob(embed.weight, bias, y[:2]).exp()
196 |         print(logprobs)
197 |         print(logprobs.sum(dim=1))
198 | 
199 |         optimizer.zero_grad()
200 |         c.backward()
201 |         optimizer.step()
202 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn
 3 | import time
 4 | import numpy as np
 5 | import math
 6 | 
 7 | from utils import get_batch, repackage_hidden
 8 | 
 9 | 
10 | def evaluate(model, criterion, data_source, batch_size, args):
11 |     # Turn on evaluation mode which disables dropout.
12 |     model.eval()
13 |     total_loss = 0
14 |     hidden = model.init_hidden(batch_size)
15 |     for i in range(0, data_source.size(0) - 1, args.bptt):
16 |         data, targets = get_batch(data_source, i, args, evaluation=True)
17 |         output, hidden = model(data, hidden)
18 |         total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data
19 |         hidden = repackage_hidden(hidden)
20 |     return total_loss.item() / len(data_source)
21 | 
22 | 
23 | def train(model, optimizer, params, criterion, train_data, args, epoch):
24 |     # Turn on training mode which enables dropout.
25 |     total_loss = 0
26 |     start_time = time.time()
27 |     hidden = model.init_hidden(args.batch_size)
28 |     batch, i = 0, 0
29 |     while i < train_data.size(0) - 1 - 1:
30 |         bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
31 |         # Prevent excessively small or negative sequence lengths
32 |         seq_len = max(5, int(np.random.normal(bptt, 5)))
33 |         # There's a very small chance that it could select a very long sequence length resulting in OOM
34 |         # seq_len = min(seq_len, args.bptt + 10)
35 | 
36 |         lr2 = optimizer.param_groups[0]['lr']
37 |         optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
38 |         model.train()
39 |         data, targets = get_batch(train_data, i, args, seq_len=seq_len)
40 | 
41 |         # Starting each batch, we detach the hidden state from how it was previously produced.
42 |         # If we didn't, the model would try backpropagating all the way to start of the dataset.
43 |         hidden = repackage_hidden(hidden)
44 |         optimizer.zero_grad()
45 | 
46 |         output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True)
47 |         raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets)
48 | 
49 |         loss = raw_loss
50 |         # Activiation Regularization
51 |         if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
52 |         # Temporal Activation Regularization (slowness)
53 |         if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
54 |         loss.backward()
55 | 
56 |         # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
57 |         if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip)
58 |         optimizer.step()
59 | 
60 |         total_loss += raw_loss.data
61 |         optimizer.param_groups[0]['lr'] = lr2
62 |         if batch % args.log_interval == 0 and batch > 0:
63 |             cur_loss = total_loss.item() / args.log_interval
64 |             elapsed = time.time() - start_time
65 |             print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
66 |                     'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
67 |                 epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
68 |                 elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
69 |             total_loss = 0
70 |             start_time = time.time()
71 |         ###
72 |         batch += 1
73 |         i += seq_len


--------------------------------------------------------------------------------
/train_logs_multi_runs/logs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/train_logs_multi_runs/logs.zip


--------------------------------------------------------------------------------
/train_logs_single_run/logs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/train_logs_single_run/logs.zip


--------------------------------------------------------------------------------
/train_logs_wikitext-2/logs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmsnew/nas-bench-nlp-release/a6d90a3b19e3966b1d009c0970b3761aa46707d1/train_logs_wikitext-2/logs.zip


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import networkx as nx
 4 | from itertools import permutations
 5 | 
 6 | 
 7 | def repackage_hidden(h):
 8 |     """Wraps hidden states in new Tensors,
 9 |     to detach them from their history."""
10 |     if isinstance(h, torch.Tensor):
11 |         return h.detach()
12 |     else:
13 |         return tuple(repackage_hidden(v) for v in h)
14 | 
15 | 
16 | def batchify(data, bsz, args, cuda='cuda'):
17 |     # Work out how cleanly we can divide the dataset into bsz parts.
18 |     nbatch = data.size(0) // bsz
19 |     # Trim off any extra elements that wouldn't cleanly fit (remainders).
20 |     data = data.narrow(0, 0, nbatch * bsz)
21 |     # Evenly divide the data across the bsz batches.
22 |     data = data.view(bsz, -1).t().contiguous()
23 |     if args.cuda:
24 |         data = data.to(cuda)
25 |     return data
26 | 
27 | 
28 | def get_batch(source, i, args, seq_len=None, evaluation=False):
29 |     seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i)
30 |     data = source[i:i+seq_len]
31 |     target = source[i+1:i+1+seq_len].view(-1)
32 |     return data, target
33 | 
34 | 
35 | def make_graph(recepie):
36 |     G = nx.DiGraph()
37 | 
38 |     for key in recepie.keys():
39 |         op = recepie[key]['op']
40 |         if key.startswith("h_new_"):
41 |             op = key+":"+op
42 |         G.add_node(key, name=key, op=op)
43 |         for inp in recepie[key]['input']:
44 |             if "h_prev" in inp or inp == "x":
45 |                 G.add_node(inp, name=inp, op=inp)
46 |             else:
47 |                 G.add_node(inp, name=inp)
48 |             G.add_edge(inp, key)
49 |     return G
50 | 
51 | 
52 | def recepie2matrixops(recepie):
53 |     G = make_graph(recepie)
54 |     labels = nx.get_node_attributes(G, "op")
55 |     nodelist_with_ops = np.array(list(labels.items()))
56 |     
57 |     matrix = nx.to_numpy_array(G, nodelist=nodelist_with_ops[:, 0])
58 |     ops = nodelist_with_ops[:, 1]
59 | 
60 |     return matrix, ops
61 | 
62 | 
63 | 
64 | def graph_edit_distance(matrixops1, matrixops2):
65 |     m1, l1 = matrixops1
66 |     m2, l2 = matrixops2
67 |     
68 |     # Pad
69 |     n1, n2 = m1.shape[0], m2.shape[0]
70 |     max_n = max(n1, n2)
71 |     m1 = np.pad(m1, ((0, max_n - m1.shape[0]), (0, max_n - m1.shape[0])))
72 |     m2 = np.pad(m2, ((0, max_n - m2.shape[0]), (0, max_n - m2.shape[0])))
73 |     l1 = np.pad(l1, (0, max_n - l1.shape[0]), constant_values=None)
74 |     l2 = np.pad(l2, (0, max_n - l2.shape[0]), constant_values=None)
75 |     
76 |     
77 |     d = 100000000
78 |     for p in permutations(range(len(m1))):
79 |         p = list(p)
80 |         d_p = (m1 != m2[p][:, p]).sum() + (l1 != l2[p]).sum()
81 |         d = min(d, d_p)
82 |     return d
83 | 


--------------------------------------------------------------------------------
/weight_drop.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import Parameter
  3 | from functools import wraps
  4 | import functools
  5 | 
  6 | class WeightDrop(torch.nn.Module):
  7 |     def __init__(self, module, weights, dropout=0, variational=False):
  8 |         super(WeightDrop, self).__init__()
  9 |         self.module = module
 10 |         self.weights = weights
 11 |         self.dropout = dropout
 12 |         self.variational = variational
 13 |         self._setup()
 14 | 
 15 |     def widget_demagnetizer_y2k_edition(*args, **kwargs):
 16 |         # We need to replace flatten_parameters with a nothing function
 17 |         # It must be a function rather than a lambda as otherwise pickling explodes
 18 |         # We can't write boring code though, so ... WIDGET DEMAGNETIZER Y2K EDITION!
 19 |         # (╯°□°）╯︵ ┻━┻
 20 |         return
 21 | 
 22 |     def _setup(self):
 23 |         # Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN
 24 |         if issubclass(type(self.module), torch.nn.RNNBase):
 25 |             self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition
 26 | 
 27 |         for name_w in self.weights:
 28 |             #print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
 29 |             w = getattr(self.module, name_w)
 30 |             del self.module._parameters[name_w]
 31 |             self.module.register_parameter(name_w + '_raw', Parameter(w.data))
 32 | 
 33 |     def _setweights(self):
 34 |         for name_w in self.weights:
 35 |             raw_w = getattr(self.module, name_w + '_raw')
 36 |             w = None
 37 |             if self.variational:
 38 |                 mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1))
 39 |                 if raw_w.is_cuda: mask = mask.cuda()
 40 |                 mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True)
 41 |                 w = torch.nn.Parameter(mask.expand_as(raw_w) * raw_w)
 42 |             else:
 43 |                 w = torch.nn.Parameter(torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training))
 44 |             setattr(self.module, name_w, w)
 45 | 
 46 |     def forward(self, *args):
 47 |         self._setweights()
 48 |         return self.module.forward(*args)
 49 | 
 50 | def rsetattr(obj, attr, val):
 51 |     pre, _, post = attr.rpartition('.')
 52 |     return setattr(rgetattr(obj, pre) if pre else obj, post, val)
 53 | 
 54 | def rgetattr(obj, attr, *args):
 55 |     def _getattr(obj, attr):
 56 |         return getattr(obj, attr, *args)
 57 |     return functools.reduce(_getattr, [obj] + attr.split('.'))
 58 | 
 59 | class ParameterListWeightDrop(torch.nn.Module):
 60 |     def __init__(self, module, weights, dropout=0, variational=False):
 61 |         super(ParameterListWeightDrop, self).__init__()
 62 |         self.module = module
 63 |         self.weights = weights
 64 |         self.parents = {}
 65 |         for w in self.weights:
 66 |             p = '.'.join(w.split('.')[:-1])
 67 |             i = int(w.split('.')[-1])
 68 |             if p not in self.parents:
 69 |                 self.parents[p] = []
 70 |             self.parents[p].append(i)
 71 |         self.dropout = dropout
 72 |         self.variational = variational
 73 |         self._setup()
 74 | 
 75 | 
 76 |     def _setup(self):
 77 |         for name_w in self.parents:
 78 |             #print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
 79 |             ws = rgetattr(self.module, name_w)
 80 |             rsetattr(self.module, name_w, None)
 81 |             rsetattr(self.module, name_w + '_raw', torch.nn.ParameterList(ws))
 82 | 
 83 |     def _setweights(self):
 84 |         for name_w in self.parents:
 85 |             raw_ws = rgetattr(self.module, name_w + '_raw')
 86 |             ws = []
 87 |             for i, raw_w in enumerate(raw_ws):
 88 |                 if i in self.parents[name_w]:
 89 |                     if self.variational:
 90 |                         mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1))
 91 |                         if raw_w.is_cuda: mask = mask.cuda()
 92 |                         mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True)
 93 |                         w = torch.nn.Parameter(mask.expand_as(raw_w) * raw_w)
 94 |                     else:
 95 |                         w = torch.nn.Parameter(torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training))
 96 |                 else:
 97 |                     w = raw_w
 98 |                 ws.append(w)
 99 |             rsetattr(self.module, name_w, torch.nn.ParameterList(ws))
100 | 
101 |     def forward(self, *args):
102 |         self._setweights()
103 |         return self.module.forward(*args)


--------------------------------------------------------------------------------