├── DRAFT
    ├── assets
    │   └── stackoverflow.jpg
    ├── minimum_viable_custom_model.ipynb
    ├── custom_estimator_mnist.ipynb
    └── keras-bow-model-multi-label-hypertune.ipynb
└── housing_prices
    ├── assets
        └── TFHierarchy.png
    ├── cloud-ml-housing-prices-hp-tuning.ipynb
    └── cloud-ml-housing-prices.ipynb


/DRAFT/assets/stackoverflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykyr/tensorflow_teaching_examples/HEAD/DRAFT/assets/stackoverflow.jpg


--------------------------------------------------------------------------------
/housing_prices/assets/TFHierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykyr/tensorflow_teaching_examples/HEAD/housing_prices/assets/TFHierarchy.png


--------------------------------------------------------------------------------
/housing_prices/cloud-ml-housing-prices-hp-tuning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Automatic Hyperparameter tuning\n",
  8 |     "\n",
  9 |     "This notebook will show you how to extend the code in the cloud-ml-housing-prices notebook to take advantage of Cloud ML Engine's [automatic hyperparameter tuning](https://cloud.google.com/ml-engine/docs/using-hyperparameter-tuning).\n",
 10 |     "\n",
 11 |     "We will use it to determine the ideal number of hidden units to use in our neural network.\n",
 12 |     "\n",
 13 |     "Cloud ML Engine uses bayesian optimization to find the hyperparameter settings for you. You can read the details of how it works [here.](https://cloud.google.com/blog/big-data/2017/08/hyperparameter-tuning-in-cloud-machine-learning-engine-using-bayesian-optimization)\n",
 14 |     "\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "### 1) Modify Tensorflow Code\n",
 22 |     "\n",
 23 |     "We need to make code changes to:\n",
 24 |     "1. Expose any hyperparameter we wish to tune as a command line argument (this is how CMLE passes new values)\n",
 25 |     "2. Modify the output_dir so each hyperparameter 'trial' gets written to a unique directory\n",
 26 |     "\n",
 27 |     "These changes are illustrated below. Any change from the original code has a **#NEW** comment next to it for easy reference"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 11,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stderr",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "mkdir: cannot create directory ‘trainer’: File exists\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "%%bash\n",
 47 |     "mkdir trainer\n",
 48 |     "touch trainer/__init__.py"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 43,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "Overwriting trainer/task.py\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "%%writefile trainer/task.py\n",
 68 |     "\n",
 69 |     "import argparse\n",
 70 |     "import pandas as pd\n",
 71 |     "import tensorflow as tf\n",
 72 |     "import os #NEW\n",
 73 |     "import json #NEW\n",
 74 |     "from tensorflow.contrib.learn.python.learn import learn_runner\n",
 75 |     "from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils\n",
 76 |     "\n",
 77 |     "print(tf.__version__)\n",
 78 |     "tf.logging.set_verbosity(tf.logging.ERROR)\n",
 79 |     "\n",
 80 |     "data_train = pd.read_csv(\n",
 81 |     "  filepath_or_buffer='https://storage.googleapis.com/vijay-public/boston_housing/housing_train.csv',\n",
 82 |     "  names=[\"CRIM\",\"ZN\",\"INDUS\",\"CHAS\",\"NOX\",\"RM\",\"AGE\",\"DIS\",\"RAD\",\"TAX\",\"PTRATIO\",\"MEDV\"])\n",
 83 |     "\n",
 84 |     "data_test = pd.read_csv(\n",
 85 |     "  filepath_or_buffer='https://storage.googleapis.com/vijay-public/boston_housing/housing_test.csv',\n",
 86 |     "  names=[\"CRIM\",\"ZN\",\"INDUS\",\"CHAS\",\"NOX\",\"RM\",\"AGE\",\"DIS\",\"RAD\",\"TAX\",\"PTRATIO\",\"MEDV\"])\n",
 87 |     "\n",
 88 |     "FEATURES = [\"CRIM\", \"ZN\", \"INDUS\", \"NOX\", \"RM\",\n",
 89 |     "            \"AGE\", \"DIS\", \"TAX\", \"PTRATIO\"]\n",
 90 |     "LABEL = \"MEDV\"\n",
 91 |     "\n",
 92 |     "feature_cols = [tf.feature_column.numeric_column(k)\n",
 93 |     "                  for k in FEATURES] #list of Feature Columns\n",
 94 |     "\n",
 95 |     "def generate_estimator(output_dir):\n",
 96 |     "  return tf.estimator.DNNRegressor(feature_columns=feature_cols, \n",
 97 |     "                                            hidden_units=[args.hidden_units_1, args.hidden_units_2], #NEW (use command line parameters for hidden units)\n",
 98 |     "                                            model_dir=output_dir)\n",
 99 |     "\n",
100 |     "def generate_input_fn(data_set):\n",
101 |     "    def input_fn():\n",
102 |     "      features = {k: tf.constant(data_set[k].values) for k in FEATURES}\n",
103 |     "      labels = tf.constant(data_set[LABEL].values)\n",
104 |     "      return features, labels\n",
105 |     "    return input_fn\n",
106 |     "\n",
107 |     "def serving_input_fn():\n",
108 |     "  #feature_placeholders are what the caller of the predict() method will have to provide\n",
109 |     "  feature_placeholders = {\n",
110 |     "      column.name: tf.placeholder(column.dtype, [None])\n",
111 |     "      for column in feature_cols\n",
112 |     "  }\n",
113 |     "  \n",
114 |     "  #features are what we actually pass to the estimator\n",
115 |     "  features = {\n",
116 |     "    # Inputs are rank 1 so that we can provide scalars to the server\n",
117 |     "    # but Estimator expects rank 2, so we expand dimension\n",
118 |     "    key: tf.expand_dims(tensor, -1)\n",
119 |     "    for key, tensor in feature_placeholders.items()\n",
120 |     "  }\n",
121 |     "  return tf.estimator.export.ServingInputReceiver(\n",
122 |     "    features, feature_placeholders\n",
123 |     "  )\n",
124 |     "\n",
125 |     "train_spec = tf.estimator.TrainSpec(\n",
126 |     "                input_fn=generate_input_fn(data_train),\n",
127 |     "                max_steps=3000)\n",
128 |     "\n",
129 |     "exporter = tf.estimator.LatestExporter('Servo', serving_input_fn)\n",
130 |     "\n",
131 |     "eval_spec=tf.estimator.EvalSpec(\n",
132 |     "            input_fn=generate_input_fn(data_test),\n",
133 |     "            steps=1,\n",
134 |     "            exporters=exporter)\n",
135 |     "\n",
136 |     "######START CLOUD ML ENGINE BOILERPLATE######\n",
137 |     "if __name__ == '__main__':\n",
138 |     "  parser = argparse.ArgumentParser()\n",
139 |     "  # Input Arguments\n",
140 |     "  parser.add_argument(\n",
141 |     "      '--output_dir',\n",
142 |     "      help='GCS location to write checkpoints and export models',\n",
143 |     "      required=True\n",
144 |     "    )\n",
145 |     "  parser.add_argument(\n",
146 |     "        '--job-dir',\n",
147 |     "        help='this model ignores this field, but it is required by gcloud',\n",
148 |     "        default='junk'\n",
149 |     "    )\n",
150 |     "  parser.add_argument(\n",
151 |     "        '--hidden_units_1', #NEW (expose hyperparameter to command line)\n",
152 |     "        help='number of neurons in first hidden layer',\n",
153 |     "        type = int,\n",
154 |     "        default=10\n",
155 |     "    )\n",
156 |     "  parser.add_argument(\n",
157 |     "        '--hidden_units_2', #NEW (expose hyperparameter to command line)\n",
158 |     "        help='number of neurons in second hidden layer',\n",
159 |     "        type = int,\n",
160 |     "        default=10\n",
161 |     "    )\n",
162 |     "  args = parser.parse_args()\n",
163 |     "  arguments = args.__dict__\n",
164 |     "  output_dir = arguments.pop('output_dir')\n",
165 |     "  output_dir = os.path.join(#NEW (give each trial its own output_dir)\n",
166 |     "      output_dir,\n",
167 |     "      json.loads(\n",
168 |     "          os.environ.get('TF_CONFIG', '{}')\n",
169 |     "      ).get('task', {}).get('trial', '')\n",
170 |     "  )\n",
171 |     "######END CLOUD ML ENGINE BOILERPLATE######\n",
172 |     "\n",
173 |     "  #initiate training job\n",
174 |     "  tf.estimator.train_and_evaluate(generate_estimator(output_dir), train_spec, eval_spec)\n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "### 2) Define Hyperparameter Configuration File\n",
182 |     "\n",
183 |     "Here you specify:\n",
184 |     "\n",
185 |     "1. Which hyperparamters to tune\n",
186 |     "2. The min and max range to search between\n",
187 |     "3. The metric to optimize\n",
188 |     "4. The number of trials to run"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 40,
194 |    "metadata": {
195 |     "collapsed": false
196 |    },
197 |    "outputs": [
198 |     {
199 |      "name": "stdout",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "Overwriting config.yaml\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "%%writefile config.yaml\n",
208 |     "trainingInput:\n",
209 |     "  hyperparameters:\n",
210 |     "    goal: MINIMIZE\n",
211 |     "    hyperparameterMetricTag: average_loss\n",
212 |     "    maxTrials: 5\n",
213 |     "    maxParallelTrials: 1\n",
214 |     "    params:\n",
215 |     "    - parameterName: hidden_units_1\n",
216 |     "      type: INTEGER\n",
217 |     "      minValue: 1\n",
218 |     "      maxValue: 100\n",
219 |     "      scaleType: UNIT_LOG_SCALE\n",
220 |     "    - parameterName: hidden_units_2\n",
221 |     "      type: INTEGER\n",
222 |     "      minValue: 1\n",
223 |     "      maxValue: 100\n",
224 |     "      scaleType: UNIT_LOG_SCALE"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {
230 |     "collapsed": true
231 |    },
232 |    "source": [
233 |     "### 3) Train"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 33,
239 |    "metadata": {
240 |     "collapsed": true
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "GCS_BUCKET = 'gs://vijays-sandbox-ml' #CHANGE THIS TO YOUR BUCKET\n",
245 |     "PROJECT = 'vijays-sandbox' #CHANGE THIS TO YOUR PROJECT ID\n",
246 |     "REGION = 'us-central1' #OPTIONALLY CHANGE THIS"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 34,
252 |    "metadata": {
253 |     "collapsed": true
254 |    },
255 |    "outputs": [],
256 |    "source": [
257 |     "import os\n",
258 |     "os.environ['GCS_BUCKET'] = GCS_BUCKET\n",
259 |     "os.environ['PROJECT'] = PROJECT\n",
260 |     "os.environ['REGION'] = REGION"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "#### Run local\n",
268 |     "It's a best practice to first run locally to check for errors. Note you can ignore the warnings in this case, as long as there are no errors."
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 44,
274 |    "metadata": {
275 |     "collapsed": false
276 |    },
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "1.5.0\n"
283 |      ]
284 |     },
285 |     {
286 |      "name": "stderr",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "/usr/local/lib/python2.7/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
290 |       "  from ._conv import register_converters as _register_converters\n",
291 |       "2018-03-13 23:10:57.249216: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA\n"
292 |      ]
293 |     }
294 |    ],
295 |    "source": [
296 |     "%%bash\n",
297 |     "gcloud ml-engine local train \\\n",
298 |     "   --module-name=trainer.task \\\n",
299 |     "   --package-path=trainer \\\n",
300 |     "   -- \\\n",
301 |     "   --output_dir='./output'"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "#### Run on cloud (1 cloud ML unit)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 41,
314 |    "metadata": {
315 |     "collapsed": false
316 |    },
317 |    "outputs": [
318 |     {
319 |      "name": "stderr",
320 |      "output_type": "stream",
321 |      "text": [
322 |       "Updated property [core/project].\n"
323 |      ]
324 |     }
325 |    ],
326 |    "source": [
327 |     "%%bash\n",
328 |     "gcloud config set project $PROJECT"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 47,
334 |    "metadata": {
335 |     "collapsed": false
336 |    },
337 |    "outputs": [
338 |     {
339 |      "name": "stdout",
340 |      "output_type": "stream",
341 |      "text": [
342 |       "jobId: housing_180313_232321\n",
343 |       "state: QUEUED\n"
344 |      ]
345 |     },
346 |     {
347 |      "name": "stderr",
348 |      "output_type": "stream",
349 |      "text": [
350 |       "Job [housing_180313_232321] submitted successfully.\n",
351 |       "Your job is still active. You may view the status of your job with the command\n",
352 |       "\n",
353 |       "  $ gcloud ml-engine jobs describe housing_180313_232321\n",
354 |       "\n",
355 |       "or continue streaming the logs with the command\n",
356 |       "\n",
357 |       "  $ gcloud ml-engine jobs stream-logs housing_180313_232321\n"
358 |      ]
359 |     }
360 |    ],
361 |    "source": [
362 |     "%%bash\n",
363 |     "JOBNAME=housing_$(date -u +%y%m%d_%H%M%S)\n",
364 |     "\n",
365 |     "gcloud ml-engine jobs submit training $JOBNAME \\\n",
366 |     "   --region=$REGION \\\n",
367 |     "   --module-name=trainer.task \\\n",
368 |     "   --package-path=./trainer \\\n",
369 |     "   --job-dir=$GCS_BUCKET/$JOBNAME/ \\\n",
370 |     "   --runtime-version 1.4 \\\n",
371 |     "   --config config.yaml \\\n",
372 |     "   -- \\\n",
373 |     "   --output_dir=$GCS_BUCKET/$JOBNAME/output\n"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "### 4) Inspect Results\n",
381 |     "\n",
382 |     "In cloud console (https://console.cloud.google.com/mlengine/jobs) you will see the output of each trial, which hyperparameters were choosen, and what the resulting loss was. Trials will be shown in order of performance, with the best trial on top."
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {
389 |     "collapsed": true
390 |    },
391 |    "outputs": [],
392 |    "source": []
393 |   }
394 |  ],
395 |  "metadata": {
396 |   "anaconda-cloud": {},
397 |   "kernelspec": {
398 |    "display_name": "Python 2",
399 |    "language": "python",
400 |    "name": "python2"
401 |   },
402 |   "language_info": {
403 |    "codemirror_mode": {
404 |     "name": "ipython",
405 |     "version": 2
406 |    },
407 |    "file_extension": ".py",
408 |    "mimetype": "text/x-python",
409 |    "name": "python",
410 |    "nbconvert_exporter": "python",
411 |    "pygments_lexer": "ipython2",
412 |    "version": "2.7.12"
413 |   }
414 |  },
415 |  "nbformat": 4,
416 |  "nbformat_minor": 2
417 | }
418 | 


--------------------------------------------------------------------------------
/DRAFT/minimum_viable_custom_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "deletable": true,
  8 |     "editable": true,
  9 |     "id": "4f3CKqFUqL2-",
 10 |     "slideshow": {
 11 |      "slide_type": "slide"
 12 |     }
 13 |    },
 14 |    "source": [
 15 |     "# Custom Estimator"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 27,
 21 |    "metadata": {
 22 |     "collapsed": false,
 23 |     "deletable": true,
 24 |     "editable": true
 25 |    },
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stdout",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "1.8.0\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "import shutil\n",
 37 |     "import tensorflow as tf\n",
 38 |     "print(tf.__version__)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {
 44 |     "deletable": true,
 45 |     "editable": true
 46 |    },
 47 |    "source": [
 48 |     "#### Generate Toy Dataset\n",
 49 |     "X1+X2 = Y"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 28,
 55 |    "metadata": {
 56 |     "collapsed": false,
 57 |     "deletable": true,
 58 |     "editable": true
 59 |    },
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "Overwriting data_train.csv\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "%%writefile data_train.csv\n",
 71 |     "X1,X2,Y\n",
 72 |     "2,3,5\n",
 73 |     "1,3,4\n",
 74 |     "3,-1,2\n",
 75 |     "4,0,4\n",
 76 |     "-2,2,0\n",
 77 |     "2,2,4"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 29,
 83 |    "metadata": {
 84 |     "collapsed": false,
 85 |     "deletable": true,
 86 |     "editable": true
 87 |    },
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "Overwriting data_eval.csv\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "%%writefile data_eval.csv\n",
 99 |     "X1,X2,Y\n",
100 |     "3,2,5\n",
101 |     "3,1,4\n",
102 |     "-2,-1,-2"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {
108 |     "deletable": true,
109 |     "editable": true
110 |    },
111 |    "source": [
112 |     "#### Input Fn"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 30,
118 |    "metadata": {
119 |     "collapsed": true,
120 |     "deletable": true,
121 |     "editable": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "def csv_input_fn(csv_path, batch_size,mode):\n",
126 |     "    def parse_csv(line):\n",
127 |     "      CSV_TYPES = [[0.0], [0.0],[0.0]]\n",
128 |     "      \n",
129 |     "      fields = tf.decode_csv(line, record_defaults=CSV_TYPES,field_delim=',')\n",
130 |     "      \n",
131 |     "      label = fields.pop(-1) #last value is label\n",
132 |     "      label = tf.expand_dims(label,-1) #to be consistent shape with predictions\n",
133 |     "      \n",
134 |     "      #combine features into single tensor\n",
135 |     "      features = tf.stack(fields,0)\n",
136 |     "      \n",
137 |     "      return features, label\n",
138 |     "    \n",
139 |     "    # Create a dataset containing the text lines.\n",
140 |     "    dataset = tf.data.TextLineDataset(csv_path).skip(1) #skip header\n",
141 |     "\n",
142 |     "    # Parse each line.\n",
143 |     "    dataset = dataset.map(parse_csv)\n",
144 |     "\n",
145 |     "    # Shuffle, repeat, and batch the examples.\n",
146 |     "    if(mode == tf.estimator.ModeKeys.TRAIN):\n",
147 |     "      dataset = dataset.shuffle(batch_size*10)\n",
148 |     "      dataset = dataset.repeat()\n",
149 |     "    \n",
150 |     "    dataset = dataset.batch(batch_size)\n",
151 |     "\n",
152 |     "    return dataset"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "#### Custom Estimator"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 31,
165 |    "metadata": {
166 |     "collapsed": false,
167 |     "deletable": true,
168 |     "editable": true
169 |    },
170 |    "outputs": [],
171 |    "source": [
172 |     "# Create the custom estimator\n",
173 |     "def custom_estimator(features, labels, mode, params):\n",
174 |     "  print('custom_estimator: features: {}'.format(features))\n",
175 |     "  print('custom_estimator: labels:{}'.format(labels))\n",
176 |     "  \n",
177 |     "  predictions = tf.layers.dense(features,1,activation=None)\n",
178 |     "  print('custom_estimator: predictions: {}'.format(predictions))\n",
179 |     "  \n",
180 |     "  # 2. Loss function, training/eval ops\n",
181 |     "  if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:\n",
182 |     "    loss = tf.losses.mean_squared_error(labels, predictions)\n",
183 |     "    optimizer = tf.train.FtrlOptimizer(learning_rate=0.1)\n",
184 |     "    train_op = tf.contrib.layers.optimize_loss(\n",
185 |     "      loss = loss,\n",
186 |     "      global_step = tf.train.get_global_step(),\n",
187 |     "      learning_rate = 0.01,\n",
188 |     "      optimizer = optimizer)\n",
189 |     "    \n",
190 |     "    eval_metric_ops = {\n",
191 |     "      \"rmse\": tf.metrics.root_mean_squared_error(labels, predictions)\n",
192 |     "    }\n",
193 |     "  else:\n",
194 |     "    loss = None\n",
195 |     "    train_op = None\n",
196 |     "    eval_metric_ops = None\n",
197 |     "  \n",
198 |     "  predictions_dict = {'predictions':predictions,'features':features}\n",
199 |     "  \n",
200 |     "  return tf.estimator.EstimatorSpec(\n",
201 |     "    mode = mode,\n",
202 |     "    predictions = predictions_dict,\n",
203 |     "    loss = loss,\n",
204 |     "    train_op = train_op,\n",
205 |     "    eval_metric_ops = eval_metric_ops,\n",
206 |     "  )"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 32,
212 |    "metadata": {
213 |     "collapsed": false,
214 |     "deletable": true,
215 |     "editable": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "# Create custom estimator's train and evaluate function\n",
220 |     "def train_and_evaluate(output_dir,args):\n",
221 |     "  estimator = tf.estimator.Estimator(model_fn=custom_estimator, \n",
222 |     "                                     model_dir=output_dir)\n",
223 |     "  train_spec = tf.estimator.TrainSpec(input_fn= lambda:csv_input_fn(\n",
224 |     "                                        args['train_path'],\n",
225 |     "                                        args['batch_size'],\n",
226 |     "                                        tf.estimator.ModeKeys.TRAIN),\n",
227 |     "                                      max_steps = args['train_steps'])\n",
228 |     "  eval_spec = tf.estimator.EvalSpec(input_fn = lambda:csv_input_fn(\n",
229 |     "                                      args['eval_path'], \n",
230 |     "                                      args['batch_size'],\n",
231 |     "                                      tf.estimator.ModeKeys.EVAL),\n",
232 |     "                                    steps = None)\n",
233 |     "  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 33,
239 |    "metadata": {
240 |     "collapsed": false,
241 |     "deletable": true,
242 |     "editable": true
243 |    },
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "INFO:tensorflow:Using default config.\n",
250 |       "INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f892c10cad0>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'trained', '_global_id_in_cluster': 0, '_save_summary_steps': 100}\n",
251 |       "WARNING:tensorflow:Estimator's model_fn (<function custom_estimator at 0x7f8927990578>) includes params argument, but params are not passed to Estimator.\n",
252 |       "INFO:tensorflow:Running training and evaluation locally (non-distributed).\n",
253 |       "INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 600 secs (eval_spec.throttle_secs) or training is finished.\n",
254 |       "INFO:tensorflow:Calling model_fn.\n",
255 |       "custom_estimator: features: Tensor(\"IteratorGetNext:0\", shape=(?, 2), dtype=float32)\n",
256 |       "custom_estimator: labels:Tensor(\"IteratorGetNext:1\", shape=(?, 1), dtype=float32)\n",
257 |       "custom_estimator: predictions: Tensor(\"dense/BiasAdd:0\", shape=(?, 1), dtype=float32)\n",
258 |       "INFO:tensorflow:Done calling model_fn.\n",
259 |       "INFO:tensorflow:Create CheckpointSaverHook.\n",
260 |       "INFO:tensorflow:Graph was finalized.\n",
261 |       "INFO:tensorflow:Running local_init_op.\n",
262 |       "INFO:tensorflow:Done running local_init_op.\n",
263 |       "INFO:tensorflow:Saving checkpoints for 1 into trained/model.ckpt.\n",
264 |       "INFO:tensorflow:loss = 14.381937, step = 1\n",
265 |       "INFO:tensorflow:Saving checkpoints for 100 into trained/model.ckpt.\n",
266 |       "INFO:tensorflow:Loss for final step: 0.16213264.\n",
267 |       "INFO:tensorflow:Calling model_fn.\n",
268 |       "custom_estimator: features: Tensor(\"IteratorGetNext:0\", shape=(?, 2), dtype=float32)\n",
269 |       "custom_estimator: labels:Tensor(\"IteratorGetNext:1\", shape=(?, 1), dtype=float32)\n",
270 |       "custom_estimator: predictions: Tensor(\"dense/BiasAdd:0\", shape=(?, 1), dtype=float32)\n",
271 |       "INFO:tensorflow:Done calling model_fn.\n",
272 |       "INFO:tensorflow:Starting evaluation at 2018-06-29-04:35:59\n",
273 |       "INFO:tensorflow:Graph was finalized.\n",
274 |       "INFO:tensorflow:Restoring parameters from trained/model.ckpt-100\n",
275 |       "INFO:tensorflow:Running local_init_op.\n",
276 |       "INFO:tensorflow:Done running local_init_op.\n",
277 |       "INFO:tensorflow:Finished evaluation at 2018-06-29-04:36:00\n",
278 |       "INFO:tensorflow:Saving dict for global step 100: global_step = 100, loss = 0.04452223, rmse = 0.21100292\n"
279 |      ]
280 |     }
281 |    ],
282 |    "source": [
283 |     "args = {\n",
284 |     "  'train_path': 'data_train.csv',\n",
285 |     "  'eval_path': 'data_eval.csv',\n",
286 |     "  'batch_size': 4,\n",
287 |     "  'train_steps': 100,\n",
288 |     "}\n",
289 |     "OUTDIR = 'trained'\n",
290 |     "shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time\n",
291 |     "train_and_evaluate(OUTDIR,args)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {
297 |     "deletable": true,
298 |     "editable": true
299 |    },
300 |    "source": [
301 |     "#### Inspect Weights\n",
302 |     "\n",
303 |     "The tensors named dense/kernel and dense/bias are the weights and bias for the model"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 34,
309 |    "metadata": {
310 |     "collapsed": false,
311 |     "deletable": true,
312 |     "editable": true
313 |    },
314 |    "outputs": [
315 |     {
316 |      "name": "stdout",
317 |      "output_type": "stream",
318 |      "text": [
319 |       "tensor_name:  dense/kernel\n",
320 |       "[[0.37332565]\n",
321 |       " [1.1088971 ]]\n",
322 |       "tensor_name:  dense/bias\n",
323 |       "[0.94920313]\n"
324 |      ]
325 |     }
326 |    ],
327 |    "source": [
328 |     "from tensorflow.python.tools import inspect_checkpoint\n",
329 |     "inspect_checkpoint.print_tensors_in_checkpoint_file(\"custom_estimator_trained_model/model.ckpt-100\", tensor_name='dense/kernel', all_tensors=False)\n",
330 |     "inspect_checkpoint.print_tensors_in_checkpoint_file(\"custom_estimator_trained_model/model.ckpt-100\", tensor_name='dense/bias', all_tensors=False)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {
336 |     "deletable": true,
337 |     "editable": true
338 |    },
339 |    "source": [
340 |     "#### Get predictions"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 35,
346 |    "metadata": {
347 |     "collapsed": false,
348 |     "deletable": true,
349 |     "editable": true
350 |    },
351 |    "outputs": [
352 |     {
353 |      "name": "stdout",
354 |      "output_type": "stream",
355 |      "text": [
356 |       "INFO:tensorflow:Using default config.\n",
357 |       "INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8949ceaf10>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'trained', '_global_id_in_cluster': 0, '_save_summary_steps': 100}\n",
358 |       "WARNING:tensorflow:Estimator's model_fn (<function custom_estimator at 0x7f8927990578>) includes params argument, but params are not passed to Estimator.\n",
359 |       "INFO:tensorflow:Calling model_fn.\n",
360 |       "custom_estimator: features: Tensor(\"IteratorGetNext:0\", shape=(?, 2), dtype=float32)\n",
361 |       "custom_estimator: labels:None\n",
362 |       "custom_estimator: predictions: Tensor(\"dense/BiasAdd:0\", shape=(?, 1), dtype=float32)\n",
363 |       "INFO:tensorflow:Done calling model_fn.\n",
364 |       "INFO:tensorflow:Graph was finalized.\n",
365 |       "INFO:tensorflow:Restoring parameters from trained/model.ckpt-100\n",
366 |       "INFO:tensorflow:Running local_init_op.\n",
367 |       "INFO:tensorflow:Done running local_init_op.\n",
368 |       "{'predictions': array([[ 4.748355 ],\n",
369 |       "       [ 3.998592 ],\n",
370 |       "       [-1.7349727]], dtype=float32), 'features': array([[ 3.,  2.],\n",
371 |       "       [ 3.,  1.],\n",
372 |       "       [-2., -1.]], dtype=float32)}\n"
373 |      ]
374 |     }
375 |    ],
376 |    "source": [
377 |     "#load checkpoint\n",
378 |     "estimator = tf.estimator.Estimator(model_fn=custom_estimator, \n",
379 |     "                                     model_dir=OUTDIR) \n",
380 |     "\n",
381 |     "predictions = estimator.predict(\n",
382 |     "  input_fn = lambda:csv_input_fn(\n",
383 |     "                  args['eval_path'], \n",
384 |     "                  args['batch_size'],\n",
385 |     "                  tf.estimator.ModeKeys.EVAL),\n",
386 |     "  yield_single_examples=False\n",
387 |     "  )\n",
388 |     "print(predictions.next())"
389 |    ]
390 |   }
391 |  ],
392 |  "metadata": {
393 |   "colab": {
394 |    "default_view": {},
395 |    "name": "first_steps_with_tensor_flow.ipynb",
396 |    "provenance": [],
397 |    "version": "0.3.2",
398 |    "views": {}
399 |   },
400 |   "kernelspec": {
401 |    "display_name": "Python 2",
402 |    "language": "python",
403 |    "name": "python2"
404 |   },
405 |   "language_info": {
406 |    "codemirror_mode": {
407 |     "name": "ipython",
408 |     "version": 2
409 |    },
410 |    "file_extension": ".py",
411 |    "mimetype": "text/x-python",
412 |    "name": "python",
413 |    "nbconvert_exporter": "python",
414 |    "pygments_lexer": "ipython2",
415 |    "version": "2.7.14"
416 |   }
417 |  },
418 |  "nbformat": 4,
419 |  "nbformat_minor": 0
420 | }
421 | 


--------------------------------------------------------------------------------
/DRAFT/custom_estimator_mnist.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 133,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "1.6.0\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "from matplotlib import pyplot as plt\n",
 18 |     "import numpy as np\n",
 19 |     "import shutil\n",
 20 |     "import tensorflow as tf\n",
 21 |     "print(tf.__version__)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "### Download and explore MNIST data"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz\n",
 41 |       "11493376/11490434 [==============================]11493376/11490434 [==============================] - 3s 0us/step\n",
 42 |       "\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "data = tf.keras.datasets.mnist.load_data(path='mnist.npz') \n",
 48 |     "# Tuple of Numpy arrays: ((x_train, y_train), (x_test, y_test))"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 60,
 54 |    "metadata": {
 55 |     "collapsed": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "x = data[0][0] #60K 28x28 images\n",
 60 |     "y = data[0][1]\n",
 61 |     "x_test = data[1][0] #10K 28x28 images\n",
 62 |     "y_test = data[1][1]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "Show amount of data"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 61,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "x:(60000, 28, 28)\n",
 82 |       "y:(60000,)\n",
 83 |       "x_test:(10000, 28, 28)\n",
 84 |       "y_test:(10000,)\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "print('x:{}'.format(x.shape))\n",
 90 |     "print('y:{}'.format(y.shape))\n",
 91 |     "print('x_test:{}'.format(x_test.shape))\n",
 92 |     "print('y_test:{}'.format(y_test.shape))"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "Split into x and y into 80% train and 20% eval.\n",
100 |     "\n",
101 |     "We'll save x_test and y_test as our hold out data"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 157,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "x_train:(48009, 28, 28)\n",
114 |       "y_train:(48009,)\n",
115 |       "x_eval:(11991, 28, 28)\n",
116 |       "y_eval:(11991,)\n",
117 |       "x_test:(10000, 28, 28)\n",
118 |       "y_test:(10000,)\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "# Split into train and eval\n",
124 |     "msk = np.random.rand(len(x)) < 0.8 #numpy vector of booleans\n",
125 |     "x_train = x[msk] #can use an numpy vector to filter a matrix\n",
126 |     "y_train = y[msk]\n",
127 |     "x_eval = x[~msk]\n",
128 |     "y_eval = y[~msk]\n",
129 |     "print('x_train:{}'.format(x_train.shape))\n",
130 |     "print('y_train:{}'.format(y_train.shape))\n",
131 |     "print('x_eval:{}'.format(x_eval.shape))\n",
132 |     "print('y_eval:{}'.format(y_eval.shape))\n",
133 |     "print('x_test:{}'.format(x_test.shape))\n",
134 |     "print('y_test:{}'.format(y_test.shape))"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "Display one training example"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 159,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "Label:4\n"
154 |      ]
155 |     },
156 |     {
157 |      "data": {
158 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAD8CAYAAAC4nHJkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADWBJREFUeJzt3W+MHPV9x/HPx8fZjp2gcCa+XsDBkEAkhNRDupg2/CmV\nCSKIyqBEVpBKHAnhPMg/pDyAuq1KlQclUROKmgjpAm5MlUBaJQg/IGnwKQpCRcYHcTBgUggxwY7x\nOTGRTTD+++2DG6IDbmfXu7M7e/6+X5J1u/Ob2flo5M/N7s7e/hwRApDPvLoDAKgH5QeSovxAUpQf\nSIryA0lRfiApyg8kRfmBpCg/kNQpvdzZfC+IhVrcy10CqbyhP+pwHHIr63ZUfttXSbpT0oCkuyPi\n9rL1F2qxLvLKTnYJoMTmmGh53baf9tsekPQtSR+XdL6k622f3+7jAeitTl7zr5D0QkS8GBGHJd0v\naVU1sQB0WyflP0PSyzPu7yyWvYXttbYnbU8e0aEOdgegSl1/tz8ixiNiLCLGBrWg27sD0KJOyr9L\n0rIZ988slgGYAzop/xZJ59o+2/Z8SZ+StLGaWAC6re1LfRFx1PbnJf2Ppi/1rY+IZypLBqCrOrrO\nHxEPSXqooiwAeoiP9wJJUX4gKcoPJEX5gaQoP5AU5QeSovxAUpQfSIryA0lRfiApyg8kRfmBpCg/\nkBTlB5Ki/EBSlB9IivIDSVF+ICnKDyRF+YGkKD+QFOUHkqL8QFKUH0iK8gNJUX4gKcoPJEX5gaQo\nP5BUR7P02t4h6YCkY5KORsRYFaGAKvzxkxc1HPvq1+4q3fYrqz9dOh6TT7eVqZ90VP7CX0fE7yp4\nHAA9xNN+IKlOyx+SNtl+wvbaKgIB6I1On/ZfEhG7bC+V9LDt5yLikZkrFL8U1krSQi3qcHcAqtLR\nmT8idhU/pyQ9IGnFLOuMR8RYRIwNakEnuwNQobbLb3ux7fe8eVvSlZLm/lugQBKdPO0flvSA7Tcf\n53sR8eNKUgHourbLHxEvSvrzCrN01cFV73hF8tbxJQOl40PrH6syDnpgaqzxE9uv7PibHibpT1zq\nA5Ki/EBSlB9IivIDSVF+ICnKDyRVxV/1zQm/vaz899yiD/6h/AHWVxgG1ZhXfnk2PnCw4djKpc+V\nbjvhj7YVaS7hzA8kRfmBpCg/kBTlB5Ki/EBSlB9IivIDSaW5zv/P1/x36fhXt1/ZoySoysAHzyod\nf+6vGn84Y/Txvy3d9v1btrWVaS7hzA8kRfmBpCg/kBTlB5Ki/EBSlB9IivIDSaW5zj/oo3VHQMVO\nufv1trc9+KtTK0wyN3HmB5Ki/EBSlB9IivIDSVF+ICnKDyRF+YGkml7nt71e0jWSpiLigmLZkKTv\nS1ouaYek1RHxavdiNnf8ktHS8UsXPtqjJOiV5Yt/3/a2yzYdqzDJ3NTKmf87kq5627JbJU1ExLmS\nJor7AOaQpuWPiEck7Xvb4lWSNhS3N0i6tuJcALqs3df8wxGxu7j9iqThivIA6JGO3/CLiJAUjcZt\nr7U9aXvyiA51ujsAFWm3/Htsj0hS8XOq0YoRMR4RYxExNqgFbe4OQNXaLf9GSWuK22skPVhNHAC9\n0rT8tu+T9JikD9veaftGSbdL+pjt5yVdUdwHMIc0vc4fEdc3GFpZcZaOvHTNu0rHlw4s6lESVOWU\n5R8oHf/k0Ma2H/tdvy7/WEqGTwHwCT8gKcoPJEX5gaQoP5AU5QeSovxAUifNV3ef8qEDHW3/xnPv\nrSgJqvLyvy0uHb94wfHS8Xv2n9l48A/724l0UuHMDyRF+YGkKD+QFOUHkqL8QFKUH0iK8gNJnTTX\n+Tu1dLL8mjFmN3D6ktLxPZ84r+HY0Oqdpdv+7Lx7mux9YenoXd9q/L2yS/f8b5PHPvlx5geSovxA\nUpQfSIryA0lRfiApyg8kRfmBpLjOXzg4VP57sPwvyztz/NILS8djwKXjL1/ReCakw+8/UrrtvPnl\nX1L9k0v/vXR8sDyaXjnWONs/vnhd6bb7jpd/9mLRvPLsw5sbf8dDw/nlEuHMDyRF+YGkKD+QFOUH\nkqL8QFKUH0iK8gNJNb3Ob3u9pGskTUXEBcWy2yTdJGlvsdq6iHioWyFbceiNwdLx402u7P7HujtK\nxzd+fvSEM7XqliV3l47PU/nF9INxuOHYb4+VXwv/5t7LS8ev2HRz6fh7fz6/dHzkJ3sajvml8r/n\n37u9fNr14YHyzzDElm2l49m1cub/jqSrZll+R0SMFv9qLT6AE9e0/BHxiKR9PcgCoIc6ec3/BdtP\n2V5v+7TKEgHoiXbLf5ekcySNStot6euNVrS91vak7ckjOtTm7gBUra3yR8SeiDgWEcclfVvSipJ1\nxyNiLCLGBtX4jzwA9FZb5bc9MuPudZKeriYOgF5p5VLffZIul3S67Z2S/knS5bZHNf2XkTskfbaL\nGQF0gSN695fNp3ooLvLKnu1vpl//y1+Wji/7yK4eJTlxe39UMs+8pCXPNL7ePf/HW6qOU5ldt3y0\ndPwXX/xm6fj9r72vdPzeDy874Uxz3eaY0P7Y1+RbFqbxCT8gKcoPJEX5gaQoP5AU5QeSovxAUmm+\nuvvsv3us7ghtG9Fv6o7QFYsu29t8pRL/8NNPlI6fp8c7evyTHWd+ICnKDyRF+YGkKD+QFOUHkqL8\nQFKUH0gqzXV+nHzOepCJtjvBmR9IivIDSVF+ICnKDyRF+YGkKD+QFOUHkqL8QFKUH0iK8gNJUX4g\nKcoPJEX5gaQoP5AU5QeSavr3/LaXSbpX0rCkkDQeEXfaHpL0fUnLJe2QtDoiXu1eVGQz4PJz06vn\nDZaO/9mPqkxz8mnlzH9U0pcj4nxJfyHpc7bPl3SrpImIOFfSRHEfwBzRtPwRsTsinixuH5C0XdIZ\nklZJ2lCstkHStd0KCaB6J/Sa3/ZySRdK2ixpOCJ2F0OvaPplAYA5ouXy2363pB9Iujki9s8ci4jQ\n9PsBs2231vak7ckjOtRRWADVaan8tgc1XfzvRsQPi8V7bI8U4yOSpmbbNiLGI2IsIsYGtaCKzAAq\n0LT8ti3pHknbI+IbM4Y2SlpT3F4j6cHq4wHolla+uvtiSTdI2mZ7a7FsnaTbJf2X7RslvSRpdXci\nIqtjcbx8BT6l0pGm5Y+IRyW5wfDKauMA6BV+dwJJUX4gKcoPJEX5gaQoP5AU5QeSYopuzFmvf+T1\nuiPMaZz5gaQoP5AU5QeSovxAUpQfSIryA0lRfiAprvOjbzX76m50hqMLJEX5gaQoP5AU5QeSovxA\nUpQfSIryA0lxnR+1ObTpfaXjx0abfG8/OsKZH0iK8gNJUX4gKcoPJEX5gaQoP5AU5QeSckSUr2Av\nk3SvpGFJIWk8Iu60fZukmyTtLVZdFxEPlT3WqR6Ki8ys3kC3bI4J7Y99bmXdVj7kc1TSlyPiSdvv\nkfSE7YeLsTsi4l/bDQqgPk3LHxG7Je0ubh+wvV3SGd0OBqC7Tug1v+3lki6UtLlY9AXbT9leb/u0\nBtustT1pe/KIDnUUFkB1Wi6/7XdL+oGkmyNiv6S7JJ0jaVTTzwy+Ptt2ETEeEWMRMTaoBRVEBlCF\nlspve1DTxf9uRPxQkiJiT0Qci4jjkr4taUX3YgKoWtPy27akeyRtj4hvzFg+MmO16yQ9XX08AN3S\nyrv9F0u6QdI221uLZeskXW97VNOX/3ZI+mxXEgLoilbe7X9U0mzXDUuv6QPob3zCD0iK8gNJUX4g\nKcoPJEX5gaQoP5AU5QeSovxAUpQfSIryA0lRfiApyg8kRfmBpCg/kFTTr+6udGf2XkkvzVh0uqTf\n9SzAienXbP2aSyJbu6rMdlZElM99Xuhp+d+xc3syIsZqC1CiX7P1ay6JbO2qKxtP+4GkKD+QVN3l\nH695/2X6NVu/5pLI1q5astX6mh9Afeo+8wOoSS3lt32V7V/afsH2rXVkaMT2DtvbbG+1PVlzlvW2\np2w/PWPZkO2HbT9f/Jx1mrSast1me1dx7LbavrqmbMts/9T2s7afsf2lYnmtx64kVy3HredP+20P\nSPo/SR+TtFPSFknXR8SzPQ3SgO0dksYiovZrwrYvk/SapHsj4oJi2dck7YuI24tfnKdFxC19ku02\nSa/VPXNzMaHMyMyZpSVdK+kzqvHYleRarRqOWx1n/hWSXoiIFyPisKT7Ja2qIUffi4hHJO172+JV\nkjYUtzdo+j9PzzXI1hciYndEPFncPiDpzZmlaz12JblqUUf5z5D08oz7O9VfU36HpE22n7C9tu4w\nsxgupk2XpFckDdcZZhZNZ27upbfNLN03x66dGa+rxht+73RJRIxK+rikzxVPb/tSTL9m66fLNS3N\n3Nwrs8ws/Sd1Hrt2Z7yuWh3l3yVp2Yz7ZxbL+kJE7Cp+Tkl6QP03+/CeNydJLX5O1ZznT/pp5ubZ\nZpZWHxy7fprxuo7yb5F0ru2zbc+X9ClJG2vI8Q62FxdvxMj2YklXqv9mH94oaU1xe42kB2vM8hb9\nMnNzo5mlVfOx67sZryOi5/8kXa3pd/x/Jenv68jQINc5kn5R/Hum7myS7tP008Ajmn5v5EZJSyRN\nSHpe0iZJQ32U7T8lbZP0lKaLNlJTtks0/ZT+KUlbi39X133sSnLVctz4hB+QFG/4AUlRfiApyg8k\nRfmBpCg/kBTlB5Ki/EBSlB9I6v8BZOIGXzoUqLUAAAAASUVORK5CYII=\n",
159 |       "text/plain": [
160 |        "<matplotlib.figure.Figure at 0x11d827cf8>"
161 |       ]
162 |      },
163 |      "metadata": {},
164 |      "output_type": "display_data"
165 |     }
166 |    ],
167 |    "source": [
168 |     "example = 1 #try changing this to see new data\n",
169 |     "print('Label:{}'.format(y_train[example]))\n",
170 |     "plt.imshow(x_train[example])\n",
171 |     "plt.show()"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "### Define input functions"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 180,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "train_input_fn = tf.estimator.inputs.numpy_input_fn(\n",
188 |     "    x={\"x\": x_train.astype('float32')}, #TODO: figure out why this needs to be float32. float16 spins forever, int gives error\n",
189 |     "    y=y_train.astype('int32'),\n",
190 |     "    batch_size=100,\n",
191 |     "    num_epochs=None, #can i change this to 1 and specify epochs at train time?\n",
192 |     "    shuffle=True)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 181,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "eval_input_fn = tf.estimator.inputs.numpy_input_fn(\n",
202 |     "    x={\"x\": x_eval.astype('float32')},\n",
203 |     "    y=y_eval.astype('int32'),\n",
204 |     "    num_epochs=1,\n",
205 |     "    shuffle=False)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "### Define model function"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 183,
218 |    "metadata": {
219 |     "collapsed": true
220 |    },
221 |    "outputs": [],
222 |    "source": [
223 |     "#Implementation from: https://www.tensorflow.org/tutorials/layers#building_the_cnn_mnist_classifier\n",
224 |     "def cnn_model_fn(features, labels, mode):\n",
225 |     "  \"\"\"Model function for CNN.\"\"\"\n",
226 |     "  # Input Layer\n",
227 |     "  input_layer = tf.expand_dims(features[\"x\"], -1)\n",
228 |     "\n",
229 |     "  # Convolutional Layer #1\n",
230 |     "  conv1 = tf.layers.conv2d(\n",
231 |     "      inputs=input_layer,\n",
232 |     "      filters=32,\n",
233 |     "      kernel_size=[5, 5],\n",
234 |     "      padding=\"same\",\n",
235 |     "      activation=tf.nn.relu)\n",
236 |     "\n",
237 |     "  # Pooling Layer #1\n",
238 |     "  pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)\n",
239 |     "\n",
240 |     "  # Convolutional Layer #2 and Pooling Layer #2\n",
241 |     "  conv2 = tf.layers.conv2d(\n",
242 |     "      inputs=pool1,\n",
243 |     "      filters=64,\n",
244 |     "      kernel_size=[5, 5],\n",
245 |     "      padding=\"same\",\n",
246 |     "      activation=tf.nn.relu)\n",
247 |     "  pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)\n",
248 |     "\n",
249 |     "  # Dense Layer\n",
250 |     "  pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])\n",
251 |     "  dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)\n",
252 |     "  dropout = tf.layers.dropout(\n",
253 |     "      inputs=dense, rate=0.4, training= (mode == tf.estimator.ModeKeys.TRAIN))\n",
254 |     "\n",
255 |     "  # Logits Layer\n",
256 |     "  logits = tf.layers.dense(inputs=dropout, units=10)\n",
257 |     "\n",
258 |     "  predictions = {\n",
259 |     "      # Generate predictions (for PREDICT and EVAL mode)\n",
260 |     "      \"classes\": tf.argmax(input=logits, axis=1),\n",
261 |     "      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the\n",
262 |     "      # `logging_hook`.\n",
263 |     "      \"probabilities\": tf.nn.softmax(logits, name=\"softmax_tensor\")\n",
264 |     "  }\n",
265 |     "\n",
266 |     "  if mode == tf.estimator.ModeKeys.PREDICT:\n",
267 |     "    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n",
268 |     "\n",
269 |     "  # Calculate Loss (for both TRAIN and EVAL modes)\n",
270 |     "  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)\n",
271 |     "\n",
272 |     "  # Configure the Training Op (for TRAIN mode)\n",
273 |     "  if mode == tf.estimator.ModeKeys.TRAIN:\n",
274 |     "    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)\n",
275 |     "    train_op = optimizer.minimize(\n",
276 |     "        loss=loss,\n",
277 |     "        global_step=tf.train.get_global_step())\n",
278 |     "    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)\n",
279 |     "\n",
280 |     "  # Add evaluation metrics (for EVAL mode)\n",
281 |     "  if mode == tf.estimator.ModeKeys.EVAL:\n",
282 |     "      eval_metric_ops = {\n",
283 |     "          \"accuracy\": tf.metrics.accuracy(\n",
284 |     "              labels=labels, predictions=predictions[\"classes\"])}\n",
285 |     "      return tf.estimator.EstimatorSpec(\n",
286 |     "          mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "### Instantiate Estimator"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 184,
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "name": "stdout",
303 |      "output_type": "stream",
304 |      "text": [
305 |       "INFO:tensorflow:Using default config.\n",
306 |       "INFO:tensorflow:Using config: {'_model_dir': 'trained_custom', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x120fc4f98>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n"
307 |      ]
308 |     }
309 |    ],
310 |    "source": [
311 |     "OUTDIR = \"trained_custom\"\n",
312 |     "mnist_classifier = tf.estimator.Estimator(\n",
313 |     "    model_fn=cnn_model_fn, model_dir=OUTDIR)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "### Train and Eval"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 185,
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "name": "stdout",
330 |      "output_type": "stream",
331 |      "text": [
332 |       "INFO:tensorflow:Calling model_fn.\n",
333 |       "INFO:tensorflow:Done calling model_fn.\n",
334 |       "INFO:tensorflow:Create CheckpointSaverHook.\n",
335 |       "INFO:tensorflow:Graph was finalized.\n",
336 |       "INFO:tensorflow:Running local_init_op.\n",
337 |       "INFO:tensorflow:Done running local_init_op.\n",
338 |       "INFO:tensorflow:Saving checkpoints for 1 into trained_custom/model.ckpt.\n",
339 |       "INFO:tensorflow:loss = 43.540283, step = 1\n",
340 |       "INFO:tensorflow:Saving checkpoints for 50 into trained_custom/model.ckpt.\n",
341 |       "INFO:tensorflow:Loss for final step: 1.2389966.\n",
342 |       "INFO:tensorflow:Calling model_fn.\n",
343 |       "INFO:tensorflow:Done calling model_fn.\n",
344 |       "INFO:tensorflow:Starting evaluation at 2018-03-16-02:57:42\n",
345 |       "INFO:tensorflow:Graph was finalized.\n",
346 |       "INFO:tensorflow:Restoring parameters from trained_custom/model.ckpt-50\n",
347 |       "INFO:tensorflow:Running local_init_op.\n",
348 |       "INFO:tensorflow:Done running local_init_op.\n",
349 |       "INFO:tensorflow:Finished evaluation at 2018-03-16-02:57:49\n",
350 |       "INFO:tensorflow:Saving dict for global step 50: accuracy = 0.8248687, global_step = 50, loss = 0.5431505\n",
351 |       "{'accuracy': 0.8248687, 'loss': 0.5431505, 'global_step': 50}\n"
352 |      ]
353 |     }
354 |    ],
355 |    "source": [
356 |     "shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time\n",
357 |     "mnist_classifier.train(\n",
358 |     "    input_fn=train_input_fn,\n",
359 |     "    steps=50) #example uses 20000\n",
360 |     "eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)\n",
361 |     "print(eval_results)"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {
368 |     "collapsed": true
369 |    },
370 |    "outputs": [],
371 |    "source": []
372 |   }
373 |  ],
374 |  "metadata": {
375 |   "kernelspec": {
376 |    "display_name": "Python 3",
377 |    "language": "python",
378 |    "name": "python3"
379 |   },
380 |   "language_info": {
381 |    "codemirror_mode": {
382 |     "name": "ipython",
383 |     "version": 3
384 |    },
385 |    "file_extension": ".py",
386 |    "mimetype": "text/x-python",
387 |    "name": "python",
388 |    "nbconvert_exporter": "python",
389 |    "pygments_lexer": "ipython3",
390 |    "version": "3.6.2"
391 |   }
392 |  },
393 |  "nbformat": 4,
394 |  "nbformat_minor": 2
395 | }
396 | 


--------------------------------------------------------------------------------
/housing_prices/cloud-ml-housing-prices.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Predicting Housing Prices using Tensorflow + Cloud ML Engine\n",
   8 |     "\n",
   9 |     "This notebook will show you how to create a tensorflow model, train it on the cloud in a distributed fashion across multiple CPUs or GPUs, explore the results using Tensorboard, and finally deploy the model for online prediction. We will demonstrate this by building a model to predict housing prices.\n"
  10 |    ]
  11 |   },
  12 |   {
  13 |    "cell_type": "code",
  14 |    "execution_count": null,
  15 |    "metadata": {
  16 |     "collapsed": false
  17 |    },
  18 |    "outputs": [],
  19 |    "source": [
  20 |     "import pandas as pd\n",
  21 |     "import tensorflow as tf"
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "code",
  26 |    "execution_count": null,
  27 |    "metadata": {
  28 |     "collapsed": false,
  29 |     "slideshow": {
  30 |      "slide_type": "-"
  31 |     }
  32 |    },
  33 |    "outputs": [],
  34 |    "source": [
  35 |     "print(tf.__version__)"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "markdown",
  40 |    "metadata": {},
  41 |    "source": [
  42 |     "## Tensorflow APIs\n",
  43 |     "<img src=\"assets/TFHierarchy.png\"  width=\"50%\">\n",
  44 |     "<sup>(image: https://www.tensorflow.org/images/tensorflow_programming_environment.png)</sup>\n",
  45 |     "\n",
  46 |     "Tensorflow is a heirarchical framework. The further down the heirarchy you go, the more flexibility you have, but that more code you have to write. A best practice is start at the highest level of abstraction. Then if you need additional flexibility for some reason drop down one layer. \n",
  47 |     "\n",
  48 |     "For this tutorial we will be operating at the highest level of Tensorflow abstraction, using the Estimator API."
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "markdown",
  53 |    "metadata": {},
  54 |    "source": [
  55 |     "## Steps\n",
  56 |     "\n",
  57 |     "1. Load raw data\n",
  58 |     "\n",
  59 |     "2. Write Tensorflow Code\n",
  60 |     "\n",
  61 |     " 1. Define Feature Columns\n",
  62 |     " \n",
  63 |     " 2. Define Estimator\n",
  64 |     "\n",
  65 |     " 3. Define Input Function\n",
  66 |     " \n",
  67 |     " 4. Define Serving Function\n",
  68 |     "\n",
  69 |     " 5. Define Train and Eval Function\n",
  70 |     "\n",
  71 |     "3. Package Code\n",
  72 |     "\n",
  73 |     "4. Train\n",
  74 |     "\n",
  75 |     "5. Inspect Results\n",
  76 |     "\n",
  77 |     "6. Deploy Model\n",
  78 |     "\n",
  79 |     "7. Get Predictions"
  80 |    ]
  81 |   },
  82 |   {
  83 |    "cell_type": "markdown",
  84 |    "metadata": {},
  85 |    "source": [
  86 |     "### 1) Load Raw Data\n",
  87 |     "\n",
  88 |     "This is a publically available dataset on housing prices in Boston area suburbs circa 1978. It is hosted in a Google Cloud Storage bucket.\n",
  89 |     "\n",
  90 |     "For datasets too large to fit in memory you would read the data in batches. Tensorflow provides a queueing mechanism for this which is documented [here](https://www.tensorflow.org/programmers_guide/reading_data).\n",
  91 |     "\n",
  92 |     "In our case the dataset is small enough to fit in memory so we will simply read it into a pandas dataframe."
  93 |    ]
  94 |   },
  95 |   {
  96 |    "cell_type": "code",
  97 |    "execution_count": null,
  98 |    "metadata": {
  99 |     "collapsed": true
 100 |    },
 101 |    "outputs": [],
 102 |    "source": [
 103 |     "#downlad data from GCS and store as pandas dataframe \n",
 104 |     "data_train = pd.read_csv(\n",
 105 |     "  filepath_or_buffer='https://storage.googleapis.com/vijay-public/boston_housing/housing_train.csv',\n",
 106 |     "  names=[\"CRIM\",\"ZN\",\"INDUS\",\"CHAS\",\"NOX\",\"RM\",\"AGE\",\"DIS\",\"RAD\",\"TAX\",\"PTRATIO\",\"MEDV\"])\n",
 107 |     "\n",
 108 |     "data_test = pd.read_csv(\n",
 109 |     "  filepath_or_buffer='https://storage.googleapis.com/vijay-public/boston_housing/housing_test.csv',\n",
 110 |     "  names=[\"CRIM\",\"ZN\",\"INDUS\",\"CHAS\",\"NOX\",\"RM\",\"AGE\",\"DIS\",\"RAD\",\"TAX\",\"PTRATIO\",\"MEDV\"])"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "code",
 115 |    "execution_count": null,
 116 |    "metadata": {
 117 |     "collapsed": false
 118 |    },
 119 |    "outputs": [],
 120 |    "source": [
 121 |     "data_train.head()"
 122 |    ]
 123 |   },
 124 |   {
 125 |    "cell_type": "markdown",
 126 |    "metadata": {},
 127 |    "source": [
 128 |     "#### Column Descriptions:\n",
 129 |     "\n",
 130 |     "1. CRIM: per capita crime rate by town \n",
 131 |     "2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft. \n",
 132 |     "3. INDUS: proportion of non-retail business acres per town \n",
 133 |     "4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) \n",
 134 |     "5. NOX: nitric oxides concentration (parts per 10 million) \n",
 135 |     "6. RM: average number of rooms per dwelling \n",
 136 |     "7. AGE: proportion of owner-occupied units built prior to 1940 \n",
 137 |     "8. DIS: weighted distances to five Boston employment centres \n",
 138 |     "9. RAD: index of accessibility to radial highways \n",
 139 |     "10. TAX: full-value property-tax rate per $10,000 \n",
 140 |     "11. PTRATIO: pupil-teacher ratio by town \n",
 141 |     "12. MEDV: Median value of owner-occupied homes"
 142 |    ]
 143 |   },
 144 |   {
 145 |    "cell_type": "markdown",
 146 |    "metadata": {},
 147 |    "source": [
 148 |     "### 2) Write Tensorflow Code"
 149 |    ]
 150 |   },
 151 |   {
 152 |    "cell_type": "markdown",
 153 |    "metadata": {},
 154 |    "source": [
 155 |     "#### 2.A Define Feature Columns\n",
 156 |     "\n",
 157 |     "Feature columns are your Estimator's data \"interface.\" They tell the estimator in what format they should expect data and how to interpret it (is it one-hot? sparse? dense? continous?).  https://www.tensorflow.org/api_docs/python/tf/feature_column\n",
 158 |     "\n",
 159 |     "\n"
 160 |    ]
 161 |   },
 162 |   {
 163 |    "cell_type": "code",
 164 |    "execution_count": null,
 165 |    "metadata": {
 166 |     "collapsed": true
 167 |    },
 168 |    "outputs": [],
 169 |    "source": [
 170 |     "FEATURES = [\"CRIM\", \"ZN\", \"INDUS\", \"NOX\", \"RM\",\n",
 171 |     "            \"AGE\", \"DIS\", \"TAX\", \"PTRATIO\"]\n",
 172 |     "LABEL = \"MEDV\"\n",
 173 |     "\n",
 174 |     "feature_cols = [tf.feature_column.numeric_column(k)\n",
 175 |     "                  for k in FEATURES] #list of Feature Columns"
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "markdown",
 180 |    "metadata": {},
 181 |    "source": [
 182 |     "#### 2.B Define Estimator\n",
 183 |     "\n",
 184 |     "An Estimator is what actually implements your training, eval and prediction loops. Every estimator has the following methods:\n",
 185 |     "\n",
 186 |     "- fit() for training\n",
 187 |     "- eval() for evaluation\n",
 188 |     "- predict() for prediction\n",
 189 |     "- export_savedmodel() for writing model state to disk\n",
 190 |     "\n",
 191 |     "Tensorflow has several canned estimator that already implement these methods (DNNClassifier, LogisticClassifier etc..) or you can implement a custom estimator. Instructions on how to implement a custom estimator [here](https://www.tensorflow.org/extend/estimators) and see an example [here](https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/blogs/timeseries/rnn_cloudmle.ipynb).\n",
 192 |     "\n",
 193 |     "For simplicity we will use a canned estimator. To instantiate an estimator simply pass it what Feature Columns to expect and specify a directory for it to output to.\n",
 194 |     "\n",
 195 |     "Notice we wrap the estimator with a function. This is to allow us to specify the 'output_dir' at runtime, instead of having to hardcode it here"
 196 |    ]
 197 |   },
 198 |   {
 199 |    "cell_type": "code",
 200 |    "execution_count": null,
 201 |    "metadata": {
 202 |     "collapsed": true
 203 |    },
 204 |    "outputs": [],
 205 |    "source": [
 206 |     "def generate_estimator(output_dir):\n",
 207 |     "  return tf.estimator.DNNRegressor(feature_columns=feature_cols,\n",
 208 |     "                                            hidden_units=[10, 10],\n",
 209 |     "                                            model_dir=output_dir)"
 210 |    ]
 211 |   },
 212 |   {
 213 |    "cell_type": "markdown",
 214 |    "metadata": {},
 215 |    "source": [
 216 |     "#### 2.C Define Input Function\n",
 217 |     "\n",
 218 |     "Now that you have an estimator and it knows what type of data to expect and how to intepret, you need to actually pass the data to it! This is the job of the input function. \n",
 219 |     "\n",
 220 |     "The input function returns a (features, label) tuple\n",
 221 |     "- features: A python dictionary. Each key is a feature column name and its value is the tensor containing the data for that Feature\n",
 222 |     "- label: A Tensor containing the label column"
 223 |    ]
 224 |   },
 225 |   {
 226 |    "cell_type": "code",
 227 |    "execution_count": null,
 228 |    "metadata": {
 229 |     "collapsed": true
 230 |    },
 231 |    "outputs": [],
 232 |    "source": [
 233 |     "def generate_input_fn(data_set):\n",
 234 |     "    def input_fn():\n",
 235 |     "      features = {k: tf.constant(data_set[k].values) for k in FEATURES}\n",
 236 |     "      labels = tf.constant(data_set[LABEL].values)\n",
 237 |     "      return features, labels\n",
 238 |     "    return input_fn"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "markdown",
 243 |    "metadata": {},
 244 |    "source": [
 245 |     "#### 2.D Define Serving Input Function\n",
 246 |     "\n",
 247 |     "To predict with the model, we need to define a serving input function which will be used to read inputs from a user at prediction time. \n",
 248 |     "\n",
 249 |     "Why do we need a separate serving function? Don't we input the same features during training as in serving?\n",
 250 |     "\n",
 251 |     "Yes, but we may be *receiving* data in a different format during serving. The serving input function preforms transormations neccessary to get the data provided at prediction time into the format compatible with the Estimator API.\n",
 252 |     "\n",
 253 |     "returns a (features, inputs) tuple\n",
 254 |     "- features: A dict of features to be passed to the Estimator\n",
 255 |     "- inputs: A dictionary of inputs the predictions server should expect from the user"
 256 |    ]
 257 |   },
 258 |   {
 259 |    "cell_type": "code",
 260 |    "execution_count": null,
 261 |    "metadata": {
 262 |     "collapsed": true
 263 |    },
 264 |    "outputs": [],
 265 |    "source": [
 266 |     "def serving_input_fn():\n",
 267 |     "  #feature_placeholders are what the caller of the predict() method will have to provide\n",
 268 |     "  feature_placeholders = {\n",
 269 |     "      column.name: tf.placeholder(column.dtype, [None])\n",
 270 |     "      for column in feature_cols\n",
 271 |     "  }\n",
 272 |     "  \n",
 273 |     "  #features are what we actually pass to the estimator\n",
 274 |     "  features = {\n",
 275 |     "    # Inputs are rank 1 so that we can provide scalars to the server\n",
 276 |     "    # but Estimator expects rank 2, so we expand dimension\n",
 277 |     "    key: tf.expand_dims(tensor, -1)\n",
 278 |     "    for key, tensor in feature_placeholders.items()\n",
 279 |     "  }\n",
 280 |     "  return tf.estimator.export.ServingInputReceiver(\n",
 281 |     "    features, feature_placeholders\n",
 282 |     "  )"
 283 |    ]
 284 |   },
 285 |   {
 286 |    "cell_type": "markdown",
 287 |    "metadata": {},
 288 |    "source": [
 289 |     "#### 2.E Define Train and Eval Function\n",
 290 |     "\n",
 291 |     "Finally to train and evaluate we use tf.estimator.train_and_evaluate()\n",
 292 |     "\n",
 293 |     "This function is special because it provides consistent behavior across local and distributed environments.\n",
 294 |     "\n",
 295 |     "Meaning if you run on multiple CPUs or GPUs, it takes care of parrallelizing the computation graph across these devices for you! \n",
 296 |     "\n",
 297 |     "The tran_and_evaluate() function requires three arguments:\n",
 298 |     "- estimator: we already defined this earlier\n",
 299 |     "- train_spec: specifies the training input function\n",
 300 |     "- eval_spec: specifies the eval input function, and also an 'exporter' which uses our serving_input_fn for serving the model\n",
 301 |     "\n",
 302 |     "**Note running this cell will give an error because we haven't specified an output_dir, we will do that later**"
 303 |    ]
 304 |   },
 305 |   {
 306 |    "cell_type": "code",
 307 |    "execution_count": null,
 308 |    "metadata": {
 309 |     "collapsed": false
 310 |    },
 311 |    "outputs": [],
 312 |    "source": [
 313 |     "train_spec = tf.estimator.TrainSpec(\n",
 314 |     "                input_fn=generate_input_fn(data_train),\n",
 315 |     "                max_steps=3000)\n",
 316 |     "\n",
 317 |     "exporter = tf.estimator.LatestExporter('Servo', serving_input_fn)\n",
 318 |     "\n",
 319 |     "eval_spec=tf.estimator.EvalSpec(\n",
 320 |     "            input_fn=generate_input_fn(data_test),\n",
 321 |     "            steps=1,\n",
 322 |     "            exporters=exporter)\n",
 323 |     "\n",
 324 |     "tf.estimator.train_and_evaluate(generate_estimator(output_dir), train_spec, eval_spec)"
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "markdown",
 329 |    "metadata": {},
 330 |    "source": [
 331 |     "### 3) Package Code\n",
 332 |     "\n",
 333 |     "You've now written all the tensoflow code you need!\n",
 334 |     "\n",
 335 |     "To make it compatible with Cloud ML Engine we'll combine the above tensorflow code into a single python file with two simple changes\n",
 336 |     "\n",
 337 |     "1. Add some boilerplate code to parse the command line arguments required for gcloud.\n",
 338 |     "2. Use the learn_runner.run() function to run the experiment\n",
 339 |     "\n",
 340 |     "We also add an empty \\__init__\\.py file to the folder. This is just the python convention for identifying modules."
 341 |    ]
 342 |   },
 343 |   {
 344 |    "cell_type": "code",
 345 |    "execution_count": null,
 346 |    "metadata": {
 347 |     "collapsed": false
 348 |    },
 349 |    "outputs": [],
 350 |    "source": [
 351 |     "%%bash\n",
 352 |     "mkdir trainer\n",
 353 |     "touch trainer/__init__.py"
 354 |    ]
 355 |   },
 356 |   {
 357 |    "cell_type": "code",
 358 |    "execution_count": 84,
 359 |    "metadata": {
 360 |     "collapsed": false
 361 |    },
 362 |    "outputs": [
 363 |     {
 364 |      "name": "stdout",
 365 |      "output_type": "stream",
 366 |      "text": [
 367 |       "Overwriting trainer/task.py\n"
 368 |      ]
 369 |     }
 370 |    ],
 371 |    "source": [
 372 |     "%%writefile trainer/task.py\n",
 373 |     "\n",
 374 |     "import argparse\n",
 375 |     "import pandas as pd\n",
 376 |     "import tensorflow as tf\n",
 377 |     "from tensorflow.contrib.learn.python.learn import learn_runner\n",
 378 |     "from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils\n",
 379 |     "\n",
 380 |     "print(tf.__version__)\n",
 381 |     "tf.logging.set_verbosity(tf.logging.ERROR)\n",
 382 |     "\n",
 383 |     "data_train = pd.read_csv(\n",
 384 |     "  filepath_or_buffer='https://storage.googleapis.com/vijay-public/boston_housing/housing_train.csv',\n",
 385 |     "  names=[\"CRIM\",\"ZN\",\"INDUS\",\"CHAS\",\"NOX\",\"RM\",\"AGE\",\"DIS\",\"RAD\",\"TAX\",\"PTRATIO\",\"MEDV\"])\n",
 386 |     "\n",
 387 |     "data_test = pd.read_csv(\n",
 388 |     "  filepath_or_buffer='https://storage.googleapis.com/vijay-public/boston_housing/housing_test.csv',\n",
 389 |     "  names=[\"CRIM\",\"ZN\",\"INDUS\",\"CHAS\",\"NOX\",\"RM\",\"AGE\",\"DIS\",\"RAD\",\"TAX\",\"PTRATIO\",\"MEDV\"])\n",
 390 |     "\n",
 391 |     "FEATURES = [\"CRIM\", \"ZN\", \"INDUS\", \"NOX\", \"RM\",\n",
 392 |     "            \"AGE\", \"DIS\", \"TAX\", \"PTRATIO\"]\n",
 393 |     "LABEL = \"MEDV\"\n",
 394 |     "\n",
 395 |     "feature_cols = [tf.feature_column.numeric_column(k)\n",
 396 |     "                  for k in FEATURES] #list of Feature Columns\n",
 397 |     "\n",
 398 |     "def generate_estimator(output_dir):\n",
 399 |     "  return tf.estimator.DNNRegressor(feature_columns=feature_cols,\n",
 400 |     "                                            hidden_units=[10, 10],\n",
 401 |     "                                            model_dir=output_dir)\n",
 402 |     "\n",
 403 |     "def generate_input_fn(data_set):\n",
 404 |     "    def input_fn():\n",
 405 |     "      features = {k: tf.constant(data_set[k].values) for k in FEATURES}\n",
 406 |     "      labels = tf.constant(data_set[LABEL].values)\n",
 407 |     "      return features, labels\n",
 408 |     "    return input_fn\n",
 409 |     "\n",
 410 |     "def serving_input_fn():\n",
 411 |     "  #feature_placeholders are what the caller of the predict() method will have to provide\n",
 412 |     "  feature_placeholders = {\n",
 413 |     "      column.name: tf.placeholder(column.dtype, [None])\n",
 414 |     "      for column in feature_cols\n",
 415 |     "  }\n",
 416 |     "  \n",
 417 |     "  #features are what we actually pass to the estimator\n",
 418 |     "  features = {\n",
 419 |     "    # Inputs are rank 1 so that we can provide scalars to the server\n",
 420 |     "    # but Estimator expects rank 2, so we expand dimension\n",
 421 |     "    key: tf.expand_dims(tensor, -1)\n",
 422 |     "    for key, tensor in feature_placeholders.items()\n",
 423 |     "  }\n",
 424 |     "  return tf.estimator.export.ServingInputReceiver(\n",
 425 |     "    features, feature_placeholders\n",
 426 |     "  )\n",
 427 |     "\n",
 428 |     "train_spec = tf.estimator.TrainSpec(\n",
 429 |     "                input_fn=generate_input_fn(data_train),\n",
 430 |     "                max_steps=3000)\n",
 431 |     "\n",
 432 |     "exporter = tf.estimator.LatestExporter('Servo', serving_input_fn)\n",
 433 |     "\n",
 434 |     "eval_spec=tf.estimator.EvalSpec(\n",
 435 |     "            input_fn=generate_input_fn(data_test),\n",
 436 |     "            steps=1,\n",
 437 |     "            exporters=exporter)\n",
 438 |     "\n",
 439 |     "######START CLOUD ML ENGINE BOILERPLATE######\n",
 440 |     "if __name__ == '__main__':\n",
 441 |     "  parser = argparse.ArgumentParser()\n",
 442 |     "  # Input Arguments\n",
 443 |     "  parser.add_argument(\n",
 444 |     "      '--output_dir',\n",
 445 |     "      help='GCS location to write checkpoints and export models',\n",
 446 |     "      required=True\n",
 447 |     "  )\n",
 448 |     "  parser.add_argument(\n",
 449 |     "        '--job-dir',\n",
 450 |     "        help='this model ignores this field, but it is required by gcloud',\n",
 451 |     "        default='junk'\n",
 452 |     "    )\n",
 453 |     "  args = parser.parse_args()\n",
 454 |     "  arguments = args.__dict__\n",
 455 |     "  output_dir = arguments.pop('output_dir')\n",
 456 |     "######END CLOUD ML ENGINE BOILERPLATE######\n",
 457 |     "\n",
 458 |     "  #initiate training job\n",
 459 |     "  tf.estimator.train_and_evaluate(generate_estimator(output_dir), train_spec, eval_spec)\n"
 460 |    ]
 461 |   },
 462 |   {
 463 |    "cell_type": "markdown",
 464 |    "metadata": {
 465 |     "collapsed": true
 466 |    },
 467 |    "source": [
 468 |     "### 4) Train\n",
 469 |     "Now that our code is packaged we can invoke it using the gcloud command line tool to run the training. \n",
 470 |     "\n",
 471 |     "Note: Since our dataset is so small and our model is simple the overhead of provisioning the cluster is longer than the actual training time. Accordingly you'll notice the single VM cloud training takes longer than the local training, and the distributed cloud training takes longer than single VM cloud. For larger datasets and more complex models this will reverse"
 472 |    ]
 473 |   },
 474 |   {
 475 |    "cell_type": "markdown",
 476 |    "metadata": {},
 477 |    "source": [
 478 |     "#### Set Environment Vars\n",
 479 |     "We'll create environment variables for our project name GCS Bucket and reference this in future commands.\n",
 480 |     "\n",
 481 |     "If you do not have a GCS bucket, you can create one using [these](https://cloud.google.com/storage/docs/creating-buckets) instructions."
 482 |    ]
 483 |   },
 484 |   {
 485 |    "cell_type": "code",
 486 |    "execution_count": 85,
 487 |    "metadata": {
 488 |     "collapsed": true
 489 |    },
 490 |    "outputs": [],
 491 |    "source": [
 492 |     "GCS_BUCKET = 'gs://BUCKET_NAME' #CHANGE THIS TO YOUR BUCKET\n",
 493 |     "PROJECT = 'PROJECT_ID' #CHANGE THIS TO YOUR PROJECT ID\n",
 494 |     "REGION = 'us-central1' #OPTIONALLY CHANGE THIS"
 495 |    ]
 496 |   },
 497 |   {
 498 |    "cell_type": "code",
 499 |    "execution_count": 86,
 500 |    "metadata": {
 501 |     "collapsed": true
 502 |    },
 503 |    "outputs": [],
 504 |    "source": [
 505 |     "import os\n",
 506 |     "os.environ['GCS_BUCKET'] = GCS_BUCKET\n",
 507 |     "os.environ['PROJECT'] = PROJECT\n",
 508 |     "os.environ['REGION'] = REGION"
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "markdown",
 513 |    "metadata": {},
 514 |    "source": [
 515 |     "#### Run local\n",
 516 |     "It's a best practice to first run locally on a small dataset to check for errors. Note you can ignore the warnings in this case, as long as there are no errors."
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "code",
 521 |    "execution_count": 87,
 522 |    "metadata": {
 523 |     "collapsed": false
 524 |    },
 525 |    "outputs": [
 526 |     {
 527 |      "name": "stdout",
 528 |      "output_type": "stream",
 529 |      "text": [
 530 |       "1.5.0\n"
 531 |      ]
 532 |     },
 533 |     {
 534 |      "name": "stderr",
 535 |      "output_type": "stream",
 536 |      "text": [
 537 |       "/usr/local/lib/python2.7/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
 538 |       "  from ._conv import register_converters as _register_converters\n",
 539 |       "2018-03-05 18:56:25.561527: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA\n"
 540 |      ]
 541 |     }
 542 |    ],
 543 |    "source": [
 544 |     "%%bash\n",
 545 |     "gcloud ai-platform local train \\\n",
 546 |     "   --module-name=trainer.task \\\n",
 547 |     "   --package-path=trainer \\\n",
 548 |     "   -- \\\n",
 549 |     "   --output_dir='./output'"
 550 |    ]
 551 |   },
 552 |   {
 553 |    "cell_type": "markdown",
 554 |    "metadata": {},
 555 |    "source": [
 556 |     "#### Run on cloud (1 cloud ML unit)"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "markdown",
 561 |    "metadata": {},
 562 |    "source": [
 563 |     "First we specify which GCP project to use."
 564 |    ]
 565 |   },
 566 |   {
 567 |    "cell_type": "code",
 568 |    "execution_count": 88,
 569 |    "metadata": {
 570 |     "collapsed": false
 571 |    },
 572 |    "outputs": [
 573 |     {
 574 |      "name": "stderr",
 575 |      "output_type": "stream",
 576 |      "text": [
 577 |       "Updated property [core/project].\n"
 578 |      ]
 579 |     }
 580 |    ],
 581 |    "source": [
 582 |     "%%bash\n",
 583 |     "gcloud config set project $PROJECT"
 584 |    ]
 585 |   },
 586 |   {
 587 |    "cell_type": "markdown",
 588 |    "metadata": {},
 589 |    "source": [
 590 |     "Then we specify which GCS bucket to write to and a job name.\n",
 591 |     "Job names submitted to the ml engine must be project unique, so we append the system date/time. Update the cell below to point to a GCS bucket you own."
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "code",
 596 |    "execution_count": 89,
 597 |    "metadata": {
 598 |     "collapsed": false
 599 |    },
 600 |    "outputs": [
 601 |     {
 602 |      "name": "stdout",
 603 |      "output_type": "stream",
 604 |      "text": [
 605 |       "jobId: housing_180305_185634\n",
 606 |       "state: QUEUED\n"
 607 |      ]
 608 |     },
 609 |     {
 610 |      "name": "stderr",
 611 |      "output_type": "stream",
 612 |      "text": [
 613 |       "Job [housing_180305_185634] submitted successfully.\n",
 614 |       "Your job is still active. You may view the status of your job with the command\n",
 615 |       "\n",
 616 |       "  $ gcloud ai-platform jobs describe housing_180305_185634\n",
 617 |       "\n",
 618 |       "or continue streaming the logs with the command\n",
 619 |       "\n",
 620 |       "  $ gcloud ai-platform jobs stream-logs housing_180305_185634\n"
 621 |      ]
 622 |     }
 623 |    ],
 624 |    "source": [
 625 |     "%%bash\n",
 626 |     "JOBNAME=housing_$(date -u +%y%m%d_%H%M%S)\n",
 627 |     "\n",
 628 |     "gcloud ai-platform jobs submit training $JOBNAME \\\n",
 629 |     "   --region=$REGION \\\n",
 630 |     "   --module-name=trainer.task \\\n",
 631 |     "   --package-path=./trainer \\\n",
 632 |     "   --job-dir=$GCS_BUCKET/$JOBNAME/ \\\n",
 633 |     "   --runtime-version 1.4 \\\n",
 634 |     "   -- \\\n",
 635 |     "   --output_dir=$GCS_BUCKET/$JOBNAME/output\n"
 636 |    ]
 637 |   },
 638 |   {
 639 |    "cell_type": "markdown",
 640 |    "metadata": {},
 641 |    "source": [
 642 |     "#### Run on cloud (10 cloud ML units)\n",
 643 |     "Because we are using the TF Estimators interface, distributed computing just works! The only change we need to make to run in a distributed fashion is to add the [--scale-tier](https://cloud.google.com/ml/pricing#ml_training_units_by_scale_tier) argument. Cloud ML Engine then takes care of distributing the training across devices for you!\n"
 644 |    ]
 645 |   },
 646 |   {
 647 |    "cell_type": "code",
 648 |    "execution_count": 90,
 649 |    "metadata": {
 650 |     "collapsed": false
 651 |    },
 652 |    "outputs": [
 653 |     {
 654 |      "name": "stdout",
 655 |      "output_type": "stream",
 656 |      "text": [
 657 |       "jobId: housing_180305_185638\n",
 658 |       "state: QUEUED\n"
 659 |      ]
 660 |     },
 661 |     {
 662 |      "name": "stderr",
 663 |      "output_type": "stream",
 664 |      "text": [
 665 |       "Job [housing_180305_185638] submitted successfully.\n",
 666 |       "Your job is still active. You may view the status of your job with the command\n",
 667 |       "\n",
 668 |       "  $ gcloud ai-platform jobs describe housing_180305_185638\n",
 669 |       "\n",
 670 |       "or continue streaming the logs with the command\n",
 671 |       "\n",
 672 |       "  $ gcloud ai-platform jobs stream-logs housing_180305_185638\n"
 673 |      ]
 674 |     }
 675 |    ],
 676 |    "source": [
 677 |     "%%bash\n",
 678 |     "JOBNAME=housing_$(date -u +%y%m%d_%H%M%S)\n",
 679 |     "\n",
 680 |     "gcloud ai-platform jobs submit training $JOBNAME \\\n",
 681 |     "   --region=$REGION \\\n",
 682 |     "   --module-name=trainer.task \\\n",
 683 |     "   --package-path=./trainer \\\n",
 684 |     "   --job-dir=$GCS_BUCKET/$JOBNAME \\\n",
 685 |     "   --runtime-version 1.4 \\\n",
 686 |     "   --scale-tier=STANDARD_1 \\\n",
 687 |     "   -- \\\n",
 688 |     "   --output_dir=$GCS_BUCKET/$JOBNAME/output"
 689 |    ]
 690 |   },
 691 |   {
 692 |    "cell_type": "markdown",
 693 |    "metadata": {},
 694 |    "source": [
 695 |     "#### Run on cloud GPU (3 cloud ML units)"
 696 |    ]
 697 |   },
 698 |   {
 699 |    "cell_type": "markdown",
 700 |    "metadata": {},
 701 |    "source": [
 702 |     "Also works with GPUs!\n",
 703 |     "\n",
 704 |     "\"BASIC_GPU\" corresponds to one Tesla K80 at the time of this writing, hardware subject to change. 1 GPU is charged as 3 cloud ML units."
 705 |    ]
 706 |   },
 707 |   {
 708 |    "cell_type": "code",
 709 |    "execution_count": 78,
 710 |    "metadata": {
 711 |     "collapsed": false
 712 |    },
 713 |    "outputs": [
 714 |     {
 715 |      "name": "stdout",
 716 |      "output_type": "stream",
 717 |      "text": [
 718 |       "jobId: housing_180305_183840\n",
 719 |       "state: QUEUED\n"
 720 |      ]
 721 |     },
 722 |     {
 723 |      "name": "stderr",
 724 |      "output_type": "stream",
 725 |      "text": [
 726 |       "Job [housing_180305_183840] submitted successfully.\n",
 727 |       "Your job is still active. You may view the status of your job with the command\n",
 728 |       "\n",
 729 |       "  $ gcloud ai-platform jobs describe housing_180305_183840\n",
 730 |       "\n",
 731 |       "or continue streaming the logs with the command\n",
 732 |       "\n",
 733 |       "  $ gcloud ai-platform jobs stream-logs housing_180305_183840\n"
 734 |      ]
 735 |     }
 736 |    ],
 737 |    "source": [
 738 |     "%%bash\n",
 739 |     "JOBNAME=housing_$(date -u +%y%m%d_%H%M%S)\n",
 740 |     "\n",
 741 |     "gcloud ai-platform jobs submit training $JOBNAME \\\n",
 742 |     "   --region=$REGION \\\n",
 743 |     "   --module-name=trainer.task \\\n",
 744 |     "   --package-path=./trainer \\\n",
 745 |     "   --job-dir=$GCS_BUCKET/$JOBNAME \\\n",
 746 |     "   --runtime-version 1.4 \\\n",
 747 |     "   --scale-tier=BASIC_GPU \\\n",
 748 |     "   -- \\\n",
 749 |     "   --output_dir=$GCS_BUCKET/$JOBNAME/output"
 750 |    ]
 751 |   },
 752 |   {
 753 |    "cell_type": "markdown",
 754 |    "metadata": {},
 755 |    "source": [
 756 |     "#### Run on 8 cloud GPUs (24 cloud ML units)\n",
 757 |     "To train across multiple GPUs you use a [custom scale tier](https://cloud.google.com/ml/docs/concepts/training-overview#job_configuration_parameters).\n",
 758 |     "\n",
 759 |     "You specify the number and types of machines you want to run on in a config.yaml, then reference that config.yaml via the --config config.yaml command line argument.\n",
 760 |     "\n",
 761 |     "Here I am specifying a master node with machine type complex_model_m_gpu and one worker node of the same type. Each complex_model_m_gpu has 4 GPUs so this job will run on 2x4=8 GPUs total. \n",
 762 |     "\n",
 763 |     "WARNING: The default project quota is 10 cloud ML units, so unless you have requested a quota increase you will get a quota exceeded error. This command is just for illustrative purposes."
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "code",
 768 |    "execution_count": 79,
 769 |    "metadata": {
 770 |     "collapsed": false
 771 |    },
 772 |    "outputs": [
 773 |     {
 774 |      "name": "stdout",
 775 |      "output_type": "stream",
 776 |      "text": [
 777 |       "Overwriting config.yaml\n"
 778 |      ]
 779 |     }
 780 |    ],
 781 |    "source": [
 782 |     "%%writefile config.yaml\n",
 783 |     "trainingInput:\n",
 784 |     "  scaleTier: CUSTOM\n",
 785 |     "  masterType: complex_model_m_gpu\n",
 786 |     "  workerType: complex_model_m_gpu\n",
 787 |     "  workerCount: 1"
 788 |    ]
 789 |   },
 790 |   {
 791 |    "cell_type": "code",
 792 |    "execution_count": 80,
 793 |    "metadata": {
 794 |     "collapsed": false
 795 |    },
 796 |    "outputs": [
 797 |     {
 798 |      "name": "stdout",
 799 |      "output_type": "stream",
 800 |      "text": [
 801 |       "jobId: housing_180305_183843\n",
 802 |       "state: QUEUED\n"
 803 |      ]
 804 |     },
 805 |     {
 806 |      "name": "stderr",
 807 |      "output_type": "stream",
 808 |      "text": [
 809 |       "Job [housing_180305_183843] submitted successfully.\n",
 810 |       "Your job is still active. You may view the status of your job with the command\n",
 811 |       "\n",
 812 |       "  $ gcloud ai-platform jobs describe housing_180305_183843\n",
 813 |       "\n",
 814 |       "or continue streaming the logs with the command\n",
 815 |       "\n",
 816 |       "  $ gcloud ai-platform jobs stream-logs housing_180305_183843\n"
 817 |      ]
 818 |     }
 819 |    ],
 820 |    "source": [
 821 |     "%%bash\n",
 822 |     "JOBNAME=housing_$(date -u +%y%m%d_%H%M%S)\n",
 823 |     "\n",
 824 |     "gcloud ai-platform jobs submit training $JOBNAME \\\n",
 825 |     "   --region=$REGION \\\n",
 826 |     "   --module-name=trainer.task \\\n",
 827 |     "   --package-path=./trainer \\\n",
 828 |     "   --job-dir=$GCS_BUCKET/$JOBNAME \\\n",
 829 |     "   --runtime-version 1.4 \\\n",
 830 |     "   --config config.yaml \\\n",
 831 |     "   -- \\\n",
 832 |     "   --output_dir=$GCS_BUCKET/$JOBNAME/output"
 833 |    ]
 834 |   },
 835 |   {
 836 |    "cell_type": "markdown",
 837 |    "metadata": {},
 838 |    "source": [
 839 |     "### 5) Inspect Results Using Tensorboard\n",
 840 |     "\n",
 841 |     "Tensorboard is a utility that allows you to visualize your results.\n",
 842 |     "\n",
 843 |     "Expand the 'loss' graph. What is your evaluation loss? This is squared error, so take the square root of it to get the average error in dollars. Does this seem like a reasonable margin of error for predicting a housing price?\n",
 844 |     "\n",
 845 |     "To activate TensorBoard within the JupyterLab UI navigate to **File** - **New Launcher**. Then double-click the 'Tensorboard' icon on the bottom row.\n",
 846 |     "\n",
 847 |     "TensorBoard 1 will appear in the new tab. Navigate through the three tabs to see the active TensorBoard. The 'Graphs' and 'Projector' tabs offer very interesting information including the ability to replay the tests.\n",
 848 |     "\n",
 849 |     "You may close the TensorBoard tab when you are finished exploring."
 850 |    ]
 851 |   },
 852 |   {
 853 |    "cell_type": "markdown",
 854 |    "metadata": {},
 855 |    "source": [
 856 |     "### 6) Deploy Model For Predictions\n",
 857 |     "\n",
 858 |     "Cloud ML Engine has a prediction service that will wrap our tensorflow model with a REST API and allow remote clients to get predictions.\n",
 859 |     "\n",
 860 |     "You can deploy the model from the Google Cloud Console GUI, or you can use the gcloud command line tool. We will use the latter method. Note this will take up to 5 minutes."
 861 |    ]
 862 |   },
 863 |   {
 864 |    "cell_type": "code",
 865 |    "execution_count": 96,
 866 |    "metadata": {
 867 |     "collapsed": false
 868 |    },
 869 |    "outputs": [
 870 |     {
 871 |      "name": "stderr",
 872 |      "output_type": "stream",
 873 |      "text": [
 874 |       "Creating version (this might take a few minutes)......\n",
 875 |       "............................................................................................done.\n"
 876 |      ]
 877 |     }
 878 |    ],
 879 |    "source": [
 880 |     "%%bash\n",
 881 |     "MODEL_NAME=\"housing_prices\"\n",
 882 |     "MODEL_VERSION=\"v1\"\n",
 883 |     "MODEL_LOCATION=output/export/Servo/$(ls output/export/Servo | tail -1) \n",
 884 |     "\n",
 885 |     "#gcloud ai-platform versions delete ${MODEL_VERSION} --model ${MODEL_NAME} #Uncomment to overwrite existing version\n",
 886 |     "#gcloud ai-platform models delete ${MODEL_NAME} #Uncomment to overwrite existing model\n",
 887 |     "gcloud ai-platform models create ${MODEL_NAME} --regions $REGION\n",
 888 |     "gcloud ai-platform versions create ${MODEL_VERSION} --model ${MODEL_NAME} --origin ${MODEL_LOCATION} --staging-bucket=$GCS_BUCKET"
 889 |    ]
 890 |   },
 891 |   {
 892 |    "cell_type": "markdown",
 893 |    "metadata": {},
 894 |    "source": [
 895 |     "### 7) Get Predictions\n",
 896 |     "\n",
 897 |     "There are two flavors of the ML Engine Prediction Service: Batch and online.\n",
 898 |     "\n",
 899 |     "Online prediction is more appropriate for latency sensitive requests as results are returned quickly and synchronously. \n",
 900 |     "\n",
 901 |     "Batch prediction is more appropriate for large prediction requests that you only need to run a few times a day.\n",
 902 |     "\n",
 903 |     "The prediction services expects prediction requests in standard JSON format so first we will create a JSON file with a couple of housing records.\n"
 904 |    ]
 905 |   },
 906 |   {
 907 |    "cell_type": "code",
 908 |    "execution_count": 68,
 909 |    "metadata": {
 910 |     "collapsed": false
 911 |    },
 912 |    "outputs": [
 913 |     {
 914 |      "name": "stdout",
 915 |      "output_type": "stream",
 916 |      "text": [
 917 |       "Writing records.json\n"
 918 |      ]
 919 |     }
 920 |    ],
 921 |    "source": [
 922 |     "%%writefile records.json\n",
 923 |     "{\"CRIM\": 0.00632,\"ZN\": 18.0,\"INDUS\": 2.31,\"NOX\": 0.538, \"RM\": 6.575, \"AGE\": 65.2, \"DIS\": 4.0900, \"TAX\": 296.0, \"PTRATIO\": 15.3}\n",
 924 |     "{\"CRIM\": 0.00332,\"ZN\": 0.0,\"INDUS\": 2.31,\"NOX\": 0.437, \"RM\": 7.7, \"AGE\": 40.0, \"DIS\": 5.0900, \"TAX\": 250.0, \"PTRATIO\": 17.3}"
 925 |    ]
 926 |   },
 927 |   {
 928 |    "cell_type": "markdown",
 929 |    "metadata": {},
 930 |    "source": [
 931 |     "Now we will pass this file to the prediction service using the gcloud command line tool. Results are returned immediatley!"
 932 |    ]
 933 |   },
 934 |   {
 935 |    "cell_type": "code",
 936 |    "execution_count": 69,
 937 |    "metadata": {
 938 |     "collapsed": false
 939 |    },
 940 |    "outputs": [
 941 |     {
 942 |      "name": "stdout",
 943 |      "output_type": "stream",
 944 |      "text": [
 945 |       "PREDICTIONS\r\n",
 946 |       "[26098.3671875]\r\n",
 947 |       "[20871.384765625]\r\n",
 948 |       "\r\n",
 949 |       "\r\n",
 950 |       "Updates are available for some Cloud SDK components.  To install them,\r\n",
 951 |       "please run:\r\n",
 952 |       "  $ gcloud components update\r\n",
 953 |       "\r\n"
 954 |      ]
 955 |     }
 956 |    ],
 957 |    "source": [
 958 |     "!gcloud ai-platform predict --model housing_prices --json-instances records.json"
 959 |    ]
 960 |   },
 961 |   {
 962 |    "cell_type": "markdown",
 963 |    "metadata": {},
 964 |    "source": [
 965 |     "### Conclusion\n",
 966 |     "\n",
 967 |     "#### What we covered\n",
 968 |     "1. How to use Tensorflow's high level Estimator API\n",
 969 |     "2. How to deploy tensorflow code for distributed training in the cloud\n",
 970 |     "3. How to evaluate results using TensorBoard\n",
 971 |     "4. How deploy the resulting model to the cloud for online prediction\n",
 972 |     "\n",
 973 |     "#### What we didn't cover\n",
 974 |     "1. How to leverage larger than memory datasets using Tensorflow's queueing system\n",
 975 |     "2. How to create synthetic features from our raw data to aid learning (Feature Engineering)\n",
 976 |     "3. How to improve model performance by finding the ideal hyperparameters using Cloud ML Engine's [HyperTune](https://cloud.google.com/ml-engine/docs/how-tos/using-hyperparameter-tuning) feature\n",
 977 |     "\n",
 978 |     "This lab is a great start, but adding in the above concepts is critical in getting your models to production ready quality. These concepts are covered in Google's 1-week on-demand Tensorflow + Cloud ML course: https://www.coursera.org/learn/serverless-machine-learning-gcp"
 979 |    ]
 980 |   }
 981 |  ],
 982 |  "metadata": {
 983 |   "anaconda-cloud": {},
 984 |   "kernelspec": {
 985 |    "display_name": "Python 2",
 986 |    "language": "python",
 987 |    "name": "python2"
 988 |   },
 989 |   "language_info": {
 990 |    "codemirror_mode": {
 991 |     "name": "ipython",
 992 |     "version": 2
 993 |    },
 994 |    "file_extension": ".py",
 995 |    "mimetype": "text/x-python",
 996 |    "name": "python",
 997 |    "nbconvert_exporter": "python",
 998 |    "pygments_lexer": "ipython2",
 999 |    "version": "2.7.12"
1000 |   }
1001 |  },
1002 |  "nbformat": 4,
1003 |  "nbformat_minor": 2
1004 | }
1005 | 


--------------------------------------------------------------------------------
/DRAFT/keras-bow-model-multi-label-hypertune.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "colab": {
   7 |      "autoexec": {
   8 |       "startup": false,
   9 |       "wait_interval": 0
  10 |      }
  11 |     },
  12 |     "colab_type": "code",
  13 |     "collapsed": true,
  14 |     "id": "frTMl3sShA3P"
  15 |    },
  16 |    "source": [
  17 |     "# Multi Label Text Classification Using Keras + Cloud ML Engine \n",
  18 |     "\n",
  19 |     "<img src=\"assets/stackoverflow.jpg\" style=\"width: 600px;\">\n",
  20 |     "\n",
  21 |     "#### This notebook will demonstrate the following steps:\n",
  22 |     "\n",
  23 |     "1. Load Raw Data\n",
  24 |     "2. Explore Data\n",
  25 |     "3. Preprocess Data\n",
  26 |     "4. Construct a model that learns to tag Stack Overflow posts\n",
  27 |     "5. Use Cloud ML Engine's automatic hyperparameter tuning feature to refine the model\n",
  28 |     "6. Inspect the results using tensorboard\n",
  29 |     "7. And deploy the final model to production using Cloud ML Engine's online prediction service\n",
  30 |     "\n",
  31 |     "This notebook is intended to be run on Google Cloud Datalab: https://cloud.google.com/datalab/docs/quickstarts\n",
  32 |     "Datalab will have the required libraries installed by default for this code to work. If you choose to run this code outside of Datalab you may run in to version and dependency issues which you will need to resolve."
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": 111,
  38 |    "metadata": {
  39 |     "collapsed": false
  40 |    },
  41 |    "outputs": [
  42 |     {
  43 |      "data": {
  44 |       "text/html": [
  45 |        "\n",
  46 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
  47 |        "          <script>\n",
  48 |        "            requirejs.config({\n",
  49 |        "              paths: {\n",
  50 |        "                base: '/static/base',\n",
  51 |        "              },\n",
  52 |        "            });\n",
  53 |        "          </script>\n",
  54 |        "          "
  55 |       ],
  56 |       "text/plain": [
  57 |        "<IPython.core.display.HTML object>"
  58 |       ]
  59 |      },
  60 |      "metadata": {},
  61 |      "output_type": "display_data"
  62 |     },
  63 |     {
  64 |      "name": "stdout",
  65 |      "output_type": "stream",
  66 |      "text": [
  67 |       "Requirement already up-to-date: tensorflow==1.4 in /usr/local/lib/python2.7/dist-packages\n",
  68 |       "Requirement already up-to-date: mock>=2.0.0 in /usr/local/lib/python2.7/dist-packages (from tensorflow==1.4)\n",
  69 |       "Requirement already up-to-date: tensorflow-tensorboard<0.5.0,>=0.4.0rc1 in /usr/local/lib/python2.7/dist-packages (from tensorflow==1.4)\n",
  70 |       "Requirement already up-to-date: numpy>=1.12.1 in /usr/local/lib/python2.7/dist-packages (from tensorflow==1.4)\n",
  71 |       "Requirement already up-to-date: backports.weakref>=1.0rc1 in /usr/local/lib/python2.7/dist-packages (from tensorflow==1.4)\n",
  72 |       "Requirement already up-to-date: wheel in /usr/local/lib/python2.7/dist-packages (from tensorflow==1.4)\n",
  73 |       "Requirement already up-to-date: six>=1.10.0 in /usr/local/lib/python2.7/dist-packages (from tensorflow==1.4)\n",
  74 |       "Requirement already up-to-date: protobuf>=3.3.0 in /usr/local/lib/python2.7/dist-packages (from tensorflow==1.4)\n",
  75 |       "Requirement already up-to-date: enum34>=1.1.6 in /usr/local/lib/python2.7/dist-packages (from tensorflow==1.4)\n",
  76 |       "Requirement already up-to-date: funcsigs>=1; python_version < \"3.3\" in /usr/local/lib/python2.7/dist-packages (from mock>=2.0.0->tensorflow==1.4)\n",
  77 |       "Requirement already up-to-date: pbr>=0.11 in /usr/local/lib/python2.7/dist-packages (from mock>=2.0.0->tensorflow==1.4)\n",
  78 |       "Requirement already up-to-date: bleach==1.5.0 in /usr/local/lib/python2.7/dist-packages (from tensorflow-tensorboard<0.5.0,>=0.4.0rc1->tensorflow==1.4)\n",
  79 |       "Requirement already up-to-date: markdown>=2.6.8 in /usr/local/lib/python2.7/dist-packages (from tensorflow-tensorboard<0.5.0,>=0.4.0rc1->tensorflow==1.4)\n",
  80 |       "Requirement already up-to-date: futures>=3.1.1; python_version < \"3.2\" in /usr/local/lib/python2.7/dist-packages (from tensorflow-tensorboard<0.5.0,>=0.4.0rc1->tensorflow==1.4)\n",
  81 |       "Requirement already up-to-date: html5lib==0.9999999 in /usr/local/lib/python2.7/dist-packages (from tensorflow-tensorboard<0.5.0,>=0.4.0rc1->tensorflow==1.4)\n",
  82 |       "Requirement already up-to-date: werkzeug>=0.11.10 in /usr/local/lib/python2.7/dist-packages (from tensorflow-tensorboard<0.5.0,>=0.4.0rc1->tensorflow==1.4)\n",
  83 |       "Collecting setuptools (from protobuf>=3.3.0->tensorflow==1.4)\n",
  84 |       "  Downloading setuptools-36.7.2-py2.py3-none-any.whl (482kB)\n",
  85 |       "\u001b[K    100% |████████████████████████████████| 491kB 2.0MB/s \n",
  86 |       "\u001b[?25hInstalling collected packages: setuptools\n",
  87 |       "  Found existing installation: setuptools 36.7.1\n",
  88 |       "    Uninstalling setuptools-36.7.1:\n",
  89 |       "      Successfully uninstalled setuptools-36.7.1\n",
  90 |       "Successfully installed setuptools-36.7.2\n"
  91 |      ]
  92 |     }
  93 |    ],
  94 |    "source": [
  95 |     "# This code was tested with TensorFlow v1.4\n",
  96 |     "# The import statements will not work with earlier versions, because Keras is in tf.contrib in those versions\n",
  97 |     "!pip install --upgrade tensorflow==1.4"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": 112,
 103 |    "metadata": {
 104 |     "colab": {
 105 |      "autoexec": {
 106 |       "startup": false,
 107 |       "wait_interval": 0
 108 |      },
 109 |      "height": 321,
 110 |      "output_extras": [
 111 |       {
 112 |        "item_id": 1
 113 |       }
 114 |      ]
 115 |     },
 116 |     "colab_type": "code",
 117 |     "collapsed": false,
 118 |     "executionInfo": {
 119 |      "elapsed": 2880,
 120 |      "status": "error",
 121 |      "timestamp": 1505781339378,
 122 |      "user": {
 123 |       "displayName": "Sara Robinson",
 124 |       "photoUrl": "//lh4.googleusercontent.com/-RR9n0dvbwgI/AAAAAAAAAAI/AAAAAAAAMYM/SOr5ZExpvXE/s50-c-k-no/photo.jpg",
 125 |       "userId": "112510032804989247452"
 126 |      },
 127 |      "user_tz": 240
 128 |     },
 129 |     "id": "783h64rGhA3T",
 130 |     "outputId": "d447b2ab-e321-4ee5-abd4-de2c0116302f"
 131 |    },
 132 |    "outputs": [
 133 |     {
 134 |      "data": {
 135 |       "text/html": [
 136 |        "\n",
 137 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 138 |        "          <script>\n",
 139 |        "            requirejs.config({\n",
 140 |        "              paths: {\n",
 141 |        "                base: '/static/base',\n",
 142 |        "              },\n",
 143 |        "            });\n",
 144 |        "          </script>\n",
 145 |        "          "
 146 |       ],
 147 |       "text/plain": [
 148 |        "<IPython.core.display.HTML object>"
 149 |       ]
 150 |      },
 151 |      "metadata": {},
 152 |      "output_type": "display_data"
 153 |     },
 154 |     {
 155 |      "name": "stdout",
 156 |      "output_type": "stream",
 157 |      "text": [
 158 |       "You have TensorFlow version 1.4.0\n"
 159 |      ]
 160 |     }
 161 |    ],
 162 |    "source": [
 163 |     "from __future__ import absolute_import\n",
 164 |     "from __future__ import division\n",
 165 |     "from __future__ import print_function\n",
 166 |     "\n",
 167 |     "import itertools\n",
 168 |     "import os\n",
 169 |     "\n",
 170 |     "%matplotlib inline\n",
 171 |     "import matplotlib.pyplot as plt\n",
 172 |     "import numpy as np\n",
 173 |     "import pandas as pd\n",
 174 |     "import tensorflow as tf\n",
 175 |     "import googleapiclient.discovery\n",
 176 |     "\n",
 177 |     "from sklearn.preprocessing import LabelBinarizer, LabelEncoder\n",
 178 |     "from sklearn.metrics import confusion_matrix\n",
 179 |     "\n",
 180 |     "from tensorflow.python.keras.models import Sequential\n",
 181 |     "from tensorflow.python.keras.layers import Dense, Activation, Dropout\n",
 182 |     "from tensorflow.python.keras.preprocessing import text, sequence\n",
 183 |     "from tensorflow.python.keras import utils\n",
 184 |     "from tensorflow.contrib.saved_model.python.saved_model.utils import simple_save\n",
 185 |     "\n",
 186 |     "from collections import Counter\n",
 187 |     "\n",
 188 |     "print(\"You have TensorFlow version\", tf.__version__)"
 189 |    ]
 190 |   },
 191 |   {
 192 |    "cell_type": "markdown",
 193 |    "metadata": {},
 194 |    "source": [
 195 |     "### 1) Load Raw Data\n",
 196 |     "\n",
 197 |     "We will use a publically available dataset of Stack Overflow posts. It is hosted for free on Google's Big Query platform [here](https://bigquery.cloud.google.com/table/bigquery-public-data:stackoverflow.posts_questions?pli=1&tab=details).\n",
 198 |     "\n",
 199 |     "We will extract a subset of this data using [this](https://bigquery.cloud.google.com/savedquery/38969729279:919b6f9f680b4cc6ace82632eeb357fd) query.\n",
 200 |     "\n",
 201 |     "Datalab has a built-in library to easily load data from Big Query which we will use below."
 202 |    ]
 203 |   },
 204 |   {
 205 |    "cell_type": "code",
 206 |    "execution_count": 114,
 207 |    "metadata": {
 208 |     "collapsed": false
 209 |    },
 210 |    "outputs": [
 211 |     {
 212 |      "data": {
 213 |       "text/html": [
 214 |        "\n",
 215 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 216 |        "          <script>\n",
 217 |        "            requirejs.config({\n",
 218 |        "              paths: {\n",
 219 |        "                base: '/static/base',\n",
 220 |        "              },\n",
 221 |        "            });\n",
 222 |        "          </script>\n",
 223 |        "          "
 224 |       ],
 225 |       "text/plain": [
 226 |        "<IPython.core.display.HTML object>"
 227 |       ]
 228 |      },
 229 |      "metadata": {},
 230 |      "output_type": "display_data"
 231 |     },
 232 |     {
 233 |      "name": "stdout",
 234 |      "output_type": "stream",
 235 |      "text": [
 236 |       "Loaded 1000 rows\n"
 237 |      ]
 238 |     },
 239 |     {
 240 |      "data": {
 241 |       "text/html": [
 242 |        "<div>\n",
 243 |        "<table border=\"1\" class=\"dataframe\">\n",
 244 |        "  <thead>\n",
 245 |        "    <tr style=\"text-align: right;\">\n",
 246 |        "      <th></th>\n",
 247 |        "      <th>tags</th>\n",
 248 |        "      <th>post</th>\n",
 249 |        "    </tr>\n",
 250 |        "  </thead>\n",
 251 |        "  <tbody>\n",
 252 |        "    <tr>\n",
 253 |        "      <th>0</th>\n",
 254 |        "      <td>c#</td>\n",
 255 |        "      <td>how to speed up port scanner c#   i created ve...</td>\n",
 256 |        "    </tr>\n",
 257 |        "    <tr>\n",
 258 |        "      <th>1</th>\n",
 259 |        "      <td>c#</td>\n",
 260 |        "      <td>program does not contain a static ‘main’ metho...</td>\n",
 261 |        "    </tr>\n",
 262 |        "    <tr>\n",
 263 |        "      <th>2</th>\n",
 264 |        "      <td>c#</td>\n",
 265 |        "      <td>c# default value before generic  i have one ol...</td>\n",
 266 |        "    </tr>\n",
 267 |        "    <tr>\n",
 268 |        "      <th>3</th>\n",
 269 |        "      <td>c#</td>\n",
 270 |        "      <td>unable to cast object of type  htmlagilitypack...</td>\n",
 271 |        "    </tr>\n",
 272 |        "    <tr>\n",
 273 |        "      <th>4</th>\n",
 274 |        "      <td>php</td>\n",
 275 |        "      <td>how to match non-keyboard characters using php...</td>\n",
 276 |        "    </tr>\n",
 277 |        "    <tr>\n",
 278 |        "      <th>5</th>\n",
 279 |        "      <td>php</td>\n",
 280 |        "      <td>php script for inserting images  i ve got thes...</td>\n",
 281 |        "    </tr>\n",
 282 |        "    <tr>\n",
 283 |        "      <th>6</th>\n",
 284 |        "      <td>php</td>\n",
 285 |        "      <td>php list structure  let s say i have an array ...</td>\n",
 286 |        "    </tr>\n",
 287 |        "    <tr>\n",
 288 |        "      <th>7</th>\n",
 289 |        "      <td>php</td>\n",
 290 |        "      <td>reverse a string in php without using any stri...</td>\n",
 291 |        "    </tr>\n",
 292 |        "    <tr>\n",
 293 |        "      <th>8</th>\n",
 294 |        "      <td>php</td>\n",
 295 |        "      <td>when would empty() return false and $var ==   ...</td>\n",
 296 |        "    </tr>\n",
 297 |        "    <tr>\n",
 298 |        "      <th>9</th>\n",
 299 |        "      <td>php</td>\n",
 300 |        "      <td>proper way of using output of one class in ano...</td>\n",
 301 |        "    </tr>\n",
 302 |        "  </tbody>\n",
 303 |        "</table>\n",
 304 |        "</div>"
 305 |       ],
 306 |       "text/plain": [
 307 |        "  tags                                               post\n",
 308 |        "0   c#  how to speed up port scanner c#   i created ve...\n",
 309 |        "1   c#  program does not contain a static ‘main’ metho...\n",
 310 |        "2   c#  c# default value before generic  i have one ol...\n",
 311 |        "3   c#  unable to cast object of type  htmlagilitypack...\n",
 312 |        "4  php  how to match non-keyboard characters using php...\n",
 313 |        "5  php  php script for inserting images  i ve got thes...\n",
 314 |        "6  php  php list structure  let s say i have an array ...\n",
 315 |        "7  php  reverse a string in php without using any stri...\n",
 316 |        "8  php  when would empty() return false and $var ==   ...\n",
 317 |        "9  php  proper way of using output of one class in ano..."
 318 |       ]
 319 |      },
 320 |      "execution_count": 114,
 321 |      "metadata": {},
 322 |      "output_type": "execute_result"
 323 |     }
 324 |    ],
 325 |    "source": [
 326 |     "import google.datalab.bigquery as bq\n",
 327 |     "\n",
 328 |     "query = \"\"\"\n",
 329 |     "SELECT tags, TRIM(LOWER(REGEXP_REPLACE(CONCAT(title, \\' \\', body), r\\'[\\\"\\\\n\\\\\\'?,]|<p>|</p>\\',\\\" \\\"))) as post \n",
 330 |     "FROM `bigquery-public-data.stackoverflow.posts_questions`\n",
 331 |     "WHERE REGEXP_CONTAINS(tags, r\\\"javascript|java|c#|php|android|jquery|python\\\") \n",
 332 |     "LIMIT 1000\n",
 333 |     "\"\"\"\n",
 334 |     "\n",
 335 |     "data = bq.Query(query).execute(output_options=bq.QueryOutput.dataframe()).result()\n",
 336 |     "NUM_ROWS = data.shape[0]\n",
 337 |     "print(\"Loaded {} rows\".format(NUM_ROWS))\n",
 338 |     "data.head(10)"
 339 |    ]
 340 |   },
 341 |   {
 342 |    "cell_type": "markdown",
 343 |    "metadata": {},
 344 |    "source": [
 345 |     "If you're running from a jupyter notebook on your laptop/workstation (as opposed to datalab) I've hosted the data as a static .CSV file in a publically accessible URL for convencience."
 346 |    ]
 347 |   },
 348 |   {
 349 |    "cell_type": "code",
 350 |    "execution_count": 115,
 351 |    "metadata": {
 352 |     "colab": {
 353 |      "autoexec": {
 354 |       "startup": false,
 355 |       "wait_interval": 0
 356 |      }
 357 |     },
 358 |     "colab_type": "code",
 359 |     "collapsed": false,
 360 |     "id": "c7te21f7hA3V"
 361 |    },
 362 |    "outputs": [
 363 |     {
 364 |      "data": {
 365 |       "text/html": [
 366 |        "\n",
 367 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 368 |        "          <script>\n",
 369 |        "            requirejs.config({\n",
 370 |        "              paths: {\n",
 371 |        "                base: '/static/base',\n",
 372 |        "              },\n",
 373 |        "            });\n",
 374 |        "          </script>\n",
 375 |        "          "
 376 |       ],
 377 |       "text/plain": [
 378 |        "<IPython.core.display.HTML object>"
 379 |       ]
 380 |      },
 381 |      "metadata": {},
 382 |      "output_type": "display_data"
 383 |     }
 384 |    ],
 385 |    "source": [
 386 |     "# alternative way to download data for non-datalab users\n",
 387 |     "# uncomment below lines to run\n",
 388 |     "\n",
 389 |     "#data = pd.read_csv(\"https://storage.googleapis.com/vijay-public/text_classification/results-1000.csv\")\n",
 390 |     "#NUM_ROWS = data.shape[0]\n",
 391 |     "#print(\"Loaded {} rows\".format(NUM_ROWS))\n",
 392 |     "#data.head()"
 393 |    ]
 394 |   },
 395 |   {
 396 |    "cell_type": "markdown",
 397 |    "metadata": {},
 398 |    "source": [
 399 |     "### 2) Data Exploration \n",
 400 |     "\n",
 401 |     "For multi-label exploration it's useful to get an idea of the distribution of our labels. Here we will count the number of occurences of each of most common labels. "
 402 |    ]
 403 |   },
 404 |   {
 405 |    "cell_type": "code",
 406 |    "execution_count": 116,
 407 |    "metadata": {
 408 |     "collapsed": false
 409 |    },
 410 |    "outputs": [
 411 |     {
 412 |      "data": {
 413 |       "text/html": [
 414 |        "\n",
 415 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 416 |        "          <script>\n",
 417 |        "            requirejs.config({\n",
 418 |        "              paths: {\n",
 419 |        "                base: '/static/base',\n",
 420 |        "              },\n",
 421 |        "            });\n",
 422 |        "          </script>\n",
 423 |        "          "
 424 |       ],
 425 |       "text/plain": [
 426 |        "<IPython.core.display.HTML object>"
 427 |       ]
 428 |      },
 429 |      "metadata": {},
 430 |      "output_type": "display_data"
 431 |     },
 432 |     {
 433 |      "name": "stdout",
 434 |      "output_type": "stream",
 435 |      "text": [
 436 |       "5 most common classes:\n"
 437 |      ]
 438 |     },
 439 |     {
 440 |      "data": {
 441 |       "text/plain": [
 442 |        "[(u'javascript', 214),\n",
 443 |        " (u'php', 176),\n",
 444 |        " (u'java', 166),\n",
 445 |        " (u'c#', 163),\n",
 446 |        " (u'android', 148)]"
 447 |       ]
 448 |      },
 449 |      "execution_count": 116,
 450 |      "metadata": {},
 451 |      "output_type": "execute_result"
 452 |     }
 453 |    ],
 454 |    "source": [
 455 |     "#Generate list of N most common labels\n",
 456 |     "NUM_CLASSES = 5\n",
 457 |     "labels_list = []\n",
 458 |     "\n",
 459 |     "counts = Counter('|'.join(data['tags'].tolist()).split('|'))\n",
 460 |     "classes = counts.most_common(NUM_CLASSES)\n",
 461 |     "\n",
 462 |     "for i in range(0,NUM_CLASSES):\n",
 463 |     "    labels_list.append(classes[i][0])\n",
 464 |     "    \n",
 465 |     "print (\"{} most common classes:\".format(NUM_CLASSES))\n",
 466 |     "classes"
 467 |    ]
 468 |   },
 469 |   {
 470 |    "cell_type": "markdown",
 471 |    "metadata": {},
 472 |    "source": [
 473 |     "Since our ML algorithm will expect numbers as labels, not words, we define utility function to switch back and forth between the human-friendly text and machine-friendly vector representation of the labels. "
 474 |    ]
 475 |   },
 476 |   {
 477 |    "cell_type": "code",
 478 |    "execution_count": 117,
 479 |    "metadata": {
 480 |     "collapsed": false
 481 |    },
 482 |    "outputs": [
 483 |     {
 484 |      "data": {
 485 |       "text/html": [
 486 |        "\n",
 487 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 488 |        "          <script>\n",
 489 |        "            requirejs.config({\n",
 490 |        "              paths: {\n",
 491 |        "                base: '/static/base',\n",
 492 |        "              },\n",
 493 |        "            });\n",
 494 |        "          </script>\n",
 495 |        "          "
 496 |       ],
 497 |       "text/plain": [
 498 |        "<IPython.core.display.HTML object>"
 499 |       ]
 500 |      },
 501 |      "metadata": {},
 502 |      "output_type": "display_data"
 503 |     },
 504 |     {
 505 |      "name": "stdout",
 506 |      "output_type": "stream",
 507 |      "text": [
 508 |       "php\n",
 509 |       "[0 1 0 0 0]\n"
 510 |      ]
 511 |     },
 512 |     {
 513 |      "data": {
 514 |       "text/plain": [
 515 |        "[u'php']"
 516 |       ]
 517 |      },
 518 |      "execution_count": 117,
 519 |      "metadata": {},
 520 |      "output_type": "execute_result"
 521 |     }
 522 |    ],
 523 |    "source": [
 524 |     "#labels_list: A list of the valid classes\n",
 525 |     "#tags: A list of tags for a post\n",
 526 |     "#returns an ndarray with ones for the active classes\n",
 527 |     "def labels_to_array(tags,labels_list=labels_list):\n",
 528 |     "    array = np.zeros(len(labels_list),dtype=np.int8)\n",
 529 |     "    tags = tags.split('|') #split tags from pipe separated string into list\n",
 530 |     "    for tag in tags:\n",
 531 |     "        try:\n",
 532 |     "            array[labels_list.index(tag)] = 1\n",
 533 |     "        except ValueError: \n",
 534 |     "            None\n",
 535 |     "    return array\n",
 536 |     "\n",
 537 |     "#translate machine readable array back to human labels\n",
 538 |     "def array_to_labels(array, labels_list=labels_list, threshold = 1):\n",
 539 |     "    labels = []\n",
 540 |     "    i=0\n",
 541 |     "    for flag in array:\n",
 542 |     "        if flag >= threshold: \n",
 543 |     "            labels.append(labels_list[i])\n",
 544 |     "        i=i+1\n",
 545 |     "    return labels\n",
 546 |     "\n",
 547 |     "#test utility functions\n",
 548 |     "print(data['tags'][4])\n",
 549 |     "array = labels_to_array(data['tags'][4])\n",
 550 |     "print(array)\n",
 551 |     "array_to_labels(array)"
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "code",
 556 |    "execution_count": 119,
 557 |    "metadata": {
 558 |     "collapsed": false
 559 |    },
 560 |    "outputs": [
 561 |     {
 562 |      "data": {
 563 |       "text/html": [
 564 |        "\n",
 565 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 566 |        "          <script>\n",
 567 |        "            requirejs.config({\n",
 568 |        "              paths: {\n",
 569 |        "                base: '/static/base',\n",
 570 |        "              },\n",
 571 |        "            });\n",
 572 |        "          </script>\n",
 573 |        "          "
 574 |       ],
 575 |       "text/plain": [
 576 |        "<IPython.core.display.HTML object>"
 577 |       ]
 578 |      },
 579 |      "metadata": {},
 580 |      "output_type": "display_data"
 581 |     },
 582 |     {
 583 |      "data": {
 584 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEcCAYAAADQqlM0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmcXFWd9/FPhx0TIpFO0ARBMuGr6CiILOMyA4JK0DE+\njmwuEOAZGQXFwQ2cQWXGUXAXnRlQEAMOgrgRRxwZxEdcUVYXnG/YQhLykAQ7QQKyJT1/3NOh0vZS\nN91VXd35vl+vvLruqXPr/u6tSv3qnHPvuV29vb1EREQ0a9JYBxAREeNLEkdERNSSxBEREbUkcURE\nRC1JHBERUUsSR0RE1JLEEZs1Sb+R9Jdt2taFknok/bwd24tola5cxxFjQdKLgbOBZwOPA78D3mH7\nhjGM6QPAbNvHtOC1XwxcAuxh++EBnj8WuAB4qBStAn4IfMT2bU1u40Jgqe33j07UY7ud6FxpcUTb\nSZoCfBv4DLAjMBM4E3ikBdvaYrRfcxPtBiweKGk0+KntHYCpwCHAH4EbJO3ZhvgimpYWR7SdpH2A\n/7Y9bZDnjwX+FrgROAZYDpxs+5ry/HzgPcAsYCXwUdufL8/9FfBl4LPA3wNXAacCXwJeDKwHfmP7\nr0r9u4ATgK2AhSWER4A7gH8BTrP9gobY3gm8yPZrB4j7qcC5ZTu/L3GdL+l44F+BLamSwSdsnznA\nPp9g+y/7lX8b+KPtI8ryV4GXANsCtwBvsf07SX9btrEeeBT4ge15kt5bjuV0YAnwj7a/VV5rNlUr\nZ6+yzvdtH12eeyZwDrBPOcbvt335YNvpfyxiYkuLI8bCImCdpC9JOlTSkweosz9wO/AU4IPANxrq\nrQAOK7/OjwM+JWmvhnV3Bp4MPB14M/BOYGl5renA+/pvzPb3gA8Dl9newfbeVIlkN0lqqPoG4KJB\n9utSqi/nnYHDgQ9LOsj2F4G/A35WXvvMQdYfyDeoEkWfK4HZZT9upOr+wvYXgP+gSlY7NHyZ306V\n6HagatV9WdKM8tw/A9+z/WSqJPxZAEnbUyXcLwM7AUcD/ybpWUNsJzYjSRzRdrYf4Ilf/58HVkq6\nQlJ3Q7UVts+xvc72VwEDryzrf9f24vL4R1Rfco1fruuAD9h+zPYjwGPAU4FnlNf7SZNxPgpcBrwR\nQNKzgV2B7/SvK2kW8ELgvWW7twDnA29q6qAMbjmwoWVm+0u2H7L9GPBPwPNK199g+/B12yvK48uB\n24D9ytOPAbtKmmn7Uds/LeWvAu6yfZHtXts3A18HXjfCfYkJYsuxDiA2T7YNHA8gaQ+qX7GfpvpF\nD3BPv1XuBp5W6s8F3g/sQfXjZzvgVw11V5Uv1j4fpfq1fZWkXuALts9uMtSLqH7Vn0GVQL7a77X7\nPA3osf1QQ9ndVF09IzET6AGQNImqVfQ6qpZAb/m3E/DAQCtLOoaqy263UvSkUh/g3cCHgF9I6gE+\naftCquR4QCkD6AK2YPCWVmxm0uKIMWd7EdUYxHMaimf2q/Z0YLmkrYGvUSWDbts7At+l+nLrs9HA\nne0Hbb/L9mzgr4FTJR00QCh/MuBn+zrgUUkvAV4PXDzIbiwHpkl6Ur+Y+yfAul4LXFsev4Eq/peW\n7qXdqPa7b983il/S06ladG+1vWM5Vr/tq297pe03255J1ZX2b5J2p+rW+3+2p5V/O5ZuqZMH2k5s\nftLiiLYrYwavpBpPuEfSLlT96D9rqDZd0tuAfwf+D/BMqi6ircu/+2yvL62PlwO/HmJ7rwT+x/Yd\nwFqq038fH6DqCuAQSV22G78cLwY+BzzW0J2zEdvLJP0U+IikdwOiGnR//TCHo1FXiXcSVdJ5J/CX\nwAHl+clUA/erS4L6CBt/ia8Adm9YfhJVd+B95TWPpSE5S3od1bjLPcCaUncd8J9lP95INW7TBTwP\neKC0FPtvJzYzaXHEWHiAavD7OkkPAD+l6mp6V0Od64A5wH1Ug7h/Y3uN7bXA24HLS1fKUcAVw2xv\nDnB12dZPgH8tYyOw8Rfv5VRfkr+XdH1D+cVUX7jDddUcDTyDqvXxdeCMvjPBmnSApD8A9wM/oEoU\n+9q+tTx/EdXg+z3Ab6iOW6MLgGeXiwy/Yft3wCeBnwP3Ul0z8+OG+vtSvQd/AL4FvN323eUYv5zq\n2C4v/84CthloOzX2LyaIlp+OK+nvqX55raf6VXgcVX/wpVTn8N8IvMn246Ub4iKqfuH7gCNtL2lp\ngNFxBjs1daxI2pbqV/bzS6slYrPW0haHpKcBb6P6D/dcqq6xo6muGP6EbVE1kU8oq5xANcA4h2qg\n9KOtjC+iSW8FfpmkEVFpR1fVFsCTJG1JdfbLcuAgqqY8wALgNeXxvLIM1QDowW2IL2JQ5QLBt1GN\nN0QELU4ctpcDn+CJftn7qbqm1theX6ot44kzaGZSndGB7XXAGkkDXl0cE5ftBZ3STWX7GeXfLWMd\nS0SnaHVX1ZOpWhG7Uo1rPAmYO0DVvoGWrn7lXeTUv4iIjtLq03EPAe603XcB0zeprq59sqRJpdUx\ni6r7CqrWxy5U5+tvAexge/VQG+jt7e3t6uqfbyIiYhib/MXZ6sSxhOoUw22pzj8/GPgl1ZxBh1NN\n53AsT5xOubAsX1eeH/ZUxq6uLlatGvCi2dgE3d1TcjxHSY7l6MrxHF3d3YPOVDOsVo9x/IJqkPsm\nqpk8u6iuZD2N6urdRVTz8FxQVrkA2EnSbcA7Sr2IiOggE2Fa9d78Chk9+VU3enIsR1eO5+jq7p6y\nyV1VuXI8IiJqSeKIiIhakjgiIqKWJI6IiKgliSMiImpJ4oiIiFpyI6eIFli3bh2LFi2ip2ftWIfC\nbrvtzhZbbDHWYcQEksQR0QKLF9/JKR9byPZTp49pHA/dv5LPvPvVzJ49Z0zjiIkliSOiRbafOp3J\nO/a/dXrE+JcxjoiIqCWJIyIiakniiIiIWpI4IiKiliSOiIioJYkjIiJqSeKIiIhakjgiIqKWJI6I\niKgliSMiImpp6ZQjkvYALgN6gS5gd+AM4OJSviuwGDjC9v1lnXOAucCDwHzbN7cyxoiIqKelLQ7b\ni2zvbfv5wD5UyeCbwGnA1bYFXAOcDiBpLjDb9hzgRODcVsYXERH1tbOr6hDgDttLgXnAglK+oCxT\n/l4EYPs6YKqkGW2MMSIihtHOxHEkcEl5PMP2CgDb9wJ9c0/PBJY2rHNPKYuIiA7RlsQhaSvg1cDl\npah3kKpdA5QNVjciIsZAu+7HMRe4wfZ9ZXmFpBm2V0jaGVhZypcBuzSsNwtYPtyLd3dPGdVgN3c5\nniO3evXksQ5hg2nTJk+Y93Si7Md4167EcTTwlYblhcB84Ozy94qG8pOAyyQdAKzp69IayqpVD4xm\nrJu17u4pOZ6joBNuGdunp2fthHhP89kcXSNJwi3vqpK0HdXA+Dcais8GXibJwMHAWQC2rwTuknQ7\ncB7w1lbHFxER9bS8xWH7j0B3v7IeqmQyUP2TWx1TRERsulw5HhERtSRxRERELUkcERFRSxJHRETU\nksQRERG1JHFEREQtSRwREVFLEkdERNSSxBEREbUkcURERC1JHBERUUsSR0RE1JLEERERtSRxRERE\nLUkcERFRSxJHRETUksQRERG1JHFEREQtSRwREVFLy+85LmkqcD7wHGA9cDywCLgM2BVYDBxh+/5S\n/xxgLvAgMN/2za2OMSIimteOFsdngCttPwt4HvA/wGnA1bYFXAOcDiBpLjDb9hzgRODcNsQXERE1\ntDRxSJoCvMT2hQC2Hy8ti3nAglJtQVmm/L2o1L0OmCppRitjjIiIelrdVbU7cJ+kC6laG9cD7wBm\n2F4BYPteSdNL/ZnA0ob17yllK1ocZ0RENKnViWNL4PnASbavl/Qpqm6q3kHqdw1QNljdDbq7p2x6\nhPEncjxHbvXqyWMdwgbTpk2eMO/pRNmP8a7ViWMZsNT29WX561SJY4WkGbZXSNoZWNlQf5eG9WcB\ny4fbyKpVD4xiyJu37u4pOZ6joKdn7ViHsEFPz9oJ8Z7mszm6RpKEWzrGUbqjlkraoxQdDPwWWAjM\nL2XzgSvK44XAMQCSDgDW9HVpRUREZ2j56bjA24H/kLQVcCdwHLAF8FVJxwNLgMMBbF8p6TBJt1Od\njntcG+KLiIgaWp44bN8C7DvAU4cMUv/k1kYUEREjkSvHIyKiliSOiIioJYkjIiJqSeKIiIhakjgi\nIqKWJI6IiKgliSMiImoZNnFIOlLSDuXxP0n6L0n7tD60iIjoRM20OP7R9h8k7Qe8gmra88+2NqyI\niOhUzSSOx8rflwHn274E2LZ1IUVERCdrJnH0SjoSOAq4upRt3bqQIiKikzWTOE4GjqZqbdwlaQ7w\ng9aGFRERnaqZSQ63s/2avgXbt0n6ZgtjioiIDtZMi+PjA5R9bLQDiYiI8WHQFoekPwP2AHaQdFjD\nU1OB7VsdWEREdKahuqpeRHV3vhnAuxvK/wC8q4UxRUREBxs0cdheACyQNN/2l9oXUkREdLJmxjju\nlDQZQNIJks6V9IwWxxURER2qmcTxOeBBSc8G3kl1j/ALWhpVRER0rGZOx33cdq+kucC/2/6spMOb\n3YCkxcD9wHrgMdv7SdoRuAzYFVgMHGH7/lL/HGAu8CAw3/bNNfYnIiJarJkWx5aS9gdeC1xTyrao\nsY31wIG297a9Xyk7Dbjatsprng5QktNs23OAE4Fza2wnIiLaoJnEcQZwHvBz27+VtAdwe41tdA2w\nnXnAgvJ4QVnuK78IwPZ1wFRJM2psKyIiWmzYrirbVwBXNCwvomp9NKsX+J6kXuA82+cDM2yvKK93\nr6Tppe5MYGnDuveUshU1thcRES00bOKQtD1Vq+MQqiRwNfAh2w81uY0XluTQDVwlyeV1BtI1QNlg\ndTfo7p7SZCjRjBzPkVu9evJYh7DBtGmTJ8x7OlH2Y7xrZnD8s6XeO8ry/6U60+r4ZjZg+97yd5Wk\nbwH7ASskzbC9QtLOwMpSfRmwS8Pqs4Dlw21j1aoHmgklmtDdPSXHcxT09Kwd6xA26OlZOyHe03w2\nR9dIknAziWNf28/tW5D0U+CWZl68tFYm2V4r6UnAy4EzgYVUV6WfXf72dYUtBE4CLpN0ALCmr0sr\nIiI6QzOD413lS7/P9gzcpTSQGcCPJd0E/Bz4tu2rqBLGy0q31cHAWQC2rwTuknQ71YD8W5vcTkRE\ntEkzLY4vAz+TdCnVeMNRlDOfhmP7LmCvAcp7qMZMBlrn5GZeOyIixsawLQ7bZwPvAaYBOwHvtZ1p\n1SMiNlNDtjgk/TnV1Oq/sv1f7QkpIiI62aAtDklvB35E1dr4paQj2hZVRER0rKG6qv4OeI7t/YEX\n8sTpuBERsRkbKnE8YnsZgO1bgW3bE1JERHSyocY4+t8ydqPlcupsRERsZoZKHEvY+JaxSxuWe4Ek\njoiIzdBQt449qJ2BRETE+NDMleMREREbJHFEREQtSRwREVHLUBcAfqX8PaV94URERKcbqsXxnPL3\n2HYEEhER48NQp+NeL+l+YDtJKxvKu4Be29MHWS8iIiawQVscto+jmuDwNmDfhn8vKH8jImIzNOTs\nuOXWrvvb7pz7YEZExJhq5kZO20o6n+rGS+uBq4FTbK9qaWQREdGRmjkd9zxgEfA8YG+qrqvzWhlU\nRER0rmZaHLNt/03D8gck3dyqgCIiorM1kzgmSZpueyWApOnUvHBQ0iTgemCZ7VdL2g24FNgRuBF4\nk+3HJW1NdT/zfYD7gCNtL6mzrYiIaK1mEsDHgZskfV7SecANwEdrbucU4NaG5bOBT9gWsAY4oZSf\nAPTYngN8ehO2ExERLTZs4rB9EfBy4FfAb4BX2P5ysxuQNAs4DDi/ofilwNfL4wXAa8rjeWUZ4GvA\nwc1uJyIi2qOZrips/xb47SZu41NU9/GYCiDpKcBq2+vL88uAmeXxTKr7fmB7naQ1kqbZ7tnEbUdE\nxChrKnFsKkmvBFbYvlnSgaW4q/xr1NvwXKOuhucG1d09ZSRhRj85niO3evXksQ5hg2nTJk+Y93Si\n7Md419LEAbwIeHW55ex2wBSqsYupkiaVVscsYHmpvwzYBVguaQtgB9urh9vIqlUPtCT4zVF395Qc\nz1HQ09M518z29KydEO9pPpujayRJeMgxDkmT+t13vBbb77P9dNu7A0cB19h+I/AD4PBS7VjgivJ4\nIU9Mqng4cM2mbjsiIlpjyMRRWgQfasF2TwNOlbQImAZcUMovAHaSdBvwjlIvIiI6SDNdVTdL2s/2\nL0ayIds/BH5YHt8F7D9AnUeAI0aynYiIaK1mEsc+wE9KK2BDx63t/VoWVUREdKxmEsfbWx5FRESM\nG8MmjtLFhKTuzIgbERHDXjkuaX9Jd1PNKYWkF0j6fMsji4iIjtTMXFWfBOZSTTqI7euprs+IiIjN\nUDOJY2vbt/Yre7QVwUREROdrJnE8ImkyZeoPSXsCD7c0qoiI6FjNnFX1L8BVwNMkfQk4FHhjK4OK\niIjO1cxZVd+VZOAVVJMOfsj27S2PLCIiOlKzkxwuBX5E1V21uGXRREREx2vmdNwXA3dS3XjpW8Cd\nkl7Y6sAiIqIzNTM4/q/AG2yr3NL1DcC/tzasiIjoVM0kDmxf2/D4R60LJyIiOl0zieO/Jb2hb0HS\n64HvtS6kiIjoZIMOjktaRTUY3kV174zzy1PbUF1F/p7WhxcREZ1mqLOqXtC2KCIiYtwYNHHYvrud\ngURExPgw7HUckl4EnAXMLvW7gF7b01scW0REdKBmLgC8EPgH4AZgXWvDiYiITtdM4lht+/JNeXFJ\n2wDXAluXbX3N9pmSdgMuBXakus/Hm2w/Lmlr4CKq29XeBxxpe8mmbDsiIlqjmdNxL5H0d5KmSdq+\n718zL277EeAg23sDewFzJe0PnA18wraANcAJZZUTgJ5yoeGngY/W3aGIiGitZhLHSuDjwCrgAWBt\n+dsU2w+Vh9tQtTp6gYOopjABWAC8pjyeV5YBvgYc3Ox2IiKiPZrpqvoIcCBwo+31dTcgaRLV+Mhs\nqulL7gDWNLzWMmBmeTyTakJFbK+TtEbSNNs9dbcbERGt0UziWF5uF7tJSoLYW9IOwDeBZw1Qrbf8\n7epX3tXw3KC6u6dsangxgBzPkVu9evJYh7DBtGmTJ8x7OlH2Y7xrJnF8X9LZwGU03PlvgNvJDsn2\nHyT9EDgAeLKkSSWpzAKWl2rLgF2A5ZK2AHawvXq41161qumesxhGd/eUHM9R0NOzdqxD2KCnZ+2E\neE/z2RxdI0nCzSSOvrv9HdFQ1gvsPtyKknYCHrN9v6TtgEOorgn5AXA4VTI6FriirLKwLF9Xnr+m\nifgiIqKNmrkD4DNG8PpPBRaUcY5JwGW2r5T0O+BSSf8M3ARcUOpfAFws6Tbg98BRI9h2RES0QDNX\nju85UHkzXVW2fw08f4Dyu4D9Byh/hI1bNhER0WGa6ar6TsPjbYEZwN3ASFoiERExTtXuqpJ0MDC3\nZRFFRERHa+oOgI1sfx94aQtiiYiIcaDuGMckYF+qq8AjImIzVHeM43HgdqpTZiMiYjPU6tNxIyJi\nghnqnuMDnobbp+6V4xERMTEM1eL4zgBlvcAUYBqwRUsiioiIjjbUPcf7n4b7JOBU4CTgky2OKyIi\nOlQzZ1VtCbwFeC9wJbCP7XtaHVhERHSmIROHpGOADwK/BF5qe1E7goqIiM411OD4r4DJVInjemDL\nxgHzDI5HRGyehmpx7EA1GH5m+dt4k6WmplWPiIiJZ6jB8d3aGEdERIwTteeqioiIzVsSR0RE1JLE\nERERtSRxRERELUkcERFRSzPTqm8ySbOAi4CdgXXAF2yfI2lH4DJgV2AxcITt+8s651DdYfBBYL7t\nm1sZY0RE1NPqFsfjwKm29wT+AjhJ0jOB04CrbQu4BjgdQNJcYLbtOcCJwLktji8iImpqaeKwfW9f\ni8H2WuB3wCxgHrCgVFtQlil/Lyr1rwOmSprRyhgjIqKeto1xSNoN2Av4OTDD9gqokgswvVSbCSxt\nWO2eUhYRER2ipWMcfSRNBr4GnGJ7raTeQap2DVA2WN0NurunjCS86CfHc+RWr5481iFsMG3a5Anz\nnk6U/RjvWp44yrTsXwMutn1FKV4haYbtFZJ2BlaW8mXALg2rzwKWD7eNVaseGM2QN2vd3VNyPEdB\nT8/asQ5hg56etRPiPc1nc3SNJAm3o6vqi8Cttj/TULYQmF8ezweuaCg/BkDSAcCavi6tiIjoDK0+\nHfdFwBuAX0u6iarb6X3A2cBXJR0PLAEOB7B9paTDJN1OdTruccNt4/0f/hzr1relx21QO3dP5ajX\nvmpMY4iIaJeWfuPa/gmD35v8kEHWObnONn626GG2fcqcuqGNqvv+cNeYbj8iop1y5XhERNSSxBER\nEbUkcURERC1JHBERUUsSR0RE1JLEERERtSRxRERELUkcERFRSxJHRETUksQRERG1JHFEREQtSRwR\nEVFLEkdERNSSxBEREbUkcURERC1JHBERUUsSR0RE1JLEERERtbT6nuMXAK8CVth+binbEbgM2BVY\nDBxh+/7y3DnAXKr7jc+3fXMr44uIiPpa3eK4EHhFv7LTgKttC7gGOB1A0lxgtu05wInAuS2OLSIi\nNkFLE4ftHwOr+xXPAxaUxwvKcl/5RWW964Cpkma0Mr6IiKhvLMY4ptteAWD7XmB6KZ8JLG2od08p\ni4iIDtJJg+NdA5T1tj2KiIgYUksHxwexQtIM2ysk7QysLOXLgF0a6s0Clrc9uk2w9dZb0d09ZazD\nGDUTaV/GyurVk8c6hA2mTZs8Yd7TibIf4107EkcXG7cmFgLzgbPL3ysayk8CLpN0ALCmr0ur0z36\n6GOsWvXAWIcxKrq7p0yYfRlLPT1rxzqEDXp61k6I9zSfzdE1kiTc6tNxLwEOBJ4iaQnwAeAs4HJJ\nxwNLgMMBbF8p6TBJt1OdjntcK2OLiIhN09LEYfv1gzx1yCD1T25hOBERMQo6aXA8IiLGgSSOiIio\nJYkjIiJqSeKIiIhakjgiIqKWJI6IiKgliSMiImpJ4oiIiFrGYq6qiIjN0rp161i8+M6xDgOA7u7n\nb/K6SRwREW2yePGdnPKxhWw/dfrwlVvooftXct3XkzgiIsaF7adOZ/KO4/tWQxnjiIiIWpI4IiKi\nliSOiIioJYkjIiJqSeKIiIhakjgiIqKWJI6IiKgliSMiImrpuAsAJR0KfJoqqV1g++wxDikiIhp0\nVItD0iTgc8ArgGcDR0t65thGFRERjToqcQD7AbfZvtv2Y8ClwLwxjikiIhp0WuKYCSxtWF5WyiIi\nokN02hhH1wBlvUOt0Lv2btbzcIvCac66Keu4447bxjSG0bJ69WR6etaOdRjj3pIld/PQ/SvHOgwe\nun8lS5bcPdZhjIqJ8NnspM/FSHT19g75vdxWkg4APmj70LJ8GtCbAfKIiM7RaS2OXwJ/JmlX4P8D\nRwFHj21IERHRqKPGOGyvA04GrgJ+C1xq+3djG1VERDTqqK6qiIjofB3V4oiIiM6XxBEREbUkcURE\nRC2ddlbVoIabw0rS1sBFwD7AfcCRtpe0PdBxoIljeSzwMaoLMAE+Z/uL7Y1y/JB0AfAqYIXt5w5S\n5xxgLvAgMN/2zW0McdwY7lhK+ivgCuDOUvQN2x9qY4jjiqRZVN+LOwPrgC/YPmeAerU+n+OixdHk\nHFYnAD2251B9KX60vVGODzXmA7vU9vPLvySNoV1IdTwHJGkuMLt8Nk8Ezm1XYOPQkMeyuLbhs5mk\nMbTHgVNt7wn8BXBS///vm/L5HBeJg+bmsJoHLCiPvwYc3Mb4xpNm5wMb6Cr+GIDtHwOrh6gyj+pX\nH7avA6ZKmtGO2MabJo4l5LPZNNv39rUebK8FfsefTuNU+/M5XhJHM3NYbahTrgdZI2lae8IbV5qd\nD+y1km6W9NXS3I1N1/+Y30PmYBuJAyTdJOk7kvYc62DGC0m7AXsB1/V7qvbnc7wkjmbmsOpfp2uA\nOtHcsVwI7GZ7L+D7PNGSi01Tew62GNQNwK6296bqcv3WGMczLkiaTNUTc0ppeTSq/fkcL4ljGfD0\nhuVZwPJ+dZYCuwBI2gLYwfZwTd7N0bDH0vbq0o0F8AWqEw5i0y2jfDaLgT6/0QTba20/VB5/F9gq\nPQtDk7QlVdK42PYVA1Sp/fkcL4ljwxxW5eypo6h+FTf6NnBseXw4cE0b4xtPhj2WknZuWJwH3NrG\n+MarLgbve18IHAMbJvJcY3tFuwIbhwY9lo1975L2A7ps97QrsHHqi8Cttj8zyPO1P5/jZsqRcgrp\nZ3jiFNKzJJ0J/NL2f0raBrgY2Bv4PXCU7cVjFnAHa+JYfhh4NfAY0AO8xfaisYu4s0m6BDgQeAqw\nAvgAsDXVzM6fL3U+BxxKdbrjcbZvHJtoO9twx1LSScBbqD6bfwT+vgzoxgAkvQi4Fvg1VfdTL/A+\nYFdG8PkcN4kjIiI6w3jpqoqIiA6RxBEREbUkcURERC1JHBERUUsSR0RE1JLEERERtSRxREeTtFjS\nr/qV3TWacxSViyFXjdbr1djuhZJ+LekrAzz3A0mH1Xy99ZK2r7nOmOx7jG9JHNHpeoHJko5pw3ZG\npExZ32zdGcBrbf+57aNHuu1iU/chF3NFLePmRk6xWfsg8EFJl9h+vPEJSXcBr7R9a//l8vjLVFPs\nPw04HZgOvB7YkeoK2Z+Ul+qS9HHgZWX5pDLFd9/9Cv4B2AZ4lOr+BteVmwqdQzXx3l7APwJX9ovv\nGOBdwHrgDqr7HTxMNSXOdpJuBBYMMR3ERiSdChxJ9X/3YeCttm/p2wfgPZLmAdsC/2D7G2W9/YCz\ngCml7gds9491O6oJLfekujLbto9qJq7YvKTFEZ2uF7ieao6tt2zC+lvbfiHwOqoJGx+xvT9VIjir\nod5TgJtsPw94G/AVSVtJ2h04AzjU9r7A3wJfbVhvT+DcclOh/l/EzwY+AhxSZhr+LdXdFNcCh1HN\nCfT8ZpNGscD2/rb3Ad7Pn95057Eyc+w84POSdpI0tdQ7uuzDXwPnSdqh37qvAKbYfk55jRNrxBWb\nkbQ4otMh+KA3AAACJ0lEQVT1TXZ3BnCNpLp3I7ys/L0R2I4nvvRvAGY31HvE9n8A2L5W0kOAgJcA\nuwPXSuqLZZKk7vL4Ntu/GGTbBwHfsb2yLJ8H3DJI3WbtK+l0YBpVK2ZOv+cvKPuwSNINwAFUtwx9\nBvDdhn1YB/wZ1bxufW4BniXps8APge+MMNaYoJI4YlwoX4RXAqeycZ/842zcct6236oPl/XXS9qw\nTPXFOdTnf1LZThfwX7bn969QXq//vQ0aDXRPmPVD1B+SpK2Ay4EX275F0lN54r7wjdvs07gPt9g+\ncIDX3LXvse27SivpYKoW0YclPcf2o5sac0xM6aqK8eRM4CSe6KcHuB3YF0DSwcBQt7wc6GZffbaR\n9PryOi+hGs8wcBVwaONZXJJe0GS83wcOkzS9LL8ZuHqIeIazLbAFTySLkwaoc1yJcQ7wPKq7vf0U\nmCPpwL5K/fahq5TNBNbbXkiVoHeiatlEbCQtjuh0G36x275H0sVUX2p9zgAWSHob1YDz3QOt28Ty\nfcBekt5blo8qA/G3S3ojcIGkbamm+P4J1bjLkMoA/enA1ZLWA3ey8bjBcGczfUnSwzzRcjmMalzj\nekn3Ud2cp//+bFkG3LcD3mz7PgBJrwY+LulTVEnxDqqxjsY4/hw4q7SkJgEftn3vcPsZm59Mqx4R\nEbWkqyoiImpJ4oiIiFqSOCIiopYkjoiIqCWJIyIiakniiIiIWpI4IiKiliSOiIio5X8BHDQ2n1Hc\nZm4AAAAASUVORK5CYII=\n",
 585 |       "text/plain": [
 586 |        "<matplotlib.figure.Figure at 0x7f590426e490>"
 587 |       ]
 588 |      },
 589 |      "metadata": {},
 590 |      "output_type": "display_data"
 591 |     }
 592 |    ],
 593 |    "source": [
 594 |     "#Find label density\n",
 595 |     "label_counts = np.zeros(NUM_ROWS)\n",
 596 |     "for i in range(len(label_counts)):\n",
 597 |     "    label_counts[i] = labels_to_array(data['tags'][i]).sum()\n",
 598 |     "    \n",
 599 |     "plt.xlabel('Number of Labels')\n",
 600 |     "plt.ylabel('Number of Posts')\n",
 601 |     "plt.title('Sparsity of Dataset')\n",
 602 |     "plt.hist(label_counts)\n",
 603 |     "plt.show()"
 604 |    ]
 605 |   },
 606 |   {
 607 |    "cell_type": "markdown",
 608 |    "metadata": {},
 609 |    "source": [
 610 |     "### 3) Data Preprocessing\n",
 611 |     "\n",
 612 |     "We will pre-process the data in the following ways\n",
 613 |     "\n",
 614 |     "1. Split it into train and test sets\n",
 615 |     "2. Generate a bag of words embedding from the 1000 most common words in the corpus\n",
 616 |     "3. Store the features and labels in their machine-friendly format"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "code",
 621 |    "execution_count": 120,
 622 |    "metadata": {
 623 |     "colab": {
 624 |      "autoexec": {
 625 |       "startup": false,
 626 |       "wait_interval": 0
 627 |      },
 628 |      "output_extras": [
 629 |       {}
 630 |      ]
 631 |     },
 632 |     "colab_type": "code",
 633 |     "collapsed": false,
 634 |     "id": "h_SDal0khA3n",
 635 |     "outputId": "e6c311e5-c674-4cf2-f2dc-d6ceabfa6f83"
 636 |    },
 637 |    "outputs": [
 638 |     {
 639 |      "data": {
 640 |       "text/html": [
 641 |        "\n",
 642 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 643 |        "          <script>\n",
 644 |        "            requirejs.config({\n",
 645 |        "              paths: {\n",
 646 |        "                base: '/static/base',\n",
 647 |        "              },\n",
 648 |        "            });\n",
 649 |        "          </script>\n",
 650 |        "          "
 651 |       ],
 652 |       "text/plain": [
 653 |        "<IPython.core.display.HTML object>"
 654 |       ]
 655 |      },
 656 |      "metadata": {},
 657 |      "output_type": "display_data"
 658 |     },
 659 |     {
 660 |      "name": "stdout",
 661 |      "output_type": "stream",
 662 |      "text": [
 663 |       "Train size: 800\n",
 664 |       "Test size: 200\n"
 665 |      ]
 666 |     }
 667 |    ],
 668 |    "source": [
 669 |     "# Split data into train and test\n",
 670 |     "train_size = int(len(data) * .8)\n",
 671 |     "test_size = len(data)-train_size\n",
 672 |     "print (\"Train size: %d\" % train_size)\n",
 673 |     "print (\"Test size: %d\" % test_size)"
 674 |    ]
 675 |   },
 676 |   {
 677 |    "cell_type": "code",
 678 |    "execution_count": 121,
 679 |    "metadata": {
 680 |     "colab": {
 681 |      "autoexec": {
 682 |       "startup": false,
 683 |       "wait_interval": 0
 684 |      }
 685 |     },
 686 |     "colab_type": "code",
 687 |     "collapsed": false,
 688 |     "id": "anD38iilhA3r"
 689 |    },
 690 |    "outputs": [
 691 |     {
 692 |      "data": {
 693 |       "text/html": [
 694 |        "\n",
 695 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 696 |        "          <script>\n",
 697 |        "            requirejs.config({\n",
 698 |        "              paths: {\n",
 699 |        "                base: '/static/base',\n",
 700 |        "              },\n",
 701 |        "            });\n",
 702 |        "          </script>\n",
 703 |        "          "
 704 |       ],
 705 |       "text/plain": [
 706 |        "<IPython.core.display.HTML object>"
 707 |       ]
 708 |      },
 709 |      "metadata": {},
 710 |      "output_type": "display_data"
 711 |     }
 712 |    ],
 713 |    "source": [
 714 |     "train_posts = data['post'][:train_size]\n",
 715 |     "train_tags = data['tags'][:train_size]\n",
 716 |     "\n",
 717 |     "test_posts = data['post'][train_size:]\n",
 718 |     "test_tags = data['tags'][train_size:]"
 719 |    ]
 720 |   },
 721 |   {
 722 |    "cell_type": "code",
 723 |    "execution_count": 122,
 724 |    "metadata": {
 725 |     "colab": {
 726 |      "autoexec": {
 727 |       "startup": false,
 728 |       "wait_interval": 0
 729 |      }
 730 |     },
 731 |     "colab_type": "code",
 732 |     "collapsed": false,
 733 |     "id": "z4GblctFhA3u"
 734 |    },
 735 |    "outputs": [
 736 |     {
 737 |      "data": {
 738 |       "text/html": [
 739 |        "\n",
 740 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 741 |        "          <script>\n",
 742 |        "            requirejs.config({\n",
 743 |        "              paths: {\n",
 744 |        "                base: '/static/base',\n",
 745 |        "              },\n",
 746 |        "            });\n",
 747 |        "          </script>\n",
 748 |        "          "
 749 |       ],
 750 |       "text/plain": [
 751 |        "<IPython.core.display.HTML object>"
 752 |       ]
 753 |      },
 754 |      "metadata": {},
 755 |      "output_type": "display_data"
 756 |     }
 757 |    ],
 758 |    "source": [
 759 |     "max_words = 1000\n",
 760 |     "tokenize = text.Tokenizer(num_words=max_words, char_level=False)"
 761 |    ]
 762 |   },
 763 |   {
 764 |    "cell_type": "code",
 765 |    "execution_count": 123,
 766 |    "metadata": {
 767 |     "colab": {
 768 |      "autoexec": {
 769 |       "startup": false,
 770 |       "wait_interval": 0
 771 |      }
 772 |     },
 773 |     "colab_type": "code",
 774 |     "collapsed": false,
 775 |     "id": "YatMLCKXhA3x"
 776 |    },
 777 |    "outputs": [
 778 |     {
 779 |      "data": {
 780 |       "text/html": [
 781 |        "\n",
 782 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 783 |        "          <script>\n",
 784 |        "            requirejs.config({\n",
 785 |        "              paths: {\n",
 786 |        "                base: '/static/base',\n",
 787 |        "              },\n",
 788 |        "            });\n",
 789 |        "          </script>\n",
 790 |        "          "
 791 |       ],
 792 |       "text/plain": [
 793 |        "<IPython.core.display.HTML object>"
 794 |       ]
 795 |      },
 796 |      "metadata": {},
 797 |      "output_type": "display_data"
 798 |     }
 799 |    ],
 800 |    "source": [
 801 |     "tokenize.fit_on_texts(train_posts) # only fit on train\n",
 802 |     "x_train = tokenize.texts_to_matrix(train_posts)\n",
 803 |     "x_test = tokenize.texts_to_matrix(test_posts)"
 804 |    ]
 805 |   },
 806 |   {
 807 |    "cell_type": "code",
 808 |    "execution_count": 124,
 809 |    "metadata": {
 810 |     "colab": {
 811 |      "autoexec": {
 812 |       "startup": false,
 813 |       "wait_interval": 0
 814 |      }
 815 |     },
 816 |     "colab_type": "code",
 817 |     "collapsed": false,
 818 |     "id": "8quTsErLhA3z"
 819 |    },
 820 |    "outputs": [
 821 |     {
 822 |      "data": {
 823 |       "text/html": [
 824 |        "\n",
 825 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 826 |        "          <script>\n",
 827 |        "            requirejs.config({\n",
 828 |        "              paths: {\n",
 829 |        "                base: '/static/base',\n",
 830 |        "              },\n",
 831 |        "            });\n",
 832 |        "          </script>\n",
 833 |        "          "
 834 |       ],
 835 |       "text/plain": [
 836 |        "<IPython.core.display.HTML object>"
 837 |       ]
 838 |      },
 839 |      "metadata": {},
 840 |      "output_type": "display_data"
 841 |     },
 842 |     {
 843 |      "data": {
 844 |       "text/plain": [
 845 |        "array([ 0.,  0.,  0.,  0.,  1.])"
 846 |       ]
 847 |      },
 848 |      "execution_count": 124,
 849 |      "metadata": {},
 850 |      "output_type": "execute_result"
 851 |     }
 852 |    ],
 853 |    "source": [
 854 |     "# generate multi-label arrays\n",
 855 |     "y_train = np.zeros([train_size,NUM_CLASSES])\n",
 856 |     "for i in range(0,train_size):\n",
 857 |     "    y_train[i] = labels_to_array(data['tags'][i])\n",
 858 |     "\n",
 859 |     "y_test = np.zeros([test_size,NUM_CLASSES])\n",
 860 |     "for i in range(0,test_size):\n",
 861 |     "    y_test[i] = labels_to_array(data['tags'][i+train_size-1])\n",
 862 |     "y_test[0]"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "code",
 867 |    "execution_count": 125,
 868 |    "metadata": {
 869 |     "colab": {
 870 |      "autoexec": {
 871 |       "startup": false,
 872 |       "wait_interval": 0
 873 |      },
 874 |      "output_extras": [
 875 |       {}
 876 |      ]
 877 |     },
 878 |     "colab_type": "code",
 879 |     "collapsed": false,
 880 |     "id": "XZFsdLYVhA33",
 881 |     "outputId": "882923f3-6705-46b5-be88-3d4fec2965f2"
 882 |    },
 883 |    "outputs": [
 884 |     {
 885 |      "data": {
 886 |       "text/html": [
 887 |        "\n",
 888 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 889 |        "          <script>\n",
 890 |        "            requirejs.config({\n",
 891 |        "              paths: {\n",
 892 |        "                base: '/static/base',\n",
 893 |        "              },\n",
 894 |        "            });\n",
 895 |        "          </script>\n",
 896 |        "          "
 897 |       ],
 898 |       "text/plain": [
 899 |        "<IPython.core.display.HTML object>"
 900 |       ]
 901 |      },
 902 |      "metadata": {},
 903 |      "output_type": "display_data"
 904 |     },
 905 |     {
 906 |      "name": "stdout",
 907 |      "output_type": "stream",
 908 |      "text": [
 909 |       "x_train shape: (800, 1000)\n",
 910 |       "x_test shape: (200, 1000)\n",
 911 |       "y_train shape: (800, 5)\n",
 912 |       "y_test shape: (200, 5)\n"
 913 |      ]
 914 |     }
 915 |    ],
 916 |    "source": [
 917 |     "# Inspect the dimenstions of our training and test data (this is helpful to debug)\n",
 918 |     "print('x_train shape:', x_train.shape)\n",
 919 |     "print('x_test shape:', x_test.shape)\n",
 920 |     "print('y_train shape:', y_train.shape)\n",
 921 |     "print('y_test shape:', y_test.shape)"
 922 |    ]
 923 |   },
 924 |   {
 925 |    "cell_type": "markdown",
 926 |    "metadata": {},
 927 |    "source": [
 928 |     "### 4) Model Code"
 929 |    ]
 930 |   },
 931 |   {
 932 |    "cell_type": "code",
 933 |    "execution_count": 126,
 934 |    "metadata": {
 935 |     "colab": {
 936 |      "autoexec": {
 937 |       "startup": false,
 938 |       "wait_interval": 0
 939 |      }
 940 |     },
 941 |     "colab_type": "code",
 942 |     "collapsed": false,
 943 |     "id": "cBIkzTOZhA36"
 944 |    },
 945 |    "outputs": [
 946 |     {
 947 |      "data": {
 948 |       "text/html": [
 949 |        "\n",
 950 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
 951 |        "          <script>\n",
 952 |        "            requirejs.config({\n",
 953 |        "              paths: {\n",
 954 |        "                base: '/static/base',\n",
 955 |        "              },\n",
 956 |        "            });\n",
 957 |        "          </script>\n",
 958 |        "          "
 959 |       ],
 960 |       "text/plain": [
 961 |        "<IPython.core.display.HTML object>"
 962 |       ]
 963 |      },
 964 |      "metadata": {},
 965 |      "output_type": "display_data"
 966 |     }
 967 |    ],
 968 |    "source": [
 969 |     "# This model trains very quickly and 2 epochs are already more than enough\n",
 970 |     "# Training for more epochs will likely lead to overfitting on this dataset\n",
 971 |     "# You can try tweaking these hyperparamaters when using this model with your own data\n",
 972 |     "batch_size = 32\n",
 973 |     "epochs = 2"
 974 |    ]
 975 |   },
 976 |   {
 977 |    "cell_type": "code",
 978 |    "execution_count": 127,
 979 |    "metadata": {
 980 |     "colab": {
 981 |      "autoexec": {
 982 |       "startup": false,
 983 |       "wait_interval": 0
 984 |      },
 985 |      "output_extras": [
 986 |       {}
 987 |      ]
 988 |     },
 989 |     "colab_type": "code",
 990 |     "collapsed": false,
 991 |     "id": "XdrFuwx4hA39",
 992 |     "outputId": "4b002559-2f06-4681-8f02-2e76e62d7a57"
 993 |    },
 994 |    "outputs": [
 995 |     {
 996 |      "data": {
 997 |       "text/html": [
 998 |        "\n",
 999 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1000 |        "          <script>\n",
1001 |        "            requirejs.config({\n",
1002 |        "              paths: {\n",
1003 |        "                base: '/static/base',\n",
1004 |        "              },\n",
1005 |        "            });\n",
1006 |        "          </script>\n",
1007 |        "          "
1008 |       ],
1009 |       "text/plain": [
1010 |        "<IPython.core.display.HTML object>"
1011 |       ]
1012 |      },
1013 |      "metadata": {},
1014 |      "output_type": "display_data"
1015 |     }
1016 |    ],
1017 |    "source": [
1018 |     "# Build the model\n",
1019 |     "model = Sequential()\n",
1020 |     "model.add(Dense(512, input_shape=(max_words,)))\n",
1021 |     "model.add(Activation('relu'))\n",
1022 |     "model.add(Dropout(0.5))\n",
1023 |     "model.add(Dense(NUM_CLASSES))\n",
1024 |     "model.add(Activation('sigmoid')) #changed from softmax\n",
1025 |     "\n",
1026 |     "model.compile(loss='binary_crossentropy',\n",
1027 |     "              optimizer='adam',\n",
1028 |     "              metrics=['accuracy']) #changed from categorical_crossentropy"
1029 |    ]
1030 |   },
1031 |   {
1032 |    "cell_type": "code",
1033 |    "execution_count": 128,
1034 |    "metadata": {
1035 |     "colab": {
1036 |      "autoexec": {
1037 |       "startup": false,
1038 |       "wait_interval": 0
1039 |      },
1040 |      "output_extras": [
1041 |       {}
1042 |      ]
1043 |     },
1044 |     "colab_type": "code",
1045 |     "collapsed": false,
1046 |     "id": "rzi-9GaBhA4A",
1047 |     "outputId": "9a56a130-8804-4ce0-ad47-38c4f40c81fa"
1048 |    },
1049 |    "outputs": [
1050 |     {
1051 |      "data": {
1052 |       "text/html": [
1053 |        "\n",
1054 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1055 |        "          <script>\n",
1056 |        "            requirejs.config({\n",
1057 |        "              paths: {\n",
1058 |        "                base: '/static/base',\n",
1059 |        "              },\n",
1060 |        "            });\n",
1061 |        "          </script>\n",
1062 |        "          "
1063 |       ],
1064 |       "text/plain": [
1065 |        "<IPython.core.display.HTML object>"
1066 |       ]
1067 |      },
1068 |      "metadata": {},
1069 |      "output_type": "display_data"
1070 |     },
1071 |     {
1072 |      "name": "stdout",
1073 |      "output_type": "stream",
1074 |      "text": [
1075 |       "Train on 720 samples, validate on 80 samples\n",
1076 |       "Epoch 1/2\n",
1077 |       "720/720 [==============================] - 1s - loss: 0.4711 - acc: 0.8122 - val_loss: 0.4311 - val_acc: 0.8125\n",
1078 |       "Epoch 2/2\n",
1079 |       "720/720 [==============================] - 0s - loss: 0.3171 - acc: 0.8653 - val_loss: 0.3765 - val_acc: 0.8300\n"
1080 |      ]
1081 |     }
1082 |    ],
1083 |    "source": [
1084 |     "# model.fit trains the model\n",
1085 |     "# The validation_split param tells Keras what % of our training data should be used in the validation set\n",
1086 |     "# You can see the validation loss decreasing slowly when you run this\n",
1087 |     "# Because val_loss is no longer decreasing we stop training to prevent overfitting\n",
1088 |     "history = model.fit(x_train, y_train,\n",
1089 |     "                    batch_size=batch_size,\n",
1090 |     "                    epochs=epochs,\n",
1091 |     "                    verbose=1,\n",
1092 |     "                    validation_split=0.1)"
1093 |    ]
1094 |   },
1095 |   {
1096 |    "cell_type": "code",
1097 |    "execution_count": 129,
1098 |    "metadata": {
1099 |     "colab": {
1100 |      "autoexec": {
1101 |       "startup": false,
1102 |       "wait_interval": 0
1103 |      },
1104 |      "output_extras": [
1105 |       {}
1106 |      ]
1107 |     },
1108 |     "colab_type": "code",
1109 |     "collapsed": false,
1110 |     "id": "zjwBD8qFhA4D",
1111 |     "outputId": "0dda5da5-44c4-4fbc-f2ad-01d642ca1914"
1112 |    },
1113 |    "outputs": [
1114 |     {
1115 |      "data": {
1116 |       "text/html": [
1117 |        "\n",
1118 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1119 |        "          <script>\n",
1120 |        "            requirejs.config({\n",
1121 |        "              paths: {\n",
1122 |        "                base: '/static/base',\n",
1123 |        "              },\n",
1124 |        "            });\n",
1125 |        "          </script>\n",
1126 |        "          "
1127 |       ],
1128 |       "text/plain": [
1129 |        "<IPython.core.display.HTML object>"
1130 |       ]
1131 |      },
1132 |      "metadata": {},
1133 |      "output_type": "display_data"
1134 |     },
1135 |     {
1136 |      "name": "stdout",
1137 |      "output_type": "stream",
1138 |      "text": [
1139 |       " 32/200 [===>..........................] - ETA: 0sTest score: 0.530333137512\n",
1140 |       "Test accuracy: 0.807000041008\n"
1141 |      ]
1142 |     }
1143 |    ],
1144 |    "source": [
1145 |     "# Evaluate the accuracy of our trained model\n",
1146 |     "score = model.evaluate(x_test, y_test,\n",
1147 |     "                       batch_size=batch_size, verbose=1)\n",
1148 |     "print('Test score:', score[0])\n",
1149 |     "print('Test accuracy:', score[1])"
1150 |    ]
1151 |   },
1152 |   {
1153 |    "cell_type": "code",
1154 |    "execution_count": 130,
1155 |    "metadata": {
1156 |     "colab": {
1157 |      "autoexec": {
1158 |       "startup": false,
1159 |       "wait_interval": 0
1160 |      },
1161 |      "output_extras": [
1162 |       {}
1163 |      ]
1164 |     },
1165 |     "colab_type": "code",
1166 |     "collapsed": false,
1167 |     "id": "f000lYoxhA4F",
1168 |     "outputId": "21cd198f-1979-4b40-a2fd-891a1c0248db"
1169 |    },
1170 |    "outputs": [
1171 |     {
1172 |      "data": {
1173 |       "text/html": [
1174 |        "\n",
1175 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1176 |        "          <script>\n",
1177 |        "            requirejs.config({\n",
1178 |        "              paths: {\n",
1179 |        "                base: '/static/base',\n",
1180 |        "              },\n",
1181 |        "            });\n",
1182 |        "          </script>\n",
1183 |        "          "
1184 |       ],
1185 |       "text/plain": [
1186 |        "<IPython.core.display.HTML object>"
1187 |       ]
1188 |      },
1189 |      "metadata": {},
1190 |      "output_type": "display_data"
1191 |     },
1192 |     {
1193 |      "name": "stdout",
1194 |      "output_type": "stream",
1195 |      "text": [
1196 |       "cannot cast from fragment to supportmapfragment <pre><code>public class outdoorfragment extends fragment  { private googlemap googlemap; double latitude = 1.31039; double longitude = 103.7784;  public ...\n",
1197 |       "Actual label:android\n",
1198 |       "Predicted label: android\n",
1199 |       "\n",
1200 |       "bullet list (or similar to) in a linear view between textviews  i try to build a bullet list with listview and use it in my scrollable linearlayout but it doesn t expand the list (i have to scroll the ...\n",
1201 |       "Actual label:android\n",
1202 |       "Predicted label: android\n",
1203 |       "\n",
1204 |       "android slide up view panel like sms  i m new to android and i want to create a chat application in android. so far i have designed bottom of the edit text with submit button.  like blow link    <a hr ...\n",
1205 |       "Actual label:android\n",
1206 |       "Predicted label: \n",
1207 |       "\n",
1208 |       "ef databinding  editing  modifying  saving records  i need a ef sample code to fill my customer form controls based on a value put in a search field. take into account that the combos on this form are ...\n",
1209 |       "Actual label:android\n",
1210 |       "Predicted label: \n",
1211 |       "\n",
1212 |       "dropdown marker info/window/bubble selector for google maps  i m trying to add a dropdown to an existing php mysql google map based on this:    i want to be able to select a [bar_name] and for the map ...\n",
1213 |       "Actual label:c#\n",
1214 |       "Predicted label: javascript\n",
1215 |       "\n"
1216 |      ]
1217 |     }
1218 |    ],
1219 |    "source": [
1220 |     "# Here's how to generate a prediction on individual examples locally\n",
1221 |     "for i in range(5):\n",
1222 |     "    prediction = model.predict(np.array([x_test[i]]))\n",
1223 |     "    predicted_label = array_to_labels(prediction[0],threshold=.5)\n",
1224 |     "    print(test_posts.iloc[i][:200], \"...\")\n",
1225 |     "    print('Actual label:' + '|'.join(array_to_labels(y_test[i])))\n",
1226 |     "    print(\"Predicted label: \" + '|'.join(predicted_label) + '\\n')  "
1227 |    ]
1228 |   },
1229 |   {
1230 |    "cell_type": "markdown",
1231 |    "metadata": {},
1232 |    "source": [
1233 |     "### 5) Package for ML Engine\n",
1234 |     "\n",
1235 |     "In order to train your model using the Cloud ML Engine Service, you need to package it as a python module and make a few additions to the code. The additions will be called out in the code comments.\n",
1236 |     "\n",
1237 |     "There are two main reasons why you'd want to train using Cloud ML Engine\n",
1238 |     "\n",
1239 |     "1. Automatic Hyperparameter tuning\n",
1240 |     "2. Access to faster training hardware\n",
1241 |     "\n",
1242 |     "We will demonstrate the first of these reasons"
1243 |    ]
1244 |   },
1245 |   {
1246 |    "cell_type": "code",
1247 |    "execution_count": 131,
1248 |    "metadata": {
1249 |     "collapsed": false
1250 |    },
1251 |    "outputs": [
1252 |     {
1253 |      "data": {
1254 |       "text/html": [
1255 |        "\n",
1256 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1257 |        "          <script>\n",
1258 |        "            requirejs.config({\n",
1259 |        "              paths: {\n",
1260 |        "                base: '/static/base',\n",
1261 |        "              },\n",
1262 |        "            });\n",
1263 |        "          </script>\n",
1264 |        "          "
1265 |       ],
1266 |       "text/plain": [
1267 |        "<IPython.core.display.HTML object>"
1268 |       ]
1269 |      },
1270 |      "metadata": {},
1271 |      "output_type": "display_data"
1272 |     },
1273 |     {
1274 |      "name": "stderr",
1275 |      "output_type": "stream",
1276 |      "text": [
1277 |       "mkdir: cannot create directory ‘trainer’: File exists\n"
1278 |      ]
1279 |     }
1280 |    ],
1281 |    "source": [
1282 |     "%%bash\n",
1283 |     "mkdir trainer\n",
1284 |     "touch trainer/__init__.py"
1285 |    ]
1286 |   },
1287 |   {
1288 |    "cell_type": "code",
1289 |    "execution_count": 144,
1290 |    "metadata": {
1291 |     "collapsed": false
1292 |    },
1293 |    "outputs": [
1294 |     {
1295 |      "data": {
1296 |       "text/html": [
1297 |        "\n",
1298 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1299 |        "          <script>\n",
1300 |        "            requirejs.config({\n",
1301 |        "              paths: {\n",
1302 |        "                base: '/static/base',\n",
1303 |        "              },\n",
1304 |        "            });\n",
1305 |        "          </script>\n",
1306 |        "          "
1307 |       ],
1308 |       "text/plain": [
1309 |        "<IPython.core.display.HTML object>"
1310 |       ]
1311 |      },
1312 |      "metadata": {},
1313 |      "output_type": "display_data"
1314 |     },
1315 |     {
1316 |      "name": "stdout",
1317 |      "output_type": "stream",
1318 |      "text": [
1319 |       "Overwriting trainer/task.py\n"
1320 |      ]
1321 |     }
1322 |    ],
1323 |    "source": [
1324 |     "%%writefile trainer/task.py\n",
1325 |     "\n",
1326 |     "from __future__ import absolute_import\n",
1327 |     "from __future__ import division\n",
1328 |     "from __future__ import print_function\n",
1329 |     "\n",
1330 |     "import itertools\n",
1331 |     "import argparse\n",
1332 |     "import json\n",
1333 |     "import time\n",
1334 |     "import os\n",
1335 |     "\n",
1336 |     "import numpy as np\n",
1337 |     "import pandas as pd\n",
1338 |     "import tensorflow as tf\n",
1339 |     "import google.datalab.bigquery as bq\n",
1340 |     "\n",
1341 |     "from sklearn.preprocessing import LabelBinarizer, LabelEncoder\n",
1342 |     "from sklearn.metrics import confusion_matrix\n",
1343 |     "\n",
1344 |     "from tensorflow.python.keras.models import Sequential\n",
1345 |     "from tensorflow.python.keras.layers import Dense, Activation, Dropout\n",
1346 |     "from tensorflow.python.keras.preprocessing import text, sequence\n",
1347 |     "from tensorflow.python.keras import utils\n",
1348 |     "\n",
1349 |     "from collections import Counter\n",
1350 |     "\n",
1351 |     "print(\"You have TensorFlow version\", tf.__version__)\n",
1352 |     "\n",
1353 |     "if __name__ == '__main__':\n",
1354 |     "### COMMAND LINE ARGUMENTS ###\n",
1355 |     "  parser = argparse.ArgumentParser()\n",
1356 |     "  \n",
1357 |     "  parser.add_argument(\n",
1358 |     "    '--train_batch_size', #hyperparameter\n",
1359 |     "    help='Batch size for training steps',\n",
1360 |     "    type=int,\n",
1361 |     "    default=32\n",
1362 |     "  )\n",
1363 |     "  parser.add_argument(\n",
1364 |     "    '--epochs', #hyperparamter\n",
1365 |     "    help='Number of epochs to train for',\n",
1366 |     "    type=int,\n",
1367 |     "    default=2\n",
1368 |     "  )\n",
1369 |     "  parser.add_argument(\n",
1370 |     "    '--neurons', #hyperparamter\n",
1371 |     "    help='Number of neurons in hidden layer',\n",
1372 |     "    type=int,\n",
1373 |     "    default=512\n",
1374 |     "  )\n",
1375 |     "  parser.add_argument(\n",
1376 |     "        '--output_dir',\n",
1377 |     "        help='GCS location to write checkpoints and export models',\n",
1378 |     "        required=True\n",
1379 |     "  )\n",
1380 |     "  parser.add_argument(\n",
1381 |     "          '--job-dir',\n",
1382 |     "          help='this model ignores this field, but it is required by gcloud',\n",
1383 |     "          default='junk'\n",
1384 |     "  )\n",
1385 |     "  args = parser.parse_args()\n",
1386 |     "  \n",
1387 |     "### DOWNLOAD DATA ###\n",
1388 |     "  query = \"\"\"\n",
1389 |     "  SELECT tags, TRIM(LOWER(REGEXP_REPLACE(CONCAT(title, \\' \\', body), r\\'[\\\"\\\\n\\\\\\'?,]|<p>|</p>\\',\\\" \\\"))) as post \n",
1390 |     "  FROM `bigquery-public-data.stackoverflow.posts_questions`\n",
1391 |     "  WHERE REGEXP_CONTAINS(tags, r\\\"javascript|java|c#|php|android|jquery|python\\\") \n",
1392 |     "  LIMIT 1000\n",
1393 |     "  \"\"\"\n",
1394 |     "\n",
1395 |     "  #data = bq.Query(query).execute(output_options=bq.QueryOutput.dataframe()).result() #issues with ML Engine service account authentication\n",
1396 |     "  data = pd.read_csv(\"https://storage.googleapis.com/vijay-public/text_classification/results-1000.csv\")\n",
1397 |     "  NUM_ROWS = data.shape[0]\n",
1398 |     "  print(\"Loaded {} rows\".format(NUM_ROWS))\n",
1399 |     "  \n",
1400 |     "### DATA PREPROCESSING ###\n",
1401 |     "  #Generate list of N most common labels\n",
1402 |     "  NUM_CLASSES = 5\n",
1403 |     "  labels_list = []\n",
1404 |     "\n",
1405 |     "  counts = Counter('|'.join(data['tags'].tolist()).split('|'))\n",
1406 |     "  classes = counts.most_common(NUM_CLASSES)\n",
1407 |     "\n",
1408 |     "  for i in range(0,NUM_CLASSES):\n",
1409 |     "      labels_list.append(classes[i][0])\n",
1410 |     "\n",
1411 |     "  print (\"{} most common classes:\".format(NUM_CLASSES))\n",
1412 |     "  print(classes)\n",
1413 |     "\n",
1414 |     "  #utility functions to extract classes and translate between \n",
1415 |     "  #human friendly (string) labels and machine friendly (array) labels\n",
1416 |     "\n",
1417 |     "  #labels_list: A list of the valid classes\n",
1418 |     "  #tags: A list of tags for a post\n",
1419 |     "  #returns an ndarray with ones for the active classes\n",
1420 |     "  def labels_to_array(tags,labels_list=labels_list):\n",
1421 |     "      array = np.zeros(len(labels_list),dtype=np.int8)\n",
1422 |     "      tags = tags.split('|') #split tags from pipe separated string into list\n",
1423 |     "      for tag in tags:\n",
1424 |     "          try:\n",
1425 |     "              array[labels_list.index(tag)] = 1\n",
1426 |     "          except ValueError: \n",
1427 |     "              None\n",
1428 |     "      return array\n",
1429 |     "\n",
1430 |     "  #translate machine readable array back to human labels\n",
1431 |     "  def array_to_labels(array, labels_list=labels_list, threshold = 1):\n",
1432 |     "      labels = []\n",
1433 |     "      i=0\n",
1434 |     "      for flag in array:\n",
1435 |     "          if flag >= threshold: \n",
1436 |     "              labels.append(labels_list[i])\n",
1437 |     "          i=i+1\n",
1438 |     "      return labels\n",
1439 |     "\n",
1440 |     "  #split into training/test set \n",
1441 |     "  train_size = int(len(data) * .8)\n",
1442 |     "  test_size = len(data)-train_size\n",
1443 |     "  print (\"Train size: %d\" % train_size)\n",
1444 |     "  print (\"Test size: %d\" % test_size)\n",
1445 |     "\n",
1446 |     "  train_posts = data['post'][:train_size]\n",
1447 |     "  train_tags = data['tags'][:train_size]\n",
1448 |     "\n",
1449 |     "  test_posts = data['post'][train_size:]\n",
1450 |     "  test_tags = data['tags'][train_size:]\n",
1451 |     "\n",
1452 |     "  #generate bag of words embedding\n",
1453 |     "  max_words = 1000\n",
1454 |     "  tokenize = text.Tokenizer(num_words=max_words, char_level=False)\n",
1455 |     "\n",
1456 |     "  tokenize.fit_on_texts(train_posts) # only fit on train\n",
1457 |     "  x_train = tokenize.texts_to_matrix(train_posts)\n",
1458 |     "  x_test = tokenize.texts_to_matrix(test_posts)\n",
1459 |     "\n",
1460 |     "  # generate multi-label arrays\n",
1461 |     "  y_train = np.zeros([train_size,NUM_CLASSES])\n",
1462 |     "  for i in range(0,train_size):\n",
1463 |     "      y_train[i] = labels_to_array(data['tags'][i])\n",
1464 |     "\n",
1465 |     "  y_test = np.zeros([test_size,NUM_CLASSES])\n",
1466 |     "  for i in range(0,test_size):\n",
1467 |     "      y_test[i] = labels_to_array(data['tags'][i+train_size-1])\n",
1468 |     "  y_test[0]\n",
1469 |     "\n",
1470 |     "  # Inspect the dimenstions of our training and test data (this is helpful to debug)\n",
1471 |     "  print('x_train shape:', x_train.shape)\n",
1472 |     "  print('x_test shape:', x_test.shape)\n",
1473 |     "  print('y_train shape:', y_train.shape)\n",
1474 |     "  print('y_test shape:', y_test.shape)\n",
1475 |     "\n",
1476 |     "### BUILD MODEL ###\n",
1477 |     "  #Set hyperparameters\n",
1478 |     "  batch_size = args.train_batch_size\n",
1479 |     "  epochs = args.epochs\n",
1480 |     "\n",
1481 |     "\n",
1482 |     "  model = Sequential()\n",
1483 |     "  model.add(Dense(args.neurons, input_shape=(max_words,)))\n",
1484 |     "  model.add(Activation('relu'))\n",
1485 |     "  #model.add(Dropout(0.5)) #this breaks SavedModel prediction\n",
1486 |     "  model.add(Dense(NUM_CLASSES))\n",
1487 |     "  model.add(Activation('sigmoid')) #changed from softmax\n",
1488 |     "\n",
1489 |     "  model.compile(loss='binary_crossentropy',\n",
1490 |     "                optimizer='adam',\n",
1491 |     "                metrics=['accuracy']) #changed from categorical_crossentropy\n",
1492 |     "\n",
1493 |     "  # model.fit trains the model\n",
1494 |     "  # The validation_split param tells Keras what % of our training data should be used in the validation set\n",
1495 |     "  # You can see the validation loss decreasing slowly when you run this\n",
1496 |     "  # Because val_loss is no longer decreasing we stop training to prevent overfitting\n",
1497 |     "\n",
1498 |     "\n",
1499 |     "  #Enable Tensorboard logging\n",
1500 |     "  Tensorboard = tf.keras.callbacks.TensorBoard(log_dir=args.output_dir + \"/tensorboard\")\n",
1501 |     "\n",
1502 |     "  history = model.fit(x_train, y_train,\n",
1503 |     "                      batch_size=batch_size,\n",
1504 |     "                      epochs=epochs,\n",
1505 |     "                      verbose=1,\n",
1506 |     "                      validation_split=0.1,\n",
1507 |     "                      callbacks=[Tensorboard]) #callback for Tensorboard\n",
1508 |     "\n",
1509 |     "  # Evaluate the accuracy of our trained model\n",
1510 |     "  score = model.evaluate(x_test, y_test,\n",
1511 |     "                         batch_size=batch_size, verbose=1)\n",
1512 |     "  print('Test score:', score[0])\n",
1513 |     "\n",
1514 |     "\n",
1515 |     "  # Here's how to generate a prediction on individual examples\n",
1516 |     "  for i in range(5):\n",
1517 |     "      prediction = model.predict(np.array([x_test[i]]))\n",
1518 |     "      #print(prediction[0])\n",
1519 |     "      #print(y_test[i])\n",
1520 |     "      predicted_label = array_to_labels(prediction[0],threshold=.5)\n",
1521 |     "      print(test_posts.iloc[i][:200], \"...\")\n",
1522 |     "      print('Actual label:' + '|'.join(array_to_labels(y_test[i])))\n",
1523 |     "      print(\"Predicted label: \" + '|'.join(predicted_label) + '\\n') \n",
1524 |     "\n",
1525 |     "### EXPORT MODEL ### \n",
1526 |     "  model_builder = tf.saved_model.builder.SavedModelBuilder(args.output_dir+\"/export/\"+time.strftime(\"%Y%m%d-%H%M%S\"))\n",
1527 |     "\n",
1528 |     "  inputs = {'input': tf.saved_model.utils.build_tensor_info(model.input)}\n",
1529 |     "  outputs = {'output': tf.saved_model.utils.build_tensor_info(model.output)}\n",
1530 |     "\n",
1531 |     "  signature_def = tf.saved_model.signature_def_utils.build_signature_def(\n",
1532 |     "      inputs=inputs,\n",
1533 |     "      outputs=outputs,\n",
1534 |     "      method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME\n",
1535 |     "  )\n",
1536 |     "\n",
1537 |     "  model_builder.add_meta_graph_and_variables(\n",
1538 |     "      tf.keras.backend.get_session(),\n",
1539 |     "      tags=[tf.saved_model.tag_constants.SERVING],\n",
1540 |     "      signature_def_map={\n",
1541 |     "          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def\n",
1542 |     "      }\n",
1543 |     "  )\n",
1544 |     "\n",
1545 |     "  model_builder.save()\n"
1546 |    ]
1547 |   },
1548 |   {
1549 |    "cell_type": "code",
1550 |    "execution_count": 145,
1551 |    "metadata": {
1552 |     "collapsed": false,
1553 |     "scrolled": false
1554 |    },
1555 |    "outputs": [
1556 |     {
1557 |      "data": {
1558 |       "text/html": [
1559 |        "\n",
1560 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1561 |        "          <script>\n",
1562 |        "            requirejs.config({\n",
1563 |        "              paths: {\n",
1564 |        "                base: '/static/base',\n",
1565 |        "              },\n",
1566 |        "            });\n",
1567 |        "          </script>\n",
1568 |        "          "
1569 |       ],
1570 |       "text/plain": [
1571 |        "<IPython.core.display.HTML object>"
1572 |       ]
1573 |      },
1574 |      "metadata": {},
1575 |      "output_type": "display_data"
1576 |     },
1577 |     {
1578 |      "name": "stdout",
1579 |      "output_type": "stream",
1580 |      "text": [
1581 |       "You have TensorFlow version 1.4.0\n",
1582 |       "Loaded 1000 rows\n",
1583 |       "5 most common classes:\n",
1584 |       "[('javascript', 376), ('java', 334), ('jquery', 133), ('c#', 100), ('html', 82)]\n",
1585 |       "Train size: 800\n",
1586 |       "Test size: 200\n",
1587 |       "x_train shape: (800, 1000)\n",
1588 |       "x_test shape: (200, 1000)\n",
1589 |       "y_train shape: (800, 5)\n",
1590 |       "y_test shape: (200, 5)\n",
1591 |       "Train on 720 samples, validate on 80 samples\n",
1592 |       "Epoch 1/2\n",
1593 |       "\r",
1594 |       " 32/720 [>.............................] - ETA: 0s - loss: 0.6860 - acc: 0.5625\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1595 |       "192/720 [=======>......................] - ETA: 0s - loss: 0.5539 - acc: 0.7594\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1596 |       "352/720 [=============>................] - ETA: 0s - loss: 0.4947 - acc: 0.7915\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1597 |       "512/720 [====================>.........] - ETA: 0s - loss: 0.4687 - acc: 0.8105\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1598 |       "672/720 [===========================>..] - ETA: 0s - loss: 0.4466 - acc: 0.8280\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1599 |       "720/720 [==============================] - 0s - loss: 0.4436 - acc: 0.8303 - val_loss: 0.3341 - val_acc: 0.8700\n",
1600 |       "Epoch 2/2\n",
1601 |       "\r",
1602 |       " 32/720 [>.............................] - ETA: 0s - loss: 0.2374 - acc: 0.9313\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1603 |       "192/720 [=======>......................] - ETA: 0s - loss: 0.2978 - acc: 0.8906\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1604 |       "352/720 [=============>................] - ETA: 0s - loss: 0.2764 - acc: 0.9034\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1605 |       "512/720 [====================>.........] - ETA: 0s - loss: 0.2657 - acc: 0.9047\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1606 |       "672/720 [===========================>..] - ETA: 0s - loss: 0.2544 - acc: 0.9086\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r",
1607 |       "720/720 [==============================] - 0s - loss: 0.2514 - acc: 0.9097 - val_loss: 0.2677 - val_acc: 0.9075\n",
1608 |       "\r",
1609 |       " 32/200 [===>..........................] - ETA: 0sTest score: 0.693889396191\n",
1610 |       "update single record with an array of results  a query leads to a list of results (@invoice_results - - also the name of the controller action) with two specific columns in the view (formatting remove ...\n",
1611 |       "Actual label:javascript|c#\n",
1612 |       "Predicted label: javascript\n",
1613 |       "\n",
1614 |       "jquery and mouseevents  i have a question regarding mouse events in the jquery library. i have a simple javascript function as following:   <pre><code>$(function() {     var xpos;     var ypos;     $( ...\n",
1615 |       "Actual label:\n",
1616 |       "Predicted label: javascript\n",
1617 |       "\n",
1618 |       "drag and drop using raphael.js has laggy performance with more than 10 draggable elements  i m making a simple html5 app  that will be wrapped to be used on android  ios and web browsers. in my app i  ...\n",
1619 |       "Actual label:javascript|jquery|html\n",
1620 |       "Predicted label: javascript|jquery\n",
1621 |       "\n",
1622 |       "event logging in asp.net  i am using vs2005 c# .net 2.0 and sql server 2005.    are there any websites that provide step by step instructions to implement event logging for my web application     <str ...\n",
1623 |       "Actual label:javascript\n",
1624 |       "Predicted label: \n",
1625 |       "\n",
1626 |       "why chrome maintains scrolltop from previous page  chrome maintains the <code>scrolltop</code> from previous page. eg: i am in list page and scroll down to end of the page and clicked on a record to v ...\n",
1627 |       "Actual label:c#\n",
1628 |       "Predicted label: javascript\n",
1629 |       "\n"
1630 |      ]
1631 |     },
1632 |     {
1633 |      "name": "stderr",
1634 |      "output_type": "stream",
1635 |      "text": [
1636 |       "2017-11-14 19:55:03.220977: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA\n"
1637 |      ]
1638 |     }
1639 |    ],
1640 |    "source": [
1641 |     "%%bash\n",
1642 |     "gcloud ml-engine local train \\\n",
1643 |     "   --module-name=trainer.task \\\n",
1644 |     "   --package-path=trainer \\\n",
1645 |     "   -- \\\n",
1646 |     "   --output_dir='./output'"
1647 |    ]
1648 |   },
1649 |   {
1650 |    "cell_type": "code",
1651 |    "execution_count": 133,
1652 |    "metadata": {
1653 |     "collapsed": false
1654 |    },
1655 |    "outputs": [
1656 |     {
1657 |      "data": {
1658 |       "text/html": [
1659 |        "\n",
1660 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1661 |        "          <script>\n",
1662 |        "            requirejs.config({\n",
1663 |        "              paths: {\n",
1664 |        "                base: '/static/base',\n",
1665 |        "              },\n",
1666 |        "            });\n",
1667 |        "          </script>\n",
1668 |        "          "
1669 |       ],
1670 |       "text/plain": [
1671 |        "<IPython.core.display.HTML object>"
1672 |       ]
1673 |      },
1674 |      "metadata": {},
1675 |      "output_type": "display_data"
1676 |     }
1677 |    ],
1678 |    "source": [
1679 |     "GCS_BUCKET = 'gs://vijays-sandbox-ml' #CHANGE THIS TO YOUR BUCKET\n",
1680 |     "PROJECT = 'vijays-sandbox' #CHANGE THIS TO YOUR PROJECT ID\n",
1681 |     "REGION = 'us-central1' #OPTIONALLY CHANGE THIS"
1682 |    ]
1683 |   },
1684 |   {
1685 |    "cell_type": "code",
1686 |    "execution_count": 134,
1687 |    "metadata": {
1688 |     "collapsed": false
1689 |    },
1690 |    "outputs": [
1691 |     {
1692 |      "data": {
1693 |       "text/html": [
1694 |        "\n",
1695 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1696 |        "          <script>\n",
1697 |        "            requirejs.config({\n",
1698 |        "              paths: {\n",
1699 |        "                base: '/static/base',\n",
1700 |        "              },\n",
1701 |        "            });\n",
1702 |        "          </script>\n",
1703 |        "          "
1704 |       ],
1705 |       "text/plain": [
1706 |        "<IPython.core.display.HTML object>"
1707 |       ]
1708 |      },
1709 |      "metadata": {},
1710 |      "output_type": "display_data"
1711 |     }
1712 |    ],
1713 |    "source": [
1714 |     "import os\n",
1715 |     "os.environ['GCS_BUCKET'] = GCS_BUCKET\n",
1716 |     "os.environ['PROJECT'] = PROJECT\n",
1717 |     "os.environ['REGION'] = REGION"
1718 |    ]
1719 |   },
1720 |   {
1721 |    "cell_type": "markdown",
1722 |    "metadata": {},
1723 |    "source": [
1724 |     "#### Configuration file for hyperparameter tuning\n",
1725 |     "\n",
1726 |     "Here I specify\n",
1727 |     "\n",
1728 |     "1. Which hyperparamters i wish to tune\n",
1729 |     "2. What min and max range I want to tune between\n",
1730 |     "3. What success metric i want to evaluate against\n",
1731 |     "\n",
1732 |     "Note that the hyperparameter tuner passes values to tensorflow via the command line, so any hyperparameter I wish to tune must be exposed as a command line argument in my code"
1733 |    ]
1734 |   },
1735 |   {
1736 |    "cell_type": "code",
1737 |    "execution_count": 148,
1738 |    "metadata": {
1739 |     "collapsed": false
1740 |    },
1741 |    "outputs": [
1742 |     {
1743 |      "data": {
1744 |       "text/html": [
1745 |        "\n",
1746 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1747 |        "          <script>\n",
1748 |        "            requirejs.config({\n",
1749 |        "              paths: {\n",
1750 |        "                base: '/static/base',\n",
1751 |        "              },\n",
1752 |        "            });\n",
1753 |        "          </script>\n",
1754 |        "          "
1755 |       ],
1756 |       "text/plain": [
1757 |        "<IPython.core.display.HTML object>"
1758 |       ]
1759 |      },
1760 |      "metadata": {},
1761 |      "output_type": "display_data"
1762 |     },
1763 |     {
1764 |      "name": "stdout",
1765 |      "output_type": "stream",
1766 |      "text": [
1767 |       "Overwriting config.yaml\n"
1768 |      ]
1769 |     }
1770 |    ],
1771 |    "source": [
1772 |     "%%writefile config.yaml\n",
1773 |     "trainingInput:\n",
1774 |     "  hyperparameters:\n",
1775 |     "    goal: MAXIMIZE\n",
1776 |     "    hyperparameterMetricTag: val_acc\n",
1777 |     "    maxTrials: 5\n",
1778 |     "    maxParallelTrials: 1\n",
1779 |     "    params:\n",
1780 |     "    - parameterName: neurons\n",
1781 |     "      type: INTEGER\n",
1782 |     "      minValue: 10\n",
1783 |     "      maxValue: 1000\n",
1784 |     "      scaleType: UNIT_LINEAR_SCALE"
1785 |    ]
1786 |   },
1787 |   {
1788 |    "cell_type": "markdown",
1789 |    "metadata": {},
1790 |    "source": [
1791 |     "#### Upgrade ML Engine Tensorflow Version\n",
1792 |     "\n",
1793 |     "Our code requires TF 1.4, however TF 1.4 is not yet a pre-packaged runtime for ML Engine. However we can force the installation of TF 1.4 by specifying it as a PyPi dependency as documented [here](https://cloud.google.com/ml-engine/docs/versioning#specifying_custom_versions_of_tensorflow_for_training)"
1794 |    ]
1795 |   },
1796 |   {
1797 |    "cell_type": "code",
1798 |    "execution_count": 136,
1799 |    "metadata": {
1800 |     "collapsed": false
1801 |    },
1802 |    "outputs": [
1803 |     {
1804 |      "data": {
1805 |       "text/html": [
1806 |        "\n",
1807 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1808 |        "          <script>\n",
1809 |        "            requirejs.config({\n",
1810 |        "              paths: {\n",
1811 |        "                base: '/static/base',\n",
1812 |        "              },\n",
1813 |        "            });\n",
1814 |        "          </script>\n",
1815 |        "          "
1816 |       ],
1817 |       "text/plain": [
1818 |        "<IPython.core.display.HTML object>"
1819 |       ]
1820 |      },
1821 |      "metadata": {},
1822 |      "output_type": "display_data"
1823 |     },
1824 |     {
1825 |      "name": "stdout",
1826 |      "output_type": "stream",
1827 |      "text": [
1828 |       "Overwriting setup.py\n"
1829 |      ]
1830 |     }
1831 |    ],
1832 |    "source": [
1833 |     "%%writefile setup.py\n",
1834 |     "from setuptools import find_packages\n",
1835 |     "from setuptools import setup\n",
1836 |     "\n",
1837 |     "REQUIRED_PACKAGES = ['tensorflow==1.4','datalab']\n",
1838 |     "\n",
1839 |     "setup(\n",
1840 |     "    name='trainer',\n",
1841 |     "    version='0.1',\n",
1842 |     "    install_requires=REQUIRED_PACKAGES,\n",
1843 |     "    packages=find_packages(),\n",
1844 |     "    include_package_data=True,\n",
1845 |     "    description='Text Classification Trainer Application')"
1846 |    ]
1847 |   },
1848 |   {
1849 |    "cell_type": "markdown",
1850 |    "metadata": {},
1851 |    "source": [
1852 |     "#### Queue ML Engine Training Job\n",
1853 |     "\n",
1854 |     "We use the gcloud command line tool to do so"
1855 |    ]
1856 |   },
1857 |   {
1858 |    "cell_type": "code",
1859 |    "execution_count": 147,
1860 |    "metadata": {
1861 |     "collapsed": false
1862 |    },
1863 |    "outputs": [
1864 |     {
1865 |      "data": {
1866 |       "text/html": [
1867 |        "\n",
1868 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1869 |        "          <script>\n",
1870 |        "            requirejs.config({\n",
1871 |        "              paths: {\n",
1872 |        "                base: '/static/base',\n",
1873 |        "              },\n",
1874 |        "            });\n",
1875 |        "          </script>\n",
1876 |        "          "
1877 |       ],
1878 |       "text/plain": [
1879 |        "<IPython.core.display.HTML object>"
1880 |       ]
1881 |      },
1882 |      "metadata": {},
1883 |      "output_type": "display_data"
1884 |     },
1885 |     {
1886 |      "name": "stdout",
1887 |      "output_type": "stream",
1888 |      "text": [
1889 |       "jobId: text_classification_171114_195631\n",
1890 |       "state: QUEUED\n"
1891 |      ]
1892 |     },
1893 |     {
1894 |      "name": "stderr",
1895 |      "output_type": "stream",
1896 |      "text": [
1897 |       "Job [text_classification_171114_195631] submitted successfully.\n",
1898 |       "Your job is still active. You may view the status of your job with the command\n",
1899 |       "\n",
1900 |       "  $ gcloud ml-engine jobs describe text_classification_171114_195631\n",
1901 |       "\n",
1902 |       "or continue streaming the logs with the command\n",
1903 |       "\n",
1904 |       "  $ gcloud ml-engine jobs stream-logs text_classification_171114_195631\n"
1905 |      ]
1906 |     }
1907 |    ],
1908 |    "source": [
1909 |     "%%bash\n",
1910 |     "JOBNAME=text_classification_$(date -u +%y%m%d_%H%M%S)\n",
1911 |     "\n",
1912 |     "gcloud ml-engine jobs submit training $JOBNAME \\\n",
1913 |     "   --region=$REGION \\\n",
1914 |     "   --runtime-version=1.2 \\\n",
1915 |     "   --module-name=trainer.task \\\n",
1916 |     "   --package-path=trainer \\\n",
1917 |     "   --job-dir=$GCS_BUCKET/$JOBNAME/ \\\n",
1918 |     "   --config config.yaml \\\n",
1919 |     "   -- \\\n",
1920 |     "   --output_dir=$GCS_BUCKET/$JOBNAME/output"
1921 |    ]
1922 |   },
1923 |   {
1924 |    "cell_type": "markdown",
1925 |    "metadata": {
1926 |     "collapsed": true
1927 |    },
1928 |    "source": [
1929 |     "### 7) Inspect Results Using TensorBoard"
1930 |    ]
1931 |   },
1932 |   {
1933 |    "cell_type": "code",
1934 |    "execution_count": 89,
1935 |    "metadata": {
1936 |     "collapsed": false
1937 |    },
1938 |    "outputs": [
1939 |     {
1940 |      "data": {
1941 |       "text/html": [
1942 |        "\n",
1943 |        "          <script src=\"/static/components/requirejs/require.js\"></script>\n",
1944 |        "          <script>\n",
1945 |        "            requirejs.config({\n",
1946 |        "              paths: {\n",
1947 |        "                base: '/static/base',\n",
1948 |        "              },\n",
1949 |        "            });\n",
1950 |        "          </script>\n",
1951 |        "          "
1952 |       ],
1953 |       "text/plain": [
1954 |        "<IPython.core.display.HTML object>"
1955 |       ]
1956 |      },
1957 |      "metadata": {},
1958 |      "output_type": "display_data"
1959 |     },
1960 |     {
1961 |      "data": {
1962 |       "text/html": [
1963 |        "<p>TensorBoard was started successfully with pid 7530. Click <a href=\"/_proxy/45932/\" target=\"_blank\">here</a> to access it.</p>"
1964 |       ]
1965 |      },
1966 |      "metadata": {},
1967 |      "output_type": "display_data"
1968 |     },
1969 |     {
1970 |      "data": {
1971 |       "text/plain": [
1972 |        "7530"
1973 |       ]
1974 |      },
1975 |      "execution_count": 89,
1976 |      "metadata": {},
1977 |      "output_type": "execute_result"
1978 |     }
1979 |    ],
1980 |    "source": [
1981 |     "from google.datalab.ml import TensorBoard\n",
1982 |     "TensorBoard().start('output')"
1983 |    ]
1984 |   },
1985 |   {
1986 |    "cell_type": "markdown",
1987 |    "metadata": {},
1988 |    "source": [
1989 |     "If you're running from a jupyter notebook on your laptop/workstation (as opposed to datalab) you can use the below command instead to launch tensorboard\n",
1990 |     "\n",
1991 |     "When you're done with tensorboard interrupt the kernel (using the Jupyter menu bar) to quit"
1992 |    ]
1993 |   },
1994 |   {
1995 |    "cell_type": "code",
1996 |    "execution_count": null,
1997 |    "metadata": {
1998 |     "collapsed": false
1999 |    },
2000 |    "outputs": [],
2001 |    "source": [
2002 |     "#alternative tensorboard command for outside datalab\n",
2003 |     "\n",
2004 |     "#!tensorboard --logdir=output/"
2005 |    ]
2006 |   },
2007 |   {
2008 |    "cell_type": "markdown",
2009 |    "metadata": {
2010 |     "collapsed": true
2011 |    },
2012 |    "source": [
2013 |     "### 8) Deploy For Prediction\n",
2014 |     "\n",
2015 |     "Cloud ML Engine has a prediction service that will wrap our tensorflow model with a REST API and allow remote clients to get predictions.\n",
2016 |     "\n",
2017 |     "You can deploy the model from the Google Cloud Console GUI, or you can use the gcloud command line tool. We will use the latter method."
2018 |    ]
2019 |   },
2020 |   {
2021 |    "cell_type": "code",
2022 |    "execution_count": null,
2023 |    "metadata": {
2024 |     "collapsed": false
2025 |    },
2026 |    "outputs": [],
2027 |    "source": [
2028 |     "%%bash\n",
2029 |     "MODEL_NAME=\"text_classification\"\n",
2030 |     "MODEL_VERSION=\"v1_1000_word_embedding\"\n",
2031 |     "MODEL_LOCATION=\"output/export/20171110-135219\" #REPLACE this with the location of your model\n",
2032 |     "\n",
2033 |     "#gcloud ml-engine versions delete ${MODEL_VERSION} --model ${MODEL_NAME} #Uncomment to overwrite existing version\n",
2034 |     "#gcloud ml-engine models delete ${MODEL_NAME} #Uncomment to overwrite existing model\n",
2035 |     "gcloud ml-engine models create ${MODEL_NAME} --regions $REGION\n",
2036 |     "gcloud ml-engine versions create ${MODEL_VERSION} --model ${MODEL_NAME} --origin ${MODEL_LOCATION} --staging-bucket=$GCS_BUCKET"
2037 |    ]
2038 |   },
2039 |   {
2040 |    "cell_type": "markdown",
2041 |    "metadata": {},
2042 |    "source": [
2043 |     "### 9) Get Predictions\n",
2044 |     "There are two flavors of the ML Engine Prediction Service: Batch and online.\n",
2045 |     "\n",
2046 |     "Online prediction is more appropriate for latency sensitive requests as results are returned quickly and synchronously.\n",
2047 |     "\n",
2048 |     "Batch prediction is more appropriate for large prediction requests that you only need to run a few times a day.\n",
2049 |     "\n",
2050 |     "Below we define a function that takes care of\n",
2051 |     "1. Authenticating to the Google Cloud API\n",
2052 |     "2. Converting our post text to the vector embedding the model was trained on\n",
2053 |     "3. Passes this embedding in JSON format, which is what the API expects"
2054 |    ]
2055 |   },
2056 |   {
2057 |    "cell_type": "code",
2058 |    "execution_count": null,
2059 |    "metadata": {
2060 |     "collapsed": true
2061 |    },
2062 |    "outputs": [],
2063 |    "source": [
2064 |     "def predict_json(project, model, post, version=None):\n",
2065 |     "    \"\"\"Send json data to a deployed model for prediction.\n",
2066 |     "\n",
2067 |     "    Args:\n",
2068 |     "        project (str): project where the Cloud ML Engine Model is deployed.\n",
2069 |     "        model (str): model name.\n",
2070 |     "        post: str, the text you want to classify.\n",
2071 |     "        version: str, version of the model to target.\n",
2072 |     "    Returns:\n",
2073 |     "        Mapping[str: any]: dictionary of prediction results defined by the\n",
2074 |     "            model.\n",
2075 |     "    \"\"\"\n",
2076 |     "\n",
2077 |     "    # Convert post to vector embedding\n",
2078 |     "    instances = tokenize.texts_to_matrix([post]).tolist()\n",
2079 |     "    # Authenticate\n",
2080 |     "    # GOOGLE_APPLICATION_CREDENTIALS=<path_to_service_account_file>\n",
2081 |     "    # OR: gcloud auth application-default login\n",
2082 |     "    service = googleapiclient.discovery.build('ml', 'v1')\n",
2083 |     "    \n",
2084 |     "    name = 'projects/{}/models/{}'.format(project, model)\n",
2085 |     "\n",
2086 |     "    if version is not None:\n",
2087 |     "        name += '/versions/{}'.format(version)\n",
2088 |     "\n",
2089 |     "    response = service.projects().predict(\n",
2090 |     "        name=name,\n",
2091 |     "        body={'instances': instances}\n",
2092 |     "    ).execute()\n",
2093 |     "\n",
2094 |     "    if 'error' in response:\n",
2095 |     "        raise RuntimeError(response['error'])\n",
2096 |     "\n",
2097 |     "    return response['predictions']"
2098 |    ]
2099 |   },
2100 |   {
2101 |    "cell_type": "markdown",
2102 |    "metadata": {},
2103 |    "source": [
2104 |     "Now we'll call the prediction function and get results! Try modifying the post text to see how it affects the label scores. Does it behave how you would expect?"
2105 |    ]
2106 |   },
2107 |   {
2108 |    "cell_type": "code",
2109 |    "execution_count": null,
2110 |    "metadata": {
2111 |     "collapsed": false
2112 |    },
2113 |    "outputs": [],
2114 |    "source": [
2115 |     "POST = \"java is my world\"\n",
2116 |     "MODEL = \"text_classification\"\n",
2117 |     "VERSION = \"v1_1000_word_embedding\"\n",
2118 |     "\n",
2119 |     "response = predict_json(PROJECT,MODEL,POST,VERSION)[0].get('output')\n",
2120 |     "\n",
2121 |     "print(\"Post: {} \\nLabel Scores: {} \\nLabels:{} \\nLabels above threshold:{}\".format(\n",
2122 |     "    POST,response,labels_list,array_to_labels(response,threshold=0.5)))"
2123 |    ]
2124 |   },
2125 |   {
2126 |    "cell_type": "code",
2127 |    "execution_count": null,
2128 |    "metadata": {
2129 |     "collapsed": true
2130 |    },
2131 |    "outputs": [],
2132 |    "source": []
2133 |   }
2134 |  ],
2135 |  "metadata": {
2136 |   "colab": {
2137 |    "default_view": {},
2138 |    "name": "josh3.ipynb",
2139 |    "provenance": [],
2140 |    "version": "0.3.2",
2141 |    "views": {}
2142 |   },
2143 |   "kernelspec": {
2144 |    "display_name": "Python 2",
2145 |    "language": "python",
2146 |    "name": "python2"
2147 |   },
2148 |   "language_info": {
2149 |    "codemirror_mode": {
2150 |     "name": "ipython",
2151 |     "version": 2
2152 |    },
2153 |    "file_extension": ".py",
2154 |    "mimetype": "text/x-python",
2155 |    "name": "python",
2156 |    "nbconvert_exporter": "python",
2157 |    "pygments_lexer": "ipython2",
2158 |    "version": "2.7.12"
2159 |   }
2160 |  },
2161 |  "nbformat": 4,
2162 |  "nbformat_minor": 1
2163 | }
2164 | 


--------------------------------------------------------------------------------