├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── .gitignore
└── src
    ├── bert-movie-reviews-sentiment-classifier-local.ipynb
    ├── bert_sentiment_classifier.py
    └── bert_sentiment_classifier.ipynb


/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/usr/local/bin/python"
3 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Loreto Parisi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # bert-movie-reviews-sentiment-classifier
 2 | Build a Movie Reviews Sentiment Classifier with Google's [BERT](https://github.com/google-research/bert) Language Model
 3 | 
 4 | # What's this?
 5 | This is a example of building a Movie Reviews Sentiment classifier with Google's BERT (Bidirectional Encoder Representations from Transformers) NLP Language Model.
 6 | 
 7 | # Requirements
 8 | This code requires `scikit-learn`, `tensorflow-gpu`, `tensorflow-hub`, `bert-tensorflow`. The code is compatibile with TF <= 1.1.50 and latest available BERT model on Tensorflow Hub. To use the cpu version please install `tensorflow==1.15.0`.
 9 | 
10 | ```bash
11 | pip install scikit-learn
12 | pip install tensorflow-gpu==1.15.0
13 | pip install tensorflow-hub
14 | pip install bert-tensorflow
15 | ```
16 | 
17 | # How to Run
18 | To run this project you can
19 | - Open the IPython Notebook `src/bert_sentiment_classifier-local.ipynb` in your Juypter Notebook or 
20 | 
21 | - Import `src/bert_sentiment_classifier.ipynb`  into [Google's Colab](https://colab.research.google.com) with GPU backend.
22 | 
23 | - Open the Pyhon Interactive `src/bert_sentiment_classifier.py` in VisualStudio Code. See [here](https://code.visualstudio.com/docs/python/jupyter-support) how it works with Jupyter Notebooks and Code.
24 | 
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/src/bert-movie-reviews-sentiment-classifier-local.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "!pip uninstall tensorflow-gpu\n",
 10 |     "!pip install scikit-learn\n",
 11 |     "!pip install tensorflow-gpu==1.15.0\n",
 12 |     "!pip install tensorflow-hub\n",
 13 |     "!pip install bert-tensorflow"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "from sklearn.model_selection import train_test_split\n",
 23 |     "import pandas as pd\n",
 24 |     "import tensorflow as tf\n",
 25 |     "import tensorflow_hub as hub\n",
 26 |     "from datetime import datetime"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "import bert\n",
 36 |     "from bert import run_classifier\n",
 37 |     "from bert import optimization\n",
 38 |     "from bert import tokenization"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "from tensorflow import keras\n",
 48 |     "import os\n",
 49 |     "import re\n",
 50 |     "\n",
 51 |     "# Load all files from a directory in a DataFrame.\n",
 52 |     "def load_directory_data(directory):\n",
 53 |     "  data = {}\n",
 54 |     "  data[\"sentence\"] = []\n",
 55 |     "  data[\"sentiment\"] = []\n",
 56 |     "  for file_path in os.listdir(directory):\n",
 57 |     "    with tf.gfile.GFile(os.path.join(directory, file_path), \"r\") as f:\n",
 58 |     "      data[\"sentence\"].append(f.read())\n",
 59 |     "      data[\"sentiment\"].append(re.match(\"\\d+_(\\d+)\\.txt\", file_path).group(1))\n",
 60 |     "  return pd.DataFrame.from_dict(data)\n",
 61 |     "\n",
 62 |     "# Merge positive and negative examples, add a polarity column and shuffle.\n",
 63 |     "def load_dataset(directory):\n",
 64 |     "  pos_df = load_directory_data(os.path.join(directory, \"pos\"))\n",
 65 |     "  neg_df = load_directory_data(os.path.join(directory, \"neg\"))\n",
 66 |     "  pos_df[\"polarity\"] = 1\n",
 67 |     "  neg_df[\"polarity\"] = 0\n",
 68 |     "  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)\n",
 69 |     "\n",
 70 |     "# Download and process the dataset files.\n",
 71 |     "def download_and_load_datasets(force_download=False):\n",
 72 |     "  dataset = tf.keras.utils.get_file(\n",
 73 |     "      fname=\"aclImdb.tar.gz\", \n",
 74 |     "      origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\", \n",
 75 |     "      extract=True)\n",
 76 |     "  \n",
 77 |     "  train_df = load_dataset(os.path.join(os.path.dirname(dataset), \n",
 78 |     "                                       \"aclImdb\", \"train\"))\n",
 79 |     "  test_df = load_dataset(os.path.join(os.path.dirname(dataset), \n",
 80 |     "                                      \"aclImdb\", \"test\"))\n",
 81 |     "  \n",
 82 |     "  return train_df, test_df"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "train, test = download_and_load_datasets()"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "train = train.sample(5000)\n",
101 |     "test = test.sample(5000)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "train.columns"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "DATA_COLUMN = 'sentence'\n",
120 |     "LABEL_COLUMN = 'polarity'\n",
121 |     "# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'\n",
122 |     "label_list = [0, 1]"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n",
132 |     "train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example\n",
133 |     "                                                                   text_a = x[DATA_COLUMN], \n",
134 |     "                                                                   text_b = None, \n",
135 |     "                                                                   label = x[LABEL_COLUMN]), axis = 1)\n",
136 |     "\n",
137 |     "test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n",
138 |     "                                                                   text_a = x[DATA_COLUMN], \n",
139 |     "                                                                   text_b = None, \n",
140 |     "                                                                   label = x[LABEL_COLUMN]), axis = 1)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "\n",
150 |     "# This is a path to an uncased (all lowercase) version of BERT\n",
151 |     "BERT_MODEL_HUB = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"\n",
152 |     "\n",
153 |     "def create_tokenizer_from_hub_module():\n",
154 |     "  \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n",
155 |     "  with tf.Graph().as_default():\n",
156 |     "    bert_module = hub.Module(BERT_MODEL_HUB)\n",
157 |     "    tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n",
158 |     "    with tf.Session() as sess:\n",
159 |     "      vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n",
160 |     "                                            tokenization_info[\"do_lower_case\"]])\n",
161 |     "      \n",
162 |     "  return bert.tokenization.FullTokenizer(\n",
163 |     "      vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
164 |     "\n",
165 |     "tokenizer = create_tokenizer_from_hub_module()"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "tokenizer.tokenize(\"This here's an example of using the BERT tokenizer\")"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "# We'll set sequences to be at most 128 tokens long.\n",
184 |     "MAX_SEQ_LENGTH = 128\n",
185 |     "# Convert our train and test features to InputFeatures that BERT understands.\n",
186 |     "train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)\n",
187 |     "test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,\n",
197 |     "                 num_labels):\n",
198 |     "  \"\"\"Creates a classification model.\"\"\"\n",
199 |     "\n",
200 |     "  bert_module = hub.Module(\n",
201 |     "      BERT_MODEL_HUB,\n",
202 |     "      trainable=True)\n",
203 |     "  bert_inputs = dict(\n",
204 |     "      input_ids=input_ids,\n",
205 |     "      input_mask=input_mask,\n",
206 |     "      segment_ids=segment_ids)\n",
207 |     "  bert_outputs = bert_module(\n",
208 |     "      inputs=bert_inputs,\n",
209 |     "      signature=\"tokens\",\n",
210 |     "      as_dict=True)\n",
211 |     "\n",
212 |     "  # Use \"pooled_output\" for classification tasks on an entire sentence.\n",
213 |     "  # Use \"sequence_outputs\" for token-level output.\n",
214 |     "  output_layer = bert_outputs[\"pooled_output\"]\n",
215 |     "\n",
216 |     "  hidden_size = output_layer.shape[-1].value\n",
217 |     "\n",
218 |     "  # Create our own layer to tune for politeness data.\n",
219 |     "  output_weights = tf.get_variable(\n",
220 |     "      \"output_weights\", [num_labels, hidden_size],\n",
221 |     "      initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
222 |     "\n",
223 |     "  output_bias = tf.get_variable(\n",
224 |     "      \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n",
225 |     "\n",
226 |     "  with tf.variable_scope(\"loss\"):\n",
227 |     "\n",
228 |     "    # Dropout helps prevent overfitting\n",
229 |     "    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n",
230 |     "\n",
231 |     "    logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n",
232 |     "    logits = tf.nn.bias_add(logits, output_bias)\n",
233 |     "    log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
234 |     "\n",
235 |     "    # Convert labels into one-hot encoding\n",
236 |     "    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n",
237 |     "\n",
238 |     "    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n",
239 |     "    # If we're predicting, we want predicted labels and the probabiltiies.\n",
240 |     "    if is_predicting:\n",
241 |     "      return (predicted_labels, log_probs)\n",
242 |     "\n",
243 |     "    # If we're train/eval, compute loss between predicted and actual label\n",
244 |     "    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n",
245 |     "    loss = tf.reduce_mean(per_example_loss)\n",
246 |     "    return (loss, predicted_labels, log_probs)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     " # model_fn_builder actually creates our model function\n",
256 |     "# using the passed parameters for num_labels, learning_rate, etc.\n",
257 |     "def model_fn_builder(num_labels, learning_rate, num_train_steps,\n",
258 |     "                     num_warmup_steps):\n",
259 |     "  \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n",
260 |     "  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
261 |     "    \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n",
262 |     "\n",
263 |     "    input_ids = features[\"input_ids\"]\n",
264 |     "    input_mask = features[\"input_mask\"]\n",
265 |     "    segment_ids = features[\"segment_ids\"]\n",
266 |     "    label_ids = features[\"label_ids\"]\n",
267 |     "\n",
268 |     "    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n",
269 |     "    \n",
270 |     "    # TRAIN and EVAL\n",
271 |     "    if not is_predicting:\n",
272 |     "\n",
273 |     "      (loss, predicted_labels, log_probs) = create_model(\n",
274 |     "        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n",
275 |     "\n",
276 |     "      train_op = bert.optimization.create_optimizer(\n",
277 |     "          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n",
278 |     "\n",
279 |     "      # Calculate evaluation metrics. \n",
280 |     "      def metric_fn(label_ids, predicted_labels):\n",
281 |     "        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n",
282 |     "        f1_score = tf.contrib.metrics.f1_score(\n",
283 |     "            label_ids,\n",
284 |     "            predicted_labels)\n",
285 |     "        auc = tf.metrics.auc(\n",
286 |     "            label_ids,\n",
287 |     "            predicted_labels)\n",
288 |     "        recall = tf.metrics.recall(\n",
289 |     "            label_ids,\n",
290 |     "            predicted_labels)\n",
291 |     "        precision = tf.metrics.precision(\n",
292 |     "            label_ids,\n",
293 |     "            predicted_labels) \n",
294 |     "        true_pos = tf.metrics.true_positives(\n",
295 |     "            label_ids,\n",
296 |     "            predicted_labels)\n",
297 |     "        true_neg = tf.metrics.true_negatives(\n",
298 |     "            label_ids,\n",
299 |     "            predicted_labels)   \n",
300 |     "        false_pos = tf.metrics.false_positives(\n",
301 |     "            label_ids,\n",
302 |     "            predicted_labels)  \n",
303 |     "        false_neg = tf.metrics.false_negatives(\n",
304 |     "            label_ids,\n",
305 |     "            predicted_labels)\n",
306 |     "        return {\n",
307 |     "            \"eval_accuracy\": accuracy,\n",
308 |     "            \"f1_score\": f1_score,\n",
309 |     "            \"auc\": auc,\n",
310 |     "            \"precision\": precision,\n",
311 |     "            \"recall\": recall,\n",
312 |     "            \"true_positives\": true_pos,\n",
313 |     "            \"true_negatives\": true_neg,\n",
314 |     "            \"false_positives\": false_pos,\n",
315 |     "            \"false_negatives\": false_neg\n",
316 |     "        }\n",
317 |     "\n",
318 |     "      eval_metrics = metric_fn(label_ids, predicted_labels)\n",
319 |     "\n",
320 |     "      if mode == tf.estimator.ModeKeys.TRAIN:\n",
321 |     "        return tf.estimator.EstimatorSpec(mode=mode,\n",
322 |     "          loss=loss,\n",
323 |     "          train_op=train_op)\n",
324 |     "      else:\n",
325 |     "          return tf.estimator.EstimatorSpec(mode=mode,\n",
326 |     "            loss=loss,\n",
327 |     "            eval_metric_ops=eval_metrics)\n",
328 |     "    else:\n",
329 |     "      (predicted_labels, log_probs) = create_model(\n",
330 |     "        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n",
331 |     "\n",
332 |     "      predictions = {\n",
333 |     "          'probabilities': log_probs,\n",
334 |     "          'labels': predicted_labels\n",
335 |     "      }\n",
336 |     "      return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n",
337 |     "\n",
338 |     "  # Return the actual model function in the closure\n",
339 |     "  return model_fn"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "# Compute train and warmup steps from batch size\n",
349 |     "# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)\n",
350 |     "BATCH_SIZE = 32\n",
351 |     "LEARNING_RATE = 2e-5\n",
352 |     "NUM_TRAIN_EPOCHS = 3.0\n",
353 |     "# Warmup is a period of time where hte learning rate \n",
354 |     "# is small and gradually increases--usually helps training.\n",
355 |     "WARMUP_PROPORTION = 0.1\n",
356 |     "# Model configs\n",
357 |     "SAVE_CHECKPOINTS_STEPS = 500\n",
358 |     "SAVE_SUMMARY_STEPS = 100"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "# Compute # train and warmup steps from batch size\n",
368 |     "num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n",
369 |     "num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "OUTPUT_DIR=\"./\""
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "metadata": {},
385 |    "outputs": [],
386 |    "source": [
387 |     "# Specify outpit directory and number of checkpoint steps to save\n",
388 |     "run_config = tf.estimator.RunConfig(\n",
389 |     "    model_dir=OUTPUT_DIR,\n",
390 |     "    save_summary_steps=SAVE_SUMMARY_STEPS,\n",
391 |     "    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "model_fn = model_fn_builder(\n",
401 |     "  num_labels=len(label_list),\n",
402 |     "  learning_rate=LEARNING_RATE,\n",
403 |     "  num_train_steps=num_train_steps,\n",
404 |     "  num_warmup_steps=num_warmup_steps)\n",
405 |     "\n",
406 |     "estimator = tf.estimator.Estimator(\n",
407 |     "  model_fn=model_fn,\n",
408 |     "  config=run_config,\n",
409 |     "  params={\"batch_size\": BATCH_SIZE})"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "# Create an input function for training. drop_remainder = True for using TPUs.\n",
419 |     "train_input_fn = bert.run_classifier.input_fn_builder(\n",
420 |     "    features=train_features,\n",
421 |     "    seq_length=MAX_SEQ_LENGTH,\n",
422 |     "    is_training=True,\n",
423 |     "    drop_remainder=False)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": [
432 |     "print(f'Beginning Training!')\n",
433 |     "current_time = datetime.now()\n",
434 |     "estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n",
435 |     "print(\"Training took time \", datetime.now() - current_time)"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "test_input_fn = run_classifier.input_fn_builder(\n",
445 |     "    features=test_features,\n",
446 |     "    seq_length=MAX_SEQ_LENGTH,\n",
447 |     "    is_training=False,\n",
448 |     "    drop_remainder=False)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "estimator.evaluate(input_fn=test_input_fn, steps=None)"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": [
466 |     "def getPrediction(in_sentences):\n",
467 |     "  labels = [\"Negative\", \"Positive\"]\n",
468 |     "  input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n",
469 |     "  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n",
470 |     "  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n",
471 |     "  predictions = estimator.predict(predict_input_fn)\n",
472 |     "  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "def getSinglePrediction(in_sentences):\n",
482 |     "  labels = [\"Negative\", \"Positive\"]\n",
483 |     "  input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n",
484 |     "  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n",
485 |     "  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n",
486 |     "  predictions = estimator.predict(input_fn=predict_input_fn, yield_single_examples=False)\n",
487 |     "  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": null,
493 |    "metadata": {},
494 |    "outputs": [],
495 |    "source": [
496 |     "pred_sentences = [\n",
497 |     "  \"That movie was absolutely awful\",\n",
498 |     "  \"The acting was a bit lacking\",\n",
499 |     "  \"The film was creative and surprising\",\n",
500 |     "  \"Absolutely fantastic!\"\n",
501 |     "]"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": null,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "predictions = getPrediction(pred_sentences)"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": null,
516 |    "metadata": {},
517 |    "outputs": [],
518 |    "source": [
519 |     "predictions"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "metadata": {},
526 |    "outputs": [],
527 |    "source": [
528 |     "pred_sentences = [\n",
529 |     "  \"I love to eat sea food\"\n",
530 |     "]"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "prediction = getSinglePrediction(pred_sentences)"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "code",
544 |    "execution_count": null,
545 |    "metadata": {},
546 |    "outputs": [],
547 |    "source": [
548 |     "prediction"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": []
557 |   }
558 |  ],
559 |  "metadata": {
560 |   "kernelspec": {
561 |    "display_name": "Python 3",
562 |    "language": "python",
563 |    "name": "python3"
564 |   },
565 |   "language_info": {
566 |    "codemirror_mode": {
567 |     "name": "ipython",
568 |     "version": 3
569 |    },
570 |    "file_extension": ".py",
571 |    "mimetype": "text/x-python",
572 |    "name": "python",
573 |    "nbconvert_exporter": "python",
574 |    "pygments_lexer": "ipython3",
575 |    "version": "3.7.4"
576 |   }
577 |  },
578 |  "nbformat": 4,
579 |  "nbformat_minor": 2
580 | }
581 | 


--------------------------------------------------------------------------------
/src/bert_sentiment_classifier.py:
--------------------------------------------------------------------------------
  1 | #%% Change working directory from the workspace root to the ipynb file location. Turn this addition off with the DataScience.changeDirOnImportExport setting
  2 | import os
  3 | try:
  4 | 	os.chdir(os.path.join(os.getcwd(), 'src'))
  5 | 	print(os.getcwd())
  6 | except:
  7 | 	pass
  8 | 
  9 | #%%
 10 | # Copyright 2019 Google Inc.
 11 | 
 12 | # Licensed under the Apache License, Version 2.0 (the "License");
 13 | # you may not use this file except in compliance with the License.
 14 | # You may obtain a copy of the License at
 15 | 
 16 | #     http://www.apache.org/licenses/LICENSE-2.0
 17 | 
 18 | # Unless required by applicable law or agreed to in writing, software
 19 | # distributed under the License is distributed on an "AS IS" BASIS,
 20 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 21 | # See the License for the specific language governing permissions and
 22 | # limitations under the License.
 23 | 
 24 | #%% [markdown]
 25 | # #Predicting Movie Review Sentiment with BERT on TF Hub
 26 | #%% [markdown]
 27 | # If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering.
 28 | # 
 29 | # Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases.
 30 | # 
 31 | # Here, we'll train a model to predict whether an IMDB movie review is positive or negative using BERT in Tensorflow with tf hub. Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started!
 32 | 
 33 | #%%
 34 | from sklearn.model_selection import train_test_split
 35 | import pandas as pd
 36 | import tensorflow as tf
 37 | import tensorflow_hub as hub
 38 | from datetime import datetime
 39 | 
 40 | #%% [markdown]
 41 | # In addition to the standard libraries we imported above, we'll need to install BERT's python package.
 42 | 
 43 | #%%
 44 | get_ipython().system(u'pip install bert-tensorflow')
 45 | 
 46 | 
 47 | #%%
 48 | import bert
 49 | from bert import run_classifier
 50 | from bert import optimization
 51 | from bert import tokenization
 52 | 
 53 | #%% [markdown]
 54 | # Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.
 55 | # 
 56 | # Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.
 57 | # 
 58 | # Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist).
 59 | 
 60 | #%%
 61 | # Set the output directory for saving model file
 62 | # Optionally, set a GCP bucket location
 63 | 
 64 | OUTPUT_DIR = 'bert'#@param {type:"string"}
 65 | #@markdown Whether or not to clear/delete the directory and create a new one
 66 | DO_DELETE = False #@param {type:"boolean"}
 67 | #@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
 68 | USE_BUCKET = True #@param {type:"boolean"}
 69 | BUCKET = 'my-cloud-bucket-ml' #@param {type:"string"}
 70 | 
 71 | if USE_BUCKET:
 72 |   OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
 73 |   from google.colab import auth
 74 |   auth.authenticate_user()
 75 | 
 76 | if DO_DELETE:
 77 |   try:
 78 |     tf.gfile.DeleteRecursively(OUTPUT_DIR)
 79 |   except:
 80 |     # Doesn't matter if the directory didn't exist
 81 |     pass
 82 | tf.gfile.MakeDirs(OUTPUT_DIR)
 83 | print('***** Model output directory: {} *****'.format(OUTPUT_DIR))
 84 | 
 85 | #%% [markdown]
 86 | # #Data
 87 | #%% [markdown]
 88 | # First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub).
 89 | 
 90 | #%%
 91 | from tensorflow import keras
 92 | import os
 93 | import re
 94 | 
 95 | # Load all files from a directory in a DataFrame.
 96 | def load_directory_data(directory):
 97 |   data = {}
 98 |   data["sentence"] = []
 99 |   data["sentiment"] = []
100 |   for file_path in os.listdir(directory):
101 |     with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
102 |       data["sentence"].append(f.read())
103 |       data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
104 |   return pd.DataFrame.from_dict(data)
105 | 
106 | # Merge positive and negative examples, add a polarity column and shuffle.
107 | def load_dataset(directory):
108 |   pos_df = load_directory_data(os.path.join(directory, "pos"))
109 |   neg_df = load_directory_data(os.path.join(directory, "neg"))
110 |   pos_df["polarity"] = 1
111 |   neg_df["polarity"] = 0
112 |   return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
113 | 
114 | # Download and process the dataset files.
115 | def download_and_load_datasets(force_download=False):
116 |   dataset = tf.keras.utils.get_file(
117 |       fname="aclImdb.tar.gz", 
118 |       origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
119 |       extract=True)
120 |   
121 |   train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
122 |                                        "aclImdb", "train"))
123 |   test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
124 |                                       "aclImdb", "test"))
125 |   
126 |   return train_df, test_df
127 | 
128 | 
129 | #%%
130 | train, test = download_and_load_datasets()
131 | 
132 | #%% [markdown]
133 | # To keep training fast, we'll take a sample of 5000 train and test examples, respectively.
134 | 
135 | #%%
136 | train = train.sample(5000)
137 | test = test.sample(5000)
138 | 
139 | 
140 | #%%
141 | train.columns
142 | 
143 | #%% [markdown]
144 | # For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)
145 | 
146 | #%%
147 | DATA_COLUMN = 'sentence'
148 | LABEL_COLUMN = 'polarity'
149 | # label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
150 | label_list = [0, 1]
151 | 
152 | #%% [markdown]
153 | # #Data Preprocessing
154 | # We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.
155 | # 
156 | # - `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
157 | # - `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
158 | # - `label` is the label for our example, i.e. True, False
159 | 
160 | #%%
161 | # Use the InputExample class from BERT's run_classifier code to create examples from the data
162 | train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
163 |                                                                    text_a = x[DATA_COLUMN], 
164 |                                                                    text_b = None, 
165 |                                                                    label = x[LABEL_COLUMN]), axis = 1)
166 | 
167 | test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
168 |                                                                    text_a = x[DATA_COLUMN], 
169 |                                                                    text_b = None, 
170 |                                                                    label = x[LABEL_COLUMN]), axis = 1)
171 | 
172 | #%% [markdown]
173 | # Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):
174 | # 
175 | # 
176 | # 1. Lowercase our text (if we're using a BERT lowercase model)
177 | # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
178 | # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
179 | # 4. Map our words to indexes using a vocab file that BERT provides
180 | # 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
181 | # 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))
182 | # 
183 | # Happily, we don't have to worry about most of these details.
184 | # 
185 | # 
186 | # 
187 | #%% [markdown]
188 | # To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:
189 | 
190 | #%%
191 | # This is a path to an uncased (all lowercase) version of BERT
192 | BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
193 | 
194 | def create_tokenizer_from_hub_module():
195 |   """Get the vocab file and casing info from the Hub module."""
196 |   with tf.Graph().as_default():
197 |     bert_module = hub.Module(BERT_MODEL_HUB)
198 |     tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
199 |     with tf.Session() as sess:
200 |       vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
201 |                                             tokenization_info["do_lower_case"]])
202 |       
203 |   return bert.tokenization.FullTokenizer(
204 |       vocab_file=vocab_file, do_lower_case=do_lower_case)
205 | 
206 | tokenizer = create_tokenizer_from_hub_module()
207 | 
208 | #%% [markdown]
209 | # Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info["do_lower_case"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:
210 | 
211 | #%%
212 | tokenizer.tokenize("This here's an example of using the BERT tokenizer")
213 | 
214 | #%% [markdown]
215 | # Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands.
216 | 
217 | #%%
218 | # We'll set sequences to be at most 128 tokens long.
219 | MAX_SEQ_LENGTH = 128
220 | # Convert our train and test features to InputFeatures that BERT understands.
221 | train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
222 | test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
223 | 
224 | #%% [markdown]
225 | # #Creating a model
226 | # 
227 | # Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning).
228 | 
229 | #%%
230 | def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
231 |                  num_labels):
232 |   """Creates a classification model."""
233 | 
234 |   bert_module = hub.Module(
235 |       BERT_MODEL_HUB,
236 |       trainable=True)
237 |   bert_inputs = dict(
238 |       input_ids=input_ids,
239 |       input_mask=input_mask,
240 |       segment_ids=segment_ids)
241 |   bert_outputs = bert_module(
242 |       inputs=bert_inputs,
243 |       signature="tokens",
244 |       as_dict=True)
245 | 
246 |   # Use "pooled_output" for classification tasks on an entire sentence.
247 |   # Use "sequence_outputs" for token-level output.
248 |   output_layer = bert_outputs["pooled_output"]
249 | 
250 |   hidden_size = output_layer.shape[-1].value
251 | 
252 |   # Create our own layer to tune for politeness data.
253 |   output_weights = tf.get_variable(
254 |       "output_weights", [num_labels, hidden_size],
255 |       initializer=tf.truncated_normal_initializer(stddev=0.02))
256 | 
257 |   output_bias = tf.get_variable(
258 |       "output_bias", [num_labels], initializer=tf.zeros_initializer())
259 | 
260 |   with tf.variable_scope("loss"):
261 | 
262 |     # Dropout helps prevent overfitting
263 |     output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
264 | 
265 |     logits = tf.matmul(output_layer, output_weights, transpose_b=True)
266 |     logits = tf.nn.bias_add(logits, output_bias)
267 |     log_probs = tf.nn.log_softmax(logits, axis=-1)
268 | 
269 |     # Convert labels into one-hot encoding
270 |     one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
271 | 
272 |     predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
273 |     # If we're predicting, we want predicted labels and the probabiltiies.
274 |     if is_predicting:
275 |       return (predicted_labels, log_probs)
276 | 
277 |     # If we're train/eval, compute loss between predicted and actual label
278 |     per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
279 |     loss = tf.reduce_mean(per_example_loss)
280 |     return (loss, predicted_labels, log_probs)
281 | 
282 | #%% [markdown]
283 | # Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction.
284 | 
285 | #%%
286 | # model_fn_builder actually creates our model function
287 | # using the passed parameters for num_labels, learning_rate, etc.
288 | def model_fn_builder(num_labels, learning_rate, num_train_steps,
289 |                      num_warmup_steps):
290 |   """Returns `model_fn` closure for TPUEstimator."""
291 |   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
292 |     """The `model_fn` for TPUEstimator."""
293 | 
294 |     input_ids = features["input_ids"]
295 |     input_mask = features["input_mask"]
296 |     segment_ids = features["segment_ids"]
297 |     label_ids = features["label_ids"]
298 | 
299 |     is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
300 |     
301 |     # TRAIN and EVAL
302 |     if not is_predicting:
303 | 
304 |       (loss, predicted_labels, log_probs) = create_model(
305 |         is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
306 | 
307 |       train_op = bert.optimization.create_optimizer(
308 |           loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
309 | 
310 |       # Calculate evaluation metrics. 
311 |       def metric_fn(label_ids, predicted_labels):
312 |         accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
313 |         f1_score = tf.contrib.metrics.f1_score(
314 |             label_ids,
315 |             predicted_labels)
316 |         auc = tf.metrics.auc(
317 |             label_ids,
318 |             predicted_labels)
319 |         recall = tf.metrics.recall(
320 |             label_ids,
321 |             predicted_labels)
322 |         precision = tf.metrics.precision(
323 |             label_ids,
324 |             predicted_labels) 
325 |         true_pos = tf.metrics.true_positives(
326 |             label_ids,
327 |             predicted_labels)
328 |         true_neg = tf.metrics.true_negatives(
329 |             label_ids,
330 |             predicted_labels)   
331 |         false_pos = tf.metrics.false_positives(
332 |             label_ids,
333 |             predicted_labels)  
334 |         false_neg = tf.metrics.false_negatives(
335 |             label_ids,
336 |             predicted_labels)
337 |         return {
338 |             "eval_accuracy": accuracy,
339 |             "f1_score": f1_score,
340 |             "auc": auc,
341 |             "precision": precision,
342 |             "recall": recall,
343 |             "true_positives": true_pos,
344 |             "true_negatives": true_neg,
345 |             "false_positives": false_pos,
346 |             "false_negatives": false_neg
347 |         }
348 | 
349 |       eval_metrics = metric_fn(label_ids, predicted_labels)
350 | 
351 |       if mode == tf.estimator.ModeKeys.TRAIN:
352 |         return tf.estimator.EstimatorSpec(mode=mode,
353 |           loss=loss,
354 |           train_op=train_op)
355 |       else:
356 |           return tf.estimator.EstimatorSpec(mode=mode,
357 |             loss=loss,
358 |             eval_metric_ops=eval_metrics)
359 |     else:
360 |       (predicted_labels, log_probs) = create_model(
361 |         is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
362 | 
363 |       predictions = {
364 |           'probabilities': log_probs,
365 |           'labels': predicted_labels
366 |       }
367 |       return tf.estimator.EstimatorSpec(mode, predictions=predictions)
368 | 
369 |   # Return the actual model function in the closure
370 |   return model_fn
371 | 
372 | 
373 | #%%
374 | # Compute train and warmup steps from batch size
375 | # These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
376 | BATCH_SIZE = 32
377 | LEARNING_RATE = 2e-5
378 | NUM_TRAIN_EPOCHS = 3.0
379 | # Warmup is a period of time where hte learning rate 
380 | # is small and gradually increases--usually helps training.
381 | WARMUP_PROPORTION = 0.1
382 | # Model configs
383 | SAVE_CHECKPOINTS_STEPS = 500
384 | SAVE_SUMMARY_STEPS = 100
385 | 
386 | 
387 | #%%
388 | # Compute # train and warmup steps from batch size
389 | num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
390 | num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
391 | 
392 | 
393 | #%%
394 | # Specify outpit directory and number of checkpoint steps to save
395 | run_config = tf.estimator.RunConfig(
396 |     model_dir=OUTPUT_DIR,
397 |     save_summary_steps=SAVE_SUMMARY_STEPS,
398 |     save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)
399 | 
400 | 
401 | #%%
402 | model_fn = model_fn_builder(
403 |   num_labels=len(label_list),
404 |   learning_rate=LEARNING_RATE,
405 |   num_train_steps=num_train_steps,
406 |   num_warmup_steps=num_warmup_steps)
407 | 
408 | estimator = tf.estimator.Estimator(
409 |   model_fn=model_fn,
410 |   config=run_config,
411 |   params={"batch_size": BATCH_SIZE})
412 | 
413 | #%% [markdown]
414 | # Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators).
415 | 
416 | #%%
417 | # Create an input function for training. drop_remainder = True for using TPUs.
418 | train_input_fn = bert.run_classifier.input_fn_builder(
419 |     features=train_features,
420 |     seq_length=MAX_SEQ_LENGTH,
421 |     is_training=True,
422 |     drop_remainder=False)
423 | 
424 | #%% [markdown]
425 | # Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes.
426 | 
427 | #%%
428 | print(f'Beginning Training!')
429 | current_time = datetime.now()
430 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
431 | print("Training took time ", datetime.now() - current_time)
432 | 
433 | #%% [markdown]
434 | # Now let's use our test data to see how well our model did:
435 | 
436 | #%%
437 | test_input_fn = run_classifier.input_fn_builder(
438 |     features=test_features,
439 |     seq_length=MAX_SEQ_LENGTH,
440 |     is_training=False,
441 |     drop_remainder=False)
442 | 
443 | 
444 | #%%
445 | estimator.evaluate(input_fn=test_input_fn, steps=None)
446 | 
447 | #%% [markdown]
448 | # Now let's write code to make predictions on new sentences:
449 | 
450 | #%%
451 | def getPrediction(in_sentences):
452 |   labels = ["Negative", "Positive"]
453 |   input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
454 |   input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
455 |   predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
456 |   predictions = estimator.predict(predict_input_fn)
457 |   return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]
458 | 
459 | 
460 | #%%
461 | def getSinglePrediction(in_sentences):
462 |   labels = ["Negative", "Positive"]
463 |   input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
464 |   input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
465 |   predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
466 |   predictions = estimator.predict(input_fn=predict_input_fn, yield_single_examples=False)
467 |   return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]
468 | 
469 | 
470 | #%%
471 | pred_sentences = [
472 |   "That movie was absolutely awful",
473 |   "The acting was a bit lacking",
474 |   "The film was creative and surprising",
475 |   "Absolutely fantastic!"
476 | ]
477 | 
478 | 
479 | #%%
480 | predictions = getPrediction(pred_sentences)
481 | 
482 | #%% [markdown]
483 | # Voila! We have a sentiment classifier!
484 | 
485 | #%%
486 | predictions
487 | 
488 | 
489 | #%%
490 | pred_sentences = [
491 |   "I love to eat sea food"
492 | ]
493 | 
494 | 
495 | #%%
496 | prediction = getSinglePrediction(pred_sentences)
497 | 
498 | 
499 | #%%
500 | prediction
501 | 
502 | 
503 | 


--------------------------------------------------------------------------------
/src/bert_sentiment_classifier.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Predicting Movie Reviews with BERT on TF Hub.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "GPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "metadata": {
 20 |         "id": "j0a4mTk9o1Qg",
 21 |         "colab_type": "code",
 22 |         "colab": {}
 23 |       },
 24 |       "cell_type": "code",
 25 |       "source": [
 26 |         "# Copyright 2019 Google Inc.\n",
 27 |         "\n",
 28 |         "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
 29 |         "# you may not use this file except in compliance with the License.\n",
 30 |         "# You may obtain a copy of the License at\n",
 31 |         "\n",
 32 |         "#     http://www.apache.org/licenses/LICENSE-2.0\n",
 33 |         "\n",
 34 |         "# Unless required by applicable law or agreed to in writing, software\n",
 35 |         "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
 36 |         "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
 37 |         "# See the License for the specific language governing permissions and\n",
 38 |         "# limitations under the License."
 39 |       ],
 40 |       "execution_count": 0,
 41 |       "outputs": []
 42 |     },
 43 |     {
 44 |       "metadata": {
 45 |         "id": "dCpvgG0vwXAZ",
 46 |         "colab_type": "text"
 47 |       },
 48 |       "cell_type": "markdown",
 49 |       "source": [
 50 |         "#Predicting Movie Review Sentiment with BERT on TF Hub"
 51 |       ]
 52 |     },
 53 |     {
 54 |       "metadata": {
 55 |         "id": "xiYrZKaHwV81",
 56 |         "colab_type": "text"
 57 |       },
 58 |       "cell_type": "markdown",
 59 |       "source": [
 60 |         "If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering.\n",
 61 |         "\n",
 62 |         "Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases.\n",
 63 |         "\n",
 64 |         "Here, we'll train a model to predict whether an IMDB movie review is positive or negative using BERT in Tensorflow with tf hub. Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started!"
 65 |       ]
 66 |     },
 67 |     {
 68 |       "metadata": {
 69 |         "id": "hsZvic2YxnTz",
 70 |         "colab_type": "code",
 71 |         "colab": {}
 72 |       },
 73 |       "cell_type": "code",
 74 |       "source": [
 75 |         "from sklearn.model_selection import train_test_split\n",
 76 |         "import pandas as pd\n",
 77 |         "import tensorflow as tf\n",
 78 |         "import tensorflow_hub as hub\n",
 79 |         "from datetime import datetime"
 80 |       ],
 81 |       "execution_count": 0,
 82 |       "outputs": []
 83 |     },
 84 |     {
 85 |       "metadata": {
 86 |         "id": "cp5wfXDx5SPH",
 87 |         "colab_type": "text"
 88 |       },
 89 |       "cell_type": "markdown",
 90 |       "source": [
 91 |         "In addition to the standard libraries we imported above, we'll need to install BERT's python package."
 92 |       ]
 93 |     },
 94 |     {
 95 |       "metadata": {
 96 |         "id": "jviywGyWyKsA",
 97 |         "colab_type": "code",
 98 |         "colab": {}
 99 |       },
100 |       "cell_type": "code",
101 |       "source": [
102 |         "!pip install bert-tensorflow"
103 |       ],
104 |       "execution_count": 0,
105 |       "outputs": []
106 |     },
107 |     {
108 |       "metadata": {
109 |         "id": "hhbGEfwgdEtw",
110 |         "colab_type": "code",
111 |         "colab": {}
112 |       },
113 |       "cell_type": "code",
114 |       "source": [
115 |         "import bert\n",
116 |         "from bert import run_classifier\n",
117 |         "from bert import optimization\n",
118 |         "from bert import tokenization"
119 |       ],
120 |       "execution_count": 0,
121 |       "outputs": []
122 |     },
123 |     {
124 |       "metadata": {
125 |         "id": "KVB3eOcjxxm1",
126 |         "colab_type": "text"
127 |       },
128 |       "cell_type": "markdown",
129 |       "source": [
130 |         "Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.\n",
131 |         "\n",
132 |         "Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.\n",
133 |         "\n",
134 |         "Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist)."
135 |       ]
136 |     },
137 |     {
138 |       "metadata": {
139 |         "id": "US_EAnICvP7f",
140 |         "colab_type": "code",
141 |         "cellView": "form",
142 |         "colab": {}
143 |       },
144 |       "cell_type": "code",
145 |       "source": [
146 |         "# Set the output directory for saving model file\n",
147 |         "# Optionally, set a GCP bucket location\n",
148 |         "\n",
149 |         "OUTPUT_DIR = 'bert'#@param {type:\"string\"}\n",
150 |         "#@markdown Whether or not to clear/delete the directory and create a new one\n",
151 |         "DO_DELETE = False #@param {type:\"boolean\"}\n",
152 |         "#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.\n",
153 |         "USE_BUCKET = True #@param {type:\"boolean\"}\n",
154 |         "BUCKET = 'my-cloud-bucket-ml' #@param {type:\"string\"}\n",
155 |         "\n",
156 |         "if USE_BUCKET:\n",
157 |         "  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)\n",
158 |         "  from google.colab import auth\n",
159 |         "  auth.authenticate_user()\n",
160 |         "\n",
161 |         "if DO_DELETE:\n",
162 |         "  try:\n",
163 |         "    tf.gfile.DeleteRecursively(OUTPUT_DIR)\n",
164 |         "  except:\n",
165 |         "    # Doesn't matter if the directory didn't exist\n",
166 |         "    pass\n",
167 |         "tf.gfile.MakeDirs(OUTPUT_DIR)\n",
168 |         "print('***** Model output directory: {} *****'.format(OUTPUT_DIR))\n"
169 |       ],
170 |       "execution_count": 0,
171 |       "outputs": []
172 |     },
173 |     {
174 |       "metadata": {
175 |         "id": "pmFYvkylMwXn",
176 |         "colab_type": "text"
177 |       },
178 |       "cell_type": "markdown",
179 |       "source": [
180 |         "#Data"
181 |       ]
182 |     },
183 |     {
184 |       "metadata": {
185 |         "id": "MC_w8SRqN0fr",
186 |         "colab_type": "text"
187 |       },
188 |       "cell_type": "markdown",
189 |       "source": [
190 |         "First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub)."
191 |       ]
192 |     },
193 |     {
194 |       "metadata": {
195 |         "id": "fom_ff20gyy6",
196 |         "colab_type": "code",
197 |         "colab": {}
198 |       },
199 |       "cell_type": "code",
200 |       "source": [
201 |         "from tensorflow import keras\n",
202 |         "import os\n",
203 |         "import re\n",
204 |         "\n",
205 |         "# Load all files from a directory in a DataFrame.\n",
206 |         "def load_directory_data(directory):\n",
207 |         "  data = {}\n",
208 |         "  data[\"sentence\"] = []\n",
209 |         "  data[\"sentiment\"] = []\n",
210 |         "  for file_path in os.listdir(directory):\n",
211 |         "    with tf.gfile.GFile(os.path.join(directory, file_path), \"r\") as f:\n",
212 |         "      data[\"sentence\"].append(f.read())\n",
213 |         "      data[\"sentiment\"].append(re.match(\"\\d+_(\\d+)\\.txt\", file_path).group(1))\n",
214 |         "  return pd.DataFrame.from_dict(data)\n",
215 |         "\n",
216 |         "# Merge positive and negative examples, add a polarity column and shuffle.\n",
217 |         "def load_dataset(directory):\n",
218 |         "  pos_df = load_directory_data(os.path.join(directory, \"pos\"))\n",
219 |         "  neg_df = load_directory_data(os.path.join(directory, \"neg\"))\n",
220 |         "  pos_df[\"polarity\"] = 1\n",
221 |         "  neg_df[\"polarity\"] = 0\n",
222 |         "  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)\n",
223 |         "\n",
224 |         "# Download and process the dataset files.\n",
225 |         "def download_and_load_datasets(force_download=False):\n",
226 |         "  dataset = tf.keras.utils.get_file(\n",
227 |         "      fname=\"aclImdb.tar.gz\", \n",
228 |         "      origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\", \n",
229 |         "      extract=True)\n",
230 |         "  \n",
231 |         "  train_df = load_dataset(os.path.join(os.path.dirname(dataset), \n",
232 |         "                                       \"aclImdb\", \"train\"))\n",
233 |         "  test_df = load_dataset(os.path.join(os.path.dirname(dataset), \n",
234 |         "                                      \"aclImdb\", \"test\"))\n",
235 |         "  \n",
236 |         "  return train_df, test_df\n"
237 |       ],
238 |       "execution_count": 0,
239 |       "outputs": []
240 |     },
241 |     {
242 |       "metadata": {
243 |         "id": "2abfwdn-g135",
244 |         "colab_type": "code",
245 |         "colab": {}
246 |       },
247 |       "cell_type": "code",
248 |       "source": [
249 |         "train, test = download_and_load_datasets()"
250 |       ],
251 |       "execution_count": 0,
252 |       "outputs": []
253 |     },
254 |     {
255 |       "metadata": {
256 |         "id": "XA8WHJgzhIZf",
257 |         "colab_type": "text"
258 |       },
259 |       "cell_type": "markdown",
260 |       "source": [
261 |         "To keep training fast, we'll take a sample of 5000 train and test examples, respectively."
262 |       ]
263 |     },
264 |     {
265 |       "metadata": {
266 |         "id": "lw_F488eixTV",
267 |         "colab_type": "code",
268 |         "colab": {}
269 |       },
270 |       "cell_type": "code",
271 |       "source": [
272 |         "train = train.sample(5000)\n",
273 |         "test = test.sample(5000)"
274 |       ],
275 |       "execution_count": 0,
276 |       "outputs": []
277 |     },
278 |     {
279 |       "metadata": {
280 |         "id": "prRQM8pDi8xI",
281 |         "colab_type": "code",
282 |         "colab": {}
283 |       },
284 |       "cell_type": "code",
285 |       "source": [
286 |         "train.columns"
287 |       ],
288 |       "execution_count": 0,
289 |       "outputs": []
290 |     },
291 |     {
292 |       "metadata": {
293 |         "id": "sfRnHSz3iSXz",
294 |         "colab_type": "text"
295 |       },
296 |       "cell_type": "markdown",
297 |       "source": [
298 |         "For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)"
299 |       ]
300 |     },
301 |     {
302 |       "metadata": {
303 |         "id": "IuMOGwFui4it",
304 |         "colab_type": "code",
305 |         "colab": {}
306 |       },
307 |       "cell_type": "code",
308 |       "source": [
309 |         "DATA_COLUMN = 'sentence'\n",
310 |         "LABEL_COLUMN = 'polarity'\n",
311 |         "# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'\n",
312 |         "label_list = [0, 1]"
313 |       ],
314 |       "execution_count": 0,
315 |       "outputs": []
316 |     },
317 |     {
318 |       "metadata": {
319 |         "id": "V399W0rqNJ-Z",
320 |         "colab_type": "text"
321 |       },
322 |       "cell_type": "markdown",
323 |       "source": [
324 |         "#Data Preprocessing\n",
325 |         "We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.\n",
326 |         "\n",
327 |         "- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n",
328 |         "- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.\n",
329 |         "- `label` is the label for our example, i.e. True, False"
330 |       ]
331 |     },
332 |     {
333 |       "metadata": {
334 |         "id": "p9gEt5SmM6i6",
335 |         "colab_type": "code",
336 |         "colab": {}
337 |       },
338 |       "cell_type": "code",
339 |       "source": [
340 |         "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n",
341 |         "train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example\n",
342 |         "                                                                   text_a = x[DATA_COLUMN], \n",
343 |         "                                                                   text_b = None, \n",
344 |         "                                                                   label = x[LABEL_COLUMN]), axis = 1)\n",
345 |         "\n",
346 |         "test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n",
347 |         "                                                                   text_a = x[DATA_COLUMN], \n",
348 |         "                                                                   text_b = None, \n",
349 |         "                                                                   label = x[LABEL_COLUMN]), axis = 1)"
350 |       ],
351 |       "execution_count": 0,
352 |       "outputs": []
353 |     },
354 |     {
355 |       "metadata": {
356 |         "id": "SCZWZtKxObjh",
357 |         "colab_type": "text"
358 |       },
359 |       "cell_type": "markdown",
360 |       "source": [
361 |         "Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):\n",
362 |         "\n",
363 |         "\n",
364 |         "1. Lowercase our text (if we're using a BERT lowercase model)\n",
365 |         "2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n",
366 |         "3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n",
367 |         "4. Map our words to indexes using a vocab file that BERT provides\n",
368 |         "5. Add special \"CLS\" and \"SEP\" tokens (see the [readme](https://github.com/google-research/bert))\n",
369 |         "6. Append \"index\" and \"segment\" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))\n",
370 |         "\n",
371 |         "Happily, we don't have to worry about most of these details.\n",
372 |         "\n",
373 |         "\n"
374 |       ]
375 |     },
376 |     {
377 |       "metadata": {
378 |         "id": "qMWiDtpyQSoU",
379 |         "colab_type": "text"
380 |       },
381 |       "cell_type": "markdown",
382 |       "source": [
383 |         "To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:"
384 |       ]
385 |     },
386 |     {
387 |       "metadata": {
388 |         "id": "IhJSe0QHNG7U",
389 |         "colab_type": "code",
390 |         "colab": {}
391 |       },
392 |       "cell_type": "code",
393 |       "source": [
394 |         "# This is a path to an uncased (all lowercase) version of BERT\n",
395 |         "BERT_MODEL_HUB = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"\n",
396 |         "\n",
397 |         "def create_tokenizer_from_hub_module():\n",
398 |         "  \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n",
399 |         "  with tf.Graph().as_default():\n",
400 |         "    bert_module = hub.Module(BERT_MODEL_HUB)\n",
401 |         "    tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n",
402 |         "    with tf.Session() as sess:\n",
403 |         "      vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n",
404 |         "                                            tokenization_info[\"do_lower_case\"]])\n",
405 |         "      \n",
406 |         "  return bert.tokenization.FullTokenizer(\n",
407 |         "      vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
408 |         "\n",
409 |         "tokenizer = create_tokenizer_from_hub_module()"
410 |       ],
411 |       "execution_count": 0,
412 |       "outputs": []
413 |     },
414 |     {
415 |       "metadata": {
416 |         "id": "z4oFkhpZBDKm",
417 |         "colab_type": "text"
418 |       },
419 |       "cell_type": "markdown",
420 |       "source": [
421 |         "Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info[\"do_lower_case\"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:"
422 |       ]
423 |     },
424 |     {
425 |       "metadata": {
426 |         "id": "dsBo6RCtQmwx",
427 |         "colab_type": "code",
428 |         "colab": {}
429 |       },
430 |       "cell_type": "code",
431 |       "source": [
432 |         "tokenizer.tokenize(\"This here's an example of using the BERT tokenizer\")"
433 |       ],
434 |       "execution_count": 0,
435 |       "outputs": []
436 |     },
437 |     {
438 |       "metadata": {
439 |         "id": "0OEzfFIt6GIc",
440 |         "colab_type": "text"
441 |       },
442 |       "cell_type": "markdown",
443 |       "source": [
444 |         "Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands."
445 |       ]
446 |     },
447 |     {
448 |       "metadata": {
449 |         "id": "LL5W8gEGRTAf",
450 |         "colab_type": "code",
451 |         "colab": {}
452 |       },
453 |       "cell_type": "code",
454 |       "source": [
455 |         "# We'll set sequences to be at most 128 tokens long.\n",
456 |         "MAX_SEQ_LENGTH = 128\n",
457 |         "# Convert our train and test features to InputFeatures that BERT understands.\n",
458 |         "train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)\n",
459 |         "test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)"
460 |       ],
461 |       "execution_count": 0,
462 |       "outputs": []
463 |     },
464 |     {
465 |       "metadata": {
466 |         "id": "ccp5trMwRtmr",
467 |         "colab_type": "text"
468 |       },
469 |       "cell_type": "markdown",
470 |       "source": [
471 |         "#Creating a model\n",
472 |         "\n",
473 |         "Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning)."
474 |       ]
475 |     },
476 |     {
477 |       "metadata": {
478 |         "id": "6o2a5ZIvRcJq",
479 |         "colab_type": "code",
480 |         "colab": {}
481 |       },
482 |       "cell_type": "code",
483 |       "source": [
484 |         "def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,\n",
485 |         "                 num_labels):\n",
486 |         "  \"\"\"Creates a classification model.\"\"\"\n",
487 |         "\n",
488 |         "  bert_module = hub.Module(\n",
489 |         "      BERT_MODEL_HUB,\n",
490 |         "      trainable=True)\n",
491 |         "  bert_inputs = dict(\n",
492 |         "      input_ids=input_ids,\n",
493 |         "      input_mask=input_mask,\n",
494 |         "      segment_ids=segment_ids)\n",
495 |         "  bert_outputs = bert_module(\n",
496 |         "      inputs=bert_inputs,\n",
497 |         "      signature=\"tokens\",\n",
498 |         "      as_dict=True)\n",
499 |         "\n",
500 |         "  # Use \"pooled_output\" for classification tasks on an entire sentence.\n",
501 |         "  # Use \"sequence_outputs\" for token-level output.\n",
502 |         "  output_layer = bert_outputs[\"pooled_output\"]\n",
503 |         "\n",
504 |         "  hidden_size = output_layer.shape[-1].value\n",
505 |         "\n",
506 |         "  # Create our own layer to tune for politeness data.\n",
507 |         "  output_weights = tf.get_variable(\n",
508 |         "      \"output_weights\", [num_labels, hidden_size],\n",
509 |         "      initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
510 |         "\n",
511 |         "  output_bias = tf.get_variable(\n",
512 |         "      \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n",
513 |         "\n",
514 |         "  with tf.variable_scope(\"loss\"):\n",
515 |         "\n",
516 |         "    # Dropout helps prevent overfitting\n",
517 |         "    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n",
518 |         "\n",
519 |         "    logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n",
520 |         "    logits = tf.nn.bias_add(logits, output_bias)\n",
521 |         "    log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
522 |         "\n",
523 |         "    # Convert labels into one-hot encoding\n",
524 |         "    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n",
525 |         "\n",
526 |         "    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n",
527 |         "    # If we're predicting, we want predicted labels and the probabiltiies.\n",
528 |         "    if is_predicting:\n",
529 |         "      return (predicted_labels, log_probs)\n",
530 |         "\n",
531 |         "    # If we're train/eval, compute loss between predicted and actual label\n",
532 |         "    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n",
533 |         "    loss = tf.reduce_mean(per_example_loss)\n",
534 |         "    return (loss, predicted_labels, log_probs)\n"
535 |       ],
536 |       "execution_count": 0,
537 |       "outputs": []
538 |     },
539 |     {
540 |       "metadata": {
541 |         "id": "qpE0ZIDOCQzE",
542 |         "colab_type": "text"
543 |       },
544 |       "cell_type": "markdown",
545 |       "source": [
546 |         "Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction."
547 |       ]
548 |     },
549 |     {
550 |       "metadata": {
551 |         "id": "FnH-AnOQ9KKW",
552 |         "colab_type": "code",
553 |         "colab": {}
554 |       },
555 |       "cell_type": "code",
556 |       "source": [
557 |         "# model_fn_builder actually creates our model function\n",
558 |         "# using the passed parameters for num_labels, learning_rate, etc.\n",
559 |         "def model_fn_builder(num_labels, learning_rate, num_train_steps,\n",
560 |         "                     num_warmup_steps):\n",
561 |         "  \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n",
562 |         "  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
563 |         "    \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n",
564 |         "\n",
565 |         "    input_ids = features[\"input_ids\"]\n",
566 |         "    input_mask = features[\"input_mask\"]\n",
567 |         "    segment_ids = features[\"segment_ids\"]\n",
568 |         "    label_ids = features[\"label_ids\"]\n",
569 |         "\n",
570 |         "    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n",
571 |         "    \n",
572 |         "    # TRAIN and EVAL\n",
573 |         "    if not is_predicting:\n",
574 |         "\n",
575 |         "      (loss, predicted_labels, log_probs) = create_model(\n",
576 |         "        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n",
577 |         "\n",
578 |         "      train_op = bert.optimization.create_optimizer(\n",
579 |         "          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n",
580 |         "\n",
581 |         "      # Calculate evaluation metrics. \n",
582 |         "      def metric_fn(label_ids, predicted_labels):\n",
583 |         "        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n",
584 |         "        f1_score = tf.contrib.metrics.f1_score(\n",
585 |         "            label_ids,\n",
586 |         "            predicted_labels)\n",
587 |         "        auc = tf.metrics.auc(\n",
588 |         "            label_ids,\n",
589 |         "            predicted_labels)\n",
590 |         "        recall = tf.metrics.recall(\n",
591 |         "            label_ids,\n",
592 |         "            predicted_labels)\n",
593 |         "        precision = tf.metrics.precision(\n",
594 |         "            label_ids,\n",
595 |         "            predicted_labels) \n",
596 |         "        true_pos = tf.metrics.true_positives(\n",
597 |         "            label_ids,\n",
598 |         "            predicted_labels)\n",
599 |         "        true_neg = tf.metrics.true_negatives(\n",
600 |         "            label_ids,\n",
601 |         "            predicted_labels)   \n",
602 |         "        false_pos = tf.metrics.false_positives(\n",
603 |         "            label_ids,\n",
604 |         "            predicted_labels)  \n",
605 |         "        false_neg = tf.metrics.false_negatives(\n",
606 |         "            label_ids,\n",
607 |         "            predicted_labels)\n",
608 |         "        return {\n",
609 |         "            \"eval_accuracy\": accuracy,\n",
610 |         "            \"f1_score\": f1_score,\n",
611 |         "            \"auc\": auc,\n",
612 |         "            \"precision\": precision,\n",
613 |         "            \"recall\": recall,\n",
614 |         "            \"true_positives\": true_pos,\n",
615 |         "            \"true_negatives\": true_neg,\n",
616 |         "            \"false_positives\": false_pos,\n",
617 |         "            \"false_negatives\": false_neg\n",
618 |         "        }\n",
619 |         "\n",
620 |         "      eval_metrics = metric_fn(label_ids, predicted_labels)\n",
621 |         "\n",
622 |         "      if mode == tf.estimator.ModeKeys.TRAIN:\n",
623 |         "        return tf.estimator.EstimatorSpec(mode=mode,\n",
624 |         "          loss=loss,\n",
625 |         "          train_op=train_op)\n",
626 |         "      else:\n",
627 |         "          return tf.estimator.EstimatorSpec(mode=mode,\n",
628 |         "            loss=loss,\n",
629 |         "            eval_metric_ops=eval_metrics)\n",
630 |         "    else:\n",
631 |         "      (predicted_labels, log_probs) = create_model(\n",
632 |         "        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n",
633 |         "\n",
634 |         "      predictions = {\n",
635 |         "          'probabilities': log_probs,\n",
636 |         "          'labels': predicted_labels\n",
637 |         "      }\n",
638 |         "      return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n",
639 |         "\n",
640 |         "  # Return the actual model function in the closure\n",
641 |         "  return model_fn\n"
642 |       ],
643 |       "execution_count": 0,
644 |       "outputs": []
645 |     },
646 |     {
647 |       "metadata": {
648 |         "id": "OjwJ4bTeWXD8",
649 |         "colab_type": "code",
650 |         "colab": {}
651 |       },
652 |       "cell_type": "code",
653 |       "source": [
654 |         "# Compute train and warmup steps from batch size\n",
655 |         "# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)\n",
656 |         "BATCH_SIZE = 32\n",
657 |         "LEARNING_RATE = 2e-5\n",
658 |         "NUM_TRAIN_EPOCHS = 3.0\n",
659 |         "# Warmup is a period of time where hte learning rate \n",
660 |         "# is small and gradually increases--usually helps training.\n",
661 |         "WARMUP_PROPORTION = 0.1\n",
662 |         "# Model configs\n",
663 |         "SAVE_CHECKPOINTS_STEPS = 500\n",
664 |         "SAVE_SUMMARY_STEPS = 100"
665 |       ],
666 |       "execution_count": 0,
667 |       "outputs": []
668 |     },
669 |     {
670 |       "metadata": {
671 |         "id": "emHf9GhfWBZ_",
672 |         "colab_type": "code",
673 |         "colab": {}
674 |       },
675 |       "cell_type": "code",
676 |       "source": [
677 |         "# Compute # train and warmup steps from batch size\n",
678 |         "num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n",
679 |         "num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)"
680 |       ],
681 |       "execution_count": 0,
682 |       "outputs": []
683 |     },
684 |     {
685 |       "metadata": {
686 |         "id": "oEJldMr3WYZa",
687 |         "colab_type": "code",
688 |         "colab": {}
689 |       },
690 |       "cell_type": "code",
691 |       "source": [
692 |         "# Specify outpit directory and number of checkpoint steps to save\n",
693 |         "run_config = tf.estimator.RunConfig(\n",
694 |         "    model_dir=OUTPUT_DIR,\n",
695 |         "    save_summary_steps=SAVE_SUMMARY_STEPS,\n",
696 |         "    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)"
697 |       ],
698 |       "execution_count": 0,
699 |       "outputs": []
700 |     },
701 |     {
702 |       "metadata": {
703 |         "id": "q_WebpS1X97v",
704 |         "colab_type": "code",
705 |         "colab": {}
706 |       },
707 |       "cell_type": "code",
708 |       "source": [
709 |         "model_fn = model_fn_builder(\n",
710 |         "  num_labels=len(label_list),\n",
711 |         "  learning_rate=LEARNING_RATE,\n",
712 |         "  num_train_steps=num_train_steps,\n",
713 |         "  num_warmup_steps=num_warmup_steps)\n",
714 |         "\n",
715 |         "estimator = tf.estimator.Estimator(\n",
716 |         "  model_fn=model_fn,\n",
717 |         "  config=run_config,\n",
718 |         "  params={\"batch_size\": BATCH_SIZE})\n"
719 |       ],
720 |       "execution_count": 0,
721 |       "outputs": []
722 |     },
723 |     {
724 |       "metadata": {
725 |         "id": "NOO3RfG1DYLo",
726 |         "colab_type": "text"
727 |       },
728 |       "cell_type": "markdown",
729 |       "source": [
730 |         "Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators)."
731 |       ]
732 |     },
733 |     {
734 |       "metadata": {
735 |         "id": "1Pv2bAlOX_-K",
736 |         "colab_type": "code",
737 |         "colab": {}
738 |       },
739 |       "cell_type": "code",
740 |       "source": [
741 |         "# Create an input function for training. drop_remainder = True for using TPUs.\n",
742 |         "train_input_fn = bert.run_classifier.input_fn_builder(\n",
743 |         "    features=train_features,\n",
744 |         "    seq_length=MAX_SEQ_LENGTH,\n",
745 |         "    is_training=True,\n",
746 |         "    drop_remainder=False)"
747 |       ],
748 |       "execution_count": 0,
749 |       "outputs": []
750 |     },
751 |     {
752 |       "metadata": {
753 |         "id": "t6Nukby2EB6-",
754 |         "colab_type": "text"
755 |       },
756 |       "cell_type": "markdown",
757 |       "source": [
758 |         "Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes."
759 |       ]
760 |     },
761 |     {
762 |       "metadata": {
763 |         "id": "nucD4gluYJmK",
764 |         "colab_type": "code",
765 |         "colab": {}
766 |       },
767 |       "cell_type": "code",
768 |       "source": [
769 |         "print(f'Beginning Training!')\n",
770 |         "current_time = datetime.now()\n",
771 |         "estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n",
772 |         "print(\"Training took time \", datetime.now() - current_time)"
773 |       ],
774 |       "execution_count": 0,
775 |       "outputs": []
776 |     },
777 |     {
778 |       "metadata": {
779 |         "id": "CmbLTVniARy3",
780 |         "colab_type": "text"
781 |       },
782 |       "cell_type": "markdown",
783 |       "source": [
784 |         "Now let's use our test data to see how well our model did:"
785 |       ]
786 |     },
787 |     {
788 |       "metadata": {
789 |         "id": "JIhejfpyJ8Bx",
790 |         "colab_type": "code",
791 |         "colab": {}
792 |       },
793 |       "cell_type": "code",
794 |       "source": [
795 |         "test_input_fn = run_classifier.input_fn_builder(\n",
796 |         "    features=test_features,\n",
797 |         "    seq_length=MAX_SEQ_LENGTH,\n",
798 |         "    is_training=False,\n",
799 |         "    drop_remainder=False)"
800 |       ],
801 |       "execution_count": 0,
802 |       "outputs": []
803 |     },
804 |     {
805 |       "metadata": {
806 |         "id": "PPVEXhNjYXC-",
807 |         "colab_type": "code",
808 |         "colab": {}
809 |       },
810 |       "cell_type": "code",
811 |       "source": [
812 |         "estimator.evaluate(input_fn=test_input_fn, steps=None)"
813 |       ],
814 |       "execution_count": 0,
815 |       "outputs": []
816 |     },
817 |     {
818 |       "metadata": {
819 |         "id": "ueKsULteiz1B",
820 |         "colab_type": "text"
821 |       },
822 |       "cell_type": "markdown",
823 |       "source": [
824 |         "Now let's write code to make predictions on new sentences:"
825 |       ]
826 |     },
827 |     {
828 |       "metadata": {
829 |         "id": "OsrbTD2EJTVl",
830 |         "colab_type": "code",
831 |         "colab": {}
832 |       },
833 |       "cell_type": "code",
834 |       "source": [
835 |         "def getPrediction(in_sentences):\n",
836 |         "  labels = [\"Negative\", \"Positive\"]\n",
837 |         "  input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n",
838 |         "  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n",
839 |         "  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n",
840 |         "  predictions = estimator.predict(predict_input_fn)\n",
841 |         "  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]"
842 |       ],
843 |       "execution_count": 0,
844 |       "outputs": []
845 |     },
846 |     {
847 |       "metadata": {
848 |         "id": "RieQtf0HxqFy",
849 |         "colab_type": "code",
850 |         "colab": {}
851 |       },
852 |       "cell_type": "code",
853 |       "source": [
854 |         "def getSinglePrediction(in_sentences):\n",
855 |         "  labels = [\"Negative\", \"Positive\"]\n",
856 |         "  input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n",
857 |         "  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n",
858 |         "  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n",
859 |         "  predictions = estimator.predict(input_fn=predict_input_fn, yield_single_examples=False)\n",
860 |         "  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]"
861 |       ],
862 |       "execution_count": 0,
863 |       "outputs": []
864 |     },
865 |     {
866 |       "metadata": {
867 |         "id": "-thbodgih_VJ",
868 |         "colab_type": "code",
869 |         "colab": {}
870 |       },
871 |       "cell_type": "code",
872 |       "source": [
873 |         "pred_sentences = [\n",
874 |         "  \"That movie was absolutely awful\",\n",
875 |         "  \"The acting was a bit lacking\",\n",
876 |         "  \"The film was creative and surprising\",\n",
877 |         "  \"Absolutely fantastic!\"\n",
878 |         "]"
879 |       ],
880 |       "execution_count": 0,
881 |       "outputs": []
882 |     },
883 |     {
884 |       "metadata": {
885 |         "id": "QrZmvZySKQTm",
886 |         "colab_type": "code",
887 |         "colab": {}
888 |       },
889 |       "cell_type": "code",
890 |       "source": [
891 |         "predictions = getPrediction(pred_sentences)"
892 |       ],
893 |       "execution_count": 0,
894 |       "outputs": []
895 |     },
896 |     {
897 |       "metadata": {
898 |         "id": "MXkRiEBUqN3n",
899 |         "colab_type": "text"
900 |       },
901 |       "cell_type": "markdown",
902 |       "source": [
903 |         "Voila! We have a sentiment classifier!"
904 |       ]
905 |     },
906 |     {
907 |       "metadata": {
908 |         "id": "ERkTE8-7oQLZ",
909 |         "colab_type": "code",
910 |         "colab": {}
911 |       },
912 |       "cell_type": "code",
913 |       "source": [
914 |         "predictions"
915 |       ],
916 |       "execution_count": 0,
917 |       "outputs": []
918 |     },
919 |     {
920 |       "metadata": {
921 |         "id": "ChWCclU4x_rU",
922 |         "colab_type": "code",
923 |         "colab": {}
924 |       },
925 |       "cell_type": "code",
926 |       "source": [
927 |         "pred_sentences = [\n",
928 |         "  \"I love to eat sea food\"\n",
929 |         "]"
930 |       ],
931 |       "execution_count": 0,
932 |       "outputs": []
933 |     },
934 |     {
935 |       "metadata": {
936 |         "id": "jAAkHDb2x7hp",
937 |         "colab_type": "code",
938 |         "colab": {}
939 |       },
940 |       "cell_type": "code",
941 |       "source": [
942 |         "prediction = getSinglePrediction(pred_sentences)"
943 |       ],
944 |       "execution_count": 0,
945 |       "outputs": []
946 |     },
947 |     {
948 |       "metadata": {
949 |         "id": "y2l7Yc_GyPJR",
950 |         "colab_type": "code",
951 |         "colab": {}
952 |       },
953 |       "cell_type": "code",
954 |       "source": [
955 |         "prediction"
956 |       ],
957 |       "execution_count": 0,
958 |       "outputs": []
959 |     }
960 |   ]
961 | }


--------------------------------------------------------------------------------