├── .vscode └── settings.json ├── LICENSE ├── README.md ├── .gitignore └── src ├── bert-movie-reviews-sentiment-classifier-local.ipynb ├── bert_sentiment_classifier.py └── bert_sentiment_classifier.ipynb /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/usr/local/bin/python" 3 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Loreto Parisi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bert-movie-reviews-sentiment-classifier 2 | Build a Movie Reviews Sentiment Classifier with Google's [BERT](https://github.com/google-research/bert) Language Model 3 | 4 | # What's this? 5 | This is a example of building a Movie Reviews Sentiment classifier with Google's BERT (Bidirectional Encoder Representations from Transformers) NLP Language Model. 6 | 7 | # Requirements 8 | This code requires `scikit-learn`, `tensorflow-gpu`, `tensorflow-hub`, `bert-tensorflow`. The code is compatibile with TF <= 1.1.50 and latest available BERT model on Tensorflow Hub. To use the cpu version please install `tensorflow==1.15.0`. 9 | 10 | ```bash 11 | pip install scikit-learn 12 | pip install tensorflow-gpu==1.15.0 13 | pip install tensorflow-hub 14 | pip install bert-tensorflow 15 | ``` 16 | 17 | # How to Run 18 | To run this project you can 19 | - Open the IPython Notebook `src/bert_sentiment_classifier-local.ipynb` in your Juypter Notebook or 20 | 21 | - Import `src/bert_sentiment_classifier.ipynb` into [Google's Colab](https://colab.research.google.com) with GPU backend. 22 | 23 | - Open the Pyhon Interactive `src/bert_sentiment_classifier.py` in VisualStudio Code. See [here](https://code.visualstudio.com/docs/python/jupyter-support) how it works with Jupyter Notebooks and Code. 24 | 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /src/bert-movie-reviews-sentiment-classifier-local.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!pip uninstall tensorflow-gpu\n", 10 | "!pip install scikit-learn\n", 11 | "!pip install tensorflow-gpu==1.15.0\n", 12 | "!pip install tensorflow-hub\n", 13 | "!pip install bert-tensorflow" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from sklearn.model_selection import train_test_split\n", 23 | "import pandas as pd\n", 24 | "import tensorflow as tf\n", 25 | "import tensorflow_hub as hub\n", 26 | "from datetime import datetime" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import bert\n", 36 | "from bert import run_classifier\n", 37 | "from bert import optimization\n", 38 | "from bert import tokenization" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from tensorflow import keras\n", 48 | "import os\n", 49 | "import re\n", 50 | "\n", 51 | "# Load all files from a directory in a DataFrame.\n", 52 | "def load_directory_data(directory):\n", 53 | " data = {}\n", 54 | " data[\"sentence\"] = []\n", 55 | " data[\"sentiment\"] = []\n", 56 | " for file_path in os.listdir(directory):\n", 57 | " with tf.gfile.GFile(os.path.join(directory, file_path), \"r\") as f:\n", 58 | " data[\"sentence\"].append(f.read())\n", 59 | " data[\"sentiment\"].append(re.match(\"\\d+_(\\d+)\\.txt\", file_path).group(1))\n", 60 | " return pd.DataFrame.from_dict(data)\n", 61 | "\n", 62 | "# Merge positive and negative examples, add a polarity column and shuffle.\n", 63 | "def load_dataset(directory):\n", 64 | " pos_df = load_directory_data(os.path.join(directory, \"pos\"))\n", 65 | " neg_df = load_directory_data(os.path.join(directory, \"neg\"))\n", 66 | " pos_df[\"polarity\"] = 1\n", 67 | " neg_df[\"polarity\"] = 0\n", 68 | " return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)\n", 69 | "\n", 70 | "# Download and process the dataset files.\n", 71 | "def download_and_load_datasets(force_download=False):\n", 72 | " dataset = tf.keras.utils.get_file(\n", 73 | " fname=\"aclImdb.tar.gz\", \n", 74 | " origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\", \n", 75 | " extract=True)\n", 76 | " \n", 77 | " train_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", 78 | " \"aclImdb\", \"train\"))\n", 79 | " test_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", 80 | " \"aclImdb\", \"test\"))\n", 81 | " \n", 82 | " return train_df, test_df" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "train, test = download_and_load_datasets()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "train = train.sample(5000)\n", 101 | "test = test.sample(5000)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "train.columns" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "DATA_COLUMN = 'sentence'\n", 120 | "LABEL_COLUMN = 'polarity'\n", 121 | "# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'\n", 122 | "label_list = [0, 1]" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n", 132 | "train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example\n", 133 | " text_a = x[DATA_COLUMN], \n", 134 | " text_b = None, \n", 135 | " label = x[LABEL_COLUMN]), axis = 1)\n", 136 | "\n", 137 | "test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n", 138 | " text_a = x[DATA_COLUMN], \n", 139 | " text_b = None, \n", 140 | " label = x[LABEL_COLUMN]), axis = 1)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "\n", 150 | "# This is a path to an uncased (all lowercase) version of BERT\n", 151 | "BERT_MODEL_HUB = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"\n", 152 | "\n", 153 | "def create_tokenizer_from_hub_module():\n", 154 | " \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n", 155 | " with tf.Graph().as_default():\n", 156 | " bert_module = hub.Module(BERT_MODEL_HUB)\n", 157 | " tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n", 158 | " with tf.Session() as sess:\n", 159 | " vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n", 160 | " tokenization_info[\"do_lower_case\"]])\n", 161 | " \n", 162 | " return bert.tokenization.FullTokenizer(\n", 163 | " vocab_file=vocab_file, do_lower_case=do_lower_case)\n", 164 | "\n", 165 | "tokenizer = create_tokenizer_from_hub_module()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "tokenizer.tokenize(\"This here's an example of using the BERT tokenizer\")" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "# We'll set sequences to be at most 128 tokens long.\n", 184 | "MAX_SEQ_LENGTH = 128\n", 185 | "# Convert our train and test features to InputFeatures that BERT understands.\n", 186 | "train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", 187 | "test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,\n", 197 | " num_labels):\n", 198 | " \"\"\"Creates a classification model.\"\"\"\n", 199 | "\n", 200 | " bert_module = hub.Module(\n", 201 | " BERT_MODEL_HUB,\n", 202 | " trainable=True)\n", 203 | " bert_inputs = dict(\n", 204 | " input_ids=input_ids,\n", 205 | " input_mask=input_mask,\n", 206 | " segment_ids=segment_ids)\n", 207 | " bert_outputs = bert_module(\n", 208 | " inputs=bert_inputs,\n", 209 | " signature=\"tokens\",\n", 210 | " as_dict=True)\n", 211 | "\n", 212 | " # Use \"pooled_output\" for classification tasks on an entire sentence.\n", 213 | " # Use \"sequence_outputs\" for token-level output.\n", 214 | " output_layer = bert_outputs[\"pooled_output\"]\n", 215 | "\n", 216 | " hidden_size = output_layer.shape[-1].value\n", 217 | "\n", 218 | " # Create our own layer to tune for politeness data.\n", 219 | " output_weights = tf.get_variable(\n", 220 | " \"output_weights\", [num_labels, hidden_size],\n", 221 | " initializer=tf.truncated_normal_initializer(stddev=0.02))\n", 222 | "\n", 223 | " output_bias = tf.get_variable(\n", 224 | " \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n", 225 | "\n", 226 | " with tf.variable_scope(\"loss\"):\n", 227 | "\n", 228 | " # Dropout helps prevent overfitting\n", 229 | " output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n", 230 | "\n", 231 | " logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n", 232 | " logits = tf.nn.bias_add(logits, output_bias)\n", 233 | " log_probs = tf.nn.log_softmax(logits, axis=-1)\n", 234 | "\n", 235 | " # Convert labels into one-hot encoding\n", 236 | " one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n", 237 | "\n", 238 | " predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n", 239 | " # If we're predicting, we want predicted labels and the probabiltiies.\n", 240 | " if is_predicting:\n", 241 | " return (predicted_labels, log_probs)\n", 242 | "\n", 243 | " # If we're train/eval, compute loss between predicted and actual label\n", 244 | " per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n", 245 | " loss = tf.reduce_mean(per_example_loss)\n", 246 | " return (loss, predicted_labels, log_probs)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | " # model_fn_builder actually creates our model function\n", 256 | "# using the passed parameters for num_labels, learning_rate, etc.\n", 257 | "def model_fn_builder(num_labels, learning_rate, num_train_steps,\n", 258 | " num_warmup_steps):\n", 259 | " \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n", 260 | " def model_fn(features, labels, mode, params): # pylint: disable=unused-argument\n", 261 | " \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n", 262 | "\n", 263 | " input_ids = features[\"input_ids\"]\n", 264 | " input_mask = features[\"input_mask\"]\n", 265 | " segment_ids = features[\"segment_ids\"]\n", 266 | " label_ids = features[\"label_ids\"]\n", 267 | "\n", 268 | " is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n", 269 | " \n", 270 | " # TRAIN and EVAL\n", 271 | " if not is_predicting:\n", 272 | "\n", 273 | " (loss, predicted_labels, log_probs) = create_model(\n", 274 | " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", 275 | "\n", 276 | " train_op = bert.optimization.create_optimizer(\n", 277 | " loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n", 278 | "\n", 279 | " # Calculate evaluation metrics. \n", 280 | " def metric_fn(label_ids, predicted_labels):\n", 281 | " accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n", 282 | " f1_score = tf.contrib.metrics.f1_score(\n", 283 | " label_ids,\n", 284 | " predicted_labels)\n", 285 | " auc = tf.metrics.auc(\n", 286 | " label_ids,\n", 287 | " predicted_labels)\n", 288 | " recall = tf.metrics.recall(\n", 289 | " label_ids,\n", 290 | " predicted_labels)\n", 291 | " precision = tf.metrics.precision(\n", 292 | " label_ids,\n", 293 | " predicted_labels) \n", 294 | " true_pos = tf.metrics.true_positives(\n", 295 | " label_ids,\n", 296 | " predicted_labels)\n", 297 | " true_neg = tf.metrics.true_negatives(\n", 298 | " label_ids,\n", 299 | " predicted_labels) \n", 300 | " false_pos = tf.metrics.false_positives(\n", 301 | " label_ids,\n", 302 | " predicted_labels) \n", 303 | " false_neg = tf.metrics.false_negatives(\n", 304 | " label_ids,\n", 305 | " predicted_labels)\n", 306 | " return {\n", 307 | " \"eval_accuracy\": accuracy,\n", 308 | " \"f1_score\": f1_score,\n", 309 | " \"auc\": auc,\n", 310 | " \"precision\": precision,\n", 311 | " \"recall\": recall,\n", 312 | " \"true_positives\": true_pos,\n", 313 | " \"true_negatives\": true_neg,\n", 314 | " \"false_positives\": false_pos,\n", 315 | " \"false_negatives\": false_neg\n", 316 | " }\n", 317 | "\n", 318 | " eval_metrics = metric_fn(label_ids, predicted_labels)\n", 319 | "\n", 320 | " if mode == tf.estimator.ModeKeys.TRAIN:\n", 321 | " return tf.estimator.EstimatorSpec(mode=mode,\n", 322 | " loss=loss,\n", 323 | " train_op=train_op)\n", 324 | " else:\n", 325 | " return tf.estimator.EstimatorSpec(mode=mode,\n", 326 | " loss=loss,\n", 327 | " eval_metric_ops=eval_metrics)\n", 328 | " else:\n", 329 | " (predicted_labels, log_probs) = create_model(\n", 330 | " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", 331 | "\n", 332 | " predictions = {\n", 333 | " 'probabilities': log_probs,\n", 334 | " 'labels': predicted_labels\n", 335 | " }\n", 336 | " return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n", 337 | "\n", 338 | " # Return the actual model function in the closure\n", 339 | " return model_fn" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "# Compute train and warmup steps from batch size\n", 349 | "# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)\n", 350 | "BATCH_SIZE = 32\n", 351 | "LEARNING_RATE = 2e-5\n", 352 | "NUM_TRAIN_EPOCHS = 3.0\n", 353 | "# Warmup is a period of time where hte learning rate \n", 354 | "# is small and gradually increases--usually helps training.\n", 355 | "WARMUP_PROPORTION = 0.1\n", 356 | "# Model configs\n", 357 | "SAVE_CHECKPOINTS_STEPS = 500\n", 358 | "SAVE_SUMMARY_STEPS = 100" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "# Compute # train and warmup steps from batch size\n", 368 | "num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n", 369 | "num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "OUTPUT_DIR=\"./\"" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "# Specify outpit directory and number of checkpoint steps to save\n", 388 | "run_config = tf.estimator.RunConfig(\n", 389 | " model_dir=OUTPUT_DIR,\n", 390 | " save_summary_steps=SAVE_SUMMARY_STEPS,\n", 391 | " save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "model_fn = model_fn_builder(\n", 401 | " num_labels=len(label_list),\n", 402 | " learning_rate=LEARNING_RATE,\n", 403 | " num_train_steps=num_train_steps,\n", 404 | " num_warmup_steps=num_warmup_steps)\n", 405 | "\n", 406 | "estimator = tf.estimator.Estimator(\n", 407 | " model_fn=model_fn,\n", 408 | " config=run_config,\n", 409 | " params={\"batch_size\": BATCH_SIZE})" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "# Create an input function for training. drop_remainder = True for using TPUs.\n", 419 | "train_input_fn = bert.run_classifier.input_fn_builder(\n", 420 | " features=train_features,\n", 421 | " seq_length=MAX_SEQ_LENGTH,\n", 422 | " is_training=True,\n", 423 | " drop_remainder=False)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "print(f'Beginning Training!')\n", 433 | "current_time = datetime.now()\n", 434 | "estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n", 435 | "print(\"Training took time \", datetime.now() - current_time)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "test_input_fn = run_classifier.input_fn_builder(\n", 445 | " features=test_features,\n", 446 | " seq_length=MAX_SEQ_LENGTH,\n", 447 | " is_training=False,\n", 448 | " drop_remainder=False)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "estimator.evaluate(input_fn=test_input_fn, steps=None)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "def getPrediction(in_sentences):\n", 467 | " labels = [\"Negative\", \"Positive\"]\n", 468 | " input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n", 469 | " input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", 470 | " predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n", 471 | " predictions = estimator.predict(predict_input_fn)\n", 472 | " return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "def getSinglePrediction(in_sentences):\n", 482 | " labels = [\"Negative\", \"Positive\"]\n", 483 | " input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n", 484 | " input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", 485 | " predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n", 486 | " predictions = estimator.predict(input_fn=predict_input_fn, yield_single_examples=False)\n", 487 | " return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "pred_sentences = [\n", 497 | " \"That movie was absolutely awful\",\n", 498 | " \"The acting was a bit lacking\",\n", 499 | " \"The film was creative and surprising\",\n", 500 | " \"Absolutely fantastic!\"\n", 501 | "]" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "predictions = getPrediction(pred_sentences)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "predictions" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "pred_sentences = [\n", 529 | " \"I love to eat sea food\"\n", 530 | "]" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "prediction = getSinglePrediction(pred_sentences)" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "prediction" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [] 557 | } 558 | ], 559 | "metadata": { 560 | "kernelspec": { 561 | "display_name": "Python 3", 562 | "language": "python", 563 | "name": "python3" 564 | }, 565 | "language_info": { 566 | "codemirror_mode": { 567 | "name": "ipython", 568 | "version": 3 569 | }, 570 | "file_extension": ".py", 571 | "mimetype": "text/x-python", 572 | "name": "python", 573 | "nbconvert_exporter": "python", 574 | "pygments_lexer": "ipython3", 575 | "version": "3.7.4" 576 | } 577 | }, 578 | "nbformat": 4, 579 | "nbformat_minor": 2 580 | } 581 | -------------------------------------------------------------------------------- /src/bert_sentiment_classifier.py: -------------------------------------------------------------------------------- 1 | #%% Change working directory from the workspace root to the ipynb file location. Turn this addition off with the DataScience.changeDirOnImportExport setting 2 | import os 3 | try: 4 | os.chdir(os.path.join(os.getcwd(), 'src')) 5 | print(os.getcwd()) 6 | except: 7 | pass 8 | 9 | #%% 10 | # Copyright 2019 Google Inc. 11 | 12 | # Licensed under the Apache License, Version 2.0 (the "License"); 13 | # you may not use this file except in compliance with the License. 14 | # You may obtain a copy of the License at 15 | 16 | # http://www.apache.org/licenses/LICENSE-2.0 17 | 18 | # Unless required by applicable law or agreed to in writing, software 19 | # distributed under the License is distributed on an "AS IS" BASIS, 20 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 21 | # See the License for the specific language governing permissions and 22 | # limitations under the License. 23 | 24 | #%% [markdown] 25 | # #Predicting Movie Review Sentiment with BERT on TF Hub 26 | #%% [markdown] 27 | # If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering. 28 | # 29 | # Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases. 30 | # 31 | # Here, we'll train a model to predict whether an IMDB movie review is positive or negative using BERT in Tensorflow with tf hub. Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started! 32 | 33 | #%% 34 | from sklearn.model_selection import train_test_split 35 | import pandas as pd 36 | import tensorflow as tf 37 | import tensorflow_hub as hub 38 | from datetime import datetime 39 | 40 | #%% [markdown] 41 | # In addition to the standard libraries we imported above, we'll need to install BERT's python package. 42 | 43 | #%% 44 | get_ipython().system(u'pip install bert-tensorflow') 45 | 46 | 47 | #%% 48 | import bert 49 | from bert import run_classifier 50 | from bert import optimization 51 | from bert import tokenization 52 | 53 | #%% [markdown] 54 | # Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends. 55 | # 56 | # Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field. 57 | # 58 | # Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist). 59 | 60 | #%% 61 | # Set the output directory for saving model file 62 | # Optionally, set a GCP bucket location 63 | 64 | OUTPUT_DIR = 'bert'#@param {type:"string"} 65 | #@markdown Whether or not to clear/delete the directory and create a new one 66 | DO_DELETE = False #@param {type:"boolean"} 67 | #@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket. 68 | USE_BUCKET = True #@param {type:"boolean"} 69 | BUCKET = 'my-cloud-bucket-ml' #@param {type:"string"} 70 | 71 | if USE_BUCKET: 72 | OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR) 73 | from google.colab import auth 74 | auth.authenticate_user() 75 | 76 | if DO_DELETE: 77 | try: 78 | tf.gfile.DeleteRecursively(OUTPUT_DIR) 79 | except: 80 | # Doesn't matter if the directory didn't exist 81 | pass 82 | tf.gfile.MakeDirs(OUTPUT_DIR) 83 | print('***** Model output directory: {} *****'.format(OUTPUT_DIR)) 84 | 85 | #%% [markdown] 86 | # #Data 87 | #%% [markdown] 88 | # First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub). 89 | 90 | #%% 91 | from tensorflow import keras 92 | import os 93 | import re 94 | 95 | # Load all files from a directory in a DataFrame. 96 | def load_directory_data(directory): 97 | data = {} 98 | data["sentence"] = [] 99 | data["sentiment"] = [] 100 | for file_path in os.listdir(directory): 101 | with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f: 102 | data["sentence"].append(f.read()) 103 | data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1)) 104 | return pd.DataFrame.from_dict(data) 105 | 106 | # Merge positive and negative examples, add a polarity column and shuffle. 107 | def load_dataset(directory): 108 | pos_df = load_directory_data(os.path.join(directory, "pos")) 109 | neg_df = load_directory_data(os.path.join(directory, "neg")) 110 | pos_df["polarity"] = 1 111 | neg_df["polarity"] = 0 112 | return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True) 113 | 114 | # Download and process the dataset files. 115 | def download_and_load_datasets(force_download=False): 116 | dataset = tf.keras.utils.get_file( 117 | fname="aclImdb.tar.gz", 118 | origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 119 | extract=True) 120 | 121 | train_df = load_dataset(os.path.join(os.path.dirname(dataset), 122 | "aclImdb", "train")) 123 | test_df = load_dataset(os.path.join(os.path.dirname(dataset), 124 | "aclImdb", "test")) 125 | 126 | return train_df, test_df 127 | 128 | 129 | #%% 130 | train, test = download_and_load_datasets() 131 | 132 | #%% [markdown] 133 | # To keep training fast, we'll take a sample of 5000 train and test examples, respectively. 134 | 135 | #%% 136 | train = train.sample(5000) 137 | test = test.sample(5000) 138 | 139 | 140 | #%% 141 | train.columns 142 | 143 | #%% [markdown] 144 | # For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely) 145 | 146 | #%% 147 | DATA_COLUMN = 'sentence' 148 | LABEL_COLUMN = 'polarity' 149 | # label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat' 150 | label_list = [0, 1] 151 | 152 | #%% [markdown] 153 | # #Data Preprocessing 154 | # We'll need to transform our data into a format BERT understands. This involves two steps. First, we create `InputExample`'s using the constructor provided in the BERT library. 155 | # 156 | # - `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 157 | # - `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank. 158 | # - `label` is the label for our example, i.e. True, False 159 | 160 | #%% 161 | # Use the InputExample class from BERT's run_classifier code to create examples from the data 162 | train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example 163 | text_a = x[DATA_COLUMN], 164 | text_b = None, 165 | label = x[LABEL_COLUMN]), axis = 1) 166 | 167 | test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 168 | text_a = x[DATA_COLUMN], 169 | text_b = None, 170 | label = x[LABEL_COLUMN]), axis = 1) 171 | 172 | #%% [markdown] 173 | # Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library): 174 | # 175 | # 176 | # 1. Lowercase our text (if we're using a BERT lowercase model) 177 | # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"]) 178 | # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"]) 179 | # 4. Map our words to indexes using a vocab file that BERT provides 180 | # 5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert)) 181 | # 6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf)) 182 | # 183 | # Happily, we don't have to worry about most of these details. 184 | # 185 | # 186 | # 187 | #%% [markdown] 188 | # To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module: 189 | 190 | #%% 191 | # This is a path to an uncased (all lowercase) version of BERT 192 | BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1" 193 | 194 | def create_tokenizer_from_hub_module(): 195 | """Get the vocab file and casing info from the Hub module.""" 196 | with tf.Graph().as_default(): 197 | bert_module = hub.Module(BERT_MODEL_HUB) 198 | tokenization_info = bert_module(signature="tokenization_info", as_dict=True) 199 | with tf.Session() as sess: 200 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], 201 | tokenization_info["do_lower_case"]]) 202 | 203 | return bert.tokenization.FullTokenizer( 204 | vocab_file=vocab_file, do_lower_case=do_lower_case) 205 | 206 | tokenizer = create_tokenizer_from_hub_module() 207 | 208 | #%% [markdown] 209 | # Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info["do_lower_case"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces: 210 | 211 | #%% 212 | tokenizer.tokenize("This here's an example of using the BERT tokenizer") 213 | 214 | #%% [markdown] 215 | # Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands. 216 | 217 | #%% 218 | # We'll set sequences to be at most 128 tokens long. 219 | MAX_SEQ_LENGTH = 128 220 | # Convert our train and test features to InputFeatures that BERT understands. 221 | train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer) 222 | test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer) 223 | 224 | #%% [markdown] 225 | # #Creating a model 226 | # 227 | # Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning). 228 | 229 | #%% 230 | def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, 231 | num_labels): 232 | """Creates a classification model.""" 233 | 234 | bert_module = hub.Module( 235 | BERT_MODEL_HUB, 236 | trainable=True) 237 | bert_inputs = dict( 238 | input_ids=input_ids, 239 | input_mask=input_mask, 240 | segment_ids=segment_ids) 241 | bert_outputs = bert_module( 242 | inputs=bert_inputs, 243 | signature="tokens", 244 | as_dict=True) 245 | 246 | # Use "pooled_output" for classification tasks on an entire sentence. 247 | # Use "sequence_outputs" for token-level output. 248 | output_layer = bert_outputs["pooled_output"] 249 | 250 | hidden_size = output_layer.shape[-1].value 251 | 252 | # Create our own layer to tune for politeness data. 253 | output_weights = tf.get_variable( 254 | "output_weights", [num_labels, hidden_size], 255 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 256 | 257 | output_bias = tf.get_variable( 258 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 259 | 260 | with tf.variable_scope("loss"): 261 | 262 | # Dropout helps prevent overfitting 263 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 264 | 265 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 266 | logits = tf.nn.bias_add(logits, output_bias) 267 | log_probs = tf.nn.log_softmax(logits, axis=-1) 268 | 269 | # Convert labels into one-hot encoding 270 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 271 | 272 | predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32)) 273 | # If we're predicting, we want predicted labels and the probabiltiies. 274 | if is_predicting: 275 | return (predicted_labels, log_probs) 276 | 277 | # If we're train/eval, compute loss between predicted and actual label 278 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 279 | loss = tf.reduce_mean(per_example_loss) 280 | return (loss, predicted_labels, log_probs) 281 | 282 | #%% [markdown] 283 | # Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction. 284 | 285 | #%% 286 | # model_fn_builder actually creates our model function 287 | # using the passed parameters for num_labels, learning_rate, etc. 288 | def model_fn_builder(num_labels, learning_rate, num_train_steps, 289 | num_warmup_steps): 290 | """Returns `model_fn` closure for TPUEstimator.""" 291 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 292 | """The `model_fn` for TPUEstimator.""" 293 | 294 | input_ids = features["input_ids"] 295 | input_mask = features["input_mask"] 296 | segment_ids = features["segment_ids"] 297 | label_ids = features["label_ids"] 298 | 299 | is_predicting = (mode == tf.estimator.ModeKeys.PREDICT) 300 | 301 | # TRAIN and EVAL 302 | if not is_predicting: 303 | 304 | (loss, predicted_labels, log_probs) = create_model( 305 | is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels) 306 | 307 | train_op = bert.optimization.create_optimizer( 308 | loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False) 309 | 310 | # Calculate evaluation metrics. 311 | def metric_fn(label_ids, predicted_labels): 312 | accuracy = tf.metrics.accuracy(label_ids, predicted_labels) 313 | f1_score = tf.contrib.metrics.f1_score( 314 | label_ids, 315 | predicted_labels) 316 | auc = tf.metrics.auc( 317 | label_ids, 318 | predicted_labels) 319 | recall = tf.metrics.recall( 320 | label_ids, 321 | predicted_labels) 322 | precision = tf.metrics.precision( 323 | label_ids, 324 | predicted_labels) 325 | true_pos = tf.metrics.true_positives( 326 | label_ids, 327 | predicted_labels) 328 | true_neg = tf.metrics.true_negatives( 329 | label_ids, 330 | predicted_labels) 331 | false_pos = tf.metrics.false_positives( 332 | label_ids, 333 | predicted_labels) 334 | false_neg = tf.metrics.false_negatives( 335 | label_ids, 336 | predicted_labels) 337 | return { 338 | "eval_accuracy": accuracy, 339 | "f1_score": f1_score, 340 | "auc": auc, 341 | "precision": precision, 342 | "recall": recall, 343 | "true_positives": true_pos, 344 | "true_negatives": true_neg, 345 | "false_positives": false_pos, 346 | "false_negatives": false_neg 347 | } 348 | 349 | eval_metrics = metric_fn(label_ids, predicted_labels) 350 | 351 | if mode == tf.estimator.ModeKeys.TRAIN: 352 | return tf.estimator.EstimatorSpec(mode=mode, 353 | loss=loss, 354 | train_op=train_op) 355 | else: 356 | return tf.estimator.EstimatorSpec(mode=mode, 357 | loss=loss, 358 | eval_metric_ops=eval_metrics) 359 | else: 360 | (predicted_labels, log_probs) = create_model( 361 | is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels) 362 | 363 | predictions = { 364 | 'probabilities': log_probs, 365 | 'labels': predicted_labels 366 | } 367 | return tf.estimator.EstimatorSpec(mode, predictions=predictions) 368 | 369 | # Return the actual model function in the closure 370 | return model_fn 371 | 372 | 373 | #%% 374 | # Compute train and warmup steps from batch size 375 | # These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb) 376 | BATCH_SIZE = 32 377 | LEARNING_RATE = 2e-5 378 | NUM_TRAIN_EPOCHS = 3.0 379 | # Warmup is a period of time where hte learning rate 380 | # is small and gradually increases--usually helps training. 381 | WARMUP_PROPORTION = 0.1 382 | # Model configs 383 | SAVE_CHECKPOINTS_STEPS = 500 384 | SAVE_SUMMARY_STEPS = 100 385 | 386 | 387 | #%% 388 | # Compute # train and warmup steps from batch size 389 | num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS) 390 | num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) 391 | 392 | 393 | #%% 394 | # Specify outpit directory and number of checkpoint steps to save 395 | run_config = tf.estimator.RunConfig( 396 | model_dir=OUTPUT_DIR, 397 | save_summary_steps=SAVE_SUMMARY_STEPS, 398 | save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS) 399 | 400 | 401 | #%% 402 | model_fn = model_fn_builder( 403 | num_labels=len(label_list), 404 | learning_rate=LEARNING_RATE, 405 | num_train_steps=num_train_steps, 406 | num_warmup_steps=num_warmup_steps) 407 | 408 | estimator = tf.estimator.Estimator( 409 | model_fn=model_fn, 410 | config=run_config, 411 | params={"batch_size": BATCH_SIZE}) 412 | 413 | #%% [markdown] 414 | # Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators). 415 | 416 | #%% 417 | # Create an input function for training. drop_remainder = True for using TPUs. 418 | train_input_fn = bert.run_classifier.input_fn_builder( 419 | features=train_features, 420 | seq_length=MAX_SEQ_LENGTH, 421 | is_training=True, 422 | drop_remainder=False) 423 | 424 | #%% [markdown] 425 | # Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes. 426 | 427 | #%% 428 | print(f'Beginning Training!') 429 | current_time = datetime.now() 430 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 431 | print("Training took time ", datetime.now() - current_time) 432 | 433 | #%% [markdown] 434 | # Now let's use our test data to see how well our model did: 435 | 436 | #%% 437 | test_input_fn = run_classifier.input_fn_builder( 438 | features=test_features, 439 | seq_length=MAX_SEQ_LENGTH, 440 | is_training=False, 441 | drop_remainder=False) 442 | 443 | 444 | #%% 445 | estimator.evaluate(input_fn=test_input_fn, steps=None) 446 | 447 | #%% [markdown] 448 | # Now let's write code to make predictions on new sentences: 449 | 450 | #%% 451 | def getPrediction(in_sentences): 452 | labels = ["Negative", "Positive"] 453 | input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label 454 | input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) 455 | predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) 456 | predictions = estimator.predict(predict_input_fn) 457 | return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)] 458 | 459 | 460 | #%% 461 | def getSinglePrediction(in_sentences): 462 | labels = ["Negative", "Positive"] 463 | input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label 464 | input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer) 465 | predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) 466 | predictions = estimator.predict(input_fn=predict_input_fn, yield_single_examples=False) 467 | return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)] 468 | 469 | 470 | #%% 471 | pred_sentences = [ 472 | "That movie was absolutely awful", 473 | "The acting was a bit lacking", 474 | "The film was creative and surprising", 475 | "Absolutely fantastic!" 476 | ] 477 | 478 | 479 | #%% 480 | predictions = getPrediction(pred_sentences) 481 | 482 | #%% [markdown] 483 | # Voila! We have a sentiment classifier! 484 | 485 | #%% 486 | predictions 487 | 488 | 489 | #%% 490 | pred_sentences = [ 491 | "I love to eat sea food" 492 | ] 493 | 494 | 495 | #%% 496 | prediction = getSinglePrediction(pred_sentences) 497 | 498 | 499 | #%% 500 | prediction 501 | 502 | 503 | -------------------------------------------------------------------------------- /src/bert_sentiment_classifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Predicting Movie Reviews with BERT on TF Hub.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "metadata": { 20 | "id": "j0a4mTk9o1Qg", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "cell_type": "code", 25 | "source": [ 26 | "# Copyright 2019 Google Inc.\n", 27 | "\n", 28 | "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", 29 | "# you may not use this file except in compliance with the License.\n", 30 | "# You may obtain a copy of the License at\n", 31 | "\n", 32 | "# http://www.apache.org/licenses/LICENSE-2.0\n", 33 | "\n", 34 | "# Unless required by applicable law or agreed to in writing, software\n", 35 | "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", 36 | "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 37 | "# See the License for the specific language governing permissions and\n", 38 | "# limitations under the License." 39 | ], 40 | "execution_count": 0, 41 | "outputs": [] 42 | }, 43 | { 44 | "metadata": { 45 | "id": "dCpvgG0vwXAZ", 46 | "colab_type": "text" 47 | }, 48 | "cell_type": "markdown", 49 | "source": [ 50 | "#Predicting Movie Review Sentiment with BERT on TF Hub" 51 | ] 52 | }, 53 | { 54 | "metadata": { 55 | "id": "xiYrZKaHwV81", 56 | "colab_type": "text" 57 | }, 58 | "cell_type": "markdown", 59 | "source": [ 60 | "If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering.\n", 61 | "\n", 62 | "Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases.\n", 63 | "\n", 64 | "Here, we'll train a model to predict whether an IMDB movie review is positive or negative using BERT in Tensorflow with tf hub. Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started!" 65 | ] 66 | }, 67 | { 68 | "metadata": { 69 | "id": "hsZvic2YxnTz", 70 | "colab_type": "code", 71 | "colab": {} 72 | }, 73 | "cell_type": "code", 74 | "source": [ 75 | "from sklearn.model_selection import train_test_split\n", 76 | "import pandas as pd\n", 77 | "import tensorflow as tf\n", 78 | "import tensorflow_hub as hub\n", 79 | "from datetime import datetime" 80 | ], 81 | "execution_count": 0, 82 | "outputs": [] 83 | }, 84 | { 85 | "metadata": { 86 | "id": "cp5wfXDx5SPH", 87 | "colab_type": "text" 88 | }, 89 | "cell_type": "markdown", 90 | "source": [ 91 | "In addition to the standard libraries we imported above, we'll need to install BERT's python package." 92 | ] 93 | }, 94 | { 95 | "metadata": { 96 | "id": "jviywGyWyKsA", 97 | "colab_type": "code", 98 | "colab": {} 99 | }, 100 | "cell_type": "code", 101 | "source": [ 102 | "!pip install bert-tensorflow" 103 | ], 104 | "execution_count": 0, 105 | "outputs": [] 106 | }, 107 | { 108 | "metadata": { 109 | "id": "hhbGEfwgdEtw", 110 | "colab_type": "code", 111 | "colab": {} 112 | }, 113 | "cell_type": "code", 114 | "source": [ 115 | "import bert\n", 116 | "from bert import run_classifier\n", 117 | "from bert import optimization\n", 118 | "from bert import tokenization" 119 | ], 120 | "execution_count": 0, 121 | "outputs": [] 122 | }, 123 | { 124 | "metadata": { 125 | "id": "KVB3eOcjxxm1", 126 | "colab_type": "text" 127 | }, 128 | "cell_type": "markdown", 129 | "source": [ 130 | "Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.\n", 131 | "\n", 132 | "Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.\n", 133 | "\n", 134 | "Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist)." 135 | ] 136 | }, 137 | { 138 | "metadata": { 139 | "id": "US_EAnICvP7f", 140 | "colab_type": "code", 141 | "cellView": "form", 142 | "colab": {} 143 | }, 144 | "cell_type": "code", 145 | "source": [ 146 | "# Set the output directory for saving model file\n", 147 | "# Optionally, set a GCP bucket location\n", 148 | "\n", 149 | "OUTPUT_DIR = 'bert'#@param {type:\"string\"}\n", 150 | "#@markdown Whether or not to clear/delete the directory and create a new one\n", 151 | "DO_DELETE = False #@param {type:\"boolean\"}\n", 152 | "#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.\n", 153 | "USE_BUCKET = True #@param {type:\"boolean\"}\n", 154 | "BUCKET = 'my-cloud-bucket-ml' #@param {type:\"string\"}\n", 155 | "\n", 156 | "if USE_BUCKET:\n", 157 | " OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)\n", 158 | " from google.colab import auth\n", 159 | " auth.authenticate_user()\n", 160 | "\n", 161 | "if DO_DELETE:\n", 162 | " try:\n", 163 | " tf.gfile.DeleteRecursively(OUTPUT_DIR)\n", 164 | " except:\n", 165 | " # Doesn't matter if the directory didn't exist\n", 166 | " pass\n", 167 | "tf.gfile.MakeDirs(OUTPUT_DIR)\n", 168 | "print('***** Model output directory: {} *****'.format(OUTPUT_DIR))\n" 169 | ], 170 | "execution_count": 0, 171 | "outputs": [] 172 | }, 173 | { 174 | "metadata": { 175 | "id": "pmFYvkylMwXn", 176 | "colab_type": "text" 177 | }, 178 | "cell_type": "markdown", 179 | "source": [ 180 | "#Data" 181 | ] 182 | }, 183 | { 184 | "metadata": { 185 | "id": "MC_w8SRqN0fr", 186 | "colab_type": "text" 187 | }, 188 | "cell_type": "markdown", 189 | "source": [ 190 | "First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub)." 191 | ] 192 | }, 193 | { 194 | "metadata": { 195 | "id": "fom_ff20gyy6", 196 | "colab_type": "code", 197 | "colab": {} 198 | }, 199 | "cell_type": "code", 200 | "source": [ 201 | "from tensorflow import keras\n", 202 | "import os\n", 203 | "import re\n", 204 | "\n", 205 | "# Load all files from a directory in a DataFrame.\n", 206 | "def load_directory_data(directory):\n", 207 | " data = {}\n", 208 | " data[\"sentence\"] = []\n", 209 | " data[\"sentiment\"] = []\n", 210 | " for file_path in os.listdir(directory):\n", 211 | " with tf.gfile.GFile(os.path.join(directory, file_path), \"r\") as f:\n", 212 | " data[\"sentence\"].append(f.read())\n", 213 | " data[\"sentiment\"].append(re.match(\"\\d+_(\\d+)\\.txt\", file_path).group(1))\n", 214 | " return pd.DataFrame.from_dict(data)\n", 215 | "\n", 216 | "# Merge positive and negative examples, add a polarity column and shuffle.\n", 217 | "def load_dataset(directory):\n", 218 | " pos_df = load_directory_data(os.path.join(directory, \"pos\"))\n", 219 | " neg_df = load_directory_data(os.path.join(directory, \"neg\"))\n", 220 | " pos_df[\"polarity\"] = 1\n", 221 | " neg_df[\"polarity\"] = 0\n", 222 | " return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)\n", 223 | "\n", 224 | "# Download and process the dataset files.\n", 225 | "def download_and_load_datasets(force_download=False):\n", 226 | " dataset = tf.keras.utils.get_file(\n", 227 | " fname=\"aclImdb.tar.gz\", \n", 228 | " origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\", \n", 229 | " extract=True)\n", 230 | " \n", 231 | " train_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", 232 | " \"aclImdb\", \"train\"))\n", 233 | " test_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", 234 | " \"aclImdb\", \"test\"))\n", 235 | " \n", 236 | " return train_df, test_df\n" 237 | ], 238 | "execution_count": 0, 239 | "outputs": [] 240 | }, 241 | { 242 | "metadata": { 243 | "id": "2abfwdn-g135", 244 | "colab_type": "code", 245 | "colab": {} 246 | }, 247 | "cell_type": "code", 248 | "source": [ 249 | "train, test = download_and_load_datasets()" 250 | ], 251 | "execution_count": 0, 252 | "outputs": [] 253 | }, 254 | { 255 | "metadata": { 256 | "id": "XA8WHJgzhIZf", 257 | "colab_type": "text" 258 | }, 259 | "cell_type": "markdown", 260 | "source": [ 261 | "To keep training fast, we'll take a sample of 5000 train and test examples, respectively." 262 | ] 263 | }, 264 | { 265 | "metadata": { 266 | "id": "lw_F488eixTV", 267 | "colab_type": "code", 268 | "colab": {} 269 | }, 270 | "cell_type": "code", 271 | "source": [ 272 | "train = train.sample(5000)\n", 273 | "test = test.sample(5000)" 274 | ], 275 | "execution_count": 0, 276 | "outputs": [] 277 | }, 278 | { 279 | "metadata": { 280 | "id": "prRQM8pDi8xI", 281 | "colab_type": "code", 282 | "colab": {} 283 | }, 284 | "cell_type": "code", 285 | "source": [ 286 | "train.columns" 287 | ], 288 | "execution_count": 0, 289 | "outputs": [] 290 | }, 291 | { 292 | "metadata": { 293 | "id": "sfRnHSz3iSXz", 294 | "colab_type": "text" 295 | }, 296 | "cell_type": "markdown", 297 | "source": [ 298 | "For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)" 299 | ] 300 | }, 301 | { 302 | "metadata": { 303 | "id": "IuMOGwFui4it", 304 | "colab_type": "code", 305 | "colab": {} 306 | }, 307 | "cell_type": "code", 308 | "source": [ 309 | "DATA_COLUMN = 'sentence'\n", 310 | "LABEL_COLUMN = 'polarity'\n", 311 | "# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'\n", 312 | "label_list = [0, 1]" 313 | ], 314 | "execution_count": 0, 315 | "outputs": [] 316 | }, 317 | { 318 | "metadata": { 319 | "id": "V399W0rqNJ-Z", 320 | "colab_type": "text" 321 | }, 322 | "cell_type": "markdown", 323 | "source": [ 324 | "#Data Preprocessing\n", 325 | "We'll need to transform our data into a format BERT understands. This involves two steps. First, we create `InputExample`'s using the constructor provided in the BERT library.\n", 326 | "\n", 327 | "- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n", 328 | "- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.\n", 329 | "- `label` is the label for our example, i.e. True, False" 330 | ] 331 | }, 332 | { 333 | "metadata": { 334 | "id": "p9gEt5SmM6i6", 335 | "colab_type": "code", 336 | "colab": {} 337 | }, 338 | "cell_type": "code", 339 | "source": [ 340 | "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n", 341 | "train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example\n", 342 | " text_a = x[DATA_COLUMN], \n", 343 | " text_b = None, \n", 344 | " label = x[LABEL_COLUMN]), axis = 1)\n", 345 | "\n", 346 | "test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n", 347 | " text_a = x[DATA_COLUMN], \n", 348 | " text_b = None, \n", 349 | " label = x[LABEL_COLUMN]), axis = 1)" 350 | ], 351 | "execution_count": 0, 352 | "outputs": [] 353 | }, 354 | { 355 | "metadata": { 356 | "id": "SCZWZtKxObjh", 357 | "colab_type": "text" 358 | }, 359 | "cell_type": "markdown", 360 | "source": [ 361 | "Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):\n", 362 | "\n", 363 | "\n", 364 | "1. Lowercase our text (if we're using a BERT lowercase model)\n", 365 | "2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n", 366 | "3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n", 367 | "4. Map our words to indexes using a vocab file that BERT provides\n", 368 | "5. Add special \"CLS\" and \"SEP\" tokens (see the [readme](https://github.com/google-research/bert))\n", 369 | "6. Append \"index\" and \"segment\" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))\n", 370 | "\n", 371 | "Happily, we don't have to worry about most of these details.\n", 372 | "\n", 373 | "\n" 374 | ] 375 | }, 376 | { 377 | "metadata": { 378 | "id": "qMWiDtpyQSoU", 379 | "colab_type": "text" 380 | }, 381 | "cell_type": "markdown", 382 | "source": [ 383 | "To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:" 384 | ] 385 | }, 386 | { 387 | "metadata": { 388 | "id": "IhJSe0QHNG7U", 389 | "colab_type": "code", 390 | "colab": {} 391 | }, 392 | "cell_type": "code", 393 | "source": [ 394 | "# This is a path to an uncased (all lowercase) version of BERT\n", 395 | "BERT_MODEL_HUB = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"\n", 396 | "\n", 397 | "def create_tokenizer_from_hub_module():\n", 398 | " \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n", 399 | " with tf.Graph().as_default():\n", 400 | " bert_module = hub.Module(BERT_MODEL_HUB)\n", 401 | " tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n", 402 | " with tf.Session() as sess:\n", 403 | " vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n", 404 | " tokenization_info[\"do_lower_case\"]])\n", 405 | " \n", 406 | " return bert.tokenization.FullTokenizer(\n", 407 | " vocab_file=vocab_file, do_lower_case=do_lower_case)\n", 408 | "\n", 409 | "tokenizer = create_tokenizer_from_hub_module()" 410 | ], 411 | "execution_count": 0, 412 | "outputs": [] 413 | }, 414 | { 415 | "metadata": { 416 | "id": "z4oFkhpZBDKm", 417 | "colab_type": "text" 418 | }, 419 | "cell_type": "markdown", 420 | "source": [ 421 | "Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info[\"do_lower_case\"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:" 422 | ] 423 | }, 424 | { 425 | "metadata": { 426 | "id": "dsBo6RCtQmwx", 427 | "colab_type": "code", 428 | "colab": {} 429 | }, 430 | "cell_type": "code", 431 | "source": [ 432 | "tokenizer.tokenize(\"This here's an example of using the BERT tokenizer\")" 433 | ], 434 | "execution_count": 0, 435 | "outputs": [] 436 | }, 437 | { 438 | "metadata": { 439 | "id": "0OEzfFIt6GIc", 440 | "colab_type": "text" 441 | }, 442 | "cell_type": "markdown", 443 | "source": [ 444 | "Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands." 445 | ] 446 | }, 447 | { 448 | "metadata": { 449 | "id": "LL5W8gEGRTAf", 450 | "colab_type": "code", 451 | "colab": {} 452 | }, 453 | "cell_type": "code", 454 | "source": [ 455 | "# We'll set sequences to be at most 128 tokens long.\n", 456 | "MAX_SEQ_LENGTH = 128\n", 457 | "# Convert our train and test features to InputFeatures that BERT understands.\n", 458 | "train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", 459 | "test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)" 460 | ], 461 | "execution_count": 0, 462 | "outputs": [] 463 | }, 464 | { 465 | "metadata": { 466 | "id": "ccp5trMwRtmr", 467 | "colab_type": "text" 468 | }, 469 | "cell_type": "markdown", 470 | "source": [ 471 | "#Creating a model\n", 472 | "\n", 473 | "Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning)." 474 | ] 475 | }, 476 | { 477 | "metadata": { 478 | "id": "6o2a5ZIvRcJq", 479 | "colab_type": "code", 480 | "colab": {} 481 | }, 482 | "cell_type": "code", 483 | "source": [ 484 | "def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,\n", 485 | " num_labels):\n", 486 | " \"\"\"Creates a classification model.\"\"\"\n", 487 | "\n", 488 | " bert_module = hub.Module(\n", 489 | " BERT_MODEL_HUB,\n", 490 | " trainable=True)\n", 491 | " bert_inputs = dict(\n", 492 | " input_ids=input_ids,\n", 493 | " input_mask=input_mask,\n", 494 | " segment_ids=segment_ids)\n", 495 | " bert_outputs = bert_module(\n", 496 | " inputs=bert_inputs,\n", 497 | " signature=\"tokens\",\n", 498 | " as_dict=True)\n", 499 | "\n", 500 | " # Use \"pooled_output\" for classification tasks on an entire sentence.\n", 501 | " # Use \"sequence_outputs\" for token-level output.\n", 502 | " output_layer = bert_outputs[\"pooled_output\"]\n", 503 | "\n", 504 | " hidden_size = output_layer.shape[-1].value\n", 505 | "\n", 506 | " # Create our own layer to tune for politeness data.\n", 507 | " output_weights = tf.get_variable(\n", 508 | " \"output_weights\", [num_labels, hidden_size],\n", 509 | " initializer=tf.truncated_normal_initializer(stddev=0.02))\n", 510 | "\n", 511 | " output_bias = tf.get_variable(\n", 512 | " \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n", 513 | "\n", 514 | " with tf.variable_scope(\"loss\"):\n", 515 | "\n", 516 | " # Dropout helps prevent overfitting\n", 517 | " output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n", 518 | "\n", 519 | " logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n", 520 | " logits = tf.nn.bias_add(logits, output_bias)\n", 521 | " log_probs = tf.nn.log_softmax(logits, axis=-1)\n", 522 | "\n", 523 | " # Convert labels into one-hot encoding\n", 524 | " one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n", 525 | "\n", 526 | " predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n", 527 | " # If we're predicting, we want predicted labels and the probabiltiies.\n", 528 | " if is_predicting:\n", 529 | " return (predicted_labels, log_probs)\n", 530 | "\n", 531 | " # If we're train/eval, compute loss between predicted and actual label\n", 532 | " per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n", 533 | " loss = tf.reduce_mean(per_example_loss)\n", 534 | " return (loss, predicted_labels, log_probs)\n" 535 | ], 536 | "execution_count": 0, 537 | "outputs": [] 538 | }, 539 | { 540 | "metadata": { 541 | "id": "qpE0ZIDOCQzE", 542 | "colab_type": "text" 543 | }, 544 | "cell_type": "markdown", 545 | "source": [ 546 | "Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction." 547 | ] 548 | }, 549 | { 550 | "metadata": { 551 | "id": "FnH-AnOQ9KKW", 552 | "colab_type": "code", 553 | "colab": {} 554 | }, 555 | "cell_type": "code", 556 | "source": [ 557 | "# model_fn_builder actually creates our model function\n", 558 | "# using the passed parameters for num_labels, learning_rate, etc.\n", 559 | "def model_fn_builder(num_labels, learning_rate, num_train_steps,\n", 560 | " num_warmup_steps):\n", 561 | " \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n", 562 | " def model_fn(features, labels, mode, params): # pylint: disable=unused-argument\n", 563 | " \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n", 564 | "\n", 565 | " input_ids = features[\"input_ids\"]\n", 566 | " input_mask = features[\"input_mask\"]\n", 567 | " segment_ids = features[\"segment_ids\"]\n", 568 | " label_ids = features[\"label_ids\"]\n", 569 | "\n", 570 | " is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n", 571 | " \n", 572 | " # TRAIN and EVAL\n", 573 | " if not is_predicting:\n", 574 | "\n", 575 | " (loss, predicted_labels, log_probs) = create_model(\n", 576 | " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", 577 | "\n", 578 | " train_op = bert.optimization.create_optimizer(\n", 579 | " loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n", 580 | "\n", 581 | " # Calculate evaluation metrics. \n", 582 | " def metric_fn(label_ids, predicted_labels):\n", 583 | " accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n", 584 | " f1_score = tf.contrib.metrics.f1_score(\n", 585 | " label_ids,\n", 586 | " predicted_labels)\n", 587 | " auc = tf.metrics.auc(\n", 588 | " label_ids,\n", 589 | " predicted_labels)\n", 590 | " recall = tf.metrics.recall(\n", 591 | " label_ids,\n", 592 | " predicted_labels)\n", 593 | " precision = tf.metrics.precision(\n", 594 | " label_ids,\n", 595 | " predicted_labels) \n", 596 | " true_pos = tf.metrics.true_positives(\n", 597 | " label_ids,\n", 598 | " predicted_labels)\n", 599 | " true_neg = tf.metrics.true_negatives(\n", 600 | " label_ids,\n", 601 | " predicted_labels) \n", 602 | " false_pos = tf.metrics.false_positives(\n", 603 | " label_ids,\n", 604 | " predicted_labels) \n", 605 | " false_neg = tf.metrics.false_negatives(\n", 606 | " label_ids,\n", 607 | " predicted_labels)\n", 608 | " return {\n", 609 | " \"eval_accuracy\": accuracy,\n", 610 | " \"f1_score\": f1_score,\n", 611 | " \"auc\": auc,\n", 612 | " \"precision\": precision,\n", 613 | " \"recall\": recall,\n", 614 | " \"true_positives\": true_pos,\n", 615 | " \"true_negatives\": true_neg,\n", 616 | " \"false_positives\": false_pos,\n", 617 | " \"false_negatives\": false_neg\n", 618 | " }\n", 619 | "\n", 620 | " eval_metrics = metric_fn(label_ids, predicted_labels)\n", 621 | "\n", 622 | " if mode == tf.estimator.ModeKeys.TRAIN:\n", 623 | " return tf.estimator.EstimatorSpec(mode=mode,\n", 624 | " loss=loss,\n", 625 | " train_op=train_op)\n", 626 | " else:\n", 627 | " return tf.estimator.EstimatorSpec(mode=mode,\n", 628 | " loss=loss,\n", 629 | " eval_metric_ops=eval_metrics)\n", 630 | " else:\n", 631 | " (predicted_labels, log_probs) = create_model(\n", 632 | " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", 633 | "\n", 634 | " predictions = {\n", 635 | " 'probabilities': log_probs,\n", 636 | " 'labels': predicted_labels\n", 637 | " }\n", 638 | " return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n", 639 | "\n", 640 | " # Return the actual model function in the closure\n", 641 | " return model_fn\n" 642 | ], 643 | "execution_count": 0, 644 | "outputs": [] 645 | }, 646 | { 647 | "metadata": { 648 | "id": "OjwJ4bTeWXD8", 649 | "colab_type": "code", 650 | "colab": {} 651 | }, 652 | "cell_type": "code", 653 | "source": [ 654 | "# Compute train and warmup steps from batch size\n", 655 | "# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)\n", 656 | "BATCH_SIZE = 32\n", 657 | "LEARNING_RATE = 2e-5\n", 658 | "NUM_TRAIN_EPOCHS = 3.0\n", 659 | "# Warmup is a period of time where hte learning rate \n", 660 | "# is small and gradually increases--usually helps training.\n", 661 | "WARMUP_PROPORTION = 0.1\n", 662 | "# Model configs\n", 663 | "SAVE_CHECKPOINTS_STEPS = 500\n", 664 | "SAVE_SUMMARY_STEPS = 100" 665 | ], 666 | "execution_count": 0, 667 | "outputs": [] 668 | }, 669 | { 670 | "metadata": { 671 | "id": "emHf9GhfWBZ_", 672 | "colab_type": "code", 673 | "colab": {} 674 | }, 675 | "cell_type": "code", 676 | "source": [ 677 | "# Compute # train and warmup steps from batch size\n", 678 | "num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n", 679 | "num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)" 680 | ], 681 | "execution_count": 0, 682 | "outputs": [] 683 | }, 684 | { 685 | "metadata": { 686 | "id": "oEJldMr3WYZa", 687 | "colab_type": "code", 688 | "colab": {} 689 | }, 690 | "cell_type": "code", 691 | "source": [ 692 | "# Specify outpit directory and number of checkpoint steps to save\n", 693 | "run_config = tf.estimator.RunConfig(\n", 694 | " model_dir=OUTPUT_DIR,\n", 695 | " save_summary_steps=SAVE_SUMMARY_STEPS,\n", 696 | " save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)" 697 | ], 698 | "execution_count": 0, 699 | "outputs": [] 700 | }, 701 | { 702 | "metadata": { 703 | "id": "q_WebpS1X97v", 704 | "colab_type": "code", 705 | "colab": {} 706 | }, 707 | "cell_type": "code", 708 | "source": [ 709 | "model_fn = model_fn_builder(\n", 710 | " num_labels=len(label_list),\n", 711 | " learning_rate=LEARNING_RATE,\n", 712 | " num_train_steps=num_train_steps,\n", 713 | " num_warmup_steps=num_warmup_steps)\n", 714 | "\n", 715 | "estimator = tf.estimator.Estimator(\n", 716 | " model_fn=model_fn,\n", 717 | " config=run_config,\n", 718 | " params={\"batch_size\": BATCH_SIZE})\n" 719 | ], 720 | "execution_count": 0, 721 | "outputs": [] 722 | }, 723 | { 724 | "metadata": { 725 | "id": "NOO3RfG1DYLo", 726 | "colab_type": "text" 727 | }, 728 | "cell_type": "markdown", 729 | "source": [ 730 | "Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators)." 731 | ] 732 | }, 733 | { 734 | "metadata": { 735 | "id": "1Pv2bAlOX_-K", 736 | "colab_type": "code", 737 | "colab": {} 738 | }, 739 | "cell_type": "code", 740 | "source": [ 741 | "# Create an input function for training. drop_remainder = True for using TPUs.\n", 742 | "train_input_fn = bert.run_classifier.input_fn_builder(\n", 743 | " features=train_features,\n", 744 | " seq_length=MAX_SEQ_LENGTH,\n", 745 | " is_training=True,\n", 746 | " drop_remainder=False)" 747 | ], 748 | "execution_count": 0, 749 | "outputs": [] 750 | }, 751 | { 752 | "metadata": { 753 | "id": "t6Nukby2EB6-", 754 | "colab_type": "text" 755 | }, 756 | "cell_type": "markdown", 757 | "source": [ 758 | "Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes." 759 | ] 760 | }, 761 | { 762 | "metadata": { 763 | "id": "nucD4gluYJmK", 764 | "colab_type": "code", 765 | "colab": {} 766 | }, 767 | "cell_type": "code", 768 | "source": [ 769 | "print(f'Beginning Training!')\n", 770 | "current_time = datetime.now()\n", 771 | "estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n", 772 | "print(\"Training took time \", datetime.now() - current_time)" 773 | ], 774 | "execution_count": 0, 775 | "outputs": [] 776 | }, 777 | { 778 | "metadata": { 779 | "id": "CmbLTVniARy3", 780 | "colab_type": "text" 781 | }, 782 | "cell_type": "markdown", 783 | "source": [ 784 | "Now let's use our test data to see how well our model did:" 785 | ] 786 | }, 787 | { 788 | "metadata": { 789 | "id": "JIhejfpyJ8Bx", 790 | "colab_type": "code", 791 | "colab": {} 792 | }, 793 | "cell_type": "code", 794 | "source": [ 795 | "test_input_fn = run_classifier.input_fn_builder(\n", 796 | " features=test_features,\n", 797 | " seq_length=MAX_SEQ_LENGTH,\n", 798 | " is_training=False,\n", 799 | " drop_remainder=False)" 800 | ], 801 | "execution_count": 0, 802 | "outputs": [] 803 | }, 804 | { 805 | "metadata": { 806 | "id": "PPVEXhNjYXC-", 807 | "colab_type": "code", 808 | "colab": {} 809 | }, 810 | "cell_type": "code", 811 | "source": [ 812 | "estimator.evaluate(input_fn=test_input_fn, steps=None)" 813 | ], 814 | "execution_count": 0, 815 | "outputs": [] 816 | }, 817 | { 818 | "metadata": { 819 | "id": "ueKsULteiz1B", 820 | "colab_type": "text" 821 | }, 822 | "cell_type": "markdown", 823 | "source": [ 824 | "Now let's write code to make predictions on new sentences:" 825 | ] 826 | }, 827 | { 828 | "metadata": { 829 | "id": "OsrbTD2EJTVl", 830 | "colab_type": "code", 831 | "colab": {} 832 | }, 833 | "cell_type": "code", 834 | "source": [ 835 | "def getPrediction(in_sentences):\n", 836 | " labels = [\"Negative\", \"Positive\"]\n", 837 | " input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n", 838 | " input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", 839 | " predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n", 840 | " predictions = estimator.predict(predict_input_fn)\n", 841 | " return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]" 842 | ], 843 | "execution_count": 0, 844 | "outputs": [] 845 | }, 846 | { 847 | "metadata": { 848 | "id": "RieQtf0HxqFy", 849 | "colab_type": "code", 850 | "colab": {} 851 | }, 852 | "cell_type": "code", 853 | "source": [ 854 | "def getSinglePrediction(in_sentences):\n", 855 | " labels = [\"Negative\", \"Positive\"]\n", 856 | " input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n", 857 | " input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", 858 | " predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n", 859 | " predictions = estimator.predict(input_fn=predict_input_fn, yield_single_examples=False)\n", 860 | " return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]" 861 | ], 862 | "execution_count": 0, 863 | "outputs": [] 864 | }, 865 | { 866 | "metadata": { 867 | "id": "-thbodgih_VJ", 868 | "colab_type": "code", 869 | "colab": {} 870 | }, 871 | "cell_type": "code", 872 | "source": [ 873 | "pred_sentences = [\n", 874 | " \"That movie was absolutely awful\",\n", 875 | " \"The acting was a bit lacking\",\n", 876 | " \"The film was creative and surprising\",\n", 877 | " \"Absolutely fantastic!\"\n", 878 | "]" 879 | ], 880 | "execution_count": 0, 881 | "outputs": [] 882 | }, 883 | { 884 | "metadata": { 885 | "id": "QrZmvZySKQTm", 886 | "colab_type": "code", 887 | "colab": {} 888 | }, 889 | "cell_type": "code", 890 | "source": [ 891 | "predictions = getPrediction(pred_sentences)" 892 | ], 893 | "execution_count": 0, 894 | "outputs": [] 895 | }, 896 | { 897 | "metadata": { 898 | "id": "MXkRiEBUqN3n", 899 | "colab_type": "text" 900 | }, 901 | "cell_type": "markdown", 902 | "source": [ 903 | "Voila! We have a sentiment classifier!" 904 | ] 905 | }, 906 | { 907 | "metadata": { 908 | "id": "ERkTE8-7oQLZ", 909 | "colab_type": "code", 910 | "colab": {} 911 | }, 912 | "cell_type": "code", 913 | "source": [ 914 | "predictions" 915 | ], 916 | "execution_count": 0, 917 | "outputs": [] 918 | }, 919 | { 920 | "metadata": { 921 | "id": "ChWCclU4x_rU", 922 | "colab_type": "code", 923 | "colab": {} 924 | }, 925 | "cell_type": "code", 926 | "source": [ 927 | "pred_sentences = [\n", 928 | " \"I love to eat sea food\"\n", 929 | "]" 930 | ], 931 | "execution_count": 0, 932 | "outputs": [] 933 | }, 934 | { 935 | "metadata": { 936 | "id": "jAAkHDb2x7hp", 937 | "colab_type": "code", 938 | "colab": {} 939 | }, 940 | "cell_type": "code", 941 | "source": [ 942 | "prediction = getSinglePrediction(pred_sentences)" 943 | ], 944 | "execution_count": 0, 945 | "outputs": [] 946 | }, 947 | { 948 | "metadata": { 949 | "id": "y2l7Yc_GyPJR", 950 | "colab_type": "code", 951 | "colab": {} 952 | }, 953 | "cell_type": "code", 954 | "source": [ 955 | "prediction" 956 | ], 957 | "execution_count": 0, 958 | "outputs": [] 959 | } 960 | ] 961 | } --------------------------------------------------------------------------------