├── README.md ├── bert_text_classification.ipynb └── imdb-sample.pickle /README.md: -------------------------------------------------------------------------------- 1 | # demo-text-binary-classification-with-bert -------------------------------------------------------------------------------- /bert_text_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "bert_text_classification.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "metadata": { 31 | "id": "zwYoyLfLQI-t", 32 | "colab_type": "text" 33 | }, 34 | "cell_type": "markdown", 35 | "source": [ 36 | "base code borrowed from [this Google Colab Notebook](https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb).\n", 37 | "\n", 38 | "Refactored by [Shuyi Wang](https://www.linkedin.com/in/shuyi-wang-b3955026/) and [Yan Sun](https://github.com/SunYanCN)\n", 39 | "\n", 40 | "Please refer to [this Medium Article](https://medium.com/@wshuyi/how-to-do-text-binary-classification-with-bert-f1348a25d905) for detailed information.\n", 41 | "\n" 42 | ] 43 | }, 44 | { 45 | "metadata": { 46 | "id": "EQHcJlAgHZ2o", 47 | "colab_type": "code", 48 | "colab": {} 49 | }, 50 | "cell_type": "code", 51 | "source": [ 52 | "!pip install bert-text" 53 | ], 54 | "execution_count": 0, 55 | "outputs": [] 56 | }, 57 | { 58 | "metadata": { 59 | "id": "oZFDseXWHiSz", 60 | "colab_type": "code", 61 | "colab": {} 62 | }, 63 | "cell_type": "code", 64 | "source": [ 65 | "from bert_text import run_on_dfs" 66 | ], 67 | "execution_count": 0, 68 | "outputs": [] 69 | }, 70 | { 71 | "metadata": { 72 | "id": "R4pBwSidH4CT", 73 | "colab_type": "code", 74 | "colab": {} 75 | }, 76 | "cell_type": "code", 77 | "source": [ 78 | "import pickle" 79 | ], 80 | "execution_count": 0, 81 | "outputs": [] 82 | }, 83 | { 84 | "metadata": { 85 | "id": "wiOyfxH6H5ry", 86 | "colab_type": "code", 87 | "colab": {} 88 | }, 89 | "cell_type": "code", 90 | "source": [ 91 | "!wget https://github.com/wshuyi/info-5731-public/raw/master/imdb-sample.pickle" 92 | ], 93 | "execution_count": 0, 94 | "outputs": [] 95 | }, 96 | { 97 | "metadata": { 98 | "id": "vEGtiE42IDyj", 99 | "colab_type": "code", 100 | "colab": {} 101 | }, 102 | "cell_type": "code", 103 | "source": [ 104 | "with open(\"imdb-sample.pickle\", 'rb') as f:\n", 105 | " train, test = pickle.load(f)" 106 | ], 107 | "execution_count": 0, 108 | "outputs": [] 109 | }, 110 | { 111 | "metadata": { 112 | "id": "P6wlSR8NIKHS", 113 | "colab_type": "code", 114 | "colab": {} 115 | }, 116 | "cell_type": "code", 117 | "source": [ 118 | "train = train.sample(len(train))" 119 | ], 120 | "execution_count": 0, 121 | "outputs": [] 122 | }, 123 | { 124 | "metadata": { 125 | "id": "qCoK42AGIXsS", 126 | "colab_type": "code", 127 | "colab": {} 128 | }, 129 | "cell_type": "code", 130 | "source": [ 131 | "myparam = {\n", 132 | " \"DATA_COLUMN\": \"text\",\n", 133 | " \"LABEL_COLUMN\": \"sentiment\",\n", 134 | " \"LEARNING_RATE\": 2e-5,\n", 135 | " \"NUM_TRAIN_EPOCHS\": 3\n", 136 | "}" 137 | ], 138 | "execution_count": 0, 139 | "outputs": [] 140 | }, 141 | { 142 | "metadata": { 143 | "id": "SUwqGJuIfgap", 144 | "colab_type": "code", 145 | "colab": {} 146 | }, 147 | "cell_type": "code", 148 | "source": [ 149 | "import tensorflow as tf\n", 150 | "tf.logging.set_verbosity(tf.logging.INFO)" 151 | ], 152 | "execution_count": 0, 153 | "outputs": [] 154 | }, 155 | { 156 | "metadata": { 157 | "id": "YoOVNBr7IsjS", 158 | "colab_type": "code", 159 | "colab": {} 160 | }, 161 | "cell_type": "code", 162 | "source": [ 163 | "result, estimator = run_on_dfs(train, test, **myparam)" 164 | ], 165 | "execution_count": 0, 166 | "outputs": [] 167 | }, 168 | { 169 | "metadata": { 170 | "id": "kaZMjQ0cIw9y", 171 | "colab_type": "code", 172 | "colab": {} 173 | }, 174 | "cell_type": "code", 175 | "source": [ 176 | "result" 177 | ], 178 | "execution_count": 0, 179 | "outputs": [] 180 | } 181 | ] 182 | } -------------------------------------------------------------------------------- /imdb-sample.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wshuyi/demo-text-binary-classification-with-bert/ba349e41923636261c0b36385887758dc9ae0833/imdb-sample.pickle --------------------------------------------------------------------------------