├── .github └── FUNDING.yml ├── .gitignore ├── README.md ├── index.html ├── part1 └── score_reviews_via_service.ipynb ├── part2 └── train_sentiment_analysis.ipynb ├── part3 └── predict_sentiment_analysis.ipynb └── part5 └── sentiment.html /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: peckjon 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # IDE tempfiles 107 | .idea 108 | 109 | # pickled models 110 | *.pickle 111 | 112 | # notebook checkpoints 113 | */.ipynb_checkpoints -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hosting-ml-as-microservice 2 | Hosting your own Machine Learning Model as a Microservice 3 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 |

Hosting your own Machine Learning Model as a Microservice

3 |
  • Part 1: Existing Machine Learning Services Open In Colab
  • 4 |
  • Part 2: Training your own ML Model Open In Colab
  • 5 |
  • Part 3: Deploying as a FaaS Open In Colab
  • 6 |
  • Part 4: Deploying as a Container Service
  • 7 |
  • Part 5: Integrating your Microservice
  • -------------------------------------------------------------------------------- /part1/score_reviews_via_service.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "anaconda-cloud": {}, 6 | "kernelspec": { 7 | "display_name": "Python 3", 8 | "language": "python", 9 | "name": "python3" 10 | }, 11 | "language_info": { 12 | "codemirror_mode": { 13 | "name": "ipython", 14 | "version": 2 15 | }, 16 | "file_extension": ".py", 17 | "mimetype": "text/x-python", 18 | "name": "python", 19 | "nbconvert_exporter": "python", 20 | "pygments_lexer": "ipython2", 21 | "version": "2.7.3" 22 | }, 23 | "pycharm": { 24 | "stem_cell": { 25 | "cell_type": "raw", 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "source": [] 30 | } 31 | }, 32 | "colab": { 33 | "name": "score_reviews_via_service.ipynb", 34 | "provenance": [] 35 | } 36 | }, 37 | "cells": [ 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "id": "24yCLLjstPjb", 42 | "colab_type": "text" 43 | }, 44 | "source": [ 45 | "## Part 1: Existing Machine Learning Services\n", 46 | "\n", 47 | "\"Open" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "id": "jwR4_2_otPjg", 54 | "colab_type": "text" 55 | }, 56 | "source": [ 57 | "### Obtain labelled reviews\n", 58 | "\n", 59 | "In order to test any of the sentiment analysis APIs, we need a labelled dataset of reviews and their sentiment polarity. We'll use NLTK to download the movie_reviews corpus." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "pycharm": { 66 | "name": "#%%\n" 67 | }, 68 | "id": "zPOhVzNrtPjj", 69 | "colab_type": "code", 70 | "colab": {} 71 | }, 72 | "source": [ 73 | "from nltk import download\n", 74 | "\n", 75 | "download('movie_reviews')" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "id": "zPd-EWKptPjw", 84 | "colab_type": "text" 85 | }, 86 | "source": [ 87 | "### Load the data\n", 88 | "\n", 89 | "The files in movie_reviews have already been divided into two sets: positive ('pos') and negative ('neg'), so we can load the raw text of the reviews into two lists, one for each polarity." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "metadata": { 95 | "id": "DuuqNmcmtPjy", 96 | "colab_type": "code", 97 | "colab": {} 98 | }, 99 | "source": [ 100 | "from nltk.corpus import movie_reviews\n", 101 | "\n", 102 | "# extract words from reviews, pair with label\n", 103 | "\n", 104 | "reviews_pos = []\n", 105 | "for fileid in movie_reviews.fileids('pos'):\n", 106 | " review = movie_reviews.raw(fileid)\n", 107 | " reviews_pos.append(review)\n", 108 | "\n", 109 | "reviews_neg = []\n", 110 | "for fileid in movie_reviews.fileids('neg'):\n", 111 | " review = movie_reviews.raw(fileid)\n", 112 | " reviews_neg.append(review)" 113 | ], 114 | "execution_count": null, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "id": "TMy_1Mg4tPj-", 121 | "colab_type": "text" 122 | }, 123 | "source": [ 124 | "### Connect to the scoring API\n", 125 | "\n", 126 | "Fill in this function with code that connects to one of these APIs, and uses it to score a single review:\n", 127 | "\n", 128 | "* [Amazon Comprehend: Detect Sentiment](https://docs.aws.amazon.com/comprehend/latest/dg/API_DetectSentiment.html)\n", 129 | "* [Google Natural Language: Analyzing Sentiment](https://cloud.google.com/natural-language/docs/analyzing-sentiment)\n", 130 | "* [Azure Cognitive Services: Sentiment Analysis](https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/how-tos/text-analytics-how-to-sentiment-analysis)\n", 131 | "* [Algorithmia: Sentiment Analysis](https://algorithmia.com/algorithms/nlp/SentimentAnalysis)\n", 132 | "\n", 133 | "Your function must return either 'pos' or 'neg', so you'll need to make some decisions about how to map the results of the API call to one of these values. For example, Amazon Comprehend can return \"NEUTRAL\" or \"MIXED\" for the Sentiment -- if this happens, you may with to inspect the numeric values under the SentimentScore to see whether it leans toward positive or negative.\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "metadata": { 139 | "id": "HSWexF18tPkA", 140 | "colab_type": "code", 141 | "colab": {} 142 | }, 143 | "source": [ 144 | "def score_review(review):\n", 145 | " # TBD: call the service and return 'pos' or 'neg'\n", 146 | " return 'pos'" 147 | ], 148 | "execution_count": null, 149 | "outputs": [] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": { 154 | "id": "egteKGkJtPkL", 155 | "colab_type": "text" 156 | }, 157 | "source": [ 158 | "### Score each review\n", 159 | "\n", 160 | "Now, we can use the function you defined to score each of the reviews.\n", 161 | "\n", 162 | "#### *Note on Testing*\n", 163 | "\n", 164 | "While most of the services listed have free tiers they may be limited to a few thousand requests per week or month, depending on the service. On some platforms you may be billed after reaching that limit. For this reason it is recommended to first test on a smaller set of the reviews, `subset_pos` and `subset_neg`. Once you're happy with your code swap those subsets for the full review sets `reviews_pos` and `reviews_neg`." 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "metadata": { 170 | "id": "mWkdNH_ktPkN", 171 | "colab_type": "code", 172 | "colab": {} 173 | }, 174 | "source": [ 175 | "# Create 2 smaller subsets for testing\n", 176 | "subset_pos = reviews_pos[:10]\n", 177 | "subset_neg = reviews_neg[:10]\n", 178 | "\n", 179 | "results_pos = []\n", 180 | "# When comfortable with results switch `subset_pos` to reviews_post`\n", 181 | "for review in subset_pos:\n", 182 | " result = score_review(review)\n", 183 | " results_pos.append(result)\n", 184 | "\n", 185 | "results_neg = []\n", 186 | "# When comfortable with results switch `subset_neg` to reviews_neg`\n", 187 | "for review in subset_neg:\n", 188 | " result = score_review(review)\n", 189 | " results_neg.append(result)" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "id": "X7W95r2BtPkf", 198 | "colab_type": "text" 199 | }, 200 | "source": [ 201 | "### Calculate accuracy\n", 202 | "\n", 203 | "For each of our known positive reviews, we can count the number which our function scored as 'pos', and use this to calculate the % accuracy. We repeaty this for negative reviews, and also for overall accuracy." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "metadata": { 209 | "id": "Ft71Rv6-tPkh", 210 | "colab_type": "code", 211 | "colab": {} 212 | }, 213 | "source": [ 214 | "correct_pos = results_pos.count('pos')\n", 215 | "accuracy_pos = float(correct_pos) / len(results_pos)\n", 216 | "correct_neg = results_neg.count('neg')\n", 217 | "accuracy_neg = float(correct_neg) / len(results_neg)\n", 218 | "correct_all = correct_pos + correct_neg\n", 219 | "accuracy_all = float(correct_all) / (len(results_pos)+len(results_neg))\n", 220 | "\n", 221 | "print('Positive reviews: {}% correct'.format(accuracy_pos*100))\n", 222 | "print('Negative reviews: {}% correct'.format(accuracy_neg*100))\n", 223 | "print('Overall accuracy: {}% correct'.format(accuracy_all*100))" 224 | ], 225 | "execution_count": null, 226 | "outputs": [] 227 | } 228 | ] 229 | } -------------------------------------------------------------------------------- /part2/train_sentiment_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": false, 7 | "pycharm": { 8 | "name": "#%% md\n" 9 | } 10 | }, 11 | "source": [ 12 | "## Part 2: Training your own ML Model\n", 13 | "\n", 14 | "\"Open" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Download corpuses\n", 22 | "\n", 23 | "We'll continue using the `movie_reviews` corpus to train our model. The `stopwords` corpus contains a [set of standard stopwords](https://gist.github.com/sebleier/554280) we'll want to remove from the input, and `punkt` is used for toneization in the [.words()](https://www.nltk.org/api/nltk.corpus.html#corpus-reader-functions) method of the corpus reader." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "collapsed": false, 31 | "pycharm": { 32 | "name": "#%%\n" 33 | } 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from nltk import download\n", 38 | "\n", 39 | "download('movie_reviews')\n", 40 | "download('punkt')\n", 41 | "download('stopwords')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### Define feature extractor and bag-of-words converter\n", 49 | "\n", 50 | "Given a list of (already tokenized) words, we need a function to extract just the ones we care about: those not found in the list of English stopwords or standard punctuation.\n", 51 | "\n", 52 | "We also need a way to easily turn a list of words into a [bag-of-words](https://en.wikipedia.org/wiki/Bag-of-words_model), pairing each word with the count of its occurrences." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "from nltk.corpus import stopwords\n", 64 | "from string import punctuation\n", 65 | "\n", 66 | "stopwords_eng = stopwords.words('english')\n", 67 | "\n", 68 | "def extract_features(words):\n", 69 | " return [w for w in words if w not in stopwords_eng and w not in punctuation]\n", 70 | "\n", 71 | "def bag_of_words(words):\n", 72 | " bag = {}\n", 73 | " for w in words:\n", 74 | " bag[w] = bag.get(w,0)+1\n", 75 | " return bag" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "### Ingest, clean, and convert the positive and negative reviews\n", 83 | "\n", 84 | "For both the positive (\"pos\") and negative (\"neg\") sets of reviews, extract the features and convert to bag of words. From these, we construct a list of tuples known as a \"featureset\": the first part of each tuple is the bag of words for that review, and the second is its label (\"pos\"/\"neg\").\n", 85 | "\n", 86 | "Note that `movie_reviews.words(fileid)` provides a tokenized list of words. If we wanted the un-tokenized text, we would use `movie_reviews.raw(fileid)` instead, then tokenize it using our preferred tokenizeer (e.g. [nltk.tokenize.word_tokenize](https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.punkt.PunktLanguageVars.word_tokenize))." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "from nltk.corpus import movie_reviews\n", 98 | "\n", 99 | "reviews_pos = []\n", 100 | "reviews_neg = []\n", 101 | "for fileid in movie_reviews.fileids('pos'):\n", 102 | " words = extract_features(movie_reviews.words(fileid))\n", 103 | " reviews_pos.append((bag_of_words(words), 'pos'))\n", 104 | "for fileid in movie_reviews.fileids('neg'):\n", 105 | " words = extract_features(movie_reviews.words(fileid))\n", 106 | " reviews_neg.append((bag_of_words(words), 'neg'))" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### Split reviews into training and test sets\n", 114 | "We need to break up each group of reviews into a training set (about 80%) and a test set (the remaining 20%). In case there's some meaningful order to the reviews (e.g. the first 800 are from one group of reviewers, the next 200 are from another), we shuffle the sets first to ensure we aren't introducing additional bias. Note that this means our accuracy will not be exactly the same on every run; if you wish to see consistent results on each run, you can stabilize the shuffle by calling [random.seed(n)](https://www.geeksforgeeks.org/random-seed-in-python/) first." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "from random import shuffle\n", 126 | "\n", 127 | "split_pct = .80\n", 128 | "\n", 129 | "def split_set(review_set):\n", 130 | " split = int(len(review_set)*split_pct)\n", 131 | " return (review_set[:split], review_set[split:])\n", 132 | "\n", 133 | "shuffle(reviews_pos)\n", 134 | "shuffle(reviews_neg)\n", 135 | "\n", 136 | "pos_train, pos_test = split_set(reviews_pos)\n", 137 | "neg_train, neg_test = split_set(reviews_neg)\n", 138 | "\n", 139 | "train_set = pos_train+neg_train\n", 140 | "test_set = pos_test+neg_test" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Train the model\n", 148 | "\n", 149 | "Now that our data is ready, the training step itself is quite simple if we use the [NaiveBayesClassifier](https://www.nltk.org/api/nltk.classify.html#module-nltk.classify.naivebayes) provided by NLTK.\n", 150 | "\n", 151 | "If you are used to methods such as `model.fit(x,y)` which take two parameters -- the data and the labels -- it may be confusing that `NaiveBayesClassifier.train` takes just one argument. This is because the labels are already embedded in `train_set`: each element in the set is a Bag of Words paired with a 'pos' or 'neg'; value." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "from nltk.classify import NaiveBayesClassifier\n", 163 | "\n", 164 | "model = NaiveBayesClassifier.train(train_set)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "### Check model accuracy\n", 172 | "\n", 173 | "NLTK's built-in [accuracy](https://www.nltk.org/api/nltk.classify.html#module-nltk.classify.util) utility can run our test_set through the model and compare the labels returned by the model to the labels in the test set, producing an overall % accuracy. Not too impressive, right? We need to improve." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "from nltk.classify.util import accuracy\n", 185 | "\n", 186 | "print(100 * accuracy(model, test_set))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "### Save the model\n", 194 | "Our trained model will be cleared from memory when this notebook is closed. So that we can use it again later, save the model as a file using the [pickle](https://docs.python.org/3/library/pickle.html) serializer." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": false, 202 | "pycharm": { 203 | "name": "#%%\n" 204 | } 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "import pickle\n", 209 | "\n", 210 | "model_file = open('sa_classifier.pickle','wb')\n", 211 | "pickle.dump(model, model_file)\n", 212 | "model_file.close()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "### Save the model (Colab version)\n", 220 | "\n", 221 | "Google Colab doesn't provide direct access to files saved during a notebook session, so we need to save it in [Google Drive](https://drive.google.com) instead. The first time you run this, it will ask for permission to access your Google Drive. Follow the instructions, then wait a few minutes and look for a new folder called \"Colab Output\" in [Drive](https://drive.google.com). Note that Colab does not alway sync to Drive immediately, so check the file update times and re-run this cell if it doesn't look like you have the most revent version of your file." 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "import sys\n", 233 | "if 'google.colab' in sys.modules:\n", 234 | " from google.colab import drive\n", 235 | " drive.mount('/content/gdrive')\n", 236 | " !mkdir -p '/content/gdrive/My Drive/Colab Output'\n", 237 | " model_file = open('/content/gdrive/My Drive/Colab Output/sa_classifier.pickle','wb')\n", 238 | " pickle.dump(model, model_file)\n", 239 | " model_file.flush()\n", 240 | " print('Model saved in /content/gdrive/My Drive/Colab Output')\n", 241 | " !ls '/content/gdrive/My Drive/Colab Output'\n", 242 | " drive.flush_and_unmount()\n", 243 | " print('Re-run this cell if you cannot find it in https://drive.google.com')" 244 | ] 245 | } 246 | ], 247 | "metadata": { 248 | "anaconda-cloud": {}, 249 | "kernelspec": { 250 | "display_name": "Python 3", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 2 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython2", 264 | "version": "2.7.3" 265 | }, 266 | "pycharm": { 267 | "stem_cell": { 268 | "cell_type": "raw", 269 | "source": [], 270 | "metadata": { 271 | "collapsed": false 272 | } 273 | } 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 1 278 | } -------------------------------------------------------------------------------- /part3/predict_sentiment_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Part 3: Deploying as a FaaS\n", 8 | "\n", 9 | "\"Open" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### Download corpuses\n", 17 | "\n", 18 | "Since we won't be doing any model-training in this step, we don't need the 'movie_reviews' corpus. However, we will still need to extract features from our input before each prediction, so we make sure 'punkt' and 'stopwords' are available for tokenization and stopword-removal. If you added any other corpuses in Part 2, consider whether they'll be needed in the prediction step." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false, 26 | "pycharm": { 27 | "name": "#%%\n" 28 | } 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "from nltk import download\n", 33 | "\n", 34 | "download('punkt')\n", 35 | "download('stopwords')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Define feature extractor and bag-of-words converter\n", 43 | "\n", 44 | "IMPORTANT: your predictions will only work properly if you use the same feature extractor that you trained your model with, so copy your updated `extract_features` method over from Part 2, replacing the method below. " 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "from nltk.corpus import stopwords\n", 56 | "from string import punctuation\n", 57 | "\n", 58 | "stopwords_eng = stopwords.words('english')\n", 59 | "\n", 60 | "def extract_features(words):\n", 61 | " return [w for w in words if w not in stopwords_eng and w not in punctuation]\n", 62 | "\n", 63 | "def bag_of_words(words):\n", 64 | " bag = {}\n", 65 | " for w in words:\n", 66 | " bag[w] = bag.get(w,0)+1\n", 67 | " return bag" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "source": [ 76 | "### Import your pickled model file (non-Colab version)\n", 77 | "\n", 78 | "In Part 2, we saved the trained model as \"sa_classifier.pickle\". Now we'll unpickle that file to get it back into memory. Either copy that file into the same folder as this notebook (\"part3\"), or adjust the path below to \"../part2/sa_classifier.pickle\" so it reads the file from the folder where it was saved." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "import pickle\n", 90 | "import sys\n", 91 | "\n", 92 | "if not 'google.colab' in sys.modules:\n", 93 | " model_file = open('sa_classifier.pickle', 'rb')\n", 94 | " model = pickle.load(model_file)\n", 95 | " model_file.close()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Import your pickled model file (Colab version)\n", 103 | "\n", 104 | "If you're running this notebook on Colab, we need to retrieve the pickled model from [Google Drive](https://drive.google.com) before we can unpickle it. This code looks for \"sa_classifier.pickle\" in a folder called \"Colab Output\"; if you have moved the file elsewhere, change the path below." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "import pickle\n", 116 | "import sys\n", 117 | "\n", 118 | "if 'google.colab' in sys.modules:\n", 119 | " from google.colab import drive\n", 120 | " drive.mount('/content/gdrive')\n", 121 | " !ls '/content/gdrive/My Drive/Colab Output'\n", 122 | " model_file = open('/content/gdrive/My Drive/Colab Output/sa_classifier.pickle','rb')\n", 123 | " model = pickle.load(model_file)\n", 124 | " model_file.close()\n", 125 | " print('Model loaded from /content/gdrive/My Drive/Colab Output')" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "### Define a method for prediction\n", 133 | "\n", 134 | "In the prediction step, we'll be taking a single piece of text input and asking the model to classify it. Models need the input for the prediction step to have the same format as the data provided during training -- so we must tokenize the input, run the same `extract_features` method that we used during training, and convert it to a bag of words before sending it to the model's `classify` method.\n", 135 | "\n", 136 | "Note: if you have (from Part 2) changed your `extract_features` method to accept the full text instead of a tokenized list, then you can omit the tokenization step here." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "from nltk.tokenize import word_tokenize\n", 148 | "\n", 149 | "def get_sentiment(review):\n", 150 | " words = word_tokenize(review)\n", 151 | " words = extract_features(words)\n", 152 | " words = bag_of_words(words)\n", 153 | " return model.classify(words)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "collapsed": false, 160 | "pycharm": { 161 | "name": "#%%\n" 162 | } 163 | }, 164 | "source": [ 165 | "### Run a prediction\n", 166 | "\n", 167 | "Test out your `get_sentiment` method on some sample inputs of your own devising: try altering the two reviews below and see how your model performs. It won't be 100% correct, and we're mostly just looking to see that it is able to run at all, but if it sems to *always* be wrong, that may indicate you've missed a critical step above (e.g. you haven't copied over all the changes to your feature extractor from Part 2, or you've loaded the wrong model file, or provided un-tokenized text when a list of words was expected)." 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "collapsed": false 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "positive_review = 'This movie is amazing, with witty dialog and beautiful shots.'\n", 179 | "print('positive_review: '+get_sentiment(positive_review))\n", 180 | "\n", 181 | "negative_review = 'I hated everything about this unimaginitive mess. Two thumbs down!'\n", 182 | "print('negative_review: '+get_sentiment(negative_review))" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "outputs": [], 192 | "source": [] 193 | } 194 | ], 195 | "metadata": { 196 | "anaconda-cloud": {}, 197 | "kernelspec": { 198 | "display_name": "Python 3", 199 | "language": "python", 200 | "name": "python3" 201 | }, 202 | "language_info": { 203 | "codemirror_mode": { 204 | "name": "ipython", 205 | "version": 2 206 | }, 207 | "file_extension": ".py", 208 | "mimetype": "text/x-python", 209 | "name": "python", 210 | "nbconvert_exporter": "python", 211 | "pygments_lexer": "ipython2", 212 | "version": "2.7.3" 213 | }, 214 | "pycharm": { 215 | "stem_cell": { 216 | "cell_type": "raw", 217 | "source": [], 218 | "metadata": { 219 | "collapsed": false 220 | } 221 | } 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 1 226 | } -------------------------------------------------------------------------------- /part5/sentiment.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 21 | 22 | 23 |
    Enter your review:
    24 |
    25 | 26 | 27 |
    28 |
    29 | 30 | 31 | --------------------------------------------------------------------------------