├── .github
└── FUNDING.yml
├── .gitignore
├── README.md
├── index.html
├── part1
└── score_reviews_via_service.ipynb
├── part2
└── train_sentiment_analysis.ipynb
├── part3
└── predict_sentiment_analysis.ipynb
└── part5
└── sentiment.html
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: peckjon
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # IDE tempfiles
107 | .idea
108 |
109 | # pickled models
110 | *.pickle
111 |
112 | # notebook checkpoints
113 | */.ipynb_checkpoints
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hosting-ml-as-microservice
2 | Hosting your own Machine Learning Model as a Microservice
3 |
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
Hosting your own Machine Learning Model as a Microservice
3 | Part 1: Existing Machine Learning Services
4 | Part 2: Training your own ML Model
5 | Part 3: Deploying as a FaaS
6 | Part 4: Deploying as a Container Service
7 | Part 5: Integrating your Microservice
--------------------------------------------------------------------------------
/part1/score_reviews_via_service.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "anaconda-cloud": {},
6 | "kernelspec": {
7 | "display_name": "Python 3",
8 | "language": "python",
9 | "name": "python3"
10 | },
11 | "language_info": {
12 | "codemirror_mode": {
13 | "name": "ipython",
14 | "version": 2
15 | },
16 | "file_extension": ".py",
17 | "mimetype": "text/x-python",
18 | "name": "python",
19 | "nbconvert_exporter": "python",
20 | "pygments_lexer": "ipython2",
21 | "version": "2.7.3"
22 | },
23 | "pycharm": {
24 | "stem_cell": {
25 | "cell_type": "raw",
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "source": []
30 | }
31 | },
32 | "colab": {
33 | "name": "score_reviews_via_service.ipynb",
34 | "provenance": []
35 | }
36 | },
37 | "cells": [
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {
41 | "id": "24yCLLjstPjb",
42 | "colab_type": "text"
43 | },
44 | "source": [
45 | "## Part 1: Existing Machine Learning Services\n",
46 | "\n",
47 | "
"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {
53 | "id": "jwR4_2_otPjg",
54 | "colab_type": "text"
55 | },
56 | "source": [
57 | "### Obtain labelled reviews\n",
58 | "\n",
59 | "In order to test any of the sentiment analysis APIs, we need a labelled dataset of reviews and their sentiment polarity. We'll use NLTK to download the movie_reviews corpus."
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "metadata": {
65 | "pycharm": {
66 | "name": "#%%\n"
67 | },
68 | "id": "zPOhVzNrtPjj",
69 | "colab_type": "code",
70 | "colab": {}
71 | },
72 | "source": [
73 | "from nltk import download\n",
74 | "\n",
75 | "download('movie_reviews')"
76 | ],
77 | "execution_count": null,
78 | "outputs": []
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {
83 | "id": "zPd-EWKptPjw",
84 | "colab_type": "text"
85 | },
86 | "source": [
87 | "### Load the data\n",
88 | "\n",
89 | "The files in movie_reviews have already been divided into two sets: positive ('pos') and negative ('neg'), so we can load the raw text of the reviews into two lists, one for each polarity."
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "metadata": {
95 | "id": "DuuqNmcmtPjy",
96 | "colab_type": "code",
97 | "colab": {}
98 | },
99 | "source": [
100 | "from nltk.corpus import movie_reviews\n",
101 | "\n",
102 | "# extract words from reviews, pair with label\n",
103 | "\n",
104 | "reviews_pos = []\n",
105 | "for fileid in movie_reviews.fileids('pos'):\n",
106 | " review = movie_reviews.raw(fileid)\n",
107 | " reviews_pos.append(review)\n",
108 | "\n",
109 | "reviews_neg = []\n",
110 | "for fileid in movie_reviews.fileids('neg'):\n",
111 | " review = movie_reviews.raw(fileid)\n",
112 | " reviews_neg.append(review)"
113 | ],
114 | "execution_count": null,
115 | "outputs": []
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {
120 | "id": "TMy_1Mg4tPj-",
121 | "colab_type": "text"
122 | },
123 | "source": [
124 | "### Connect to the scoring API\n",
125 | "\n",
126 | "Fill in this function with code that connects to one of these APIs, and uses it to score a single review:\n",
127 | "\n",
128 | "* [Amazon Comprehend: Detect Sentiment](https://docs.aws.amazon.com/comprehend/latest/dg/API_DetectSentiment.html)\n",
129 | "* [Google Natural Language: Analyzing Sentiment](https://cloud.google.com/natural-language/docs/analyzing-sentiment)\n",
130 | "* [Azure Cognitive Services: Sentiment Analysis](https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/how-tos/text-analytics-how-to-sentiment-analysis)\n",
131 | "* [Algorithmia: Sentiment Analysis](https://algorithmia.com/algorithms/nlp/SentimentAnalysis)\n",
132 | "\n",
133 | "Your function must return either 'pos' or 'neg', so you'll need to make some decisions about how to map the results of the API call to one of these values. For example, Amazon Comprehend can return \"NEUTRAL\" or \"MIXED\" for the Sentiment -- if this happens, you may with to inspect the numeric values under the SentimentScore to see whether it leans toward positive or negative.\n"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "metadata": {
139 | "id": "HSWexF18tPkA",
140 | "colab_type": "code",
141 | "colab": {}
142 | },
143 | "source": [
144 | "def score_review(review):\n",
145 | " # TBD: call the service and return 'pos' or 'neg'\n",
146 | " return 'pos'"
147 | ],
148 | "execution_count": null,
149 | "outputs": []
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {
154 | "id": "egteKGkJtPkL",
155 | "colab_type": "text"
156 | },
157 | "source": [
158 | "### Score each review\n",
159 | "\n",
160 | "Now, we can use the function you defined to score each of the reviews.\n",
161 | "\n",
162 | "#### *Note on Testing*\n",
163 | "\n",
164 | "While most of the services listed have free tiers they may be limited to a few thousand requests per week or month, depending on the service. On some platforms you may be billed after reaching that limit. For this reason it is recommended to first test on a smaller set of the reviews, `subset_pos` and `subset_neg`. Once you're happy with your code swap those subsets for the full review sets `reviews_pos` and `reviews_neg`."
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "metadata": {
170 | "id": "mWkdNH_ktPkN",
171 | "colab_type": "code",
172 | "colab": {}
173 | },
174 | "source": [
175 | "# Create 2 smaller subsets for testing\n",
176 | "subset_pos = reviews_pos[:10]\n",
177 | "subset_neg = reviews_neg[:10]\n",
178 | "\n",
179 | "results_pos = []\n",
180 | "# When comfortable with results switch `subset_pos` to reviews_post`\n",
181 | "for review in subset_pos:\n",
182 | " result = score_review(review)\n",
183 | " results_pos.append(result)\n",
184 | "\n",
185 | "results_neg = []\n",
186 | "# When comfortable with results switch `subset_neg` to reviews_neg`\n",
187 | "for review in subset_neg:\n",
188 | " result = score_review(review)\n",
189 | " results_neg.append(result)"
190 | ],
191 | "execution_count": null,
192 | "outputs": []
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {
197 | "id": "X7W95r2BtPkf",
198 | "colab_type": "text"
199 | },
200 | "source": [
201 | "### Calculate accuracy\n",
202 | "\n",
203 | "For each of our known positive reviews, we can count the number which our function scored as 'pos', and use this to calculate the % accuracy. We repeaty this for negative reviews, and also for overall accuracy."
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "metadata": {
209 | "id": "Ft71Rv6-tPkh",
210 | "colab_type": "code",
211 | "colab": {}
212 | },
213 | "source": [
214 | "correct_pos = results_pos.count('pos')\n",
215 | "accuracy_pos = float(correct_pos) / len(results_pos)\n",
216 | "correct_neg = results_neg.count('neg')\n",
217 | "accuracy_neg = float(correct_neg) / len(results_neg)\n",
218 | "correct_all = correct_pos + correct_neg\n",
219 | "accuracy_all = float(correct_all) / (len(results_pos)+len(results_neg))\n",
220 | "\n",
221 | "print('Positive reviews: {}% correct'.format(accuracy_pos*100))\n",
222 | "print('Negative reviews: {}% correct'.format(accuracy_neg*100))\n",
223 | "print('Overall accuracy: {}% correct'.format(accuracy_all*100))"
224 | ],
225 | "execution_count": null,
226 | "outputs": []
227 | }
228 | ]
229 | }
--------------------------------------------------------------------------------
/part2/train_sentiment_analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": false,
7 | "pycharm": {
8 | "name": "#%% md\n"
9 | }
10 | },
11 | "source": [
12 | "## Part 2: Training your own ML Model\n",
13 | "\n",
14 | "
"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "### Download corpuses\n",
22 | "\n",
23 | "We'll continue using the `movie_reviews` corpus to train our model. The `stopwords` corpus contains a [set of standard stopwords](https://gist.github.com/sebleier/554280) we'll want to remove from the input, and `punkt` is used for toneization in the [.words()](https://www.nltk.org/api/nltk.corpus.html#corpus-reader-functions) method of the corpus reader."
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "collapsed": false,
31 | "pycharm": {
32 | "name": "#%%\n"
33 | }
34 | },
35 | "outputs": [],
36 | "source": [
37 | "from nltk import download\n",
38 | "\n",
39 | "download('movie_reviews')\n",
40 | "download('punkt')\n",
41 | "download('stopwords')"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "### Define feature extractor and bag-of-words converter\n",
49 | "\n",
50 | "Given a list of (already tokenized) words, we need a function to extract just the ones we care about: those not found in the list of English stopwords or standard punctuation.\n",
51 | "\n",
52 | "We also need a way to easily turn a list of words into a [bag-of-words](https://en.wikipedia.org/wiki/Bag-of-words_model), pairing each word with the count of its occurrences."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {
59 | "collapsed": false
60 | },
61 | "outputs": [],
62 | "source": [
63 | "from nltk.corpus import stopwords\n",
64 | "from string import punctuation\n",
65 | "\n",
66 | "stopwords_eng = stopwords.words('english')\n",
67 | "\n",
68 | "def extract_features(words):\n",
69 | " return [w for w in words if w not in stopwords_eng and w not in punctuation]\n",
70 | "\n",
71 | "def bag_of_words(words):\n",
72 | " bag = {}\n",
73 | " for w in words:\n",
74 | " bag[w] = bag.get(w,0)+1\n",
75 | " return bag"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "### Ingest, clean, and convert the positive and negative reviews\n",
83 | "\n",
84 | "For both the positive (\"pos\") and negative (\"neg\") sets of reviews, extract the features and convert to bag of words. From these, we construct a list of tuples known as a \"featureset\": the first part of each tuple is the bag of words for that review, and the second is its label (\"pos\"/\"neg\").\n",
85 | "\n",
86 | "Note that `movie_reviews.words(fileid)` provides a tokenized list of words. If we wanted the un-tokenized text, we would use `movie_reviews.raw(fileid)` instead, then tokenize it using our preferred tokenizeer (e.g. [nltk.tokenize.word_tokenize](https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.punkt.PunktLanguageVars.word_tokenize))."
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [],
96 | "source": [
97 | "from nltk.corpus import movie_reviews\n",
98 | "\n",
99 | "reviews_pos = []\n",
100 | "reviews_neg = []\n",
101 | "for fileid in movie_reviews.fileids('pos'):\n",
102 | " words = extract_features(movie_reviews.words(fileid))\n",
103 | " reviews_pos.append((bag_of_words(words), 'pos'))\n",
104 | "for fileid in movie_reviews.fileids('neg'):\n",
105 | " words = extract_features(movie_reviews.words(fileid))\n",
106 | " reviews_neg.append((bag_of_words(words), 'neg'))"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### Split reviews into training and test sets\n",
114 | "We need to break up each group of reviews into a training set (about 80%) and a test set (the remaining 20%). In case there's some meaningful order to the reviews (e.g. the first 800 are from one group of reviewers, the next 200 are from another), we shuffle the sets first to ensure we aren't introducing additional bias. Note that this means our accuracy will not be exactly the same on every run; if you wish to see consistent results on each run, you can stabilize the shuffle by calling [random.seed(n)](https://www.geeksforgeeks.org/random-seed-in-python/) first."
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "collapsed": false
122 | },
123 | "outputs": [],
124 | "source": [
125 | "from random import shuffle\n",
126 | "\n",
127 | "split_pct = .80\n",
128 | "\n",
129 | "def split_set(review_set):\n",
130 | " split = int(len(review_set)*split_pct)\n",
131 | " return (review_set[:split], review_set[split:])\n",
132 | "\n",
133 | "shuffle(reviews_pos)\n",
134 | "shuffle(reviews_neg)\n",
135 | "\n",
136 | "pos_train, pos_test = split_set(reviews_pos)\n",
137 | "neg_train, neg_test = split_set(reviews_neg)\n",
138 | "\n",
139 | "train_set = pos_train+neg_train\n",
140 | "test_set = pos_test+neg_test"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "### Train the model\n",
148 | "\n",
149 | "Now that our data is ready, the training step itself is quite simple if we use the [NaiveBayesClassifier](https://www.nltk.org/api/nltk.classify.html#module-nltk.classify.naivebayes) provided by NLTK.\n",
150 | "\n",
151 | "If you are used to methods such as `model.fit(x,y)` which take two parameters -- the data and the labels -- it may be confusing that `NaiveBayesClassifier.train` takes just one argument. This is because the labels are already embedded in `train_set`: each element in the set is a Bag of Words paired with a 'pos' or 'neg'; value."
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "collapsed": false
159 | },
160 | "outputs": [],
161 | "source": [
162 | "from nltk.classify import NaiveBayesClassifier\n",
163 | "\n",
164 | "model = NaiveBayesClassifier.train(train_set)"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "### Check model accuracy\n",
172 | "\n",
173 | "NLTK's built-in [accuracy](https://www.nltk.org/api/nltk.classify.html#module-nltk.classify.util) utility can run our test_set through the model and compare the labels returned by the model to the labels in the test set, producing an overall % accuracy. Not too impressive, right? We need to improve."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {
180 | "collapsed": false
181 | },
182 | "outputs": [],
183 | "source": [
184 | "from nltk.classify.util import accuracy\n",
185 | "\n",
186 | "print(100 * accuracy(model, test_set))"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "### Save the model\n",
194 | "Our trained model will be cleared from memory when this notebook is closed. So that we can use it again later, save the model as a file using the [pickle](https://docs.python.org/3/library/pickle.html) serializer."
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {
201 | "collapsed": false,
202 | "pycharm": {
203 | "name": "#%%\n"
204 | }
205 | },
206 | "outputs": [],
207 | "source": [
208 | "import pickle\n",
209 | "\n",
210 | "model_file = open('sa_classifier.pickle','wb')\n",
211 | "pickle.dump(model, model_file)\n",
212 | "model_file.close()"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "### Save the model (Colab version)\n",
220 | "\n",
221 | "Google Colab doesn't provide direct access to files saved during a notebook session, so we need to save it in [Google Drive](https://drive.google.com) instead. The first time you run this, it will ask for permission to access your Google Drive. Follow the instructions, then wait a few minutes and look for a new folder called \"Colab Output\" in [Drive](https://drive.google.com). Note that Colab does not alway sync to Drive immediately, so check the file update times and re-run this cell if it doesn't look like you have the most revent version of your file."
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {
228 | "collapsed": true
229 | },
230 | "outputs": [],
231 | "source": [
232 | "import sys\n",
233 | "if 'google.colab' in sys.modules:\n",
234 | " from google.colab import drive\n",
235 | " drive.mount('/content/gdrive')\n",
236 | " !mkdir -p '/content/gdrive/My Drive/Colab Output'\n",
237 | " model_file = open('/content/gdrive/My Drive/Colab Output/sa_classifier.pickle','wb')\n",
238 | " pickle.dump(model, model_file)\n",
239 | " model_file.flush()\n",
240 | " print('Model saved in /content/gdrive/My Drive/Colab Output')\n",
241 | " !ls '/content/gdrive/My Drive/Colab Output'\n",
242 | " drive.flush_and_unmount()\n",
243 | " print('Re-run this cell if you cannot find it in https://drive.google.com')"
244 | ]
245 | }
246 | ],
247 | "metadata": {
248 | "anaconda-cloud": {},
249 | "kernelspec": {
250 | "display_name": "Python 3",
251 | "language": "python",
252 | "name": "python3"
253 | },
254 | "language_info": {
255 | "codemirror_mode": {
256 | "name": "ipython",
257 | "version": 2
258 | },
259 | "file_extension": ".py",
260 | "mimetype": "text/x-python",
261 | "name": "python",
262 | "nbconvert_exporter": "python",
263 | "pygments_lexer": "ipython2",
264 | "version": "2.7.3"
265 | },
266 | "pycharm": {
267 | "stem_cell": {
268 | "cell_type": "raw",
269 | "source": [],
270 | "metadata": {
271 | "collapsed": false
272 | }
273 | }
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 1
278 | }
--------------------------------------------------------------------------------
/part3/predict_sentiment_analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Part 3: Deploying as a FaaS\n",
8 | "\n",
9 | "
"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "### Download corpuses\n",
17 | "\n",
18 | "Since we won't be doing any model-training in this step, we don't need the 'movie_reviews' corpus. However, we will still need to extract features from our input before each prediction, so we make sure 'punkt' and 'stopwords' are available for tokenization and stopword-removal. If you added any other corpuses in Part 2, consider whether they'll be needed in the prediction step."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false,
26 | "pycharm": {
27 | "name": "#%%\n"
28 | }
29 | },
30 | "outputs": [],
31 | "source": [
32 | "from nltk import download\n",
33 | "\n",
34 | "download('punkt')\n",
35 | "download('stopwords')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "### Define feature extractor and bag-of-words converter\n",
43 | "\n",
44 | "IMPORTANT: your predictions will only work properly if you use the same feature extractor that you trained your model with, so copy your updated `extract_features` method over from Part 2, replacing the method below. "
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "collapsed": false
52 | },
53 | "outputs": [],
54 | "source": [
55 | "from nltk.corpus import stopwords\n",
56 | "from string import punctuation\n",
57 | "\n",
58 | "stopwords_eng = stopwords.words('english')\n",
59 | "\n",
60 | "def extract_features(words):\n",
61 | " return [w for w in words if w not in stopwords_eng and w not in punctuation]\n",
62 | "\n",
63 | "def bag_of_words(words):\n",
64 | " bag = {}\n",
65 | " for w in words:\n",
66 | " bag[w] = bag.get(w,0)+1\n",
67 | " return bag"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {
73 | "collapsed": true
74 | },
75 | "source": [
76 | "### Import your pickled model file (non-Colab version)\n",
77 | "\n",
78 | "In Part 2, we saved the trained model as \"sa_classifier.pickle\". Now we'll unpickle that file to get it back into memory. Either copy that file into the same folder as this notebook (\"part3\"), or adjust the path below to \"../part2/sa_classifier.pickle\" so it reads the file from the folder where it was saved."
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {
85 | "collapsed": false
86 | },
87 | "outputs": [],
88 | "source": [
89 | "import pickle\n",
90 | "import sys\n",
91 | "\n",
92 | "if not 'google.colab' in sys.modules:\n",
93 | " model_file = open('sa_classifier.pickle', 'rb')\n",
94 | " model = pickle.load(model_file)\n",
95 | " model_file.close()"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "### Import your pickled model file (Colab version)\n",
103 | "\n",
104 | "If you're running this notebook on Colab, we need to retrieve the pickled model from [Google Drive](https://drive.google.com) before we can unpickle it. This code looks for \"sa_classifier.pickle\" in a folder called \"Colab Output\"; if you have moved the file elsewhere, change the path below."
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": true
112 | },
113 | "outputs": [],
114 | "source": [
115 | "import pickle\n",
116 | "import sys\n",
117 | "\n",
118 | "if 'google.colab' in sys.modules:\n",
119 | " from google.colab import drive\n",
120 | " drive.mount('/content/gdrive')\n",
121 | " !ls '/content/gdrive/My Drive/Colab Output'\n",
122 | " model_file = open('/content/gdrive/My Drive/Colab Output/sa_classifier.pickle','rb')\n",
123 | " model = pickle.load(model_file)\n",
124 | " model_file.close()\n",
125 | " print('Model loaded from /content/gdrive/My Drive/Colab Output')"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "### Define a method for prediction\n",
133 | "\n",
134 | "In the prediction step, we'll be taking a single piece of text input and asking the model to classify it. Models need the input for the prediction step to have the same format as the data provided during training -- so we must tokenize the input, run the same `extract_features` method that we used during training, and convert it to a bag of words before sending it to the model's `classify` method.\n",
135 | "\n",
136 | "Note: if you have (from Part 2) changed your `extract_features` method to accept the full text instead of a tokenized list, then you can omit the tokenization step here."
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {
143 | "collapsed": false
144 | },
145 | "outputs": [],
146 | "source": [
147 | "from nltk.tokenize import word_tokenize\n",
148 | "\n",
149 | "def get_sentiment(review):\n",
150 | " words = word_tokenize(review)\n",
151 | " words = extract_features(words)\n",
152 | " words = bag_of_words(words)\n",
153 | " return model.classify(words)"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {
159 | "collapsed": false,
160 | "pycharm": {
161 | "name": "#%%\n"
162 | }
163 | },
164 | "source": [
165 | "### Run a prediction\n",
166 | "\n",
167 | "Test out your `get_sentiment` method on some sample inputs of your own devising: try altering the two reviews below and see how your model performs. It won't be 100% correct, and we're mostly just looking to see that it is able to run at all, but if it sems to *always* be wrong, that may indicate you've missed a critical step above (e.g. you haven't copied over all the changes to your feature extractor from Part 2, or you've loaded the wrong model file, or provided un-tokenized text when a list of words was expected)."
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "collapsed": false
175 | },
176 | "outputs": [],
177 | "source": [
178 | "positive_review = 'This movie is amazing, with witty dialog and beautiful shots.'\n",
179 | "print('positive_review: '+get_sentiment(positive_review))\n",
180 | "\n",
181 | "negative_review = 'I hated everything about this unimaginitive mess. Two thumbs down!'\n",
182 | "print('negative_review: '+get_sentiment(negative_review))"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {
189 | "collapsed": true
190 | },
191 | "outputs": [],
192 | "source": []
193 | }
194 | ],
195 | "metadata": {
196 | "anaconda-cloud": {},
197 | "kernelspec": {
198 | "display_name": "Python 3",
199 | "language": "python",
200 | "name": "python3"
201 | },
202 | "language_info": {
203 | "codemirror_mode": {
204 | "name": "ipython",
205 | "version": 2
206 | },
207 | "file_extension": ".py",
208 | "mimetype": "text/x-python",
209 | "name": "python",
210 | "nbconvert_exporter": "python",
211 | "pygments_lexer": "ipython2",
212 | "version": "2.7.3"
213 | },
214 | "pycharm": {
215 | "stem_cell": {
216 | "cell_type": "raw",
217 | "source": [],
218 | "metadata": {
219 | "collapsed": false
220 | }
221 | }
222 | }
223 | },
224 | "nbformat": 4,
225 | "nbformat_minor": 1
226 | }
--------------------------------------------------------------------------------
/part5/sentiment.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
8 |
21 |
22 |
23 | Enter your review:
24 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------