├── .gitattributes
├── .gitignore
├── Chapter01
├── affinity_dataset.txt
├── ch1_affinity.ipynb
├── ch1_affinity_create.ipynb
└── ch1_oner_application.ipynb
├── Chapter02
└── Ionosphere%20Nearest%20Neighbour.ipynb
├── Chapter03
└── chapter3_nba_prediction.ipynb
├── Chapter04
└── ch4_movie_recommendation.ipynb
├── Chapter05
├── .ipynb_checkpoints
│ ├── Chapter 5 Adult-checkpoint.ipynb
│ └── Chapter 5 Advertisements-checkpoint.ipynb
├── Chapter 5 Adult.ipynb
├── Chapter 5 Advertisements.ipynb
└── adult_tests.py
├── Chapter06
├── ch6_classify_twitter.ipynb
├── ch6_create_replicable_dataset.ipynb
├── ch6_get_twitter.ipynb
├── ch6_label_twitter.ipynb
└── ch6_recreate_dataset.ipynb
├── Chapter07
└── ch7_graph.ipynb
├── Chapter08
├── Old
│ └── ch8_CAPTCHA.ipynb
└── ch8_CAPTCHA.ipynb
├── Chapter09
├── Chapter 9 Authorship Analysis.ipynb
└── Old
│ └── Chapter 9 Authorship Analysis.ipynb
├── Chapter10
└── chapter 10.ipynb
├── Chapter11
├── Chapter 11 Keras Introduction.ipynb
├── Chapter 11 Keras Object Recognition.ipynb
├── Chapter 11 TensorFlow Introduction.ipynb
└── Chapter 11 update of CAPTCHA.ipynb
├── Chapter12
├── .ipynb_checkpoints
│ └── Chapter 12 (NB Predict)-checkpoint.ipynb
├── CH12 MapReduce Basics.ipynb
├── Chapter 12 (NB Predict).ipynb
├── Chapter 12 (Test load).ipynb
├── extract_posts.py
├── nb_predict.py
└── nb_train.py
├── LICENSE
└── README.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows image file caches
2 | Thumbs.db
3 | ehthumbs.db
4 |
5 | # Folder config file
6 | Desktop.ini
7 |
8 | # Recycle Bin used on file shares
9 | $RECYCLE.BIN/
10 |
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 |
17 | # Windows shortcuts
18 | *.lnk
19 |
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 |
24 | # OSX
25 | # =========================
26 |
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 |
31 | # Thumbnails
32 | ._*
33 |
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 |
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 |
--------------------------------------------------------------------------------
/Chapter01/affinity_dataset.txt:
--------------------------------------------------------------------------------
1 | 0 1 0 0 0
2 | 1 1 0 0 0
3 | 0 0 1 0 1
4 | 1 1 0 0 0
5 | 0 0 1 1 1
6 | 0 1 0 0 0
7 | 0 0 1 1 1
8 | 0 0 1 1 0
9 | 0 1 0 1 0
10 | 0 1 0 0 1
11 | 0 0 0 1 0
12 | 1 0 1 0 0
13 | 1 0 0 0 1
14 | 0 1 1 0 0
15 | 0 0 1 0 1
16 | 0 1 0 1 0
17 | 1 1 0 1 1
18 | 0 0 0 1 1
19 | 0 1 0 0 1
20 | 1 1 0 1 0
21 | 0 1 1 0 0
22 | 0 1 0 0 1
23 | 0 0 1 0 0
24 | 1 0 0 0 1
25 | 0 1 0 1 0
26 | 1 0 0 1 1
27 | 0 1 1 0 0
28 | 0 1 0 0 1
29 | 0 0 0 0 1
30 | 1 0 0 0 1
31 | 0 1 0 1 1
32 | 1 0 0 0 0
33 | 0 1 0 0 0
34 | 1 0 0 0 0
35 | 0 0 1 1 1
36 | 0 0 1 1 1
37 | 0 0 1 1 1
38 | 1 0 0 1 0
39 | 0 1 0 0 1
40 | 1 1 0 0 0
41 | 0 0 0 0 1
42 | 0 1 0 1 1
43 | 0 1 0 1 0
44 | 0 1 0 0 1
45 | 1 1 1 1 0
46 | 1 0 0 0 1
47 | 0 0 0 1 1
48 | 1 1 0 0 1
49 | 0 1 0 0 0
50 | 0 1 1 0 0
51 | 0 1 0 1 1
52 | 0 1 0 0 1
53 | 0 0 1 1 1
54 | 0 0 0 1 1
55 | 0 0 1 0 0
56 | 0 0 1 1 1
57 | 1 0 0 0 0
58 | 1 1 1 0 1
59 | 0 0 1 1 1
60 | 0 1 0 0 0
61 | 0 0 1 1 0
62 | 0 1 0 0 1
63 | 0 0 1 0 0
64 | 0 1 0 0 0
65 | 1 0 0 0 1
66 | 0 1 0 0 0
67 | 0 1 1 0 1
68 | 0 0 1 0 0
69 | 0 0 1 0 0
70 | 0 0 0 1 1
71 | 0 0 1 0 0
72 | 0 0 1 1 0
73 | 0 1 0 0 0
74 | 0 1 1 1 1
75 | 1 1 0 0 1
76 | 0 0 1 1 0
77 | 0 0 1 1 0
78 | 0 0 1 1 1
79 | 0 0 1 1 1
80 | 0 1 0 0 0
81 | 0 1 0 1 0
82 | 1 1 0 0 1
83 | 0 1 0 0 1
84 | 0 0 1 1 1
85 | 0 1 0 0 1
86 | 0 1 0 1 1
87 | 0 1 0 0 1
88 | 1 0 0 0 0
89 | 1 0 0 1 1
90 | 0 1 1 1 1
91 | 1 0 0 0 1
92 | 0 0 1 0 1
93 | 0 1 1 1 0
94 | 1 1 0 1 1
95 | 1 0 1 0 1
96 | 0 0 1 1 1
97 | 1 1 1 1 0
98 | 0 1 0 0 1
99 | 0 1 0 0 1
100 | 1 1 0 1 1
101 |
--------------------------------------------------------------------------------
/Chapter01/ch1_affinity.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 15,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [
10 | {
11 | "name": "stdout",
12 | "output_type": "stream",
13 | "text": [
14 | "This dataset has 100 samples and 5 features\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "import numpy as np\n",
20 | "dataset_filename = \"affinity_dataset.txt\"\n",
21 | "X = np.loadtxt(dataset_filename)\n",
22 | "n_samples, n_features = X.shape\n",
23 | "print(\"This dataset has {0} samples and {1} features\".format(n_samples, n_features))"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 16,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "[[ 1. 0. 1. 0. 0.]\n",
38 | " [ 0. 0. 1. 1. 1.]\n",
39 | " [ 1. 1. 0. 0. 1.]\n",
40 | " [ 0. 1. 0. 1. 0.]\n",
41 | " [ 0. 1. 0. 0. 1.]]\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "print(X[:5])"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 17,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "# The names of the features, for your reference.\n",
58 | "features = [\"bread\", \"milk\", \"cheese\", \"apples\", \"bananas\"]"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "In our first example, we will compute the Support and Confidence of the rule \"If a person buys Apples, they also buy Bananas\"."
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 18,
71 | "metadata": {
72 | "collapsed": false
73 | },
74 | "outputs": [
75 | {
76 | "name": "stdout",
77 | "output_type": "stream",
78 | "text": [
79 | "37 people bought Apples\n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "# First, how many rows contain our premise: that a person is buying apples\n",
85 | "num_apple_purchases = 0\n",
86 | "for sample in X:\n",
87 | " if sample[3] == 1: # This person bought Apples\n",
88 | " num_apple_purchases += 1\n",
89 | "print(\"{0} people bought Apples\".format(num_apple_purchases))"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 19,
95 | "metadata": {
96 | "collapsed": false
97 | },
98 | "outputs": [
99 | {
100 | "name": "stdout",
101 | "output_type": "stream",
102 | "text": [
103 | "27 cases of the rule being valid were discovered\n",
104 | "10 cases of the rule being invalid were discovered\n"
105 | ]
106 | }
107 | ],
108 | "source": [
109 | "# How many of the cases that a person bought Apples involved the people purchasing Bananas too?\n",
110 | "# Record both cases where the rule is valid and is invalid.\n",
111 | "rule_valid = 0\n",
112 | "rule_invalid = 0\n",
113 | "for sample in X:\n",
114 | " if sample[3] == 1: # This person bought Apples\n",
115 | " if sample[4] == 1:\n",
116 | " # This person bought both Apples and Bananas\n",
117 | " rule_valid += 1\n",
118 | " else:\n",
119 | " # This person bought Apples, but not Bananas\n",
120 | " rule_invalid += 1\n",
121 | "print(\"{0} cases of the rule being valid were discovered\".format(rule_valid))\n",
122 | "print(\"{0} cases of the rule being invalid were discovered\".format(rule_invalid))"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 20,
128 | "metadata": {
129 | "collapsed": false
130 | },
131 | "outputs": [
132 | {
133 | "name": "stdout",
134 | "output_type": "stream",
135 | "text": [
136 | "The support is 27 and the confidence is 0.730.\n",
137 | "As a percentage, that is 73.0%.\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "# Now we have all the information needed to compute Support and Confidence\n",
143 | "support = rule_valid # The Support is the number of times the rule is discovered.\n",
144 | "confidence = rule_valid / num_apple_purchases\n",
145 | "print(\"The support is {0} and the confidence is {1:.3f}.\".format(support, confidence))\n",
146 | "# Confidence can be thought of as a percentage using the following:\n",
147 | "print(\"As a percentage, that is {0:.1f}%.\".format(100 * confidence))"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 21,
153 | "metadata": {
154 | "collapsed": false
155 | },
156 | "outputs": [],
157 | "source": [
158 | "from collections import defaultdict\n",
159 | "# Now compute for all possible rules\n",
160 | "valid_rules = defaultdict(int)\n",
161 | "invalid_rules = defaultdict(int)\n",
162 | "num_occurences = defaultdict(int)\n",
163 | "\n",
164 | "for sample in X:\n",
165 | " for premise in range(n_features):\n",
166 | " if sample[premise] == 0: continue\n",
167 | " # Record that the premise was bought in another transaction\n",
168 | " num_occurences[premise] += 1\n",
169 | " for conclusion in range(n_features):\n",
170 | " if premise == conclusion: # It makes little sense to measure if X -> X.\n",
171 | " continue\n",
172 | " if sample[conclusion] == 1:\n",
173 | " # This person also bought the conclusion item\n",
174 | " valid_rules[(premise, conclusion)] += 1\n",
175 | " else:\n",
176 | " # This person bought the premise, but not the conclusion\n",
177 | " invalid_rules[(premise, conclusion)] += 1\n",
178 | "support = valid_rules\n",
179 | "confidence = defaultdict(float)\n",
180 | "for premise, conclusion in valid_rules.keys():\n",
181 | " confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 22,
187 | "metadata": {
188 | "collapsed": false
189 | },
190 | "outputs": [
191 | {
192 | "name": "stdout",
193 | "output_type": "stream",
194 | "text": [
195 | "Rule: If a person buys bread they will also buy milk\n",
196 | " - Confidence: 0.444\n",
197 | " - Support: 12\n",
198 | "\n",
199 | "Rule: If a person buys milk they will also buy cheese\n",
200 | " - Confidence: 0.264\n",
201 | " - Support: 14\n",
202 | "\n",
203 | "Rule: If a person buys apples they will also buy cheese\n",
204 | " - Confidence: 0.703\n",
205 | " - Support: 26\n",
206 | "\n",
207 | "Rule: If a person buys milk they will also buy apples\n",
208 | " - Confidence: 0.226\n",
209 | " - Support: 12\n",
210 | "\n",
211 | "Rule: If a person buys apples they will also buy bread\n",
212 | " - Confidence: 0.108\n",
213 | " - Support: 4\n",
214 | "\n",
215 | "Rule: If a person buys apples they will also buy bananas\n",
216 | " - Confidence: 0.730\n",
217 | " - Support: 27\n",
218 | "\n",
219 | "Rule: If a person buys apples they will also buy milk\n",
220 | " - Confidence: 0.324\n",
221 | " - Support: 12\n",
222 | "\n",
223 | "Rule: If a person buys milk they will also buy bananas\n",
224 | " - Confidence: 0.566\n",
225 | " - Support: 30\n",
226 | "\n",
227 | "Rule: If a person buys bread they will also buy cheese\n",
228 | " - Confidence: 0.185\n",
229 | " - Support: 5\n",
230 | "\n",
231 | "Rule: If a person buys cheese they will also buy bread\n",
232 | " - Confidence: 0.109\n",
233 | " - Support: 5\n",
234 | "\n",
235 | "Rule: If a person buys cheese they will also buy apples\n",
236 | " - Confidence: 0.565\n",
237 | " - Support: 26\n",
238 | "\n",
239 | "Rule: If a person buys cheese they will also buy milk\n",
240 | " - Confidence: 0.304\n",
241 | " - Support: 14\n",
242 | "\n",
243 | "Rule: If a person buys bananas they will also buy apples\n",
244 | " - Confidence: 0.435\n",
245 | " - Support: 27\n",
246 | "\n",
247 | "Rule: If a person buys bread they will also buy bananas\n",
248 | " - Confidence: 0.556\n",
249 | " - Support: 15\n",
250 | "\n",
251 | "Rule: If a person buys milk they will also buy bread\n",
252 | " - Confidence: 0.226\n",
253 | " - Support: 12\n",
254 | "\n",
255 | "Rule: If a person buys bananas they will also buy cheese\n",
256 | " - Confidence: 0.484\n",
257 | " - Support: 30\n",
258 | "\n",
259 | "Rule: If a person buys bread they will also buy apples\n",
260 | " - Confidence: 0.148\n",
261 | " - Support: 4\n",
262 | "\n",
263 | "Rule: If a person buys bananas they will also buy milk\n",
264 | " - Confidence: 0.484\n",
265 | " - Support: 30\n",
266 | "\n",
267 | "Rule: If a person buys cheese they will also buy bananas\n",
268 | " - Confidence: 0.652\n",
269 | " - Support: 30\n",
270 | "\n",
271 | "Rule: If a person buys bananas they will also buy bread\n",
272 | " - Confidence: 0.242\n",
273 | " - Support: 15\n",
274 | "\n"
275 | ]
276 | }
277 | ],
278 | "source": [
279 | "for premise, conclusion in confidence:\n",
280 | " premise_name = features[premise]\n",
281 | " conclusion_name = features[conclusion]\n",
282 | " print(\"Rule: If a person buys {0} they will also buy {1}\".format(premise_name, conclusion_name))\n",
283 | " print(\" - Confidence: {0:.3f}\".format(confidence[(premise, conclusion)]))\n",
284 | " print(\" - Support: {0}\".format(support[(premise, conclusion)]))\n",
285 | " print(\"\")"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 23,
291 | "metadata": {
292 | "collapsed": false
293 | },
294 | "outputs": [],
295 | "source": [
296 | "def print_rule(premise, conclusion, support, confidence, features):\n",
297 | " premise_name = features[premise]\n",
298 | " conclusion_name = features[conclusion]\n",
299 | " print(\"Rule: If a person buys {0} they will also buy {1}\".format(premise_name, conclusion_name))\n",
300 | " print(\" - Confidence: {0:.3f}\".format(confidence[(premise, conclusion)]))\n",
301 | " print(\" - Support: {0}\".format(support[(premise, conclusion)]))\n",
302 | " print(\"\")"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 24,
308 | "metadata": {
309 | "collapsed": false
310 | },
311 | "outputs": [
312 | {
313 | "name": "stdout",
314 | "output_type": "stream",
315 | "text": [
316 | "Rule: If a person buys milk they will also buy apples\n",
317 | " - Confidence: 0.226\n",
318 | " - Support: 12\n",
319 | "\n"
320 | ]
321 | }
322 | ],
323 | "source": [
324 | "premise = 1\n",
325 | "conclusion = 3\n",
326 | "print_rule(premise, conclusion, support, confidence, features)"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 25,
332 | "metadata": {
333 | "collapsed": false
334 | },
335 | "outputs": [
336 | {
337 | "name": "stdout",
338 | "output_type": "stream",
339 | "text": [
340 | "[((0, 1), 12),\n",
341 | " ((1, 2), 14),\n",
342 | " ((3, 2), 26),\n",
343 | " ((1, 3), 12),\n",
344 | " ((0, 3), 4),\n",
345 | " ((3, 0), 4),\n",
346 | " ((4, 1), 30),\n",
347 | " ((3, 1), 12),\n",
348 | " ((1, 4), 30),\n",
349 | " ((2, 4), 30),\n",
350 | " ((2, 0), 5),\n",
351 | " ((2, 3), 26),\n",
352 | " ((2, 1), 14),\n",
353 | " ((4, 3), 27),\n",
354 | " ((0, 4), 15),\n",
355 | " ((4, 2), 30),\n",
356 | " ((1, 0), 12),\n",
357 | " ((3, 4), 27),\n",
358 | " ((0, 2), 5),\n",
359 | " ((4, 0), 15)]\n"
360 | ]
361 | }
362 | ],
363 | "source": [
364 | "# Sort by support\n",
365 | "from pprint import pprint\n",
366 | "pprint(list(support.items()))"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 26,
372 | "metadata": {
373 | "collapsed": false
374 | },
375 | "outputs": [],
376 | "source": [
377 | "from operator import itemgetter\n",
378 | "sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 27,
384 | "metadata": {
385 | "collapsed": false
386 | },
387 | "outputs": [
388 | {
389 | "name": "stdout",
390 | "output_type": "stream",
391 | "text": [
392 | "Rule #1\n",
393 | "Rule: If a person buys bananas they will also buy milk\n",
394 | " - Confidence: 0.484\n",
395 | " - Support: 30\n",
396 | "\n",
397 | "Rule #2\n",
398 | "Rule: If a person buys milk they will also buy bananas\n",
399 | " - Confidence: 0.566\n",
400 | " - Support: 30\n",
401 | "\n",
402 | "Rule #3\n",
403 | "Rule: If a person buys cheese they will also buy bananas\n",
404 | " - Confidence: 0.652\n",
405 | " - Support: 30\n",
406 | "\n",
407 | "Rule #4\n",
408 | "Rule: If a person buys bananas they will also buy cheese\n",
409 | " - Confidence: 0.484\n",
410 | " - Support: 30\n",
411 | "\n",
412 | "Rule #5\n",
413 | "Rule: If a person buys bananas they will also buy apples\n",
414 | " - Confidence: 0.435\n",
415 | " - Support: 27\n",
416 | "\n"
417 | ]
418 | }
419 | ],
420 | "source": [
421 | "for index in range(5):\n",
422 | " print(\"Rule #{0}\".format(index + 1))\n",
423 | " (premise, conclusion) = sorted_support[index][0]\n",
424 | " print_rule(premise, conclusion, support, confidence, features)"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 28,
430 | "metadata": {
431 | "collapsed": false
432 | },
433 | "outputs": [],
434 | "source": [
435 | "sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": 29,
441 | "metadata": {
442 | "collapsed": false
443 | },
444 | "outputs": [
445 | {
446 | "name": "stdout",
447 | "output_type": "stream",
448 | "text": [
449 | "Rule #1\n",
450 | "Rule: If a person buys apples they will also buy bananas\n",
451 | " - Confidence: 0.730\n",
452 | " - Support: 27\n",
453 | "\n",
454 | "Rule #2\n",
455 | "Rule: If a person buys apples they will also buy cheese\n",
456 | " - Confidence: 0.703\n",
457 | " - Support: 26\n",
458 | "\n",
459 | "Rule #3\n",
460 | "Rule: If a person buys cheese they will also buy bananas\n",
461 | " - Confidence: 0.652\n",
462 | " - Support: 30\n",
463 | "\n",
464 | "Rule #4\n",
465 | "Rule: If a person buys milk they will also buy bananas\n",
466 | " - Confidence: 0.566\n",
467 | " - Support: 30\n",
468 | "\n",
469 | "Rule #5\n",
470 | "Rule: If a person buys cheese they will also buy apples\n",
471 | " - Confidence: 0.565\n",
472 | " - Support: 26\n",
473 | "\n"
474 | ]
475 | }
476 | ],
477 | "source": [
478 | "for index in range(5):\n",
479 | " print(\"Rule #{0}\".format(index + 1))\n",
480 | " (premise, conclusion) = sorted_confidence[index][0]\n",
481 | " print_rule(premise, conclusion, support, confidence, features)"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": null,
487 | "metadata": {
488 | "collapsed": false
489 | },
490 | "outputs": [],
491 | "source": []
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": null,
496 | "metadata": {
497 | "collapsed": true
498 | },
499 | "outputs": [],
500 | "source": []
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": null,
505 | "metadata": {
506 | "collapsed": true
507 | },
508 | "outputs": [],
509 | "source": []
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": null,
514 | "metadata": {
515 | "collapsed": true
516 | },
517 | "outputs": [],
518 | "source": []
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": null,
523 | "metadata": {
524 | "collapsed": true
525 | },
526 | "outputs": [],
527 | "source": []
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": null,
532 | "metadata": {
533 | "collapsed": true
534 | },
535 | "outputs": [],
536 | "source": []
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": null,
541 | "metadata": {
542 | "collapsed": true
543 | },
544 | "outputs": [],
545 | "source": []
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": null,
550 | "metadata": {
551 | "collapsed": true
552 | },
553 | "outputs": [],
554 | "source": []
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": null,
559 | "metadata": {
560 | "collapsed": true
561 | },
562 | "outputs": [],
563 | "source": []
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": null,
568 | "metadata": {
569 | "collapsed": true
570 | },
571 | "outputs": [],
572 | "source": []
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": null,
577 | "metadata": {
578 | "collapsed": true
579 | },
580 | "outputs": [],
581 | "source": []
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": null,
586 | "metadata": {
587 | "collapsed": true
588 | },
589 | "outputs": [],
590 | "source": []
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": null,
595 | "metadata": {
596 | "collapsed": true
597 | },
598 | "outputs": [],
599 | "source": []
600 | },
601 | {
602 | "cell_type": "code",
603 | "execution_count": null,
604 | "metadata": {
605 | "collapsed": true
606 | },
607 | "outputs": [],
608 | "source": []
609 | },
610 | {
611 | "cell_type": "code",
612 | "execution_count": null,
613 | "metadata": {
614 | "collapsed": true
615 | },
616 | "outputs": [],
617 | "source": []
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": null,
622 | "metadata": {
623 | "collapsed": true
624 | },
625 | "outputs": [],
626 | "source": []
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": null,
631 | "metadata": {
632 | "collapsed": true
633 | },
634 | "outputs": [],
635 | "source": []
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": null,
640 | "metadata": {
641 | "collapsed": true
642 | },
643 | "outputs": [],
644 | "source": []
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": null,
649 | "metadata": {
650 | "collapsed": true
651 | },
652 | "outputs": [],
653 | "source": []
654 | }
655 | ],
656 | "metadata": {
657 | "kernelspec": {
658 | "display_name": "Python 3",
659 | "language": "python",
660 | "name": "python3"
661 | },
662 | "language_info": {
663 | "codemirror_mode": {
664 | "name": "ipython",
665 | "version": 3
666 | },
667 | "file_extension": ".py",
668 | "mimetype": "text/x-python",
669 | "name": "python",
670 | "nbconvert_exporter": "python",
671 | "pygments_lexer": "ipython3",
672 | "version": "3.5.1"
673 | }
674 | },
675 | "nbformat": 4,
676 | "nbformat_minor": 0
677 | }
678 |
--------------------------------------------------------------------------------
/Chapter01/ch1_affinity_create.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 6,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 7,
17 | "metadata": {
18 | "collapsed": false
19 | },
20 | "outputs": [],
21 | "source": [
22 | "X = np.zeros((100, 5), dtype='bool')\n",
23 | "features = [\"bread\", \"milk\", \"cheese\", \"apples\", \"bananas\"]"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 8,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [],
33 | "source": [
34 | "for i in range(X.shape[0]):\n",
35 | " if np.random.random() < 0.3:\n",
36 | " # A bread winner\n",
37 | " X[i][0] = 1\n",
38 | " if np.random.random() < 0.5:\n",
39 | " # Who likes milk\n",
40 | " X[i][1] = 1\n",
41 | " if np.random.random() < 0.2:\n",
42 | " # Who likes cheese\n",
43 | " X[i][2] = 1\n",
44 | " if np.random.random() < 0.25:\n",
45 | " # Who likes apples\n",
46 | " X[i][3] = 1\n",
47 | " if np.random.random() < 0.5:\n",
48 | " # Who likes bananas\n",
49 | " X[i][4] = 1\n",
50 | " else:\n",
51 | " # Not a bread winner\n",
52 | " if np.random.random() < 0.5:\n",
53 | " # Who likes milk\n",
54 | " X[i][1] = 1\n",
55 | " if np.random.random() < 0.2:\n",
56 | " # Who likes cheese\n",
57 | " X[i][2] = 1\n",
58 | " if np.random.random() < 0.25:\n",
59 | " # Who likes apples\n",
60 | " X[i][3] = 1\n",
61 | " if np.random.random() < 0.5:\n",
62 | " # Who likes bananas\n",
63 | " X[i][4] = 1\n",
64 | " else:\n",
65 | " if np.random.random() < 0.8:\n",
66 | " # Who likes cheese\n",
67 | " X[i][2] = 1\n",
68 | " if np.random.random() < 0.6:\n",
69 | " # Who likes apples\n",
70 | " X[i][3] = 1\n",
71 | " if np.random.random() < 0.7:\n",
72 | " # Who likes bananas\n",
73 | " X[i][4] = 1\n",
74 | " if X[i].sum() == 0:\n",
75 | " X[i][4] = 1 # Must buy something, so gets bananas\n"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 9,
81 | "metadata": {
82 | "collapsed": false
83 | },
84 | "outputs": [
85 | {
86 | "name": "stdout",
87 | "output_type": "stream",
88 | "text": [
89 | "[[False True False False False]\n",
90 | " [ True True False False False]\n",
91 | " [False False True False True]\n",
92 | " [ True True False False False]\n",
93 | " [False False True True True]]\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "print(X[:5])"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 10,
104 | "metadata": {
105 | "collapsed": false
106 | },
107 | "outputs": [],
108 | "source": [
109 | "np.savetxt(\"affinity_dataset.txt\", X, fmt='%d')"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [],
119 | "source": []
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {
125 | "collapsed": true
126 | },
127 | "outputs": [],
128 | "source": []
129 | }
130 | ],
131 | "metadata": {
132 | "kernelspec": {
133 | "display_name": "Python 3",
134 | "language": "python",
135 | "name": "python3"
136 | },
137 | "language_info": {
138 | "codemirror_mode": {
139 | "name": "ipython",
140 | "version": 3
141 | },
142 | "file_extension": ".py",
143 | "mimetype": "text/x-python",
144 | "name": "python",
145 | "nbconvert_exporter": "python",
146 | "pygments_lexer": "ipython3",
147 | "version": "3.5.1"
148 | }
149 | },
150 | "nbformat": 4,
151 | "nbformat_minor": 0
152 | }
153 |
--------------------------------------------------------------------------------
/Chapter01/ch1_oner_application.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 23,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "The OneR algorithm is quite simple but can be quite effective, showing the power of using even basic statistics in many applications.\n",
19 | "The algorithm is:\n",
20 | "\n",
21 | "* For each variable\n",
22 | " * For each value of the variable\n",
23 | " * The prediction based on this variable goes the most frequent class\n",
24 | " * Compute the error of this prediction\n",
25 | " * Sum the prediction errors for all values of the variable\n",
26 | "* Use the variable with the lowest error"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 24,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "Iris Plants Database\n",
41 | "\n",
42 | "Notes\n",
43 | "-----\n",
44 | "Data Set Characteristics:\n",
45 | " :Number of Instances: 150 (50 in each of three classes)\n",
46 | " :Number of Attributes: 4 numeric, predictive attributes and the class\n",
47 | " :Attribute Information:\n",
48 | " - sepal length in cm\n",
49 | " - sepal width in cm\n",
50 | " - petal length in cm\n",
51 | " - petal width in cm\n",
52 | " - class:\n",
53 | " - Iris-Setosa\n",
54 | " - Iris-Versicolour\n",
55 | " - Iris-Virginica\n",
56 | " :Summary Statistics:\n",
57 | "\n",
58 | " ============== ==== ==== ======= ===== ====================\n",
59 | " Min Max Mean SD Class Correlation\n",
60 | " ============== ==== ==== ======= ===== ====================\n",
61 | " sepal length: 4.3 7.9 5.84 0.83 0.7826\n",
62 | " sepal width: 2.0 4.4 3.05 0.43 -0.4194\n",
63 | " petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n",
64 | " petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n",
65 | " ============== ==== ==== ======= ===== ====================\n",
66 | "\n",
67 | " :Missing Attribute Values: None\n",
68 | " :Class Distribution: 33.3% for each of 3 classes.\n",
69 | " :Creator: R.A. Fisher\n",
70 | " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n",
71 | " :Date: July, 1988\n",
72 | "\n",
73 | "This is a copy of UCI ML iris datasets.\n",
74 | "http://archive.ics.uci.edu/ml/datasets/Iris\n",
75 | "\n",
76 | "The famous Iris database, first used by Sir R.A Fisher\n",
77 | "\n",
78 | "This is perhaps the best known database to be found in the\n",
79 | "pattern recognition literature. Fisher's paper is a classic in the field and\n",
80 | "is referenced frequently to this day. (See Duda & Hart, for example.) The\n",
81 | "data set contains 3 classes of 50 instances each, where each class refers to a\n",
82 | "type of iris plant. One class is linearly separable from the other 2; the\n",
83 | "latter are NOT linearly separable from each other.\n",
84 | "\n",
85 | "References\n",
86 | "----------\n",
87 | " - Fisher,R.A. \"The use of multiple measurements in taxonomic problems\"\n",
88 | " Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\n",
89 | " Mathematical Statistics\" (John Wiley, NY, 1950).\n",
90 | " - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n",
91 | " (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n",
92 | " - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\n",
93 | " Structure and Classification Rule for Recognition in Partially Exposed\n",
94 | " Environments\". IEEE Transactions on Pattern Analysis and Machine\n",
95 | " Intelligence, Vol. PAMI-2, No. 1, 67-71.\n",
96 | " - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\". IEEE Transactions\n",
97 | " on Information Theory, May 1972, 431-433.\n",
98 | " - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al\"s AUTOCLASS II\n",
99 | " conceptual clustering system finds 3 classes in the data.\n",
100 | " - Many, many more ...\n",
101 | "\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "# Load our dataset\n",
107 | "from sklearn.datasets import load_iris\n",
108 | "#X, y = np.loadtxt(\"X_classification.txt\"), np.loadtxt(\"y_classification.txt\")\n",
109 | "dataset = load_iris()\n",
110 | "X = dataset.data\n",
111 | "y = dataset.target\n",
112 | "print(dataset.DESCR)\n",
113 | "n_samples, n_features = X.shape"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "Our attributes are continuous, while we want categorical features to use OneR. We will perform a *preprocessing* step called discretisation. At this stage, we will perform a simple procedure: compute the mean and determine whether a value is above or below the mean."
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 25,
126 | "metadata": {
127 | "collapsed": false
128 | },
129 | "outputs": [],
130 | "source": [
131 | "# Compute the mean for each attribute\n",
132 | "attribute_means = X.mean(axis=0)\n",
133 | "assert attribute_means.shape == (n_features,)\n",
134 | "X_d = np.array(X >= attribute_means, dtype='int')"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 26,
140 | "metadata": {
141 | "collapsed": false
142 | },
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "There are (112,) training samples\n",
149 | "There are (38,) testing samples\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "# Now, we split into a training and test set\n",
155 | "from sklearn.cross_validation import train_test_split\n",
156 | "\n",
157 | "# Set the random state to the same number to get the same results as in the book\n",
158 | "random_state = 14\n",
159 | "\n",
160 | "X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)\n",
161 | "print(\"There are {} training samples\".format(y_train.shape))\n",
162 | "print(\"There are {} testing samples\".format(y_test.shape))"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 27,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [],
172 | "source": [
173 | "from collections import defaultdict\n",
174 | "from operator import itemgetter\n",
175 | "\n",
176 | "\n",
177 | "def train(X, y_true, feature):\n",
178 | " \"\"\"Computes the predictors and error for a given feature using the OneR algorithm\n",
179 | " \n",
180 | " Parameters\n",
181 | " ----------\n",
182 | " X: array [n_samples, n_features]\n",
183 | " The two dimensional array that holds the dataset. Each row is a sample, each column\n",
184 | " is a feature.\n",
185 | " \n",
186 | " y_true: array [n_samples,]\n",
187 | " The one dimensional array that holds the class values. Corresponds to X, such that\n",
188 | " y_true[i] is the class value for sample X[i].\n",
189 | " \n",
190 | " feature: int\n",
191 | " An integer corresponding to the index of the variable we wish to test.\n",
192 | " 0 <= variable < n_features\n",
193 | " \n",
194 | " Returns\n",
195 | " -------\n",
196 | " predictors: dictionary of tuples: (value, prediction)\n",
197 | " For each item in the array, if the variable has a given value, make the given prediction.\n",
198 | " \n",
199 | " error: float\n",
200 | " The ratio of training data that this rule incorrectly predicts.\n",
201 | " \"\"\"\n",
202 | " # Check that variable is a valid number\n",
203 | " n_samples, n_features = X.shape\n",
204 | " assert 0 <= feature < n_features\n",
205 | " # Get all of the unique values that this variable has\n",
206 | " values = set(X[:,feature])\n",
207 | " # Stores the predictors array that is returned\n",
208 | " predictors = dict()\n",
209 | " errors = []\n",
210 | " for current_value in values:\n",
211 | " most_frequent_class, error = train_feature_value(X, y_true, feature, current_value)\n",
212 | " predictors[current_value] = most_frequent_class\n",
213 | " errors.append(error)\n",
214 | " # Compute the total error of using this feature to classify on\n",
215 | " total_error = sum(errors)\n",
216 | " return predictors, total_error\n",
217 | "\n",
218 | "# Compute what our predictors say each sample is based on its value\n",
219 | "#y_predicted = np.array([predictors[sample[feature]] for sample in X])\n",
220 | " \n",
221 | "\n",
222 | "def train_feature_value(X, y_true, feature, value):\n",
223 | " # Create a simple dictionary to count how frequency they give certain predictions\n",
224 | " class_counts = defaultdict(int)\n",
225 | " # Iterate through each sample and count the frequency of each class/value pair\n",
226 | " for sample, y in zip(X, y_true):\n",
227 | " if sample[feature] == value:\n",
228 | " class_counts[y] += 1\n",
229 | " # Now get the best one by sorting (highest first) and choosing the first item\n",
230 | " sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)\n",
231 | " most_frequent_class = sorted_class_counts[0][0]\n",
232 | " # The error is the number of samples that do not classify as the most frequent class\n",
233 | " # *and* have the feature value.\n",
234 | " n_samples = X.shape[1]\n",
235 | " error = sum([class_count for class_value, class_count in class_counts.items()\n",
236 | " if class_value != most_frequent_class])\n",
237 | " return most_frequent_class, error"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 28,
243 | "metadata": {
244 | "collapsed": false
245 | },
246 | "outputs": [
247 | {
248 | "name": "stdout",
249 | "output_type": "stream",
250 | "text": [
251 | "The best model is based on variable 2 and has error 37.00\n",
252 | "{'variable': 2, 'predictor': {0: 0, 1: 2}}\n"
253 | ]
254 | }
255 | ],
256 | "source": [
257 | "# Compute all of the predictors\n",
258 | "all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}\n",
259 | "errors = {variable: error for variable, (mapping, error) in all_predictors.items()}\n",
260 | "# Now choose the best and save that as \"model\"\n",
261 | "# Sort by error\n",
262 | "best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]\n",
263 | "print(\"The best model is based on variable {0} and has error {1:.2f}\".format(best_variable, best_error))\n",
264 | "\n",
265 | "# Choose the bset model\n",
266 | "model = {'variable': best_variable,\n",
267 | " 'predictor': all_predictors[best_variable][0]}\n",
268 | "print(model)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 29,
274 | "metadata": {
275 | "collapsed": false
276 | },
277 | "outputs": [],
278 | "source": [
279 | "def predict(X_test, model):\n",
280 | " variable = model['variable']\n",
281 | " predictor = model['predictor']\n",
282 | " y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])\n",
283 | " return y_predicted"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 30,
289 | "metadata": {
290 | "collapsed": false
291 | },
292 | "outputs": [
293 | {
294 | "name": "stdout",
295 | "output_type": "stream",
296 | "text": [
297 | "[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2\n",
298 | " 2]\n"
299 | ]
300 | }
301 | ],
302 | "source": [
303 | "y_predicted = predict(X_test, model)\n",
304 | "print(y_predicted)"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 31,
310 | "metadata": {
311 | "collapsed": false
312 | },
313 | "outputs": [
314 | {
315 | "name": "stdout",
316 | "output_type": "stream",
317 | "text": [
318 | "The test accuracy is 65.8%\n"
319 | ]
320 | }
321 | ],
322 | "source": [
323 | "# Compute the accuracy by taking the mean of the amounts that y_predicted is equal to y_test\n",
324 | "accuracy = np.mean(y_predicted == y_test) * 100\n",
325 | "print(\"The test accuracy is {:.1f}%\".format(accuracy))"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 32,
331 | "metadata": {
332 | "collapsed": false
333 | },
334 | "outputs": [],
335 | "source": [
336 | "from sklearn.metrics import classification_report"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": 33,
342 | "metadata": {
343 | "collapsed": false
344 | },
345 | "outputs": [
346 | {
347 | "name": "stdout",
348 | "output_type": "stream",
349 | "text": [
350 | " precision recall f1-score support\n",
351 | "\n",
352 | " 0 0.94 1.00 0.97 17\n",
353 | " 1 0.00 0.00 0.00 13\n",
354 | " 2 0.40 1.00 0.57 8\n",
355 | "\n",
356 | "avg / total 0.51 0.66 0.55 38\n",
357 | "\n"
358 | ]
359 | },
360 | {
361 | "name": "stderr",
362 | "output_type": "stream",
363 | "text": [
364 | "/home/matt/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
365 | " 'precision', 'predicted', average, warn_for)\n"
366 | ]
367 | }
368 | ],
369 | "source": [
370 | "print(classification_report(y_test, y_predicted))"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {
377 | "collapsed": false
378 | },
379 | "outputs": [],
380 | "source": []
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {
386 | "collapsed": false
387 | },
388 | "outputs": [],
389 | "source": []
390 | }
391 | ],
392 | "metadata": {
393 | "kernelspec": {
394 | "display_name": "Python 3",
395 | "language": "python",
396 | "name": "python3"
397 | },
398 | "language_info": {
399 | "codemirror_mode": {
400 | "name": "ipython",
401 | "version": 3
402 | },
403 | "file_extension": ".py",
404 | "mimetype": "text/x-python",
405 | "name": "python",
406 | "nbconvert_exporter": "python",
407 | "pygments_lexer": "ipython3",
408 | "version": "3.5.1"
409 | }
410 | },
411 | "nbformat": 4,
412 | "nbformat_minor": 0
413 | }
414 |
--------------------------------------------------------------------------------
/Chapter04/ch4_movie_recommendation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import os\n",
12 | "import pandas as pd\n",
13 | "data_folder = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"ml-100k\")\n",
14 | "ratings_filename = os.path.join(data_folder, \"u.data\")"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "all_ratings = pd.read_csv(ratings_filename, delimiter=\"\\t\", header=None, names = [\"UserID\", \"MovieID\", \"Rating\", \"Datetime\"])"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/html": [
38 | "
\n",
39 | "
\n",
40 | " \n",
41 | " \n",
42 | " \n",
43 | " UserID \n",
44 | " MovieID \n",
45 | " Rating \n",
46 | " Datetime \n",
47 | " \n",
48 | " \n",
49 | " \n",
50 | " \n",
51 | " 0 \n",
52 | " 196 \n",
53 | " 242 \n",
54 | " 3 \n",
55 | " 881250949 \n",
56 | " \n",
57 | " \n",
58 | " 1 \n",
59 | " 186 \n",
60 | " 302 \n",
61 | " 3 \n",
62 | " 891717742 \n",
63 | " \n",
64 | " \n",
65 | " 2 \n",
66 | " 22 \n",
67 | " 377 \n",
68 | " 1 \n",
69 | " 878887116 \n",
70 | " \n",
71 | " \n",
72 | " 3 \n",
73 | " 244 \n",
74 | " 51 \n",
75 | " 2 \n",
76 | " 880606923 \n",
77 | " \n",
78 | " \n",
79 | " 4 \n",
80 | " 166 \n",
81 | " 346 \n",
82 | " 1 \n",
83 | " 886397596 \n",
84 | " \n",
85 | " \n",
86 | "
\n",
87 | "
"
88 | ],
89 | "text/plain": [
90 | " UserID MovieID Rating Datetime\n",
91 | "0 196 242 3 881250949\n",
92 | "1 186 302 3 891717742\n",
93 | "2 22 377 1 878887116\n",
94 | "3 244 51 2 880606923\n",
95 | "4 166 346 1 886397596"
96 | ]
97 | },
98 | "execution_count": 3,
99 | "metadata": {},
100 | "output_type": "execute_result"
101 | }
102 | ],
103 | "source": [
104 | "all_ratings.head()"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 4,
110 | "metadata": {
111 | "collapsed": true
112 | },
113 | "outputs": [],
114 | "source": [
115 | "all_ratings[\"Datetime\"] = pd.to_datetime(all_ratings['Datetime'], unit='s')"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 5,
121 | "metadata": {
122 | "collapsed": false
123 | },
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/html": [
128 | "\n",
129 | "
\n",
130 | " \n",
131 | " \n",
132 | " \n",
133 | " UserID \n",
134 | " MovieID \n",
135 | " Rating \n",
136 | " Datetime \n",
137 | " \n",
138 | " \n",
139 | " \n",
140 | " \n",
141 | " 0 \n",
142 | " 196 \n",
143 | " 242 \n",
144 | " 3 \n",
145 | " 1997-12-04 15:55:49 \n",
146 | " \n",
147 | " \n",
148 | " 1 \n",
149 | " 186 \n",
150 | " 302 \n",
151 | " 3 \n",
152 | " 1998-04-04 19:22:22 \n",
153 | " \n",
154 | " \n",
155 | " 2 \n",
156 | " 22 \n",
157 | " 377 \n",
158 | " 1 \n",
159 | " 1997-11-07 07:18:36 \n",
160 | " \n",
161 | " \n",
162 | " 3 \n",
163 | " 244 \n",
164 | " 51 \n",
165 | " 2 \n",
166 | " 1997-11-27 05:02:03 \n",
167 | " \n",
168 | " \n",
169 | " 4 \n",
170 | " 166 \n",
171 | " 346 \n",
172 | " 1 \n",
173 | " 1998-02-02 05:33:16 \n",
174 | " \n",
175 | " \n",
176 | "
\n",
177 | "
"
178 | ],
179 | "text/plain": [
180 | " UserID MovieID Rating Datetime\n",
181 | "0 196 242 3 1997-12-04 15:55:49\n",
182 | "1 186 302 3 1998-04-04 19:22:22\n",
183 | "2 22 377 1 1997-11-07 07:18:36\n",
184 | "3 244 51 2 1997-11-27 05:02:03\n",
185 | "4 166 346 1 1998-02-02 05:33:16"
186 | ]
187 | },
188 | "execution_count": 5,
189 | "metadata": {},
190 | "output_type": "execute_result"
191 | }
192 | ],
193 | "source": [
194 | "all_ratings.head()"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 6,
200 | "metadata": {
201 | "collapsed": true
202 | },
203 | "outputs": [],
204 | "source": [
205 | "all_ratings[\"Favorable\"] = all_ratings[\"Rating\"] > 3"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 7,
211 | "metadata": {
212 | "collapsed": true
213 | },
214 | "outputs": [],
215 | "source": [
216 | "ratings = all_ratings[all_ratings['UserID'].isin(range(200))]"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 8,
222 | "metadata": {
223 | "collapsed": true
224 | },
225 | "outputs": [],
226 | "source": [
227 | "favorable_ratings = ratings[ratings[\"Favorable\"]]"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 9,
233 | "metadata": {
234 | "collapsed": true
235 | },
236 | "outputs": [],
237 | "source": [
238 | "favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby(\"UserID\")[\"MovieID\"])"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 10,
244 | "metadata": {
245 | "collapsed": true
246 | },
247 | "outputs": [],
248 | "source": [
249 | "num_favorable_by_movie = ratings[[\"MovieID\", \"Favorable\"]].groupby(\"MovieID\").sum()"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 11,
255 | "metadata": {
256 | "collapsed": false
257 | },
258 | "outputs": [
259 | {
260 | "data": {
261 | "text/html": [
262 | "\n",
263 | "
\n",
264 | " \n",
265 | " \n",
266 | " \n",
267 | " Favorable \n",
268 | " \n",
269 | " \n",
270 | " MovieID \n",
271 | " \n",
272 | " \n",
273 | " \n",
274 | " \n",
275 | " \n",
276 | " 50 \n",
277 | " 100.0 \n",
278 | " \n",
279 | " \n",
280 | " 100 \n",
281 | " 89.0 \n",
282 | " \n",
283 | " \n",
284 | " 258 \n",
285 | " 83.0 \n",
286 | " \n",
287 | " \n",
288 | " 181 \n",
289 | " 79.0 \n",
290 | " \n",
291 | " \n",
292 | " 174 \n",
293 | " 74.0 \n",
294 | " \n",
295 | " \n",
296 | "
\n",
297 | "
"
298 | ],
299 | "text/plain": [
300 | " Favorable\n",
301 | "MovieID \n",
302 | "50 100.0\n",
303 | "100 89.0\n",
304 | "258 83.0\n",
305 | "181 79.0\n",
306 | "174 74.0"
307 | ]
308 | },
309 | "execution_count": 11,
310 | "metadata": {},
311 | "output_type": "execute_result"
312 | }
313 | ],
314 | "source": [
315 | "num_favorable_by_movie.sort_values(by=\"Favorable\", ascending=False).head()"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 12,
321 | "metadata": {
322 | "collapsed": true
323 | },
324 | "outputs": [],
325 | "source": [
326 | "frequent_itemsets = {}"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 13,
332 | "metadata": {
333 | "collapsed": true
334 | },
335 | "outputs": [],
336 | "source": [
337 | "min_support = 50"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 14,
343 | "metadata": {
344 | "collapsed": true
345 | },
346 | "outputs": [],
347 | "source": [
348 | "frequent_itemsets[1] = dict((frozenset((movie_id,)), row[\"Favorable\"])\n",
349 | " for movie_id, row in num_favorable_by_movie.iterrows()\n",
350 | " if row[\"Favorable\"] > min_support)"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 15,
356 | "metadata": {
357 | "collapsed": true
358 | },
359 | "outputs": [],
360 | "source": [
361 | "from collections import defaultdict\n",
362 | "def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):\n",
363 | " counts = defaultdict(int)\n",
364 | " for user, reviews in favorable_reviews_by_users.items():\n",
365 | " for itemset in k_1_itemsets:\n",
366 | " if itemset.issubset(reviews):\n",
367 | " for other_reviewed_movie in reviews - itemset:\n",
368 | " current_superset = itemset | frozenset((other_reviewed_movie,))\n",
369 | " counts[current_superset] += 1\n",
370 | " return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 16,
376 | "metadata": {
377 | "collapsed": false
378 | },
379 | "outputs": [
380 | {
381 | "name": "stdout",
382 | "output_type": "stream",
383 | "text": [
384 | "There are 16 movies with more than 50 favorable reviews\n",
385 | "I found 93 frequent itemsets of length 2\n",
386 | "I found 295 frequent itemsets of length 3\n",
387 | "I found 593 frequent itemsets of length 4\n",
388 | "I found 785 frequent itemsets of length 5\n",
389 | "I found 677 frequent itemsets of length 6\n",
390 | "I found 373 frequent itemsets of length 7\n",
391 | "I found 126 frequent itemsets of length 8\n",
392 | "I found 24 frequent itemsets of length 9\n",
393 | "I found 2 frequent itemsets of length 10\n",
394 | "Did not find any frequent itemsets of length 11\n"
395 | ]
396 | }
397 | ],
398 | "source": [
399 | "import sys\n",
400 | "frequent_itemsets = {} # itemsets are sorted by length\n",
401 | "min_support = 50\n",
402 | "\n",
403 | "# k=1 candidates are the isbns with more than min_support favourable reviews\n",
404 | "frequent_itemsets[1] = dict((frozenset((movie_id,)), row[\"Favorable\"])\n",
405 | " for movie_id, row in num_favorable_by_movie.iterrows()\n",
406 | " if row[\"Favorable\"] > min_support)\n",
407 | "\n",
408 | "print(\"There are {} movies with more than {} favorable reviews\".format(len(frequent_itemsets[1]), min_support))\n",
409 | "sys.stdout.flush()\n",
410 | "for k in range(2, 20):\n",
411 | " # Generate candidates of length k, using the frequent itemsets of length k-1\n",
412 | " # Only store the frequent itemsets\n",
413 | " cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],\n",
414 | " min_support)\n",
415 | " if len(cur_frequent_itemsets) == 0:\n",
416 | " print(\"Did not find any frequent itemsets of length {}\".format(k))\n",
417 | " sys.stdout.flush()\n",
418 | " break\n",
419 | " else:\n",
420 | " print(\"I found {} frequent itemsets of length {}\".format(len(cur_frequent_itemsets), k))\n",
421 | " #print(cur_frequent_itemsets)\n",
422 | " sys.stdout.flush()\n",
423 | " frequent_itemsets[k] = cur_frequent_itemsets\n",
424 | "# We aren't interested in the itemsets of length 1, so remove those\n",
425 | "del frequent_itemsets[1]"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 17,
431 | "metadata": {
432 | "collapsed": false
433 | },
434 | "outputs": [
435 | {
436 | "name": "stdout",
437 | "output_type": "stream",
438 | "text": [
439 | "There are 15285 candidate rules\n"
440 | ]
441 | }
442 | ],
443 | "source": [
444 | "# Now we create the association rules. First, they are candidates until the confidence has been tested\n",
445 | "candidate_rules = []\n",
446 | "for itemset_length, itemset_counts in frequent_itemsets.items():\n",
447 | " for itemset in itemset_counts.keys():\n",
448 | " for conclusion in itemset:\n",
449 | " premise = itemset - set((conclusion,))\n",
450 | " candidate_rules.append((premise, conclusion))\n",
451 | "print(\"There are {} candidate rules\".format(len(candidate_rules)))"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 18,
457 | "metadata": {
458 | "collapsed": false
459 | },
460 | "outputs": [
461 | {
462 | "name": "stdout",
463 | "output_type": "stream",
464 | "text": [
465 | "[(frozenset({79}), 258), (frozenset({258}), 79), (frozenset({50}), 64), (frozenset({64}), 50), (frozenset({127}), 181)]\n"
466 | ]
467 | }
468 | ],
469 | "source": [
470 | "print(candidate_rules[:5])"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 19,
476 | "metadata": {
477 | "collapsed": true
478 | },
479 | "outputs": [],
480 | "source": [
481 | "# Now, we compute the confidence of each of these rules. This is very similar to what we did in chapter 1\n",
482 | "correct_counts = defaultdict(int)\n",
483 | "incorrect_counts = defaultdict(int)\n",
484 | "for user, reviews in favorable_reviews_by_users.items():\n",
485 | " for candidate_rule in candidate_rules:\n",
486 | " premise, conclusion = candidate_rule\n",
487 | " if premise.issubset(reviews):\n",
488 | " if conclusion in reviews:\n",
489 | " correct_counts[candidate_rule] += 1\n",
490 | " else:\n",
491 | " incorrect_counts[candidate_rule] += 1\n",
492 | "rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])\n",
493 | " for candidate_rule in candidate_rules}"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 20,
499 | "metadata": {
500 | "collapsed": true
501 | },
502 | "outputs": [],
503 | "source": [
504 | "# Choose only rules above a minimum confidence level\n",
505 | "min_confidence = 0.9"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": 21,
511 | "metadata": {
512 | "collapsed": false
513 | },
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "5152\n"
520 | ]
521 | }
522 | ],
523 | "source": [
524 | "# Filter out the rules with poor confidence\n",
525 | "rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}\n",
526 | "print(len(rule_confidence))"
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": 22,
532 | "metadata": {
533 | "collapsed": true
534 | },
535 | "outputs": [],
536 | "source": [
537 | "from operator import itemgetter\n",
538 | "sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)"
539 | ]
540 | },
541 | {
542 | "cell_type": "code",
543 | "execution_count": 23,
544 | "metadata": {
545 | "collapsed": false
546 | },
547 | "outputs": [
548 | {
549 | "name": "stdout",
550 | "output_type": "stream",
551 | "text": [
552 | "Rule #1\n",
553 | "Rule: If a person recommends frozenset({64, 98, 56, 50, 7}) they will also recommend 174\n",
554 | " - Confidence: 1.000\n",
555 | "\n",
556 | "Rule #2\n",
557 | "Rule: If a person recommends frozenset({98, 100, 172, 79, 50, 56}) they will also recommend 7\n",
558 | " - Confidence: 1.000\n",
559 | "\n",
560 | "Rule #3\n",
561 | "Rule: If a person recommends frozenset({98, 172, 181, 174, 7}) they will also recommend 50\n",
562 | " - Confidence: 1.000\n",
563 | "\n",
564 | "Rule #4\n",
565 | "Rule: If a person recommends frozenset({64, 98, 100, 7, 172, 50}) they will also recommend 174\n",
566 | " - Confidence: 1.000\n",
567 | "\n",
568 | "Rule #5\n",
569 | "Rule: If a person recommends frozenset({64, 1, 7, 172, 79, 50}) they will also recommend 181\n",
570 | " - Confidence: 1.000\n",
571 | "\n"
572 | ]
573 | }
574 | ],
575 | "source": [
576 | "for index in range(5):\n",
577 | " print(\"Rule #{0}\".format(index + 1))\n",
578 | " (premise, conclusion) = sorted_confidence[index][0]\n",
579 | " print(\"Rule: If a person recommends {0} they will also recommend {1}\".format(premise, conclusion))\n",
580 | " print(\" - Confidence: {0:.3f}\".format(rule_confidence[(premise, conclusion)]))\n",
581 | " print(\"\")"
582 | ]
583 | },
584 | {
585 | "cell_type": "code",
586 | "execution_count": 24,
587 | "metadata": {
588 | "collapsed": true
589 | },
590 | "outputs": [],
591 | "source": [
592 | "# Even better, we can get the movie titles themselves from the dataset\n",
593 | "movie_name_filename = os.path.join(data_folder, \"u.item\")\n",
594 | "movie_name_data = pd.read_csv(movie_name_filename, delimiter=\"|\", header=None, encoding = \"mac-roman\")\n",
595 | "movie_name_data.columns = [\"MovieID\", \"Title\", \"Release Date\", \"Video Release\", \"IMDB\", \"\", \"Action\", \"Adventure\",\n",
596 | " \"Animation\", \"Children's\", \"Comedy\", \"Crime\", \"Documentary\", \"Drama\", \"Fantasy\", \"Film-Noir\",\n",
597 | " \"Horror\", \"Musical\", \"Mystery\", \"Romance\", \"Sci-Fi\", \"Thriller\", \"War\", \"Western\"]"
598 | ]
599 | },
600 | {
601 | "cell_type": "code",
602 | "execution_count": 31,
603 | "metadata": {
604 | "collapsed": false
605 | },
606 | "outputs": [
607 | {
608 | "data": {
609 | "text/html": [
610 | "\n",
611 | "
\n",
612 | " \n",
613 | " \n",
614 | " \n",
615 | " MovieID \n",
616 | " Title \n",
617 | " Release Date \n",
618 | " Video Release \n",
619 | " IMDB \n",
620 | " <UNK> \n",
621 | " Action \n",
622 | " Adventure \n",
623 | " Animation \n",
624 | " Children's \n",
625 | " ... \n",
626 | " Fantasy \n",
627 | " Film-Noir \n",
628 | " Horror \n",
629 | " Musical \n",
630 | " Mystery \n",
631 | " Romance \n",
632 | " Sci-Fi \n",
633 | " Thriller \n",
634 | " War \n",
635 | " Western \n",
636 | " \n",
637 | " \n",
638 | " \n",
639 | " \n",
640 | " 0 \n",
641 | " 1 \n",
642 | " Toy Story (1995) \n",
643 | " 01-Jan-1995 \n",
644 | " NaN \n",
645 | " http://us.imdb.com/M/title-exact?Toy%20Story%2... \n",
646 | " 0 \n",
647 | " 0 \n",
648 | " 0 \n",
649 | " 1 \n",
650 | " 1 \n",
651 | " ... \n",
652 | " 0 \n",
653 | " 0 \n",
654 | " 0 \n",
655 | " 0 \n",
656 | " 0 \n",
657 | " 0 \n",
658 | " 0 \n",
659 | " 0 \n",
660 | " 0 \n",
661 | " 0 \n",
662 | " \n",
663 | " \n",
664 | " 1 \n",
665 | " 2 \n",
666 | " GoldenEye (1995) \n",
667 | " 01-Jan-1995 \n",
668 | " NaN \n",
669 | " http://us.imdb.com/M/title-exact?GoldenEye%20(... \n",
670 | " 0 \n",
671 | " 1 \n",
672 | " 1 \n",
673 | " 0 \n",
674 | " 0 \n",
675 | " ... \n",
676 | " 0 \n",
677 | " 0 \n",
678 | " 0 \n",
679 | " 0 \n",
680 | " 0 \n",
681 | " 0 \n",
682 | " 0 \n",
683 | " 1 \n",
684 | " 0 \n",
685 | " 0 \n",
686 | " \n",
687 | " \n",
688 | " 2 \n",
689 | " 3 \n",
690 | " Four Rooms (1995) \n",
691 | " 01-Jan-1995 \n",
692 | " NaN \n",
693 | " http://us.imdb.com/M/title-exact?Four%20Rooms%... \n",
694 | " 0 \n",
695 | " 0 \n",
696 | " 0 \n",
697 | " 0 \n",
698 | " 0 \n",
699 | " ... \n",
700 | " 0 \n",
701 | " 0 \n",
702 | " 0 \n",
703 | " 0 \n",
704 | " 0 \n",
705 | " 0 \n",
706 | " 0 \n",
707 | " 1 \n",
708 | " 0 \n",
709 | " 0 \n",
710 | " \n",
711 | " \n",
712 | " 3 \n",
713 | " 4 \n",
714 | " Get Shorty (1995) \n",
715 | " 01-Jan-1995 \n",
716 | " NaN \n",
717 | " http://us.imdb.com/M/title-exact?Get%20Shorty%... \n",
718 | " 0 \n",
719 | " 1 \n",
720 | " 0 \n",
721 | " 0 \n",
722 | " 0 \n",
723 | " ... \n",
724 | " 0 \n",
725 | " 0 \n",
726 | " 0 \n",
727 | " 0 \n",
728 | " 0 \n",
729 | " 0 \n",
730 | " 0 \n",
731 | " 0 \n",
732 | " 0 \n",
733 | " 0 \n",
734 | " \n",
735 | " \n",
736 | " 4 \n",
737 | " 5 \n",
738 | " Copycat (1995) \n",
739 | " 01-Jan-1995 \n",
740 | " NaN \n",
741 | " http://us.imdb.com/M/title-exact?Copycat%20(1995) \n",
742 | " 0 \n",
743 | " 0 \n",
744 | " 0 \n",
745 | " 0 \n",
746 | " 0 \n",
747 | " ... \n",
748 | " 0 \n",
749 | " 0 \n",
750 | " 0 \n",
751 | " 0 \n",
752 | " 0 \n",
753 | " 0 \n",
754 | " 0 \n",
755 | " 1 \n",
756 | " 0 \n",
757 | " 0 \n",
758 | " \n",
759 | " \n",
760 | "
\n",
761 | "
5 rows × 24 columns
\n",
762 | "
"
763 | ],
764 | "text/plain": [
765 | " MovieID Title Release Date Video Release \\\n",
766 | "0 1 Toy Story (1995) 01-Jan-1995 NaN \n",
767 | "1 2 GoldenEye (1995) 01-Jan-1995 NaN \n",
768 | "2 3 Four Rooms (1995) 01-Jan-1995 NaN \n",
769 | "3 4 Get Shorty (1995) 01-Jan-1995 NaN \n",
770 | "4 5 Copycat (1995) 01-Jan-1995 NaN \n",
771 | "\n",
772 | " IMDB Action \\\n",
773 | "0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 \n",
774 | "1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 \n",
775 | "2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 \n",
776 | "3 http://us.imdb.com/M/title-exact?Get%20Shorty%... 0 1 \n",
777 | "4 http://us.imdb.com/M/title-exact?Copycat%20(1995) 0 0 \n",
778 | "\n",
779 | " Adventure Animation Children's ... Fantasy Film-Noir Horror \\\n",
780 | "0 0 1 1 ... 0 0 0 \n",
781 | "1 1 0 0 ... 0 0 0 \n",
782 | "2 0 0 0 ... 0 0 0 \n",
783 | "3 0 0 0 ... 0 0 0 \n",
784 | "4 0 0 0 ... 0 0 0 \n",
785 | "\n",
786 | " Musical Mystery Romance Sci-Fi Thriller War Western \n",
787 | "0 0 0 0 0 0 0 0 \n",
788 | "1 0 0 0 0 1 0 0 \n",
789 | "2 0 0 0 0 1 0 0 \n",
790 | "3 0 0 0 0 0 0 0 \n",
791 | "4 0 0 0 0 1 0 0 \n",
792 | "\n",
793 | "[5 rows x 24 columns]"
794 | ]
795 | },
796 | "execution_count": 31,
797 | "metadata": {},
798 | "output_type": "execute_result"
799 | }
800 | ],
801 | "source": [
802 | "movie_name_data.head()"
803 | ]
804 | },
805 | {
806 | "cell_type": "code",
807 | "execution_count": 25,
808 | "metadata": {
809 | "collapsed": true
810 | },
811 | "outputs": [],
812 | "source": [
813 | "def get_movie_name(movie_id):\n",
814 | " title_object = movie_name_data[movie_name_data[\"MovieID\"] == movie_id][\"Title\"]\n",
815 | " title = title_object.values[0]\n",
816 | " return title"
817 | ]
818 | },
819 | {
820 | "cell_type": "code",
821 | "execution_count": 26,
822 | "metadata": {
823 | "collapsed": false
824 | },
825 | "outputs": [
826 | {
827 | "data": {
828 | "text/plain": [
829 | "'Get Shorty (1995)'"
830 | ]
831 | },
832 | "execution_count": 26,
833 | "metadata": {},
834 | "output_type": "execute_result"
835 | }
836 | ],
837 | "source": [
838 | "get_movie_name(4)"
839 | ]
840 | },
841 | {
842 | "cell_type": "code",
843 | "execution_count": 27,
844 | "metadata": {
845 | "collapsed": false
846 | },
847 | "outputs": [
848 | {
849 | "name": "stdout",
850 | "output_type": "stream",
851 | "text": [
852 | "Rule #1\n",
853 | "Rule: If a person recommends Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Pulp Fiction (1994), Star Wars (1977), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)\n",
854 | " - Confidence: 1.000\n",
855 | "\n",
856 | "Rule #2\n",
857 | "Rule: If a person recommends Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977), Pulp Fiction (1994) they will also recommend Twelve Monkeys (1995)\n",
858 | " - Confidence: 1.000\n",
859 | "\n",
860 | "Rule #3\n",
861 | "Rule: If a person recommends Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) they will also recommend Star Wars (1977)\n",
862 | " - Confidence: 1.000\n",
863 | "\n",
864 | "Rule #4\n",
865 | "Rule: If a person recommends Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Fargo (1996), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Star Wars (1977) they will also recommend Raiders of the Lost Ark (1981)\n",
866 | " - Confidence: 1.000\n",
867 | "\n",
868 | "Rule #5\n",
869 | "Rule: If a person recommends Shawshank Redemption, The (1994), Toy Story (1995), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977) they will also recommend Return of the Jedi (1983)\n",
870 | " - Confidence: 1.000\n",
871 | "\n"
872 | ]
873 | }
874 | ],
875 | "source": [
876 | "for index in range(5):\n",
877 | " print(\"Rule #{0}\".format(index + 1))\n",
878 | " (premise, conclusion) = sorted_confidence[index][0]\n",
879 | " premise_names = \", \".join(get_movie_name(idx) for idx in premise)\n",
880 | " conclusion_name = get_movie_name(conclusion)\n",
881 | " print(\"Rule: If a person recommends {0} they will also recommend {1}\".format(premise_names, conclusion_name))\n",
882 | " print(\" - Confidence: {0:.3f}\".format(rule_confidence[(premise, conclusion)]))\n",
883 | " print(\"\")"
884 | ]
885 | },
886 | {
887 | "cell_type": "code",
888 | "execution_count": 33,
889 | "metadata": {
890 | "collapsed": true
891 | },
892 | "outputs": [],
893 | "source": [
894 | "# Evaluation using test data\n",
895 | "test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]\n",
896 | "test_favorable = test_dataset[test_dataset[\"Favorable\"]]\n",
897 | "test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby(\"UserID\")[\"MovieID\"])"
898 | ]
899 | },
900 | {
901 | "cell_type": "code",
902 | "execution_count": 34,
903 | "metadata": {
904 | "collapsed": true
905 | },
906 | "outputs": [],
907 | "source": [
908 | "correct_counts = defaultdict(int)\n",
909 | "incorrect_counts = defaultdict(int)\n",
910 | "for user, reviews in test_favorable_by_users.items():\n",
911 | " for candidate_rule in candidate_rules:\n",
912 | " premise, conclusion = candidate_rule\n",
913 | " if premise.issubset(reviews):\n",
914 | " if conclusion in reviews:\n",
915 | " correct_counts[candidate_rule] += 1\n",
916 | " else:\n",
917 | " incorrect_counts[candidate_rule] += 1"
918 | ]
919 | },
920 | {
921 | "cell_type": "code",
922 | "execution_count": 35,
923 | "metadata": {
924 | "collapsed": false
925 | },
926 | "outputs": [
927 | {
928 | "name": "stdout",
929 | "output_type": "stream",
930 | "text": [
931 | "5152\n"
932 | ]
933 | }
934 | ],
935 | "source": [
936 | "test_confidence = {candidate_rule:\n",
937 | " (correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]))\n",
938 | " for candidate_rule in rule_confidence}\n",
939 | "print(len(test_confidence))"
940 | ]
941 | },
942 | {
943 | "cell_type": "code",
944 | "execution_count": 37,
945 | "metadata": {
946 | "collapsed": false
947 | },
948 | "outputs": [
949 | {
950 | "name": "stdout",
951 | "output_type": "stream",
952 | "text": [
953 | "Rule #1\n",
954 | "Rule: If a person recommends Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Pulp Fiction (1994), Star Wars (1977), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)\n",
955 | " - Train Confidence: 1.000\n",
956 | " - Test Confidence: 0.909\n",
957 | "\n",
958 | "Rule #2\n",
959 | "Rule: If a person recommends Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977), Pulp Fiction (1994) they will also recommend Twelve Monkeys (1995)\n",
960 | " - Train Confidence: 1.000\n",
961 | " - Test Confidence: 0.609\n",
962 | "\n",
963 | "Rule #3\n",
964 | "Rule: If a person recommends Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) they will also recommend Star Wars (1977)\n",
965 | " - Train Confidence: 1.000\n",
966 | " - Test Confidence: 0.946\n",
967 | "\n",
968 | "Rule #4\n",
969 | "Rule: If a person recommends Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Fargo (1996), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Star Wars (1977) they will also recommend Raiders of the Lost Ark (1981)\n",
970 | " - Train Confidence: 1.000\n",
971 | " - Test Confidence: 0.971\n",
972 | "\n",
973 | "Rule #5\n",
974 | "Rule: If a person recommends Shawshank Redemption, The (1994), Toy Story (1995), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977) they will also recommend Return of the Jedi (1983)\n",
975 | " - Train Confidence: 1.000\n",
976 | " - Test Confidence: 0.900\n",
977 | "\n",
978 | "Rule #6\n",
979 | "Rule: If a person recommends Toy Story (1995), Silence of the Lambs, The (1991), Fargo (1996), Raiders of the Lost Ark (1981), Godfather, The (1972) they will also recommend Pulp Fiction (1994)\n",
980 | " - Train Confidence: 1.000\n",
981 | " - Test Confidence: 0.750\n",
982 | "\n",
983 | "Rule #7\n",
984 | "Rule: If a person recommends Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Godfather, The (1972), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994)\n",
985 | " - Train Confidence: 1.000\n",
986 | " - Test Confidence: 0.854\n",
987 | "\n",
988 | "Rule #8\n",
989 | "Rule: If a person recommends Pulp Fiction (1994), Toy Story (1995), Shawshank Redemption, The (1994), Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991)\n",
990 | " - Train Confidence: 1.000\n",
991 | " - Test Confidence: 0.870\n",
992 | "\n",
993 | "Rule #9\n",
994 | "Rule: If a person recommends Shawshank Redemption, The (1994), Fargo (1996), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Fugitive, The (1993) they will also recommend Pulp Fiction (1994)\n",
995 | " - Train Confidence: 1.000\n",
996 | " - Test Confidence: 0.756\n",
997 | "\n",
998 | "Rule #10\n",
999 | "Rule: If a person recommends Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Raiders of the Lost Ark (1981), Fugitive, The (1993), Star Wars (1977), Return of the Jedi (1983) they will also recommend Pulp Fiction (1994)\n",
1000 | " - Train Confidence: 1.000\n",
1001 | " - Test Confidence: 0.756\n",
1002 | "\n"
1003 | ]
1004 | }
1005 | ],
1006 | "source": [
1007 | "for index in range(10):\n",
1008 | " print(\"Rule #{0}\".format(index + 1))\n",
1009 | " (premise, conclusion) = sorted_confidence[index][0]\n",
1010 | " premise_names = \", \".join(get_movie_name(idx) for idx in premise)\n",
1011 | " conclusion_name = get_movie_name(conclusion)\n",
1012 | " print(\"Rule: If a person recommends {0} they will also recommend {1}\".format(premise_names, conclusion_name))\n",
1013 | " print(\" - Train Confidence: {0:.3f}\".format(rule_confidence.get((premise, conclusion), -1)))\n",
1014 | " print(\" - Test Confidence: {0:.3f}\".format(test_confidence.get((premise, conclusion), -1)))\n",
1015 | " print(\"\")"
1016 | ]
1017 | },
1018 | {
1019 | "cell_type": "code",
1020 | "execution_count": null,
1021 | "metadata": {
1022 | "collapsed": true
1023 | },
1024 | "outputs": [],
1025 | "source": []
1026 | }
1027 | ],
1028 | "metadata": {
1029 | "kernelspec": {
1030 | "display_name": "Python 3",
1031 | "language": "python",
1032 | "name": "python3"
1033 | },
1034 | "language_info": {
1035 | "codemirror_mode": {
1036 | "name": "ipython",
1037 | "version": 3
1038 | },
1039 | "file_extension": ".py",
1040 | "mimetype": "text/x-python",
1041 | "name": "python",
1042 | "nbconvert_exporter": "python",
1043 | "pygments_lexer": "ipython3",
1044 | "version": "3.5.1"
1045 | }
1046 | },
1047 | "nbformat": 4,
1048 | "nbformat_minor": 0
1049 | }
1050 |
--------------------------------------------------------------------------------
/Chapter05/adult_tests.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from numpy.testing import assert_array_equal
3 |
4 | def test_meandiscrete():
5 | X_test = np.array([[ 0, 2],
6 | [ 3, 5],
7 | [ 6, 8],
8 | [ 9, 11],
9 | [12, 14],
10 | [15, 17],
11 | [18, 20],
12 | [21, 23],
13 | [24, 26],
14 | [27, 29]])
15 | mean_discrete = MeanDiscrete()
16 | mean_discrete.fit(X_test)
17 | assert_array_equal(mean_discrete.mean, np.array([13.5, 15.5]))
18 | X_transformed = mean_discrete.transform(X_test)
19 | X_expected = np.array([[ 0, 0],
20 | [ 0, 0],
21 | [ 0, 0],
22 | [ 0, 0],
23 | [ 0, 0],
24 | [ 1, 1],
25 | [ 1, 1],
26 | [ 1, 1],
27 | [ 1, 1],
28 | [ 1, 1]])
29 | assert_array_equal(X_transformed, X_expected)
--------------------------------------------------------------------------------
/Chapter06/ch6_classify_twitter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 15,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import spacy\n",
12 | "from sklearn.base import TransformerMixin\n",
13 | "\n",
14 | "# Create a spaCy parser\n",
15 | "nlp = spacy.load('en')\n",
16 | "\n",
17 | "\n",
18 | "class BagOfWords(TransformerMixin):\n",
19 | " def fit(self, X, y=None):\n",
20 | " return self\n",
21 | " \n",
22 | " def transform(self, X):\n",
23 | " results = []\n",
24 | " for document in X:\n",
25 | " row = {}\n",
26 | " for word in list(nlp(document, tag=False, parse=False, entity=False)):\n",
27 | " if len(word.text.strip()):\n",
28 | " row[word.text] = True\n",
29 | " results.append(row)\n",
30 | " return results"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 22,
36 | "metadata": {
37 | "collapsed": true
38 | },
39 | "outputs": [],
40 | "source": [
41 | "from sklearn.feature_extraction import DictVectorizer"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 23,
47 | "metadata": {
48 | "collapsed": true
49 | },
50 | "outputs": [],
51 | "source": [
52 | "from sklearn.naive_bayes import BernoulliNB"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 24,
58 | "metadata": {
59 | "collapsed": true
60 | },
61 | "outputs": [],
62 | "source": [
63 | "import os\n",
64 | "input_filename = os.path.join(os.path.expanduser(\"~\"), \"data/datasets\", \"twitter\", \"python_tweets.json\")\n",
65 | "labels_filename = os.path.join(os.path.expanduser(\"~\"), \"data/datasets\", \"twitter\", \"python_classes.json\")"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 31,
71 | "metadata": {
72 | "collapsed": false
73 | },
74 | "outputs": [],
75 | "source": [
76 | "import json\n",
77 | "\n",
78 | "tweets = []\n",
79 | "with open(input_filename) as inf:\n",
80 | " for line in inf:\n",
81 | " if len(line.strip()) == 0: continue\n",
82 | " tweets.append(json.loads(line)['text'])\n",
83 | "\n",
84 | "with open(labels_filename) as inf:\n",
85 | " labels = json.load(inf)\n",
86 | "\n",
87 | "# Ensure only classified tweets are loaded\n",
88 | "tweets = tweets[:len(labels)]\n",
89 | "assert len(tweets) == len(labels)"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 32,
95 | "metadata": {
96 | "collapsed": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "from sklearn.pipeline import Pipeline\n",
101 | "\n",
102 | "pipeline = Pipeline([('bag-of-words', BagOfWords()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB()) ])"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 33,
108 | "metadata": {
109 | "collapsed": false
110 | },
111 | "outputs": [
112 | {
113 | "name": "stdout",
114 | "output_type": "stream",
115 | "text": [
116 | "Score: 0.684\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "from sklearn.cross_validation import cross_val_score\n",
122 | "scores = cross_val_score(pipeline, tweets, labels, scoring='f1')\n",
123 | "#We then print out the average of the scores:\n",
124 | "import numpy as np\n",
125 | "print(\"Score: {:.3f}\".format(np.mean(scores)))"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 35,
131 | "metadata": {
132 | "collapsed": true
133 | },
134 | "outputs": [],
135 | "source": [
136 | "model = pipeline.fit(tweets, labels)"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 38,
142 | "metadata": {
143 | "collapsed": true
144 | },
145 | "outputs": [],
146 | "source": [
147 | "nb = model.named_steps['naive-bayes']\n",
148 | "feature_probabilities = nb.feature_log_prob_"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 39,
154 | "metadata": {
155 | "collapsed": false
156 | },
157 | "outputs": [],
158 | "source": [
159 | "top_features = np.argsort(-nb.feature_log_prob_[1])[:50]"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 40,
165 | "metadata": {
166 | "collapsed": true
167 | },
168 | "outputs": [],
169 | "source": [
170 | "dv = model.named_steps['vectorizer']"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 41,
176 | "metadata": {
177 | "collapsed": false
178 | },
179 | "outputs": [
180 | {
181 | "name": "stdout",
182 | "output_type": "stream",
183 | "text": [
184 | "0 : 0.53125\n",
185 | "1 # 0.51875\n",
186 | "2 Python 0.4875\n",
187 | "3 python 0.40625\n",
188 | "4 RT 0.26875\n",
189 | "5 in 0.21875\n",
190 | "6 - 0.2\n",
191 | "7 to 0.19375\n",
192 | "8 , 0.1875\n",
193 | "9 for 0.175\n",
194 | "10 and 0.1375\n",
195 | "11 . 0.125\n",
196 | "12 ? 0.11875\n",
197 | "13 the 0.10625\n",
198 | "14 ) 0.10625\n",
199 | "15 ( 0.10625\n",
200 | "16 of 0.1\n",
201 | "17 with 0.1\n",
202 | "18 I 0.08125\n",
203 | "19 a 0.08125\n",
204 | "20 A 0.06875\n",
205 | "21 via 0.06875\n",
206 | "22 jobs 0.0625\n",
207 | "23 ! 0.05625\n",
208 | "24 an 0.05625\n",
209 | "25 from 0.05\n",
210 | "26 How 0.05\n",
211 | "27 Data 0.05\n",
212 | "28 this 0.05\n",
213 | "29 Developer 0.05\n",
214 | "30 data 0.05\n",
215 | "31 current 0.04375\n",
216 | "32 installing 0.04375\n",
217 | "33 Top 0.04375\n",
218 | "34 by 0.04375\n",
219 | "35 library 0.04375\n",
220 | "36 status 0.04375\n",
221 | "37 30 0.0375\n",
222 | "38 And 0.0375\n",
223 | "39 C++ 0.0375\n",
224 | "40 Tech 0.0375\n",
225 | "41 Job 0.0375\n",
226 | "42 or 0.0375\n",
227 | "43 looking 0.0375\n",
228 | "44 3 0.0375\n",
229 | "45 [ 0.0375\n",
230 | "46 ] 0.0375\n",
231 | "47 @shiftkey 0.0375\n",
232 | "48 Django 0.0375\n",
233 | "49 Engineer 0.0375\n"
234 | ]
235 | }
236 | ],
237 | "source": [
238 | "for i, feature_index in enumerate(top_features):\n",
239 | " print(i, dv.feature_names_[feature_index], np.exp(feature_probabilities[1][feature_index]))"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {
246 | "collapsed": true
247 | },
248 | "outputs": [],
249 | "source": []
250 | }
251 | ],
252 | "metadata": {
253 | "kernelspec": {
254 | "display_name": "Python 3",
255 | "language": "python",
256 | "name": "python3"
257 | },
258 | "language_info": {
259 | "codemirror_mode": {
260 | "name": "ipython",
261 | "version": 3
262 | },
263 | "file_extension": ".py",
264 | "mimetype": "text/x-python",
265 | "name": "python",
266 | "nbconvert_exporter": "python",
267 | "pygments_lexer": "ipython3",
268 | "version": "3.5.1"
269 | }
270 | },
271 | "nbformat": 4,
272 | "nbformat_minor": 0
273 | }
274 |
--------------------------------------------------------------------------------
/Chapter06/ch6_create_replicable_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import os\n",
12 | "input_filename = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"twitter\", \"python_tweets.json\")\n",
13 | "labels_filename = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"twitter\", \"python_classes.json\")\n",
14 | "replicable_dataset = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"twitter\", \"replicable_dataset.json\")"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 3,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import json\n",
26 | "tweets = []\n",
27 | "with open(input_filename) as inf:\n",
28 | " for line in inf:\n",
29 | " if len(line.strip()) == 0:\n",
30 | " continue\n",
31 | " tweets.append(json.loads(line))\n",
32 | "if os.path.exists(labels_filename):\n",
33 | " with open(labels_filename) as inf:\n",
34 | " labels = json.load(inf)"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 5,
40 | "metadata": {
41 | "collapsed": false
42 | },
43 | "outputs": [],
44 | "source": [
45 | "dataset = [(tweet['id'], label) for label, tweet in zip(labels, tweets)]"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 6,
51 | "metadata": {
52 | "collapsed": false
53 | },
54 | "outputs": [
55 | {
56 | "data": {
57 | "text/plain": [
58 | "(315, 400, 315)"
59 | ]
60 | },
61 | "execution_count": 6,
62 | "metadata": {},
63 | "output_type": "execute_result"
64 | }
65 | ],
66 | "source": [
67 | "len(dataset), len(tweets), len(labels)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 7,
73 | "metadata": {
74 | "collapsed": true
75 | },
76 | "outputs": [],
77 | "source": [
78 | "with open(replicable_dataset, 'w') as outf:\n",
79 | " json.dump(dataset, outf)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {
86 | "collapsed": true
87 | },
88 | "outputs": [],
89 | "source": []
90 | }
91 | ],
92 | "metadata": {
93 | "kernelspec": {
94 | "display_name": "Python 3",
95 | "language": "python",
96 | "name": "python3"
97 | },
98 | "language_info": {
99 | "codemirror_mode": {
100 | "name": "ipython",
101 | "version": 3
102 | },
103 | "file_extension": ".py",
104 | "mimetype": "text/x-python",
105 | "name": "python",
106 | "nbconvert_exporter": "python",
107 | "pygments_lexer": "ipython3",
108 | "version": "3.5.1"
109 | }
110 | },
111 | "nbformat": 4,
112 | "nbformat_minor": 0
113 | }
114 |
--------------------------------------------------------------------------------
/Chapter06/ch6_get_twitter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import twitter\n",
12 | "consumer_key = \"59tyrGqNHCdGB92eFYEsqcjdg\"\n",
13 | "consumer_secret = \"DhglQERO5u936QibJP8YLbu6w60zmrxzl7jM0KmTQZZ0AXhr10\"\n",
14 | "access_token = \"16065520-USf3DBbQAh6ZA8CnSAi6NAUlkorXdppRXpC4cQCKk\"\n",
15 | "access_token_secret = \"DowMQeXqh5ZsGvZGrmUmkI0iCmI34ShFzKF3iOdiilpX5\"\n",
16 | "authorization = twitter.OAuth(access_token, access_token_secret, consumer_key, consumer_secret)"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 5,
22 | "metadata": {
23 | "collapsed": false
24 | },
25 | "outputs": [
26 | {
27 | "name": "stdout",
28 | "output_type": "stream",
29 | "text": [
30 | "/home/robert/data/datasets/twitter/python_tweets.json\n"
31 | ]
32 | }
33 | ],
34 | "source": [
35 | "import os\n",
36 | "output_filename = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"twitter\", \"python_tweets.json\")\n",
37 | "print(output_filename)"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 6,
43 | "metadata": {
44 | "collapsed": true
45 | },
46 | "outputs": [],
47 | "source": [
48 | "t = twitter.Twitter(auth=authorization)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 9,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "Saved 100 entries\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "import json\n",
68 | "\n",
69 | "n_output = 0\n",
70 | "\n",
71 | "with open(output_filename, 'a') as output_file:\n",
72 | " search_results = t.search.tweets(q=\"python\", count=100)['statuses']\n",
73 | " for tweet in search_results:\n",
74 | " if 'text' in tweet:\n",
75 | " output_file.write(json.dumps(tweet))\n",
76 | " output_file.write(\"\\n\\n\")\n",
77 | " n_output += 1\n",
78 | "\n",
79 | "print(\"Saved {} entries\".format(n_output))"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {
86 | "collapsed": true
87 | },
88 | "outputs": [],
89 | "source": []
90 | }
91 | ],
92 | "metadata": {
93 | "kernelspec": {
94 | "display_name": "Python 3",
95 | "language": "python",
96 | "name": "python3"
97 | },
98 | "language_info": {
99 | "codemirror_mode": {
100 | "name": "ipython",
101 | "version": 3
102 | },
103 | "file_extension": ".py",
104 | "mimetype": "text/x-python",
105 | "name": "python",
106 | "nbconvert_exporter": "python",
107 | "pygments_lexer": "ipython3",
108 | "version": "3.5.1"
109 | }
110 | },
111 | "nbformat": 4,
112 | "nbformat_minor": 0
113 | }
114 |
--------------------------------------------------------------------------------
/Chapter06/ch6_label_twitter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import json\n",
12 | "import os\n",
13 | "\n",
14 | "# Input filename\n",
15 | "input_filename = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"twitter\", \"python_tweets.json\")\n",
16 | "# Output filename\n",
17 | "labels_filename = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"twitter\", \"python_classes.json\")\n",
18 | "\n",
19 | "tweets = []\n",
20 | "with open(input_filename) as inf:\n",
21 | " for line in inf:\n",
22 | " if len(line.strip()) == 0:\n",
23 | " continue\n",
24 | " tweets.append(json.loads(line))"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 3,
30 | "metadata": {
31 | "collapsed": true
32 | },
33 | "outputs": [],
34 | "source": [
35 | "labels = []\n",
36 | "if os.path.exists(labels_filename):\n",
37 | " with open(labels_filename) as inf:\n",
38 | " labels = json.load(inf)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 12,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "def get_next_tweet():\n",
50 | " return tweets[len(labels)]['text']"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 13,
56 | "metadata": {
57 | "collapsed": false
58 | },
59 | "outputs": [
60 | {
61 | "data": {
62 | "text/html": [
63 | "\n",
64 | " Instructions: Click in text box. Enter a 1 if the tweet is relevant, enter 0 otherwise. \n",
65 | " Tweet: \n",
66 | " \n",
67 | "
\n",
68 | "\n",
69 | ""
104 | ],
105 | "text/plain": [
106 | ""
107 | ]
108 | },
109 | "metadata": {},
110 | "output_type": "display_data"
111 | }
112 | ],
113 | "source": [
114 | "%%html\n",
115 | "\n",
116 | " Instructions: Click in text box. Enter a 1 if the tweet is relevant, enter 0 otherwise. \n",
117 | " Tweet: \n",
118 | " \n",
119 | "
\n",
120 | "\n",
121 | ""
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 14,
161 | "metadata": {
162 | "collapsed": true
163 | },
164 | "outputs": [],
165 | "source": [
166 | "with open(labels_filename, 'w') as outf:\n",
167 | " json.dump(labels, outf)"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "collapsed": true
175 | },
176 | "outputs": [],
177 | "source": []
178 | }
179 | ],
180 | "metadata": {
181 | "kernelspec": {
182 | "display_name": "Python 3",
183 | "language": "python",
184 | "name": "python3"
185 | },
186 | "language_info": {
187 | "codemirror_mode": {
188 | "name": "ipython",
189 | "version": 3
190 | },
191 | "file_extension": ".py",
192 | "mimetype": "text/x-python",
193 | "name": "python",
194 | "nbconvert_exporter": "python",
195 | "pygments_lexer": "ipython3",
196 | "version": "3.5.1"
197 | }
198 | },
199 | "nbformat": 4,
200 | "nbformat_minor": 0
201 | }
202 |
--------------------------------------------------------------------------------
/Chapter06/ch6_recreate_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import os\n",
12 | "tweet_filename = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"twitter\", \"replicable_python_tweets.json\")\n",
13 | "labels_filename = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"twitter\", \"replicable_python_classes.json\")\n",
14 | "replicable_dataset = os.path.join(os.path.expanduser(\"~\"), \"data\", \"datasets\", \"twitter\", \"replicable_dataset.json\")"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 4,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import json\n",
26 | "with open(replicable_dataset) as inf:\n",
27 | " tweet_ids = json.load(inf)"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 5,
33 | "metadata": {
34 | "collapsed": true
35 | },
36 | "outputs": [],
37 | "source": [
38 | "actual_labels = []\n",
39 | "label_mapping = dict(tweet_ids)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 6,
45 | "metadata": {
46 | "collapsed": true
47 | },
48 | "outputs": [],
49 | "source": [
50 | "import twitter\n",
51 | "consumer_key = \"59tyrGqNHCdGB92eFYEsqcjdg\"\n",
52 | "consumer_secret = \"DhglQERO5u936QibJP8YLbu6w60zmrxzl7jM0KmTQZZ0AXhr10\"\n",
53 | "access_token = \"16065520-USf3DBbQAh6ZA8CnSAi6NAUlkorXdppRXpC4cQCKk\"\n",
54 | "access_token_secret = \"DowMQeXqh5ZsGvZGrmUmkI0iCmI34ShFzKF3iOdiilpX5\"\n",
55 | "authorization = twitter.OAuth(access_token, access_token_secret, consumer_key, consumer_secret)\n",
56 | "t = twitter.Twitter(auth=authorization)"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 9,
62 | "metadata": {
63 | "collapsed": false
64 | },
65 | "outputs": [],
66 | "source": [
67 | "all_ids = [tweet_id for tweet_id, label in tweet_ids]\n",
68 | "\n",
69 | "with open(tweet_filename, 'a') as output_file:\n",
70 | " # We can lookup 100 tweets at a time, which saves time in asking twitter for them\n",
71 | " for start_index in range(0, len(all_ids), 100):\n",
72 | " id_string = \",\".join(str(i) for i in all_ids[start_index:start_index+100])\n",
73 | " search_results = t.statuses.lookup(_id=id_string)\n",
74 | " for tweet in search_results:\n",
75 | " if 'text' in tweet:\n",
76 | " # Valid tweet - save to file\n",
77 | " output_file.write(json.dumps(tweet))\n",
78 | " output_file.write(\"\\n\\n\")\n",
79 | " actual_labels.append(label_mapping[tweet['id']])"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 10,
85 | "metadata": {
86 | "collapsed": true
87 | },
88 | "outputs": [],
89 | "source": [
90 | "with open(labels_filename, 'w') as outf:\n",
91 | " json.dump(actual_labels, outf)"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 11,
97 | "metadata": {
98 | "collapsed": false
99 | },
100 | "outputs": [
101 | {
102 | "data": {
103 | "text/plain": [
104 | "(260, 315)"
105 | ]
106 | },
107 | "execution_count": 11,
108 | "metadata": {},
109 | "output_type": "execute_result"
110 | }
111 | ],
112 | "source": [
113 | "len(actual_labels), len(all_ids)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {
120 | "collapsed": true
121 | },
122 | "outputs": [],
123 | "source": []
124 | }
125 | ],
126 | "metadata": {
127 | "kernelspec": {
128 | "display_name": "Python 3",
129 | "language": "python",
130 | "name": "python3"
131 | },
132 | "language_info": {
133 | "codemirror_mode": {
134 | "name": "ipython",
135 | "version": 3
136 | },
137 | "file_extension": ".py",
138 | "mimetype": "text/x-python",
139 | "name": "python",
140 | "nbconvert_exporter": "python",
141 | "pygments_lexer": "ipython3",
142 | "version": "3.5.1"
143 | }
144 | },
145 | "nbformat": 4,
146 | "nbformat_minor": 0
147 | }
148 |
--------------------------------------------------------------------------------
/Chapter07/ch7_graph.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import twitter\n",
12 | "consumer_key = \"\"\n",
13 | "consumer_secret = \"\"\n",
14 | "access_token = \"\"\n",
15 | "access_token_secret = \"\"\n",
16 | "authorization = twitter.OAuth(access_token, \n",
17 | "access_token_secret, consumer_key, consumer_secret)\n",
18 | "t = twitter.Twitter(auth=authorization, retry=True)"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": true
26 | },
27 | "outputs": [],
28 | "source": [
29 | "import os \n",
30 | "data_folder = os.path.join(\"twitter\")\n",
31 | "output_filename = os.path.join(data_folder, \"python_tweets.json\")"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {
38 | "collapsed": true
39 | },
40 | "outputs": [],
41 | "source": [
42 | "original_users = [] \n",
43 | "tweets = []\n",
44 | "user_ids = {}"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "collapsed": false
52 | },
53 | "outputs": [],
54 | "source": [
55 | "search_results = t.search.tweets(q=\"python\", count=100)['statuses']\n",
56 | "for tweet in search_results:\n",
57 | " if 'text' in tweet:\n",
58 | " original_users.append(tweet['user']['screen_name']) \n",
59 | " user_ids[tweet['user']['screen_name']] = tweet['user']['id']\n",
60 | " tweets.append(tweet['text'])"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {
67 | "collapsed": true
68 | },
69 | "outputs": [],
70 | "source": [
71 | "model_filename = os.path.join(\"models\", \"python_context.pkl\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "collapsed": true
79 | },
80 | "outputs": [],
81 | "source": [
82 | "import spacy\n",
83 | "from sklearn.base import TransformerMixin\n",
84 | "\n",
85 | "# Create a spaCy parser\n",
86 | "nlp = spacy.load('en')\n",
87 | "\n",
88 | "\n",
89 | "class BagOfWords(TransformerMixin):\n",
90 | " def fit(self, X, y=None):\n",
91 | " return self\n",
92 | " \n",
93 | " def transform(self, X):\n",
94 | " results = []\n",
95 | " for document in X:\n",
96 | " row = {}\n",
97 | " for word in list(nlp(document, tag=False, parse=False, entity=False)):\n",
98 | " if len(word.text.strip()): # Ignore words that are just whitespace\n",
99 | " row[word.text] = True\n",
100 | " results.append(row)\n",
101 | " return results"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {
108 | "collapsed": false
109 | },
110 | "outputs": [],
111 | "source": [
112 | "from sklearn.externals import joblib\n",
113 | "context_classifier = joblib.load(model_filename)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {
120 | "collapsed": true
121 | },
122 | "outputs": [],
123 | "source": [
124 | "y_pred = context_classifier.predict(tweets)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {
131 | "collapsed": true
132 | },
133 | "outputs": [],
134 | "source": [
135 | "relevant_tweets = [tweets[i] for i in range(len(tweets)) if y_pred[i] == 1]\n",
136 | "relevant_users = [original_users[i] for i in range(len(tweets)) if y_pred[i] == 1]"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {
143 | "collapsed": true
144 | },
145 | "outputs": [],
146 | "source": [
147 | "import time\n",
148 | "\n",
149 | "def get_friends(t, user_id):\n",
150 | " friends = []\n",
151 | " cursor = -1\n",
152 | " while cursor != 0: \n",
153 | " try:\n",
154 | " results = t.friends.ids(user_id= user_id, cursor=cursor, count=5000)\n",
155 | " friends.extend([friend for friend in results['ids']])\n",
156 | " cursor = results['next_cursor'] \n",
157 | " if True or len(friends) >= 10000:\n",
158 | " break\n",
159 | " except TypeError as e:\n",
160 | " if results is None:\n",
161 | " print(\"You probably reached your API limit, waiting for 5 minutes\")\n",
162 | " sys.stdout.flush() \n",
163 | " time.sleep(5*60) # 5 minute wait \n",
164 | " else: \n",
165 | " # Some other error happened, so raise the error as normal\n",
166 | " raise e\n",
167 | " except twitter.TwitterHTTPError as e:\n",
168 | " print(e)\n",
169 | " break\n",
170 | " finally:\n",
171 | " # Break regardless -- this stops us going over our API limit\n",
172 | " time.sleep(60)\n",
173 | " return friends"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {
180 | "collapsed": false
181 | },
182 | "outputs": [],
183 | "source": [
184 | "friends = {} \n",
185 | "for screen_name in relevant_users:\n",
186 | " user_id = user_ids[screen_name]\n",
187 | " friends[user_id] = get_friends(t, user_id)"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {
194 | "collapsed": false
195 | },
196 | "outputs": [],
197 | "source": []
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {
203 | "collapsed": false
204 | },
205 | "outputs": [],
206 | "source": [
207 | "friends = {user_id:friends[user_id] \n",
208 | " for user_id in friends\n",
209 | " if len(friends[user_id]) > 0}"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {
216 | "collapsed": true
217 | },
218 | "outputs": [],
219 | "source": [
220 | "from collections import defaultdict\n",
221 | "def count_friends(friends): \n",
222 | " friend_count = defaultdict(int)\n",
223 | " for friend_list in friends.values(): \n",
224 | " for friend in friend_list:\n",
225 | " friend_count[friend] += 1 \n",
226 | " return friend_count"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "collapsed": true
234 | },
235 | "outputs": [],
236 | "source": [
237 | "friend_count = count_friends(friends)\n",
238 | "from operator import itemgetter\n",
239 | "best_friends = sorted(friend_count, key=friend_count.get, reverse=True)"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {
246 | "collapsed": false
247 | },
248 | "outputs": [],
249 | "source": [
250 | "best_friends"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "metadata": {
257 | "collapsed": true
258 | },
259 | "outputs": [],
260 | "source": [
261 | "import sys"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "metadata": {
268 | "collapsed": false
269 | },
270 | "outputs": [],
271 | "source": [
272 | "while len(friends) < 150:\n",
273 | " for user_id in best_friends:\n",
274 | " if user_id in friends:\n",
275 | " continue\n",
276 | " print(user_id)\n",
277 | " sys.stdout.flush()\n",
278 | " friends[user_id] = get_friends(t, user_id) \n",
279 | " for friend in friends[user_id]: \n",
280 | " friend_count[friend] += 1\n",
281 | " best_friends = sorted(friend_count.items(), key=itemgetter(1), reverse=True)\n",
282 | " break"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {
289 | "collapsed": false
290 | },
291 | "outputs": [],
292 | "source": [
293 | "len(friends)"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {
300 | "collapsed": true
301 | },
302 | "outputs": [],
303 | "source": [
304 | "import json\n",
305 | "friends_filename = os.path.join(data_folder, \"python_friends.json\")"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {
312 | "collapsed": true
313 | },
314 | "outputs": [],
315 | "source": [
316 | "with open(friends_filename, 'w') as outf: \n",
317 | " json.dump(friends, outf)"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "metadata": {
324 | "collapsed": false
325 | },
326 | "outputs": [],
327 | "source": [
328 | "with open(friends_filename) as inf:\n",
329 | " friends = json.load(inf)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {
336 | "collapsed": false
337 | },
338 | "outputs": [],
339 | "source": [
340 | "len(friends), type(friends)"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {
347 | "collapsed": false
348 | },
349 | "outputs": [],
350 | "source": [
351 | "import networkx as nx \n",
352 | "G = nx.DiGraph()"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "metadata": {
359 | "collapsed": true
360 | },
361 | "outputs": [],
362 | "source": [
363 | "main_users = friends.keys() \n",
364 | "G.add_nodes_from(main_users)"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": null,
370 | "metadata": {
371 | "collapsed": true
372 | },
373 | "outputs": [],
374 | "source": [
375 | "for user_id in friends:\n",
376 | " for friend in friends[user_id]:\n",
377 | " if str(friend) in main_users: \n",
378 | " G.add_edge(user_id, friend) "
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {
385 | "collapsed": true
386 | },
387 | "outputs": [],
388 | "source": [
389 | "nx.draw?"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": null,
395 | "metadata": {
396 | "collapsed": false
397 | },
398 | "outputs": [],
399 | "source": [
400 | "%matplotlib inline \n",
401 | "nx.draw(G)"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {
408 | "collapsed": false
409 | },
410 | "outputs": [],
411 | "source": [
412 | "from matplotlib import pyplot as plt\n",
413 | "plt.figure(3,figsize=(20,20))\n",
414 | "nx.draw(G, alpha=0.1, edge_color='b')"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {
421 | "collapsed": true
422 | },
423 | "outputs": [],
424 | "source": [
425 | "friends = {user: set(friends[user]) for user in friends}\n"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": null,
431 | "metadata": {
432 | "collapsed": true
433 | },
434 | "outputs": [],
435 | "source": [
436 | "def compute_similarity(friends1, friends2):\n",
437 | " return len(friends1 & friends2) / (len(friends1 | friends2) + 1e-6)"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "metadata": {
444 | "collapsed": true
445 | },
446 | "outputs": [],
447 | "source": [
448 | "def create_graph(followers, threshold=0): \n",
449 | " G = nx.Graph()\n",
450 | " for user1 in friends.keys(): \n",
451 | " for user2 in friends.keys(): \n",
452 | " if user1 == user2:\n",
453 | " continue\n",
454 | " weight = compute_similarity(friends[user1], friends[user2])\n",
455 | " if weight >= threshold:\n",
456 | " G.add_node(user1) \n",
457 | " G.add_node(user2)\n",
458 | " G.add_edge(user1, user2, weight=weight)\n",
459 | " return G"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "metadata": {
466 | "collapsed": false
467 | },
468 | "outputs": [],
469 | "source": [
470 | "G = create_graph(friends)"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": null,
476 | "metadata": {
477 | "collapsed": false
478 | },
479 | "outputs": [],
480 | "source": [
481 | "plt.figure(figsize=(10,10))\n",
482 | "pos = nx.spring_layout(G)\n",
483 | "nx.draw_networkx_nodes(G, pos)\n",
484 | "edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n",
485 | "nx.draw_networkx_edges(G, pos, width=edgewidth)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": null,
491 | "metadata": {
492 | "collapsed": false
493 | },
494 | "outputs": [],
495 | "source": [
496 | "G = create_graph(friends, 0.1)"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": null,
502 | "metadata": {
503 | "collapsed": true
504 | },
505 | "outputs": [],
506 | "source": [
507 | "sub_graphs = nx.connected_component_subgraphs(G)"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": null,
513 | "metadata": {
514 | "collapsed": false
515 | },
516 | "outputs": [],
517 | "source": [
518 | "for i, sub_graph in enumerate(sub_graphs):\n",
519 | " n_nodes = len(sub_graph.nodes()) \n",
520 | " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": null,
526 | "metadata": {
527 | "collapsed": false
528 | },
529 | "outputs": [],
530 | "source": [
531 | "G = create_graph(friends, 0.25) \n",
532 | "sub_graphs = nx.connected_component_subgraphs(G) \n",
533 | "for i, sub_graph in enumerate(sub_graphs): \n",
534 | " n_nodes = len(sub_graph.nodes()) \n",
535 | " print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
536 | ]
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": null,
541 | "metadata": {
542 | "collapsed": true
543 | },
544 | "outputs": [],
545 | "source": [
546 | "sub_graphs = nx.connected_component_subgraphs(G) \n",
547 | "n_subgraphs = nx.number_connected_components(G)"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": null,
553 | "metadata": {
554 | "collapsed": false
555 | },
556 | "outputs": [],
557 | "source": [
558 | "n_subgraphs"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": null,
564 | "metadata": {
565 | "collapsed": false
566 | },
567 | "outputs": [],
568 | "source": [
569 | "sub_graphs = nx.connected_component_subgraphs(G) \n",
570 | "n_subgraphs = nx.number_connected_components(G)\n",
571 | "\n",
572 | "fig = plt.figure(figsize=(20, (n_subgraphs * 3)))\n",
573 | "for i, sub_graph in enumerate(sub_graphs):\n",
574 | " \n",
575 | " ax = fig.add_subplot(int(n_subgraphs / 3)+1, 3, i+1)\n",
576 | " ax.get_xaxis().set_visible(False) \n",
577 | " ax.get_yaxis().set_visible(False)\n",
578 | " nx.draw(sub_graph, ax=ax)"
579 | ]
580 | },
581 | {
582 | "cell_type": "code",
583 | "execution_count": null,
584 | "metadata": {
585 | "collapsed": false
586 | },
587 | "outputs": [],
588 | "source": [
589 | "print(\"Done\")"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": null,
595 | "metadata": {
596 | "collapsed": true
597 | },
598 | "outputs": [],
599 | "source": [
600 | "import numpy as np\n",
601 | "from sklearn.metrics import silhouette_score\n",
602 | "\n",
603 | "def compute_silhouette(threshold, friends):\n",
604 | " G = create_graph(friends, threshold=threshold) \n",
605 | " if len(G.nodes()) < 2:\n",
606 | " return -99\n",
607 | " sub_graphs = nx.connected_component_subgraphs(G)\n",
608 | "\n",
609 | " if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1): \n",
610 | " return -99\n",
611 | "\n",
612 | " label_dict = {}\n",
613 | " for i, sub_graph in enumerate(sub_graphs): \n",
614 | " for node in sub_graph.nodes(): \n",
615 | " label_dict[node] = i\n",
616 | "\n",
617 | " labels = np.array([label_dict[node] for node in G.nodes()])\n",
618 | " X = nx.to_scipy_sparse_matrix(G).todense()\n",
619 | " X = 1 - X\n",
620 | " return silhouette_score(X, labels, metric='precomputed')"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": null,
626 | "metadata": {
627 | "collapsed": true
628 | },
629 | "outputs": [],
630 | "source": [
631 | "def inverted_silhouette(threshold, friends):\n",
632 | " return -compute_silhouette(threshold, friends)"
633 | ]
634 | },
635 | {
636 | "cell_type": "code",
637 | "execution_count": null,
638 | "metadata": {
639 | "collapsed": false
640 | },
641 | "outputs": [],
642 | "source": [
643 | "from scipy.optimize import minimize\n",
644 | "result = minimize(inverted_silhouette, 0.1, args=(friends,))"
645 | ]
646 | },
647 | {
648 | "cell_type": "code",
649 | "execution_count": null,
650 | "metadata": {
651 | "collapsed": false
652 | },
653 | "outputs": [],
654 | "source": [
655 | "result"
656 | ]
657 | },
658 | {
659 | "cell_type": "code",
660 | "execution_count": null,
661 | "metadata": {
662 | "collapsed": true
663 | },
664 | "outputs": [],
665 | "source": []
666 | }
667 | ],
668 | "metadata": {
669 | "anaconda-cloud": {},
670 | "kernelspec": {
671 | "display_name": "Python 3",
672 | "language": "python",
673 | "name": "python3"
674 | },
675 | "language_info": {
676 | "codemirror_mode": {
677 | "name": "ipython",
678 | "version": 3
679 | },
680 | "file_extension": ".py",
681 | "mimetype": "text/x-python",
682 | "name": "python",
683 | "nbconvert_exporter": "python",
684 | "pygments_lexer": "ipython3",
685 | "version": "3.5.1"
686 | }
687 | },
688 | "nbformat": 4,
689 | "nbformat_minor": 1
690 | }
691 |
--------------------------------------------------------------------------------
/Chapter10/chapter 10.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 27,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "CLIENT_ID = \"hZ-F6QLaGJ6VBg\" \n",
12 | "CLIENT_SECRET = \"MFC8FXTlXjm4jtG60bm9Qs-PPls\""
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 28,
18 | "metadata": {
19 | "collapsed": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "USER_AGENT = \"python: (by /u/Dealsy23)\""
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 29,
29 | "metadata": {
30 | "collapsed": true
31 | },
32 | "outputs": [],
33 | "source": [
34 | "USERNAME = \"Dealsy23\" \n",
35 | "PASSWORD = \"Outbreak23\""
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 30,
41 | "metadata": {
42 | "collapsed": true
43 | },
44 | "outputs": [],
45 | "source": [
46 | "import requests\n",
47 | "def login(username, password):\n",
48 | " if password is None:\n",
49 | " password = getpass.getpass(\"Enter reddit password for user {}: \".format(username)) \n",
50 | " headers = {\"User-Agent\": USER_AGENT}\n",
51 | " # Setup an auth object with our credentials\n",
52 | " client_auth = requests.auth.HTTPBasicAuth(CLIENT_ID, CLIENT_SECRET)\n",
53 | " # Make a post request to the access_token endpoint\n",
54 | " post_data = {\"grant_type\": \"password\", \"username\": username, \"password\": password}\n",
55 | " response = requests.post(\"https://www.reddit.com/api/v1/access_token\", auth=client_auth, \n",
56 | " data=post_data, headers=headers) \n",
57 | " return response.json()"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 31,
63 | "metadata": {
64 | "collapsed": false
65 | },
66 | "outputs": [],
67 | "source": [
68 | "token = login(USERNAME, PASSWORD)\n"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 32,
74 | "metadata": {
75 | "collapsed": false
76 | },
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/plain": [
81 | "{'access_token': 'K9NuIv1-0S3zDIVj6oyOD1dw83Y',\n",
82 | " 'expires_in': 3600,\n",
83 | " 'scope': '*',\n",
84 | " 'token_type': 'bearer'}"
85 | ]
86 | },
87 | "execution_count": 32,
88 | "metadata": {},
89 | "output_type": "execute_result"
90 | }
91 | ],
92 | "source": [
93 | "token"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 33,
99 | "metadata": {
100 | "collapsed": true
101 | },
102 | "outputs": [],
103 | "source": [
104 | "subreddit = \"worldnews\""
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 34,
110 | "metadata": {
111 | "collapsed": true
112 | },
113 | "outputs": [],
114 | "source": [
115 | "url = \"https://oauth.reddit.com/r/{}\".format(subreddit)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 35,
121 | "metadata": {
122 | "collapsed": true
123 | },
124 | "outputs": [],
125 | "source": [
126 | "headers = {\"Authorization\": \"bearer {}\".format(token['access_token']), \n",
127 | "\"User-Agent\": USER_AGENT}"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 46,
133 | "metadata": {
134 | "collapsed": false
135 | },
136 | "outputs": [],
137 | "source": [
138 | "response = requests.get(url, headers=headers)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 47,
144 | "metadata": {
145 | "collapsed": false
146 | },
147 | "outputs": [],
148 | "source": [
149 | "result = response.json()"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 48,
155 | "metadata": {
156 | "collapsed": false
157 | },
158 | "outputs": [
159 | {
160 | "name": "stdout",
161 | "output_type": "stream",
162 | "text": [
163 | "Russia considers banning sale of cigarettes to anyone born after 2015\n",
164 | "Swiss Muslim girls must swim with boys\n",
165 | "Report: Russia spread fake news and disinformation in Sweden - Russia has coordinated a campaign over the past 2years to influence Sweden’s decision making by using disinformation, propaganda and false documents, according to a report by researchers at The Swedish Institute of International Affairs.\n",
166 | "100% of Dutch Trains Now Run on Wind Energy. The Netherlands met its renewable energy goals a year ahead of time.\n",
167 | "Legal challenge against UK’s sweeping surveillance laws quickly crowdfunded\n",
168 | "A 1,000-foot-thick ice block about the size of Delaware is snapping off of Antarctica\n",
169 | "The U.S. dropped an average of 72 bombs every day — the equivalent of three an hour — in 2016, according to an analysis of American strikes around the world. U.S. Bombed Iraq, Syria, Pakistan, Afghanistan, Libya, Yemen, Somalia in 2016\n",
170 | "The German government is investigating a recent surge in fake news following claims that Russia is attempting to meddle in the country’s parliamentary elections later this year.\n",
171 | "Pesticides kill over 10 million bees in a matter of days in Brazil countryside\n",
172 | "The families of American victims of Islamic State terrorist attacks in Europe have sued Twitter, charging that the social media giant allowed the terror group to proliferate online\n",
173 | "Gas taxes drop globally despite climate change; oil & gas industry gets $500 billion in subsidies; last new US gas tax was in 1993\n",
174 | "Czech government tells citizens to arm themselves and shoot Muslim terrorists in case of 'Super Holocaust'\n",
175 | "PLO threatens to revoke recognition of Israel if US embassy moves to Jerusalem\n",
176 | "Two-thirds of all new HIV cases in Europe are being recorded in just one country – Russia: More than a million Russians now live with the virus and that number is expected to nearly double in the next decade\n",
177 | "Czech government tells its citizens how to fight terrorists: Shoot them yourselves | The interior ministry is pushing a constitutional change that would let citizens use guns against terrorists\n",
178 | "Morocco Prohibits Sale of Burqa\n",
179 | "Mass killer Breivik makes Nazi salute at rights appeal case\n",
180 | "Soros Groups Risk Purge After Trump’s Win Emboldens Hungary\n",
181 | "Nigeria purges 50,000 ‘ghost workers’ from State payroll in corruption sweep\n",
182 | "Alcohol advertising is aggressive and linked to youth drinking, research finds | Society\n",
183 | "UK Government quietly launched ‘assault on freedom’ while distracting people, say campaigners behind legal challenge - The Investigatory Powers Act became law at the end of last year, and gives spies the power to read through everyone’s entire internet history\n",
184 | "Russia’s Reserve Fund down 70 percent in 2016\n",
185 | "Russian diplomat found dead in Athens\n",
186 | "At least 21 people have been killed (most were civilians) and 45 wounded in twin bombings near the Afghan parliament in Kabul\n",
187 | "Pound’s Decline Deepens as Currency Reclaims Dubious Honor\n"
188 | ]
189 | }
190 | ],
191 | "source": [
192 | "for story in result['data']['children']: \n",
193 | " print(story['data']['title'])"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 49,
199 | "metadata": {
200 | "collapsed": true
201 | },
202 | "outputs": [],
203 | "source": [
204 | "from time import sleep"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 55,
210 | "metadata": {
211 | "collapsed": false
212 | },
213 | "outputs": [],
214 | "source": [
215 | "def get_links(subreddit, token, n_pages=5):\n",
216 | " stories = []\n",
217 | " after = None\n",
218 | " for page_number in range(n_pages):\n",
219 | " # Sleep before making calls to avoid going over the API limit\n",
220 | " sleep(2)\n",
221 | " # Setup headers and make call, just like in the login function\n",
222 | " headers = {\"Authorization\": \"bearer {}\".format(token['access_token']), \"User-Agent\": USER_AGENT} \n",
223 | " url = \"https://oauth.reddit.com/r/{}?limit=100\". format(subreddit)\n",
224 | " if after:\n",
225 | " # Append cursor for next page, if we have one\n",
226 | " url += \"&after={}\".format(after)\n",
227 | " response = requests.get(url, headers=headers)\n",
228 | " result = response.json()\n",
229 | " # Get the new cursor for the next loop\n",
230 | " after = result['data']['after']\n",
231 | " # Add all of the news items to our stories list\n",
232 | " for story in result['data']['children']:\n",
233 | " stories.append((story['data']['title'], story['data']['url'], story['data']['score']))\n",
234 | " return stories"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 56,
240 | "metadata": {
241 | "collapsed": true
242 | },
243 | "outputs": [],
244 | "source": [
245 | "stories = get_links(\"worldnews\", token)"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 5,
251 | "metadata": {
252 | "collapsed": true
253 | },
254 | "outputs": [],
255 | "source": [
256 | "import os \n",
257 | "data_folder = os.path.join(\"data\", \"websites\", \"raw\")"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 59,
263 | "metadata": {
264 | "collapsed": true
265 | },
266 | "outputs": [],
267 | "source": [
268 | "import hashlib"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 60,
274 | "metadata": {
275 | "collapsed": true
276 | },
277 | "outputs": [],
278 | "source": [
279 | "number_errors = 0\n"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 66,
285 | "metadata": {
286 | "collapsed": false
287 | },
288 | "outputs": [],
289 | "source": [
290 | "for title, url, score in stories:\n",
291 | " output_filename = hashlib.md5(url.encode()).hexdigest() \n",
292 | " fullpath = os.path.join(data_folder, output_filename + \".txt\")\n",
293 | " try: \n",
294 | " response = requests.get(url) \n",
295 | " data = response.text \n",
296 | " with open(fullpath, 'w') as outf: \n",
297 | " outf.write(data)\n",
298 | " except Exception as e:\n",
299 | " number_errors += 1\n",
300 | " # You can use this to view the errors, if you are getting too many:\n",
301 | " #raise"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 67,
307 | "metadata": {
308 | "collapsed": false
309 | },
310 | "outputs": [
311 | {
312 | "data": {
313 | "text/plain": [
314 | "503"
315 | ]
316 | },
317 | "execution_count": 67,
318 | "metadata": {},
319 | "output_type": "execute_result"
320 | }
321 | ],
322 | "source": [
323 | "number_errors"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 7,
329 | "metadata": {
330 | "collapsed": false
331 | },
332 | "outputs": [],
333 | "source": [
334 | "filenames = [os.path.join(data_folder, filename) for filename in os.listdir(data_folder)]"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 16,
340 | "metadata": {
341 | "collapsed": true
342 | },
343 | "outputs": [],
344 | "source": [
345 | "text_output_folder = os.path.join(\"data\", \"websites\", \"textonly\")"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 17,
351 | "metadata": {
352 | "collapsed": false
353 | },
354 | "outputs": [],
355 | "source": [
356 | "import lxml\n",
357 | "from lxml import etree"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 18,
363 | "metadata": {
364 | "collapsed": true
365 | },
366 | "outputs": [],
367 | "source": [
368 | "skip_node_types = [\"script\", \"head\", \"style\", etree.Comment]\n"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 19,
374 | "metadata": {
375 | "collapsed": true
376 | },
377 | "outputs": [],
378 | "source": [
379 | "parser = etree.HTMLParser()\n",
380 | "\n",
381 | "def get_text_from_file(filename):\n",
382 | " with open(filename) as inf:\n",
383 | " html_tree = etree.parse(inf, parser) \n",
384 | " return get_text_from_node(html_tree.getroot())"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 20,
390 | "metadata": {
391 | "collapsed": true
392 | },
393 | "outputs": [],
394 | "source": [
395 | "def get_text_from_node(node):\n",
396 | " if len(node) == 0: \n",
397 | " # No children, just return text from this item\n",
398 | " if node.text: \n",
399 | " return node.text \n",
400 | " else:\n",
401 | " return \"\"\n",
402 | " else:\n",
403 | " # This node has children, return the text from it:\n",
404 | " results = (get_text_from_node(child)\n",
405 | " for child in node\n",
406 | " if child.tag not in skip_node_types)\n",
407 | " result = str.join(\"\\n\", (r for r in results if len(r) > 1))\n",
408 | " if len(result) >= 100:\n",
409 | " return result\n",
410 | " else:\n",
411 | " return \"\""
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 21,
417 | "metadata": {
418 | "collapsed": false
419 | },
420 | "outputs": [],
421 | "source": [
422 | "for filename in os.listdir(data_folder):\n",
423 | " text = get_text_from_file(os.path.join(data_folder, filename)) \n",
424 | " with open(os.path.join(text_output_folder, filename), 'w') as outf: \n",
425 | " outf.write(text)"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 22,
431 | "metadata": {
432 | "collapsed": true
433 | },
434 | "outputs": [],
435 | "source": [
436 | "from sklearn.cluster import KMeans"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 23,
442 | "metadata": {
443 | "collapsed": true
444 | },
445 | "outputs": [],
446 | "source": [
447 | "from sklearn.feature_extraction.text import TfidfVectorizer"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": 24,
453 | "metadata": {
454 | "collapsed": false
455 | },
456 | "outputs": [],
457 | "source": [
458 | "from sklearn.pipeline import Pipeline\n",
459 | "n_clusters = 10 \n",
460 | "pipeline = Pipeline([('feature_extraction', TfidfVectorizer(max_df=0.4)),\n",
461 | " ('clusterer', KMeans(n_clusters=n_clusters)) ])"
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": 25,
467 | "metadata": {
468 | "collapsed": true
469 | },
470 | "outputs": [],
471 | "source": [
472 | "\n",
473 | "documents = [open(os.path.join(text_output_folder, filename)).read() for filename in os.listdir(text_output_folder)]"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 26,
479 | "metadata": {
480 | "collapsed": false
481 | },
482 | "outputs": [],
483 | "source": [
484 | "pipeline.fit(documents)\n",
485 | "labels = pipeline.predict(documents)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": 27,
491 | "metadata": {
492 | "collapsed": false
493 | },
494 | "outputs": [
495 | {
496 | "name": "stdout",
497 | "output_type": "stream",
498 | "text": [
499 | "Cluster 0 contains 42 samples\n",
500 | "Cluster 1 contains 37 samples\n",
501 | "Cluster 2 contains 31 samples\n",
502 | "Cluster 3 contains 26 samples\n",
503 | "Cluster 4 contains 28 samples\n",
504 | "Cluster 5 contains 13 samples\n",
505 | "Cluster 6 contains 43 samples\n",
506 | "Cluster 7 contains 25 samples\n",
507 | "Cluster 8 contains 209 samples\n",
508 | "Cluster 9 contains 37 samples\n"
509 | ]
510 | }
511 | ],
512 | "source": [
513 | "from collections import Counter\n",
514 | "c = Counter(labels) \n",
515 | "for cluster_number in range(n_clusters): \n",
516 | " print(\"Cluster {} contains {} samples\".format(cluster_number, c[cluster_number]))"
517 | ]
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": 28,
522 | "metadata": {
523 | "collapsed": false
524 | },
525 | "outputs": [
526 | {
527 | "data": {
528 | "text/plain": [
529 | "414.5540395641533"
530 | ]
531 | },
532 | "execution_count": 28,
533 | "metadata": {},
534 | "output_type": "execute_result"
535 | }
536 | ],
537 | "source": [
538 | "pipeline.named_steps['clusterer'].inertia_"
539 | ]
540 | },
541 | {
542 | "cell_type": "code",
543 | "execution_count": 29,
544 | "metadata": {
545 | "collapsed": false
546 | },
547 | "outputs": [],
548 | "source": [
549 | "inertia_scores = [] \n",
550 | "n_cluster_values = list(range(2, 20)) \n",
551 | "for n_clusters in n_cluster_values: \n",
552 | " cur_inertia_scores = [] \n",
553 | " X = TfidfVectorizer(max_df=0.4).fit_transform(documents) \n",
554 | " for i in range(10): \n",
555 | " km = KMeans(n_clusters=n_clusters).fit(X) \n",
556 | " cur_inertia_scores.append(km.inertia_) \n",
557 | " inertia_scores.append(cur_inertia_scores)"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": 30,
563 | "metadata": {
564 | "collapsed": false
565 | },
566 | "outputs": [
567 | {
568 | "data": {
569 | "text/plain": [
570 | "19"
571 | ]
572 | },
573 | "execution_count": 30,
574 | "metadata": {},
575 | "output_type": "execute_result"
576 | }
577 | ],
578 | "source": [
579 | "n_clusters"
580 | ]
581 | },
582 | {
583 | "cell_type": "code",
584 | "execution_count": 31,
585 | "metadata": {
586 | "collapsed": true
587 | },
588 | "outputs": [],
589 | "source": [
590 | "n_clusters = 6 \n",
591 | "pipeline = Pipeline([('feature_extraction', TfidfVectorizer(max_df=0.4)),\n",
592 | " ('clusterer', KMeans(n_clusters=n_clusters)) ])\n",
593 | "pipeline.fit(documents) \n",
594 | "labels = pipeline.predict(documents)"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": 32,
600 | "metadata": {
601 | "collapsed": false
602 | },
603 | "outputs": [
604 | {
605 | "data": {
606 | "text/plain": [
607 | "6"
608 | ]
609 | },
610 | "execution_count": 32,
611 | "metadata": {},
612 | "output_type": "execute_result"
613 | }
614 | ],
615 | "source": [
616 | "n_clusters"
617 | ]
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": 33,
622 | "metadata": {
623 | "collapsed": true
624 | },
625 | "outputs": [],
626 | "source": [
627 | "terms = pipeline.named_steps['feature_extraction'].get_feature_names()"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 34,
633 | "metadata": {
634 | "collapsed": true
635 | },
636 | "outputs": [],
637 | "source": [
638 | "c = Counter(labels)"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": 35,
644 | "metadata": {
645 | "collapsed": false
646 | },
647 | "outputs": [
648 | {
649 | "name": "stdout",
650 | "output_type": "stream",
651 | "text": [
652 | "Cluster 0 contains 22 samples\n",
653 | " Most important terms\n",
654 | " 1) iran (score: 0.1932)\n",
655 | " 2) iranian (score: 0.1310)\n",
656 | " 3) shots (score: 0.1198)\n",
657 | " 4) mahan (score: 0.1109)\n",
658 | " 5) navy (score: 0.1108)\n",
659 | "Cluster 1 contains 36 samples\n",
660 | " Most important terms\n",
661 | " 1) click (score: 0.1833)\n",
662 | " 2) adblock (score: 0.1786)\n",
663 | " 3) icon (score: 0.1166)\n",
664 | " 4) site (score: 0.1012)\n",
665 | " 5) getty (score: 0.0796)\n",
666 | "Cluster 2 contains 308 samples\n",
667 | " Most important terms\n",
668 | " 1) she (score: 0.0185)\n",
669 | " 2) her (score: 0.0178)\n",
670 | " 3) china (score: 0.0174)\n",
671 | " 4) us (score: 0.0171)\n",
672 | " 5) government (score: 0.0141)\n",
673 | "Cluster 3 contains 43 samples\n",
674 | " Most important terms\n",
675 | " 1) kabul (score: 0.0654)\n",
676 | " 2) afghanistan (score: 0.0566)\n",
677 | " 3) taliban (score: 0.0558)\n",
678 | " 4) killed (score: 0.0509)\n",
679 | " 5) pakistan (score: 0.0490)\n",
680 | "Cluster 4 contains 45 samples\n",
681 | " Most important terms\n",
682 | " 1) washington (score: 0.1076)\n",
683 | " 2) industry (score: 0.0755)\n",
684 | " 3) tuesday (score: 0.0742)\n",
685 | " 4) your (score: 0.0719)\n",
686 | " 5) tax (score: 0.0699)\n",
687 | "Cluster 5 contains 37 samples\n",
688 | " Most important terms\n",
689 | " 1) israeli (score: 0.1896)\n",
690 | " 2) israel (score: 0.1494)\n",
691 | " 3) jerusalem (score: 0.1114)\n",
692 | " 4) soldiers (score: 0.0775)\n",
693 | " 5) embassy (score: 0.0769)\n"
694 | ]
695 | }
696 | ],
697 | "source": [
698 | "for cluster_number in range(n_clusters): \n",
699 | " print(\"Cluster {} contains {} samples\".format(cluster_number, c[cluster_number]))\n",
700 | " print(\" Most important terms\")\n",
701 | " centroid = pipeline.named_steps['clusterer'].cluster_centers_[cluster_number]\n",
702 | " most_important = centroid.argsort()\n",
703 | " for i in range(5):\n",
704 | " term_index = most_important[-(i+1)]\n",
705 | " print(\" {0}) {1} (score: {2:.4f})\".format(i+1, terms[term_index], centroid[term_index]))"
706 | ]
707 | },
708 | {
709 | "cell_type": "code",
710 | "execution_count": 36,
711 | "metadata": {
712 | "collapsed": true
713 | },
714 | "outputs": [],
715 | "source": [
716 | "X = pipeline.transform(documents)"
717 | ]
718 | },
719 | {
720 | "cell_type": "code",
721 | "execution_count": 37,
722 | "metadata": {
723 | "collapsed": true
724 | },
725 | "outputs": [],
726 | "source": [
727 | "from scipy.sparse import csr_matrix"
728 | ]
729 | },
730 | {
731 | "cell_type": "code",
732 | "execution_count": 38,
733 | "metadata": {
734 | "collapsed": true
735 | },
736 | "outputs": [],
737 | "source": [
738 | "import numpy as np\n",
739 | "def create_coassociation_matrix(labels):\n",
740 | " rows = [] \n",
741 | " cols = []\n",
742 | " unique_labels = set(labels) \n",
743 | " for label in unique_labels:\n",
744 | " indices = np.where(labels == label)[0]\n",
745 | " for index1 in indices:\n",
746 | " for index2 in indices:\n",
747 | " rows.append(index1)\n",
748 | " cols.append(index2)\n",
749 | " data = np.ones((len(rows),)) \n",
750 | " return csr_matrix((data, (rows, cols)), dtype='float')"
751 | ]
752 | },
753 | {
754 | "cell_type": "code",
755 | "execution_count": 39,
756 | "metadata": {
757 | "collapsed": false
758 | },
759 | "outputs": [],
760 | "source": [
761 | "C = create_coassociation_matrix(labels)"
762 | ]
763 | },
764 | {
765 | "cell_type": "code",
766 | "execution_count": 40,
767 | "metadata": {
768 | "collapsed": false
769 | },
770 | "outputs": [
771 | {
772 | "data": {
773 | "text/plain": [
774 | "<491x491 sparse matrix of type ''\n",
775 | "\twith 101887 stored elements in Compressed Sparse Row format>"
776 | ]
777 | },
778 | "execution_count": 40,
779 | "metadata": {},
780 | "output_type": "execute_result"
781 | }
782 | ],
783 | "source": [
784 | "C"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": 41,
790 | "metadata": {
791 | "collapsed": true
792 | },
793 | "outputs": [],
794 | "source": [
795 | "from scipy.sparse.csgraph import minimum_spanning_tree"
796 | ]
797 | },
798 | {
799 | "cell_type": "code",
800 | "execution_count": 42,
801 | "metadata": {
802 | "collapsed": true
803 | },
804 | "outputs": [],
805 | "source": [
806 | "mst = minimum_spanning_tree(C)"
807 | ]
808 | },
809 | {
810 | "cell_type": "code",
811 | "execution_count": 43,
812 | "metadata": {
813 | "collapsed": true
814 | },
815 | "outputs": [],
816 | "source": [
817 | "mst = minimum_spanning_tree(-C)"
818 | ]
819 | },
820 | {
821 | "cell_type": "code",
822 | "execution_count": 44,
823 | "metadata": {
824 | "collapsed": true
825 | },
826 | "outputs": [],
827 | "source": [
828 | "pipeline.fit(documents) \n",
829 | "labels2 = pipeline.predict(documents) \n",
830 | "C2 = create_coassociation_matrix(labels2) \n",
831 | "C_sum = (C + C2) / 2"
832 | ]
833 | },
834 | {
835 | "cell_type": "code",
836 | "execution_count": 45,
837 | "metadata": {
838 | "collapsed": true
839 | },
840 | "outputs": [],
841 | "source": [
842 | "mst = minimum_spanning_tree(-C_sum) \n",
843 | "mst.data[mst.data > -1] = 0"
844 | ]
845 | },
846 | {
847 | "cell_type": "code",
848 | "execution_count": 46,
849 | "metadata": {
850 | "collapsed": true
851 | },
852 | "outputs": [],
853 | "source": [
854 | "from scipy.sparse.csgraph import connected_components \n",
855 | "number_of_clusters, labels = connected_components(mst)"
856 | ]
857 | },
858 | {
859 | "cell_type": "code",
860 | "execution_count": 47,
861 | "metadata": {
862 | "collapsed": true
863 | },
864 | "outputs": [],
865 | "source": [
866 | "from sklearn.base import BaseEstimator, ClusterMixin \n",
867 | "class EAC(BaseEstimator, ClusterMixin):\n",
868 | " def __init__(self, n_clusterings=10, cut_threshold=0.5, n_clusters_range=(3, 10)): \n",
869 | " self.n_clusterings = n_clusterings\n",
870 | " self.cut_threshold = cut_threshold\n",
871 | " self.n_clusters_range = n_clusters_range\n",
872 | "\n",
873 | " def fit(self, X, y=None):\n",
874 | " C = sum((create_coassociation_matrix(self._single_clustering(X))\n",
875 | " for i in range(self.n_clusterings)))\n",
876 | " mst = minimum_spanning_tree(-C)\n",
877 | " mst.data[mst.data > -self.cut_threshold] = 0\n",
878 | " mst.eliminate_zeros()\n",
879 | " self.n_components, self.labels_ = connected_components(mst)\n",
880 | " return self\n",
881 | " \n",
882 | " def _single_clustering(self, X):\n",
883 | " n_clusters = np.random.randint(*self.n_clusters_range)\n",
884 | " km = KMeans(n_clusters=n_clusters)\n",
885 | " return km.fit_predict(X)\n",
886 | " \n",
887 | " def fit_predict(self, X):\n",
888 | " self.fit(X)\n",
889 | " return self.labels_"
890 | ]
891 | },
892 | {
893 | "cell_type": "code",
894 | "execution_count": 48,
895 | "metadata": {
896 | "collapsed": true
897 | },
898 | "outputs": [],
899 | "source": [
900 | "pipeline = Pipeline([('feature_extraction', TfidfVectorizer(max_df=0.4)),\n",
901 | " ('clusterer', EAC()) ])"
902 | ]
903 | },
904 | {
905 | "cell_type": "code",
906 | "execution_count": 49,
907 | "metadata": {
908 | "collapsed": true
909 | },
910 | "outputs": [],
911 | "source": [
912 | "vec = TfidfVectorizer(max_df=0.4) \n",
913 | "X = vec.fit_transform(documents)"
914 | ]
915 | },
916 | {
917 | "cell_type": "code",
918 | "execution_count": 50,
919 | "metadata": {
920 | "collapsed": true
921 | },
922 | "outputs": [],
923 | "source": [
924 | "from sklearn.cluster import MiniBatchKMeans \n",
925 | "mbkm = MiniBatchKMeans(random_state=14, n_clusters=3)"
926 | ]
927 | },
928 | {
929 | "cell_type": "code",
930 | "execution_count": 51,
931 | "metadata": {
932 | "collapsed": false
933 | },
934 | "outputs": [],
935 | "source": [
936 | "batch_size = 10 \n",
937 | "for iteration in range(int(X.shape[0] / batch_size)): \n",
938 | " start = batch_size * iteration \n",
939 | " end = batch_size * (iteration + 1) \n",
940 | " mbkm.partial_fit(X[start:end])"
941 | ]
942 | },
943 | {
944 | "cell_type": "code",
945 | "execution_count": 52,
946 | "metadata": {
947 | "collapsed": true
948 | },
949 | "outputs": [],
950 | "source": [
951 | "labels = mbkm.predict(X)"
952 | ]
953 | },
954 | {
955 | "cell_type": "code",
956 | "execution_count": 53,
957 | "metadata": {
958 | "collapsed": false
959 | },
960 | "outputs": [
961 | {
962 | "data": {
963 | "text/plain": [
964 | "array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
965 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
966 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
967 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
968 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
969 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
970 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
971 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
972 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,\n",
973 | " 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
974 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
975 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
976 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
977 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
978 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
979 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
980 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
981 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
982 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
983 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
984 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
985 | " 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)"
986 | ]
987 | },
988 | "execution_count": 53,
989 | "metadata": {},
990 | "output_type": "execute_result"
991 | }
992 | ],
993 | "source": [
994 | "labels"
995 | ]
996 | },
997 | {
998 | "cell_type": "code",
999 | "execution_count": 54,
1000 | "metadata": {
1001 | "collapsed": true
1002 | },
1003 | "outputs": [],
1004 | "source": [
1005 | "class PartialFitPipeline(Pipeline):\n",
1006 | " def partial_fit(self, X, y=None):\n",
1007 | " Xt = X\n",
1008 | " for name, transform in self.steps[:-1]:\n",
1009 | " Xt = transform.transform(Xt)\n",
1010 | " return self.steps[-1][1].partial_fit(Xt, y=y)"
1011 | ]
1012 | },
1013 | {
1014 | "cell_type": "code",
1015 | "execution_count": 55,
1016 | "metadata": {
1017 | "collapsed": false
1018 | },
1019 | "outputs": [],
1020 | "source": [
1021 | "from sklearn.feature_extraction.text import HashingVectorizer\n",
1022 | "\n",
1023 | "pipeline = PartialFitPipeline([('feature_extraction', HashingVectorizer()),\n",
1024 | " ('clusterer', MiniBatchKMeans(random_state=14, n_clusters=3)) ])\n",
1025 | "batch_size = 10 \n",
1026 | "for iteration in range(int(len(documents) / batch_size)): \n",
1027 | " start = batch_size * iteration \n",
1028 | " end = batch_size * (iteration + 1)\n",
1029 | " pipeline.partial_fit(documents[start:end]) \n",
1030 | "labels = pipeline.predict(documents)"
1031 | ]
1032 | },
1033 | {
1034 | "cell_type": "code",
1035 | "execution_count": 56,
1036 | "metadata": {
1037 | "collapsed": false
1038 | },
1039 | "outputs": [
1040 | {
1041 | "data": {
1042 | "text/plain": [
1043 | "array([0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,\n",
1044 | " 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2,\n",
1045 | " 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
1046 | " 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,\n",
1047 | " 1, 2, 0, 1, 0, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
1048 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1, 1, 1, 0, 1,\n",
1049 | " 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 1, 1, 2, 0, 1, 1, 1, 1, 2, 1,\n",
1050 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 0, 1, 1, 1, 1, 1,\n",
1051 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,\n",
1052 | " 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1,\n",
1053 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
1054 | " 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 1,\n",
1055 | " 2, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 0,\n",
1056 | " 2, 1, 1, 1, 0, 0, 1, 2, 1, 1, 1, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
1057 | " 1, 1, 1, 1, 0, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2, 1, 1, 2,\n",
1058 | " 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
1059 | " 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
1060 | " 2, 2, 1, 2, 1, 0, 1, 1, 2, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1,\n",
1061 | " 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1,\n",
1062 | " 0, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,\n",
1063 | " 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1,\n",
1064 | " 1, 1, 1, 1, 1, 1, 0, 1], dtype=int32)"
1065 | ]
1066 | },
1067 | "execution_count": 56,
1068 | "metadata": {},
1069 | "output_type": "execute_result"
1070 | }
1071 | ],
1072 | "source": [
1073 | "labels"
1074 | ]
1075 | },
1076 | {
1077 | "cell_type": "code",
1078 | "execution_count": null,
1079 | "metadata": {
1080 | "collapsed": true
1081 | },
1082 | "outputs": [],
1083 | "source": []
1084 | },
1085 | {
1086 | "cell_type": "code",
1087 | "execution_count": null,
1088 | "metadata": {
1089 | "collapsed": true
1090 | },
1091 | "outputs": [],
1092 | "source": []
1093 | },
1094 | {
1095 | "cell_type": "code",
1096 | "execution_count": null,
1097 | "metadata": {
1098 | "collapsed": true
1099 | },
1100 | "outputs": [],
1101 | "source": []
1102 | }
1103 | ],
1104 | "metadata": {
1105 | "anaconda-cloud": {},
1106 | "kernelspec": {
1107 | "display_name": "Python 3",
1108 | "language": "python",
1109 | "name": "python3"
1110 | },
1111 | "language_info": {
1112 | "codemirror_mode": {
1113 | "name": "ipython",
1114 | "version": 3
1115 | },
1116 | "file_extension": ".py",
1117 | "mimetype": "text/x-python",
1118 | "name": "python",
1119 | "nbconvert_exporter": "python",
1120 | "pygments_lexer": "ipython3",
1121 | "version": "3.5.1"
1122 | }
1123 | },
1124 | "nbformat": 4,
1125 | "nbformat_minor": 1
1126 | }
1127 |
--------------------------------------------------------------------------------
/Chapter11/Chapter 11 TensorFlow Introduction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {
18 | "collapsed": true
19 | },
20 | "outputs": [],
21 | "source": [
22 | "a = tf.constant(5.0)\n",
23 | "b = tf.constant(4.5)\n",
24 | "c = tf.constant(3.0)"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 3,
30 | "metadata": {
31 | "collapsed": false
32 | },
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/plain": [
37 | "(tf.float32, tf.float32, tf.float32)"
38 | ]
39 | },
40 | "execution_count": 3,
41 | "metadata": {},
42 | "output_type": "execute_result"
43 | }
44 | ],
45 | "source": [
46 | "a.dtype, b.dtype, c.dtype"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 4,
52 | "metadata": {
53 | "collapsed": true
54 | },
55 | "outputs": [],
56 | "source": [
57 | "x = tf.Variable(0., name='x')"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 5,
63 | "metadata": {
64 | "collapsed": false
65 | },
66 | "outputs": [
67 | {
68 | "data": {
69 | "text/plain": [
70 | "tf.float32_ref"
71 | ]
72 | },
73 | "execution_count": 5,
74 | "metadata": {},
75 | "output_type": "execute_result"
76 | }
77 | ],
78 | "source": [
79 | "x.dtype"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 6,
85 | "metadata": {
86 | "collapsed": false
87 | },
88 | "outputs": [],
89 | "source": [
90 | "y = (a * x ** 2) + (b * x) + c"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 7,
96 | "metadata": {
97 | "collapsed": false
98 | },
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/plain": [
103 | ""
104 | ]
105 | },
106 | "execution_count": 7,
107 | "metadata": {},
108 | "output_type": "execute_result"
109 | }
110 | ],
111 | "source": [
112 | "y"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 40,
118 | "metadata": {
119 | "collapsed": true
120 | },
121 | "outputs": [],
122 | "source": [
123 | "# See here: http://stackoverflow.com/a/38192374/307363\n",
124 | "\n",
125 | "import numpy as np\n",
126 | "from IPython.display import clear_output, Image, display, HTML\n",
127 | "\n",
128 | "def strip_consts(graph_def, max_const_size=32):\n",
129 | " \"\"\"Strip large constant values from graph_def.\"\"\"\n",
130 | " strip_def = tf.GraphDef()\n",
131 | " for n0 in graph_def.node:\n",
132 | " n = strip_def.node.add() \n",
133 | " n.MergeFrom(n0)\n",
134 | " if n.op == 'Const':\n",
135 | " tensor = n.attr['value'].tensor\n",
136 | " size = len(tensor.tensor_content)\n",
137 | " if size > max_const_size:\n",
138 | " tensor.tensor_content = \"\"%size\n",
139 | " return strip_def\n",
140 | "\n",
141 | "def show_graph(graph_def, max_const_size=32):\n",
142 | " \"\"\"Visualize TensorFlow graph.\"\"\"\n",
143 | " if hasattr(graph_def, 'as_graph_def'):\n",
144 | " graph_def = graph_def.as_graph_def()\n",
145 | " strip_def = strip_consts(graph_def, max_const_size=max_const_size)\n",
146 | " code = \"\"\"\n",
147 | " \n",
152 | " \n",
153 | " \n",
154 | " \n",
155 | "
\n",
156 | " \"\"\".format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))\n",
157 | "\n",
158 | " iframe = \"\"\"\n",
159 | " \n",
160 | " \"\"\".format(code.replace('\"', '"'))\n",
161 | " display(HTML(iframe))"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 27,
167 | "metadata": {
168 | "collapsed": false,
169 | "scrolled": false
170 | },
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/html": [
175 | "\n",
176 | " \n",
187 | " "
188 | ],
189 | "text/plain": [
190 | ""
191 | ]
192 | },
193 | "metadata": {},
194 | "output_type": "display_data"
195 | }
196 | ],
197 | "source": [
198 | "show_graph(tf.get_default_graph().as_graph_def())"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 10,
204 | "metadata": {
205 | "collapsed": true
206 | },
207 | "outputs": [],
208 | "source": [
209 | "tf.global_variables_initializer?"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 12,
215 | "metadata": {
216 | "collapsed": false
217 | },
218 | "outputs": [
219 | {
220 | "name": "stdout",
221 | "output_type": "stream",
222 | "text": [
223 | "3.0\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "model = tf.global_variables_initializer()\n",
229 | "with tf.Session() as session:\n",
230 | " session.run(model)\n",
231 | " result = session.run(y)\n",
232 | "print(result)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 33,
238 | "metadata": {
239 | "collapsed": true
240 | },
241 | "outputs": [],
242 | "source": []
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 14,
247 | "metadata": {
248 | "collapsed": false
249 | },
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "150.5\n"
256 | ]
257 | }
258 | ],
259 | "source": [
260 | "model = tf.global_variables_initializer()\n",
261 | "with tf.Session() as session:\n",
262 | " session.run(model)\n",
263 | " result = session.run(y, {x: 5})\n",
264 | "print(result)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 16,
270 | "metadata": {
271 | "collapsed": false
272 | },
273 | "outputs": [
274 | {
275 | "name": "stdout",
276 | "output_type": "stream",
277 | "text": [
278 | "548.0\n"
279 | ]
280 | }
281 | ],
282 | "source": [
283 | "model = tf.global_variables_initializer()\n",
284 | "with tf.Session() as session:\n",
285 | " session.run(model)\n",
286 | " session.run(x.assign(10))\n",
287 | " result = session.run(y)\n",
288 | "print(result)"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "metadata": {
295 | "collapsed": true
296 | },
297 | "outputs": [],
298 | "source": []
299 | }
300 | ],
301 | "metadata": {
302 | "kernelspec": {
303 | "display_name": "Python 3",
304 | "language": "python",
305 | "name": "python3"
306 | },
307 | "language_info": {
308 | "codemirror_mode": {
309 | "name": "ipython",
310 | "version": 3
311 | },
312 | "file_extension": ".py",
313 | "mimetype": "text/x-python",
314 | "name": "python",
315 | "nbconvert_exporter": "python",
316 | "pygments_lexer": "ipython3",
317 | "version": "3.5.2"
318 | }
319 | },
320 | "nbformat": 4,
321 | "nbformat_minor": 0
322 | }
323 |
--------------------------------------------------------------------------------
/Chapter12/.ipynb_checkpoints/Chapter 12 (NB Predict)-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import os\n",
12 | "import re\n",
13 | "import numpy as np\n",
14 | "from collections import defaultdict\n",
15 | "from operator import itemgetter"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 2,
21 | "metadata": {
22 | "collapsed": false
23 | },
24 | "outputs": [],
25 | "source": [
26 | "word_search_re = re.compile(r\"[\\w']+\")"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [],
36 | "source": [
37 | "def load_model(model_filename):\n",
38 | " model = defaultdict(lambda: defaultdict(float))\n",
39 | " with open(model_filename) as inf:\n",
40 | " for line in inf:\n",
41 | " word, values = line.split(maxsplit=1)\n",
42 | " word = eval(word)\n",
43 | " values = eval(values)\n",
44 | " model[word] = values\n",
45 | " return model"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {
52 | "collapsed": false
53 | },
54 | "outputs": [],
55 | "source": [
56 | "model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"part-00000\")\n",
57 | "model = load_model(model_filename)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 5,
63 | "metadata": {
64 | "collapsed": false
65 | },
66 | "outputs": [
67 | {
68 | "data": {
69 | "text/plain": [
70 | "(409.7987003114851, 513.3231594734408)"
71 | ]
72 | },
73 | "execution_count": 5,
74 | "metadata": {},
75 | "output_type": "execute_result"
76 | }
77 | ],
78 | "source": [
79 | "model[\"i\"][\"male\"], model[\"i\"][\"female\"]"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 6,
85 | "metadata": {
86 | "collapsed": false
87 | },
88 | "outputs": [],
89 | "source": [
90 | "def nb_predict(model, document):\n",
91 | " words = word_search_re.findall(document)\n",
92 | " probabilities = defaultdict(lambda : 0)\n",
93 | " for word in set(words):\n",
94 | " probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-5))\n",
95 | " probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-5))\n",
96 | " # Now find the most likely gender\n",
97 | " most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n",
98 | " return most_likely_genders[0][0]"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 7,
104 | "metadata": {
105 | "collapsed": false
106 | },
107 | "outputs": [],
108 | "source": [
109 | "new_post = \"\"\" Every day should be a half day. Took the afternoon off to hit the dentist, and while I was out I managed to get my oil changed, too. Remember that business with my car dealership this winter? Well, consider this the epilogue. The friendly fellas at the Valvoline Instant Oil Change on Snelling were nice enough to notice that my dipstick was broken, and the metal piece was too far down in its little dipstick tube to pull out. Looks like I'm going to need a magnet. Damn you, Kline Nissan, daaaaaaammmnnn yooouuuu.... Today I let my boss know that I've submitted my Corps application. The news has been greeted by everyone in the company with a level of enthusiasm that really floors me. The back deck has finally been cleared off by the construction company working on the place. This company, for anyone who's interested, consists mainly of one guy who spends his days cursing at his crew of Spanish-speaking laborers. Construction of my deck began around the time Nixon was getting out of office.\n",
110 | "\"\"\""
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 8,
116 | "metadata": {
117 | "collapsed": false
118 | },
119 | "outputs": [
120 | {
121 | "data": {
122 | "text/plain": [
123 | "'male'"
124 | ]
125 | },
126 | "execution_count": 8,
127 | "metadata": {},
128 | "output_type": "execute_result"
129 | }
130 | ],
131 | "source": [
132 | "nb_predict(model, new_post)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 9,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [],
142 | "source": [
143 | "testing_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"blogposts_testing\")\n",
144 | "testing_filenames = []\n",
145 | "for filename in os.listdir(testing_folder):\n",
146 | " testing_filenames.append(os.path.join(testing_folder, filename))"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 10,
152 | "metadata": {
153 | "collapsed": false
154 | },
155 | "outputs": [],
156 | "source": [
157 | "def nb_predict_many(model, input_filename):\n",
158 | " with open(input_filename) as inf:\n",
159 | " # remove leading and trailing whitespace\n",
160 | " for line in inf:\n",
161 | " tokens = line.split()\n",
162 | " actual_gender = eval(tokens[0])\n",
163 | " blog_post = eval(\" \".join(tokens[1:]))\n",
164 | " yield actual_gender, nb_predict(model, blog_post)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 11,
170 | "metadata": {
171 | "collapsed": false
172 | },
173 | "outputs": [],
174 | "source": [
175 | "def nb_predict(model, document):\n",
176 | " words = word_search_re.findall(document)\n",
177 | " probabilities = defaultdict(lambda : 1)\n",
178 | " for word in set(words):\n",
179 | " probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-15))\n",
180 | " probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-15))\n",
181 | " # Now find the most likely gender\n",
182 | " most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n",
183 | " return most_likely_genders"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 13,
189 | "metadata": {
190 | "collapsed": false
191 | },
192 | "outputs": [],
193 | "source": [
194 | "y_true = []\n",
195 | "y_pred = []\n",
196 | "for testing_filename in testing_filenames:\n",
197 | " for actual_gender, ratios in nb_predict_many(model, testing_filename):\n",
198 | " predicted_gender = ratios[0][0]\n",
199 | " y_true.append(actual_gender == \"female\")\n",
200 | " y_pred.append(predicted_gender == \"female\")\n",
201 | "y_true = np.array(y_true, dtype='int')\n",
202 | "y_pred = np.array(y_pred, dtype='int')"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 14,
208 | "metadata": {
209 | "collapsed": false
210 | },
211 | "outputs": [
212 | {
213 | "name": "stdout",
214 | "output_type": "stream",
215 | "text": [
216 | "f1=0.5540\n",
217 | "acc=0.5765\n"
218 | ]
219 | }
220 | ],
221 | "source": [
222 | "from sklearn.metrics import f1_score\n",
223 | "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n",
224 | "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))\n",
225 | " \n"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 15,
231 | "metadata": {
232 | "collapsed": false
233 | },
234 | "outputs": [],
235 | "source": [
236 | "aws_model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"model_aws\")\n",
237 | "aws_model = load_model(aws_model_filename)"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 16,
243 | "metadata": {
244 | "collapsed": false
245 | },
246 | "outputs": [],
247 | "source": [
248 | "y_true = []\n",
249 | "y_pred = []\n",
250 | "for testing_filename in testing_filenames:\n",
251 | " for actual_gender, predicted_gender in nb_predict_many(aws_model, testing_filename):\n",
252 | " predicted_gender = ratios[0][0]\n",
253 | " y_true.append(actual_gender == \"female\")\n",
254 | " y_pred.append(predicted_gender == \"female\")\n",
255 | " #print(\"Actual: {0}\\tPredicted: {1}\".format(actual_gender, predicted_gender))\n",
256 | " if len(y_true) > 500:\n",
257 | " break\n",
258 | "y_true = np.array(y_true, dtype='int')\n",
259 | "y_pred = np.array(y_pred, dtype='int')"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 17,
265 | "metadata": {
266 | "collapsed": false
267 | },
268 | "outputs": [
269 | {
270 | "name": "stdout",
271 | "output_type": "stream",
272 | "text": [
273 | "f1=0.8144\n",
274 | "acc=0.8734\n"
275 | ]
276 | },
277 | {
278 | "name": "stderr",
279 | "output_type": "stream",
280 | "text": [
281 | "/usr/local/lib/python3.4/dist-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
282 | " 'precision', 'predicted', average, warn_for)\n"
283 | ]
284 | }
285 | ],
286 | "source": [
287 | "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n",
288 | "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 18,
294 | "metadata": {
295 | "collapsed": false
296 | },
297 | "outputs": [
298 | {
299 | "name": "stdout",
300 | "output_type": "stream",
301 | "text": [
302 | "[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]\n"
303 | ]
304 | }
305 | ],
306 | "source": [
307 | "print(list(zip(y_true, y_pred))[:10])"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 19,
313 | "metadata": {
314 | "collapsed": false
315 | },
316 | "outputs": [],
317 | "source": [
318 | "from sklearn.metrics import confusion_matrix"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 20,
324 | "metadata": {
325 | "collapsed": false
326 | },
327 | "outputs": [
328 | {
329 | "data": {
330 | "text/plain": [
331 | "array([[614, 0],\n",
332 | " [ 89, 0]])"
333 | ]
334 | },
335 | "execution_count": 20,
336 | "metadata": {},
337 | "output_type": "execute_result"
338 | }
339 | ],
340 | "source": [
341 | "confusion_matrix(y_true, y_pred)"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": null,
347 | "metadata": {
348 | "collapsed": false
349 | },
350 | "outputs": [],
351 | "source": []
352 | }
353 | ],
354 | "metadata": {
355 | "kernelspec": {
356 | "display_name": "Python 3",
357 | "language": "python",
358 | "name": "python3"
359 | },
360 | "language_info": {
361 | "codemirror_mode": {
362 | "name": "ipython",
363 | "version": 3
364 | },
365 | "file_extension": ".py",
366 | "mimetype": "text/x-python",
367 | "name": "python",
368 | "nbconvert_exporter": "python",
369 | "pygments_lexer": "ipython3",
370 | "version": "3.5.2"
371 | }
372 | },
373 | "nbformat": 4,
374 | "nbformat_minor": 0
375 | }
376 |
--------------------------------------------------------------------------------
/Chapter12/Chapter 12 (NB Predict).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import os\n",
12 | "import re\n",
13 | "import numpy as np\n",
14 | "from collections import defaultdict\n",
15 | "from operator import itemgetter"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 2,
21 | "metadata": {
22 | "collapsed": false
23 | },
24 | "outputs": [],
25 | "source": [
26 | "word_search_re = re.compile(r\"[\\w']+\")"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [],
36 | "source": [
37 | "def load_model(model_filename):\n",
38 | " model = defaultdict(lambda: defaultdict(float))\n",
39 | " with open(model_filename) as inf:\n",
40 | " for line in inf:\n",
41 | " word, values = line.split(maxsplit=1)\n",
42 | " word = eval(word)\n",
43 | " values = eval(values)\n",
44 | " model[word] = values\n",
45 | " return model"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {
52 | "collapsed": false
53 | },
54 | "outputs": [],
55 | "source": [
56 | "model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"part-00000\")\n",
57 | "model = load_model(model_filename)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 5,
63 | "metadata": {
64 | "collapsed": false
65 | },
66 | "outputs": [
67 | {
68 | "data": {
69 | "text/plain": [
70 | "(409.7987003114851, 513.3231594734408)"
71 | ]
72 | },
73 | "execution_count": 5,
74 | "metadata": {},
75 | "output_type": "execute_result"
76 | }
77 | ],
78 | "source": [
79 | "model[\"i\"][\"male\"], model[\"i\"][\"female\"]"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 6,
85 | "metadata": {
86 | "collapsed": false
87 | },
88 | "outputs": [],
89 | "source": [
90 | "def nb_predict(model, document):\n",
91 | " words = word_search_re.findall(document)\n",
92 | " probabilities = defaultdict(lambda : 0)\n",
93 | " for word in set(words):\n",
94 | " probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-5))\n",
95 | " probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-5))\n",
96 | " # Now find the most likely gender\n",
97 | " most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n",
98 | " return most_likely_genders[0][0]"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 7,
104 | "metadata": {
105 | "collapsed": false
106 | },
107 | "outputs": [],
108 | "source": [
109 | "new_post = \"\"\" Every day should be a half day. Took the afternoon off to hit the dentist, and while I was out I managed to get my oil changed, too. Remember that business with my car dealership this winter? Well, consider this the epilogue. The friendly fellas at the Valvoline Instant Oil Change on Snelling were nice enough to notice that my dipstick was broken, and the metal piece was too far down in its little dipstick tube to pull out. Looks like I'm going to need a magnet. Damn you, Kline Nissan, daaaaaaammmnnn yooouuuu.... Today I let my boss know that I've submitted my Corps application. The news has been greeted by everyone in the company with a level of enthusiasm that really floors me. The back deck has finally been cleared off by the construction company working on the place. This company, for anyone who's interested, consists mainly of one guy who spends his days cursing at his crew of Spanish-speaking laborers. Construction of my deck began around the time Nixon was getting out of office.\n",
110 | "\"\"\""
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 8,
116 | "metadata": {
117 | "collapsed": false
118 | },
119 | "outputs": [
120 | {
121 | "data": {
122 | "text/plain": [
123 | "'male'"
124 | ]
125 | },
126 | "execution_count": 8,
127 | "metadata": {},
128 | "output_type": "execute_result"
129 | }
130 | ],
131 | "source": [
132 | "nb_predict(model, new_post)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 9,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [],
142 | "source": [
143 | "testing_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"blogposts_testing\")\n",
144 | "testing_filenames = []\n",
145 | "for filename in os.listdir(testing_folder):\n",
146 | " testing_filenames.append(os.path.join(testing_folder, filename))"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 10,
152 | "metadata": {
153 | "collapsed": false
154 | },
155 | "outputs": [],
156 | "source": [
157 | "def nb_predict_many(model, input_filename):\n",
158 | " with open(input_filename) as inf:\n",
159 | " # remove leading and trailing whitespace\n",
160 | " for line in inf:\n",
161 | " tokens = line.split()\n",
162 | " actual_gender = eval(tokens[0])\n",
163 | " blog_post = eval(\" \".join(tokens[1:]))\n",
164 | " yield actual_gender, nb_predict(model, blog_post)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 11,
170 | "metadata": {
171 | "collapsed": false
172 | },
173 | "outputs": [],
174 | "source": [
175 | "def nb_predict(model, document):\n",
176 | " words = word_search_re.findall(document)\n",
177 | " probabilities = defaultdict(lambda : 1)\n",
178 | " for word in set(words):\n",
179 | " probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-15))\n",
180 | " probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-15))\n",
181 | " # Now find the most likely gender\n",
182 | " most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n",
183 | " return most_likely_genders"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 13,
189 | "metadata": {
190 | "collapsed": false
191 | },
192 | "outputs": [],
193 | "source": [
194 | "y_true = []\n",
195 | "y_pred = []\n",
196 | "for testing_filename in testing_filenames:\n",
197 | " for actual_gender, ratios in nb_predict_many(model, testing_filename):\n",
198 | " predicted_gender = ratios[0][0]\n",
199 | " y_true.append(actual_gender == \"female\")\n",
200 | " y_pred.append(predicted_gender == \"female\")\n",
201 | "y_true = np.array(y_true, dtype='int')\n",
202 | "y_pred = np.array(y_pred, dtype='int')"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 14,
208 | "metadata": {
209 | "collapsed": false
210 | },
211 | "outputs": [
212 | {
213 | "name": "stdout",
214 | "output_type": "stream",
215 | "text": [
216 | "f1=0.5540\n",
217 | "acc=0.5765\n"
218 | ]
219 | }
220 | ],
221 | "source": [
222 | "from sklearn.metrics import f1_score\n",
223 | "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n",
224 | "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))\n",
225 | " \n"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 15,
231 | "metadata": {
232 | "collapsed": false
233 | },
234 | "outputs": [],
235 | "source": [
236 | "aws_model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"model_aws\")\n",
237 | "aws_model = load_model(aws_model_filename)"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 16,
243 | "metadata": {
244 | "collapsed": false
245 | },
246 | "outputs": [],
247 | "source": [
248 | "y_true = []\n",
249 | "y_pred = []\n",
250 | "for testing_filename in testing_filenames:\n",
251 | " for actual_gender, predicted_gender in nb_predict_many(aws_model, testing_filename):\n",
252 | " predicted_gender = ratios[0][0]\n",
253 | " y_true.append(actual_gender == \"female\")\n",
254 | " y_pred.append(predicted_gender == \"female\")\n",
255 | " #print(\"Actual: {0}\\tPredicted: {1}\".format(actual_gender, predicted_gender))\n",
256 | " if len(y_true) > 500:\n",
257 | " break\n",
258 | "y_true = np.array(y_true, dtype='int')\n",
259 | "y_pred = np.array(y_pred, dtype='int')"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 17,
265 | "metadata": {
266 | "collapsed": false
267 | },
268 | "outputs": [
269 | {
270 | "name": "stdout",
271 | "output_type": "stream",
272 | "text": [
273 | "f1=0.8144\n",
274 | "acc=0.8734\n"
275 | ]
276 | },
277 | {
278 | "name": "stderr",
279 | "output_type": "stream",
280 | "text": [
281 | "/usr/local/lib/python3.4/dist-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
282 | " 'precision', 'predicted', average, warn_for)\n"
283 | ]
284 | }
285 | ],
286 | "source": [
287 | "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n",
288 | "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 18,
294 | "metadata": {
295 | "collapsed": false
296 | },
297 | "outputs": [
298 | {
299 | "name": "stdout",
300 | "output_type": "stream",
301 | "text": [
302 | "[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]\n"
303 | ]
304 | }
305 | ],
306 | "source": [
307 | "print(list(zip(y_true, y_pred))[:10])"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 19,
313 | "metadata": {
314 | "collapsed": false
315 | },
316 | "outputs": [],
317 | "source": [
318 | "from sklearn.metrics import confusion_matrix"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 20,
324 | "metadata": {
325 | "collapsed": false
326 | },
327 | "outputs": [
328 | {
329 | "data": {
330 | "text/plain": [
331 | "array([[614, 0],\n",
332 | " [ 89, 0]])"
333 | ]
334 | },
335 | "execution_count": 20,
336 | "metadata": {},
337 | "output_type": "execute_result"
338 | }
339 | ],
340 | "source": [
341 | "confusion_matrix(y_true, y_pred)"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": null,
347 | "metadata": {
348 | "collapsed": false
349 | },
350 | "outputs": [],
351 | "source": []
352 | }
353 | ],
354 | "metadata": {
355 | "kernelspec": {
356 | "display_name": "Python 3",
357 | "language": "python",
358 | "name": "python3"
359 | },
360 | "language_info": {
361 | "codemirror_mode": {
362 | "name": "ipython",
363 | "version": 3
364 | },
365 | "file_extension": ".py",
366 | "mimetype": "text/x-python",
367 | "name": "python",
368 | "nbconvert_exporter": "python",
369 | "pygments_lexer": "ipython3",
370 | "version": "3.5.2"
371 | }
372 | },
373 | "nbformat": 4,
374 | "nbformat_minor": 0
375 | }
376 |
--------------------------------------------------------------------------------
/Chapter12/Chapter 12 (Test load).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": ""
4 | },
5 | "nbformat": 3,
6 | "nbformat_minor": 0,
7 | "worksheets": [
8 | {
9 | "cells": [
10 | {
11 | "cell_type": "code",
12 | "collapsed": false,
13 | "input": [
14 | "import os\n",
15 | "filename = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"blogs\", \"1005545.male.25.Engineering.Sagittarius.xml\")"
16 | ],
17 | "language": "python",
18 | "metadata": {},
19 | "outputs": [],
20 | "prompt_number": 1
21 | },
22 | {
23 | "cell_type": "code",
24 | "collapsed": false,
25 | "input": [
26 | "all_posts = []\n",
27 | "with open(filename) as inf:\n",
28 | " # remove leading and trailing whitespace\n",
29 | " post_start = False\n",
30 | " post = []\n",
31 | " for line in inf:\n",
32 | " line = line.strip()\n",
33 | " if line == \"\":\n",
34 | " post_start = True\n",
35 | " elif line == \" \":\n",
36 | " post_start = False\n",
37 | " all_posts.append(\"\\n\".join(post))\n",
38 | " post = []\n",
39 | " elif post_start:\n",
40 | " post.append(line)"
41 | ],
42 | "language": "python",
43 | "metadata": {},
44 | "outputs": [],
45 | "prompt_number": 3
46 | },
47 | {
48 | "cell_type": "code",
49 | "collapsed": false,
50 | "input": [
51 | "len(all_posts)"
52 | ],
53 | "language": "python",
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "metadata": {},
58 | "output_type": "pyout",
59 | "prompt_number": 4,
60 | "text": [
61 | "80"
62 | ]
63 | }
64 | ],
65 | "prompt_number": 4
66 | },
67 | {
68 | "cell_type": "code",
69 | "collapsed": false,
70 | "input": [],
71 | "language": "python",
72 | "metadata": {},
73 | "outputs": []
74 | }
75 | ],
76 | "metadata": {}
77 | }
78 | ]
79 | }
--------------------------------------------------------------------------------
/Chapter12/extract_posts.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from mrjob.job import MRJob
4 | from mrjob.step import MRStep
5 |
6 | word_search_re = re.compile(r"[\w']+")
7 |
8 |
9 | class ExtractPosts(MRJob):
10 |
11 | post_start = False
12 | post = []
13 |
14 | def mapper(self, key, line):
15 | filename = os.environ["map_input_file"]
16 | gender = filename.split(".")[1]
17 | try:
18 | docnum = int(filename[0])
19 | except:
20 | docnum = 8
21 | # remove leading and trailing whitespace
22 | line = line.strip()
23 | if line == "":
24 | self.post_start = True
25 | elif line == " ":
26 | self.post_start = False
27 | yield gender, repr("\n".join(self.post))
28 | self.post = []
29 | elif self.post_start:
30 | self.post.append(line)
31 |
32 |
33 |
34 | if __name__ == '__main__':
35 | ExtractPosts.run()
36 |
--------------------------------------------------------------------------------
/Chapter12/nb_predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import numpy as np
4 | from mrjob.job import MRJob
5 | from mrjob.step import MRStep
6 | from operator import itemgetter
7 |
8 | word_search_re = re.compile(r"[\w']+")
9 |
10 |
11 | class NaiveBayesTrainer(MRJob):
12 |
13 | def steps(self):
14 | return [
15 | MRStep(mapper=self.extract_words_mapping,
16 | reducer=self.reducer_count_words),
17 | MRStep(mapper=self.predict_mapper),
18 | ]
19 |
20 | def extract_words_mapping(self, key, value):
21 | tokens = value.split()
22 | gender = eval(tokens[0])
23 | blog_post = eval(" ".join(tokens[1:]))
24 | all_words = word_search_re.findall(blog_post)
25 | all_words = [word.lower() for word in all_words]
26 | #for word in all_words:
27 | for word in all_words:
28 | #yield "{0}:{1}".format(gender, word.lower()), 1
29 | #yield (gender, word.lower()), (1. / len(all_words))
30 | # Occurence probability
31 | yield (gender, word), 1
32 |
33 | def reducer_count_words(self, key, counts):
34 | s = sum(counts)
35 | gender, word = key #.split(":")
36 | yield word, (gender, s)
37 |
38 | def predict_mapper(self, key, value):
39 | per_gender = {}
40 | for value in values:
41 | gender, s = value
42 | per_gender[gender] = s
43 | yield word, per_gender
44 |
45 | def ratio_mapper(self, word, value):
46 | counts = dict(value)
47 | sum_of_counts = float(np.mean(counts.values()))
48 | maximum_score = max(counts.items(), key=itemgetter(1))
49 | current_ratio = maximum_score[1] / sum_of_counts
50 | yield None, (word, sum_of_counts, value)
51 |
52 | def sorter_reducer(self, key, values):
53 | ranked_list = sorted(values, key=itemgetter(1), reverse=True)
54 | n_printed = 0
55 | for word, sum_of_counts, scores in ranked_list:
56 | if n_printed < 20:
57 | print((n_printed + 1), word, scores)
58 | n_printed += 1
59 | yield word, dict(scores)
60 |
61 | if __name__ == '__main__':
62 | NaiveBayesTrainer.run()
63 |
--------------------------------------------------------------------------------
/Chapter12/nb_train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import numpy as np
4 | from mrjob.job import MRJob
5 | from mrjob.step import MRStep
6 | from operator import itemgetter
7 |
8 | word_search_re = re.compile(r"[\w']+")
9 |
10 |
11 | class NaiveBayesTrainer(MRJob):
12 |
13 | def steps(self):
14 | return [
15 | MRStep(mapper=self.extract_words_mapping,
16 | reducer=self.reducer_count_words),
17 | MRStep(reducer=self.compare_words_reducer),
18 | ]
19 |
20 | def extract_words_mapping(self, key, value):
21 | tokens = value.split()
22 | gender = eval(tokens[0])
23 | blog_post = eval(" ".join(tokens[1:]))
24 | all_words = word_search_re.findall(blog_post)
25 | all_words = [word.lower() for word in all_words]
26 | #for word in all_words:
27 | for word in all_words:
28 | #yield "{0}:{1}".format(gender, word.lower()), 1
29 | #yield (gender, word.lower()), (1. / len(all_words))
30 | # Occurence probability
31 | yield (gender, word), 1. / len(all_words)
32 |
33 | def reducer_count_words(self, key, counts):
34 | s = sum(counts)
35 | gender, word = key #.split(":")
36 | yield word, (gender, s)
37 |
38 | def compare_words_reducer(self, word, values):
39 | per_gender = {}
40 | for value in values:
41 | gender, s = value
42 | per_gender[gender] = s
43 | yield word, per_gender
44 |
45 | def ratio_mapper(self, word, value):
46 | counts = dict(value)
47 | sum_of_counts = float(np.mean(counts.values()))
48 | maximum_score = max(counts.items(), key=itemgetter(1))
49 | current_ratio = maximum_score[1] / sum_of_counts
50 | yield None, (word, sum_of_counts, value)
51 |
52 | def sorter_reducer(self, key, values):
53 | ranked_list = sorted(values, key=itemgetter(1), reverse=True)
54 | n_printed = 0
55 | for word, sum_of_counts, scores in ranked_list:
56 | if n_printed < 20:
57 | print((n_printed + 1), word, scores)
58 | n_printed += 1
59 | yield word, dict(scores)
60 |
61 | if __name__ == '__main__':
62 | NaiveBayesTrainer.run()
63 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Learning Data Mining with Python - Second Edition
2 | This is the code repository for [Learning Data Mining with Python - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/learning-data-mining-python-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781787126787), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish.
3 | ## About the Book
4 | This book teaches you to design and develop data mining applications using a variety of datasets, starting with basic classification and affinity analysis. This book covers a large number of libraries available in Python, including the Jupyter Notebook, pandas, scikit-learn, and NLTK.
5 | ## Instructions and Navigation
6 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02.
7 |
8 |
9 |
10 | The code will look like the following:
11 | ```
12 | import numpy as np
13 | dataset_filename = "affinity_dataset.txt"
14 | X = np.loadtxt(dataset_filename)
15 | ```
16 |
17 | It should come as no surprise that you’ll need a computer, or access to one, to complete the
18 | book. The computer should be reasonably modern, but it doesn’t need to be overpowered.
19 | Any modern processor (from about 2010 onwards) and 4 gigabytes of RAM will suffice, and
20 | you can probably run almost all of the code on a slower system too.
21 | The exception here is with the final two chapters. In these chapters, I step through using
22 | Amazon’s web services (AWS) for running the code. This will probably cost you some
23 | money, but the advantage is less system setup than running the code locally. If you don’t
24 | want to pay for those services, the tools used can all be set-up on a local computer, but you
25 | will definitely need a modern system to run it. A processor built in at least 2012, and more
26 | than 4 GB of RAM are necessary.
27 | I recommend the Ubuntu operating system, but the code should work well on Windows,
28 | Macs, or any other Linux variant. You may need to consult the documentation for your
29 | system to get some things installed though.
30 | In this book, I use pip for installing code, which is a command line tool for installing Python
31 | libraries.
32 | Another option is to use Anaconda, which can be found online here:
33 | http://continuum.io/downloads
34 | I also have tested all code using Python 3. Most of the code examples work on Python 2
35 | with no changes. If you run into any problems, and can’t get around it, send an email and
36 | we can offer a solution.
37 |
38 | ## Related Products
39 | * [Mastering Data Mining with Python - Find patterns hidden in your data](https://www.packtpub.com/big-data-and-business-intelligence/mastering-data-mining-python-–-find-patterns-hidden-your-data?utm_source=github&utm_medium=repository&utm_campaign=9781785889950)
40 |
41 | * [Learning Data Mining with R](https://www.packtpub.com/big-data-and-business-intelligence/learning-data-mining-r?utm_source=github&utm_medium=repository&utm_campaign=9781783982103)
42 |
43 | * [Python: Data Analytics and Visualization](https://www.packtpub.com/big-data-and-business-intelligence/python-data-analytics-and-visualization?utm_source=github&utm_medium=repository&utm_campaign=9781788290098)
44 |
45 | ### Suggestions and Feedback
46 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions.
47 |
--------------------------------------------------------------------------------