├── Chapter2
├── .ipynb_checkpoints
│ └── Chapter2-checkpoint.ipynb
└── Chapter2.ipynb
├── Chapter3
├── .ipynb_checkpoints
│ ├── Knowledge Recommender-checkpoint.ipynb
│ └── Simple Recommender-checkpoint.ipynb
├── Knowledge Recommender.ipynb
└── Simple Recommender.ipynb
├── Chapter4
├── .ipynb_checkpoints
│ └── Content Based Recommenders-checkpoint.ipynb
└── Content Based Recommenders.ipynb
├── Chapter5
├── .ipynb_checkpoints
│ └── Data Mining-checkpoint.ipynb
└── Data Mining.ipynb
├── Chapter6
├── .ipynb_checkpoints
│ └── Collaborative Filtering-checkpoint.ipynb
└── Collaborative Filtering.ipynb
├── Chapter7
├── .ipynb_checkpoints
│ └── Hybrid Recommender-checkpoint.ipynb
└── Hybrid Recommender.ipynb
├── LICENSE
└── README.md
/Chapter2/.ipynb_checkpoints/Chapter2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/Chapter3/.ipynb_checkpoints/Knowledge Recommender-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
13 | " interactivity=interactivity, compiler=compiler, result=result)\n"
14 | ]
15 | },
16 | {
17 | "data": {
18 | "text/plain": [
19 | "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n",
20 | " 'imdb_id', 'original_language', 'original_title', 'overview',\n",
21 | " 'popularity', 'poster_path', 'production_companies',\n",
22 | " 'production_countries', 'release_date', 'revenue', 'runtime',\n",
23 | " 'spoken_languages', 'status', 'tagline', 'title', 'video',\n",
24 | " 'vote_average', 'vote_count'],\n",
25 | " dtype='object')"
26 | ]
27 | },
28 | "execution_count": 1,
29 | "metadata": {},
30 | "output_type": "execute_result"
31 | }
32 | ],
33 | "source": [
34 | "import pandas as pd\n",
35 | "import numpy as np\n",
36 | "\n",
37 | "df = pd.read_csv('../data/movies_metadata.csv')\n",
38 | "\n",
39 | "#Print all the features (or columns) of the DataFrame\n",
40 | "df.columns"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/html": [
51 | "
\n",
52 | "\n",
65 | "
\n",
66 | " \n",
67 | " \n",
68 | " | \n",
69 | " title | \n",
70 | " genres | \n",
71 | " release_date | \n",
72 | " runtime | \n",
73 | " vote_average | \n",
74 | " vote_count | \n",
75 | "
\n",
76 | " \n",
77 | " \n",
78 | " \n",
79 | " 0 | \n",
80 | " Toy Story | \n",
81 | " [{'id': 16, 'name': 'Animation'}, {'id': 35, '... | \n",
82 | " 1995-10-30 | \n",
83 | " 81.0 | \n",
84 | " 7.7 | \n",
85 | " 5415.0 | \n",
86 | "
\n",
87 | " \n",
88 | " 1 | \n",
89 | " Jumanji | \n",
90 | " [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... | \n",
91 | " 1995-12-15 | \n",
92 | " 104.0 | \n",
93 | " 6.9 | \n",
94 | " 2413.0 | \n",
95 | "
\n",
96 | " \n",
97 | " 2 | \n",
98 | " Grumpier Old Men | \n",
99 | " [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... | \n",
100 | " 1995-12-22 | \n",
101 | " 101.0 | \n",
102 | " 6.5 | \n",
103 | " 92.0 | \n",
104 | "
\n",
105 | " \n",
106 | " 3 | \n",
107 | " Waiting to Exhale | \n",
108 | " [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... | \n",
109 | " 1995-12-22 | \n",
110 | " 127.0 | \n",
111 | " 6.1 | \n",
112 | " 34.0 | \n",
113 | "
\n",
114 | " \n",
115 | " 4 | \n",
116 | " Father of the Bride Part II | \n",
117 | " [{'id': 35, 'name': 'Comedy'}] | \n",
118 | " 1995-02-10 | \n",
119 | " 106.0 | \n",
120 | " 5.7 | \n",
121 | " 173.0 | \n",
122 | "
\n",
123 | " \n",
124 | "
\n",
125 | "
"
126 | ],
127 | "text/plain": [
128 | " title \\\n",
129 | "0 Toy Story \n",
130 | "1 Jumanji \n",
131 | "2 Grumpier Old Men \n",
132 | "3 Waiting to Exhale \n",
133 | "4 Father of the Bride Part II \n",
134 | "\n",
135 | " genres release_date runtime \\\n",
136 | "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 1995-10-30 81.0 \n",
137 | "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 1995-12-15 104.0 \n",
138 | "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 1995-12-22 101.0 \n",
139 | "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 1995-12-22 127.0 \n",
140 | "4 [{'id': 35, 'name': 'Comedy'}] 1995-02-10 106.0 \n",
141 | "\n",
142 | " vote_average vote_count \n",
143 | "0 7.7 5415.0 \n",
144 | "1 6.9 2413.0 \n",
145 | "2 6.5 92.0 \n",
146 | "3 6.1 34.0 \n",
147 | "4 5.7 173.0 "
148 | ]
149 | },
150 | "execution_count": 2,
151 | "metadata": {},
152 | "output_type": "execute_result"
153 | }
154 | ],
155 | "source": [
156 | "#Only keep those features that we require \n",
157 | "df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]\n",
158 | "\n",
159 | "df.head()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 3,
165 | "metadata": {
166 | "collapsed": true
167 | },
168 | "outputs": [],
169 | "source": [
170 | "#Convert release_date into pandas datetime format\n",
171 | "df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
172 | "\n",
173 | "#Extract year from the datetime\n",
174 | "df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 4,
180 | "metadata": {
181 | "collapsed": true
182 | },
183 | "outputs": [],
184 | "source": [
185 | "#Helper function to convert NaT to 0 and all other years to integers.\n",
186 | "def convert_int(x):\n",
187 | " try:\n",
188 | " return int(x)\n",
189 | " except:\n",
190 | " return 0"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 5,
196 | "metadata": {
197 | "collapsed": true
198 | },
199 | "outputs": [],
200 | "source": [
201 | "#Apply convert_int to the year feature\n",
202 | "df['year'] = df['year'].apply(convert_int)"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 6,
208 | "metadata": {},
209 | "outputs": [
210 | {
211 | "data": {
212 | "text/html": [
213 | "\n",
214 | "\n",
227 | "
\n",
228 | " \n",
229 | " \n",
230 | " | \n",
231 | " title | \n",
232 | " genres | \n",
233 | " runtime | \n",
234 | " vote_average | \n",
235 | " vote_count | \n",
236 | " year | \n",
237 | "
\n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " 0 | \n",
242 | " Toy Story | \n",
243 | " [{'id': 16, 'name': 'Animation'}, {'id': 35, '... | \n",
244 | " 81.0 | \n",
245 | " 7.7 | \n",
246 | " 5415.0 | \n",
247 | " 1995 | \n",
248 | "
\n",
249 | " \n",
250 | " 1 | \n",
251 | " Jumanji | \n",
252 | " [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... | \n",
253 | " 104.0 | \n",
254 | " 6.9 | \n",
255 | " 2413.0 | \n",
256 | " 1995 | \n",
257 | "
\n",
258 | " \n",
259 | " 2 | \n",
260 | " Grumpier Old Men | \n",
261 | " [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... | \n",
262 | " 101.0 | \n",
263 | " 6.5 | \n",
264 | " 92.0 | \n",
265 | " 1995 | \n",
266 | "
\n",
267 | " \n",
268 | " 3 | \n",
269 | " Waiting to Exhale | \n",
270 | " [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... | \n",
271 | " 127.0 | \n",
272 | " 6.1 | \n",
273 | " 34.0 | \n",
274 | " 1995 | \n",
275 | "
\n",
276 | " \n",
277 | " 4 | \n",
278 | " Father of the Bride Part II | \n",
279 | " [{'id': 35, 'name': 'Comedy'}] | \n",
280 | " 106.0 | \n",
281 | " 5.7 | \n",
282 | " 173.0 | \n",
283 | " 1995 | \n",
284 | "
\n",
285 | " \n",
286 | "
\n",
287 | "
"
288 | ],
289 | "text/plain": [
290 | " title \\\n",
291 | "0 Toy Story \n",
292 | "1 Jumanji \n",
293 | "2 Grumpier Old Men \n",
294 | "3 Waiting to Exhale \n",
295 | "4 Father of the Bride Part II \n",
296 | "\n",
297 | " genres runtime vote_average \\\n",
298 | "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 81.0 7.7 \n",
299 | "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 104.0 6.9 \n",
300 | "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 101.0 6.5 \n",
301 | "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 127.0 6.1 \n",
302 | "4 [{'id': 35, 'name': 'Comedy'}] 106.0 5.7 \n",
303 | "\n",
304 | " vote_count year \n",
305 | "0 5415.0 1995 \n",
306 | "1 2413.0 1995 \n",
307 | "2 92.0 1995 \n",
308 | "3 34.0 1995 \n",
309 | "4 173.0 1995 "
310 | ]
311 | },
312 | "execution_count": 6,
313 | "metadata": {},
314 | "output_type": "execute_result"
315 | }
316 | ],
317 | "source": [
318 | "#Drop the release_date column\n",
319 | "df = df.drop('release_date', axis=1)\n",
320 | "\n",
321 | "#Display the dataframe\n",
322 | "df.head()"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 7,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "data": {
332 | "text/plain": [
333 | "\"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]\""
334 | ]
335 | },
336 | "execution_count": 7,
337 | "metadata": {},
338 | "output_type": "execute_result"
339 | }
340 | ],
341 | "source": [
342 | "#Print genres of the first movie\n",
343 | "df.iloc[0]['genres']"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 8,
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "name": "stdout",
353 | "output_type": "stream",
354 | "text": [
355 | "\n",
356 | "\n"
357 | ]
358 | }
359 | ],
360 | "source": [
361 | "#Import the literal_eval function from ast\n",
362 | "from ast import literal_eval\n",
363 | "\n",
364 | "#Define a stringified list and output its type\n",
365 | "a = \"[1,2,3]\"\n",
366 | "print(type(a))\n",
367 | "\n",
368 | "#Apply literal_eval and output type\n",
369 | "b = literal_eval(a)\n",
370 | "print(type(b))"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 9,
376 | "metadata": {
377 | "collapsed": true
378 | },
379 | "outputs": [],
380 | "source": [
381 | "#Convert all NaN into stringified empty lists\n",
382 | "df['genres'] = df['genres'].fillna('[]')\n",
383 | "\n",
384 | "#Apply literal_eval to convert stringified empty lists to the list object\n",
385 | "df['genres'] = df['genres'].apply(literal_eval)\n",
386 | "\n",
387 | "#Convert list of dictionaries to a list of strings\n",
388 | "df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 10,
394 | "metadata": {},
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/html": [
399 | "\n",
400 | "\n",
413 | "
\n",
414 | " \n",
415 | " \n",
416 | " | \n",
417 | " title | \n",
418 | " genres | \n",
419 | " runtime | \n",
420 | " vote_average | \n",
421 | " vote_count | \n",
422 | " year | \n",
423 | "
\n",
424 | " \n",
425 | " \n",
426 | " \n",
427 | " 0 | \n",
428 | " Toy Story | \n",
429 | " [animation, comedy, family] | \n",
430 | " 81.0 | \n",
431 | " 7.7 | \n",
432 | " 5415.0 | \n",
433 | " 1995 | \n",
434 | "
\n",
435 | " \n",
436 | " 1 | \n",
437 | " Jumanji | \n",
438 | " [adventure, fantasy, family] | \n",
439 | " 104.0 | \n",
440 | " 6.9 | \n",
441 | " 2413.0 | \n",
442 | " 1995 | \n",
443 | "
\n",
444 | " \n",
445 | " 2 | \n",
446 | " Grumpier Old Men | \n",
447 | " [romance, comedy] | \n",
448 | " 101.0 | \n",
449 | " 6.5 | \n",
450 | " 92.0 | \n",
451 | " 1995 | \n",
452 | "
\n",
453 | " \n",
454 | " 3 | \n",
455 | " Waiting to Exhale | \n",
456 | " [comedy, drama, romance] | \n",
457 | " 127.0 | \n",
458 | " 6.1 | \n",
459 | " 34.0 | \n",
460 | " 1995 | \n",
461 | "
\n",
462 | " \n",
463 | " 4 | \n",
464 | " Father of the Bride Part II | \n",
465 | " [comedy] | \n",
466 | " 106.0 | \n",
467 | " 5.7 | \n",
468 | " 173.0 | \n",
469 | " 1995 | \n",
470 | "
\n",
471 | " \n",
472 | "
\n",
473 | "
"
474 | ],
475 | "text/plain": [
476 | " title genres runtime \\\n",
477 | "0 Toy Story [animation, comedy, family] 81.0 \n",
478 | "1 Jumanji [adventure, fantasy, family] 104.0 \n",
479 | "2 Grumpier Old Men [romance, comedy] 101.0 \n",
480 | "3 Waiting to Exhale [comedy, drama, romance] 127.0 \n",
481 | "4 Father of the Bride Part II [comedy] 106.0 \n",
482 | "\n",
483 | " vote_average vote_count year \n",
484 | "0 7.7 5415.0 1995 \n",
485 | "1 6.9 2413.0 1995 \n",
486 | "2 6.5 92.0 1995 \n",
487 | "3 6.1 34.0 1995 \n",
488 | "4 5.7 173.0 1995 "
489 | ]
490 | },
491 | "execution_count": 10,
492 | "metadata": {},
493 | "output_type": "execute_result"
494 | }
495 | ],
496 | "source": [
497 | "df.head()"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 11,
503 | "metadata": {},
504 | "outputs": [
505 | {
506 | "data": {
507 | "text/html": [
508 | "\n",
509 | "\n",
522 | "
\n",
523 | " \n",
524 | " \n",
525 | " | \n",
526 | " title | \n",
527 | " runtime | \n",
528 | " vote_average | \n",
529 | " vote_count | \n",
530 | " year | \n",
531 | " genre | \n",
532 | "
\n",
533 | " \n",
534 | " \n",
535 | " \n",
536 | " 0 | \n",
537 | " Toy Story | \n",
538 | " 81.0 | \n",
539 | " 7.7 | \n",
540 | " 5415.0 | \n",
541 | " 1995 | \n",
542 | " animation | \n",
543 | "
\n",
544 | " \n",
545 | " 0 | \n",
546 | " Toy Story | \n",
547 | " 81.0 | \n",
548 | " 7.7 | \n",
549 | " 5415.0 | \n",
550 | " 1995 | \n",
551 | " comedy | \n",
552 | "
\n",
553 | " \n",
554 | " 0 | \n",
555 | " Toy Story | \n",
556 | " 81.0 | \n",
557 | " 7.7 | \n",
558 | " 5415.0 | \n",
559 | " 1995 | \n",
560 | " family | \n",
561 | "
\n",
562 | " \n",
563 | " 1 | \n",
564 | " Jumanji | \n",
565 | " 104.0 | \n",
566 | " 6.9 | \n",
567 | " 2413.0 | \n",
568 | " 1995 | \n",
569 | " adventure | \n",
570 | "
\n",
571 | " \n",
572 | " 1 | \n",
573 | " Jumanji | \n",
574 | " 104.0 | \n",
575 | " 6.9 | \n",
576 | " 2413.0 | \n",
577 | " 1995 | \n",
578 | " fantasy | \n",
579 | "
\n",
580 | " \n",
581 | "
\n",
582 | "
"
583 | ],
584 | "text/plain": [
585 | " title runtime vote_average vote_count year genre\n",
586 | "0 Toy Story 81.0 7.7 5415.0 1995 animation\n",
587 | "0 Toy Story 81.0 7.7 5415.0 1995 comedy\n",
588 | "0 Toy Story 81.0 7.7 5415.0 1995 family\n",
589 | "1 Jumanji 104.0 6.9 2413.0 1995 adventure\n",
590 | "1 Jumanji 104.0 6.9 2413.0 1995 fantasy"
591 | ]
592 | },
593 | "execution_count": 11,
594 | "metadata": {},
595 | "output_type": "execute_result"
596 | }
597 | ],
598 | "source": [
599 | "#Create a new feature by exploding genres\n",
600 | "s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n",
601 | "\n",
602 | "#Name the new feature as 'genre'\n",
603 | "s.name = 'genre'\n",
604 | "\n",
605 | "#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.\n",
606 | "gen_df = df.drop('genres', axis=1).join(s)\n",
607 | "\n",
608 | "#Print the head of the new gen_df\n",
609 | "gen_df.head()"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 12,
615 | "metadata": {
616 | "collapsed": true
617 | },
618 | "outputs": [],
619 | "source": [
620 | "def build_chart(gen_df, percentile=0.8):\n",
621 | " #Ask for preferred genres\n",
622 | " print(\"Input preferred genre\")\n",
623 | " genre = input()\n",
624 | " \n",
625 | " #Ask for lower limit of duration\n",
626 | " print(\"Input shortest duration\")\n",
627 | " low_time = int(input())\n",
628 | " \n",
629 | " #Ask for upper limit of duration\n",
630 | " print(\"Input longest duration\")\n",
631 | " high_time = int(input())\n",
632 | " \n",
633 | " #Ask for lower limit of timeline\n",
634 | " print(\"Input earliest year\")\n",
635 | " low_year = int(input())\n",
636 | " \n",
637 | " #Ask for upper limit of timeline\n",
638 | " print(\"Input latest year\")\n",
639 | " high_year = int(input())\n",
640 | " \n",
641 | " #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies\n",
642 | " movies = gen_df.copy()\n",
643 | " \n",
644 | " #Filter based on the condition\n",
645 | " movies = movies[(movies['genre'] == genre) & \n",
646 | " (movies['runtime'] >= low_time) & \n",
647 | " (movies['runtime'] <= high_time) & \n",
648 | " (movies['year'] >= low_year) & \n",
649 | " (movies['year'] <= high_year)]\n",
650 | " \n",
651 | " #Compute the values of C and m for the filtered movies\n",
652 | " C = movies['vote_average'].mean()\n",
653 | " m = movies['vote_count'].quantile(percentile)\n",
654 | " \n",
655 | " #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies\n",
656 | " q_movies = movies.copy().loc[movies['vote_count'] >= m]\n",
657 | " \n",
658 | " #Calculate score using the IMDB formula\n",
659 | " q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) \n",
660 | " + (m/(m+x['vote_count']) * C)\n",
661 | " ,axis=1)\n",
662 | "\n",
663 | " #Sort movies in descending order of their scores\n",
664 | " q_movies = q_movies.sort_values('score', ascending=False)\n",
665 | " \n",
666 | " return q_movies"
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": 13,
672 | "metadata": {},
673 | "outputs": [
674 | {
675 | "name": "stdout",
676 | "output_type": "stream",
677 | "text": [
678 | "Input preferred genre\n",
679 | "action\n",
680 | "Input shortest duration\n",
681 | "80\n",
682 | "Input longest duration\n",
683 | "120\n",
684 | "Input earliest year\n",
685 | "1990\n",
686 | "Input latest year\n",
687 | "2000\n"
688 | ]
689 | },
690 | {
691 | "data": {
692 | "text/html": [
693 | "\n",
694 | "\n",
707 | "
\n",
708 | " \n",
709 | " \n",
710 | " | \n",
711 | " title | \n",
712 | " runtime | \n",
713 | " vote_average | \n",
714 | " vote_count | \n",
715 | " year | \n",
716 | " genre | \n",
717 | " score | \n",
718 | "
\n",
719 | " \n",
720 | " \n",
721 | " \n",
722 | " 723 | \n",
723 | " Ghost in the Shell | \n",
724 | " 83.0 | \n",
725 | " 7.8 | \n",
726 | " 854.0 | \n",
727 | " 1995 | \n",
728 | " action | \n",
729 | " 7.521643 | \n",
730 | "
\n",
731 | " \n",
732 | " 550 | \n",
733 | " True Romance | \n",
734 | " 120.0 | \n",
735 | " 7.5 | \n",
736 | " 762.0 | \n",
737 | " 1993 | \n",
738 | " action | \n",
739 | " 7.231980 | \n",
740 | "
\n",
741 | " \n",
742 | " 3902 | \n",
743 | " O Brother, Where Art Thou? | \n",
744 | " 106.0 | \n",
745 | " 7.3 | \n",
746 | " 1144.0 | \n",
747 | " 2000 | \n",
748 | " action | \n",
749 | " 7.131617 | \n",
750 | "
\n",
751 | " \n",
752 | " 348 | \n",
753 | " The Crow | \n",
754 | " 102.0 | \n",
755 | " 7.3 | \n",
756 | " 980.0 | \n",
757 | " 1994 | \n",
758 | " action | \n",
759 | " 7.106412 | \n",
760 | "
\n",
761 | " \n",
762 | " 3871 | \n",
763 | " Crouching Tiger, Hidden Dragon | \n",
764 | " 120.0 | \n",
765 | " 7.2 | \n",
766 | " 949.0 | \n",
767 | " 2000 | \n",
768 | " action | \n",
769 | " 7.011634 | \n",
770 | "
\n",
771 | " \n",
772 | "
\n",
773 | "
"
774 | ],
775 | "text/plain": [
776 | " title runtime vote_average vote_count year \\\n",
777 | "723 Ghost in the Shell 83.0 7.8 854.0 1995 \n",
778 | "550 True Romance 120.0 7.5 762.0 1993 \n",
779 | "3902 O Brother, Where Art Thou? 106.0 7.3 1144.0 2000 \n",
780 | "348 The Crow 102.0 7.3 980.0 1994 \n",
781 | "3871 Crouching Tiger, Hidden Dragon 120.0 7.2 949.0 2000 \n",
782 | "\n",
783 | " genre score \n",
784 | "723 action 7.521643 \n",
785 | "550 action 7.231980 \n",
786 | "3902 action 7.131617 \n",
787 | "348 action 7.106412 \n",
788 | "3871 action 7.011634 "
789 | ]
790 | },
791 | "execution_count": 13,
792 | "metadata": {},
793 | "output_type": "execute_result"
794 | }
795 | ],
796 | "source": [
797 | "#Generate the chart for top animation movies and display top 5.\n",
798 | "build_chart(gen_df).head()"
799 | ]
800 | },
801 | {
802 | "cell_type": "code",
803 | "execution_count": null,
804 | "metadata": {},
805 | "outputs": [],
806 | "source": []
807 | }
808 | ],
809 | "metadata": {
810 | "kernelspec": {
811 | "display_name": "Python 3",
812 | "language": "python",
813 | "name": "python3"
814 | },
815 | "language_info": {
816 | "codemirror_mode": {
817 | "name": "ipython",
818 | "version": 3
819 | },
820 | "file_extension": ".py",
821 | "mimetype": "text/x-python",
822 | "name": "python",
823 | "nbconvert_exporter": "python",
824 | "pygments_lexer": "ipython3",
825 | "version": "3.6.0"
826 | }
827 | },
828 | "nbformat": 4,
829 | "nbformat_minor": 2
830 | }
831 |
--------------------------------------------------------------------------------
/Chapter3/.ipynb_checkpoints/Simple Recommender-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/Chapter3/Knowledge Recommender.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
13 | " interactivity=interactivity, compiler=compiler, result=result)\n"
14 | ]
15 | },
16 | {
17 | "data": {
18 | "text/plain": [
19 | "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n",
20 | " 'imdb_id', 'original_language', 'original_title', 'overview',\n",
21 | " 'popularity', 'poster_path', 'production_companies',\n",
22 | " 'production_countries', 'release_date', 'revenue', 'runtime',\n",
23 | " 'spoken_languages', 'status', 'tagline', 'title', 'video',\n",
24 | " 'vote_average', 'vote_count'],\n",
25 | " dtype='object')"
26 | ]
27 | },
28 | "execution_count": 1,
29 | "metadata": {},
30 | "output_type": "execute_result"
31 | }
32 | ],
33 | "source": [
34 | "import pandas as pd\n",
35 | "import numpy as np\n",
36 | "\n",
37 | "df = pd.read_csv('../data/movies_metadata.csv')\n",
38 | "\n",
39 | "#Print all the features (or columns) of the DataFrame\n",
40 | "df.columns"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/html": [
51 | "\n",
52 | "\n",
65 | "
\n",
66 | " \n",
67 | " \n",
68 | " | \n",
69 | " title | \n",
70 | " genres | \n",
71 | " release_date | \n",
72 | " runtime | \n",
73 | " vote_average | \n",
74 | " vote_count | \n",
75 | "
\n",
76 | " \n",
77 | " \n",
78 | " \n",
79 | " 0 | \n",
80 | " Toy Story | \n",
81 | " [{'id': 16, 'name': 'Animation'}, {'id': 35, '... | \n",
82 | " 1995-10-30 | \n",
83 | " 81.0 | \n",
84 | " 7.7 | \n",
85 | " 5415.0 | \n",
86 | "
\n",
87 | " \n",
88 | " 1 | \n",
89 | " Jumanji | \n",
90 | " [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... | \n",
91 | " 1995-12-15 | \n",
92 | " 104.0 | \n",
93 | " 6.9 | \n",
94 | " 2413.0 | \n",
95 | "
\n",
96 | " \n",
97 | " 2 | \n",
98 | " Grumpier Old Men | \n",
99 | " [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... | \n",
100 | " 1995-12-22 | \n",
101 | " 101.0 | \n",
102 | " 6.5 | \n",
103 | " 92.0 | \n",
104 | "
\n",
105 | " \n",
106 | " 3 | \n",
107 | " Waiting to Exhale | \n",
108 | " [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... | \n",
109 | " 1995-12-22 | \n",
110 | " 127.0 | \n",
111 | " 6.1 | \n",
112 | " 34.0 | \n",
113 | "
\n",
114 | " \n",
115 | " 4 | \n",
116 | " Father of the Bride Part II | \n",
117 | " [{'id': 35, 'name': 'Comedy'}] | \n",
118 | " 1995-02-10 | \n",
119 | " 106.0 | \n",
120 | " 5.7 | \n",
121 | " 173.0 | \n",
122 | "
\n",
123 | " \n",
124 | "
\n",
125 | "
"
126 | ],
127 | "text/plain": [
128 | " title \\\n",
129 | "0 Toy Story \n",
130 | "1 Jumanji \n",
131 | "2 Grumpier Old Men \n",
132 | "3 Waiting to Exhale \n",
133 | "4 Father of the Bride Part II \n",
134 | "\n",
135 | " genres release_date runtime \\\n",
136 | "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 1995-10-30 81.0 \n",
137 | "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 1995-12-15 104.0 \n",
138 | "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 1995-12-22 101.0 \n",
139 | "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 1995-12-22 127.0 \n",
140 | "4 [{'id': 35, 'name': 'Comedy'}] 1995-02-10 106.0 \n",
141 | "\n",
142 | " vote_average vote_count \n",
143 | "0 7.7 5415.0 \n",
144 | "1 6.9 2413.0 \n",
145 | "2 6.5 92.0 \n",
146 | "3 6.1 34.0 \n",
147 | "4 5.7 173.0 "
148 | ]
149 | },
150 | "execution_count": 2,
151 | "metadata": {},
152 | "output_type": "execute_result"
153 | }
154 | ],
155 | "source": [
156 | "#Only keep those features that we require \n",
157 | "df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]\n",
158 | "\n",
159 | "df.head()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 3,
165 | "metadata": {
166 | "collapsed": true
167 | },
168 | "outputs": [],
169 | "source": [
170 | "#Convert release_date into pandas datetime format\n",
171 | "df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
172 | "\n",
173 | "#Extract year from the datetime\n",
174 | "df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 4,
180 | "metadata": {
181 | "collapsed": true
182 | },
183 | "outputs": [],
184 | "source": [
185 | "#Helper function to convert NaT to 0 and all other years to integers.\n",
186 | "def convert_int(x):\n",
187 | " try:\n",
188 | " return int(x)\n",
189 | " except:\n",
190 | " return 0"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 5,
196 | "metadata": {
197 | "collapsed": true
198 | },
199 | "outputs": [],
200 | "source": [
201 | "#Apply convert_int to the year feature\n",
202 | "df['year'] = df['year'].apply(convert_int)"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 6,
208 | "metadata": {},
209 | "outputs": [
210 | {
211 | "data": {
212 | "text/html": [
213 | "\n",
214 | "\n",
227 | "
\n",
228 | " \n",
229 | " \n",
230 | " | \n",
231 | " title | \n",
232 | " genres | \n",
233 | " runtime | \n",
234 | " vote_average | \n",
235 | " vote_count | \n",
236 | " year | \n",
237 | "
\n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " 0 | \n",
242 | " Toy Story | \n",
243 | " [{'id': 16, 'name': 'Animation'}, {'id': 35, '... | \n",
244 | " 81.0 | \n",
245 | " 7.7 | \n",
246 | " 5415.0 | \n",
247 | " 1995 | \n",
248 | "
\n",
249 | " \n",
250 | " 1 | \n",
251 | " Jumanji | \n",
252 | " [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... | \n",
253 | " 104.0 | \n",
254 | " 6.9 | \n",
255 | " 2413.0 | \n",
256 | " 1995 | \n",
257 | "
\n",
258 | " \n",
259 | " 2 | \n",
260 | " Grumpier Old Men | \n",
261 | " [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... | \n",
262 | " 101.0 | \n",
263 | " 6.5 | \n",
264 | " 92.0 | \n",
265 | " 1995 | \n",
266 | "
\n",
267 | " \n",
268 | " 3 | \n",
269 | " Waiting to Exhale | \n",
270 | " [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... | \n",
271 | " 127.0 | \n",
272 | " 6.1 | \n",
273 | " 34.0 | \n",
274 | " 1995 | \n",
275 | "
\n",
276 | " \n",
277 | " 4 | \n",
278 | " Father of the Bride Part II | \n",
279 | " [{'id': 35, 'name': 'Comedy'}] | \n",
280 | " 106.0 | \n",
281 | " 5.7 | \n",
282 | " 173.0 | \n",
283 | " 1995 | \n",
284 | "
\n",
285 | " \n",
286 | "
\n",
287 | "
"
288 | ],
289 | "text/plain": [
290 | " title \\\n",
291 | "0 Toy Story \n",
292 | "1 Jumanji \n",
293 | "2 Grumpier Old Men \n",
294 | "3 Waiting to Exhale \n",
295 | "4 Father of the Bride Part II \n",
296 | "\n",
297 | " genres runtime vote_average \\\n",
298 | "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 81.0 7.7 \n",
299 | "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 104.0 6.9 \n",
300 | "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 101.0 6.5 \n",
301 | "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 127.0 6.1 \n",
302 | "4 [{'id': 35, 'name': 'Comedy'}] 106.0 5.7 \n",
303 | "\n",
304 | " vote_count year \n",
305 | "0 5415.0 1995 \n",
306 | "1 2413.0 1995 \n",
307 | "2 92.0 1995 \n",
308 | "3 34.0 1995 \n",
309 | "4 173.0 1995 "
310 | ]
311 | },
312 | "execution_count": 6,
313 | "metadata": {},
314 | "output_type": "execute_result"
315 | }
316 | ],
317 | "source": [
318 | "#Drop the release_date column\n",
319 | "df = df.drop('release_date', axis=1)\n",
320 | "\n",
321 | "#Display the dataframe\n",
322 | "df.head()"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 7,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "data": {
332 | "text/plain": [
333 | "\"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]\""
334 | ]
335 | },
336 | "execution_count": 7,
337 | "metadata": {},
338 | "output_type": "execute_result"
339 | }
340 | ],
341 | "source": [
342 | "#Print genres of the first movie\n",
343 | "df.iloc[0]['genres']"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 8,
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "name": "stdout",
353 | "output_type": "stream",
354 | "text": [
355 | "\n",
356 | "\n"
357 | ]
358 | }
359 | ],
360 | "source": [
361 | "#Import the literal_eval function from ast\n",
362 | "from ast import literal_eval\n",
363 | "\n",
364 | "#Define a stringified list and output its type\n",
365 | "a = \"[1,2,3]\"\n",
366 | "print(type(a))\n",
367 | "\n",
368 | "#Apply literal_eval and output type\n",
369 | "b = literal_eval(a)\n",
370 | "print(type(b))"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 9,
376 | "metadata": {
377 | "collapsed": true
378 | },
379 | "outputs": [],
380 | "source": [
381 | "#Convert all NaN into stringified empty lists\n",
382 | "df['genres'] = df['genres'].fillna('[]')\n",
383 | "\n",
384 | "#Apply literal_eval to convert stringified empty lists to the list object\n",
385 | "df['genres'] = df['genres'].apply(literal_eval)\n",
386 | "\n",
387 | "#Convert list of dictionaries to a list of strings\n",
388 | "df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 10,
394 | "metadata": {},
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/html": [
399 | "\n",
400 | "\n",
413 | "
\n",
414 | " \n",
415 | " \n",
416 | " | \n",
417 | " title | \n",
418 | " genres | \n",
419 | " runtime | \n",
420 | " vote_average | \n",
421 | " vote_count | \n",
422 | " year | \n",
423 | "
\n",
424 | " \n",
425 | " \n",
426 | " \n",
427 | " 0 | \n",
428 | " Toy Story | \n",
429 | " [animation, comedy, family] | \n",
430 | " 81.0 | \n",
431 | " 7.7 | \n",
432 | " 5415.0 | \n",
433 | " 1995 | \n",
434 | "
\n",
435 | " \n",
436 | " 1 | \n",
437 | " Jumanji | \n",
438 | " [adventure, fantasy, family] | \n",
439 | " 104.0 | \n",
440 | " 6.9 | \n",
441 | " 2413.0 | \n",
442 | " 1995 | \n",
443 | "
\n",
444 | " \n",
445 | " 2 | \n",
446 | " Grumpier Old Men | \n",
447 | " [romance, comedy] | \n",
448 | " 101.0 | \n",
449 | " 6.5 | \n",
450 | " 92.0 | \n",
451 | " 1995 | \n",
452 | "
\n",
453 | " \n",
454 | " 3 | \n",
455 | " Waiting to Exhale | \n",
456 | " [comedy, drama, romance] | \n",
457 | " 127.0 | \n",
458 | " 6.1 | \n",
459 | " 34.0 | \n",
460 | " 1995 | \n",
461 | "
\n",
462 | " \n",
463 | " 4 | \n",
464 | " Father of the Bride Part II | \n",
465 | " [comedy] | \n",
466 | " 106.0 | \n",
467 | " 5.7 | \n",
468 | " 173.0 | \n",
469 | " 1995 | \n",
470 | "
\n",
471 | " \n",
472 | "
\n",
473 | "
"
474 | ],
475 | "text/plain": [
476 | " title genres runtime \\\n",
477 | "0 Toy Story [animation, comedy, family] 81.0 \n",
478 | "1 Jumanji [adventure, fantasy, family] 104.0 \n",
479 | "2 Grumpier Old Men [romance, comedy] 101.0 \n",
480 | "3 Waiting to Exhale [comedy, drama, romance] 127.0 \n",
481 | "4 Father of the Bride Part II [comedy] 106.0 \n",
482 | "\n",
483 | " vote_average vote_count year \n",
484 | "0 7.7 5415.0 1995 \n",
485 | "1 6.9 2413.0 1995 \n",
486 | "2 6.5 92.0 1995 \n",
487 | "3 6.1 34.0 1995 \n",
488 | "4 5.7 173.0 1995 "
489 | ]
490 | },
491 | "execution_count": 10,
492 | "metadata": {},
493 | "output_type": "execute_result"
494 | }
495 | ],
496 | "source": [
497 | "df.head()"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 11,
503 | "metadata": {},
504 | "outputs": [
505 | {
506 | "data": {
507 | "text/html": [
508 | "\n",
509 | "\n",
522 | "
\n",
523 | " \n",
524 | " \n",
525 | " | \n",
526 | " title | \n",
527 | " runtime | \n",
528 | " vote_average | \n",
529 | " vote_count | \n",
530 | " year | \n",
531 | " genre | \n",
532 | "
\n",
533 | " \n",
534 | " \n",
535 | " \n",
536 | " 0 | \n",
537 | " Toy Story | \n",
538 | " 81.0 | \n",
539 | " 7.7 | \n",
540 | " 5415.0 | \n",
541 | " 1995 | \n",
542 | " animation | \n",
543 | "
\n",
544 | " \n",
545 | " 0 | \n",
546 | " Toy Story | \n",
547 | " 81.0 | \n",
548 | " 7.7 | \n",
549 | " 5415.0 | \n",
550 | " 1995 | \n",
551 | " comedy | \n",
552 | "
\n",
553 | " \n",
554 | " 0 | \n",
555 | " Toy Story | \n",
556 | " 81.0 | \n",
557 | " 7.7 | \n",
558 | " 5415.0 | \n",
559 | " 1995 | \n",
560 | " family | \n",
561 | "
\n",
562 | " \n",
563 | " 1 | \n",
564 | " Jumanji | \n",
565 | " 104.0 | \n",
566 | " 6.9 | \n",
567 | " 2413.0 | \n",
568 | " 1995 | \n",
569 | " adventure | \n",
570 | "
\n",
571 | " \n",
572 | " 1 | \n",
573 | " Jumanji | \n",
574 | " 104.0 | \n",
575 | " 6.9 | \n",
576 | " 2413.0 | \n",
577 | " 1995 | \n",
578 | " fantasy | \n",
579 | "
\n",
580 | " \n",
581 | "
\n",
582 | "
"
583 | ],
584 | "text/plain": [
585 | " title runtime vote_average vote_count year genre\n",
586 | "0 Toy Story 81.0 7.7 5415.0 1995 animation\n",
587 | "0 Toy Story 81.0 7.7 5415.0 1995 comedy\n",
588 | "0 Toy Story 81.0 7.7 5415.0 1995 family\n",
589 | "1 Jumanji 104.0 6.9 2413.0 1995 adventure\n",
590 | "1 Jumanji 104.0 6.9 2413.0 1995 fantasy"
591 | ]
592 | },
593 | "execution_count": 11,
594 | "metadata": {},
595 | "output_type": "execute_result"
596 | }
597 | ],
598 | "source": [
599 | "#Create a new feature by exploding genres\n",
600 | "s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n",
601 | "\n",
602 | "#Name the new feature as 'genre'\n",
603 | "s.name = 'genre'\n",
604 | "\n",
605 | "#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.\n",
606 | "gen_df = df.drop('genres', axis=1).join(s)\n",
607 | "\n",
608 | "#Print the head of the new gen_df\n",
609 | "gen_df.head()"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 12,
615 | "metadata": {
616 | "collapsed": true
617 | },
618 | "outputs": [],
619 | "source": [
620 | "def build_chart(gen_df, percentile=0.8):\n",
621 | " #Ask for preferred genres\n",
622 | " print(\"Input preferred genre\")\n",
623 | " genre = input()\n",
624 | " \n",
625 | " #Ask for lower limit of duration\n",
626 | " print(\"Input shortest duration\")\n",
627 | " low_time = int(input())\n",
628 | " \n",
629 | " #Ask for upper limit of duration\n",
630 | " print(\"Input longest duration\")\n",
631 | " high_time = int(input())\n",
632 | " \n",
633 | " #Ask for lower limit of timeline\n",
634 | " print(\"Input earliest year\")\n",
635 | " low_year = int(input())\n",
636 | " \n",
637 | " #Ask for upper limit of timeline\n",
638 | " print(\"Input latest year\")\n",
639 | " high_year = int(input())\n",
640 | " \n",
641 | " #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies\n",
642 | " movies = gen_df.copy()\n",
643 | " \n",
644 | " #Filter based on the condition\n",
645 | " movies = movies[(movies['genre'] == genre) & \n",
646 | " (movies['runtime'] >= low_time) & \n",
647 | " (movies['runtime'] <= high_time) & \n",
648 | " (movies['year'] >= low_year) & \n",
649 | " (movies['year'] <= high_year)]\n",
650 | " \n",
651 | " #Compute the values of C and m for the filtered movies\n",
652 | " C = movies['vote_average'].mean()\n",
653 | " m = movies['vote_count'].quantile(percentile)\n",
654 | " \n",
655 | " #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies\n",
656 | " q_movies = movies.copy().loc[movies['vote_count'] >= m]\n",
657 | " \n",
658 | " #Calculate score using the IMDB formula\n",
659 | " q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) \n",
660 | " + (m/(m+x['vote_count']) * C)\n",
661 | " ,axis=1)\n",
662 | "\n",
663 | " #Sort movies in descending order of their scores\n",
664 | " q_movies = q_movies.sort_values('score', ascending=False)\n",
665 | " \n",
666 | " return q_movies"
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": 13,
672 | "metadata": {},
673 | "outputs": [
674 | {
675 | "name": "stdout",
676 | "output_type": "stream",
677 | "text": [
678 | "Input preferred genre\n",
679 | "action\n",
680 | "Input shortest duration\n",
681 | "80\n",
682 | "Input longest duration\n",
683 | "120\n",
684 | "Input earliest year\n",
685 | "1990\n",
686 | "Input latest year\n",
687 | "2000\n"
688 | ]
689 | },
690 | {
691 | "data": {
692 | "text/html": [
693 | "\n",
694 | "\n",
707 | "
\n",
708 | " \n",
709 | " \n",
710 | " | \n",
711 | " title | \n",
712 | " runtime | \n",
713 | " vote_average | \n",
714 | " vote_count | \n",
715 | " year | \n",
716 | " genre | \n",
717 | " score | \n",
718 | "
\n",
719 | " \n",
720 | " \n",
721 | " \n",
722 | " 723 | \n",
723 | " Ghost in the Shell | \n",
724 | " 83.0 | \n",
725 | " 7.8 | \n",
726 | " 854.0 | \n",
727 | " 1995 | \n",
728 | " action | \n",
729 | " 7.521643 | \n",
730 | "
\n",
731 | " \n",
732 | " 550 | \n",
733 | " True Romance | \n",
734 | " 120.0 | \n",
735 | " 7.5 | \n",
736 | " 762.0 | \n",
737 | " 1993 | \n",
738 | " action | \n",
739 | " 7.231980 | \n",
740 | "
\n",
741 | " \n",
742 | " 3902 | \n",
743 | " O Brother, Where Art Thou? | \n",
744 | " 106.0 | \n",
745 | " 7.3 | \n",
746 | " 1144.0 | \n",
747 | " 2000 | \n",
748 | " action | \n",
749 | " 7.131617 | \n",
750 | "
\n",
751 | " \n",
752 | " 348 | \n",
753 | " The Crow | \n",
754 | " 102.0 | \n",
755 | " 7.3 | \n",
756 | " 980.0 | \n",
757 | " 1994 | \n",
758 | " action | \n",
759 | " 7.106412 | \n",
760 | "
\n",
761 | " \n",
762 | " 3871 | \n",
763 | " Crouching Tiger, Hidden Dragon | \n",
764 | " 120.0 | \n",
765 | " 7.2 | \n",
766 | " 949.0 | \n",
767 | " 2000 | \n",
768 | " action | \n",
769 | " 7.011634 | \n",
770 | "
\n",
771 | " \n",
772 | "
\n",
773 | "
"
774 | ],
775 | "text/plain": [
776 | " title runtime vote_average vote_count year \\\n",
777 | "723 Ghost in the Shell 83.0 7.8 854.0 1995 \n",
778 | "550 True Romance 120.0 7.5 762.0 1993 \n",
779 | "3902 O Brother, Where Art Thou? 106.0 7.3 1144.0 2000 \n",
780 | "348 The Crow 102.0 7.3 980.0 1994 \n",
781 | "3871 Crouching Tiger, Hidden Dragon 120.0 7.2 949.0 2000 \n",
782 | "\n",
783 | " genre score \n",
784 | "723 action 7.521643 \n",
785 | "550 action 7.231980 \n",
786 | "3902 action 7.131617 \n",
787 | "348 action 7.106412 \n",
788 | "3871 action 7.011634 "
789 | ]
790 | },
791 | "execution_count": 13,
792 | "metadata": {},
793 | "output_type": "execute_result"
794 | }
795 | ],
796 | "source": [
797 | "#Generate the chart for top animation movies and display top 5.\n",
798 | "build_chart(gen_df).head()"
799 | ]
800 | },
801 | {
802 | "cell_type": "code",
803 | "execution_count": 19,
804 | "metadata": {},
805 | "outputs": [],
806 | "source": [
807 | "#Convert the cleaned (non-exploded) dataframe df into a CSV file and save it in the data folder\n",
808 | "#Set parameter index to False as the index of the DataFrame has no inherent meaning.\n",
809 | "df.to_csv('../data/metadata_clean.csv', index=False)"
810 | ]
811 | },
812 | {
813 | "cell_type": "code",
814 | "execution_count": null,
815 | "metadata": {
816 | "collapsed": true
817 | },
818 | "outputs": [],
819 | "source": []
820 | }
821 | ],
822 | "metadata": {
823 | "kernelspec": {
824 | "display_name": "Python 3",
825 | "language": "python",
826 | "name": "python3"
827 | },
828 | "language_info": {
829 | "codemirror_mode": {
830 | "name": "ipython",
831 | "version": 3
832 | },
833 | "file_extension": ".py",
834 | "mimetype": "text/x-python",
835 | "name": "python",
836 | "nbconvert_exporter": "python",
837 | "pygments_lexer": "ipython3",
838 | "version": "3.6.0"
839 | }
840 | },
841 | "nbformat": 4,
842 | "nbformat_minor": 2
843 | }
844 |
--------------------------------------------------------------------------------
/Chapter3/Simple Recommender.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
13 | " interactivity=interactivity, compiler=compiler, result=result)\n"
14 | ]
15 | },
16 | {
17 | "data": {
18 | "text/html": [
19 | "\n",
20 | "\n",
33 | "
\n",
34 | " \n",
35 | " \n",
36 | " | \n",
37 | " adult | \n",
38 | " belongs_to_collection | \n",
39 | " budget | \n",
40 | " genres | \n",
41 | " homepage | \n",
42 | " id | \n",
43 | " imdb_id | \n",
44 | " original_language | \n",
45 | " original_title | \n",
46 | " overview | \n",
47 | " ... | \n",
48 | " release_date | \n",
49 | " revenue | \n",
50 | " runtime | \n",
51 | " spoken_languages | \n",
52 | " status | \n",
53 | " tagline | \n",
54 | " title | \n",
55 | " video | \n",
56 | " vote_average | \n",
57 | " vote_count | \n",
58 | "
\n",
59 | " \n",
60 | " \n",
61 | " \n",
62 | " 0 | \n",
63 | " False | \n",
64 | " {'id': 10194, 'name': 'Toy Story Collection', ... | \n",
65 | " 30000000 | \n",
66 | " [{'id': 16, 'name': 'Animation'}, {'id': 35, '... | \n",
67 | " http://toystory.disney.com/toy-story | \n",
68 | " 862 | \n",
69 | " tt0114709 | \n",
70 | " en | \n",
71 | " Toy Story | \n",
72 | " Led by Woody, Andy's toys live happily in his ... | \n",
73 | " ... | \n",
74 | " 1995-10-30 | \n",
75 | " 373554033.0 | \n",
76 | " 81.0 | \n",
77 | " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
78 | " Released | \n",
79 | " NaN | \n",
80 | " Toy Story | \n",
81 | " False | \n",
82 | " 7.7 | \n",
83 | " 5415.0 | \n",
84 | "
\n",
85 | " \n",
86 | " 1 | \n",
87 | " False | \n",
88 | " NaN | \n",
89 | " 65000000 | \n",
90 | " [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... | \n",
91 | " NaN | \n",
92 | " 8844 | \n",
93 | " tt0113497 | \n",
94 | " en | \n",
95 | " Jumanji | \n",
96 | " When siblings Judy and Peter discover an encha... | \n",
97 | " ... | \n",
98 | " 1995-12-15 | \n",
99 | " 262797249.0 | \n",
100 | " 104.0 | \n",
101 | " [{'iso_639_1': 'en', 'name': 'English'}, {'iso... | \n",
102 | " Released | \n",
103 | " Roll the dice and unleash the excitement! | \n",
104 | " Jumanji | \n",
105 | " False | \n",
106 | " 6.9 | \n",
107 | " 2413.0 | \n",
108 | "
\n",
109 | " \n",
110 | " 2 | \n",
111 | " False | \n",
112 | " {'id': 119050, 'name': 'Grumpy Old Men Collect... | \n",
113 | " 0 | \n",
114 | " [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... | \n",
115 | " NaN | \n",
116 | " 15602 | \n",
117 | " tt0113228 | \n",
118 | " en | \n",
119 | " Grumpier Old Men | \n",
120 | " A family wedding reignites the ancient feud be... | \n",
121 | " ... | \n",
122 | " 1995-12-22 | \n",
123 | " 0.0 | \n",
124 | " 101.0 | \n",
125 | " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
126 | " Released | \n",
127 | " Still Yelling. Still Fighting. Still Ready for... | \n",
128 | " Grumpier Old Men | \n",
129 | " False | \n",
130 | " 6.5 | \n",
131 | " 92.0 | \n",
132 | "
\n",
133 | " \n",
134 | " 3 | \n",
135 | " False | \n",
136 | " NaN | \n",
137 | " 16000000 | \n",
138 | " [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... | \n",
139 | " NaN | \n",
140 | " 31357 | \n",
141 | " tt0114885 | \n",
142 | " en | \n",
143 | " Waiting to Exhale | \n",
144 | " Cheated on, mistreated and stepped on, the wom... | \n",
145 | " ... | \n",
146 | " 1995-12-22 | \n",
147 | " 81452156.0 | \n",
148 | " 127.0 | \n",
149 | " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
150 | " Released | \n",
151 | " Friends are the people who let you be yourself... | \n",
152 | " Waiting to Exhale | \n",
153 | " False | \n",
154 | " 6.1 | \n",
155 | " 34.0 | \n",
156 | "
\n",
157 | " \n",
158 | " 4 | \n",
159 | " False | \n",
160 | " {'id': 96871, 'name': 'Father of the Bride Col... | \n",
161 | " 0 | \n",
162 | " [{'id': 35, 'name': 'Comedy'}] | \n",
163 | " NaN | \n",
164 | " 11862 | \n",
165 | " tt0113041 | \n",
166 | " en | \n",
167 | " Father of the Bride Part II | \n",
168 | " Just when George Banks has recovered from his ... | \n",
169 | " ... | \n",
170 | " 1995-02-10 | \n",
171 | " 76578911.0 | \n",
172 | " 106.0 | \n",
173 | " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
174 | " Released | \n",
175 | " Just When His World Is Back To Normal... He's ... | \n",
176 | " Father of the Bride Part II | \n",
177 | " False | \n",
178 | " 5.7 | \n",
179 | " 173.0 | \n",
180 | "
\n",
181 | " \n",
182 | "
\n",
183 | "
5 rows × 24 columns
\n",
184 | "
"
185 | ],
186 | "text/plain": [
187 | " adult belongs_to_collection budget \\\n",
188 | "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
189 | "1 False NaN 65000000 \n",
190 | "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
191 | "3 False NaN 16000000 \n",
192 | "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
193 | "\n",
194 | " genres \\\n",
195 | "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
196 | "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
197 | "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
198 | "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
199 | "4 [{'id': 35, 'name': 'Comedy'}] \n",
200 | "\n",
201 | " homepage id imdb_id original_language \\\n",
202 | "0 http://toystory.disney.com/toy-story 862 tt0114709 en \n",
203 | "1 NaN 8844 tt0113497 en \n",
204 | "2 NaN 15602 tt0113228 en \n",
205 | "3 NaN 31357 tt0114885 en \n",
206 | "4 NaN 11862 tt0113041 en \n",
207 | "\n",
208 | " original_title \\\n",
209 | "0 Toy Story \n",
210 | "1 Jumanji \n",
211 | "2 Grumpier Old Men \n",
212 | "3 Waiting to Exhale \n",
213 | "4 Father of the Bride Part II \n",
214 | "\n",
215 | " overview ... release_date \\\n",
216 | "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
217 | "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
218 | "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
219 | "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
220 | "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
221 | "\n",
222 | " revenue runtime spoken_languages \\\n",
223 | "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
224 | "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
225 | "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
226 | "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
227 | "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
228 | "\n",
229 | " status tagline \\\n",
230 | "0 Released NaN \n",
231 | "1 Released Roll the dice and unleash the excitement! \n",
232 | "2 Released Still Yelling. Still Fighting. Still Ready for... \n",
233 | "3 Released Friends are the people who let you be yourself... \n",
234 | "4 Released Just When His World Is Back To Normal... He's ... \n",
235 | "\n",
236 | " title video vote_average vote_count \n",
237 | "0 Toy Story False 7.7 5415.0 \n",
238 | "1 Jumanji False 6.9 2413.0 \n",
239 | "2 Grumpier Old Men False 6.5 92.0 \n",
240 | "3 Waiting to Exhale False 6.1 34.0 \n",
241 | "4 Father of the Bride Part II False 5.7 173.0 \n",
242 | "\n",
243 | "[5 rows x 24 columns]"
244 | ]
245 | },
246 | "execution_count": 4,
247 | "metadata": {},
248 | "output_type": "execute_result"
249 | }
250 | ],
251 | "source": [
252 | "import pandas as pd\n",
253 | "import numpy as np\n",
254 | "\n",
255 | "df = pd.read_csv('../data/movies_metadata.csv')\n",
256 | "df.head()"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": 7,
262 | "metadata": {},
263 | "outputs": [
264 | {
265 | "data": {
266 | "text/plain": [
267 | "50.0"
268 | ]
269 | },
270 | "execution_count": 7,
271 | "metadata": {},
272 | "output_type": "execute_result"
273 | }
274 | ],
275 | "source": [
276 | "#Calculate the number of votes garnered by the 80th percentile movie\n",
277 | "m = df['vote_count'].quantile(0.80)\n",
278 | "m"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 13,
284 | "metadata": {},
285 | "outputs": [
286 | {
287 | "data": {
288 | "text/plain": [
289 | "(8963, 24)"
290 | ]
291 | },
292 | "execution_count": 13,
293 | "metadata": {},
294 | "output_type": "execute_result"
295 | }
296 | ],
297 | "source": [
298 | "#Only consider movies longer than 45 minutes and shorter than 300 minutes\n",
299 | "q_movies = df[(df['runtime'] >= 45) & (df['runtime'] <= 300)]\n",
300 | "\n",
301 | "#Only consider movies that have garnered more than m votes\n",
302 | "q_movies = q_movies[q_movies['vote_count'] >= m]\n",
303 | "\n",
304 | "#Inspect the number of movies that made the cut\n",
305 | "q_movies.shape"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 15,
311 | "metadata": {},
312 | "outputs": [
313 | {
314 | "data": {
315 | "text/plain": [
316 | "5.6182072151341851"
317 | ]
318 | },
319 | "execution_count": 15,
320 | "metadata": {},
321 | "output_type": "execute_result"
322 | }
323 | ],
324 | "source": [
325 | "# Calculate C\n",
326 | "C = df['vote_average'].mean()\n",
327 | "C"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 16,
333 | "metadata": {
334 | "collapsed": true
335 | },
336 | "outputs": [],
337 | "source": [
338 | "# Function to compute the IMDB weighted rating for each movie\n",
339 | "def weighted_rating(x, m=m, C=C):\n",
340 | " v = x['vote_count']\n",
341 | " R = x['vote_average']\n",
342 | " # Compute the weighted score\n",
343 | " return (v/(v+m) * R) + (m/(m+v) * C)"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 17,
349 | "metadata": {
350 | "collapsed": true
351 | },
352 | "outputs": [],
353 | "source": [
354 | "# Compute the score using the weighted_rating function defined above\n",
355 | "q_movies['score'] = q_movies.apply(weighted_rating, axis=1)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 20,
361 | "metadata": {},
362 | "outputs": [
363 | {
364 | "data": {
365 | "text/html": [
366 | "\n",
367 | "\n",
380 | "
\n",
381 | " \n",
382 | " \n",
383 | " | \n",
384 | " title | \n",
385 | " vote_count | \n",
386 | " vote_average | \n",
387 | " score | \n",
388 | " runtime | \n",
389 | "
\n",
390 | " \n",
391 | " \n",
392 | " \n",
393 | " 10309 | \n",
394 | " Dilwale Dulhania Le Jayenge | \n",
395 | " 661.0 | \n",
396 | " 9.1 | \n",
397 | " 8.855148 | \n",
398 | " 190.0 | \n",
399 | "
\n",
400 | " \n",
401 | " 314 | \n",
402 | " The Shawshank Redemption | \n",
403 | " 8358.0 | \n",
404 | " 8.5 | \n",
405 | " 8.482863 | \n",
406 | " 142.0 | \n",
407 | "
\n",
408 | " \n",
409 | " 834 | \n",
410 | " The Godfather | \n",
411 | " 6024.0 | \n",
412 | " 8.5 | \n",
413 | " 8.476278 | \n",
414 | " 175.0 | \n",
415 | "
\n",
416 | " \n",
417 | " 40251 | \n",
418 | " Your Name. | \n",
419 | " 1030.0 | \n",
420 | " 8.5 | \n",
421 | " 8.366584 | \n",
422 | " 106.0 | \n",
423 | "
\n",
424 | " \n",
425 | " 12481 | \n",
426 | " The Dark Knight | \n",
427 | " 12269.0 | \n",
428 | " 8.3 | \n",
429 | " 8.289115 | \n",
430 | " 152.0 | \n",
431 | "
\n",
432 | " \n",
433 | " 2843 | \n",
434 | " Fight Club | \n",
435 | " 9678.0 | \n",
436 | " 8.3 | \n",
437 | " 8.286216 | \n",
438 | " 139.0 | \n",
439 | "
\n",
440 | " \n",
441 | " 292 | \n",
442 | " Pulp Fiction | \n",
443 | " 8670.0 | \n",
444 | " 8.3 | \n",
445 | " 8.284623 | \n",
446 | " 154.0 | \n",
447 | "
\n",
448 | " \n",
449 | " 522 | \n",
450 | " Schindler's List | \n",
451 | " 4436.0 | \n",
452 | " 8.3 | \n",
453 | " 8.270109 | \n",
454 | " 195.0 | \n",
455 | "
\n",
456 | " \n",
457 | " 23673 | \n",
458 | " Whiplash | \n",
459 | " 4376.0 | \n",
460 | " 8.3 | \n",
461 | " 8.269704 | \n",
462 | " 105.0 | \n",
463 | "
\n",
464 | " \n",
465 | " 5481 | \n",
466 | " Spirited Away | \n",
467 | " 3968.0 | \n",
468 | " 8.3 | \n",
469 | " 8.266628 | \n",
470 | " 125.0 | \n",
471 | "
\n",
472 | " \n",
473 | " 2211 | \n",
474 | " Life Is Beautiful | \n",
475 | " 3643.0 | \n",
476 | " 8.3 | \n",
477 | " 8.263691 | \n",
478 | " 116.0 | \n",
479 | "
\n",
480 | " \n",
481 | " 1178 | \n",
482 | " The Godfather: Part II | \n",
483 | " 3418.0 | \n",
484 | " 8.3 | \n",
485 | " 8.261335 | \n",
486 | " 200.0 | \n",
487 | "
\n",
488 | " \n",
489 | " 1152 | \n",
490 | " One Flew Over the Cuckoo's Nest | \n",
491 | " 3001.0 | \n",
492 | " 8.3 | \n",
493 | " 8.256051 | \n",
494 | " 133.0 | \n",
495 | "
\n",
496 | " \n",
497 | " 1176 | \n",
498 | " Psycho | \n",
499 | " 2405.0 | \n",
500 | " 8.3 | \n",
501 | " 8.245381 | \n",
502 | " 109.0 | \n",
503 | "
\n",
504 | " \n",
505 | " 351 | \n",
506 | " Forrest Gump | \n",
507 | " 8147.0 | \n",
508 | " 8.2 | \n",
509 | " 8.184252 | \n",
510 | " 142.0 | \n",
511 | "
\n",
512 | " \n",
513 | " 1184 | \n",
514 | " Once Upon a Time in America | \n",
515 | " 1104.0 | \n",
516 | " 8.3 | \n",
517 | " 8.183804 | \n",
518 | " 229.0 | \n",
519 | "
\n",
520 | " \n",
521 | " 1154 | \n",
522 | " The Empire Strikes Back | \n",
523 | " 5998.0 | \n",
524 | " 8.2 | \n",
525 | " 8.178656 | \n",
526 | " 124.0 | \n",
527 | "
\n",
528 | " \n",
529 | " 18465 | \n",
530 | " The Intouchables | \n",
531 | " 5410.0 | \n",
532 | " 8.2 | \n",
533 | " 8.176357 | \n",
534 | " 112.0 | \n",
535 | "
\n",
536 | " \n",
537 | " 289 | \n",
538 | " Leon: The Professional | \n",
539 | " 4293.0 | \n",
540 | " 8.2 | \n",
541 | " 8.170276 | \n",
542 | " 110.0 | \n",
543 | "
\n",
544 | " \n",
545 | " 3030 | \n",
546 | " The Green Mile | \n",
547 | " 4166.0 | \n",
548 | " 8.2 | \n",
549 | " 8.169381 | \n",
550 | " 189.0 | \n",
551 | "
\n",
552 | " \n",
553 | " 1170 | \n",
554 | " GoodFellas | \n",
555 | " 3211.0 | \n",
556 | " 8.2 | \n",
557 | " 8.160414 | \n",
558 | " 145.0 | \n",
559 | "
\n",
560 | " \n",
561 | " 2216 | \n",
562 | " American History X | \n",
563 | " 3120.0 | \n",
564 | " 8.2 | \n",
565 | " 8.159278 | \n",
566 | " 119.0 | \n",
567 | "
\n",
568 | " \n",
569 | " 1161 | \n",
570 | " 12 Angry Men | \n",
571 | " 2130.0 | \n",
572 | " 8.2 | \n",
573 | " 8.140785 | \n",
574 | " 96.0 | \n",
575 | "
\n",
576 | " \n",
577 | " 9698 | \n",
578 | " Howl's Moving Castle | \n",
579 | " 2049.0 | \n",
580 | " 8.2 | \n",
581 | " 8.138499 | \n",
582 | " 119.0 | \n",
583 | "
\n",
584 | " \n",
585 | " 2884 | \n",
586 | " Princess Mononoke | \n",
587 | " 2041.0 | \n",
588 | " 8.2 | \n",
589 | " 8.138264 | \n",
590 | " 134.0 | \n",
591 | "
\n",
592 | " \n",
593 | "
\n",
594 | "
"
595 | ],
596 | "text/plain": [
597 | " title vote_count vote_average score \\\n",
598 | "10309 Dilwale Dulhania Le Jayenge 661.0 9.1 8.855148 \n",
599 | "314 The Shawshank Redemption 8358.0 8.5 8.482863 \n",
600 | "834 The Godfather 6024.0 8.5 8.476278 \n",
601 | "40251 Your Name. 1030.0 8.5 8.366584 \n",
602 | "12481 The Dark Knight 12269.0 8.3 8.289115 \n",
603 | "2843 Fight Club 9678.0 8.3 8.286216 \n",
604 | "292 Pulp Fiction 8670.0 8.3 8.284623 \n",
605 | "522 Schindler's List 4436.0 8.3 8.270109 \n",
606 | "23673 Whiplash 4376.0 8.3 8.269704 \n",
607 | "5481 Spirited Away 3968.0 8.3 8.266628 \n",
608 | "2211 Life Is Beautiful 3643.0 8.3 8.263691 \n",
609 | "1178 The Godfather: Part II 3418.0 8.3 8.261335 \n",
610 | "1152 One Flew Over the Cuckoo's Nest 3001.0 8.3 8.256051 \n",
611 | "1176 Psycho 2405.0 8.3 8.245381 \n",
612 | "351 Forrest Gump 8147.0 8.2 8.184252 \n",
613 | "1184 Once Upon a Time in America 1104.0 8.3 8.183804 \n",
614 | "1154 The Empire Strikes Back 5998.0 8.2 8.178656 \n",
615 | "18465 The Intouchables 5410.0 8.2 8.176357 \n",
616 | "289 Leon: The Professional 4293.0 8.2 8.170276 \n",
617 | "3030 The Green Mile 4166.0 8.2 8.169381 \n",
618 | "1170 GoodFellas 3211.0 8.2 8.160414 \n",
619 | "2216 American History X 3120.0 8.2 8.159278 \n",
620 | "1161 12 Angry Men 2130.0 8.2 8.140785 \n",
621 | "9698 Howl's Moving Castle 2049.0 8.2 8.138499 \n",
622 | "2884 Princess Mononoke 2041.0 8.2 8.138264 \n",
623 | "\n",
624 | " runtime \n",
625 | "10309 190.0 \n",
626 | "314 142.0 \n",
627 | "834 175.0 \n",
628 | "40251 106.0 \n",
629 | "12481 152.0 \n",
630 | "2843 139.0 \n",
631 | "292 154.0 \n",
632 | "522 195.0 \n",
633 | "23673 105.0 \n",
634 | "5481 125.0 \n",
635 | "2211 116.0 \n",
636 | "1178 200.0 \n",
637 | "1152 133.0 \n",
638 | "1176 109.0 \n",
639 | "351 142.0 \n",
640 | "1184 229.0 \n",
641 | "1154 124.0 \n",
642 | "18465 112.0 \n",
643 | "289 110.0 \n",
644 | "3030 189.0 \n",
645 | "1170 145.0 \n",
646 | "2216 119.0 \n",
647 | "1161 96.0 \n",
648 | "9698 119.0 \n",
649 | "2884 134.0 "
650 | ]
651 | },
652 | "execution_count": 20,
653 | "metadata": {},
654 | "output_type": "execute_result"
655 | }
656 | ],
657 | "source": [
658 | "#Sort movies in descending order of their scores\n",
659 | "q_movies = q_movies.sort_values('score', ascending=False)\n",
660 | "\n",
661 | "#Print the top 25 movies\n",
662 | "q_movies[['title', 'vote_count', 'vote_average', 'score', 'runtime']].head(25)"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": null,
668 | "metadata": {
669 | "collapsed": true
670 | },
671 | "outputs": [],
672 | "source": []
673 | }
674 | ],
675 | "metadata": {
676 | "kernelspec": {
677 | "display_name": "Python 3",
678 | "language": "python",
679 | "name": "python3"
680 | },
681 | "language_info": {
682 | "codemirror_mode": {
683 | "name": "ipython",
684 | "version": 3
685 | },
686 | "file_extension": ".py",
687 | "mimetype": "text/x-python",
688 | "name": "python",
689 | "nbconvert_exporter": "python",
690 | "pygments_lexer": "ipython3",
691 | "version": "3.6.0"
692 | }
693 | },
694 | "nbformat": 4,
695 | "nbformat_minor": 2
696 | }
697 |
--------------------------------------------------------------------------------
/Chapter4/.ipynb_checkpoints/Content Based Recommenders-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Plot Description Based Recommender"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "data": {
17 | "text/html": [
18 | "\n",
19 | "\n",
32 | "
\n",
33 | " \n",
34 | " \n",
35 | " | \n",
36 | " title | \n",
37 | " genres | \n",
38 | " runtime | \n",
39 | " vote_average | \n",
40 | " vote_count | \n",
41 | " year | \n",
42 | "
\n",
43 | " \n",
44 | " \n",
45 | " \n",
46 | " 0 | \n",
47 | " Toy Story | \n",
48 | " ['animation', 'comedy', 'family'] | \n",
49 | " 81.0 | \n",
50 | " 7.7 | \n",
51 | " 5415.0 | \n",
52 | " 1995 | \n",
53 | "
\n",
54 | " \n",
55 | " 1 | \n",
56 | " Jumanji | \n",
57 | " ['adventure', 'fantasy', 'family'] | \n",
58 | " 104.0 | \n",
59 | " 6.9 | \n",
60 | " 2413.0 | \n",
61 | " 1995 | \n",
62 | "
\n",
63 | " \n",
64 | " 2 | \n",
65 | " Grumpier Old Men | \n",
66 | " ['romance', 'comedy'] | \n",
67 | " 101.0 | \n",
68 | " 6.5 | \n",
69 | " 92.0 | \n",
70 | " 1995 | \n",
71 | "
\n",
72 | " \n",
73 | " 3 | \n",
74 | " Waiting to Exhale | \n",
75 | " ['comedy', 'drama', 'romance'] | \n",
76 | " 127.0 | \n",
77 | " 6.1 | \n",
78 | " 34.0 | \n",
79 | " 1995 | \n",
80 | "
\n",
81 | " \n",
82 | " 4 | \n",
83 | " Father of the Bride Part II | \n",
84 | " ['comedy'] | \n",
85 | " 106.0 | \n",
86 | " 5.7 | \n",
87 | " 173.0 | \n",
88 | " 1995 | \n",
89 | "
\n",
90 | " \n",
91 | "
\n",
92 | "
"
93 | ],
94 | "text/plain": [
95 | " title genres runtime \\\n",
96 | "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
97 | "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
98 | "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
99 | "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
100 | "4 Father of the Bride Part II ['comedy'] 106.0 \n",
101 | "\n",
102 | " vote_average vote_count year \n",
103 | "0 7.7 5415.0 1995 \n",
104 | "1 6.9 2413.0 1995 \n",
105 | "2 6.5 92.0 1995 \n",
106 | "3 6.1 34.0 1995 \n",
107 | "4 5.7 173.0 1995 "
108 | ]
109 | },
110 | "execution_count": 1,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "import pandas as pd\n",
117 | "import numpy as np\n",
118 | "\n",
119 | "#Import data from the clean file \n",
120 | "df = pd.read_csv('../data/metadata_clean.csv')\n",
121 | "\n",
122 | "#Print the head of the cleaned DataFrame\n",
123 | "df.head()"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 2,
129 | "metadata": {},
130 | "outputs": [
131 | {
132 | "data": {
133 | "text/html": [
134 | "\n",
135 | "\n",
148 | "
\n",
149 | " \n",
150 | " \n",
151 | " | \n",
152 | " title | \n",
153 | " genres | \n",
154 | " runtime | \n",
155 | " vote_average | \n",
156 | " vote_count | \n",
157 | " year | \n",
158 | " overview | \n",
159 | " id | \n",
160 | "
\n",
161 | " \n",
162 | " \n",
163 | " \n",
164 | " 0 | \n",
165 | " Toy Story | \n",
166 | " ['animation', 'comedy', 'family'] | \n",
167 | " 81.0 | \n",
168 | " 7.7 | \n",
169 | " 5415.0 | \n",
170 | " 1995 | \n",
171 | " Led by Woody, Andy's toys live happily in his ... | \n",
172 | " 862 | \n",
173 | "
\n",
174 | " \n",
175 | " 1 | \n",
176 | " Jumanji | \n",
177 | " ['adventure', 'fantasy', 'family'] | \n",
178 | " 104.0 | \n",
179 | " 6.9 | \n",
180 | " 2413.0 | \n",
181 | " 1995 | \n",
182 | " When siblings Judy and Peter discover an encha... | \n",
183 | " 8844 | \n",
184 | "
\n",
185 | " \n",
186 | " 2 | \n",
187 | " Grumpier Old Men | \n",
188 | " ['romance', 'comedy'] | \n",
189 | " 101.0 | \n",
190 | " 6.5 | \n",
191 | " 92.0 | \n",
192 | " 1995 | \n",
193 | " A family wedding reignites the ancient feud be... | \n",
194 | " 15602 | \n",
195 | "
\n",
196 | " \n",
197 | " 3 | \n",
198 | " Waiting to Exhale | \n",
199 | " ['comedy', 'drama', 'romance'] | \n",
200 | " 127.0 | \n",
201 | " 6.1 | \n",
202 | " 34.0 | \n",
203 | " 1995 | \n",
204 | " Cheated on, mistreated and stepped on, the wom... | \n",
205 | " 31357 | \n",
206 | "
\n",
207 | " \n",
208 | " 4 | \n",
209 | " Father of the Bride Part II | \n",
210 | " ['comedy'] | \n",
211 | " 106.0 | \n",
212 | " 5.7 | \n",
213 | " 173.0 | \n",
214 | " 1995 | \n",
215 | " Just when George Banks has recovered from his ... | \n",
216 | " 11862 | \n",
217 | "
\n",
218 | " \n",
219 | "
\n",
220 | "
"
221 | ],
222 | "text/plain": [
223 | " title genres runtime \\\n",
224 | "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
225 | "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
226 | "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
227 | "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
228 | "4 Father of the Bride Part II ['comedy'] 106.0 \n",
229 | "\n",
230 | " vote_average vote_count year \\\n",
231 | "0 7.7 5415.0 1995 \n",
232 | "1 6.9 2413.0 1995 \n",
233 | "2 6.5 92.0 1995 \n",
234 | "3 6.1 34.0 1995 \n",
235 | "4 5.7 173.0 1995 \n",
236 | "\n",
237 | " overview id \n",
238 | "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
239 | "1 When siblings Judy and Peter discover an encha... 8844 \n",
240 | "2 A family wedding reignites the ancient feud be... 15602 \n",
241 | "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
242 | "4 Just when George Banks has recovered from his ... 11862 "
243 | ]
244 | },
245 | "execution_count": 2,
246 | "metadata": {},
247 | "output_type": "execute_result"
248 | }
249 | ],
250 | "source": [
251 | "#Import the original file\n",
252 | "orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)\n",
253 | "\n",
254 | "#Add the useful features into the cleaned dataframe\n",
255 | "df['overview'], df['id'] = orig_df['overview'], orig_df['id']\n",
256 | "\n",
257 | "df.head()"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 3,
263 | "metadata": {},
264 | "outputs": [
265 | {
266 | "data": {
267 | "text/plain": [
268 | "(45466, 75827)"
269 | ]
270 | },
271 | "execution_count": 3,
272 | "metadata": {},
273 | "output_type": "execute_result"
274 | }
275 | ],
276 | "source": [
277 | "#Import TfIdfVectorizer from the scikit-learn library\n",
278 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
279 | "\n",
280 | "#Define a TF-IDF Vectorizer Object. Remove all english stopwords\n",
281 | "tfidf = TfidfVectorizer(stop_words='english')\n",
282 | "\n",
283 | "#Replace NaN with an empty string\n",
284 | "df['overview'] = df['overview'].fillna('')\n",
285 | "\n",
286 | "#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature\n",
287 | "tfidf_matrix = tfidf.fit_transform(df['overview'])\n",
288 | "\n",
289 | "#Output the shape of tfidf_matrix\n",
290 | "tfidf_matrix.shape"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 4,
296 | "metadata": {
297 | "collapsed": true
298 | },
299 | "outputs": [],
300 | "source": [
301 | "# Import linear_kernel to compute the dot product\n",
302 | "from sklearn.metrics.pairwise import linear_kernel\n",
303 | "\n",
304 | "# Compute the cosine similarity matrix\n",
305 | "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 5,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any\n",
315 | "indices = pd.Series(df.index, index=df['title']).drop_duplicates()"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 6,
321 | "metadata": {
322 | "collapsed": true
323 | },
324 | "outputs": [],
325 | "source": [
326 | "# Function that takes in movie title as input and gives recommendations \n",
327 | "def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):\n",
328 | " # Obtain the index of the movie that matches the title\n",
329 | " idx = indices[title]\n",
330 | "\n",
331 | " # Get the pairwsie similarity scores of all movies with that movie\n",
332 | " # And convert it into a list of tuples as described above\n",
333 | " sim_scores = list(enumerate(cosine_sim[idx]))\n",
334 | "\n",
335 | " # Sort the movies based on the cosine similarity scores\n",
336 | " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
337 | "\n",
338 | " # Get the scores of the 10 most similar movies. Ignore the first movie.\n",
339 | " sim_scores = sim_scores[1:11]\n",
340 | "\n",
341 | " # Get the movie indices\n",
342 | " movie_indices = [i[0] for i in sim_scores]\n",
343 | "\n",
344 | " # Return the top 10 most similar movies\n",
345 | " return df['title'].iloc[movie_indices]"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 7,
351 | "metadata": {},
352 | "outputs": [
353 | {
354 | "data": {
355 | "text/plain": [
356 | "34682 How the Lion Cub and the Turtle Sang a Song\n",
357 | "9353 The Lion King 1½\n",
358 | "9115 The Lion King 2: Simba's Pride\n",
359 | "42829 Prey\n",
360 | "25654 Fearless Fagan\n",
361 | "17041 African Cats\n",
362 | "27933 Massaï, les guerriers de la pluie\n",
363 | "6094 Born Free\n",
364 | "37409 Sour Grape\n",
365 | "3203 The Waiting Game\n",
366 | "Name: title, dtype: object"
367 | ]
368 | },
369 | "execution_count": 7,
370 | "metadata": {},
371 | "output_type": "execute_result"
372 | }
373 | ],
374 | "source": [
375 | "#Get recommendations for The Lion King\n",
376 | "content_recommender('The Lion King')"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "# Metadata Based Recommender"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 8,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "# Load the keywords and credits files\n",
393 | "cred_df = pd.read_csv('../data/credits.csv')\n",
394 | "key_df = pd.read_csv('../data/keywords.csv')"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 9,
400 | "metadata": {},
401 | "outputs": [
402 | {
403 | "data": {
404 | "text/html": [
405 | "\n",
406 | "\n",
419 | "
\n",
420 | " \n",
421 | " \n",
422 | " | \n",
423 | " cast | \n",
424 | " crew | \n",
425 | " id | \n",
426 | "
\n",
427 | " \n",
428 | " \n",
429 | " \n",
430 | " 0 | \n",
431 | " [{'cast_id': 14, 'character': 'Woody (voice)',... | \n",
432 | " [{'credit_id': '52fe4284c3a36847f8024f49', 'de... | \n",
433 | " 862 | \n",
434 | "
\n",
435 | " \n",
436 | " 1 | \n",
437 | " [{'cast_id': 1, 'character': 'Alan Parrish', '... | \n",
438 | " [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... | \n",
439 | " 8844 | \n",
440 | "
\n",
441 | " \n",
442 | " 2 | \n",
443 | " [{'cast_id': 2, 'character': 'Max Goldman', 'c... | \n",
444 | " [{'credit_id': '52fe466a9251416c75077a89', 'de... | \n",
445 | " 15602 | \n",
446 | "
\n",
447 | " \n",
448 | " 3 | \n",
449 | " [{'cast_id': 1, 'character': \"Savannah 'Vannah... | \n",
450 | " [{'credit_id': '52fe44779251416c91011acb', 'de... | \n",
451 | " 31357 | \n",
452 | "
\n",
453 | " \n",
454 | " 4 | \n",
455 | " [{'cast_id': 1, 'character': 'George Banks', '... | \n",
456 | " [{'credit_id': '52fe44959251416c75039ed7', 'de... | \n",
457 | " 11862 | \n",
458 | "
\n",
459 | " \n",
460 | "
\n",
461 | "
"
462 | ],
463 | "text/plain": [
464 | " cast \\\n",
465 | "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
466 | "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
467 | "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
468 | "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
469 | "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
470 | "\n",
471 | " crew id \n",
472 | "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n",
473 | "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n",
474 | "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n",
475 | "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n",
476 | "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 "
477 | ]
478 | },
479 | "execution_count": 9,
480 | "metadata": {},
481 | "output_type": "execute_result"
482 | }
483 | ],
484 | "source": [
485 | "#Print the head of the credit dataframe\n",
486 | "cred_df.head()"
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": 10,
492 | "metadata": {},
493 | "outputs": [
494 | {
495 | "data": {
496 | "text/html": [
497 | "\n",
498 | "\n",
511 | "
\n",
512 | " \n",
513 | " \n",
514 | " | \n",
515 | " id | \n",
516 | " keywords | \n",
517 | "
\n",
518 | " \n",
519 | " \n",
520 | " \n",
521 | " 0 | \n",
522 | " 862 | \n",
523 | " [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... | \n",
524 | "
\n",
525 | " \n",
526 | " 1 | \n",
527 | " 8844 | \n",
528 | " [{'id': 10090, 'name': 'board game'}, {'id': 1... | \n",
529 | "
\n",
530 | " \n",
531 | " 2 | \n",
532 | " 15602 | \n",
533 | " [{'id': 1495, 'name': 'fishing'}, {'id': 12392... | \n",
534 | "
\n",
535 | " \n",
536 | " 3 | \n",
537 | " 31357 | \n",
538 | " [{'id': 818, 'name': 'based on novel'}, {'id':... | \n",
539 | "
\n",
540 | " \n",
541 | " 4 | \n",
542 | " 11862 | \n",
543 | " [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... | \n",
544 | "
\n",
545 | " \n",
546 | "
\n",
547 | "
"
548 | ],
549 | "text/plain": [
550 | " id keywords\n",
551 | "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n",
552 | "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n",
553 | "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n",
554 | "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n",
555 | "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
556 | ]
557 | },
558 | "execution_count": 10,
559 | "metadata": {},
560 | "output_type": "execute_result"
561 | }
562 | ],
563 | "source": [
564 | "#Print the head of the keywords dataframe\n",
565 | "key_df.head()"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 11,
571 | "metadata": {},
572 | "outputs": [
573 | {
574 | "ename": "ValueError",
575 | "evalue": "invalid literal for int() with base 10: '1997-08-20'",
576 | "output_type": "error",
577 | "traceback": [
578 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
579 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
580 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#Convert the IDs of df into int\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'int'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
581 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 91\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 92\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_deprecate_kwarg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
582 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors, **kwargs)\u001b[0m\n\u001b[1;32m 3408\u001b[0m \u001b[0;31m# else, only a single dtype is given\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3409\u001b[0m new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,\n\u001b[0;32m-> 3410\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 3411\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3412\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
583 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, **kwargs)\u001b[0m\n\u001b[1;32m 3222\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3223\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3224\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'astype'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3225\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3226\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mconvert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
584 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)\u001b[0m\n\u001b[1;32m 3089\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3090\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'mgr'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3091\u001b[0;31m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3092\u001b[0m \u001b[0mresult_blocks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_extend_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mapplied\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult_blocks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3093\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
585 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors, values, **kwargs)\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'raise'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 470\u001b[0m return self._astype(dtype, copy=copy, errors=errors, values=values,\n\u001b[0;32m--> 471\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 472\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 473\u001b[0m def _astype(self, dtype, copy=False, errors='raise', values=None,\n",
586 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m_astype\u001b[0;34m(self, dtype, copy, errors, values, klass, mgr, raise_on_error, **kwargs)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;31m# _astype_nansafe works fine with 1-d only\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mastype_nansafe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
587 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/dtypes/cast.py\u001b[0m in \u001b[0;36mastype_nansafe\u001b[0;34m(arr, dtype, copy)\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobject_\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0missubdtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minteger\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 624\u001b[0m \u001b[0;31m# work around NumPy brokenness, #1987\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 625\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype_intsafe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 626\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"datetime64\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"timedelta64\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
588 | "\u001b[0;32mpandas/_libs/lib.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.astype_intsafe (pandas/_libs/lib.c:16264)\u001b[0;34m()\u001b[0m\n",
589 | "\u001b[0;32mpandas/_libs/src/util.pxd\u001b[0m in \u001b[0;36mutil.set_value_at_unsafe (pandas/_libs/lib.c:73298)\u001b[0;34m()\u001b[0m\n",
590 | "\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: '1997-08-20'"
591 | ]
592 | }
593 | ],
594 | "source": [
595 | "#Convert the IDs of df into int\n",
596 | "df['id'] = df['id'].astype('int')"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": 12,
602 | "metadata": {
603 | "collapsed": true
604 | },
605 | "outputs": [],
606 | "source": [
607 | "# Function to convert all non-integer IDs to NaN\n",
608 | "def clean_ids(x):\n",
609 | " try:\n",
610 | " return int(x)\n",
611 | " except:\n",
612 | " return np.nan"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": 13,
618 | "metadata": {
619 | "collapsed": true
620 | },
621 | "outputs": [],
622 | "source": [
623 | "#Clean the ids of df\n",
624 | "df['id'] = df['id'].apply(clean_ids)\n",
625 | "\n",
626 | "#Filter all rows that have a null ID\n",
627 | "df = df[df['id'].notnull()]"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 14,
633 | "metadata": {},
634 | "outputs": [
635 | {
636 | "name": "stderr",
637 | "output_type": "stream",
638 | "text": [
639 | "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
640 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
641 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
642 | "\n",
643 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
644 | " \n"
645 | ]
646 | },
647 | {
648 | "data": {
649 | "text/html": [
650 | "\n",
651 | "\n",
664 | "
\n",
665 | " \n",
666 | " \n",
667 | " | \n",
668 | " title | \n",
669 | " genres | \n",
670 | " runtime | \n",
671 | " vote_average | \n",
672 | " vote_count | \n",
673 | " year | \n",
674 | " overview | \n",
675 | " id | \n",
676 | " cast | \n",
677 | " crew | \n",
678 | " keywords | \n",
679 | "
\n",
680 | " \n",
681 | " \n",
682 | " \n",
683 | " 0 | \n",
684 | " Toy Story | \n",
685 | " ['animation', 'comedy', 'family'] | \n",
686 | " 81.0 | \n",
687 | " 7.7 | \n",
688 | " 5415.0 | \n",
689 | " 1995 | \n",
690 | " Led by Woody, Andy's toys live happily in his ... | \n",
691 | " 862 | \n",
692 | " [{'cast_id': 14, 'character': 'Woody (voice)',... | \n",
693 | " [{'credit_id': '52fe4284c3a36847f8024f49', 'de... | \n",
694 | " [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... | \n",
695 | "
\n",
696 | " \n",
697 | " 1 | \n",
698 | " Jumanji | \n",
699 | " ['adventure', 'fantasy', 'family'] | \n",
700 | " 104.0 | \n",
701 | " 6.9 | \n",
702 | " 2413.0 | \n",
703 | " 1995 | \n",
704 | " When siblings Judy and Peter discover an encha... | \n",
705 | " 8844 | \n",
706 | " [{'cast_id': 1, 'character': 'Alan Parrish', '... | \n",
707 | " [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... | \n",
708 | " [{'id': 10090, 'name': 'board game'}, {'id': 1... | \n",
709 | "
\n",
710 | " \n",
711 | " 2 | \n",
712 | " Grumpier Old Men | \n",
713 | " ['romance', 'comedy'] | \n",
714 | " 101.0 | \n",
715 | " 6.5 | \n",
716 | " 92.0 | \n",
717 | " 1995 | \n",
718 | " A family wedding reignites the ancient feud be... | \n",
719 | " 15602 | \n",
720 | " [{'cast_id': 2, 'character': 'Max Goldman', 'c... | \n",
721 | " [{'credit_id': '52fe466a9251416c75077a89', 'de... | \n",
722 | " [{'id': 1495, 'name': 'fishing'}, {'id': 12392... | \n",
723 | "
\n",
724 | " \n",
725 | " 3 | \n",
726 | " Waiting to Exhale | \n",
727 | " ['comedy', 'drama', 'romance'] | \n",
728 | " 127.0 | \n",
729 | " 6.1 | \n",
730 | " 34.0 | \n",
731 | " 1995 | \n",
732 | " Cheated on, mistreated and stepped on, the wom... | \n",
733 | " 31357 | \n",
734 | " [{'cast_id': 1, 'character': \"Savannah 'Vannah... | \n",
735 | " [{'credit_id': '52fe44779251416c91011acb', 'de... | \n",
736 | " [{'id': 818, 'name': 'based on novel'}, {'id':... | \n",
737 | "
\n",
738 | " \n",
739 | " 4 | \n",
740 | " Father of the Bride Part II | \n",
741 | " ['comedy'] | \n",
742 | " 106.0 | \n",
743 | " 5.7 | \n",
744 | " 173.0 | \n",
745 | " 1995 | \n",
746 | " Just when George Banks has recovered from his ... | \n",
747 | " 11862 | \n",
748 | " [{'cast_id': 1, 'character': 'George Banks', '... | \n",
749 | " [{'credit_id': '52fe44959251416c75039ed7', 'de... | \n",
750 | " [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... | \n",
751 | "
\n",
752 | " \n",
753 | "
\n",
754 | "
"
755 | ],
756 | "text/plain": [
757 | " title genres runtime \\\n",
758 | "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
759 | "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
760 | "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
761 | "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
762 | "4 Father of the Bride Part II ['comedy'] 106.0 \n",
763 | "\n",
764 | " vote_average vote_count year \\\n",
765 | "0 7.7 5415.0 1995 \n",
766 | "1 6.9 2413.0 1995 \n",
767 | "2 6.5 92.0 1995 \n",
768 | "3 6.1 34.0 1995 \n",
769 | "4 5.7 173.0 1995 \n",
770 | "\n",
771 | " overview id \\\n",
772 | "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
773 | "1 When siblings Judy and Peter discover an encha... 8844 \n",
774 | "2 A family wedding reignites the ancient feud be... 15602 \n",
775 | "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
776 | "4 Just when George Banks has recovered from his ... 11862 \n",
777 | "\n",
778 | " cast \\\n",
779 | "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
780 | "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
781 | "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
782 | "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
783 | "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
784 | "\n",
785 | " crew \\\n",
786 | "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... \n",
787 | "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... \n",
788 | "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... \n",
789 | "3 [{'credit_id': '52fe44779251416c91011acb', 'de... \n",
790 | "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... \n",
791 | "\n",
792 | " keywords \n",
793 | "0 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... \n",
794 | "1 [{'id': 10090, 'name': 'board game'}, {'id': 1... \n",
795 | "2 [{'id': 1495, 'name': 'fishing'}, {'id': 12392... \n",
796 | "3 [{'id': 818, 'name': 'based on novel'}, {'id':... \n",
797 | "4 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... "
798 | ]
799 | },
800 | "execution_count": 14,
801 | "metadata": {},
802 | "output_type": "execute_result"
803 | }
804 | ],
805 | "source": [
806 | "# Convert IDs into integer\n",
807 | "df['id'] = df['id'].astype('int')\n",
808 | "key_df['id'] = key_df['id'].astype('int')\n",
809 | "cred_df['id'] = cred_df['id'].astype('int')\n",
810 | "\n",
811 | "# Merge keywords and credits into your main metadata dataframe\n",
812 | "df = df.merge(cred_df, on='id')\n",
813 | "df = df.merge(key_df, on='id')\n",
814 | "\n",
815 | "#Display the head of df\n",
816 | "df.head()"
817 | ]
818 | },
819 | {
820 | "cell_type": "code",
821 | "execution_count": 15,
822 | "metadata": {
823 | "collapsed": true
824 | },
825 | "outputs": [],
826 | "source": [
827 | "# Convert the stringified objects into the native python objects\n",
828 | "from ast import literal_eval\n",
829 | "\n",
830 | "features = ['cast', 'crew', 'keywords', 'genres']\n",
831 | "for feature in features:\n",
832 | " df[feature] = df[feature].apply(literal_eval)"
833 | ]
834 | },
835 | {
836 | "cell_type": "code",
837 | "execution_count": 16,
838 | "metadata": {},
839 | "outputs": [
840 | {
841 | "data": {
842 | "text/plain": [
843 | "{'credit_id': '52fe4284c3a36847f8024f49',\n",
844 | " 'department': 'Directing',\n",
845 | " 'gender': 2,\n",
846 | " 'id': 7879,\n",
847 | " 'job': 'Director',\n",
848 | " 'name': 'John Lasseter',\n",
849 | " 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}"
850 | ]
851 | },
852 | "execution_count": 16,
853 | "metadata": {},
854 | "output_type": "execute_result"
855 | }
856 | ],
857 | "source": [
858 | "#Print the first cast member of the first movie in df\n",
859 | "df.iloc[0]['crew'][0]"
860 | ]
861 | },
862 | {
863 | "cell_type": "code",
864 | "execution_count": 17,
865 | "metadata": {
866 | "collapsed": true
867 | },
868 | "outputs": [],
869 | "source": [
870 | "# Extract the director's name. If director is not listed, return NaN\n",
871 | "def get_director(x):\n",
872 | " for crew_member in x:\n",
873 | " if crew_member['job'] == 'Director':\n",
874 | " return crew_member['name']\n",
875 | " return np.nan"
876 | ]
877 | },
878 | {
879 | "cell_type": "code",
880 | "execution_count": 18,
881 | "metadata": {},
882 | "outputs": [
883 | {
884 | "data": {
885 | "text/plain": [
886 | "0 John Lasseter\n",
887 | "1 Joe Johnston\n",
888 | "2 Howard Deutch\n",
889 | "3 Forest Whitaker\n",
890 | "4 Charles Shyer\n",
891 | "Name: director, dtype: object"
892 | ]
893 | },
894 | "execution_count": 18,
895 | "metadata": {},
896 | "output_type": "execute_result"
897 | }
898 | ],
899 | "source": [
900 | "#Define the new director feature\n",
901 | "df['director'] = df['crew'].apply(get_director)\n",
902 | "\n",
903 | "#Print the directors of the first five movies\n",
904 | "df['director'].head()"
905 | ]
906 | },
907 | {
908 | "cell_type": "code",
909 | "execution_count": 19,
910 | "metadata": {
911 | "collapsed": true
912 | },
913 | "outputs": [],
914 | "source": [
915 | "# Returns the list top 3 elements or entire list; whichever is more.\n",
916 | "def generate_list(x):\n",
917 | " if isinstance(x, list):\n",
918 | " names = [i['name'] for i in x]\n",
919 | " #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
920 | " if len(names) > 3:\n",
921 | " names = names[:3]\n",
922 | " return names\n",
923 | "\n",
924 | " #Return empty list in case of missing/malformed data\n",
925 | " return []"
926 | ]
927 | },
928 | {
929 | "cell_type": "code",
930 | "execution_count": 20,
931 | "metadata": {},
932 | "outputs": [],
933 | "source": [
934 | "#Apply the generate_list function to cast and keywords\n",
935 | "df['cast'] = df['cast'].apply(generate_list)\n",
936 | "df['keywords'] = df['keywords'].apply(generate_list)"
937 | ]
938 | },
939 | {
940 | "cell_type": "code",
941 | "execution_count": 21,
942 | "metadata": {
943 | "collapsed": true
944 | },
945 | "outputs": [],
946 | "source": [
947 | "#Only consider a maximum of 3 genres\n",
948 | "df['genres'] = df['genres'].apply(lambda x: x[:3])"
949 | ]
950 | },
951 | {
952 | "cell_type": "code",
953 | "execution_count": 22,
954 | "metadata": {},
955 | "outputs": [
956 | {
957 | "data": {
958 | "text/html": [
959 | "\n",
960 | "\n",
973 | "
\n",
974 | " \n",
975 | " \n",
976 | " | \n",
977 | " title | \n",
978 | " cast | \n",
979 | " director | \n",
980 | " keywords | \n",
981 | " genres | \n",
982 | "
\n",
983 | " \n",
984 | " \n",
985 | " \n",
986 | " 0 | \n",
987 | " Toy Story | \n",
988 | " [Tom Hanks, Tim Allen, Don Rickles] | \n",
989 | " John Lasseter | \n",
990 | " [jealousy, toy, boy] | \n",
991 | " [animation, comedy, family] | \n",
992 | "
\n",
993 | " \n",
994 | " 1 | \n",
995 | " Jumanji | \n",
996 | " [Robin Williams, Jonathan Hyde, Kirsten Dunst] | \n",
997 | " Joe Johnston | \n",
998 | " [board game, disappearance, based on children'... | \n",
999 | " [adventure, fantasy, family] | \n",
1000 | "
\n",
1001 | " \n",
1002 | " 2 | \n",
1003 | " Grumpier Old Men | \n",
1004 | " [Walter Matthau, Jack Lemmon, Ann-Margret] | \n",
1005 | " Howard Deutch | \n",
1006 | " [fishing, best friend, duringcreditsstinger] | \n",
1007 | " [romance, comedy] | \n",
1008 | "
\n",
1009 | " \n",
1010 | " 3 | \n",
1011 | " Waiting to Exhale | \n",
1012 | " [Whitney Houston, Angela Bassett, Loretta Devine] | \n",
1013 | " Forest Whitaker | \n",
1014 | " [based on novel, interracial relationship, sin... | \n",
1015 | " [comedy, drama, romance] | \n",
1016 | "
\n",
1017 | " \n",
1018 | " 4 | \n",
1019 | " Father of the Bride Part II | \n",
1020 | " [Steve Martin, Diane Keaton, Martin Short] | \n",
1021 | " Charles Shyer | \n",
1022 | " [baby, midlife crisis, confidence] | \n",
1023 | " [comedy] | \n",
1024 | "
\n",
1025 | " \n",
1026 | "
\n",
1027 | "
"
1028 | ],
1029 | "text/plain": [
1030 | " title \\\n",
1031 | "0 Toy Story \n",
1032 | "1 Jumanji \n",
1033 | "2 Grumpier Old Men \n",
1034 | "3 Waiting to Exhale \n",
1035 | "4 Father of the Bride Part II \n",
1036 | "\n",
1037 | " cast director \\\n",
1038 | "0 [Tom Hanks, Tim Allen, Don Rickles] John Lasseter \n",
1039 | "1 [Robin Williams, Jonathan Hyde, Kirsten Dunst] Joe Johnston \n",
1040 | "2 [Walter Matthau, Jack Lemmon, Ann-Margret] Howard Deutch \n",
1041 | "3 [Whitney Houston, Angela Bassett, Loretta Devine] Forest Whitaker \n",
1042 | "4 [Steve Martin, Diane Keaton, Martin Short] Charles Shyer \n",
1043 | "\n",
1044 | " keywords \\\n",
1045 | "0 [jealousy, toy, boy] \n",
1046 | "1 [board game, disappearance, based on children'... \n",
1047 | "2 [fishing, best friend, duringcreditsstinger] \n",
1048 | "3 [based on novel, interracial relationship, sin... \n",
1049 | "4 [baby, midlife crisis, confidence] \n",
1050 | "\n",
1051 | " genres \n",
1052 | "0 [animation, comedy, family] \n",
1053 | "1 [adventure, fantasy, family] \n",
1054 | "2 [romance, comedy] \n",
1055 | "3 [comedy, drama, romance] \n",
1056 | "4 [comedy] "
1057 | ]
1058 | },
1059 | "execution_count": 22,
1060 | "metadata": {},
1061 | "output_type": "execute_result"
1062 | }
1063 | ],
1064 | "source": [
1065 | "# Print the new features of the first 5 movies along with title\n",
1066 | "df[['title', 'cast', 'director', 'keywords', 'genres']].head()"
1067 | ]
1068 | },
1069 | {
1070 | "cell_type": "code",
1071 | "execution_count": 23,
1072 | "metadata": {
1073 | "collapsed": true
1074 | },
1075 | "outputs": [],
1076 | "source": [
1077 | "# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase\n",
1078 | "def sanitize(x):\n",
1079 | " if isinstance(x, list):\n",
1080 | " #Strip spaces and convert to lowercase\n",
1081 | " return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
1082 | " else:\n",
1083 | " #Check if director exists. If not, return empty string\n",
1084 | " if isinstance(x, str):\n",
1085 | " return str.lower(x.replace(\" \", \"\"))\n",
1086 | " else:\n",
1087 | " return ''"
1088 | ]
1089 | },
1090 | {
1091 | "cell_type": "code",
1092 | "execution_count": 24,
1093 | "metadata": {},
1094 | "outputs": [],
1095 | "source": [
1096 | "#Apply the generate_list function to cast, keywords, director and genres\n",
1097 | "for feature in ['cast', 'director', 'genres', 'keywords']:\n",
1098 | " df[feature] = df[feature].apply(sanitize)"
1099 | ]
1100 | },
1101 | {
1102 | "cell_type": "code",
1103 | "execution_count": 25,
1104 | "metadata": {
1105 | "scrolled": true
1106 | },
1107 | "outputs": [],
1108 | "source": [
1109 | "#Function that creates a soup out of the desired metadata\n",
1110 | "def create_soup(x):\n",
1111 | " return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])"
1112 | ]
1113 | },
1114 | {
1115 | "cell_type": "code",
1116 | "execution_count": 26,
1117 | "metadata": {
1118 | "collapsed": true
1119 | },
1120 | "outputs": [],
1121 | "source": [
1122 | "# Create the new soup feature\n",
1123 | "df['soup'] = df.apply(create_soup, axis=1)"
1124 | ]
1125 | },
1126 | {
1127 | "cell_type": "code",
1128 | "execution_count": 27,
1129 | "metadata": {},
1130 | "outputs": [
1131 | {
1132 | "data": {
1133 | "text/plain": [
1134 | "'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'"
1135 | ]
1136 | },
1137 | "execution_count": 27,
1138 | "metadata": {},
1139 | "output_type": "execute_result"
1140 | }
1141 | ],
1142 | "source": [
1143 | "#Display the soup of the first movie\n",
1144 | "df.iloc[0]['soup']"
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "code",
1149 | "execution_count": 28,
1150 | "metadata": {},
1151 | "outputs": [],
1152 | "source": [
1153 | "# Import CountVectorizer\n",
1154 | "from sklearn.feature_extraction.text import CountVectorizer\n",
1155 | "\n",
1156 | "#Define a new CountVectorizer object and create vectors for the soup\n",
1157 | "count = CountVectorizer(stop_words='english')\n",
1158 | "count_matrix = count.fit_transform(df['soup'])"
1159 | ]
1160 | },
1161 | {
1162 | "cell_type": "code",
1163 | "execution_count": 29,
1164 | "metadata": {},
1165 | "outputs": [],
1166 | "source": [
1167 | "#Import cosine_similarity function\n",
1168 | "from sklearn.metrics.pairwise import cosine_similarity\n",
1169 | "\n",
1170 | "#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)\n",
1171 | "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)"
1172 | ]
1173 | },
1174 | {
1175 | "cell_type": "code",
1176 | "execution_count": 34,
1177 | "metadata": {
1178 | "collapsed": true
1179 | },
1180 | "outputs": [],
1181 | "source": [
1182 | "# Reset index of your df and construct reverse mapping again\n",
1183 | "df = df.reset_index()\n",
1184 | "indices2 = pd.Series(df.index, index=df['title'])"
1185 | ]
1186 | },
1187 | {
1188 | "cell_type": "code",
1189 | "execution_count": 37,
1190 | "metadata": {},
1191 | "outputs": [
1192 | {
1193 | "data": {
1194 | "text/plain": [
1195 | "29607 Cheburashka\n",
1196 | "40904 VeggieTales: Josh and the Big Wall\n",
1197 | "40913 VeggieTales: Minnesota Cuke and the Search for...\n",
1198 | "27768 The Little Matchgirl\n",
1199 | "15209 Spiderman: The Ultimate Villain Showdown\n",
1200 | "16613 Cirque du Soleil: Varekai\n",
1201 | "24654 The Seventh Brother\n",
1202 | "29198 Superstar Goofy\n",
1203 | "30244 My Love\n",
1204 | "31179 Pokémon: Arceus and the Jewel of Life\n",
1205 | "Name: title, dtype: object"
1206 | ]
1207 | },
1208 | "execution_count": 37,
1209 | "metadata": {},
1210 | "output_type": "execute_result"
1211 | }
1212 | ],
1213 | "source": [
1214 | "content_recommender('The Lion King', cosine_sim2, df, indices2)"
1215 | ]
1216 | },
1217 | {
1218 | "cell_type": "code",
1219 | "execution_count": null,
1220 | "metadata": {},
1221 | "outputs": [],
1222 | "source": []
1223 | }
1224 | ],
1225 | "metadata": {
1226 | "kernelspec": {
1227 | "display_name": "Python 3",
1228 | "language": "python",
1229 | "name": "python3"
1230 | },
1231 | "language_info": {
1232 | "codemirror_mode": {
1233 | "name": "ipython",
1234 | "version": 3
1235 | },
1236 | "file_extension": ".py",
1237 | "mimetype": "text/x-python",
1238 | "name": "python",
1239 | "nbconvert_exporter": "python",
1240 | "pygments_lexer": "ipython3",
1241 | "version": "3.6.0"
1242 | }
1243 | },
1244 | "nbformat": 4,
1245 | "nbformat_minor": 2
1246 | }
1247 |
--------------------------------------------------------------------------------
/Chapter4/Content Based Recommenders.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Plot Description Based Recommender"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "data": {
17 | "text/html": [
18 | "\n",
19 | "\n",
32 | "
\n",
33 | " \n",
34 | " \n",
35 | " | \n",
36 | " title | \n",
37 | " genres | \n",
38 | " runtime | \n",
39 | " vote_average | \n",
40 | " vote_count | \n",
41 | " year | \n",
42 | "
\n",
43 | " \n",
44 | " \n",
45 | " \n",
46 | " 0 | \n",
47 | " Toy Story | \n",
48 | " ['animation', 'comedy', 'family'] | \n",
49 | " 81.0 | \n",
50 | " 7.7 | \n",
51 | " 5415.0 | \n",
52 | " 1995 | \n",
53 | "
\n",
54 | " \n",
55 | " 1 | \n",
56 | " Jumanji | \n",
57 | " ['adventure', 'fantasy', 'family'] | \n",
58 | " 104.0 | \n",
59 | " 6.9 | \n",
60 | " 2413.0 | \n",
61 | " 1995 | \n",
62 | "
\n",
63 | " \n",
64 | " 2 | \n",
65 | " Grumpier Old Men | \n",
66 | " ['romance', 'comedy'] | \n",
67 | " 101.0 | \n",
68 | " 6.5 | \n",
69 | " 92.0 | \n",
70 | " 1995 | \n",
71 | "
\n",
72 | " \n",
73 | " 3 | \n",
74 | " Waiting to Exhale | \n",
75 | " ['comedy', 'drama', 'romance'] | \n",
76 | " 127.0 | \n",
77 | " 6.1 | \n",
78 | " 34.0 | \n",
79 | " 1995 | \n",
80 | "
\n",
81 | " \n",
82 | " 4 | \n",
83 | " Father of the Bride Part II | \n",
84 | " ['comedy'] | \n",
85 | " 106.0 | \n",
86 | " 5.7 | \n",
87 | " 173.0 | \n",
88 | " 1995 | \n",
89 | "
\n",
90 | " \n",
91 | "
\n",
92 | "
"
93 | ],
94 | "text/plain": [
95 | " title genres runtime \\\n",
96 | "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
97 | "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
98 | "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
99 | "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
100 | "4 Father of the Bride Part II ['comedy'] 106.0 \n",
101 | "\n",
102 | " vote_average vote_count year \n",
103 | "0 7.7 5415.0 1995 \n",
104 | "1 6.9 2413.0 1995 \n",
105 | "2 6.5 92.0 1995 \n",
106 | "3 6.1 34.0 1995 \n",
107 | "4 5.7 173.0 1995 "
108 | ]
109 | },
110 | "execution_count": 1,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "import pandas as pd\n",
117 | "import numpy as np\n",
118 | "\n",
119 | "#Import data from the clean file \n",
120 | "df = pd.read_csv('../data/metadata_clean.csv')\n",
121 | "\n",
122 | "#Print the head of the cleaned DataFrame\n",
123 | "df.head()"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 2,
129 | "metadata": {},
130 | "outputs": [
131 | {
132 | "data": {
133 | "text/html": [
134 | "\n",
135 | "\n",
148 | "
\n",
149 | " \n",
150 | " \n",
151 | " | \n",
152 | " title | \n",
153 | " genres | \n",
154 | " runtime | \n",
155 | " vote_average | \n",
156 | " vote_count | \n",
157 | " year | \n",
158 | " overview | \n",
159 | " id | \n",
160 | "
\n",
161 | " \n",
162 | " \n",
163 | " \n",
164 | " 0 | \n",
165 | " Toy Story | \n",
166 | " ['animation', 'comedy', 'family'] | \n",
167 | " 81.0 | \n",
168 | " 7.7 | \n",
169 | " 5415.0 | \n",
170 | " 1995 | \n",
171 | " Led by Woody, Andy's toys live happily in his ... | \n",
172 | " 862 | \n",
173 | "
\n",
174 | " \n",
175 | " 1 | \n",
176 | " Jumanji | \n",
177 | " ['adventure', 'fantasy', 'family'] | \n",
178 | " 104.0 | \n",
179 | " 6.9 | \n",
180 | " 2413.0 | \n",
181 | " 1995 | \n",
182 | " When siblings Judy and Peter discover an encha... | \n",
183 | " 8844 | \n",
184 | "
\n",
185 | " \n",
186 | " 2 | \n",
187 | " Grumpier Old Men | \n",
188 | " ['romance', 'comedy'] | \n",
189 | " 101.0 | \n",
190 | " 6.5 | \n",
191 | " 92.0 | \n",
192 | " 1995 | \n",
193 | " A family wedding reignites the ancient feud be... | \n",
194 | " 15602 | \n",
195 | "
\n",
196 | " \n",
197 | " 3 | \n",
198 | " Waiting to Exhale | \n",
199 | " ['comedy', 'drama', 'romance'] | \n",
200 | " 127.0 | \n",
201 | " 6.1 | \n",
202 | " 34.0 | \n",
203 | " 1995 | \n",
204 | " Cheated on, mistreated and stepped on, the wom... | \n",
205 | " 31357 | \n",
206 | "
\n",
207 | " \n",
208 | " 4 | \n",
209 | " Father of the Bride Part II | \n",
210 | " ['comedy'] | \n",
211 | " 106.0 | \n",
212 | " 5.7 | \n",
213 | " 173.0 | \n",
214 | " 1995 | \n",
215 | " Just when George Banks has recovered from his ... | \n",
216 | " 11862 | \n",
217 | "
\n",
218 | " \n",
219 | "
\n",
220 | "
"
221 | ],
222 | "text/plain": [
223 | " title genres runtime \\\n",
224 | "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
225 | "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
226 | "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
227 | "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
228 | "4 Father of the Bride Part II ['comedy'] 106.0 \n",
229 | "\n",
230 | " vote_average vote_count year \\\n",
231 | "0 7.7 5415.0 1995 \n",
232 | "1 6.9 2413.0 1995 \n",
233 | "2 6.5 92.0 1995 \n",
234 | "3 6.1 34.0 1995 \n",
235 | "4 5.7 173.0 1995 \n",
236 | "\n",
237 | " overview id \n",
238 | "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
239 | "1 When siblings Judy and Peter discover an encha... 8844 \n",
240 | "2 A family wedding reignites the ancient feud be... 15602 \n",
241 | "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
242 | "4 Just when George Banks has recovered from his ... 11862 "
243 | ]
244 | },
245 | "execution_count": 2,
246 | "metadata": {},
247 | "output_type": "execute_result"
248 | }
249 | ],
250 | "source": [
251 | "#Import the original file\n",
252 | "orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)\n",
253 | "\n",
254 | "#Add the useful features into the cleaned dataframe\n",
255 | "df['overview'], df['id'] = orig_df['overview'], orig_df['id']\n",
256 | "\n",
257 | "df.head()"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 3,
263 | "metadata": {},
264 | "outputs": [
265 | {
266 | "data": {
267 | "text/plain": [
268 | "(45466, 75827)"
269 | ]
270 | },
271 | "execution_count": 3,
272 | "metadata": {},
273 | "output_type": "execute_result"
274 | }
275 | ],
276 | "source": [
277 | "#Import TfIdfVectorizer from the scikit-learn library\n",
278 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
279 | "\n",
280 | "#Define a TF-IDF Vectorizer Object. Remove all english stopwords\n",
281 | "tfidf = TfidfVectorizer(stop_words='english')\n",
282 | "\n",
283 | "#Replace NaN with an empty string\n",
284 | "df['overview'] = df['overview'].fillna('')\n",
285 | "\n",
286 | "#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature\n",
287 | "tfidf_matrix = tfidf.fit_transform(df['overview'])\n",
288 | "\n",
289 | "#Output the shape of tfidf_matrix\n",
290 | "tfidf_matrix.shape"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 4,
296 | "metadata": {
297 | "collapsed": true
298 | },
299 | "outputs": [],
300 | "source": [
301 | "# Import linear_kernel to compute the dot product\n",
302 | "from sklearn.metrics.pairwise import linear_kernel\n",
303 | "\n",
304 | "# Compute the cosine similarity matrix\n",
305 | "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 5,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any\n",
315 | "indices = pd.Series(df.index, index=df['title']).drop_duplicates()"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 6,
321 | "metadata": {
322 | "collapsed": true
323 | },
324 | "outputs": [],
325 | "source": [
326 | "# Function that takes in movie title as input and gives recommendations \n",
327 | "def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):\n",
328 | " # Obtain the index of the movie that matches the title\n",
329 | " idx = indices[title]\n",
330 | "\n",
331 | " # Get the pairwsie similarity scores of all movies with that movie\n",
332 | " # And convert it into a list of tuples as described above\n",
333 | " sim_scores = list(enumerate(cosine_sim[idx]))\n",
334 | "\n",
335 | " # Sort the movies based on the cosine similarity scores\n",
336 | " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
337 | "\n",
338 | " # Get the scores of the 10 most similar movies. Ignore the first movie.\n",
339 | " sim_scores = sim_scores[1:11]\n",
340 | "\n",
341 | " # Get the movie indices\n",
342 | " movie_indices = [i[0] for i in sim_scores]\n",
343 | "\n",
344 | " # Return the top 10 most similar movies\n",
345 | " return df['title'].iloc[movie_indices]"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 7,
351 | "metadata": {},
352 | "outputs": [
353 | {
354 | "data": {
355 | "text/plain": [
356 | "34682 How the Lion Cub and the Turtle Sang a Song\n",
357 | "9353 The Lion King 1½\n",
358 | "9115 The Lion King 2: Simba's Pride\n",
359 | "42829 Prey\n",
360 | "25654 Fearless Fagan\n",
361 | "17041 African Cats\n",
362 | "27933 Massaï, les guerriers de la pluie\n",
363 | "6094 Born Free\n",
364 | "37409 Sour Grape\n",
365 | "3203 The Waiting Game\n",
366 | "Name: title, dtype: object"
367 | ]
368 | },
369 | "execution_count": 7,
370 | "metadata": {},
371 | "output_type": "execute_result"
372 | }
373 | ],
374 | "source": [
375 | "#Get recommendations for The Lion King\n",
376 | "content_recommender('The Lion King')"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "# Metadata Based Recommender"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 8,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "# Load the keywords and credits files\n",
393 | "cred_df = pd.read_csv('../data/credits.csv')\n",
394 | "key_df = pd.read_csv('../data/keywords.csv')"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 9,
400 | "metadata": {},
401 | "outputs": [
402 | {
403 | "data": {
404 | "text/html": [
405 | "\n",
406 | "\n",
419 | "
\n",
420 | " \n",
421 | " \n",
422 | " | \n",
423 | " cast | \n",
424 | " crew | \n",
425 | " id | \n",
426 | "
\n",
427 | " \n",
428 | " \n",
429 | " \n",
430 | " 0 | \n",
431 | " [{'cast_id': 14, 'character': 'Woody (voice)',... | \n",
432 | " [{'credit_id': '52fe4284c3a36847f8024f49', 'de... | \n",
433 | " 862 | \n",
434 | "
\n",
435 | " \n",
436 | " 1 | \n",
437 | " [{'cast_id': 1, 'character': 'Alan Parrish', '... | \n",
438 | " [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... | \n",
439 | " 8844 | \n",
440 | "
\n",
441 | " \n",
442 | " 2 | \n",
443 | " [{'cast_id': 2, 'character': 'Max Goldman', 'c... | \n",
444 | " [{'credit_id': '52fe466a9251416c75077a89', 'de... | \n",
445 | " 15602 | \n",
446 | "
\n",
447 | " \n",
448 | " 3 | \n",
449 | " [{'cast_id': 1, 'character': \"Savannah 'Vannah... | \n",
450 | " [{'credit_id': '52fe44779251416c91011acb', 'de... | \n",
451 | " 31357 | \n",
452 | "
\n",
453 | " \n",
454 | " 4 | \n",
455 | " [{'cast_id': 1, 'character': 'George Banks', '... | \n",
456 | " [{'credit_id': '52fe44959251416c75039ed7', 'de... | \n",
457 | " 11862 | \n",
458 | "
\n",
459 | " \n",
460 | "
\n",
461 | "
"
462 | ],
463 | "text/plain": [
464 | " cast \\\n",
465 | "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
466 | "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
467 | "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
468 | "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
469 | "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
470 | "\n",
471 | " crew id \n",
472 | "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n",
473 | "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n",
474 | "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n",
475 | "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n",
476 | "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 "
477 | ]
478 | },
479 | "execution_count": 9,
480 | "metadata": {},
481 | "output_type": "execute_result"
482 | }
483 | ],
484 | "source": [
485 | "#Print the head of the credit dataframe\n",
486 | "cred_df.head()"
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": 10,
492 | "metadata": {},
493 | "outputs": [
494 | {
495 | "data": {
496 | "text/html": [
497 | "\n",
498 | "\n",
511 | "
\n",
512 | " \n",
513 | " \n",
514 | " | \n",
515 | " id | \n",
516 | " keywords | \n",
517 | "
\n",
518 | " \n",
519 | " \n",
520 | " \n",
521 | " 0 | \n",
522 | " 862 | \n",
523 | " [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... | \n",
524 | "
\n",
525 | " \n",
526 | " 1 | \n",
527 | " 8844 | \n",
528 | " [{'id': 10090, 'name': 'board game'}, {'id': 1... | \n",
529 | "
\n",
530 | " \n",
531 | " 2 | \n",
532 | " 15602 | \n",
533 | " [{'id': 1495, 'name': 'fishing'}, {'id': 12392... | \n",
534 | "
\n",
535 | " \n",
536 | " 3 | \n",
537 | " 31357 | \n",
538 | " [{'id': 818, 'name': 'based on novel'}, {'id':... | \n",
539 | "
\n",
540 | " \n",
541 | " 4 | \n",
542 | " 11862 | \n",
543 | " [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... | \n",
544 | "
\n",
545 | " \n",
546 | "
\n",
547 | "
"
548 | ],
549 | "text/plain": [
550 | " id keywords\n",
551 | "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n",
552 | "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n",
553 | "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n",
554 | "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n",
555 | "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
556 | ]
557 | },
558 | "execution_count": 10,
559 | "metadata": {},
560 | "output_type": "execute_result"
561 | }
562 | ],
563 | "source": [
564 | "#Print the head of the keywords dataframe\n",
565 | "key_df.head()"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 11,
571 | "metadata": {},
572 | "outputs": [
573 | {
574 | "ename": "ValueError",
575 | "evalue": "invalid literal for int() with base 10: '1997-08-20'",
576 | "output_type": "error",
577 | "traceback": [
578 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
579 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
580 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#Convert the IDs of df into int\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'int'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
581 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 91\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 92\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_deprecate_kwarg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
582 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors, **kwargs)\u001b[0m\n\u001b[1;32m 3408\u001b[0m \u001b[0;31m# else, only a single dtype is given\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3409\u001b[0m new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,\n\u001b[0;32m-> 3410\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 3411\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3412\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
583 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, **kwargs)\u001b[0m\n\u001b[1;32m 3222\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3223\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3224\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'astype'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3225\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3226\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mconvert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
584 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)\u001b[0m\n\u001b[1;32m 3089\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3090\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'mgr'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3091\u001b[0;31m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3092\u001b[0m \u001b[0mresult_blocks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_extend_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mapplied\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult_blocks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3093\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
585 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors, values, **kwargs)\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'raise'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 470\u001b[0m return self._astype(dtype, copy=copy, errors=errors, values=values,\n\u001b[0;32m--> 471\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 472\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 473\u001b[0m def _astype(self, dtype, copy=False, errors='raise', values=None,\n",
586 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m_astype\u001b[0;34m(self, dtype, copy, errors, values, klass, mgr, raise_on_error, **kwargs)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;31m# _astype_nansafe works fine with 1-d only\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mastype_nansafe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
587 | "\u001b[0;32m/usr/local/lib/python3.6/site-packages/pandas/core/dtypes/cast.py\u001b[0m in \u001b[0;36mastype_nansafe\u001b[0;34m(arr, dtype, copy)\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobject_\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0missubdtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minteger\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 624\u001b[0m \u001b[0;31m# work around NumPy brokenness, #1987\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 625\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype_intsafe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 626\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"datetime64\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"timedelta64\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
588 | "\u001b[0;32mpandas/_libs/lib.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.astype_intsafe (pandas/_libs/lib.c:16264)\u001b[0;34m()\u001b[0m\n",
589 | "\u001b[0;32mpandas/_libs/src/util.pxd\u001b[0m in \u001b[0;36mutil.set_value_at_unsafe (pandas/_libs/lib.c:73298)\u001b[0;34m()\u001b[0m\n",
590 | "\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: '1997-08-20'"
591 | ]
592 | }
593 | ],
594 | "source": [
595 | "#Convert the IDs of df into int\n",
596 | "df['id'] = df['id'].astype('int')"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": 12,
602 | "metadata": {
603 | "collapsed": true
604 | },
605 | "outputs": [],
606 | "source": [
607 | "# Function to convert all non-integer IDs to NaN\n",
608 | "def clean_ids(x):\n",
609 | " try:\n",
610 | " return int(x)\n",
611 | " except:\n",
612 | " return np.nan"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": 13,
618 | "metadata": {
619 | "collapsed": true
620 | },
621 | "outputs": [],
622 | "source": [
623 | "#Clean the ids of df\n",
624 | "df['id'] = df['id'].apply(clean_ids)\n",
625 | "\n",
626 | "#Filter all rows that have a null ID\n",
627 | "df = df[df['id'].notnull()]"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 14,
633 | "metadata": {},
634 | "outputs": [
635 | {
636 | "name": "stderr",
637 | "output_type": "stream",
638 | "text": [
639 | "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
640 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
641 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
642 | "\n",
643 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
644 | " \n"
645 | ]
646 | },
647 | {
648 | "data": {
649 | "text/html": [
650 | "\n",
651 | "\n",
664 | "
\n",
665 | " \n",
666 | " \n",
667 | " | \n",
668 | " title | \n",
669 | " genres | \n",
670 | " runtime | \n",
671 | " vote_average | \n",
672 | " vote_count | \n",
673 | " year | \n",
674 | " overview | \n",
675 | " id | \n",
676 | " cast | \n",
677 | " crew | \n",
678 | " keywords | \n",
679 | "
\n",
680 | " \n",
681 | " \n",
682 | " \n",
683 | " 0 | \n",
684 | " Toy Story | \n",
685 | " ['animation', 'comedy', 'family'] | \n",
686 | " 81.0 | \n",
687 | " 7.7 | \n",
688 | " 5415.0 | \n",
689 | " 1995 | \n",
690 | " Led by Woody, Andy's toys live happily in his ... | \n",
691 | " 862 | \n",
692 | " [{'cast_id': 14, 'character': 'Woody (voice)',... | \n",
693 | " [{'credit_id': '52fe4284c3a36847f8024f49', 'de... | \n",
694 | " [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... | \n",
695 | "
\n",
696 | " \n",
697 | " 1 | \n",
698 | " Jumanji | \n",
699 | " ['adventure', 'fantasy', 'family'] | \n",
700 | " 104.0 | \n",
701 | " 6.9 | \n",
702 | " 2413.0 | \n",
703 | " 1995 | \n",
704 | " When siblings Judy and Peter discover an encha... | \n",
705 | " 8844 | \n",
706 | " [{'cast_id': 1, 'character': 'Alan Parrish', '... | \n",
707 | " [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... | \n",
708 | " [{'id': 10090, 'name': 'board game'}, {'id': 1... | \n",
709 | "
\n",
710 | " \n",
711 | " 2 | \n",
712 | " Grumpier Old Men | \n",
713 | " ['romance', 'comedy'] | \n",
714 | " 101.0 | \n",
715 | " 6.5 | \n",
716 | " 92.0 | \n",
717 | " 1995 | \n",
718 | " A family wedding reignites the ancient feud be... | \n",
719 | " 15602 | \n",
720 | " [{'cast_id': 2, 'character': 'Max Goldman', 'c... | \n",
721 | " [{'credit_id': '52fe466a9251416c75077a89', 'de... | \n",
722 | " [{'id': 1495, 'name': 'fishing'}, {'id': 12392... | \n",
723 | "
\n",
724 | " \n",
725 | " 3 | \n",
726 | " Waiting to Exhale | \n",
727 | " ['comedy', 'drama', 'romance'] | \n",
728 | " 127.0 | \n",
729 | " 6.1 | \n",
730 | " 34.0 | \n",
731 | " 1995 | \n",
732 | " Cheated on, mistreated and stepped on, the wom... | \n",
733 | " 31357 | \n",
734 | " [{'cast_id': 1, 'character': \"Savannah 'Vannah... | \n",
735 | " [{'credit_id': '52fe44779251416c91011acb', 'de... | \n",
736 | " [{'id': 818, 'name': 'based on novel'}, {'id':... | \n",
737 | "
\n",
738 | " \n",
739 | " 4 | \n",
740 | " Father of the Bride Part II | \n",
741 | " ['comedy'] | \n",
742 | " 106.0 | \n",
743 | " 5.7 | \n",
744 | " 173.0 | \n",
745 | " 1995 | \n",
746 | " Just when George Banks has recovered from his ... | \n",
747 | " 11862 | \n",
748 | " [{'cast_id': 1, 'character': 'George Banks', '... | \n",
749 | " [{'credit_id': '52fe44959251416c75039ed7', 'de... | \n",
750 | " [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... | \n",
751 | "
\n",
752 | " \n",
753 | "
\n",
754 | "
"
755 | ],
756 | "text/plain": [
757 | " title genres runtime \\\n",
758 | "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
759 | "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
760 | "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
761 | "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
762 | "4 Father of the Bride Part II ['comedy'] 106.0 \n",
763 | "\n",
764 | " vote_average vote_count year \\\n",
765 | "0 7.7 5415.0 1995 \n",
766 | "1 6.9 2413.0 1995 \n",
767 | "2 6.5 92.0 1995 \n",
768 | "3 6.1 34.0 1995 \n",
769 | "4 5.7 173.0 1995 \n",
770 | "\n",
771 | " overview id \\\n",
772 | "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
773 | "1 When siblings Judy and Peter discover an encha... 8844 \n",
774 | "2 A family wedding reignites the ancient feud be... 15602 \n",
775 | "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
776 | "4 Just when George Banks has recovered from his ... 11862 \n",
777 | "\n",
778 | " cast \\\n",
779 | "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
780 | "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
781 | "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
782 | "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
783 | "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
784 | "\n",
785 | " crew \\\n",
786 | "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... \n",
787 | "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... \n",
788 | "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... \n",
789 | "3 [{'credit_id': '52fe44779251416c91011acb', 'de... \n",
790 | "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... \n",
791 | "\n",
792 | " keywords \n",
793 | "0 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... \n",
794 | "1 [{'id': 10090, 'name': 'board game'}, {'id': 1... \n",
795 | "2 [{'id': 1495, 'name': 'fishing'}, {'id': 12392... \n",
796 | "3 [{'id': 818, 'name': 'based on novel'}, {'id':... \n",
797 | "4 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... "
798 | ]
799 | },
800 | "execution_count": 14,
801 | "metadata": {},
802 | "output_type": "execute_result"
803 | }
804 | ],
805 | "source": [
806 | "# Convert IDs into integer\n",
807 | "df['id'] = df['id'].astype('int')\n",
808 | "key_df['id'] = key_df['id'].astype('int')\n",
809 | "cred_df['id'] = cred_df['id'].astype('int')\n",
810 | "\n",
811 | "# Merge keywords and credits into your main metadata dataframe\n",
812 | "df = df.merge(cred_df, on='id')\n",
813 | "df = df.merge(key_df, on='id')\n",
814 | "\n",
815 | "#Display the head of df\n",
816 | "df.head()"
817 | ]
818 | },
819 | {
820 | "cell_type": "code",
821 | "execution_count": 15,
822 | "metadata": {
823 | "collapsed": true
824 | },
825 | "outputs": [],
826 | "source": [
827 | "# Convert the stringified objects into the native python objects\n",
828 | "from ast import literal_eval\n",
829 | "\n",
830 | "features = ['cast', 'crew', 'keywords', 'genres']\n",
831 | "for feature in features:\n",
832 | " df[feature] = df[feature].apply(literal_eval)"
833 | ]
834 | },
835 | {
836 | "cell_type": "code",
837 | "execution_count": 16,
838 | "metadata": {},
839 | "outputs": [
840 | {
841 | "data": {
842 | "text/plain": [
843 | "{'credit_id': '52fe4284c3a36847f8024f49',\n",
844 | " 'department': 'Directing',\n",
845 | " 'gender': 2,\n",
846 | " 'id': 7879,\n",
847 | " 'job': 'Director',\n",
848 | " 'name': 'John Lasseter',\n",
849 | " 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}"
850 | ]
851 | },
852 | "execution_count": 16,
853 | "metadata": {},
854 | "output_type": "execute_result"
855 | }
856 | ],
857 | "source": [
858 | "#Print the first cast member of the first movie in df\n",
859 | "df.iloc[0]['crew'][0]"
860 | ]
861 | },
862 | {
863 | "cell_type": "code",
864 | "execution_count": 17,
865 | "metadata": {
866 | "collapsed": true
867 | },
868 | "outputs": [],
869 | "source": [
870 | "# Extract the director's name. If director is not listed, return NaN\n",
871 | "def get_director(x):\n",
872 | " for crew_member in x:\n",
873 | " if crew_member['job'] == 'Director':\n",
874 | " return crew_member['name']\n",
875 | " return np.nan"
876 | ]
877 | },
878 | {
879 | "cell_type": "code",
880 | "execution_count": 18,
881 | "metadata": {},
882 | "outputs": [
883 | {
884 | "data": {
885 | "text/plain": [
886 | "0 John Lasseter\n",
887 | "1 Joe Johnston\n",
888 | "2 Howard Deutch\n",
889 | "3 Forest Whitaker\n",
890 | "4 Charles Shyer\n",
891 | "Name: director, dtype: object"
892 | ]
893 | },
894 | "execution_count": 18,
895 | "metadata": {},
896 | "output_type": "execute_result"
897 | }
898 | ],
899 | "source": [
900 | "#Define the new director feature\n",
901 | "df['director'] = df['crew'].apply(get_director)\n",
902 | "\n",
903 | "#Print the directors of the first five movies\n",
904 | "df['director'].head()"
905 | ]
906 | },
907 | {
908 | "cell_type": "code",
909 | "execution_count": 19,
910 | "metadata": {
911 | "collapsed": true
912 | },
913 | "outputs": [],
914 | "source": [
915 | "# Returns the list top 3 elements or entire list; whichever is more.\n",
916 | "def generate_list(x):\n",
917 | " if isinstance(x, list):\n",
918 | " names = [i['name'] for i in x]\n",
919 | " #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
920 | " if len(names) > 3:\n",
921 | " names = names[:3]\n",
922 | " return names\n",
923 | "\n",
924 | " #Return empty list in case of missing/malformed data\n",
925 | " return []"
926 | ]
927 | },
928 | {
929 | "cell_type": "code",
930 | "execution_count": 20,
931 | "metadata": {},
932 | "outputs": [],
933 | "source": [
934 | "#Apply the generate_list function to cast and keywords\n",
935 | "df['cast'] = df['cast'].apply(generate_list)\n",
936 | "df['keywords'] = df['keywords'].apply(generate_list)"
937 | ]
938 | },
939 | {
940 | "cell_type": "code",
941 | "execution_count": 21,
942 | "metadata": {
943 | "collapsed": true
944 | },
945 | "outputs": [],
946 | "source": [
947 | "#Only consider a maximum of 3 genres\n",
948 | "df['genres'] = df['genres'].apply(lambda x: x[:3])"
949 | ]
950 | },
951 | {
952 | "cell_type": "code",
953 | "execution_count": 22,
954 | "metadata": {},
955 | "outputs": [
956 | {
957 | "data": {
958 | "text/html": [
959 | "\n",
960 | "\n",
973 | "
\n",
974 | " \n",
975 | " \n",
976 | " | \n",
977 | " title | \n",
978 | " cast | \n",
979 | " director | \n",
980 | " keywords | \n",
981 | " genres | \n",
982 | "
\n",
983 | " \n",
984 | " \n",
985 | " \n",
986 | " 0 | \n",
987 | " Toy Story | \n",
988 | " [Tom Hanks, Tim Allen, Don Rickles] | \n",
989 | " John Lasseter | \n",
990 | " [jealousy, toy, boy] | \n",
991 | " [animation, comedy, family] | \n",
992 | "
\n",
993 | " \n",
994 | " 1 | \n",
995 | " Jumanji | \n",
996 | " [Robin Williams, Jonathan Hyde, Kirsten Dunst] | \n",
997 | " Joe Johnston | \n",
998 | " [board game, disappearance, based on children'... | \n",
999 | " [adventure, fantasy, family] | \n",
1000 | "
\n",
1001 | " \n",
1002 | " 2 | \n",
1003 | " Grumpier Old Men | \n",
1004 | " [Walter Matthau, Jack Lemmon, Ann-Margret] | \n",
1005 | " Howard Deutch | \n",
1006 | " [fishing, best friend, duringcreditsstinger] | \n",
1007 | " [romance, comedy] | \n",
1008 | "
\n",
1009 | " \n",
1010 | " 3 | \n",
1011 | " Waiting to Exhale | \n",
1012 | " [Whitney Houston, Angela Bassett, Loretta Devine] | \n",
1013 | " Forest Whitaker | \n",
1014 | " [based on novel, interracial relationship, sin... | \n",
1015 | " [comedy, drama, romance] | \n",
1016 | "
\n",
1017 | " \n",
1018 | " 4 | \n",
1019 | " Father of the Bride Part II | \n",
1020 | " [Steve Martin, Diane Keaton, Martin Short] | \n",
1021 | " Charles Shyer | \n",
1022 | " [baby, midlife crisis, confidence] | \n",
1023 | " [comedy] | \n",
1024 | "
\n",
1025 | " \n",
1026 | "
\n",
1027 | "
"
1028 | ],
1029 | "text/plain": [
1030 | " title \\\n",
1031 | "0 Toy Story \n",
1032 | "1 Jumanji \n",
1033 | "2 Grumpier Old Men \n",
1034 | "3 Waiting to Exhale \n",
1035 | "4 Father of the Bride Part II \n",
1036 | "\n",
1037 | " cast director \\\n",
1038 | "0 [Tom Hanks, Tim Allen, Don Rickles] John Lasseter \n",
1039 | "1 [Robin Williams, Jonathan Hyde, Kirsten Dunst] Joe Johnston \n",
1040 | "2 [Walter Matthau, Jack Lemmon, Ann-Margret] Howard Deutch \n",
1041 | "3 [Whitney Houston, Angela Bassett, Loretta Devine] Forest Whitaker \n",
1042 | "4 [Steve Martin, Diane Keaton, Martin Short] Charles Shyer \n",
1043 | "\n",
1044 | " keywords \\\n",
1045 | "0 [jealousy, toy, boy] \n",
1046 | "1 [board game, disappearance, based on children'... \n",
1047 | "2 [fishing, best friend, duringcreditsstinger] \n",
1048 | "3 [based on novel, interracial relationship, sin... \n",
1049 | "4 [baby, midlife crisis, confidence] \n",
1050 | "\n",
1051 | " genres \n",
1052 | "0 [animation, comedy, family] \n",
1053 | "1 [adventure, fantasy, family] \n",
1054 | "2 [romance, comedy] \n",
1055 | "3 [comedy, drama, romance] \n",
1056 | "4 [comedy] "
1057 | ]
1058 | },
1059 | "execution_count": 22,
1060 | "metadata": {},
1061 | "output_type": "execute_result"
1062 | }
1063 | ],
1064 | "source": [
1065 | "# Print the new features of the first 5 movies along with title\n",
1066 | "df[['title', 'cast', 'director', 'keywords', 'genres']].head()"
1067 | ]
1068 | },
1069 | {
1070 | "cell_type": "code",
1071 | "execution_count": 23,
1072 | "metadata": {
1073 | "collapsed": true
1074 | },
1075 | "outputs": [],
1076 | "source": [
1077 | "# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase\n",
1078 | "def sanitize(x):\n",
1079 | " if isinstance(x, list):\n",
1080 | " #Strip spaces and convert to lowercase\n",
1081 | " return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
1082 | " else:\n",
1083 | " #Check if director exists. If not, return empty string\n",
1084 | " if isinstance(x, str):\n",
1085 | " return str.lower(x.replace(\" \", \"\"))\n",
1086 | " else:\n",
1087 | " return ''"
1088 | ]
1089 | },
1090 | {
1091 | "cell_type": "code",
1092 | "execution_count": 24,
1093 | "metadata": {},
1094 | "outputs": [],
1095 | "source": [
1096 | "#Apply the generate_list function to cast, keywords, director and genres\n",
1097 | "for feature in ['cast', 'director', 'genres', 'keywords']:\n",
1098 | " df[feature] = df[feature].apply(sanitize)"
1099 | ]
1100 | },
1101 | {
1102 | "cell_type": "code",
1103 | "execution_count": 25,
1104 | "metadata": {
1105 | "scrolled": true
1106 | },
1107 | "outputs": [],
1108 | "source": [
1109 | "#Function that creates a soup out of the desired metadata\n",
1110 | "def create_soup(x):\n",
1111 | " return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])"
1112 | ]
1113 | },
1114 | {
1115 | "cell_type": "code",
1116 | "execution_count": 26,
1117 | "metadata": {
1118 | "collapsed": true
1119 | },
1120 | "outputs": [],
1121 | "source": [
1122 | "# Create the new soup feature\n",
1123 | "df['soup'] = df.apply(create_soup, axis=1)"
1124 | ]
1125 | },
1126 | {
1127 | "cell_type": "code",
1128 | "execution_count": 27,
1129 | "metadata": {},
1130 | "outputs": [
1131 | {
1132 | "data": {
1133 | "text/plain": [
1134 | "'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'"
1135 | ]
1136 | },
1137 | "execution_count": 27,
1138 | "metadata": {},
1139 | "output_type": "execute_result"
1140 | }
1141 | ],
1142 | "source": [
1143 | "#Display the soup of the first movie\n",
1144 | "df.iloc[0]['soup']"
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "code",
1149 | "execution_count": 28,
1150 | "metadata": {},
1151 | "outputs": [],
1152 | "source": [
1153 | "# Import CountVectorizer\n",
1154 | "from sklearn.feature_extraction.text import CountVectorizer\n",
1155 | "\n",
1156 | "#Define a new CountVectorizer object and create vectors for the soup\n",
1157 | "count = CountVectorizer(stop_words='english')\n",
1158 | "count_matrix = count.fit_transform(df['soup'])"
1159 | ]
1160 | },
1161 | {
1162 | "cell_type": "code",
1163 | "execution_count": 29,
1164 | "metadata": {},
1165 | "outputs": [],
1166 | "source": [
1167 | "#Import cosine_similarity function\n",
1168 | "from sklearn.metrics.pairwise import cosine_similarity\n",
1169 | "\n",
1170 | "#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)\n",
1171 | "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)"
1172 | ]
1173 | },
1174 | {
1175 | "cell_type": "code",
1176 | "execution_count": 34,
1177 | "metadata": {
1178 | "collapsed": true
1179 | },
1180 | "outputs": [],
1181 | "source": [
1182 | "# Reset index of your df and construct reverse mapping again\n",
1183 | "df = df.reset_index()\n",
1184 | "indices2 = pd.Series(df.index, index=df['title'])"
1185 | ]
1186 | },
1187 | {
1188 | "cell_type": "code",
1189 | "execution_count": 37,
1190 | "metadata": {},
1191 | "outputs": [
1192 | {
1193 | "data": {
1194 | "text/plain": [
1195 | "29607 Cheburashka\n",
1196 | "40904 VeggieTales: Josh and the Big Wall\n",
1197 | "40913 VeggieTales: Minnesota Cuke and the Search for...\n",
1198 | "27768 The Little Matchgirl\n",
1199 | "15209 Spiderman: The Ultimate Villain Showdown\n",
1200 | "16613 Cirque du Soleil: Varekai\n",
1201 | "24654 The Seventh Brother\n",
1202 | "29198 Superstar Goofy\n",
1203 | "30244 My Love\n",
1204 | "31179 Pokémon: Arceus and the Jewel of Life\n",
1205 | "Name: title, dtype: object"
1206 | ]
1207 | },
1208 | "execution_count": 37,
1209 | "metadata": {},
1210 | "output_type": "execute_result"
1211 | }
1212 | ],
1213 | "source": [
1214 | "content_recommender('The Lion King', cosine_sim2, df, indices2)"
1215 | ]
1216 | },
1217 | {
1218 | "cell_type": "code",
1219 | "execution_count": null,
1220 | "metadata": {},
1221 | "outputs": [],
1222 | "source": []
1223 | }
1224 | ],
1225 | "metadata": {
1226 | "kernelspec": {
1227 | "display_name": "Python 3",
1228 | "language": "python",
1229 | "name": "python3"
1230 | },
1231 | "language_info": {
1232 | "codemirror_mode": {
1233 | "name": "ipython",
1234 | "version": 3
1235 | },
1236 | "file_extension": ".py",
1237 | "mimetype": "text/x-python",
1238 | "name": "python",
1239 | "nbconvert_exporter": "python",
1240 | "pygments_lexer": "ipython3",
1241 | "version": "3.6.0"
1242 | }
1243 | },
1244 | "nbformat": 4,
1245 | "nbformat_minor": 2
1246 | }
1247 |
--------------------------------------------------------------------------------
/Chapter7/.ipynb_checkpoints/Hybrid Recommender-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/Chapter7/Hybrid Recommender.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Hybrid Recommenders"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import pandas as pd"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {
26 | "collapsed": true
27 | },
28 | "outputs": [],
29 | "source": [
30 | "#Import or compute the cosine_sim matrix\n",
31 | "cosine_sim = pd.read_csv('../data/cosine_sim.csv')"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 3,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "#Import or compute the cosine sim mapping matrix\n",
41 | "cosine_sim_map = pd.read_csv('../data/cosine_sim_map.csv', header=None)\n",
42 | "\n",
43 | "#Convert cosine_sim_map into a Pandas Series\n",
44 | "cosine_sim_map = cosine_sim_map.set_index(0)\n",
45 | "cosine_sim_map = cosine_sim_map[1]"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "#Build the SVD based Collaborative filter\n",
55 | "from surprise import SVD, Reader, Dataset\n",
56 | "\n",
57 | "reader = Reader()\n",
58 | "ratings = pd.read_csv('../data/ratings_small.csv')\n",
59 | "data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)\n",
60 | "data.split(n_folds=5)\n",
61 | "svd = SVD()\n",
62 | "trainset = data.build_full_trainset()\n",
63 | "svd.train(trainset)"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 5,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "#Build title to ID and ID to title mappings\n",
73 | "id_map = pd.read_csv('../data/movie_ids.csv')\n",
74 | "id_to_title = id_map.set_index('id')\n",
75 | "title_to_id = id_map.set_index('title')"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 6,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "#Import or compute relevant metadata of the movies\n",
85 | "smd = pd.read_csv('../data/metadata_small.csv')"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 7,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "def hybrid(userId, title):\n",
95 | " #Extract the cosine_sim index of the movie\n",
96 | " idx = cosine_sim_map[title]\n",
97 | " \n",
98 | " #Extract the TMDB ID of the movie\n",
99 | " tmdbId = title_to_id.loc[title]['id']\n",
100 | " \n",
101 | " #Extract the movie ID internally assigned by the dataset\n",
102 | " movie_id = title_to_id.loc[title]['movieId']\n",
103 | " \n",
104 | " #Extract the similarity scores and their corresponding index for every movie from the cosine_sim matrix\n",
105 | " sim_scores = list(enumerate(cosine_sim[str(int(idx))]))\n",
106 | " \n",
107 | " #Sort the (index, score) tuples in decreasing order of similarity scores\n",
108 | " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
109 | " \n",
110 | " #Select the top 25 tuples, excluding the first \n",
111 | " #(as it is the similarity score of the movie with itself)\n",
112 | " sim_scores = sim_scores[1:26]\n",
113 | " \n",
114 | " #Store the cosine_sim indices of the top 25 movies in a list\n",
115 | " movie_indices = [i[0] for i in sim_scores]\n",
116 | "\n",
117 | " #Extract the metadata of the aforementioned movies\n",
118 | " movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]\n",
119 | " \n",
120 | " #Compute the predicted ratings using the SVD filter\n",
121 | " movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, id_to_title.loc[x]['movieId']).est)\n",
122 | " \n",
123 | " #Sort the movies in decreasing order of predicted rating\n",
124 | " movies = movies.sort_values('est', ascending=False)\n",
125 | " \n",
126 | " #Return the top 10 movies as recommendations\n",
127 | " return movies.head(10)"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 8,
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "data": {
137 | "text/html": [
138 | "\n",
139 | "\n",
152 | "
\n",
153 | " \n",
154 | " \n",
155 | " | \n",
156 | " title | \n",
157 | " vote_count | \n",
158 | " vote_average | \n",
159 | " year | \n",
160 | " id | \n",
161 | " est | \n",
162 | "
\n",
163 | " \n",
164 | " \n",
165 | " \n",
166 | " 1011 | \n",
167 | " The Terminator | \n",
168 | " 4208.0 | \n",
169 | " 7.4 | \n",
170 | " 1984 | \n",
171 | " 218 | \n",
172 | " 3.140748 | \n",
173 | "
\n",
174 | " \n",
175 | " 974 | \n",
176 | " Aliens | \n",
177 | " 3282.0 | \n",
178 | " 7.7 | \n",
179 | " 1986 | \n",
180 | " 679 | \n",
181 | " 3.126947 | \n",
182 | "
\n",
183 | " \n",
184 | " 8401 | \n",
185 | " Star Trek Into Darkness | \n",
186 | " 4479.0 | \n",
187 | " 7.4 | \n",
188 | " 2013 | \n",
189 | " 54138 | \n",
190 | " 3.079551 | \n",
191 | "
\n",
192 | " \n",
193 | " 7705 | \n",
194 | " Alice in Wonderland | \n",
195 | " 8.0 | \n",
196 | " 5.4 | \n",
197 | " 1933 | \n",
198 | " 25694 | \n",
199 | " 3.054995 | \n",
200 | "
\n",
201 | " \n",
202 | " 3060 | \n",
203 | " Sinbad and the Eye of the Tiger | \n",
204 | " 39.0 | \n",
205 | " 6.3 | \n",
206 | " 1977 | \n",
207 | " 11940 | \n",
208 | " 3.028386 | \n",
209 | "
\n",
210 | " \n",
211 | " 8658 | \n",
212 | " X-Men: Days of Future Past | \n",
213 | " 6155.0 | \n",
214 | " 7.5 | \n",
215 | " 2014 | \n",
216 | " 127585 | \n",
217 | " 2.997411 | \n",
218 | "
\n",
219 | " \n",
220 | " 2014 | \n",
221 | " Fantastic Planet | \n",
222 | " 140.0 | \n",
223 | " 7.6 | \n",
224 | " 1973 | \n",
225 | " 16306 | \n",
226 | " 2.957614 | \n",
227 | "
\n",
228 | " \n",
229 | " 522 | \n",
230 | " Terminator 2: Judgment Day | \n",
231 | " 4274.0 | \n",
232 | " 7.7 | \n",
233 | " 1991 | \n",
234 | " 280 | \n",
235 | " 2.914548 | \n",
236 | "
\n",
237 | " \n",
238 | " 1621 | \n",
239 | " Darby O'Gill and the Little People | \n",
240 | " 35.0 | \n",
241 | " 6.7 | \n",
242 | " 1959 | \n",
243 | " 18887 | \n",
244 | " 2.844940 | \n",
245 | "
\n",
246 | " \n",
247 | " 1668 | \n",
248 | " Return from Witch Mountain | \n",
249 | " 38.0 | \n",
250 | " 5.6 | \n",
251 | " 1978 | \n",
252 | " 14822 | \n",
253 | " 2.804012 | \n",
254 | "
\n",
255 | " \n",
256 | "
\n",
257 | "
"
258 | ],
259 | "text/plain": [
260 | " title vote_count vote_average year \\\n",
261 | "1011 The Terminator 4208.0 7.4 1984 \n",
262 | "974 Aliens 3282.0 7.7 1986 \n",
263 | "8401 Star Trek Into Darkness 4479.0 7.4 2013 \n",
264 | "7705 Alice in Wonderland 8.0 5.4 1933 \n",
265 | "3060 Sinbad and the Eye of the Tiger 39.0 6.3 1977 \n",
266 | "8658 X-Men: Days of Future Past 6155.0 7.5 2014 \n",
267 | "2014 Fantastic Planet 140.0 7.6 1973 \n",
268 | "522 Terminator 2: Judgment Day 4274.0 7.7 1991 \n",
269 | "1621 Darby O'Gill and the Little People 35.0 6.7 1959 \n",
270 | "1668 Return from Witch Mountain 38.0 5.6 1978 \n",
271 | "\n",
272 | " id est \n",
273 | "1011 218 3.140748 \n",
274 | "974 679 3.126947 \n",
275 | "8401 54138 3.079551 \n",
276 | "7705 25694 3.054995 \n",
277 | "3060 11940 3.028386 \n",
278 | "8658 127585 2.997411 \n",
279 | "2014 16306 2.957614 \n",
280 | "522 280 2.914548 \n",
281 | "1621 18887 2.844940 \n",
282 | "1668 14822 2.804012 "
283 | ]
284 | },
285 | "execution_count": 8,
286 | "metadata": {},
287 | "output_type": "execute_result"
288 | }
289 | ],
290 | "source": [
291 | "hybrid(1, 'Avatar')"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 9,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "data": {
301 | "text/html": [
302 | "\n",
303 | "\n",
316 | "
\n",
317 | " \n",
318 | " \n",
319 | " | \n",
320 | " title | \n",
321 | " vote_count | \n",
322 | " vote_average | \n",
323 | " year | \n",
324 | " id | \n",
325 | " est | \n",
326 | "
\n",
327 | " \n",
328 | " \n",
329 | " \n",
330 | " 522 | \n",
331 | " Terminator 2: Judgment Day | \n",
332 | " 4274.0 | \n",
333 | " 7.7 | \n",
334 | " 1991 | \n",
335 | " 280 | \n",
336 | " 3.943639 | \n",
337 | "
\n",
338 | " \n",
339 | " 2834 | \n",
340 | " Predator | \n",
341 | " 2129.0 | \n",
342 | " 7.3 | \n",
343 | " 1987 | \n",
344 | " 106 | \n",
345 | " 3.866272 | \n",
346 | "
\n",
347 | " \n",
348 | " 8401 | \n",
349 | " Star Trek Into Darkness | \n",
350 | " 4479.0 | \n",
351 | " 7.4 | \n",
352 | " 2013 | \n",
353 | " 54138 | \n",
354 | " 3.858491 | \n",
355 | "
\n",
356 | " \n",
357 | " 1011 | \n",
358 | " The Terminator | \n",
359 | " 4208.0 | \n",
360 | " 7.4 | \n",
361 | " 1984 | \n",
362 | " 218 | \n",
363 | " 3.856029 | \n",
364 | "
\n",
365 | " \n",
366 | " 7705 | \n",
367 | " Alice in Wonderland | \n",
368 | " 8.0 | \n",
369 | " 5.4 | \n",
370 | " 1933 | \n",
371 | " 25694 | \n",
372 | " 3.701565 | \n",
373 | "
\n",
374 | " \n",
375 | " 922 | \n",
376 | " The Abyss | \n",
377 | " 822.0 | \n",
378 | " 7.1 | \n",
379 | " 1989 | \n",
380 | " 2756 | \n",
381 | " 3.676465 | \n",
382 | "
\n",
383 | " \n",
384 | " 974 | \n",
385 | " Aliens | \n",
386 | " 3282.0 | \n",
387 | " 7.7 | \n",
388 | " 1986 | \n",
389 | " 679 | \n",
390 | " 3.672303 | \n",
391 | "
\n",
392 | " \n",
393 | " 1621 | \n",
394 | " Darby O'Gill and the Little People | \n",
395 | " 35.0 | \n",
396 | " 6.7 | \n",
397 | " 1959 | \n",
398 | " 18887 | \n",
399 | " 3.628234 | \n",
400 | "
\n",
401 | " \n",
402 | " 1668 | \n",
403 | " Return from Witch Mountain | \n",
404 | " 38.0 | \n",
405 | " 5.6 | \n",
406 | " 1978 | \n",
407 | " 14822 | \n",
408 | " 3.614118 | \n",
409 | "
\n",
410 | " \n",
411 | " 2014 | \n",
412 | " Fantastic Planet | \n",
413 | " 140.0 | \n",
414 | " 7.6 | \n",
415 | " 1973 | \n",
416 | " 16306 | \n",
417 | " 3.602051 | \n",
418 | "
\n",
419 | " \n",
420 | "
\n",
421 | "
"
422 | ],
423 | "text/plain": [
424 | " title vote_count vote_average year \\\n",
425 | "522 Terminator 2: Judgment Day 4274.0 7.7 1991 \n",
426 | "2834 Predator 2129.0 7.3 1987 \n",
427 | "8401 Star Trek Into Darkness 4479.0 7.4 2013 \n",
428 | "1011 The Terminator 4208.0 7.4 1984 \n",
429 | "7705 Alice in Wonderland 8.0 5.4 1933 \n",
430 | "922 The Abyss 822.0 7.1 1989 \n",
431 | "974 Aliens 3282.0 7.7 1986 \n",
432 | "1621 Darby O'Gill and the Little People 35.0 6.7 1959 \n",
433 | "1668 Return from Witch Mountain 38.0 5.6 1978 \n",
434 | "2014 Fantastic Planet 140.0 7.6 1973 \n",
435 | "\n",
436 | " id est \n",
437 | "522 280 3.943639 \n",
438 | "2834 106 3.866272 \n",
439 | "8401 54138 3.858491 \n",
440 | "1011 218 3.856029 \n",
441 | "7705 25694 3.701565 \n",
442 | "922 2756 3.676465 \n",
443 | "974 679 3.672303 \n",
444 | "1621 18887 3.628234 \n",
445 | "1668 14822 3.614118 \n",
446 | "2014 16306 3.602051 "
447 | ]
448 | },
449 | "execution_count": 9,
450 | "metadata": {},
451 | "output_type": "execute_result"
452 | }
453 | ],
454 | "source": [
455 | "hybrid(2, 'Avatar')"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": null,
461 | "metadata": {},
462 | "outputs": [],
463 | "source": []
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": null,
468 | "metadata": {
469 | "collapsed": true
470 | },
471 | "outputs": [],
472 | "source": []
473 | }
474 | ],
475 | "metadata": {
476 | "kernelspec": {
477 | "display_name": "Python 3",
478 | "language": "python",
479 | "name": "python3"
480 | },
481 | "language_info": {
482 | "codemirror_mode": {
483 | "name": "ipython",
484 | "version": 3
485 | },
486 | "file_extension": ".py",
487 | "mimetype": "text/x-python",
488 | "name": "python",
489 | "nbconvert_exporter": "python",
490 | "pygments_lexer": "ipython3",
491 | "version": "3.6.0"
492 | }
493 | },
494 | "nbformat": 4,
495 | "nbformat_minor": 2
496 | }
497 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Hands-On Recommendation Systems with Python
5 |
6 |
7 |
8 | This is the code repository for [Hands-On Recommendation Systems with Python](https://www.packtpub.com/big-data-and-business-intelligence/hands-recommendation-systems-python?utm_source=github&utm_medium=repository&utm_campaign=9781788993753), published by Packt.
9 |
10 | **Start building powerful and personalized, recommendation engines with Python**
11 |
12 | ## What is this book about?
13 | First Paragraph from the Long Description
14 |
15 | This book covers the following exciting features:
16 | * The different kinds of recommender systems
17 | * Data wrangling techniques using the pandas library
18 | * Building an IMDB Top 250 Clone
19 | * Building a content based engine to recommend movies based on movie metadata
20 | * Data mining techniques used in building recommenders
21 |
22 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1788993756) today!
23 |
24 |
26 |
27 |
28 | ## Instructions and Navigations
29 | All of the code is organized into folders. For example, Chapter02.
30 |
31 | The code will look like the following:
32 | ```
33 | #Import SVD
34 | from surprise import SVD
35 |
36 | #Define the SVD algorithm object
37 | svd = SVD()
38 |
39 | #Evaluate the performance in terms of RMSE
40 | evaluate(svd, data, measures=['RMSE'])
41 | ```
42 |
43 | **Following is what you need for this book:**
44 | If you are a Python developer and want to develop applications for social networking, news personalization or smart advertising, this is the book for you. Basic knowledge of machine learning techniques will be helpful, but not mandatory.
45 |
46 | With the following software and hardware list you can run all code files present in the book (Chapter 1-7).
47 |
48 | ### Software and Hardware List
49 |
50 | | Chapter | Software required | OS required |
51 | | -------- | ------------------------------------| -----------------------------------|
52 | | 1 | Samba 4.x Server Software | Windows |
53 |
54 |
55 |
56 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://www.packtpub.com/sites/default/files/downloads/HandsOnRecommendationSystemswithPython_ColorImages.pdf).
57 |
58 | ## Code in Action
59 |
60 | Click on the following link to see the Code in Action:
61 |
62 | [http://bit.ly/2JV4oeu](http://bit.ly/2JV4oeu)
63 |
64 | ### Related products
65 | * Statistics for Machine Learning [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/statistics-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781788295758) [[Amazon]](https://www.amazon.com/dp/1788295757)
66 |
67 | * Feature Engineering Made Easy [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/feature-engineering-made-easy?utm_source=github&utm_medium=repository&utm_campaign=9781787287600) [[Amazon]](https://www.amazon.com/dp/1787287602)
68 |
69 | ## Get to Know the Author
70 | **Rounak Banik**
71 | Rounak Banik is a Young India Fellow and an ECE graduate from IIT Roorkee. He has worked as a software engineer at Parceed, a New York start-up, and Springboard, an EdTech start-up based in San Francisco and Bangalore. He has also served as a backend development instructor at Acadview, teaching Python and Django to around 35 college students from Delhi and Dehradun.
72 |
73 | He is an alumni of Springboard's data science career track. He has given talks at the SciPy India Conference and published popular tutorials on Kaggle and DataCamp.
74 |
75 | ### Suggestions and Feedback
76 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
77 | ### Download a free PDF
78 |
79 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
80 | https://packt.link/free-ebook/9781788993753
--------------------------------------------------------------------------------