├── .ipynb_checkpoints ├── preprocessing 2.ipynb ├── preprocessing 3.ipynb ├── preprocessing 4.ipynb └── sentiment.ipynb ├── Procfile ├── README.md ├── datasets ├── final_data.csv ├── main_data.csv ├── movie.csv ├── movie_metadata.csv └── reviews.txt ├── main.py ├── main_data.csv ├── nlp_model.pkl ├── requirements.txt ├── static ├── autocomplete.js ├── default.jpg ├── image.jpg ├── loader.gif ├── recommend.js └── style.css ├── templates ├── home.html └── recommend.html └── tranform.pkl /.ipynb_checkpoints/preprocessing 2.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import numpy as np\nimport pandas as pd","execution_count":86,"outputs":[]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"credits = pd.read_csv('credits.csv')","execution_count":87,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"credits","execution_count":88,"outputs":[{"output_type":"execute_result","execution_count":88,"data":{"text/plain":" cast \\\n0 [{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3... \n1 [{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a3... \n2 [{'cast_id': 2, 'character': 'Max Goldman', 'credit_id': '52fe466a92514... \n3 [{'cast_id': 1, 'character': \"Savannah 'Vannah' Jackson\", 'credit_id': ... \n4 [{'cast_id': 1, 'character': 'George Banks', 'credit_id': '52fe44959251... \n... ... \n45471 [{'cast_id': 0, 'character': '', 'credit_id': '5894a909925141427e0079a5... \n45472 [{'cast_id': 1002, 'character': 'Sister Angela', 'credit_id': '52fe4af1... \n45473 [{'cast_id': 6, 'character': 'Emily Shaw', 'credit_id': '52fe4776c3a368... \n45474 [{'cast_id': 2, 'character': '', 'credit_id': '52fe4ea59251416c7515d7d5... \n45475 [] \n\n crew \\\n0 [{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', '... \n1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', ... \n2 [{'credit_id': '52fe466a9251416c75077a89', 'department': 'Directing', '... \n3 [{'credit_id': '52fe44779251416c91011acb', 'department': 'Directing', '... \n4 [{'credit_id': '52fe44959251416c75039ed7', 'department': 'Sound', 'gend... \n... ... \n45471 [{'credit_id': '5894a97d925141426c00818c', 'department': 'Directing', '... \n45472 [{'credit_id': '52fe4af1c3a36847f81e9b15', 'department': 'Directing', '... \n45473 [{'credit_id': '52fe4776c3a368484e0c8387', 'department': 'Directing', '... \n45474 [{'credit_id': '533bccebc3a36844cf0011a7', 'department': 'Directing', '... \n45475 [{'credit_id': '593e676c92514105b702e68e', 'department': 'Directing', '... \n\n id \n0 862 \n1 8844 \n2 15602 \n3 31357 \n4 11862 \n... ... \n45471 439050 \n45472 111109 \n45473 67758 \n45474 227506 \n45475 461257 \n\n[45476 rows x 3 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
castcrewid
0[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3...[{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', '...862
1[{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a3...[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', ...8844
2[{'cast_id': 2, 'character': 'Max Goldman', 'credit_id': '52fe466a92514...[{'credit_id': '52fe466a9251416c75077a89', 'department': 'Directing', '...15602
3[{'cast_id': 1, 'character': \"Savannah 'Vannah' Jackson\", 'credit_id': ...[{'credit_id': '52fe44779251416c91011acb', 'department': 'Directing', '...31357
4[{'cast_id': 1, 'character': 'George Banks', 'credit_id': '52fe44959251...[{'credit_id': '52fe44959251416c75039ed7', 'department': 'Sound', 'gend...11862
............
45471[{'cast_id': 0, 'character': '', 'credit_id': '5894a909925141427e0079a5...[{'credit_id': '5894a97d925141426c00818c', 'department': 'Directing', '...439050
45472[{'cast_id': 1002, 'character': 'Sister Angela', 'credit_id': '52fe4af1...[{'credit_id': '52fe4af1c3a36847f81e9b15', 'department': 'Directing', '...111109
45473[{'cast_id': 6, 'character': 'Emily Shaw', 'credit_id': '52fe4776c3a368...[{'credit_id': '52fe4776c3a368484e0c8387', 'department': 'Directing', '...67758
45474[{'cast_id': 2, 'character': '', 'credit_id': '52fe4ea59251416c7515d7d5...[{'credit_id': '533bccebc3a36844cf0011a7', 'department': 'Directing', '...227506
45475[][{'credit_id': '593e676c92514105b702e68e', 'department': 'Directing', '...461257
\n

45476 rows × 3 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"meta = pd.read_csv('movies_metadata.csv')","execution_count":89,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.\n interactivity=interactivity, compiler=compiler, result=result)\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"meta['release_date'] = pd.to_datetime(meta['release_date'], errors='coerce')","execution_count":90,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"\nmeta['year'] = meta['release_date'].dt.year","execution_count":91,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"meta['year'].value_counts().sort_index()","execution_count":92,"outputs":[{"output_type":"execute_result","execution_count":92,"data":{"text/plain":"1874.0 1\n1878.0 1\n1883.0 1\n1887.0 1\n1888.0 2\n ... \n2015.0 1905\n2016.0 1604\n2017.0 532\n2018.0 5\n2020.0 1\nName: year, Length: 135, dtype: int64"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Getting only movies upto 2017 movies as we don't have enough data for the movies from 2018, 2019 and 2020. \n# We'll deal with it in the upcoming preprocessing files\nnew_meta = meta.loc[meta.year <= 2017,['genres','id','title','year']]","execution_count":93,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"\nnew_meta","execution_count":94,"outputs":[{"output_type":"execute_result","execution_count":94,"data":{"text/plain":" genres \\\n0 [{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': ... \n1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id':... \n2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}] \n3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 1074... \n4 [{'id': 35, 'name': 'Comedy'}] \n... ... \n45460 [{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}, {'id': 1074... \n45462 [{'id': 18, 'name': 'Drama'}] \n45463 [{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 53, ... \n45464 [] \n45465 [] \n\n id title year \n0 862 Toy Story 1995.0 \n1 8844 Jumanji 1995.0 \n2 15602 Grumpier Old Men 1995.0 \n3 31357 Waiting to Exhale 1995.0 \n4 11862 Father of the Bride Part II 1995.0 \n... ... ... ... \n45460 30840 Robin Hood 1991.0 \n45462 111109 Century of Birthing 2011.0 \n45463 67758 Betrayal 2003.0 \n45464 227506 Satan Triumphant 1917.0 \n45465 461257 Queerama 2017.0 \n\n[45370 rows x 4 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
genresidtitleyear
0[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': ...862Toy Story1995.0
1[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id':...8844Jumanji1995.0
2[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]15602Grumpier Old Men1995.0
3[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 1074...31357Waiting to Exhale1995.0
4[{'id': 35, 'name': 'Comedy'}]11862Father of the Bride Part II1995.0
...............
45460[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}, {'id': 1074...30840Robin Hood1991.0
45462[{'id': 18, 'name': 'Drama'}]111109Century of Birthing2011.0
45463[{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 53, ...67758Betrayal2003.0
45464[]227506Satan Triumphant1917.0
45465[]461257Queerama2017.0
\n

45370 rows × 4 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_meta['id'] = new_meta['id'].astype(int)","execution_count":95,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data = pd.merge(new_meta, credits, on='id')","execution_count":96,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pd.set_option('display.max_colwidth', 75)\ndata","execution_count":97,"outputs":[{"output_type":"execute_result","execution_count":97,"data":{"text/plain":" genres \\\n0 [{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': ... \n1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id':... \n2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}] \n3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 1074... \n4 [{'id': 35, 'name': 'Comedy'}] \n... ... \n45440 [{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}, {'id': 1074... \n45441 [{'id': 18, 'name': 'Drama'}] \n45442 [{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 53, ... \n45443 [] \n45444 [] \n\n id title year \\\n0 862 Toy Story 1995.0 \n1 8844 Jumanji 1995.0 \n2 15602 Grumpier Old Men 1995.0 \n3 31357 Waiting to Exhale 1995.0 \n4 11862 Father of the Bride Part II 1995.0 \n... ... ... ... \n45440 30840 Robin Hood 1991.0 \n45441 111109 Century of Birthing 2011.0 \n45442 67758 Betrayal 2003.0 \n45443 227506 Satan Triumphant 1917.0 \n45444 461257 Queerama 2017.0 \n\n cast \\\n0 [{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3... \n1 [{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a3... \n2 [{'cast_id': 2, 'character': 'Max Goldman', 'credit_id': '52fe466a92514... \n3 [{'cast_id': 1, 'character': \"Savannah 'Vannah' Jackson\", 'credit_id': ... \n4 [{'cast_id': 1, 'character': 'George Banks', 'credit_id': '52fe44959251... \n... ... \n45440 [{'cast_id': 1, 'character': 'Sir Robert Hode', 'credit_id': '52fe44439... \n45441 [{'cast_id': 1002, 'character': 'Sister Angela', 'credit_id': '52fe4af1... \n45442 [{'cast_id': 6, 'character': 'Emily Shaw', 'credit_id': '52fe4776c3a368... \n45443 [{'cast_id': 2, 'character': '', 'credit_id': '52fe4ea59251416c7515d7d5... \n45444 [] \n\n crew \n0 [{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', '... \n1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', ... \n2 [{'credit_id': '52fe466a9251416c75077a89', 'department': 'Directing', '... \n3 [{'credit_id': '52fe44779251416c91011acb', 'department': 'Directing', '... \n4 [{'credit_id': '52fe44959251416c75039ed7', 'department': 'Sound', 'gend... \n... ... \n45440 [{'credit_id': '52fe44439251416c9100a899', 'department': 'Directing', '... \n45441 [{'credit_id': '52fe4af1c3a36847f81e9b15', 'department': 'Directing', '... \n45442 [{'credit_id': '52fe4776c3a368484e0c8387', 'department': 'Directing', '... \n45443 [{'credit_id': '533bccebc3a36844cf0011a7', 'department': 'Directing', '... \n45444 [{'credit_id': '593e676c92514105b702e68e', 'department': 'Directing', '... \n\n[45445 rows x 6 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
genresidtitleyearcastcrew
0[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': ...862Toy Story1995.0[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3...[{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', '...
1[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id':...8844Jumanji1995.0[{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a3...[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', ...
2[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]15602Grumpier Old Men1995.0[{'cast_id': 2, 'character': 'Max Goldman', 'credit_id': '52fe466a92514...[{'credit_id': '52fe466a9251416c75077a89', 'department': 'Directing', '...
3[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 1074...31357Waiting to Exhale1995.0[{'cast_id': 1, 'character': \"Savannah 'Vannah' Jackson\", 'credit_id': ...[{'credit_id': '52fe44779251416c91011acb', 'department': 'Directing', '...
4[{'id': 35, 'name': 'Comedy'}]11862Father of the Bride Part II1995.0[{'cast_id': 1, 'character': 'George Banks', 'credit_id': '52fe44959251...[{'credit_id': '52fe44959251416c75039ed7', 'department': 'Sound', 'gend...
.....................
45440[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}, {'id': 1074...30840Robin Hood1991.0[{'cast_id': 1, 'character': 'Sir Robert Hode', 'credit_id': '52fe44439...[{'credit_id': '52fe44439251416c9100a899', 'department': 'Directing', '...
45441[{'id': 18, 'name': 'Drama'}]111109Century of Birthing2011.0[{'cast_id': 1002, 'character': 'Sister Angela', 'credit_id': '52fe4af1...[{'credit_id': '52fe4af1c3a36847f81e9b15', 'department': 'Directing', '...
45442[{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 53, ...67758Betrayal2003.0[{'cast_id': 6, 'character': 'Emily Shaw', 'credit_id': '52fe4776c3a368...[{'credit_id': '52fe4776c3a368484e0c8387', 'department': 'Directing', '...
45443[]227506Satan Triumphant1917.0[{'cast_id': 2, 'character': '', 'credit_id': '52fe4ea59251416c7515d7d5...[{'credit_id': '533bccebc3a36844cf0011a7', 'department': 'Directing', '...
45444[]461257Queerama2017.0[][{'credit_id': '593e676c92514105b702e68e', 'department': 'Directing', '...
\n

45445 rows × 6 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"# evaluates an expression node or a string containing a Python literal or container display\nimport ast\ndata['genres'] = data['genres'].map(lambda x: ast.literal_eval(x))\ndata['cast'] = data['cast'].map(lambda x: ast.literal_eval(x))\ndata['crew'] = data['crew'].map(lambda x: ast.literal_eval(x))","execution_count":98,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def make_genresList(x):\n gen = []\n st = \" \"\n for i in x:\n if i.get('name') == 'Science Fiction':\n scifi = 'Sci-Fi'\n gen.append(scifi)\n else:\n gen.append(i.get('name'))\n if gen == []:\n return np.NaN\n else:\n return (st.join(gen))","execution_count":99,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data['genres_list'] = data['genres'].map(lambda x: make_genresList(x))","execution_count":100,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data['genres_list']","execution_count":101,"outputs":[{"output_type":"execute_result","execution_count":101,"data":{"text/plain":"0 Animation Comedy Family\n1 Adventure Fantasy Family\n2 Romance Comedy\n3 Comedy Drama Romance\n4 Comedy\n ... \n45440 Drama Action Romance\n45441 Drama\n45442 Action Drama Thriller\n45443 NaN\n45444 NaN\nName: genres_list, Length: 45445, dtype: object"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_actor1(x):\n casts = []\n for i in x:\n casts.append(i.get('name'))\n if casts == []:\n return np.NaN\n else:\n return (casts[0])","execution_count":102,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data['actor_1_name'] = data['cast'].map(lambda x: get_actor1(x))","execution_count":103,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_actor2(x):\n casts = []\n for i in x:\n casts.append(i.get('name'))\n if casts == [] or len(casts)<=1:\n return np.NaN\n else:\n return (casts[1])","execution_count":104,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data['actor_2_name'] = data['cast'].map(lambda x: get_actor2(x))","execution_count":105,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data['actor_2_name']","execution_count":106,"outputs":[{"output_type":"execute_result","execution_count":106,"data":{"text/plain":"0 Tim Allen\n1 Jonathan Hyde\n2 Jack Lemmon\n3 Angela Bassett\n4 Diane Keaton\n ... \n45440 Uma Thurman\n45441 Perry Dizon\n45442 Adam Baldwin\n45443 Nathalie Lissenko\n45444 NaN\nName: actor_2_name, Length: 45445, dtype: object"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_actor3(x):\n casts = []\n for i in x:\n casts.append(i.get('name'))\n if casts == [] or len(casts)<=2:\n return np.NaN\n else:\n return (casts[2])","execution_count":107,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data['actor_3_name'] = data['cast'].map(lambda x: get_actor3(x))","execution_count":108,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data['actor_3_name']","execution_count":109,"outputs":[{"output_type":"execute_result","execution_count":109,"data":{"text/plain":"0 Don Rickles\n1 Kirsten Dunst\n2 Ann-Margret\n3 Loretta Devine\n4 Martin Short\n ... \n45440 David Morrissey\n45441 Hazel Orencio\n45442 Julie du Page\n45443 Pavel Pavlov\n45444 NaN\nName: actor_3_name, Length: 45445, dtype: object"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_directors(x):\n dt = []\n st = \" \"\n for i in x:\n if i.get('job') == 'Director':\n dt.append(i.get('name'))\n if dt == []:\n return np.NaN\n else:\n return (st.join(dt))","execution_count":110,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data['director_name'] = data['crew'].map(lambda x: get_directors(x))","execution_count":111,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"data['director_name']","execution_count":112,"outputs":[{"output_type":"execute_result","execution_count":112,"data":{"text/plain":"0 John Lasseter\n1 Joe Johnston\n2 Howard Deutch\n3 Forest Whitaker\n4 Charles Shyer\n ... \n45440 John Irvin\n45441 Lav Diaz\n45442 Mark L. Lester\n45443 Yakov Protazanov\n45444 Daisy Asquith\nName: director_name, Length: 45445, dtype: object"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie = data.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres_list','title']]","execution_count":113,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie","execution_count":114,"outputs":[{"output_type":"execute_result","execution_count":114,"data":{"text/plain":" director_name actor_1_name actor_2_name actor_3_name \\\n0 John Lasseter Tom Hanks Tim Allen Don Rickles \n1 Joe Johnston Robin Williams Jonathan Hyde Kirsten Dunst \n2 Howard Deutch Walter Matthau Jack Lemmon Ann-Margret \n3 Forest Whitaker Whitney Houston Angela Bassett Loretta Devine \n4 Charles Shyer Steve Martin Diane Keaton Martin Short \n... ... ... ... ... \n45440 John Irvin Patrick Bergin Uma Thurman David Morrissey \n45441 Lav Diaz Angel Aquino Perry Dizon Hazel Orencio \n45442 Mark L. Lester Erika Eleniak Adam Baldwin Julie du Page \n45443 Yakov Protazanov Iwan Mosschuchin Nathalie Lissenko Pavel Pavlov \n45444 Daisy Asquith NaN NaN NaN \n\n genres_list title \n0 Animation Comedy Family Toy Story \n1 Adventure Fantasy Family Jumanji \n2 Romance Comedy Grumpier Old Men \n3 Comedy Drama Romance Waiting to Exhale \n4 Comedy Father of the Bride Part II \n... ... ... \n45440 Drama Action Romance Robin Hood \n45441 Drama Century of Birthing \n45442 Action Drama Thriller Betrayal \n45443 NaN Satan Triumphant \n45444 NaN Queerama \n\n[45445 rows x 6 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenres_listtitle
0John LasseterTom HanksTim AllenDon RicklesAnimation Comedy FamilyToy Story
1Joe JohnstonRobin WilliamsJonathan HydeKirsten DunstAdventure Fantasy FamilyJumanji
2Howard DeutchWalter MatthauJack LemmonAnn-MargretRomance ComedyGrumpier Old Men
3Forest WhitakerWhitney HoustonAngela BassettLoretta DevineComedy Drama RomanceWaiting to Exhale
4Charles ShyerSteve MartinDiane KeatonMartin ShortComedyFather of the Bride Part II
.....................
45440John IrvinPatrick BerginUma ThurmanDavid MorrisseyDrama Action RomanceRobin Hood
45441Lav DiazAngel AquinoPerry DizonHazel OrencioDramaCentury of Birthing
45442Mark L. LesterErika EleniakAdam BaldwinJulie du PageAction Drama ThrillerBetrayal
45443Yakov ProtazanovIwan MosschuchinNathalie LissenkoPavel PavlovNaNSatan Triumphant
45444Daisy AsquithNaNNaNNaNNaNQueerama
\n

45445 rows × 6 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie.isna().sum()","execution_count":115,"outputs":[{"output_type":"execute_result","execution_count":115,"data":{"text/plain":"director_name 835\nactor_1_name 2354\nactor_2_name 3683\nactor_3_name 4593\ngenres_list 2384\ntitle 0\ndtype: int64"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie = movie.dropna(how='any')","execution_count":116,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie.isna().sum()","execution_count":117,"outputs":[{"output_type":"execute_result","execution_count":117,"data":{"text/plain":"director_name 0\nactor_1_name 0\nactor_2_name 0\nactor_3_name 0\ngenres_list 0\ntitle 0\ndtype: int64"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie = movie.rename(columns={'genres_list':'genres'})\nmovie = movie.rename(columns={'title':'movie_title'})","execution_count":118,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie['movie_title'] = movie['movie_title'].str.lower()","execution_count":119,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie['comb'] = movie['actor_1_name'] + ' ' + movie['actor_2_name'] + ' '+ movie['actor_3_name'] + ' '+ movie['director_name'] +' ' + movie['genres']","execution_count":120,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie","execution_count":121,"outputs":[{"output_type":"execute_result","execution_count":121,"data":{"text/plain":" director_name actor_1_name actor_2_name actor_3_name \\\n0 John Lasseter Tom Hanks Tim Allen Don Rickles \n1 Joe Johnston Robin Williams Jonathan Hyde Kirsten Dunst \n2 Howard Deutch Walter Matthau Jack Lemmon Ann-Margret \n3 Forest Whitaker Whitney Houston Angela Bassett Loretta Devine \n4 Charles Shyer Steve Martin Diane Keaton Martin Short \n... ... ... ... ... \n45438 Ben Rock Monty Bane Lucy Butler David Grammer \n45439 Aaron Osborne Lisa Boyle Kena Land Zaneta Polard \n45440 John Irvin Patrick Bergin Uma Thurman David Morrissey \n45441 Lav Diaz Angel Aquino Perry Dizon Hazel Orencio \n45442 Mark L. Lester Erika Eleniak Adam Baldwin Julie du Page \n\n genres movie_title \\\n0 Animation Comedy Family toy story \n1 Adventure Fantasy Family jumanji \n2 Romance Comedy grumpier old men \n3 Comedy Drama Romance waiting to exhale \n4 Comedy father of the bride part ii \n... ... ... \n45438 Horror the burkittsville 7 \n45439 Sci-Fi caged heat 3000 \n45440 Drama Action Romance robin hood \n45441 Drama century of birthing \n45442 Action Drama Thriller betrayal \n\n comb \n0 Tom Hanks Tim Allen Don Rickles John Lasseter Animation Comedy Family \n1 Robin Williams Jonathan Hyde Kirsten Dunst Joe Johnston Adventure Fanta... \n2 Walter Matthau Jack Lemmon Ann-Margret Howard Deutch Romance Comedy \n3 Whitney Houston Angela Bassett Loretta Devine Forest Whitaker Comedy Dr... \n4 Steve Martin Diane Keaton Martin Short Charles Shyer Comedy \n... ... \n45438 Monty Bane Lucy Butler David Grammer Ben Rock Horror \n45439 Lisa Boyle Kena Land Zaneta Polard Aaron Osborne Sci-Fi \n45440 Patrick Bergin Uma Thurman David Morrissey John Irvin Drama Action Romance \n45441 Angel Aquino Perry Dizon Hazel Orencio Lav Diaz Drama \n45442 Erika Eleniak Adam Baldwin Julie du Page Mark L. Lester Action Drama Th... \n\n[39201 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0John LasseterTom HanksTim AllenDon RicklesAnimation Comedy Familytoy storyTom Hanks Tim Allen Don Rickles John Lasseter Animation Comedy Family
1Joe JohnstonRobin WilliamsJonathan HydeKirsten DunstAdventure Fantasy FamilyjumanjiRobin Williams Jonathan Hyde Kirsten Dunst Joe Johnston Adventure Fanta...
2Howard DeutchWalter MatthauJack LemmonAnn-MargretRomance Comedygrumpier old menWalter Matthau Jack Lemmon Ann-Margret Howard Deutch Romance Comedy
3Forest WhitakerWhitney HoustonAngela BassettLoretta DevineComedy Drama Romancewaiting to exhaleWhitney Houston Angela Bassett Loretta Devine Forest Whitaker Comedy Dr...
4Charles ShyerSteve MartinDiane KeatonMartin ShortComedyfather of the bride part iiSteve Martin Diane Keaton Martin Short Charles Shyer Comedy
........................
45438Ben RockMonty BaneLucy ButlerDavid GrammerHorrorthe burkittsville 7Monty Bane Lucy Butler David Grammer Ben Rock Horror
45439Aaron OsborneLisa BoyleKena LandZaneta PolardSci-Ficaged heat 3000Lisa Boyle Kena Land Zaneta Polard Aaron Osborne Sci-Fi
45440John IrvinPatrick BerginUma ThurmanDavid MorrisseyDrama Action Romancerobin hoodPatrick Bergin Uma Thurman David Morrissey John Irvin Drama Action Romance
45441Lav DiazAngel AquinoPerry DizonHazel OrencioDramacentury of birthingAngel Aquino Perry Dizon Hazel Orencio Lav Diaz Drama
45442Mark L. LesterErika EleniakAdam BaldwinJulie du PageAction Drama ThrillerbetrayalErika Eleniak Adam Baldwin Julie du Page Mark L. Lester Action Drama Th...
\n

39201 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie.drop_duplicates(subset =\"movie_title\", keep = 'last', inplace = True)","execution_count":122,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie","execution_count":123,"outputs":[{"output_type":"execute_result","execution_count":123,"data":{"text/plain":" director_name actor_1_name actor_2_name actor_3_name \\\n0 John Lasseter Tom Hanks Tim Allen Don Rickles \n1 Joe Johnston Robin Williams Jonathan Hyde Kirsten Dunst \n2 Howard Deutch Walter Matthau Jack Lemmon Ann-Margret \n3 Forest Whitaker Whitney Houston Angela Bassett Loretta Devine \n4 Charles Shyer Steve Martin Diane Keaton Martin Short \n... ... ... ... ... \n45438 Ben Rock Monty Bane Lucy Butler David Grammer \n45439 Aaron Osborne Lisa Boyle Kena Land Zaneta Polard \n45440 John Irvin Patrick Bergin Uma Thurman David Morrissey \n45441 Lav Diaz Angel Aquino Perry Dizon Hazel Orencio \n45442 Mark L. Lester Erika Eleniak Adam Baldwin Julie du Page \n\n genres movie_title \\\n0 Animation Comedy Family toy story \n1 Adventure Fantasy Family jumanji \n2 Romance Comedy grumpier old men \n3 Comedy Drama Romance waiting to exhale \n4 Comedy father of the bride part ii \n... ... ... \n45438 Horror the burkittsville 7 \n45439 Sci-Fi caged heat 3000 \n45440 Drama Action Romance robin hood \n45441 Drama century of birthing \n45442 Action Drama Thriller betrayal \n\n comb \n0 Tom Hanks Tim Allen Don Rickles John Lasseter Animation Comedy Family \n1 Robin Williams Jonathan Hyde Kirsten Dunst Joe Johnston Adventure Fanta... \n2 Walter Matthau Jack Lemmon Ann-Margret Howard Deutch Romance Comedy \n3 Whitney Houston Angela Bassett Loretta Devine Forest Whitaker Comedy Dr... \n4 Steve Martin Diane Keaton Martin Short Charles Shyer Comedy \n... ... \n45438 Monty Bane Lucy Butler David Grammer Ben Rock Horror \n45439 Lisa Boyle Kena Land Zaneta Polard Aaron Osborne Sci-Fi \n45440 Patrick Bergin Uma Thurman David Morrissey John Irvin Drama Action Romance \n45441 Angel Aquino Perry Dizon Hazel Orencio Lav Diaz Drama \n45442 Erika Eleniak Adam Baldwin Julie du Page Mark L. Lester Action Drama Th... \n\n[36341 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0John LasseterTom HanksTim AllenDon RicklesAnimation Comedy Familytoy storyTom Hanks Tim Allen Don Rickles John Lasseter Animation Comedy Family
1Joe JohnstonRobin WilliamsJonathan HydeKirsten DunstAdventure Fantasy FamilyjumanjiRobin Williams Jonathan Hyde Kirsten Dunst Joe Johnston Adventure Fanta...
2Howard DeutchWalter MatthauJack LemmonAnn-MargretRomance Comedygrumpier old menWalter Matthau Jack Lemmon Ann-Margret Howard Deutch Romance Comedy
3Forest WhitakerWhitney HoustonAngela BassettLoretta DevineComedy Drama Romancewaiting to exhaleWhitney Houston Angela Bassett Loretta Devine Forest Whitaker Comedy Dr...
4Charles ShyerSteve MartinDiane KeatonMartin ShortComedyfather of the bride part iiSteve Martin Diane Keaton Martin Short Charles Shyer Comedy
........................
45438Ben RockMonty BaneLucy ButlerDavid GrammerHorrorthe burkittsville 7Monty Bane Lucy Butler David Grammer Ben Rock Horror
45439Aaron OsborneLisa BoyleKena LandZaneta PolardSci-Ficaged heat 3000Lisa Boyle Kena Land Zaneta Polard Aaron Osborne Sci-Fi
45440John IrvinPatrick BerginUma ThurmanDavid MorrisseyDrama Action Romancerobin hoodPatrick Bergin Uma Thurman David Morrissey John Irvin Drama Action Romance
45441Lav DiazAngel AquinoPerry DizonHazel OrencioDramacentury of birthingAngel Aquino Perry Dizon Hazel Orencio Lav Diaz Drama
45442Mark L. LesterErika EleniakAdam BaldwinJulie du PageAction Drama ThrillerbetrayalErika Eleniak Adam Baldwin Julie du Page Mark L. Lester Action Drama Th...
\n

36341 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"movie.to_csv('movie.csv',index=False)","execution_count":124,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4} -------------------------------------------------------------------------------- /.ipynb_checkpoints/preprocessing 3.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import pandas as pd\nimport numpy as np","execution_count":2,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Extracting features of 2018 movies from Wikipedia"},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"link = \"https://en.wikipedia.org/wiki/List_of_American_films_of_2018\"\ndf1 = pd.read_html(link, header=0)[2]\ndf2 = pd.read_html(link, header=0)[3]\ndf3 = pd.read_html(link, header=0)[4]\ndf4 = pd.read_html(link, header=0)[5]","execution_count":3,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)","execution_count":4,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df","execution_count":5,"outputs":[{"output_type":"execute_result","execution_count":5,"data":{"text/plain":" Opening Opening.1 Title \\\n0 JANUARY 5 Insidious: The Last Key \n1 JANUARY 5 The Strange Ones \n2 JANUARY 5 Stratton \n3 JANUARY 10 Sweet Country \n4 JANUARY 12 The Commuter \n.. ... ... ... \n263 DECEMBER 25 Holmes & Watson \n264 DECEMBER 25 Vice \n265 DECEMBER 25 On the Basis of Sex \n266 DECEMBER 25 Destroyer \n267 DECEMBER 28 Black Mirror: Bandersnatch \n\n Production company \\\n0 Universal Pictures / Blumhouse Productions / S... \n1 Vertical Entertainment \n2 Momentum Pictures \n3 Samuel Goldwyn Films \n4 Lionsgate / StudioCanal / The Picture Company \n.. ... \n263 Columbia Pictures / Gary Sanchez Productions \n264 Annapurna Pictures / Plan B Entertainment \n265 Focus Features \n266 Annapurna Pictures \n267 Netflix \n\n Cast and crew Ref. \n0 Adam Robitel (director); Leigh Whannell (scree... [2] \n1 Lauren Wolkstein (director); Christopher Radcl... [3] \n2 Simon West (director); Duncan Falconer, Warren... [4] \n3 Warwick Thornton (director); David Tranter, St... [5] \n4 Jaume Collet-Serra (director); Byron Willinger... [6] \n.. ... ... \n263 Etan Cohen (director/screenplay); Will Ferrell... [162] \n264 Adam McKay (director/screenplay); Christian Ba... [136] \n265 Mimi Leder (director); Daniel Stiepleman (scre... [223] \n266 Karyn Kusama (director); Phil Hay, Matt Manfre... [256] \n267 David Slade (director); Charlie Brooker (scree... [257] \n\n[268 rows x 6 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
OpeningOpening.1TitleProduction companyCast and crewRef.
0JANUARY5Insidious: The Last KeyUniversal Pictures / Blumhouse Productions / S...Adam Robitel (director); Leigh Whannell (scree...[2]
1JANUARY5The Strange OnesVertical EntertainmentLauren Wolkstein (director); Christopher Radcl...[3]
2JANUARY5StrattonMomentum PicturesSimon West (director); Duncan Falconer, Warren...[4]
3JANUARY10Sweet CountrySamuel Goldwyn FilmsWarwick Thornton (director); David Tranter, St...[5]
4JANUARY12The CommuterLionsgate / StudioCanal / The Picture CompanyJaume Collet-Serra (director); Byron Willinger...[6]
.....................
263DECEMBER25Holmes & WatsonColumbia Pictures / Gary Sanchez ProductionsEtan Cohen (director/screenplay); Will Ferrell...[162]
264DECEMBER25ViceAnnapurna Pictures / Plan B EntertainmentAdam McKay (director/screenplay); Christian Ba...[136]
265DECEMBER25On the Basis of SexFocus FeaturesMimi Leder (director); Daniel Stiepleman (scre...[223]
266DECEMBER25DestroyerAnnapurna PicturesKaryn Kusama (director); Phil Hay, Matt Manfre...[256]
267DECEMBER28Black Mirror: BandersnatchNetflixDavid Slade (director); Charlie Brooker (scree...[257]
\n

268 rows × 6 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"!pip install tmdbv3api","execution_count":6,"outputs":[{"output_type":"stream","text":"Collecting tmdbv3api\n Downloading tmdbv3api-1.6.1-py2.py3-none-any.whl (13 kB)\nRequirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from tmdbv3api) (2.23.0)\nRequirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->tmdbv3api) (2.9)\nRequirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.7/site-packages (from requests->tmdbv3api) (3.0.4)\nRequirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->tmdbv3api) (1.24.3)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->tmdbv3api) (2020.6.20)\nInstalling collected packages: tmdbv3api\nSuccessfully installed tmdbv3api-1.6.1\n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"from tmdbv3api import TMDb\nimport json\nimport requests\ntmdb = TMDb()\ntmdb.api_key = 'YOUR_API_KEY'","execution_count":7,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from tmdbv3api import Movie\ntmdb_movie = Movie()\ndef get_genre(x):\n genres = []\n result = tmdb_movie.search(x)\n movie_id = result[0].id\n response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))\n data_json = response.json()\n if data_json['genres']:\n genre_str = \" \" \n for i in range(0,len(data_json['genres'])):\n genres.append(data_json['genres'][i]['name'])\n return genre_str.join(genres)\n else:\n np.NaN","execution_count":8,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))","execution_count":9,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df","execution_count":10,"outputs":[{"output_type":"execute_result","execution_count":10,"data":{"text/plain":" Opening Opening.1 Title \\\n0 JANUARY 5 Insidious: The Last Key \n1 JANUARY 5 The Strange Ones \n2 JANUARY 5 Stratton \n3 JANUARY 10 Sweet Country \n4 JANUARY 12 The Commuter \n.. ... ... ... \n263 DECEMBER 25 Holmes & Watson \n264 DECEMBER 25 Vice \n265 DECEMBER 25 On the Basis of Sex \n266 DECEMBER 25 Destroyer \n267 DECEMBER 28 Black Mirror: Bandersnatch \n\n Production company \\\n0 Universal Pictures / Blumhouse Productions / S... \n1 Vertical Entertainment \n2 Momentum Pictures \n3 Samuel Goldwyn Films \n4 Lionsgate / StudioCanal / The Picture Company \n.. ... \n263 Columbia Pictures / Gary Sanchez Productions \n264 Annapurna Pictures / Plan B Entertainment \n265 Focus Features \n266 Annapurna Pictures \n267 Netflix \n\n Cast and crew Ref. \\\n0 Adam Robitel (director); Leigh Whannell (scree... [2] \n1 Lauren Wolkstein (director); Christopher Radcl... [3] \n2 Simon West (director); Duncan Falconer, Warren... [4] \n3 Warwick Thornton (director); David Tranter, St... [5] \n4 Jaume Collet-Serra (director); Byron Willinger... [6] \n.. ... ... \n263 Etan Cohen (director/screenplay); Will Ferrell... [162] \n264 Adam McKay (director/screenplay); Christian Ba... [136] \n265 Mimi Leder (director); Daniel Stiepleman (scre... [223] \n266 Karyn Kusama (director); Phil Hay, Matt Manfre... [256] \n267 David Slade (director); Charlie Brooker (scree... [257] \n\n genres \n0 Mystery Horror Thriller \n1 Thriller Drama \n2 Action Thriller \n3 Drama History Western \n4 Action Thriller \n.. ... \n263 Mystery Adventure Comedy Crime \n264 Thriller Science Fiction Action Adventure \n265 Drama History \n266 Thriller Crime Drama Action \n267 Science Fiction Mystery Drama Thriller TV Movie \n\n[268 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
OpeningOpening.1TitleProduction companyCast and crewRef.genres
0JANUARY5Insidious: The Last KeyUniversal Pictures / Blumhouse Productions / S...Adam Robitel (director); Leigh Whannell (scree...[2]Mystery Horror Thriller
1JANUARY5The Strange OnesVertical EntertainmentLauren Wolkstein (director); Christopher Radcl...[3]Thriller Drama
2JANUARY5StrattonMomentum PicturesSimon West (director); Duncan Falconer, Warren...[4]Action Thriller
3JANUARY10Sweet CountrySamuel Goldwyn FilmsWarwick Thornton (director); David Tranter, St...[5]Drama History Western
4JANUARY12The CommuterLionsgate / StudioCanal / The Picture CompanyJaume Collet-Serra (director); Byron Willinger...[6]Action Thriller
........................
263DECEMBER25Holmes & WatsonColumbia Pictures / Gary Sanchez ProductionsEtan Cohen (director/screenplay); Will Ferrell...[162]Mystery Adventure Comedy Crime
264DECEMBER25ViceAnnapurna Pictures / Plan B EntertainmentAdam McKay (director/screenplay); Christian Ba...[136]Thriller Science Fiction Action Adventure
265DECEMBER25On the Basis of SexFocus FeaturesMimi Leder (director); Daniel Stiepleman (scre...[223]Drama History
266DECEMBER25DestroyerAnnapurna PicturesKaryn Kusama (director); Phil Hay, Matt Manfre...[256]Thriller Crime Drama Action
267DECEMBER28Black Mirror: BandersnatchNetflixDavid Slade (director); Charlie Brooker (scree...[257]Science Fiction Mystery Drama Thriller TV Movie
\n

268 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2018 = df[['Title','Cast and crew','genres']]","execution_count":11,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2018","execution_count":12,"outputs":[{"output_type":"execute_result","execution_count":12,"data":{"text/plain":" Title \\\n0 Insidious: The Last Key \n1 The Strange Ones \n2 Stratton \n3 Sweet Country \n4 The Commuter \n.. ... \n263 Holmes & Watson \n264 Vice \n265 On the Basis of Sex \n266 Destroyer \n267 Black Mirror: Bandersnatch \n\n Cast and crew \\\n0 Adam Robitel (director); Leigh Whannell (scree... \n1 Lauren Wolkstein (director); Christopher Radcl... \n2 Simon West (director); Duncan Falconer, Warren... \n3 Warwick Thornton (director); David Tranter, St... \n4 Jaume Collet-Serra (director); Byron Willinger... \n.. ... \n263 Etan Cohen (director/screenplay); Will Ferrell... \n264 Adam McKay (director/screenplay); Christian Ba... \n265 Mimi Leder (director); Daniel Stiepleman (scre... \n266 Karyn Kusama (director); Phil Hay, Matt Manfre... \n267 David Slade (director); Charlie Brooker (scree... \n\n genres \n0 Mystery Horror Thriller \n1 Thriller Drama \n2 Action Thriller \n3 Drama History Western \n4 Action Thriller \n.. ... \n263 Mystery Adventure Comedy Crime \n264 Thriller Science Fiction Action Adventure \n265 Drama History \n266 Thriller Crime Drama Action \n267 Science Fiction Mystery Drama Thriller TV Movie \n\n[268 rows x 3 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TitleCast and crewgenres
0Insidious: The Last KeyAdam Robitel (director); Leigh Whannell (scree...Mystery Horror Thriller
1The Strange OnesLauren Wolkstein (director); Christopher Radcl...Thriller Drama
2StrattonSimon West (director); Duncan Falconer, Warren...Action Thriller
3Sweet CountryWarwick Thornton (director); David Tranter, St...Drama History Western
4The CommuterJaume Collet-Serra (director); Byron Willinger...Action Thriller
............
263Holmes & WatsonEtan Cohen (director/screenplay); Will Ferrell...Mystery Adventure Comedy Crime
264ViceAdam McKay (director/screenplay); Christian Ba...Thriller Science Fiction Action Adventure
265On the Basis of SexMimi Leder (director); Daniel Stiepleman (scre...Drama History
266DestroyerKaryn Kusama (director); Phil Hay, Matt Manfre...Thriller Crime Drama Action
267Black Mirror: BandersnatchDavid Slade (director); Charlie Brooker (scree...Science Fiction Mystery Drama Thriller TV Movie
\n

268 rows × 3 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_director(x):\n if \" (director)\" in x:\n return x.split(\" (director)\")[0]\n elif \" (directors)\" in x:\n return x.split(\" (directors)\")[0]\n else:\n return x.split(\" (director/screenplay)\")[0]","execution_count":13,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2018['director_name'] = df_2018['Cast and crew'].map(lambda x: get_director(x))","execution_count":14,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_actor1(x):\n return ((x.split(\"screenplay); \")[-1]).split(\", \")[0])","execution_count":15,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2018['actor_1_name'] = df_2018['Cast and crew'].map(lambda x: get_actor1(x))","execution_count":16,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_actor2(x):\n if len((x.split(\"screenplay); \")[-1]).split(\", \")) < 2:\n return np.NaN\n else:\n return ((x.split(\"screenplay); \")[-1]).split(\", \")[1])","execution_count":17,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2018['actor_2_name'] = df_2018['Cast and crew'].map(lambda x: get_actor2(x))","execution_count":18,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_actor3(x):\n if len((x.split(\"screenplay); \")[-1]).split(\", \")) < 3:\n return np.NaN\n else:\n return ((x.split(\"screenplay); \")[-1]).split(\", \")[2])","execution_count":19,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2018['actor_3_name'] = df_2018['Cast and crew'].map(lambda x: get_actor3(x))","execution_count":20,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2018","execution_count":21,"outputs":[{"output_type":"execute_result","execution_count":21,"data":{"text/plain":" Title \\\n0 Insidious: The Last Key \n1 The Strange Ones \n2 Stratton \n3 Sweet Country \n4 The Commuter \n.. ... \n263 Holmes & Watson \n264 Vice \n265 On the Basis of Sex \n266 Destroyer \n267 Black Mirror: Bandersnatch \n\n Cast and crew \\\n0 Adam Robitel (director); Leigh Whannell (scree... \n1 Lauren Wolkstein (director); Christopher Radcl... \n2 Simon West (director); Duncan Falconer, Warren... \n3 Warwick Thornton (director); David Tranter, St... \n4 Jaume Collet-Serra (director); Byron Willinger... \n.. ... \n263 Etan Cohen (director/screenplay); Will Ferrell... \n264 Adam McKay (director/screenplay); Christian Ba... \n265 Mimi Leder (director); Daniel Stiepleman (scre... \n266 Karyn Kusama (director); Phil Hay, Matt Manfre... \n267 David Slade (director); Charlie Brooker (scree... \n\n genres director_name \\\n0 Mystery Horror Thriller Adam Robitel \n1 Thriller Drama Lauren Wolkstein \n2 Action Thriller Simon West \n3 Drama History Western Warwick Thornton \n4 Action Thriller Jaume Collet-Serra \n.. ... ... \n263 Mystery Adventure Comedy Crime Etan Cohen \n264 Thriller Science Fiction Action Adventure Adam McKay \n265 Drama History Mimi Leder \n266 Thriller Crime Drama Action Karyn Kusama \n267 Science Fiction Mystery Drama Thriller TV Movie David Slade \n\n actor_1_name actor_2_name actor_3_name \n0 Lin Shaye Angus Sampson Leigh Whannell \n1 Alex Pettyfer James Freedson-Jackson Emily Althaus \n2 Dominic Cooper Austin Stowell Gemma Chan \n3 Bryan Brown Sam Neill NaN \n4 Liam Neeson Vera Farmiga Patrick Wilson \n.. ... ... ... \n263 Will Ferrell John C. Reilly Rebecca Hall \n264 Christian Bale Amy Adams Steve Carell \n265 Felicity Jones Armie Hammer Justin Theroux \n266 Nicole Kidman Sebastian Stan Toby Kebbell \n267 Fionn Whitehead Will Poulter Asim Chaudhry \n\n[268 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TitleCast and crewgenresdirector_nameactor_1_nameactor_2_nameactor_3_name
0Insidious: The Last KeyAdam Robitel (director); Leigh Whannell (scree...Mystery Horror ThrillerAdam RobitelLin ShayeAngus SampsonLeigh Whannell
1The Strange OnesLauren Wolkstein (director); Christopher Radcl...Thriller DramaLauren WolksteinAlex PettyferJames Freedson-JacksonEmily Althaus
2StrattonSimon West (director); Duncan Falconer, Warren...Action ThrillerSimon WestDominic CooperAustin StowellGemma Chan
3Sweet CountryWarwick Thornton (director); David Tranter, St...Drama History WesternWarwick ThorntonBryan BrownSam NeillNaN
4The CommuterJaume Collet-Serra (director); Byron Willinger...Action ThrillerJaume Collet-SerraLiam NeesonVera FarmigaPatrick Wilson
........................
263Holmes & WatsonEtan Cohen (director/screenplay); Will Ferrell...Mystery Adventure Comedy CrimeEtan CohenWill FerrellJohn C. ReillyRebecca Hall
264ViceAdam McKay (director/screenplay); Christian Ba...Thriller Science Fiction Action AdventureAdam McKayChristian BaleAmy AdamsSteve Carell
265On the Basis of SexMimi Leder (director); Daniel Stiepleman (scre...Drama HistoryMimi LederFelicity JonesArmie HammerJustin Theroux
266DestroyerKaryn Kusama (director); Phil Hay, Matt Manfre...Thriller Crime Drama ActionKaryn KusamaNicole KidmanSebastian StanToby Kebbell
267Black Mirror: BandersnatchDavid Slade (director); Charlie Brooker (scree...Science Fiction Mystery Drama Thriller TV MovieDavid SladeFionn WhiteheadWill PoulterAsim Chaudhry
\n

268 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"\ndf_2018 = df_2018.rename(columns={'Title':'movie_title'})","execution_count":22,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df18 = df_2018.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]","execution_count":23,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df18","execution_count":24,"outputs":[{"output_type":"execute_result","execution_count":24,"data":{"text/plain":" director_name actor_1_name actor_2_name \\\n0 Adam Robitel Lin Shaye Angus Sampson \n1 Lauren Wolkstein Alex Pettyfer James Freedson-Jackson \n2 Simon West Dominic Cooper Austin Stowell \n3 Warwick Thornton Bryan Brown Sam Neill \n4 Jaume Collet-Serra Liam Neeson Vera Farmiga \n.. ... ... ... \n263 Etan Cohen Will Ferrell John C. Reilly \n264 Adam McKay Christian Bale Amy Adams \n265 Mimi Leder Felicity Jones Armie Hammer \n266 Karyn Kusama Nicole Kidman Sebastian Stan \n267 David Slade Fionn Whitehead Will Poulter \n\n actor_3_name genres \\\n0 Leigh Whannell Mystery Horror Thriller \n1 Emily Althaus Thriller Drama \n2 Gemma Chan Action Thriller \n3 NaN Drama History Western \n4 Patrick Wilson Action Thriller \n.. ... ... \n263 Rebecca Hall Mystery Adventure Comedy Crime \n264 Steve Carell Thriller Science Fiction Action Adventure \n265 Justin Theroux Drama History \n266 Toby Kebbell Thriller Crime Drama Action \n267 Asim Chaudhry Science Fiction Mystery Drama Thriller TV Movie \n\n movie_title \n0 Insidious: The Last Key \n1 The Strange Ones \n2 Stratton \n3 Sweet Country \n4 The Commuter \n.. ... \n263 Holmes & Watson \n264 Vice \n265 On the Basis of Sex \n266 Destroyer \n267 Black Mirror: Bandersnatch \n\n[268 rows x 6 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_title
0Adam RobitelLin ShayeAngus SampsonLeigh WhannellMystery Horror ThrillerInsidious: The Last Key
1Lauren WolksteinAlex PettyferJames Freedson-JacksonEmily AlthausThriller DramaThe Strange Ones
2Simon WestDominic CooperAustin StowellGemma ChanAction ThrillerStratton
3Warwick ThorntonBryan BrownSam NeillNaNDrama History WesternSweet Country
4Jaume Collet-SerraLiam NeesonVera FarmigaPatrick WilsonAction ThrillerThe Commuter
.....................
263Etan CohenWill FerrellJohn C. ReillyRebecca HallMystery Adventure Comedy CrimeHolmes & Watson
264Adam McKayChristian BaleAmy AdamsSteve CarellThriller Science Fiction Action AdventureVice
265Mimi LederFelicity JonesArmie HammerJustin TherouxDrama HistoryOn the Basis of Sex
266Karyn KusamaNicole KidmanSebastian StanToby KebbellThriller Crime Drama ActionDestroyer
267David SladeFionn WhiteheadWill PoulterAsim ChaudhryScience Fiction Mystery Drama Thriller TV MovieBlack Mirror: Bandersnatch
\n

268 rows × 6 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df18['actor_2_name'] = new_df18['actor_2_name'].replace(np.nan, 'unknown')\nnew_df18['actor_3_name'] = new_df18['actor_3_name'].replace(np.nan, 'unknown')","execution_count":25,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df18['movie_title'] = new_df18['movie_title'].str.lower()","execution_count":26,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df18['comb'] = new_df18['actor_1_name'] + ' ' + new_df18['actor_2_name'] + ' '+ new_df18['actor_3_name'] + ' '+ new_df18['director_name'] +' ' + new_df18['genres']","execution_count":27,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df18","execution_count":28,"outputs":[{"output_type":"execute_result","execution_count":28,"data":{"text/plain":" director_name actor_1_name actor_2_name \\\n0 Adam Robitel Lin Shaye Angus Sampson \n1 Lauren Wolkstein Alex Pettyfer James Freedson-Jackson \n2 Simon West Dominic Cooper Austin Stowell \n3 Warwick Thornton Bryan Brown Sam Neill \n4 Jaume Collet-Serra Liam Neeson Vera Farmiga \n.. ... ... ... \n263 Etan Cohen Will Ferrell John C. Reilly \n264 Adam McKay Christian Bale Amy Adams \n265 Mimi Leder Felicity Jones Armie Hammer \n266 Karyn Kusama Nicole Kidman Sebastian Stan \n267 David Slade Fionn Whitehead Will Poulter \n\n actor_3_name genres \\\n0 Leigh Whannell Mystery Horror Thriller \n1 Emily Althaus Thriller Drama \n2 Gemma Chan Action Thriller \n3 unknown Drama History Western \n4 Patrick Wilson Action Thriller \n.. ... ... \n263 Rebecca Hall Mystery Adventure Comedy Crime \n264 Steve Carell Thriller Science Fiction Action Adventure \n265 Justin Theroux Drama History \n266 Toby Kebbell Thriller Crime Drama Action \n267 Asim Chaudhry Science Fiction Mystery Drama Thriller TV Movie \n\n movie_title \\\n0 insidious: the last key \n1 the strange ones \n2 stratton \n3 sweet country \n4 the commuter \n.. ... \n263 holmes & watson \n264 vice \n265 on the basis of sex \n266 destroyer \n267 black mirror: bandersnatch \n\n comb \n0 Lin Shaye Angus Sampson Leigh Whannell Adam Ro... \n1 Alex Pettyfer James Freedson-Jackson Emily Alt... \n2 Dominic Cooper Austin Stowell Gemma Chan Simon... \n3 Bryan Brown Sam Neill unknown Warwick Thornton... \n4 Liam Neeson Vera Farmiga Patrick Wilson Jaume ... \n.. ... \n263 Will Ferrell John C. Reilly Rebecca Hall Etan ... \n264 Christian Bale Amy Adams Steve Carell Adam McK... \n265 Felicity Jones Armie Hammer Justin Theroux Mim... \n266 Nicole Kidman Sebastian Stan Toby Kebbell Kary... \n267 Fionn Whitehead Will Poulter Asim Chaudhry Dav... \n\n[268 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0Adam RobitelLin ShayeAngus SampsonLeigh WhannellMystery Horror Thrillerinsidious: the last keyLin Shaye Angus Sampson Leigh Whannell Adam Ro...
1Lauren WolksteinAlex PettyferJames Freedson-JacksonEmily AlthausThriller Dramathe strange onesAlex Pettyfer James Freedson-Jackson Emily Alt...
2Simon WestDominic CooperAustin StowellGemma ChanAction ThrillerstrattonDominic Cooper Austin Stowell Gemma Chan Simon...
3Warwick ThorntonBryan BrownSam NeillunknownDrama History Westernsweet countryBryan Brown Sam Neill unknown Warwick Thornton...
4Jaume Collet-SerraLiam NeesonVera FarmigaPatrick WilsonAction Thrillerthe commuterLiam Neeson Vera Farmiga Patrick Wilson Jaume ...
........................
263Etan CohenWill FerrellJohn C. ReillyRebecca HallMystery Adventure Comedy Crimeholmes & watsonWill Ferrell John C. Reilly Rebecca Hall Etan ...
264Adam McKayChristian BaleAmy AdamsSteve CarellThriller Science Fiction Action AdventureviceChristian Bale Amy Adams Steve Carell Adam McK...
265Mimi LederFelicity JonesArmie HammerJustin TherouxDrama Historyon the basis of sexFelicity Jones Armie Hammer Justin Theroux Mim...
266Karyn KusamaNicole KidmanSebastian StanToby KebbellThriller Crime Drama ActiondestroyerNicole Kidman Sebastian Stan Toby Kebbell Kary...
267David SladeFionn WhiteheadWill PoulterAsim ChaudhryScience Fiction Mystery Drama Thriller TV Movieblack mirror: bandersnatchFionn Whitehead Will Poulter Asim Chaudhry Dav...
\n

268 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{},"cell_type":"markdown","source":"## Extracting features of 2019 movies from Wikipedia"},{"metadata":{"trusted":true},"cell_type":"code","source":"link = \"https://en.wikipedia.org/wiki/List_of_American_films_of_2019\"\ndf1 = pd.read_html(link, header=0)[3]\ndf2 = pd.read_html(link, header=0)[4]\ndf3 = pd.read_html(link, header=0)[5]\ndf4 = pd.read_html(link, header=0)[6]","execution_count":29,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)","execution_count":30,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df","execution_count":31,"outputs":[{"output_type":"execute_result","execution_count":31,"data":{"text/plain":" Opening Opening.1 Title \\\n0 JANUARY 4 Escape Room \n1 JANUARY 4 Rust Creek \n2 JANUARY 4 American Hangman \n3 JANUARY 11 A Dog's Way Home \n4 JANUARY 11 The Upside \n.. ... ... ... \n236 DECEMBER 25 Little Women \n237 DECEMBER 25 1917 \n238 DECEMBER 25 Just Mercy \n239 DECEMBER 27 Clemency \n240 DECEMBER 27 Apparition \n\n Production company \\\n0 Columbia Pictures \n1 IFC Films \n2 Hangman Justice Productions \n3 Columbia Pictures \n4 STX Entertainment \n.. ... \n236 Columbia Pictures / Regency Enterprises \n237 Universal Pictures / DreamWorks Pictures / Ent... \n238 Warner Bros. Pictures / Participant Media \n239 Neon \n240 Vertical Pictures \n\n Cast and crew Ref. \n0 Adam Robitel (director); Bragi F. Schut, Maria... [2] \n1 Jen McGowan (director); Julie Lipson (screenpl... [3] \n2 Wilson Coneybeare (director/screenplay); Donal... [4] \n3 Charles Martin Smith (director); W. Bruce Came... [5] \n4 Neil Burger (director); Jon Hartmere (screenpl... [6] \n.. ... ... \n236 Greta Gerwig (director/screenplay); Saoirse Ro... [222] \n237 Sam Mendes (director/screenplay); Krysty Wilso... [223] \n238 Destin Daniel Cretton (director/screenplay), A... [224] \n239 Chinonye Chukwu (director/screenplay); Alfre W... [225] \n240 Waymon Boone (director/screenplay); Mena Suvar... [226] \n\n[241 rows x 6 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
OpeningOpening.1TitleProduction companyCast and crewRef.
0JANUARY4Escape RoomColumbia PicturesAdam Robitel (director); Bragi F. Schut, Maria...[2]
1JANUARY4Rust CreekIFC FilmsJen McGowan (director); Julie Lipson (screenpl...[3]
2JANUARY4American HangmanHangman Justice ProductionsWilson Coneybeare (director/screenplay); Donal...[4]
3JANUARY11A Dog's Way HomeColumbia PicturesCharles Martin Smith (director); W. Bruce Came...[5]
4JANUARY11The UpsideSTX EntertainmentNeil Burger (director); Jon Hartmere (screenpl...[6]
.....................
236DECEMBER25Little WomenColumbia Pictures / Regency EnterprisesGreta Gerwig (director/screenplay); Saoirse Ro...[222]
237DECEMBER251917Universal Pictures / DreamWorks Pictures / Ent...Sam Mendes (director/screenplay); Krysty Wilso...[223]
238DECEMBER25Just MercyWarner Bros. Pictures / Participant MediaDestin Daniel Cretton (director/screenplay), A...[224]
239DECEMBER27ClemencyNeonChinonye Chukwu (director/screenplay); Alfre W...[225]
240DECEMBER27ApparitionVertical PicturesWaymon Boone (director/screenplay); Mena Suvar...[226]
\n

241 rows × 6 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))","execution_count":32,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2019 = df[['Title','Cast and crew','genres']]","execution_count":33,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2019","execution_count":34,"outputs":[{"output_type":"execute_result","execution_count":34,"data":{"text/plain":" Title Cast and crew \\\n0 Escape Room Adam Robitel (director); Bragi F. Schut, Maria... \n1 Rust Creek Jen McGowan (director); Julie Lipson (screenpl... \n2 American Hangman Wilson Coneybeare (director/screenplay); Donal... \n3 A Dog's Way Home Charles Martin Smith (director); W. Bruce Came... \n4 The Upside Neil Burger (director); Jon Hartmere (screenpl... \n.. ... ... \n236 Little Women Greta Gerwig (director/screenplay); Saoirse Ro... \n237 1917 Sam Mendes (director/screenplay); Krysty Wilso... \n238 Just Mercy Destin Daniel Cretton (director/screenplay), A... \n239 Clemency Chinonye Chukwu (director/screenplay); Alfre W... \n240 Apparition Waymon Boone (director/screenplay); Mena Suvar... \n\n genres \n0 Thriller Action Mystery Horror \n1 Thriller Drama \n2 Thriller \n3 Drama Adventure Family \n4 Comedy Drama \n.. ... \n236 Drama Romance \n237 War Drama Action History \n238 Drama Crime \n239 Drama \n240 Horror Thriller \n\n[241 rows x 3 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TitleCast and crewgenres
0Escape RoomAdam Robitel (director); Bragi F. Schut, Maria...Thriller Action Mystery Horror
1Rust CreekJen McGowan (director); Julie Lipson (screenpl...Thriller Drama
2American HangmanWilson Coneybeare (director/screenplay); Donal...Thriller
3A Dog's Way HomeCharles Martin Smith (director); W. Bruce Came...Drama Adventure Family
4The UpsideNeil Burger (director); Jon Hartmere (screenpl...Comedy Drama
............
236Little WomenGreta Gerwig (director/screenplay); Saoirse Ro...Drama Romance
2371917Sam Mendes (director/screenplay); Krysty Wilso...War Drama Action History
238Just MercyDestin Daniel Cretton (director/screenplay), A...Drama Crime
239ClemencyChinonye Chukwu (director/screenplay); Alfre W...Drama
240ApparitionWaymon Boone (director/screenplay); Mena Suvar...Horror Thriller
\n

241 rows × 3 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2019['director_name'] = df_2019['Cast and crew'].map(lambda x: get_director(str(x)))","execution_count":35,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2019['actor_1_name'] = df_2019['Cast and crew'].map(lambda x: get_actor1(x))","execution_count":36,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2019['actor_2_name'] = df_2019['Cast and crew'].map(lambda x: get_actor2(x))","execution_count":37,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2019['actor_3_name'] = df_2019['Cast and crew'].map(lambda x: get_actor3(x))","execution_count":38,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2019 = df_2019.rename(columns={'Title':'movie_title'})","execution_count":39,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df19 = df_2019.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]","execution_count":40,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"\nnew_df19['actor_2_name'] = new_df19['actor_2_name'].replace(np.nan, 'unknown')\nnew_df19['actor_3_name'] = new_df19['actor_3_name'].replace(np.nan, 'unknown')","execution_count":41,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df19['movie_title'] = new_df19['movie_title'].str.lower()","execution_count":42,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df19['comb'] = new_df19['actor_1_name'] + ' ' + new_df19['actor_2_name'] + ' '+ new_df19['actor_3_name'] + ' '+ new_df19['director_name'] +' ' + new_df19['genres']","execution_count":43,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df19","execution_count":44,"outputs":[{"output_type":"execute_result","execution_count":44,"data":{"text/plain":" director_name actor_1_name actor_2_name \\\n0 Adam Robitel Taylor Russell Logan Miller \n1 Jen McGowan Hermione Corfield Jay Paulson \n2 Wilson Coneybeare Donald Sutherland Vincent Kartheiser \n3 Charles Martin Smith Bryce Dallas Howard Edward James Olmos \n4 Neil Burger Bryan Cranston Kevin Hart \n.. ... ... ... \n236 Greta Gerwig Saoirse Ronan Emma Watson \n237 Sam Mendes George MacKay Dean-Charles Chapman \n238 Destin Daniel Cretton Michael B. Jordan Jamie Foxx \n239 Chinonye Chukwu Alfre Woodard Wendell Pierce \n240 Waymon Boone Mena Suvari Kevin Pollak \n\n actor_3_name genres movie_title \\\n0 Deborah Ann Woll Thriller Action Mystery Horror escape room \n1 Sean O'Bryan Thriller Drama rust creek \n2 Oliver Dennis Thriller american hangman \n3 Alexandra Shipp Drama Adventure Family a dog's way home \n4 Nicole Kidman Comedy Drama the upside \n.. ... ... ... \n236 Florence Pugh Drama Romance little women \n237 Mark Strong War Drama Action History 1917 \n238 Brie Larson Drama Crime just mercy \n239 Aldis Hodge Drama clemency \n240 unknown Horror Thriller apparition \n\n comb \n0 Taylor Russell Logan Miller Deborah Ann Woll A... \n1 Hermione Corfield Jay Paulson Sean O'Bryan Jen... \n2 Donald Sutherland Vincent Kartheiser Oliver De... \n3 Bryce Dallas Howard Edward James Olmos Alexand... \n4 Bryan Cranston Kevin Hart Nicole Kidman Neil B... \n.. ... \n236 Saoirse Ronan Emma Watson Florence Pugh Greta ... \n237 George MacKay Dean-Charles Chapman Mark Strong... \n238 Michael B. Jordan Jamie Foxx Brie Larson Desti... \n239 Alfre Woodard Wendell Pierce Aldis Hodge Chino... \n240 Mena Suvari Kevin Pollak unknown Waymon Boone ... \n\n[241 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0Adam RobitelTaylor RussellLogan MillerDeborah Ann WollThriller Action Mystery Horrorescape roomTaylor Russell Logan Miller Deborah Ann Woll A...
1Jen McGowanHermione CorfieldJay PaulsonSean O'BryanThriller Dramarust creekHermione Corfield Jay Paulson Sean O'Bryan Jen...
2Wilson ConeybeareDonald SutherlandVincent KartheiserOliver DennisThrilleramerican hangmanDonald Sutherland Vincent Kartheiser Oliver De...
3Charles Martin SmithBryce Dallas HowardEdward James OlmosAlexandra ShippDrama Adventure Familya dog's way homeBryce Dallas Howard Edward James Olmos Alexand...
4Neil BurgerBryan CranstonKevin HartNicole KidmanComedy Dramathe upsideBryan Cranston Kevin Hart Nicole Kidman Neil B...
........................
236Greta GerwigSaoirse RonanEmma WatsonFlorence PughDrama Romancelittle womenSaoirse Ronan Emma Watson Florence Pugh Greta ...
237Sam MendesGeorge MacKayDean-Charles ChapmanMark StrongWar Drama Action History1917George MacKay Dean-Charles Chapman Mark Strong...
238Destin Daniel CrettonMichael B. JordanJamie FoxxBrie LarsonDrama Crimejust mercyMichael B. Jordan Jamie Foxx Brie Larson Desti...
239Chinonye ChukwuAlfre WoodardWendell PierceAldis HodgeDramaclemencyAlfre Woodard Wendell Pierce Aldis Hodge Chino...
240Waymon BooneMena SuvariKevin PollakunknownHorror ThrillerapparitionMena Suvari Kevin Pollak unknown Waymon Boone ...
\n

241 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"my_df = new_df18.append(new_df19,ignore_index=True)","execution_count":45,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"my_df","execution_count":46,"outputs":[{"output_type":"execute_result","execution_count":46,"data":{"text/plain":" director_name actor_1_name actor_2_name \\\n0 Adam Robitel Lin Shaye Angus Sampson \n1 Lauren Wolkstein Alex Pettyfer James Freedson-Jackson \n2 Simon West Dominic Cooper Austin Stowell \n3 Warwick Thornton Bryan Brown Sam Neill \n4 Jaume Collet-Serra Liam Neeson Vera Farmiga \n.. ... ... ... \n504 Greta Gerwig Saoirse Ronan Emma Watson \n505 Sam Mendes George MacKay Dean-Charles Chapman \n506 Destin Daniel Cretton Michael B. Jordan Jamie Foxx \n507 Chinonye Chukwu Alfre Woodard Wendell Pierce \n508 Waymon Boone Mena Suvari Kevin Pollak \n\n actor_3_name genres movie_title \\\n0 Leigh Whannell Mystery Horror Thriller insidious: the last key \n1 Emily Althaus Thriller Drama the strange ones \n2 Gemma Chan Action Thriller stratton \n3 unknown Drama History Western sweet country \n4 Patrick Wilson Action Thriller the commuter \n.. ... ... ... \n504 Florence Pugh Drama Romance little women \n505 Mark Strong War Drama Action History 1917 \n506 Brie Larson Drama Crime just mercy \n507 Aldis Hodge Drama clemency \n508 unknown Horror Thriller apparition \n\n comb \n0 Lin Shaye Angus Sampson Leigh Whannell Adam Ro... \n1 Alex Pettyfer James Freedson-Jackson Emily Alt... \n2 Dominic Cooper Austin Stowell Gemma Chan Simon... \n3 Bryan Brown Sam Neill unknown Warwick Thornton... \n4 Liam Neeson Vera Farmiga Patrick Wilson Jaume ... \n.. ... \n504 Saoirse Ronan Emma Watson Florence Pugh Greta ... \n505 George MacKay Dean-Charles Chapman Mark Strong... \n506 Michael B. Jordan Jamie Foxx Brie Larson Desti... \n507 Alfre Woodard Wendell Pierce Aldis Hodge Chino... \n508 Mena Suvari Kevin Pollak unknown Waymon Boone ... \n\n[509 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0Adam RobitelLin ShayeAngus SampsonLeigh WhannellMystery Horror Thrillerinsidious: the last keyLin Shaye Angus Sampson Leigh Whannell Adam Ro...
1Lauren WolksteinAlex PettyferJames Freedson-JacksonEmily AlthausThriller Dramathe strange onesAlex Pettyfer James Freedson-Jackson Emily Alt...
2Simon WestDominic CooperAustin StowellGemma ChanAction ThrillerstrattonDominic Cooper Austin Stowell Gemma Chan Simon...
3Warwick ThorntonBryan BrownSam NeillunknownDrama History Westernsweet countryBryan Brown Sam Neill unknown Warwick Thornton...
4Jaume Collet-SerraLiam NeesonVera FarmigaPatrick WilsonAction Thrillerthe commuterLiam Neeson Vera Farmiga Patrick Wilson Jaume ...
........................
504Greta GerwigSaoirse RonanEmma WatsonFlorence PughDrama Romancelittle womenSaoirse Ronan Emma Watson Florence Pugh Greta ...
505Sam MendesGeorge MacKayDean-Charles ChapmanMark StrongWar Drama Action History1917George MacKay Dean-Charles Chapman Mark Strong...
506Destin Daniel CrettonMichael B. JordanJamie FoxxBrie LarsonDrama Crimejust mercyMichael B. Jordan Jamie Foxx Brie Larson Desti...
507Chinonye ChukwuAlfre WoodardWendell PierceAldis HodgeDramaclemencyAlfre Woodard Wendell Pierce Aldis Hodge Chino...
508Waymon BooneMena SuvariKevin PollakunknownHorror ThrillerapparitionMena Suvari Kevin Pollak unknown Waymon Boone ...
\n

509 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"old_df = pd.read_csv('../input/movie.csv')","execution_count":51,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"old_df","execution_count":52,"outputs":[{"output_type":"execute_result","execution_count":52,"data":{"text/plain":" director_name actor_1_name actor_2_name actor_3_name \\\n0 John Lasseter Tom Hanks Tim Allen Don Rickles \n1 Joe Johnston Robin Williams Jonathan Hyde Kirsten Dunst \n2 Howard Deutch Walter Matthau Jack Lemmon Ann-Margret \n3 Forest Whitaker Whitney Houston Angela Bassett Loretta Devine \n4 Charles Shyer Steve Martin Diane Keaton Martin Short \n... ... ... ... ... \n36336 Ben Rock Monty Bane Lucy Butler David Grammer \n36337 Aaron Osborne Lisa Boyle Kena Land Zaneta Polard \n36338 John Irvin Patrick Bergin Uma Thurman David Morrissey \n36339 Lav Diaz Angel Aquino Perry Dizon Hazel Orencio \n36340 Mark L. Lester Erika Eleniak Adam Baldwin Julie du Page \n\n genres movie_title \\\n0 Animation Comedy Family toy story \n1 Adventure Fantasy Family jumanji \n2 Romance Comedy grumpier old men \n3 Comedy Drama Romance waiting to exhale \n4 Comedy father of the bride part ii \n... ... ... \n36336 Horror the burkittsville 7 \n36337 Sci-Fi caged heat 3000 \n36338 Drama Action Romance robin hood \n36339 Drama century of birthing \n36340 Action Drama Thriller betrayal \n\n comb \n0 Tom Hanks Tim Allen Don Rickles John Lasseter ... \n1 Robin Williams Jonathan Hyde Kirsten Dunst Joe... \n2 Walter Matthau Jack Lemmon Ann-Margret Howard ... \n3 Whitney Houston Angela Bassett Loretta Devine ... \n4 Steve Martin Diane Keaton Martin Short Charles... \n... ... \n36336 Monty Bane Lucy Butler David Grammer Ben Rock ... \n36337 Lisa Boyle Kena Land Zaneta Polard Aaron Osbor... \n36338 Patrick Bergin Uma Thurman David Morrissey Joh... \n36339 Angel Aquino Perry Dizon Hazel Orencio Lav Dia... \n36340 Erika Eleniak Adam Baldwin Julie du Page Mark ... \n\n[36341 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0John LasseterTom HanksTim AllenDon RicklesAnimation Comedy Familytoy storyTom Hanks Tim Allen Don Rickles John Lasseter ...
1Joe JohnstonRobin WilliamsJonathan HydeKirsten DunstAdventure Fantasy FamilyjumanjiRobin Williams Jonathan Hyde Kirsten Dunst Joe...
2Howard DeutchWalter MatthauJack LemmonAnn-MargretRomance Comedygrumpier old menWalter Matthau Jack Lemmon Ann-Margret Howard ...
3Forest WhitakerWhitney HoustonAngela BassettLoretta DevineComedy Drama Romancewaiting to exhaleWhitney Houston Angela Bassett Loretta Devine ...
4Charles ShyerSteve MartinDiane KeatonMartin ShortComedyfather of the bride part iiSteve Martin Diane Keaton Martin Short Charles...
........................
36336Ben RockMonty BaneLucy ButlerDavid GrammerHorrorthe burkittsville 7Monty Bane Lucy Butler David Grammer Ben Rock ...
36337Aaron OsborneLisa BoyleKena LandZaneta PolardSci-Ficaged heat 3000Lisa Boyle Kena Land Zaneta Polard Aaron Osbor...
36338John IrvinPatrick BerginUma ThurmanDavid MorrisseyDrama Action Romancerobin hoodPatrick Bergin Uma Thurman David Morrissey Joh...
36339Lav DiazAngel AquinoPerry DizonHazel OrencioDramacentury of birthingAngel Aquino Perry Dizon Hazel Orencio Lav Dia...
36340Mark L. LesterErika EleniakAdam BaldwinJulie du PageAction Drama ThrillerbetrayalErika Eleniak Adam Baldwin Julie du Page Mark ...
\n

36341 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"final_df = old_df.append(my_df,ignore_index=True)","execution_count":53,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"final_df","execution_count":54,"outputs":[{"output_type":"execute_result","execution_count":54,"data":{"text/plain":" director_name actor_1_name actor_2_name \\\n0 John Lasseter Tom Hanks Tim Allen \n1 Joe Johnston Robin Williams Jonathan Hyde \n2 Howard Deutch Walter Matthau Jack Lemmon \n3 Forest Whitaker Whitney Houston Angela Bassett \n4 Charles Shyer Steve Martin Diane Keaton \n... ... ... ... \n36845 Greta Gerwig Saoirse Ronan Emma Watson \n36846 Sam Mendes George MacKay Dean-Charles Chapman \n36847 Destin Daniel Cretton Michael B. Jordan Jamie Foxx \n36848 Chinonye Chukwu Alfre Woodard Wendell Pierce \n36849 Waymon Boone Mena Suvari Kevin Pollak \n\n actor_3_name genres movie_title \\\n0 Don Rickles Animation Comedy Family toy story \n1 Kirsten Dunst Adventure Fantasy Family jumanji \n2 Ann-Margret Romance Comedy grumpier old men \n3 Loretta Devine Comedy Drama Romance waiting to exhale \n4 Martin Short Comedy father of the bride part ii \n... ... ... ... \n36845 Florence Pugh Drama Romance little women \n36846 Mark Strong War Drama Action History 1917 \n36847 Brie Larson Drama Crime just mercy \n36848 Aldis Hodge Drama clemency \n36849 unknown Horror Thriller apparition \n\n comb \n0 Tom Hanks Tim Allen Don Rickles John Lasseter ... \n1 Robin Williams Jonathan Hyde Kirsten Dunst Joe... \n2 Walter Matthau Jack Lemmon Ann-Margret Howard ... \n3 Whitney Houston Angela Bassett Loretta Devine ... \n4 Steve Martin Diane Keaton Martin Short Charles... \n... ... \n36845 Saoirse Ronan Emma Watson Florence Pugh Greta ... \n36846 George MacKay Dean-Charles Chapman Mark Strong... \n36847 Michael B. Jordan Jamie Foxx Brie Larson Desti... \n36848 Alfre Woodard Wendell Pierce Aldis Hodge Chino... \n36849 Mena Suvari Kevin Pollak unknown Waymon Boone ... \n\n[36850 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0John LasseterTom HanksTim AllenDon RicklesAnimation Comedy Familytoy storyTom Hanks Tim Allen Don Rickles John Lasseter ...
1Joe JohnstonRobin WilliamsJonathan HydeKirsten DunstAdventure Fantasy FamilyjumanjiRobin Williams Jonathan Hyde Kirsten Dunst Joe...
2Howard DeutchWalter MatthauJack LemmonAnn-MargretRomance Comedygrumpier old menWalter Matthau Jack Lemmon Ann-Margret Howard ...
3Forest WhitakerWhitney HoustonAngela BassettLoretta DevineComedy Drama Romancewaiting to exhaleWhitney Houston Angela Bassett Loretta Devine ...
4Charles ShyerSteve MartinDiane KeatonMartin ShortComedyfather of the bride part iiSteve Martin Diane Keaton Martin Short Charles...
........................
36845Greta GerwigSaoirse RonanEmma WatsonFlorence PughDrama Romancelittle womenSaoirse Ronan Emma Watson Florence Pugh Greta ...
36846Sam MendesGeorge MacKayDean-Charles ChapmanMark StrongWar Drama Action History1917George MacKay Dean-Charles Chapman Mark Strong...
36847Destin Daniel CrettonMichael B. JordanJamie FoxxBrie LarsonDrama Crimejust mercyMichael B. Jordan Jamie Foxx Brie Larson Desti...
36848Chinonye ChukwuAlfre WoodardWendell PierceAldis HodgeDramaclemencyAlfre Woodard Wendell Pierce Aldis Hodge Chino...
36849Waymon BooneMena SuvariKevin PollakunknownHorror ThrillerapparitionMena Suvari Kevin Pollak unknown Waymon Boone ...
\n

36850 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"final_df.isna().sum()","execution_count":55,"outputs":[{"output_type":"execute_result","execution_count":55,"data":{"text/plain":"director_name 0\nactor_1_name 0\nactor_2_name 0\nactor_3_name 0\ngenres 4\nmovie_title 0\ncomb 4\ndtype: int64"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"final_df = final_df.dropna(how='any')","execution_count":56,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"final_df.to_csv('final_data.csv',index=False)","execution_count":57,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4} -------------------------------------------------------------------------------- /.ipynb_checkpoints/preprocessing 4.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport requests\nimport bs4 as bs\nimport urllib.request","execution_count":1,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Extracting features of 2020 movies from Wikipedia"},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"link = \"https://en.wikipedia.org/wiki/List_of_American_films_of_2020\"","execution_count":2,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"source = urllib.request.urlopen(link).read()\nsoup = bs.BeautifulSoup(source,'lxml')","execution_count":3,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"tables = soup.find_all('table',class_='wikitable sortable')","execution_count":4,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df1 = pd.read_html(str(tables[0]))[0]\ndf2 = pd.read_html(str(tables[1]))[0]\ndf3 = pd.read_html(str(tables[2]))[0]\ndf4 = pd.read_html(str(tables[3]).replace(\"'1\\\"\\'\",'\"1\"'))[0] # avoided \"ValueError: invalid literal for int() with base 10: '1\"'","execution_count":5,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)","execution_count":6,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df","execution_count":7,"outputs":[{"output_type":"execute_result","execution_count":7,"data":{"text/plain":" Opening Opening.1 Title \\\n0 JANUARY 3.0 The Grudge \n1 JANUARY 10.0 Underwater \n2 JANUARY 10.0 Like a Boss \n3 JANUARY 10.0 Inherit the Viper \n4 JANUARY 10.0 The Sonata \n.. ... ... ... \n150 DECEMBER 23.0 Top Gun: Maverick \n151 DECEMBER 23.0 The Croods 2 \n152 DECEMBER 25.0 Respect \n153 DECEMBER 25.0 The Last Duel \n154 DECEMBER 25.0 News of the World \n\n Production company \\\n0 Screen Gems / Stage 6 Films / Ghost House Pict... \n1 20th Century Fox / TSG Entertainment / Chernin... \n2 Paramount Pictures \n3 Barry Films / Tycor International Film Company \n4 Screen Media Films \n.. ... \n150 Paramount Pictures / Skydance Media / Don Simp... \n151 Universal Pictures / DreamWorks Animation \n152 Metro-Goldwyn-Mayer / Universal Pictures / Bro... \n153 20th Century Studios / Scott Free Productions ... \n154 Universal Pictures / Playtone \n\n Cast and crew Ref. \n0 Nicolas Pesce (director/screenplay); Andrea Ri... [2] \n1 William Eubank (director); Brian Duffield, Ada... [3] \n2 Miguel Arteta (director); Sam Pitman, Adam Col... [4] \n3 Anthony Jerjen (director); Andrew Crabtree (sc... [5] \n4 Andrew Desmond (director/screenplay); Arthur M... [6] \n.. ... ... \n150 Joseph Kosinski (director); Ehren Kruger, Eric... [149] \n151 Joel Crawford (director); Kevin Hageman, Dan H... [150] \n152 Liesl Tommy (director); Tracey Scott Wilson (s... [151] \n153 Ridley Scott (director); Ben Affleck, Matt Dam... [152] \n154 Paul Greengrass (director/screenplay); Luke Da... [153] \n\n[155 rows x 6 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
OpeningOpening.1TitleProduction companyCast and crewRef.
0JANUARY3.0The GrudgeScreen Gems / Stage 6 Films / Ghost House Pict...Nicolas Pesce (director/screenplay); Andrea Ri...[2]
1JANUARY10.0Underwater20th Century Fox / TSG Entertainment / Chernin...William Eubank (director); Brian Duffield, Ada...[3]
2JANUARY10.0Like a BossParamount PicturesMiguel Arteta (director); Sam Pitman, Adam Col...[4]
3JANUARY10.0Inherit the ViperBarry Films / Tycor International Film CompanyAnthony Jerjen (director); Andrew Crabtree (sc...[5]
4JANUARY10.0The SonataScreen Media FilmsAndrew Desmond (director/screenplay); Arthur M...[6]
.....................
150DECEMBER23.0Top Gun: MaverickParamount Pictures / Skydance Media / Don Simp...Joseph Kosinski (director); Ehren Kruger, Eric...[149]
151DECEMBER23.0The Croods 2Universal Pictures / DreamWorks AnimationJoel Crawford (director); Kevin Hageman, Dan H...[150]
152DECEMBER25.0RespectMetro-Goldwyn-Mayer / Universal Pictures / Bro...Liesl Tommy (director); Tracey Scott Wilson (s...[151]
153DECEMBER25.0The Last Duel20th Century Studios / Scott Free Productions ...Ridley Scott (director); Ben Affleck, Matt Dam...[152]
154DECEMBER25.0News of the WorldUniversal Pictures / PlaytonePaul Greengrass (director/screenplay); Luke Da...[153]
\n

155 rows × 6 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2020 = df[['Title','Cast and crew']]","execution_count":8,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2020","execution_count":9,"outputs":[{"output_type":"execute_result","execution_count":9,"data":{"text/plain":" Title Cast and crew\n0 The Grudge Nicolas Pesce (director/screenplay); Andrea Ri...\n1 Underwater William Eubank (director); Brian Duffield, Ada...\n2 Like a Boss Miguel Arteta (director); Sam Pitman, Adam Col...\n3 Inherit the Viper Anthony Jerjen (director); Andrew Crabtree (sc...\n4 The Sonata Andrew Desmond (director/screenplay); Arthur M...\n.. ... ...\n150 Top Gun: Maverick Joseph Kosinski (director); Ehren Kruger, Eric...\n151 The Croods 2 Joel Crawford (director); Kevin Hageman, Dan H...\n152 Respect Liesl Tommy (director); Tracey Scott Wilson (s...\n153 The Last Duel Ridley Scott (director); Ben Affleck, Matt Dam...\n154 News of the World Paul Greengrass (director/screenplay); Luke Da...\n\n[155 rows x 2 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TitleCast and crew
0The GrudgeNicolas Pesce (director/screenplay); Andrea Ri...
1UnderwaterWilliam Eubank (director); Brian Duffield, Ada...
2Like a BossMiguel Arteta (director); Sam Pitman, Adam Col...
3Inherit the ViperAnthony Jerjen (director); Andrew Crabtree (sc...
4The SonataAndrew Desmond (director/screenplay); Arthur M...
.........
150Top Gun: MaverickJoseph Kosinski (director); Ehren Kruger, Eric...
151The Croods 2Joel Crawford (director); Kevin Hageman, Dan H...
152RespectLiesl Tommy (director); Tracey Scott Wilson (s...
153The Last DuelRidley Scott (director); Ben Affleck, Matt Dam...
154News of the WorldPaul Greengrass (director/screenplay); Luke Da...
\n

155 rows × 2 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"!pip install tmdbv3api","execution_count":12,"outputs":[{"output_type":"stream","text":"Collecting tmdbv3api\n Downloading tmdbv3api-1.6.1-py2.py3-none-any.whl (13 kB)\nRequirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from tmdbv3api) (2.23.0)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->tmdbv3api) (2020.6.20)\nRequirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->tmdbv3api) (2.9)\nRequirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.7/site-packages (from requests->tmdbv3api) (3.0.4)\nRequirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->tmdbv3api) (1.24.3)\nInstalling collected packages: tmdbv3api\nSuccessfully installed tmdbv3api-1.6.1\n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"from tmdbv3api import TMDb\nimport json\nimport requests\ntmdb = TMDb()\ntmdb.api_key = 'YOUR_API_KEY'","execution_count":13,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from tmdbv3api import Movie\ntmdb_movie = Movie()\ndef get_genre(x):\n genres = []\n result = tmdb_movie.search(x)\n movie_id = result[0].id\n response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))\n data_json = response.json()\n if data_json['genres']:\n genre_str = \" \" \n for i in range(0,len(data_json['genres'])):\n genres.append(data_json['genres'][i]['name'])\n return genre_str.join(genres)\n else:\n np.NaN","execution_count":14,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2020['genres'] = df_2020['Title'].map(lambda x: get_genre(str(x)))","execution_count":15,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2020","execution_count":16,"outputs":[{"output_type":"execute_result","execution_count":16,"data":{"text/plain":" Title Cast and crew \\\n0 The Grudge Nicolas Pesce (director/screenplay); Andrea Ri... \n1 Underwater William Eubank (director); Brian Duffield, Ada... \n2 Like a Boss Miguel Arteta (director); Sam Pitman, Adam Col... \n3 Inherit the Viper Anthony Jerjen (director); Andrew Crabtree (sc... \n4 The Sonata Andrew Desmond (director/screenplay); Arthur M... \n.. ... ... \n150 Top Gun: Maverick Joseph Kosinski (director); Ehren Kruger, Eric... \n151 The Croods 2 Joel Crawford (director); Kevin Hageman, Dan H... \n152 Respect Liesl Tommy (director); Tracey Scott Wilson (s... \n153 The Last Duel Ridley Scott (director); Ben Affleck, Matt Dam... \n154 News of the World Paul Greengrass (director/screenplay); Luke Da... \n\n genres \n0 Horror Mystery \n1 Action Horror Science Fiction Thriller \n2 Comedy \n3 Drama Thriller Crime \n4 Horror Thriller Mystery \n.. ... \n150 Action Drama \n151 Animation Adventure Family \n152 Music Drama \n153 Drama \n154 Drama Western \n\n[155 rows x 3 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TitleCast and crewgenres
0The GrudgeNicolas Pesce (director/screenplay); Andrea Ri...Horror Mystery
1UnderwaterWilliam Eubank (director); Brian Duffield, Ada...Action Horror Science Fiction Thriller
2Like a BossMiguel Arteta (director); Sam Pitman, Adam Col...Comedy
3Inherit the ViperAnthony Jerjen (director); Andrew Crabtree (sc...Drama Thriller Crime
4The SonataAndrew Desmond (director/screenplay); Arthur M...Horror Thriller Mystery
............
150Top Gun: MaverickJoseph Kosinski (director); Ehren Kruger, Eric...Action Drama
151The Croods 2Joel Crawford (director); Kevin Hageman, Dan H...Animation Adventure Family
152RespectLiesl Tommy (director); Tracey Scott Wilson (s...Music Drama
153The Last DuelRidley Scott (director); Ben Affleck, Matt Dam...Drama
154News of the WorldPaul Greengrass (director/screenplay); Luke Da...Drama Western
\n

155 rows × 3 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_director(x):\n if \" (director)\" in x:\n return x.split(\" (director)\")[0]\n elif \" (directors)\" in x:\n return x.split(\" (directors)\")[0]\n else:\n return x.split(\" (director/screenplay)\")[0]","execution_count":17,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2020['director_name'] = df_2020['Cast and crew'].map(lambda x: get_director(str(x)))","execution_count":18,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_actor1(x):\n return ((x.split(\"screenplay); \")[-1]).split(\", \")[0])","execution_count":19,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2020['actor_1_name'] = df_2020['Cast and crew'].map(lambda x: get_actor1(str(x)))","execution_count":20,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_actor2(x):\n if len((x.split(\"screenplay); \")[-1]).split(\", \")) < 2:\n return np.NaN\n else:\n return ((x.split(\"screenplay); \")[-1]).split(\", \")[1])","execution_count":21,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2020['actor_2_name'] = df_2020['Cast and crew'].map(lambda x: get_actor2(str(x)))","execution_count":22,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def get_actor3(x):\n if len((x.split(\"screenplay); \")[-1]).split(\", \")) < 3:\n return np.NaN\n else:\n return ((x.split(\"screenplay); \")[-1]).split(\", \")[2])","execution_count":23,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"\ndf_2020['actor_3_name'] = df_2020['Cast and crew'].map(lambda x: get_actor3(str(x)))","execution_count":24,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2020","execution_count":25,"outputs":[{"output_type":"execute_result","execution_count":25,"data":{"text/plain":" Title Cast and crew \\\n0 The Grudge Nicolas Pesce (director/screenplay); Andrea Ri... \n1 Underwater William Eubank (director); Brian Duffield, Ada... \n2 Like a Boss Miguel Arteta (director); Sam Pitman, Adam Col... \n3 Inherit the Viper Anthony Jerjen (director); Andrew Crabtree (sc... \n4 The Sonata Andrew Desmond (director/screenplay); Arthur M... \n.. ... ... \n150 Top Gun: Maverick Joseph Kosinski (director); Ehren Kruger, Eric... \n151 The Croods 2 Joel Crawford (director); Kevin Hageman, Dan H... \n152 Respect Liesl Tommy (director); Tracey Scott Wilson (s... \n153 The Last Duel Ridley Scott (director); Ben Affleck, Matt Dam... \n154 News of the World Paul Greengrass (director/screenplay); Luke Da... \n\n genres director_name \\\n0 Horror Mystery Nicolas Pesce \n1 Action Horror Science Fiction Thriller William Eubank \n2 Comedy Miguel Arteta \n3 Drama Thriller Crime Anthony Jerjen \n4 Horror Thriller Mystery Andrew Desmond \n.. ... ... \n150 Action Drama Joseph Kosinski \n151 Animation Adventure Family Joel Crawford \n152 Music Drama Liesl Tommy \n153 Drama Ridley Scott \n154 Drama Western Paul Greengrass \n\n actor_1_name actor_2_name actor_3_name \n0 Andrea Riseborough Demián Bichir John Cho \n1 Kristen Stewart Vincent Cassel Jessica Henwick \n2 Tiffany Haddish Rose Byrne Salma Hayek \n3 Josh Hartnett Margarita Levieva Chandler Riggs \n4 Freya Tingley Simon Abkarian Rutger Hauer \n.. ... ... ... \n150 Tom Cruise Miles Teller Jennifer Connelly \n151 Nicolas Cage Emma Stone Ryan Reynolds \n152 Jennifer Hudson Forest Whitaker Marlon Wayans \n153 Matt Damon Adam Driver Jodie Comer \n154 Tom Hanks Helena Zengel Neil Sandilands \n\n[155 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TitleCast and crewgenresdirector_nameactor_1_nameactor_2_nameactor_3_name
0The GrudgeNicolas Pesce (director/screenplay); Andrea Ri...Horror MysteryNicolas PesceAndrea RiseboroughDemián BichirJohn Cho
1UnderwaterWilliam Eubank (director); Brian Duffield, Ada...Action Horror Science Fiction ThrillerWilliam EubankKristen StewartVincent CasselJessica Henwick
2Like a BossMiguel Arteta (director); Sam Pitman, Adam Col...ComedyMiguel ArtetaTiffany HaddishRose ByrneSalma Hayek
3Inherit the ViperAnthony Jerjen (director); Andrew Crabtree (sc...Drama Thriller CrimeAnthony JerjenJosh HartnettMargarita LevievaChandler Riggs
4The SonataAndrew Desmond (director/screenplay); Arthur M...Horror Thriller MysteryAndrew DesmondFreya TingleySimon AbkarianRutger Hauer
........................
150Top Gun: MaverickJoseph Kosinski (director); Ehren Kruger, Eric...Action DramaJoseph KosinskiTom CruiseMiles TellerJennifer Connelly
151The Croods 2Joel Crawford (director); Kevin Hageman, Dan H...Animation Adventure FamilyJoel CrawfordNicolas CageEmma StoneRyan Reynolds
152RespectLiesl Tommy (director); Tracey Scott Wilson (s...Music DramaLiesl TommyJennifer HudsonForest WhitakerMarlon Wayans
153The Last DuelRidley Scott (director); Ben Affleck, Matt Dam...DramaRidley ScottMatt DamonAdam DriverJodie Comer
154News of the WorldPaul Greengrass (director/screenplay); Luke Da...Drama WesternPaul GreengrassTom HanksHelena ZengelNeil Sandilands
\n

155 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_2020 = df_2020.rename(columns={'Title':'movie_title'})","execution_count":26,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df20 = df_2020.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]","execution_count":27,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df20","execution_count":28,"outputs":[{"output_type":"execute_result","execution_count":28,"data":{"text/plain":" director_name actor_1_name actor_2_name \\\n0 Nicolas Pesce Andrea Riseborough Demián Bichir \n1 William Eubank Kristen Stewart Vincent Cassel \n2 Miguel Arteta Tiffany Haddish Rose Byrne \n3 Anthony Jerjen Josh Hartnett Margarita Levieva \n4 Andrew Desmond Freya Tingley Simon Abkarian \n.. ... ... ... \n150 Joseph Kosinski Tom Cruise Miles Teller \n151 Joel Crawford Nicolas Cage Emma Stone \n152 Liesl Tommy Jennifer Hudson Forest Whitaker \n153 Ridley Scott Matt Damon Adam Driver \n154 Paul Greengrass Tom Hanks Helena Zengel \n\n actor_3_name genres \\\n0 John Cho Horror Mystery \n1 Jessica Henwick Action Horror Science Fiction Thriller \n2 Salma Hayek Comedy \n3 Chandler Riggs Drama Thriller Crime \n4 Rutger Hauer Horror Thriller Mystery \n.. ... ... \n150 Jennifer Connelly Action Drama \n151 Ryan Reynolds Animation Adventure Family \n152 Marlon Wayans Music Drama \n153 Jodie Comer Drama \n154 Neil Sandilands Drama Western \n\n movie_title \n0 The Grudge \n1 Underwater \n2 Like a Boss \n3 Inherit the Viper \n4 The Sonata \n.. ... \n150 Top Gun: Maverick \n151 The Croods 2 \n152 Respect \n153 The Last Duel \n154 News of the World \n\n[155 rows x 6 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_title
0Nicolas PesceAndrea RiseboroughDemián BichirJohn ChoHorror MysteryThe Grudge
1William EubankKristen StewartVincent CasselJessica HenwickAction Horror Science Fiction ThrillerUnderwater
2Miguel ArtetaTiffany HaddishRose ByrneSalma HayekComedyLike a Boss
3Anthony JerjenJosh HartnettMargarita LevievaChandler RiggsDrama Thriller CrimeInherit the Viper
4Andrew DesmondFreya TingleySimon AbkarianRutger HauerHorror Thriller MysteryThe Sonata
.....................
150Joseph KosinskiTom CruiseMiles TellerJennifer ConnellyAction DramaTop Gun: Maverick
151Joel CrawfordNicolas CageEmma StoneRyan ReynoldsAnimation Adventure FamilyThe Croods 2
152Liesl TommyJennifer HudsonForest WhitakerMarlon WayansMusic DramaRespect
153Ridley ScottMatt DamonAdam DriverJodie ComerDramaThe Last Duel
154Paul GreengrassTom HanksHelena ZengelNeil SandilandsDrama WesternNews of the World
\n

155 rows × 6 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df20['comb'] = new_df20['actor_1_name'] + ' ' + new_df20['actor_2_name'] + ' '+ new_df20['actor_3_name'] + ' '+ new_df20['director_name'] +' ' + new_df20['genres']","execution_count":29,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df20 = new_df20.dropna(how='any')","execution_count":30,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df20['movie_title'] = new_df20['movie_title'].str.lower()","execution_count":31,"outputs":[{"output_type":"stream","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n \"\"\"Entry point for launching an IPython kernel.\n","name":"stderr"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"new_df20","execution_count":32,"outputs":[{"output_type":"execute_result","execution_count":32,"data":{"text/plain":" director_name actor_1_name actor_2_name \\\n0 Nicolas Pesce Andrea Riseborough Demián Bichir \n1 William Eubank Kristen Stewart Vincent Cassel \n2 Miguel Arteta Tiffany Haddish Rose Byrne \n3 Anthony Jerjen Josh Hartnett Margarita Levieva \n4 Andrew Desmond Freya Tingley Simon Abkarian \n.. ... ... ... \n150 Joseph Kosinski Tom Cruise Miles Teller \n151 Joel Crawford Nicolas Cage Emma Stone \n152 Liesl Tommy Jennifer Hudson Forest Whitaker \n153 Ridley Scott Matt Damon Adam Driver \n154 Paul Greengrass Tom Hanks Helena Zengel \n\n actor_3_name genres \\\n0 John Cho Horror Mystery \n1 Jessica Henwick Action Horror Science Fiction Thriller \n2 Salma Hayek Comedy \n3 Chandler Riggs Drama Thriller Crime \n4 Rutger Hauer Horror Thriller Mystery \n.. ... ... \n150 Jennifer Connelly Action Drama \n151 Ryan Reynolds Animation Adventure Family \n152 Marlon Wayans Music Drama \n153 Jodie Comer Drama \n154 Neil Sandilands Drama Western \n\n movie_title comb \n0 the grudge Andrea Riseborough Demián Bichir John Cho Nico... \n1 underwater Kristen Stewart Vincent Cassel Jessica Henwick... \n2 like a boss Tiffany Haddish Rose Byrne Salma Hayek Miguel ... \n3 inherit the viper Josh Hartnett Margarita Levieva Chandler Riggs... \n4 the sonata Freya Tingley Simon Abkarian Rutger Hauer Andr... \n.. ... ... \n150 top gun: maverick Tom Cruise Miles Teller Jennifer Connelly Jose... \n151 the croods 2 Nicolas Cage Emma Stone Ryan Reynolds Joel Cra... \n152 respect Jennifer Hudson Forest Whitaker Marlon Wayans ... \n153 the last duel Matt Damon Adam Driver Jodie Comer Ridley Scot... \n154 news of the world Tom Hanks Helena Zengel Neil Sandilands Paul G... \n\n[141 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0Nicolas PesceAndrea RiseboroughDemián BichirJohn ChoHorror Mysterythe grudgeAndrea Riseborough Demián Bichir John Cho Nico...
1William EubankKristen StewartVincent CasselJessica HenwickAction Horror Science Fiction ThrillerunderwaterKristen Stewart Vincent Cassel Jessica Henwick...
2Miguel ArtetaTiffany HaddishRose ByrneSalma HayekComedylike a bossTiffany Haddish Rose Byrne Salma Hayek Miguel ...
3Anthony JerjenJosh HartnettMargarita LevievaChandler RiggsDrama Thriller Crimeinherit the viperJosh Hartnett Margarita Levieva Chandler Riggs...
4Andrew DesmondFreya TingleySimon AbkarianRutger HauerHorror Thriller Mysterythe sonataFreya Tingley Simon Abkarian Rutger Hauer Andr...
........................
150Joseph KosinskiTom CruiseMiles TellerJennifer ConnellyAction Dramatop gun: maverickTom Cruise Miles Teller Jennifer Connelly Jose...
151Joel CrawfordNicolas CageEmma StoneRyan ReynoldsAnimation Adventure Familythe croods 2Nicolas Cage Emma Stone Ryan Reynolds Joel Cra...
152Liesl TommyJennifer HudsonForest WhitakerMarlon WayansMusic DramarespectJennifer Hudson Forest Whitaker Marlon Wayans ...
153Ridley ScottMatt DamonAdam DriverJodie ComerDramathe last duelMatt Damon Adam Driver Jodie Comer Ridley Scot...
154Paul GreengrassTom HanksHelena ZengelNeil SandilandsDrama Westernnews of the worldTom Hanks Helena Zengel Neil Sandilands Paul G...
\n

141 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"old_df = pd.read_csv('../input/final_data.csv')","execution_count":34,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"old_df","execution_count":35,"outputs":[{"output_type":"execute_result","execution_count":35,"data":{"text/plain":" director_name actor_1_name actor_2_name \\\n0 John Lasseter Tom Hanks Tim Allen \n1 Joe Johnston Robin Williams Jonathan Hyde \n2 Howard Deutch Walter Matthau Jack Lemmon \n3 Forest Whitaker Whitney Houston Angela Bassett \n4 Charles Shyer Steve Martin Diane Keaton \n... ... ... ... \n36841 Greta Gerwig Saoirse Ronan Emma Watson \n36842 Sam Mendes George MacKay Dean-Charles Chapman \n36843 Destin Daniel Cretton Michael B. Jordan Jamie Foxx \n36844 Chinonye Chukwu Alfre Woodard Wendell Pierce \n36845 Waymon Boone Mena Suvari Kevin Pollak \n\n actor_3_name genres movie_title \\\n0 Don Rickles Animation Comedy Family toy story \n1 Kirsten Dunst Adventure Fantasy Family jumanji \n2 Ann-Margret Romance Comedy grumpier old men \n3 Loretta Devine Comedy Drama Romance waiting to exhale \n4 Martin Short Comedy father of the bride part ii \n... ... ... ... \n36841 Florence Pugh Drama Romance little women \n36842 Mark Strong War Drama Action History 1917 \n36843 Brie Larson Drama Crime just mercy \n36844 Aldis Hodge Drama clemency \n36845 unknown Horror Thriller apparition \n\n comb \n0 Tom Hanks Tim Allen Don Rickles John Lasseter ... \n1 Robin Williams Jonathan Hyde Kirsten Dunst Joe... \n2 Walter Matthau Jack Lemmon Ann-Margret Howard ... \n3 Whitney Houston Angela Bassett Loretta Devine ... \n4 Steve Martin Diane Keaton Martin Short Charles... \n... ... \n36841 Saoirse Ronan Emma Watson Florence Pugh Greta ... \n36842 George MacKay Dean-Charles Chapman Mark Strong... \n36843 Michael B. Jordan Jamie Foxx Brie Larson Desti... \n36844 Alfre Woodard Wendell Pierce Aldis Hodge Chino... \n36845 Mena Suvari Kevin Pollak unknown Waymon Boone ... \n\n[36846 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0John LasseterTom HanksTim AllenDon RicklesAnimation Comedy Familytoy storyTom Hanks Tim Allen Don Rickles John Lasseter ...
1Joe JohnstonRobin WilliamsJonathan HydeKirsten DunstAdventure Fantasy FamilyjumanjiRobin Williams Jonathan Hyde Kirsten Dunst Joe...
2Howard DeutchWalter MatthauJack LemmonAnn-MargretRomance Comedygrumpier old menWalter Matthau Jack Lemmon Ann-Margret Howard ...
3Forest WhitakerWhitney HoustonAngela BassettLoretta DevineComedy Drama Romancewaiting to exhaleWhitney Houston Angela Bassett Loretta Devine ...
4Charles ShyerSteve MartinDiane KeatonMartin ShortComedyfather of the bride part iiSteve Martin Diane Keaton Martin Short Charles...
........................
36841Greta GerwigSaoirse RonanEmma WatsonFlorence PughDrama Romancelittle womenSaoirse Ronan Emma Watson Florence Pugh Greta ...
36842Sam MendesGeorge MacKayDean-Charles ChapmanMark StrongWar Drama Action History1917George MacKay Dean-Charles Chapman Mark Strong...
36843Destin Daniel CrettonMichael B. JordanJamie FoxxBrie LarsonDrama Crimejust mercyMichael B. Jordan Jamie Foxx Brie Larson Desti...
36844Chinonye ChukwuAlfre WoodardWendell PierceAldis HodgeDramaclemencyAlfre Woodard Wendell Pierce Aldis Hodge Chino...
36845Waymon BooneMena SuvariKevin PollakunknownHorror ThrillerapparitionMena Suvari Kevin Pollak unknown Waymon Boone ...
\n

36846 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"final_df = old_df.append(new_df20,ignore_index=True)","execution_count":36,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"final_df","execution_count":37,"outputs":[{"output_type":"execute_result","execution_count":37,"data":{"text/plain":" director_name actor_1_name actor_2_name actor_3_name \\\n0 John Lasseter Tom Hanks Tim Allen Don Rickles \n1 Joe Johnston Robin Williams Jonathan Hyde Kirsten Dunst \n2 Howard Deutch Walter Matthau Jack Lemmon Ann-Margret \n3 Forest Whitaker Whitney Houston Angela Bassett Loretta Devine \n4 Charles Shyer Steve Martin Diane Keaton Martin Short \n... ... ... ... ... \n36982 Joseph Kosinski Tom Cruise Miles Teller Jennifer Connelly \n36983 Joel Crawford Nicolas Cage Emma Stone Ryan Reynolds \n36984 Liesl Tommy Jennifer Hudson Forest Whitaker Marlon Wayans \n36985 Ridley Scott Matt Damon Adam Driver Jodie Comer \n36986 Paul Greengrass Tom Hanks Helena Zengel Neil Sandilands \n\n genres movie_title \\\n0 Animation Comedy Family toy story \n1 Adventure Fantasy Family jumanji \n2 Romance Comedy grumpier old men \n3 Comedy Drama Romance waiting to exhale \n4 Comedy father of the bride part ii \n... ... ... \n36982 Action Drama top gun: maverick \n36983 Animation Adventure Family the croods 2 \n36984 Music Drama respect \n36985 Drama the last duel \n36986 Drama Western news of the world \n\n comb \n0 Tom Hanks Tim Allen Don Rickles John Lasseter ... \n1 Robin Williams Jonathan Hyde Kirsten Dunst Joe... \n2 Walter Matthau Jack Lemmon Ann-Margret Howard ... \n3 Whitney Houston Angela Bassett Loretta Devine ... \n4 Steve Martin Diane Keaton Martin Short Charles... \n... ... \n36982 Tom Cruise Miles Teller Jennifer Connelly Jose... \n36983 Nicolas Cage Emma Stone Ryan Reynolds Joel Cra... \n36984 Jennifer Hudson Forest Whitaker Marlon Wayans ... \n36985 Matt Damon Adam Driver Jodie Comer Ridley Scot... \n36986 Tom Hanks Helena Zengel Neil Sandilands Paul G... \n\n[36987 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
director_nameactor_1_nameactor_2_nameactor_3_namegenresmovie_titlecomb
0John LasseterTom HanksTim AllenDon RicklesAnimation Comedy Familytoy storyTom Hanks Tim Allen Don Rickles John Lasseter ...
1Joe JohnstonRobin WilliamsJonathan HydeKirsten DunstAdventure Fantasy FamilyjumanjiRobin Williams Jonathan Hyde Kirsten Dunst Joe...
2Howard DeutchWalter MatthauJack LemmonAnn-MargretRomance Comedygrumpier old menWalter Matthau Jack Lemmon Ann-Margret Howard ...
3Forest WhitakerWhitney HoustonAngela BassettLoretta DevineComedy Drama Romancewaiting to exhaleWhitney Houston Angela Bassett Loretta Devine ...
4Charles ShyerSteve MartinDiane KeatonMartin ShortComedyfather of the bride part iiSteve Martin Diane Keaton Martin Short Charles...
........................
36982Joseph KosinskiTom CruiseMiles TellerJennifer ConnellyAction Dramatop gun: maverickTom Cruise Miles Teller Jennifer Connelly Jose...
36983Joel CrawfordNicolas CageEmma StoneRyan ReynoldsAnimation Adventure Familythe croods 2Nicolas Cage Emma Stone Ryan Reynolds Joel Cra...
36984Liesl TommyJennifer HudsonForest WhitakerMarlon WayansMusic DramarespectJennifer Hudson Forest Whitaker Marlon Wayans ...
36985Ridley ScottMatt DamonAdam DriverJodie ComerDramathe last duelMatt Damon Adam Driver Jodie Comer Ridley Scot...
36986Paul GreengrassTom HanksHelena ZengelNeil SandilandsDrama Westernnews of the worldTom Hanks Helena Zengel Neil Sandilands Paul G...
\n

36987 rows × 7 columns

\n
"},"metadata":{}}]},{"metadata":{"trusted":true},"cell_type":"code","source":"final_df.to_csv('main_data.csv',index=False)","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4} -------------------------------------------------------------------------------- /.ipynb_checkpoints/sentiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import nltk\n", 12 | "from nltk.corpus import stopwords\n", 13 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 14 | "from sklearn.model_selection import train_test_split\n", 15 | "from sklearn import naive_bayes\n", 16 | "from sklearn.metrics import roc_auc_score,accuracy_score\n", 17 | "import pickle" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stderr", 27 | "output_type": "stream", 28 | "text": [ 29 | "[nltk_data] Downloading package stopwords to\n", 30 | "[nltk_data] C:\\Users\\tharun\\AppData\\Roaming\\nltk_data...\n", 31 | "[nltk_data] Unzipping corpora\\stopwords.zip.\n" 32 | ] 33 | }, 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "True" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "nltk.download(\"stopwords\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "dataset = pd.read_csv('reviews.txt',sep = '\\t', names =['Reviews','Comments'])" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/html": [ 66 | "
\n", 67 | "\n", 80 | "\n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | "
ReviewsComments
01The Da Vinci Code book is just awesome.
11this was the first clive cussler i've ever rea...
21i liked the Da Vinci Code a lot.
31i liked the Da Vinci Code a lot.
41I liked the Da Vinci Code but it ultimatly did...
.........
69130Brokeback Mountain was boring.
69140So Brokeback Mountain was really depressing.
69150As I sit here, watching the MTV Movie Awards, ...
69160Ok brokeback mountain is such a horrible movie.
69170Oh, and Brokeback Mountain was a terrible movie.
\n", 146 | "

6918 rows × 2 columns

\n", 147 | "
" 148 | ], 149 | "text/plain": [ 150 | " Reviews Comments\n", 151 | "0 1 The Da Vinci Code book is just awesome.\n", 152 | "1 1 this was the first clive cussler i've ever rea...\n", 153 | "2 1 i liked the Da Vinci Code a lot.\n", 154 | "3 1 i liked the Da Vinci Code a lot.\n", 155 | "4 1 I liked the Da Vinci Code but it ultimatly did...\n", 156 | "... ... ...\n", 157 | "6913 0 Brokeback Mountain was boring.\n", 158 | "6914 0 So Brokeback Mountain was really depressing.\n", 159 | "6915 0 As I sit here, watching the MTV Movie Awards, ...\n", 160 | "6916 0 Ok brokeback mountain is such a horrible movie.\n", 161 | "6917 0 Oh, and Brokeback Mountain was a terrible movie.\n", 162 | "\n", 163 | "[6918 rows x 2 columns]" 164 | ] 165 | }, 166 | "execution_count": 4, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "dataset" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 5, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "stopset = set(stopwords.words('english'))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopset)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 16, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "X = vectorizer.fit_transform(dataset.Comments)\n", 200 | "y = dataset.Reviews\n", 201 | "pickle.dump(vectorizer, open('tranform.pkl', 'wb'))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 17, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 18, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "text/plain": [ 221 | "MultinomialNB()" 222 | ] 223 | }, 224 | "execution_count": 18, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "clf = naive_bayes.MultinomialNB()\n", 231 | "clf.fit(X_train,y_train)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 19, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "97.47109826589595" 243 | ] 244 | }, 245 | "execution_count": 19, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "accuracy_score(y_test,clf.predict(X_test))*100" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 20, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "MultinomialNB()" 263 | ] 264 | }, 265 | "execution_count": 20, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "clf = naive_bayes.MultinomialNB()\n", 272 | "clf.fit(X,y)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 21, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "98.77167630057804" 284 | ] 285 | }, 286 | "execution_count": 21, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "accuracy_score(y_test,clf.predict(X_test))*100" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 22, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "filename = 'nlp_model.pkl'\n", 302 | "pickle.dump(clf, open(filename, 'wb'))" 303 | ] 304 | } 305 | ], 306 | "metadata": { 307 | "kernelspec": { 308 | "display_name": "Python 3", 309 | "language": "python", 310 | "name": "python3" 311 | }, 312 | "language_info": { 313 | "codemirror_mode": { 314 | "name": "ipython", 315 | "version": 3 316 | }, 317 | "file_extension": ".py", 318 | "mimetype": "text/x-python", 319 | "name": "python", 320 | "nbconvert_exporter": "python", 321 | "pygments_lexer": "ipython3", 322 | "version": "3.8.3" 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 4 327 | } 328 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: gunicorn main:app -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Movie-Recommendation-System 2 | 3 | This application provides all the details of the requested movie such as overview, genre, release date, rating, runtime, top cast, reviews, recommended movies, etc. 4 | 5 | The details of the movies(title, genre, runtime, ratings, posters, etc) are fetched using an API by TMDB, https://www.themoviedb.org, and using the IMDB id of the movie in the API, I did web scraping to get the reviews given by the user in the IMDB site using `beautifulsoup4` and performed sentiment analysis on those reviews. 6 | 7 | ## Link to the application 8 | 9 | Check out the live demo: https://lookupforcinema.herokuapp.com 10 | 11 | # Medium Article About My Project 12 | 13 | https://medium.com/analytics-vidhya/build-a-movie-recommendation-flask-based-deployment-8e2970f1f5f1 14 | 15 | ## Finding similar movies 16 | ### Without taking content into account (Just based on ratings) 17 | 18 | Here just based on the ratings of the users for different movies, we use K nearest neighbours algorithm to find the movies which are similar. 19 | 20 | ### With taking Content into account 21 | 22 | Here we just information about the movies, in this case the information of genres to predict the most similar movies. 23 | 24 | ## Matrix Factorisation(Collabarative Filtering) 25 | 26 | Two approaches were tried to do matrix factorisation, the low rank method is very slow, so used scipy's SVD for sparse matrix. 27 | 28 | ## Architecture 29 | 30 | ![110212434-597bb700-7ec1-11eb-9ffa-7ac319e33123](https://user-images.githubusercontent.com/41158838/140876791-13716f4e-7e62-4f1e-8f06-155ce8360f16.jpg) 31 | 32 | ## Deep Learning Methods 33 | 34 | One popular recommender systems approach is called Matrix Factorisation. It works on the principle that we can learn a low-dimensional representation (embedding) of user and movie. For example, for each movie, we can have how much action it has, how long it is, and so on. For each user, we can encode how much they like action, or how much they like long movies, etc. Thus, we can combine the user and the movie embeddings to estimate the ratings on unseen movies. This approach can also be viewed as: given a matrix (A [M X N]) containing users and movies, we want to estimate low dimensional matrices (W [M X k] and H [M X k]), such that: A≈W.HT 35 | ### 1.Matrix Factorisation based on Deep learning 36 | ### 2. Matrix Factorisation based on Deep learning with non negative embeddings. 37 | ### 3. Advanced neural network with different number of embeddings for both and movies. 38 | 39 | ## Required Tools 40 | 41 | 1. Keras 42 | 2. Scipy 43 | 3. Numpy 44 | 4. Pandas 45 | 5. python 3 46 | 47 | ### Sources of the datasets 48 | 49 | 1. [IMDB 5000 Movie Dataset](https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset) 50 | 2. [The Movies Dataset](https://www.kaggle.com/rounakbanik/the-movies-dataset) 51 | 3. [List of movies in 2018](https://en.wikipedia.org/wiki/List_of_American_films_of_2018) 52 | 4. [List of movies in 2019](https://en.wikipedia.org/wiki/List_of_American_films_of_2019) 53 | 5. [List of movies in 2020](https://en.wikipedia.org/wiki/List_of_American_films_of_2020) 54 | 55 | ### Please do ⭐ the repository, if it helped you in anyways 56 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from flask import Flask, render_template, request 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.metrics.pairwise import cosine_similarity 6 | import json 7 | import bs4 as bs 8 | import urllib.request 9 | import pickle 10 | import requests 11 | from datetime import date, datetime 12 | 13 | # load the nlp model and tfidf vectorizer from disk 14 | filename = 'nlp_model.pkl' 15 | clf = pickle.load(open(filename, 'rb')) 16 | vectorizer = pickle.load(open('tranform.pkl','rb')) 17 | 18 | # converting list of string to list (eg. "["abc","def"]" to ["abc","def"]) 19 | def convert_to_list(my_list): 20 | my_list = my_list.split('","') 21 | my_list[0] = my_list[0].replace('["','') 22 | my_list[-1] = my_list[-1].replace('"]','') 23 | return my_list 24 | 25 | # convert list of numbers to list (eg. "[1,2,3]" to [1,2,3]) 26 | def convert_to_list_num(my_list): 27 | my_list = my_list.split(',') 28 | my_list[0] = my_list[0].replace("[","") 29 | my_list[-1] = my_list[-1].replace("]","") 30 | return my_list 31 | 32 | def get_suggestions(): 33 | data = pd.read_csv('main_data.csv') 34 | return list(data['movie_title'].str.capitalize()) 35 | 36 | app = Flask(__name__) 37 | 38 | @app.route("/") 39 | @app.route("/home") 40 | def home(): 41 | suggestions = get_suggestions() 42 | return render_template('home.html',suggestions=suggestions) 43 | 44 | 45 | @app.route("/recommend",methods=["POST"]) 46 | def recommend(): 47 | # getting data from AJAX request 48 | title = request.form['title'] 49 | cast_ids = request.form['cast_ids'] 50 | cast_names = request.form['cast_names'] 51 | cast_chars = request.form['cast_chars'] 52 | cast_bdays = request.form['cast_bdays'] 53 | cast_bios = request.form['cast_bios'] 54 | cast_places = request.form['cast_places'] 55 | cast_profiles = request.form['cast_profiles'] 56 | imdb_id = request.form['imdb_id'] 57 | poster = request.form['poster'] 58 | genres = request.form['genres'] 59 | overview = request.form['overview'] 60 | vote_average = request.form['rating'] 61 | vote_count = request.form['vote_count'] 62 | rel_date = request.form['rel_date'] 63 | release_date = request.form['release_date'] 64 | runtime = request.form['runtime'] 65 | status = request.form['status'] 66 | rec_movies = request.form['rec_movies'] 67 | rec_posters = request.form['rec_posters'] 68 | rec_movies_org = request.form['rec_movies_org'] 69 | rec_year = request.form['rec_year'] 70 | rec_vote = request.form['rec_vote'] 71 | 72 | # get movie suggestions for auto complete 73 | suggestions = get_suggestions() 74 | 75 | # call the convert_to_list function for every string that needs to be converted to list 76 | rec_movies_org = convert_to_list(rec_movies_org) 77 | rec_movies = convert_to_list(rec_movies) 78 | rec_posters = convert_to_list(rec_posters) 79 | cast_names = convert_to_list(cast_names) 80 | cast_chars = convert_to_list(cast_chars) 81 | cast_profiles = convert_to_list(cast_profiles) 82 | cast_bdays = convert_to_list(cast_bdays) 83 | cast_bios = convert_to_list(cast_bios) 84 | cast_places = convert_to_list(cast_places) 85 | 86 | # convert string to list (eg. "[1,2,3]" to [1,2,3]) 87 | cast_ids = convert_to_list_num(cast_ids) 88 | rec_vote = convert_to_list_num(rec_vote) 89 | rec_year = convert_to_list_num(rec_year) 90 | 91 | # rendering the string to python string 92 | for i in range(len(cast_bios)): 93 | cast_bios[i] = cast_bios[i].replace(r'\n', '\n').replace(r'\"','\"') 94 | 95 | for i in range(len(cast_chars)): 96 | cast_chars[i] = cast_chars[i].replace(r'\n', '\n').replace(r'\"','\"') 97 | 98 | # combining multiple lists as a dictionary which can be passed to the html file so that it can be processed easily and the order of information will be preserved 99 | movie_cards = {rec_posters[i]: [rec_movies[i],rec_movies_org[i],rec_vote[i],rec_year[i]] for i in range(len(rec_posters))} 100 | 101 | casts = {cast_names[i]:[cast_ids[i], cast_chars[i], cast_profiles[i]] for i in range(len(cast_profiles))} 102 | 103 | cast_details = {cast_names[i]:[cast_ids[i], cast_profiles[i], cast_bdays[i], cast_places[i], cast_bios[i]] for i in range(len(cast_places))} 104 | 105 | # web scraping to get user reviews from IMDB site 106 | sauce = urllib.request.urlopen('https://www.imdb.com/title/{}/reviews?ref_=tt_ov_rt'.format(imdb_id)).read() 107 | soup = bs.BeautifulSoup(sauce,'lxml') 108 | soup_result = soup.find_all("div",{"class":"text show-more__control"}) 109 | 110 | reviews_list = [] # list of reviews 111 | reviews_status = [] # list of comments (good or bad) 112 | for reviews in soup_result: 113 | if reviews.string: 114 | reviews_list.append(reviews.string) 115 | # passing the review to our model 116 | movie_review_list = np.array([reviews.string]) 117 | movie_vector = vectorizer.transform(movie_review_list) 118 | pred = clf.predict(movie_vector) 119 | reviews_status.append('Positive' if pred else 'Negative') 120 | 121 | # getting current date 122 | movie_rel_date = "" 123 | curr_date = "" 124 | if(rel_date): 125 | today = str(date.today()) 126 | curr_date = datetime.strptime(today,'%Y-%m-%d') 127 | movie_rel_date = datetime.strptime(rel_date, '%Y-%m-%d') 128 | 129 | # combining reviews and comments into a dictionary 130 | movie_reviews = {reviews_list[i]: reviews_status[i] for i in range(len(reviews_list))} 131 | 132 | # passing all the data to the html file 133 | return render_template('recommend.html',title=title,poster=poster,overview=overview,vote_average=vote_average, 134 | vote_count=vote_count,release_date=release_date,movie_rel_date=movie_rel_date,curr_date=curr_date,runtime=runtime,status=status,genres=genres,movie_cards=movie_cards,reviews=movie_reviews,casts=casts,cast_details=cast_details) 135 | 136 | if __name__ == '__main__': 137 | app.run(debug=True) 138 | -------------------------------------------------------------------------------- /nlp_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tharun-tharun/Movie-Recommendation-System-with-Sentiment-Analysis/fc73e55a6dd98283c45aadcdd3fd43c8fc4f24d3/nlp_model.pkl -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==1.1.1 2 | gunicorn==19.9.0 3 | Jinja2==2.11.3 4 | MarkupSafe==1.1.1 5 | Werkzeug==0.15.5 6 | numpy>=1.9.2 7 | scipy>=0.15.1 8 | nltk>=3.6.4 9 | scikit-learn>=0.18 10 | pandas>=0.19 11 | beautifulsoup4==4.9.1 12 | jsonschema==3.2.0 13 | tmdbv3api==1.6.1 14 | lxml==4.9.1 15 | urllib3 16 | requests 17 | pickleshare==0.7.5 18 | -------------------------------------------------------------------------------- /static/autocomplete.js: -------------------------------------------------------------------------------- 1 | new autoComplete({ 2 | data: { // Data src [Array, Function, Async] | (REQUIRED) 3 | src: films, 4 | }, 5 | selector: "#autoComplete", // Input field selector | (Optional) 6 | threshold: 2, // Min. Chars length to start Engine | (Optional) 7 | debounce: 100, // Post duration for engine to start | (Optional) 8 | searchEngine: "strict", // Search Engine type/mode | (Optional) 9 | resultsList: { // Rendered results list object | (Optional) 10 | render: true, 11 | container: source => { 12 | source.setAttribute("id", "food_list"); 13 | }, 14 | destination: document.querySelector("#autoComplete"), 15 | position: "afterend", 16 | element: "ul" 17 | }, 18 | maxResults: 5, // Max. number of rendered results | (Optional) 19 | highlight: true, // Highlight matching results | (Optional) 20 | resultItem: { // Rendered result item | (Optional) 21 | content: (data, source) => { 22 | source.innerHTML = data.match; 23 | }, 24 | element: "li" 25 | }, 26 | noResults: () => { // Action script on noResults | (Optional) 27 | const result = document.createElement("li"); 28 | result.setAttribute("class", "no_result"); 29 | result.setAttribute("tabindex", "1"); 30 | result.innerHTML = "No Results"; 31 | document.querySelector("#autoComplete_list").appendChild(result); 32 | }, 33 | onSelection: feedback => { // Action script onSelection event | (Optional) 34 | document.getElementById('autoComplete').value = feedback.selection.value; 35 | } 36 | }); -------------------------------------------------------------------------------- /static/default.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tharun-tharun/Movie-Recommendation-System-with-Sentiment-Analysis/fc73e55a6dd98283c45aadcdd3fd43c8fc4f24d3/static/default.jpg -------------------------------------------------------------------------------- /static/image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tharun-tharun/Movie-Recommendation-System-with-Sentiment-Analysis/fc73e55a6dd98283c45aadcdd3fd43c8fc4f24d3/static/image.jpg -------------------------------------------------------------------------------- /static/loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tharun-tharun/Movie-Recommendation-System-with-Sentiment-Analysis/fc73e55a6dd98283c45aadcdd3fd43c8fc4f24d3/static/loader.gif -------------------------------------------------------------------------------- /static/recommend.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | // Button will be disabled until we type something inside the input field 3 | const source = document.getElementById('autoComplete'); 4 | const inputHandler = function(e) { 5 | if(e.target.value==""){ 6 | $('.movie-button').attr('disabled', true); 7 | } 8 | else{ 9 | $('.movie-button').attr('disabled', false); 10 | } 11 | } 12 | source.addEventListener('input', inputHandler); 13 | 14 | $('.fa-arrow-up').click(function(){ 15 | $('html, body').animate({scrollTop:0}, 'slow'); 16 | }); 17 | 18 | $('.app-title').click(function(){ 19 | window.location.href = '/'; 20 | }) 21 | 22 | $('.movie-button').on('click',function(){ 23 | var my_api_key = '55a6a5ca3cdfe2fea08ca6f270dde6a2'; 24 | var title = $('.movie').val(); 25 | if (title=="") { 26 | $('.results').css('display','none'); 27 | $('.fail').css('display','block'); 28 | } 29 | 30 | if (($('.fail').text() && ($('.footer').css('position') == 'absolute'))) { 31 | $('.footer').css('position', 'fixed'); 32 | } 33 | 34 | else{ 35 | load_details(my_api_key,title); 36 | } 37 | }); 38 | }); 39 | 40 | // will be invoked when clicking on the recommended movie cards 41 | function recommendcard(e){ 42 | $("#loader").fadeIn(); 43 | var my_api_key = '55a6a5ca3cdfe2fea08ca6f270dde6a2'; 44 | var title = e.getAttribute('title'); 45 | load_details(my_api_key,title); 46 | } 47 | 48 | 49 | // get the details of the movie from the API (based on the name of the movie) 50 | function load_details(my_api_key,title){ 51 | $.ajax({ 52 | type: 'GET', 53 | url:'https://api.themoviedb.org/3/search/movie?api_key='+my_api_key+'&query='+title, 54 | async: false, 55 | success: function(movie){ 56 | if(movie.results.length<1){ 57 | $('.fail').css('display','block'); 58 | $('.results').css('display','none'); 59 | $("#loader").delay(500).fadeOut(); 60 | } 61 | else if(movie.results.length==1) { 62 | $("#loader").fadeIn(); 63 | $('.fail').css('display','none'); 64 | $('.results').delay(1000).css('display','block'); 65 | var movie_id = movie.results[0].id; 66 | var movie_title = movie.results[0].title; 67 | var movie_title_org = movie.results[0].original_title; 68 | get_movie_details(movie_id,my_api_key,movie_title,movie_title_org); 69 | } 70 | else{ 71 | var close_match = {}; 72 | var flag=0; 73 | var movie_id=""; 74 | var movie_title=""; 75 | var movie_title_org=""; 76 | $("#loader").fadeIn(); 77 | $('.fail').css('display','none'); 78 | $('.results').delay(1000).css('display','block'); 79 | for(var count in movie.results){ 80 | if(title==movie.results[count].original_title){ 81 | flag = 1; 82 | movie_id = movie.results[count].id; 83 | movie_title = movie.results[count].title; 84 | movie_title_org = movie.results[count].original_title; 85 | break; 86 | } 87 | else{ 88 | close_match[movie.results[count].title] = similarity(title, movie.results[count].title); 89 | } 90 | } 91 | if(flag==0){ 92 | movie_title = Object.keys(close_match).reduce(function(a, b){ return close_match[a] > close_match[b] ? a : b }); 93 | var index = Object.keys(close_match).indexOf(movie_title) 94 | movie_id = movie.results[index].id; 95 | movie_title_org = movie.results[index].original_title; 96 | } 97 | get_movie_details(movie_id,my_api_key,movie_title,movie_title_org); 98 | } 99 | }, 100 | error: function(error){ 101 | alert('Invalid Request - '+error); 102 | $("#loader").delay(500).fadeOut(); 103 | }, 104 | }); 105 | } 106 | 107 | // getting closest match to the requested movie name using Levenshtein distance 108 | function similarity(s1, s2) { 109 | var longer = s1; 110 | var shorter = s2; 111 | if (s1.length < s2.length) { 112 | longer = s2; 113 | shorter = s1; 114 | } 115 | var longerLength = longer.length; 116 | if (longerLength == 0) { 117 | return 1.0; 118 | } 119 | return (longerLength - editDistance(longer, shorter)) / parseFloat(longerLength); 120 | } 121 | 122 | function editDistance(s1, s2) { 123 | s1 = s1.toLowerCase(); 124 | s2 = s2.toLowerCase(); 125 | 126 | var costs = new Array(); 127 | for (var i = 0; i <= s1.length; i++) { 128 | var lastValue = i; 129 | for (var j = 0; j <= s2.length; j++) { 130 | if (i == 0) 131 | costs[j] = j; 132 | else { 133 | if (j > 0) { 134 | var newValue = costs[j - 1]; 135 | if (s1.charAt(i - 1) != s2.charAt(j - 1)) 136 | newValue = Math.min(Math.min(newValue, lastValue), 137 | costs[j]) + 1; 138 | costs[j - 1] = lastValue; 139 | lastValue = newValue; 140 | } 141 | } 142 | } 143 | if (i > 0) 144 | costs[s2.length] = lastValue; 145 | } 146 | return costs[s2.length]; 147 | } 148 | 149 | // get all the details of the movie using the movie id. 150 | function get_movie_details(movie_id,my_api_key,movie_title,movie_title_org) { 151 | $.ajax({ 152 | type:'GET', 153 | url:'https://api.themoviedb.org/3/movie/'+movie_id+'?api_key='+my_api_key, 154 | success: function(movie_details){ 155 | show_details(movie_details,movie_title,my_api_key,movie_id,movie_title_org); 156 | }, 157 | error: function(error){ 158 | alert("API Error! - "+error); 159 | $("#loader").delay(500).fadeOut(); 160 | }, 161 | }); 162 | } 163 | 164 | // passing all the details to python's flask for displaying and scraping the movie reviews using imdb id 165 | function show_details(movie_details,movie_title,my_api_key,movie_id,movie_title_org){ 166 | var imdb_id = movie_details.imdb_id; 167 | var poster; 168 | if(movie_details.poster_path){ 169 | poster = 'https://image.tmdb.org/t/p/original'+movie_details.poster_path; 170 | } 171 | else { 172 | poster = 'static/default.jpg'; 173 | } 174 | var overview = movie_details.overview; 175 | var genres = movie_details.genres; 176 | var rating = movie_details.vote_average; 177 | var vote_count = movie_details.vote_count; 178 | var release_date = movie_details.release_date; 179 | var runtime = parseInt(movie_details.runtime); 180 | var status = movie_details.status; 181 | var genre_list = [] 182 | for (var genre in genres){ 183 | genre_list.push(genres[genre].name); 184 | } 185 | var my_genre = genre_list.join(", "); 186 | if(runtime%60==0){ 187 | runtime = Math.floor(runtime/60)+" hour(s)" 188 | } 189 | else { 190 | runtime = Math.floor(runtime/60)+" hour(s) "+(runtime%60)+" min(s)" 191 | } 192 | 193 | // calling `get_movie_cast` to get the top cast for the queried movie 194 | movie_cast = get_movie_cast(movie_id,my_api_key); 195 | 196 | // calling `get_individual_cast` to get the individual cast details 197 | ind_cast = get_individual_cast(movie_cast,my_api_key); 198 | 199 | // calling `get_recommendations` to get the recommended movies for the given movie id from the TMDB API 200 | recommendations = get_recommendations(movie_id, my_api_key); 201 | 202 | details = { 203 | 'title':movie_title, 204 | 'cast_ids':JSON.stringify(movie_cast.cast_ids), 205 | 'cast_names':JSON.stringify(movie_cast.cast_names), 206 | 'cast_chars':JSON.stringify(movie_cast.cast_chars), 207 | 'cast_profiles':JSON.stringify(movie_cast.cast_profiles), 208 | 'cast_bdays':JSON.stringify(ind_cast.cast_bdays), 209 | 'cast_bios':JSON.stringify(ind_cast.cast_bios), 210 | 'cast_places':JSON.stringify(ind_cast.cast_places), 211 | 'imdb_id':imdb_id, 212 | 'poster':poster, 213 | 'genres':my_genre, 214 | 'overview':overview, 215 | 'rating':rating, 216 | 'vote_count':vote_count.toLocaleString(), 217 | 'rel_date':release_date, 218 | 'release_date':new Date(release_date).toDateString().split(' ').slice(1).join(' '), 219 | 'runtime':runtime, 220 | 'status':status, 221 | 'rec_movies':JSON.stringify(recommendations.rec_movies), 222 | 'rec_posters':JSON.stringify(recommendations.rec_posters), 223 | 'rec_movies_org':JSON.stringify(recommendations.rec_movies_org), 224 | 'rec_year':JSON.stringify(recommendations.rec_year), 225 | 'rec_vote':JSON.stringify(recommendations.rec_vote) 226 | } 227 | 228 | $.ajax({ 229 | type:'POST', 230 | data:details, 231 | url:"/recommend", 232 | dataType: 'html', 233 | complete: function(){ 234 | $("#loader").delay(500).fadeOut(); 235 | }, 236 | success: function(response) { 237 | $('.results').html(response); 238 | $('#autoComplete').val(''); 239 | $('.footer').css('position','absolute'); 240 | if ($('.movie-content')) { 241 | $('.movie-content').after('
'); 242 | } 243 | $(window).scrollTop(0); 244 | } 245 | }); 246 | } 247 | 248 | // getting the details of individual cast 249 | function get_individual_cast(movie_cast,my_api_key) { 250 | cast_bdays = []; 251 | cast_bios = []; 252 | cast_places = []; 253 | for(var cast_id in movie_cast.cast_ids){ 254 | $.ajax({ 255 | type:'GET', 256 | url:'https://api.themoviedb.org/3/person/'+movie_cast.cast_ids[cast_id]+'?api_key='+my_api_key, 257 | async:false, 258 | success: function(cast_details){ 259 | cast_bdays.push((new Date(cast_details.birthday)).toDateString().split(' ').slice(1).join(' ')); 260 | if(cast_details.biography){ 261 | cast_bios.push(cast_details.biography); 262 | } 263 | else { 264 | cast_bios.push("Not Available"); 265 | } 266 | if(cast_details.place_of_birth){ 267 | cast_places.push(cast_details.place_of_birth); 268 | } 269 | else { 270 | cast_places.push("Not Available"); 271 | } 272 | } 273 | }); 274 | } 275 | return {cast_bdays:cast_bdays,cast_bios:cast_bios,cast_places:cast_places}; 276 | } 277 | 278 | // getting the details of the cast for the requested movie 279 | function get_movie_cast(movie_id,my_api_key){ 280 | cast_ids= []; 281 | cast_names = []; 282 | cast_chars = []; 283 | cast_profiles = []; 284 | top_10 = [0,1,2,3,4,5,6,7,8,9]; 285 | $.ajax({ 286 | type:'GET', 287 | url:"https://api.themoviedb.org/3/movie/"+movie_id+"/credits?api_key="+my_api_key, 288 | async:false, 289 | success: function(my_movie){ 290 | if(my_movie.cast.length>0){ 291 | if(my_movie.cast.length>=10){ 292 | top_cast = [0,1,2,3,4,5,6,7,8,9]; 293 | } 294 | else { 295 | top_cast = [0,1,2,3,4]; 296 | } 297 | for(var my_cast in top_cast){ 298 | cast_ids.push(my_movie.cast[my_cast].id) 299 | cast_names.push(my_movie.cast[my_cast].name); 300 | cast_chars.push(my_movie.cast[my_cast].character); 301 | if(my_movie.cast[my_cast].profile_path){ 302 | cast_profiles.push("https://image.tmdb.org/t/p/original"+my_movie.cast[my_cast].profile_path); 303 | } 304 | else { 305 | cast_profiles.push("static/default.jpg"); 306 | } 307 | } 308 | } 309 | }, 310 | error: function(error){ 311 | alert("Invalid Request! - "+error); 312 | $("#loader").delay(500).fadeOut(); 313 | } 314 | }); 315 | 316 | return {cast_ids:cast_ids,cast_names:cast_names,cast_chars:cast_chars,cast_profiles:cast_profiles}; 317 | } 318 | 319 | // getting recommendations 320 | function get_recommendations(movie_id, my_api_key) { 321 | rec_movies = []; 322 | rec_posters = []; 323 | rec_movies_org = []; 324 | rec_year = []; 325 | rec_vote = []; 326 | 327 | $.ajax({ 328 | type: 'GET', 329 | url: "https://api.themoviedb.org/3/movie/"+movie_id+"/recommendations?api_key="+my_api_key, 330 | async: false, 331 | success: function(recommend) { 332 | for(var recs in recommend.results) { 333 | rec_movies.push(recommend.results[recs].title); 334 | rec_movies_org.push(recommend.results[recs].original_title); 335 | rec_year.push(new Date(recommend.results[recs].release_date).getFullYear()); 336 | rec_vote.push(recommend.results[recs].vote_average); 337 | if(recommend.results[recs].poster_path){ 338 | rec_posters.push("https://image.tmdb.org/t/p/original"+recommend.results[recs].poster_path); 339 | } 340 | else { 341 | rec_posters.push("static/default.jpg"); 342 | } 343 | } 344 | }, 345 | error: function(error) { 346 | alert("Invalid Request! - "+error); 347 | $("#loader").delay(500).fadeOut(); 348 | } 349 | }); 350 | return {rec_movies:rec_movies,rec_movies_org:rec_movies_org,rec_posters:rec_posters,rec_year:rec_year,rec_vote:rec_vote}; 351 | } 352 | -------------------------------------------------------------------------------- /static/style.css: -------------------------------------------------------------------------------- 1 | /* width */ 2 | ::-webkit-scrollbar { 3 | width: 15px; 4 | } 5 | 6 | /* Track */ 7 | ::-webkit-scrollbar-track { 8 | border-radius: 10px; 9 | } 10 | 11 | /* Handle */ 12 | ::-webkit-scrollbar-thumb { 13 | background: #e50914; 14 | border-radius: 10px; 15 | } 16 | 17 | /* Handle on hover */ 18 | ::-webkit-scrollbar-thumb:hover { 19 | background: #b30000; 20 | } 21 | 22 | .movie { 23 | color: #fff; 24 | margin-left: auto; 25 | margin-right: auto; 26 | resize: none; 27 | } 28 | 29 | .movie-content { 30 | display: flex; 31 | flex-wrap: wrap; 32 | justify-content:center; 33 | } 34 | 35 | .movie-content > div { 36 | margin:20px; 37 | } 38 | 39 | .btn-block{ 40 | width: 15%; 41 | text-align: center; 42 | margin-left: auto; 43 | margin-right: auto; 44 | color: #e4e0e0; 45 | } 46 | 47 | #content { 48 | background-image: url("../static/image.jpg"); 49 | background-color: #181818; 50 | font-family: 'Noto Sans JP', sans-serif; 51 | } 52 | 53 | #details { 54 | margin-left: 50px; 55 | } 56 | 57 | .body-content { 58 | position: relative; 59 | min-height: 100%; 60 | } 61 | 62 | .footer { 63 | color: #e4e0e0; 64 | background-color: #e50914d1; 65 | text-align:center; 66 | position: fixed; 67 | bottom: 0; 68 | width: 100%; 69 | } 70 | 71 | .social-icons { 72 | margin-left: 15px; 73 | } 74 | 75 | h1 { 76 | font-family: 'Netflix Sans', 'Helvetica Neue', Helvetica, Arial, sans-serif; 77 | color: #e50914; 78 | font-weight: bold; 79 | margin-top: 30px; 80 | text-shadow: #000000 0px 0px 13px; 81 | } 82 | 83 | .github-corner:hover .octo-arm { 84 | animation: octocat-wave 560ms ease-in-out; 85 | } 86 | 87 | .fa-arrow-up { 88 | font-size: 1.5em; 89 | color: #ffffff; 90 | background: #e50914; 91 | padding: 10px; 92 | border-radius: 50%; 93 | float: right; 94 | bottom: 25px; 95 | position: relative; 96 | } 97 | 98 | .fa-arrow-up:hover { 99 | cursor: pointer; 100 | } 101 | 102 | .app-title:hover { 103 | cursor: pointer; 104 | } 105 | 106 | @keyframes octocat-wave { 107 | 0%, 108 | 100% { 109 | transform: rotate(0) 110 | } 111 | 112 | 20%, 113 | 60% { 114 | transform: rotate(-25deg) 115 | } 116 | 117 | 40%, 118 | 80% { 119 | transform: rotate(10deg) 120 | } 121 | } 122 | 123 | #autoComplete { 124 | background-position: 98% ; 125 | } 126 | 127 | #name { 128 | color: white; 129 | padding: 1px; 130 | } 131 | 132 | h6 { 133 | margin-bottom: 20px; 134 | } 135 | 136 | @media only screen and (max-width: 650px) { 137 | #mcontent { 138 | display: block; 139 | } 140 | .poster-lg { 141 | display: none; 142 | } 143 | #details { 144 | margin-left: 30px; 145 | } 146 | #loader { 147 | display: none; 148 | position: fixed; 149 | z-index: 100; 150 | left: 0; 151 | top:0; 152 | width: 100%; 153 | height: 100%; 154 | background-image: url("../static/loader.gif"); 155 | background-size: 40%; 156 | background-position: 50% 50%; 157 | background-color: rgba(255, 255, 255, 1); 158 | background-repeat: no-repeat; 159 | -webkit-transition: background-image 0.2s ease-in-out; 160 | transition: background-image 0.2s ease-in-out; 161 | } 162 | 163 | #loader-text { 164 | vertical-align: middle; 165 | color:white; 166 | } 167 | 168 | #autoComplete { 169 | background-position: 97% ; 170 | } 171 | 172 | svg[data-toggle=tooltip] { 173 | width: 50px; 174 | height: 50px; 175 | } 176 | 177 | .fa-arrow-up{ 178 | font-size: 1em; 179 | color: #ffffff; 180 | background: #e50914; 181 | padding: 10px; 182 | border-radius: 50%; 183 | float: right; 184 | bottom: 10px; 185 | position: relative; 186 | } 187 | } 188 | 189 | @media only screen and (max-width: 991px) { 190 | .modal-body{ 191 | display: block; 192 | } 193 | .profile-pic { 194 | margin-left: auto; 195 | margin-right: auto; 196 | display: block; 197 | margin-bottom: 20px; 198 | } 199 | } 200 | @media only screen and (min-width: 992px) { 201 | .modal-body { 202 | display: flex; 203 | } 204 | } 205 | 206 | @media only screen and (min-width: 651px) { 207 | .poster-sm { 208 | display: none; 209 | } 210 | 211 | #mcontent { 212 | display: flex; 213 | flex-wrap: nowrap; 214 | } 215 | 216 | #loader { 217 | display: none; 218 | position: fixed; 219 | z-index: 100; 220 | left: 0; 221 | top:0; 222 | width: 100%; 223 | height: 100%; 224 | background-image: url("../static/loader.gif"); 225 | background-size: 20%; 226 | background-position: 50% 50%; 227 | background-color: rgba(255, 255, 255, 1); 228 | background-repeat: no-repeat; 229 | -webkit-transition: background-image 0.2s ease-in-out; 230 | transition: background-image 0.2s ease-in-out; 231 | } 232 | 233 | #loader-text { 234 | vertical-align: middle; 235 | color:white; 236 | } 237 | 238 | } 239 | 240 | .poster{ 241 | -webkit-box-shadow: 0px 1px 15px 4px rgba(250,250,250,1); 242 | -moz-box-shadow: 0px 1px 15px 4px rgba(250,250,250,1); 243 | box-shadow: 0px 1px 15px 4px rgba(250,250,250,1); 244 | } 245 | 246 | .card:hover { 247 | cursor: pointer; 248 | } 249 | 250 | .castcard:hover { 251 | cursor: pointer; 252 | } 253 | 254 | .cast-img { 255 | filter: brightness(100%); 256 | -moz-transition: all 0.75s ease; 257 | -webkit-transition: all 0.75s ease; 258 | transition: all 0.75s ease; 259 | } 260 | 261 | 262 | .cast-img:hover { 263 | filter: brightness(50%); 264 | -moz-transition: all 0.75s ease; 265 | -webkit-transition: all 0.75s ease; 266 | transition: all 0.75s ease; 267 | } 268 | 269 | .fig { 270 | display: flex; 271 | align-items: center; 272 | justify-content: center; 273 | backdrop-filter: brightness(50%); 274 | position: absolute; 275 | bottom: 0px; 276 | top: 0px; 277 | right: 0px; 278 | left: 0px; 279 | opacity: 0; 280 | -moz-transition: all 0.75s ease; 281 | -webkit-transition: all 0.75s ease; 282 | transition: all 0.75s ease; 283 | } 284 | 285 | .fig:hover { 286 | opacity: 1; 287 | backdrop-filter:br; 288 | -moz-transition: all 0.75s ease; 289 | -webkit-transition: all 0.75s ease; 290 | transition: all 0.75s ease; 291 | } 292 | 293 | .card-btn { 294 | border-radius: 20px; 295 | } 296 | 297 | .imghvr { 298 | position: relative; 299 | } 300 | 301 | .table td { 302 | border-color: white; 303 | border-style:solid; 304 | border-width:1px; 305 | } 306 | 307 | .fail { 308 | display: none; 309 | color: white; 310 | } 311 | -------------------------------------------------------------------------------- /templates/home.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | The Movie Cinema 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 33 | 34 | 35 | 36 | 37 |
38 |
39 | 40 | 51 | 52 |

The Movie Cinema

53 |
54 | 55 |
56 |
57 | 58 |
59 |

60 |
61 |
62 | 63 |
64 |
65 | 66 |
67 |

Sorry! The movie you requested is not in our database. 68 | Please check the spelling or try with other movies!

69 |
70 | 71 |
72 |
73 |

74 |
75 |
76 | 77 | 95 | 96 |
97 |
98 | 103 |
104 |
105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /templates/recommend.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | NEW 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
27 |
28 |

{{title}}

29 |
30 |
31 |
32 | 33 |
34 |
35 |
36 | 37 |
38 |
39 | 40 |
41 |
42 |
43 |
TITLE:  {{title}}
44 |
OVERVIEW:

       {{overview}}
45 |
RATING:  {{vote_average}}/10 ({{vote_count}} votes)
46 |
GENRE:  {{genres}}
47 |
RELEASE DATE:  {{release_date}}
48 |
RUNTIME:  {{runtime}}
49 |
STATUS:  {{status}}
50 |
51 |
52 |
53 |
54 | 55 | {% for name, details in cast_details.items() if not cast_details.hidden %} 56 | 83 | {% endfor %} 84 | 85 |
86 | 87 | {% if casts|length > 1 %} 88 |
89 |
90 |

TOP CAST

91 |
(Click on the cast to know more)
92 |
93 |
94 | 95 | 96 |
97 | {% for name, details in casts.items() if not casts.hidden %} 98 |
99 |
100 | {{name}} - profile 101 |
102 | 103 |
104 |
105 |
106 |
{{name|upper}}
107 |
AS {{details[1]|upper}}
108 |
109 |
110 | {% endfor %} 111 |
112 | {% endif %} 113 |
114 | 115 |
116 | {% if reviews %} 117 |

USER REVIEWS

118 |
119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | {% for review, status in reviews.items() if not reviews.hidden %} 129 | 130 | 131 | 141 | 142 | {% endfor %} 143 | 144 |
CommentsSentiments
{{review}} 132 |
133 | {{status}} : 134 | {% if status =='Positive' %} 135 | 😃 136 | {% else %} 137 | 😖 138 | {% endif %} 139 |
140 |
145 |
146 | 147 | {% if (curr_date) and (movie_rel_date) %} 148 | {% elif curr_date < movie_rel_date %} 149 |
150 |

This movie is not released yet. Stay tuned!

151 |
152 | {% else %} 153 |
154 |

Sorry, the reviews for this movie are not available! :(

155 |
156 | {% endif %} 157 | {% else %} 158 |
159 |

Sorry, the reviews for this movie are not available! :(

160 |
161 | {% endif %} 162 |
163 |
164 | 165 | 166 | {% if movie_cards|length > 1 %} 167 | 168 |
169 |

RECOMMENDED MOVIES FOR YOU

(Click any of the movies to get recommendation)
170 |
171 | 172 |
173 | {% for poster, details in movie_cards.items() if not movie_cards.hidden %} 174 |
175 |
176 | {{details[0]}} - poster 177 |
178 |   {{details[2]}}/10 179 |
180 |
181 | {{details[3]}} 182 |
183 |
184 | 185 |
186 |
187 |
188 |
{{details[0]|upper}}
189 |
190 |
191 | {% endfor %} 192 |
193 | {% endif %} 194 |



195 |
196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /tranform.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tharun-tharun/Movie-Recommendation-System-with-Sentiment-Analysis/fc73e55a6dd98283c45aadcdd3fd43c8fc4f24d3/tranform.pkl --------------------------------------------------------------------------------