├── MovieLens
├── .gitignore
├── extra-imdb-info.ipynb
└── data-processing.ipynb
├── README.md
├── WorldCup
└── download.ipynb
└── .gitignore
/MovieLens/.gitignore:
--------------------------------------------------------------------------------
1 | # files
2 | *.zip
3 | *.csv
4 | *.txt
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # neotwork-datasets
2 | Take network datasets and feed them to neo4j
3 |
--------------------------------------------------------------------------------
/WorldCup/download.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": []
9 | }
10 | ],
11 | "metadata": {
12 | "kernelspec": {
13 | "display_name": "Python 3",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "codemirror_mode": {
19 | "name": "ipython",
20 | "version": 3
21 | },
22 | "file_extension": ".py",
23 | "mimetype": "text/x-python",
24 | "name": "python",
25 | "nbconvert_exporter": "python",
26 | "pygments_lexer": "ipython3",
27 | "version": "3.6.4"
28 | }
29 | },
30 | "nbformat": 4,
31 | "nbformat_minor": 2
32 | }
33 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.gitignore.io/api/macos,windows,jupyternotebook
3 |
4 | ### JupyterNotebook ###
5 | .ipynb_checkpoints
6 | */.ipynb_checkpoints/*
7 |
8 | # Remove previous ipynb_checkpoints
9 | # git rm -r .ipynb_checkpoints/
10 | #
11 | ### macOS ###
12 | *.DS_Store
13 | .AppleDouble
14 | .LSOverride
15 |
16 | # Icon must end with two \r
17 | Icon
18 |
19 | # Thumbnails
20 | ._*
21 |
22 | # Files that might appear in the root of a volume
23 | .DocumentRevisions-V100
24 | .fseventsd
25 | .Spotlight-V100
26 | .TemporaryItems
27 | .Trashes
28 | .VolumeIcon.icns
29 | .com.apple.timemachine.donotpresent
30 |
31 | # Directories potentially created on remote AFP share
32 | .AppleDB
33 | .AppleDesktop
34 | Network Trash Folder
35 | Temporary Items
36 | .apdisk
37 |
38 | ### Windows ###
39 | # Windows thumbnail cache files
40 | Thumbs.db
41 | ehthumbs.db
42 | ehthumbs_vista.db
43 |
44 | # Folder config file
45 | Desktop.ini
46 |
47 | # Recycle Bin used on file shares
48 | $RECYCLE.BIN/
49 |
50 | # Windows Installer files
51 | *.cab
52 | *.msi
53 | *.msm
54 | *.msp
55 |
56 | # Windows shortcuts
57 | *.lnk
58 |
59 |
60 | # End of https://www.gitignore.io/api/macos,windows,jupyternotebook
61 |
62 | # files
63 | *.zip
64 | *.csv
65 |
--------------------------------------------------------------------------------
/MovieLens/extra-imdb-info.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "from imdb import IMDb"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "data": {
20 | "text/html": [
21 | "
\n",
22 | "\n",
35 | "
\n",
36 | " \n",
37 | " \n",
38 | " | \n",
39 | " title | \n",
40 | " imdbId | \n",
41 | " tmdbId | \n",
42 | " year | \n",
43 | "
\n",
44 | " \n",
45 | " | movieId | \n",
46 | " | \n",
47 | " | \n",
48 | " | \n",
49 | " | \n",
50 | "
\n",
51 | " \n",
52 | " \n",
53 | " \n",
54 | " | 1 | \n",
55 | " Toy Story | \n",
56 | " 114709 | \n",
57 | " 862 | \n",
58 | " 1995 | \n",
59 | "
\n",
60 | " \n",
61 | " | 2 | \n",
62 | " Jumanji | \n",
63 | " 113497 | \n",
64 | " 8844 | \n",
65 | " 1995 | \n",
66 | "
\n",
67 | " \n",
68 | " | 3 | \n",
69 | " Grumpier Old Men | \n",
70 | " 113228 | \n",
71 | " 15602 | \n",
72 | " 1995 | \n",
73 | "
\n",
74 | " \n",
75 | " | 4 | \n",
76 | " Waiting to Exhale | \n",
77 | " 114885 | \n",
78 | " 31357 | \n",
79 | " 1995 | \n",
80 | "
\n",
81 | " \n",
82 | " | 5 | \n",
83 | " Father of the Bride Part II | \n",
84 | " 113041 | \n",
85 | " 11862 | \n",
86 | " 1995 | \n",
87 | "
\n",
88 | " \n",
89 | "
\n",
90 | "
"
91 | ],
92 | "text/plain": [
93 | " title imdbId tmdbId year\n",
94 | "movieId \n",
95 | "1 Toy Story 114709 862 1995\n",
96 | "2 Jumanji 113497 8844 1995\n",
97 | "3 Grumpier Old Men 113228 15602 1995\n",
98 | "4 Waiting to Exhale 114885 31357 1995\n",
99 | "5 Father of the Bride Part II 113041 11862 1995"
100 | ]
101 | },
102 | "execution_count": 2,
103 | "metadata": {},
104 | "output_type": "execute_result"
105 | }
106 | ],
107 | "source": [
108 | "movies = pd.read_csv(\"movies.clean.csv\", index_col=0)\n",
109 | "movies.head()"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 5,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "ia = IMDb()"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 20,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "data": {
128 | "text/plain": [
129 | "['cast',\n",
130 | " 'genres',\n",
131 | " 'runtimes',\n",
132 | " 'countries',\n",
133 | " 'country codes',\n",
134 | " 'language codes',\n",
135 | " 'color info',\n",
136 | " 'aspect ratio',\n",
137 | " 'sound mix',\n",
138 | " 'certificates',\n",
139 | " 'original air date',\n",
140 | " 'rating',\n",
141 | " 'votes',\n",
142 | " 'cover url',\n",
143 | " 'plot outline',\n",
144 | " 'languages',\n",
145 | " 'title',\n",
146 | " 'year',\n",
147 | " 'kind',\n",
148 | " 'directors',\n",
149 | " 'writers',\n",
150 | " 'producers',\n",
151 | " 'composers',\n",
152 | " 'editors',\n",
153 | " 'editorial department',\n",
154 | " 'casting directors',\n",
155 | " 'art directors',\n",
156 | " 'production managers ',\n",
157 | " 'art department',\n",
158 | " 'sound department',\n",
159 | " 'visual effects',\n",
160 | " 'camera department',\n",
161 | " 'animation department',\n",
162 | " 'casting department',\n",
163 | " 'music department',\n",
164 | " 'miscellaneous',\n",
165 | " 'akas',\n",
166 | " 'writer',\n",
167 | " 'director',\n",
168 | " 'top 250 rank',\n",
169 | " 'plot',\n",
170 | " 'synopsis',\n",
171 | " 'canonical title',\n",
172 | " 'long imdb title',\n",
173 | " 'long imdb canonical title',\n",
174 | " 'smart canonical title',\n",
175 | " 'smart long imdb canonical title',\n",
176 | " 'full-size cover url']"
177 | ]
178 | },
179 | "execution_count": 20,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "movie = ia.get_movie(114709)\n",
186 | "movie.keys()"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 19,
192 | "metadata": {},
193 | "outputs": [
194 | {
195 | "name": "stdout",
196 | "output_type": "stream",
197 | "text": [
198 | "Lasseter, John\n"
199 | ]
200 | },
201 | {
202 | "data": {
203 | "text/plain": [
204 | "['name', 'canonical name', 'long imdb name', 'long imdb canonical name']"
205 | ]
206 | },
207 | "execution_count": 19,
208 | "metadata": {},
209 | "output_type": "execute_result"
210 | }
211 | ],
212 | "source": [
213 | "one_director = movie[\"director\"][0]\n",
214 | "print(one_director['canonical name'])\n",
215 | "one_director.keys()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": []
224 | }
225 | ],
226 | "metadata": {
227 | "kernelspec": {
228 | "display_name": "Python 3",
229 | "language": "python",
230 | "name": "python3"
231 | },
232 | "language_info": {
233 | "codemirror_mode": {
234 | "name": "ipython",
235 | "version": 3
236 | },
237 | "file_extension": ".py",
238 | "mimetype": "text/x-python",
239 | "name": "python",
240 | "nbconvert_exporter": "python",
241 | "pygments_lexer": "ipython3",
242 | "version": "3.6.5"
243 | }
244 | },
245 | "nbformat": 4,
246 | "nbformat_minor": 2
247 | }
248 |
--------------------------------------------------------------------------------
/MovieLens/data-processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 25,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import os\n",
12 | "import re"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | " % Total % Received % Xferd Average Speed Time Time Time Current\n",
25 | " Dload Upload Total Spent Left Speed\n",
26 | "100 896k 100 896k 0 0 593k 0 0:00:01 0:00:01 --:--:-- 179k 593k\n"
27 | ]
28 | }
29 | ],
30 | "source": [
31 | "ml_latest_small = \"ml-latest-small.zip\"\n",
32 | "\n",
33 | "!curl -o $ml_latest_small http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 4,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import zipfile\n",
43 | "with zipfile.ZipFile(ml_latest_small, 'r') as zip_ref:\n",
44 | " zip_ref.extractall(\".\")"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 5,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/html": [
55 | "\n",
56 | "\n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " | \n",
73 | " title | \n",
74 | " genres | \n",
75 | " imdbId | \n",
76 | " tmdbId | \n",
77 | "
\n",
78 | " \n",
79 | " | movieId | \n",
80 | " | \n",
81 | " | \n",
82 | " | \n",
83 | " | \n",
84 | "
\n",
85 | " \n",
86 | " \n",
87 | " \n",
88 | " | 1 | \n",
89 | " Toy Story (1995) | \n",
90 | " Adventure|Animation|Children|Comedy|Fantasy | \n",
91 | " 114709 | \n",
92 | " 862 | \n",
93 | "
\n",
94 | " \n",
95 | " | 2 | \n",
96 | " Jumanji (1995) | \n",
97 | " Adventure|Children|Fantasy | \n",
98 | " 113497 | \n",
99 | " 8844 | \n",
100 | "
\n",
101 | " \n",
102 | " | 3 | \n",
103 | " Grumpier Old Men (1995) | \n",
104 | " Comedy|Romance | \n",
105 | " 113228 | \n",
106 | " 15602 | \n",
107 | "
\n",
108 | " \n",
109 | " | 4 | \n",
110 | " Waiting to Exhale (1995) | \n",
111 | " Comedy|Drama|Romance | \n",
112 | " 114885 | \n",
113 | " 31357 | \n",
114 | "
\n",
115 | " \n",
116 | " | 5 | \n",
117 | " Father of the Bride Part II (1995) | \n",
118 | " Comedy | \n",
119 | " 113041 | \n",
120 | " 11862 | \n",
121 | "
\n",
122 | " \n",
123 | "
\n",
124 | "
"
125 | ],
126 | "text/plain": [
127 | " title \\\n",
128 | "movieId \n",
129 | "1 Toy Story (1995) \n",
130 | "2 Jumanji (1995) \n",
131 | "3 Grumpier Old Men (1995) \n",
132 | "4 Waiting to Exhale (1995) \n",
133 | "5 Father of the Bride Part II (1995) \n",
134 | "\n",
135 | " genres imdbId tmdbId \n",
136 | "movieId \n",
137 | "1 Adventure|Animation|Children|Comedy|Fantasy 114709 862 \n",
138 | "2 Adventure|Children|Fantasy 113497 8844 \n",
139 | "3 Comedy|Romance 113228 15602 \n",
140 | "4 Comedy|Drama|Romance 114885 31357 \n",
141 | "5 Comedy 113041 11862 "
142 | ]
143 | },
144 | "execution_count": 5,
145 | "metadata": {},
146 | "output_type": "execute_result"
147 | }
148 | ],
149 | "source": [
150 | "movies = pd.read_csv(\"ml-latest-small/movies.csv\", index_col=0)\n",
151 | "links = pd.read_csv(\"ml-latest-small/links.csv\", index_col=0)\n",
152 | "movies = pd.merge(movies, links, left_index=True, right_index=True)\n",
153 | "movies.tmdbId = movies.tmdbId.apply(lambda v: int(v) if pd.notna(v) else -1)\n",
154 | "movies.head()"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 6,
160 | "metadata": {},
161 | "outputs": [
162 | {
163 | "name": "stdout",
164 | "output_type": "stream",
165 | "text": [
166 | "Witchfinder General (Conquerer Worm, The)\n"
167 | ]
168 | },
169 | {
170 | "data": {
171 | "text/html": [
172 | "\n",
173 | "\n",
186 | "
\n",
187 | " \n",
188 | " \n",
189 | " | \n",
190 | " title | \n",
191 | " genres | \n",
192 | " imdbId | \n",
193 | " tmdbId | \n",
194 | " year | \n",
195 | "
\n",
196 | " \n",
197 | " | movieId | \n",
198 | " | \n",
199 | " | \n",
200 | " | \n",
201 | " | \n",
202 | " | \n",
203 | "
\n",
204 | " \n",
205 | " \n",
206 | " \n",
207 | " | 1 | \n",
208 | " Toy Story | \n",
209 | " Adventure|Animation|Children|Comedy|Fantasy | \n",
210 | " 114709 | \n",
211 | " 862 | \n",
212 | " 1995 | \n",
213 | "
\n",
214 | " \n",
215 | " | 2 | \n",
216 | " Jumanji | \n",
217 | " Adventure|Children|Fantasy | \n",
218 | " 113497 | \n",
219 | " 8844 | \n",
220 | " 1995 | \n",
221 | "
\n",
222 | " \n",
223 | " | 3 | \n",
224 | " Grumpier Old Men | \n",
225 | " Comedy|Romance | \n",
226 | " 113228 | \n",
227 | " 15602 | \n",
228 | " 1995 | \n",
229 | "
\n",
230 | " \n",
231 | " | 4 | \n",
232 | " Waiting to Exhale | \n",
233 | " Comedy|Drama|Romance | \n",
234 | " 114885 | \n",
235 | " 31357 | \n",
236 | " 1995 | \n",
237 | "
\n",
238 | " \n",
239 | " | 5 | \n",
240 | " Father of the Bride Part II | \n",
241 | " Comedy | \n",
242 | " 113041 | \n",
243 | " 11862 | \n",
244 | " 1995 | \n",
245 | "
\n",
246 | " \n",
247 | "
\n",
248 | "
"
249 | ],
250 | "text/plain": [
251 | " title \\\n",
252 | "movieId \n",
253 | "1 Toy Story \n",
254 | "2 Jumanji \n",
255 | "3 Grumpier Old Men \n",
256 | "4 Waiting to Exhale \n",
257 | "5 Father of the Bride Part II \n",
258 | "\n",
259 | " genres imdbId tmdbId year \n",
260 | "movieId \n",
261 | "1 Adventure|Animation|Children|Comedy|Fantasy 114709 862 1995 \n",
262 | "2 Adventure|Children|Fantasy 113497 8844 1995 \n",
263 | "3 Comedy|Romance 113228 15602 1995 \n",
264 | "4 Comedy|Drama|Romance 114885 31357 1995 \n",
265 | "5 Comedy 113041 11862 1995 "
266 | ]
267 | },
268 | "execution_count": 6,
269 | "metadata": {},
270 | "output_type": "execute_result"
271 | }
272 | ],
273 | "source": [
274 | "year = re.compile(\"\\(([0-9]{4})\\)$\")\n",
275 | "\n",
276 | "def get_year(date):\n",
277 | " match = year.search(date.strip())\n",
278 | " if match:\n",
279 | " return int(match.group(1))\n",
280 | " return -1\n",
281 | "\n",
282 | "shave_year = lambda title: title[:-7] if year.search(title) else title\n",
283 | "\n",
284 | "print(shave_year(\"Witchfinder General (Conquerer Worm, The) (1968)\"))\n",
285 | "\n",
286 | "movies[\"year\"] = movies.title.apply(get_year)\n",
287 | "movies[\"title\"] = movies.title.apply(shave_year)\n",
288 | "movies.head()"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 7,
294 | "metadata": {},
295 | "outputs": [
296 | {
297 | "data": {
298 | "text/html": [
299 | "\n",
300 | "\n",
313 | "
\n",
314 | " \n",
315 | " \n",
316 | " | \n",
317 | " title | \n",
318 | " genres | \n",
319 | " imdbId | \n",
320 | " tmdbId | \n",
321 | " year | \n",
322 | "
\n",
323 | " \n",
324 | " | movieId | \n",
325 | " | \n",
326 | " | \n",
327 | " | \n",
328 | " | \n",
329 | " | \n",
330 | "
\n",
331 | " \n",
332 | " \n",
333 | " \n",
334 | " | 1 | \n",
335 | " Toy Story | \n",
336 | " Adventure|Animation|Children|Comedy|Fantasy | \n",
337 | " 114709 | \n",
338 | " 862 | \n",
339 | " 1995 | \n",
340 | "
\n",
341 | " \n",
342 | " | 2 | \n",
343 | " Jumanji | \n",
344 | " Adventure|Children|Fantasy | \n",
345 | " 113497 | \n",
346 | " 8844 | \n",
347 | " 1995 | \n",
348 | "
\n",
349 | " \n",
350 | " | 3 | \n",
351 | " Grumpier Old Men | \n",
352 | " Comedy|Romance | \n",
353 | " 113228 | \n",
354 | " 15602 | \n",
355 | " 1995 | \n",
356 | "
\n",
357 | " \n",
358 | " | 4 | \n",
359 | " Waiting to Exhale | \n",
360 | " Comedy|Drama|Romance | \n",
361 | " 114885 | \n",
362 | " 31357 | \n",
363 | " 1995 | \n",
364 | "
\n",
365 | " \n",
366 | " | 5 | \n",
367 | " Father of the Bride Part II | \n",
368 | " Comedy | \n",
369 | " 113041 | \n",
370 | " 11862 | \n",
371 | " 1995 | \n",
372 | "
\n",
373 | " \n",
374 | "
\n",
375 | "
"
376 | ],
377 | "text/plain": [
378 | " title \\\n",
379 | "movieId \n",
380 | "1 Toy Story \n",
381 | "2 Jumanji \n",
382 | "3 Grumpier Old Men \n",
383 | "4 Waiting to Exhale \n",
384 | "5 Father of the Bride Part II \n",
385 | "\n",
386 | " genres imdbId tmdbId year \n",
387 | "movieId \n",
388 | "1 Adventure|Animation|Children|Comedy|Fantasy 114709 862 1995 \n",
389 | "2 Adventure|Children|Fantasy 113497 8844 1995 \n",
390 | "3 Comedy|Romance 113228 15602 1995 \n",
391 | "4 Comedy|Drama|Romance 114885 31357 1995 \n",
392 | "5 Comedy 113041 11862 1995 "
393 | ]
394 | },
395 | "execution_count": 7,
396 | "metadata": {},
397 | "output_type": "execute_result"
398 | }
399 | ],
400 | "source": [
401 | "movies = movies[movies.year!=-1]\n",
402 | "movies.head()"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": 8,
408 | "metadata": {},
409 | "outputs": [],
410 | "source": [
411 | "get_genre_set = lambda g: g.split('|')\n",
412 | "genres = movies.genres.apply(get_genre_set)\n",
413 | "movies.drop(\"genres\", axis=1, inplace=True)"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 9,
419 | "metadata": {},
420 | "outputs": [
421 | {
422 | "data": {
423 | "text/html": [
424 | "\n",
425 | "\n",
438 | "
\n",
439 | " \n",
440 | " \n",
441 | " | \n",
442 | " title | \n",
443 | " imdbId | \n",
444 | " tmdbId | \n",
445 | " year | \n",
446 | "
\n",
447 | " \n",
448 | " | movieId | \n",
449 | " | \n",
450 | " | \n",
451 | " | \n",
452 | " | \n",
453 | "
\n",
454 | " \n",
455 | " \n",
456 | " \n",
457 | " | 1 | \n",
458 | " Toy Story | \n",
459 | " 114709 | \n",
460 | " 862 | \n",
461 | " 1995 | \n",
462 | "
\n",
463 | " \n",
464 | " | 2 | \n",
465 | " Jumanji | \n",
466 | " 113497 | \n",
467 | " 8844 | \n",
468 | " 1995 | \n",
469 | "
\n",
470 | " \n",
471 | " | 3 | \n",
472 | " Grumpier Old Men | \n",
473 | " 113228 | \n",
474 | " 15602 | \n",
475 | " 1995 | \n",
476 | "
\n",
477 | " \n",
478 | " | 4 | \n",
479 | " Waiting to Exhale | \n",
480 | " 114885 | \n",
481 | " 31357 | \n",
482 | " 1995 | \n",
483 | "
\n",
484 | " \n",
485 | " | 5 | \n",
486 | " Father of the Bride Part II | \n",
487 | " 113041 | \n",
488 | " 11862 | \n",
489 | " 1995 | \n",
490 | "
\n",
491 | " \n",
492 | "
\n",
493 | "
"
494 | ],
495 | "text/plain": [
496 | " title imdbId tmdbId year\n",
497 | "movieId \n",
498 | "1 Toy Story 114709 862 1995\n",
499 | "2 Jumanji 113497 8844 1995\n",
500 | "3 Grumpier Old Men 113228 15602 1995\n",
501 | "4 Waiting to Exhale 114885 31357 1995\n",
502 | "5 Father of the Bride Part II 113041 11862 1995"
503 | ]
504 | },
505 | "execution_count": 9,
506 | "metadata": {},
507 | "output_type": "execute_result"
508 | }
509 | ],
510 | "source": [
511 | "movies.to_csv(\"movies.clean.csv\")\n",
512 | "movies.head()"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": 10,
518 | "metadata": {},
519 | "outputs": [
520 | {
521 | "data": {
522 | "text/html": [
523 | "\n",
524 | "\n",
537 | "
\n",
538 | " \n",
539 | " \n",
540 | " | \n",
541 | " genre | \n",
542 | "
\n",
543 | " \n",
544 | " | movieId | \n",
545 | " | \n",
546 | "
\n",
547 | " \n",
548 | " \n",
549 | " \n",
550 | " | 1 | \n",
551 | " Adventure | \n",
552 | "
\n",
553 | " \n",
554 | " | 1 | \n",
555 | " Animation | \n",
556 | "
\n",
557 | " \n",
558 | " | 1 | \n",
559 | " Children | \n",
560 | "
\n",
561 | " \n",
562 | " | 1 | \n",
563 | " Comedy | \n",
564 | "
\n",
565 | " \n",
566 | " | 1 | \n",
567 | " Fantasy | \n",
568 | "
\n",
569 | " \n",
570 | "
\n",
571 | "
"
572 | ],
573 | "text/plain": [
574 | " genre\n",
575 | "movieId \n",
576 | "1 Adventure\n",
577 | "1 Animation\n",
578 | "1 Children\n",
579 | "1 Comedy\n",
580 | "1 Fantasy"
581 | ]
582 | },
583 | "execution_count": 10,
584 | "metadata": {},
585 | "output_type": "execute_result"
586 | }
587 | ],
588 | "source": [
589 | "genres_assignation = []\n",
590 | "for i, gen in genres.iteritems():\n",
591 | " for gnre in gen:\n",
592 | " genres_assignation.append([i, gnre])\n",
593 | "genres_df = pd.DataFrame(genres_assignation, columns=[\"movieId\", \"genre\"]).set_index(\"movieId\")\n",
594 | "genres_df.to_csv(\"genres.csv\")\n",
595 | "genres_df.head()"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 11,
601 | "metadata": {},
602 | "outputs": [
603 | {
604 | "name": "stdout",
605 | "output_type": "stream",
606 | "text": [
607 | "2007-01-15T16:29:38\n"
608 | ]
609 | }
610 | ],
611 | "source": [
612 | "import datetime\n",
613 | "'2015-06-24T12:50:35.556+0100'\n",
614 | "def date_ms(timestamp):\n",
615 | " return datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%dT%H:%M:%S')"
616 | ]
617 | },
618 | {
619 | "cell_type": "code",
620 | "execution_count": 12,
621 | "metadata": {},
622 | "outputs": [
623 | {
624 | "data": {
625 | "text/html": [
626 | "\n",
627 | "\n",
640 | "
\n",
641 | " \n",
642 | " \n",
643 | " | \n",
644 | " | \n",
645 | " rating | \n",
646 | " timestamp | \n",
647 | " time | \n",
648 | "
\n",
649 | " \n",
650 | " | userId | \n",
651 | " movieId | \n",
652 | " | \n",
653 | " | \n",
654 | " | \n",
655 | "
\n",
656 | " \n",
657 | " \n",
658 | " \n",
659 | " | 1 | \n",
660 | " 31 | \n",
661 | " 2.5 | \n",
662 | " 1260759144 | \n",
663 | " 2009-12-14T02:52:24 | \n",
664 | "
\n",
665 | " \n",
666 | " | 1029 | \n",
667 | " 3.0 | \n",
668 | " 1260759179 | \n",
669 | " 2009-12-14T02:52:59 | \n",
670 | "
\n",
671 | " \n",
672 | " | 1061 | \n",
673 | " 3.0 | \n",
674 | " 1260759182 | \n",
675 | " 2009-12-14T02:53:02 | \n",
676 | "
\n",
677 | " \n",
678 | " | 1129 | \n",
679 | " 2.0 | \n",
680 | " 1260759185 | \n",
681 | " 2009-12-14T02:53:05 | \n",
682 | "
\n",
683 | " \n",
684 | " | 1172 | \n",
685 | " 4.0 | \n",
686 | " 1260759205 | \n",
687 | " 2009-12-14T02:53:25 | \n",
688 | "
\n",
689 | " \n",
690 | " | 1263 | \n",
691 | " 2.0 | \n",
692 | " 1260759151 | \n",
693 | " 2009-12-14T02:52:31 | \n",
694 | "
\n",
695 | " \n",
696 | " | 1287 | \n",
697 | " 2.0 | \n",
698 | " 1260759187 | \n",
699 | " 2009-12-14T02:53:07 | \n",
700 | "
\n",
701 | " \n",
702 | " | 1293 | \n",
703 | " 2.0 | \n",
704 | " 1260759148 | \n",
705 | " 2009-12-14T02:52:28 | \n",
706 | "
\n",
707 | " \n",
708 | " | 1339 | \n",
709 | " 3.5 | \n",
710 | " 1260759125 | \n",
711 | " 2009-12-14T02:52:05 | \n",
712 | "
\n",
713 | " \n",
714 | " | 1343 | \n",
715 | " 2.0 | \n",
716 | " 1260759131 | \n",
717 | " 2009-12-14T02:52:11 | \n",
718 | "
\n",
719 | " \n",
720 | "
\n",
721 | "
"
722 | ],
723 | "text/plain": [
724 | " rating timestamp time\n",
725 | "userId movieId \n",
726 | "1 31 2.5 1260759144 2009-12-14T02:52:24\n",
727 | " 1029 3.0 1260759179 2009-12-14T02:52:59\n",
728 | " 1061 3.0 1260759182 2009-12-14T02:53:02\n",
729 | " 1129 2.0 1260759185 2009-12-14T02:53:05\n",
730 | " 1172 4.0 1260759205 2009-12-14T02:53:25\n",
731 | " 1263 2.0 1260759151 2009-12-14T02:52:31\n",
732 | " 1287 2.0 1260759187 2009-12-14T02:53:07\n",
733 | " 1293 2.0 1260759148 2009-12-14T02:52:28\n",
734 | " 1339 3.5 1260759125 2009-12-14T02:52:05\n",
735 | " 1343 2.0 1260759131 2009-12-14T02:52:11"
736 | ]
737 | },
738 | "execution_count": 12,
739 | "metadata": {},
740 | "output_type": "execute_result"
741 | }
742 | ],
743 | "source": [
744 | "ratings = pd.read_csv(\"ml-latest-small/ratings.csv\", index_col=[0,1])\n",
745 | "ratings[\"time\"] = ratings.timestamp.apply(date_ms)\n",
746 | "ratings.to_csv(\"ratings.csv\")\n",
747 | "ratings.head(10) "
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 39,
753 | "metadata": {},
754 | "outputs": [
755 | {
756 | "data": {
757 | "text/html": [
758 | "\n",
759 | "\n",
772 | "
\n",
773 | " \n",
774 | " \n",
775 | " | \n",
776 | " | \n",
777 | " tag | \n",
778 | " timestamp | \n",
779 | " time | \n",
780 | "
\n",
781 | " \n",
782 | " | userId | \n",
783 | " movieId | \n",
784 | " | \n",
785 | " | \n",
786 | " | \n",
787 | "
\n",
788 | " \n",
789 | " \n",
790 | " \n",
791 | " | 15 | \n",
792 | " 339 | \n",
793 | " sandra 'boring' bullock | \n",
794 | " 1138537770 | \n",
795 | " 2006-01-29T12:29:30 | \n",
796 | "
\n",
797 | " \n",
798 | " | 1955 | \n",
799 | " dentist | \n",
800 | " 1193435061 | \n",
801 | " 2007-10-26T22:44:21 | \n",
802 | "
\n",
803 | " \n",
804 | " | 7478 | \n",
805 | " Cambodia | \n",
806 | " 1170560997 | \n",
807 | " 2007-02-04T03:49:57 | \n",
808 | "
\n",
809 | " \n",
810 | " | 32892 | \n",
811 | " Russian | \n",
812 | " 1170626366 | \n",
813 | " 2007-02-04T21:59:26 | \n",
814 | "
\n",
815 | " \n",
816 | " | 34162 | \n",
817 | " forgettable | \n",
818 | " 1141391765 | \n",
819 | " 2006-03-03T13:16:05 | \n",
820 | "
\n",
821 | " \n",
822 | "
\n",
823 | "
"
824 | ],
825 | "text/plain": [
826 | " tag timestamp time\n",
827 | "userId movieId \n",
828 | "15 339 sandra 'boring' bullock 1138537770 2006-01-29T12:29:30\n",
829 | " 1955 dentist 1193435061 2007-10-26T22:44:21\n",
830 | " 7478 Cambodia 1170560997 2007-02-04T03:49:57\n",
831 | " 32892 Russian 1170626366 2007-02-04T21:59:26\n",
832 | " 34162 forgettable 1141391765 2006-03-03T13:16:05"
833 | ]
834 | },
835 | "execution_count": 39,
836 | "metadata": {},
837 | "output_type": "execute_result"
838 | }
839 | ],
840 | "source": [
841 | "tags = pd.read_csv(\"ml-latest-small/tags.csv\", index_col=[0,1])\n",
842 | "tags[\"time\"] = tags.timestamp.apply(date_ms)\n",
843 | "tags.to_csv(\"tags.csv\")\n",
844 | "tags.head()"
845 | ]
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": 40,
850 | "metadata": {},
851 | "outputs": [
852 | {
853 | "data": {
854 | "text/html": [
855 | "\n",
856 | "\n",
869 | "
\n",
870 | " \n",
871 | " \n",
872 | " | \n",
873 | " userId | \n",
874 | "
\n",
875 | " \n",
876 | " \n",
877 | " \n",
878 | " | 666 | \n",
879 | " 667 | \n",
880 | "
\n",
881 | " \n",
882 | " | 667 | \n",
883 | " 668 | \n",
884 | "
\n",
885 | " \n",
886 | " | 668 | \n",
887 | " 669 | \n",
888 | "
\n",
889 | " \n",
890 | " | 669 | \n",
891 | " 670 | \n",
892 | "
\n",
893 | " \n",
894 | " | 670 | \n",
895 | " 671 | \n",
896 | "
\n",
897 | " \n",
898 | "
\n",
899 | "
"
900 | ],
901 | "text/plain": [
902 | " userId\n",
903 | "666 667\n",
904 | "667 668\n",
905 | "668 669\n",
906 | "669 670\n",
907 | "670 671"
908 | ]
909 | },
910 | "execution_count": 40,
911 | "metadata": {},
912 | "output_type": "execute_result"
913 | }
914 | ],
915 | "source": [
916 | "users = np.unique(np.concatenate(\n",
917 | " (ratings.index.levels[0].values , tags.index.levels[0].values)))\n",
918 | "\n",
919 | "users_df = pd.DataFrame({'userId':users})\n",
920 | "users_df.to_csv(\"users.csv\")\n",
921 | "users_df.tail()"
922 | ]
923 | },
924 | {
925 | "cell_type": "code",
926 | "execution_count": null,
927 | "metadata": {},
928 | "outputs": [],
929 | "source": []
930 | }
931 | ],
932 | "metadata": {
933 | "kernelspec": {
934 | "display_name": "Python 3",
935 | "language": "python",
936 | "name": "python3"
937 | },
938 | "language_info": {
939 | "codemirror_mode": {
940 | "name": "ipython",
941 | "version": 3
942 | },
943 | "file_extension": ".py",
944 | "mimetype": "text/x-python",
945 | "name": "python",
946 | "nbconvert_exporter": "python",
947 | "pygments_lexer": "ipython3",
948 | "version": "3.6.5"
949 | }
950 | },
951 | "nbformat": 4,
952 | "nbformat_minor": 2
953 | }
954 |
--------------------------------------------------------------------------------