├── MovieLens ├── .gitignore ├── extra-imdb-info.ipynb └── data-processing.ipynb ├── README.md ├── WorldCup └── download.ipynb └── .gitignore /MovieLens/.gitignore: -------------------------------------------------------------------------------- 1 | # files 2 | *.zip 3 | *.csv 4 | *.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # neotwork-datasets 2 | Take network datasets and feed them to neo4j 3 | -------------------------------------------------------------------------------- /WorldCup/download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.4" 28 | } 29 | }, 30 | "nbformat": 4, 31 | "nbformat_minor": 2 32 | } 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/macos,windows,jupyternotebook 3 | 4 | ### JupyterNotebook ### 5 | .ipynb_checkpoints 6 | */.ipynb_checkpoints/* 7 | 8 | # Remove previous ipynb_checkpoints 9 | # git rm -r .ipynb_checkpoints/ 10 | # 11 | ### macOS ### 12 | *.DS_Store 13 | .AppleDouble 14 | .LSOverride 15 | 16 | # Icon must end with two \r 17 | Icon 18 | 19 | # Thumbnails 20 | ._* 21 | 22 | # Files that might appear in the root of a volume 23 | .DocumentRevisions-V100 24 | .fseventsd 25 | .Spotlight-V100 26 | .TemporaryItems 27 | .Trashes 28 | .VolumeIcon.icns 29 | .com.apple.timemachine.donotpresent 30 | 31 | # Directories potentially created on remote AFP share 32 | .AppleDB 33 | .AppleDesktop 34 | Network Trash Folder 35 | Temporary Items 36 | .apdisk 37 | 38 | ### Windows ### 39 | # Windows thumbnail cache files 40 | Thumbs.db 41 | ehthumbs.db 42 | ehthumbs_vista.db 43 | 44 | # Folder config file 45 | Desktop.ini 46 | 47 | # Recycle Bin used on file shares 48 | $RECYCLE.BIN/ 49 | 50 | # Windows Installer files 51 | *.cab 52 | *.msi 53 | *.msm 54 | *.msp 55 | 56 | # Windows shortcuts 57 | *.lnk 58 | 59 | 60 | # End of https://www.gitignore.io/api/macos,windows,jupyternotebook 61 | 62 | # files 63 | *.zip 64 | *.csv 65 | -------------------------------------------------------------------------------- /MovieLens/extra-imdb-info.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from imdb import IMDb" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/html": [ 21 | "
\n", 22 | "\n", 35 | "\n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | "
titleimdbIdtmdbIdyear
movieId
1Toy Story1147098621995
2Jumanji11349788441995
3Grumpier Old Men113228156021995
4Waiting to Exhale114885313571995
5Father of the Bride Part II113041118621995
\n", 90 | "
" 91 | ], 92 | "text/plain": [ 93 | " title imdbId tmdbId year\n", 94 | "movieId \n", 95 | "1 Toy Story 114709 862 1995\n", 96 | "2 Jumanji 113497 8844 1995\n", 97 | "3 Grumpier Old Men 113228 15602 1995\n", 98 | "4 Waiting to Exhale 114885 31357 1995\n", 99 | "5 Father of the Bride Part II 113041 11862 1995" 100 | ] 101 | }, 102 | "execution_count": 2, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "movies = pd.read_csv(\"movies.clean.csv\", index_col=0)\n", 109 | "movies.head()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "ia = IMDb()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 20, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "['cast',\n", 130 | " 'genres',\n", 131 | " 'runtimes',\n", 132 | " 'countries',\n", 133 | " 'country codes',\n", 134 | " 'language codes',\n", 135 | " 'color info',\n", 136 | " 'aspect ratio',\n", 137 | " 'sound mix',\n", 138 | " 'certificates',\n", 139 | " 'original air date',\n", 140 | " 'rating',\n", 141 | " 'votes',\n", 142 | " 'cover url',\n", 143 | " 'plot outline',\n", 144 | " 'languages',\n", 145 | " 'title',\n", 146 | " 'year',\n", 147 | " 'kind',\n", 148 | " 'directors',\n", 149 | " 'writers',\n", 150 | " 'producers',\n", 151 | " 'composers',\n", 152 | " 'editors',\n", 153 | " 'editorial department',\n", 154 | " 'casting directors',\n", 155 | " 'art directors',\n", 156 | " 'production managers ',\n", 157 | " 'art department',\n", 158 | " 'sound department',\n", 159 | " 'visual effects',\n", 160 | " 'camera department',\n", 161 | " 'animation department',\n", 162 | " 'casting department',\n", 163 | " 'music department',\n", 164 | " 'miscellaneous',\n", 165 | " 'akas',\n", 166 | " 'writer',\n", 167 | " 'director',\n", 168 | " 'top 250 rank',\n", 169 | " 'plot',\n", 170 | " 'synopsis',\n", 171 | " 'canonical title',\n", 172 | " 'long imdb title',\n", 173 | " 'long imdb canonical title',\n", 174 | " 'smart canonical title',\n", 175 | " 'smart long imdb canonical title',\n", 176 | " 'full-size cover url']" 177 | ] 178 | }, 179 | "execution_count": 20, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "movie = ia.get_movie(114709)\n", 186 | "movie.keys()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 19, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "Lasseter, John\n" 199 | ] 200 | }, 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "['name', 'canonical name', 'long imdb name', 'long imdb canonical name']" 205 | ] 206 | }, 207 | "execution_count": 19, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "one_director = movie[\"director\"][0]\n", 214 | "print(one_director['canonical name'])\n", 215 | "one_director.keys()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 3", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 3 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython3", 242 | "version": "3.6.5" 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 2 247 | } 248 | -------------------------------------------------------------------------------- /MovieLens/data-processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 25, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import os\n", 12 | "import re" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | " % Total % Received % Xferd Average Speed Time Time Time Current\n", 25 | " Dload Upload Total Spent Left Speed\n", 26 | "100 896k 100 896k 0 0 593k 0 0:00:01 0:00:01 --:--:-- 179k 593k\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "ml_latest_small = \"ml-latest-small.zip\"\n", 32 | "\n", 33 | "!curl -o $ml_latest_small http://files.grouplens.org/datasets/movielens/ml-latest-small.zip" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import zipfile\n", 43 | "with zipfile.ZipFile(ml_latest_small, 'r') as zip_ref:\n", 44 | " zip_ref.extractall(\".\")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/html": [ 55 | "
\n", 56 | "\n", 69 | "\n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | "
titlegenresimdbIdtmdbId
movieId
1Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy114709862
2Jumanji (1995)Adventure|Children|Fantasy1134978844
3Grumpier Old Men (1995)Comedy|Romance11322815602
4Waiting to Exhale (1995)Comedy|Drama|Romance11488531357
5Father of the Bride Part II (1995)Comedy11304111862
\n", 124 | "
" 125 | ], 126 | "text/plain": [ 127 | " title \\\n", 128 | "movieId \n", 129 | "1 Toy Story (1995) \n", 130 | "2 Jumanji (1995) \n", 131 | "3 Grumpier Old Men (1995) \n", 132 | "4 Waiting to Exhale (1995) \n", 133 | "5 Father of the Bride Part II (1995) \n", 134 | "\n", 135 | " genres imdbId tmdbId \n", 136 | "movieId \n", 137 | "1 Adventure|Animation|Children|Comedy|Fantasy 114709 862 \n", 138 | "2 Adventure|Children|Fantasy 113497 8844 \n", 139 | "3 Comedy|Romance 113228 15602 \n", 140 | "4 Comedy|Drama|Romance 114885 31357 \n", 141 | "5 Comedy 113041 11862 " 142 | ] 143 | }, 144 | "execution_count": 5, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "movies = pd.read_csv(\"ml-latest-small/movies.csv\", index_col=0)\n", 151 | "links = pd.read_csv(\"ml-latest-small/links.csv\", index_col=0)\n", 152 | "movies = pd.merge(movies, links, left_index=True, right_index=True)\n", 153 | "movies.tmdbId = movies.tmdbId.apply(lambda v: int(v) if pd.notna(v) else -1)\n", 154 | "movies.head()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 6, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "Witchfinder General (Conquerer Worm, The)\n" 167 | ] 168 | }, 169 | { 170 | "data": { 171 | "text/html": [ 172 | "
\n", 173 | "\n", 186 | "\n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | "
titlegenresimdbIdtmdbIdyear
movieId
1Toy StoryAdventure|Animation|Children|Comedy|Fantasy1147098621995
2JumanjiAdventure|Children|Fantasy11349788441995
3Grumpier Old MenComedy|Romance113228156021995
4Waiting to ExhaleComedy|Drama|Romance114885313571995
5Father of the Bride Part IIComedy113041118621995
\n", 248 | "
" 249 | ], 250 | "text/plain": [ 251 | " title \\\n", 252 | "movieId \n", 253 | "1 Toy Story \n", 254 | "2 Jumanji \n", 255 | "3 Grumpier Old Men \n", 256 | "4 Waiting to Exhale \n", 257 | "5 Father of the Bride Part II \n", 258 | "\n", 259 | " genres imdbId tmdbId year \n", 260 | "movieId \n", 261 | "1 Adventure|Animation|Children|Comedy|Fantasy 114709 862 1995 \n", 262 | "2 Adventure|Children|Fantasy 113497 8844 1995 \n", 263 | "3 Comedy|Romance 113228 15602 1995 \n", 264 | "4 Comedy|Drama|Romance 114885 31357 1995 \n", 265 | "5 Comedy 113041 11862 1995 " 266 | ] 267 | }, 268 | "execution_count": 6, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "year = re.compile(\"\\(([0-9]{4})\\)$\")\n", 275 | "\n", 276 | "def get_year(date):\n", 277 | " match = year.search(date.strip())\n", 278 | " if match:\n", 279 | " return int(match.group(1))\n", 280 | " return -1\n", 281 | "\n", 282 | "shave_year = lambda title: title[:-7] if year.search(title) else title\n", 283 | "\n", 284 | "print(shave_year(\"Witchfinder General (Conquerer Worm, The) (1968)\"))\n", 285 | "\n", 286 | "movies[\"year\"] = movies.title.apply(get_year)\n", 287 | "movies[\"title\"] = movies.title.apply(shave_year)\n", 288 | "movies.head()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 7, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "data": { 298 | "text/html": [ 299 | "
\n", 300 | "\n", 313 | "\n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | "
titlegenresimdbIdtmdbIdyear
movieId
1Toy StoryAdventure|Animation|Children|Comedy|Fantasy1147098621995
2JumanjiAdventure|Children|Fantasy11349788441995
3Grumpier Old MenComedy|Romance113228156021995
4Waiting to ExhaleComedy|Drama|Romance114885313571995
5Father of the Bride Part IIComedy113041118621995
\n", 375 | "
" 376 | ], 377 | "text/plain": [ 378 | " title \\\n", 379 | "movieId \n", 380 | "1 Toy Story \n", 381 | "2 Jumanji \n", 382 | "3 Grumpier Old Men \n", 383 | "4 Waiting to Exhale \n", 384 | "5 Father of the Bride Part II \n", 385 | "\n", 386 | " genres imdbId tmdbId year \n", 387 | "movieId \n", 388 | "1 Adventure|Animation|Children|Comedy|Fantasy 114709 862 1995 \n", 389 | "2 Adventure|Children|Fantasy 113497 8844 1995 \n", 390 | "3 Comedy|Romance 113228 15602 1995 \n", 391 | "4 Comedy|Drama|Romance 114885 31357 1995 \n", 392 | "5 Comedy 113041 11862 1995 " 393 | ] 394 | }, 395 | "execution_count": 7, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "movies = movies[movies.year!=-1]\n", 402 | "movies.head()" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 8, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "get_genre_set = lambda g: g.split('|')\n", 412 | "genres = movies.genres.apply(get_genre_set)\n", 413 | "movies.drop(\"genres\", axis=1, inplace=True)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 9, 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "data": { 423 | "text/html": [ 424 | "
\n", 425 | "\n", 438 | "\n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | "
titleimdbIdtmdbIdyear
movieId
1Toy Story1147098621995
2Jumanji11349788441995
3Grumpier Old Men113228156021995
4Waiting to Exhale114885313571995
5Father of the Bride Part II113041118621995
\n", 493 | "
" 494 | ], 495 | "text/plain": [ 496 | " title imdbId tmdbId year\n", 497 | "movieId \n", 498 | "1 Toy Story 114709 862 1995\n", 499 | "2 Jumanji 113497 8844 1995\n", 500 | "3 Grumpier Old Men 113228 15602 1995\n", 501 | "4 Waiting to Exhale 114885 31357 1995\n", 502 | "5 Father of the Bride Part II 113041 11862 1995" 503 | ] 504 | }, 505 | "execution_count": 9, 506 | "metadata": {}, 507 | "output_type": "execute_result" 508 | } 509 | ], 510 | "source": [ 511 | "movies.to_csv(\"movies.clean.csv\")\n", 512 | "movies.head()" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 10, 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "data": { 522 | "text/html": [ 523 | "
\n", 524 | "\n", 537 | "\n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | "
genre
movieId
1Adventure
1Animation
1Children
1Comedy
1Fantasy
\n", 571 | "
" 572 | ], 573 | "text/plain": [ 574 | " genre\n", 575 | "movieId \n", 576 | "1 Adventure\n", 577 | "1 Animation\n", 578 | "1 Children\n", 579 | "1 Comedy\n", 580 | "1 Fantasy" 581 | ] 582 | }, 583 | "execution_count": 10, 584 | "metadata": {}, 585 | "output_type": "execute_result" 586 | } 587 | ], 588 | "source": [ 589 | "genres_assignation = []\n", 590 | "for i, gen in genres.iteritems():\n", 591 | " for gnre in gen:\n", 592 | " genres_assignation.append([i, gnre])\n", 593 | "genres_df = pd.DataFrame(genres_assignation, columns=[\"movieId\", \"genre\"]).set_index(\"movieId\")\n", 594 | "genres_df.to_csv(\"genres.csv\")\n", 595 | "genres_df.head()" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 11, 601 | "metadata": {}, 602 | "outputs": [ 603 | { 604 | "name": "stdout", 605 | "output_type": "stream", 606 | "text": [ 607 | "2007-01-15T16:29:38\n" 608 | ] 609 | } 610 | ], 611 | "source": [ 612 | "import datetime\n", 613 | "'2015-06-24T12:50:35.556+0100'\n", 614 | "def date_ms(timestamp):\n", 615 | " return datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%dT%H:%M:%S')" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 12, 621 | "metadata": {}, 622 | "outputs": [ 623 | { 624 | "data": { 625 | "text/html": [ 626 | "
\n", 627 | "\n", 640 | "\n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | "
ratingtimestamptime
userIdmovieId
1312.512607591442009-12-14T02:52:24
10293.012607591792009-12-14T02:52:59
10613.012607591822009-12-14T02:53:02
11292.012607591852009-12-14T02:53:05
11724.012607592052009-12-14T02:53:25
12632.012607591512009-12-14T02:52:31
12872.012607591872009-12-14T02:53:07
12932.012607591482009-12-14T02:52:28
13393.512607591252009-12-14T02:52:05
13432.012607591312009-12-14T02:52:11
\n", 721 | "
" 722 | ], 723 | "text/plain": [ 724 | " rating timestamp time\n", 725 | "userId movieId \n", 726 | "1 31 2.5 1260759144 2009-12-14T02:52:24\n", 727 | " 1029 3.0 1260759179 2009-12-14T02:52:59\n", 728 | " 1061 3.0 1260759182 2009-12-14T02:53:02\n", 729 | " 1129 2.0 1260759185 2009-12-14T02:53:05\n", 730 | " 1172 4.0 1260759205 2009-12-14T02:53:25\n", 731 | " 1263 2.0 1260759151 2009-12-14T02:52:31\n", 732 | " 1287 2.0 1260759187 2009-12-14T02:53:07\n", 733 | " 1293 2.0 1260759148 2009-12-14T02:52:28\n", 734 | " 1339 3.5 1260759125 2009-12-14T02:52:05\n", 735 | " 1343 2.0 1260759131 2009-12-14T02:52:11" 736 | ] 737 | }, 738 | "execution_count": 12, 739 | "metadata": {}, 740 | "output_type": "execute_result" 741 | } 742 | ], 743 | "source": [ 744 | "ratings = pd.read_csv(\"ml-latest-small/ratings.csv\", index_col=[0,1])\n", 745 | "ratings[\"time\"] = ratings.timestamp.apply(date_ms)\n", 746 | "ratings.to_csv(\"ratings.csv\")\n", 747 | "ratings.head(10) " 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 39, 753 | "metadata": {}, 754 | "outputs": [ 755 | { 756 | "data": { 757 | "text/html": [ 758 | "
\n", 759 | "\n", 772 | "\n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | "
tagtimestamptime
userIdmovieId
15339sandra 'boring' bullock11385377702006-01-29T12:29:30
1955dentist11934350612007-10-26T22:44:21
7478Cambodia11705609972007-02-04T03:49:57
32892Russian11706263662007-02-04T21:59:26
34162forgettable11413917652006-03-03T13:16:05
\n", 823 | "
" 824 | ], 825 | "text/plain": [ 826 | " tag timestamp time\n", 827 | "userId movieId \n", 828 | "15 339 sandra 'boring' bullock 1138537770 2006-01-29T12:29:30\n", 829 | " 1955 dentist 1193435061 2007-10-26T22:44:21\n", 830 | " 7478 Cambodia 1170560997 2007-02-04T03:49:57\n", 831 | " 32892 Russian 1170626366 2007-02-04T21:59:26\n", 832 | " 34162 forgettable 1141391765 2006-03-03T13:16:05" 833 | ] 834 | }, 835 | "execution_count": 39, 836 | "metadata": {}, 837 | "output_type": "execute_result" 838 | } 839 | ], 840 | "source": [ 841 | "tags = pd.read_csv(\"ml-latest-small/tags.csv\", index_col=[0,1])\n", 842 | "tags[\"time\"] = tags.timestamp.apply(date_ms)\n", 843 | "tags.to_csv(\"tags.csv\")\n", 844 | "tags.head()" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 40, 850 | "metadata": {}, 851 | "outputs": [ 852 | { 853 | "data": { 854 | "text/html": [ 855 | "
\n", 856 | "\n", 869 | "\n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | "
userId
666667
667668
668669
669670
670671
\n", 899 | "
" 900 | ], 901 | "text/plain": [ 902 | " userId\n", 903 | "666 667\n", 904 | "667 668\n", 905 | "668 669\n", 906 | "669 670\n", 907 | "670 671" 908 | ] 909 | }, 910 | "execution_count": 40, 911 | "metadata": {}, 912 | "output_type": "execute_result" 913 | } 914 | ], 915 | "source": [ 916 | "users = np.unique(np.concatenate(\n", 917 | " (ratings.index.levels[0].values , tags.index.levels[0].values)))\n", 918 | "\n", 919 | "users_df = pd.DataFrame({'userId':users})\n", 920 | "users_df.to_csv(\"users.csv\")\n", 921 | "users_df.tail()" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": null, 927 | "metadata": {}, 928 | "outputs": [], 929 | "source": [] 930 | } 931 | ], 932 | "metadata": { 933 | "kernelspec": { 934 | "display_name": "Python 3", 935 | "language": "python", 936 | "name": "python3" 937 | }, 938 | "language_info": { 939 | "codemirror_mode": { 940 | "name": "ipython", 941 | "version": 3 942 | }, 943 | "file_extension": ".py", 944 | "mimetype": "text/x-python", 945 | "name": "python", 946 | "nbconvert_exporter": "python", 947 | "pygments_lexer": "ipython3", 948 | "version": "3.6.5" 949 | } 950 | }, 951 | "nbformat": 4, 952 | "nbformat_minor": 2 953 | } 954 | --------------------------------------------------------------------------------