├── movie_time ├── __init__.py ├── wsgi.py ├── urls.py └── settings.py ├── first_time_setup ├── __init__.py ├── downloads │ └── __init__.py ├── movie_similarity.py └── run.py ├── movie_time_app ├── __init__.py ├── movies │ ├── __init__.py │ ├── loader.py │ ├── search.py │ ├── detail.py │ └── homepage.py ├── migrations │ ├── __init__.py │ └── 0001_initial.py ├── static │ ├── css │ │ └── app.css │ └── logo.png ├── tests.py ├── apps.py ├── admin.py ├── urls.py ├── templates │ ├── search.html │ ├── movie_detail.html │ └── index.html ├── models.py └── views.py ├── requirements.txt ├── manage.py ├── LICENSE ├── .gitignore ├── README.md └── movie_time_investigation.ipynb /movie_time/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /first_time_setup/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /movie_time_app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /movie_time_app/movies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /movie_time_app/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /movie_time_app/static/css/app.css: -------------------------------------------------------------------------------- 1 | ul#menu li { 2 | display:inline; 3 | } -------------------------------------------------------------------------------- /first_time_setup/downloads/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | path = Path(*__path__) -------------------------------------------------------------------------------- /movie_time_app/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /movie_time_app/static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/osama-haggag/movie-time/HEAD/movie_time_app/static/logo.png -------------------------------------------------------------------------------- /movie_time_app/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class MovieTimeAppConfig(AppConfig): 5 | name = 'movie_time_app' 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Django==1.11.1 2 | pandas==0.20.1 3 | Pillow==4.1.1 4 | scikit-learn==0.18.1 5 | scipy==0.19.0 6 | requests==2.14.1 7 | tqdm==4.11.2 -------------------------------------------------------------------------------- /movie_time_app/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | from movie_time_app.models import Movie, Similarity, Tag 5 | 6 | admin.site.register(Movie) 7 | admin.site.register(Similarity) 8 | admin.site.register(Tag) -------------------------------------------------------------------------------- /movie_time_app/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import url 2 | 3 | from . import views 4 | 5 | urlpatterns = [ 6 | url(r'^$', views.index, name='index'), 7 | url(r'^(?P[0-9]+)/$', views.detail, name='detail'), 8 | url(r'^search/$', views.search, name='search'), 9 | url(r'^update/(?P[0-9]+)/$', views.update, name='update') 10 | ] -------------------------------------------------------------------------------- /movie_time/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for movie_time project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "movie_time.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /movie_time_app/templates/search.html: -------------------------------------------------------------------------------- 1 | {% load staticfiles %} 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 |
12 |

Search results:

13 |
14 | 19 |
20 | 21 | -------------------------------------------------------------------------------- /movie_time_app/movies/loader.py: -------------------------------------------------------------------------------- 1 | from movie_time_app.models import Movie, Similarity 2 | 3 | 4 | def load_unrelatable_movies(n): 5 | return Movie.objects.filter(relatable=False).order_by('?')[:n] 6 | 7 | 8 | def _filter_movies(similarities, n): 9 | similar_movies = [] 10 | for similarity in similarities[:n]: 11 | similar_movie = similarity.second_movie 12 | similar_movies.append((similar_movie.title, similar_movie.movie_id)) 13 | return similar_movies 14 | 15 | 16 | def load_similar_movies(movie, n): 17 | similarities = Similarity.objects.filter( 18 | first_movie=movie.movie_id, 19 | second_movie__liked_or_not=None 20 | ).exclude( 21 | second_movie__movie_id=movie.movie_id 22 | ).order_by('-similarity_score') 23 | 24 | relevant_similar_movies = _filter_movies(similarities, n) 25 | return relevant_similar_movies -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "movie_time.settings") 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError: 10 | # The above import may fail for some other reason. Ensure that the 11 | # issue is really that Django is missing to avoid masking other 12 | # exceptions on Python 2. 13 | try: 14 | import django 15 | except ImportError: 16 | raise ImportError( 17 | "Couldn't import Django. Are you sure it's installed and " 18 | "available on your PYTHONPATH environment variable? Did you " 19 | "forget to activate a virtual environment?" 20 | ) 21 | raise 22 | execute_from_command_line(sys.argv) 23 | -------------------------------------------------------------------------------- /movie_time/urls.py: -------------------------------------------------------------------------------- 1 | """movie_time URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/1.11/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.conf.urls import url, include 14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) 15 | """ 16 | from django.conf.urls import url, include 17 | from django.contrib import admin 18 | 19 | urlpatterns = [ 20 | url(r'^admin/', admin.site.urls), 21 | url(r'', include('movie_time_app.urls')) 22 | ] 23 | -------------------------------------------------------------------------------- /movie_time_app/movies/search.py: -------------------------------------------------------------------------------- 1 | import operator 2 | from functools import reduce 3 | 4 | from django.db.models import Q 5 | 6 | from movie_time_app.models import Movie 7 | 8 | SEARCH_TEMPLATE_NAME = 'search.html' 9 | 10 | 11 | def _combine_filters(all_movies, query_elems): 12 | filtered = all_movies.filter( 13 | reduce(operator.and_, 14 | (Q(title__icontains=q) for q in query_elems)) 15 | ) 16 | return filtered 17 | 18 | 19 | def _prepare_as_context(filtered): 20 | context = {'search_results': filtered} 21 | return context 22 | 23 | 24 | def _filter_for_query(query): 25 | all_movies = Movie.objects.all() 26 | query_elems = query.split() 27 | filtered = _combine_filters(all_movies, query_elems) 28 | result = _prepare_as_context(filtered) 29 | return result 30 | 31 | 32 | def search_for_query(request): 33 | query = request.get('q') 34 | result = _filter_for_query(query) 35 | return result, SEARCH_TEMPLATE_NAME 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Osama Haggag 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /movie_time_app/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Create your models here. 4 | class Movie(models.Model): 5 | movie_id = models.IntegerField(primary_key=True) 6 | title = models.CharField(max_length=200) 7 | poster = models.ImageField(null=True, blank=True) 8 | year = models.IntegerField(null=True) 9 | genres = models.CharField(max_length=200) 10 | num_ratings = models.IntegerField(null=True) 11 | rating_median = models.FloatField(null=True) 12 | rating_mean = models.FloatField(null=True) 13 | relatable = models.BooleanField(default=True) 14 | liked_or_not = models.NullBooleanField(null=True, blank=True) 15 | 16 | def __str__(self): 17 | return self.title 18 | 19 | 20 | class Similarity(models.Model): 21 | first_movie = models.ForeignKey(Movie, related_name='first_movie') 22 | second_movie = models.ForeignKey(Movie, related_name='second_movie') 23 | similarity_score = models.FloatField() 24 | 25 | 26 | class Tag(models.Model): 27 | movie = models.ForeignKey(Movie) 28 | tag = models.CharField(max_length=50) 29 | relevance = models.FloatField() 30 | 31 | 32 | class OnlineLink(models.Model): 33 | movie = models.ForeignKey(Movie) 34 | imdb_id = models.CharField(max_length=50) 35 | -------------------------------------------------------------------------------- /movie_time_app/views.py: -------------------------------------------------------------------------------- 1 | from django.http import HttpResponse, HttpResponseRedirect 2 | from django.shortcuts import render 3 | from django.urls import reverse 4 | 5 | from movie_time_app.models import Movie 6 | from movie_time_app.movies.detail import load_movie_detail 7 | from movie_time_app.movies.homepage import load_homepage_recommendations 8 | from movie_time_app.movies.search import search_for_query 9 | 10 | INDEX_TEMPLATE_NAME = 'index.html' 11 | 12 | 13 | def detail(request, movie_id): 14 | movie_detail, template_name = load_movie_detail(movie_id) 15 | return render(request, template_name, context=movie_detail) 16 | 17 | 18 | def search(request): 19 | search_results, template_name = search_for_query(request.GET) 20 | return render(request, template_name, context=search_results) 21 | 22 | def update(request, movie_id): 23 | movie = Movie.objects.get(pk=movie_id) 24 | if 'liked' in request.POST: 25 | movie.liked_or_not = True 26 | elif 'disliked' in request.POST: 27 | movie.liked_or_not = False 28 | elif 'reset' in request.POST: 29 | movie.liked_or_not = None 30 | 31 | movie.save() 32 | return HttpResponseRedirect(request.META.get('HTTP_REFERER')) 33 | 34 | 35 | def index(request): 36 | liked, not_liked, random = load_homepage_recommendations() 37 | context = { 38 | 'liked': liked, 39 | 'not_liked': not_liked, 40 | 'random': random 41 | } 42 | response = render(request, 'index.html', context=context) 43 | return response 44 | -------------------------------------------------------------------------------- /movie_time_app/movies/detail.py: -------------------------------------------------------------------------------- 1 | from movie_time_app.models import Movie, OnlineLink, Similarity, Tag 2 | from movie_time_app.movies.loader import load_similar_movies 3 | 4 | DETAIL_TEMPLATE_NAME = 'movie_detail.html' 5 | DETAIL_TOP_N_SIMILARITIES = 40 6 | 7 | 8 | def _load_movie_tags(movie_object): 9 | tag_objects = Tag.objects.filter(movie_id=movie_object.movie_id).order_by('-relevance')[:10] 10 | tags = ', '.join([tag_obj.tag for tag_obj in tag_objects]) 11 | return tags 12 | 13 | 14 | def _prepare_context(movie_object, similar_movies): 15 | links = OnlineLink.objects.get(movie_id=movie_object.movie_id) 16 | movie_tags = _load_movie_tags(movie_object) 17 | context = { 18 | 'movie': { 19 | 'name': movie_object.title, 20 | 'id': movie_object.movie_id, 21 | 'movielens_rating': movie_object.rating_mean, 22 | 'movielens_num_ratings': movie_object.num_ratings, 23 | 'liked_or_not': movie_object.liked_or_not, 24 | 'relatable': movie_object.relatable, 25 | 'tags': movie_tags 26 | }, 27 | 'links': { 28 | 'imdb': links.imdb_id 29 | }, 30 | 'similarities': similar_movies 31 | } 32 | return context 33 | 34 | 35 | def load_movie_detail(movie_id): 36 | movie_object = Movie.objects.get(movie_id=movie_id) 37 | similar_movies = load_similar_movies(movie_object, DETAIL_TOP_N_SIMILARITIES) 38 | detail = _prepare_context(movie_object, similar_movies) 39 | return detail, DETAIL_TEMPLATE_NAME -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | .hypothesis/ 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | local_settings.py 54 | 55 | # Flask stuff: 56 | instance/ 57 | .webassets-cache 58 | 59 | # Scrapy stuff: 60 | .scrapy 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # IPython Notebook 69 | .ipynb_checkpoints 70 | 71 | # pyenv 72 | .python-version 73 | 74 | # celery beat schedule file 75 | celerybeat-schedule 76 | 77 | # dotenv 78 | .env 79 | 80 | # virtualenv 81 | venv/ 82 | ENV/ 83 | 84 | # Spyder project settings 85 | .spyderproject 86 | 87 | # Rope project settings 88 | .ropeproject 89 | 90 | # PyCharm stuff 91 | .project 92 | .pydevproject 93 | .settings 94 | *~ 95 | *# 96 | .coverage 97 | .idea 98 | .cache 99 | -------------------------------------------------------------------------------- /movie_time_app/templates/movie_detail.html: -------------------------------------------------------------------------------- 1 | {% load staticfiles %} 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 |
12 |

{{ movie.name }}:

13 |
14 | Rating: {{ movie.movielens_rating|floatformat:2 }} by {{ movie.movielens_num_ratings }} users
15 | Top 10 Tags: {{ movie.tags }}
16 | {% if movie.liked_or_not is True %} 17 | You LIKED this movie 18 | {% elif movie.liked_or_not is False %} 19 | You DISLIKED this movie 20 | {% else %} 21 | You haven't rated this movie yet 22 | {% endif %}
23 | IMDB: {{ links.imdb }}

24 |
25 | {% csrf_token %} 26 | 27 | 28 | 29 |
30 |
31 |
32 | {% if movie.relatable %} 33 | Similar movies you have not yet watched:
34 | {% for movie_title, movie_id in similarities %} 35 | {{ movie_title }}
36 | {% endfor %} 37 | {% else %} 38 | Unfortunately this movie has no tags and cannot be related to other movies 39 | {% endif %} 40 |
41 | 42 | -------------------------------------------------------------------------------- /movie_time_app/movies/homepage.py: -------------------------------------------------------------------------------- 1 | from movie_time_app.models import Movie 2 | from movie_time_app.movies.loader import load_unrelatable_movies, load_similar_movies 3 | 4 | TOP_N = 10 5 | 6 | 7 | def _get_similar_movies(movies): 8 | all_similar_movies = [] 9 | for movie in movies: 10 | all_similar_movies.extend(load_similar_movies(movie, TOP_N)) 11 | movie_ids = [movie_id for _, movie_id in all_similar_movies] 12 | return Movie.objects.filter(movie_id__in=movie_ids) 13 | 14 | 15 | def _load_similar_to_not_liked_movies(watched_movies): 16 | not_liked_movies = watched_movies.filter(liked_or_not=False, relatable=True) 17 | similar_movies = _get_similar_movies(not_liked_movies) 18 | return similar_movies.order_by('-rating_mean') 19 | 20 | 21 | def _load_similar_to_liked_movies(watched_movies): 22 | liked_movies = watched_movies.filter(liked_or_not=True, relatable=True) 23 | similar_movies = _get_similar_movies(liked_movies) 24 | return similar_movies.order_by('?') 25 | 26 | 27 | def _exclude_movies_in_similar_to_liked(similar_to_non_liked_movies, similar_to_liked_movies): 28 | similar_to_liked_ids = similar_to_liked_movies.values_list('movie_id', flat=True) 29 | from_disliked = similar_to_non_liked_movies.exclude(movie_id__in=similar_to_liked_ids)[:TOP_N] 30 | from_liked = similar_to_liked_movies[:TOP_N] 31 | return from_liked, from_disliked 32 | 33 | 34 | def load_homepage_recommendations(): 35 | watched_movies = Movie.objects.filter(liked_or_not__isnull=False) 36 | similar_to_liked_movies = _load_similar_to_liked_movies(watched_movies) 37 | similar_to_non_liked_movies = _load_similar_to_not_liked_movies(watched_movies) 38 | random_recommendations_from_unrelatable = load_unrelatable_movies(TOP_N) 39 | 40 | from_liked, from_disliked = _exclude_movies_in_similar_to_liked(similar_to_non_liked_movies, similar_to_liked_movies) 41 | return from_liked, from_disliked, random_recommendations_from_unrelatable -------------------------------------------------------------------------------- /movie_time_app/templates/index.html: -------------------------------------------------------------------------------- 1 | {% load staticfiles %} 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 | 15 | 21 |
22 |
23 |

Similar to movies you liked!

24 |
    25 | {% if liked|length == 0 %} 26 | Like more movies to see recommendations here 27 | {% else %} 28 | {% for movie in liked %} 29 | 30 |
  • {{ movie.title }}

  • 31 | 32 | {% endfor %} 33 | {% endif %} 34 |
35 |
36 |
37 |

Let some of these surprise you!

38 |
    39 | {% if not_liked|length == 0 %} 40 | Dislike movies to see well received similar movies here (maybe they got it right making these movies!) 41 | {% else %} 42 | {% for movie in not_liked %} 43 | 44 |
  • {{ movie.title }}

  • 45 | 46 | {% endfor %} 47 | {% endif %} 48 |
49 |
50 |
51 |

What's life without a little randomness?

52 | 59 |
60 | 61 | -------------------------------------------------------------------------------- /movie_time_app/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.11.1 on 2017-05-24 17:28 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | import django.db.models.deletion 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | initial = True 12 | 13 | dependencies = [ 14 | ] 15 | 16 | operations = [ 17 | migrations.CreateModel( 18 | name='Movie', 19 | fields=[ 20 | ('movie_id', models.IntegerField(primary_key=True, serialize=False)), 21 | ('title', models.CharField(max_length=200)), 22 | ('poster', models.ImageField(blank=True, null=True, upload_to='')), 23 | ('year', models.IntegerField(null=True)), 24 | ('genres', models.CharField(max_length=200)), 25 | ('num_ratings', models.IntegerField(null=True)), 26 | ('rating_median', models.FloatField(null=True)), 27 | ('rating_mean', models.FloatField(null=True)), 28 | ('relatable', models.BooleanField(default=True)), 29 | ('liked_or_not', models.NullBooleanField()), 30 | ], 31 | ), 32 | migrations.CreateModel( 33 | name='OnlineLink', 34 | fields=[ 35 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 36 | ('imdb_id', models.CharField(max_length=50)), 37 | ('movie', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='movie_time_app.Movie')), 38 | ], 39 | ), 40 | migrations.CreateModel( 41 | name='Similarity', 42 | fields=[ 43 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 44 | ('similarity_score', models.FloatField()), 45 | ('first_movie', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='first_movie', to='movie_time_app.Movie')), 46 | ('second_movie', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='second_movie', to='movie_time_app.Movie')), 47 | ], 48 | ), 49 | migrations.CreateModel( 50 | name='Tag', 51 | fields=[ 52 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 53 | ('tag', models.CharField(max_length=50)), 54 | ('relevance', models.FloatField()), 55 | ('movie', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='movie_time_app.Movie')), 56 | ], 57 | ), 58 | ] 59 | -------------------------------------------------------------------------------- /movie_time/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for movie_time project. 3 | 4 | Generated by 'django-admin startproject' using Django 1.11.1. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.11/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.11/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = '^txb_x=#ms-5pacoy_)3jkbnx6$tkv4df-i9g)w%m*e%5#o*v%' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = ['127.0.0.1',] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | 'django.contrib.admin', 35 | 'django.contrib.auth', 36 | 'django.contrib.contenttypes', 37 | 'django.contrib.sessions', 38 | 'django.contrib.messages', 39 | 'django.contrib.staticfiles', 40 | 'movie_time_app' 41 | ] 42 | 43 | MIDDLEWARE = [ 44 | 'django.middleware.security.SecurityMiddleware', 45 | 'django.contrib.sessions.middleware.SessionMiddleware', 46 | 'django.middleware.common.CommonMiddleware', 47 | 'django.middleware.csrf.CsrfViewMiddleware', 48 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 49 | 'django.contrib.messages.middleware.MessageMiddleware', 50 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 51 | ] 52 | 53 | ROOT_URLCONF = 'movie_time.urls' 54 | 55 | TEMPLATES = [ 56 | { 57 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 58 | 'DIRS': [], 59 | 'APP_DIRS': True, 60 | 'OPTIONS': { 61 | 'context_processors': [ 62 | 'django.template.context_processors.debug', 63 | 'django.template.context_processors.request', 64 | 'django.contrib.auth.context_processors.auth', 65 | 'django.contrib.messages.context_processors.messages', 66 | ], 67 | }, 68 | }, 69 | ] 70 | 71 | WSGI_APPLICATION = 'movie_time.wsgi.application' 72 | 73 | 74 | # Database 75 | # https://docs.djangoproject.com/en/1.11/ref/settings/#databases 76 | 77 | DATABASES = { 78 | 'default': { 79 | 'ENGINE': 'django.db.backends.sqlite3', 80 | 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 81 | } 82 | } 83 | 84 | 85 | # Password validation 86 | # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators 87 | 88 | AUTH_PASSWORD_VALIDATORS = [ 89 | { 90 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 91 | }, 92 | { 93 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 94 | }, 95 | { 96 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 97 | }, 98 | { 99 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 100 | }, 101 | ] 102 | 103 | 104 | # Internationalization 105 | # https://docs.djangoproject.com/en/1.11/topics/i18n/ 106 | 107 | LANGUAGE_CODE = 'en-us' 108 | 109 | TIME_ZONE = 'UTC' 110 | 111 | USE_I18N = True 112 | 113 | USE_L10N = True 114 | 115 | USE_TZ = True 116 | 117 | 118 | # Static files (CSS, JavaScript, Images) 119 | # https://docs.djangoproject.com/en/1.11/howto/static-files/ 120 | 121 | STATIC_URL = '/static/' 122 | STATIC_ROOT = os.path.join(BASE_DIR, 'static') 123 | -------------------------------------------------------------------------------- /first_time_setup/movie_similarity.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | RELEVANCE_CUTOFF = 0.3 9 | 10 | 11 | def _concatenate_tags_of_movie(tags): 12 | tags_as_str = ' '.join(set(tags)) 13 | return tags_as_str 14 | 15 | 16 | def _get_tags_per_movie(genome_scores, genome_tags): 17 | relevant_tags = genome_scores[genome_scores.relevance > RELEVANCE_CUTOFF][['movieId', 'tagId']] 18 | movie_id_to_relevant_tags = pd.merge(relevant_tags, genome_tags, on='tagId', how='left')[['movieId', 'tagId']] 19 | movie_id_to_relevant_tags['tagId'] = movie_id_to_relevant_tags.tagId.astype(str) 20 | relevant_tags_per_movie = movie_id_to_relevant_tags.groupby('movieId')['tagId'].agg({ 21 | 'movie_tags': _concatenate_tags_of_movie 22 | }) 23 | return relevant_tags_per_movie.reset_index() 24 | 25 | 26 | def _calculate_avg_movie_ratings(movie_ratings): 27 | avg_ratings = movie_ratings.groupby('movieId')['rating'].agg({ 28 | 'rating_mean': 'mean', 29 | 'rating_median': 'median', 30 | 'num_ratings': 'size' 31 | }) 32 | return avg_ratings.reset_index() 33 | 34 | 35 | def _extract_year_from_movie_title(movie_title): 36 | matches = re.findall(r'\d{4}', movie_title) 37 | if len(matches) > 1: 38 | return int(matches[-1]) 39 | if len(matches) < 1: 40 | return np.nan 41 | return int(matches[0]) 42 | 43 | 44 | def _gather_dataset(movie_names, avg_movie_ratings, tags_per_movie): 45 | print("merging files into one dataset dataframe...") 46 | movies_with_ratings = pd.merge(movie_names, avg_movie_ratings, on='movieId') 47 | dataset = pd.merge(movies_with_ratings, tags_per_movie, on='movieId', how='left') 48 | 49 | dataset['year'] = dataset.title.apply(_extract_year_from_movie_title) 50 | dataset.rename(columns={'movieId': 'movie_id'}, inplace=True) 51 | 52 | movies_with_tags_mask = dataset.movie_tags.notnull() 53 | movies_without_ratings_mask = dataset.movie_tags.isnull() 54 | dataset_with_tags = dataset[movies_with_tags_mask].reset_index(drop=True) 55 | unrelatable_movies = dataset[(~movies_with_tags_mask) | (movies_without_ratings_mask)] 56 | return dataset_with_tags, unrelatable_movies 57 | 58 | 59 | def _vectorize_dataset(dataset): 60 | tf_idf = TfidfVectorizer() 61 | movies_tfidf_vectorized = tf_idf.fit_transform(dataset.movie_tags) 62 | return movies_tfidf_vectorized 63 | 64 | 65 | def _match_indices_and_columns_to_ids(dataset, movie_to_movie_matrix): 66 | index_to_movie_id = dataset['movie_id'] 67 | movie_to_movie_matrix.columns = [str(index_to_movie_id[int(col)]) for col in movie_to_movie_matrix.columns] 68 | movie_to_movie_matrix.index = [index_to_movie_id[idx] for idx in movie_to_movie_matrix.index] 69 | return movie_to_movie_matrix 70 | 71 | 72 | def _stack_matrix_to_db_model(movie_to_movie_matrix): 73 | movie_to_movie_stacked = movie_to_movie_matrix.stack().reset_index() 74 | movie_to_movie_stacked.columns = ['first_movie_id', 'second_movie_id', 'similarity_score'] 75 | return movie_to_movie_stacked 76 | 77 | 78 | def _calculate_movie_similarity(dataset, vectorized): 79 | print("calculating movie to movie similarity...") 80 | movie_to_movie_matrix = pd.DataFrame(cosine_similarity(vectorized)) 81 | movie_to_movie_matrix = _match_indices_and_columns_to_ids(dataset, movie_to_movie_matrix) 82 | movie_to_movie_stacked = _stack_matrix_to_db_model(movie_to_movie_matrix) 83 | return movie_to_movie_stacked 84 | 85 | 86 | def movie_to_movie(genome_scores, genome_tags, movie_names, movie_ratings): 87 | tags_per_movie = _get_tags_per_movie(genome_scores, genome_tags) 88 | avg_movie_ratings = _calculate_avg_movie_ratings(movie_ratings) 89 | dataset_with_tags, unrelatable_movies = _gather_dataset(movie_names, avg_movie_ratings, tags_per_movie) 90 | vectorized = _vectorize_dataset(dataset_with_tags) 91 | movie_to_movie_similarity = _calculate_movie_similarity(dataset_with_tags, vectorized) 92 | return movie_to_movie_similarity, dataset_with_tags, unrelatable_movies 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](/movie_time_app/static/logo.png) 2 | 3 | A movie recommendation system based on the GroupLens dataset of MovieLens data. 4 | The dataset contains about 40,000 movies, and around 11,000 of those have tags 5 | associated with them and could be related to one another. 6 | 7 | Movie Time uses these tagged movies to relate them to each other, and presents 8 | random recommendations from the other 29,000 unrelatable movies. 9 | 10 | [Screenshots!](https://imgur.com/a/cH3cs) 11 | 12 | ## Installation 13 | 14 | ### Virtual Environment 15 | **0- Install python virtual env** 16 | 17 | On ubuntu: 18 | 19 | $ apt-get install virtualenvwrapper 20 | 21 | or via pip (for Linux, Mac and Linux subsystem for Windows) given you have python installed: 22 | 23 | $ pip install virtualenv virtualenvwrapper 24 | 25 | Reset your terminal and the commands below should be available to use 26 | 27 | **1- Create a *python 3* virtualenv for the project** 28 | 29 | $ mkvirtualenv movie-time -p /usr/bin/python3 30 | 31 | **1.1- Activate the virtualenv if it's not automatically activated** 32 | 33 | $ workon movie-time 34 | 35 | **2- Install the required dependencies** 36 | 37 | (movie-time) $ pip install -r requirements.txt 38 | 39 | ### Getting the movie database ready 40 | 41 | #### Building the DB yourself 42 | 43 | **1- Preferrably download the dataset manually from [here](http://files.grouplens.org/datasets/movielens/ml-latest.zip) 44 | then extract it to a folder** 45 | 46 | **2- Run the first time setup script from the project's root directory to populate a 47 | local database with the data needed for the recommendations.** 48 | 49 | This takes between 50 minutes (on a MacBook Pro) to 50 | 5 hours (on an MSI gaming laptop) and the database amounts to about 7 GB. Just point the script to the directory where 51 | you extracted the dataset, and optionally (but not preferrably) the path of the DB, but then you'd have to change it in 52 | the django settings too. 53 | 54 | (movie-time) $ PYTHONPATH=. python first_time_setup/run.py -i /path/to/extracted/dataset 55 | 56 | #### Downloading the DB from the cloud 57 | 58 | The DB is also backed up on Google Drive, and can be downloaded directly from there but may be slow in case Google sets 59 | bandwidth limits. 60 | 61 | Either download it to the default 62 | path, which is the project root. Or, download it to a specific path but be sure to change the django settings to point 63 | to that path. 64 | 65 | Download it from [here](https://drive.google.com/file/d/0B4oaUOQPKT44QzhacnBjSkw1Tjg/view), the download is about 6GB. 66 | 67 | ### Starting the server 68 | 69 | **1- Start the django server** 70 | 71 | (movie-time) $ python manage.py runserver 72 | 73 | **2- Open the server in a browser, by default it's at: [127.0.0.1:8000](http://127.0.0.1:8000/) and rate away!** 74 | 75 | ## Usage 76 | You can either: search for movies you watched and like/dislike them to see more candidates on the homepage, or you can 77 | search for a movie you know you enjoyed and manually find yourself a promising candidate from the list of similar movies. 78 | 79 | In the detail view it only shows the 40 most similar movies to the selected movie. 80 | 81 | ## How it works 82 | The "science" behind the system can be found in the 83 | [investigation notebook](/movie_time_investigation.ipynb) 84 | 85 | Recommendations on the homepage are presented in 3 categories, 2 of which are based on the calculated movie-to-movie similarity using 86 | the provided genome tags. 87 | 88 | **1. Movies similar to movies you have liked** 89 | 90 | These are the movies that are the most similar to the ones you've liked. Recommendations on the homepage are presented 91 | as follows: each movie you liked adds N (default 10) of its most similar movies to a pool of recommendations, which will 92 | have `N * number of liked movies` movies in it. Out of this pool N movies (again 10) are randomly selected to ensure 93 | some mixture of your tastes. 94 | 95 | **2. Movies similar to what you have disliked** 96 | 97 | These are the highest rated movies that are the most similar to movies you disliked. The recommendations are presented 98 | simply as a list of movies sorted by their mean_rating descendingly. The assumption is if a movie you didn't like got 99 | something "wrong", maybe a high-rated and very similar movie to it got it "right". 100 | 101 | **3. A random selection from the 29,000 that have no tags and can't be related due to them not having tags** 102 | -------------------------------------------------------------------------------- /first_time_setup/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import zipfile 4 | 5 | import pandas as pd 6 | import requests 7 | import sqlite3 8 | from tqdm import tqdm 9 | 10 | from first_time_setup.downloads import path 11 | from first_time_setup.movie_similarity import movie_to_movie 12 | from movie_time.settings import BASE_DIR 13 | 14 | LINK_TO_MOVIE_LENS_DATASET = "http://files.grouplens.org/datasets/movielens/ml-latest.zip" 15 | DEFAULT_PATH_TO_DB = os.path.join(BASE_DIR, 'db.sqlite3') 16 | 17 | 18 | def _load_dataset_from_local_path(input_dataset_path): 19 | movie_ratings = pd.read_csv(os.path.join(input_dataset_path, 'ratings.csv'), usecols=['movieId', 'rating']) 20 | genome_scores = pd.read_csv(os.path.join(input_dataset_path, 'genome-scores.csv')) 21 | genome_tags = pd.read_csv(os.path.join(input_dataset_path, 'genome-tags.csv')) 22 | movie_names = pd.read_csv(os.path.join(input_dataset_path, 'movies.csv')) 23 | links = pd.read_csv(os.path.join(input_dataset_path, 'links.csv')) 24 | movie_tags_as_text = pd.merge(genome_scores, genome_tags, on='tagId')[['movieId', 'tag', 'relevance']] 25 | return genome_scores, genome_tags, movie_names, movie_ratings, links, movie_tags_as_text 26 | 27 | 28 | def _download_data(download_path): 29 | file_path = os.path.join(download_path, 'ml-latest.zip') 30 | response = requests.get(LINK_TO_MOVIE_LENS_DATASET, stream=True) 31 | with open(file_path, 'wb') as handle: 32 | for data in tqdm(response.iter_content()): 33 | handle.write(data) 34 | 35 | 36 | def _extract_dataset_from_zip(download_path): 37 | file_path = os.path.join(download_path, 'ml-latest.zip') 38 | zip_ref = zipfile.ZipFile(file_path, 'rb') 39 | zip_ref.extractall(download_path) 40 | zip_ref.close() 41 | 42 | 43 | def _download_dataset(): 44 | print("downloading dataset...") 45 | download_path = str(path) 46 | _download_data(download_path) 47 | _extract_dataset_from_zip(download_path) 48 | dataset = _load_dataset_from_local_path(download_path) 49 | return dataset 50 | 51 | 52 | def _load_dataset(input_dataset_path): 53 | print("loading dataset...") 54 | if input_dataset_path is not None: 55 | dataset = _load_dataset_from_local_path(input_dataset_path) 56 | else: 57 | dataset = _download_dataset() 58 | return dataset 59 | 60 | 61 | def _connect_to_database(database_path): 62 | if database_path is None: 63 | database_path = DEFAULT_PATH_TO_DB 64 | db = sqlite3.connect(database_path) 65 | return db 66 | 67 | 68 | def _conform_to_db_model(dataset_with_tags, unrelatable_movies, links_to_imdb, movie_tags_as_text): 69 | links_col_order = ['movie_id', 'imdb_id'] 70 | links_to_imdb.rename(columns={'movieId': 'movie_id', 'imdbId': 'imdb_id'}, inplace=True) 71 | 72 | tags_col_order = ['movie_id', 'tag', 'relevance'] 73 | movie_tags_as_text.rename(columns={'movieId': 'movie_id'}, inplace=True) 74 | 75 | dataset_col_order = ['movie_id', 'title', 'year', 'genres', 'num_ratings', 'rating_median', 'rating_mean', 'relatable'] 76 | dataset_with_tags['relatable'] = True 77 | unrelatable_movies['relatable'] = False 78 | return (dataset_with_tags[dataset_col_order], unrelatable_movies[dataset_col_order], 79 | links_to_imdb[links_col_order], movie_tags_as_text[tags_col_order]) 80 | 81 | 82 | def _write_to_db_with_progress_bar(df, table_name, db_connection): 83 | total_length = len(df) 84 | step = int(total_length / 100) 85 | 86 | with tqdm(total=total_length) as pbar: 87 | for i in range(0, total_length, step): 88 | subset = df[i: i + step] 89 | subset.to_sql(table_name, db_connection, if_exists='append', index=False) 90 | pbar.update(step) 91 | 92 | 93 | def _populate_database_tables(db_connection, movie_to_movie_similarity, dataset_with_tags, 94 | unrelatable_movies, links_to_imdb, movie_tags_as_text): 95 | with_tags, without_tags, links, tags = _conform_to_db_model(dataset_with_tags, unrelatable_movies, 96 | links_to_imdb, movie_tags_as_text) 97 | 98 | print("writing movies with tags to DB...") 99 | _write_to_db_with_progress_bar(with_tags, 'movie_time_app_movie', db_connection) 100 | 101 | print("writing movies without tags to DB...") 102 | _write_to_db_with_progress_bar(without_tags, 'movie_time_app_movie', db_connection) 103 | 104 | print("writing online links to DB...") 105 | _write_to_db_with_progress_bar(links, 'movie_time_app_onlinelink', db_connection) 106 | 107 | print("writing movie tags to DB...") 108 | _write_to_db_with_progress_bar(tags, 'movie_time_app_tag', db_connection) 109 | 110 | print("writing movie similarities to DB...") 111 | _write_to_db_with_progress_bar(movie_to_movie_similarity, 'movie_time_app_similarity', db_connection) 112 | 113 | 114 | def main(input_dataset_path, database_path): 115 | genome_scores, genome_tags, movie_names, movie_ratings, links_to_imdb, movie_tags_as_text = _load_dataset(input_dataset_path) 116 | db_connection = _connect_to_database(database_path) 117 | movie_to_movie_similarity, dataset_with_tags, unrelatable_movies = movie_to_movie(genome_scores, genome_tags, 118 | movie_names, movie_ratings) 119 | _populate_database_tables(db_connection, movie_to_movie_similarity, dataset_with_tags, 120 | unrelatable_movies, links_to_imdb, movie_tags_as_text) 121 | 122 | 123 | if __name__ == "__main__": 124 | parser = argparse.ArgumentParser(description="Download MovieLens dataset & fill database with movie similarity matrix") 125 | parser.add_argument('-i', '--input-dataset', type=str, help="Path to dataset folder if already downloaded") 126 | parser.add_argument('-d', '--database', type=str, help="Path to the sqlite DB if not in default path in the django project") 127 | args = parser.parse_args() 128 | main(args.input_dataset, args.database) 129 | -------------------------------------------------------------------------------- /movie_time_investigation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": true, 21 | "deletable": true, 22 | "editable": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "import numpy as np" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": false, 34 | "deletable": true, 35 | "editable": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "ratings = pd.read_csv(\"datasets/ml-latest/ratings.csv\", usecols=['movieId', 'rating'])" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": { 46 | "collapsed": false, 47 | "deletable": true, 48 | "editable": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "genome_scores = pd.read_csv(\"datasets/ml-latest/genome-scores.csv\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": { 59 | "collapsed": true, 60 | "deletable": true, 61 | "editable": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "genome_tags = pd.read_csv(\"datasets/ml-latest/genome-tags.csv\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": { 72 | "collapsed": true, 73 | "deletable": true, 74 | "editable": true 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "movie_names = pd.read_csv(\"datasets/ml-latest/movies.csv\")" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 7, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "links = pd.read_csv(\"datasets/ml-latest/links.csv\")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 8, 97 | "metadata": { 98 | "collapsed": false, 99 | "deletable": true, 100 | "editable": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "movie_tags_in_text = pd.merge(genome_scores, genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": { 110 | "deletable": true, 111 | "editable": true 112 | }, 113 | "source": [ 114 | "### Determine a good tag relevancy score cut-off" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 9, 120 | "metadata": { 121 | "collapsed": true, 122 | "deletable": true, 123 | "editable": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "%matplotlib inline" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 10, 133 | "metadata": { 134 | "collapsed": false, 135 | "deletable": true, 136 | "editable": true 137 | }, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/html": [ 142 | "
\n", 143 | "\n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | "
movieIdtagIdrelevance
0110.02400
1120.02400
2130.05475
3140.09200
4150.14825
\n", 185 | "
" 186 | ], 187 | "text/plain": [ 188 | " movieId tagId relevance\n", 189 | "0 1 1 0.02400\n", 190 | "1 1 2 0.02400\n", 191 | "2 1 3 0.05475\n", 192 | "3 1 4 0.09200\n", 193 | "4 1 5 0.14825" 194 | ] 195 | }, 196 | "execution_count": 10, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "genome_scores.head()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 11, 208 | "metadata": { 209 | "collapsed": false, 210 | "deletable": true, 211 | "editable": true 212 | }, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/html": [ 217 | "
\n", 218 | "\n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | "
tagId12345678910...1119112011211122112311241125112611271128
movieId
10.024000.024000.054750.092000.148250.215000.066250.270250.260500.03025...0.036500.018000.045250.032750.124500.041750.020000.034750.083500.02525
20.038000.041750.037000.048750.110750.073250.049500.107750.102000.02050...0.039000.019250.017250.024250.134250.022250.016000.014500.096000.02025
30.042000.052500.027250.079750.056250.070250.059750.182750.051750.02725...0.039500.026250.027250.034500.169250.035250.017250.018750.099250.02000
40.036000.038500.035000.031250.071000.045000.024750.083000.051500.02975...0.053750.033000.022750.040250.196000.057000.015500.014750.066250.01400
50.040750.051250.058000.036750.075750.126750.029750.081750.030750.02950...0.040000.028500.021000.026500.154750.020500.017000.015750.112750.01975
\n", 392 | "

5 rows × 1128 columns

\n", 393 | "
" 394 | ], 395 | "text/plain": [ 396 | "tagId 1 2 3 4 5 6 7 \\\n", 397 | "movieId \n", 398 | "1 0.02400 0.02400 0.05475 0.09200 0.14825 0.21500 0.06625 \n", 399 | "2 0.03800 0.04175 0.03700 0.04875 0.11075 0.07325 0.04950 \n", 400 | "3 0.04200 0.05250 0.02725 0.07975 0.05625 0.07025 0.05975 \n", 401 | "4 0.03600 0.03850 0.03500 0.03125 0.07100 0.04500 0.02475 \n", 402 | "5 0.04075 0.05125 0.05800 0.03675 0.07575 0.12675 0.02975 \n", 403 | "\n", 404 | "tagId 8 9 10 ... 1119 1120 1121 \\\n", 405 | "movieId ... \n", 406 | "1 0.27025 0.26050 0.03025 ... 0.03650 0.01800 0.04525 \n", 407 | "2 0.10775 0.10200 0.02050 ... 0.03900 0.01925 0.01725 \n", 408 | "3 0.18275 0.05175 0.02725 ... 0.03950 0.02625 0.02725 \n", 409 | "4 0.08300 0.05150 0.02975 ... 0.05375 0.03300 0.02275 \n", 410 | "5 0.08175 0.03075 0.02950 ... 0.04000 0.02850 0.02100 \n", 411 | "\n", 412 | "tagId 1122 1123 1124 1125 1126 1127 1128 \n", 413 | "movieId \n", 414 | "1 0.03275 0.12450 0.04175 0.02000 0.03475 0.08350 0.02525 \n", 415 | "2 0.02425 0.13425 0.02225 0.01600 0.01450 0.09600 0.02025 \n", 416 | "3 0.03450 0.16925 0.03525 0.01725 0.01875 0.09925 0.02000 \n", 417 | "4 0.04025 0.19600 0.05700 0.01550 0.01475 0.06625 0.01400 \n", 418 | "5 0.02650 0.15475 0.02050 0.01700 0.01575 0.11275 0.01975 \n", 419 | "\n", 420 | "[5 rows x 1128 columns]" 421 | ] 422 | }, 423 | "execution_count": 11, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "genome_scores[:100000].pivot(index='movieId', columns='tagId')['relevance'].head()" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 12, 435 | "metadata": { 436 | "collapsed": false, 437 | "deletable": true, 438 | "editable": true 439 | }, 440 | "outputs": [], 441 | "source": [ 442 | "table = genome_scores[:100000].pivot_table('relevance', index='movieId', columns='tagId', aggfunc='mean')" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 13, 448 | "metadata": { 449 | "collapsed": false, 450 | "deletable": true, 451 | "editable": true 452 | }, 453 | "outputs": [ 454 | { 455 | "data": { 456 | "text/html": [ 457 | "
\n", 458 | "\n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | "
tagId12345678910...1119112011211122112311241125112611271128
movieId
10.024000.024000.054750.092000.148250.215000.066250.270250.260500.03025...0.036500.018000.045250.032750.124500.041750.020000.034750.083500.02525
20.038000.041750.037000.048750.110750.073250.049500.107750.102000.02050...0.039000.019250.017250.024250.134250.022250.016000.014500.096000.02025
30.042000.052500.027250.079750.056250.070250.059750.182750.051750.02725...0.039500.026250.027250.034500.169250.035250.017250.018750.099250.02000
40.036000.038500.035000.031250.071000.045000.024750.083000.051500.02975...0.053750.033000.022750.040250.196000.057000.015500.014750.066250.01400
50.040750.051250.058000.036750.075750.126750.029750.081750.030750.02950...0.040000.028500.021000.026500.154750.020500.017000.015750.112750.01975
\n", 632 | "

5 rows × 1128 columns

\n", 633 | "
" 634 | ], 635 | "text/plain": [ 636 | "tagId 1 2 3 4 5 6 7 \\\n", 637 | "movieId \n", 638 | "1 0.02400 0.02400 0.05475 0.09200 0.14825 0.21500 0.06625 \n", 639 | "2 0.03800 0.04175 0.03700 0.04875 0.11075 0.07325 0.04950 \n", 640 | "3 0.04200 0.05250 0.02725 0.07975 0.05625 0.07025 0.05975 \n", 641 | "4 0.03600 0.03850 0.03500 0.03125 0.07100 0.04500 0.02475 \n", 642 | "5 0.04075 0.05125 0.05800 0.03675 0.07575 0.12675 0.02975 \n", 643 | "\n", 644 | "tagId 8 9 10 ... 1119 1120 1121 \\\n", 645 | "movieId ... \n", 646 | "1 0.27025 0.26050 0.03025 ... 0.03650 0.01800 0.04525 \n", 647 | "2 0.10775 0.10200 0.02050 ... 0.03900 0.01925 0.01725 \n", 648 | "3 0.18275 0.05175 0.02725 ... 0.03950 0.02625 0.02725 \n", 649 | "4 0.08300 0.05150 0.02975 ... 0.05375 0.03300 0.02275 \n", 650 | "5 0.08175 0.03075 0.02950 ... 0.04000 0.02850 0.02100 \n", 651 | "\n", 652 | "tagId 1122 1123 1124 1125 1126 1127 1128 \n", 653 | "movieId \n", 654 | "1 0.03275 0.12450 0.04175 0.02000 0.03475 0.08350 0.02525 \n", 655 | "2 0.02425 0.13425 0.02225 0.01600 0.01450 0.09600 0.02025 \n", 656 | "3 0.03450 0.16925 0.03525 0.01725 0.01875 0.09925 0.02000 \n", 657 | "4 0.04025 0.19600 0.05700 0.01550 0.01475 0.06625 0.01400 \n", 658 | "5 0.02650 0.15475 0.02050 0.01700 0.01575 0.11275 0.01975 \n", 659 | "\n", 660 | "[5 rows x 1128 columns]" 661 | ] 662 | }, 663 | "execution_count": 13, 664 | "metadata": {}, 665 | "output_type": "execute_result" 666 | } 667 | ], 668 | "source": [ 669 | "table.head()" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 14, 675 | "metadata": { 676 | "collapsed": true, 677 | "deletable": true, 678 | "editable": true 679 | }, 680 | "outputs": [], 681 | "source": [ 682 | "%matplotlib inline" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 15, 688 | "metadata": { 689 | "collapsed": false, 690 | "deletable": true, 691 | "editable": true 692 | }, 693 | "outputs": [ 694 | { 695 | "data": { 696 | "text/plain": [ 697 | "array([[]], dtype=object)" 698 | ] 699 | }, 700 | "execution_count": 15, 701 | "metadata": {}, 702 | "output_type": "execute_result" 703 | }, 704 | { 705 | "data": { 706 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEICAYAAABRSj9aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEwNJREFUeJzt3X+MndV95/H3JzgkKUNtEtIRwt46VWi7ESgpjAJRV92Z\neLc1pIqRmiIiUgzyrqVuEqWb7ArvbqXuj0rrqKJRYKO0bokwFe2E0qa2CGmFHEYoqzVbu0kxgXYz\noSa1y9qb2Hh3AkmX7nf/uA/Ziddw78zcmes5835Jo/s85znPPec7Hn/uM+f+mFQVkqR2vWbUE5Ak\nLS+DXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJfOIcmHkhxK8t0k9456PtJSrBv1BKTz1N8A\nvwr8DPCGEc9FWhKDXjqHqvpDgCQTwMYRT0daEpduJKlxBr0kNc6gl6TGGfSS1DifjJXOIck6ev8/\nLgAuSPJ64KWqemm0M5MWzit66dx+GXgR2AV8oNv+5ZHOSFqk+IdHJKltXtFLUuMMeklqnEEvSY0z\n6CWpcefFyysvvfTS2rx586LO/fa3v81FF1003Amd56x5bbDmtWEpNR8+fPibVfXmfv3Oi6DfvHkz\nhw4dWtS5MzMzTE5ODndC5zlrXhuseW1YSs1Jnh2kn0s3ktQ4g16SGmfQS1LjDHpJapxBL0mNM+gl\nqXEGvSQ1zqCXpMYZ9JLUuPPinbFLceT4GW7b9fmRjH1093tGMq4kLYRX9JLUOINekhpn0EtS4wx6\nSWqcQS9JjTPoJalxBr0kNW6goE+yIcmDSf4iydNJ3pXkjUkeSfK17vaSrm+S3JVkNskTSa5e3hIk\nSa9m0Cv6TwJ/XFU/DrwdeBrYBRyoqiuAA90+wPXAFd3XTuDTQ52xJGlB+gZ9kvXATwH3AFTV31bV\n88A2YG/XbS9wY7e9Dbiveg4CG5JcNvSZS5IGkqp69Q7JO4A9wFP0ruYPAx8BjlfVhq5PgNNVtSHJ\nQ8DuqvpSd+wAcEdVHTrrfnfSu+JnfHz8munp6UUVcPLUGU68uKhTl+yqy9ePZNy5uTnGxsZGMvao\nWPPaYM0LMzU1dbiqJvr1G+SzbtYBVwMfrqrHk3yS/7dMA0BVVZJXf8Q4S1XtofcAwsTERC32r6Df\nff8+7jwymo/sOXrL5EjGXcpfjV+trHltsOblMcga/THgWFU93u0/SC/4T7y8JNPdnuyOHwc2zTt/\nY9cmSRqBvkFfVf8d+OskP9Y1baG3jLMf2N61bQf2ddv7gVu7V99cB5ypqueGO21J0qAGXfP4MHB/\nkguBZ4Db6T1IPJBkB/AscFPX92HgBmAWeKHrK0kakYGCvqq+ApxrwX/LOfoW8MElzkuSNCS+M1aS\nGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalx\nBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxg0U9EmOJjmS5CtJ\nDnVtb0zySJKvdbeXdO1JcleS2SRPJLl6OQuQJL26hVzRT1XVO6pqotvfBRyoqiuAA90+wPXAFd3X\nTuDTw5qsJGnhlrJ0sw3Y223vBW6c135f9RwENiS5bAnjSJKWIFXVv1PyV8BpoIDfrKo9SZ6vqg3d\n8QCnq2pDkoeA3VX1pe7YAeCOqjp01n3upHfFz/j4+DXT09OLKuDkqTOceHFRpy7ZVZevH8m4c3Nz\njI2NjWTsUbHmtcGaF2ZqaurwvFWWV7RuwPv7B1V1PMkPAY8k+Yv5B6uqkvR/xPj+c/YAewAmJiZq\ncnJyIad/z9337+POI4OWMVxHb5kcybgzMzMs9vu1Wlnz2mDNy2OgpZuqOt7dngQ+B7wTOPHykkx3\ne7LrfhzYNO/0jV2bJGkE+gZ9kouSXPzyNvDTwJPAfmB71207sK/b3g/c2r365jrgTFU9N/SZS5IG\nMsiaxzjwud4yPOuA362qP07yp8ADSXYAzwI3df0fBm4AZoEXgNuHPmtJ0sD6Bn1VPQO8/Rzt3wK2\nnKO9gA8OZXaSpCXznbGS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16S\nGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalx\nBr0kNc6gl6TGDRz0SS5I8uUkD3X7b0nyeJLZJJ9NcmHX/rpuf7Y7vnl5pi5JGsRCrug/Ajw9b//j\nwCeq6q3AaWBH174DON21f6LrJ0kakYGCPslG4D3Ab3f7Ad4NPNh12Qvc2G1v6/bpjm/p+kuSRiBV\n1b9T8iDwH4GLgX8B3AYc7K7aSbIJ+EJVXZnkSWBrVR3rjn0duLaqvnnWfe4EdgKMj49fMz09vagC\nTp46w4kXF3Xqkl11+fqRjDs3N8fY2NhIxh4Va14brHlhpqamDlfVRL9+6/p1SPKzwMmqOpxkclGz\nOYeq2gPsAZiYmKjJycXd9d337+POI33LWBZHb5kcybgzMzMs9vu1Wlnz2mDNy2OQhPxJ4L1JbgBe\nD/wg8ElgQ5J1VfUSsBE43vU/DmwCjiVZB6wHvjX0mUuSBtJ3jb6q/lVVbayqzcDNwBer6hbgUeB9\nXbftwL5ue3+3T3f8izXI+pAkaVks5XX0dwAfTTILvAm4p2u/B3hT1/5RYNfSpihJWooFLW5X1Qww\n020/A7zzHH2+A/z8EOYmSRoC3xkrSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxB\nL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS\n1DiDXpIaZ9BLUuMMeklqXN+gT/L6JP81yZ8n+WqSf9e1vyXJ40lmk3w2yYVd++u6/dnu+OblLUGS\n9GoGuaL/LvDuqno78A5ga5LrgI8Dn6iqtwKngR1d/x3A6a79E10/SdKI9A366pnrdl/bfRXwbuDB\nrn0vcGO3va3bpzu+JUmGNmNJ0oKkqvp3Si4ADgNvBT4F/BpwsLtqJ8km4AtVdWWSJ4GtVXWsO/Z1\n4Nqq+uZZ97kT2AkwPj5+zfT09KIKOHnqDCdeXNSpS3bV5etHMu7c3BxjY2MjGXtUrHltsOaFmZqa\nOlxVE/36rRvkzqrq74B3JNkAfA748UXN6vvvcw+wB2BiYqImJycXdT9337+PO48MVMbQHb1lciTj\nzszMsNjv12plzWuDNS+PBb3qpqqeBx4F3gVsSPJywm4Ejnfbx4FNAN3x9cC3hjJbSdKCDfKqmzd3\nV/IkeQPwj4Gn6QX++7pu24F93fb+bp/u+BdrkPUhSdKyGGTN4zJgb7dO/xrggap6KMlTwHSSXwW+\nDNzT9b8H+J0ks8Ap4OZlmLckaUB9g76qngB+4hztzwDvPEf7d4CfH8rsJElL5jtjJalxBr0kNc6g\nl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJ\napxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWpc36BPsinJo0meSvLVJB/p2t+Y\n5JEkX+tuL+nak+SuJLNJnkhy9XIXIUl6ZesG6PMS8LGq+rMkFwOHkzwC3AYcqKrdSXYBu4A7gOuB\nK7qva4FPd7fN2bzr8yMZ996tF41kXEmrU98r+qp6rqr+rNv+X8DTwOXANmBv120vcGO3vQ24r3oO\nAhuSXDb0mUuSBpKqGrxzshl4DLgS+EZVbejaA5yuqg1JHgJ2V9WXumMHgDuq6tBZ97UT2AkwPj5+\nzfT09KIKOHnqDCdeXNSpq9Zb1l/A2NjYqKexoubm5qx5DbDmhZmamjpcVRP9+g2ydANAkjHgD4Bf\nqqr/2cv2nqqqJIM/YvTO2QPsAZiYmKjJycmFnP49d9+/jzuPDFxGE+7dehGL/X6tVjMzM9a8Bljz\n8hjoVTdJXksv5O+vqj/smk+8vCTT3Z7s2o8Dm+advrFrkySNwCCvuglwD/B0Vf36vEP7ge3d9nZg\n37z2W7tX31wHnKmq54Y4Z0nSAgyy5vGTwC8AR5J8pWv718Bu4IEkO4BngZu6Yw8DNwCzwAvA7UOd\nsSRpQfoGffekal7h8JZz9C/gg0uclyRpSHxnrCQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9J\njTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4\ng16SGmfQS1LjDHpJapxBL0mNM+glqXF9gz7JZ5KcTPLkvLY3Jnkkyde620u69iS5K8lskieSXL2c\nk5ck9bdugD73Av8JuG9e2y7gQFXtTrKr278DuB64ovu6Fvh0d6shOnL8DLft+vxIxj66+z0jGVfS\n4vW9oq+qx4BTZzVvA/Z223uBG+e131c9B4ENSS4b1mQlSQuXqurfKdkMPFRVV3b7z1fVhm47wOmq\n2pDkIWB3VX2pO3YAuKOqDp3jPncCOwHGx8evmZ6eXlQBJ0+d4cSLizp11Rp/AyOr+arL149k3Lm5\nOcbGxkYy9qhY89qwlJqnpqYOV9VEv36DLN28qqqqJP0fLf7/8/YAewAmJiZqcnJyUePfff8+7jyy\n5DJWlY9d9dLIaj56y+RIxp2ZmWGxPyOrlTWvDStR82JfdXPi5SWZ7vZk134c2DSv38auTZI0IosN\n+v3A9m57O7BvXvut3atvrgPOVNVzS5yjJGkJ+v7+n+T3gEng0iTHgF8BdgMPJNkBPAvc1HV/GLgB\nmAVeAG5fhjlLkhagb9BX1ftf4dCWc/Qt4INLnZQkaXh8Z6wkNc6gl6TGGfSS1DiDXpIaZ9BLUuMM\neklqnEEvSY0z6CWpcQa9JDVubX3so5Zs84j+4Mm9Wy8aybhSC7yil6TGGfSS1DiDXpIaZ9BLUuMM\neklqnK+60apw5PgZbhvBK36O7n7Pio8pDZtX9JLUOINekhpn0EtS41yjl17FqN4JDL4bWMPjFb0k\nNc4reuk85SuNNCwGvaTzxqge3EZpJZboDHpJ32eUz0t87KqRDd20ZVmjT7I1yV8mmU2yaznGkCQN\nZuhBn+QC4FPA9cDbgPcneduwx5EkDWY5rujfCcxW1TNV9bfANLBtGcaRJA0gVTXcO0zeB2ytqn/S\n7f8CcG1VfeisfjuBnd3ujwF/ucghLwW+uchzVytrXhuseW1YSs0/XFVv7tdpZE/GVtUeYM9S7yfJ\noaqaGMKUVg1rXhuseW1YiZqXY+nmOLBp3v7Grk2SNALLEfR/ClyR5C1JLgRuBvYvwziSpAEMfemm\nql5K8iHgT4ALgM9U1VeHPc48S17+WYWseW2w5rVh2Wse+pOxkqTzix9qJkmNM+glqXGrJuj7faxC\nktcl+Wx3/PEkm1d+lsM1QM0fTfJUkieSHEjyw6OY5zAN+vEZSX4uSSVZ9S/FG6TmJDd1/9ZfTfK7\nKz3HYRvgZ/vvJXk0yZe7n+8bRjHPYUnymSQnkzz5CseT5K7u+/FEkquHOoGqOu+/6D2p+3XgR4AL\ngT8H3nZWn38G/Ea3fTPw2VHPewVqngJ+oNv+xbVQc9fvYuAx4CAwMep5r8C/8xXAl4FLuv0fGvW8\nV6DmPcAvdttvA46Oet5LrPmngKuBJ1/h+A3AF4AA1wGPD3P81XJFP8jHKmwD9nbbDwJbkmQF5zhs\nfWuuqker6oVu9yC99yysZoN+fMZ/AD4OfGclJ7dMBqn5nwKfqqrTAFV1coXnOGyD1FzAD3bb64G/\nWcH5DV1VPQacepUu24D7qucgsCHJZcMaf7UE/eXAX8/bP9a1nbNPVb0EnAHetCKzWx6D1DzfDnpX\nBKtZ35q7X2k3VVUrH1o+yL/zjwI/muQ/JzmYZOuKzW55DFLzvwU+kOQY8DDw4ZWZ2sgs9P/7gvh5\n9A1I8gFgAviHo57LckryGuDXgdtGPJWVto7e8s0kvd/aHktyVVU9P9JZLa/3A/dW1Z1J3gX8TpIr\nq+r/jHpiq9FquaIf5GMVvtcnyTp6v+59a0VmtzwG+iiJJP8I+DfAe6vquys0t+XSr+aLgSuBmSRH\n6a1l7l/lT8gO8u98DNhfVf+7qv4K+G/0gn+1GqTmHcADAFX1X4DX0/vwr1Yt60fHrJagH+RjFfYD\n27vt9wFfrO5ZjlWqb81JfgL4TXohv9rXbaFPzVV1pqourarNVbWZ3vMS762qQ6OZ7lAM8rP9R/Su\n5klyKb2lnGdWcpJDNkjN3wC2ACT5+/SC/n+s6CxX1n7g1u7VN9cBZ6rquWHd+apYuqlX+FiFJP8e\nOFRV+4F76P16N0vvSY+bRzfjpRuw5l8DxoDf7553/kZVvXdkk16iAWtuyoA1/wnw00meAv4O+JdV\ntWp/Wx2w5o8Bv5Xkn9N7Yva21XzhluT36D1YX9o97/ArwGsBquo36D0PcQMwC7wA3D7U8Vfx906S\nNIDVsnQjSVokg16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ17v8Cmf/ETUsBdJcAAAAASUVORK5C\nYII=\n", 707 | "text/plain": [ 708 | "" 709 | ] 710 | }, 711 | "metadata": {}, 712 | "output_type": "display_data" 713 | } 714 | ], 715 | "source": [ 716 | "table[:1].T.hist()" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": { 722 | "deletable": true, 723 | "editable": true 724 | }, 725 | "source": [ 726 | "conclusion: 0.3 seems to be a good cut-off" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": 16, 732 | "metadata": { 733 | "collapsed": false, 734 | "deletable": true, 735 | "editable": true 736 | }, 737 | "outputs": [], 738 | "source": [ 739 | "movie_tags = genome_scores[genome_scores.relevance > 0.3][['movieId', 'tagId']]" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": { 745 | "deletable": true, 746 | "editable": true 747 | }, 748 | "source": [ 749 | "### Merge in tag and movie names" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 17, 755 | "metadata": { 756 | "collapsed": false, 757 | "deletable": true, 758 | "editable": true 759 | }, 760 | "outputs": [], 761 | "source": [ 762 | "tags_to_movies = pd.merge(movie_tags, genome_tags, on='tagId', how='left')[['movieId', 'tagId']]" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": 18, 768 | "metadata": { 769 | "collapsed": true, 770 | "deletable": true, 771 | "editable": true 772 | }, 773 | "outputs": [], 774 | "source": [ 775 | "tags_to_movies['tagId'] = tags_to_movies.tagId.astype(str)" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 19, 781 | "metadata": { 782 | "collapsed": true, 783 | "deletable": true, 784 | "editable": true 785 | }, 786 | "outputs": [], 787 | "source": [ 788 | "def _concatenate_tags_of_movie(tags):\n", 789 | " tags_as_str = ' '.join(set(tags))\n", 790 | " return tags_as_str" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 20, 796 | "metadata": { 797 | "collapsed": false, 798 | "deletable": true, 799 | "editable": true 800 | }, 801 | "outputs": [], 802 | "source": [ 803 | "tags_per_movie = tags_to_movies.groupby('movieId')['tagId'].agg({\n", 804 | " 'movie_tags': _concatenate_tags_of_movie\n", 805 | "}).reset_index()" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": 21, 811 | "metadata": { 812 | "collapsed": false, 813 | "deletable": true, 814 | "editable": true 815 | }, 816 | "outputs": [], 817 | "source": [ 818 | "avg_ratings = ratings.groupby('movieId')['rating'].agg({\n", 819 | " 'rating_mean': 'mean',\n", 820 | " 'rating_median': 'median',\n", 821 | " 'num_ratings': 'size'\n", 822 | "}).reset_index()" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": 22, 828 | "metadata": { 829 | "collapsed": false, 830 | "deletable": true, 831 | "editable": true 832 | }, 833 | "outputs": [], 834 | "source": [ 835 | "movies_with_ratings = pd.merge(movie_names, avg_ratings, how='left', on='movieId')" 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "execution_count": 23, 841 | "metadata": { 842 | "collapsed": true, 843 | "deletable": true, 844 | "editable": true 845 | }, 846 | "outputs": [], 847 | "source": [ 848 | "dataset = pd.merge(movies_with_ratings, tags_per_movie, how='left', on='movieId')" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": 24, 854 | "metadata": { 855 | "collapsed": true, 856 | "deletable": true, 857 | "editable": true 858 | }, 859 | "outputs": [], 860 | "source": [ 861 | "dataset.rename(columns={'median': 'rating_median', 'mean': 'rating_mean', 'tagId': 'movie_tags'}, inplace=True)" 862 | ] 863 | }, 864 | { 865 | "cell_type": "markdown", 866 | "metadata": { 867 | "deletable": true, 868 | "editable": true 869 | }, 870 | "source": [ 871 | "### Extracting movie year from title" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": 25, 877 | "metadata": { 878 | "collapsed": false, 879 | "deletable": true, 880 | "editable": true 881 | }, 882 | "outputs": [], 883 | "source": [ 884 | "import re\n", 885 | "\n", 886 | "def extract_year_from_movie_title(movie_title):\n", 887 | " matches = re.findall(r'\\d{4}', movie_title)\n", 888 | " if len(matches) > 1:\n", 889 | " return int(matches[-1])\n", 890 | " if len(matches) < 1:\n", 891 | " return np.nan\n", 892 | " return int(matches[0])" 893 | ] 894 | }, 895 | { 896 | "cell_type": "code", 897 | "execution_count": 26, 898 | "metadata": { 899 | "collapsed": false, 900 | "deletable": true, 901 | "editable": true 902 | }, 903 | "outputs": [], 904 | "source": [ 905 | "dataset['year'] = dataset.title.apply(extract_year_from_movie_title)" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": 27, 911 | "metadata": { 912 | "collapsed": false, 913 | "deletable": true, 914 | "editable": true 915 | }, 916 | "outputs": [ 917 | { 918 | "data": { 919 | "text/html": [ 920 | "
\n", 921 | "\n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | "
movieIdtitlegenresnum_ratingsrating_medianrating_meanmovie_tagsyear
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy63469.04.03.889300113 93 1071 745 881 186 1025 464 588 355 942 1...1995.0
12Jumanji (1995)Adventure|Children|Fantasy25045.03.03.229527113 745 881 186 694 20 464 588 355 314 22 664 ...1995.0
23Grumpier Old Men (1995)Comedy|Romance15381.03.03.1781421071 374 846 902 919 629 469 609 464 807 1057 ...1995.0
34Waiting to Exhale (1995)Comedy|Drama|Romance2961.03.02.879433864 374 846 425 602 900 388 807 464 107 726 97...1995.0
45Father of the Bride Part II (1995)Comedy15023.03.03.0804101040 157 926 1071 204 864 374 334 902 694 919 ...1995.0
\n", 993 | "
" 994 | ], 995 | "text/plain": [ 996 | " movieId title \\\n", 997 | "0 1 Toy Story (1995) \n", 998 | "1 2 Jumanji (1995) \n", 999 | "2 3 Grumpier Old Men (1995) \n", 1000 | "3 4 Waiting to Exhale (1995) \n", 1001 | "4 5 Father of the Bride Part II (1995) \n", 1002 | "\n", 1003 | " genres num_ratings rating_median \\\n", 1004 | "0 Adventure|Animation|Children|Comedy|Fantasy 63469.0 4.0 \n", 1005 | "1 Adventure|Children|Fantasy 25045.0 3.0 \n", 1006 | "2 Comedy|Romance 15381.0 3.0 \n", 1007 | "3 Comedy|Drama|Romance 2961.0 3.0 \n", 1008 | "4 Comedy 15023.0 3.0 \n", 1009 | "\n", 1010 | " rating_mean movie_tags year \n", 1011 | "0 3.889300 113 93 1071 745 881 186 1025 464 588 355 942 1... 1995.0 \n", 1012 | "1 3.229527 113 745 881 186 694 20 464 588 355 314 22 664 ... 1995.0 \n", 1013 | "2 3.178142 1071 374 846 902 919 629 469 609 464 807 1057 ... 1995.0 \n", 1014 | "3 2.879433 864 374 846 425 602 900 388 807 464 107 726 97... 1995.0 \n", 1015 | "4 3.080410 1040 157 926 1071 204 864 374 334 902 694 919 ... 1995.0 " 1016 | ] 1017 | }, 1018 | "execution_count": 27, 1019 | "metadata": {}, 1020 | "output_type": "execute_result" 1021 | } 1022 | ], 1023 | "source": [ 1024 | "dataset.head()" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "markdown", 1029 | "metadata": { 1030 | "deletable": true, 1031 | "editable": true 1032 | }, 1033 | "source": [ 1034 | "There are movies without tags" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "code", 1039 | "execution_count": 28, 1040 | "metadata": { 1041 | "collapsed": false, 1042 | "deletable": true, 1043 | "editable": true 1044 | }, 1045 | "outputs": [ 1046 | { 1047 | "data": { 1048 | "text/html": [ 1049 | "
\n", 1050 | "\n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | "
movieIdtitlegenresnum_ratingsrating_medianrating_meanmovie_tagsyear
5051Guardian Angel (1994)Action|Drama|Thriller34.03.002.588235NaN1994.0
107109Headless Body in Topless Bar (1995)Comedy|Drama|Thriller18.02.252.333333NaN1995.0
113115Happiness Is in the Field (Bonheur est dans le...Comedy50.04.003.380000NaN1995.0
122124Star Maker, The (Uomo delle stelle, L') (1995)Drama335.04.003.489552NaN1995.0
125127Silences of the Palace, The (Saimt el Qusur) (...Drama51.03.003.215686NaN1994.0
\n", 1122 | "
" 1123 | ], 1124 | "text/plain": [ 1125 | " movieId title \\\n", 1126 | "50 51 Guardian Angel (1994) \n", 1127 | "107 109 Headless Body in Topless Bar (1995) \n", 1128 | "113 115 Happiness Is in the Field (Bonheur est dans le... \n", 1129 | "122 124 Star Maker, The (Uomo delle stelle, L') (1995) \n", 1130 | "125 127 Silences of the Palace, The (Saimt el Qusur) (... \n", 1131 | "\n", 1132 | " genres num_ratings rating_median rating_mean \\\n", 1133 | "50 Action|Drama|Thriller 34.0 3.00 2.588235 \n", 1134 | "107 Comedy|Drama|Thriller 18.0 2.25 2.333333 \n", 1135 | "113 Comedy 50.0 4.00 3.380000 \n", 1136 | "122 Drama 335.0 4.00 3.489552 \n", 1137 | "125 Drama 51.0 3.00 3.215686 \n", 1138 | "\n", 1139 | " movie_tags year \n", 1140 | "50 NaN 1994.0 \n", 1141 | "107 NaN 1995.0 \n", 1142 | "113 NaN 1995.0 \n", 1143 | "122 NaN 1995.0 \n", 1144 | "125 NaN 1994.0 " 1145 | ] 1146 | }, 1147 | "execution_count": 28, 1148 | "metadata": {}, 1149 | "output_type": "execute_result" 1150 | } 1151 | ], 1152 | "source": [ 1153 | "dataset[dataset.movie_tags.isnull()].head()" 1154 | ] 1155 | }, 1156 | { 1157 | "cell_type": "markdown", 1158 | "metadata": { 1159 | "deletable": true, 1160 | "editable": true 1161 | }, 1162 | "source": [ 1163 | "There are movies without ratings" 1164 | ] 1165 | }, 1166 | { 1167 | "cell_type": "code", 1168 | "execution_count": 29, 1169 | "metadata": { 1170 | "collapsed": false, 1171 | "deletable": true, 1172 | "editable": true 1173 | }, 1174 | "outputs": [ 1175 | { 1176 | "data": { 1177 | "text/html": [ 1178 | "
\n", 1179 | "\n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | "
movieIdtitlegenresnum_ratingsrating_medianrating_meanmovie_tagsyear
853125981Man on a Tightrope (1953)DramaNaNNaNNaNNaN1953.0
932227396Gentleman's Game, A (2002)DramaNaNNaNNaNNaN2002.0
977931797White Banners (1938)DramaNaNNaNNaNNaN1938.0
1330765078Jane Austen in Manhattan (1980)Drama|RomanceNaNNaNNaNNaN1980.0
1351066622His Private Secretary (1933)Comedy|RomanceNaNNaNNaNNaN1933.0
\n", 1251 | "
" 1252 | ], 1253 | "text/plain": [ 1254 | " movieId title genres num_ratings \\\n", 1255 | "8531 25981 Man on a Tightrope (1953) Drama NaN \n", 1256 | "9322 27396 Gentleman's Game, A (2002) Drama NaN \n", 1257 | "9779 31797 White Banners (1938) Drama NaN \n", 1258 | "13307 65078 Jane Austen in Manhattan (1980) Drama|Romance NaN \n", 1259 | "13510 66622 His Private Secretary (1933) Comedy|Romance NaN \n", 1260 | "\n", 1261 | " rating_median rating_mean movie_tags year \n", 1262 | "8531 NaN NaN NaN 1953.0 \n", 1263 | "9322 NaN NaN NaN 2002.0 \n", 1264 | "9779 NaN NaN NaN 1938.0 \n", 1265 | "13307 NaN NaN NaN 1980.0 \n", 1266 | "13510 NaN NaN NaN 1933.0 " 1267 | ] 1268 | }, 1269 | "execution_count": 29, 1270 | "metadata": {}, 1271 | "output_type": "execute_result" 1272 | } 1273 | ], 1274 | "source": [ 1275 | "dataset[dataset.rating_mean.isnull()].head()" 1276 | ] 1277 | }, 1278 | { 1279 | "cell_type": "markdown", 1280 | "metadata": { 1281 | "deletable": true, 1282 | "editable": true 1283 | }, 1284 | "source": [ 1285 | "Conclusion: These cannot be related to other movies due to lack of features (tags), they could be presented as a \"random recommendation\" solution" 1286 | ] 1287 | }, 1288 | { 1289 | "cell_type": "markdown", 1290 | "metadata": { 1291 | "deletable": true, 1292 | "editable": true 1293 | }, 1294 | "source": [ 1295 | "### Bag of words vectorization" 1296 | ] 1297 | }, 1298 | { 1299 | "cell_type": "code", 1300 | "execution_count": 30, 1301 | "metadata": { 1302 | "collapsed": false, 1303 | "deletable": true, 1304 | "editable": true 1305 | }, 1306 | "outputs": [], 1307 | "source": [ 1308 | "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer" 1309 | ] 1310 | }, 1311 | { 1312 | "cell_type": "code", 1313 | "execution_count": 31, 1314 | "metadata": { 1315 | "collapsed": true, 1316 | "deletable": true, 1317 | "editable": true 1318 | }, 1319 | "outputs": [], 1320 | "source": [ 1321 | "from sklearn.metrics.pairwise import cosine_similarity" 1322 | ] 1323 | }, 1324 | { 1325 | "cell_type": "code", 1326 | "execution_count": 32, 1327 | "metadata": { 1328 | "collapsed": true, 1329 | "deletable": true, 1330 | "editable": true 1331 | }, 1332 | "outputs": [], 1333 | "source": [ 1334 | "dataset_with_tags = dataset[~dataset.movie_tags.isnull()].reset_index(drop=True)" 1335 | ] 1336 | }, 1337 | { 1338 | "cell_type": "code", 1339 | "execution_count": 33, 1340 | "metadata": { 1341 | "collapsed": true, 1342 | "deletable": true, 1343 | "editable": true 1344 | }, 1345 | "outputs": [], 1346 | "source": [ 1347 | "bag_of_words = CountVectorizer()" 1348 | ] 1349 | }, 1350 | { 1351 | "cell_type": "code", 1352 | "execution_count": 34, 1353 | "metadata": { 1354 | "collapsed": true, 1355 | "deletable": true, 1356 | "editable": true 1357 | }, 1358 | "outputs": [], 1359 | "source": [ 1360 | "tags_as_descriptors = [' '.join(genome_tags.tagId.astype(str))]" 1361 | ] 1362 | }, 1363 | { 1364 | "cell_type": "code", 1365 | "execution_count": 35, 1366 | "metadata": { 1367 | "collapsed": false, 1368 | "deletable": true, 1369 | "editable": true 1370 | }, 1371 | "outputs": [], 1372 | "source": [ 1373 | "movies_described_bag_of_words = bag_of_words.fit_transform(dataset_with_tags.movie_tags)" 1374 | ] 1375 | }, 1376 | { 1377 | "cell_type": "code", 1378 | "execution_count": 36, 1379 | "metadata": { 1380 | "collapsed": true, 1381 | "deletable": true, 1382 | "editable": true 1383 | }, 1384 | "outputs": [], 1385 | "source": [ 1386 | "df_bag_m2m = pd.DataFrame(cosine_similarity(movies_described_bag_of_words))" 1387 | ] 1388 | }, 1389 | { 1390 | "cell_type": "markdown", 1391 | "metadata": { 1392 | "deletable": true, 1393 | "editable": true 1394 | }, 1395 | "source": [ 1396 | "### Tf-Idf Vectorization" 1397 | ] 1398 | }, 1399 | { 1400 | "cell_type": "code", 1401 | "execution_count": 37, 1402 | "metadata": { 1403 | "collapsed": false, 1404 | "deletable": true, 1405 | "editable": true 1406 | }, 1407 | "outputs": [], 1408 | "source": [ 1409 | "tf_idf = TfidfVectorizer()" 1410 | ] 1411 | }, 1412 | { 1413 | "cell_type": "code", 1414 | "execution_count": 38, 1415 | "metadata": { 1416 | "collapsed": false, 1417 | "deletable": true, 1418 | "editable": true 1419 | }, 1420 | "outputs": [], 1421 | "source": [ 1422 | "movies_tf_idf_described = tf_idf.fit_transform(dataset_with_tags.movie_tags)" 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "code", 1427 | "execution_count": 39, 1428 | "metadata": { 1429 | "collapsed": true, 1430 | "deletable": true, 1431 | "editable": true 1432 | }, 1433 | "outputs": [], 1434 | "source": [ 1435 | "m2m = cosine_similarity(movies_tf_idf_described)" 1436 | ] 1437 | }, 1438 | { 1439 | "cell_type": "code", 1440 | "execution_count": 40, 1441 | "metadata": { 1442 | "collapsed": false, 1443 | "deletable": true, 1444 | "editable": true 1445 | }, 1446 | "outputs": [], 1447 | "source": [ 1448 | "df_tfidf_m2m = pd.DataFrame(cosine_similarity(movies_tf_idf_described))" 1449 | ] 1450 | }, 1451 | { 1452 | "cell_type": "code", 1453 | "execution_count": 41, 1454 | "metadata": { 1455 | "collapsed": false, 1456 | "deletable": true, 1457 | "editable": true 1458 | }, 1459 | "outputs": [ 1460 | { 1461 | "data": { 1462 | "text/html": [ 1463 | "
\n", 1464 | "\n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | "
0123456789...10664106651066610667106681066910670106711067210673
01.0000000.4319930.1597810.1413100.2169840.2564020.2195180.2550390.0846870.251235...0.2539950.2457070.2988940.2189800.2432040.2288430.3389290.3902640.5274020.210074
10.4319931.0000000.1078470.1207630.2370590.1204850.1819580.2829490.1882130.234384...0.3655640.1302270.2211580.1718330.1726120.1604110.2510520.2656340.2997840.181168
20.1597810.1078471.0000000.2600630.4308760.1189010.3697140.2067080.0671160.175896...0.1513410.1354690.1346280.0935380.1023290.1235520.1421920.1241840.1337020.096047
30.1413100.1207630.2600631.0000000.2634650.0889100.3746380.2440200.0779060.075830...0.1441430.1366910.1194440.1083070.1105340.1139560.1597680.1657740.1653680.079979
40.2169840.2370590.4308760.2634651.0000000.0504630.4462510.1822840.1080080.137105...0.1809750.2049000.1145780.1076300.1377420.1056550.1733430.1875190.1821700.098338
\n", 1614 | "

5 rows × 10674 columns

\n", 1615 | "
" 1616 | ], 1617 | "text/plain": [ 1618 | " 0 1 2 3 4 5 6 \\\n", 1619 | "0 1.000000 0.431993 0.159781 0.141310 0.216984 0.256402 0.219518 \n", 1620 | "1 0.431993 1.000000 0.107847 0.120763 0.237059 0.120485 0.181958 \n", 1621 | "2 0.159781 0.107847 1.000000 0.260063 0.430876 0.118901 0.369714 \n", 1622 | "3 0.141310 0.120763 0.260063 1.000000 0.263465 0.088910 0.374638 \n", 1623 | "4 0.216984 0.237059 0.430876 0.263465 1.000000 0.050463 0.446251 \n", 1624 | "\n", 1625 | " 7 8 9 ... 10664 10665 10666 \\\n", 1626 | "0 0.255039 0.084687 0.251235 ... 0.253995 0.245707 0.298894 \n", 1627 | "1 0.282949 0.188213 0.234384 ... 0.365564 0.130227 0.221158 \n", 1628 | "2 0.206708 0.067116 0.175896 ... 0.151341 0.135469 0.134628 \n", 1629 | "3 0.244020 0.077906 0.075830 ... 0.144143 0.136691 0.119444 \n", 1630 | "4 0.182284 0.108008 0.137105 ... 0.180975 0.204900 0.114578 \n", 1631 | "\n", 1632 | " 10667 10668 10669 10670 10671 10672 10673 \n", 1633 | "0 0.218980 0.243204 0.228843 0.338929 0.390264 0.527402 0.210074 \n", 1634 | "1 0.171833 0.172612 0.160411 0.251052 0.265634 0.299784 0.181168 \n", 1635 | "2 0.093538 0.102329 0.123552 0.142192 0.124184 0.133702 0.096047 \n", 1636 | "3 0.108307 0.110534 0.113956 0.159768 0.165774 0.165368 0.079979 \n", 1637 | "4 0.107630 0.137742 0.105655 0.173343 0.187519 0.182170 0.098338 \n", 1638 | "\n", 1639 | "[5 rows x 10674 columns]" 1640 | ] 1641 | }, 1642 | "execution_count": 41, 1643 | "metadata": {}, 1644 | "output_type": "execute_result" 1645 | } 1646 | ], 1647 | "source": [ 1648 | "df_tfidf_m2m.head()" 1649 | ] 1650 | }, 1651 | { 1652 | "cell_type": "markdown", 1653 | "metadata": { 1654 | "deletable": true, 1655 | "editable": true 1656 | }, 1657 | "source": [ 1658 | "### Match indices to movie IDs" 1659 | ] 1660 | }, 1661 | { 1662 | "cell_type": "code", 1663 | "execution_count": 42, 1664 | "metadata": { 1665 | "collapsed": false, 1666 | "deletable": true, 1667 | "editable": true, 1668 | "scrolled": true 1669 | }, 1670 | "outputs": [], 1671 | "source": [ 1672 | "index_to_movie_id = dataset_with_tags['movieId']" 1673 | ] 1674 | }, 1675 | { 1676 | "cell_type": "code", 1677 | "execution_count": 43, 1678 | "metadata": { 1679 | "collapsed": false, 1680 | "deletable": true, 1681 | "editable": true 1682 | }, 1683 | "outputs": [ 1684 | { 1685 | "data": { 1686 | "text/plain": [ 1687 | "1" 1688 | ] 1689 | }, 1690 | "execution_count": 43, 1691 | "metadata": {}, 1692 | "output_type": "execute_result" 1693 | } 1694 | ], 1695 | "source": [ 1696 | "index_to_movie_id[0]" 1697 | ] 1698 | }, 1699 | { 1700 | "cell_type": "code", 1701 | "execution_count": 44, 1702 | "metadata": { 1703 | "collapsed": false, 1704 | "deletable": true, 1705 | "editable": true 1706 | }, 1707 | "outputs": [ 1708 | { 1709 | "data": { 1710 | "text/plain": [ 1711 | "160980" 1712 | ] 1713 | }, 1714 | "execution_count": 44, 1715 | "metadata": {}, 1716 | "output_type": "execute_result" 1717 | } 1718 | ], 1719 | "source": [ 1720 | "index_to_movie_id[10665]" 1721 | ] 1722 | }, 1723 | { 1724 | "cell_type": "code", 1725 | "execution_count": 45, 1726 | "metadata": { 1727 | "collapsed": false, 1728 | "deletable": true, 1729 | "editable": true 1730 | }, 1731 | "outputs": [ 1732 | { 1733 | "data": { 1734 | "text/plain": [ 1735 | "movieId 52\n", 1736 | "title Mighty Aphrodite (1995)\n", 1737 | "genres Comedy|Drama|Romance\n", 1738 | "num_ratings 10277\n", 1739 | "rating_median 4\n", 1740 | "rating_mean 3.53741\n", 1741 | "movie_tags 829 335 1071 745 845 297 704 609 464 726 1062 ...\n", 1742 | "year 1995\n", 1743 | "Name: 50, dtype: object" 1744 | ] 1745 | }, 1746 | "execution_count": 45, 1747 | "metadata": {}, 1748 | "output_type": "execute_result" 1749 | } 1750 | ], 1751 | "source": [ 1752 | "dataset_with_tags.reset_index(drop=True).ix[50]" 1753 | ] 1754 | }, 1755 | { 1756 | "cell_type": "code", 1757 | "execution_count": 46, 1758 | "metadata": { 1759 | "collapsed": false, 1760 | "deletable": true, 1761 | "editable": true 1762 | }, 1763 | "outputs": [], 1764 | "source": [ 1765 | "df_tfidf_m2m.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_m2m.columns]" 1766 | ] 1767 | }, 1768 | { 1769 | "cell_type": "code", 1770 | "execution_count": 47, 1771 | "metadata": { 1772 | "collapsed": false, 1773 | "deletable": true, 1774 | "editable": true, 1775 | "scrolled": true 1776 | }, 1777 | "outputs": [], 1778 | "source": [ 1779 | "df_tfidf_m2m.index = [index_to_movie_id[idx] for idx in df_tfidf_m2m.index]" 1780 | ] 1781 | }, 1782 | { 1783 | "cell_type": "code", 1784 | "execution_count": 48, 1785 | "metadata": { 1786 | "collapsed": false, 1787 | "deletable": true, 1788 | "editable": true 1789 | }, 1790 | "outputs": [ 1791 | { 1792 | "data": { 1793 | "text/html": [ 1794 | "
\n", 1795 | "\n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | " \n", 1814 | " \n", 1815 | " \n", 1816 | " \n", 1817 | " \n", 1818 | " \n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | " \n", 1825 | " \n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | " \n", 1831 | " \n", 1832 | " \n", 1833 | " \n", 1834 | " \n", 1835 | " \n", 1836 | " \n", 1837 | " \n", 1838 | " \n", 1839 | " \n", 1840 | " \n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | " \n", 1870 | " \n", 1871 | " \n", 1872 | " \n", 1873 | " \n", 1874 | " \n", 1875 | " \n", 1876 | " \n", 1877 | " \n", 1878 | " \n", 1879 | " \n", 1880 | " \n", 1881 | " \n", 1882 | " \n", 1883 | " \n", 1884 | " \n", 1885 | " \n", 1886 | " \n", 1887 | " \n", 1888 | " \n", 1889 | " \n", 1890 | " \n", 1891 | " \n", 1892 | " \n", 1893 | " \n", 1894 | " \n", 1895 | " \n", 1896 | " \n", 1897 | " \n", 1898 | " \n", 1899 | " \n", 1900 | " \n", 1901 | " \n", 1902 | " \n", 1903 | " \n", 1904 | " \n", 1905 | " \n", 1906 | " \n", 1907 | " \n", 1908 | " \n", 1909 | " \n", 1910 | " \n", 1911 | " \n", 1912 | " \n", 1913 | " \n", 1914 | " \n", 1915 | " \n", 1916 | " \n", 1917 | " \n", 1918 | " \n", 1919 | " \n", 1920 | " \n", 1921 | " \n", 1922 | " \n", 1923 | " \n", 1924 | " \n", 1925 | " \n", 1926 | " \n", 1927 | " \n", 1928 | " \n", 1929 | " \n", 1930 | " \n", 1931 | " \n", 1932 | " \n", 1933 | " \n", 1934 | " \n", 1935 | " \n", 1936 | " \n", 1937 | " \n", 1938 | " \n", 1939 | " \n", 1940 | " \n", 1941 | " \n", 1942 | " \n", 1943 | " \n", 1944 | "
12345678910...160954160980161131161354161582161634162350162376162578162600
11.0000000.4319930.1597810.1413100.2169840.2564020.2195180.2550390.0846870.251235...0.2539950.2457070.2988940.2189800.2432040.2288430.3389290.3902640.5274020.210074
20.4319931.0000000.1078470.1207630.2370590.1204850.1819580.2829490.1882130.234384...0.3655640.1302270.2211580.1718330.1726120.1604110.2510520.2656340.2997840.181168
30.1597810.1078471.0000000.2600630.4308760.1189010.3697140.2067080.0671160.175896...0.1513410.1354690.1346280.0935380.1023290.1235520.1421920.1241840.1337020.096047
40.1413100.1207630.2600631.0000000.2634650.0889100.3746380.2440200.0779060.075830...0.1441430.1366910.1194440.1083070.1105340.1139560.1597680.1657740.1653680.079979
50.2169840.2370590.4308760.2634651.0000000.0504630.4462510.1822840.1080080.137105...0.1809750.2049000.1145780.1076300.1377420.1056550.1733430.1875190.1821700.098338
\n", 1945 | "

5 rows × 10674 columns

\n", 1946 | "
" 1947 | ], 1948 | "text/plain": [ 1949 | " 1 2 3 4 5 6 7 \\\n", 1950 | "1 1.000000 0.431993 0.159781 0.141310 0.216984 0.256402 0.219518 \n", 1951 | "2 0.431993 1.000000 0.107847 0.120763 0.237059 0.120485 0.181958 \n", 1952 | "3 0.159781 0.107847 1.000000 0.260063 0.430876 0.118901 0.369714 \n", 1953 | "4 0.141310 0.120763 0.260063 1.000000 0.263465 0.088910 0.374638 \n", 1954 | "5 0.216984 0.237059 0.430876 0.263465 1.000000 0.050463 0.446251 \n", 1955 | "\n", 1956 | " 8 9 10 ... 160954 160980 161131 \\\n", 1957 | "1 0.255039 0.084687 0.251235 ... 0.253995 0.245707 0.298894 \n", 1958 | "2 0.282949 0.188213 0.234384 ... 0.365564 0.130227 0.221158 \n", 1959 | "3 0.206708 0.067116 0.175896 ... 0.151341 0.135469 0.134628 \n", 1960 | "4 0.244020 0.077906 0.075830 ... 0.144143 0.136691 0.119444 \n", 1961 | "5 0.182284 0.108008 0.137105 ... 0.180975 0.204900 0.114578 \n", 1962 | "\n", 1963 | " 161354 161582 161634 162350 162376 162578 162600 \n", 1964 | "1 0.218980 0.243204 0.228843 0.338929 0.390264 0.527402 0.210074 \n", 1965 | "2 0.171833 0.172612 0.160411 0.251052 0.265634 0.299784 0.181168 \n", 1966 | "3 0.093538 0.102329 0.123552 0.142192 0.124184 0.133702 0.096047 \n", 1967 | "4 0.108307 0.110534 0.113956 0.159768 0.165774 0.165368 0.079979 \n", 1968 | "5 0.107630 0.137742 0.105655 0.173343 0.187519 0.182170 0.098338 \n", 1969 | "\n", 1970 | "[5 rows x 10674 columns]" 1971 | ] 1972 | }, 1973 | "execution_count": 48, 1974 | "metadata": {}, 1975 | "output_type": "execute_result" 1976 | } 1977 | ], 1978 | "source": [ 1979 | "df_tfidf_m2m.head()" 1980 | ] 1981 | }, 1982 | { 1983 | "cell_type": "code", 1984 | "execution_count": 49, 1985 | "metadata": { 1986 | "collapsed": false, 1987 | "deletable": true, 1988 | "editable": true 1989 | }, 1990 | "outputs": [ 1991 | { 1992 | "data": { 1993 | "text/html": [ 1994 | "
\n", 1995 | "\n", 1996 | " \n", 1997 | " \n", 1998 | " \n", 1999 | " \n", 2000 | " \n", 2001 | " \n", 2002 | " \n", 2003 | " \n", 2004 | " \n", 2005 | " \n", 2006 | " \n", 2007 | " \n", 2008 | " \n", 2009 | " \n", 2010 | " \n", 2011 | " \n", 2012 | " \n", 2013 | " \n", 2014 | " \n", 2015 | " \n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | " \n", 2027 | " \n", 2028 | " \n", 2029 | " \n", 2030 | " \n", 2031 | " \n", 2032 | " \n", 2033 | " \n", 2034 | " \n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | " \n", 2041 | " \n", 2042 | " \n", 2043 | " \n", 2044 | " \n", 2045 | " \n", 2046 | " \n", 2047 | " \n", 2048 | " \n", 2049 | " \n", 2050 | " \n", 2051 | " \n", 2052 | " \n", 2053 | " \n", 2054 | " \n", 2055 | " \n", 2056 | " \n", 2057 | " \n", 2058 | " \n", 2059 | " \n", 2060 | " \n", 2061 | " \n", 2062 | " \n", 2063 | " \n", 2064 | " \n", 2065 | " \n", 2066 | " \n", 2067 | " \n", 2068 | " \n", 2069 | " \n", 2070 | " \n", 2071 | " \n", 2072 | " \n", 2073 | " \n", 2074 | " \n", 2075 | " \n", 2076 | " \n", 2077 | " \n", 2078 | " \n", 2079 | " \n", 2080 | " \n", 2081 | " \n", 2082 | " \n", 2083 | " \n", 2084 | " \n", 2085 | " \n", 2086 | " \n", 2087 | " \n", 2088 | " \n", 2089 | " \n", 2090 | " \n", 2091 | " \n", 2092 | " \n", 2093 | " \n", 2094 | " \n", 2095 | " \n", 2096 | " \n", 2097 | " \n", 2098 | " \n", 2099 | " \n", 2100 | " \n", 2101 | " \n", 2102 | " \n", 2103 | " \n", 2104 | " \n", 2105 | " \n", 2106 | " \n", 2107 | " \n", 2108 | " \n", 2109 | " \n", 2110 | " \n", 2111 | " \n", 2112 | " \n", 2113 | " \n", 2114 | " \n", 2115 | " \n", 2116 | " \n", 2117 | " \n", 2118 | " \n", 2119 | " \n", 2120 | " \n", 2121 | " \n", 2122 | " \n", 2123 | " \n", 2124 | " \n", 2125 | " \n", 2126 | " \n", 2127 | " \n", 2128 | " \n", 2129 | " \n", 2130 | " \n", 2131 | " \n", 2132 | " \n", 2133 | " \n", 2134 | " \n", 2135 | " \n", 2136 | " \n", 2137 | " \n", 2138 | " \n", 2139 | " \n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | "
12345678910...160954160980161131161354161582161634162350162376162578162600
1616340.2288430.1604110.1235520.1139560.1056550.3846520.1188600.1148610.0928830.189042...0.1855660.1688220.2565610.1946140.2747961.0000000.2588320.4509560.3192770.097457
1623500.3389290.2510520.1421920.1597680.1733430.3558090.1910530.2263370.1645360.232459...0.2754510.1950420.4042970.3203940.3811010.2588321.0000000.3671290.3169500.253319
1623760.3902640.2656340.1241840.1657740.1875190.4517330.1607880.1610370.0838970.196478...0.2695170.2654970.3984730.2139490.4553810.4509560.3671291.0000000.5558520.204455
1625780.5274020.2997840.1337020.1653680.1821700.3406400.1837400.2293450.0999490.177231...0.2228040.2796600.3148550.2462880.3332840.3192770.3169500.5558521.0000000.209276
1626000.2100740.1811680.0960470.0799790.0983380.0696680.0695610.1174700.1218240.128960...0.2088570.1161810.1666200.2226070.1261800.0974570.2533190.2044550.2092761.000000
\n", 2145 | "

5 rows × 10674 columns

\n", 2146 | "
" 2147 | ], 2148 | "text/plain": [ 2149 | " 1 2 3 4 5 6 7 \\\n", 2150 | "161634 0.228843 0.160411 0.123552 0.113956 0.105655 0.384652 0.118860 \n", 2151 | "162350 0.338929 0.251052 0.142192 0.159768 0.173343 0.355809 0.191053 \n", 2152 | "162376 0.390264 0.265634 0.124184 0.165774 0.187519 0.451733 0.160788 \n", 2153 | "162578 0.527402 0.299784 0.133702 0.165368 0.182170 0.340640 0.183740 \n", 2154 | "162600 0.210074 0.181168 0.096047 0.079979 0.098338 0.069668 0.069561 \n", 2155 | "\n", 2156 | " 8 9 10 ... 160954 160980 161131 \\\n", 2157 | "161634 0.114861 0.092883 0.189042 ... 0.185566 0.168822 0.256561 \n", 2158 | "162350 0.226337 0.164536 0.232459 ... 0.275451 0.195042 0.404297 \n", 2159 | "162376 0.161037 0.083897 0.196478 ... 0.269517 0.265497 0.398473 \n", 2160 | "162578 0.229345 0.099949 0.177231 ... 0.222804 0.279660 0.314855 \n", 2161 | "162600 0.117470 0.121824 0.128960 ... 0.208857 0.116181 0.166620 \n", 2162 | "\n", 2163 | " 161354 161582 161634 162350 162376 162578 162600 \n", 2164 | "161634 0.194614 0.274796 1.000000 0.258832 0.450956 0.319277 0.097457 \n", 2165 | "162350 0.320394 0.381101 0.258832 1.000000 0.367129 0.316950 0.253319 \n", 2166 | "162376 0.213949 0.455381 0.450956 0.367129 1.000000 0.555852 0.204455 \n", 2167 | "162578 0.246288 0.333284 0.319277 0.316950 0.555852 1.000000 0.209276 \n", 2168 | "162600 0.222607 0.126180 0.097457 0.253319 0.204455 0.209276 1.000000 \n", 2169 | "\n", 2170 | "[5 rows x 10674 columns]" 2171 | ] 2172 | }, 2173 | "execution_count": 49, 2174 | "metadata": {}, 2175 | "output_type": "execute_result" 2176 | } 2177 | ], 2178 | "source": [ 2179 | "df_tfidf_m2m.tail()" 2180 | ] 2181 | }, 2182 | { 2183 | "cell_type": "markdown", 2184 | "metadata": { 2185 | "deletable": true, 2186 | "editable": true 2187 | }, 2188 | "source": [ 2189 | "### Check similarities" 2190 | ] 2191 | }, 2192 | { 2193 | "cell_type": "code", 2194 | "execution_count": 50, 2195 | "metadata": { 2196 | "collapsed": false, 2197 | "deletable": true, 2198 | "editable": true, 2199 | "scrolled": true 2200 | }, 2201 | "outputs": [ 2202 | { 2203 | "data": { 2204 | "text/plain": [ 2205 | "1 1.000000\n", 2206 | "3114 0.736535\n", 2207 | "4886 0.724898\n", 2208 | "78499 0.720759\n", 2209 | "2355 0.714265\n", 2210 | "76093 0.686358\n", 2211 | "5218 0.672731\n", 2212 | "68954 0.648925\n", 2213 | "6377 0.647981\n", 2214 | "4306 0.641639\n", 2215 | "50872 0.635019\n", 2216 | "8961 0.626673\n", 2217 | "2761 0.608781\n", 2218 | "81847 0.604853\n", 2219 | "45517 0.602484\n", 2220 | "152081 0.601611\n", 2221 | "42191 0.599953\n", 2222 | "97913 0.594801\n", 2223 | "98491 0.591688\n", 2224 | "108932 0.590483\n", 2225 | "Name: 1, dtype: float64" 2226 | ] 2227 | }, 2228 | "execution_count": 50, 2229 | "metadata": {}, 2230 | "output_type": "execute_result" 2231 | } 2232 | ], 2233 | "source": [ 2234 | "df_tfidf_m2m.ix[1].sort_values(ascending=False)[:20]" 2235 | ] 2236 | }, 2237 | { 2238 | "cell_type": "code", 2239 | "execution_count": 51, 2240 | "metadata": { 2241 | "collapsed": false, 2242 | "deletable": true, 2243 | "editable": true 2244 | }, 2245 | "outputs": [ 2246 | { 2247 | "data": { 2248 | "text/html": [ 2249 | "
\n", 2250 | "\n", 2251 | " \n", 2252 | " \n", 2253 | " \n", 2254 | " \n", 2255 | " \n", 2256 | " \n", 2257 | " \n", 2258 | " \n", 2259 | " \n", 2260 | " \n", 2261 | " \n", 2262 | " \n", 2263 | " \n", 2264 | " \n", 2265 | " \n", 2266 | " \n", 2267 | " \n", 2268 | " \n", 2269 | " \n", 2270 | " \n", 2271 | " \n", 2272 | " \n", 2273 | " \n", 2274 | " \n", 2275 | " \n", 2276 | " \n", 2277 | "
movieIdtitlegenresnum_ratingsrating_medianrating_meanmovie_tagsyear
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy63469.04.03.8893113 93 1071 745 881 186 1025 464 588 355 942 1...1995.0
\n", 2278 | "
" 2279 | ], 2280 | "text/plain": [ 2281 | " movieId title genres \\\n", 2282 | "0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy \n", 2283 | "\n", 2284 | " num_ratings rating_median rating_mean \\\n", 2285 | "0 63469.0 4.0 3.8893 \n", 2286 | "\n", 2287 | " movie_tags year \n", 2288 | "0 113 93 1071 745 881 186 1025 464 588 355 942 1... 1995.0 " 2289 | ] 2290 | }, 2291 | "execution_count": 51, 2292 | "metadata": {}, 2293 | "output_type": "execute_result" 2294 | } 2295 | ], 2296 | "source": [ 2297 | "dataset_with_tags[dataset_with_tags.movieId == 1]" 2298 | ] 2299 | }, 2300 | { 2301 | "cell_type": "code", 2302 | "execution_count": 52, 2303 | "metadata": { 2304 | "collapsed": false, 2305 | "deletable": true, 2306 | "editable": true 2307 | }, 2308 | "outputs": [ 2309 | { 2310 | "data": { 2311 | "text/html": [ 2312 | "
\n", 2313 | "\n", 2314 | " \n", 2315 | " \n", 2316 | " \n", 2317 | " \n", 2318 | " \n", 2319 | " \n", 2320 | " \n", 2321 | " \n", 2322 | " \n", 2323 | " \n", 2324 | " \n", 2325 | " \n", 2326 | " \n", 2327 | " \n", 2328 | " \n", 2329 | " \n", 2330 | " \n", 2331 | " \n", 2332 | " \n", 2333 | " \n", 2334 | " \n", 2335 | " \n", 2336 | " \n", 2337 | " \n", 2338 | " \n", 2339 | " \n", 2340 | "
movieIdtitlegenresnum_ratingsrating_medianrating_meanmovie_tagsyear
27693114Toy Story 2 (1999)Adventure|Animation|Children|Comedy|Fantasy26904.04.03.8208441071 745 186 464 588 355 1062 664 244 455 128 ...1999.0
\n", 2341 | "
" 2342 | ], 2343 | "text/plain": [ 2344 | " movieId title \\\n", 2345 | "2769 3114 Toy Story 2 (1999) \n", 2346 | "\n", 2347 | " genres num_ratings rating_median \\\n", 2348 | "2769 Adventure|Animation|Children|Comedy|Fantasy 26904.0 4.0 \n", 2349 | "\n", 2350 | " rating_mean movie_tags year \n", 2351 | "2769 3.820844 1071 745 186 464 588 355 1062 664 244 455 128 ... 1999.0 " 2352 | ] 2353 | }, 2354 | "execution_count": 52, 2355 | "metadata": {}, 2356 | "output_type": "execute_result" 2357 | } 2358 | ], 2359 | "source": [ 2360 | "dataset_with_tags[dataset_with_tags.movieId == 3114]" 2361 | ] 2362 | }, 2363 | { 2364 | "cell_type": "code", 2365 | "execution_count": 53, 2366 | "metadata": { 2367 | "collapsed": false, 2368 | "deletable": true, 2369 | "editable": true 2370 | }, 2371 | "outputs": [ 2372 | { 2373 | "data": { 2374 | "text/html": [ 2375 | "
\n", 2376 | "\n", 2377 | " \n", 2378 | " \n", 2379 | " \n", 2380 | " \n", 2381 | " \n", 2382 | " \n", 2383 | " \n", 2384 | " \n", 2385 | " \n", 2386 | " \n", 2387 | " \n", 2388 | " \n", 2389 | " \n", 2390 | " \n", 2391 | " \n", 2392 | " \n", 2393 | " \n", 2394 | " \n", 2395 | " \n", 2396 | " \n", 2397 | " \n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | "
movieIdtitlegenresnum_ratingsrating_medianrating_meanmovie_tagsyear
43314886Monsters, Inc. (2001)Adventure|Animation|Children|Comedy|Fantasy31089.04.03.863633113 1071 745 186 464 588 355 1062 755 22 372 6...2001.0
\n", 2404 | "
" 2405 | ], 2406 | "text/plain": [ 2407 | " movieId title \\\n", 2408 | "4331 4886 Monsters, Inc. (2001) \n", 2409 | "\n", 2410 | " genres num_ratings rating_median \\\n", 2411 | "4331 Adventure|Animation|Children|Comedy|Fantasy 31089.0 4.0 \n", 2412 | "\n", 2413 | " rating_mean movie_tags year \n", 2414 | "4331 3.863633 113 1071 745 186 464 588 355 1062 755 22 372 6... 2001.0 " 2415 | ] 2416 | }, 2417 | "execution_count": 53, 2418 | "metadata": {}, 2419 | "output_type": "execute_result" 2420 | } 2421 | ], 2422 | "source": [ 2423 | "dataset_with_tags[dataset_with_tags.movieId == 4886]" 2424 | ] 2425 | }, 2426 | { 2427 | "cell_type": "code", 2428 | "execution_count": 54, 2429 | "metadata": { 2430 | "collapsed": false, 2431 | "deletable": true, 2432 | "editable": true 2433 | }, 2434 | "outputs": [ 2435 | { 2436 | "data": { 2437 | "text/html": [ 2438 | "
\n", 2439 | "\n", 2440 | " \n", 2441 | " \n", 2442 | " \n", 2443 | " \n", 2444 | " \n", 2445 | " \n", 2446 | " \n", 2447 | " \n", 2448 | " \n", 2449 | " \n", 2450 | " \n", 2451 | " \n", 2452 | " \n", 2453 | " \n", 2454 | " \n", 2455 | " \n", 2456 | " \n", 2457 | " \n", 2458 | " \n", 2459 | " \n", 2460 | " \n", 2461 | " \n", 2462 | " \n", 2463 | " \n", 2464 | " \n", 2465 | " \n", 2466 | "
movieIdtitlegenresnum_ratingsrating_medianrating_meanmovie_tagsyear
907078499Toy Story 3 (2010)Adventure|Animation|Children|Comedy|Fantasy|IMAX10963.04.03.9102441071 864 745 881 186 845 464 588 355 1062 755 ...2010.0
\n", 2467 | "
" 2468 | ], 2469 | "text/plain": [ 2470 | " movieId title \\\n", 2471 | "9070 78499 Toy Story 3 (2010) \n", 2472 | "\n", 2473 | " genres num_ratings \\\n", 2474 | "9070 Adventure|Animation|Children|Comedy|Fantasy|IMAX 10963.0 \n", 2475 | "\n", 2476 | " rating_median rating_mean \\\n", 2477 | "9070 4.0 3.910244 \n", 2478 | "\n", 2479 | " movie_tags year \n", 2480 | "9070 1071 864 745 881 186 845 464 588 355 1062 755 ... 2010.0 " 2481 | ] 2482 | }, 2483 | "execution_count": 54, 2484 | "metadata": {}, 2485 | "output_type": "execute_result" 2486 | } 2487 | ], 2488 | "source": [ 2489 | "dataset_with_tags[dataset_with_tags.movieId == 78499]" 2490 | ] 2491 | }, 2492 | { 2493 | "cell_type": "markdown", 2494 | "metadata": { 2495 | "deletable": true, 2496 | "editable": true 2497 | }, 2498 | "source": [ 2499 | "The closest movies to Toy Story 1 are the sequels and Monsters Inc! (No sh*t sherlock)" 2500 | ] 2501 | }, 2502 | { 2503 | "cell_type": "markdown", 2504 | "metadata": { 2505 | "deletable": true, 2506 | "editable": true 2507 | }, 2508 | "source": [ 2509 | "## Stacking for writing to DB" 2510 | ] 2511 | }, 2512 | { 2513 | "cell_type": "code", 2514 | "execution_count": 55, 2515 | "metadata": { 2516 | "collapsed": true, 2517 | "deletable": true, 2518 | "editable": true 2519 | }, 2520 | "outputs": [], 2521 | "source": [ 2522 | "m2m_similarity_stacked = df_tfidf_m2m.stack().reset_index()\n", 2523 | "m2m_similarity_stacked.columns = ['first_movie', 'second_movie', 'similarity_score']" 2524 | ] 2525 | }, 2526 | { 2527 | "cell_type": "code", 2528 | "execution_count": 56, 2529 | "metadata": { 2530 | "collapsed": false, 2531 | "deletable": true, 2532 | "editable": true 2533 | }, 2534 | "outputs": [ 2535 | { 2536 | "data": { 2537 | "text/html": [ 2538 | "
\n", 2539 | "\n", 2540 | " \n", 2541 | " \n", 2542 | " \n", 2543 | " \n", 2544 | " \n", 2545 | " \n", 2546 | " \n", 2547 | " \n", 2548 | " \n", 2549 | " \n", 2550 | " \n", 2551 | " \n", 2552 | " \n", 2553 | " \n", 2554 | " \n", 2555 | " \n", 2556 | " \n", 2557 | " \n", 2558 | " \n", 2559 | " \n", 2560 | " \n", 2561 | " \n", 2562 | " \n", 2563 | " \n", 2564 | " \n", 2565 | " \n", 2566 | " \n", 2567 | " \n", 2568 | " \n", 2569 | " \n", 2570 | " \n", 2571 | " \n", 2572 | " \n", 2573 | " \n", 2574 | " \n", 2575 | " \n", 2576 | " \n", 2577 | " \n", 2578 | " \n", 2579 | " \n", 2580 | "
first_moviesecond_moviesimilarity_score
0111.000000
1120.431993
2130.159781
3140.141310
4150.216984
\n", 2581 | "
" 2582 | ], 2583 | "text/plain": [ 2584 | " first_movie second_movie similarity_score\n", 2585 | "0 1 1 1.000000\n", 2586 | "1 1 2 0.431993\n", 2587 | "2 1 3 0.159781\n", 2588 | "3 1 4 0.141310\n", 2589 | "4 1 5 0.216984" 2590 | ] 2591 | }, 2592 | "execution_count": 56, 2593 | "metadata": {}, 2594 | "output_type": "execute_result" 2595 | } 2596 | ], 2597 | "source": [ 2598 | "m2m_similarity_stacked.head()" 2599 | ] 2600 | }, 2601 | { 2602 | "cell_type": "code", 2603 | "execution_count": 57, 2604 | "metadata": { 2605 | "collapsed": false, 2606 | "deletable": true, 2607 | "editable": true 2608 | }, 2609 | "outputs": [ 2610 | { 2611 | "data": { 2612 | "text/html": [ 2613 | "
\n", 2614 | "\n", 2615 | " \n", 2616 | " \n", 2617 | " \n", 2618 | " \n", 2619 | " \n", 2620 | " \n", 2621 | " \n", 2622 | " \n", 2623 | " \n", 2624 | " \n", 2625 | " \n", 2626 | " \n", 2627 | " \n", 2628 | " \n", 2629 | " \n", 2630 | " \n", 2631 | " \n", 2632 | " \n", 2633 | " \n", 2634 | " \n", 2635 | " \n", 2636 | " \n", 2637 | " \n", 2638 | " \n", 2639 | " \n", 2640 | " \n", 2641 | " \n", 2642 | " \n", 2643 | " \n", 2644 | " \n", 2645 | " \n", 2646 | " \n", 2647 | " \n", 2648 | " \n", 2649 | " \n", 2650 | " \n", 2651 | " \n", 2652 | " \n", 2653 | " \n", 2654 | " \n", 2655 | "
first_moviesecond_moviesimilarity_score
1139342711626001616340.097457
1139342721626001623500.253319
1139342731626001623760.204455
1139342741626001625780.209276
1139342751626001626001.000000
\n", 2656 | "
" 2657 | ], 2658 | "text/plain": [ 2659 | " first_movie second_movie similarity_score\n", 2660 | "113934271 162600 161634 0.097457\n", 2661 | "113934272 162600 162350 0.253319\n", 2662 | "113934273 162600 162376 0.204455\n", 2663 | "113934274 162600 162578 0.209276\n", 2664 | "113934275 162600 162600 1.000000" 2665 | ] 2666 | }, 2667 | "execution_count": 57, 2668 | "metadata": {}, 2669 | "output_type": "execute_result" 2670 | } 2671 | ], 2672 | "source": [ 2673 | "m2m_similarity_stacked.tail()" 2674 | ] 2675 | }, 2676 | { 2677 | "cell_type": "markdown", 2678 | "metadata": { 2679 | "deletable": true, 2680 | "editable": true 2681 | }, 2682 | "source": [ 2683 | "## Writing to DB" 2684 | ] 2685 | }, 2686 | { 2687 | "cell_type": "code", 2688 | "execution_count": 58, 2689 | "metadata": { 2690 | "collapsed": false, 2691 | "deletable": true, 2692 | "editable": true 2693 | }, 2694 | "outputs": [], 2695 | "source": [ 2696 | "import sqlite3 as db" 2697 | ] 2698 | }, 2699 | { 2700 | "cell_type": "code", 2701 | "execution_count": 59, 2702 | "metadata": { 2703 | "collapsed": true, 2704 | "deletable": true, 2705 | "editable": true 2706 | }, 2707 | "outputs": [], 2708 | "source": [ 2709 | "connection = db.connect('db.sqlite3')" 2710 | ] 2711 | }, 2712 | { 2713 | "cell_type": "code", 2714 | "execution_count": 60, 2715 | "metadata": { 2716 | "collapsed": false, 2717 | "deletable": true, 2718 | "editable": true 2719 | }, 2720 | "outputs": [], 2721 | "source": [ 2722 | "for_db = dataset.rename(columns={\n", 2723 | " 'movieId': 'movie_id'\n", 2724 | "})[['movie_id', 'title', 'year', 'genres', 'num_ratings', 'rating_median', 'rating_mean']]" 2725 | ] 2726 | }, 2727 | { 2728 | "cell_type": "code", 2729 | "execution_count": 61, 2730 | "metadata": { 2731 | "collapsed": true, 2732 | "deletable": true, 2733 | "editable": true 2734 | }, 2735 | "outputs": [], 2736 | "source": [ 2737 | "for_db['relatable'] = True" 2738 | ] 2739 | }, 2740 | { 2741 | "cell_type": "code", 2742 | "execution_count": 62, 2743 | "metadata": { 2744 | "collapsed": false, 2745 | "deletable": true, 2746 | "editable": true 2747 | }, 2748 | "outputs": [ 2749 | { 2750 | "data": { 2751 | "text/html": [ 2752 | "
\n", 2753 | "\n", 2754 | " \n", 2755 | " \n", 2756 | " \n", 2757 | " \n", 2758 | " \n", 2759 | " \n", 2760 | " \n", 2761 | " \n", 2762 | " \n", 2763 | " \n", 2764 | " \n", 2765 | " \n", 2766 | " \n", 2767 | " \n", 2768 | " \n", 2769 | " \n", 2770 | " \n", 2771 | " \n", 2772 | " \n", 2773 | " \n", 2774 | " \n", 2775 | " \n", 2776 | " \n", 2777 | " \n", 2778 | " \n", 2779 | " \n", 2780 | " \n", 2781 | " \n", 2782 | " \n", 2783 | " \n", 2784 | " \n", 2785 | " \n", 2786 | " \n", 2787 | " \n", 2788 | " \n", 2789 | " \n", 2790 | " \n", 2791 | " \n", 2792 | " \n", 2793 | " \n", 2794 | " \n", 2795 | " \n", 2796 | " \n", 2797 | " \n", 2798 | " \n", 2799 | " \n", 2800 | " \n", 2801 | " \n", 2802 | " \n", 2803 | " \n", 2804 | " \n", 2805 | " \n", 2806 | " \n", 2807 | " \n", 2808 | " \n", 2809 | " \n", 2810 | " \n", 2811 | " \n", 2812 | " \n", 2813 | " \n", 2814 | " \n", 2815 | " \n", 2816 | " \n", 2817 | " \n", 2818 | " \n", 2819 | " \n", 2820 | " \n", 2821 | " \n", 2822 | " \n", 2823 | " \n", 2824 | "
movie_idtitleyeargenresnum_ratingsrating_medianrating_meanrelatable
01Toy Story (1995)1995.0Adventure|Animation|Children|Comedy|Fantasy63469.04.03.889300True
12Jumanji (1995)1995.0Adventure|Children|Fantasy25045.03.03.229527True
23Grumpier Old Men (1995)1995.0Comedy|Romance15381.03.03.178142True
34Waiting to Exhale (1995)1995.0Comedy|Drama|Romance2961.03.02.879433True
45Father of the Bride Part II (1995)1995.0Comedy15023.03.03.080410True
\n", 2825 | "
" 2826 | ], 2827 | "text/plain": [ 2828 | " movie_id title year \\\n", 2829 | "0 1 Toy Story (1995) 1995.0 \n", 2830 | "1 2 Jumanji (1995) 1995.0 \n", 2831 | "2 3 Grumpier Old Men (1995) 1995.0 \n", 2832 | "3 4 Waiting to Exhale (1995) 1995.0 \n", 2833 | "4 5 Father of the Bride Part II (1995) 1995.0 \n", 2834 | "\n", 2835 | " genres num_ratings rating_median \\\n", 2836 | "0 Adventure|Animation|Children|Comedy|Fantasy 63469.0 4.0 \n", 2837 | "1 Adventure|Children|Fantasy 25045.0 3.0 \n", 2838 | "2 Comedy|Romance 15381.0 3.0 \n", 2839 | "3 Comedy|Drama|Romance 2961.0 3.0 \n", 2840 | "4 Comedy 15023.0 3.0 \n", 2841 | "\n", 2842 | " rating_mean relatable \n", 2843 | "0 3.889300 True \n", 2844 | "1 3.229527 True \n", 2845 | "2 3.178142 True \n", 2846 | "3 2.879433 True \n", 2847 | "4 3.080410 True " 2848 | ] 2849 | }, 2850 | "execution_count": 62, 2851 | "metadata": {}, 2852 | "output_type": "execute_result" 2853 | } 2854 | ], 2855 | "source": [ 2856 | "for_db.head()" 2857 | ] 2858 | }, 2859 | { 2860 | "cell_type": "code", 2861 | "execution_count": 63, 2862 | "metadata": { 2863 | "collapsed": true, 2864 | "deletable": true, 2865 | "editable": true 2866 | }, 2867 | "outputs": [], 2868 | "source": [ 2869 | "from tqdm import tqdm" 2870 | ] 2871 | }, 2872 | { 2873 | "cell_type": "code", 2874 | "execution_count": 64, 2875 | "metadata": { 2876 | "collapsed": false, 2877 | "deletable": true, 2878 | "editable": true 2879 | }, 2880 | "outputs": [ 2881 | { 2882 | "name": "stderr", 2883 | "output_type": "stream", 2884 | "text": [ 2885 | "40501it [00:01, 38515.09it/s] \n" 2886 | ] 2887 | } 2888 | ], 2889 | "source": [ 2890 | "total_length = len(for_db)\n", 2891 | "step = int(total_length / 100)\n", 2892 | "\n", 2893 | "with tqdm(total=total_length) as pbar:\n", 2894 | " for i in range(0, total_length, step):\n", 2895 | " subset = for_db[i: i+step]\n", 2896 | " subset.to_sql('movie_time_app_movie', connection, if_exists='append', index=False)\n", 2897 | " pbar.update(step)" 2898 | ] 2899 | }, 2900 | { 2901 | "cell_type": "code", 2902 | "execution_count": 65, 2903 | "metadata": { 2904 | "collapsed": false, 2905 | "deletable": true, 2906 | "editable": true 2907 | }, 2908 | "outputs": [ 2909 | { 2910 | "data": { 2911 | "text/html": [ 2912 | "
\n", 2913 | "\n", 2914 | " \n", 2915 | " \n", 2916 | " \n", 2917 | " \n", 2918 | " \n", 2919 | " \n", 2920 | " \n", 2921 | " \n", 2922 | " \n", 2923 | " \n", 2924 | " \n", 2925 | " \n", 2926 | " \n", 2927 | " \n", 2928 | " \n", 2929 | " \n", 2930 | " \n", 2931 | " \n", 2932 | " \n", 2933 | " \n", 2934 | " \n", 2935 | " \n", 2936 | " \n", 2937 | " \n", 2938 | " \n", 2939 | " \n", 2940 | " \n", 2941 | " \n", 2942 | " \n", 2943 | " \n", 2944 | " \n", 2945 | " \n", 2946 | " \n", 2947 | " \n", 2948 | " \n", 2949 | " \n", 2950 | " \n", 2951 | " \n", 2952 | " \n", 2953 | " \n", 2954 | " \n", 2955 | " \n", 2956 | " \n", 2957 | " \n", 2958 | " \n", 2959 | " \n", 2960 | " \n", 2961 | " \n", 2962 | " \n", 2963 | " \n", 2964 | " \n", 2965 | " \n", 2966 | " \n", 2967 | " \n", 2968 | " \n", 2969 | " \n", 2970 | " \n", 2971 | " \n", 2972 | " \n", 2973 | " \n", 2974 | " \n", 2975 | " \n", 2976 | " \n", 2977 | " \n", 2978 | " \n", 2979 | " \n", 2980 | " \n", 2981 | " \n", 2982 | " \n", 2983 | " \n", 2984 | " \n", 2985 | " \n", 2986 | " \n", 2987 | " \n", 2988 | " \n", 2989 | " \n", 2990 | "
movie_idtitleposteryeargenresnum_ratingsrating_medianrating_meanrelatable
01Toy Story (1995)None1995Adventure|Animation|Children|Comedy|Fantasy634694.03.8893001
12Jumanji (1995)None1995Adventure|Children|Fantasy250453.03.2295271
23Grumpier Old Men (1995)None1995Comedy|Romance153813.03.1781421
34Waiting to Exhale (1995)None1995Comedy|Drama|Romance29613.02.8794331
45Father of the Bride Part II (1995)None1995Comedy150233.03.0804101
\n", 2991 | "
" 2992 | ], 2993 | "text/plain": [ 2994 | " movie_id title poster year \\\n", 2995 | "0 1 Toy Story (1995) None 1995 \n", 2996 | "1 2 Jumanji (1995) None 1995 \n", 2997 | "2 3 Grumpier Old Men (1995) None 1995 \n", 2998 | "3 4 Waiting to Exhale (1995) None 1995 \n", 2999 | "4 5 Father of the Bride Part II (1995) None 1995 \n", 3000 | "\n", 3001 | " genres num_ratings rating_median \\\n", 3002 | "0 Adventure|Animation|Children|Comedy|Fantasy 63469 4.0 \n", 3003 | "1 Adventure|Children|Fantasy 25045 3.0 \n", 3004 | "2 Comedy|Romance 15381 3.0 \n", 3005 | "3 Comedy|Drama|Romance 2961 3.0 \n", 3006 | "4 Comedy 15023 3.0 \n", 3007 | "\n", 3008 | " rating_mean relatable \n", 3009 | "0 3.889300 1 \n", 3010 | "1 3.229527 1 \n", 3011 | "2 3.178142 1 \n", 3012 | "3 2.879433 1 \n", 3013 | "4 3.080410 1 " 3014 | ] 3015 | }, 3016 | "execution_count": 65, 3017 | "metadata": {}, 3018 | "output_type": "execute_result" 3019 | } 3020 | ], 3021 | "source": [ 3022 | "pd.read_sql_query('SELECT * FROM movie_time_app_movie LIMIT 5', connection)" 3023 | ] 3024 | } 3025 | ], 3026 | "metadata": { 3027 | "kernelspec": { 3028 | "display_name": "Python 3", 3029 | "language": "python", 3030 | "name": "python3" 3031 | }, 3032 | "language_info": { 3033 | "codemirror_mode": { 3034 | "name": "ipython", 3035 | "version": 3 3036 | }, 3037 | "file_extension": ".py", 3038 | "mimetype": "text/x-python", 3039 | "name": "python", 3040 | "nbconvert_exporter": "python", 3041 | "pygments_lexer": "ipython3", 3042 | "version": "3.5.2" 3043 | } 3044 | }, 3045 | "nbformat": 4, 3046 | "nbformat_minor": 0 3047 | } 3048 | --------------------------------------------------------------------------------