├── .gitignore ├── README.md ├── book-crossing-eda.ipynb ├── book-crossing-preprocessing.ipynb ├── collaborative-filtering-memory-based.ipynb ├── collaborative-filtering-model-based.ipynb ├── functions.py └── img ├── books_header.jpg ├── test_actual.jpg ├── test_pred.jpg └── train_actual.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,pycharm,windows,jupyternotebooks 3 | # Edit at https://www.gitignore.io/?templates=python,pycharm,windows,jupyternotebooks 4 | 5 | ### JupyterNotebooks ### 6 | # gitignore template for Jupyter Notebooks 7 | # website: http://jupyter.org/ 8 | 9 | .ipynb_checkpoints 10 | */.ipynb_checkpoints/* 11 | 12 | # IPython 13 | profile_default/ 14 | ipython_config.py 15 | 16 | # Remove previous ipynb_checkpoints 17 | # git rm -r .ipynb_checkpoints/ 18 | 19 | ### PyCharm ### 20 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 21 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 22 | 23 | # User-specific stuff 24 | .idea/**/workspace.xml 25 | .idea/**/tasks.xml 26 | .idea/**/usage.statistics.xml 27 | .idea/**/dictionaries 28 | .idea/**/shelf 29 | 30 | # Generated files 31 | .idea/**/contentModel.xml 32 | 33 | # Sensitive or high-churn files 34 | .idea/**/dataSources/ 35 | .idea/**/dataSources.ids 36 | .idea/**/dataSources.local.xml 37 | .idea/**/sqlDataSources.xml 38 | .idea/**/dynamic.xml 39 | .idea/**/uiDesigner.xml 40 | .idea/**/dbnavigator.xml 41 | 42 | # Gradle 43 | .idea/**/gradle.xml 44 | .idea/**/libraries 45 | 46 | # Gradle and Maven with auto-import 47 | # When using Gradle or Maven with auto-import, you should exclude module files, 48 | # since they will be recreated, and may cause churn. Uncomment if using 49 | # auto-import. 50 | # .idea/modules.xml 51 | # .idea/*.iml 52 | # .idea/modules 53 | # *.iml 54 | # *.ipr 55 | 56 | # CMake 57 | cmake-build-*/ 58 | 59 | # Mongo Explorer plugin 60 | .idea/**/mongoSettings.xml 61 | 62 | # File-based project format 63 | *.iws 64 | 65 | # IntelliJ 66 | out/ 67 | 68 | # mpeltonen/sbt-idea plugin 69 | .idea_modules/ 70 | 71 | # JIRA plugin 72 | atlassian-ide-plugin.xml 73 | 74 | # Cursive Clojure plugin 75 | .idea/replstate.xml 76 | 77 | # Crashlytics plugin (for Android Studio and IntelliJ) 78 | com_crashlytics_export_strings.xml 79 | crashlytics.properties 80 | crashlytics-build.properties 81 | fabric.properties 82 | 83 | # Editor-based Rest Client 84 | .idea/httpRequests 85 | 86 | # Android studio 3.1+ serialized cache file 87 | .idea/caches/build_file_checksums.ser 88 | 89 | ### PyCharm Patch ### 90 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 91 | 92 | # *.iml 93 | # modules.xml 94 | # .idea/misc.xml 95 | # *.ipr 96 | 97 | # Sonarlint plugin 98 | .idea/**/sonarlint/ 99 | 100 | # SonarQube Plugin 101 | .idea/**/sonarIssues.xml 102 | 103 | # Markdown Navigator plugin 104 | .idea/**/markdown-navigator.xml 105 | .idea/**/markdown-navigator/ 106 | 107 | ### Python ### 108 | # Byte-compiled / optimized / DLL files 109 | __pycache__/ 110 | *.py[cod] 111 | *$py.class 112 | 113 | # C extensions 114 | *.so 115 | 116 | # Distribution / packaging 117 | .Python 118 | build/ 119 | develop-eggs/ 120 | dist/ 121 | downloads/ 122 | eggs/ 123 | .eggs/ 124 | lib/ 125 | lib64/ 126 | parts/ 127 | sdist/ 128 | var/ 129 | wheels/ 130 | pip-wheel-metadata/ 131 | share/python-wheels/ 132 | *.egg-info/ 133 | .installed.cfg 134 | *.egg 135 | MANIFEST 136 | 137 | # PyInstaller 138 | # Usually these files are written by a python script from a template 139 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 140 | *.manifest 141 | *.spec 142 | 143 | # Installer logs 144 | pip-log.txt 145 | pip-delete-this-directory.txt 146 | 147 | # Unit test / coverage reports 148 | htmlcov/ 149 | .tox/ 150 | .nox/ 151 | .coverage 152 | .coverage.* 153 | .cache 154 | nosetests.xml 155 | coverage.xml 156 | *.cover 157 | .hypothesis/ 158 | .pytest_cache/ 159 | 160 | # Translations 161 | *.mo 162 | *.pot 163 | 164 | # Scrapy stuff: 165 | .scrapy 166 | 167 | # Sphinx documentation 168 | docs/_build/ 169 | 170 | # PyBuilder 171 | target/ 172 | 173 | # pyenv 174 | .python-version 175 | 176 | # pipenv 177 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 178 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 179 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 180 | # install all needed dependencies. 181 | #Pipfile.lock 182 | 183 | # celery beat schedule file 184 | celerybeat-schedule 185 | 186 | # SageMath parsed files 187 | *.sage.py 188 | 189 | # Spyder project settings 190 | .spyderproject 191 | .spyproject 192 | 193 | # Rope project settings 194 | .ropeproject 195 | 196 | # Mr Developer 197 | .mr.developer.cfg 198 | .project 199 | .pydevproject 200 | 201 | # mkdocs documentation 202 | /site 203 | 204 | # mypy 205 | .mypy_cache/ 206 | .dmypy.json 207 | dmypy.json 208 | 209 | # Pyre type checker 210 | .pyre/ 211 | 212 | ### Windows ### 213 | # Windows thumbnail cache files 214 | Thumbs.db 215 | Thumbs.db:encryptable 216 | ehthumbs.db 217 | ehthumbs_vista.db 218 | 219 | # Dump file 220 | *.stackdump 221 | 222 | # Folder config file 223 | [Dd]esktop.ini 224 | 225 | # Recycle Bin used on file shares 226 | $RECYCLE.BIN/ 227 | 228 | # Windows Installer files 229 | *.cab 230 | *.msi 231 | *.msix 232 | *.msm 233 | *.msp 234 | 235 | # Windows shortcuts 236 | *.lnk 237 | 238 | # Custom 239 | /data/ 240 | .idea/ 241 | 242 | # End of https://www.gitignore.io/api/python,pycharm,windows,jupyternotebooks -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Collaborative Filtering Recommender System with Python 2 | 3 | ![books recommendations](img/books_header.jpg) 4 | 5 | 6 | 7 | **Collaborative filtering** is a technique commonly used to build personalized recommendations in online products. Among companies using the collaborative filtering technology we can find some popular websites like: Amazon, Netflix, IMDB. In collaborative filtering, algorithms are used to make automatic predictions about a user's interests by compiling preferences from several users. 8 | 9 | The main focus of this repository is to build collaborative filtering recommender systems for a **Book-Crossing dataset**. It contains data about book ratings collected in a 4-week crawl in 2004 as well as detailed information about books and users. Further details on the dataset are given in this publication: 10 | 11 | > [Improving Recommendation Lists Through Topic Diversification](http://www2.informatik.uni-freiburg.de/~dbis/Publications/05/WWW05.html), 12 | > 13 | > Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen; *Proceedings of the 14th International World Wide Web Conference (WWW '05),* May 10-14, 2005, Chiba, Japan. *To appear.* 14 | 15 | 16 | 17 | ------ 18 | 19 | **Contents:** 20 | 21 | 1. [**Preprocessing of Book-Crossing dataset**](book-crossing-preprocessing.ipynb) - the script includes loading data in the correct format, filtering out incorrect rows and reducing dimensionality of the dataset. 22 | 2. [**Exploratory Data Analysis of Book-Crossing dataset**](book-crossing-eda.ipynb) - the analysis provides insights about distribution of ratings, most popular readings and characteristics of users giving the scores. 23 | 3. [**Memory-based approach to Collaborative Filtering**](collaborative-filtering-memory-based.ipynb) - memory based algorithms apply statistical techniques to the entire dataset to calculate the predictions. In this notebook two methods are compared (user-user and user-item) and the model is optimized to provide the best predictions. 24 | 4. [**Model-based approach to Collaborative Filtering**](collaborative-filtering-model-based.ipynb) - model based approach involves building machine learning algorithms to predict user's ratings. In this notebook SVD and NMF methods are compared and the model is optimized to provide the best predictions. 25 | 26 | ------ 27 | 28 | **Reference:** 29 | 30 | 1. https://surprise.readthedocs.io/en/stable/getting_started.html#getting-started 31 | 2. https://realpython.com/build-recommendation-engine-collaborative-filtering/ 32 | 3. https://towardsdatascience.com/various-implementations-of-collaborative-filtering-100385c6dfe0 33 | 4. https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b -------------------------------------------------------------------------------- /book-crossing-preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Preprocessing of Book-Crossing Dataset\n", 8 | "\n", 9 | "The [Book-Crossing dataset](http://www2.informatik.uni-freiburg.de/~cziegler/BX/) contains data about book ratings, books and users collected by Cai-Nicolas Ziegler in a 4-week crawl (August / September 2004)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "\n", 21 | "import functions as f" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "path = 'data/original/'\n", 31 | "\n", 32 | "df_ratings = pd.read_csv(path + 'BX-Book-Ratings.csv', sep=';', encoding='ansi')\n", 33 | "df_books = pd.read_csv(path + 'BX-Books.csv', sep=';', encoding='ansi', escapechar='\\\\')\n", 34 | "df_users = pd.read_csv(path + 'BX-Users.csv', sep=';', encoding='ansi')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "for df in [df_ratings, df_books, df_users]:\n", 44 | " df.columns = [f.colname_fix(col) for col in df.columns]" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "Ratings:\n", 57 | "Number of ratings: 1149780\n", 58 | "Number of books: 340556\n", 59 | "Number of users: 105283\n", 60 | "\n", 61 | "Number of books: 271379\n", 62 | "\n", 63 | "Number of users: 278858\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "print('Ratings:\\nNumber of ratings: %d\\nNumber of books: %d\\nNumber of users: %d' % (len(df_ratings),\n", 69 | " len(df_ratings['isbn'].unique()),\n", 70 | " len(df_ratings['user_id'].unique())))\n", 71 | "print('\\nNumber of books: %d' % len(df_books))\n", 72 | "print('\\nNumber of users: %d' % len(df_users))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "items with non-ascii characters in user_id: 0\n", 85 | "items with non-ascii characters in isbn: 55\n", 86 | "items with non-ascii characters in book_rating: 0\n", 87 | "\n", 88 | "items with non-ascii characters in isbn: 0\n", 89 | "items with non-ascii characters in book_title: 365\n", 90 | "items with non-ascii characters in book_author: 21\n", 91 | "items with non-ascii characters in year_of_publication: 0\n", 92 | "items with non-ascii characters in publisher: 33\n", 93 | "items with non-ascii characters in image_url_s: 0\n", 94 | "items with non-ascii characters in image_url_m: 0\n", 95 | "items with non-ascii characters in image_url_l: 0\n", 96 | "\n", 97 | "items with non-ascii characters in user_id: 0\n", 98 | "items with non-ascii characters in location: 560\n", 99 | "items with non-ascii characters in age: 0\n", 100 | "\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "f.ascii_check_bulk(df_ratings)\n", 106 | "f.ascii_check_bulk(df_books)\n", 107 | "f.ascii_check_bulk(df_users)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Filtering observations\n", 115 | "* Remove (incorrect) ISBN with non-ascii characters\n", 116 | "* Use only country instead of whole 'location' data\n", 117 | "* Remove images' urls\n", 118 | "* Separate explicit (1-10) and implicit (0) ratings" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "df_ratings['isbn_check'] = df_ratings['isbn'].apply(f.ascii_check)\n", 128 | "df_ratings = df_ratings[df_ratings['isbn_check']==0]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 7, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "df_users['country'] = df_users['location'].apply(lambda x: x.split(', ')[-1].title())\n", 138 | "df_users['country_check'] = df_users['country'].apply(f.ascii_check)\n", 139 | "df_users.loc[df_users['country_check']==1, 'country'] = np.nan" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 8, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "df_ratings.drop(['isbn_check'], axis=1, inplace=True)\n", 149 | "df_books.drop(['image_url_s', 'image_url_m', 'image_url_l'], axis=1, inplace=True)\n", 150 | "df_users.drop(['country_check'], axis=1, inplace=True)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 9, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "Explicit ratings: 433642\n", 163 | "Implicit ratings: 716083\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "df_ratings_explicit = df_ratings[df_ratings['book_rating']!=0]\n", 169 | "df_ratings_implicit = df_ratings[df_ratings['book_rating']==0]\n", 170 | "\n", 171 | "print('Explicit ratings: %d\\nImplicit ratings: %d' % (len(df_ratings_explicit), len(df_ratings_implicit)))" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 10, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "df_ratings_explicit.to_csv('data/ratings_explicit.csv', encoding='utf-8', index=False)\n", 181 | "df_ratings_implicit.to_csv('data/ratings_implicit.csv', encoding='utf-8', index=False)\n", 182 | "df_books.to_csv('data/books.csv', encoding='utf-8', index=False)\n", 183 | "df_users.to_csv('data/users.csv', encoding='utf-8', index=False)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Reducing the dimensionality\n", 191 | "To reduce the dimensionality of the dataset and avoid running into memory error it will focus on users with at least 3 ratings and top 10% most frequently rated books. It consists of 176,594 records." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 11, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "Filter: users with at least 3 ratings\n", 204 | "Number of records: 368563\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "user_ratings_threshold = 3\n", 210 | "\n", 211 | "filter_users = df_ratings_explicit['user_id'].value_counts()\n", 212 | "filter_users_list = filter_users[filter_users >= user_ratings_threshold].index.to_list()\n", 213 | "\n", 214 | "df_ratings_top = df_ratings_explicit[df_ratings_explicit['user_id'].isin(filter_users_list)]\n", 215 | "\n", 216 | "print('Filter: users with at least %d ratings\\nNumber of records: %d' % (user_ratings_threshold, len(df_ratings_top)))" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 12, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "Filter: top 10% most frequently rated books\n", 229 | "Number of records: 176594\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "book_ratings_threshold_perc = 0.1\n", 235 | "book_ratings_threshold = len(df_ratings_top['isbn'].unique()) * book_ratings_threshold_perc\n", 236 | "\n", 237 | "filter_books_list = df_ratings_top['isbn'].value_counts().head(int(book_ratings_threshold)).index.to_list()\n", 238 | "df_ratings_top = df_ratings_top[df_ratings_top['isbn'].isin(filter_books_list)]\n", 239 | "\n", 240 | "print('Filter: top %d%% most frequently rated books\\nNumber of records: %d' % (book_ratings_threshold_perc*100, len(df_ratings_top)))" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 13, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "df_ratings_top.to_csv('data/ratings_top.csv', encoding='utf-8', index=False)" 250 | ] 251 | } 252 | ], 253 | "metadata": { 254 | "kernelspec": { 255 | "display_name": "master", 256 | "language": "python", 257 | "name": "master" 258 | }, 259 | "language_info": { 260 | "codemirror_mode": { 261 | "name": "ipython", 262 | "version": 3 263 | }, 264 | "file_extension": ".py", 265 | "mimetype": "text/x-python", 266 | "name": "python", 267 | "nbconvert_exporter": "python", 268 | "pygments_lexer": "ipython3", 269 | "version": "3.7.6" 270 | } 271 | }, 272 | "nbformat": 4, 273 | "nbformat_minor": 2 274 | } 275 | -------------------------------------------------------------------------------- /collaborative-filtering-model-based.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model Based Collaborative Filtering Recommender\n", 8 | "\n", 9 | "The goal of the **recommender system** is to predict user preference for a set of items based on the past experience. Two the most popular approaches are Content-Based and Collaborative Filtering.\n", 10 | "\n", 11 | "**Collaborative filtering** is a technique used by websites like Amazon, YouTube, and Netflix. It filters out items that a user might like on the basis of reactions of similar users. There are two categories of collaborative filtering algorithms: memory based and model based.\n", 12 | "\n", 13 | "**Model based approach** involves building machine learning algorithms to predict user's ratings. They involve dimensionality reduction methods that reduce high dimensional matrix containing abundant number of missing values with a much smaller matrix in lower-dimensional space.\n", 14 | "\n", 15 | "The goal of this exercise is to compare SVD and NMF algorithms, try different configurations of parameters and explore obtained results." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "import seaborn as sns\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "\n", 29 | "from surprise import Dataset, Reader\n", 30 | "from surprise import SVD, NMF\n", 31 | "from surprise.model_selection import cross_validate, train_test_split, GridSearchCV\n", 32 | "\n", 33 | "import functions as f" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "This analysis will focus on book recommendations based on [Book-Crossing dataset](http://www2.informatik.uni-freiburg.de/~cziegler/BX/). To reduce the dimensionality of the dataset and avoid running into memory error it will focus on users with at least 3 ratings and top 10% most frequently rated books. It consists of 176,594 records.\n", 41 | "\n", 42 | "The recommender systems will be built using [surprise package](https://surprise.readthedocs.io/en/stable/getting_started.html) (Matrix Factorization - based models)." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "df = pd.read_csv('data/ratings_top.csv')\n", 52 | "\n", 53 | "reader = Reader(rating_scale=(1, 10))\n", 54 | "data = Dataset.load_from_df(df[['user_id', 'isbn', 'book_rating']], reader)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "Number of ratings: 176594\n", 67 | "Number of books: 16766\n", 68 | "Number of users: 20149\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "print('Number of ratings: %d\\nNumber of books: %d\\nNumber of users: %d' % (len(df), len(df['isbn'].unique()), len(df['user_id'].unique())))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## SVD and NMF models comparison\n", 81 | "\n", 82 | "Singular Value Decomposition (SVD) and Non-negative Matrix Factorization (NMF) are matrix factorization techniques used for dimensionality reduction. Surprise package provides implementation of those algorithms.\n", 83 | "\n", 84 | "It's clear that for the given dataset much better results can be obtained with SVD approach - both in terms of accuracy and training / testing time." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 15, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "test_rmse 1.606926\n", 96 | "test_mae 1.242338\n", 97 | "fit_time 18.130412\n", 98 | "test_time 1.120190\n", 99 | "dtype: float64" 100 | ] 101 | }, 102 | "execution_count": 15, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "model_svd = SVD()\n", 109 | "cv_results_svd = cross_validate(model_svd, data, cv=3)\n", 110 | "pd.DataFrame(cv_results_svd).mean()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 16, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "test_rmse 2.640803\n", 122 | "test_mae 2.255504\n", 123 | "fit_time 22.795353\n", 124 | "test_time 1.005285\n", 125 | "dtype: float64" 126 | ] 127 | }, 128 | "execution_count": 16, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "model_nmf = NMF()\n", 135 | "cv_results_nmf = cross_validate(model_nmf, data, cv=3)\n", 136 | "pd.DataFrame(cv_results_nmf).mean()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Optimisation of SVD algorithm\n", 144 | "\n", 145 | "Grid Search Cross Validation computes accuracy metrics for an algorithm on various combinations of parameters, over a cross-validation procedure. It's useful for finding the best configuration of parameters.\n", 146 | "\n", 147 | "It is used to find the best setting of parameters:\n", 148 | "* n_factors - the number of factors\n", 149 | "* n_epochs - the number of iteration of the SGD procedure\n", 150 | "* lr_all - the learning rate for all parameters\n", 151 | "* reg_all - the regularization term for all parameters\n", 152 | "\n", 153 | "As a result, regarding the majority of parameters, the default setting is the most optimal one. The improvement obtained with Grid Search is very small." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 17, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "1.5981785240945765\n", 166 | "{'n_factors': 80, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "param_grid = {'n_factors': [80,100,120],\n", 172 | " 'n_epochs': [5, 10, 20],\n", 173 | " 'lr_all': [0.002, 0.005],\n", 174 | " 'reg_all': [0.2, 0.4, 0.6]}\n", 175 | "\n", 176 | "gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)\n", 177 | "gs.fit(data)\n", 178 | "\n", 179 | "print(gs.best_score['rmse'])\n", 180 | "print(gs.best_params['rmse'])\n", 181 | "\n", 182 | "#1.5981785240945765\n", 183 | "#{'n_factors': 80, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Analysis of Collaborative Filtering model results\n", 191 | "\n", 192 | "In this part, let's examine in detail the results obtained by the SVD model that provided the best RMSE score." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 4, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "trainset, testset = train_test_split(data, test_size=0.2)\n", 202 | "\n", 203 | "model = SVD(n_factors=80, n_epochs=20, lr_all=0.005, reg_all=0.2)\n", 204 | "model.fit(trainset)\n", 205 | "predictions = model.test(testset)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 5, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/html": [ 216 | "
\n", 217 | "\n", 230 | "\n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | "
user_idisbnactual_ratingpred_ratingimpossiblepred_rating_roundabs_err
611824299903453745685.07.246858False7.02.246858
1718567840155166951X10.08.513183False9.01.486817
2131378553045140432710.09.083398False9.00.916602
2342310778403730314675.05.890978False6.00.890978
98999525003757256019.08.035049False8.00.964951
\n", 296 | "
" 297 | ], 298 | "text/plain": [ 299 | " user_id isbn actual_rating pred_rating impossible \\\n", 300 | "6118 242999 0345374568 5.0 7.246858 False \n", 301 | "17185 67840 155166951X 10.0 8.513183 False \n", 302 | "21313 78553 0451404327 10.0 9.083398 False \n", 303 | "23423 107784 0373031467 5.0 5.890978 False \n", 304 | "9899 95250 0375725601 9.0 8.035049 False \n", 305 | "\n", 306 | " pred_rating_round abs_err \n", 307 | "6118 7.0 2.246858 \n", 308 | "17185 9.0 1.486817 \n", 309 | "21313 9.0 0.916602 \n", 310 | "23423 6.0 0.890978 \n", 311 | "9899 8.0 0.964951 " 312 | ] 313 | }, 314 | "execution_count": 5, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "df_pred = pd.DataFrame(predictions, columns=['user_id', 'isbn', 'actual_rating', 'pred_rating', 'details'])\n", 321 | "\n", 322 | "df_pred['impossible'] = df_pred['details'].apply(lambda x: x['was_impossible'])\n", 323 | "df_pred['pred_rating_round'] = df_pred['pred_rating'].round()\n", 324 | "df_pred['abs_err'] = abs(df_pred['pred_rating'] - df_pred['actual_rating'])\n", 325 | "df_pred.drop(['details'], axis=1, inplace=True)\n", 326 | "\n", 327 | "df_pred.sample(5)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "### Distribution of actual and predicted ratings in the test set\n", 335 | "\n", 336 | "According to the distribution of actual ratings of books in the test set, the biggest part of users give positive scores - between 7 and 10. The mode equals 8 but count of ratings 7, 9, 10 is also noticeable. The distribution of predicted ratings in the test set is visibly different. One more time, 8 is a mode but scores 7, 9 and 10 are clearly less frequent.\n", 337 | "\n", 338 | "It shows that the recommender system is not perfect and it cannot reflect the real distribution of book ratings." 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 6, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "image/png": "\n", 349 | "text/plain": [ 350 | "
" 351 | ] 352 | }, 353 | "metadata": { 354 | "needs_background": "light" 355 | }, 356 | "output_type": "display_data" 357 | } 358 | ], 359 | "source": [ 360 | "palette = sns.color_palette(\"RdBu\", 10)\n", 361 | "fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))\n", 362 | "\n", 363 | "sns.countplot(x='actual_rating', data=df_pred, palette=palette, ax=ax1)\n", 364 | "ax1.set_title('Distribution of actual ratings of books in the test set')\n", 365 | "\n", 366 | "sns.countplot(x='pred_rating_round', data=df_pred, palette=palette, ax=ax2)\n", 367 | "ax2.set_title('Distribution of predicted ratings of books in the test set')\n", 368 | "\n", 369 | "plt.show()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "### Absolute error of predicted ratings\n", 377 | "\n", 378 | "The distribution of absolute errors is right-skewed, showing that the majority of errors is small: between 0 and 1. There is a long tail that indicates that there are several observations for which the absolute error was close to 10.\n", 379 | "\n", 380 | "How good/bad the model is with predicting certain scores? As expected from the above charts, the model deals very well with predicting score = 8 (the most frequent value). The further the rating from score = 8, the higher the absolute error. The biggest errors happen to observations with scores 1 or 2 which indicates that probably the model is predicting high ratings for those observations." 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 7, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "data": { 390 | "image/png": "\n", 391 | "text/plain": [ 392 | "
" 393 | ] 394 | }, 395 | "metadata": { 396 | "needs_background": "light" 397 | }, 398 | "output_type": "display_data" 399 | } 400 | ], 401 | "source": [ 402 | "df_pred_err = df_pred.groupby('actual_rating')['abs_err'].mean().reset_index()\n", 403 | "\n", 404 | "fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))\n", 405 | "\n", 406 | "sns.distplot(df_pred['abs_err'], color='#2f6194', ax=ax1)\n", 407 | "ax1.set_title('Distribution of absolute error in test set')\n", 408 | "\n", 409 | "sns.barplot(x='actual_rating', y='abs_err', data=df_pred_err, palette=palette, ax=ax2)\n", 410 | "ax2.set_title('Mean absolute error for rating in test set')\n", 411 | "\n", 412 | "plt.show()" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "### Analysis of predicted ratings of a particular user\n", 420 | "\n", 421 | "For this part of the analysis, the user with id 193458 was selected. By analyzing book ratings by this user, it can be noted that he/she likes diverse types of readings: English romantic novels (Pride and Prejudice, Sense and Sensibility), fantasy (Narnia) as well as historical novels (Schindler's List). Among the recommended books there are other works from Narnia's series, two historical novels and one romance which correlates with user's previous preferences." 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 17, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "df_books = pd.read_csv('data/books.csv')\n", 431 | "\n", 432 | "df_ext = df.merge(df_books[['isbn', 'book_title']], on='isbn', how='left')\n", 433 | "df_ext['book_title_short'] = df_ext['book_title'].apply(f.short_title)\n", 434 | "df_ext = df_ext.merge(df_pred[['isbn', 'user_id', 'pred_rating']], on=['isbn', 'user_id'], how='left')" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 11, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "data": { 444 | "text/html": [ 445 | "
\n", 446 | "\n", 459 | "\n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | "
user_idisbnbook_ratingbook_titlebook_title_shortpred_rating
124989193458185326000210Pride & Prejudice (Wordsworth Classics)Pride & Prejudice (Wordsworth Classics)NaN
12494219345801406201259Wuthering Heights (Penguin Popular Classics)Wuthering Heights (Penguin Popular Classics)NaN
12495219345803453425699Shoeless JoeShoeless JoeNaN
12494019345801402984799Bridget Jones: The Edge of ReasonBridget Jones: The Edge of ReasonNaN
124991193458185326016910Sense and Sensibility (Wordsworth Classics)Sense and Sensibility (Wordsworth Classics)NaN
12497819345806718803149Schindler's ListSchindler's ListNaN
12495119345803303526959Four Letters of LoveFour Letters of LoveNaN
12493219345800644710479The Lion, the Witch, and the Wardrobe (The Chr...The Lion, the Witch, and the Wardrobe (TheNaN
12493819345800644711019The Magician's Nephew (rack) (Narnia)The Magician's Nephew (rack) (Narnia)NaN
124936193458006447108X9The Last BattleThe Last BattleNaN
\n", 564 | "
" 565 | ], 566 | "text/plain": [ 567 | " user_id isbn book_rating \\\n", 568 | "124989 193458 1853260002 10 \n", 569 | "124942 193458 0140620125 9 \n", 570 | "124952 193458 0345342569 9 \n", 571 | "124940 193458 0140298479 9 \n", 572 | "124991 193458 1853260169 10 \n", 573 | "124978 193458 0671880314 9 \n", 574 | "124951 193458 0330352695 9 \n", 575 | "124932 193458 0064471047 9 \n", 576 | "124938 193458 0064471101 9 \n", 577 | "124936 193458 006447108X 9 \n", 578 | "\n", 579 | " book_title \\\n", 580 | "124989 Pride & Prejudice (Wordsworth Classics) \n", 581 | "124942 Wuthering Heights (Penguin Popular Classics) \n", 582 | "124952 Shoeless Joe \n", 583 | "124940 Bridget Jones: The Edge of Reason \n", 584 | "124991 Sense and Sensibility (Wordsworth Classics) \n", 585 | "124978 Schindler's List \n", 586 | "124951 Four Letters of Love \n", 587 | "124932 The Lion, the Witch, and the Wardrobe (The Chr... \n", 588 | "124938 The Magician's Nephew (rack) (Narnia) \n", 589 | "124936 The Last Battle \n", 590 | "\n", 591 | " book_title_short pred_rating \n", 592 | "124989 Pride & Prejudice (Wordsworth Classics) NaN \n", 593 | "124942 Wuthering Heights (Penguin Popular Classics) NaN \n", 594 | "124952 Shoeless Joe NaN \n", 595 | "124940 Bridget Jones: The Edge of Reason NaN \n", 596 | "124991 Sense and Sensibility (Wordsworth Classics) NaN \n", 597 | "124978 Schindler's List NaN \n", 598 | "124951 Four Letters of Love NaN \n", 599 | "124932 The Lion, the Witch, and the Wardrobe (The NaN \n", 600 | "124938 The Magician's Nephew (rack) (Narnia) NaN \n", 601 | "124936 The Last Battle NaN " 602 | ] 603 | }, 604 | "execution_count": 11, 605 | "metadata": {}, 606 | "output_type": "execute_result" 607 | } 608 | ], 609 | "source": [ 610 | "selected_user_id = 193458\n", 611 | "df_user = df_ext[df_ext['user_id']==selected_user_id]\n", 612 | "\n", 613 | "df_user[(df_user['pred_rating'].isna())&(df_user['book_rating']>=9)].sample(10)" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "### Train set: Top rated books\n", 621 | "\n", 622 | "![](img/train_actual.jpg)" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 12, 628 | "metadata": {}, 629 | "outputs": [ 630 | { 631 | "data": { 632 | "text/html": [ 633 | "
\n", 634 | "\n", 647 | "\n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | "
user_idisbnbook_ratingbook_titlebook_title_shortpred_rating
12494619345801420017409The Secret Life of BeesThe Secret Life of Bees8.281881
12493519345800644710719The Voyage of the Dawn Treader (rack) (Narnia)The Voyage of the Dawn Treader (rack) (Narnia)8.244509
12493719345800644710989The Silver ChairThe Silver Chair8.184727
12497419345805532580019The Cider House RulesThe Cider House Rules8.057183
12495819345803454310579Slaves in the Family (Ballantine Reader's Circle)Slaves in the Family (Ballantine Reader's8.055557
\n", 707 | "
" 708 | ], 709 | "text/plain": [ 710 | " user_id isbn book_rating \\\n", 711 | "124946 193458 0142001740 9 \n", 712 | "124935 193458 0064471071 9 \n", 713 | "124937 193458 0064471098 9 \n", 714 | "124974 193458 0553258001 9 \n", 715 | "124958 193458 0345431057 9 \n", 716 | "\n", 717 | " book_title \\\n", 718 | "124946 The Secret Life of Bees \n", 719 | "124935 The Voyage of the Dawn Treader (rack) (Narnia) \n", 720 | "124937 The Silver Chair \n", 721 | "124974 The Cider House Rules \n", 722 | "124958 Slaves in the Family (Ballantine Reader's Circle) \n", 723 | "\n", 724 | " book_title_short pred_rating \n", 725 | "124946 The Secret Life of Bees 8.281881 \n", 726 | "124935 The Voyage of the Dawn Treader (rack) (Narnia) 8.244509 \n", 727 | "124937 The Silver Chair 8.184727 \n", 728 | "124974 The Cider House Rules 8.057183 \n", 729 | "124958 Slaves in the Family (Ballantine Reader's 8.055557 " 730 | ] 731 | }, 732 | "execution_count": 12, 733 | "metadata": {}, 734 | "output_type": "execute_result" 735 | } 736 | ], 737 | "source": [ 738 | "df_user[df_user['pred_rating'].notna()].sort_values('pred_rating', ascending=False).head(5)" 739 | ] 740 | }, 741 | { 742 | "cell_type": "markdown", 743 | "metadata": {}, 744 | "source": [ 745 | "### Test set: predicted top rated books\n", 746 | "\n", 747 | "![](img/test_pred.jpg)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 13, 753 | "metadata": {}, 754 | "outputs": [ 755 | { 756 | "data": { 757 | "text/html": [ 758 | "
\n", 759 | "\n", 772 | "\n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | "
user_idisbnbook_ratingbook_titlebook_title_shortpred_rating
12493419345800644710639The Horse and His BoyThe Horse and His Boy7.814202
12493519345800644710719The Voyage of the Dawn Treader (rack) (Narnia)The Voyage of the Dawn Treader (rack) (Narnia)8.244509
12493719345800644710989The Silver ChairThe Silver Chair8.184727
12494619345801420017409The Secret Life of BeesThe Secret Life of Bees8.281881
12495819345803454310579Slaves in the Family (Ballantine Reader's Circle)Slaves in the Family (Ballantine Reader's8.055557
\n", 832 | "
" 833 | ], 834 | "text/plain": [ 835 | " user_id isbn book_rating \\\n", 836 | "124934 193458 0064471063 9 \n", 837 | "124935 193458 0064471071 9 \n", 838 | "124937 193458 0064471098 9 \n", 839 | "124946 193458 0142001740 9 \n", 840 | "124958 193458 0345431057 9 \n", 841 | "\n", 842 | " book_title \\\n", 843 | "124934 The Horse and His Boy \n", 844 | "124935 The Voyage of the Dawn Treader (rack) (Narnia) \n", 845 | "124937 The Silver Chair \n", 846 | "124946 The Secret Life of Bees \n", 847 | "124958 Slaves in the Family (Ballantine Reader's Circle) \n", 848 | "\n", 849 | " book_title_short pred_rating \n", 850 | "124934 The Horse and His Boy 7.814202 \n", 851 | "124935 The Voyage of the Dawn Treader (rack) (Narnia) 8.244509 \n", 852 | "124937 The Silver Chair 8.184727 \n", 853 | "124946 The Secret Life of Bees 8.281881 \n", 854 | "124958 Slaves in the Family (Ballantine Reader's 8.055557 " 855 | ] 856 | }, 857 | "execution_count": 13, 858 | "metadata": {}, 859 | "output_type": "execute_result" 860 | } 861 | ], 862 | "source": [ 863 | "df_user[df_user['pred_rating'].notna()].sort_values('book_rating', ascending=False).head(5)" 864 | ] 865 | }, 866 | { 867 | "cell_type": "markdown", 868 | "metadata": {}, 869 | "source": [ 870 | "### Test set: actual top rated books\n", 871 | "\n", 872 | "![](img/test_actual.jpg)" 873 | ] 874 | } 875 | ], 876 | "metadata": { 877 | "kernelspec": { 878 | "display_name": "master", 879 | "language": "python", 880 | "name": "master" 881 | }, 882 | "language_info": { 883 | "codemirror_mode": { 884 | "name": "ipython", 885 | "version": 3 886 | }, 887 | "file_extension": ".py", 888 | "mimetype": "text/x-python", 889 | "name": "python", 890 | "nbconvert_exporter": "python", 891 | "pygments_lexer": "ipython3", 892 | "version": "3.7.6" 893 | } 894 | }, 895 | "nbformat": 4, 896 | "nbformat_minor": 2 897 | } 898 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from string import ascii_letters, digits 5 | from surprise.model_selection import cross_validate 6 | 7 | ### DataFrame operations 8 | 9 | def k_from_details(details): 10 | try: 11 | return details['actual_k'] 12 | except KeyError: 13 | return 1000 14 | 15 | def short_title(title, max_len=40): 16 | title = str(title).split(' ') 17 | short_title = '' 18 | 19 | for i in range(len(title)): 20 | if len(short_title) < max_len: 21 | short_title = ' '.join([short_title, title[i]]) 22 | short_title = short_title.strip() 23 | return short_title 24 | 25 | def ascii_check(item): 26 | for letter in str(item): 27 | if letter not in ascii_letters + digits: 28 | return 1 29 | else: 30 | return 0 31 | 32 | def ascii_check_bulk(df): 33 | for col in df.columns: 34 | print('items with non-ascii characters in %s: %d' % (col, df[col].apply(ascii_check).sum())) 35 | print('') 36 | 37 | def colname_fix(colname): 38 | return colname.lower().replace('-','_') 39 | 40 | ### New DataFrames 41 | 42 | def df_dist(df, colname, norm=False): 43 | new_df = df[colname].value_counts(normalize=norm).reset_index() 44 | new_df.columns = [colname, 'count'] 45 | return new_df 46 | 47 | def books_groupby(df, column, new_colname): 48 | df_groupby = df.groupby(column).agg({'isbn': 'count', 'book_rating': 'mean'}).reset_index() 49 | df_groupby.columns = [new_colname, 'count', 'avg_rating'] 50 | return df_groupby 51 | 52 | ### Visualizations 53 | 54 | def draw_distribution(data, title_part, threshold=20): 55 | fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4)) 56 | 57 | sns.distplot(data['count'], color='#2f6194', ax=ax1) 58 | ax1.set_title('Distribution of number of ratings per %s' % title_part) 59 | 60 | sns.countplot(data[data['count']<=threshold]['count'], color='#2f6194', ax=ax2) 61 | ax2.set_title('Distribution of number of ratings per %s (<= %d ratings)' % (title_part, threshold)) 62 | 63 | plt.show() 64 | 65 | def draw_top_chart(data, x, y_list, title): 66 | fig, ax1 = plt.subplots(figsize=(14, 6)) 67 | plt.xticks(rotation=90) 68 | 69 | palette = sns.color_palette("RdBu", len(data)) 70 | 71 | sns.barplot(x=x, y=y_list[0], data=data, palette=palette, ax=ax1) 72 | ax1.set_title(title) 73 | 74 | ax2 = ax1.twinx() 75 | sns.scatterplot(x=x, y=y_list[1], data=data, color='black', ax=ax2) 76 | 77 | plt.show() 78 | 79 | ### Model-related functions 80 | 81 | def get_model_name(model): 82 | return str(model).split('.')[-1].split(' ')[0].replace("'>", "") 83 | 84 | def cv_multiple_models(data, models_dict, cv=3): 85 | results = pd.DataFrame() 86 | 87 | for model_name, model in models_dict.items(): 88 | print('\n---> CV for %s...' % model_name) 89 | 90 | cv_results = cross_validate(model, data, cv=cv) 91 | tmp = pd.DataFrame(cv_results).mean() 92 | tmp['model'] = model_name 93 | results = results.append(tmp, ignore_index=True) 94 | 95 | return results 96 | 97 | def generate_models_dict(models, sim_names, user_based): 98 | models_dict = {} 99 | 100 | for sim_name in sim_names: 101 | sim_dict = { 102 | 'name': sim_name, 103 | 'user_based': user_based 104 | } 105 | for model in models: 106 | model_name = get_model_name(model) + ' ' + sim_name 107 | models_dict[model_name] = model(sim_options=sim_dict) 108 | 109 | return models_dict 110 | 111 | def draw_model_results(results): 112 | fig, ax1 = plt.subplots(figsize=(10, 6)) 113 | plt.xticks(rotation=90) 114 | 115 | palette = sns.color_palette("RdBu", len(results)) 116 | 117 | sns.barplot(x='model', y='test_rmse', data=results, palette=palette, ax=ax1) 118 | ax1.set_title('Test RMSE and fit time of evaluated models') 119 | 120 | ax2 = ax1.twinx() 121 | sns.scatterplot(x='model', y='fit_time', data=results, color='black', ax=ax2) 122 | ax2.set(ylim=(0, results['fit_time'].max() * 1.1)) 123 | 124 | plt.show() -------------------------------------------------------------------------------- /img/books_header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klaudia-nazarko/collaborative-filtering-python/8196bce6135bc42a40b36c2f7a1c214dddcf09b1/img/books_header.jpg -------------------------------------------------------------------------------- /img/test_actual.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klaudia-nazarko/collaborative-filtering-python/8196bce6135bc42a40b36c2f7a1c214dddcf09b1/img/test_actual.jpg -------------------------------------------------------------------------------- /img/test_pred.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klaudia-nazarko/collaborative-filtering-python/8196bce6135bc42a40b36c2f7a1c214dddcf09b1/img/test_pred.jpg -------------------------------------------------------------------------------- /img/train_actual.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klaudia-nazarko/collaborative-filtering-python/8196bce6135bc42a40b36c2f7a1c214dddcf09b1/img/train_actual.jpg --------------------------------------------------------------------------------