├── .gitignore
├── README.md
├── book-crossing-eda.ipynb
├── book-crossing-preprocessing.ipynb
├── collaborative-filtering-memory-based.ipynb
├── collaborative-filtering-model-based.ipynb
├── functions.py
└── img
├── books_header.jpg
├── test_actual.jpg
├── test_pred.jpg
└── train_actual.jpg
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.gitignore.io/api/python,pycharm,windows,jupyternotebooks
3 | # Edit at https://www.gitignore.io/?templates=python,pycharm,windows,jupyternotebooks
4 |
5 | ### JupyterNotebooks ###
6 | # gitignore template for Jupyter Notebooks
7 | # website: http://jupyter.org/
8 |
9 | .ipynb_checkpoints
10 | */.ipynb_checkpoints/*
11 |
12 | # IPython
13 | profile_default/
14 | ipython_config.py
15 |
16 | # Remove previous ipynb_checkpoints
17 | # git rm -r .ipynb_checkpoints/
18 |
19 | ### PyCharm ###
20 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
21 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
22 |
23 | # User-specific stuff
24 | .idea/**/workspace.xml
25 | .idea/**/tasks.xml
26 | .idea/**/usage.statistics.xml
27 | .idea/**/dictionaries
28 | .idea/**/shelf
29 |
30 | # Generated files
31 | .idea/**/contentModel.xml
32 |
33 | # Sensitive or high-churn files
34 | .idea/**/dataSources/
35 | .idea/**/dataSources.ids
36 | .idea/**/dataSources.local.xml
37 | .idea/**/sqlDataSources.xml
38 | .idea/**/dynamic.xml
39 | .idea/**/uiDesigner.xml
40 | .idea/**/dbnavigator.xml
41 |
42 | # Gradle
43 | .idea/**/gradle.xml
44 | .idea/**/libraries
45 |
46 | # Gradle and Maven with auto-import
47 | # When using Gradle or Maven with auto-import, you should exclude module files,
48 | # since they will be recreated, and may cause churn. Uncomment if using
49 | # auto-import.
50 | # .idea/modules.xml
51 | # .idea/*.iml
52 | # .idea/modules
53 | # *.iml
54 | # *.ipr
55 |
56 | # CMake
57 | cmake-build-*/
58 |
59 | # Mongo Explorer plugin
60 | .idea/**/mongoSettings.xml
61 |
62 | # File-based project format
63 | *.iws
64 |
65 | # IntelliJ
66 | out/
67 |
68 | # mpeltonen/sbt-idea plugin
69 | .idea_modules/
70 |
71 | # JIRA plugin
72 | atlassian-ide-plugin.xml
73 |
74 | # Cursive Clojure plugin
75 | .idea/replstate.xml
76 |
77 | # Crashlytics plugin (for Android Studio and IntelliJ)
78 | com_crashlytics_export_strings.xml
79 | crashlytics.properties
80 | crashlytics-build.properties
81 | fabric.properties
82 |
83 | # Editor-based Rest Client
84 | .idea/httpRequests
85 |
86 | # Android studio 3.1+ serialized cache file
87 | .idea/caches/build_file_checksums.ser
88 |
89 | ### PyCharm Patch ###
90 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
91 |
92 | # *.iml
93 | # modules.xml
94 | # .idea/misc.xml
95 | # *.ipr
96 |
97 | # Sonarlint plugin
98 | .idea/**/sonarlint/
99 |
100 | # SonarQube Plugin
101 | .idea/**/sonarIssues.xml
102 |
103 | # Markdown Navigator plugin
104 | .idea/**/markdown-navigator.xml
105 | .idea/**/markdown-navigator/
106 |
107 | ### Python ###
108 | # Byte-compiled / optimized / DLL files
109 | __pycache__/
110 | *.py[cod]
111 | *$py.class
112 |
113 | # C extensions
114 | *.so
115 |
116 | # Distribution / packaging
117 | .Python
118 | build/
119 | develop-eggs/
120 | dist/
121 | downloads/
122 | eggs/
123 | .eggs/
124 | lib/
125 | lib64/
126 | parts/
127 | sdist/
128 | var/
129 | wheels/
130 | pip-wheel-metadata/
131 | share/python-wheels/
132 | *.egg-info/
133 | .installed.cfg
134 | *.egg
135 | MANIFEST
136 |
137 | # PyInstaller
138 | # Usually these files are written by a python script from a template
139 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
140 | *.manifest
141 | *.spec
142 |
143 | # Installer logs
144 | pip-log.txt
145 | pip-delete-this-directory.txt
146 |
147 | # Unit test / coverage reports
148 | htmlcov/
149 | .tox/
150 | .nox/
151 | .coverage
152 | .coverage.*
153 | .cache
154 | nosetests.xml
155 | coverage.xml
156 | *.cover
157 | .hypothesis/
158 | .pytest_cache/
159 |
160 | # Translations
161 | *.mo
162 | *.pot
163 |
164 | # Scrapy stuff:
165 | .scrapy
166 |
167 | # Sphinx documentation
168 | docs/_build/
169 |
170 | # PyBuilder
171 | target/
172 |
173 | # pyenv
174 | .python-version
175 |
176 | # pipenv
177 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
178 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
179 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
180 | # install all needed dependencies.
181 | #Pipfile.lock
182 |
183 | # celery beat schedule file
184 | celerybeat-schedule
185 |
186 | # SageMath parsed files
187 | *.sage.py
188 |
189 | # Spyder project settings
190 | .spyderproject
191 | .spyproject
192 |
193 | # Rope project settings
194 | .ropeproject
195 |
196 | # Mr Developer
197 | .mr.developer.cfg
198 | .project
199 | .pydevproject
200 |
201 | # mkdocs documentation
202 | /site
203 |
204 | # mypy
205 | .mypy_cache/
206 | .dmypy.json
207 | dmypy.json
208 |
209 | # Pyre type checker
210 | .pyre/
211 |
212 | ### Windows ###
213 | # Windows thumbnail cache files
214 | Thumbs.db
215 | Thumbs.db:encryptable
216 | ehthumbs.db
217 | ehthumbs_vista.db
218 |
219 | # Dump file
220 | *.stackdump
221 |
222 | # Folder config file
223 | [Dd]esktop.ini
224 |
225 | # Recycle Bin used on file shares
226 | $RECYCLE.BIN/
227 |
228 | # Windows Installer files
229 | *.cab
230 | *.msi
231 | *.msix
232 | *.msm
233 | *.msp
234 |
235 | # Windows shortcuts
236 | *.lnk
237 |
238 | # Custom
239 | /data/
240 | .idea/
241 |
242 | # End of https://www.gitignore.io/api/python,pycharm,windows,jupyternotebooks
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Collaborative Filtering Recommender System with Python
2 |
3 | 
4 |
5 |
6 |
7 | **Collaborative filtering** is a technique commonly used to build personalized recommendations in online products. Among companies using the collaborative filtering technology we can find some popular websites like: Amazon, Netflix, IMDB. In collaborative filtering, algorithms are used to make automatic predictions about a user's interests by compiling preferences from several users.
8 |
9 | The main focus of this repository is to build collaborative filtering recommender systems for a **Book-Crossing dataset**. It contains data about book ratings collected in a 4-week crawl in 2004 as well as detailed information about books and users. Further details on the dataset are given in this publication:
10 |
11 | > [Improving Recommendation Lists Through Topic Diversification](http://www2.informatik.uni-freiburg.de/~dbis/Publications/05/WWW05.html),
12 | >
13 | > Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen; *Proceedings of the 14th International World Wide Web Conference (WWW '05),* May 10-14, 2005, Chiba, Japan. *To appear.*
14 |
15 |
16 |
17 | ------
18 |
19 | **Contents:**
20 |
21 | 1. [**Preprocessing of Book-Crossing dataset**](book-crossing-preprocessing.ipynb) - the script includes loading data in the correct format, filtering out incorrect rows and reducing dimensionality of the dataset.
22 | 2. [**Exploratory Data Analysis of Book-Crossing dataset**](book-crossing-eda.ipynb) - the analysis provides insights about distribution of ratings, most popular readings and characteristics of users giving the scores.
23 | 3. [**Memory-based approach to Collaborative Filtering**](collaborative-filtering-memory-based.ipynb) - memory based algorithms apply statistical techniques to the entire dataset to calculate the predictions. In this notebook two methods are compared (user-user and user-item) and the model is optimized to provide the best predictions.
24 | 4. [**Model-based approach to Collaborative Filtering**](collaborative-filtering-model-based.ipynb) - model based approach involves building machine learning algorithms to predict user's ratings. In this notebook SVD and NMF methods are compared and the model is optimized to provide the best predictions.
25 |
26 | ------
27 |
28 | **Reference:**
29 |
30 | 1. https://surprise.readthedocs.io/en/stable/getting_started.html#getting-started
31 | 2. https://realpython.com/build-recommendation-engine-collaborative-filtering/
32 | 3. https://towardsdatascience.com/various-implementations-of-collaborative-filtering-100385c6dfe0
33 | 4. https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b
--------------------------------------------------------------------------------
/book-crossing-preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Preprocessing of Book-Crossing Dataset\n",
8 | "\n",
9 | "The [Book-Crossing dataset](http://www2.informatik.uni-freiburg.de/~cziegler/BX/) contains data about book ratings, books and users collected by Cai-Nicolas Ziegler in a 4-week crawl (August / September 2004)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import numpy as np\n",
20 | "\n",
21 | "import functions as f"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "path = 'data/original/'\n",
31 | "\n",
32 | "df_ratings = pd.read_csv(path + 'BX-Book-Ratings.csv', sep=';', encoding='ansi')\n",
33 | "df_books = pd.read_csv(path + 'BX-Books.csv', sep=';', encoding='ansi', escapechar='\\\\')\n",
34 | "df_users = pd.read_csv(path + 'BX-Users.csv', sep=';', encoding='ansi')"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 3,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "for df in [df_ratings, df_books, df_users]:\n",
44 | " df.columns = [f.colname_fix(col) for col in df.columns]"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 4,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "Ratings:\n",
57 | "Number of ratings: 1149780\n",
58 | "Number of books: 340556\n",
59 | "Number of users: 105283\n",
60 | "\n",
61 | "Number of books: 271379\n",
62 | "\n",
63 | "Number of users: 278858\n"
64 | ]
65 | }
66 | ],
67 | "source": [
68 | "print('Ratings:\\nNumber of ratings: %d\\nNumber of books: %d\\nNumber of users: %d' % (len(df_ratings),\n",
69 | " len(df_ratings['isbn'].unique()),\n",
70 | " len(df_ratings['user_id'].unique())))\n",
71 | "print('\\nNumber of books: %d' % len(df_books))\n",
72 | "print('\\nNumber of users: %d' % len(df_users))"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "items with non-ascii characters in user_id: 0\n",
85 | "items with non-ascii characters in isbn: 55\n",
86 | "items with non-ascii characters in book_rating: 0\n",
87 | "\n",
88 | "items with non-ascii characters in isbn: 0\n",
89 | "items with non-ascii characters in book_title: 365\n",
90 | "items with non-ascii characters in book_author: 21\n",
91 | "items with non-ascii characters in year_of_publication: 0\n",
92 | "items with non-ascii characters in publisher: 33\n",
93 | "items with non-ascii characters in image_url_s: 0\n",
94 | "items with non-ascii characters in image_url_m: 0\n",
95 | "items with non-ascii characters in image_url_l: 0\n",
96 | "\n",
97 | "items with non-ascii characters in user_id: 0\n",
98 | "items with non-ascii characters in location: 560\n",
99 | "items with non-ascii characters in age: 0\n",
100 | "\n"
101 | ]
102 | }
103 | ],
104 | "source": [
105 | "f.ascii_check_bulk(df_ratings)\n",
106 | "f.ascii_check_bulk(df_books)\n",
107 | "f.ascii_check_bulk(df_users)"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## Filtering observations\n",
115 | "* Remove (incorrect) ISBN with non-ascii characters\n",
116 | "* Use only country instead of whole 'location' data\n",
117 | "* Remove images' urls\n",
118 | "* Separate explicit (1-10) and implicit (0) ratings"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 6,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "df_ratings['isbn_check'] = df_ratings['isbn'].apply(f.ascii_check)\n",
128 | "df_ratings = df_ratings[df_ratings['isbn_check']==0]"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 7,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "df_users['country'] = df_users['location'].apply(lambda x: x.split(', ')[-1].title())\n",
138 | "df_users['country_check'] = df_users['country'].apply(f.ascii_check)\n",
139 | "df_users.loc[df_users['country_check']==1, 'country'] = np.nan"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 8,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "df_ratings.drop(['isbn_check'], axis=1, inplace=True)\n",
149 | "df_books.drop(['image_url_s', 'image_url_m', 'image_url_l'], axis=1, inplace=True)\n",
150 | "df_users.drop(['country_check'], axis=1, inplace=True)"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 9,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "name": "stdout",
160 | "output_type": "stream",
161 | "text": [
162 | "Explicit ratings: 433642\n",
163 | "Implicit ratings: 716083\n"
164 | ]
165 | }
166 | ],
167 | "source": [
168 | "df_ratings_explicit = df_ratings[df_ratings['book_rating']!=0]\n",
169 | "df_ratings_implicit = df_ratings[df_ratings['book_rating']==0]\n",
170 | "\n",
171 | "print('Explicit ratings: %d\\nImplicit ratings: %d' % (len(df_ratings_explicit), len(df_ratings_implicit)))"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 10,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "df_ratings_explicit.to_csv('data/ratings_explicit.csv', encoding='utf-8', index=False)\n",
181 | "df_ratings_implicit.to_csv('data/ratings_implicit.csv', encoding='utf-8', index=False)\n",
182 | "df_books.to_csv('data/books.csv', encoding='utf-8', index=False)\n",
183 | "df_users.to_csv('data/users.csv', encoding='utf-8', index=False)"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "## Reducing the dimensionality\n",
191 | "To reduce the dimensionality of the dataset and avoid running into memory error it will focus on users with at least 3 ratings and top 10% most frequently rated books. It consists of 176,594 records."
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 11,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "name": "stdout",
201 | "output_type": "stream",
202 | "text": [
203 | "Filter: users with at least 3 ratings\n",
204 | "Number of records: 368563\n"
205 | ]
206 | }
207 | ],
208 | "source": [
209 | "user_ratings_threshold = 3\n",
210 | "\n",
211 | "filter_users = df_ratings_explicit['user_id'].value_counts()\n",
212 | "filter_users_list = filter_users[filter_users >= user_ratings_threshold].index.to_list()\n",
213 | "\n",
214 | "df_ratings_top = df_ratings_explicit[df_ratings_explicit['user_id'].isin(filter_users_list)]\n",
215 | "\n",
216 | "print('Filter: users with at least %d ratings\\nNumber of records: %d' % (user_ratings_threshold, len(df_ratings_top)))"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 12,
222 | "metadata": {},
223 | "outputs": [
224 | {
225 | "name": "stdout",
226 | "output_type": "stream",
227 | "text": [
228 | "Filter: top 10% most frequently rated books\n",
229 | "Number of records: 176594\n"
230 | ]
231 | }
232 | ],
233 | "source": [
234 | "book_ratings_threshold_perc = 0.1\n",
235 | "book_ratings_threshold = len(df_ratings_top['isbn'].unique()) * book_ratings_threshold_perc\n",
236 | "\n",
237 | "filter_books_list = df_ratings_top['isbn'].value_counts().head(int(book_ratings_threshold)).index.to_list()\n",
238 | "df_ratings_top = df_ratings_top[df_ratings_top['isbn'].isin(filter_books_list)]\n",
239 | "\n",
240 | "print('Filter: top %d%% most frequently rated books\\nNumber of records: %d' % (book_ratings_threshold_perc*100, len(df_ratings_top)))"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 13,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "df_ratings_top.to_csv('data/ratings_top.csv', encoding='utf-8', index=False)"
250 | ]
251 | }
252 | ],
253 | "metadata": {
254 | "kernelspec": {
255 | "display_name": "master",
256 | "language": "python",
257 | "name": "master"
258 | },
259 | "language_info": {
260 | "codemirror_mode": {
261 | "name": "ipython",
262 | "version": 3
263 | },
264 | "file_extension": ".py",
265 | "mimetype": "text/x-python",
266 | "name": "python",
267 | "nbconvert_exporter": "python",
268 | "pygments_lexer": "ipython3",
269 | "version": "3.7.6"
270 | }
271 | },
272 | "nbformat": 4,
273 | "nbformat_minor": 2
274 | }
275 |
--------------------------------------------------------------------------------
/collaborative-filtering-model-based.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Model Based Collaborative Filtering Recommender\n",
8 | "\n",
9 | "The goal of the **recommender system** is to predict user preference for a set of items based on the past experience. Two the most popular approaches are Content-Based and Collaborative Filtering.\n",
10 | "\n",
11 | "**Collaborative filtering** is a technique used by websites like Amazon, YouTube, and Netflix. It filters out items that a user might like on the basis of reactions of similar users. There are two categories of collaborative filtering algorithms: memory based and model based.\n",
12 | "\n",
13 | "**Model based approach** involves building machine learning algorithms to predict user's ratings. They involve dimensionality reduction methods that reduce high dimensional matrix containing abundant number of missing values with a much smaller matrix in lower-dimensional space.\n",
14 | "\n",
15 | "The goal of this exercise is to compare SVD and NMF algorithms, try different configurations of parameters and explore obtained results."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "import seaborn as sns\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "\n",
29 | "from surprise import Dataset, Reader\n",
30 | "from surprise import SVD, NMF\n",
31 | "from surprise.model_selection import cross_validate, train_test_split, GridSearchCV\n",
32 | "\n",
33 | "import functions as f"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "This analysis will focus on book recommendations based on [Book-Crossing dataset](http://www2.informatik.uni-freiburg.de/~cziegler/BX/). To reduce the dimensionality of the dataset and avoid running into memory error it will focus on users with at least 3 ratings and top 10% most frequently rated books. It consists of 176,594 records.\n",
41 | "\n",
42 | "The recommender systems will be built using [surprise package](https://surprise.readthedocs.io/en/stable/getting_started.html) (Matrix Factorization - based models)."
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "df = pd.read_csv('data/ratings_top.csv')\n",
52 | "\n",
53 | "reader = Reader(rating_scale=(1, 10))\n",
54 | "data = Dataset.load_from_df(df[['user_id', 'isbn', 'book_rating']], reader)"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "name": "stdout",
64 | "output_type": "stream",
65 | "text": [
66 | "Number of ratings: 176594\n",
67 | "Number of books: 16766\n",
68 | "Number of users: 20149\n"
69 | ]
70 | }
71 | ],
72 | "source": [
73 | "print('Number of ratings: %d\\nNumber of books: %d\\nNumber of users: %d' % (len(df), len(df['isbn'].unique()), len(df['user_id'].unique())))"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "## SVD and NMF models comparison\n",
81 | "\n",
82 | "Singular Value Decomposition (SVD) and Non-negative Matrix Factorization (NMF) are matrix factorization techniques used for dimensionality reduction. Surprise package provides implementation of those algorithms.\n",
83 | "\n",
84 | "It's clear that for the given dataset much better results can be obtained with SVD approach - both in terms of accuracy and training / testing time."
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 15,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/plain": [
95 | "test_rmse 1.606926\n",
96 | "test_mae 1.242338\n",
97 | "fit_time 18.130412\n",
98 | "test_time 1.120190\n",
99 | "dtype: float64"
100 | ]
101 | },
102 | "execution_count": 15,
103 | "metadata": {},
104 | "output_type": "execute_result"
105 | }
106 | ],
107 | "source": [
108 | "model_svd = SVD()\n",
109 | "cv_results_svd = cross_validate(model_svd, data, cv=3)\n",
110 | "pd.DataFrame(cv_results_svd).mean()"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 16,
116 | "metadata": {},
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "test_rmse 2.640803\n",
122 | "test_mae 2.255504\n",
123 | "fit_time 22.795353\n",
124 | "test_time 1.005285\n",
125 | "dtype: float64"
126 | ]
127 | },
128 | "execution_count": 16,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | }
132 | ],
133 | "source": [
134 | "model_nmf = NMF()\n",
135 | "cv_results_nmf = cross_validate(model_nmf, data, cv=3)\n",
136 | "pd.DataFrame(cv_results_nmf).mean()"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "## Optimisation of SVD algorithm\n",
144 | "\n",
145 | "Grid Search Cross Validation computes accuracy metrics for an algorithm on various combinations of parameters, over a cross-validation procedure. It's useful for finding the best configuration of parameters.\n",
146 | "\n",
147 | "It is used to find the best setting of parameters:\n",
148 | "* n_factors - the number of factors\n",
149 | "* n_epochs - the number of iteration of the SGD procedure\n",
150 | "* lr_all - the learning rate for all parameters\n",
151 | "* reg_all - the regularization term for all parameters\n",
152 | "\n",
153 | "As a result, regarding the majority of parameters, the default setting is the most optimal one. The improvement obtained with Grid Search is very small."
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 17,
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "name": "stdout",
163 | "output_type": "stream",
164 | "text": [
165 | "1.5981785240945765\n",
166 | "{'n_factors': 80, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}\n"
167 | ]
168 | }
169 | ],
170 | "source": [
171 | "param_grid = {'n_factors': [80,100,120],\n",
172 | " 'n_epochs': [5, 10, 20],\n",
173 | " 'lr_all': [0.002, 0.005],\n",
174 | " 'reg_all': [0.2, 0.4, 0.6]}\n",
175 | "\n",
176 | "gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)\n",
177 | "gs.fit(data)\n",
178 | "\n",
179 | "print(gs.best_score['rmse'])\n",
180 | "print(gs.best_params['rmse'])\n",
181 | "\n",
182 | "#1.5981785240945765\n",
183 | "#{'n_factors': 80, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "## Analysis of Collaborative Filtering model results\n",
191 | "\n",
192 | "In this part, let's examine in detail the results obtained by the SVD model that provided the best RMSE score."
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 4,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "trainset, testset = train_test_split(data, test_size=0.2)\n",
202 | "\n",
203 | "model = SVD(n_factors=80, n_epochs=20, lr_all=0.005, reg_all=0.2)\n",
204 | "model.fit(trainset)\n",
205 | "predictions = model.test(testset)"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 5,
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "data": {
215 | "text/html": [
216 | "
\n",
217 | "\n",
230 | "
\n",
231 | " \n",
232 | "
\n",
233 | "
\n",
234 | "
user_id
\n",
235 | "
isbn
\n",
236 | "
actual_rating
\n",
237 | "
pred_rating
\n",
238 | "
impossible
\n",
239 | "
pred_rating_round
\n",
240 | "
abs_err
\n",
241 | "
\n",
242 | " \n",
243 | " \n",
244 | "
\n",
245 | "
6118
\n",
246 | "
242999
\n",
247 | "
0345374568
\n",
248 | "
5.0
\n",
249 | "
7.246858
\n",
250 | "
False
\n",
251 | "
7.0
\n",
252 | "
2.246858
\n",
253 | "
\n",
254 | "
\n",
255 | "
17185
\n",
256 | "
67840
\n",
257 | "
155166951X
\n",
258 | "
10.0
\n",
259 | "
8.513183
\n",
260 | "
False
\n",
261 | "
9.0
\n",
262 | "
1.486817
\n",
263 | "
\n",
264 | "
\n",
265 | "
21313
\n",
266 | "
78553
\n",
267 | "
0451404327
\n",
268 | "
10.0
\n",
269 | "
9.083398
\n",
270 | "
False
\n",
271 | "
9.0
\n",
272 | "
0.916602
\n",
273 | "
\n",
274 | "
\n",
275 | "
23423
\n",
276 | "
107784
\n",
277 | "
0373031467
\n",
278 | "
5.0
\n",
279 | "
5.890978
\n",
280 | "
False
\n",
281 | "
6.0
\n",
282 | "
0.890978
\n",
283 | "
\n",
284 | "
\n",
285 | "
9899
\n",
286 | "
95250
\n",
287 | "
0375725601
\n",
288 | "
9.0
\n",
289 | "
8.035049
\n",
290 | "
False
\n",
291 | "
8.0
\n",
292 | "
0.964951
\n",
293 | "
\n",
294 | " \n",
295 | "
\n",
296 | "
"
297 | ],
298 | "text/plain": [
299 | " user_id isbn actual_rating pred_rating impossible \\\n",
300 | "6118 242999 0345374568 5.0 7.246858 False \n",
301 | "17185 67840 155166951X 10.0 8.513183 False \n",
302 | "21313 78553 0451404327 10.0 9.083398 False \n",
303 | "23423 107784 0373031467 5.0 5.890978 False \n",
304 | "9899 95250 0375725601 9.0 8.035049 False \n",
305 | "\n",
306 | " pred_rating_round abs_err \n",
307 | "6118 7.0 2.246858 \n",
308 | "17185 9.0 1.486817 \n",
309 | "21313 9.0 0.916602 \n",
310 | "23423 6.0 0.890978 \n",
311 | "9899 8.0 0.964951 "
312 | ]
313 | },
314 | "execution_count": 5,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "df_pred = pd.DataFrame(predictions, columns=['user_id', 'isbn', 'actual_rating', 'pred_rating', 'details'])\n",
321 | "\n",
322 | "df_pred['impossible'] = df_pred['details'].apply(lambda x: x['was_impossible'])\n",
323 | "df_pred['pred_rating_round'] = df_pred['pred_rating'].round()\n",
324 | "df_pred['abs_err'] = abs(df_pred['pred_rating'] - df_pred['actual_rating'])\n",
325 | "df_pred.drop(['details'], axis=1, inplace=True)\n",
326 | "\n",
327 | "df_pred.sample(5)"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {},
333 | "source": [
334 | "### Distribution of actual and predicted ratings in the test set\n",
335 | "\n",
336 | "According to the distribution of actual ratings of books in the test set, the biggest part of users give positive scores - between 7 and 10. The mode equals 8 but count of ratings 7, 9, 10 is also noticeable. The distribution of predicted ratings in the test set is visibly different. One more time, 8 is a mode but scores 7, 9 and 10 are clearly less frequent.\n",
337 | "\n",
338 | "It shows that the recommender system is not perfect and it cannot reflect the real distribution of book ratings."
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 6,
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "data": {
348 | "image/png": "\n",
349 | "text/plain": [
350 | "
"
351 | ]
352 | },
353 | "metadata": {
354 | "needs_background": "light"
355 | },
356 | "output_type": "display_data"
357 | }
358 | ],
359 | "source": [
360 | "palette = sns.color_palette(\"RdBu\", 10)\n",
361 | "fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))\n",
362 | "\n",
363 | "sns.countplot(x='actual_rating', data=df_pred, palette=palette, ax=ax1)\n",
364 | "ax1.set_title('Distribution of actual ratings of books in the test set')\n",
365 | "\n",
366 | "sns.countplot(x='pred_rating_round', data=df_pred, palette=palette, ax=ax2)\n",
367 | "ax2.set_title('Distribution of predicted ratings of books in the test set')\n",
368 | "\n",
369 | "plt.show()"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "### Absolute error of predicted ratings\n",
377 | "\n",
378 | "The distribution of absolute errors is right-skewed, showing that the majority of errors is small: between 0 and 1. There is a long tail that indicates that there are several observations for which the absolute error was close to 10.\n",
379 | "\n",
380 | "How good/bad the model is with predicting certain scores? As expected from the above charts, the model deals very well with predicting score = 8 (the most frequent value). The further the rating from score = 8, the higher the absolute error. The biggest errors happen to observations with scores 1 or 2 which indicates that probably the model is predicting high ratings for those observations."
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 7,
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "data": {
390 | "image/png": "\n",
391 | "text/plain": [
392 | "
"
393 | ]
394 | },
395 | "metadata": {
396 | "needs_background": "light"
397 | },
398 | "output_type": "display_data"
399 | }
400 | ],
401 | "source": [
402 | "df_pred_err = df_pred.groupby('actual_rating')['abs_err'].mean().reset_index()\n",
403 | "\n",
404 | "fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))\n",
405 | "\n",
406 | "sns.distplot(df_pred['abs_err'], color='#2f6194', ax=ax1)\n",
407 | "ax1.set_title('Distribution of absolute error in test set')\n",
408 | "\n",
409 | "sns.barplot(x='actual_rating', y='abs_err', data=df_pred_err, palette=palette, ax=ax2)\n",
410 | "ax2.set_title('Mean absolute error for rating in test set')\n",
411 | "\n",
412 | "plt.show()"
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "metadata": {},
418 | "source": [
419 | "### Analysis of predicted ratings of a particular user\n",
420 | "\n",
421 | "For this part of the analysis, the user with id 193458 was selected. By analyzing book ratings by this user, it can be noted that he/she likes diverse types of readings: English romantic novels (Pride and Prejudice, Sense and Sensibility), fantasy (Narnia) as well as historical novels (Schindler's List). Among the recommended books there are other works from Narnia's series, two historical novels and one romance which correlates with user's previous preferences."
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 17,
427 | "metadata": {},
428 | "outputs": [],
429 | "source": [
430 | "df_books = pd.read_csv('data/books.csv')\n",
431 | "\n",
432 | "df_ext = df.merge(df_books[['isbn', 'book_title']], on='isbn', how='left')\n",
433 | "df_ext['book_title_short'] = df_ext['book_title'].apply(f.short_title)\n",
434 | "df_ext = df_ext.merge(df_pred[['isbn', 'user_id', 'pred_rating']], on=['isbn', 'user_id'], how='left')"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 11,
440 | "metadata": {},
441 | "outputs": [
442 | {
443 | "data": {
444 | "text/html": [
445 | "
\n",
446 | "\n",
459 | "
\n",
460 | " \n",
461 | "
\n",
462 | "
\n",
463 | "
user_id
\n",
464 | "
isbn
\n",
465 | "
book_rating
\n",
466 | "
book_title
\n",
467 | "
book_title_short
\n",
468 | "
pred_rating
\n",
469 | "
\n",
470 | " \n",
471 | " \n",
472 | "
\n",
473 | "
124989
\n",
474 | "
193458
\n",
475 | "
1853260002
\n",
476 | "
10
\n",
477 | "
Pride & Prejudice (Wordsworth Classics)
\n",
478 | "
Pride & Prejudice (Wordsworth Classics)
\n",
479 | "
NaN
\n",
480 | "
\n",
481 | "
\n",
482 | "
124942
\n",
483 | "
193458
\n",
484 | "
0140620125
\n",
485 | "
9
\n",
486 | "
Wuthering Heights (Penguin Popular Classics)
\n",
487 | "
Wuthering Heights (Penguin Popular Classics)
\n",
488 | "
NaN
\n",
489 | "
\n",
490 | "
\n",
491 | "
124952
\n",
492 | "
193458
\n",
493 | "
0345342569
\n",
494 | "
9
\n",
495 | "
Shoeless Joe
\n",
496 | "
Shoeless Joe
\n",
497 | "
NaN
\n",
498 | "
\n",
499 | "
\n",
500 | "
124940
\n",
501 | "
193458
\n",
502 | "
0140298479
\n",
503 | "
9
\n",
504 | "
Bridget Jones: The Edge of Reason
\n",
505 | "
Bridget Jones: The Edge of Reason
\n",
506 | "
NaN
\n",
507 | "
\n",
508 | "
\n",
509 | "
124991
\n",
510 | "
193458
\n",
511 | "
1853260169
\n",
512 | "
10
\n",
513 | "
Sense and Sensibility (Wordsworth Classics)
\n",
514 | "
Sense and Sensibility (Wordsworth Classics)
\n",
515 | "
NaN
\n",
516 | "
\n",
517 | "
\n",
518 | "
124978
\n",
519 | "
193458
\n",
520 | "
0671880314
\n",
521 | "
9
\n",
522 | "
Schindler's List
\n",
523 | "
Schindler's List
\n",
524 | "
NaN
\n",
525 | "
\n",
526 | "
\n",
527 | "
124951
\n",
528 | "
193458
\n",
529 | "
0330352695
\n",
530 | "
9
\n",
531 | "
Four Letters of Love
\n",
532 | "
Four Letters of Love
\n",
533 | "
NaN
\n",
534 | "
\n",
535 | "
\n",
536 | "
124932
\n",
537 | "
193458
\n",
538 | "
0064471047
\n",
539 | "
9
\n",
540 | "
The Lion, the Witch, and the Wardrobe (The Chr...
\n",
541 | "
The Lion, the Witch, and the Wardrobe (The
\n",
542 | "
NaN
\n",
543 | "
\n",
544 | "
\n",
545 | "
124938
\n",
546 | "
193458
\n",
547 | "
0064471101
\n",
548 | "
9
\n",
549 | "
The Magician's Nephew (rack) (Narnia)
\n",
550 | "
The Magician's Nephew (rack) (Narnia)
\n",
551 | "
NaN
\n",
552 | "
\n",
553 | "
\n",
554 | "
124936
\n",
555 | "
193458
\n",
556 | "
006447108X
\n",
557 | "
9
\n",
558 | "
The Last Battle
\n",
559 | "
The Last Battle
\n",
560 | "
NaN
\n",
561 | "
\n",
562 | " \n",
563 | "
\n",
564 | "
"
565 | ],
566 | "text/plain": [
567 | " user_id isbn book_rating \\\n",
568 | "124989 193458 1853260002 10 \n",
569 | "124942 193458 0140620125 9 \n",
570 | "124952 193458 0345342569 9 \n",
571 | "124940 193458 0140298479 9 \n",
572 | "124991 193458 1853260169 10 \n",
573 | "124978 193458 0671880314 9 \n",
574 | "124951 193458 0330352695 9 \n",
575 | "124932 193458 0064471047 9 \n",
576 | "124938 193458 0064471101 9 \n",
577 | "124936 193458 006447108X 9 \n",
578 | "\n",
579 | " book_title \\\n",
580 | "124989 Pride & Prejudice (Wordsworth Classics) \n",
581 | "124942 Wuthering Heights (Penguin Popular Classics) \n",
582 | "124952 Shoeless Joe \n",
583 | "124940 Bridget Jones: The Edge of Reason \n",
584 | "124991 Sense and Sensibility (Wordsworth Classics) \n",
585 | "124978 Schindler's List \n",
586 | "124951 Four Letters of Love \n",
587 | "124932 The Lion, the Witch, and the Wardrobe (The Chr... \n",
588 | "124938 The Magician's Nephew (rack) (Narnia) \n",
589 | "124936 The Last Battle \n",
590 | "\n",
591 | " book_title_short pred_rating \n",
592 | "124989 Pride & Prejudice (Wordsworth Classics) NaN \n",
593 | "124942 Wuthering Heights (Penguin Popular Classics) NaN \n",
594 | "124952 Shoeless Joe NaN \n",
595 | "124940 Bridget Jones: The Edge of Reason NaN \n",
596 | "124991 Sense and Sensibility (Wordsworth Classics) NaN \n",
597 | "124978 Schindler's List NaN \n",
598 | "124951 Four Letters of Love NaN \n",
599 | "124932 The Lion, the Witch, and the Wardrobe (The NaN \n",
600 | "124938 The Magician's Nephew (rack) (Narnia) NaN \n",
601 | "124936 The Last Battle NaN "
602 | ]
603 | },
604 | "execution_count": 11,
605 | "metadata": {},
606 | "output_type": "execute_result"
607 | }
608 | ],
609 | "source": [
610 | "selected_user_id = 193458\n",
611 | "df_user = df_ext[df_ext['user_id']==selected_user_id]\n",
612 | "\n",
613 | "df_user[(df_user['pred_rating'].isna())&(df_user['book_rating']>=9)].sample(10)"
614 | ]
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "metadata": {},
619 | "source": [
620 | "### Train set: Top rated books\n",
621 | "\n",
622 | ""
623 | ]
624 | },
625 | {
626 | "cell_type": "code",
627 | "execution_count": 12,
628 | "metadata": {},
629 | "outputs": [
630 | {
631 | "data": {
632 | "text/html": [
633 | "
\n",
634 | "\n",
647 | "
\n",
648 | " \n",
649 | "
\n",
650 | "
\n",
651 | "
user_id
\n",
652 | "
isbn
\n",
653 | "
book_rating
\n",
654 | "
book_title
\n",
655 | "
book_title_short
\n",
656 | "
pred_rating
\n",
657 | "
\n",
658 | " \n",
659 | " \n",
660 | "
\n",
661 | "
124946
\n",
662 | "
193458
\n",
663 | "
0142001740
\n",
664 | "
9
\n",
665 | "
The Secret Life of Bees
\n",
666 | "
The Secret Life of Bees
\n",
667 | "
8.281881
\n",
668 | "
\n",
669 | "
\n",
670 | "
124935
\n",
671 | "
193458
\n",
672 | "
0064471071
\n",
673 | "
9
\n",
674 | "
The Voyage of the Dawn Treader (rack) (Narnia)
\n",
675 | "
The Voyage of the Dawn Treader (rack) (Narnia)
\n",
676 | "
8.244509
\n",
677 | "
\n",
678 | "
\n",
679 | "
124937
\n",
680 | "
193458
\n",
681 | "
0064471098
\n",
682 | "
9
\n",
683 | "
The Silver Chair
\n",
684 | "
The Silver Chair
\n",
685 | "
8.184727
\n",
686 | "
\n",
687 | "
\n",
688 | "
124974
\n",
689 | "
193458
\n",
690 | "
0553258001
\n",
691 | "
9
\n",
692 | "
The Cider House Rules
\n",
693 | "
The Cider House Rules
\n",
694 | "
8.057183
\n",
695 | "
\n",
696 | "
\n",
697 | "
124958
\n",
698 | "
193458
\n",
699 | "
0345431057
\n",
700 | "
9
\n",
701 | "
Slaves in the Family (Ballantine Reader's Circle)
\n",
702 | "
Slaves in the Family (Ballantine Reader's
\n",
703 | "
8.055557
\n",
704 | "
\n",
705 | " \n",
706 | "
\n",
707 | "
"
708 | ],
709 | "text/plain": [
710 | " user_id isbn book_rating \\\n",
711 | "124946 193458 0142001740 9 \n",
712 | "124935 193458 0064471071 9 \n",
713 | "124937 193458 0064471098 9 \n",
714 | "124974 193458 0553258001 9 \n",
715 | "124958 193458 0345431057 9 \n",
716 | "\n",
717 | " book_title \\\n",
718 | "124946 The Secret Life of Bees \n",
719 | "124935 The Voyage of the Dawn Treader (rack) (Narnia) \n",
720 | "124937 The Silver Chair \n",
721 | "124974 The Cider House Rules \n",
722 | "124958 Slaves in the Family (Ballantine Reader's Circle) \n",
723 | "\n",
724 | " book_title_short pred_rating \n",
725 | "124946 The Secret Life of Bees 8.281881 \n",
726 | "124935 The Voyage of the Dawn Treader (rack) (Narnia) 8.244509 \n",
727 | "124937 The Silver Chair 8.184727 \n",
728 | "124974 The Cider House Rules 8.057183 \n",
729 | "124958 Slaves in the Family (Ballantine Reader's 8.055557 "
730 | ]
731 | },
732 | "execution_count": 12,
733 | "metadata": {},
734 | "output_type": "execute_result"
735 | }
736 | ],
737 | "source": [
738 | "df_user[df_user['pred_rating'].notna()].sort_values('pred_rating', ascending=False).head(5)"
739 | ]
740 | },
741 | {
742 | "cell_type": "markdown",
743 | "metadata": {},
744 | "source": [
745 | "### Test set: predicted top rated books\n",
746 | "\n",
747 | ""
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 13,
753 | "metadata": {},
754 | "outputs": [
755 | {
756 | "data": {
757 | "text/html": [
758 | "