├── .gitignore
├── 00_TopPopular.ipynb
├── 01_FrequentSeqMining.ipynb
├── 02_MarkovChain.ipynb
├── 03_FPMC.ipynb
├── 04_Prod2Vec.ipynb
├── 05_SessionBasedRNN.ipynb
├── 06_PersonalizedRNN.ipynb
├── 07_KNN.ipynb
├── LICENSE
├── README.md
├── datasets
└── sessions.zip
├── environment.yml
├── gifs
└── sequential_eval.gif
├── images
├── fpmc.png
├── gru4rec.png
├── hgru4rec.png
├── prod2vec.png
├── running_notebooks_1.png
├── running_notebooks_2.png
└── running_notebooks_3.png
├── recommenders
├── FPMCRecommender.py
├── FSMRecommender.py
├── ISeqRecommender.py
├── KNNRecommender.py
├── MarkovChainRecommender.py
├── MixedMarkovRecommender.py
├── PopularityRecommender.py
├── Prod2VecRecommender.py
├── RNNRecommender.py
├── SupervisedRecommender.py
└── __init__.py
├── slides
├── TheWebConf2019_01_Introduction.pdf
├── TheWebConf2019_02_Algorithms.pdf
└── TheWebConf2019_03_Evaluation.pdf
├── spmf
└── spmf.jar
└── util
├── SPMFinterface.py
├── __init__.py
├── data_expansion.py
├── data_utils.py
├── evaluation.py
├── fpmc
├── FPMC.py
├── FPMC_numba.py
├── __init__.py
└── utils.py
├── knn
├── __init__.py
├── iknn.py
├── sfsknn.py
├── sknn.py
├── ssknn.py
└── vmsknn.py
├── markov
└── Markov.py
├── metrics.py
├── rnn
├── __init__.py
├── gpu_ops.py
├── gru4rec.py
└── hgru4rec.py
├── split.py
└── tree
├── Tree.py
└── __init__.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 | .idea/*
91 | .idea/
92 | recpy/.idea/
93 | .DS_Store
94 |
95 | # custom
96 | datasets/
97 |
--------------------------------------------------------------------------------
/00_TopPopular.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Table of contents\n",
8 | "\n",
9 | "1. [Load the dataset](#load_the_dataset)\n",
10 | "2. [Split the dataset](#split_the_dataset)\n",
11 | "3. [Fitting the recommender](#fitting)\n",
12 | "4. [Sequential evaluation](#seq_evaluation) \n",
13 | " 4.1 [Evaluation with sequentially revaeled user profiles](#eval_seq_rev) \n",
14 | " 4.2 [Evaluation with \"static\" user profiles](#eval_static) \n",
15 | "5. [Analysis of next-item recommendation](#next-item) \n",
16 | " 5.1 [Evaluation with different recommendation list lengths](#next-item_list_length) \n",
17 | " 5.2 [Evaluation with different user profile lengths](#next-item_profile_length)"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import numpy as np\n",
27 | "import pandas as pd\n",
28 | "import matplotlib.pyplot as plt\n",
29 | "%matplotlib inline"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "from util.data_utils import create_seq_db_filter_top_k\n",
39 | "from util.split import last_session_out_split\n",
40 | "from util.metrics import precision, recall, mrr\n",
41 | "from util import evaluation\n",
42 | "from recommenders.PopularityRecommender import PopularityRecommender"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "import datetime"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "def get_test_sequences(test_data, given_k):\n",
61 | " # we can run evaluation only over sequences longer than abs(LAST_K)\n",
62 | " test_sequences = test_data.loc[test_data['sequence'].map(len) > abs(given_k), 'sequence'].values\n",
63 | " return test_sequences"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | ""
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "# 1. Load the dataset\n",
78 | "\n",
79 | "For this hands-on session we will use a dataset of user-listening sessions crawled from [last.fm](https://www.last.fm/). In detail, we will use a subset of the following dataset:\n",
80 | "\n",
81 | "* 30Music listening and playlists dataset, Turrin et al., ACM RecSys 2015 ([paper](https://home.deib.polimi.it/pagano/portfolio/papers/30Musiclisteningandplaylistsdataset.pdf))"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "# unzip the dataset, if you haven't already done it\n",
91 | "# ! unzip datasets/sessions.zip -d datasets"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "! ls datasets/"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "dataset_path = 'datasets/sessions.csv'\n",
110 | "# load this sample if you experience a severe slowdown with the previous dataset\n",
111 | "dataset_path = 'datasets/sessions_sample_10.csv'\n",
112 | "\n",
113 | "# for the sake of speed, let's keep only the top-1k most popular items in the last month\n",
114 | "dataset = create_seq_db_filter_top_k(path=dataset_path, topk=1000, last_months=1) "
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "Let's see at how the dataset looks like"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "dataset.head()"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "Let's show some statistics about the dataset"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "from collections import Counter\n",
147 | "cnt = Counter()\n",
148 | "dataset.sequence.map(cnt.update);"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "sequence_length = dataset.sequence.map(len).values\n",
158 | "n_sessions_per_user = dataset.groupby('user_id').size()\n",
159 | "\n",
160 | "print('Number of items: {}'.format(len(cnt)))\n",
161 | "print('Number of users: {}'.format(dataset.user_id.nunique()))\n",
162 | "print('Number of sessions: {}'.format(len(dataset)) )\n",
163 | "\n",
164 | "print('\\nSession length:\\n\\tAverage: {:.2f}\\n\\tMedian: {}\\n\\tMin: {}\\n\\tMax: {}'.format(\n",
165 | " sequence_length.mean(), \n",
166 | " np.quantile(sequence_length, 0.5), \n",
167 | " sequence_length.min(), \n",
168 | " sequence_length.max()))\n",
169 | "\n",
170 | "print('Sessions per user:\\n\\tAverage: {:.2f}\\n\\tMedian: {}\\n\\tMin: {}\\n\\tMax: {}'.format(\n",
171 | " n_sessions_per_user.mean(), \n",
172 | " np.quantile(n_sessions_per_user, 0.5), \n",
173 | " n_sessions_per_user.min(), \n",
174 | " n_sessions_per_user.max()))"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "print('Most popular items: {}'.format(cnt.most_common(5)))"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | ""
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "# 2. Split the dataset"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "For simplicity, let's split the dataset by assigning the **last session** of every user to the **test set**, and **all the previous** ones to the **training set**."
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "train_data, test_data = last_session_out_split(dataset)\n",
214 | "print(\"Train sessions: {} - Test sessions: {}\".format(len(train_data), len(test_data)))"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {},
220 | "source": [
221 | ""
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "# 3. Fitting the recommender\n",
229 | "\n",
230 | "Here we fit the recommedation algorithm over the sessions in the training set.\n",
231 | "\n",
232 | "`PopularityRecommender` simply recommends items ordered by their popularity in the training set. \n",
233 | "`PopularityRecommender` doesn't have any hyper-parameter, so we can move on!"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "recommender = PopularityRecommender()\n",
243 | "recommender.fit(train_data)"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "\n"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "# 4. Sequential evaluation\n",
258 | "\n",
259 | "In the evaluation of sequence-aware recommenders, each sequence in the test set is split into:\n",
260 | "- the _user profile_, used to compute recommendations, is composed by the first *k* events in the sequence;\n",
261 | "- the _ground truth_, used for performance evaluation, is composed by the remainder of the sequence.\n",
262 | "\n",
263 | "In the cells below, you can control the dimension of the _user profile_ by assigning a **positive** value to `GIVEN_K`, which correspond to the number of events from the beginning of the sequence that will be assigned to the initial user profile. This ensures that each user profile in the test set will have exactly the same initial size, but the size of the ground truth will change for every sequence.\n",
264 | "\n",
265 | "Alternatively, by assigning a **negative** value to `GIVEN_K`, you will set the initial size of the _ground truth_. In this way the _ground truth_ will have the same size for all sequences, but the dimension of the user profile will differ."
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "METRICS = {'precision':precision, \n",
275 | " 'recall':recall,\n",
276 | " 'mrr': mrr}\n",
277 | "TOPN=100 # length of the recommendation list"
278 | ]
279 | },
280 | {
281 | "cell_type": "markdown",
282 | "metadata": {},
283 | "source": [
284 | ""
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "## 4.1 Evaluation with sequentially revealed user-profiles\n",
292 | "\n",
293 | "Here we evaluate the quality of the recommendations in a setting in which user profiles are revealed _sequentially_.\n",
294 | "\n",
295 | "The _user profile_ starts from the first `GIVEN_K` events (or, alternatively, from the last `-GIVEN_K` events if `GIVEN_K<0`). \n",
296 | "The recommendations are evaluated against the next `LOOK_AHEAD` events (the _ground truth_). \n",
297 | "The _user profile_ is next expanded to the next `STEP` events, the ground truth is scrolled forward accordingly, and the evaluation continues until the sequence ends.\n",
298 | "\n",
299 | "In typical **next-item recommendation**, we start with `GIVEN_K=1`, generate a set of **alternatives** that will evaluated against the next event in the sequence (`LOOK_AHEAD=1`), move forward of one step (`STEP=1`) and repeat until the sequence ends.\n",
300 | "\n",
301 | "You can set the `LOOK_AHEAD='all'` to see what happens if you had to recommend a **whole sequence** instead of a set of a set of alternatives to a user.\n",
302 | "\n",
303 | "NOTE: Metrics are averaged over each sequence first, then averaged over all test sequences.\n",
304 | "\n",
305 | "** (TODO) Try out with different evaluation settings to see how the recommandation quality changes. **\n",
306 | "\n",
307 | "\n",
308 | ""
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "# GIVEN_K=1, LOOK_AHEAD=1, STEP=1 corresponds to the classical next-item evaluation\n",
318 | "GIVEN_K = 1\n",
319 | "LOOK_AHEAD = 1\n",
320 | "STEP=1"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {},
327 | "outputs": [],
328 | "source": [
329 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n",
330 | "print('{} sequences available for evaluation'.format(len(test_sequences)))\n",
331 | "\n",
332 | "results = evaluation.sequential_evaluation(recommender,\n",
333 | " test_sequences=test_sequences,\n",
334 | " given_k=GIVEN_K,\n",
335 | " look_ahead=LOOK_AHEAD,\n",
336 | " evaluation_functions=METRICS.values(),\n",
337 | " top_n=TOPN,\n",
338 | " scroll=True, # scrolling averages metrics over all profile lengths\n",
339 | " step=STEP)"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "print('Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})'.format(GIVEN_K, LOOK_AHEAD, STEP))\n",
349 | "for mname, mvalue in zip(METRICS.keys(), results):\n",
350 | " print('\\t{}@{}: {:.4f}'.format(mname, TOPN, mvalue))"
351 | ]
352 | },
353 | {
354 | "cell_type": "markdown",
355 | "metadata": {},
356 | "source": [
357 | ""
358 | ]
359 | },
360 | {
361 | "cell_type": "markdown",
362 | "metadata": {},
363 | "source": [
364 | "## 4.2 Evaluation with \"static\" user-profiles\n",
365 | "\n",
366 | "Here we evaluate the quality of the recommendations in a setting in which user profiles are instead _static_.\n",
367 | "\n",
368 | "The _user profile_ starts from the first `GIVEN_K` events (or, alternatively, from the last `-GIVEN_K` events if `GIVEN_K<0`). \n",
369 | "The recommendations are evaluated against the next `LOOK_AHEAD` events (the _ground truth_). \n",
370 | "\n",
371 | "The user profile is *not extended* and the ground truth *doesn't move forward*.\n",
372 | "This allows to obtain \"snapshots\" of the recommendation performance for different user profile and ground truth lenghts.\n",
373 | "\n",
374 | "Also here you can set the `LOOK_AHEAD='all'` to see what happens if you had to recommend a **whole sequence** instead of a set of a set of alternatives to a user.\n",
375 | "\n",
376 | "**(TODO) Try out with different evaluation settings to see how the recommandation quality changes.**"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": null,
382 | "metadata": {},
383 | "outputs": [],
384 | "source": [
385 | "GIVEN_K = 1\n",
386 | "LOOK_AHEAD = 'all'\n",
387 | "STEP=1"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n",
397 | "print('{} sequences available for evaluation'.format(len(test_sequences)))\n",
398 | "\n",
399 | "results = evaluation.sequential_evaluation(recommender,\n",
400 | " test_sequences=test_sequences,\n",
401 | " given_k=GIVEN_K,\n",
402 | " look_ahead=LOOK_AHEAD,\n",
403 | " evaluation_functions=METRICS.values(),\n",
404 | " top_n=TOPN,\n",
405 | " scroll=False # notice that scrolling is disabled!\n",
406 | " ) "
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": null,
412 | "metadata": {},
413 | "outputs": [],
414 | "source": [
415 | "print('Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})'.format(GIVEN_K, LOOK_AHEAD, STEP))\n",
416 | "for mname, mvalue in zip(METRICS.keys(), results):\n",
417 | " print('\\t{}@{}: {:.4f}'.format(mname, TOPN, mvalue))"
418 | ]
419 | },
420 | {
421 | "cell_type": "markdown",
422 | "metadata": {},
423 | "source": [
424 | ""
425 | ]
426 | },
427 | {
428 | "cell_type": "markdown",
429 | "metadata": {},
430 | "source": [
431 | "## 5. Analysis of next-item recommendation\n",
432 | "\n",
433 | "Here we propose to analyse the performance of the recommender system in the scenario of *next-item recommendation* over the following dimensions:\n",
434 | "\n",
435 | "* the *length* of the **recommendation list**, and\n",
436 | "* the *length* of the **user profile**.\n",
437 | "\n",
438 | "NOTE: This evaluation is by no means exhaustive, as different the hyper-parameters of the recommendation algorithm should be *carefully tuned* before drawing any conclusions. Unfortunately, given the time constraints for this tutorial, we had to leave hyper-parameter tuning out. A very useful reference about careful evaluation of (session-based) recommenders can be found at:\n",
439 | "\n",
440 | "* Evaluation of Session-based Recommendation Algorithms, Ludewig and Jannach, 2018 ([paper](https://arxiv.org/abs/1803.09587))"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "metadata": {},
446 | "source": [
447 | ""
448 | ]
449 | },
450 | {
451 | "cell_type": "markdown",
452 | "metadata": {},
453 | "source": [
454 | "### 5.1 Evaluation for different recommendation list lengths"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": null,
460 | "metadata": {},
461 | "outputs": [],
462 | "source": [
463 | "GIVEN_K = 1\n",
464 | "LOOK_AHEAD = 1\n",
465 | "STEP = 1\n",
466 | "topn_list = [1, 5, 10, 20, 50, 100]"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": null,
472 | "metadata": {},
473 | "outputs": [],
474 | "source": [
475 | "# ensure that all sequences have the same minimum length \n",
476 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n",
477 | "print('{} sequences available for evaluation'.format(len(test_sequences)))"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": null,
483 | "metadata": {},
484 | "outputs": [],
485 | "source": [
486 | "res_list = []\n",
487 | "\n",
488 | "for topn in topn_list:\n",
489 | " print('Evaluating recommendation lists with length: {}'.format(topn))\n",
490 | " res_tmp = evaluation.sequential_evaluation(recommender,\n",
491 | " test_sequences=test_sequences,\n",
492 | " given_k=GIVEN_K,\n",
493 | " look_ahead=LOOK_AHEAD,\n",
494 | " evaluation_functions=METRICS.values(),\n",
495 | " top_n=topn,\n",
496 | " scroll=True, # here we average over all profile lengths\n",
497 | " step=STEP)\n",
498 | " mvalues = list(zip(METRICS.keys(), res_tmp))\n",
499 | " res_list.append((topn, mvalues))"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": null,
505 | "metadata": {},
506 | "outputs": [],
507 | "source": [
508 | "# show separate plots per metric\n",
509 | "fig, axes = plt.subplots(nrows=1, ncols=len(METRICS), figsize=(15,5))\n",
510 | "res_list_t = list(zip(*res_list))\n",
511 | "for midx, metric in enumerate(METRICS):\n",
512 | " mvalues = [res_list_t[1][j][midx][1] for j in range(len(res_list_t[1]))]\n",
513 | " ax = axes[midx]\n",
514 | " ax.plot(topn_list, mvalues)\n",
515 | " ax.set_title(metric)\n",
516 | " ax.set_xticks(topn_list)\n",
517 | " ax.set_xlabel('List length')"
518 | ]
519 | },
520 | {
521 | "cell_type": "markdown",
522 | "metadata": {},
523 | "source": [
524 | ""
525 | ]
526 | },
527 | {
528 | "cell_type": "markdown",
529 | "metadata": {},
530 | "source": [
531 | "### 5.2 Evaluation for different user profile lengths"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": null,
537 | "metadata": {},
538 | "outputs": [],
539 | "source": [
540 | "given_k_list = [1, 2, 3, 4]\n",
541 | "LOOK_AHEAD = 1\n",
542 | "STEP = 1\n",
543 | "TOPN = 20"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": null,
549 | "metadata": {},
550 | "outputs": [],
551 | "source": [
552 | "# ensure that all sequences have the same minimum length \n",
553 | "test_sequences = get_test_sequences(test_data, max(given_k_list))\n",
554 | "print('{} sequences available for evaluation'.format(len(test_sequences)))"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": null,
560 | "metadata": {},
561 | "outputs": [],
562 | "source": [
563 | "res_list = []\n",
564 | "\n",
565 | "for gk in given_k_list:\n",
566 | " print('Evaluating profiles having length: {}'.format(gk))\n",
567 | " res_tmp = evaluation.sequential_evaluation(recommender,\n",
568 | " test_sequences=test_sequences,\n",
569 | " given_k=gk,\n",
570 | " look_ahead=LOOK_AHEAD,\n",
571 | " evaluation_functions=METRICS.values(),\n",
572 | " top_n=TOPN,\n",
573 | " scroll=False, # here we stop at each profile length\n",
574 | " step=STEP)\n",
575 | " mvalues = list(zip(METRICS.keys(), res_tmp))\n",
576 | " res_list.append((gk, mvalues))"
577 | ]
578 | },
579 | {
580 | "cell_type": "code",
581 | "execution_count": null,
582 | "metadata": {},
583 | "outputs": [],
584 | "source": [
585 | "# show separate plots per metric\n",
586 | "fig, axes = plt.subplots(nrows=1, ncols=len(METRICS), figsize=(15,5))\n",
587 | "res_list_t = list(zip(*res_list))\n",
588 | "for midx, metric in enumerate(METRICS):\n",
589 | " mvalues = [res_list_t[1][j][midx][1] for j in range(len(res_list_t[1]))]\n",
590 | " ax = axes[midx]\n",
591 | " ax.plot(given_k_list, mvalues)\n",
592 | " ax.set_title(metric)\n",
593 | " ax.set_xticks(given_k_list)\n",
594 | " ax.set_xlabel('Profile length')"
595 | ]
596 | }
597 | ],
598 | "metadata": {
599 | "kernelspec": {
600 | "display_name": "srs",
601 | "language": "python",
602 | "name": "srs"
603 | },
604 | "language_info": {
605 | "codemirror_mode": {
606 | "name": "ipython",
607 | "version": 3
608 | },
609 | "file_extension": ".py",
610 | "mimetype": "text/x-python",
611 | "name": "python",
612 | "nbconvert_exporter": "python",
613 | "pygments_lexer": "ipython3",
614 | "version": "3.6.6"
615 | }
616 | },
617 | "nbformat": 4,
618 | "nbformat_minor": 2
619 | }
620 |
--------------------------------------------------------------------------------
/02_MarkovChain.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Table of contents\n",
8 | "\n",
9 | "1. [Load the dataset](#load_the_dataset)\n",
10 | "2. [Split the dataset](#split_the_dataset)\n",
11 | "3. [Fitting the recommender](#fitting)\n",
12 | "4. [Sequential evaluation](#seq_evaluation) \n",
13 | " 4.1 [Evaluation with sequentially revaeled user profiles](#eval_seq_rev) \n",
14 | " 4.2 [Evaluation with \"static\" user profiles](#eval_static) \n",
15 | "5. [Analysis of next-item recommendation](#next-item) \n",
16 | " 5.1 [Evaluation with different recommendation list lengths](#next-item_list_length) \n",
17 | " 5.2 [Evaluation with different user profile lengths](#next-item_profile_length)"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import numpy as np\n",
27 | "import pandas as pd\n",
28 | "import matplotlib.pyplot as plt\n",
29 | "%matplotlib inline"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "from util.data_utils import create_seq_db_filter_top_k, sequences_to_spfm_format\n",
39 | "from util.split import last_session_out_split\n",
40 | "from util.metrics import precision, recall, mrr\n",
41 | "from util import evaluation\n",
42 | "from recommenders.MixedMarkovRecommender import MixedMarkovChainRecommender"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "import datetime"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "def get_test_sequences(test_data, given_k):\n",
61 | " # we can run evaluation only over sequences longer than abs(LAST_K)\n",
62 | " test_sequences = test_data.loc[test_data['sequence'].map(len) > abs(given_k), 'sequence'].values\n",
63 | " return test_sequences"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | ""
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "# 1. Load the dataset\n",
78 | "\n",
79 | "For this hands-on session we will use a dataset of user-listening sessions crawled from [last.fm](https://www.last.fm/). In detail, we will use a subset of the following dataset:\n",
80 | "\n",
81 | "* 30Music listening and playlists dataset, Turrin et al., ACM RecSys 2015 ([paper](https://home.deib.polimi.it/pagano/portfolio/papers/30Musiclisteningandplaylistsdataset.pdf))"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "# unzip the dataset, if you haven't already done it\n",
91 | "# ! unzip datasets/sessions.zip -d datasets"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "dataset_path = 'datasets/sessions.csv'\n",
101 | "# load this sample if you experience a severe slowdown with the previous dataset\n",
102 | "#dataset_path = 'datasets/sessions_sample_10.csv'\n",
103 | "\n",
104 | "# for the sake of speed, let's keep only the top-1k most popular items in the last month\n",
105 | "dataset = create_seq_db_filter_top_k(path=dataset_path, topk=1000, last_months=1) "
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "Let's see at how the dataset looks like"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "dataset.head()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "Let's show some statistics about the dataset"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "from collections import Counter\n",
138 | "cnt = Counter()\n",
139 | "dataset.sequence.map(cnt.update);"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "sequence_length = dataset.sequence.map(len).values\n",
149 | "n_sessions_per_user = dataset.groupby('user_id').size()\n",
150 | "\n",
151 | "print('Number of items: {}'.format(len(cnt)))\n",
152 | "print('Number of users: {}'.format(dataset.user_id.nunique()))\n",
153 | "print('Number of sessions: {}'.format(len(dataset)) )\n",
154 | "\n",
155 | "print('\\nSession length:\\n\\tAverage: {:.2f}\\n\\tMedian: {}\\n\\tMin: {}\\n\\tMax: {}'.format(\n",
156 | " sequence_length.mean(), \n",
157 | " np.quantile(sequence_length, 0.5), \n",
158 | " sequence_length.min(), \n",
159 | " sequence_length.max()))\n",
160 | "\n",
161 | "print('Sessions per user:\\n\\tAverage: {:.2f}\\n\\tMedian: {}\\n\\tMin: {}\\n\\tMax: {}'.format(\n",
162 | " n_sessions_per_user.mean(), \n",
163 | " np.quantile(n_sessions_per_user, 0.5), \n",
164 | " n_sessions_per_user.min(), \n",
165 | " n_sessions_per_user.max()))"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "print('Most popular items: {}'.format(cnt.most_common(5)))"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | ""
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "# 2. Split the dataset"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "For simplicity, let's split the dataset by assigning the **last session** of every user to the **test set**, and **all the previous** ones to the **training set**."
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "train_data, test_data = last_session_out_split(dataset)\n",
205 | "print(\"Train sessions: {} - Test sessions: {}\".format(len(train_data), len(test_data)))"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | ""
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "# 3. Fitting the recommender\n",
220 | "\n",
221 | "Here we fit the recommedation algorithm over the sessions in the training set. \n",
222 | "This recommender is based on the `MarkovChainRecommender` implemented from:\n",
223 | "\n",
224 | "_Shani, Guy, David Heckerman, and Ronen I. Brafman. \"An MDP-based recommender system.\" Journal of Machine Learning Research 6, no. Sep (2005): 1265-1295. Chapter 3-4_\n",
225 | "\n",
226 | "This recommender computes the item transition matrices for any Markov Chain having order in `[min_order, max_order]`. Each individual Markov Chain model employes some heristics like skipping or clustering to deal better with data sparsity. Recommendations are generated by sorting items by their transition probability to being next, given the user profile. The scores coming from different MC are weighted _inversely_ wrt to their order.\n",
227 | "\n",
228 | "The class `MixedMarkovChainRecommender` has the following initialization hyper-parameters:\n",
229 | "* `min_order`: the minimum order of the Mixed Markov Chain\n",
230 | "* `max_order`: the maximum order of the Mixed Markov Chain\n"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "# You can try with max_order=2 or higher too, but it will take some time to complete though due to slow heristic computations\n",
240 | "recommender = MixedMarkovChainRecommender(min_order=1, \n",
241 | " max_order=1)\n",
242 | "recommender.fit(train_data)"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "\n"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "metadata": {},
255 | "source": [
256 | "# 4. Sequential evaluation\n",
257 | "\n",
258 | "In the evaluation of sequence-aware recommenders, each sequence in the test set is split into:\n",
259 | "- the _user profile_, used to compute recommendations, is composed by the first *k* events in the sequence;\n",
260 | "- the _ground truth_, used for performance evaluation, is composed by the remainder of the sequence.\n",
261 | "\n",
262 | "In the cells below, you can control the dimension of the _user profile_ by assigning a **positive** value to `GIVEN_K`, which correspond to the number of events from the beginning of the sequence that will be assigned to the initial user profile. This ensures that each user profile in the test set will have exactly the same initial size, but the size of the ground truth will change for every sequence.\n",
263 | "\n",
264 | "Alternatively, by assigning a **negative** value to `GIVEN_K`, you will set the initial size of the _ground truth_. In this way the _ground truth_ will have the same size for all sequences, but the dimension of the user profile will differ."
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "METRICS = {'precision':precision, \n",
274 | " 'recall':recall,\n",
275 | " 'mrr': mrr}\n",
276 | "TOPN = 10 # length of the recommendation list"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | ""
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {},
289 | "source": [
290 | "## 4.1 Evaluation with sequentially revealed user-profiles\n",
291 | "\n",
292 | "Here we evaluate the quality of the recommendations in a setting in which user profiles are revealed _sequentially_.\n",
293 | "\n",
294 | "The _user profile_ starts from the first `GIVEN_K` events (or, alternatively, from the last `-GIVEN_K` events if `GIVEN_K<0`). \n",
295 | "The recommendations are evaluated against the next `LOOK_AHEAD` events (the _ground truth_). \n",
296 | "The _user profile_ is next expanded to the next `STEP` events, the ground truth is scrolled forward accordingly, and the evaluation continues until the sequence ends.\n",
297 | "\n",
298 | "In typical **next-item recommendation**, we start with `GIVEN_K=1`, generate a set of **alternatives** that will evaluated against the next event in the sequence (`LOOK_AHEAD=1`), move forward of one step (`STEP=1`) and repeat until the sequence ends.\n",
299 | "\n",
300 | "You can set the `LOOK_AHEAD='all'` to see what happens if you had to recommend a **whole sequence** instead of a set of a set of alternatives to a user.\n",
301 | "\n",
302 | "NOTE: Metrics are averaged over each sequence first, then averaged over all test sequences.\n",
303 | "\n",
304 | "** (TODO) Try out with different evaluation settings to see how the recommandation quality changes. **\n",
305 | "\n",
306 | "\n",
307 | ""
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "# GIVEN_K=1, LOOK_AHEAD=1, STEP=1 corresponds to the classical next-item evaluation\n",
317 | "GIVEN_K = 1\n",
318 | "LOOK_AHEAD = 1\n",
319 | "STEP=1"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": null,
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n",
329 | "print('{} sequences available for evaluation'.format(len(test_sequences)))\n",
330 | "\n",
331 | "results = evaluation.sequential_evaluation(recommender,\n",
332 | " test_sequences=test_sequences,\n",
333 | " given_k=GIVEN_K,\n",
334 | " look_ahead=LOOK_AHEAD,\n",
335 | " evaluation_functions=METRICS.values(),\n",
336 | " top_n=TOPN,\n",
337 | " scroll=True, # scrolling averages metrics over all profile lengths\n",
338 | " step=STEP)"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": null,
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "print('Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})'.format(GIVEN_K, LOOK_AHEAD, STEP))\n",
348 | "for mname, mvalue in zip(METRICS.keys(), results):\n",
349 | " print('\\t{}@{}: {:.4f}'.format(mname, TOPN, mvalue))"
350 | ]
351 | },
352 | {
353 | "cell_type": "markdown",
354 | "metadata": {},
355 | "source": [
356 | ""
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "metadata": {},
362 | "source": [
363 | "## 4.2 Evaluation with \"static\" user-profiles\n",
364 | "\n",
365 | "Here we evaluate the quality of the recommendations in a setting in which user profiles are instead _static_.\n",
366 | "\n",
367 | "The _user profile_ starts from the first `GIVEN_K` events (or, alternatively, from the last `-GIVEN_K` events if `GIVEN_K<0`). \n",
368 | "The recommendations are evaluated against the next `LOOK_AHEAD` events (the _ground truth_). \n",
369 | "\n",
370 | "The user profile is *not extended* and the ground truth *doesn't move forward*.\n",
371 | "This allows to obtain \"snapshots\" of the recommendation performance for different user profile and ground truth lenghts.\n",
372 | "\n",
373 | "Also here you can set the `LOOK_AHEAD='all'` to see what happens if you had to recommend a **whole sequence** instead of a set of a set of alternatives to a user.\n",
374 | "\n",
375 | "**(TODO) Try out with different evaluation settings to see how the recommandation quality changes.**"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "metadata": {},
382 | "outputs": [],
383 | "source": [
384 | "GIVEN_K = 1\n",
385 | "LOOK_AHEAD = 'all'\n",
386 | "STEP=1"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n",
396 | "print('{} sequences available for evaluation'.format(len(test_sequences)))\n",
397 | "\n",
398 | "results = evaluation.sequential_evaluation(recommender,\n",
399 | " test_sequences=test_sequences,\n",
400 | " given_k=GIVEN_K,\n",
401 | " look_ahead=LOOK_AHEAD,\n",
402 | " evaluation_functions=METRICS.values(),\n",
403 | " top_n=TOPN,\n",
404 | " scroll=False # notice that scrolling is disabled!\n",
405 | " ) "
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": null,
411 | "metadata": {},
412 | "outputs": [],
413 | "source": [
414 | "print('Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})'.format(GIVEN_K, LOOK_AHEAD, STEP))\n",
415 | "for mname, mvalue in zip(METRICS.keys(), results):\n",
416 | " print('\\t{}@{}: {:.4f}'.format(mname, TOPN, mvalue))"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | ""
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "## 5. Analysis of next-item recommendation\n",
431 | "\n",
432 | "Here we propose to analyse the performance of the recommender system in the scenario of *next-item recommendation* over the following dimensions:\n",
433 | "\n",
434 | "* the *length* of the **recommendation list**, and\n",
435 | "* the *length* of the **user profile**.\n",
436 | "\n",
437 | "NOTE: This evaluation is by no means exhaustive, as different the hyper-parameters of the recommendation algorithm should be *carefully tuned* before drawing any conclusions. Unfortunately, given the time constraints for this tutorial, we had to leave hyper-parameter tuning out. A very useful reference about careful evaluation of (session-based) recommenders can be found at:\n",
438 | "\n",
439 | "* Evaluation of Session-based Recommendation Algorithms, Ludewig and Jannach, 2018 ([paper](https://arxiv.org/abs/1803.09587))"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | ""
447 | ]
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "metadata": {},
452 | "source": [
453 | "### 5.1 Evaluation for different recommendation list lengths"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": null,
459 | "metadata": {},
460 | "outputs": [],
461 | "source": [
462 | "GIVEN_K = 1\n",
463 | "LOOK_AHEAD = 1\n",
464 | "STEP = 1\n",
465 | "topn_list = [1, 5, 10, 20, 50, 100]"
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": null,
471 | "metadata": {},
472 | "outputs": [],
473 | "source": [
474 | "# ensure that all sequences have the same minimum length \n",
475 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n",
476 | "print('{} sequences available for evaluation'.format(len(test_sequences)))"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": null,
482 | "metadata": {},
483 | "outputs": [],
484 | "source": [
485 | "res_list = []\n",
486 | "\n",
487 | "for topn in topn_list:\n",
488 | " print('Evaluating recommendation lists with length: {}'.format(topn))\n",
489 | " res_tmp = evaluation.sequential_evaluation(recommender,\n",
490 | " test_sequences=test_sequences,\n",
491 | " given_k=GIVEN_K,\n",
492 | " look_ahead=LOOK_AHEAD,\n",
493 | " evaluation_functions=METRICS.values(),\n",
494 | " top_n=topn,\n",
495 | " scroll=True, # here we average over all profile lengths\n",
496 | " step=STEP)\n",
497 | " mvalues = list(zip(METRICS.keys(), res_tmp))\n",
498 | " res_list.append((topn, mvalues))"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": null,
504 | "metadata": {},
505 | "outputs": [],
506 | "source": [
507 | "# show separate plots per metric\n",
508 | "fig, axes = plt.subplots(nrows=1, ncols=len(METRICS), figsize=(15,5))\n",
509 | "res_list_t = list(zip(*res_list))\n",
510 | "for midx, metric in enumerate(METRICS):\n",
511 | " mvalues = [res_list_t[1][j][midx][1] for j in range(len(res_list_t[1]))]\n",
512 | " ax = axes[midx]\n",
513 | " ax.plot(topn_list, mvalues)\n",
514 | " ax.set_title(metric)\n",
515 | " ax.set_xticks(topn_list)\n",
516 | " ax.set_xlabel('List length')"
517 | ]
518 | },
519 | {
520 | "cell_type": "markdown",
521 | "metadata": {},
522 | "source": [
523 | ""
524 | ]
525 | },
526 | {
527 | "cell_type": "markdown",
528 | "metadata": {},
529 | "source": [
530 | "### 5.2 Evaluation for different user profile lengths"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": null,
536 | "metadata": {},
537 | "outputs": [],
538 | "source": [
539 | "given_k_list = [1, 2, 3, 4]\n",
540 | "LOOK_AHEAD = 1\n",
541 | "STEP = 1\n",
542 | "TOPN = 20"
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": null,
548 | "metadata": {},
549 | "outputs": [],
550 | "source": [
551 | "# ensure that all sequences have the same minimum length \n",
552 | "test_sequences = get_test_sequences(test_data, max(given_k_list))\n",
553 | "print('{} sequences available for evaluation'.format(len(test_sequences)))"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": null,
559 | "metadata": {},
560 | "outputs": [],
561 | "source": [
562 | "res_list = []\n",
563 | "\n",
564 | "for gk in given_k_list:\n",
565 | " print('Evaluating profiles having length: {}'.format(gk))\n",
566 | " res_tmp = evaluation.sequential_evaluation(recommender,\n",
567 | " test_sequences=test_sequences,\n",
568 | " given_k=gk,\n",
569 | " look_ahead=LOOK_AHEAD,\n",
570 | " evaluation_functions=METRICS.values(),\n",
571 | " top_n=TOPN,\n",
572 | " scroll=False, # here we stop at each profile length\n",
573 | " step=STEP)\n",
574 | " mvalues = list(zip(METRICS.keys(), res_tmp))\n",
575 | " res_list.append((gk, mvalues))"
576 | ]
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": null,
581 | "metadata": {},
582 | "outputs": [],
583 | "source": [
584 | "# show separate plots per metric\n",
585 | "fig, axes = plt.subplots(nrows=1, ncols=len(METRICS), figsize=(15,5))\n",
586 | "res_list_t = list(zip(*res_list))\n",
587 | "for midx, metric in enumerate(METRICS):\n",
588 | " mvalues = [res_list_t[1][j][midx][1] for j in range(len(res_list_t[1]))]\n",
589 | " ax = axes[midx]\n",
590 | " ax.plot(given_k_list, mvalues)\n",
591 | " ax.set_title(metric)\n",
592 | " ax.set_xticks(given_k_list)\n",
593 | " ax.set_xlabel('Profile length')"
594 | ]
595 | }
596 | ],
597 | "metadata": {
598 | "kernelspec": {
599 | "display_name": "srs",
600 | "language": "python",
601 | "name": "srs"
602 | },
603 | "language_info": {
604 | "codemirror_mode": {
605 | "name": "ipython",
606 | "version": 3
607 | },
608 | "file_extension": ".py",
609 | "mimetype": "text/x-python",
610 | "name": "python",
611 | "nbconvert_exporter": "python",
612 | "pygments_lexer": "ipython3",
613 | "version": "3.6.6"
614 | }
615 | },
616 | "nbformat": 4,
617 | "nbformat_minor": 2
618 | }
619 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Massimo Quadrana
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Tutorial on Sequence-Aware Recommender Systems
3 |
4 | This repository contains the material used in the hands-on session of the tutorials on Sequence-Aware Recommenders we gave
5 | at [TheWebConf 2019](https://www2019.thewebconf.org/tutorials) and [ACM RecSys 2018](https://recsys.acm.org/recsys18/tutorials/#content-tab-1-4-tab).
6 |
7 | ## ACM CSUR Paper and TheWebConf 2019 Slides
8 |
9 | ### ACM Computing Surveys (CSUR) Paper
10 |
Sequence-Aware Recommender Systems
11 | Massimo Quadrana, Paolo Cremonesi, Dietmar Jannach
12 | ACM Computing Surveys (CSUR), 2018
13 |
14 | ### TheWebConf 2019 Slides
15 | 1. [Introduction](slides/TheWebConf2019_01_Introduction.pdf)
16 | 2. [Algorithms](slides/TheWebConf2019_02_Algorithms.pdf)
17 | 3. [Evaluation](slides/TheWebConf2019_03_Evaluation.pdf)
18 |
19 | ## Running the code
20 |
21 | You have two options to run the code contained in this repository:
22 | 1. Setup a new environment on your local machine and run the code locally (_highly recommended_).
23 | 2. Launch a new Binder instance by clicking on this badge [](https://mybinder.org/v2/gh/mquad/sars_tutorial/master).
24 |
25 | While we all know that setting up a new local environment is a slightly tedious process, Binder instances have strict resource limits (1-2GB of memory, max 100 concurrent users per repository).
26 | Also beware that Binder sessions automatically expire after 10 minutes of inactivity!
27 | So we *highly recommend* to set up a new local environment in advance by following the [Setup instructions](#setup-instructions).
28 |
29 | ### Setup instructions
30 |
31 | 1. First of all, clone this project to your local machine:
32 | ```bash
33 | git clone https://github.com/mquad/sars_tutorial.git
34 | ```
35 |
36 | 2. Now you need to set up a new python3 environment. We will use Anaconda/Miniconda for doing so.
37 | If you don't have Anaconda/Minicoda already installed on your machine, click here to download [Miniconda](https://conda.io/miniconda.html) or [Anaconda](https://www.anaconda.com/download/) (**Python 3 version**).
38 |
39 | 3. After that, install the environment for this hands-on by running:
40 | ```bash
41 | cd sars_tutorial/
42 | conda env create --file environment.yml
43 | ```
44 |
45 | 4. (_Miniconda users only_) If you choose to install Miniconda before, you will now have to install Jupyter Notebook on your machine, just by running `conda install jupyter`.
46 | You can do it in your main python environment (necessarily in the `srs` env), as long as you setup the kernel as explained after.
47 | Anaconda users should already have Jupyter Notebook installed, so they can skip this step.
48 |
49 | 5. Then activate the environment with `source activate srs` or `conda activate srs`, and install a new `iptyhon` kernel by running:
50 | ```bash
51 | python -m ipykernel install --name srs
52 | ```
53 | If you get "Permission denied" error with the above command, try with
54 | ```bash
55 | python -m ipykernel install --name srs --user
56 | ```
57 |
58 | 6. Finally, launch the Jupyter Notebook with
59 | ```bash
60 | jupyter notebook --port=8888
61 | ```
62 | and open it your browser at the address `localhost:8888`.
63 | (Beware, if port `8888` is already taken by another service, Jupyter Notebook will automatically open on a different one. Check out the startup log!).
64 |
65 |
66 | ### Running the notebooks
67 |
68 | The notebooks used in this hands-on are listed in the main directory of this project, as shown below:
69 |
70 |
71 |
72 | Click on the name of the notebook to open it in a new window. The name of each running notebook is highlighted in green
73 | (in the screen above, the notebook `00_TopPopular` is the only one running).
74 |
75 | Before starting to execute the notebook cells, you have to ensure that the kernel is properly set to `srs`, like in the screen below:
76 |
77 | 
78 |
79 | If it's not your case, change the kernel to `srs` by clicking on `Kernel > Change kernel > srs` in the menu bar, as shown below:
80 |
81 | 
82 |
83 | NOTE: this requires the installation of the `srs` kernel, as explained in the [Setup instructions](#setup-instructions).
84 |
85 | You can now start running the cells in the notebook! Yay!
86 |
87 |
88 | # Acknowledgments
89 |
90 | We want to sincerely thank [Umberto Di Fabrizio](https://www.linkedin.com/in/umbertodifabrizio) for the help in the development of this repository back when he was a MSc student at Politecnico di Milano. Great job Umberto!
--------------------------------------------------------------------------------
/datasets/sessions.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/datasets/sessions.zip
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: srs
2 | channels:
3 | - defaults
4 | - conda-forge
5 | - anaconda
6 | dependencies:
7 | - cython
8 | - gensim=3.4.0
9 | - ipykernel=4.9.0
10 | - matplotlib
11 | - mkl-service
12 | - networkx=1.11
13 | - numba=0.39.0
14 | - numpy=1.15.1
15 | - pandas=0.23.4
16 | - scipy=1.1.0
17 | - theano=1.0.3
18 | - tqdm=4.25.0
19 | - pip:
20 | - treelib
21 | - pymining
22 |
23 |
--------------------------------------------------------------------------------
/gifs/sequential_eval.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/gifs/sequential_eval.gif
--------------------------------------------------------------------------------
/images/fpmc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/fpmc.png
--------------------------------------------------------------------------------
/images/gru4rec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/gru4rec.png
--------------------------------------------------------------------------------
/images/hgru4rec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/hgru4rec.png
--------------------------------------------------------------------------------
/images/prod2vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/prod2vec.png
--------------------------------------------------------------------------------
/images/running_notebooks_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/running_notebooks_1.png
--------------------------------------------------------------------------------
/images/running_notebooks_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/running_notebooks_2.png
--------------------------------------------------------------------------------
/images/running_notebooks_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/running_notebooks_3.png
--------------------------------------------------------------------------------
/recommenders/FPMCRecommender.py:
--------------------------------------------------------------------------------
1 | from recommenders.ISeqRecommender import ISeqRecommender
2 | from util.fpmc.FPMC_numba import FPMC
3 |
4 |
5 | class FPMCRecommender(ISeqRecommender):
6 | """
7 | Implementation of
8 | Rendle, S., Freudenthaler, C., & Schmidt-Thieme, L. (2010). Factorizing personalized Markov chains for next-basket recommendation.
9 | Proceedings of the 19th International Conference on World Wide Web - WWW ’10, 811
10 |
11 | Based on the implementation available at https://github.com/khesui/FPMC
12 | """
13 |
14 | def __init__(self, n_factor=32, learn_rate=0.01, regular=0.001, n_epoch=15, n_neg=10):
15 | """
16 | :param n_factor: (optional) the number of latent factors
17 | :param learn_rate: (optional) the learning rate
18 | :param regular: (optional) the L2 regularization coefficient
19 | :param n_epoch: (optional) the number of training epochs
20 | :param n_neg: (optional) the number of negative samples used in BPR learning
21 | """
22 | super(FPMCRecommender, self).__init__()
23 | self.n_epoch = n_epoch
24 | self.n_neg = n_neg
25 | self.n_factor = n_factor
26 | self.learn_rate = learn_rate
27 | self.regular = regular
28 |
29 | def __str__(self):
30 | return 'FPMCRecommender(n_epoch={n_epoch}, ' \
31 | 'n_neg={n_neg}, ' \
32 | 'n_factor={n_factor}, ' \
33 | 'learn_rate={learn_rate}, ' \
34 | 'regular={regular})'.format(**self.__dict__)
35 |
36 | def fit(self, train_data):
37 | self._declare(train_data)
38 |
39 | train_data_supervised = []
40 |
41 | for i, row in train_data.iterrows():
42 | u = self.user_mapping[row['user_id']]
43 |
44 | seq = []
45 | if len(row['sequence']) > 1: # cannot use sequences with length 1 for supervised learning
46 | for item in row['sequence']:
47 | i = self.item_mapping[item]
48 | seq.append(i)
49 |
50 | train_data_supervised.append((u, seq[len(seq) - 1], seq[:len(seq) - 1]))
51 |
52 | self.fpmc = FPMC(n_user=len(self.user_mapping), n_item=len(self.item_mapping),
53 | n_factor=self.n_factor, learn_rate=self.learn_rate, regular=self.regular)
54 |
55 | self.fpmc.user_set = set(self.user_mapping.values())
56 | self.fpmc.item_set = set(self.item_mapping.values())
57 | self.fpmc.init_model()
58 |
59 | self.fpmc.learnSBPR_FPMC(train_data_supervised, n_epoch=self.n_epoch, neg_batch_size=self.n_neg)
60 |
61 | def recommend(self, user_profile, user_id=None):
62 | context = []
63 | for item in user_profile:
64 | context.append(self.item_mapping[item])
65 |
66 | items, scores = self.fpmc.evaluation_recommender(self.user_mapping[user_id], context)
67 | recommendations = []
68 |
69 | for i, it in enumerate(items):
70 | recommendations.append(([self.reverse_item_mapping[it]], scores[i]))
71 | return recommendations
72 |
73 | def _declare(self, data):
74 | self.user_mapping = {}
75 | self.item_mapping = {}
76 | self.reverse_item_mapping = {}
77 |
78 | user_counter = 0
79 | item_counter = 0
80 | for i, row in data.iterrows():
81 | if row['user_id'] not in self.user_mapping:
82 | self.user_mapping[row['user_id']] = user_counter
83 | user_counter += 1
84 |
85 | for item in row['sequence']:
86 | if item not in self.item_mapping:
87 | self.item_mapping[item] = item_counter
88 | self.reverse_item_mapping[item_counter] = item
89 | item_counter += 1
90 |
--------------------------------------------------------------------------------
/recommenders/FSMRecommender.py:
--------------------------------------------------------------------------------
1 | from pymining import seqmining
2 |
3 | from recommenders.ISeqRecommender import ISeqRecommender
4 | from util.SPMFinterface import callSPMF
5 | from util.tree.Tree import SmartTree
6 |
7 |
8 | class FSMRecommender(ISeqRecommender):
9 | """Frequent Sequence Mining recommender"""
10 |
11 | def __init__(self, minsup, minconf, max_context=1, min_context=1, spmf_path=None, db_path=None):
12 | """
13 |
14 | :param minsup: the minimum support threshold. It is interpreted as relative count if in [0-1],
15 | otherwise as an absolute count. NOTE: Relative count required for training with SPFM (faster).
16 | :param minconf: the minimum confidence threshold.
17 | :param max_context: (optional) the maximum number of items in the user profile (starting from the last) that will be used
18 | for lookup in the database of frequent sequences.
19 | :param min_context: (optional) the minimum number of items in the user profile (starting from the last) that will be used
20 | for lookup in the database of frequent sequences.
21 | :param spmf_path: (optional) path to SPMF jar file. If provided, SPFM library will be used for pattern extraction (algorithm: Prefix Span).
22 | Otherwise, use pymining, which can be significantly slower depending on the sequence database size.
23 | :param db_path: (optional) path to the sequence database file
24 | """
25 |
26 | super(FSMRecommender, self).__init__()
27 | self.minsup = minsup
28 | self.minconf = minconf
29 | self.max_context = max_context
30 | self.min_context = min_context
31 | self.recommendation_length = 1
32 | self.db_path = db_path
33 | self.spmf_path = spmf_path
34 | self.spmf_algorithm = "PrefixSpan"
35 | self.output_path = "tmp/tmp_output.txt"
36 |
37 | def __str__(self):
38 | return 'FreqSeqMiningRecommender: ' \
39 | 'minsup={minsup}, ' \
40 | 'minconf={minconf}, ' \
41 | 'max_context={max_context}, ' \
42 | 'min_context={min_context}, ' \
43 | 'spmf_path={spmf_path}, ' \
44 | 'db_path={db_path}'.format(**self.__dict__)
45 |
46 | def fit(self, train_data=None):
47 | """
48 | Fit the model
49 | :param train_data: (optional) DataFrame with the training sequences, which must be assigned to column "sequence".
50 | If None, run FSM using SPFM over the sequence database stored in `self.db_path`.
51 | Otherwise, run FSM using `pymining.seqmining` (slower).
52 | """
53 |
54 | if train_data is None:
55 | if self.spmf_path is None or self.db_path is None:
56 | raise ValueError("You should set db_path and spfm_path before calling fit() without arguments.")
57 |
58 | self.logger.info('Using SPFM (Java) for Frequent Sequence Mining')
59 | if 0 <= self.minsup <= 1:
60 | percentage_min_sup = self.minsup * 100
61 | else:
62 | raise NameError("SPMF only accepts 0<=minsup<=1")
63 |
64 | # call spmf
65 | command = ' '.join([self.spmf_algorithm, self.db_path, self.output_path, str(percentage_min_sup) + '%'])
66 | callSPMF(self.spmf_path, command)
67 |
68 | # parse back output from text file
69 | self._parse_spfm_output()
70 | else:
71 | # use pymining
72 | self.logger.info('Using pymining.seqmining (python) for Frequent Sequence Mining')
73 | sequences = train_data['sequence'].values
74 | msup = int(self.minsup * len(sequences)) if 0 <= self.minsup <= 1 else self.minsup
75 | self.logger.info('Mining frequent sequences (minsup={})'.format(msup))
76 | self.freq_seqs = seqmining.freq_seq_enum(sequences, msup)
77 |
78 | self.logger.info('{} frequent sequences found'.format(len(self.freq_seqs)))
79 | self.logger.info('Building the prefix tree')
80 | self.tree = SmartTree()
81 | self.root_node = self.tree.set_root()
82 | for pattern, support in self.freq_seqs:
83 | if len(pattern) == 1:
84 | # add node to root
85 | self.tree.create_node(pattern[0], parent=self.root_node, data={"support": support})
86 | elif len(pattern) > 1:
87 | # add entire path starting from root
88 | self.tree.add_path(self.root_node, pattern, support)
89 | else:
90 | raise ValueError('Frequent sequence of length 0')
91 | self.logger.info('Training completed')
92 |
93 | def recommend(self, user_profile, user_id=None):
94 | n = len(user_profile)
95 | c = min(n, self.max_context)
96 | match = []
97 | # iterate over decreasing context lengths until a match with sufficient confidence is found
98 | while not match and c >= self.min_context:
99 | q = user_profile[n - c:n]
100 | match = self._find_match(q, self.recommendation_length)
101 | c -= 1
102 | return match
103 |
104 | def _find_match(self, context, recommendation_length):
105 | # search context
106 | lastNode = self.tree.find_path(self.root_node, context)
107 |
108 | if lastNode == -1:
109 | return []
110 | else: # context matched
111 | context_support = self.tree[lastNode].data['support']
112 | children = self.tree[lastNode].fpointer
113 |
114 | if not children:
115 | return []
116 |
117 | # find all path of length recommendation_length from match
118 | paths = self.tree.find_n_length_paths(lastNode, recommendation_length)
119 | return sorted(self._filter_confidence(context_support, paths), key=lambda x: x[1], reverse=True)
120 |
121 | def _filter_confidence(self, context_support, path_list):
122 | goodPaths = []
123 | for p in path_list:
124 | confidence = self.tree[p[len(p) - 1]].data['support'] / float(context_support)
125 | if confidence >= self.minconf:
126 | goodPaths.append((self.tree.get_nodes_tag(p), confidence))
127 | return goodPaths
128 |
129 | def _set_tree_debug_only(self, tree):
130 | self.tree = tree
131 | self.root_node = tree.get_root()
132 |
133 | def get_freq_seqs(self):
134 | return self.freq_seqs
135 |
136 | def get_sequence_tree(self):
137 | return self.tree
138 |
139 | def show_tree(self):
140 | self.tree.show()
141 |
142 | def get_confidence_list(self, recommendation):
143 | return list(map(lambda x: x[1], recommendation))
144 |
145 | def _parse_spfm_output(self):
146 | with open(self.output_path, 'r') as fin:
147 | self.freq_seqs = []
148 | for line in fin:
149 | pieces = line.split('#SUP: ')
150 | support = pieces[1].strip()
151 | items = pieces[0].split(' ')
152 | seq = tuple(x for x in items if x != '' and x != '-1')
153 | seq_and_support = (seq, int(support))
154 | self.freq_seqs.append(seq_and_support)
155 |
--------------------------------------------------------------------------------
/recommenders/ISeqRecommender.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 |
4 | class ISeqRecommender(object):
5 | """Abstract Recommender class"""
6 |
7 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8 | logger = logging.getLogger()
9 |
10 | def __init__(self):
11 | super(ISeqRecommender, self).__init__()
12 |
13 | def fit(self, train_data):
14 | pass
15 |
16 | def recommend(self, user_profile, user_id=None):
17 | """
18 | Given the user profile return a list of recommendation
19 | :param user_profile: the user profile as a list of item identifiers
20 | :param user_id: (optional) the user id
21 | :return: list of recommendations e.g. [([2], 0.875), ([6], 1.0)]
22 | """
23 | pass
24 |
25 | @staticmethod
26 | def get_recommendation_list(recommendation):
27 | return list(map(lambda x: x[0], recommendation))
28 |
29 | @staticmethod
30 | def get_recommendation_confidence_list(recommendation):
31 | return list(map(lambda x: x[1], recommendation))
32 |
33 | def activate_debug_print(self):
34 | self.logger.setLevel(logging.DEBUG)
35 |
36 | def deactivate_debug_print(self):
37 | self.logger.setLevel(logging.INFO)
38 |
--------------------------------------------------------------------------------
/recommenders/KNNRecommender.py:
--------------------------------------------------------------------------------
1 | from recommenders.ISeqRecommender import ISeqRecommender
2 | from util.data_utils import dataset_to_gru4rec_format
3 | from util.knn.iknn import ItemKNN
4 | from util.knn.sknn import SessionKNN
5 | from util.knn.vmsknn import VMSessionKNN
6 | from util.knn.ssknn import SeqSessionKNN
7 | from util.knn.sfsknn import SeqFilterSessionKNN
8 |
9 |
10 | class KNNRecommender(ISeqRecommender):
11 | """
12 | Interface to ItemKNN and Session-based KNN methods. Based on:
13 |
14 | Evaluation of Session-based Recommendation Algorithms, Malte Ludewig and Dietmar Jannach
15 | """
16 | knn_models = {
17 | 'iknn': ItemKNN,
18 | 'sknn': SessionKNN,
19 | 'v-sknn': VMSessionKNN,
20 | 's-sknn': SeqSessionKNN,
21 | 'sf-sknn': SeqFilterSessionKNN
22 | }
23 |
24 | def __init__(self,
25 | model='cknn',
26 | **init_args):
27 | """
28 | :param model: One among the following KNN models:
29 | - iknn: ItemKNN, item-to-item KNN based on the *last* item in the session to determine the items to be recommended.
30 | - sknn: SessionKNN, compares the *entire* current session with the past sessions in the training data to
31 | determine the items to be recommended.
32 | - v-sknn: VMSessionKNN, use linearly decayed real-valued vectors to encode the current session,
33 | then compares the current session with the past sessions in the training data using the dot-product
34 | to determine the items to be recommended.
35 | - s-sknn: SeqSessionKNN, this variant also puts more weight on elements that appear later in the session by
36 | using a custom scoring function (see the paper by Ludewng and Jannach).
37 | - sf-sknn: SeqFilterSessionKNN, this variant also puts more weight on elements that appear later in the session
38 | in a more restrictive way by using a custom scoring function (see the paper by Ludewng and Jannach).
39 |
40 | :param init_args: The model initialization arguments. See the following initializations or
41 | check `util.knn` for more details on each model:
42 | - iknn: ItemKNN(n_sims=100, lmbd=20, alpha=0.5)
43 | - sknn: SessionKNN(k, sample_size=500, sampling='recent', similarity='jaccard', remind=False, pop_boost=0)
44 | - v-sknn: VMSessionKNN(k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div',
45 | dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score',
46 | weighting_time=False, normalize=True)
47 | - s-knn: SeqSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', weighting='div',
48 | remind=False, pop_boost=0, extend=False, normalize=True)
49 | - sf-sknn: SeqFilterSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0,
50 | extend=False, normalize=True)
51 | """
52 | super(KNNRecommender).__init__()
53 | if model not in self.knn_models:
54 | raise ValueError("Unknown KNN model '{}'. The available ones are: {}".format(
55 | model, list(self.knn_models.keys())
56 | ))
57 | self.init_args = init_args
58 | self.init_args.update(dict(session_key='session_id',
59 | item_key='item_id',
60 | time_key='ts'))
61 | self.model = self.knn_models[model](**self.init_args)
62 | self.pseudo_session_id = 0
63 |
64 | def __str__(self):
65 | return str(self.model)
66 |
67 | def fit(self, train_data):
68 | self.logger.info('Converting training data to GRU4Rec format')
69 | # parse training data to GRU4Rec format
70 | train_data = dataset_to_gru4rec_format(dataset=train_data)
71 |
72 | self.logger.info('Training started')
73 | self.model.fit(train_data)
74 | self.logger.info('Training completed')
75 | self.pseudo_session_id = 0
76 |
77 | def recommend(self, user_profile, user_id=None):
78 | for item in user_profile:
79 | pred = self.model.predict_next(session_id=self.pseudo_session_id,
80 | input_item_id=item)
81 | # sort items by predicted score
82 | pred.sort_values(0, ascending=False, inplace=True)
83 | # increase the psuedo-session id so that future call to recommend() won't be connected
84 | self.pseudo_session_id += 1
85 | # convert to the required output format
86 | return [([x.index], x._2) for x in pred.reset_index().itertuples()]
87 |
--------------------------------------------------------------------------------
/recommenders/MarkovChainRecommender.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import logging
3 |
4 | from recommenders.ISeqRecommender import ISeqRecommender
5 | from util.markov.Markov import add_nodes_to_graph, add_edges, apply_skipping, apply_clustering
6 |
7 |
8 | class MarkovChainRecommender(ISeqRecommender):
9 | """
10 | Implementation from Shani, Guy, David Heckerman, and Ronen I. Brafman. "An MDP-based recommender system."
11 | Journal of Machine Learning Research 6, no. Sep (2005): 1265-1295. Chapter 3-4
12 | """
13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14 |
15 | def __init__(self, order):
16 | """
17 | :param order: the order of the Markov Chain
18 | """
19 | super(MarkovChainRecommender, self).__init__()
20 | self.order = order
21 |
22 | def fit(self, train_data):
23 | sequences = train_data['sequence'].values
24 |
25 | logging.info('Building Markov Chain model with k = ' + str(self.order))
26 | logging.info('Adding nodes')
27 | self.tree, self.count_dict, self.G = add_nodes_to_graph(sequences, self.order)
28 | logging.info('Adding edges')
29 | self.G = add_edges(self.tree, self.count_dict, self.G, self.order)
30 | logging.info('Applying skipping')
31 | self.G = apply_skipping(self.G, self.order, sequences)
32 | logging.info('Applying clustering')
33 | logging.info('{} states in the graph'.format(len(self.G.nodes())))
34 | self.G, _, _ = apply_clustering(self.G)
35 | # drop not useful resources
36 | self.tree = None
37 | self.count_dict = None
38 | gc.collect()
39 |
40 | def recommend(self, user_profile, user_id=None):
41 |
42 | # if the user profile is longer than the markov order, chop it keeping recent history
43 | state = tuple(user_profile[-self.order:])
44 | # see if graph has that state
45 | recommendations = []
46 | if self.G.has_node(state):
47 | # search for recommendations in the forward star
48 | rec_dict = {}
49 | for u, v in self.G.out_edges_iter([state]):
50 | lastElement = tuple(v[-1:])
51 | if lastElement in rec_dict:
52 | rec_dict[lastElement] += self.G[u][v]['count']
53 | else:
54 | rec_dict[lastElement] = self.G[u][v]['count']
55 | for k, v in rec_dict.items():
56 | recommendations.append((list(k), v))
57 |
58 | return recommendations
59 |
60 | def _set_graph_debug(self, G):
61 | self.G = G
62 |
--------------------------------------------------------------------------------
/recommenders/MixedMarkovRecommender.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from recommenders.ISeqRecommender import ISeqRecommender
4 | from recommenders.MarkovChainRecommender import MarkovChainRecommender
5 |
6 |
7 | class MixedMarkovChainRecommender(ISeqRecommender):
8 | """
9 | Creates markov models with different values of k, and return recommendation by weighting the list of
10 | recommendation of each model.
11 |
12 | Reference: Shani, Guy, David Heckerman, and Ronen I. Brafman. "An MDP-based recommender system."
13 | Journal of Machine Learning Research 6, no. Sep (2005): 1265-1295. Chapter 3-4
14 | """
15 |
16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17 |
18 | recommenders = {}
19 |
20 | def __init__(self, min_order=1, max_order=1):
21 | """
22 | :param min_order: the minimum order of the Mixed Markov Chain
23 | :param max_order: the maximum order of the Mixed Markov Chain
24 | """
25 | super(MixedMarkovChainRecommender, self).__init__()
26 | self.min_order = min_order
27 | self.max_order = max_order
28 | # define the models
29 | for i in range(self.min_order, self.max_order + 1):
30 | self.recommenders[i] = MarkovChainRecommender(i)
31 |
32 | def fit(self, user_profile):
33 | for order in self.recommenders:
34 | self.recommenders[order].fit(user_profile)
35 |
36 | def recommend(self, user_profile, user_id=None):
37 | rec_dict = {}
38 | recommendations = []
39 | sum_of_weights = 0
40 | for order, r in self.recommenders.items():
41 | rec_list = r.recommend(user_profile)
42 | sum_of_weights += 1 / order
43 | for i in rec_list:
44 | if tuple(i[0]) in rec_dict:
45 | rec_dict[tuple(i[0])] += 1 / order * i[1]
46 | else:
47 | rec_dict[tuple(i[0])] = 1 / order * i[1]
48 | for k, v in rec_dict.items():
49 | recommendations.append((list(k), v / sum_of_weights))
50 |
51 | return recommendations
52 |
53 | def _set_model_debug(self, recommender, order):
54 | self.recommenders[order] = recommender
55 |
--------------------------------------------------------------------------------
/recommenders/PopularityRecommender.py:
--------------------------------------------------------------------------------
1 | import operator
2 |
3 | from recommenders.ISeqRecommender import ISeqRecommender
4 |
5 |
6 | class PopularityRecommender(ISeqRecommender):
7 |
8 | def __init__(self):
9 | super(PopularityRecommender, self).__init__()
10 |
11 | def fit(self, train_data):
12 | sequences = train_data['sequence'].values
13 |
14 | count_dict = {}
15 | for s in sequences:
16 | for item in s:
17 | if item not in count_dict:
18 | count_dict[item] = 1
19 | else:
20 | count_dict[item] += 1
21 |
22 | self.top = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True)
23 | self.top = [([x[0]], x[1]) for x in self.top]
24 |
25 | def recommend(self, user_profile, user_id=None):
26 | return self.top
27 |
28 | def get_popular_list(self):
29 | return self.top
30 |
--------------------------------------------------------------------------------
/recommenders/Prod2VecRecommender.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import gensim
4 |
5 | from recommenders.ISeqRecommender import ISeqRecommender
6 |
7 |
8 | class Prod2VecRecommender(ISeqRecommender):
9 | """
10 | Implementation of the Prod2Vec skipgram model from
11 | Grbovic Mihajlo, Vladan Radosavljevic, Nemanja Djuric, Narayan Bhamidipati, Jaikit Savla, Varun Bhagwan, and Doug Sharp.
12 | "E-commerce in your inbox: Product recommendations at scale."
13 | In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining,
14 | pp. 1809-1818. ACM, 2015.
15 | """
16 |
17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18 |
19 | def __init__(self, min_count=2, size=100, window=5, decay_alpha=0.9, workers=4):
20 | """
21 | :param min_count: (optional) the minimum item frequency. Items less frequent that min_count will be pruned
22 | :param size: (optional) the size of the embeddings
23 | :param window: (optional) the size of the context window
24 | :param decay_alpha: (optional) the exponential decay factor used to discount the similarity scores for items
25 | back in the user profile. Lower values mean higher discounting of past user interactions. Allows values in [0-1].
26 | :param workers: (optional) the number of threads used for training
27 | """
28 | super(Prod2VecRecommender, self).__init__()
29 | self.min_count = min_count
30 | self.size = size
31 | self.window = window
32 | self.decay_alpha = decay_alpha
33 | self.workers = workers
34 |
35 | def __str__(self):
36 | return 'Prod2VecRecommender(min_count={min_count}, ' \
37 | 'size={size}, ' \
38 | 'window={window}, ' \
39 | 'decay_alpha={decay_alpha}, ' \
40 | 'workers={workers})'.format(**self.__dict__)
41 |
42 | def fit(self, train_data):
43 | sequences = train_data['sequence'].values
44 | self.model = gensim.models.Word2Vec(sequences,
45 | min_count=self.min_count,
46 | window=self.window,
47 | hs=1,
48 | size=self.size,
49 | sg=1,
50 | workers=self.workers)
51 |
52 | def recommend(self, user_profile, user_id=None):
53 | user_profile = list(map(str, user_profile))
54 | rec = []
55 | try:
56 | # iterate the user profile backwards
57 | for i, item in enumerate(user_profile[::-1]):
58 | ms = self.model.most_similar(positive=item)
59 | # apply exponential decay to the similarity scores
60 | decay = self.decay_alpha ** i
61 | ms = [(x[0], decay * x[1]) for x in ms]
62 | rec.extend(ms)
63 | # sort items by similarity score
64 | rec = sorted(rec, key=lambda x: -x[1])
65 | except KeyError:
66 | rec = []
67 | return [([x[0]], x[1]) for x in rec]
68 |
--------------------------------------------------------------------------------
/recommenders/RNNRecommender.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from recommenders.ISeqRecommender import ISeqRecommender
4 | from util.data_utils import dataset_to_gru4rec_format
5 | from util.rnn.gru4rec import GRU4Rec
6 | from util.rnn.hgru4rec import HGRU4Rec
7 |
8 |
9 | class RNNRecommender(ISeqRecommender):
10 | """
11 | A **simplified** interface to Recurrent Neural Network models for Session-based recommendation.
12 | Based on the following two papers:
13 |
14 | * Recurrent Neural Networks with Top-k Gains for Session-based Recommendations, Hidasi and Karatzoglou, CIKM 2018
15 | * Personalizing Session-based Recommendation with Hierarchical Recurrent Neural Networks, Quadrana et al, Recsys 2017
16 |
17 | """
18 |
19 | def __init__(self,
20 | session_layers,
21 | user_layers=None,
22 | batch_size=32,
23 | learning_rate=0.1,
24 | momentum=0.0,
25 | dropout=None,
26 | epochs=10,
27 | personalized=False):
28 | """
29 | :param session_layers: number of units per layer used at session level.
30 | It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks.
31 | :param user_layers: number of units per layer used at user level. Required only by personalized models.
32 | It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks.
33 | :param batch_size: the mini-batch size used in training
34 | :param learning_rate: the learning rate used in training (Adagrad optimized)
35 | :param momentum: the momentum coefficient used in training
36 | :param dropout: dropout coefficients.
37 | If personalized=False, it's a float value for the hidden-layer(s) dropout.
38 | If personalized=True, it's a 3-tuple with the values for the dropout of (user hidden, session hidden, user-to-session hidden) layers.
39 | :param epochs: number of training epochs
40 | :param personalized: whether to train a personalized model using the HRNN model.
41 | It will require user ids at prediction time.
42 | """
43 | super(RNNRecommender).__init__()
44 | if isinstance(session_layers, int):
45 | session_layers = [session_layers]
46 | if isinstance(user_layers, int):
47 | user_layers = [user_layers]
48 | self.session_layers = session_layers
49 | self.user_layers = user_layers
50 | self.batch_size = batch_size
51 | self.learning_rate = learning_rate
52 | self.momentum = momentum
53 | if dropout is None:
54 | if not personalized:
55 | dropout = 0.0
56 | else:
57 | dropout = (0.0, 0.0, 0.0)
58 | self.dropout = dropout
59 | self.epochs = epochs
60 | self.personalized = personalized
61 | self.pseudo_session_id = 0
62 |
63 | def __str__(self):
64 | return 'RNNRecommender(' \
65 | 'session_layers={session_layers}, ' \
66 | 'user_layers={user_layers}, ' \
67 | 'batch_size={batch_size}, ' \
68 | 'learning_rate={learning_rate}, ' \
69 | 'momentum={momentum}, ' \
70 | 'dropout={dropout}, ' \
71 | 'epochs={epochs}, ' \
72 | 'personalized={personalized}, ' \
73 | ')'.format(**self.__dict__)
74 |
75 | def fit(self, train_data):
76 | self.logger.info('Converting training data to GRU4Rec format')
77 | # parse training data to GRU4Rec format
78 | train_data = dataset_to_gru4rec_format(dataset=train_data)
79 |
80 | if not self.personalized:
81 | # fit GRU4Rec
82 | self.model = GRU4Rec(layers=self.session_layers,
83 | n_epochs=self.epochs,
84 | batch_size=self.batch_size,
85 | learning_rate=self.learning_rate,
86 | momentum=self.momentum,
87 | dropout_p_hidden=self.dropout,
88 | session_key='session_id',
89 | item_key='item_id',
90 | time_key='ts')
91 | else:
92 | if self.user_layers is None:
93 | raise ValueError('You should set the value of user_layers before training the personalized model.')
94 |
95 | if len(self.dropout) != 3:
96 | raise ValueError('dropout should be a 3 tuple with '
97 | '(user hidden, session hidden, user-to-session hidden) dropout values.')
98 |
99 | self.model = HGRU4Rec(session_layers=self.session_layers,
100 | user_layers=self.user_layers,
101 | batch_size=self.batch_size,
102 | n_epochs=self.epochs,
103 | learning_rate=self.learning_rate,
104 | momentum=self.momentum,
105 | dropout_p_hidden_usr=self.dropout[0],
106 | dropout_p_hidden_ses=self.dropout[1],
107 | dropout_p_init=self.dropout[2],
108 | session_key='session_id',
109 | user_key='user_id',
110 | item_key='item_id',
111 | time_key='ts')
112 | self.logger.info('Training started')
113 | self.model.fit(train_data)
114 | self.logger.info('Training completed')
115 |
116 | def recommend(self, user_profile, user_id=None):
117 | if not self.personalized:
118 | for item in user_profile:
119 | pred = self.model.predict_next_batch(np.array([self.pseudo_session_id]),
120 | np.array([item]),
121 | batch=1)
122 | else:
123 | if user_id is None:
124 | raise ValueError('user_id required by personalized models')
125 | for item in user_profile:
126 | pred = self.model.predict_next_batch(np.array([self.pseudo_session_id]),
127 | np.array([item]),
128 | np.array([user_id]),
129 | batch=1)
130 | # sort items by predicted score
131 | pred.sort_values(0, ascending=False, inplace=True)
132 | # increase the psuedo-session id so that future call to recommend() won't be connected
133 | self.pseudo_session_id += 1
134 | # convert to the required output format
135 | return [([x.index], x._2) for x in pred.reset_index().itertuples()]
136 |
--------------------------------------------------------------------------------
/recommenders/SupervisedRecommender.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import clone
2 | from sklearn.tree import DecisionTreeClassifier
3 | from tqdm import tqdm
4 |
5 | from recommenders.ISeqRecommender import ISeqRecommender
6 | from util.data_expansion import data_expansion, user_profile_expansion
7 | from util.split import balance_dataset
8 |
9 |
10 | class SupervisedRecommender(ISeqRecommender):
11 | """
12 | Adapted from Zimdars, Andrew, David Maxwell Chickering, and Christopher Meek.
13 | "Using temporal data for making recommendations." In Proceedings of the Seventeenth conference
14 | on Uncertainty in artificial intelligence, pp. 580-588. Morgan Kaufmann Publishers Inc., 2001.
15 | """
16 |
17 | def __init__(self, history_length, classifier=DecisionTreeClassifier(), balance=True):
18 | """
19 | :param history_length: how many recent items to consider
20 | :param classifier: an instance of sklearn classifier (e.g. DecisionTreeClassifier, LogisticRegression)
21 | :param balance : whether to balance or not the training data for each item
22 | :return:
23 | """
24 |
25 | super(SupervisedRecommender, self).__init__()
26 | self.classifier = classifier
27 | self.history_length = history_length
28 | self.balance = balance
29 |
30 | def fit(self, train_data):
31 | sequences = train_data['sequence'].values
32 |
33 | data, self.mapping = data_expansion(sequences, self.history_length)
34 | self.item_classifier = {}
35 | # for each column i.e. item, build a classifier
36 | with tqdm(total=len(self.mapping)) as pbar:
37 | for key, value in self.mapping.items():
38 | train, test = self._split_train_test(data, value, len(self.mapping))
39 | if self.balance:
40 | train, test = balance_dataset(train, test)
41 | self.item_classifier[key] = self.classifier.fit(train, test.toarray().ravel())
42 | # reset classifier
43 | self.classifier = clone(self.classifier)
44 | pbar.update(1)
45 |
46 | def recommend(self, user_profile, user_id=None):
47 | # print('recommending')
48 | data = user_profile_expansion(user_profile, self.history_length, self.mapping)
49 | recommendations = []
50 | for item, c in self.item_classifier.items():
51 | if c.predict(data) == [1]:
52 | recommendations.append(item)
53 | return [([x], 1 / len(recommendations)) for x in recommendations]
54 |
55 | def _split_train_test(self, data, col_index, n_unique_items):
56 | test = data[:, col_index]
57 | train = data[:, [x for x in range(data.shape[1]) if x >= n_unique_items]]
58 | return train, test
59 |
60 | def set_classifier(self, classifier):
61 | self.classifier = classifier
62 |
--------------------------------------------------------------------------------
/recommenders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/recommenders/__init__.py
--------------------------------------------------------------------------------
/slides/TheWebConf2019_01_Introduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/slides/TheWebConf2019_01_Introduction.pdf
--------------------------------------------------------------------------------
/slides/TheWebConf2019_02_Algorithms.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/slides/TheWebConf2019_02_Algorithms.pdf
--------------------------------------------------------------------------------
/slides/TheWebConf2019_03_Evaluation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/slides/TheWebConf2019_03_Evaluation.pdf
--------------------------------------------------------------------------------
/spmf/spmf.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/spmf/spmf.jar
--------------------------------------------------------------------------------
/util/SPMFinterface.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | from datetime import datetime as dt
3 |
4 |
5 | def callSPMF(spmfPath, command):
6 | # java -jar spmf.jar run PrefixSpan contextPrefixSpan.txt output.txt 50%
7 | comm = ' '.join(['java -jar', spmfPath, 'run', command])
8 | print(comm)
9 | p = subprocess.Popen(comm,
10 | stdout=subprocess.PIPE,
11 | stderr=subprocess.STDOUT,
12 | shell=True)
13 | p.communicate() # wait for completion
14 |
--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/__init__.py
--------------------------------------------------------------------------------
/util/data_expansion.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.sparse import csc_matrix
3 |
4 |
5 | def data_expansion(sequences, history_length):
6 | # sequences = [[1,2,3,4],[9,7,4],[3,2,1],[0,4,3,2]]
7 | # history_length = 3
8 |
9 | # store unique elements
10 | # mapping items to incremental integers
11 |
12 | count = 0
13 | items_mapping = {}
14 | for s in sequences:
15 | for i in s:
16 | if i in items_mapping: continue
17 | items_mapping[i] = count
18 | count += 1
19 |
20 | number_of_unique_items = len(items_mapping)
21 |
22 | row = 0
23 | row_indeces = []
24 | col_indeces = []
25 | # for each sequence
26 | for s in sequences:
27 | # for each item in the sequence
28 | cached = []
29 | for i, item in enumerate(s):
30 | index = items_mapping[item]
31 |
32 | # in each row there will be: the taget,the cache
33 | row_indeces += [row] * (1 + len(cached))
34 |
35 | # add data target
36 | col_indeces.append(index)
37 |
38 | # add history
39 | for l in range(1, history_length + 1):
40 | if i < l: continue # no history available that far
41 | row_indeces.append(row)
42 | l_th_previous_item = s[i - l]
43 | previous_el_index = items_mapping[l_th_previous_item]
44 | col_indeces.append(previous_el_index + number_of_unique_items * l)
45 |
46 | # add cache
47 | col_indeces += cached
48 | cached.append(index + number_of_unique_items * (history_length + 1))
49 | assert len(row_indeces) == len(col_indeces)
50 |
51 | row += 1
52 |
53 | return csc_matrix((np.ones(len(row_indeces), dtype=np.int8), (row_indeces, col_indeces)),
54 | shape=(row, (history_length + 2) * len(items_mapping))), items_mapping
55 |
56 |
57 | def user_profile_expansion(user_profile, history_length, items_mapping):
58 | number_of_unique_items = len(items_mapping)
59 |
60 | row_indeces = []
61 | col_indeces = []
62 |
63 | # for each item in the sequence
64 | cached = [items_mapping[x] + number_of_unique_items * (history_length) for x in user_profile]
65 | last = user_profile[len(user_profile) - 1]
66 | index = items_mapping[last]
67 |
68 | # in each row there will be:the cache
69 | row_indeces += [0] * (len(cached))
70 |
71 | # add history
72 | for l in range(1, history_length + 1):
73 | if len(user_profile) < l: continue # no history available that far
74 | row_indeces.append(0)
75 | l_th_previous_item = user_profile[len(user_profile) - l]
76 | previous_el_index = items_mapping[l_th_previous_item]
77 | col_indeces.append(previous_el_index + number_of_unique_items * (l - 1))
78 |
79 | # add cache
80 | col_indeces += cached
81 |
82 | assert len(row_indeces) == len(col_indeces)
83 |
84 | return csc_matrix((np.ones(len(row_indeces)), (row_indeces, col_indeces)),
85 | shape=(1, (history_length + 1) * len(items_mapping)))
86 |
--------------------------------------------------------------------------------
/util/data_utils.py:
--------------------------------------------------------------------------------
1 | import calendar
2 | import datetime
3 | import os
4 | import time
5 | from collections import Counter
6 |
7 | import numpy as np
8 | import pandas as pd
9 |
10 |
11 | def create_seq_db_filter_top_k(path, topk=0, last_months=0):
12 | file = load_and_adapt(path, last_months=last_months)
13 |
14 | c = Counter(list(file['item_id']))
15 |
16 | if topk > 1:
17 | keeper = set([x[0] for x in c.most_common(topk)])
18 | file = file[file['item_id'].isin(keeper)]
19 |
20 | # group by session id and concat song_id
21 | groups = file.groupby('session_id')
22 |
23 | # convert item ids to string, then aggregate them to lists
24 | aggregated = groups['item_id'].agg({'sequence': lambda x: list(map(str, x))})
25 | init_ts = groups['ts'].min()
26 | users = groups['user_id'].min() # it's just fast, min doesn't actually make sense
27 |
28 | result = aggregated.join(init_ts).join(users)
29 | result.reset_index(inplace=True)
30 | return result
31 |
32 |
33 | def dataset_to_gru4rec_format(dataset):
34 | """
35 | Convert a list of sequences to GRU4Rec format.
36 | Based on this StackOverflow answer: https://stackoverflow.com/a/48532692
37 |
38 | :param dataset: the dataset to be transformed
39 | """
40 |
41 | lst_col = 'sequence'
42 | df = dataset.reset_index()
43 | unstacked = pd.DataFrame({
44 | col: np.repeat(df[col].values, df[lst_col].str.len()) for col in df.columns.drop(lst_col)}
45 | ).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns]
46 | # ensure that events in the session have increasing timestamps
47 | unstacked['ts'] = unstacked['ts'] + unstacked.groupby('user_id').cumcount()
48 | unstacked.rename(columns={'sequence': 'item_id'}, inplace=True)
49 | return unstacked
50 |
51 |
52 | def sequences_to_spfm_format(sequences, tmp_path='tmp/sequences.txt'):
53 | """
54 | Convert a list of sequences to SPFM format and write them to `tmp_path`
55 | :param sequences: the list of sequences
56 | :param tmp_path: the path where sequences will be written in the SPFM format
57 | """
58 | basedir = os.path.split(tmp_path)[0]
59 | os.makedirs(basedir, exist_ok=True)
60 | with open(tmp_path, 'w') as fout:
61 | for s in sequences:
62 | fout.write(' -1 '.join(map(str, s)))
63 | fout.write(' -2\n')
64 |
65 |
66 | def load_and_adapt(path, last_months=0):
67 | file_ext = os.path.splitext(path)[-1]
68 | if file_ext == '.csv':
69 | data = pd.read_csv(path, header=0)
70 | elif file_ext == '.hdf':
71 | data = pd.read_hdf(path)
72 | else:
73 | raise ValueError('Unsupported file {} having extension {}'.format(path, file_ext))
74 |
75 | col_names = ['session_id', 'user_id', 'item_id', 'ts'] + data.columns.values.tolist()[4:]
76 | data.columns = col_names
77 |
78 | if last_months > 0:
79 | def add_months(sourcedate, months):
80 | month = sourcedate.month - 1 + months
81 | year = int(sourcedate.year + month / 12)
82 | month = month % 12 + 1
83 | day = min(sourcedate.day, calendar.monthrange(year, month)[1])
84 | return datetime.date(year, month, day)
85 |
86 | lastdate = datetime.datetime.fromtimestamp(data.ts.max())
87 | firstdate = add_months(lastdate, -last_months)
88 | initial_unix = time.mktime(firstdate.timetuple())
89 |
90 | # filter out older interactions
91 | data = data[data['ts'] >= initial_unix]
92 |
93 | return data
94 |
--------------------------------------------------------------------------------
/util/evaluation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from tqdm import tqdm
3 |
4 |
5 | def sequential_evaluation(recommender,
6 | test_sequences,
7 | evaluation_functions,
8 | users=None,
9 | given_k=1,
10 | look_ahead=1,
11 | top_n=10,
12 | scroll=True,
13 | step=1):
14 | """
15 | Runs sequential evaluation of a recommender over a set of test sequences
16 | :param recommender: the instance of the recommender to test
17 | :param test_sequences: the set of test sequences
18 | :param evaluation_functions: list of evaluation metric functions
19 | :param users: (optional) the list of user ids associated to each test sequence. Required by personalized models like FPMC.
20 | :param given_k: (optional) the initial size of each user profile, starting from the first interaction in the sequence.
21 | If <0, start counting from the end of the sequence. It must be != 0.
22 | :param look_ahead: (optional) number of subsequent interactions in the sequence to be considered as ground truth.
23 | It can be any positive number or 'all' to extend the ground truth until the end of the sequence.
24 | :param top_n: (optional) size of the recommendation list
25 | :param scroll: (optional) whether to scroll the ground truth until the end of the sequence.
26 | If True, expand the user profile and move the ground truth forward of `step` interactions. Recompute and evaluate recommendations every time.
27 | If False, evaluate recommendations once per sequence without expanding the user profile.
28 | :param step: (optional) number of interactions that will be added to the user profile at each step of the sequential evaluation.
29 | :return: the list of the average values for each evaluation metric
30 | """
31 | if given_k == 0:
32 | raise ValueError('given_k must be != 0')
33 |
34 | metrics = np.zeros(len(evaluation_functions))
35 | with tqdm(total=len(test_sequences)) as pbar:
36 | for i, test_seq in enumerate(test_sequences):
37 | if users is not None:
38 | user = users[i]
39 | else:
40 | user = None
41 | if scroll:
42 | metrics += sequence_sequential_evaluation(recommender,
43 | test_seq,
44 | evaluation_functions,
45 | user,
46 | given_k,
47 | look_ahead,
48 | top_n,
49 | step)
50 | else:
51 | metrics += evaluate_sequence(recommender,
52 | test_seq,
53 | evaluation_functions,
54 | user,
55 | given_k,
56 | look_ahead,
57 | top_n)
58 | pbar.update(1)
59 | return metrics / len(test_sequences)
60 |
61 |
62 | def evaluate_sequence(recommender, seq, evaluation_functions, user, given_k, look_ahead, top_n):
63 | """
64 | :param recommender: which recommender to use
65 | :param seq: the user_profile/ context
66 | :param given_k: last element used as ground truth. NB if <0 it is interpreted as first elements to keep
67 | :param evaluation_functions: which function to use to evaluate the rec performance
68 | :param look_ahead: number of elements in ground truth to consider. if look_ahead = 'all' then all the ground_truth sequence is considered
69 | :return: performance of recommender
70 | """
71 | # safety checks
72 | if given_k < 0:
73 | given_k = len(seq) + given_k
74 |
75 | user_profile = seq[:given_k]
76 | ground_truth = seq[given_k:]
77 |
78 | # restrict ground truth to look_ahead
79 | ground_truth = ground_truth[:look_ahead] if look_ahead != 'all' else ground_truth
80 | ground_truth = list(map(lambda x: [x], ground_truth)) # list of list format
81 |
82 | if not user_profile or not ground_truth:
83 | # if any of the two missing all evaluation functions are 0
84 | return np.zeros(len(evaluation_functions))
85 |
86 | r = recommender.recommend(user_profile, user)[:top_n]
87 |
88 | if not r:
89 | # no recommendation found
90 | return np.zeros(len(evaluation_functions))
91 | reco_list = recommender.get_recommendation_list(r)
92 |
93 | tmp_results = []
94 | for f in evaluation_functions:
95 | tmp_results.append(f(ground_truth, reco_list))
96 | return np.array(tmp_results)
97 |
98 |
99 | def sequence_sequential_evaluation(recommender, seq, evaluation_functions, user, given_k, look_ahead, top_n, step):
100 | if given_k < 0:
101 | given_k = len(seq) + given_k
102 |
103 | eval_res = 0.0
104 | eval_cnt = 0
105 | for gk in range(given_k, len(seq), step):
106 | eval_res += evaluate_sequence(recommender, seq, evaluation_functions, user, gk, look_ahead, top_n)
107 | eval_cnt += 1
108 | return eval_res / eval_cnt
109 |
--------------------------------------------------------------------------------
/util/fpmc/FPMC.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pickle
3 | import random
4 |
5 | from util.fpmc.utils import *
6 |
7 |
8 | class FPMC:
9 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10 | logger = logging.getLogger()
11 |
12 | def __init__(self, n_user, n_item, n_factor, learn_rate, regular):
13 | self.user_set = set()
14 | self.item_set = set()
15 |
16 | self.n_user = n_user
17 | self.n_item = n_item
18 |
19 | self.n_factor = n_factor
20 | self.learn_rate = learn_rate
21 | self.regular = regular
22 |
23 | @staticmethod
24 | def dump(fpmcObj, fname):
25 | pickle.dump(fpmcObj, open(fname, 'wb'))
26 |
27 | @staticmethod
28 | def load(fname):
29 | return pickle.load(open(fname, 'rb'))
30 |
31 | def init_model(self, std=0.01):
32 | self.VUI = np.random.normal(0, std, size=(self.n_user, self.n_factor))
33 | self.VIU = np.random.normal(0, std, size=(self.n_item, self.n_factor))
34 | self.VIL = np.random.normal(0, std, size=(self.n_item, self.n_factor))
35 | self.VLI = np.random.normal(0, std, size=(self.n_item, self.n_factor))
36 | self.VUI_m_VIU = np.dot(self.VUI, self.VIU.T)
37 | self.VIL_m_VLI = np.dot(self.VIL, self.VLI.T)
38 |
39 | def compute_x(self, u, i, b_tm1):
40 | acc_val = 0.0
41 | for l in b_tm1:
42 | acc_val += np.dot(self.VIL[i], self.VLI[l])
43 | return (np.dot(self.VUI[u], self.VIU[i]) + (acc_val / len(b_tm1)))
44 |
45 | def compute_x_batch(self, u, b_tm1):
46 | former = self.VUI_m_VIU[u]
47 | latter = np.mean(self.VIL_m_VLI[:, b_tm1], axis=1).T
48 | return (former + latter)
49 |
50 | def evaluation(self, data_list):
51 | np.dot(self.VUI, self.VIU.T, out=self.VUI_m_VIU)
52 | np.dot(self.VIL, self.VLI.T, out=self.VIL_m_VLI)
53 |
54 | correct_count = 0
55 | rr_list = []
56 | for (u, i, b_tm1) in data_list:
57 | scores = self.compute_x_batch(u, b_tm1)
58 |
59 | if i == scores.argmax():
60 | correct_count += 1
61 |
62 | rank = len(np.where(scores > scores[i])[0]) + 1
63 | rr = 1.0 / rank
64 | rr_list.append(rr)
65 |
66 | try:
67 | acc = correct_count / len(rr_list)
68 | mrr = (sum(rr_list) / len(rr_list))
69 | return (acc, mrr)
70 | except:
71 | return (0.0, 0.0)
72 |
73 | def learn_epoch(self, tr_data, neg_batch_size):
74 | for iter_idx in range(len(tr_data)):
75 | (u, i, b_tm1) = random.choice(tr_data)
76 |
77 | exclu_set = self.item_set - set([i])
78 | j_list = random.sample(exclu_set, neg_batch_size)
79 |
80 | z1 = self.compute_x(u, i, b_tm1)
81 | for j in j_list:
82 | z2 = self.compute_x(u, j, b_tm1)
83 | delta = 1 - sigmoid(z1 - z2)
84 |
85 | VUI_update = self.learn_rate * (delta * (self.VIU[i] - self.VIU[j]) - self.regular * self.VUI[u])
86 | VIUi_update = self.learn_rate * (delta * self.VUI[u] - self.regular * self.VIU[i])
87 | VIUj_update = self.learn_rate * (-delta * self.VUI[u] - self.regular * self.VIU[j])
88 |
89 | self.VUI[u] += VUI_update
90 | self.VIU[i] += VIUi_update
91 | self.VIU[j] += VIUj_update
92 |
93 | eta = np.mean(self.VLI[b_tm1], axis=0)
94 | VILi_update = self.learn_rate * (delta * eta - self.regular * self.VIL[i])
95 | VILj_update = self.learn_rate * (-delta * eta - self.regular * self.VIL[j])
96 | VLI_update = self.learn_rate * (
97 | (delta * (self.VIL[i] - self.VIL[j]) / len(b_tm1)) - self.regular * self.VLI[b_tm1])
98 |
99 | self.VIL[i] += VILi_update
100 | self.VIL[j] += VILj_update
101 | self.VLI[b_tm1] += VLI_update
102 |
103 | def learnSBPR_FPMC(self, tr_data, n_epoch=10, neg_batch_size=10):
104 | for epoch in range(n_epoch):
105 | self.learn_epoch(tr_data, neg_batch_size=neg_batch_size)
106 | self.logger.info('epoch %d done' % epoch)
107 | # if eval_per_epoch == True:
108 | # acc_in, mrr_in = self.evaluation(tr_data)
109 | # if te_data != None:
110 | # acc_out, mrr_out = self.evaluation(te_data)
111 | # self.logger.info ('In sample:%.4f\t%.4f \t Out sample:%.4f\t%.4f' % (acc_in, mrr_in, acc_out, mrr_out))
112 | # else:
113 | # self.logger.info ('In sample:%.4f\t%.4f' % (acc_in, mrr_in))
114 | # else:
115 | #
116 |
117 | # if eval_per_epoch == False:
118 | # acc_in, mrr_in = self.evaluation(tr_data)
119 | # if te_data != None:
120 | # acc_out, mrr_out = self.evaluation(te_data)
121 | # print ('In sample:%.4f\t%.4f \t Out sample:%.4f\t%.4f' % (acc_in, mrr_in, acc_out, mrr_out))
122 | # else:
123 | # print ('In sample:%.4f\t%.4f' % (acc_in, mrr_in))
124 | #
125 | # if te_data != None:
126 | # return (acc_out, mrr_out)
127 | # else:
128 | # return None
129 |
--------------------------------------------------------------------------------
/util/fpmc/FPMC_numba.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from numba import jit
4 |
5 | from util.fpmc import FPMC as FPMC_basic
6 | from util.fpmc.utils import *
7 |
8 |
9 | class FPMC(FPMC_basic.FPMC):
10 |
11 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12 | logger = logging.getLogger()
13 |
14 | def __init__(self, n_user, n_item, n_factor, learn_rate, regular):
15 | super(FPMC, self).__init__(n_user, n_item, n_factor, learn_rate, regular)
16 |
17 | def evaluation(self, data_3_list):
18 | np.dot(self.VUI, self.VIU.T, out=self.VUI_m_VIU)
19 | np.dot(self.VIL, self.VLI.T, out=self.VIL_m_VLI)
20 | acc, mrr = evaluation_jit(data_3_list[0], data_3_list[1], data_3_list[2], self.VUI_m_VIU, self.VIL_m_VLI)
21 |
22 | return acc, mrr
23 |
24 | def evaluation_recommender(self, user, user_profile):
25 | np.dot(self.VUI, self.VIU.T, out=self.VUI_m_VIU)
26 | np.dot(self.VIL, self.VLI.T, out=self.VIL_m_VLI)
27 | scores = evaluation_jit_recommender(user, user_profile, self.VUI_m_VIU, self.VIL_m_VLI)
28 | return sorted(range(len(scores)), key=lambda x: -scores[x]), sorted(scores, reverse=True)
29 |
30 | def learn_epoch(self, data_3_list, neg_batch_size):
31 | VUI, VIU, VLI, VIL = learn_epoch_jit(data_3_list[0], data_3_list[1], data_3_list[2], neg_batch_size,
32 | np.array(list(self.item_set)), self.VUI, self.VIU, self.VLI, self.VIL,
33 | self.learn_rate, self.regular)
34 | self.VUI = VUI
35 | self.VIU = VIU
36 | self.VLI = VLI
37 | self.VIL = VIL
38 |
39 | def learnSBPR_FPMC(self, tr_data, n_epoch=10, neg_batch_size=10):
40 | tr_3_list = data_to_3_list(tr_data)
41 |
42 | for epoch in range(n_epoch):
43 | self.learn_epoch(tr_3_list, neg_batch_size)
44 | self.logger.info('epoch %d done' % epoch)
45 |
46 | # if eval_per_epoch == False:
47 | # acc_in, mrr_in = self.evaluation(tr_3_list)
48 | # if te_data != None:
49 | # acc_out, mrr_out = self.evaluation(te_3_list)
50 | # print ('In sample:%.4f\t%.4f \t Out sample:%.4f\t%.4f' % (acc_in, mrr_in, acc_out, mrr_out))
51 | # else:
52 | # print ('In sample:%.4f\t%.4f' % (acc_in, mrr_in))
53 | #
54 | #
55 | # if te_data != None:
56 | # if ret_in_score:
57 | # return (acc_in, mrr_in, acc_out, mrr_out)
58 | # else:
59 | # return (acc_out, mrr_out)
60 | # else:
61 | # return None
62 |
63 |
64 | @jit(nopython=True)
65 | def compute_x_jit(u, i, b_tm1, VUI, VIU, VLI, VIL):
66 | acc_val = 0.0
67 | for l in b_tm1:
68 | acc_val += np.dot(VIL[i], VLI[l])
69 | return (np.dot(VUI[u], VIU[i]) + (acc_val / len(b_tm1)))
70 |
71 |
72 | @jit(nopython=True)
73 | def learn_epoch_jit(u_list, i_list, b_tm1_list, neg_batch_size, item_set, VUI, VIU, VLI, VIL, learn_rate, regular):
74 | for iter_idx in range(len(u_list)):
75 | d_idx = np.random.randint(0, len(u_list))
76 | u = u_list[d_idx]
77 | i = i_list[d_idx]
78 | b_tm1 = b_tm1_list[d_idx][b_tm1_list[d_idx] != -1]
79 |
80 | j_list = np.random.choice(item_set, size=neg_batch_size, replace=False)
81 | z1 = compute_x_jit(u, i, b_tm1, VUI, VIU, VLI, VIL)
82 | for j in j_list:
83 | z2 = compute_x_jit(u, j, b_tm1, VUI, VIU, VLI, VIL)
84 | delta = 1 - sigmoid_jit(z1 - z2)
85 |
86 | VUI_update = learn_rate * (delta * (VIU[i] - VIU[j]) - regular * VUI[u])
87 | VIUi_update = learn_rate * (delta * VUI[u] - regular * VIU[i])
88 | VIUj_update = learn_rate * (-delta * VUI[u] - regular * VIU[j])
89 |
90 | VUI[u] += VUI_update
91 | VIU[i] += VIUi_update
92 | VIU[j] += VIUj_update
93 |
94 | eta = np.zeros(VLI.shape[1])
95 | for l in b_tm1:
96 | eta += VLI[l]
97 | eta = eta / len(b_tm1)
98 |
99 | VILi_update = learn_rate * (delta * eta - regular * VIL[i])
100 | VILj_update = learn_rate * (-delta * eta - regular * VIL[j])
101 | VLI_updates = np.zeros((len(b_tm1), VLI.shape[1]))
102 | for idx, l in enumerate(b_tm1):
103 | VLI_updates[idx] = learn_rate * ((delta * (VIL[i] - VIL[j]) / len(b_tm1)) - regular * VLI[l])
104 |
105 | VIL[i] += VILi_update
106 | VIL[j] += VILj_update
107 | for idx, l in enumerate(b_tm1):
108 | VLI[l] += VLI_updates[idx]
109 |
110 | return VUI, VIU, VLI, VIL
111 |
112 |
113 | @jit(nopython=True)
114 | def sigmoid_jit(x):
115 | if x >= 0:
116 | return math.exp(-np.logaddexp(0, -x))
117 | else:
118 | return math.exp(x - np.logaddexp(x, 0))
119 |
120 |
121 | @jit(nopython=True)
122 | def compute_x_batch_jit(u, b_tm1, VUI_m_VIU, VIL_m_VLI):
123 | former = VUI_m_VIU[u]
124 | latter = np.zeros(VIL_m_VLI.shape[0])
125 | for idx in range(VIL_m_VLI.shape[0]):
126 | for l in b_tm1:
127 | latter[idx] += VIL_m_VLI[idx, l]
128 | latter = latter / len(b_tm1)
129 |
130 | return (former + latter)
131 |
132 |
133 | @jit(nopython=True)
134 | def evaluation_jit(u_list, i_list, b_tm1_list, VUI_m_VIU, VIL_m_VLI):
135 | correct_count = 0
136 | acc_rr = 0
137 | for d_idx in range(len(u_list)):
138 | u = u_list[d_idx]
139 | i = i_list[d_idx]
140 | b_tm1 = b_tm1_list[d_idx][b_tm1_list[d_idx] != -1]
141 | scores = compute_x_batch_jit(u, b_tm1, VUI_m_VIU, VIL_m_VLI)
142 |
143 | if i == scores.argmax():
144 | correct_count += 1
145 |
146 | rank = len(np.where(scores > scores[i])[0]) + 1
147 | rr = 1.0 / rank
148 | acc_rr += rr
149 |
150 | acc = correct_count / len(u_list)
151 | mrr = acc_rr / len(u_list)
152 | return (acc, mrr)
153 |
154 |
155 | @jit(nopython=True)
156 | def evaluation_jit_recommender(user, b_tm1_list, VUI_m_VIU, VIL_m_VLI):
157 | u = user
158 | # b_tm1 = [x for x in b_tm1_list if x!=-1]
159 | b_tm1 = b_tm1_list
160 | scores = compute_x_batch_jit(u, b_tm1, VUI_m_VIU, VIL_m_VLI)
161 |
162 | return scores
163 |
--------------------------------------------------------------------------------
/util/fpmc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/fpmc/__init__.py
--------------------------------------------------------------------------------
/util/fpmc/utils.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import math
3 |
4 | import numpy as np
5 |
6 |
7 | def sigmoid(x):
8 | if x >= 0:
9 | return math.exp(-np.logaddexp(0, -x))
10 | else:
11 | return math.exp(x - np.logaddexp(x, 0))
12 |
13 |
14 | def load_data_from_dir(dirname):
15 | fname_user_idxseq = dirname + '/' + 'idxseq.txt'
16 | fname_user_list = dirname + '/' + 'user_idx_list.txt'
17 | fname_item_list = dirname + '/' + 'item_idx_list.txt'
18 | user_set = load_idx_list_file(fname_user_list)
19 | item_set = load_idx_list_file(fname_item_list)
20 |
21 | data_list = []
22 | with open(fname_user_idxseq, 'r') as f:
23 | for l in f:
24 | l = [int(s) for s in l.strip().split()]
25 | user = l[0]
26 | b_tm1 = list(set(l[1:-1]))
27 | label = l[-1]
28 |
29 | data_list.append((user, label, b_tm1))
30 |
31 | return data_list, user_set, item_set
32 |
33 |
34 | def load_idx_list_file(fname, delimiter=','):
35 | idx_set = set()
36 | with open(fname, 'r') as f:
37 | # dicard header
38 | f.readline()
39 |
40 | for l in csv.reader(f, delimiter=delimiter, quotechar='"'):
41 | idx = int(l[0])
42 | idx_set.add(idx)
43 | return idx_set
44 |
45 |
46 | def data_to_3_list(data_list):
47 | u_list = []
48 | i_list = []
49 | b_tm1_list = []
50 | max_l = 0
51 | for d in data_list:
52 | u_list.append(d[0])
53 | i_list.append(d[1])
54 | b_tm1_list.append(d[2])
55 | if len(d[2]) > max_l:
56 | max_l = len(d[2])
57 | for b_tm1 in b_tm1_list:
58 | b_tm1.extend([-1 for i in range(max_l - len(b_tm1))])
59 | b_tm1_list = np.array(b_tm1_list)
60 |
61 | return (u_list, i_list, b_tm1_list)
62 |
--------------------------------------------------------------------------------
/util/knn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/knn/__init__.py
--------------------------------------------------------------------------------
/util/knn/iknn.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Jun 26 11:57:27 2015
4 | @author: Balázs Hidasi
5 | """
6 |
7 | import numpy as np
8 | import pandas as pd
9 |
10 |
11 | class ItemKNN:
12 | '''
13 | ItemKNN(n_sims = 100, lmbd = 20, alpha = 0.5, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time')
14 |
15 | Item-to-item predictor that computes the the similarity to all items to the given item.
16 |
17 | Similarity of two items is given by:
18 |
19 | .. math::
20 | s_{i,j}=\sum_{s}I\{(s,i)\in D & (s,j)\in D\} / (supp_i+\\lambda)^{\\alpha}(supp_j+\\lambda)^{1-\\alpha}
21 |
22 | Parameters
23 | --------
24 | n_sims : int
25 | Only give back non-zero scores to the N most similar items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100)
26 | lmbd : float
27 | Regularization. Discounts the similarity of rare items (incidental co-occurrences). (Default value: 20)
28 | alpha : float
29 | Balance between normalizing with the supports of the two items. 0.5 gives cosine similarity, 1.0 gives confidence (as in association rules).
30 | session_key : string
31 | header of the session ID column in the input file (default: 'SessionId')
32 | item_key : string
33 | header of the item ID column in the input file (default: 'ItemId')
34 | time_key : string
35 | header of the timestamp column in the input file (default: 'Time')
36 |
37 | '''
38 |
39 | def __init__(self, n_sims=100, lmbd=20, alpha=0.5, session_key='SessionId', item_key='ItemId', time_key='Time'):
40 | self.n_sims = n_sims
41 | self.lmbd = lmbd
42 | self.alpha = alpha
43 | self.item_key = item_key
44 | self.session_key = session_key
45 | self.time_key = time_key
46 |
47 | def fit(self, data):
48 | '''
49 | Trains the predictor.
50 |
51 | Parameters
52 | --------
53 | data: pandas.DataFrame
54 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
55 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
56 |
57 | '''
58 | data.set_index(np.arange(len(data)), inplace=True)
59 | self.itemids = data[self.item_key].unique()
60 | n_items = len(self.itemids)
61 | data = pd.merge(data, pd.DataFrame({self.item_key: self.itemids, 'ItemIdx': np.arange(len(self.itemids))}),
62 | on=self.item_key, how='inner')
63 | sessionids = data[self.session_key].unique()
64 | data = pd.merge(data, pd.DataFrame({self.session_key: sessionids, 'SessionIdx': np.arange(len(sessionids))}),
65 | on=self.session_key, how='inner')
66 | supp = data.groupby('SessionIdx').size()
67 | session_offsets = np.zeros(len(supp) + 1, dtype=np.int32)
68 | session_offsets[1:] = supp.cumsum()
69 | index_by_sessions = data.sort_values(['SessionIdx', self.time_key]).index.values
70 | supp = data.groupby('ItemIdx').size()
71 | item_offsets = np.zeros(n_items + 1, dtype=np.int32)
72 | item_offsets[1:] = supp.cumsum()
73 | index_by_items = data.sort_values(['ItemIdx', self.time_key]).index.values
74 | self.sims = dict()
75 | for i in range(n_items):
76 | iarray = np.zeros(n_items)
77 | start = item_offsets[i]
78 | end = item_offsets[i + 1]
79 | for e in index_by_items[start:end]:
80 | uidx = data.SessionIdx.values[e]
81 | ustart = session_offsets[uidx]
82 | uend = session_offsets[uidx + 1]
83 | user_events = index_by_sessions[ustart:uend]
84 | iarray[data.ItemIdx.values[user_events]] += 1
85 | iarray[i] = 0
86 | norm = np.power((supp[i] + self.lmbd), self.alpha) * np.power((supp.values + self.lmbd), (1.0 - self.alpha))
87 | norm[norm == 0] = 1
88 | iarray = iarray / norm
89 | indices = np.argsort(iarray)[-1:-1 - self.n_sims:-1]
90 | self.sims[self.itemids[i]] = pd.Series(data=iarray[indices], index=self.itemids[indices])
91 |
92 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
93 | '''
94 | Gives predicton scores for a selected set of items on how likely they be the next item in the session.
95 |
96 | Parameters
97 | --------
98 | session_id : int or string
99 | The session IDs of the event.
100 | input_item_id : int or string
101 | The item ID of the event. Must be in the set of item IDs of the training set.
102 | predict_for_item_ids : 1D array
103 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
104 |
105 | Returns
106 | --------
107 | out : pandas.Series
108 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
109 |
110 | '''
111 | if predict_for_item_ids is None:
112 | predict_for_item_ids = self.itemids
113 | preds = np.zeros(len(predict_for_item_ids))
114 | sim_list = self.sims[input_item_id]
115 | mask = np.in1d(predict_for_item_ids, sim_list.index)
116 | preds[mask] = sim_list[predict_for_item_ids[mask]]
117 | return pd.Series(data=preds, index=predict_for_item_ids)
118 |
--------------------------------------------------------------------------------
/util/knn/sfsknn.py:
--------------------------------------------------------------------------------
1 | from _operator import itemgetter
2 | from math import sqrt
3 | import random
4 | import time
5 |
6 | import numpy as np
7 | import pandas as pd
8 |
9 |
10 | class SeqFilterSessionKNN:
11 | '''
12 | SessionKNN( k, sample_size=500, sampling='recent', similarity = 'jaccard', remind=False, pop_boost=0, session_key = 'SessionId', item_key= 'ItemId')
13 |
14 | Parameters
15 | -----------
16 | k : int
17 | Number of neighboring session to calculate the item scores from. (Default value: 100)
18 | sample_size : int
19 | Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
20 | sampling : string
21 | String to define the sampling method for sessions (recent, random). (default: recent)
22 | similarity : string
23 | String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
24 | remind : bool
25 | Should the last items of the current session be boosted to the top as reminders
26 | pop_boost : int
27 | Push popular items in the neighbor sessions by this factor. (default: 0 to leave out)
28 | extend : bool
29 | Add evaluated sessions to the maps
30 | normalize : bool
31 | Normalize the scores in the end
32 | session_key : string
33 | Header of the session ID column in the input file. (default: 'SessionId')
34 | item_key : string
35 | Header of the item ID column in the input file. (default: 'ItemId')
36 | time_key : string
37 | Header of the timestamp column in the input file. (default: 'Time')
38 | '''
39 |
40 | def __init__(self, k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0,
41 | extend=False, normalize=True, session_key='SessionId', item_key='ItemId', time_key='Time'):
42 |
43 | self.remind = remind
44 | self.k = k
45 | self.sample_size = sample_size
46 | self.sampling = sampling
47 | self.similarity = similarity
48 | self.pop_boost = pop_boost
49 | self.session_key = session_key
50 | self.item_key = item_key
51 | self.time_key = time_key
52 | self.extend = extend
53 | self.normalize = normalize
54 |
55 | # updated while recommending
56 | self.session = -1
57 | self.session_items = []
58 | self.relevant_sessions = set()
59 |
60 | # cache relations once at startup
61 | self.session_item_map = dict()
62 | self.item_session_map = dict()
63 | self.session_time = dict()
64 | self.followed_by = dict()
65 |
66 | self.sim_time = 0
67 |
68 | def fit(self, train, items=None):
69 | '''
70 | Trains the predictor.
71 |
72 | Parameters
73 | --------
74 | data: pandas.DataFrame
75 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
76 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
77 |
78 | '''
79 |
80 | index_session = train.columns.get_loc(self.session_key)
81 | index_item = train.columns.get_loc(self.item_key)
82 | index_time = train.columns.get_loc(self.time_key)
83 | self.itemids = train[self.item_key].unique()
84 |
85 | session = -1
86 | session_items = set()
87 | last_item = -1
88 | time = -1
89 | # cnt = 0
90 | for row in train.itertuples(index=False):
91 | # cache items of sessions
92 | if row[index_session] != session:
93 | if len(session_items) > 0:
94 | self.session_item_map.update({session: session_items})
95 | # cache the last time stamp of the session
96 | self.session_time.update({session: time})
97 | session = row[index_session]
98 | session_items = set()
99 | else:
100 | if last_item != -1: # fill followed by map for filtering of candidate items
101 | if not last_item in self.followed_by:
102 | self.followed_by[last_item] = set()
103 | self.followed_by[last_item].add(row[index_item])
104 |
105 | time = row[index_time]
106 | session_items.add(row[index_item])
107 |
108 | # cache sessions involving an item
109 | map_is = self.item_session_map.get(row[index_item])
110 | if map_is is None:
111 | map_is = set()
112 | self.item_session_map.update({row[index_item]: map_is})
113 | map_is.add(row[index_session])
114 |
115 | last_item = row[index_item]
116 |
117 | # Add the last tuple
118 | self.session_item_map.update({session: session_items})
119 | self.session_time.update({session: time})
120 |
121 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
122 | '''
123 | Gives predicton scores for a selected set of items on how likely they be the next item in the session.
124 |
125 | Parameters
126 | --------
127 | session_id : int or string
128 | The session IDs of the event.
129 | input_item_id : int or string
130 | The item ID of the event. Must be in the set of item IDs of the training set.
131 | predict_for_item_ids : 1D array
132 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
133 |
134 | Returns
135 | --------
136 | out : pandas.Series
137 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
138 |
139 | '''
140 |
141 | # gc.collect()
142 | # process = psutil.Process(os.getpid())
143 | # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')
144 |
145 | if (self.session != session_id): # new session
146 |
147 | if (self.extend):
148 | item_set = set(self.session_items)
149 | self.session_item_map[self.session] = item_set;
150 | for item in item_set:
151 | map_is = self.item_session_map.get(item)
152 | if map_is is None:
153 | map_is = set()
154 | self.item_session_map.update({item: map_is})
155 | map_is.add(self.session)
156 |
157 | ts = time.time()
158 | self.session_time.update({self.session: ts})
159 |
160 | last_item = -1
161 | for item in self.session_items:
162 | if last_item != -1:
163 | if not last_item in self.followed_by:
164 | self.followed_by[last_item] = set()
165 | self.followed_by[last_item].add(item)
166 | last_item = item
167 |
168 | self.session = session_id
169 | self.session_items = list()
170 | self.relevant_sessions = set()
171 |
172 | if type == 'view':
173 | self.session_items.append(input_item_id)
174 |
175 | if skip:
176 | return
177 |
178 | neighbors = self.find_neighbors(set(self.session_items), input_item_id, session_id)
179 | scores = self.score_items(neighbors, input_item_id)
180 |
181 | # add some reminders
182 | if self.remind:
183 |
184 | reminderScore = 5
185 | takeLastN = 3
186 |
187 | cnt = 0
188 | for elem in self.session_items[-takeLastN:]:
189 | cnt = cnt + 1
190 | # reminderScore = reminderScore + (cnt/100)
191 |
192 | oldScore = scores.get(elem)
193 | newScore = 0
194 | if oldScore is None:
195 | newScore = reminderScore
196 | else:
197 | newScore = oldScore + reminderScore
198 | # print 'old score ', oldScore
199 | # update the score and add a small number for the position
200 | newScore = (newScore * reminderScore) + (cnt / 100)
201 |
202 | scores.update({elem: newScore})
203 |
204 | # push popular ones
205 | if self.pop_boost > 0:
206 |
207 | pop = self.item_pop(neighbors)
208 | # Iterate over the item neighbors
209 | # print itemScores
210 | for key in scores:
211 | item_pop = pop.get(key)
212 | # Gives some minimal MRR boost?
213 | scores.update({key: (scores[key] + (self.pop_boost * item_pop))})
214 |
215 | # Create things in the format ..
216 | if predict_for_item_ids is None:
217 | predict_for_item_ids = self.itemids
218 | predictions = np.zeros(len(predict_for_item_ids))
219 | mask = np.in1d(predict_for_item_ids, list(scores.keys()))
220 |
221 | items = predict_for_item_ids[mask]
222 | values = [scores[x] for x in items]
223 | predictions[mask] = values
224 | series = pd.Series(data=predictions, index=predict_for_item_ids)
225 |
226 | if self.normalize:
227 | series = series / series.max()
228 |
229 | return series
230 |
231 | def item_pop(self, sessions):
232 | '''
233 | Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids)
234 |
235 | Parameters
236 | --------
237 | sessions: set
238 |
239 | Returns
240 | --------
241 | out : dict
242 | '''
243 | result = dict()
244 | max_pop = 0
245 | for session, weight in sessions:
246 | items = self.items_for_session(session)
247 | for item in items:
248 |
249 | count = result.get(item)
250 | if count is None:
251 | result.update({item: 1})
252 | else:
253 | result.update({item: count + 1})
254 |
255 | if (result.get(item) > max_pop):
256 | max_pop = result.get(item)
257 |
258 | for key in result:
259 | result.update({key: (result[key] / max_pop)})
260 |
261 | return result
262 |
263 | def jaccard(self, first, second):
264 | '''
265 | Calculates the jaccard index for two sessions
266 |
267 | Parameters
268 | --------
269 | first: Id of a session
270 | second: Id of a session
271 |
272 | Returns
273 | --------
274 | out : float value
275 | '''
276 | sc = time.clock()
277 | intersection = len(first & second)
278 | union = len(first | second)
279 | res = intersection / union
280 |
281 | self.sim_time += (time.clock() - sc)
282 |
283 | return res
284 |
285 | def cosine(self, first, second):
286 | '''
287 | Calculates the cosine similarity for two sessions
288 |
289 | Parameters
290 | --------
291 | first: Id of a session
292 | second: Id of a session
293 |
294 | Returns
295 | --------
296 | out : float value
297 | '''
298 | li = len(first & second)
299 | la = len(first)
300 | lb = len(second)
301 | result = li / sqrt(la) * sqrt(lb)
302 |
303 | return result
304 |
305 | def tanimoto(self, first, second):
306 | '''
307 | Calculates the cosine tanimoto similarity for two sessions
308 |
309 | Parameters
310 | --------
311 | first: Id of a session
312 | second: Id of a session
313 |
314 | Returns
315 | --------
316 | out : float value
317 | '''
318 | li = len(first & second)
319 | la = len(first)
320 | lb = len(second)
321 | result = li / (la + lb - li)
322 |
323 | return result
324 |
325 | def binary(self, first, second):
326 | '''
327 | Calculates the ? for 2 sessions
328 |
329 | Parameters
330 | --------
331 | first: Id of a session
332 | second: Id of a session
333 |
334 | Returns
335 | --------
336 | out : float value
337 | '''
338 | a = len(first & second)
339 | b = len(first)
340 | c = len(second)
341 |
342 | result = (2 * a) / ((2 * a) + b + c)
343 |
344 | return result
345 |
346 | def items_for_session(self, session):
347 | '''
348 | Returns all items in the session
349 |
350 | Parameters
351 | --------
352 | session: Id of a session
353 |
354 | Returns
355 | --------
356 | out : set
357 | '''
358 | return self.session_item_map.get(session);
359 |
360 | def sessions_for_item(self, item_id):
361 | '''
362 | Returns all session for an item
363 |
364 | Parameters
365 | --------
366 | item: Id of the item session
367 |
368 | Returns
369 | --------
370 | out : set
371 | '''
372 | return self.item_session_map.get(item_id)
373 |
374 | def most_recent_sessions(self, sessions, number):
375 | '''
376 | Find the most recent sessions in the given set
377 |
378 | Parameters
379 | --------
380 | sessions: set of session ids
381 |
382 | Returns
383 | --------
384 | out : set
385 | '''
386 | sample = set()
387 |
388 | tuples = list()
389 | for session in sessions:
390 | time = self.session_time.get(session)
391 | if time is None:
392 | print(' EMPTY TIMESTAMP!! ', session)
393 | tuples.append((session, time))
394 |
395 | tuples = sorted(tuples, key=itemgetter(1), reverse=True)
396 | # print 'sorted list ', sortedList
397 | cnt = 0
398 | for element in tuples:
399 | cnt = cnt + 1
400 | if cnt > number:
401 | break
402 | sample.add(element[0])
403 | # print 'returning sample of size ', len(sample)
404 | return sample
405 |
406 | def possible_neighbor_sessions(self, session_items, input_item_id, session_id):
407 | '''
408 | Find a set of session to later on find neighbors in.
409 | A self.sample_size of 0 uses all sessions in which any item of the current session appears.
410 | self.sampling can be performed with the options "recent" or "random".
411 | "recent" selects the self.sample_size most recent sessions while "random" just choses randomly.
412 |
413 | Parameters
414 | --------
415 | sessions: set of session ids
416 |
417 | Returns
418 | --------
419 | out : set
420 | '''
421 |
422 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
423 |
424 | if self.sample_size == 0: # use all session as possible neighbors
425 |
426 | print('!!!!! runnig KNN without a sample size (check config)')
427 | return self.relevant_sessions
428 |
429 | else: # sample some sessions
430 |
431 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
432 |
433 | if len(self.relevant_sessions) > self.sample_size:
434 |
435 | if self.sampling == 'recent':
436 | sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size)
437 | elif self.sampling == 'random':
438 | sample = random.sample(self.relevant_sessions, self.sample_size)
439 | else:
440 | sample = self.relevant_sessions[:self.sample_size]
441 |
442 | return sample
443 | else:
444 | return self.relevant_sessions
445 |
446 | def calc_similarity(self, session_items, sessions):
447 | '''
448 | Calculates the configured similarity for the items in session_items and each session in sessions.
449 |
450 | Parameters
451 | --------
452 | session_items: set of item ids
453 | sessions: list of session ids
454 |
455 | Returns
456 | --------
457 | out : list of tuple (session_id,similarity)
458 | '''
459 |
460 | # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
461 | neighbors = []
462 | cnt = 0
463 | for session in sessions:
464 | cnt = cnt + 1
465 | # get items of the session, look up the cache first
466 | session_items_test = self.items_for_session(session)
467 |
468 | similarity = getattr(self, self.similarity)(session_items_test, session_items)
469 | if similarity > 0:
470 | neighbors.append((session, similarity))
471 |
472 | return neighbors
473 |
474 | # -----------------
475 | # Find a set of neighbors, returns a list of tuples (sessionid: similarity)
476 | # -----------------
477 | def find_neighbors(self, session_items, input_item_id, session_id):
478 | '''
479 | Finds the k nearest neighbors for the given session_id and the current item input_item_id.
480 |
481 | Parameters
482 | --------
483 | session_items: set of item ids
484 | input_item_id: int
485 | session_id: int
486 |
487 | Returns
488 | --------
489 | out : list of tuple (session_id, similarity)
490 | '''
491 | possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id)
492 | possible_neighbors = self.calc_similarity(session_items, possible_neighbors)
493 |
494 | possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1])
495 | possible_neighbors = possible_neighbors[:self.k]
496 |
497 | return possible_neighbors
498 |
499 | def score_items(self, neighbors, input_item_id):
500 | '''
501 | Compute a set of scores for all items given a set of neighbors.
502 |
503 | Parameters
504 | --------
505 | neighbors: set of session ids
506 |
507 | Returns
508 | --------
509 | out : list of tuple (item, score)
510 | '''
511 | # now we have the set of relevant items to make predictions
512 | scores = dict()
513 | # iterate over the sessions
514 | for session in neighbors:
515 | # get the items in this session
516 | items = self.items_for_session(session[0])
517 |
518 | for item in items:
519 |
520 | if input_item_id in self.followed_by and item in self.followed_by[
521 | input_item_id]: # hard filter the candidates
522 |
523 | old_score = scores.get(item)
524 | new_score = session[1]
525 |
526 | if old_score is None:
527 | scores.update({item: new_score})
528 | else:
529 | new_score = old_score + new_score
530 | scores.update({item: new_score})
531 |
532 | return scores
533 |
--------------------------------------------------------------------------------
/util/knn/sknn.py:
--------------------------------------------------------------------------------
1 | from _operator import itemgetter
2 | from math import sqrt
3 | import random
4 | import time
5 |
6 | import numpy as np
7 | import pandas as pd
8 |
9 |
10 | class SessionKNN:
11 | '''
12 | SessionKNN( k, sample_size=500, sampling='recent', similarity = 'jaccard', remind=False, pop_boost=0, session_key = 'SessionId', item_key= 'ItemId')
13 |
14 | Parameters
15 | -----------
16 | k : int
17 | Number of neighboring session to calculate the item scores from. (Default value: 100)
18 | sample_size : int
19 | Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
20 | sampling : string
21 | String to define the sampling method for sessions (recent, random). (default: recent)
22 | similarity : string
23 | String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
24 | remind : bool
25 | Should the last items of the current session be boosted to the top as reminders
26 | pop_boost : int
27 | Push popular items in the neighbor sessions by this factor. (default: 0 to leave out)
28 | extend : bool
29 | Add evaluated sessions to the maps
30 | normalize : bool
31 | Normalize the scores in the end
32 | session_key : string
33 | Header of the session ID column in the input file. (default: 'SessionId')
34 | item_key : string
35 | Header of the item ID column in the input file. (default: 'ItemId')
36 | time_key : string
37 | Header of the timestamp column in the input file. (default: 'Time')
38 | '''
39 |
40 | def __init__(self, k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0,
41 | extend=False, normalize=True, session_key='SessionId', item_key='ItemId', time_key='Time'):
42 |
43 | self.remind = remind
44 | self.k = k
45 | self.sample_size = sample_size
46 | self.sampling = sampling
47 | self.similarity = similarity
48 | self.pop_boost = pop_boost
49 | self.session_key = session_key
50 | self.item_key = item_key
51 | self.time_key = time_key
52 | self.extend = extend
53 | self.normalize = normalize
54 |
55 | # updated while recommending
56 | self.session = -1
57 | self.session_items = []
58 | self.relevant_sessions = set()
59 |
60 | # cache relations once at startup
61 | self.session_item_map = dict()
62 | self.item_session_map = dict()
63 | self.session_time = dict()
64 |
65 | self.sim_time = 0
66 |
67 | def fit(self, train):
68 | '''
69 | Trains the predictor.
70 |
71 | Parameters
72 | --------
73 | data: pandas.DataFrame
74 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
75 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
76 |
77 | '''
78 |
79 | index_session = train.columns.get_loc(self.session_key)
80 | index_item = train.columns.get_loc(self.item_key)
81 | index_time = train.columns.get_loc(self.time_key)
82 | self.itemids = train[self.item_key].unique()
83 |
84 | session = -1
85 | session_items = set()
86 | time = -1
87 | # cnt = 0
88 | for row in train.itertuples(index=False):
89 | # cache items of sessions
90 | if row[index_session] != session:
91 | if len(session_items) > 0:
92 | self.session_item_map.update({session: session_items})
93 | # cache the last time stamp of the session
94 | self.session_time.update({session: time})
95 | session = row[index_session]
96 | session_items = set()
97 | time = row[index_time]
98 | session_items.add(row[index_item])
99 |
100 | # cache sessions involving an item
101 | map_is = self.item_session_map.get(row[index_item])
102 | if map_is is None:
103 | map_is = set()
104 | self.item_session_map.update({row[index_item]: map_is})
105 | map_is.add(row[index_session])
106 |
107 | # Add the last tuple
108 | self.session_item_map.update({session: session_items})
109 | self.session_time.update({session: time})
110 |
111 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
112 | '''
113 | Gives predicton scores for a selected set of items on how likely they be the next item in the session.
114 |
115 | Parameters
116 | --------
117 | session_id : int or string
118 | The session IDs of the event.
119 | input_item_id : int or string
120 | The item ID of the event. Must be in the set of item IDs of the training set.
121 | predict_for_item_ids : 1D array
122 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
123 |
124 | Returns
125 | --------
126 | out : pandas.Series
127 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
128 |
129 | '''
130 |
131 | # gc.collect()
132 | # process = psutil.Process(os.getpid())
133 | # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')
134 |
135 | if (self.session != session_id): # new session
136 |
137 | if (self.extend):
138 | item_set = set(self.session_items)
139 | self.session_item_map[self.session] = item_set;
140 | for item in item_set:
141 | map_is = self.item_session_map.get(item)
142 | if map_is is None:
143 | map_is = set()
144 | self.item_session_map.update({item: map_is})
145 | map_is.add(self.session)
146 |
147 | ts = time.time()
148 | self.session_time.update({self.session: ts})
149 |
150 | self.session = session_id
151 | self.session_items = list()
152 | self.relevant_sessions = set()
153 |
154 | if type == 'view':
155 | self.session_items.append(input_item_id)
156 |
157 | if skip:
158 | return
159 |
160 | neighbors = self.find_neighbors(set(self.session_items), input_item_id, session_id)
161 | scores = self.score_items(neighbors)
162 |
163 | # add some reminders
164 | if self.remind:
165 |
166 | reminderScore = 5
167 | takeLastN = 3
168 |
169 | cnt = 0
170 | for elem in self.session_items[-takeLastN:]:
171 | cnt = cnt + 1
172 | # reminderScore = reminderScore + (cnt/100)
173 |
174 | oldScore = scores.get(elem)
175 | newScore = 0
176 | if oldScore is None:
177 | newScore = reminderScore
178 | else:
179 | newScore = oldScore + reminderScore
180 | # print 'old score ', oldScore
181 | # update the score and add a small number for the position
182 | newScore = (newScore * reminderScore) + (cnt / 100)
183 |
184 | scores.update({elem: newScore})
185 |
186 | # push popular ones
187 | if self.pop_boost > 0:
188 |
189 | pop = self.item_pop(neighbors)
190 | # Iterate over the item neighbors
191 | # print itemScores
192 | for key in scores:
193 | item_pop = pop.get(key)
194 | # Gives some minimal MRR boost?
195 | scores.update({key: (scores[key] + (self.pop_boost * item_pop))})
196 |
197 | # Create things in the format ..
198 | if predict_for_item_ids is None:
199 | predict_for_item_ids = self.itemids
200 | predictions = np.zeros(len(predict_for_item_ids))
201 | mask = np.in1d(predict_for_item_ids, list(scores.keys()))
202 |
203 | items = predict_for_item_ids[mask]
204 | values = [scores[x] for x in items]
205 | predictions[mask] = values
206 | series = pd.Series(data=predictions, index=predict_for_item_ids)
207 |
208 | if self.normalize:
209 | series = series / series.max()
210 |
211 | return series
212 |
213 | def item_pop(self, sessions):
214 | '''
215 | Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids)
216 |
217 | Parameters
218 | --------
219 | sessions: set
220 |
221 | Returns
222 | --------
223 | out : dict
224 | '''
225 | result = dict()
226 | max_pop = 0
227 | for session, weight in sessions:
228 | items = self.items_for_session(session)
229 | for item in items:
230 |
231 | count = result.get(item)
232 | if count is None:
233 | result.update({item: 1})
234 | else:
235 | result.update({item: count + 1})
236 |
237 | if (result.get(item) > max_pop):
238 | max_pop = result.get(item)
239 |
240 | for key in result:
241 | result.update({key: (result[key] / max_pop)})
242 |
243 | return result
244 |
245 | def jaccard(self, first, second):
246 | '''
247 | Calculates the jaccard index for two sessions
248 |
249 | Parameters
250 | --------
251 | first: Id of a session
252 | second: Id of a session
253 |
254 | Returns
255 | --------
256 | out : float value
257 | '''
258 | sc = time.clock()
259 | intersection = len(first & second)
260 | union = len(first | second)
261 | res = intersection / union
262 |
263 | self.sim_time += (time.clock() - sc)
264 |
265 | return res
266 |
267 | def cosine(self, first, second):
268 | '''
269 | Calculates the cosine similarity for two sessions
270 |
271 | Parameters
272 | --------
273 | first: Id of a session
274 | second: Id of a session
275 |
276 | Returns
277 | --------
278 | out : float value
279 | '''
280 | li = len(first & second)
281 | la = len(first)
282 | lb = len(second)
283 | result = li / sqrt(la) * sqrt(lb)
284 |
285 | return result
286 |
287 | def tanimoto(self, first, second):
288 | '''
289 | Calculates the cosine tanimoto similarity for two sessions
290 |
291 | Parameters
292 | --------
293 | first: Id of a session
294 | second: Id of a session
295 |
296 | Returns
297 | --------
298 | out : float value
299 | '''
300 | li = len(first & second)
301 | la = len(first)
302 | lb = len(second)
303 | result = li / (la + lb - li)
304 |
305 | return result
306 |
307 | def binary(self, first, second):
308 | '''
309 | Calculates the ? for 2 sessions
310 |
311 | Parameters
312 | --------
313 | first: Id of a session
314 | second: Id of a session
315 |
316 | Returns
317 | --------
318 | out : float value
319 | '''
320 | a = len(first & second)
321 | b = len(first)
322 | c = len(second)
323 |
324 | result = (2 * a) / ((2 * a) + b + c)
325 |
326 | return result
327 |
328 | def random(self, first, second):
329 | '''
330 | Calculates the ? for 2 sessions
331 |
332 | Parameters
333 | --------
334 | first: Id of a session
335 | second: Id of a session
336 |
337 | Returns
338 | --------
339 | out : float value
340 | '''
341 | return random.random()
342 |
343 | def items_for_session(self, session):
344 | '''
345 | Returns all items in the session
346 |
347 | Parameters
348 | --------
349 | session: Id of a session
350 |
351 | Returns
352 | --------
353 | out : set
354 | '''
355 | return self.session_item_map.get(session);
356 |
357 | def sessions_for_item(self, item_id):
358 | '''
359 | Returns all session for an item
360 |
361 | Parameters
362 | --------
363 | item: Id of the item session
364 |
365 | Returns
366 | --------
367 | out : set
368 | '''
369 | return self.item_session_map.get(item_id)
370 |
371 | def most_recent_sessions(self, sessions, number):
372 | '''
373 | Find the most recent sessions in the given set
374 |
375 | Parameters
376 | --------
377 | sessions: set of session ids
378 |
379 | Returns
380 | --------
381 | out : set
382 | '''
383 | sample = set()
384 |
385 | tuples = list()
386 | for session in sessions:
387 | time = self.session_time.get(session)
388 | if time is None:
389 | print(' EMPTY TIMESTAMP!! ', session)
390 | tuples.append((session, time))
391 |
392 | tuples = sorted(tuples, key=itemgetter(1), reverse=True)
393 | # print 'sorted list ', sortedList
394 | cnt = 0
395 | for element in tuples:
396 | cnt = cnt + 1
397 | if cnt > number:
398 | break
399 | sample.add(element[0])
400 | # print 'returning sample of size ', len(sample)
401 | return sample
402 |
403 | def possible_neighbor_sessions(self, session_items, input_item_id, session_id):
404 | '''
405 | Find a set of session to later on find neighbors in.
406 | A self.sample_size of 0 uses all sessions in which any item of the current session appears.
407 | self.sampling can be performed with the options "recent" or "random".
408 | "recent" selects the self.sample_size most recent sessions while "random" just choses randomly.
409 |
410 | Parameters
411 | --------
412 | sessions: set of session ids
413 |
414 | Returns
415 | --------
416 | out : set
417 | '''
418 |
419 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
420 |
421 | if self.sample_size == 0: # use all session as possible neighbors
422 |
423 | print('!!!!! runnig KNN without a sample size (check config)')
424 | return self.relevant_sessions
425 |
426 | else: # sample some sessions
427 |
428 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
429 |
430 | if len(self.relevant_sessions) > self.sample_size:
431 |
432 | if self.sampling == 'recent':
433 | sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size)
434 | elif self.sampling == 'random':
435 | sample = random.sample(self.relevant_sessions, self.sample_size)
436 | else:
437 | sample = self.relevant_sessions[:self.sample_size]
438 |
439 | return sample
440 | else:
441 | return self.relevant_sessions
442 |
443 | def calc_similarity(self, session_items, sessions):
444 | '''
445 | Calculates the configured similarity for the items in session_items and each session in sessions.
446 |
447 | Parameters
448 | --------
449 | session_items: set of item ids
450 | sessions: list of session ids
451 |
452 | Returns
453 | --------
454 | out : list of tuple (session_id,similarity)
455 | '''
456 |
457 | # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
458 | neighbors = []
459 | cnt = 0
460 | for session in sessions:
461 | cnt = cnt + 1
462 | # get items of the session, look up the cache first
463 | session_items_test = self.items_for_session(session)
464 |
465 | similarity = getattr(self, self.similarity)(session_items_test, session_items)
466 | if similarity > 0:
467 | neighbors.append((session, similarity))
468 |
469 | return neighbors
470 |
471 | # -----------------
472 | # Find a set of neighbors, returns a list of tuples (sessionid: similarity)
473 | # -----------------
474 | def find_neighbors(self, session_items, input_item_id, session_id):
475 | '''
476 | Finds the k nearest neighbors for the given session_id and the current item input_item_id.
477 |
478 | Parameters
479 | --------
480 | session_items: set of item ids
481 | input_item_id: int
482 | session_id: int
483 |
484 | Returns
485 | --------
486 | out : list of tuple (session_id, similarity)
487 | '''
488 | possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id)
489 | possible_neighbors = self.calc_similarity(session_items, possible_neighbors)
490 |
491 | possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1])
492 | possible_neighbors = possible_neighbors[:self.k]
493 |
494 | return possible_neighbors
495 |
496 | def score_items(self, neighbors):
497 | '''
498 | Compute a set of scores for all items given a set of neighbors.
499 |
500 | Parameters
501 | --------
502 | neighbors: set of session ids
503 |
504 | Returns
505 | --------
506 | out : list of tuple (item, score)
507 | '''
508 | # now we have the set of relevant items to make predictions
509 | scores = dict()
510 | # iterate over the sessions
511 | for session in neighbors:
512 | # get the items in this session
513 | items = self.items_for_session(session[0])
514 |
515 | for item in items:
516 | old_score = scores.get(item)
517 | new_score = session[1]
518 |
519 | if old_score is None:
520 | scores.update({item: new_score})
521 | else:
522 | new_score = old_score + new_score
523 | scores.update({item: new_score})
524 |
525 | return scores
526 |
--------------------------------------------------------------------------------
/util/knn/ssknn.py:
--------------------------------------------------------------------------------
1 | from _operator import itemgetter
2 | from math import sqrt
3 | import random
4 | import time
5 | from math import log10
6 |
7 | import numpy as np
8 | import pandas as pd
9 |
10 |
11 | class SeqSessionKNN:
12 | '''
13 | SeqSessionKNN( k, sample_size=500, sampling='recent', similarity = 'jaccard', remind=False, pop_boost=0, session_key = 'SessionId', item_key= 'ItemId')
14 |
15 | Parameters
16 | -----------
17 | k : int
18 | Number of neighboring session to calculate the item scores from. (Default value: 100)
19 | sample_size : int
20 | Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
21 | sampling : string
22 | String to define the sampling method for sessions (recent, random). (default: recent)
23 | similarity : string
24 | String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
25 | remind : bool
26 | Should the last items of the current session be boosted to the top as reminders
27 | pop_boost : int
28 | Push popular items in the neighbor sessions by this factor. (default: 0 to leave out)
29 | extend : bool
30 | Add evaluated sessions to the maps
31 | normalize : bool
32 | Normalize the scores in the end
33 | session_key : string
34 | Header of the session ID column in the input file. (default: 'SessionId')
35 | item_key : string
36 | Header of the item ID column in the input file. (default: 'ItemId')
37 | time_key : string
38 | Header of the timestamp column in the input file. (default: 'Time')
39 | '''
40 |
41 | def __init__(self, k, sample_size=1000, sampling='recent', similarity='jaccard', weighting='div', remind=False,
42 | pop_boost=0, extend=False, normalize=True, session_key='SessionId', item_key='ItemId',
43 | time_key='Time'):
44 |
45 | self.remind = remind
46 | self.k = k
47 | self.sample_size = sample_size
48 | self.sampling = sampling
49 | self.weighting = weighting
50 | self.similarity = similarity
51 | self.pop_boost = pop_boost
52 | self.session_key = session_key
53 | self.item_key = item_key
54 | self.time_key = time_key
55 | self.extend = extend
56 | self.normalize = normalize
57 |
58 | # updated while recommending
59 | self.session = -1
60 | self.session_items = []
61 | self.relevant_sessions = set()
62 |
63 | # cache relations once at startup
64 | self.session_item_map = dict()
65 | self.item_session_map = dict()
66 | self.session_time = dict()
67 |
68 | self.sim_time = 0
69 |
70 | def fit(self, train, items=None):
71 | '''
72 | Trains the predictor.
73 |
74 | Parameters
75 | --------
76 | data: pandas.DataFrame
77 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
78 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
79 |
80 | '''
81 |
82 | index_session = train.columns.get_loc(self.session_key)
83 | index_item = train.columns.get_loc(self.item_key)
84 | index_time = train.columns.get_loc(self.time_key)
85 | self.itemids = train[self.item_key].unique()
86 |
87 | session = -1
88 | session_items = set()
89 | time = -1
90 | # cnt = 0
91 | for row in train.itertuples(index=False):
92 | # cache items of sessions
93 | if row[index_session] != session:
94 | if len(session_items) > 0:
95 | self.session_item_map.update({session: session_items})
96 | # cache the last time stamp of the session
97 | self.session_time.update({session: time})
98 | session = row[index_session]
99 | session_items = set()
100 | time = row[index_time]
101 | session_items.add(row[index_item])
102 |
103 | # cache sessions involving an item
104 | map_is = self.item_session_map.get(row[index_item])
105 | if map_is is None:
106 | map_is = set()
107 | self.item_session_map.update({row[index_item]: map_is})
108 | map_is.add(row[index_session])
109 |
110 | # Add the last tuple
111 | self.session_item_map.update({session: session_items})
112 | self.session_time.update({session: time})
113 |
114 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
115 | '''
116 | Gives predicton scores for a selected set of items on how likely they be the next item in the session.
117 |
118 | Parameters
119 | --------
120 | session_id : int or string
121 | The session IDs of the event.
122 | input_item_id : int or string
123 | The item ID of the event. Must be in the set of item IDs of the training set.
124 | predict_for_item_ids : 1D array
125 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
126 |
127 | Returns
128 | --------
129 | out : pandas.Series
130 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
131 |
132 | '''
133 |
134 | # gc.collect()
135 | # process = psutil.Process(os.getpid())
136 | # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')
137 |
138 | if (self.session != session_id): # new session
139 |
140 | if (self.extend):
141 | item_set = set(self.session_items)
142 | self.session_item_map[self.session] = item_set
143 | for item in item_set:
144 | map_is = self.item_session_map.get(item)
145 | if map_is is None:
146 | map_is = set()
147 | self.item_session_map.update({item: map_is})
148 | map_is.add(self.session)
149 |
150 | ts = time.time()
151 | self.session_time.update({self.session: ts})
152 |
153 | self.session = session_id
154 | self.session_items = list()
155 | self.relevant_sessions = set()
156 |
157 | if type == 'view':
158 | self.session_items.append(input_item_id)
159 |
160 | if skip:
161 | return
162 |
163 | neighbors = self.find_neighbors(set(self.session_items), input_item_id, session_id)
164 | scores = self.score_items(neighbors, self.session_items)
165 |
166 | # add some reminders
167 | if self.remind:
168 |
169 | reminderScore = 5
170 | takeLastN = 3
171 |
172 | cnt = 0
173 | for elem in self.session_items[-takeLastN:]:
174 | cnt = cnt + 1
175 | # reminderScore = reminderScore + (cnt/100)
176 |
177 | oldScore = scores.get(elem)
178 | newScore = 0
179 | if oldScore is None:
180 | newScore = reminderScore
181 | else:
182 | newScore = oldScore + reminderScore
183 | # print 'old score ', oldScore
184 | # update the score and add a small number for the position
185 | newScore = (newScore * reminderScore) + (cnt / 100)
186 |
187 | scores.update({elem: newScore})
188 |
189 | # push popular ones
190 | if self.pop_boost > 0:
191 |
192 | pop = self.item_pop(neighbors)
193 | # Iterate over the item neighbors
194 | # print itemScores
195 | for key in scores:
196 | item_pop = pop.get(key)
197 | # Gives some minimal MRR boost?
198 | scores.update({key: (scores[key] + (self.pop_boost * item_pop))})
199 |
200 | # Create things in the format ..
201 | if predict_for_item_ids is None:
202 | predict_for_item_ids = self.itemids
203 | predictions = np.zeros(len(predict_for_item_ids))
204 | mask = np.in1d(predict_for_item_ids, list(scores.keys()))
205 |
206 | items = predict_for_item_ids[mask]
207 | values = [scores[x] for x in items]
208 | predictions[mask] = values
209 | series = pd.Series(data=predictions, index=predict_for_item_ids)
210 |
211 | if self.normalize:
212 | series = series / series.max()
213 |
214 | return series
215 |
216 | def item_pop(self, sessions):
217 | '''
218 | Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids)
219 |
220 | Parameters
221 | --------
222 | sessions: set
223 |
224 | Returns
225 | --------
226 | out : dict
227 | '''
228 | result = dict()
229 | max_pop = 0
230 | for session, weight in sessions:
231 | items = self.items_for_session(session)
232 | for item in items:
233 |
234 | count = result.get(item)
235 | if count is None:
236 | result.update({item: 1})
237 | else:
238 | result.update({item: count + 1})
239 |
240 | if (result.get(item) > max_pop):
241 | max_pop = result.get(item)
242 |
243 | for key in result:
244 | result.update({key: (result[key] / max_pop)})
245 |
246 | return result
247 |
248 | def jaccard(self, first, second):
249 | '''
250 | Calculates the jaccard index for two sessions
251 |
252 | Parameters
253 | --------
254 | first: Id of a session
255 | second: Id of a session
256 |
257 | Returns
258 | --------
259 | out : float value
260 | '''
261 | sc = time.clock()
262 | intersection = len(first & second)
263 | union = len(first | second)
264 | res = intersection / union
265 |
266 | self.sim_time += (time.clock() - sc)
267 |
268 | return res
269 |
270 | def cosine(self, first, second):
271 | '''
272 | Calculates the cosine similarity for two sessions
273 |
274 | Parameters
275 | --------
276 | first: Id of a session
277 | second: Id of a session
278 |
279 | Returns
280 | --------
281 | out : float value
282 | '''
283 | li = len(first & second)
284 | la = len(first)
285 | lb = len(second)
286 | result = li / sqrt(la) * sqrt(lb)
287 |
288 | return result
289 |
290 | def tanimoto(self, first, second):
291 | '''
292 | Calculates the cosine tanimoto similarity for two sessions
293 |
294 | Parameters
295 | --------
296 | first: Id of a session
297 | second: Id of a session
298 |
299 | Returns
300 | --------
301 | out : float value
302 | '''
303 | li = len(first & second)
304 | la = len(first)
305 | lb = len(second)
306 | result = li / (la + lb - li)
307 |
308 | return result
309 |
310 | def binary(self, first, second):
311 | '''
312 | Calculates the ? for 2 sessions
313 |
314 | Parameters
315 | --------
316 | first: Id of a session
317 | second: Id of a session
318 |
319 | Returns
320 | --------
321 | out : float value
322 | '''
323 | a = len(first & second)
324 | b = len(first)
325 | c = len(second)
326 |
327 | result = (2 * a) / ((2 * a) + b + c)
328 |
329 | return result
330 |
331 | def items_for_session(self, session):
332 | '''
333 | Returns all items in the session
334 |
335 | Parameters
336 | --------
337 | session: Id of a session
338 |
339 | Returns
340 | --------
341 | out : set
342 | '''
343 | return self.session_item_map.get(session);
344 |
345 | def sessions_for_item(self, item_id):
346 | '''
347 | Returns all session for an item
348 |
349 | Parameters
350 | --------
351 | item: Id of the item session
352 |
353 | Returns
354 | --------
355 | out : set
356 | '''
357 | return self.item_session_map.get(item_id)
358 |
359 | def most_recent_sessions(self, sessions, number):
360 | '''
361 | Find the most recent sessions in the given set
362 |
363 | Parameters
364 | --------
365 | sessions: set of session ids
366 |
367 | Returns
368 | --------
369 | out : set
370 | '''
371 | sample = set()
372 |
373 | tuples = list()
374 | for session in sessions:
375 | time = self.session_time.get(session)
376 | if time is None:
377 | print(' EMPTY TIMESTAMP!! ', session)
378 | tuples.append((session, time))
379 |
380 | tuples = sorted(tuples, key=itemgetter(1), reverse=True)
381 | # print 'sorted list ', sortedList
382 | cnt = 0
383 | for element in tuples:
384 | cnt = cnt + 1
385 | if cnt > number:
386 | break
387 | sample.add(element[0])
388 | # print 'returning sample of size ', len(sample)
389 | return sample
390 |
391 | def possible_neighbor_sessions(self, session_items, input_item_id, session_id):
392 | '''
393 | Find a set of session to later on find neighbors in.
394 | A self.sample_size of 0 uses all sessions in which any item of the current session appears.
395 | self.sampling can be performed with the options "recent" or "random".
396 | "recent" selects the self.sample_size most recent sessions while "random" just choses randomly.
397 |
398 | Parameters
399 | --------
400 | sessions: set of session ids
401 |
402 | Returns
403 | --------
404 | out : set
405 | '''
406 |
407 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
408 |
409 | if self.sample_size == 0: # use all session as possible neighbors
410 |
411 | print('!!!!! runnig KNN without a sample size (check config)')
412 | return self.relevant_sessions
413 |
414 | else: # sample some sessions
415 |
416 | if len(self.relevant_sessions) > self.sample_size:
417 |
418 | if self.sampling == 'recent':
419 | sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size)
420 | elif self.sampling == 'random':
421 | sample = random.sample(self.relevant_sessions, self.sample_size)
422 | else:
423 | sample = self.relevant_sessions[:self.sample_size]
424 |
425 | return sample
426 | else:
427 | return self.relevant_sessions
428 |
429 | def calc_similarity(self, session_items, sessions):
430 | '''
431 | Calculates the configured similarity for the items in session_items and each session in sessions.
432 |
433 | Parameters
434 | --------
435 | session_items: set of item ids
436 | sessions: list of session ids
437 |
438 | Returns
439 | --------
440 | out : list of tuple (session_id,similarity)
441 | '''
442 |
443 | # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
444 | neighbors = []
445 | cnt = 0
446 | for session in sessions:
447 | cnt = cnt + 1
448 | # get items of the session, look up the cache first
449 | session_items_test = self.items_for_session(session)
450 |
451 | similarity = getattr(self, self.similarity)(session_items_test, session_items)
452 | if similarity > 0:
453 | neighbors.append((session, similarity))
454 |
455 | return neighbors
456 |
457 | # -----------------
458 | # Find a set of neighbors, returns a list of tuples (sessionid: similarity)
459 | # -----------------
460 | def find_neighbors(self, session_items, input_item_id, session_id):
461 | '''
462 | Finds the k nearest neighbors for the given session_id and the current item input_item_id.
463 |
464 | Parameters
465 | --------
466 | session_items: set of item ids
467 | input_item_id: int
468 | session_id: int
469 |
470 | Returns
471 | --------
472 | out : list of tuple (session_id, similarity)
473 | '''
474 | possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id)
475 | possible_neighbors = self.calc_similarity(session_items, possible_neighbors)
476 |
477 | possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1])
478 | possible_neighbors = possible_neighbors[:self.k]
479 |
480 | return possible_neighbors
481 |
482 | def score_items(self, neighbors, current_session):
483 | '''
484 | Compute a set of scores for all items given a set of neighbors.
485 |
486 | Parameters
487 | --------
488 | neighbors: set of session ids
489 |
490 | Returns
491 | --------
492 | out : list of tuple (item, score)
493 | '''
494 | # now we have the set of relevant items to make predictions
495 | scores = dict()
496 | # iterate over the sessions
497 | for session in neighbors:
498 | # get the items in this session
499 | items = self.items_for_session(session[0])
500 | step = 1
501 |
502 | for item in reversed(current_session):
503 | if item in items:
504 | decay = getattr(self, self.weighting)(step)
505 | break
506 | step += 1
507 |
508 | for item in items:
509 | old_score = scores.get(item)
510 | similarity = session[1]
511 |
512 | if old_score is None:
513 | scores.update({item: (similarity * decay)})
514 | else:
515 | new_score = old_score + (similarity * decay)
516 | scores.update({item: new_score})
517 |
518 | return scores
519 |
520 | def linear(self, i):
521 | return 1 - (0.1 * i) if i <= 100 else 0
522 |
523 | def same(self, i):
524 | return 1
525 |
526 | def div(self, i):
527 | return 1 / i
528 |
529 | def log(self, i):
530 | return 1 / (log10(i + 1.7))
531 |
532 | def quadratic(self, i):
533 | return 1 / (i * i)
534 |
--------------------------------------------------------------------------------
/util/knn/vmsknn.py:
--------------------------------------------------------------------------------
1 | from _operator import itemgetter
2 | from math import sqrt
3 | import random
4 | import time
5 | from math import log10
6 | from datetime import datetime as dt
7 | from datetime import timedelta as td
8 |
9 | import numpy as np
10 | import pandas as pd
11 |
12 |
13 | class VMSessionKNN:
14 | '''
15 | VMSessionKNN( k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div', dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score', weighting_time=False, normalize=True, session_key = 'SessionId', item_key= 'ItemId', time_key= 'Time')
16 |
17 | Parameters
18 | -----------
19 | k : int
20 | Number of neighboring session to calculate the item scores from. (Default value: 100)
21 | sample_size : int
22 | Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
23 | sampling : string
24 | String to define the sampling method for sessions (recent, random). (default: recent)
25 | similarity : string
26 | String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
27 | weighting : string
28 | Decay function to determine the importance/weight of individual actions in the current session (linear, same, div, log, quadratic). (default: div)
29 | weighting_score : string
30 | Decay function to lower the score of candidate items from a neighboring sessions that were selected by less recently clicked items in the current session. (linear, same, div, log, quadratic). (default: div_score)
31 | weighting_time : boolean
32 | Experimental function to give less weight to items from older sessions (default: False)
33 | dwelling_time : boolean
34 | Experimental function to use the dwelling time for item view actions as a weight in the similarity calculation. (default: False)
35 | last_n_days : int
36 | Use only data from the last N days. (default: None)
37 | last_n_clicks : int
38 | Use only the last N clicks of the current session when recommending. (default: None)
39 | extend : bool
40 | Add evaluated sessions to the maps.
41 | normalize : bool
42 | Normalize the scores in the end.
43 | session_key : string
44 | Header of the session ID column in the input file. (default: 'SessionId')
45 | item_key : string
46 | Header of the item ID column in the input file. (default: 'ItemId')
47 | time_key : string
48 | Header of the timestamp column in the input file. (default: 'Time')
49 | '''
50 |
51 | def __init__(self, k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div',
52 | dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score',
53 | weighting_time=False, normalize=True, session_key='SessionId', item_key='ItemId', time_key='Time'):
54 |
55 | self.k = k
56 | self.sample_size = sample_size
57 | self.sampling = sampling
58 | self.weighting = weighting
59 | self.dwelling_time = dwelling_time
60 | self.weighting_score = weighting_score
61 | self.weighting_time = weighting_time
62 | self.similarity = similarity
63 | self.session_key = session_key
64 | self.item_key = item_key
65 | self.time_key = time_key
66 | self.extend = extend
67 | self.normalize = normalize
68 | self.last_n_days = last_n_days
69 | self.last_n_clicks = last_n_clicks
70 |
71 | # updated while recommending
72 | self.session = -1
73 | self.session_items = []
74 | self.relevant_sessions = set()
75 |
76 | # cache relations once at startup
77 | self.session_item_map = dict()
78 | self.item_session_map = dict()
79 | self.session_time = dict()
80 | self.min_time = -1
81 |
82 | self.sim_time = 0
83 |
84 | def fit(self, data, items=None):
85 | '''
86 | Trains the predictor.
87 |
88 | Parameters
89 | --------
90 | data: pandas.DataFrame
91 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
92 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
93 |
94 | '''
95 |
96 | if self.last_n_days != None:
97 |
98 | max_time = dt.fromtimestamp(data[self.time_key].max())
99 | date_threshold = max_time.date() - td(self.last_n_days)
100 | stamp = dt.combine(date_threshold, dt.min.time()).timestamp()
101 | train = data[data[self.time_key] >= stamp]
102 |
103 | else:
104 | train = data
105 |
106 | self.num_items = train[self.item_key].max()
107 |
108 | index_session = train.columns.get_loc(self.session_key)
109 | index_item = train.columns.get_loc(self.item_key)
110 | index_time = train.columns.get_loc(self.time_key)
111 | self.itemids = train[self.item_key].unique()
112 |
113 | session = -1
114 | session_items = set()
115 | time = -1
116 | # cnt = 0
117 | for row in train.itertuples(index=False):
118 | # cache items of sessions
119 | if row[index_session] != session:
120 | if len(session_items) > 0:
121 | self.session_item_map.update({session: session_items})
122 | # cache the last time stamp of the session
123 | self.session_time.update({session: time})
124 | if time < self.min_time:
125 | self.min_time = time
126 | session = row[index_session]
127 | session_items = set()
128 | time = row[index_time]
129 | session_items.add(row[index_item])
130 |
131 | # cache sessions involving an item
132 | map_is = self.item_session_map.get(row[index_item])
133 | if map_is is None:
134 | map_is = set()
135 | self.item_session_map.update({row[index_item]: map_is})
136 | map_is.add(row[index_session])
137 |
138 | # Add the last tuple
139 | self.session_item_map.update({session: session_items})
140 | self.session_time.update({session: time})
141 |
142 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
143 | '''
144 | Gives predicton scores for a selected set of items on how likely they be the next item in the session.
145 |
146 | Parameters
147 | --------
148 | session_id : int or string
149 | The session IDs of the event.
150 | input_item_id : int or string
151 | The item ID of the event. Must be in the set of item IDs of the training set.
152 | predict_for_item_ids : 1D array
153 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
154 |
155 | Returns
156 | --------
157 | out : pandas.Series
158 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
159 |
160 | '''
161 |
162 | # gc.collect()
163 | # process = psutil.Process(os.getpid())
164 | # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')
165 |
166 | if (self.session != session_id): # new session
167 |
168 | if (self.extend):
169 | item_set = set(self.session_items)
170 | self.session_item_map[self.session] = item_set;
171 | for item in item_set:
172 | map_is = self.item_session_map.get(item)
173 | if map_is is None:
174 | map_is = set()
175 | self.item_session_map.update({item: map_is})
176 | map_is.add(self.session)
177 |
178 | ts = time.time()
179 | self.session_time.update({self.session: ts})
180 |
181 | self.last_ts = -1
182 | self.session = session_id
183 | self.session_items = list()
184 | self.dwelling_times = list()
185 | self.relevant_sessions = set()
186 |
187 | if type == 'view':
188 | self.session_items.append(input_item_id)
189 | if self.dwelling_time:
190 | if self.last_ts > 0:
191 | self.dwelling_times.append(timestamp - self.last_ts)
192 | self.last_ts = timestamp
193 |
194 | if skip:
195 | return
196 |
197 | items = self.session_items if self.last_n_clicks is None else self.session_items[-self.last_n_clicks:]
198 | neighbors = self.find_neighbors(items, input_item_id, session_id, self.dwelling_times, timestamp)
199 | scores = self.score_items(neighbors, items, timestamp)
200 |
201 | # Create things in the format ..
202 | if predict_for_item_ids is None:
203 | predict_for_item_ids = self.itemids
204 | predictions = np.zeros(len(predict_for_item_ids))
205 | mask = np.in1d(predict_for_item_ids, list(scores.keys()))
206 |
207 | items = predict_for_item_ids[mask]
208 | values = [scores[x] for x in items]
209 | predictions[mask] = values
210 | series = pd.Series(data=predictions, index=predict_for_item_ids)
211 |
212 | if self.normalize:
213 | series = series / series.max()
214 |
215 | return series
216 |
217 | def item_pop(self, sessions):
218 | '''
219 | Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids)
220 |
221 | Parameters
222 | --------
223 | sessions: set
224 |
225 | Returns
226 | --------
227 | out : dict
228 | '''
229 | result = dict()
230 | max_pop = 0
231 | for session, weight in sessions:
232 | items = self.items_for_session(session)
233 | for item in items:
234 |
235 | count = result.get(item)
236 | if count is None:
237 | result.update({item: 1})
238 | else:
239 | result.update({item: count + 1})
240 |
241 | if (result.get(item) > max_pop):
242 | max_pop = result.get(item)
243 |
244 | for key in result:
245 | result.update({key: (result[key] / max_pop)})
246 |
247 | return result
248 |
249 | def jaccard(self, first, second):
250 | '''
251 | Calculates the jaccard index for two sessions
252 |
253 | Parameters
254 | --------
255 | first: Id of a session
256 | second: Id of a session
257 |
258 | Returns
259 | --------
260 | out : float value
261 | '''
262 | sc = time.clock()
263 | intersection = len(first & second)
264 | union = len(first | second)
265 | res = intersection / union
266 |
267 | self.sim_time += (time.clock() - sc)
268 |
269 | return res
270 |
271 | def cosine(self, first, second):
272 | '''
273 | Calculates the cosine similarity for two sessions
274 |
275 | Parameters
276 | --------
277 | first: Id of a session
278 | second: Id of a session
279 |
280 | Returns
281 | --------
282 | out : float value
283 | '''
284 | li = len(first & second)
285 | la = len(first)
286 | lb = len(second)
287 | result = li / sqrt(la) * sqrt(lb)
288 |
289 | return result
290 |
291 | def tanimoto(self, first, second):
292 | '''
293 | Calculates the cosine tanimoto similarity for two sessions
294 |
295 | Parameters
296 | --------
297 | first: Id of a session
298 | second: Id of a session
299 |
300 | Returns
301 | --------
302 | out : float value
303 | '''
304 | li = len(first & second)
305 | la = len(first)
306 | lb = len(second)
307 | result = li / (la + lb - li)
308 |
309 | return result
310 |
311 | def binary(self, first, second):
312 | '''
313 | Calculates the ? for 2 sessions
314 |
315 | Parameters
316 | --------
317 | first: Id of a session
318 | second: Id of a session
319 |
320 | Returns
321 | --------
322 | out : float value
323 | '''
324 | a = len(first & second)
325 | b = len(first)
326 | c = len(second)
327 |
328 | result = (2 * a) / ((2 * a) + b + c)
329 |
330 | return result
331 |
332 | def vec(self, first, second, map):
333 | '''
334 | Calculates the ? for 2 sessions
335 |
336 | Parameters
337 | --------
338 | first: Id of a session
339 | second: Id of a session
340 |
341 | Returns
342 | --------
343 | out : float value
344 | '''
345 | a = first & second
346 | sum = 0
347 | for i in a:
348 | sum += map[i]
349 |
350 | result = sum / len(map)
351 |
352 | return result
353 |
354 | def items_for_session(self, session):
355 | '''
356 | Returns all items in the session
357 |
358 | Parameters
359 | --------
360 | session: Id of a session
361 |
362 | Returns
363 | --------
364 | out : set
365 | '''
366 | return self.session_item_map.get(session);
367 |
368 | def vec_for_session(self, session):
369 | '''
370 | Returns all items in the session
371 |
372 | Parameters
373 | --------
374 | session: Id of a session
375 |
376 | Returns
377 | --------
378 | out : set
379 | '''
380 | return self.session_vec_map.get(session);
381 |
382 | def sessions_for_item(self, item_id):
383 | '''
384 | Returns all session for an item
385 |
386 | Parameters
387 | --------
388 | item: Id of the item session
389 |
390 | Returns
391 | --------
392 | out : set
393 | '''
394 | return self.item_session_map.get(item_id) if item_id in self.item_session_map else set()
395 |
396 | def most_recent_sessions(self, sessions, number):
397 | '''
398 | Find the most recent sessions in the given set
399 |
400 | Parameters
401 | --------
402 | sessions: set of session ids
403 |
404 | Returns
405 | --------
406 | out : set
407 | '''
408 | sample = set()
409 |
410 | tuples = list()
411 | for session in sessions:
412 | time = self.session_time.get(session)
413 | if time is None:
414 | print(' EMPTY TIMESTAMP!! ', session)
415 | tuples.append((session, time))
416 |
417 | tuples = sorted(tuples, key=itemgetter(1), reverse=True)
418 | # print 'sorted list ', sortedList
419 | cnt = 0
420 | for element in tuples:
421 | cnt = cnt + 1
422 | if cnt > number:
423 | break
424 | sample.add(element[0])
425 | # print 'returning sample of size ', len(sample)
426 | return sample
427 |
428 | def possible_neighbor_sessions(self, session_items, input_item_id, session_id):
429 | '''
430 | Find a set of session to later on find neighbors in.
431 | A self.sample_size of 0 uses all sessions in which any item of the current session appears.
432 | self.sampling can be performed with the options "recent" or "random".
433 | "recent" selects the self.sample_size most recent sessions while "random" just choses randomly.
434 |
435 | Parameters
436 | --------
437 | sessions: set of session ids
438 |
439 | Returns
440 | --------
441 | out : set
442 | '''
443 |
444 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id)
445 |
446 | if self.sample_size == 0: # use all session as possible neighbors
447 |
448 | print('!!!!! runnig KNN without a sample size (check config)')
449 | return self.relevant_sessions
450 |
451 | else: # sample some sessions
452 |
453 | if len(self.relevant_sessions) > self.sample_size:
454 |
455 | if self.sampling == 'recent':
456 | sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size)
457 | elif self.sampling == 'random':
458 | sample = random.sample(self.relevant_sessions, self.sample_size)
459 | else:
460 | sample = self.relevant_sessions[:self.sample_size]
461 |
462 | return sample
463 | else:
464 | return self.relevant_sessions
465 |
466 | def calc_similarity(self, session_items, sessions, dwelling_times, timestamp):
467 | '''
468 | Calculates the configured similarity for the items in session_items and each session in sessions.
469 |
470 | Parameters
471 | --------
472 | session_items: set of item ids
473 | sessions: list of session ids
474 |
475 | Returns
476 | --------
477 | out : list of tuple (session_id,similarity)
478 | '''
479 |
480 | pos_map = {}
481 | length = len(session_items)
482 |
483 | count = 1
484 | for item in session_items:
485 | if self.weighting is not None:
486 | pos_map[item] = getattr(self, self.weighting)(count, length)
487 | count += 1
488 | else:
489 | pos_map[item] = 1
490 |
491 | dt = dwelling_times.copy()
492 | dt.append(0)
493 | dt = pd.Series(dt, index=session_items)
494 | dt = dt / dt.max()
495 | # dt[session_items[-1]] = dt.mean() if len(session_items) > 1 else 1
496 | dt[session_items[-1]] = 1
497 |
498 | if self.dwelling_time:
499 | # print(dt)
500 | for i in range(len(dt)):
501 | pos_map[session_items[i]] *= dt.iloc[i]
502 | # print(pos_map)
503 | # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
504 | items = set(session_items)
505 | neighbors = []
506 | cnt = 0
507 | for session in sessions:
508 | cnt = cnt + 1
509 | # get items of the session, look up the cache first
510 | n_items = self.items_for_session(session)
511 | sts = self.session_time[session]
512 |
513 | similarity = self.vec(items, n_items, pos_map)
514 | if similarity > 0:
515 |
516 | if self.weighting_time:
517 | diff = timestamp - sts
518 | days = round(diff / 60 / 60 / 24)
519 | decay = pow(7 / 8, days)
520 | similarity *= decay
521 |
522 | # print("days:",days," => ",decay)
523 |
524 | neighbors.append((session, similarity))
525 |
526 | return neighbors
527 |
528 | # -----------------
529 | # Find a set of neighbors, returns a list of tuples (sessionid: similarity)
530 | # -----------------
531 | def find_neighbors(self, session_items, input_item_id, session_id, dwelling_times, timestamp):
532 | '''
533 | Finds the k nearest neighbors for the given session_id and the current item input_item_id.
534 |
535 | Parameters
536 | --------
537 | session_items: set of item ids
538 | input_item_id: int
539 | session_id: int
540 |
541 | Returns
542 | --------
543 | out : list of tuple (session_id, similarity)
544 | '''
545 | possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id)
546 | possible_neighbors = self.calc_similarity(session_items, possible_neighbors, dwelling_times, timestamp)
547 |
548 | possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1])
549 | possible_neighbors = possible_neighbors[:self.k]
550 |
551 | return possible_neighbors
552 |
553 | def score_items(self, neighbors, current_session, timestamp):
554 | '''
555 | Compute a set of scores for all items given a set of neighbors.
556 |
557 | Parameters
558 | --------
559 | neighbors: set of session ids
560 |
561 | Returns
562 | --------
563 | out : list of tuple (item, score)
564 | '''
565 | # now we have the set of relevant items to make predictions
566 | scores = dict()
567 | # iterate over the sessions
568 | for session in neighbors:
569 | # get the items in this session
570 | items = self.items_for_session(session[0])
571 | step = 1
572 |
573 | for item in reversed(current_session):
574 | if item in items:
575 | decay = getattr(self, self.weighting_score)(step)
576 | break
577 | step += 1
578 |
579 | for item in items:
580 | old_score = scores.get(item)
581 | similarity = session[1]
582 |
583 | if old_score is None:
584 | scores.update({item: (similarity * decay)})
585 | else:
586 | new_score = old_score + (similarity * decay)
587 | scores.update({item: new_score})
588 |
589 | return scores
590 |
591 | def linear_score(self, i):
592 | return 1 - (0.1 * i) if i <= 100 else 0
593 |
594 | def same_score(self, i):
595 | return 1
596 |
597 | def div_score(self, i):
598 | return 1 / i
599 |
600 | def log_score(self, i):
601 | return 1 / (log10(i + 1.7))
602 |
603 | def quadratic_score(self, i):
604 | return 1 / (i * i)
605 |
606 | def linear(self, i, length):
607 | return 1 - (0.1 * (length - i)) if i <= 10 else 0
608 |
609 | def same(self, i, length):
610 | return 1
611 |
612 | def div(self, i, length):
613 | return i / length
614 |
615 | def log(self, i, length):
616 | return 1 / (log10((length - i) + 1.7))
617 |
618 | def quadratic(self, i, length):
619 | return (i / length) ** 2
620 |
--------------------------------------------------------------------------------
/util/markov/Markov.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from functools import reduce
3 |
4 | import networkx as nx
5 |
6 | from util.tree.Tree import SmartTree
7 |
8 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
9 |
10 |
11 | def add_nodes_to_graph(seqs, last_k):
12 | t = SmartTree()
13 | rootNode = t.set_root()
14 |
15 | countDict = {}
16 | G = nx.DiGraph()
17 | for s in seqs:
18 | nearHistory = tuple(s[-(last_k):])
19 | if nearHistory in countDict:
20 | # increment count
21 | countDict[nearHistory] += 1
22 | else:
23 | # init count
24 | countDict[nearHistory] = 1
25 | # add seq to sequence tree
26 | t.add_path(rootNode, list(nearHistory))
27 | # add node to graph
28 | G.add_node(nearHistory)
29 |
30 | ## i also have to save the sequence of length k+1 because otherwise I cannot calculate the count
31 | ## from state x to state y. So the seqeunces of length k+1 are in the tree but not in the states
32 | nearHistoryLong = tuple(
33 | s[-(last_k + 1):]) # +1 because I need one more element to calculate the transition prob
34 | if nearHistory != nearHistoryLong: # otherwise short seq are counted double
35 | if nearHistoryLong in countDict:
36 | # increment count
37 | countDict[nearHistoryLong] += 1
38 | else:
39 | # init count
40 | countDict[nearHistoryLong] = 1
41 | return (t, countDict, G)
42 |
43 |
44 | def add_edges(t, countDict, G, last_k):
45 | """
46 | :param t: Tree of the sequnces available as states
47 | :param countDict: dicionary counting the occurence for each sequence
48 | :param G: the graph containing the states (each one is a sequence)
49 | :param last_k: the number of recent item considered
50 | :return: the same graph G, with edges connecting states
51 | """
52 | # add links
53 | rootNode = t.get_root()
54 | for node in G.nodes_iter():
55 | # if the sequence is shorter than states's len, the next state has all the sequence as prefix
56 | next_state_prefix = node[1:] if len(node) == last_k else node
57 | p = t.find_path(rootNode, next_state_prefix)
58 | if t.path_is_valid(p):
59 | children = t.get_nodes_tag(t[p].fpointer)
60 | for c in children:
61 | # the tree may suggest a children which is not a state of the graph, because it was part of a longer
62 | # sequence, in that case no edge has to be added
63 | if next_state_prefix + (c,) in G.nodes():
64 | if countDict.get(node + (c,), 0) != 0: # do not add edge if count is 0
65 | G.add_edge(node, next_state_prefix + (c,), {'count': countDict.get(node + (c,), 0)})
66 | return G
67 |
68 |
69 | def apply_skipping(G, last_k, seqs):
70 | # iterate over seqs to add skipping count
71 | window = last_k
72 |
73 | for us in seqs:
74 | s = tuple(us)
75 | for i in range(len(s) - window):
76 | previous_state = s[i:i + window]
77 | next_state_prefix = previous_state[1:]
78 | for j in range(i + window + 1, len(s)):
79 | fractional_count = 1 / (2 ** (j - (i + window)))
80 | next_state = next_state_prefix + (s[j],)
81 | # update count
82 | old_count = G.get_edge_data(previous_state, next_state, {}).get('count', 0)
83 | if G.has_edge(previous_state, next_state):
84 | G[previous_state][next_state]['count'] = old_count + fractional_count
85 | else:
86 | G.add_edge(previous_state, next_state, {'count': fractional_count})
87 | # print('updating '+str(previous_state)+'->'+str(next_state)+' from '+str(old_count)+' to '+str(old_count+fractional_count))
88 |
89 | # normalize
90 | for n in G.nodes_iter():
91 | edges = G.out_edges(n)
92 | countSum = reduce(lambda x, y: x + y, [G[x[0]][x[1]]['count'] for x in edges], 0)
93 | for e in edges:
94 | G[e[0]][e[1]]['count'] = G[e[0]][e[1]]['count'] / float(countSum) if countSum else 0
95 |
96 | return G
97 |
98 |
99 | def apply_clustering(G):
100 | ##clustering
101 | def sequence_similarity(s, t):
102 | sum = 0
103 | for i in range(min(len(s), len(t))):
104 | sum += 0 if s[i] != t[i] else (i + 2)
105 | return sum
106 |
107 | similarity_dict = {}
108 | # for each state in the graph, calculate similarity
109 | for node in G.nodes_iter():
110 | for deno in G.nodes_iter():
111 | if node == deno or (node, deno) in similarity_dict:
112 | continue # skip if same or already done
113 | else:
114 | sim = sequence_similarity(node, deno)
115 | if sim: # save only if different from zero
116 | similarity_dict[node, deno] = similarity_dict[deno, node] = sim
117 |
118 | similarity_count_dict = {}
119 |
120 | for node in G.nodes_iter():
121 | for deno in G.nodes_iter():
122 | if node == deno: continue
123 | sum = 0
124 | for in_edge in G.in_edges_iter([deno]):
125 | intermediate_node = in_edge[0]
126 | if intermediate_node != node: # I want to count the effect of going through Other nodes
127 | sum += similarity_dict.get((node, intermediate_node), 0) * G[intermediate_node][deno]['count']
128 | if sum:
129 | similarity_count_dict[node, deno] = sum
130 |
131 | def compute_normalization_similarity_count(G, node):
132 | normalization_sum = 0
133 | for other_state in G.nodes_iter():
134 | # skip similarity with myself is 0 because of how similarity_dict is calculated
135 | normalization_sum += similarity_count_dict.get((node, other_state), 0)
136 | return normalization_sum
137 |
138 | ##update transition probability
139 | ### this can be made faster(?) if I store the adjancency matrix where node are connected if
140 | # there is a probability due to the clustering (i.e. there is an entry in similarity_count_dict
141 | # in this way I only have to check those edges. now it's already pretty optimized anyway
142 | ALPHA = 0.5
143 | for node in G.nodes_iter():
144 | normalization_sum = compute_normalization_similarity_count(G, node)
145 |
146 | # first half the original transition prob
147 | for u, v in G.out_edges_iter([node]):
148 | G[u][v]['count'] *= ALPHA
149 |
150 | # if there is similarity probability somewhere
151 | if normalization_sum:
152 | # add similarity probability
153 | for deno in G.nodes_iter():
154 | # skip if same node or there is nothing that can be added to that node
155 | if node == deno or similarity_count_dict.get((node, deno), 0) == 0: continue
156 |
157 | partial_prob = (1 - ALPHA) * similarity_count_dict.get((node, deno), 0) / normalization_sum
158 |
159 | if G.has_edge(node, deno):
160 | G[node][deno]['count'] += partial_prob
161 | elif partial_prob: # there wasn't an edge but now there is partial prob from other nodes
162 | G.add_edge(node, deno, {'count': partial_prob})
163 |
164 | return G, similarity_dict, similarity_count_dict
165 |
--------------------------------------------------------------------------------
/util/metrics.py:
--------------------------------------------------------------------------------
1 | def precision(ground_truth, prediction):
2 | """
3 | Compute Precision metric
4 | :param ground_truth: the ground truth set or sequence
5 | :param prediction: the predicted set or sequence
6 | :return: the value of the metric
7 | """
8 | ground_truth = remove_duplicates(ground_truth)
9 | prediction = remove_duplicates(prediction)
10 | precision_score = count_a_in_b_unique(prediction, ground_truth) / float(len(prediction))
11 | assert 0 <= precision_score <= 1
12 | return precision_score
13 |
14 |
15 | def recall(ground_truth, prediction):
16 | """
17 | Compute Recall metric
18 | :param ground_truth: the ground truth set or sequence
19 | :param prediction: the predicted set or sequence
20 | :return: the value of the metric
21 | """
22 | ground_truth = remove_duplicates(ground_truth)
23 | prediction = remove_duplicates(prediction)
24 | recall_score = 0 if len(prediction) == 0 else count_a_in_b_unique(prediction, ground_truth) / float(
25 | len(ground_truth))
26 | assert 0 <= recall_score <= 1
27 | return recall_score
28 |
29 |
30 | def mrr(ground_truth, prediction):
31 | """
32 | Compute Mean Reciprocal Rank metric. Reciprocal Rank is set 0 if no predicted item is in contained the ground truth.
33 | :param ground_truth: the ground truth set or sequence
34 | :param prediction: the predicted set or sequence
35 | :return: the value of the metric
36 | """
37 | rr = 0.
38 | for rank, p in enumerate(prediction):
39 | if p in ground_truth:
40 | rr = 1. / (rank + 1)
41 | break
42 | return rr
43 |
44 |
45 | def count_a_in_b_unique(a, b):
46 | """
47 | :param a: list of lists
48 | :param b: list of lists
49 | :return: number of elements of a in b
50 | """
51 | count = 0
52 | for el in a:
53 | if el in b:
54 | count += 1
55 | return count
56 |
57 |
58 | def remove_duplicates(l):
59 | return [list(x) for x in set(tuple(x) for x in l)]
60 |
--------------------------------------------------------------------------------
/util/rnn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/rnn/__init__.py
--------------------------------------------------------------------------------
/util/rnn/gpu_ops.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Nov 10 14:17:58 2017
4 |
5 | @author: Balázs Hidasi
6 | """
7 |
8 | import theano
9 | from theano import tensor as T
10 |
11 | def gpu_diag_wide(X):
12 | E = T.eye(*X.shape)
13 | return T.sum(X*E, axis=1)
14 |
15 | def gpu_diag_tall(X):
16 | E = T.eye(*X.shape)
17 | return T.sum(X*E, axis=0)
--------------------------------------------------------------------------------
/util/split.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | from scipy.sparse import find
4 |
5 |
6 | def random_holdout(dataset, perc=0.8, seed=1234):
7 | """
8 | Split sequence dataset randomly
9 | :param dataset: the sequence dataset
10 | :param perc: the training percentange
11 | :param seed: the random seed
12 | :return: the training and test splits
13 | """
14 | dataset = dataset.sample(frac=1, random_state=seed)
15 | nseqs = len(dataset)
16 | train_size = int(nseqs * perc)
17 | # split data according to the shuffled index and the holdout size
18 | train_split = dataset[:train_size]
19 | test_split = dataset[train_size:]
20 |
21 | return train_split, test_split
22 |
23 |
24 | def temporal_holdout(dataset, ts_threshold):
25 | """
26 | Split sequence dataset using timestamps
27 | :param dataset: the sequence dataset
28 | :param ts_threshold: the timestamp from which test sequences will start
29 | :return: the training and test splits
30 | """
31 | train = dataset.loc[dataset['ts'] < ts_threshold]
32 | test = dataset.loc[dataset['ts'] >= ts_threshold]
33 | train, test = clean_split(train, test)
34 |
35 | return train, test
36 |
37 |
38 | def last_session_out_split(data,
39 | user_key='user_id',
40 | session_key='session_id',
41 | time_key='ts'):
42 | """
43 | Assign the last session of every user to the test set and the remaining ones to the training set
44 | """
45 | sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key]
46 | last_session = sessions.last()
47 | train = data[~data.session_id.isin(last_session.values)].copy()
48 | test = data[data.session_id.isin(last_session.values)].copy()
49 | train, test = clean_split(train, test)
50 | return train, test
51 |
52 |
53 | def clean_split(train, test):
54 | """
55 | Remove new items from the test set.
56 | :param train: The training set.
57 | :param test: The test set.
58 | :return: The cleaned training and test sets.
59 | """
60 | train_items = set()
61 | train['sequence'].apply(lambda seq: train_items.update(set(seq)))
62 | test['sequence'] = test['sequence'].apply(lambda seq: [it for it in seq if it in train_items])
63 | return train, test
64 |
65 |
66 | def balance_dataset(x, y):
67 | number_of_elements = y.shape[0]
68 | nnz = set(find(y)[0])
69 | zero = set(range(number_of_elements)).difference(nnz)
70 |
71 | max_samples = min(len(zero), len(nnz))
72 |
73 | nnz_indices = random.sample(nnz, max_samples)
74 | zero_indeces = random.sample(zero, max_samples)
75 | indeces = nnz_indices + zero_indeces
76 |
77 | return x[indeces, :], y[indeces, :]
78 |
--------------------------------------------------------------------------------
/util/tree/Tree.py:
--------------------------------------------------------------------------------
1 | import uuid
2 |
3 | import treelib
4 |
5 |
6 | class SmartTree(treelib.Tree):
7 | _PATH_NOT_FOUND = -1
8 |
9 | def find_path(self, origin, path):
10 | """
11 | Takes the nodeId where to start the path search and the path to look for,
12 | :returns -1 if path not found, nodeId of the last node if path found
13 | """
14 |
15 | if not path:
16 | # path found
17 | return origin
18 |
19 | res = self._PATH_NOT_FOUND
20 |
21 | for nodeId in self[origin].fpointer:
22 | node = self[nodeId]
23 | if node.tag == path[0]:
24 | res = self.find_path(nodeId, path[1:])
25 | break
26 |
27 | if res is None:
28 | # path not found
29 | return self._PATH_NOT_FOUND
30 | else:
31 | return res
32 |
33 | def longest_subpath(self, origin, path):
34 | """
35 | Takes the nodeId where to start the path search and the path to look for,
36 | :returns the nodeId of the node where the path is broken and the number of missing element for the complete path
37 | """
38 |
39 | if not path: # path empty, all nodes matched
40 | # path found
41 | return origin, 0
42 |
43 | res = ()
44 |
45 | for nodeId in self[origin].fpointer:
46 | node = self[nodeId]
47 | if node.tag == path[0]:
48 | res = self.longest_subpath(nodeId, path[1:])
49 | break
50 |
51 | if res == ():
52 | # path not found
53 | return origin, len(path)
54 | else:
55 | return res
56 |
57 | def add_path(self, origin, path, support=None):
58 | """add a path, starting from origin"""
59 | sub = self.longest_subpath(origin, path)
60 | if sub[1] == 0:
61 | # path already exists, updating support
62 | self[sub[0]].data = {'support': support}
63 |
64 | else:
65 | # add what's missing
66 | missingPath = path[-sub[1]:]
67 |
68 | par = sub[0]
69 | for item in missingPath:
70 | itemId = uuid.uuid4()
71 | self.create_node(item, itemId, parent=par, data={'support': support})
72 | par = itemId
73 |
74 | def path_is_valid(self, path):
75 | return path != self._PATH_NOT_FOUND
76 |
77 | def create_node(self, tag=None, identifier=None, parent=None, data=None):
78 | """override to get a random id if none provided"""
79 | id = uuid.uuid4() if identifier is None else identifier
80 | if id == self._PATH_NOT_FOUND:
81 | raise NameError("Cannot create a node with special id " + str(self._PATH_NOT_FOUND))
82 | super(SmartTree, self).create_node(tag, id, parent, data)
83 |
84 | def set_root(self, root_tag=None, root_id=None):
85 | id = uuid.uuid4()
86 | root_id = root_id if root_id is not None else id
87 | root_tag = root_tag if root_tag is not None else 'root'
88 | self.create_node(root_tag, root_id)
89 | self.root = root_id
90 | return root_id
91 |
92 | def get_root(self):
93 | try:
94 | return self.root
95 | except AttributeError:
96 | return None
97 |
98 | def find_n_length_paths(self, origin, length, exclude_origin=True):
99 |
100 | if length == 0:
101 | return [[]] if exclude_origin else [[origin]]
102 |
103 | else:
104 | children = self[origin].fpointer
105 | paths = []
106 | for c in children:
107 | children_paths = self.find_n_length_paths(c, length - 1, False)
108 | # this line is magic, if there are no children the all path gets lost,
109 | # that's how i get paths of exactly length wanted
110 | l = list(map(lambda x: [] + x, children_paths)) if exclude_origin else list(
111 | map(lambda x: [origin] + x, children_paths))
112 | for el in l:
113 | paths.append(el)
114 | return paths
115 |
116 | def get_paths_tag(self, list_of_paths):
117 | return list(map(lambda x: self.get_nodes_tag(x), list_of_paths))
118 |
119 | def get_nodes_tag(self, list_of_nids):
120 | return list(map(lambda y: self[y].tag, list_of_nids))
121 |
--------------------------------------------------------------------------------
/util/tree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/tree/__init__.py
--------------------------------------------------------------------------------