├── .gitignore ├── 00_TopPopular.ipynb ├── 01_FrequentSeqMining.ipynb ├── 02_MarkovChain.ipynb ├── 03_FPMC.ipynb ├── 04_Prod2Vec.ipynb ├── 05_SessionBasedRNN.ipynb ├── 06_PersonalizedRNN.ipynb ├── 07_KNN.ipynb ├── LICENSE ├── README.md ├── datasets └── sessions.zip ├── environment.yml ├── gifs └── sequential_eval.gif ├── images ├── fpmc.png ├── gru4rec.png ├── hgru4rec.png ├── prod2vec.png ├── running_notebooks_1.png ├── running_notebooks_2.png └── running_notebooks_3.png ├── recommenders ├── FPMCRecommender.py ├── FSMRecommender.py ├── ISeqRecommender.py ├── KNNRecommender.py ├── MarkovChainRecommender.py ├── MixedMarkovRecommender.py ├── PopularityRecommender.py ├── Prod2VecRecommender.py ├── RNNRecommender.py ├── SupervisedRecommender.py └── __init__.py ├── slides ├── TheWebConf2019_01_Introduction.pdf ├── TheWebConf2019_02_Algorithms.pdf └── TheWebConf2019_03_Evaluation.pdf ├── spmf └── spmf.jar └── util ├── SPMFinterface.py ├── __init__.py ├── data_expansion.py ├── data_utils.py ├── evaluation.py ├── fpmc ├── FPMC.py ├── FPMC_numba.py ├── __init__.py └── utils.py ├── knn ├── __init__.py ├── iknn.py ├── sfsknn.py ├── sknn.py ├── ssknn.py └── vmsknn.py ├── markov └── Markov.py ├── metrics.py ├── rnn ├── __init__.py ├── gpu_ops.py ├── gru4rec.py └── hgru4rec.py ├── split.py └── tree ├── Tree.py └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | .idea/* 91 | .idea/ 92 | recpy/.idea/ 93 | .DS_Store 94 | 95 | # custom 96 | datasets/ 97 | -------------------------------------------------------------------------------- /00_TopPopular.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Table of contents\n", 8 | "\n", 9 | "1. [Load the dataset](#load_the_dataset)\n", 10 | "2. [Split the dataset](#split_the_dataset)\n", 11 | "3. [Fitting the recommender](#fitting)\n", 12 | "4. [Sequential evaluation](#seq_evaluation) \n", 13 | " 4.1 [Evaluation with sequentially revaeled user profiles](#eval_seq_rev) \n", 14 | " 4.2 [Evaluation with \"static\" user profiles](#eval_static) \n", 15 | "5. [Analysis of next-item recommendation](#next-item) \n", 16 | " 5.1 [Evaluation with different recommendation list lengths](#next-item_list_length) \n", 17 | " 5.2 [Evaluation with different user profile lengths](#next-item_profile_length)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "%matplotlib inline" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "from util.data_utils import create_seq_db_filter_top_k\n", 39 | "from util.split import last_session_out_split\n", 40 | "from util.metrics import precision, recall, mrr\n", 41 | "from util import evaluation\n", 42 | "from recommenders.PopularityRecommender import PopularityRecommender" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import datetime" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "def get_test_sequences(test_data, given_k):\n", 61 | " # we can run evaluation only over sequences longer than abs(LAST_K)\n", 62 | " test_sequences = test_data.loc[test_data['sequence'].map(len) > abs(given_k), 'sequence'].values\n", 63 | " return test_sequences" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "# 1. Load the dataset\n", 78 | "\n", 79 | "For this hands-on session we will use a dataset of user-listening sessions crawled from [last.fm](https://www.last.fm/). In detail, we will use a subset of the following dataset:\n", 80 | "\n", 81 | "* 30Music listening and playlists dataset, Turrin et al., ACM RecSys 2015 ([paper](https://home.deib.polimi.it/pagano/portfolio/papers/30Musiclisteningandplaylistsdataset.pdf))" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# unzip the dataset, if you haven't already done it\n", 91 | "# ! unzip datasets/sessions.zip -d datasets" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "! ls datasets/" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "dataset_path = 'datasets/sessions.csv'\n", 110 | "# load this sample if you experience a severe slowdown with the previous dataset\n", 111 | "dataset_path = 'datasets/sessions_sample_10.csv'\n", 112 | "\n", 113 | "# for the sake of speed, let's keep only the top-1k most popular items in the last month\n", 114 | "dataset = create_seq_db_filter_top_k(path=dataset_path, topk=1000, last_months=1) " 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "Let's see at how the dataset looks like" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "dataset.head()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "Let's show some statistics about the dataset" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "from collections import Counter\n", 147 | "cnt = Counter()\n", 148 | "dataset.sequence.map(cnt.update);" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "sequence_length = dataset.sequence.map(len).values\n", 158 | "n_sessions_per_user = dataset.groupby('user_id').size()\n", 159 | "\n", 160 | "print('Number of items: {}'.format(len(cnt)))\n", 161 | "print('Number of users: {}'.format(dataset.user_id.nunique()))\n", 162 | "print('Number of sessions: {}'.format(len(dataset)) )\n", 163 | "\n", 164 | "print('\\nSession length:\\n\\tAverage: {:.2f}\\n\\tMedian: {}\\n\\tMin: {}\\n\\tMax: {}'.format(\n", 165 | " sequence_length.mean(), \n", 166 | " np.quantile(sequence_length, 0.5), \n", 167 | " sequence_length.min(), \n", 168 | " sequence_length.max()))\n", 169 | "\n", 170 | "print('Sessions per user:\\n\\tAverage: {:.2f}\\n\\tMedian: {}\\n\\tMin: {}\\n\\tMax: {}'.format(\n", 171 | " n_sessions_per_user.mean(), \n", 172 | " np.quantile(n_sessions_per_user, 0.5), \n", 173 | " n_sessions_per_user.min(), \n", 174 | " n_sessions_per_user.max()))" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "print('Most popular items: {}'.format(cnt.most_common(5)))" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "# 2. Split the dataset" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "For simplicity, let's split the dataset by assigning the **last session** of every user to the **test set**, and **all the previous** ones to the **training set**." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "train_data, test_data = last_session_out_split(dataset)\n", 214 | "print(\"Train sessions: {} - Test sessions: {}\".format(len(train_data), len(test_data)))" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "# 3. Fitting the recommender\n", 229 | "\n", 230 | "Here we fit the recommedation algorithm over the sessions in the training set.\n", 231 | "\n", 232 | "`PopularityRecommender` simply recommends items ordered by their popularity in the training set. \n", 233 | "`PopularityRecommender` doesn't have any hyper-parameter, so we can move on!" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "recommender = PopularityRecommender()\n", 243 | "recommender.fit(train_data)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "\n" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "# 4. Sequential evaluation\n", 258 | "\n", 259 | "In the evaluation of sequence-aware recommenders, each sequence in the test set is split into:\n", 260 | "- the _user profile_, used to compute recommendations, is composed by the first *k* events in the sequence;\n", 261 | "- the _ground truth_, used for performance evaluation, is composed by the remainder of the sequence.\n", 262 | "\n", 263 | "In the cells below, you can control the dimension of the _user profile_ by assigning a **positive** value to `GIVEN_K`, which correspond to the number of events from the beginning of the sequence that will be assigned to the initial user profile. This ensures that each user profile in the test set will have exactly the same initial size, but the size of the ground truth will change for every sequence.\n", 264 | "\n", 265 | "Alternatively, by assigning a **negative** value to `GIVEN_K`, you will set the initial size of the _ground truth_. In this way the _ground truth_ will have the same size for all sequences, but the dimension of the user profile will differ." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "METRICS = {'precision':precision, \n", 275 | " 'recall':recall,\n", 276 | " 'mrr': mrr}\n", 277 | "TOPN=100 # length of the recommendation list" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "## 4.1 Evaluation with sequentially revealed user-profiles\n", 292 | "\n", 293 | "Here we evaluate the quality of the recommendations in a setting in which user profiles are revealed _sequentially_.\n", 294 | "\n", 295 | "The _user profile_ starts from the first `GIVEN_K` events (or, alternatively, from the last `-GIVEN_K` events if `GIVEN_K<0`). \n", 296 | "The recommendations are evaluated against the next `LOOK_AHEAD` events (the _ground truth_). \n", 297 | "The _user profile_ is next expanded to the next `STEP` events, the ground truth is scrolled forward accordingly, and the evaluation continues until the sequence ends.\n", 298 | "\n", 299 | "In typical **next-item recommendation**, we start with `GIVEN_K=1`, generate a set of **alternatives** that will evaluated against the next event in the sequence (`LOOK_AHEAD=1`), move forward of one step (`STEP=1`) and repeat until the sequence ends.\n", 300 | "\n", 301 | "You can set the `LOOK_AHEAD='all'` to see what happens if you had to recommend a **whole sequence** instead of a set of a set of alternatives to a user.\n", 302 | "\n", 303 | "NOTE: Metrics are averaged over each sequence first, then averaged over all test sequences.\n", 304 | "\n", 305 | "** (TODO) Try out with different evaluation settings to see how the recommandation quality changes. **\n", 306 | "\n", 307 | "\n", 308 | "![](gifs/sequential_eval.gif)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "# GIVEN_K=1, LOOK_AHEAD=1, STEP=1 corresponds to the classical next-item evaluation\n", 318 | "GIVEN_K = 1\n", 319 | "LOOK_AHEAD = 1\n", 320 | "STEP=1" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n", 330 | "print('{} sequences available for evaluation'.format(len(test_sequences)))\n", 331 | "\n", 332 | "results = evaluation.sequential_evaluation(recommender,\n", 333 | " test_sequences=test_sequences,\n", 334 | " given_k=GIVEN_K,\n", 335 | " look_ahead=LOOK_AHEAD,\n", 336 | " evaluation_functions=METRICS.values(),\n", 337 | " top_n=TOPN,\n", 338 | " scroll=True, # scrolling averages metrics over all profile lengths\n", 339 | " step=STEP)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "print('Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})'.format(GIVEN_K, LOOK_AHEAD, STEP))\n", 349 | "for mname, mvalue in zip(METRICS.keys(), results):\n", 350 | " print('\\t{}@{}: {:.4f}'.format(mname, TOPN, mvalue))" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "## 4.2 Evaluation with \"static\" user-profiles\n", 365 | "\n", 366 | "Here we evaluate the quality of the recommendations in a setting in which user profiles are instead _static_.\n", 367 | "\n", 368 | "The _user profile_ starts from the first `GIVEN_K` events (or, alternatively, from the last `-GIVEN_K` events if `GIVEN_K<0`). \n", 369 | "The recommendations are evaluated against the next `LOOK_AHEAD` events (the _ground truth_). \n", 370 | "\n", 371 | "The user profile is *not extended* and the ground truth *doesn't move forward*.\n", 372 | "This allows to obtain \"snapshots\" of the recommendation performance for different user profile and ground truth lenghts.\n", 373 | "\n", 374 | "Also here you can set the `LOOK_AHEAD='all'` to see what happens if you had to recommend a **whole sequence** instead of a set of a set of alternatives to a user.\n", 375 | "\n", 376 | "**(TODO) Try out with different evaluation settings to see how the recommandation quality changes.**" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "GIVEN_K = 1\n", 386 | "LOOK_AHEAD = 'all'\n", 387 | "STEP=1" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n", 397 | "print('{} sequences available for evaluation'.format(len(test_sequences)))\n", 398 | "\n", 399 | "results = evaluation.sequential_evaluation(recommender,\n", 400 | " test_sequences=test_sequences,\n", 401 | " given_k=GIVEN_K,\n", 402 | " look_ahead=LOOK_AHEAD,\n", 403 | " evaluation_functions=METRICS.values(),\n", 404 | " top_n=TOPN,\n", 405 | " scroll=False # notice that scrolling is disabled!\n", 406 | " ) " 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "print('Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})'.format(GIVEN_K, LOOK_AHEAD, STEP))\n", 416 | "for mname, mvalue in zip(METRICS.keys(), results):\n", 417 | " print('\\t{}@{}: {:.4f}'.format(mname, TOPN, mvalue))" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "## 5. Analysis of next-item recommendation\n", 432 | "\n", 433 | "Here we propose to analyse the performance of the recommender system in the scenario of *next-item recommendation* over the following dimensions:\n", 434 | "\n", 435 | "* the *length* of the **recommendation list**, and\n", 436 | "* the *length* of the **user profile**.\n", 437 | "\n", 438 | "NOTE: This evaluation is by no means exhaustive, as different the hyper-parameters of the recommendation algorithm should be *carefully tuned* before drawing any conclusions. Unfortunately, given the time constraints for this tutorial, we had to leave hyper-parameter tuning out. A very useful reference about careful evaluation of (session-based) recommenders can be found at:\n", 439 | "\n", 440 | "* Evaluation of Session-based Recommendation Algorithms, Ludewig and Jannach, 2018 ([paper](https://arxiv.org/abs/1803.09587))" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "### 5.1 Evaluation for different recommendation list lengths" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "GIVEN_K = 1\n", 464 | "LOOK_AHEAD = 1\n", 465 | "STEP = 1\n", 466 | "topn_list = [1, 5, 10, 20, 50, 100]" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "# ensure that all sequences have the same minimum length \n", 476 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n", 477 | "print('{} sequences available for evaluation'.format(len(test_sequences)))" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "res_list = []\n", 487 | "\n", 488 | "for topn in topn_list:\n", 489 | " print('Evaluating recommendation lists with length: {}'.format(topn))\n", 490 | " res_tmp = evaluation.sequential_evaluation(recommender,\n", 491 | " test_sequences=test_sequences,\n", 492 | " given_k=GIVEN_K,\n", 493 | " look_ahead=LOOK_AHEAD,\n", 494 | " evaluation_functions=METRICS.values(),\n", 495 | " top_n=topn,\n", 496 | " scroll=True, # here we average over all profile lengths\n", 497 | " step=STEP)\n", 498 | " mvalues = list(zip(METRICS.keys(), res_tmp))\n", 499 | " res_list.append((topn, mvalues))" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "# show separate plots per metric\n", 509 | "fig, axes = plt.subplots(nrows=1, ncols=len(METRICS), figsize=(15,5))\n", 510 | "res_list_t = list(zip(*res_list))\n", 511 | "for midx, metric in enumerate(METRICS):\n", 512 | " mvalues = [res_list_t[1][j][midx][1] for j in range(len(res_list_t[1]))]\n", 513 | " ax = axes[midx]\n", 514 | " ax.plot(topn_list, mvalues)\n", 515 | " ax.set_title(metric)\n", 516 | " ax.set_xticks(topn_list)\n", 517 | " ax.set_xlabel('List length')" 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": {}, 523 | "source": [ 524 | "" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "### 5.2 Evaluation for different user profile lengths" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "given_k_list = [1, 2, 3, 4]\n", 541 | "LOOK_AHEAD = 1\n", 542 | "STEP = 1\n", 543 | "TOPN = 20" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "# ensure that all sequences have the same minimum length \n", 553 | "test_sequences = get_test_sequences(test_data, max(given_k_list))\n", 554 | "print('{} sequences available for evaluation'.format(len(test_sequences)))" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "res_list = []\n", 564 | "\n", 565 | "for gk in given_k_list:\n", 566 | " print('Evaluating profiles having length: {}'.format(gk))\n", 567 | " res_tmp = evaluation.sequential_evaluation(recommender,\n", 568 | " test_sequences=test_sequences,\n", 569 | " given_k=gk,\n", 570 | " look_ahead=LOOK_AHEAD,\n", 571 | " evaluation_functions=METRICS.values(),\n", 572 | " top_n=TOPN,\n", 573 | " scroll=False, # here we stop at each profile length\n", 574 | " step=STEP)\n", 575 | " mvalues = list(zip(METRICS.keys(), res_tmp))\n", 576 | " res_list.append((gk, mvalues))" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "# show separate plots per metric\n", 586 | "fig, axes = plt.subplots(nrows=1, ncols=len(METRICS), figsize=(15,5))\n", 587 | "res_list_t = list(zip(*res_list))\n", 588 | "for midx, metric in enumerate(METRICS):\n", 589 | " mvalues = [res_list_t[1][j][midx][1] for j in range(len(res_list_t[1]))]\n", 590 | " ax = axes[midx]\n", 591 | " ax.plot(given_k_list, mvalues)\n", 592 | " ax.set_title(metric)\n", 593 | " ax.set_xticks(given_k_list)\n", 594 | " ax.set_xlabel('Profile length')" 595 | ] 596 | } 597 | ], 598 | "metadata": { 599 | "kernelspec": { 600 | "display_name": "srs", 601 | "language": "python", 602 | "name": "srs" 603 | }, 604 | "language_info": { 605 | "codemirror_mode": { 606 | "name": "ipython", 607 | "version": 3 608 | }, 609 | "file_extension": ".py", 610 | "mimetype": "text/x-python", 611 | "name": "python", 612 | "nbconvert_exporter": "python", 613 | "pygments_lexer": "ipython3", 614 | "version": "3.6.6" 615 | } 616 | }, 617 | "nbformat": 4, 618 | "nbformat_minor": 2 619 | } 620 | -------------------------------------------------------------------------------- /02_MarkovChain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Table of contents\n", 8 | "\n", 9 | "1. [Load the dataset](#load_the_dataset)\n", 10 | "2. [Split the dataset](#split_the_dataset)\n", 11 | "3. [Fitting the recommender](#fitting)\n", 12 | "4. [Sequential evaluation](#seq_evaluation) \n", 13 | " 4.1 [Evaluation with sequentially revaeled user profiles](#eval_seq_rev) \n", 14 | " 4.2 [Evaluation with \"static\" user profiles](#eval_static) \n", 15 | "5. [Analysis of next-item recommendation](#next-item) \n", 16 | " 5.1 [Evaluation with different recommendation list lengths](#next-item_list_length) \n", 17 | " 5.2 [Evaluation with different user profile lengths](#next-item_profile_length)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "%matplotlib inline" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "from util.data_utils import create_seq_db_filter_top_k, sequences_to_spfm_format\n", 39 | "from util.split import last_session_out_split\n", 40 | "from util.metrics import precision, recall, mrr\n", 41 | "from util import evaluation\n", 42 | "from recommenders.MixedMarkovRecommender import MixedMarkovChainRecommender" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import datetime" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "def get_test_sequences(test_data, given_k):\n", 61 | " # we can run evaluation only over sequences longer than abs(LAST_K)\n", 62 | " test_sequences = test_data.loc[test_data['sequence'].map(len) > abs(given_k), 'sequence'].values\n", 63 | " return test_sequences" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "# 1. Load the dataset\n", 78 | "\n", 79 | "For this hands-on session we will use a dataset of user-listening sessions crawled from [last.fm](https://www.last.fm/). In detail, we will use a subset of the following dataset:\n", 80 | "\n", 81 | "* 30Music listening and playlists dataset, Turrin et al., ACM RecSys 2015 ([paper](https://home.deib.polimi.it/pagano/portfolio/papers/30Musiclisteningandplaylistsdataset.pdf))" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# unzip the dataset, if you haven't already done it\n", 91 | "# ! unzip datasets/sessions.zip -d datasets" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "dataset_path = 'datasets/sessions.csv'\n", 101 | "# load this sample if you experience a severe slowdown with the previous dataset\n", 102 | "#dataset_path = 'datasets/sessions_sample_10.csv'\n", 103 | "\n", 104 | "# for the sake of speed, let's keep only the top-1k most popular items in the last month\n", 105 | "dataset = create_seq_db_filter_top_k(path=dataset_path, topk=1000, last_months=1) " 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Let's see at how the dataset looks like" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "dataset.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "Let's show some statistics about the dataset" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from collections import Counter\n", 138 | "cnt = Counter()\n", 139 | "dataset.sequence.map(cnt.update);" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "sequence_length = dataset.sequence.map(len).values\n", 149 | "n_sessions_per_user = dataset.groupby('user_id').size()\n", 150 | "\n", 151 | "print('Number of items: {}'.format(len(cnt)))\n", 152 | "print('Number of users: {}'.format(dataset.user_id.nunique()))\n", 153 | "print('Number of sessions: {}'.format(len(dataset)) )\n", 154 | "\n", 155 | "print('\\nSession length:\\n\\tAverage: {:.2f}\\n\\tMedian: {}\\n\\tMin: {}\\n\\tMax: {}'.format(\n", 156 | " sequence_length.mean(), \n", 157 | " np.quantile(sequence_length, 0.5), \n", 158 | " sequence_length.min(), \n", 159 | " sequence_length.max()))\n", 160 | "\n", 161 | "print('Sessions per user:\\n\\tAverage: {:.2f}\\n\\tMedian: {}\\n\\tMin: {}\\n\\tMax: {}'.format(\n", 162 | " n_sessions_per_user.mean(), \n", 163 | " np.quantile(n_sessions_per_user, 0.5), \n", 164 | " n_sessions_per_user.min(), \n", 165 | " n_sessions_per_user.max()))" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "print('Most popular items: {}'.format(cnt.most_common(5)))" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "# 2. Split the dataset" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "For simplicity, let's split the dataset by assigning the **last session** of every user to the **test set**, and **all the previous** ones to the **training set**." 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "train_data, test_data = last_session_out_split(dataset)\n", 205 | "print(\"Train sessions: {} - Test sessions: {}\".format(len(train_data), len(test_data)))" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "# 3. Fitting the recommender\n", 220 | "\n", 221 | "Here we fit the recommedation algorithm over the sessions in the training set. \n", 222 | "This recommender is based on the `MarkovChainRecommender` implemented from:\n", 223 | "\n", 224 | "_Shani, Guy, David Heckerman, and Ronen I. Brafman. \"An MDP-based recommender system.\" Journal of Machine Learning Research 6, no. Sep (2005): 1265-1295. Chapter 3-4_\n", 225 | "\n", 226 | "This recommender computes the item transition matrices for any Markov Chain having order in `[min_order, max_order]`. Each individual Markov Chain model employes some heristics like skipping or clustering to deal better with data sparsity. Recommendations are generated by sorting items by their transition probability to being next, given the user profile. The scores coming from different MC are weighted _inversely_ wrt to their order.\n", 227 | "\n", 228 | "The class `MixedMarkovChainRecommender` has the following initialization hyper-parameters:\n", 229 | "* `min_order`: the minimum order of the Mixed Markov Chain\n", 230 | "* `max_order`: the maximum order of the Mixed Markov Chain\n" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "# You can try with max_order=2 or higher too, but it will take some time to complete though due to slow heristic computations\n", 240 | "recommender = MixedMarkovChainRecommender(min_order=1, \n", 241 | " max_order=1)\n", 242 | "recommender.fit(train_data)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "\n" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "# 4. Sequential evaluation\n", 257 | "\n", 258 | "In the evaluation of sequence-aware recommenders, each sequence in the test set is split into:\n", 259 | "- the _user profile_, used to compute recommendations, is composed by the first *k* events in the sequence;\n", 260 | "- the _ground truth_, used for performance evaluation, is composed by the remainder of the sequence.\n", 261 | "\n", 262 | "In the cells below, you can control the dimension of the _user profile_ by assigning a **positive** value to `GIVEN_K`, which correspond to the number of events from the beginning of the sequence that will be assigned to the initial user profile. This ensures that each user profile in the test set will have exactly the same initial size, but the size of the ground truth will change for every sequence.\n", 263 | "\n", 264 | "Alternatively, by assigning a **negative** value to `GIVEN_K`, you will set the initial size of the _ground truth_. In this way the _ground truth_ will have the same size for all sequences, but the dimension of the user profile will differ." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "METRICS = {'precision':precision, \n", 274 | " 'recall':recall,\n", 275 | " 'mrr': mrr}\n", 276 | "TOPN = 10 # length of the recommendation list" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "## 4.1 Evaluation with sequentially revealed user-profiles\n", 291 | "\n", 292 | "Here we evaluate the quality of the recommendations in a setting in which user profiles are revealed _sequentially_.\n", 293 | "\n", 294 | "The _user profile_ starts from the first `GIVEN_K` events (or, alternatively, from the last `-GIVEN_K` events if `GIVEN_K<0`). \n", 295 | "The recommendations are evaluated against the next `LOOK_AHEAD` events (the _ground truth_). \n", 296 | "The _user profile_ is next expanded to the next `STEP` events, the ground truth is scrolled forward accordingly, and the evaluation continues until the sequence ends.\n", 297 | "\n", 298 | "In typical **next-item recommendation**, we start with `GIVEN_K=1`, generate a set of **alternatives** that will evaluated against the next event in the sequence (`LOOK_AHEAD=1`), move forward of one step (`STEP=1`) and repeat until the sequence ends.\n", 299 | "\n", 300 | "You can set the `LOOK_AHEAD='all'` to see what happens if you had to recommend a **whole sequence** instead of a set of a set of alternatives to a user.\n", 301 | "\n", 302 | "NOTE: Metrics are averaged over each sequence first, then averaged over all test sequences.\n", 303 | "\n", 304 | "** (TODO) Try out with different evaluation settings to see how the recommandation quality changes. **\n", 305 | "\n", 306 | "\n", 307 | "![](gifs/sequential_eval.gif)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "# GIVEN_K=1, LOOK_AHEAD=1, STEP=1 corresponds to the classical next-item evaluation\n", 317 | "GIVEN_K = 1\n", 318 | "LOOK_AHEAD = 1\n", 319 | "STEP=1" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n", 329 | "print('{} sequences available for evaluation'.format(len(test_sequences)))\n", 330 | "\n", 331 | "results = evaluation.sequential_evaluation(recommender,\n", 332 | " test_sequences=test_sequences,\n", 333 | " given_k=GIVEN_K,\n", 334 | " look_ahead=LOOK_AHEAD,\n", 335 | " evaluation_functions=METRICS.values(),\n", 336 | " top_n=TOPN,\n", 337 | " scroll=True, # scrolling averages metrics over all profile lengths\n", 338 | " step=STEP)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "print('Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})'.format(GIVEN_K, LOOK_AHEAD, STEP))\n", 348 | "for mname, mvalue in zip(METRICS.keys(), results):\n", 349 | " print('\\t{}@{}: {:.4f}'.format(mname, TOPN, mvalue))" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "## 4.2 Evaluation with \"static\" user-profiles\n", 364 | "\n", 365 | "Here we evaluate the quality of the recommendations in a setting in which user profiles are instead _static_.\n", 366 | "\n", 367 | "The _user profile_ starts from the first `GIVEN_K` events (or, alternatively, from the last `-GIVEN_K` events if `GIVEN_K<0`). \n", 368 | "The recommendations are evaluated against the next `LOOK_AHEAD` events (the _ground truth_). \n", 369 | "\n", 370 | "The user profile is *not extended* and the ground truth *doesn't move forward*.\n", 371 | "This allows to obtain \"snapshots\" of the recommendation performance for different user profile and ground truth lenghts.\n", 372 | "\n", 373 | "Also here you can set the `LOOK_AHEAD='all'` to see what happens if you had to recommend a **whole sequence** instead of a set of a set of alternatives to a user.\n", 374 | "\n", 375 | "**(TODO) Try out with different evaluation settings to see how the recommandation quality changes.**" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "GIVEN_K = 1\n", 385 | "LOOK_AHEAD = 'all'\n", 386 | "STEP=1" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n", 396 | "print('{} sequences available for evaluation'.format(len(test_sequences)))\n", 397 | "\n", 398 | "results = evaluation.sequential_evaluation(recommender,\n", 399 | " test_sequences=test_sequences,\n", 400 | " given_k=GIVEN_K,\n", 401 | " look_ahead=LOOK_AHEAD,\n", 402 | " evaluation_functions=METRICS.values(),\n", 403 | " top_n=TOPN,\n", 404 | " scroll=False # notice that scrolling is disabled!\n", 405 | " ) " 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "print('Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})'.format(GIVEN_K, LOOK_AHEAD, STEP))\n", 415 | "for mname, mvalue in zip(METRICS.keys(), results):\n", 416 | " print('\\t{}@{}: {:.4f}'.format(mname, TOPN, mvalue))" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "## 5. Analysis of next-item recommendation\n", 431 | "\n", 432 | "Here we propose to analyse the performance of the recommender system in the scenario of *next-item recommendation* over the following dimensions:\n", 433 | "\n", 434 | "* the *length* of the **recommendation list**, and\n", 435 | "* the *length* of the **user profile**.\n", 436 | "\n", 437 | "NOTE: This evaluation is by no means exhaustive, as different the hyper-parameters of the recommendation algorithm should be *carefully tuned* before drawing any conclusions. Unfortunately, given the time constraints for this tutorial, we had to leave hyper-parameter tuning out. A very useful reference about careful evaluation of (session-based) recommenders can be found at:\n", 438 | "\n", 439 | "* Evaluation of Session-based Recommendation Algorithms, Ludewig and Jannach, 2018 ([paper](https://arxiv.org/abs/1803.09587))" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "### 5.1 Evaluation for different recommendation list lengths" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "GIVEN_K = 1\n", 463 | "LOOK_AHEAD = 1\n", 464 | "STEP = 1\n", 465 | "topn_list = [1, 5, 10, 20, 50, 100]" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "# ensure that all sequences have the same minimum length \n", 475 | "test_sequences = get_test_sequences(test_data, GIVEN_K)\n", 476 | "print('{} sequences available for evaluation'.format(len(test_sequences)))" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "res_list = []\n", 486 | "\n", 487 | "for topn in topn_list:\n", 488 | " print('Evaluating recommendation lists with length: {}'.format(topn))\n", 489 | " res_tmp = evaluation.sequential_evaluation(recommender,\n", 490 | " test_sequences=test_sequences,\n", 491 | " given_k=GIVEN_K,\n", 492 | " look_ahead=LOOK_AHEAD,\n", 493 | " evaluation_functions=METRICS.values(),\n", 494 | " top_n=topn,\n", 495 | " scroll=True, # here we average over all profile lengths\n", 496 | " step=STEP)\n", 497 | " mvalues = list(zip(METRICS.keys(), res_tmp))\n", 498 | " res_list.append((topn, mvalues))" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "# show separate plots per metric\n", 508 | "fig, axes = plt.subplots(nrows=1, ncols=len(METRICS), figsize=(15,5))\n", 509 | "res_list_t = list(zip(*res_list))\n", 510 | "for midx, metric in enumerate(METRICS):\n", 511 | " mvalues = [res_list_t[1][j][midx][1] for j in range(len(res_list_t[1]))]\n", 512 | " ax = axes[midx]\n", 513 | " ax.plot(topn_list, mvalues)\n", 514 | " ax.set_title(metric)\n", 515 | " ax.set_xticks(topn_list)\n", 516 | " ax.set_xlabel('List length')" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "### 5.2 Evaluation for different user profile lengths" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "given_k_list = [1, 2, 3, 4]\n", 540 | "LOOK_AHEAD = 1\n", 541 | "STEP = 1\n", 542 | "TOPN = 20" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "# ensure that all sequences have the same minimum length \n", 552 | "test_sequences = get_test_sequences(test_data, max(given_k_list))\n", 553 | "print('{} sequences available for evaluation'.format(len(test_sequences)))" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "res_list = []\n", 563 | "\n", 564 | "for gk in given_k_list:\n", 565 | " print('Evaluating profiles having length: {}'.format(gk))\n", 566 | " res_tmp = evaluation.sequential_evaluation(recommender,\n", 567 | " test_sequences=test_sequences,\n", 568 | " given_k=gk,\n", 569 | " look_ahead=LOOK_AHEAD,\n", 570 | " evaluation_functions=METRICS.values(),\n", 571 | " top_n=TOPN,\n", 572 | " scroll=False, # here we stop at each profile length\n", 573 | " step=STEP)\n", 574 | " mvalues = list(zip(METRICS.keys(), res_tmp))\n", 575 | " res_list.append((gk, mvalues))" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [ 584 | "# show separate plots per metric\n", 585 | "fig, axes = plt.subplots(nrows=1, ncols=len(METRICS), figsize=(15,5))\n", 586 | "res_list_t = list(zip(*res_list))\n", 587 | "for midx, metric in enumerate(METRICS):\n", 588 | " mvalues = [res_list_t[1][j][midx][1] for j in range(len(res_list_t[1]))]\n", 589 | " ax = axes[midx]\n", 590 | " ax.plot(given_k_list, mvalues)\n", 591 | " ax.set_title(metric)\n", 592 | " ax.set_xticks(given_k_list)\n", 593 | " ax.set_xlabel('Profile length')" 594 | ] 595 | } 596 | ], 597 | "metadata": { 598 | "kernelspec": { 599 | "display_name": "srs", 600 | "language": "python", 601 | "name": "srs" 602 | }, 603 | "language_info": { 604 | "codemirror_mode": { 605 | "name": "ipython", 606 | "version": 3 607 | }, 608 | "file_extension": ".py", 609 | "mimetype": "text/x-python", 610 | "name": "python", 611 | "nbconvert_exporter": "python", 612 | "pygments_lexer": "ipython3", 613 | "version": "3.6.6" 614 | } 615 | }, 616 | "nbformat": 4, 617 | "nbformat_minor": 2 618 | } 619 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Massimo Quadrana 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Tutorial on Sequence-Aware Recommender Systems 3 | 4 | This repository contains the material used in the hands-on session of the tutorials on Sequence-Aware Recommenders we gave 5 | at [TheWebConf 2019](https://www2019.thewebconf.org/tutorials) and [ACM RecSys 2018](https://recsys.acm.org/recsys18/tutorials/#content-tab-1-4-tab). 6 | 7 | ## ACM CSUR Paper and TheWebConf 2019 Slides 8 | 9 | ### ACM Computing Surveys (CSUR) Paper 10 | ACM DL Author-ize serviceSequence-Aware Recommender Systems 11 | Massimo Quadrana, Paolo Cremonesi, Dietmar Jannach 12 | ACM Computing Surveys (CSUR), 2018 13 | 14 | ### TheWebConf 2019 Slides 15 | 1. [Introduction](slides/TheWebConf2019_01_Introduction.pdf) 16 | 2. [Algorithms](slides/TheWebConf2019_02_Algorithms.pdf) 17 | 3. [Evaluation](slides/TheWebConf2019_03_Evaluation.pdf) 18 | 19 | ## Running the code 20 | 21 | You have two options to run the code contained in this repository: 22 | 1. Setup a new environment on your local machine and run the code locally (_highly recommended_). 23 | 2. Launch a new Binder instance by clicking on this badge [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/mquad/sars_tutorial/master). 24 | 25 | While we all know that setting up a new local environment is a slightly tedious process, Binder instances have strict resource limits (1-2GB of memory, max 100 concurrent users per repository). 26 | Also beware that Binder sessions automatically expire after 10 minutes of inactivity! 27 | So we *highly recommend* to set up a new local environment in advance by following the [Setup instructions](#setup-instructions). 28 | 29 | ### Setup instructions 30 | 31 | 1. First of all, clone this project to your local machine: 32 | ```bash 33 | git clone https://github.com/mquad/sars_tutorial.git 34 | ``` 35 | 36 | 2. Now you need to set up a new python3 environment. We will use Anaconda/Miniconda for doing so. 37 | If you don't have Anaconda/Minicoda already installed on your machine, click here to download [Miniconda](https://conda.io/miniconda.html) or [Anaconda](https://www.anaconda.com/download/) (**Python 3 version**). 38 | 39 | 3. After that, install the environment for this hands-on by running: 40 | ```bash 41 | cd sars_tutorial/ 42 | conda env create --file environment.yml 43 | ``` 44 | 45 | 4. (_Miniconda users only_) If you choose to install Miniconda before, you will now have to install Jupyter Notebook on your machine, just by running `conda install jupyter`. 46 | You can do it in your main python environment (necessarily in the `srs` env), as long as you setup the kernel as explained after. 47 | Anaconda users should already have Jupyter Notebook installed, so they can skip this step. 48 | 49 | 5. Then activate the environment with `source activate srs` or `conda activate srs`, and install a new `iptyhon` kernel by running: 50 | ```bash 51 | python -m ipykernel install --name srs 52 | ``` 53 | If you get "Permission denied" error with the above command, try with 54 | ```bash 55 | python -m ipykernel install --name srs --user 56 | ``` 57 | 58 | 6. Finally, launch the Jupyter Notebook with 59 | ```bash 60 | jupyter notebook --port=8888 61 | ``` 62 | and open it your browser at the address `localhost:8888`. 63 | (Beware, if port `8888` is already taken by another service, Jupyter Notebook will automatically open on a different one. Check out the startup log!). 64 | 65 | 66 | ### Running the notebooks 67 | 68 | The notebooks used in this hands-on are listed in the main directory of this project, as shown below: 69 | 70 | 71 | 72 | Click on the name of the notebook to open it in a new window. The name of each running notebook is highlighted in green 73 | (in the screen above, the notebook `00_TopPopular` is the only one running). 74 | 75 | Before starting to execute the notebook cells, you have to ensure that the kernel is properly set to `srs`, like in the screen below: 76 | 77 | ![](images/running_notebooks_2.png) 78 | 79 | If it's not your case, change the kernel to `srs` by clicking on `Kernel > Change kernel > srs` in the menu bar, as shown below: 80 | 81 | ![](images/running_notebooks_3.png) 82 | 83 | NOTE: this requires the installation of the `srs` kernel, as explained in the [Setup instructions](#setup-instructions). 84 | 85 | You can now start running the cells in the notebook! Yay! 86 | 87 | 88 | # Acknowledgments 89 | 90 | We want to sincerely thank [Umberto Di Fabrizio](https://www.linkedin.com/in/umbertodifabrizio) for the help in the development of this repository back when he was a MSc student at Politecnico di Milano. Great job Umberto! -------------------------------------------------------------------------------- /datasets/sessions.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/datasets/sessions.zip -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: srs 2 | channels: 3 | - defaults 4 | - conda-forge 5 | - anaconda 6 | dependencies: 7 | - cython 8 | - gensim=3.4.0 9 | - ipykernel=4.9.0 10 | - matplotlib 11 | - mkl-service 12 | - networkx=1.11 13 | - numba=0.39.0 14 | - numpy=1.15.1 15 | - pandas=0.23.4 16 | - scipy=1.1.0 17 | - theano=1.0.3 18 | - tqdm=4.25.0 19 | - pip: 20 | - treelib 21 | - pymining 22 | 23 | -------------------------------------------------------------------------------- /gifs/sequential_eval.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/gifs/sequential_eval.gif -------------------------------------------------------------------------------- /images/fpmc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/fpmc.png -------------------------------------------------------------------------------- /images/gru4rec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/gru4rec.png -------------------------------------------------------------------------------- /images/hgru4rec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/hgru4rec.png -------------------------------------------------------------------------------- /images/prod2vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/prod2vec.png -------------------------------------------------------------------------------- /images/running_notebooks_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/running_notebooks_1.png -------------------------------------------------------------------------------- /images/running_notebooks_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/running_notebooks_2.png -------------------------------------------------------------------------------- /images/running_notebooks_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/images/running_notebooks_3.png -------------------------------------------------------------------------------- /recommenders/FPMCRecommender.py: -------------------------------------------------------------------------------- 1 | from recommenders.ISeqRecommender import ISeqRecommender 2 | from util.fpmc.FPMC_numba import FPMC 3 | 4 | 5 | class FPMCRecommender(ISeqRecommender): 6 | """ 7 | Implementation of 8 | Rendle, S., Freudenthaler, C., & Schmidt-Thieme, L. (2010). Factorizing personalized Markov chains for next-basket recommendation. 9 | Proceedings of the 19th International Conference on World Wide Web - WWW ’10, 811 10 | 11 | Based on the implementation available at https://github.com/khesui/FPMC 12 | """ 13 | 14 | def __init__(self, n_factor=32, learn_rate=0.01, regular=0.001, n_epoch=15, n_neg=10): 15 | """ 16 | :param n_factor: (optional) the number of latent factors 17 | :param learn_rate: (optional) the learning rate 18 | :param regular: (optional) the L2 regularization coefficient 19 | :param n_epoch: (optional) the number of training epochs 20 | :param n_neg: (optional) the number of negative samples used in BPR learning 21 | """ 22 | super(FPMCRecommender, self).__init__() 23 | self.n_epoch = n_epoch 24 | self.n_neg = n_neg 25 | self.n_factor = n_factor 26 | self.learn_rate = learn_rate 27 | self.regular = regular 28 | 29 | def __str__(self): 30 | return 'FPMCRecommender(n_epoch={n_epoch}, ' \ 31 | 'n_neg={n_neg}, ' \ 32 | 'n_factor={n_factor}, ' \ 33 | 'learn_rate={learn_rate}, ' \ 34 | 'regular={regular})'.format(**self.__dict__) 35 | 36 | def fit(self, train_data): 37 | self._declare(train_data) 38 | 39 | train_data_supervised = [] 40 | 41 | for i, row in train_data.iterrows(): 42 | u = self.user_mapping[row['user_id']] 43 | 44 | seq = [] 45 | if len(row['sequence']) > 1: # cannot use sequences with length 1 for supervised learning 46 | for item in row['sequence']: 47 | i = self.item_mapping[item] 48 | seq.append(i) 49 | 50 | train_data_supervised.append((u, seq[len(seq) - 1], seq[:len(seq) - 1])) 51 | 52 | self.fpmc = FPMC(n_user=len(self.user_mapping), n_item=len(self.item_mapping), 53 | n_factor=self.n_factor, learn_rate=self.learn_rate, regular=self.regular) 54 | 55 | self.fpmc.user_set = set(self.user_mapping.values()) 56 | self.fpmc.item_set = set(self.item_mapping.values()) 57 | self.fpmc.init_model() 58 | 59 | self.fpmc.learnSBPR_FPMC(train_data_supervised, n_epoch=self.n_epoch, neg_batch_size=self.n_neg) 60 | 61 | def recommend(self, user_profile, user_id=None): 62 | context = [] 63 | for item in user_profile: 64 | context.append(self.item_mapping[item]) 65 | 66 | items, scores = self.fpmc.evaluation_recommender(self.user_mapping[user_id], context) 67 | recommendations = [] 68 | 69 | for i, it in enumerate(items): 70 | recommendations.append(([self.reverse_item_mapping[it]], scores[i])) 71 | return recommendations 72 | 73 | def _declare(self, data): 74 | self.user_mapping = {} 75 | self.item_mapping = {} 76 | self.reverse_item_mapping = {} 77 | 78 | user_counter = 0 79 | item_counter = 0 80 | for i, row in data.iterrows(): 81 | if row['user_id'] not in self.user_mapping: 82 | self.user_mapping[row['user_id']] = user_counter 83 | user_counter += 1 84 | 85 | for item in row['sequence']: 86 | if item not in self.item_mapping: 87 | self.item_mapping[item] = item_counter 88 | self.reverse_item_mapping[item_counter] = item 89 | item_counter += 1 90 | -------------------------------------------------------------------------------- /recommenders/FSMRecommender.py: -------------------------------------------------------------------------------- 1 | from pymining import seqmining 2 | 3 | from recommenders.ISeqRecommender import ISeqRecommender 4 | from util.SPMFinterface import callSPMF 5 | from util.tree.Tree import SmartTree 6 | 7 | 8 | class FSMRecommender(ISeqRecommender): 9 | """Frequent Sequence Mining recommender""" 10 | 11 | def __init__(self, minsup, minconf, max_context=1, min_context=1, spmf_path=None, db_path=None): 12 | """ 13 | 14 | :param minsup: the minimum support threshold. It is interpreted as relative count if in [0-1], 15 | otherwise as an absolute count. NOTE: Relative count required for training with SPFM (faster). 16 | :param minconf: the minimum confidence threshold. 17 | :param max_context: (optional) the maximum number of items in the user profile (starting from the last) that will be used 18 | for lookup in the database of frequent sequences. 19 | :param min_context: (optional) the minimum number of items in the user profile (starting from the last) that will be used 20 | for lookup in the database of frequent sequences. 21 | :param spmf_path: (optional) path to SPMF jar file. If provided, SPFM library will be used for pattern extraction (algorithm: Prefix Span). 22 | Otherwise, use pymining, which can be significantly slower depending on the sequence database size. 23 | :param db_path: (optional) path to the sequence database file 24 | """ 25 | 26 | super(FSMRecommender, self).__init__() 27 | self.minsup = minsup 28 | self.minconf = minconf 29 | self.max_context = max_context 30 | self.min_context = min_context 31 | self.recommendation_length = 1 32 | self.db_path = db_path 33 | self.spmf_path = spmf_path 34 | self.spmf_algorithm = "PrefixSpan" 35 | self.output_path = "tmp/tmp_output.txt" 36 | 37 | def __str__(self): 38 | return 'FreqSeqMiningRecommender: ' \ 39 | 'minsup={minsup}, ' \ 40 | 'minconf={minconf}, ' \ 41 | 'max_context={max_context}, ' \ 42 | 'min_context={min_context}, ' \ 43 | 'spmf_path={spmf_path}, ' \ 44 | 'db_path={db_path}'.format(**self.__dict__) 45 | 46 | def fit(self, train_data=None): 47 | """ 48 | Fit the model 49 | :param train_data: (optional) DataFrame with the training sequences, which must be assigned to column "sequence". 50 | If None, run FSM using SPFM over the sequence database stored in `self.db_path`. 51 | Otherwise, run FSM using `pymining.seqmining` (slower). 52 | """ 53 | 54 | if train_data is None: 55 | if self.spmf_path is None or self.db_path is None: 56 | raise ValueError("You should set db_path and spfm_path before calling fit() without arguments.") 57 | 58 | self.logger.info('Using SPFM (Java) for Frequent Sequence Mining') 59 | if 0 <= self.minsup <= 1: 60 | percentage_min_sup = self.minsup * 100 61 | else: 62 | raise NameError("SPMF only accepts 0<=minsup<=1") 63 | 64 | # call spmf 65 | command = ' '.join([self.spmf_algorithm, self.db_path, self.output_path, str(percentage_min_sup) + '%']) 66 | callSPMF(self.spmf_path, command) 67 | 68 | # parse back output from text file 69 | self._parse_spfm_output() 70 | else: 71 | # use pymining 72 | self.logger.info('Using pymining.seqmining (python) for Frequent Sequence Mining') 73 | sequences = train_data['sequence'].values 74 | msup = int(self.minsup * len(sequences)) if 0 <= self.minsup <= 1 else self.minsup 75 | self.logger.info('Mining frequent sequences (minsup={})'.format(msup)) 76 | self.freq_seqs = seqmining.freq_seq_enum(sequences, msup) 77 | 78 | self.logger.info('{} frequent sequences found'.format(len(self.freq_seqs))) 79 | self.logger.info('Building the prefix tree') 80 | self.tree = SmartTree() 81 | self.root_node = self.tree.set_root() 82 | for pattern, support in self.freq_seqs: 83 | if len(pattern) == 1: 84 | # add node to root 85 | self.tree.create_node(pattern[0], parent=self.root_node, data={"support": support}) 86 | elif len(pattern) > 1: 87 | # add entire path starting from root 88 | self.tree.add_path(self.root_node, pattern, support) 89 | else: 90 | raise ValueError('Frequent sequence of length 0') 91 | self.logger.info('Training completed') 92 | 93 | def recommend(self, user_profile, user_id=None): 94 | n = len(user_profile) 95 | c = min(n, self.max_context) 96 | match = [] 97 | # iterate over decreasing context lengths until a match with sufficient confidence is found 98 | while not match and c >= self.min_context: 99 | q = user_profile[n - c:n] 100 | match = self._find_match(q, self.recommendation_length) 101 | c -= 1 102 | return match 103 | 104 | def _find_match(self, context, recommendation_length): 105 | # search context 106 | lastNode = self.tree.find_path(self.root_node, context) 107 | 108 | if lastNode == -1: 109 | return [] 110 | else: # context matched 111 | context_support = self.tree[lastNode].data['support'] 112 | children = self.tree[lastNode].fpointer 113 | 114 | if not children: 115 | return [] 116 | 117 | # find all path of length recommendation_length from match 118 | paths = self.tree.find_n_length_paths(lastNode, recommendation_length) 119 | return sorted(self._filter_confidence(context_support, paths), key=lambda x: x[1], reverse=True) 120 | 121 | def _filter_confidence(self, context_support, path_list): 122 | goodPaths = [] 123 | for p in path_list: 124 | confidence = self.tree[p[len(p) - 1]].data['support'] / float(context_support) 125 | if confidence >= self.minconf: 126 | goodPaths.append((self.tree.get_nodes_tag(p), confidence)) 127 | return goodPaths 128 | 129 | def _set_tree_debug_only(self, tree): 130 | self.tree = tree 131 | self.root_node = tree.get_root() 132 | 133 | def get_freq_seqs(self): 134 | return self.freq_seqs 135 | 136 | def get_sequence_tree(self): 137 | return self.tree 138 | 139 | def show_tree(self): 140 | self.tree.show() 141 | 142 | def get_confidence_list(self, recommendation): 143 | return list(map(lambda x: x[1], recommendation)) 144 | 145 | def _parse_spfm_output(self): 146 | with open(self.output_path, 'r') as fin: 147 | self.freq_seqs = [] 148 | for line in fin: 149 | pieces = line.split('#SUP: ') 150 | support = pieces[1].strip() 151 | items = pieces[0].split(' ') 152 | seq = tuple(x for x in items if x != '' and x != '-1') 153 | seq_and_support = (seq, int(support)) 154 | self.freq_seqs.append(seq_and_support) 155 | -------------------------------------------------------------------------------- /recommenders/ISeqRecommender.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class ISeqRecommender(object): 5 | """Abstract Recommender class""" 6 | 7 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 8 | logger = logging.getLogger() 9 | 10 | def __init__(self): 11 | super(ISeqRecommender, self).__init__() 12 | 13 | def fit(self, train_data): 14 | pass 15 | 16 | def recommend(self, user_profile, user_id=None): 17 | """ 18 | Given the user profile return a list of recommendation 19 | :param user_profile: the user profile as a list of item identifiers 20 | :param user_id: (optional) the user id 21 | :return: list of recommendations e.g. [([2], 0.875), ([6], 1.0)] 22 | """ 23 | pass 24 | 25 | @staticmethod 26 | def get_recommendation_list(recommendation): 27 | return list(map(lambda x: x[0], recommendation)) 28 | 29 | @staticmethod 30 | def get_recommendation_confidence_list(recommendation): 31 | return list(map(lambda x: x[1], recommendation)) 32 | 33 | def activate_debug_print(self): 34 | self.logger.setLevel(logging.DEBUG) 35 | 36 | def deactivate_debug_print(self): 37 | self.logger.setLevel(logging.INFO) 38 | -------------------------------------------------------------------------------- /recommenders/KNNRecommender.py: -------------------------------------------------------------------------------- 1 | from recommenders.ISeqRecommender import ISeqRecommender 2 | from util.data_utils import dataset_to_gru4rec_format 3 | from util.knn.iknn import ItemKNN 4 | from util.knn.sknn import SessionKNN 5 | from util.knn.vmsknn import VMSessionKNN 6 | from util.knn.ssknn import SeqSessionKNN 7 | from util.knn.sfsknn import SeqFilterSessionKNN 8 | 9 | 10 | class KNNRecommender(ISeqRecommender): 11 | """ 12 | Interface to ItemKNN and Session-based KNN methods. Based on: 13 | 14 | Evaluation of Session-based Recommendation Algorithms, Malte Ludewig and Dietmar Jannach 15 | """ 16 | knn_models = { 17 | 'iknn': ItemKNN, 18 | 'sknn': SessionKNN, 19 | 'v-sknn': VMSessionKNN, 20 | 's-sknn': SeqSessionKNN, 21 | 'sf-sknn': SeqFilterSessionKNN 22 | } 23 | 24 | def __init__(self, 25 | model='cknn', 26 | **init_args): 27 | """ 28 | :param model: One among the following KNN models: 29 | - iknn: ItemKNN, item-to-item KNN based on the *last* item in the session to determine the items to be recommended. 30 | - sknn: SessionKNN, compares the *entire* current session with the past sessions in the training data to 31 | determine the items to be recommended. 32 | - v-sknn: VMSessionKNN, use linearly decayed real-valued vectors to encode the current session, 33 | then compares the current session with the past sessions in the training data using the dot-product 34 | to determine the items to be recommended. 35 | - s-sknn: SeqSessionKNN, this variant also puts more weight on elements that appear later in the session by 36 | using a custom scoring function (see the paper by Ludewng and Jannach). 37 | - sf-sknn: SeqFilterSessionKNN, this variant also puts more weight on elements that appear later in the session 38 | in a more restrictive way by using a custom scoring function (see the paper by Ludewng and Jannach). 39 | 40 | :param init_args: The model initialization arguments. See the following initializations or 41 | check `util.knn` for more details on each model: 42 | - iknn: ItemKNN(n_sims=100, lmbd=20, alpha=0.5) 43 | - sknn: SessionKNN(k, sample_size=500, sampling='recent', similarity='jaccard', remind=False, pop_boost=0) 44 | - v-sknn: VMSessionKNN(k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div', 45 | dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score', 46 | weighting_time=False, normalize=True) 47 | - s-knn: SeqSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', weighting='div', 48 | remind=False, pop_boost=0, extend=False, normalize=True) 49 | - sf-sknn: SeqFilterSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0, 50 | extend=False, normalize=True) 51 | """ 52 | super(KNNRecommender).__init__() 53 | if model not in self.knn_models: 54 | raise ValueError("Unknown KNN model '{}'. The available ones are: {}".format( 55 | model, list(self.knn_models.keys()) 56 | )) 57 | self.init_args = init_args 58 | self.init_args.update(dict(session_key='session_id', 59 | item_key='item_id', 60 | time_key='ts')) 61 | self.model = self.knn_models[model](**self.init_args) 62 | self.pseudo_session_id = 0 63 | 64 | def __str__(self): 65 | return str(self.model) 66 | 67 | def fit(self, train_data): 68 | self.logger.info('Converting training data to GRU4Rec format') 69 | # parse training data to GRU4Rec format 70 | train_data = dataset_to_gru4rec_format(dataset=train_data) 71 | 72 | self.logger.info('Training started') 73 | self.model.fit(train_data) 74 | self.logger.info('Training completed') 75 | self.pseudo_session_id = 0 76 | 77 | def recommend(self, user_profile, user_id=None): 78 | for item in user_profile: 79 | pred = self.model.predict_next(session_id=self.pseudo_session_id, 80 | input_item_id=item) 81 | # sort items by predicted score 82 | pred.sort_values(0, ascending=False, inplace=True) 83 | # increase the psuedo-session id so that future call to recommend() won't be connected 84 | self.pseudo_session_id += 1 85 | # convert to the required output format 86 | return [([x.index], x._2) for x in pred.reset_index().itertuples()] 87 | -------------------------------------------------------------------------------- /recommenders/MarkovChainRecommender.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import logging 3 | 4 | from recommenders.ISeqRecommender import ISeqRecommender 5 | from util.markov.Markov import add_nodes_to_graph, add_edges, apply_skipping, apply_clustering 6 | 7 | 8 | class MarkovChainRecommender(ISeqRecommender): 9 | """ 10 | Implementation from Shani, Guy, David Heckerman, and Ronen I. Brafman. "An MDP-based recommender system." 11 | Journal of Machine Learning Research 6, no. Sep (2005): 1265-1295. Chapter 3-4 12 | """ 13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 14 | 15 | def __init__(self, order): 16 | """ 17 | :param order: the order of the Markov Chain 18 | """ 19 | super(MarkovChainRecommender, self).__init__() 20 | self.order = order 21 | 22 | def fit(self, train_data): 23 | sequences = train_data['sequence'].values 24 | 25 | logging.info('Building Markov Chain model with k = ' + str(self.order)) 26 | logging.info('Adding nodes') 27 | self.tree, self.count_dict, self.G = add_nodes_to_graph(sequences, self.order) 28 | logging.info('Adding edges') 29 | self.G = add_edges(self.tree, self.count_dict, self.G, self.order) 30 | logging.info('Applying skipping') 31 | self.G = apply_skipping(self.G, self.order, sequences) 32 | logging.info('Applying clustering') 33 | logging.info('{} states in the graph'.format(len(self.G.nodes()))) 34 | self.G, _, _ = apply_clustering(self.G) 35 | # drop not useful resources 36 | self.tree = None 37 | self.count_dict = None 38 | gc.collect() 39 | 40 | def recommend(self, user_profile, user_id=None): 41 | 42 | # if the user profile is longer than the markov order, chop it keeping recent history 43 | state = tuple(user_profile[-self.order:]) 44 | # see if graph has that state 45 | recommendations = [] 46 | if self.G.has_node(state): 47 | # search for recommendations in the forward star 48 | rec_dict = {} 49 | for u, v in self.G.out_edges_iter([state]): 50 | lastElement = tuple(v[-1:]) 51 | if lastElement in rec_dict: 52 | rec_dict[lastElement] += self.G[u][v]['count'] 53 | else: 54 | rec_dict[lastElement] = self.G[u][v]['count'] 55 | for k, v in rec_dict.items(): 56 | recommendations.append((list(k), v)) 57 | 58 | return recommendations 59 | 60 | def _set_graph_debug(self, G): 61 | self.G = G 62 | -------------------------------------------------------------------------------- /recommenders/MixedMarkovRecommender.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from recommenders.ISeqRecommender import ISeqRecommender 4 | from recommenders.MarkovChainRecommender import MarkovChainRecommender 5 | 6 | 7 | class MixedMarkovChainRecommender(ISeqRecommender): 8 | """ 9 | Creates markov models with different values of k, and return recommendation by weighting the list of 10 | recommendation of each model. 11 | 12 | Reference: Shani, Guy, David Heckerman, and Ronen I. Brafman. "An MDP-based recommender system." 13 | Journal of Machine Learning Research 6, no. Sep (2005): 1265-1295. Chapter 3-4 14 | """ 15 | 16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 17 | 18 | recommenders = {} 19 | 20 | def __init__(self, min_order=1, max_order=1): 21 | """ 22 | :param min_order: the minimum order of the Mixed Markov Chain 23 | :param max_order: the maximum order of the Mixed Markov Chain 24 | """ 25 | super(MixedMarkovChainRecommender, self).__init__() 26 | self.min_order = min_order 27 | self.max_order = max_order 28 | # define the models 29 | for i in range(self.min_order, self.max_order + 1): 30 | self.recommenders[i] = MarkovChainRecommender(i) 31 | 32 | def fit(self, user_profile): 33 | for order in self.recommenders: 34 | self.recommenders[order].fit(user_profile) 35 | 36 | def recommend(self, user_profile, user_id=None): 37 | rec_dict = {} 38 | recommendations = [] 39 | sum_of_weights = 0 40 | for order, r in self.recommenders.items(): 41 | rec_list = r.recommend(user_profile) 42 | sum_of_weights += 1 / order 43 | for i in rec_list: 44 | if tuple(i[0]) in rec_dict: 45 | rec_dict[tuple(i[0])] += 1 / order * i[1] 46 | else: 47 | rec_dict[tuple(i[0])] = 1 / order * i[1] 48 | for k, v in rec_dict.items(): 49 | recommendations.append((list(k), v / sum_of_weights)) 50 | 51 | return recommendations 52 | 53 | def _set_model_debug(self, recommender, order): 54 | self.recommenders[order] = recommender 55 | -------------------------------------------------------------------------------- /recommenders/PopularityRecommender.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | from recommenders.ISeqRecommender import ISeqRecommender 4 | 5 | 6 | class PopularityRecommender(ISeqRecommender): 7 | 8 | def __init__(self): 9 | super(PopularityRecommender, self).__init__() 10 | 11 | def fit(self, train_data): 12 | sequences = train_data['sequence'].values 13 | 14 | count_dict = {} 15 | for s in sequences: 16 | for item in s: 17 | if item not in count_dict: 18 | count_dict[item] = 1 19 | else: 20 | count_dict[item] += 1 21 | 22 | self.top = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True) 23 | self.top = [([x[0]], x[1]) for x in self.top] 24 | 25 | def recommend(self, user_profile, user_id=None): 26 | return self.top 27 | 28 | def get_popular_list(self): 29 | return self.top 30 | -------------------------------------------------------------------------------- /recommenders/Prod2VecRecommender.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import gensim 4 | 5 | from recommenders.ISeqRecommender import ISeqRecommender 6 | 7 | 8 | class Prod2VecRecommender(ISeqRecommender): 9 | """ 10 | Implementation of the Prod2Vec skipgram model from 11 | Grbovic Mihajlo, Vladan Radosavljevic, Nemanja Djuric, Narayan Bhamidipati, Jaikit Savla, Varun Bhagwan, and Doug Sharp. 12 | "E-commerce in your inbox: Product recommendations at scale." 13 | In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 14 | pp. 1809-1818. ACM, 2015. 15 | """ 16 | 17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 18 | 19 | def __init__(self, min_count=2, size=100, window=5, decay_alpha=0.9, workers=4): 20 | """ 21 | :param min_count: (optional) the minimum item frequency. Items less frequent that min_count will be pruned 22 | :param size: (optional) the size of the embeddings 23 | :param window: (optional) the size of the context window 24 | :param decay_alpha: (optional) the exponential decay factor used to discount the similarity scores for items 25 | back in the user profile. Lower values mean higher discounting of past user interactions. Allows values in [0-1]. 26 | :param workers: (optional) the number of threads used for training 27 | """ 28 | super(Prod2VecRecommender, self).__init__() 29 | self.min_count = min_count 30 | self.size = size 31 | self.window = window 32 | self.decay_alpha = decay_alpha 33 | self.workers = workers 34 | 35 | def __str__(self): 36 | return 'Prod2VecRecommender(min_count={min_count}, ' \ 37 | 'size={size}, ' \ 38 | 'window={window}, ' \ 39 | 'decay_alpha={decay_alpha}, ' \ 40 | 'workers={workers})'.format(**self.__dict__) 41 | 42 | def fit(self, train_data): 43 | sequences = train_data['sequence'].values 44 | self.model = gensim.models.Word2Vec(sequences, 45 | min_count=self.min_count, 46 | window=self.window, 47 | hs=1, 48 | size=self.size, 49 | sg=1, 50 | workers=self.workers) 51 | 52 | def recommend(self, user_profile, user_id=None): 53 | user_profile = list(map(str, user_profile)) 54 | rec = [] 55 | try: 56 | # iterate the user profile backwards 57 | for i, item in enumerate(user_profile[::-1]): 58 | ms = self.model.most_similar(positive=item) 59 | # apply exponential decay to the similarity scores 60 | decay = self.decay_alpha ** i 61 | ms = [(x[0], decay * x[1]) for x in ms] 62 | rec.extend(ms) 63 | # sort items by similarity score 64 | rec = sorted(rec, key=lambda x: -x[1]) 65 | except KeyError: 66 | rec = [] 67 | return [([x[0]], x[1]) for x in rec] 68 | -------------------------------------------------------------------------------- /recommenders/RNNRecommender.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from recommenders.ISeqRecommender import ISeqRecommender 4 | from util.data_utils import dataset_to_gru4rec_format 5 | from util.rnn.gru4rec import GRU4Rec 6 | from util.rnn.hgru4rec import HGRU4Rec 7 | 8 | 9 | class RNNRecommender(ISeqRecommender): 10 | """ 11 | A **simplified** interface to Recurrent Neural Network models for Session-based recommendation. 12 | Based on the following two papers: 13 | 14 | * Recurrent Neural Networks with Top-k Gains for Session-based Recommendations, Hidasi and Karatzoglou, CIKM 2018 15 | * Personalizing Session-based Recommendation with Hierarchical Recurrent Neural Networks, Quadrana et al, Recsys 2017 16 | 17 | """ 18 | 19 | def __init__(self, 20 | session_layers, 21 | user_layers=None, 22 | batch_size=32, 23 | learning_rate=0.1, 24 | momentum=0.0, 25 | dropout=None, 26 | epochs=10, 27 | personalized=False): 28 | """ 29 | :param session_layers: number of units per layer used at session level. 30 | It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks. 31 | :param user_layers: number of units per layer used at user level. Required only by personalized models. 32 | It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks. 33 | :param batch_size: the mini-batch size used in training 34 | :param learning_rate: the learning rate used in training (Adagrad optimized) 35 | :param momentum: the momentum coefficient used in training 36 | :param dropout: dropout coefficients. 37 | If personalized=False, it's a float value for the hidden-layer(s) dropout. 38 | If personalized=True, it's a 3-tuple with the values for the dropout of (user hidden, session hidden, user-to-session hidden) layers. 39 | :param epochs: number of training epochs 40 | :param personalized: whether to train a personalized model using the HRNN model. 41 | It will require user ids at prediction time. 42 | """ 43 | super(RNNRecommender).__init__() 44 | if isinstance(session_layers, int): 45 | session_layers = [session_layers] 46 | if isinstance(user_layers, int): 47 | user_layers = [user_layers] 48 | self.session_layers = session_layers 49 | self.user_layers = user_layers 50 | self.batch_size = batch_size 51 | self.learning_rate = learning_rate 52 | self.momentum = momentum 53 | if dropout is None: 54 | if not personalized: 55 | dropout = 0.0 56 | else: 57 | dropout = (0.0, 0.0, 0.0) 58 | self.dropout = dropout 59 | self.epochs = epochs 60 | self.personalized = personalized 61 | self.pseudo_session_id = 0 62 | 63 | def __str__(self): 64 | return 'RNNRecommender(' \ 65 | 'session_layers={session_layers}, ' \ 66 | 'user_layers={user_layers}, ' \ 67 | 'batch_size={batch_size}, ' \ 68 | 'learning_rate={learning_rate}, ' \ 69 | 'momentum={momentum}, ' \ 70 | 'dropout={dropout}, ' \ 71 | 'epochs={epochs}, ' \ 72 | 'personalized={personalized}, ' \ 73 | ')'.format(**self.__dict__) 74 | 75 | def fit(self, train_data): 76 | self.logger.info('Converting training data to GRU4Rec format') 77 | # parse training data to GRU4Rec format 78 | train_data = dataset_to_gru4rec_format(dataset=train_data) 79 | 80 | if not self.personalized: 81 | # fit GRU4Rec 82 | self.model = GRU4Rec(layers=self.session_layers, 83 | n_epochs=self.epochs, 84 | batch_size=self.batch_size, 85 | learning_rate=self.learning_rate, 86 | momentum=self.momentum, 87 | dropout_p_hidden=self.dropout, 88 | session_key='session_id', 89 | item_key='item_id', 90 | time_key='ts') 91 | else: 92 | if self.user_layers is None: 93 | raise ValueError('You should set the value of user_layers before training the personalized model.') 94 | 95 | if len(self.dropout) != 3: 96 | raise ValueError('dropout should be a 3 tuple with ' 97 | '(user hidden, session hidden, user-to-session hidden) dropout values.') 98 | 99 | self.model = HGRU4Rec(session_layers=self.session_layers, 100 | user_layers=self.user_layers, 101 | batch_size=self.batch_size, 102 | n_epochs=self.epochs, 103 | learning_rate=self.learning_rate, 104 | momentum=self.momentum, 105 | dropout_p_hidden_usr=self.dropout[0], 106 | dropout_p_hidden_ses=self.dropout[1], 107 | dropout_p_init=self.dropout[2], 108 | session_key='session_id', 109 | user_key='user_id', 110 | item_key='item_id', 111 | time_key='ts') 112 | self.logger.info('Training started') 113 | self.model.fit(train_data) 114 | self.logger.info('Training completed') 115 | 116 | def recommend(self, user_profile, user_id=None): 117 | if not self.personalized: 118 | for item in user_profile: 119 | pred = self.model.predict_next_batch(np.array([self.pseudo_session_id]), 120 | np.array([item]), 121 | batch=1) 122 | else: 123 | if user_id is None: 124 | raise ValueError('user_id required by personalized models') 125 | for item in user_profile: 126 | pred = self.model.predict_next_batch(np.array([self.pseudo_session_id]), 127 | np.array([item]), 128 | np.array([user_id]), 129 | batch=1) 130 | # sort items by predicted score 131 | pred.sort_values(0, ascending=False, inplace=True) 132 | # increase the psuedo-session id so that future call to recommend() won't be connected 133 | self.pseudo_session_id += 1 134 | # convert to the required output format 135 | return [([x.index], x._2) for x in pred.reset_index().itertuples()] 136 | -------------------------------------------------------------------------------- /recommenders/SupervisedRecommender.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import clone 2 | from sklearn.tree import DecisionTreeClassifier 3 | from tqdm import tqdm 4 | 5 | from recommenders.ISeqRecommender import ISeqRecommender 6 | from util.data_expansion import data_expansion, user_profile_expansion 7 | from util.split import balance_dataset 8 | 9 | 10 | class SupervisedRecommender(ISeqRecommender): 11 | """ 12 | Adapted from Zimdars, Andrew, David Maxwell Chickering, and Christopher Meek. 13 | "Using temporal data for making recommendations." In Proceedings of the Seventeenth conference 14 | on Uncertainty in artificial intelligence, pp. 580-588. Morgan Kaufmann Publishers Inc., 2001. 15 | """ 16 | 17 | def __init__(self, history_length, classifier=DecisionTreeClassifier(), balance=True): 18 | """ 19 | :param history_length: how many recent items to consider 20 | :param classifier: an instance of sklearn classifier (e.g. DecisionTreeClassifier, LogisticRegression) 21 | :param balance : whether to balance or not the training data for each item 22 | :return: 23 | """ 24 | 25 | super(SupervisedRecommender, self).__init__() 26 | self.classifier = classifier 27 | self.history_length = history_length 28 | self.balance = balance 29 | 30 | def fit(self, train_data): 31 | sequences = train_data['sequence'].values 32 | 33 | data, self.mapping = data_expansion(sequences, self.history_length) 34 | self.item_classifier = {} 35 | # for each column i.e. item, build a classifier 36 | with tqdm(total=len(self.mapping)) as pbar: 37 | for key, value in self.mapping.items(): 38 | train, test = self._split_train_test(data, value, len(self.mapping)) 39 | if self.balance: 40 | train, test = balance_dataset(train, test) 41 | self.item_classifier[key] = self.classifier.fit(train, test.toarray().ravel()) 42 | # reset classifier 43 | self.classifier = clone(self.classifier) 44 | pbar.update(1) 45 | 46 | def recommend(self, user_profile, user_id=None): 47 | # print('recommending') 48 | data = user_profile_expansion(user_profile, self.history_length, self.mapping) 49 | recommendations = [] 50 | for item, c in self.item_classifier.items(): 51 | if c.predict(data) == [1]: 52 | recommendations.append(item) 53 | return [([x], 1 / len(recommendations)) for x in recommendations] 54 | 55 | def _split_train_test(self, data, col_index, n_unique_items): 56 | test = data[:, col_index] 57 | train = data[:, [x for x in range(data.shape[1]) if x >= n_unique_items]] 58 | return train, test 59 | 60 | def set_classifier(self, classifier): 61 | self.classifier = classifier 62 | -------------------------------------------------------------------------------- /recommenders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/recommenders/__init__.py -------------------------------------------------------------------------------- /slides/TheWebConf2019_01_Introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/slides/TheWebConf2019_01_Introduction.pdf -------------------------------------------------------------------------------- /slides/TheWebConf2019_02_Algorithms.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/slides/TheWebConf2019_02_Algorithms.pdf -------------------------------------------------------------------------------- /slides/TheWebConf2019_03_Evaluation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/slides/TheWebConf2019_03_Evaluation.pdf -------------------------------------------------------------------------------- /spmf/spmf.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/spmf/spmf.jar -------------------------------------------------------------------------------- /util/SPMFinterface.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from datetime import datetime as dt 3 | 4 | 5 | def callSPMF(spmfPath, command): 6 | # java -jar spmf.jar run PrefixSpan contextPrefixSpan.txt output.txt 50% 7 | comm = ' '.join(['java -jar', spmfPath, 'run', command]) 8 | print(comm) 9 | p = subprocess.Popen(comm, 10 | stdout=subprocess.PIPE, 11 | stderr=subprocess.STDOUT, 12 | shell=True) 13 | p.communicate() # wait for completion 14 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/__init__.py -------------------------------------------------------------------------------- /util/data_expansion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import csc_matrix 3 | 4 | 5 | def data_expansion(sequences, history_length): 6 | # sequences = [[1,2,3,4],[9,7,4],[3,2,1],[0,4,3,2]] 7 | # history_length = 3 8 | 9 | # store unique elements 10 | # mapping items to incremental integers 11 | 12 | count = 0 13 | items_mapping = {} 14 | for s in sequences: 15 | for i in s: 16 | if i in items_mapping: continue 17 | items_mapping[i] = count 18 | count += 1 19 | 20 | number_of_unique_items = len(items_mapping) 21 | 22 | row = 0 23 | row_indeces = [] 24 | col_indeces = [] 25 | # for each sequence 26 | for s in sequences: 27 | # for each item in the sequence 28 | cached = [] 29 | for i, item in enumerate(s): 30 | index = items_mapping[item] 31 | 32 | # in each row there will be: the taget,the cache 33 | row_indeces += [row] * (1 + len(cached)) 34 | 35 | # add data target 36 | col_indeces.append(index) 37 | 38 | # add history 39 | for l in range(1, history_length + 1): 40 | if i < l: continue # no history available that far 41 | row_indeces.append(row) 42 | l_th_previous_item = s[i - l] 43 | previous_el_index = items_mapping[l_th_previous_item] 44 | col_indeces.append(previous_el_index + number_of_unique_items * l) 45 | 46 | # add cache 47 | col_indeces += cached 48 | cached.append(index + number_of_unique_items * (history_length + 1)) 49 | assert len(row_indeces) == len(col_indeces) 50 | 51 | row += 1 52 | 53 | return csc_matrix((np.ones(len(row_indeces), dtype=np.int8), (row_indeces, col_indeces)), 54 | shape=(row, (history_length + 2) * len(items_mapping))), items_mapping 55 | 56 | 57 | def user_profile_expansion(user_profile, history_length, items_mapping): 58 | number_of_unique_items = len(items_mapping) 59 | 60 | row_indeces = [] 61 | col_indeces = [] 62 | 63 | # for each item in the sequence 64 | cached = [items_mapping[x] + number_of_unique_items * (history_length) for x in user_profile] 65 | last = user_profile[len(user_profile) - 1] 66 | index = items_mapping[last] 67 | 68 | # in each row there will be:the cache 69 | row_indeces += [0] * (len(cached)) 70 | 71 | # add history 72 | for l in range(1, history_length + 1): 73 | if len(user_profile) < l: continue # no history available that far 74 | row_indeces.append(0) 75 | l_th_previous_item = user_profile[len(user_profile) - l] 76 | previous_el_index = items_mapping[l_th_previous_item] 77 | col_indeces.append(previous_el_index + number_of_unique_items * (l - 1)) 78 | 79 | # add cache 80 | col_indeces += cached 81 | 82 | assert len(row_indeces) == len(col_indeces) 83 | 84 | return csc_matrix((np.ones(len(row_indeces)), (row_indeces, col_indeces)), 85 | shape=(1, (history_length + 1) * len(items_mapping))) 86 | -------------------------------------------------------------------------------- /util/data_utils.py: -------------------------------------------------------------------------------- 1 | import calendar 2 | import datetime 3 | import os 4 | import time 5 | from collections import Counter 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | def create_seq_db_filter_top_k(path, topk=0, last_months=0): 12 | file = load_and_adapt(path, last_months=last_months) 13 | 14 | c = Counter(list(file['item_id'])) 15 | 16 | if topk > 1: 17 | keeper = set([x[0] for x in c.most_common(topk)]) 18 | file = file[file['item_id'].isin(keeper)] 19 | 20 | # group by session id and concat song_id 21 | groups = file.groupby('session_id') 22 | 23 | # convert item ids to string, then aggregate them to lists 24 | aggregated = groups['item_id'].agg({'sequence': lambda x: list(map(str, x))}) 25 | init_ts = groups['ts'].min() 26 | users = groups['user_id'].min() # it's just fast, min doesn't actually make sense 27 | 28 | result = aggregated.join(init_ts).join(users) 29 | result.reset_index(inplace=True) 30 | return result 31 | 32 | 33 | def dataset_to_gru4rec_format(dataset): 34 | """ 35 | Convert a list of sequences to GRU4Rec format. 36 | Based on this StackOverflow answer: https://stackoverflow.com/a/48532692 37 | 38 | :param dataset: the dataset to be transformed 39 | """ 40 | 41 | lst_col = 'sequence' 42 | df = dataset.reset_index() 43 | unstacked = pd.DataFrame({ 44 | col: np.repeat(df[col].values, df[lst_col].str.len()) for col in df.columns.drop(lst_col)} 45 | ).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns] 46 | # ensure that events in the session have increasing timestamps 47 | unstacked['ts'] = unstacked['ts'] + unstacked.groupby('user_id').cumcount() 48 | unstacked.rename(columns={'sequence': 'item_id'}, inplace=True) 49 | return unstacked 50 | 51 | 52 | def sequences_to_spfm_format(sequences, tmp_path='tmp/sequences.txt'): 53 | """ 54 | Convert a list of sequences to SPFM format and write them to `tmp_path` 55 | :param sequences: the list of sequences 56 | :param tmp_path: the path where sequences will be written in the SPFM format 57 | """ 58 | basedir = os.path.split(tmp_path)[0] 59 | os.makedirs(basedir, exist_ok=True) 60 | with open(tmp_path, 'w') as fout: 61 | for s in sequences: 62 | fout.write(' -1 '.join(map(str, s))) 63 | fout.write(' -2\n') 64 | 65 | 66 | def load_and_adapt(path, last_months=0): 67 | file_ext = os.path.splitext(path)[-1] 68 | if file_ext == '.csv': 69 | data = pd.read_csv(path, header=0) 70 | elif file_ext == '.hdf': 71 | data = pd.read_hdf(path) 72 | else: 73 | raise ValueError('Unsupported file {} having extension {}'.format(path, file_ext)) 74 | 75 | col_names = ['session_id', 'user_id', 'item_id', 'ts'] + data.columns.values.tolist()[4:] 76 | data.columns = col_names 77 | 78 | if last_months > 0: 79 | def add_months(sourcedate, months): 80 | month = sourcedate.month - 1 + months 81 | year = int(sourcedate.year + month / 12) 82 | month = month % 12 + 1 83 | day = min(sourcedate.day, calendar.monthrange(year, month)[1]) 84 | return datetime.date(year, month, day) 85 | 86 | lastdate = datetime.datetime.fromtimestamp(data.ts.max()) 87 | firstdate = add_months(lastdate, -last_months) 88 | initial_unix = time.mktime(firstdate.timetuple()) 89 | 90 | # filter out older interactions 91 | data = data[data['ts'] >= initial_unix] 92 | 93 | return data 94 | -------------------------------------------------------------------------------- /util/evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | 4 | 5 | def sequential_evaluation(recommender, 6 | test_sequences, 7 | evaluation_functions, 8 | users=None, 9 | given_k=1, 10 | look_ahead=1, 11 | top_n=10, 12 | scroll=True, 13 | step=1): 14 | """ 15 | Runs sequential evaluation of a recommender over a set of test sequences 16 | :param recommender: the instance of the recommender to test 17 | :param test_sequences: the set of test sequences 18 | :param evaluation_functions: list of evaluation metric functions 19 | :param users: (optional) the list of user ids associated to each test sequence. Required by personalized models like FPMC. 20 | :param given_k: (optional) the initial size of each user profile, starting from the first interaction in the sequence. 21 | If <0, start counting from the end of the sequence. It must be != 0. 22 | :param look_ahead: (optional) number of subsequent interactions in the sequence to be considered as ground truth. 23 | It can be any positive number or 'all' to extend the ground truth until the end of the sequence. 24 | :param top_n: (optional) size of the recommendation list 25 | :param scroll: (optional) whether to scroll the ground truth until the end of the sequence. 26 | If True, expand the user profile and move the ground truth forward of `step` interactions. Recompute and evaluate recommendations every time. 27 | If False, evaluate recommendations once per sequence without expanding the user profile. 28 | :param step: (optional) number of interactions that will be added to the user profile at each step of the sequential evaluation. 29 | :return: the list of the average values for each evaluation metric 30 | """ 31 | if given_k == 0: 32 | raise ValueError('given_k must be != 0') 33 | 34 | metrics = np.zeros(len(evaluation_functions)) 35 | with tqdm(total=len(test_sequences)) as pbar: 36 | for i, test_seq in enumerate(test_sequences): 37 | if users is not None: 38 | user = users[i] 39 | else: 40 | user = None 41 | if scroll: 42 | metrics += sequence_sequential_evaluation(recommender, 43 | test_seq, 44 | evaluation_functions, 45 | user, 46 | given_k, 47 | look_ahead, 48 | top_n, 49 | step) 50 | else: 51 | metrics += evaluate_sequence(recommender, 52 | test_seq, 53 | evaluation_functions, 54 | user, 55 | given_k, 56 | look_ahead, 57 | top_n) 58 | pbar.update(1) 59 | return metrics / len(test_sequences) 60 | 61 | 62 | def evaluate_sequence(recommender, seq, evaluation_functions, user, given_k, look_ahead, top_n): 63 | """ 64 | :param recommender: which recommender to use 65 | :param seq: the user_profile/ context 66 | :param given_k: last element used as ground truth. NB if <0 it is interpreted as first elements to keep 67 | :param evaluation_functions: which function to use to evaluate the rec performance 68 | :param look_ahead: number of elements in ground truth to consider. if look_ahead = 'all' then all the ground_truth sequence is considered 69 | :return: performance of recommender 70 | """ 71 | # safety checks 72 | if given_k < 0: 73 | given_k = len(seq) + given_k 74 | 75 | user_profile = seq[:given_k] 76 | ground_truth = seq[given_k:] 77 | 78 | # restrict ground truth to look_ahead 79 | ground_truth = ground_truth[:look_ahead] if look_ahead != 'all' else ground_truth 80 | ground_truth = list(map(lambda x: [x], ground_truth)) # list of list format 81 | 82 | if not user_profile or not ground_truth: 83 | # if any of the two missing all evaluation functions are 0 84 | return np.zeros(len(evaluation_functions)) 85 | 86 | r = recommender.recommend(user_profile, user)[:top_n] 87 | 88 | if not r: 89 | # no recommendation found 90 | return np.zeros(len(evaluation_functions)) 91 | reco_list = recommender.get_recommendation_list(r) 92 | 93 | tmp_results = [] 94 | for f in evaluation_functions: 95 | tmp_results.append(f(ground_truth, reco_list)) 96 | return np.array(tmp_results) 97 | 98 | 99 | def sequence_sequential_evaluation(recommender, seq, evaluation_functions, user, given_k, look_ahead, top_n, step): 100 | if given_k < 0: 101 | given_k = len(seq) + given_k 102 | 103 | eval_res = 0.0 104 | eval_cnt = 0 105 | for gk in range(given_k, len(seq), step): 106 | eval_res += evaluate_sequence(recommender, seq, evaluation_functions, user, gk, look_ahead, top_n) 107 | eval_cnt += 1 108 | return eval_res / eval_cnt 109 | -------------------------------------------------------------------------------- /util/fpmc/FPMC.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | import random 4 | 5 | from util.fpmc.utils import * 6 | 7 | 8 | class FPMC: 9 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 10 | logger = logging.getLogger() 11 | 12 | def __init__(self, n_user, n_item, n_factor, learn_rate, regular): 13 | self.user_set = set() 14 | self.item_set = set() 15 | 16 | self.n_user = n_user 17 | self.n_item = n_item 18 | 19 | self.n_factor = n_factor 20 | self.learn_rate = learn_rate 21 | self.regular = regular 22 | 23 | @staticmethod 24 | def dump(fpmcObj, fname): 25 | pickle.dump(fpmcObj, open(fname, 'wb')) 26 | 27 | @staticmethod 28 | def load(fname): 29 | return pickle.load(open(fname, 'rb')) 30 | 31 | def init_model(self, std=0.01): 32 | self.VUI = np.random.normal(0, std, size=(self.n_user, self.n_factor)) 33 | self.VIU = np.random.normal(0, std, size=(self.n_item, self.n_factor)) 34 | self.VIL = np.random.normal(0, std, size=(self.n_item, self.n_factor)) 35 | self.VLI = np.random.normal(0, std, size=(self.n_item, self.n_factor)) 36 | self.VUI_m_VIU = np.dot(self.VUI, self.VIU.T) 37 | self.VIL_m_VLI = np.dot(self.VIL, self.VLI.T) 38 | 39 | def compute_x(self, u, i, b_tm1): 40 | acc_val = 0.0 41 | for l in b_tm1: 42 | acc_val += np.dot(self.VIL[i], self.VLI[l]) 43 | return (np.dot(self.VUI[u], self.VIU[i]) + (acc_val / len(b_tm1))) 44 | 45 | def compute_x_batch(self, u, b_tm1): 46 | former = self.VUI_m_VIU[u] 47 | latter = np.mean(self.VIL_m_VLI[:, b_tm1], axis=1).T 48 | return (former + latter) 49 | 50 | def evaluation(self, data_list): 51 | np.dot(self.VUI, self.VIU.T, out=self.VUI_m_VIU) 52 | np.dot(self.VIL, self.VLI.T, out=self.VIL_m_VLI) 53 | 54 | correct_count = 0 55 | rr_list = [] 56 | for (u, i, b_tm1) in data_list: 57 | scores = self.compute_x_batch(u, b_tm1) 58 | 59 | if i == scores.argmax(): 60 | correct_count += 1 61 | 62 | rank = len(np.where(scores > scores[i])[0]) + 1 63 | rr = 1.0 / rank 64 | rr_list.append(rr) 65 | 66 | try: 67 | acc = correct_count / len(rr_list) 68 | mrr = (sum(rr_list) / len(rr_list)) 69 | return (acc, mrr) 70 | except: 71 | return (0.0, 0.0) 72 | 73 | def learn_epoch(self, tr_data, neg_batch_size): 74 | for iter_idx in range(len(tr_data)): 75 | (u, i, b_tm1) = random.choice(tr_data) 76 | 77 | exclu_set = self.item_set - set([i]) 78 | j_list = random.sample(exclu_set, neg_batch_size) 79 | 80 | z1 = self.compute_x(u, i, b_tm1) 81 | for j in j_list: 82 | z2 = self.compute_x(u, j, b_tm1) 83 | delta = 1 - sigmoid(z1 - z2) 84 | 85 | VUI_update = self.learn_rate * (delta * (self.VIU[i] - self.VIU[j]) - self.regular * self.VUI[u]) 86 | VIUi_update = self.learn_rate * (delta * self.VUI[u] - self.regular * self.VIU[i]) 87 | VIUj_update = self.learn_rate * (-delta * self.VUI[u] - self.regular * self.VIU[j]) 88 | 89 | self.VUI[u] += VUI_update 90 | self.VIU[i] += VIUi_update 91 | self.VIU[j] += VIUj_update 92 | 93 | eta = np.mean(self.VLI[b_tm1], axis=0) 94 | VILi_update = self.learn_rate * (delta * eta - self.regular * self.VIL[i]) 95 | VILj_update = self.learn_rate * (-delta * eta - self.regular * self.VIL[j]) 96 | VLI_update = self.learn_rate * ( 97 | (delta * (self.VIL[i] - self.VIL[j]) / len(b_tm1)) - self.regular * self.VLI[b_tm1]) 98 | 99 | self.VIL[i] += VILi_update 100 | self.VIL[j] += VILj_update 101 | self.VLI[b_tm1] += VLI_update 102 | 103 | def learnSBPR_FPMC(self, tr_data, n_epoch=10, neg_batch_size=10): 104 | for epoch in range(n_epoch): 105 | self.learn_epoch(tr_data, neg_batch_size=neg_batch_size) 106 | self.logger.info('epoch %d done' % epoch) 107 | # if eval_per_epoch == True: 108 | # acc_in, mrr_in = self.evaluation(tr_data) 109 | # if te_data != None: 110 | # acc_out, mrr_out = self.evaluation(te_data) 111 | # self.logger.info ('In sample:%.4f\t%.4f \t Out sample:%.4f\t%.4f' % (acc_in, mrr_in, acc_out, mrr_out)) 112 | # else: 113 | # self.logger.info ('In sample:%.4f\t%.4f' % (acc_in, mrr_in)) 114 | # else: 115 | # 116 | 117 | # if eval_per_epoch == False: 118 | # acc_in, mrr_in = self.evaluation(tr_data) 119 | # if te_data != None: 120 | # acc_out, mrr_out = self.evaluation(te_data) 121 | # print ('In sample:%.4f\t%.4f \t Out sample:%.4f\t%.4f' % (acc_in, mrr_in, acc_out, mrr_out)) 122 | # else: 123 | # print ('In sample:%.4f\t%.4f' % (acc_in, mrr_in)) 124 | # 125 | # if te_data != None: 126 | # return (acc_out, mrr_out) 127 | # else: 128 | # return None 129 | -------------------------------------------------------------------------------- /util/fpmc/FPMC_numba.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from numba import jit 4 | 5 | from util.fpmc import FPMC as FPMC_basic 6 | from util.fpmc.utils import * 7 | 8 | 9 | class FPMC(FPMC_basic.FPMC): 10 | 11 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 12 | logger = logging.getLogger() 13 | 14 | def __init__(self, n_user, n_item, n_factor, learn_rate, regular): 15 | super(FPMC, self).__init__(n_user, n_item, n_factor, learn_rate, regular) 16 | 17 | def evaluation(self, data_3_list): 18 | np.dot(self.VUI, self.VIU.T, out=self.VUI_m_VIU) 19 | np.dot(self.VIL, self.VLI.T, out=self.VIL_m_VLI) 20 | acc, mrr = evaluation_jit(data_3_list[0], data_3_list[1], data_3_list[2], self.VUI_m_VIU, self.VIL_m_VLI) 21 | 22 | return acc, mrr 23 | 24 | def evaluation_recommender(self, user, user_profile): 25 | np.dot(self.VUI, self.VIU.T, out=self.VUI_m_VIU) 26 | np.dot(self.VIL, self.VLI.T, out=self.VIL_m_VLI) 27 | scores = evaluation_jit_recommender(user, user_profile, self.VUI_m_VIU, self.VIL_m_VLI) 28 | return sorted(range(len(scores)), key=lambda x: -scores[x]), sorted(scores, reverse=True) 29 | 30 | def learn_epoch(self, data_3_list, neg_batch_size): 31 | VUI, VIU, VLI, VIL = learn_epoch_jit(data_3_list[0], data_3_list[1], data_3_list[2], neg_batch_size, 32 | np.array(list(self.item_set)), self.VUI, self.VIU, self.VLI, self.VIL, 33 | self.learn_rate, self.regular) 34 | self.VUI = VUI 35 | self.VIU = VIU 36 | self.VLI = VLI 37 | self.VIL = VIL 38 | 39 | def learnSBPR_FPMC(self, tr_data, n_epoch=10, neg_batch_size=10): 40 | tr_3_list = data_to_3_list(tr_data) 41 | 42 | for epoch in range(n_epoch): 43 | self.learn_epoch(tr_3_list, neg_batch_size) 44 | self.logger.info('epoch %d done' % epoch) 45 | 46 | # if eval_per_epoch == False: 47 | # acc_in, mrr_in = self.evaluation(tr_3_list) 48 | # if te_data != None: 49 | # acc_out, mrr_out = self.evaluation(te_3_list) 50 | # print ('In sample:%.4f\t%.4f \t Out sample:%.4f\t%.4f' % (acc_in, mrr_in, acc_out, mrr_out)) 51 | # else: 52 | # print ('In sample:%.4f\t%.4f' % (acc_in, mrr_in)) 53 | # 54 | # 55 | # if te_data != None: 56 | # if ret_in_score: 57 | # return (acc_in, mrr_in, acc_out, mrr_out) 58 | # else: 59 | # return (acc_out, mrr_out) 60 | # else: 61 | # return None 62 | 63 | 64 | @jit(nopython=True) 65 | def compute_x_jit(u, i, b_tm1, VUI, VIU, VLI, VIL): 66 | acc_val = 0.0 67 | for l in b_tm1: 68 | acc_val += np.dot(VIL[i], VLI[l]) 69 | return (np.dot(VUI[u], VIU[i]) + (acc_val / len(b_tm1))) 70 | 71 | 72 | @jit(nopython=True) 73 | def learn_epoch_jit(u_list, i_list, b_tm1_list, neg_batch_size, item_set, VUI, VIU, VLI, VIL, learn_rate, regular): 74 | for iter_idx in range(len(u_list)): 75 | d_idx = np.random.randint(0, len(u_list)) 76 | u = u_list[d_idx] 77 | i = i_list[d_idx] 78 | b_tm1 = b_tm1_list[d_idx][b_tm1_list[d_idx] != -1] 79 | 80 | j_list = np.random.choice(item_set, size=neg_batch_size, replace=False) 81 | z1 = compute_x_jit(u, i, b_tm1, VUI, VIU, VLI, VIL) 82 | for j in j_list: 83 | z2 = compute_x_jit(u, j, b_tm1, VUI, VIU, VLI, VIL) 84 | delta = 1 - sigmoid_jit(z1 - z2) 85 | 86 | VUI_update = learn_rate * (delta * (VIU[i] - VIU[j]) - regular * VUI[u]) 87 | VIUi_update = learn_rate * (delta * VUI[u] - regular * VIU[i]) 88 | VIUj_update = learn_rate * (-delta * VUI[u] - regular * VIU[j]) 89 | 90 | VUI[u] += VUI_update 91 | VIU[i] += VIUi_update 92 | VIU[j] += VIUj_update 93 | 94 | eta = np.zeros(VLI.shape[1]) 95 | for l in b_tm1: 96 | eta += VLI[l] 97 | eta = eta / len(b_tm1) 98 | 99 | VILi_update = learn_rate * (delta * eta - regular * VIL[i]) 100 | VILj_update = learn_rate * (-delta * eta - regular * VIL[j]) 101 | VLI_updates = np.zeros((len(b_tm1), VLI.shape[1])) 102 | for idx, l in enumerate(b_tm1): 103 | VLI_updates[idx] = learn_rate * ((delta * (VIL[i] - VIL[j]) / len(b_tm1)) - regular * VLI[l]) 104 | 105 | VIL[i] += VILi_update 106 | VIL[j] += VILj_update 107 | for idx, l in enumerate(b_tm1): 108 | VLI[l] += VLI_updates[idx] 109 | 110 | return VUI, VIU, VLI, VIL 111 | 112 | 113 | @jit(nopython=True) 114 | def sigmoid_jit(x): 115 | if x >= 0: 116 | return math.exp(-np.logaddexp(0, -x)) 117 | else: 118 | return math.exp(x - np.logaddexp(x, 0)) 119 | 120 | 121 | @jit(nopython=True) 122 | def compute_x_batch_jit(u, b_tm1, VUI_m_VIU, VIL_m_VLI): 123 | former = VUI_m_VIU[u] 124 | latter = np.zeros(VIL_m_VLI.shape[0]) 125 | for idx in range(VIL_m_VLI.shape[0]): 126 | for l in b_tm1: 127 | latter[idx] += VIL_m_VLI[idx, l] 128 | latter = latter / len(b_tm1) 129 | 130 | return (former + latter) 131 | 132 | 133 | @jit(nopython=True) 134 | def evaluation_jit(u_list, i_list, b_tm1_list, VUI_m_VIU, VIL_m_VLI): 135 | correct_count = 0 136 | acc_rr = 0 137 | for d_idx in range(len(u_list)): 138 | u = u_list[d_idx] 139 | i = i_list[d_idx] 140 | b_tm1 = b_tm1_list[d_idx][b_tm1_list[d_idx] != -1] 141 | scores = compute_x_batch_jit(u, b_tm1, VUI_m_VIU, VIL_m_VLI) 142 | 143 | if i == scores.argmax(): 144 | correct_count += 1 145 | 146 | rank = len(np.where(scores > scores[i])[0]) + 1 147 | rr = 1.0 / rank 148 | acc_rr += rr 149 | 150 | acc = correct_count / len(u_list) 151 | mrr = acc_rr / len(u_list) 152 | return (acc, mrr) 153 | 154 | 155 | @jit(nopython=True) 156 | def evaluation_jit_recommender(user, b_tm1_list, VUI_m_VIU, VIL_m_VLI): 157 | u = user 158 | # b_tm1 = [x for x in b_tm1_list if x!=-1] 159 | b_tm1 = b_tm1_list 160 | scores = compute_x_batch_jit(u, b_tm1, VUI_m_VIU, VIL_m_VLI) 161 | 162 | return scores 163 | -------------------------------------------------------------------------------- /util/fpmc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/fpmc/__init__.py -------------------------------------------------------------------------------- /util/fpmc/utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import math 3 | 4 | import numpy as np 5 | 6 | 7 | def sigmoid(x): 8 | if x >= 0: 9 | return math.exp(-np.logaddexp(0, -x)) 10 | else: 11 | return math.exp(x - np.logaddexp(x, 0)) 12 | 13 | 14 | def load_data_from_dir(dirname): 15 | fname_user_idxseq = dirname + '/' + 'idxseq.txt' 16 | fname_user_list = dirname + '/' + 'user_idx_list.txt' 17 | fname_item_list = dirname + '/' + 'item_idx_list.txt' 18 | user_set = load_idx_list_file(fname_user_list) 19 | item_set = load_idx_list_file(fname_item_list) 20 | 21 | data_list = [] 22 | with open(fname_user_idxseq, 'r') as f: 23 | for l in f: 24 | l = [int(s) for s in l.strip().split()] 25 | user = l[0] 26 | b_tm1 = list(set(l[1:-1])) 27 | label = l[-1] 28 | 29 | data_list.append((user, label, b_tm1)) 30 | 31 | return data_list, user_set, item_set 32 | 33 | 34 | def load_idx_list_file(fname, delimiter=','): 35 | idx_set = set() 36 | with open(fname, 'r') as f: 37 | # dicard header 38 | f.readline() 39 | 40 | for l in csv.reader(f, delimiter=delimiter, quotechar='"'): 41 | idx = int(l[0]) 42 | idx_set.add(idx) 43 | return idx_set 44 | 45 | 46 | def data_to_3_list(data_list): 47 | u_list = [] 48 | i_list = [] 49 | b_tm1_list = [] 50 | max_l = 0 51 | for d in data_list: 52 | u_list.append(d[0]) 53 | i_list.append(d[1]) 54 | b_tm1_list.append(d[2]) 55 | if len(d[2]) > max_l: 56 | max_l = len(d[2]) 57 | for b_tm1 in b_tm1_list: 58 | b_tm1.extend([-1 for i in range(max_l - len(b_tm1))]) 59 | b_tm1_list = np.array(b_tm1_list) 60 | 61 | return (u_list, i_list, b_tm1_list) 62 | -------------------------------------------------------------------------------- /util/knn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/knn/__init__.py -------------------------------------------------------------------------------- /util/knn/iknn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 26 11:57:27 2015 4 | @author: Balázs Hidasi 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | class ItemKNN: 12 | ''' 13 | ItemKNN(n_sims = 100, lmbd = 20, alpha = 0.5, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time') 14 | 15 | Item-to-item predictor that computes the the similarity to all items to the given item. 16 | 17 | Similarity of two items is given by: 18 | 19 | .. math:: 20 | s_{i,j}=\sum_{s}I\{(s,i)\in D & (s,j)\in D\} / (supp_i+\\lambda)^{\\alpha}(supp_j+\\lambda)^{1-\\alpha} 21 | 22 | Parameters 23 | -------- 24 | n_sims : int 25 | Only give back non-zero scores to the N most similar items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100) 26 | lmbd : float 27 | Regularization. Discounts the similarity of rare items (incidental co-occurrences). (Default value: 20) 28 | alpha : float 29 | Balance between normalizing with the supports of the two items. 0.5 gives cosine similarity, 1.0 gives confidence (as in association rules). 30 | session_key : string 31 | header of the session ID column in the input file (default: 'SessionId') 32 | item_key : string 33 | header of the item ID column in the input file (default: 'ItemId') 34 | time_key : string 35 | header of the timestamp column in the input file (default: 'Time') 36 | 37 | ''' 38 | 39 | def __init__(self, n_sims=100, lmbd=20, alpha=0.5, session_key='SessionId', item_key='ItemId', time_key='Time'): 40 | self.n_sims = n_sims 41 | self.lmbd = lmbd 42 | self.alpha = alpha 43 | self.item_key = item_key 44 | self.session_key = session_key 45 | self.time_key = time_key 46 | 47 | def fit(self, data): 48 | ''' 49 | Trains the predictor. 50 | 51 | Parameters 52 | -------- 53 | data: pandas.DataFrame 54 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). 55 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). 56 | 57 | ''' 58 | data.set_index(np.arange(len(data)), inplace=True) 59 | self.itemids = data[self.item_key].unique() 60 | n_items = len(self.itemids) 61 | data = pd.merge(data, pd.DataFrame({self.item_key: self.itemids, 'ItemIdx': np.arange(len(self.itemids))}), 62 | on=self.item_key, how='inner') 63 | sessionids = data[self.session_key].unique() 64 | data = pd.merge(data, pd.DataFrame({self.session_key: sessionids, 'SessionIdx': np.arange(len(sessionids))}), 65 | on=self.session_key, how='inner') 66 | supp = data.groupby('SessionIdx').size() 67 | session_offsets = np.zeros(len(supp) + 1, dtype=np.int32) 68 | session_offsets[1:] = supp.cumsum() 69 | index_by_sessions = data.sort_values(['SessionIdx', self.time_key]).index.values 70 | supp = data.groupby('ItemIdx').size() 71 | item_offsets = np.zeros(n_items + 1, dtype=np.int32) 72 | item_offsets[1:] = supp.cumsum() 73 | index_by_items = data.sort_values(['ItemIdx', self.time_key]).index.values 74 | self.sims = dict() 75 | for i in range(n_items): 76 | iarray = np.zeros(n_items) 77 | start = item_offsets[i] 78 | end = item_offsets[i + 1] 79 | for e in index_by_items[start:end]: 80 | uidx = data.SessionIdx.values[e] 81 | ustart = session_offsets[uidx] 82 | uend = session_offsets[uidx + 1] 83 | user_events = index_by_sessions[ustart:uend] 84 | iarray[data.ItemIdx.values[user_events]] += 1 85 | iarray[i] = 0 86 | norm = np.power((supp[i] + self.lmbd), self.alpha) * np.power((supp.values + self.lmbd), (1.0 - self.alpha)) 87 | norm[norm == 0] = 1 88 | iarray = iarray / norm 89 | indices = np.argsort(iarray)[-1:-1 - self.n_sims:-1] 90 | self.sims[self.itemids[i]] = pd.Series(data=iarray[indices], index=self.itemids[indices]) 91 | 92 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0): 93 | ''' 94 | Gives predicton scores for a selected set of items on how likely they be the next item in the session. 95 | 96 | Parameters 97 | -------- 98 | session_id : int or string 99 | The session IDs of the event. 100 | input_item_id : int or string 101 | The item ID of the event. Must be in the set of item IDs of the training set. 102 | predict_for_item_ids : 1D array 103 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. 104 | 105 | Returns 106 | -------- 107 | out : pandas.Series 108 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. 109 | 110 | ''' 111 | if predict_for_item_ids is None: 112 | predict_for_item_ids = self.itemids 113 | preds = np.zeros(len(predict_for_item_ids)) 114 | sim_list = self.sims[input_item_id] 115 | mask = np.in1d(predict_for_item_ids, sim_list.index) 116 | preds[mask] = sim_list[predict_for_item_ids[mask]] 117 | return pd.Series(data=preds, index=predict_for_item_ids) 118 | -------------------------------------------------------------------------------- /util/knn/sfsknn.py: -------------------------------------------------------------------------------- 1 | from _operator import itemgetter 2 | from math import sqrt 3 | import random 4 | import time 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | class SeqFilterSessionKNN: 11 | ''' 12 | SessionKNN( k, sample_size=500, sampling='recent', similarity = 'jaccard', remind=False, pop_boost=0, session_key = 'SessionId', item_key= 'ItemId') 13 | 14 | Parameters 15 | ----------- 16 | k : int 17 | Number of neighboring session to calculate the item scores from. (Default value: 100) 18 | sample_size : int 19 | Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500) 20 | sampling : string 21 | String to define the sampling method for sessions (recent, random). (default: recent) 22 | similarity : string 23 | String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard) 24 | remind : bool 25 | Should the last items of the current session be boosted to the top as reminders 26 | pop_boost : int 27 | Push popular items in the neighbor sessions by this factor. (default: 0 to leave out) 28 | extend : bool 29 | Add evaluated sessions to the maps 30 | normalize : bool 31 | Normalize the scores in the end 32 | session_key : string 33 | Header of the session ID column in the input file. (default: 'SessionId') 34 | item_key : string 35 | Header of the item ID column in the input file. (default: 'ItemId') 36 | time_key : string 37 | Header of the timestamp column in the input file. (default: 'Time') 38 | ''' 39 | 40 | def __init__(self, k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0, 41 | extend=False, normalize=True, session_key='SessionId', item_key='ItemId', time_key='Time'): 42 | 43 | self.remind = remind 44 | self.k = k 45 | self.sample_size = sample_size 46 | self.sampling = sampling 47 | self.similarity = similarity 48 | self.pop_boost = pop_boost 49 | self.session_key = session_key 50 | self.item_key = item_key 51 | self.time_key = time_key 52 | self.extend = extend 53 | self.normalize = normalize 54 | 55 | # updated while recommending 56 | self.session = -1 57 | self.session_items = [] 58 | self.relevant_sessions = set() 59 | 60 | # cache relations once at startup 61 | self.session_item_map = dict() 62 | self.item_session_map = dict() 63 | self.session_time = dict() 64 | self.followed_by = dict() 65 | 66 | self.sim_time = 0 67 | 68 | def fit(self, train, items=None): 69 | ''' 70 | Trains the predictor. 71 | 72 | Parameters 73 | -------- 74 | data: pandas.DataFrame 75 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). 76 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). 77 | 78 | ''' 79 | 80 | index_session = train.columns.get_loc(self.session_key) 81 | index_item = train.columns.get_loc(self.item_key) 82 | index_time = train.columns.get_loc(self.time_key) 83 | self.itemids = train[self.item_key].unique() 84 | 85 | session = -1 86 | session_items = set() 87 | last_item = -1 88 | time = -1 89 | # cnt = 0 90 | for row in train.itertuples(index=False): 91 | # cache items of sessions 92 | if row[index_session] != session: 93 | if len(session_items) > 0: 94 | self.session_item_map.update({session: session_items}) 95 | # cache the last time stamp of the session 96 | self.session_time.update({session: time}) 97 | session = row[index_session] 98 | session_items = set() 99 | else: 100 | if last_item != -1: # fill followed by map for filtering of candidate items 101 | if not last_item in self.followed_by: 102 | self.followed_by[last_item] = set() 103 | self.followed_by[last_item].add(row[index_item]) 104 | 105 | time = row[index_time] 106 | session_items.add(row[index_item]) 107 | 108 | # cache sessions involving an item 109 | map_is = self.item_session_map.get(row[index_item]) 110 | if map_is is None: 111 | map_is = set() 112 | self.item_session_map.update({row[index_item]: map_is}) 113 | map_is.add(row[index_session]) 114 | 115 | last_item = row[index_item] 116 | 117 | # Add the last tuple 118 | self.session_item_map.update({session: session_items}) 119 | self.session_time.update({session: time}) 120 | 121 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0): 122 | ''' 123 | Gives predicton scores for a selected set of items on how likely they be the next item in the session. 124 | 125 | Parameters 126 | -------- 127 | session_id : int or string 128 | The session IDs of the event. 129 | input_item_id : int or string 130 | The item ID of the event. Must be in the set of item IDs of the training set. 131 | predict_for_item_ids : 1D array 132 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. 133 | 134 | Returns 135 | -------- 136 | out : pandas.Series 137 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. 138 | 139 | ''' 140 | 141 | # gc.collect() 142 | # process = psutil.Process(os.getpid()) 143 | # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used') 144 | 145 | if (self.session != session_id): # new session 146 | 147 | if (self.extend): 148 | item_set = set(self.session_items) 149 | self.session_item_map[self.session] = item_set; 150 | for item in item_set: 151 | map_is = self.item_session_map.get(item) 152 | if map_is is None: 153 | map_is = set() 154 | self.item_session_map.update({item: map_is}) 155 | map_is.add(self.session) 156 | 157 | ts = time.time() 158 | self.session_time.update({self.session: ts}) 159 | 160 | last_item = -1 161 | for item in self.session_items: 162 | if last_item != -1: 163 | if not last_item in self.followed_by: 164 | self.followed_by[last_item] = set() 165 | self.followed_by[last_item].add(item) 166 | last_item = item 167 | 168 | self.session = session_id 169 | self.session_items = list() 170 | self.relevant_sessions = set() 171 | 172 | if type == 'view': 173 | self.session_items.append(input_item_id) 174 | 175 | if skip: 176 | return 177 | 178 | neighbors = self.find_neighbors(set(self.session_items), input_item_id, session_id) 179 | scores = self.score_items(neighbors, input_item_id) 180 | 181 | # add some reminders 182 | if self.remind: 183 | 184 | reminderScore = 5 185 | takeLastN = 3 186 | 187 | cnt = 0 188 | for elem in self.session_items[-takeLastN:]: 189 | cnt = cnt + 1 190 | # reminderScore = reminderScore + (cnt/100) 191 | 192 | oldScore = scores.get(elem) 193 | newScore = 0 194 | if oldScore is None: 195 | newScore = reminderScore 196 | else: 197 | newScore = oldScore + reminderScore 198 | # print 'old score ', oldScore 199 | # update the score and add a small number for the position 200 | newScore = (newScore * reminderScore) + (cnt / 100) 201 | 202 | scores.update({elem: newScore}) 203 | 204 | # push popular ones 205 | if self.pop_boost > 0: 206 | 207 | pop = self.item_pop(neighbors) 208 | # Iterate over the item neighbors 209 | # print itemScores 210 | for key in scores: 211 | item_pop = pop.get(key) 212 | # Gives some minimal MRR boost? 213 | scores.update({key: (scores[key] + (self.pop_boost * item_pop))}) 214 | 215 | # Create things in the format .. 216 | if predict_for_item_ids is None: 217 | predict_for_item_ids = self.itemids 218 | predictions = np.zeros(len(predict_for_item_ids)) 219 | mask = np.in1d(predict_for_item_ids, list(scores.keys())) 220 | 221 | items = predict_for_item_ids[mask] 222 | values = [scores[x] for x in items] 223 | predictions[mask] = values 224 | series = pd.Series(data=predictions, index=predict_for_item_ids) 225 | 226 | if self.normalize: 227 | series = series / series.max() 228 | 229 | return series 230 | 231 | def item_pop(self, sessions): 232 | ''' 233 | Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids) 234 | 235 | Parameters 236 | -------- 237 | sessions: set 238 | 239 | Returns 240 | -------- 241 | out : dict 242 | ''' 243 | result = dict() 244 | max_pop = 0 245 | for session, weight in sessions: 246 | items = self.items_for_session(session) 247 | for item in items: 248 | 249 | count = result.get(item) 250 | if count is None: 251 | result.update({item: 1}) 252 | else: 253 | result.update({item: count + 1}) 254 | 255 | if (result.get(item) > max_pop): 256 | max_pop = result.get(item) 257 | 258 | for key in result: 259 | result.update({key: (result[key] / max_pop)}) 260 | 261 | return result 262 | 263 | def jaccard(self, first, second): 264 | ''' 265 | Calculates the jaccard index for two sessions 266 | 267 | Parameters 268 | -------- 269 | first: Id of a session 270 | second: Id of a session 271 | 272 | Returns 273 | -------- 274 | out : float value 275 | ''' 276 | sc = time.clock() 277 | intersection = len(first & second) 278 | union = len(first | second) 279 | res = intersection / union 280 | 281 | self.sim_time += (time.clock() - sc) 282 | 283 | return res 284 | 285 | def cosine(self, first, second): 286 | ''' 287 | Calculates the cosine similarity for two sessions 288 | 289 | Parameters 290 | -------- 291 | first: Id of a session 292 | second: Id of a session 293 | 294 | Returns 295 | -------- 296 | out : float value 297 | ''' 298 | li = len(first & second) 299 | la = len(first) 300 | lb = len(second) 301 | result = li / sqrt(la) * sqrt(lb) 302 | 303 | return result 304 | 305 | def tanimoto(self, first, second): 306 | ''' 307 | Calculates the cosine tanimoto similarity for two sessions 308 | 309 | Parameters 310 | -------- 311 | first: Id of a session 312 | second: Id of a session 313 | 314 | Returns 315 | -------- 316 | out : float value 317 | ''' 318 | li = len(first & second) 319 | la = len(first) 320 | lb = len(second) 321 | result = li / (la + lb - li) 322 | 323 | return result 324 | 325 | def binary(self, first, second): 326 | ''' 327 | Calculates the ? for 2 sessions 328 | 329 | Parameters 330 | -------- 331 | first: Id of a session 332 | second: Id of a session 333 | 334 | Returns 335 | -------- 336 | out : float value 337 | ''' 338 | a = len(first & second) 339 | b = len(first) 340 | c = len(second) 341 | 342 | result = (2 * a) / ((2 * a) + b + c) 343 | 344 | return result 345 | 346 | def items_for_session(self, session): 347 | ''' 348 | Returns all items in the session 349 | 350 | Parameters 351 | -------- 352 | session: Id of a session 353 | 354 | Returns 355 | -------- 356 | out : set 357 | ''' 358 | return self.session_item_map.get(session); 359 | 360 | def sessions_for_item(self, item_id): 361 | ''' 362 | Returns all session for an item 363 | 364 | Parameters 365 | -------- 366 | item: Id of the item session 367 | 368 | Returns 369 | -------- 370 | out : set 371 | ''' 372 | return self.item_session_map.get(item_id) 373 | 374 | def most_recent_sessions(self, sessions, number): 375 | ''' 376 | Find the most recent sessions in the given set 377 | 378 | Parameters 379 | -------- 380 | sessions: set of session ids 381 | 382 | Returns 383 | -------- 384 | out : set 385 | ''' 386 | sample = set() 387 | 388 | tuples = list() 389 | for session in sessions: 390 | time = self.session_time.get(session) 391 | if time is None: 392 | print(' EMPTY TIMESTAMP!! ', session) 393 | tuples.append((session, time)) 394 | 395 | tuples = sorted(tuples, key=itemgetter(1), reverse=True) 396 | # print 'sorted list ', sortedList 397 | cnt = 0 398 | for element in tuples: 399 | cnt = cnt + 1 400 | if cnt > number: 401 | break 402 | sample.add(element[0]) 403 | # print 'returning sample of size ', len(sample) 404 | return sample 405 | 406 | def possible_neighbor_sessions(self, session_items, input_item_id, session_id): 407 | ''' 408 | Find a set of session to later on find neighbors in. 409 | A self.sample_size of 0 uses all sessions in which any item of the current session appears. 410 | self.sampling can be performed with the options "recent" or "random". 411 | "recent" selects the self.sample_size most recent sessions while "random" just choses randomly. 412 | 413 | Parameters 414 | -------- 415 | sessions: set of session ids 416 | 417 | Returns 418 | -------- 419 | out : set 420 | ''' 421 | 422 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id); 423 | 424 | if self.sample_size == 0: # use all session as possible neighbors 425 | 426 | print('!!!!! runnig KNN without a sample size (check config)') 427 | return self.relevant_sessions 428 | 429 | else: # sample some sessions 430 | 431 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id); 432 | 433 | if len(self.relevant_sessions) > self.sample_size: 434 | 435 | if self.sampling == 'recent': 436 | sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size) 437 | elif self.sampling == 'random': 438 | sample = random.sample(self.relevant_sessions, self.sample_size) 439 | else: 440 | sample = self.relevant_sessions[:self.sample_size] 441 | 442 | return sample 443 | else: 444 | return self.relevant_sessions 445 | 446 | def calc_similarity(self, session_items, sessions): 447 | ''' 448 | Calculates the configured similarity for the items in session_items and each session in sessions. 449 | 450 | Parameters 451 | -------- 452 | session_items: set of item ids 453 | sessions: list of session ids 454 | 455 | Returns 456 | -------- 457 | out : list of tuple (session_id,similarity) 458 | ''' 459 | 460 | # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric 461 | neighbors = [] 462 | cnt = 0 463 | for session in sessions: 464 | cnt = cnt + 1 465 | # get items of the session, look up the cache first 466 | session_items_test = self.items_for_session(session) 467 | 468 | similarity = getattr(self, self.similarity)(session_items_test, session_items) 469 | if similarity > 0: 470 | neighbors.append((session, similarity)) 471 | 472 | return neighbors 473 | 474 | # ----------------- 475 | # Find a set of neighbors, returns a list of tuples (sessionid: similarity) 476 | # ----------------- 477 | def find_neighbors(self, session_items, input_item_id, session_id): 478 | ''' 479 | Finds the k nearest neighbors for the given session_id and the current item input_item_id. 480 | 481 | Parameters 482 | -------- 483 | session_items: set of item ids 484 | input_item_id: int 485 | session_id: int 486 | 487 | Returns 488 | -------- 489 | out : list of tuple (session_id, similarity) 490 | ''' 491 | possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id) 492 | possible_neighbors = self.calc_similarity(session_items, possible_neighbors) 493 | 494 | possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1]) 495 | possible_neighbors = possible_neighbors[:self.k] 496 | 497 | return possible_neighbors 498 | 499 | def score_items(self, neighbors, input_item_id): 500 | ''' 501 | Compute a set of scores for all items given a set of neighbors. 502 | 503 | Parameters 504 | -------- 505 | neighbors: set of session ids 506 | 507 | Returns 508 | -------- 509 | out : list of tuple (item, score) 510 | ''' 511 | # now we have the set of relevant items to make predictions 512 | scores = dict() 513 | # iterate over the sessions 514 | for session in neighbors: 515 | # get the items in this session 516 | items = self.items_for_session(session[0]) 517 | 518 | for item in items: 519 | 520 | if input_item_id in self.followed_by and item in self.followed_by[ 521 | input_item_id]: # hard filter the candidates 522 | 523 | old_score = scores.get(item) 524 | new_score = session[1] 525 | 526 | if old_score is None: 527 | scores.update({item: new_score}) 528 | else: 529 | new_score = old_score + new_score 530 | scores.update({item: new_score}) 531 | 532 | return scores 533 | -------------------------------------------------------------------------------- /util/knn/sknn.py: -------------------------------------------------------------------------------- 1 | from _operator import itemgetter 2 | from math import sqrt 3 | import random 4 | import time 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | class SessionKNN: 11 | ''' 12 | SessionKNN( k, sample_size=500, sampling='recent', similarity = 'jaccard', remind=False, pop_boost=0, session_key = 'SessionId', item_key= 'ItemId') 13 | 14 | Parameters 15 | ----------- 16 | k : int 17 | Number of neighboring session to calculate the item scores from. (Default value: 100) 18 | sample_size : int 19 | Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500) 20 | sampling : string 21 | String to define the sampling method for sessions (recent, random). (default: recent) 22 | similarity : string 23 | String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard) 24 | remind : bool 25 | Should the last items of the current session be boosted to the top as reminders 26 | pop_boost : int 27 | Push popular items in the neighbor sessions by this factor. (default: 0 to leave out) 28 | extend : bool 29 | Add evaluated sessions to the maps 30 | normalize : bool 31 | Normalize the scores in the end 32 | session_key : string 33 | Header of the session ID column in the input file. (default: 'SessionId') 34 | item_key : string 35 | Header of the item ID column in the input file. (default: 'ItemId') 36 | time_key : string 37 | Header of the timestamp column in the input file. (default: 'Time') 38 | ''' 39 | 40 | def __init__(self, k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0, 41 | extend=False, normalize=True, session_key='SessionId', item_key='ItemId', time_key='Time'): 42 | 43 | self.remind = remind 44 | self.k = k 45 | self.sample_size = sample_size 46 | self.sampling = sampling 47 | self.similarity = similarity 48 | self.pop_boost = pop_boost 49 | self.session_key = session_key 50 | self.item_key = item_key 51 | self.time_key = time_key 52 | self.extend = extend 53 | self.normalize = normalize 54 | 55 | # updated while recommending 56 | self.session = -1 57 | self.session_items = [] 58 | self.relevant_sessions = set() 59 | 60 | # cache relations once at startup 61 | self.session_item_map = dict() 62 | self.item_session_map = dict() 63 | self.session_time = dict() 64 | 65 | self.sim_time = 0 66 | 67 | def fit(self, train): 68 | ''' 69 | Trains the predictor. 70 | 71 | Parameters 72 | -------- 73 | data: pandas.DataFrame 74 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). 75 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). 76 | 77 | ''' 78 | 79 | index_session = train.columns.get_loc(self.session_key) 80 | index_item = train.columns.get_loc(self.item_key) 81 | index_time = train.columns.get_loc(self.time_key) 82 | self.itemids = train[self.item_key].unique() 83 | 84 | session = -1 85 | session_items = set() 86 | time = -1 87 | # cnt = 0 88 | for row in train.itertuples(index=False): 89 | # cache items of sessions 90 | if row[index_session] != session: 91 | if len(session_items) > 0: 92 | self.session_item_map.update({session: session_items}) 93 | # cache the last time stamp of the session 94 | self.session_time.update({session: time}) 95 | session = row[index_session] 96 | session_items = set() 97 | time = row[index_time] 98 | session_items.add(row[index_item]) 99 | 100 | # cache sessions involving an item 101 | map_is = self.item_session_map.get(row[index_item]) 102 | if map_is is None: 103 | map_is = set() 104 | self.item_session_map.update({row[index_item]: map_is}) 105 | map_is.add(row[index_session]) 106 | 107 | # Add the last tuple 108 | self.session_item_map.update({session: session_items}) 109 | self.session_time.update({session: time}) 110 | 111 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0): 112 | ''' 113 | Gives predicton scores for a selected set of items on how likely they be the next item in the session. 114 | 115 | Parameters 116 | -------- 117 | session_id : int or string 118 | The session IDs of the event. 119 | input_item_id : int or string 120 | The item ID of the event. Must be in the set of item IDs of the training set. 121 | predict_for_item_ids : 1D array 122 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. 123 | 124 | Returns 125 | -------- 126 | out : pandas.Series 127 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. 128 | 129 | ''' 130 | 131 | # gc.collect() 132 | # process = psutil.Process(os.getpid()) 133 | # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used') 134 | 135 | if (self.session != session_id): # new session 136 | 137 | if (self.extend): 138 | item_set = set(self.session_items) 139 | self.session_item_map[self.session] = item_set; 140 | for item in item_set: 141 | map_is = self.item_session_map.get(item) 142 | if map_is is None: 143 | map_is = set() 144 | self.item_session_map.update({item: map_is}) 145 | map_is.add(self.session) 146 | 147 | ts = time.time() 148 | self.session_time.update({self.session: ts}) 149 | 150 | self.session = session_id 151 | self.session_items = list() 152 | self.relevant_sessions = set() 153 | 154 | if type == 'view': 155 | self.session_items.append(input_item_id) 156 | 157 | if skip: 158 | return 159 | 160 | neighbors = self.find_neighbors(set(self.session_items), input_item_id, session_id) 161 | scores = self.score_items(neighbors) 162 | 163 | # add some reminders 164 | if self.remind: 165 | 166 | reminderScore = 5 167 | takeLastN = 3 168 | 169 | cnt = 0 170 | for elem in self.session_items[-takeLastN:]: 171 | cnt = cnt + 1 172 | # reminderScore = reminderScore + (cnt/100) 173 | 174 | oldScore = scores.get(elem) 175 | newScore = 0 176 | if oldScore is None: 177 | newScore = reminderScore 178 | else: 179 | newScore = oldScore + reminderScore 180 | # print 'old score ', oldScore 181 | # update the score and add a small number for the position 182 | newScore = (newScore * reminderScore) + (cnt / 100) 183 | 184 | scores.update({elem: newScore}) 185 | 186 | # push popular ones 187 | if self.pop_boost > 0: 188 | 189 | pop = self.item_pop(neighbors) 190 | # Iterate over the item neighbors 191 | # print itemScores 192 | for key in scores: 193 | item_pop = pop.get(key) 194 | # Gives some minimal MRR boost? 195 | scores.update({key: (scores[key] + (self.pop_boost * item_pop))}) 196 | 197 | # Create things in the format .. 198 | if predict_for_item_ids is None: 199 | predict_for_item_ids = self.itemids 200 | predictions = np.zeros(len(predict_for_item_ids)) 201 | mask = np.in1d(predict_for_item_ids, list(scores.keys())) 202 | 203 | items = predict_for_item_ids[mask] 204 | values = [scores[x] for x in items] 205 | predictions[mask] = values 206 | series = pd.Series(data=predictions, index=predict_for_item_ids) 207 | 208 | if self.normalize: 209 | series = series / series.max() 210 | 211 | return series 212 | 213 | def item_pop(self, sessions): 214 | ''' 215 | Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids) 216 | 217 | Parameters 218 | -------- 219 | sessions: set 220 | 221 | Returns 222 | -------- 223 | out : dict 224 | ''' 225 | result = dict() 226 | max_pop = 0 227 | for session, weight in sessions: 228 | items = self.items_for_session(session) 229 | for item in items: 230 | 231 | count = result.get(item) 232 | if count is None: 233 | result.update({item: 1}) 234 | else: 235 | result.update({item: count + 1}) 236 | 237 | if (result.get(item) > max_pop): 238 | max_pop = result.get(item) 239 | 240 | for key in result: 241 | result.update({key: (result[key] / max_pop)}) 242 | 243 | return result 244 | 245 | def jaccard(self, first, second): 246 | ''' 247 | Calculates the jaccard index for two sessions 248 | 249 | Parameters 250 | -------- 251 | first: Id of a session 252 | second: Id of a session 253 | 254 | Returns 255 | -------- 256 | out : float value 257 | ''' 258 | sc = time.clock() 259 | intersection = len(first & second) 260 | union = len(first | second) 261 | res = intersection / union 262 | 263 | self.sim_time += (time.clock() - sc) 264 | 265 | return res 266 | 267 | def cosine(self, first, second): 268 | ''' 269 | Calculates the cosine similarity for two sessions 270 | 271 | Parameters 272 | -------- 273 | first: Id of a session 274 | second: Id of a session 275 | 276 | Returns 277 | -------- 278 | out : float value 279 | ''' 280 | li = len(first & second) 281 | la = len(first) 282 | lb = len(second) 283 | result = li / sqrt(la) * sqrt(lb) 284 | 285 | return result 286 | 287 | def tanimoto(self, first, second): 288 | ''' 289 | Calculates the cosine tanimoto similarity for two sessions 290 | 291 | Parameters 292 | -------- 293 | first: Id of a session 294 | second: Id of a session 295 | 296 | Returns 297 | -------- 298 | out : float value 299 | ''' 300 | li = len(first & second) 301 | la = len(first) 302 | lb = len(second) 303 | result = li / (la + lb - li) 304 | 305 | return result 306 | 307 | def binary(self, first, second): 308 | ''' 309 | Calculates the ? for 2 sessions 310 | 311 | Parameters 312 | -------- 313 | first: Id of a session 314 | second: Id of a session 315 | 316 | Returns 317 | -------- 318 | out : float value 319 | ''' 320 | a = len(first & second) 321 | b = len(first) 322 | c = len(second) 323 | 324 | result = (2 * a) / ((2 * a) + b + c) 325 | 326 | return result 327 | 328 | def random(self, first, second): 329 | ''' 330 | Calculates the ? for 2 sessions 331 | 332 | Parameters 333 | -------- 334 | first: Id of a session 335 | second: Id of a session 336 | 337 | Returns 338 | -------- 339 | out : float value 340 | ''' 341 | return random.random() 342 | 343 | def items_for_session(self, session): 344 | ''' 345 | Returns all items in the session 346 | 347 | Parameters 348 | -------- 349 | session: Id of a session 350 | 351 | Returns 352 | -------- 353 | out : set 354 | ''' 355 | return self.session_item_map.get(session); 356 | 357 | def sessions_for_item(self, item_id): 358 | ''' 359 | Returns all session for an item 360 | 361 | Parameters 362 | -------- 363 | item: Id of the item session 364 | 365 | Returns 366 | -------- 367 | out : set 368 | ''' 369 | return self.item_session_map.get(item_id) 370 | 371 | def most_recent_sessions(self, sessions, number): 372 | ''' 373 | Find the most recent sessions in the given set 374 | 375 | Parameters 376 | -------- 377 | sessions: set of session ids 378 | 379 | Returns 380 | -------- 381 | out : set 382 | ''' 383 | sample = set() 384 | 385 | tuples = list() 386 | for session in sessions: 387 | time = self.session_time.get(session) 388 | if time is None: 389 | print(' EMPTY TIMESTAMP!! ', session) 390 | tuples.append((session, time)) 391 | 392 | tuples = sorted(tuples, key=itemgetter(1), reverse=True) 393 | # print 'sorted list ', sortedList 394 | cnt = 0 395 | for element in tuples: 396 | cnt = cnt + 1 397 | if cnt > number: 398 | break 399 | sample.add(element[0]) 400 | # print 'returning sample of size ', len(sample) 401 | return sample 402 | 403 | def possible_neighbor_sessions(self, session_items, input_item_id, session_id): 404 | ''' 405 | Find a set of session to later on find neighbors in. 406 | A self.sample_size of 0 uses all sessions in which any item of the current session appears. 407 | self.sampling can be performed with the options "recent" or "random". 408 | "recent" selects the self.sample_size most recent sessions while "random" just choses randomly. 409 | 410 | Parameters 411 | -------- 412 | sessions: set of session ids 413 | 414 | Returns 415 | -------- 416 | out : set 417 | ''' 418 | 419 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id); 420 | 421 | if self.sample_size == 0: # use all session as possible neighbors 422 | 423 | print('!!!!! runnig KNN without a sample size (check config)') 424 | return self.relevant_sessions 425 | 426 | else: # sample some sessions 427 | 428 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id); 429 | 430 | if len(self.relevant_sessions) > self.sample_size: 431 | 432 | if self.sampling == 'recent': 433 | sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size) 434 | elif self.sampling == 'random': 435 | sample = random.sample(self.relevant_sessions, self.sample_size) 436 | else: 437 | sample = self.relevant_sessions[:self.sample_size] 438 | 439 | return sample 440 | else: 441 | return self.relevant_sessions 442 | 443 | def calc_similarity(self, session_items, sessions): 444 | ''' 445 | Calculates the configured similarity for the items in session_items and each session in sessions. 446 | 447 | Parameters 448 | -------- 449 | session_items: set of item ids 450 | sessions: list of session ids 451 | 452 | Returns 453 | -------- 454 | out : list of tuple (session_id,similarity) 455 | ''' 456 | 457 | # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric 458 | neighbors = [] 459 | cnt = 0 460 | for session in sessions: 461 | cnt = cnt + 1 462 | # get items of the session, look up the cache first 463 | session_items_test = self.items_for_session(session) 464 | 465 | similarity = getattr(self, self.similarity)(session_items_test, session_items) 466 | if similarity > 0: 467 | neighbors.append((session, similarity)) 468 | 469 | return neighbors 470 | 471 | # ----------------- 472 | # Find a set of neighbors, returns a list of tuples (sessionid: similarity) 473 | # ----------------- 474 | def find_neighbors(self, session_items, input_item_id, session_id): 475 | ''' 476 | Finds the k nearest neighbors for the given session_id and the current item input_item_id. 477 | 478 | Parameters 479 | -------- 480 | session_items: set of item ids 481 | input_item_id: int 482 | session_id: int 483 | 484 | Returns 485 | -------- 486 | out : list of tuple (session_id, similarity) 487 | ''' 488 | possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id) 489 | possible_neighbors = self.calc_similarity(session_items, possible_neighbors) 490 | 491 | possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1]) 492 | possible_neighbors = possible_neighbors[:self.k] 493 | 494 | return possible_neighbors 495 | 496 | def score_items(self, neighbors): 497 | ''' 498 | Compute a set of scores for all items given a set of neighbors. 499 | 500 | Parameters 501 | -------- 502 | neighbors: set of session ids 503 | 504 | Returns 505 | -------- 506 | out : list of tuple (item, score) 507 | ''' 508 | # now we have the set of relevant items to make predictions 509 | scores = dict() 510 | # iterate over the sessions 511 | for session in neighbors: 512 | # get the items in this session 513 | items = self.items_for_session(session[0]) 514 | 515 | for item in items: 516 | old_score = scores.get(item) 517 | new_score = session[1] 518 | 519 | if old_score is None: 520 | scores.update({item: new_score}) 521 | else: 522 | new_score = old_score + new_score 523 | scores.update({item: new_score}) 524 | 525 | return scores 526 | -------------------------------------------------------------------------------- /util/knn/ssknn.py: -------------------------------------------------------------------------------- 1 | from _operator import itemgetter 2 | from math import sqrt 3 | import random 4 | import time 5 | from math import log10 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | class SeqSessionKNN: 12 | ''' 13 | SeqSessionKNN( k, sample_size=500, sampling='recent', similarity = 'jaccard', remind=False, pop_boost=0, session_key = 'SessionId', item_key= 'ItemId') 14 | 15 | Parameters 16 | ----------- 17 | k : int 18 | Number of neighboring session to calculate the item scores from. (Default value: 100) 19 | sample_size : int 20 | Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500) 21 | sampling : string 22 | String to define the sampling method for sessions (recent, random). (default: recent) 23 | similarity : string 24 | String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard) 25 | remind : bool 26 | Should the last items of the current session be boosted to the top as reminders 27 | pop_boost : int 28 | Push popular items in the neighbor sessions by this factor. (default: 0 to leave out) 29 | extend : bool 30 | Add evaluated sessions to the maps 31 | normalize : bool 32 | Normalize the scores in the end 33 | session_key : string 34 | Header of the session ID column in the input file. (default: 'SessionId') 35 | item_key : string 36 | Header of the item ID column in the input file. (default: 'ItemId') 37 | time_key : string 38 | Header of the timestamp column in the input file. (default: 'Time') 39 | ''' 40 | 41 | def __init__(self, k, sample_size=1000, sampling='recent', similarity='jaccard', weighting='div', remind=False, 42 | pop_boost=0, extend=False, normalize=True, session_key='SessionId', item_key='ItemId', 43 | time_key='Time'): 44 | 45 | self.remind = remind 46 | self.k = k 47 | self.sample_size = sample_size 48 | self.sampling = sampling 49 | self.weighting = weighting 50 | self.similarity = similarity 51 | self.pop_boost = pop_boost 52 | self.session_key = session_key 53 | self.item_key = item_key 54 | self.time_key = time_key 55 | self.extend = extend 56 | self.normalize = normalize 57 | 58 | # updated while recommending 59 | self.session = -1 60 | self.session_items = [] 61 | self.relevant_sessions = set() 62 | 63 | # cache relations once at startup 64 | self.session_item_map = dict() 65 | self.item_session_map = dict() 66 | self.session_time = dict() 67 | 68 | self.sim_time = 0 69 | 70 | def fit(self, train, items=None): 71 | ''' 72 | Trains the predictor. 73 | 74 | Parameters 75 | -------- 76 | data: pandas.DataFrame 77 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). 78 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). 79 | 80 | ''' 81 | 82 | index_session = train.columns.get_loc(self.session_key) 83 | index_item = train.columns.get_loc(self.item_key) 84 | index_time = train.columns.get_loc(self.time_key) 85 | self.itemids = train[self.item_key].unique() 86 | 87 | session = -1 88 | session_items = set() 89 | time = -1 90 | # cnt = 0 91 | for row in train.itertuples(index=False): 92 | # cache items of sessions 93 | if row[index_session] != session: 94 | if len(session_items) > 0: 95 | self.session_item_map.update({session: session_items}) 96 | # cache the last time stamp of the session 97 | self.session_time.update({session: time}) 98 | session = row[index_session] 99 | session_items = set() 100 | time = row[index_time] 101 | session_items.add(row[index_item]) 102 | 103 | # cache sessions involving an item 104 | map_is = self.item_session_map.get(row[index_item]) 105 | if map_is is None: 106 | map_is = set() 107 | self.item_session_map.update({row[index_item]: map_is}) 108 | map_is.add(row[index_session]) 109 | 110 | # Add the last tuple 111 | self.session_item_map.update({session: session_items}) 112 | self.session_time.update({session: time}) 113 | 114 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0): 115 | ''' 116 | Gives predicton scores for a selected set of items on how likely they be the next item in the session. 117 | 118 | Parameters 119 | -------- 120 | session_id : int or string 121 | The session IDs of the event. 122 | input_item_id : int or string 123 | The item ID of the event. Must be in the set of item IDs of the training set. 124 | predict_for_item_ids : 1D array 125 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. 126 | 127 | Returns 128 | -------- 129 | out : pandas.Series 130 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. 131 | 132 | ''' 133 | 134 | # gc.collect() 135 | # process = psutil.Process(os.getpid()) 136 | # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used') 137 | 138 | if (self.session != session_id): # new session 139 | 140 | if (self.extend): 141 | item_set = set(self.session_items) 142 | self.session_item_map[self.session] = item_set 143 | for item in item_set: 144 | map_is = self.item_session_map.get(item) 145 | if map_is is None: 146 | map_is = set() 147 | self.item_session_map.update({item: map_is}) 148 | map_is.add(self.session) 149 | 150 | ts = time.time() 151 | self.session_time.update({self.session: ts}) 152 | 153 | self.session = session_id 154 | self.session_items = list() 155 | self.relevant_sessions = set() 156 | 157 | if type == 'view': 158 | self.session_items.append(input_item_id) 159 | 160 | if skip: 161 | return 162 | 163 | neighbors = self.find_neighbors(set(self.session_items), input_item_id, session_id) 164 | scores = self.score_items(neighbors, self.session_items) 165 | 166 | # add some reminders 167 | if self.remind: 168 | 169 | reminderScore = 5 170 | takeLastN = 3 171 | 172 | cnt = 0 173 | for elem in self.session_items[-takeLastN:]: 174 | cnt = cnt + 1 175 | # reminderScore = reminderScore + (cnt/100) 176 | 177 | oldScore = scores.get(elem) 178 | newScore = 0 179 | if oldScore is None: 180 | newScore = reminderScore 181 | else: 182 | newScore = oldScore + reminderScore 183 | # print 'old score ', oldScore 184 | # update the score and add a small number for the position 185 | newScore = (newScore * reminderScore) + (cnt / 100) 186 | 187 | scores.update({elem: newScore}) 188 | 189 | # push popular ones 190 | if self.pop_boost > 0: 191 | 192 | pop = self.item_pop(neighbors) 193 | # Iterate over the item neighbors 194 | # print itemScores 195 | for key in scores: 196 | item_pop = pop.get(key) 197 | # Gives some minimal MRR boost? 198 | scores.update({key: (scores[key] + (self.pop_boost * item_pop))}) 199 | 200 | # Create things in the format .. 201 | if predict_for_item_ids is None: 202 | predict_for_item_ids = self.itemids 203 | predictions = np.zeros(len(predict_for_item_ids)) 204 | mask = np.in1d(predict_for_item_ids, list(scores.keys())) 205 | 206 | items = predict_for_item_ids[mask] 207 | values = [scores[x] for x in items] 208 | predictions[mask] = values 209 | series = pd.Series(data=predictions, index=predict_for_item_ids) 210 | 211 | if self.normalize: 212 | series = series / series.max() 213 | 214 | return series 215 | 216 | def item_pop(self, sessions): 217 | ''' 218 | Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids) 219 | 220 | Parameters 221 | -------- 222 | sessions: set 223 | 224 | Returns 225 | -------- 226 | out : dict 227 | ''' 228 | result = dict() 229 | max_pop = 0 230 | for session, weight in sessions: 231 | items = self.items_for_session(session) 232 | for item in items: 233 | 234 | count = result.get(item) 235 | if count is None: 236 | result.update({item: 1}) 237 | else: 238 | result.update({item: count + 1}) 239 | 240 | if (result.get(item) > max_pop): 241 | max_pop = result.get(item) 242 | 243 | for key in result: 244 | result.update({key: (result[key] / max_pop)}) 245 | 246 | return result 247 | 248 | def jaccard(self, first, second): 249 | ''' 250 | Calculates the jaccard index for two sessions 251 | 252 | Parameters 253 | -------- 254 | first: Id of a session 255 | second: Id of a session 256 | 257 | Returns 258 | -------- 259 | out : float value 260 | ''' 261 | sc = time.clock() 262 | intersection = len(first & second) 263 | union = len(first | second) 264 | res = intersection / union 265 | 266 | self.sim_time += (time.clock() - sc) 267 | 268 | return res 269 | 270 | def cosine(self, first, second): 271 | ''' 272 | Calculates the cosine similarity for two sessions 273 | 274 | Parameters 275 | -------- 276 | first: Id of a session 277 | second: Id of a session 278 | 279 | Returns 280 | -------- 281 | out : float value 282 | ''' 283 | li = len(first & second) 284 | la = len(first) 285 | lb = len(second) 286 | result = li / sqrt(la) * sqrt(lb) 287 | 288 | return result 289 | 290 | def tanimoto(self, first, second): 291 | ''' 292 | Calculates the cosine tanimoto similarity for two sessions 293 | 294 | Parameters 295 | -------- 296 | first: Id of a session 297 | second: Id of a session 298 | 299 | Returns 300 | -------- 301 | out : float value 302 | ''' 303 | li = len(first & second) 304 | la = len(first) 305 | lb = len(second) 306 | result = li / (la + lb - li) 307 | 308 | return result 309 | 310 | def binary(self, first, second): 311 | ''' 312 | Calculates the ? for 2 sessions 313 | 314 | Parameters 315 | -------- 316 | first: Id of a session 317 | second: Id of a session 318 | 319 | Returns 320 | -------- 321 | out : float value 322 | ''' 323 | a = len(first & second) 324 | b = len(first) 325 | c = len(second) 326 | 327 | result = (2 * a) / ((2 * a) + b + c) 328 | 329 | return result 330 | 331 | def items_for_session(self, session): 332 | ''' 333 | Returns all items in the session 334 | 335 | Parameters 336 | -------- 337 | session: Id of a session 338 | 339 | Returns 340 | -------- 341 | out : set 342 | ''' 343 | return self.session_item_map.get(session); 344 | 345 | def sessions_for_item(self, item_id): 346 | ''' 347 | Returns all session for an item 348 | 349 | Parameters 350 | -------- 351 | item: Id of the item session 352 | 353 | Returns 354 | -------- 355 | out : set 356 | ''' 357 | return self.item_session_map.get(item_id) 358 | 359 | def most_recent_sessions(self, sessions, number): 360 | ''' 361 | Find the most recent sessions in the given set 362 | 363 | Parameters 364 | -------- 365 | sessions: set of session ids 366 | 367 | Returns 368 | -------- 369 | out : set 370 | ''' 371 | sample = set() 372 | 373 | tuples = list() 374 | for session in sessions: 375 | time = self.session_time.get(session) 376 | if time is None: 377 | print(' EMPTY TIMESTAMP!! ', session) 378 | tuples.append((session, time)) 379 | 380 | tuples = sorted(tuples, key=itemgetter(1), reverse=True) 381 | # print 'sorted list ', sortedList 382 | cnt = 0 383 | for element in tuples: 384 | cnt = cnt + 1 385 | if cnt > number: 386 | break 387 | sample.add(element[0]) 388 | # print 'returning sample of size ', len(sample) 389 | return sample 390 | 391 | def possible_neighbor_sessions(self, session_items, input_item_id, session_id): 392 | ''' 393 | Find a set of session to later on find neighbors in. 394 | A self.sample_size of 0 uses all sessions in which any item of the current session appears. 395 | self.sampling can be performed with the options "recent" or "random". 396 | "recent" selects the self.sample_size most recent sessions while "random" just choses randomly. 397 | 398 | Parameters 399 | -------- 400 | sessions: set of session ids 401 | 402 | Returns 403 | -------- 404 | out : set 405 | ''' 406 | 407 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id); 408 | 409 | if self.sample_size == 0: # use all session as possible neighbors 410 | 411 | print('!!!!! runnig KNN without a sample size (check config)') 412 | return self.relevant_sessions 413 | 414 | else: # sample some sessions 415 | 416 | if len(self.relevant_sessions) > self.sample_size: 417 | 418 | if self.sampling == 'recent': 419 | sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size) 420 | elif self.sampling == 'random': 421 | sample = random.sample(self.relevant_sessions, self.sample_size) 422 | else: 423 | sample = self.relevant_sessions[:self.sample_size] 424 | 425 | return sample 426 | else: 427 | return self.relevant_sessions 428 | 429 | def calc_similarity(self, session_items, sessions): 430 | ''' 431 | Calculates the configured similarity for the items in session_items and each session in sessions. 432 | 433 | Parameters 434 | -------- 435 | session_items: set of item ids 436 | sessions: list of session ids 437 | 438 | Returns 439 | -------- 440 | out : list of tuple (session_id,similarity) 441 | ''' 442 | 443 | # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric 444 | neighbors = [] 445 | cnt = 0 446 | for session in sessions: 447 | cnt = cnt + 1 448 | # get items of the session, look up the cache first 449 | session_items_test = self.items_for_session(session) 450 | 451 | similarity = getattr(self, self.similarity)(session_items_test, session_items) 452 | if similarity > 0: 453 | neighbors.append((session, similarity)) 454 | 455 | return neighbors 456 | 457 | # ----------------- 458 | # Find a set of neighbors, returns a list of tuples (sessionid: similarity) 459 | # ----------------- 460 | def find_neighbors(self, session_items, input_item_id, session_id): 461 | ''' 462 | Finds the k nearest neighbors for the given session_id and the current item input_item_id. 463 | 464 | Parameters 465 | -------- 466 | session_items: set of item ids 467 | input_item_id: int 468 | session_id: int 469 | 470 | Returns 471 | -------- 472 | out : list of tuple (session_id, similarity) 473 | ''' 474 | possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id) 475 | possible_neighbors = self.calc_similarity(session_items, possible_neighbors) 476 | 477 | possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1]) 478 | possible_neighbors = possible_neighbors[:self.k] 479 | 480 | return possible_neighbors 481 | 482 | def score_items(self, neighbors, current_session): 483 | ''' 484 | Compute a set of scores for all items given a set of neighbors. 485 | 486 | Parameters 487 | -------- 488 | neighbors: set of session ids 489 | 490 | Returns 491 | -------- 492 | out : list of tuple (item, score) 493 | ''' 494 | # now we have the set of relevant items to make predictions 495 | scores = dict() 496 | # iterate over the sessions 497 | for session in neighbors: 498 | # get the items in this session 499 | items = self.items_for_session(session[0]) 500 | step = 1 501 | 502 | for item in reversed(current_session): 503 | if item in items: 504 | decay = getattr(self, self.weighting)(step) 505 | break 506 | step += 1 507 | 508 | for item in items: 509 | old_score = scores.get(item) 510 | similarity = session[1] 511 | 512 | if old_score is None: 513 | scores.update({item: (similarity * decay)}) 514 | else: 515 | new_score = old_score + (similarity * decay) 516 | scores.update({item: new_score}) 517 | 518 | return scores 519 | 520 | def linear(self, i): 521 | return 1 - (0.1 * i) if i <= 100 else 0 522 | 523 | def same(self, i): 524 | return 1 525 | 526 | def div(self, i): 527 | return 1 / i 528 | 529 | def log(self, i): 530 | return 1 / (log10(i + 1.7)) 531 | 532 | def quadratic(self, i): 533 | return 1 / (i * i) 534 | -------------------------------------------------------------------------------- /util/knn/vmsknn.py: -------------------------------------------------------------------------------- 1 | from _operator import itemgetter 2 | from math import sqrt 3 | import random 4 | import time 5 | from math import log10 6 | from datetime import datetime as dt 7 | from datetime import timedelta as td 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | 13 | class VMSessionKNN: 14 | ''' 15 | VMSessionKNN( k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div', dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score', weighting_time=False, normalize=True, session_key = 'SessionId', item_key= 'ItemId', time_key= 'Time') 16 | 17 | Parameters 18 | ----------- 19 | k : int 20 | Number of neighboring session to calculate the item scores from. (Default value: 100) 21 | sample_size : int 22 | Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500) 23 | sampling : string 24 | String to define the sampling method for sessions (recent, random). (default: recent) 25 | similarity : string 26 | String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard) 27 | weighting : string 28 | Decay function to determine the importance/weight of individual actions in the current session (linear, same, div, log, quadratic). (default: div) 29 | weighting_score : string 30 | Decay function to lower the score of candidate items from a neighboring sessions that were selected by less recently clicked items in the current session. (linear, same, div, log, quadratic). (default: div_score) 31 | weighting_time : boolean 32 | Experimental function to give less weight to items from older sessions (default: False) 33 | dwelling_time : boolean 34 | Experimental function to use the dwelling time for item view actions as a weight in the similarity calculation. (default: False) 35 | last_n_days : int 36 | Use only data from the last N days. (default: None) 37 | last_n_clicks : int 38 | Use only the last N clicks of the current session when recommending. (default: None) 39 | extend : bool 40 | Add evaluated sessions to the maps. 41 | normalize : bool 42 | Normalize the scores in the end. 43 | session_key : string 44 | Header of the session ID column in the input file. (default: 'SessionId') 45 | item_key : string 46 | Header of the item ID column in the input file. (default: 'ItemId') 47 | time_key : string 48 | Header of the timestamp column in the input file. (default: 'Time') 49 | ''' 50 | 51 | def __init__(self, k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div', 52 | dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score', 53 | weighting_time=False, normalize=True, session_key='SessionId', item_key='ItemId', time_key='Time'): 54 | 55 | self.k = k 56 | self.sample_size = sample_size 57 | self.sampling = sampling 58 | self.weighting = weighting 59 | self.dwelling_time = dwelling_time 60 | self.weighting_score = weighting_score 61 | self.weighting_time = weighting_time 62 | self.similarity = similarity 63 | self.session_key = session_key 64 | self.item_key = item_key 65 | self.time_key = time_key 66 | self.extend = extend 67 | self.normalize = normalize 68 | self.last_n_days = last_n_days 69 | self.last_n_clicks = last_n_clicks 70 | 71 | # updated while recommending 72 | self.session = -1 73 | self.session_items = [] 74 | self.relevant_sessions = set() 75 | 76 | # cache relations once at startup 77 | self.session_item_map = dict() 78 | self.item_session_map = dict() 79 | self.session_time = dict() 80 | self.min_time = -1 81 | 82 | self.sim_time = 0 83 | 84 | def fit(self, data, items=None): 85 | ''' 86 | Trains the predictor. 87 | 88 | Parameters 89 | -------- 90 | data: pandas.DataFrame 91 | Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). 92 | It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). 93 | 94 | ''' 95 | 96 | if self.last_n_days != None: 97 | 98 | max_time = dt.fromtimestamp(data[self.time_key].max()) 99 | date_threshold = max_time.date() - td(self.last_n_days) 100 | stamp = dt.combine(date_threshold, dt.min.time()).timestamp() 101 | train = data[data[self.time_key] >= stamp] 102 | 103 | else: 104 | train = data 105 | 106 | self.num_items = train[self.item_key].max() 107 | 108 | index_session = train.columns.get_loc(self.session_key) 109 | index_item = train.columns.get_loc(self.item_key) 110 | index_time = train.columns.get_loc(self.time_key) 111 | self.itemids = train[self.item_key].unique() 112 | 113 | session = -1 114 | session_items = set() 115 | time = -1 116 | # cnt = 0 117 | for row in train.itertuples(index=False): 118 | # cache items of sessions 119 | if row[index_session] != session: 120 | if len(session_items) > 0: 121 | self.session_item_map.update({session: session_items}) 122 | # cache the last time stamp of the session 123 | self.session_time.update({session: time}) 124 | if time < self.min_time: 125 | self.min_time = time 126 | session = row[index_session] 127 | session_items = set() 128 | time = row[index_time] 129 | session_items.add(row[index_item]) 130 | 131 | # cache sessions involving an item 132 | map_is = self.item_session_map.get(row[index_item]) 133 | if map_is is None: 134 | map_is = set() 135 | self.item_session_map.update({row[index_item]: map_is}) 136 | map_is.add(row[index_session]) 137 | 138 | # Add the last tuple 139 | self.session_item_map.update({session: session_items}) 140 | self.session_time.update({session: time}) 141 | 142 | def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0): 143 | ''' 144 | Gives predicton scores for a selected set of items on how likely they be the next item in the session. 145 | 146 | Parameters 147 | -------- 148 | session_id : int or string 149 | The session IDs of the event. 150 | input_item_id : int or string 151 | The item ID of the event. Must be in the set of item IDs of the training set. 152 | predict_for_item_ids : 1D array 153 | IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. 154 | 155 | Returns 156 | -------- 157 | out : pandas.Series 158 | Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. 159 | 160 | ''' 161 | 162 | # gc.collect() 163 | # process = psutil.Process(os.getpid()) 164 | # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used') 165 | 166 | if (self.session != session_id): # new session 167 | 168 | if (self.extend): 169 | item_set = set(self.session_items) 170 | self.session_item_map[self.session] = item_set; 171 | for item in item_set: 172 | map_is = self.item_session_map.get(item) 173 | if map_is is None: 174 | map_is = set() 175 | self.item_session_map.update({item: map_is}) 176 | map_is.add(self.session) 177 | 178 | ts = time.time() 179 | self.session_time.update({self.session: ts}) 180 | 181 | self.last_ts = -1 182 | self.session = session_id 183 | self.session_items = list() 184 | self.dwelling_times = list() 185 | self.relevant_sessions = set() 186 | 187 | if type == 'view': 188 | self.session_items.append(input_item_id) 189 | if self.dwelling_time: 190 | if self.last_ts > 0: 191 | self.dwelling_times.append(timestamp - self.last_ts) 192 | self.last_ts = timestamp 193 | 194 | if skip: 195 | return 196 | 197 | items = self.session_items if self.last_n_clicks is None else self.session_items[-self.last_n_clicks:] 198 | neighbors = self.find_neighbors(items, input_item_id, session_id, self.dwelling_times, timestamp) 199 | scores = self.score_items(neighbors, items, timestamp) 200 | 201 | # Create things in the format .. 202 | if predict_for_item_ids is None: 203 | predict_for_item_ids = self.itemids 204 | predictions = np.zeros(len(predict_for_item_ids)) 205 | mask = np.in1d(predict_for_item_ids, list(scores.keys())) 206 | 207 | items = predict_for_item_ids[mask] 208 | values = [scores[x] for x in items] 209 | predictions[mask] = values 210 | series = pd.Series(data=predictions, index=predict_for_item_ids) 211 | 212 | if self.normalize: 213 | series = series / series.max() 214 | 215 | return series 216 | 217 | def item_pop(self, sessions): 218 | ''' 219 | Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids) 220 | 221 | Parameters 222 | -------- 223 | sessions: set 224 | 225 | Returns 226 | -------- 227 | out : dict 228 | ''' 229 | result = dict() 230 | max_pop = 0 231 | for session, weight in sessions: 232 | items = self.items_for_session(session) 233 | for item in items: 234 | 235 | count = result.get(item) 236 | if count is None: 237 | result.update({item: 1}) 238 | else: 239 | result.update({item: count + 1}) 240 | 241 | if (result.get(item) > max_pop): 242 | max_pop = result.get(item) 243 | 244 | for key in result: 245 | result.update({key: (result[key] / max_pop)}) 246 | 247 | return result 248 | 249 | def jaccard(self, first, second): 250 | ''' 251 | Calculates the jaccard index for two sessions 252 | 253 | Parameters 254 | -------- 255 | first: Id of a session 256 | second: Id of a session 257 | 258 | Returns 259 | -------- 260 | out : float value 261 | ''' 262 | sc = time.clock() 263 | intersection = len(first & second) 264 | union = len(first | second) 265 | res = intersection / union 266 | 267 | self.sim_time += (time.clock() - sc) 268 | 269 | return res 270 | 271 | def cosine(self, first, second): 272 | ''' 273 | Calculates the cosine similarity for two sessions 274 | 275 | Parameters 276 | -------- 277 | first: Id of a session 278 | second: Id of a session 279 | 280 | Returns 281 | -------- 282 | out : float value 283 | ''' 284 | li = len(first & second) 285 | la = len(first) 286 | lb = len(second) 287 | result = li / sqrt(la) * sqrt(lb) 288 | 289 | return result 290 | 291 | def tanimoto(self, first, second): 292 | ''' 293 | Calculates the cosine tanimoto similarity for two sessions 294 | 295 | Parameters 296 | -------- 297 | first: Id of a session 298 | second: Id of a session 299 | 300 | Returns 301 | -------- 302 | out : float value 303 | ''' 304 | li = len(first & second) 305 | la = len(first) 306 | lb = len(second) 307 | result = li / (la + lb - li) 308 | 309 | return result 310 | 311 | def binary(self, first, second): 312 | ''' 313 | Calculates the ? for 2 sessions 314 | 315 | Parameters 316 | -------- 317 | first: Id of a session 318 | second: Id of a session 319 | 320 | Returns 321 | -------- 322 | out : float value 323 | ''' 324 | a = len(first & second) 325 | b = len(first) 326 | c = len(second) 327 | 328 | result = (2 * a) / ((2 * a) + b + c) 329 | 330 | return result 331 | 332 | def vec(self, first, second, map): 333 | ''' 334 | Calculates the ? for 2 sessions 335 | 336 | Parameters 337 | -------- 338 | first: Id of a session 339 | second: Id of a session 340 | 341 | Returns 342 | -------- 343 | out : float value 344 | ''' 345 | a = first & second 346 | sum = 0 347 | for i in a: 348 | sum += map[i] 349 | 350 | result = sum / len(map) 351 | 352 | return result 353 | 354 | def items_for_session(self, session): 355 | ''' 356 | Returns all items in the session 357 | 358 | Parameters 359 | -------- 360 | session: Id of a session 361 | 362 | Returns 363 | -------- 364 | out : set 365 | ''' 366 | return self.session_item_map.get(session); 367 | 368 | def vec_for_session(self, session): 369 | ''' 370 | Returns all items in the session 371 | 372 | Parameters 373 | -------- 374 | session: Id of a session 375 | 376 | Returns 377 | -------- 378 | out : set 379 | ''' 380 | return self.session_vec_map.get(session); 381 | 382 | def sessions_for_item(self, item_id): 383 | ''' 384 | Returns all session for an item 385 | 386 | Parameters 387 | -------- 388 | item: Id of the item session 389 | 390 | Returns 391 | -------- 392 | out : set 393 | ''' 394 | return self.item_session_map.get(item_id) if item_id in self.item_session_map else set() 395 | 396 | def most_recent_sessions(self, sessions, number): 397 | ''' 398 | Find the most recent sessions in the given set 399 | 400 | Parameters 401 | -------- 402 | sessions: set of session ids 403 | 404 | Returns 405 | -------- 406 | out : set 407 | ''' 408 | sample = set() 409 | 410 | tuples = list() 411 | for session in sessions: 412 | time = self.session_time.get(session) 413 | if time is None: 414 | print(' EMPTY TIMESTAMP!! ', session) 415 | tuples.append((session, time)) 416 | 417 | tuples = sorted(tuples, key=itemgetter(1), reverse=True) 418 | # print 'sorted list ', sortedList 419 | cnt = 0 420 | for element in tuples: 421 | cnt = cnt + 1 422 | if cnt > number: 423 | break 424 | sample.add(element[0]) 425 | # print 'returning sample of size ', len(sample) 426 | return sample 427 | 428 | def possible_neighbor_sessions(self, session_items, input_item_id, session_id): 429 | ''' 430 | Find a set of session to later on find neighbors in. 431 | A self.sample_size of 0 uses all sessions in which any item of the current session appears. 432 | self.sampling can be performed with the options "recent" or "random". 433 | "recent" selects the self.sample_size most recent sessions while "random" just choses randomly. 434 | 435 | Parameters 436 | -------- 437 | sessions: set of session ids 438 | 439 | Returns 440 | -------- 441 | out : set 442 | ''' 443 | 444 | self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id) 445 | 446 | if self.sample_size == 0: # use all session as possible neighbors 447 | 448 | print('!!!!! runnig KNN without a sample size (check config)') 449 | return self.relevant_sessions 450 | 451 | else: # sample some sessions 452 | 453 | if len(self.relevant_sessions) > self.sample_size: 454 | 455 | if self.sampling == 'recent': 456 | sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size) 457 | elif self.sampling == 'random': 458 | sample = random.sample(self.relevant_sessions, self.sample_size) 459 | else: 460 | sample = self.relevant_sessions[:self.sample_size] 461 | 462 | return sample 463 | else: 464 | return self.relevant_sessions 465 | 466 | def calc_similarity(self, session_items, sessions, dwelling_times, timestamp): 467 | ''' 468 | Calculates the configured similarity for the items in session_items and each session in sessions. 469 | 470 | Parameters 471 | -------- 472 | session_items: set of item ids 473 | sessions: list of session ids 474 | 475 | Returns 476 | -------- 477 | out : list of tuple (session_id,similarity) 478 | ''' 479 | 480 | pos_map = {} 481 | length = len(session_items) 482 | 483 | count = 1 484 | for item in session_items: 485 | if self.weighting is not None: 486 | pos_map[item] = getattr(self, self.weighting)(count, length) 487 | count += 1 488 | else: 489 | pos_map[item] = 1 490 | 491 | dt = dwelling_times.copy() 492 | dt.append(0) 493 | dt = pd.Series(dt, index=session_items) 494 | dt = dt / dt.max() 495 | # dt[session_items[-1]] = dt.mean() if len(session_items) > 1 else 1 496 | dt[session_items[-1]] = 1 497 | 498 | if self.dwelling_time: 499 | # print(dt) 500 | for i in range(len(dt)): 501 | pos_map[session_items[i]] *= dt.iloc[i] 502 | # print(pos_map) 503 | # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric 504 | items = set(session_items) 505 | neighbors = [] 506 | cnt = 0 507 | for session in sessions: 508 | cnt = cnt + 1 509 | # get items of the session, look up the cache first 510 | n_items = self.items_for_session(session) 511 | sts = self.session_time[session] 512 | 513 | similarity = self.vec(items, n_items, pos_map) 514 | if similarity > 0: 515 | 516 | if self.weighting_time: 517 | diff = timestamp - sts 518 | days = round(diff / 60 / 60 / 24) 519 | decay = pow(7 / 8, days) 520 | similarity *= decay 521 | 522 | # print("days:",days," => ",decay) 523 | 524 | neighbors.append((session, similarity)) 525 | 526 | return neighbors 527 | 528 | # ----------------- 529 | # Find a set of neighbors, returns a list of tuples (sessionid: similarity) 530 | # ----------------- 531 | def find_neighbors(self, session_items, input_item_id, session_id, dwelling_times, timestamp): 532 | ''' 533 | Finds the k nearest neighbors for the given session_id and the current item input_item_id. 534 | 535 | Parameters 536 | -------- 537 | session_items: set of item ids 538 | input_item_id: int 539 | session_id: int 540 | 541 | Returns 542 | -------- 543 | out : list of tuple (session_id, similarity) 544 | ''' 545 | possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id) 546 | possible_neighbors = self.calc_similarity(session_items, possible_neighbors, dwelling_times, timestamp) 547 | 548 | possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1]) 549 | possible_neighbors = possible_neighbors[:self.k] 550 | 551 | return possible_neighbors 552 | 553 | def score_items(self, neighbors, current_session, timestamp): 554 | ''' 555 | Compute a set of scores for all items given a set of neighbors. 556 | 557 | Parameters 558 | -------- 559 | neighbors: set of session ids 560 | 561 | Returns 562 | -------- 563 | out : list of tuple (item, score) 564 | ''' 565 | # now we have the set of relevant items to make predictions 566 | scores = dict() 567 | # iterate over the sessions 568 | for session in neighbors: 569 | # get the items in this session 570 | items = self.items_for_session(session[0]) 571 | step = 1 572 | 573 | for item in reversed(current_session): 574 | if item in items: 575 | decay = getattr(self, self.weighting_score)(step) 576 | break 577 | step += 1 578 | 579 | for item in items: 580 | old_score = scores.get(item) 581 | similarity = session[1] 582 | 583 | if old_score is None: 584 | scores.update({item: (similarity * decay)}) 585 | else: 586 | new_score = old_score + (similarity * decay) 587 | scores.update({item: new_score}) 588 | 589 | return scores 590 | 591 | def linear_score(self, i): 592 | return 1 - (0.1 * i) if i <= 100 else 0 593 | 594 | def same_score(self, i): 595 | return 1 596 | 597 | def div_score(self, i): 598 | return 1 / i 599 | 600 | def log_score(self, i): 601 | return 1 / (log10(i + 1.7)) 602 | 603 | def quadratic_score(self, i): 604 | return 1 / (i * i) 605 | 606 | def linear(self, i, length): 607 | return 1 - (0.1 * (length - i)) if i <= 10 else 0 608 | 609 | def same(self, i, length): 610 | return 1 611 | 612 | def div(self, i, length): 613 | return i / length 614 | 615 | def log(self, i, length): 616 | return 1 / (log10((length - i) + 1.7)) 617 | 618 | def quadratic(self, i, length): 619 | return (i / length) ** 2 620 | -------------------------------------------------------------------------------- /util/markov/Markov.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from functools import reduce 3 | 4 | import networkx as nx 5 | 6 | from util.tree.Tree import SmartTree 7 | 8 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') 9 | 10 | 11 | def add_nodes_to_graph(seqs, last_k): 12 | t = SmartTree() 13 | rootNode = t.set_root() 14 | 15 | countDict = {} 16 | G = nx.DiGraph() 17 | for s in seqs: 18 | nearHistory = tuple(s[-(last_k):]) 19 | if nearHistory in countDict: 20 | # increment count 21 | countDict[nearHistory] += 1 22 | else: 23 | # init count 24 | countDict[nearHistory] = 1 25 | # add seq to sequence tree 26 | t.add_path(rootNode, list(nearHistory)) 27 | # add node to graph 28 | G.add_node(nearHistory) 29 | 30 | ## i also have to save the sequence of length k+1 because otherwise I cannot calculate the count 31 | ## from state x to state y. So the seqeunces of length k+1 are in the tree but not in the states 32 | nearHistoryLong = tuple( 33 | s[-(last_k + 1):]) # +1 because I need one more element to calculate the transition prob 34 | if nearHistory != nearHistoryLong: # otherwise short seq are counted double 35 | if nearHistoryLong in countDict: 36 | # increment count 37 | countDict[nearHistoryLong] += 1 38 | else: 39 | # init count 40 | countDict[nearHistoryLong] = 1 41 | return (t, countDict, G) 42 | 43 | 44 | def add_edges(t, countDict, G, last_k): 45 | """ 46 | :param t: Tree of the sequnces available as states 47 | :param countDict: dicionary counting the occurence for each sequence 48 | :param G: the graph containing the states (each one is a sequence) 49 | :param last_k: the number of recent item considered 50 | :return: the same graph G, with edges connecting states 51 | """ 52 | # add links 53 | rootNode = t.get_root() 54 | for node in G.nodes_iter(): 55 | # if the sequence is shorter than states's len, the next state has all the sequence as prefix 56 | next_state_prefix = node[1:] if len(node) == last_k else node 57 | p = t.find_path(rootNode, next_state_prefix) 58 | if t.path_is_valid(p): 59 | children = t.get_nodes_tag(t[p].fpointer) 60 | for c in children: 61 | # the tree may suggest a children which is not a state of the graph, because it was part of a longer 62 | # sequence, in that case no edge has to be added 63 | if next_state_prefix + (c,) in G.nodes(): 64 | if countDict.get(node + (c,), 0) != 0: # do not add edge if count is 0 65 | G.add_edge(node, next_state_prefix + (c,), {'count': countDict.get(node + (c,), 0)}) 66 | return G 67 | 68 | 69 | def apply_skipping(G, last_k, seqs): 70 | # iterate over seqs to add skipping count 71 | window = last_k 72 | 73 | for us in seqs: 74 | s = tuple(us) 75 | for i in range(len(s) - window): 76 | previous_state = s[i:i + window] 77 | next_state_prefix = previous_state[1:] 78 | for j in range(i + window + 1, len(s)): 79 | fractional_count = 1 / (2 ** (j - (i + window))) 80 | next_state = next_state_prefix + (s[j],) 81 | # update count 82 | old_count = G.get_edge_data(previous_state, next_state, {}).get('count', 0) 83 | if G.has_edge(previous_state, next_state): 84 | G[previous_state][next_state]['count'] = old_count + fractional_count 85 | else: 86 | G.add_edge(previous_state, next_state, {'count': fractional_count}) 87 | # print('updating '+str(previous_state)+'->'+str(next_state)+' from '+str(old_count)+' to '+str(old_count+fractional_count)) 88 | 89 | # normalize 90 | for n in G.nodes_iter(): 91 | edges = G.out_edges(n) 92 | countSum = reduce(lambda x, y: x + y, [G[x[0]][x[1]]['count'] for x in edges], 0) 93 | for e in edges: 94 | G[e[0]][e[1]]['count'] = G[e[0]][e[1]]['count'] / float(countSum) if countSum else 0 95 | 96 | return G 97 | 98 | 99 | def apply_clustering(G): 100 | ##clustering 101 | def sequence_similarity(s, t): 102 | sum = 0 103 | for i in range(min(len(s), len(t))): 104 | sum += 0 if s[i] != t[i] else (i + 2) 105 | return sum 106 | 107 | similarity_dict = {} 108 | # for each state in the graph, calculate similarity 109 | for node in G.nodes_iter(): 110 | for deno in G.nodes_iter(): 111 | if node == deno or (node, deno) in similarity_dict: 112 | continue # skip if same or already done 113 | else: 114 | sim = sequence_similarity(node, deno) 115 | if sim: # save only if different from zero 116 | similarity_dict[node, deno] = similarity_dict[deno, node] = sim 117 | 118 | similarity_count_dict = {} 119 | 120 | for node in G.nodes_iter(): 121 | for deno in G.nodes_iter(): 122 | if node == deno: continue 123 | sum = 0 124 | for in_edge in G.in_edges_iter([deno]): 125 | intermediate_node = in_edge[0] 126 | if intermediate_node != node: # I want to count the effect of going through Other nodes 127 | sum += similarity_dict.get((node, intermediate_node), 0) * G[intermediate_node][deno]['count'] 128 | if sum: 129 | similarity_count_dict[node, deno] = sum 130 | 131 | def compute_normalization_similarity_count(G, node): 132 | normalization_sum = 0 133 | for other_state in G.nodes_iter(): 134 | # skip similarity with myself is 0 because of how similarity_dict is calculated 135 | normalization_sum += similarity_count_dict.get((node, other_state), 0) 136 | return normalization_sum 137 | 138 | ##update transition probability 139 | ### this can be made faster(?) if I store the adjancency matrix where node are connected if 140 | # there is a probability due to the clustering (i.e. there is an entry in similarity_count_dict 141 | # in this way I only have to check those edges. now it's already pretty optimized anyway 142 | ALPHA = 0.5 143 | for node in G.nodes_iter(): 144 | normalization_sum = compute_normalization_similarity_count(G, node) 145 | 146 | # first half the original transition prob 147 | for u, v in G.out_edges_iter([node]): 148 | G[u][v]['count'] *= ALPHA 149 | 150 | # if there is similarity probability somewhere 151 | if normalization_sum: 152 | # add similarity probability 153 | for deno in G.nodes_iter(): 154 | # skip if same node or there is nothing that can be added to that node 155 | if node == deno or similarity_count_dict.get((node, deno), 0) == 0: continue 156 | 157 | partial_prob = (1 - ALPHA) * similarity_count_dict.get((node, deno), 0) / normalization_sum 158 | 159 | if G.has_edge(node, deno): 160 | G[node][deno]['count'] += partial_prob 161 | elif partial_prob: # there wasn't an edge but now there is partial prob from other nodes 162 | G.add_edge(node, deno, {'count': partial_prob}) 163 | 164 | return G, similarity_dict, similarity_count_dict 165 | -------------------------------------------------------------------------------- /util/metrics.py: -------------------------------------------------------------------------------- 1 | def precision(ground_truth, prediction): 2 | """ 3 | Compute Precision metric 4 | :param ground_truth: the ground truth set or sequence 5 | :param prediction: the predicted set or sequence 6 | :return: the value of the metric 7 | """ 8 | ground_truth = remove_duplicates(ground_truth) 9 | prediction = remove_duplicates(prediction) 10 | precision_score = count_a_in_b_unique(prediction, ground_truth) / float(len(prediction)) 11 | assert 0 <= precision_score <= 1 12 | return precision_score 13 | 14 | 15 | def recall(ground_truth, prediction): 16 | """ 17 | Compute Recall metric 18 | :param ground_truth: the ground truth set or sequence 19 | :param prediction: the predicted set or sequence 20 | :return: the value of the metric 21 | """ 22 | ground_truth = remove_duplicates(ground_truth) 23 | prediction = remove_duplicates(prediction) 24 | recall_score = 0 if len(prediction) == 0 else count_a_in_b_unique(prediction, ground_truth) / float( 25 | len(ground_truth)) 26 | assert 0 <= recall_score <= 1 27 | return recall_score 28 | 29 | 30 | def mrr(ground_truth, prediction): 31 | """ 32 | Compute Mean Reciprocal Rank metric. Reciprocal Rank is set 0 if no predicted item is in contained the ground truth. 33 | :param ground_truth: the ground truth set or sequence 34 | :param prediction: the predicted set or sequence 35 | :return: the value of the metric 36 | """ 37 | rr = 0. 38 | for rank, p in enumerate(prediction): 39 | if p in ground_truth: 40 | rr = 1. / (rank + 1) 41 | break 42 | return rr 43 | 44 | 45 | def count_a_in_b_unique(a, b): 46 | """ 47 | :param a: list of lists 48 | :param b: list of lists 49 | :return: number of elements of a in b 50 | """ 51 | count = 0 52 | for el in a: 53 | if el in b: 54 | count += 1 55 | return count 56 | 57 | 58 | def remove_duplicates(l): 59 | return [list(x) for x in set(tuple(x) for x in l)] 60 | -------------------------------------------------------------------------------- /util/rnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/rnn/__init__.py -------------------------------------------------------------------------------- /util/rnn/gpu_ops.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Nov 10 14:17:58 2017 4 | 5 | @author: Balázs Hidasi 6 | """ 7 | 8 | import theano 9 | from theano import tensor as T 10 | 11 | def gpu_diag_wide(X): 12 | E = T.eye(*X.shape) 13 | return T.sum(X*E, axis=1) 14 | 15 | def gpu_diag_tall(X): 16 | E = T.eye(*X.shape) 17 | return T.sum(X*E, axis=0) -------------------------------------------------------------------------------- /util/split.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from scipy.sparse import find 4 | 5 | 6 | def random_holdout(dataset, perc=0.8, seed=1234): 7 | """ 8 | Split sequence dataset randomly 9 | :param dataset: the sequence dataset 10 | :param perc: the training percentange 11 | :param seed: the random seed 12 | :return: the training and test splits 13 | """ 14 | dataset = dataset.sample(frac=1, random_state=seed) 15 | nseqs = len(dataset) 16 | train_size = int(nseqs * perc) 17 | # split data according to the shuffled index and the holdout size 18 | train_split = dataset[:train_size] 19 | test_split = dataset[train_size:] 20 | 21 | return train_split, test_split 22 | 23 | 24 | def temporal_holdout(dataset, ts_threshold): 25 | """ 26 | Split sequence dataset using timestamps 27 | :param dataset: the sequence dataset 28 | :param ts_threshold: the timestamp from which test sequences will start 29 | :return: the training and test splits 30 | """ 31 | train = dataset.loc[dataset['ts'] < ts_threshold] 32 | test = dataset.loc[dataset['ts'] >= ts_threshold] 33 | train, test = clean_split(train, test) 34 | 35 | return train, test 36 | 37 | 38 | def last_session_out_split(data, 39 | user_key='user_id', 40 | session_key='session_id', 41 | time_key='ts'): 42 | """ 43 | Assign the last session of every user to the test set and the remaining ones to the training set 44 | """ 45 | sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key] 46 | last_session = sessions.last() 47 | train = data[~data.session_id.isin(last_session.values)].copy() 48 | test = data[data.session_id.isin(last_session.values)].copy() 49 | train, test = clean_split(train, test) 50 | return train, test 51 | 52 | 53 | def clean_split(train, test): 54 | """ 55 | Remove new items from the test set. 56 | :param train: The training set. 57 | :param test: The test set. 58 | :return: The cleaned training and test sets. 59 | """ 60 | train_items = set() 61 | train['sequence'].apply(lambda seq: train_items.update(set(seq))) 62 | test['sequence'] = test['sequence'].apply(lambda seq: [it for it in seq if it in train_items]) 63 | return train, test 64 | 65 | 66 | def balance_dataset(x, y): 67 | number_of_elements = y.shape[0] 68 | nnz = set(find(y)[0]) 69 | zero = set(range(number_of_elements)).difference(nnz) 70 | 71 | max_samples = min(len(zero), len(nnz)) 72 | 73 | nnz_indices = random.sample(nnz, max_samples) 74 | zero_indeces = random.sample(zero, max_samples) 75 | indeces = nnz_indices + zero_indeces 76 | 77 | return x[indeces, :], y[indeces, :] 78 | -------------------------------------------------------------------------------- /util/tree/Tree.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | import treelib 4 | 5 | 6 | class SmartTree(treelib.Tree): 7 | _PATH_NOT_FOUND = -1 8 | 9 | def find_path(self, origin, path): 10 | """ 11 | Takes the nodeId where to start the path search and the path to look for, 12 | :returns -1 if path not found, nodeId of the last node if path found 13 | """ 14 | 15 | if not path: 16 | # path found 17 | return origin 18 | 19 | res = self._PATH_NOT_FOUND 20 | 21 | for nodeId in self[origin].fpointer: 22 | node = self[nodeId] 23 | if node.tag == path[0]: 24 | res = self.find_path(nodeId, path[1:]) 25 | break 26 | 27 | if res is None: 28 | # path not found 29 | return self._PATH_NOT_FOUND 30 | else: 31 | return res 32 | 33 | def longest_subpath(self, origin, path): 34 | """ 35 | Takes the nodeId where to start the path search and the path to look for, 36 | :returns the nodeId of the node where the path is broken and the number of missing element for the complete path 37 | """ 38 | 39 | if not path: # path empty, all nodes matched 40 | # path found 41 | return origin, 0 42 | 43 | res = () 44 | 45 | for nodeId in self[origin].fpointer: 46 | node = self[nodeId] 47 | if node.tag == path[0]: 48 | res = self.longest_subpath(nodeId, path[1:]) 49 | break 50 | 51 | if res == (): 52 | # path not found 53 | return origin, len(path) 54 | else: 55 | return res 56 | 57 | def add_path(self, origin, path, support=None): 58 | """add a path, starting from origin""" 59 | sub = self.longest_subpath(origin, path) 60 | if sub[1] == 0: 61 | # path already exists, updating support 62 | self[sub[0]].data = {'support': support} 63 | 64 | else: 65 | # add what's missing 66 | missingPath = path[-sub[1]:] 67 | 68 | par = sub[0] 69 | for item in missingPath: 70 | itemId = uuid.uuid4() 71 | self.create_node(item, itemId, parent=par, data={'support': support}) 72 | par = itemId 73 | 74 | def path_is_valid(self, path): 75 | return path != self._PATH_NOT_FOUND 76 | 77 | def create_node(self, tag=None, identifier=None, parent=None, data=None): 78 | """override to get a random id if none provided""" 79 | id = uuid.uuid4() if identifier is None else identifier 80 | if id == self._PATH_NOT_FOUND: 81 | raise NameError("Cannot create a node with special id " + str(self._PATH_NOT_FOUND)) 82 | super(SmartTree, self).create_node(tag, id, parent, data) 83 | 84 | def set_root(self, root_tag=None, root_id=None): 85 | id = uuid.uuid4() 86 | root_id = root_id if root_id is not None else id 87 | root_tag = root_tag if root_tag is not None else 'root' 88 | self.create_node(root_tag, root_id) 89 | self.root = root_id 90 | return root_id 91 | 92 | def get_root(self): 93 | try: 94 | return self.root 95 | except AttributeError: 96 | return None 97 | 98 | def find_n_length_paths(self, origin, length, exclude_origin=True): 99 | 100 | if length == 0: 101 | return [[]] if exclude_origin else [[origin]] 102 | 103 | else: 104 | children = self[origin].fpointer 105 | paths = [] 106 | for c in children: 107 | children_paths = self.find_n_length_paths(c, length - 1, False) 108 | # this line is magic, if there are no children the all path gets lost, 109 | # that's how i get paths of exactly length wanted 110 | l = list(map(lambda x: [] + x, children_paths)) if exclude_origin else list( 111 | map(lambda x: [origin] + x, children_paths)) 112 | for el in l: 113 | paths.append(el) 114 | return paths 115 | 116 | def get_paths_tag(self, list_of_paths): 117 | return list(map(lambda x: self.get_nodes_tag(x), list_of_paths)) 118 | 119 | def get_nodes_tag(self, list_of_nids): 120 | return list(map(lambda y: self[y].tag, list_of_nids)) 121 | -------------------------------------------------------------------------------- /util/tree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mquad/sars_tutorial/b9cffab7d5a4dd4a9920ef6d755fd961a58a8b50/util/tree/__init__.py --------------------------------------------------------------------------------