├── LICENSE.txt ├── README.md ├── conf └── base.conf ├── dat └── filmtrust │ ├── README.md │ ├── pro │ ├── network.tsv │ ├── test.tsv │ ├── train.tsv │ └── validation.tsv │ ├── process.py │ └── raw │ ├── ratings.txt │ ├── readme.txt │ └── trust.txt ├── scripts ├── README.md ├── adjust_amplification.py ├── aggregate_amp_results.py ├── aggregate_results.py ├── amplification_check.py ├── amplify_data.py ├── deamplify_data.py ├── get_time.py ├── process_data.py ├── process_data_Nusers.py ├── process_time_data.py ├── setup.sh ├── sim_data.sh ├── study.sh ├── to_librec_form.py ├── to_list_form.py └── to_sorec_list_form.py └── src ├── Makefile ├── data.cpp ├── data.h ├── eval.cpp ├── eval.h ├── librec.cpp ├── main.cpp ├── mf.cpp ├── model.h ├── popularity.cpp ├── random.cpp ├── spf.cpp ├── spf.h ├── utils.cpp └── utils.h /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2016 Allison J.B. Chaney 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | -- 24 | Taken on Mar 2, 2015 from http://opensource.org/licenses/MIT 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Social Poisson Factorization (SPF) 2 | 3 | (C) Copyright 2014-2016, Allison J.B. Chaney 4 | 5 | This software is distributed under the MIT license. See `LICENSE.txt` for details. 6 | 7 | #### Repository Contents 8 | - `conf` contains a base configure file for running LibRec to do model comparisons 9 | - `scripts` bash and python scripts for data processing and running experiments 10 | - `src` C++ source code 11 | - `dat` example data 12 | - `README.md` this file 13 | 14 | 15 | ## Data 16 | The input format for data is tab-separted files with integer values: 17 | ``` 18 | user id item id rating 19 | ``` 20 | The ratings should be separated into training, testing, and validation data; `scripts/process_data.py` 21 | helps divide data into these different sets. This script also culls the user network such that only 22 | connections that have at least one item in common are included. 23 | ``` 24 | python process_data.py [ratings-file] [network-file] [output-dir] 25 | ``` 26 | 27 | Alternatively, data with time information (like shown below) can be processed with 28 | `process_time_data.py` which takes the same arguments as `process_data.py`. This 29 | will split the data according to time; ratings are implicit and therefore binary. 30 | ``` 31 | user id item id unix time 32 | ``` 33 | 34 | 35 | ## Running SPF 36 | 1. Clone the repo: 37 | `git clone https://github.com/ajbc/spf.git` 38 | 2. Navigate to the `spf/src` directory 39 | 3. Compile with `make` 40 | 4. Run the executable, e.g.: 41 | `./spf --data ~/my-data/ --out my-fit` 42 | 43 | #### SPF Options 44 | |Option|Arguments|Help|Default| 45 | |---|---|---|---| 46 | |help||print help information|| 47 | |verbose||print extra information while running|off| 48 | |out|dir|save directory, required|| 49 | |data|dir|data directory, required|| 50 | |svi||use stochastic VI (instead of batch VI)|off for < 10M ratings in training| 51 | |batch||use batch VI (instead of SVI)|on for < 10M ratings in training| 52 | |a_theta|a|shape hyperparamter to theta (user preferences)|0.3| 53 | |b_theta|b|rate hyperparamter to theta (user preferences)|0.3| 54 | |a_beta|a|shape hyperparamter to beta (item attributes)|0.3| 55 | |b_beta|b|rate hyperparamter to beta (item attributes)|0.3| 56 | |a_tau|a|shape hyperparamter to tau (user influence)|2| 57 | |b_tau|b|rate hyperparamter to tau (user influence)|5| 58 | |a_delta|a|shape hyperparamter to delta (item bias)|0.3| 59 | |b_delta|b|rate hyperparamter to delta (item bias)|0.3| 60 | |social-only||only consider social aspect of factorization (SF)|include factors| 61 | |factor-only||only consider general factors (no social; PF)|include social| 62 | |bias||include a bias term for each item|no bias| 63 | |binary||assume ratings are binary|integer| 64 | |directed||assume network is directed|undirected| 65 | |seed|seed|the random seed|time| 66 | |save_freq|f|the saving frequency. Negative value means no savings for intermediate results.|20| 67 | |eval_freq|f|the intermediate evaluating frequency. Negative means no evaluation for intermediate results.|-1| 68 | |conv_freq|f|the convergence check frequency|10| 69 | |max_iter|max|the max number of iterations|300| 70 | |min_iter|min|the min number of iterations|30| 71 | |converge|c|the change in rating log likelihood required for convergence|1e-6| 72 | |final_pass||do a final pass on all users and items|no final pass| 73 | |sample|sample_size|the stochastic sample size|1000| 74 | |svi_delay|tau|SVI delay >= 0 to down-weight early samples|1024| 75 | |svi_forget|kappa|SVI forgetting rate (0.5,1]|default 0.75| 76 | |K|K|the number of general factors|100| 77 | 78 | 79 | ## Running an Experiment 80 | 1. Download and compile code for comparison models: 81 | `cd scripts/; ./setup.sh; cd ..` 82 | 2. Kick off fits for multiple models with the script (from `scripts` directory): 83 | ``` 84 | ./study [data-dir] [output-dir] [K] [directed/undirected] 85 | ``` 86 | -------------------------------------------------------------------------------- /conf/base.conf: -------------------------------------------------------------------------------- 1 | ################################################### General Settings ############################################# 2 | # dataset: item ratings and user social inforamtion 3 | #dataset.training.lins=/home/statler/achaney/spf/src/spf-mmm/c-revamp/librec/filmtrust-ratings.txt 4 | #dataset.social.lins=/home/statler/achaney/spf/src/spf-mmm/c-revamp/librec/filmtrust-trust.txt 5 | 6 | # in case you use separate testing files 7 | #dataset.testing.lins=/home/statler/achaney/spf/src/spf-mmm/c-revamp/librec/filmtrust-test.txt 8 | 9 | # Rating threshold to convert original rating values to binary ones (during data loading); 10 | # Note that not all (item) recommendation models require binary ratings; -1: disable conversion 11 | val.binary.threshold=-1 12 | 13 | # MISC 14 | is.verbose=on 15 | #is.verbose=off 16 | num.rand.seed=11 17 | is.prediction.out=on 18 | 19 | # Guava cache configuration 20 | guava.cache.spec=maximumSize=200,expireAfterAccess=2m 21 | 22 | # Email notification 23 | is.email.notify=off 24 | mail.smtp.host=smtp.gmail.com 25 | mail.smtp.port=465 26 | mail.smtp.auth=true 27 | mail.smtp.user=xxx@gmail.com 28 | mail.smtp.password=xxxx 29 | mail.to=xxx@email.address 30 | 31 | # validation priority: 1. cross-validation; 2. (training) ratio; 3. given n; 4: given ratio 32 | is.cross.validation=off 33 | is.parallel.folds=on 34 | num.kfold=5 35 | val.ratio=0.2 36 | num.given.n=5 37 | val.given.ratio=0.8 38 | 39 | # testing view of rating predictions: all/-1, cold-start; 40 | rating.pred.view=all 41 | 42 | # item recommendations 43 | is.ranking.pred=off 44 | is.diverse.used=off 45 | num.reclist.len=-1 46 | num.ignore.items=-1 47 | 48 | # baseline & exts: GlobalAvg, UserAvg, ItemAvg, Random, Constant, MostPop; NMF, SlopeOne, Hybrid, PD, AR, PRankD; 49 | # ranking: CLiMF, WRMF, BPR, GBPR, SBPR, RankALS, RankSGD, FISMrmse, FISMauc; 50 | # rating: UserKNN, ItemKNN, RegSVD, BiasedMF, PMF, BPMF, SVD++, SocialMF, RSTE, TrustMF, SoRec, SoReg, TrustSVD; 51 | #recommender=SoRec 52 | 53 | ################################################### Model-based Methods ########################################## 54 | # general parameters for matrix factorization 55 | #val.learn.rate=0.01 56 | max.learn.rate=-1 57 | 58 | val.reg.user=0.35 59 | val.reg.item=0.35 60 | val.reg.bias=0.01 61 | #val.reg.social=1 62 | 63 | #num.factors=10 64 | #num.max.iter=100 65 | val.momentum=0.8 66 | 67 | # learn rate update: first check if bold driver, then consider constanly decay; if neither, keep unchanged. 68 | is.bold.driver=on 69 | is.undo.change=off 70 | val.decay.rate=-1 71 | 72 | # save learned model to files under the folder "Resulsts/#algorithm#". 73 | is.save.model=off 74 | 75 | ################################################### Memoroy-based Methods ######################################### 76 | # similarity method: PCC, COS, COS-Binary, MSD, CPC, exJaccard; -1 to disable shrinkage; Note: case insensitive 77 | similarity=PCC 78 | num.shrinkage=-1 79 | 80 | # neighborhood size for memory-based methods; -1 to use as many as possible. 81 | num.neighbors=50 82 | 83 | ################################################### Method-specific Settings ####################################### 84 | FISM.rho=100 85 | FISM.alpha=0.5 86 | 87 | GBPR.rho=0.8 88 | GBPR.group.size=5 89 | 90 | Hybrid.lambda=0.5 91 | 92 | PD.sigma=2.5 93 | PRankD.alpha=20 94 | 95 | RankALS.is.sw=on 96 | RSTE.alpha=0.4 97 | 98 | SLIM.reg.l1=1.0 99 | SLIM.reg.l2=5.0 100 | 101 | SoRec.reg.c=1 102 | SoRec.reg.z=0.001 103 | SoReg.beta=0.01 104 | 105 | # options: Tr, Te, T; 106 | TrustMF.model=T 107 | 108 | WRMF.alpha=1 109 | -------------------------------------------------------------------------------- /dat/filmtrust/README.md: -------------------------------------------------------------------------------- 1 | # FilmTrust example data 2 | 3 | The raw FilmTrust data was taken from the [LibRec website](http://www.librec.net/datasets.html#filmtrust). 4 | 5 | ## Organization 6 | 7 | - `pro` processed form of the data 8 | - `raw` the original data 9 | - `process.py` the python scripts used to process the data from raw format to the form used by SPF, including diving into train, test, and validation sets 10 | -------------------------------------------------------------------------------- /dat/filmtrust/pro/network.tsv: -------------------------------------------------------------------------------- 1 | 1094 1459 2 | 88 862 3 | 269 962 4 | 1208 227 5 | 1439 428 6 | 782 79 7 | 584 1415 8 | 509 334 9 | 146 264 10 | 188 312 11 | 1153 332 12 | 285 702 13 | 273 222 14 | 1196 883 15 | 1360 1212 16 | 483 38 17 | 452 628 18 | 1159 628 19 | 428 423 20 | 965 141 21 | 1468 372 22 | 261 1504 23 | 26 1179 24 | 436 406 25 | 509 1128 26 | 628 120 27 | 60 733 28 | 1228 606 29 | 898 1288 30 | 1196 428 31 | 188 1398 32 | 1112 825 33 | 210 6 34 | 546 931 35 | 1078 509 36 | 568 1420 37 | 433 154 38 | 632 1304 39 | 298 410 40 | 411 169 41 | 892 550 42 | 509 892 43 | 282 355 44 | 267 509 45 | 931 489 46 | 1125 1205 47 | 1366 1342 48 | 1187 777 49 | 1088 285 50 | 282 782 51 | 489 931 52 | 272 1111 53 | 188 436 54 | 1024 606 55 | 1360 524 56 | 146 23 57 | 812 1204 58 | 616 694 59 | 999 312 60 | 969 298 61 | 628 617 62 | 410 89 63 | 1398 29 64 | 298 509 65 | 1424 1272 66 | 1064 825 67 | 485 873 68 | 628 1398 69 | 825 1064 70 | 918 223 71 | 578 410 72 | 1149 849 73 | 1415 516 74 | 1223 593 75 | 1355 146 76 | 387 1039 77 | 353 537 78 | 452 782 79 | 319 1059 80 | 433 420 81 | 1379 35 82 | 1202 1423 83 | 672 1272 84 | 468 170 85 | 1479 1182 86 | 824 13 87 | 490 265 88 | 1094 741 89 | 1014 702 90 | 1019 695 91 | 509 617 92 | 478 406 93 | 355 863 94 | 1187 489 95 | 232 477 96 | 285 739 97 | 397 489 98 | 1411 1403 99 | 616 397 100 | 769 433 101 | 618 1217 102 | 1229 320 103 | 256 1505 104 | 1074 983 105 | 1065 319 106 | 353 445 107 | 873 1062 108 | 640 1217 109 | 969 509 110 | 965 58 111 | 509 238 112 | 1057 607 113 | 1220 509 114 | 938 402 115 | 188 628 116 | 29 568 117 | 1249 1153 118 | 485 1271 119 | 1398 546 120 | 506 739 121 | 1212 481 122 | 1187 965 123 | 1244 115 124 | 499 433 125 | 1338 1157 126 | 399 1299 127 | 759 1208 128 | 215 1200 129 | 1173 1333 130 | 508 272 131 | 716 282 132 | 509 628 133 | 420 433 134 | 918 965 135 | 1094 857 136 | 898 433 137 | 702 300 138 | 1041 1418 139 | 825 812 140 | 628 362 141 | 1398 298 142 | 671 528 143 | 725 406 144 | 843 84 145 | 1147 892 146 | 842 618 147 | 716 1159 148 | 1346 1406 149 | 617 509 150 | 1157 193 151 | 938 508 152 | 1147 1398 153 | 499 584 154 | 120 509 155 | 1228 1024 156 | 1202 403 157 | 999 188 158 | 240 683 159 | 1494 1039 160 | 738 116 161 | 1140 490 162 | 1398 509 163 | 188 397 164 | 859 1088 165 | 95 1232 166 | 1348 716 167 | 340 1033 168 | 546 1187 169 | 488 764 170 | 629 628 171 | 300 795 172 | 979 1168 173 | 1468 1288 174 | 423 106 175 | 317 857 176 | 1178 484 177 | 503 327 178 | 773 1041 179 | 1269 774 180 | 182 61 181 | 578 315 182 | 999 628 183 | 310 300 184 | 808 629 185 | 1329 724 186 | 489 223 187 | 716 1398 188 | 300 161 189 | 1333 91 190 | 711 716 191 | 986 509 192 | 36 1229 193 | 1402 740 194 | 47 1033 195 | 618 640 196 | 1504 188 197 | 791 1041 198 | 1168 830 199 | 452 282 200 | 1415 165 201 | 537 353 202 | 1196 448 203 | 905 188 204 | 188 1137 205 | 628 452 206 | 1273 905 207 | 509 436 208 | 420 782 209 | 1142 741 210 | 666 546 211 | 1019 485 212 | 433 1018 213 | 1288 516 214 | 1398 957 215 | 378 771 216 | 628 999 217 | 1466 1417 218 | 231 873 219 | 1202 1167 220 | 228 1168 221 | 423 428 222 | 1288 1468 223 | 1147 188 224 | 735 1015 225 | 1458 663 226 | 1165 1249 227 | 380 1041 228 | 242 1159 229 | 544 838 230 | 417 96 231 | 163 1435 232 | 61 263 233 | 814 319 234 | 584 1018 235 | 965 1187 236 | 477 232 237 | 593 1223 238 | 227 196 239 | 1149 837 240 | 491 395 241 | 516 165 242 | 1147 403 243 | 774 1269 244 | 154 1288 245 | 892 321 246 | 509 580 247 | 1288 165 248 | 824 1201 249 | 1142 718 250 | 1147 1488 251 | 1327 552 252 | 1432 1398 253 | 165 516 254 | 298 1355 255 | 1416 478 256 | 618 1149 257 | 941 1162 258 | 716 1504 259 | 1398 716 260 | 242 1435 261 | 490 1102 262 | 397 509 263 | 129 298 264 | 1435 188 265 | 29 892 266 | 1278 986 267 | 1344 114 268 | 1202 112 269 | 716 1348 270 | 15 883 271 | 628 432 272 | 222 315 273 | 313 509 274 | 282 628 275 | 16 509 276 | 436 509 277 | 847 989 278 | 1149 473 279 | 807 1347 280 | 263 61 281 | 989 402 282 | 422 1249 283 | 725 1416 284 | 436 1413 285 | 1329 1360 286 | 282 716 287 | 970 234 288 | 853 319 289 | 200 507 290 | 716 188 291 | 313 188 292 | 114 1344 293 | 546 489 294 | 1002 298 295 | 38 483 296 | 616 509 297 | 804 702 298 | 380 339 299 | 272 300 300 | 1202 1420 301 | 29 298 302 | 1333 877 303 | 782 764 304 | 436 918 305 | 842 945 306 | 1469 979 307 | 26 60 308 | 433 1388 309 | 433 861 310 | 1094 329 311 | 397 1368 312 | 223 1398 313 | 407 623 314 | 1392 1336 315 | 898 154 316 | 1249 79 317 | 1195 151 318 | 278 509 319 | 1288 898 320 | 298 1159 321 | 301 16 322 | 489 240 323 | 353 1056 324 | 426 739 325 | 298 188 326 | 782 863 327 | 535 445 328 | 95 1292 329 | 193 1157 330 | 432 628 331 | 1249 1072 332 | 261 1342 333 | 578 134 334 | 1111 272 335 | 165 499 336 | 516 1018 337 | 938 782 338 | 989 188 339 | 628 364 340 | 95 1252 341 | 1494 524 342 | 1048 195 343 | 60 26 344 | 853 782 345 | 80 1187 346 | 29 1398 347 | 564 810 348 | 452 312 349 | 842 215 350 | 509 1508 351 | 1458 448 352 | 1187 931 353 | 1249 315 354 | 1159 242 355 | 847 865 356 | 1080 1154 357 | 1371 319 358 | 965 188 359 | 508 441 360 | 509 80 361 | 36 320 362 | 683 240 363 | 1504 716 364 | 249 984 365 | 341 509 366 | 842 79 367 | 849 300 368 | 1168 979 369 | 89 509 370 | 1413 683 371 | 739 1249 372 | 1205 1149 373 | 1481 509 374 | 694 616 375 | 914 1183 376 | 863 355 377 | 1469 1492 378 | 725 580 379 | 1014 658 380 | 1048 395 381 | 509 986 382 | 1157 509 383 | 403 897 384 | 80 965 385 | 1395 302 386 | 199 1043 387 | 1187 1200 388 | 1410 1411 389 | 618 509 390 | 965 29 391 | 278 812 392 | 1406 509 393 | 1249 422 394 | 989 312 395 | 702 484 396 | 499 165 397 | 648 911 398 | 1332 995 399 | 508 938 400 | 161 272 401 | 1416 436 402 | 739 863 403 | 1466 396 404 | 776 649 405 | 739 355 406 | 191 1284 407 | 317 969 408 | 568 452 409 | 659 312 410 | 1504 166 411 | 201 395 412 | 725 711 413 | 905 1273 414 | 790 909 415 | 683 546 416 | 162 1398 417 | 269 125 418 | 355 79 419 | 1137 188 420 | 1379 1377 421 | 1420 452 422 | 782 1348 423 | 1174 63 424 | 402 938 425 | 918 436 426 | 546 1202 427 | 892 969 428 | 892 1398 429 | 1507 1288 430 | 1278 114 431 | 430 1334 432 | 1220 626 433 | 1200 1187 434 | 929 805 435 | 310 1192 436 | 618 837 437 | 965 298 438 | 550 892 439 | 1398 528 440 | 825 1033 441 | 188 402 442 | 1204 812 443 | 452 1202 444 | 1212 562 445 | 1415 109 446 | 6 1192 447 | 1223 1243 448 | 79 863 449 | 1488 1147 450 | 509 546 451 | 762 1202 452 | 509 317 453 | 509 1427 454 | 110 1491 455 | 222 578 456 | 154 361 457 | 804 739 458 | 931 509 459 | 1249 362 460 | 29 509 461 | 1160 312 462 | 320 1299 463 | 782 402 464 | 395 188 465 | 1081 476 466 | 397 694 467 | 716 628 468 | 361 516 469 | 966 2 470 | 307 905 471 | 628 1192 472 | 1191 1327 473 | 313 716 474 | 578 938 475 | 27 74 476 | 1154 1061 477 | 992 1149 478 | 312 426 479 | 1435 242 480 | 1342 261 481 | 1123 638 482 | 738 553 483 | 509 616 484 | 1147 66 485 | 298 969 486 | 1398 436 487 | 240 489 488 | 395 628 489 | 938 578 490 | 618 1125 491 | 1249 428 492 | 1478 285 493 | 420 282 494 | 361 1288 495 | 1187 509 496 | 1020 71 497 | 467 164 498 | 1179 26 499 | 1398 160 500 | 188 478 501 | 906 999 502 | 509 1463 503 | 191 810 504 | 485 702 505 | 1444 1159 506 | 718 1168 507 | 1420 1202 508 | 1325 1355 509 | 95 96 510 | 584 499 511 | 640 618 512 | 188 1420 513 | 1014 355 514 | 863 79 515 | 79 1153 516 | 1147 80 517 | 618 1205 518 | 969 892 519 | 1147 628 520 | 35 1377 521 | 1065 764 522 | 298 1147 523 | 1508 1187 524 | 546 965 525 | 516 1415 526 | 436 1398 527 | 222 782 528 | 63 1274 529 | 161 1298 530 | 129 1414 531 | 319 188 532 | 782 938 533 | 1249 1165 534 | 1208 759 535 | 1466 938 536 | 1212 524 537 | 1017 94 538 | 361 1507 539 | 509 394 540 | 63 409 541 | 965 919 542 | 782 1192 543 | 983 1074 544 | 188 491 545 | 433 769 546 | 436 188 547 | 256 554 548 | 553 116 549 | 80 509 550 | 154 898 551 | 1001 682 552 | 552 188 553 | 1278 1344 554 | 1417 1466 555 | 433 636 556 | 1015 928 557 | 1360 1329 558 | 918 546 559 | 29 969 560 | 285 938 561 | 129 89 562 | 165 752 563 | 702 782 564 | 298 1398 565 | 1056 445 566 | 315 739 567 | 1262 1360 568 | 1443 433 569 | 918 240 570 | 1398 965 571 | 79 188 572 | 716 989 573 | 1469 1498 574 | 397 1060 575 | 1427 509 576 | 509 1206 577 | 628 716 578 | 725 215 579 | 285 1398 580 | 182 263 581 | 807 906 582 | 1415 433 583 | 29 1061 584 | 1276 883 585 | 319 764 586 | 508 433 587 | 161 825 588 | 1232 95 589 | 777 1187 590 | 490 1140 591 | 948 147 592 | 748 1040 593 | 1223 1166 594 | 1094 274 595 | 685 393 596 | 12 234 597 | 95 1021 598 | 1398 1432 599 | 66 1147 600 | 1212 226 601 | 509 1392 602 | 1508 509 603 | 516 499 604 | 795 300 605 | 739 1153 606 | 812 825 607 | 452 1420 608 | 1504 1342 609 | 1131 272 610 | 1336 946 611 | 1415 499 612 | 883 849 613 | 313 903 614 | 1165 188 615 | 1187 348 616 | 969 965 617 | 773 905 618 | 1094 253 619 | 1094 1500 620 | 716 312 621 | 1140 224 622 | 965 546 623 | 752 177 624 | 1269 175 625 | 1125 618 626 | 1398 1147 627 | 1334 430 628 | 661 1354 629 | 535 353 630 | 546 240 631 | 481 676 632 | 188 552 633 | 282 1355 634 | 1142 1500 635 | 1056 353 636 | 509 298 637 | 1289 509 638 | 300 702 639 | 1149 739 640 | 1176 1398 641 | 165 361 642 | 188 1350 643 | 1359 1387 644 | 1212 676 645 | 1072 79 646 | 1065 1178 647 | 1171 1170 648 | 406 478 649 | 188 591 650 | 1295 425 651 | 670 628 652 | 1125 640 653 | 361 1468 654 | 312 1348 655 | 345 554 656 | 716 436 657 | 420 134 658 | 284 1174 659 | 782 315 660 | 1299 399 661 | 227 1208 662 | 273 938 663 | 963 535 664 | 1135 1235 665 | 568 319 666 | 628 428 667 | 1187 223 668 | 1056 537 669 | 162 1034 670 | 188 1504 671 | 1147 223 672 | 188 716 673 | 949 911 674 | 1024 1228 675 | 716 426 676 | 1507 433 677 | 220 825 678 | 546 188 679 | 1243 1223 680 | 312 282 681 | 957 1398 682 | 448 196 683 | 201 1197 684 | 1153 315 685 | 361 154 686 | 312 999 687 | 1094 32 688 | 1149 640 689 | 1056 535 690 | 502 1187 691 | 644 164 692 | 1202 762 693 | 99 712 694 | 546 1398 695 | 1321 147 696 | 1034 162 697 | 739 79 698 | 969 1187 699 | 1178 1065 700 | 436 160 701 | 1199 754 702 | 1157 1194 703 | 546 628 704 | 782 134 705 | 629 1159 706 | 427 1104 707 | 433 584 708 | 509 1033 709 | 35 637 710 | 1104 509 711 | 433 752 712 | 1398 1176 713 | 1398 1357 714 | 353 535 715 | 227 448 716 | 764 938 717 | 1149 1125 718 | 1000 623 719 | 965 1311 720 | 188 905 721 | 1018 584 722 | 659 272 723 | 436 546 724 | 1413 436 725 | 1072 1249 726 | 227 457 727 | 965 918 728 | 900 458 729 | 1200 215 730 | 1018 1415 731 | 752 433 732 | 1094 159 733 | 1065 464 734 | 509 1187 735 | 1464 883 736 | 1002 683 737 | 826 96 738 | 452 568 739 | 80 683 740 | 528 671 741 | 397 868 742 | 477 577 743 | 552 1327 744 | 1493 490 745 | 244 1078 746 | 1205 1217 747 | 961 624 748 | 1481 188 749 | 1153 426 750 | 1159 298 751 | 298 29 752 | 983 147 753 | 1034 842 754 | 1249 426 755 | 509 1220 756 | 1398 223 757 | 546 403 758 | 725 616 759 | 670 702 760 | 1479 188 761 | 60 775 762 | 1187 1398 763 | 83 208 764 | 315 188 765 | 892 188 766 | 188 166 767 | 509 1388 768 | 629 362 769 | 830 1168 770 | 426 1153 771 | 1350 188 772 | 632 737 773 | 445 537 774 | 484 702 775 | 509 658 776 | 1412 1075 777 | 38 795 778 | 1402 578 779 | 1101 707 780 | 863 782 781 | 428 1439 782 | 1018 516 783 | 862 88 784 | 489 777 785 | 1468 1507 786 | 312 782 787 | 978 732 788 | 813 867 789 | 116 738 790 | 1162 1223 791 | 1106 188 792 | 1333 1208 793 | 712 99 794 | 824 1000 795 | 661 1344 796 | 314 918 797 | 426 716 798 | 616 617 799 | 79 315 800 | 1101 1091 801 | 1013 1371 802 | 94 1495 803 | 628 683 804 | 1178 428 805 | 1292 324 806 | 1406 51 807 | 165 1288 808 | 1165 1348 809 | 949 648 810 | 509 114 811 | 640 1205 812 | 842 653 813 | 240 546 814 | 1192 938 815 | 238 509 816 | 167 628 817 | 1202 1357 818 | 1437 1039 819 | 1149 509 820 | 222 273 821 | 272 508 822 | 509 694 823 | 989 716 824 | 1301 93 825 | 263 182 826 | 546 123 827 | 883 1033 828 | 445 1056 829 | 825 161 830 | 1359 873 831 | 764 402 832 | 1149 705 833 | 509 1289 834 | 1014 580 835 | 1388 433 836 | 1398 1187 837 | 319 1213 838 | 282 1249 839 | 645 1103 840 | 1187 80 841 | 1333 681 842 | 1187 628 843 | 782 312 844 | 666 812 845 | 568 313 846 | 402 188 847 | 261 188 848 | 1387 825 849 | 1333 1079 850 | 509 29 851 | 166 188 852 | 1420 568 853 | 1330 1223 854 | 1149 1205 855 | 89 628 856 | 109 499 857 | 188 307 858 | 1348 999 859 | 965 80 860 | 1371 1013 861 | 1292 95 862 | 1366 1504 863 | 1039 231 864 | 620 938 865 | 269 927 866 | 1507 361 867 | 509 1149 868 | 716 1147 869 | 79 739 870 | 1062 1495 871 | 1002 80 872 | 29 1147 873 | 319 678 874 | 1463 509 875 | 837 618 876 | 628 436 877 | 478 1416 878 | 574 509 879 | 1298 489 880 | 509 1157 881 | 1187 502 882 | 892 29 883 | 89 403 884 | 509 341 885 | 222 938 886 | 188 782 887 | 842 1125 888 | 2 966 889 | 1202 546 890 | 1406 1346 891 | 1333 29 892 | 535 1056 893 | 1217 618 894 | 79 782 895 | 975 965 896 | 1040 493 897 | 782 188 898 | 402 312 899 | 188 150 900 | 509 223 901 | 863 282 902 | 188 79 903 | 1491 110 904 | 898 752 905 | 282 1302 906 | 1065 508 907 | 628 1159 908 | 1157 1338 909 | 313 628 910 | 629 1355 911 | 427 313 912 | 782 509 913 | 89 298 914 | 436 716 915 | 918 123 916 | 499 1415 917 | 499 1018 918 | 478 546 919 | 1196 509 920 | 120 628 921 | 433 1415 922 | 1327 188 923 | 307 631 924 | 90 109 925 | 810 191 926 | 1508 938 927 | 1021 95 928 | 1232 1505 929 | 628 167 930 | 524 1212 931 | 640 1149 932 | 591 188 933 | 1078 1114 934 | 509 1336 935 | 969 1147 936 | 568 29 937 | 965 1398 938 | 38 272 939 | 426 863 940 | 759 307 941 | 227 345 942 | 1094 696 943 | 300 272 944 | 1187 240 945 | 965 892 946 | 867 813 947 | 1039 387 948 | 670 546 949 | 272 571 950 | 897 403 951 | 628 6 952 | 188 1435 953 | 728 298 954 | 168 1216 955 | 867 628 956 | 228 718 957 | 857 256 958 | 307 1273 959 | 380 379 960 | 716 509 961 | 1178 877 962 | 568 188 963 | 281 718 964 | 1206 509 965 | 509 965 966 | 537 535 967 | 464 1065 968 | 965 975 969 | 508 1065 970 | 509 267 971 | 436 29 972 | 402 989 973 | 628 188 974 | 764 488 975 | 29 965 976 | 426 79 977 | 165 584 978 | 1014 503 979 | 965 1147 980 | 272 1180 981 | 154 433 982 | 1361 188 983 | 564 62 984 | 1022 188 985 | 1402 464 986 | 355 315 987 | 769 361 988 | 472 509 989 | 772 210 990 | 1255 999 991 | 1019 1232 992 | 89 129 993 | 1104 313 994 | 805 929 995 | 313 892 996 | 1147 509 997 | 355 1014 998 | 1147 362 999 | 1187 294 1000 | 1200 240 1001 | 1275 532 1002 | 1165 355 1003 | 489 1187 1004 | 782 999 1005 | 509 278 1006 | 716 402 1007 | 1327 1191 1008 | 426 1249 1009 | 857 1149 1010 | 564 1003 1011 | 490 1493 1012 | 509 120 1013 | 201 319 1014 | 188 1479 1015 | 1415 584 1016 | 1162 1330 1017 | 223 1147 1018 | 1103 645 1019 | 1252 1089 1020 | 683 80 1021 | 319 201 1022 | 1043 199 1023 | 752 1507 1024 | 509 1398 1025 | 298 129 1026 | 892 313 1027 | 1415 1018 1028 | 528 957 1029 | 546 436 1030 | 89 1287 1031 | 514 1207 1032 | 489 509 1033 | 897 595 1034 | 1507 1468 1035 | 1288 284 1036 | 1014 416 1037 | 1153 739 1038 | 812 278 1039 | 790 448 1040 | 282 863 1041 | 146 1355 1042 | 406 436 1043 | 164 644 1044 | 631 307 1045 | 285 1481 1046 | 436 628 1047 | 1306 1500 1048 | 1018 752 1049 | 842 546 1050 | 174 872 1051 | 533 336 1052 | 1255 1147 1053 | 655 93 1054 | 509 472 1055 | 79 1072 1056 | 741 680 1057 | 489 546 1058 | 86 43 1059 | 361 165 1060 | 490 883 1061 | 1159 1444 1062 | 1147 965 1063 | 773 272 1064 | 223 489 1065 | 478 436 1066 | 452 791 1067 | 1212 1248 1068 | 1494 1360 1069 | 231 1039 1070 | 165 433 1071 | 306 1212 1072 | 842 1034 1073 | 1187 496 1074 | 51 1406 1075 | 403 595 1076 | 857 473 1077 | 969 29 1078 | 534 167 1079 | 300 310 1080 | 240 777 1081 | 499 516 1082 | 1202 452 1083 | 1223 1330 1084 | 1298 161 1085 | 684 1269 1086 | 725 436 1087 | 628 670 1088 | 1033 764 1089 | 300 849 1090 | 282 452 1091 | 782 355 1092 | 825 188 1093 | 1160 782 1094 | 938 702 1095 | 188 1022 1096 | 1459 1094 1097 | 1187 215 1098 | 481 1212 1099 | 425 1295 1100 | 410 578 1101 | 984 249 1102 | 1197 201 1103 | 1333 1168 1104 | 317 509 1105 | 628 509 1106 | 29 188 1107 | 15 1239 1108 | 1249 282 1109 | 1159 1147 1110 | 61 182 1111 | 1149 876 1112 | 85 1055 1113 | 1371 222 1114 | 863 426 1115 | 396 1466 1116 | 493 1040 1117 | 556 319 1118 | 1342 1366 1119 | 965 509 1120 | 628 89 1121 | 1289 628 1122 | 546 683 1123 | 1333 509 1124 | 1282 320 1125 | 436 1416 1126 | 307 188 1127 | 395 1355 1128 | 1196 986 1129 | 725 478 1130 | 63 1174 1131 | 79 842 1132 | 188 495 1133 | 1142 1205 1134 | 1088 859 1135 | 1272 345 1136 | 537 1056 1137 | 109 1415 1138 | 1495 1062 1139 | 1355 1325 1140 | 485 1225 1141 | 433 1288 1142 | 364 628 1143 | 496 1187 1144 | 1217 1205 1145 | 782 634 1146 | 509 931 1147 | 863 739 1148 | 1202 628 1149 | 1192 6 1150 | 355 739 1151 | 584 165 1152 | 1137 452 1153 | 509 956 1154 | 188 509 1155 | 812 666 1156 | 617 616 1157 | 282 188 1158 | 509 1368 1159 | 739 506 1160 | 1458 554 1161 | 1357 1398 1162 | 1094 295 1163 | 892 509 1164 | 397 580 1165 | 1014 509 1166 | 320 36 1167 | 16 301 1168 | 1153 1249 1169 | 315 1153 1170 | 320 1229 1171 | 838 544 1172 | 129 1160 1173 | 1333 845 1174 | 312 188 1175 | 256 857 1176 | 969 1398 1177 | 1307 626 1178 | 1147 716 1179 | 947 985 1180 | 1298 1500 1181 | 509 1060 1182 | 938 764 1183 | 433 508 1184 | 240 1187 1185 | 716 1350 1186 | 1403 1411 1187 | 445 535 1188 | 637 35 1189 | 584 433 1190 | 892 1159 1191 | 628 1147 1192 | 535 537 1193 | 29 436 1194 | 857 1369 1195 | 80 628 1196 | 1061 29 1197 | 606 1024 1198 | 315 79 1199 | 485 633 1200 | 1398 188 1201 | 1187 546 1202 | 676 1212 1203 | 862 489 1204 | 188 1481 1205 | 562 1212 1206 | 1178 1357 1207 | 355 1165 1208 | 1281 272 1209 | 509 1078 1210 | 114 509 1211 | 188 1361 1212 | 1398 628 1213 | 345 94 1214 | 208 83 1215 | 928 1015 1216 | 1060 509 1217 | 718 228 1218 | 1288 433 1219 | 79 426 1220 | 397 616 1221 | 771 378 1222 | 1147 29 1223 | 188 965 1224 | 1149 1042 1225 | 1157 323 1226 | 537 445 1227 | 223 509 1228 | 853 764 1229 | 679 1441 1230 | 315 863 1231 | 1018 499 1232 | 320 1282 1233 | 892 965 1234 | 489 397 1235 | 312 402 1236 | 931 1187 1237 | 1173 91 1238 | 402 782 1239 | 452 1182 1240 | 1125 1149 1241 | 509 1104 1242 | 467 1158 1243 | 1229 1348 1244 | 328 337 1245 | 240 918 1246 | 1039 873 1247 | 825 272 1248 | 1201 1193 1249 | 188 29 1250 | 79 355 1251 | 217 1242 1252 | 1504 261 1253 | 196 448 1254 | 509 574 1255 | 298 89 1256 | 478 918 1257 | 546 653 1258 | 1039 1437 1259 | 509 313 1260 | 628 1423 1261 | 1153 79 1262 | 735 928 1263 | 655 1301 1264 | 1232 1335 1265 | 116 553 1266 | 938 1508 1267 | 546 478 1268 | 1226 147 1269 | 1180 272 1270 | 1342 1504 1271 | 739 426 1272 | 420 578 1273 | 1252 95 1274 | 739 315 1275 | 306 226 1276 | 509 489 1277 | 716 711 1278 | 509 1147 1279 | 188 1106 1280 | 1479 792 1281 | 509 188 1282 | 1418 1041 1283 | 956 509 1284 | 1392 509 1285 | 428 1196 1286 | 433 1507 1287 | 969 436 1288 | 509 1481 1289 | 485 1019 1290 | 1154 1080 1291 | 457 294 1292 | 632 1428 1293 | 361 898 1294 | 911 949 1295 | 1147 1159 1296 | 1220 1335 1297 | 336 533 1298 | 978 92 1299 | 862 1307 1300 | 1232 1019 1301 | 89 410 1302 | 433 1397 1303 | 452 1479 1304 | 149 343 1305 | 284 1288 1306 | 999 1348 1307 | 1149 1217 1308 | 752 1018 1309 | 509 618 1310 | 1469 384 1311 | 1147 298 1312 | 1149 471 1313 | 1018 433 1314 | 564 706 1315 | 452 1137 1316 | 58 965 1317 | 1168 718 1318 | 1002 898 1319 | 402 716 1320 | 938 999 1321 | 564 480 1322 | 215 1187 1323 | 509 397 1324 | 1507 752 1325 | 629 428 1326 | 593 1166 1327 | 315 355 1328 | 135 436 1329 | 546 509 1330 | 433 1383 1331 | 436 725 1332 | 1042 1149 1333 | 420 542 1334 | 1361 591 1335 | 1183 914 1336 | 546 223 1337 | 141 965 1338 | 188 1327 1339 | 1128 509 1340 | 1288 1507 1341 | 1147 312 1342 | 1207 514 1343 | 782 6 1344 | 1269 684 1345 | 1065 315 1346 | 445 353 1347 | 1014 1417 1348 | 892 1147 1349 | 1355 629 1350 | 509 89 1351 | 539 502 1352 | 1278 1505 1353 | 165 1415 1354 | 436 478 1355 | 1288 372 1356 | 612 909 1357 | 99 188 1358 | 1424 1242 1359 | 436 135 1360 | 313 436 1361 | 298 160 1362 | 1480 1360 1363 | 546 1147 1364 | 616 1434 1365 | 898 361 1366 | 1369 857 1367 | 361 752 1368 | 1104 628 1369 | 1288 806 1370 | 1336 509 1371 | 1187 123 1372 | 672 1360 1373 | 546 716 1374 | 733 1456 1375 | 1015 42 1376 | 969 188 1377 | 702 938 1378 | 1333 616 1379 | 938 273 1380 | 1149 618 1381 | 774 684 1382 | 433 499 1383 | 1355 395 1384 | 521 598 1385 | 188 99 1386 | 528 1398 1387 | 853 938 1388 | 1288 154 1389 | 441 508 1390 | 1249 791 1391 | 1288 361 1392 | 938 853 1393 | 298 436 1394 | 1223 1162 1395 | 1406 4 1396 | 1173 877 1397 | 1195 1169 1398 | 791 1137 1399 | 1149 382 1400 | 96 95 1401 | 1147 546 1402 | 824 61 1403 | 313 568 1404 | 695 1019 1405 | 420 315 1406 | 524 1360 1407 | 1178 1132 1408 | 313 29 1409 | 1289 320 1410 | 1168 228 1411 | 1388 509 1412 | 892 745 1413 | 485 392 1414 | 649 776 1415 | 499 177 1416 | 1249 739 1417 | 911 648 1418 | 79 1249 1419 | 161 300 1420 | 1212 1360 1421 | 546 918 1422 | 74 27 1423 | 918 478 1424 | 298 965 1425 | 546 777 1426 | 345 1272 1427 | 989 865 1428 | 1348 312 1429 | 272 1131 1430 | 188 989 1431 | 397 188 1432 | 623 776 1433 | 402 764 1434 | 698 1201 1435 | 168 630 1436 | 776 1457 1437 | 1147 969 1438 | -------------------------------------------------------------------------------- /dat/filmtrust/pro/validation.tsv: -------------------------------------------------------------------------------- 1 | 1 1 2 2 | 3 84 3 3 | 19 17 4 4 | 21 251 4 5 | 23 8 3 6 | 26 214 0 7 | 27 252 1 8 | 29 288 3 9 | 30 84 4 10 | 33 257 3 11 | 35 310 4 12 | 37 257 3 13 | 38 213 0 14 | 39 9 4 15 | 51 318 2 16 | 56 84 4 17 | 71 10 3 18 | 72 245 4 19 | 79 483 1 20 | 79 439 1 21 | 81 13 2 22 | 82 12 3 23 | 83 13 4 24 | 88 10 2 25 | 90 215 2 26 | 96 241 2 27 | 99 213 4 28 | 105 17 3 29 | 126 9 4 30 | 127 1 1 31 | 128 248 1 32 | 133 5 3 33 | 137 210 2 34 | 138 249 3 35 | 144 207 3 36 | 150 661 3 37 | 156 217 3 38 | 158 11 4 39 | 161 236 3 40 | 163 217 3 41 | 164 773 3 42 | 167 246 3 43 | 168 3 3 44 | 169 3 2 45 | 171 8 4 46 | 174 234 2 47 | 176 250 4 48 | 181 10 4 49 | 188 490 2 50 | 190 9 3 51 | 192 17 3 52 | 196 11 3 53 | 199 518 1 54 | 199 861 1 55 | 199 891 3 56 | 202 206 3 57 | 206 234 3 58 | 223 210 1 59 | 225 219 3 60 | 243 216 2 61 | 245 205 2 62 | 250 17 2 63 | 256 2 3 64 | 259 3 2 65 | 262 13 4 66 | 265 253 3 67 | 271 220 4 68 | 272 659 3 69 | 272 739 3 70 | 272 11 3 71 | 276 251 4 72 | 285 7 3 73 | 291 6 3 74 | 293 239 3 75 | 295 255 3 76 | 308 1115 4 77 | 314 787 3 78 | 319 234 2 79 | 320 2 3 80 | 323 805 3 81 | 330 248 4 82 | 335 249 2 83 | 340 7 2 84 | 349 217 3 85 | 352 213 3 86 | 353 205 2 87 | 355 491 3 88 | 359 219 4 89 | 365 3 2 90 | 367 8 3 91 | 368 1 2 92 | 374 215 4 93 | 381 2 2 94 | 384 256 2 95 | 385 215 2 96 | 386 121 3 97 | 389 11 4 98 | 391 4 3 99 | 393 216 3 100 | 397 251 4 101 | 398 207 1 102 | 408 211 4 103 | 411 241 4 104 | 412 239 3 105 | 413 236 3 106 | 420 250 2 107 | 424 254 1 108 | 425 236 4 109 | 429 236 4 110 | 432 2 4 111 | 433 1262 4 112 | 437 250 3 113 | 442 239 3 114 | 448 843 2 115 | 452 6 3 116 | 454 1 2 117 | 457 234 2 118 | 467 12 4 119 | 468 8 1 120 | 477 234 1 121 | 485 214 3 122 | 489 211 3 123 | 491 236 4 124 | 494 217 2 125 | 500 211 4 126 | 502 211 3 127 | 504 4 3 128 | 505 17 3 129 | 506 7 3 130 | 507 4 4 131 | 509 1326 1 132 | 513 236 3 133 | 514 239 3 134 | 517 254 3 135 | 523 215 1 136 | 526 1 2 137 | 528 391 4 138 | 533 250 3 139 | 536 220 1 140 | 537 207 3 141 | 539 251 4 142 | 546 251 3 143 | 547 210 0 144 | 548 5 1 145 | 550 7 2 146 | 557 246 3 147 | 559 212 0 148 | 564 13 3 149 | 566 212 3 150 | 567 235 3 151 | 571 216 3 152 | 576 220 4 153 | 578 143 3 154 | 579 7 3 155 | 587 638 2 156 | 591 592 2 157 | 601 241 3 158 | 606 211 1 159 | 607 7 4 160 | 613 245 3 161 | 617 1 4 162 | 618 4 3 163 | 622 8 3 164 | 625 247 4 165 | 626 1388 4 166 | 629 239 2 167 | 631 121 2 168 | 633 251 3 169 | 645 241 2 170 | 650 620 4 171 | 651 205 4 172 | 657 121 2 173 | 661 239 2 174 | 662 8 4 175 | 663 11 4 176 | 666 211 1 177 | 670 57 3 178 | 670 236 4 179 | 674 220 4 180 | 675 11 4 181 | 678 12 1 182 | 681 236 3 183 | 684 11 3 184 | 685 84 0 185 | 686 10 4 186 | 702 1491 3 187 | 702 1449 4 188 | 718 235 2 189 | 719 8 2 190 | 723 2 4 191 | 725 595 4 192 | 726 205 3 193 | 729 239 1 194 | 730 84 3 195 | 734 247 3 196 | 735 6 4 197 | 745 207 3 198 | 748 17 3 199 | 756 84 3 200 | 760 4 3 201 | 764 484 4 202 | 770 236 3 203 | 778 3 1 204 | 782 233 3 205 | 790 236 2 206 | 797 251 3 207 | 812 1544 4 208 | 815 236 0 209 | 821 256 4 210 | 825 752 4 211 | 825 242 4 212 | 830 144 4 213 | 835 7 4 214 | 836 216 3 215 | 838 207 4 216 | 839 805 4 217 | 842 415 4 218 | 845 379 3 219 | 847 662 4 220 | 849 17 1 221 | 851 234 2 222 | 852 257 2 223 | 859 9 3 224 | 862 215 4 225 | 863 253 4 226 | 864 551 2 227 | 867 683 3 228 | 878 255 1 229 | 879 1 3 230 | 896 219 4 231 | 899 257 2 232 | 900 217 2 233 | 908 245 2 234 | 909 5 4 235 | 914 251 3 236 | 917 7 3 237 | 918 4 3 238 | 920 233 2 239 | 927 13 3 240 | 932 10 3 241 | 944 234 3 242 | 946 234 2 243 | 950 257 2 244 | 958 214 3 245 | 965 7 3 246 | 968 9 2 247 | 969 955 1 248 | 969 1687 3 249 | 971 216 2 250 | 977 7 3 251 | 981 216 2 252 | 984 207 4 253 | 995 121 3 254 | 998 1778 1 255 | 1000 11 2 256 | 1008 205 4 257 | 1009 17 4 258 | 1013 234 4 259 | 1017 1601 3 260 | 1019 347 3 261 | 1025 257 3 262 | 1039 240 2 263 | 1047 233 4 264 | 1047 245 3 265 | 1060 1221 4 266 | 1061 3 3 267 | 1062 236 4 268 | 1067 213 3 269 | 1067 214 3 270 | 1089 8 3 271 | 1104 649 3 272 | 1104 647 3 273 | 1105 253 3 274 | 1109 207 2 275 | 1111 583 4 276 | 1114 5 3 277 | 1123 2 3 278 | 1130 206 3 279 | 1132 205 2 280 | 1138 84 3 281 | 1142 244 3 282 | 1142 241 3 283 | 1146 255 2 284 | 1147 391 4 285 | 1149 1404 3 286 | 1150 233 3 287 | 1152 4 0 288 | 1153 8 3 289 | 1157 434 3 290 | 1159 241 3 291 | 1162 121 2 292 | 1164 1171 3 293 | 1173 1874 3 294 | 1184 13 3 295 | 1185 292 4 296 | 1187 1348 2 297 | 1187 249 3 298 | 1187 236 2 299 | 1189 8 3 300 | 1198 213 1 301 | 1199 930 3 302 | 1201 249 4 303 | 1202 206 3 304 | 1212 453 0 305 | 1215 1 3 306 | 1218 9 4 307 | 1220 255 4 308 | 1223 220 4 309 | 1225 3 3 310 | 1230 207 3 311 | 1234 255 3 312 | 1235 212 0 313 | 1241 13 3 314 | 1246 3 1 315 | 1249 250 3 316 | 1252 12 2 317 | 1253 7 4 318 | 1255 236 3 319 | 1258 210 4 320 | 1274 255 3 321 | 1283 211 3 322 | 1283 210 0 323 | 1289 1259 4 324 | 1289 1694 4 325 | 1302 8 3 326 | 1304 206 4 327 | 1307 1 2 328 | 1308 8 4 329 | 1318 210 4 330 | 1329 253 3 331 | 1335 8 4 332 | 1342 255 4 333 | 1348 12 2 334 | 1375 246 3 335 | 1376 2 3 336 | 1386 828 2 337 | 1393 219 4 338 | 1396 211 4 339 | 1398 696 4 340 | 1399 10 3 341 | 1401 214 3 342 | 1406 319 4 343 | 1414 233 2 344 | 1420 5 4 345 | 1429 216 4 346 | 1430 219 1 347 | 1432 442 3 348 | 1437 1876 3 349 | 1438 17 0 350 | 1439 207 3 351 | 1443 8 3 352 | 1444 248 0 353 | 1451 245 3 354 | 1451 213 3 355 | 1453 210 3 356 | 1455 11 3 357 | 1457 17 3 358 | 1461 233 3 359 | 1464 10 1 360 | 1467 211 3 361 | 1469 205 3 362 | 1474 252 2 363 | 1499 252 2 364 | 1508 235 2 365 | -------------------------------------------------------------------------------- /dat/filmtrust/process.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import scipy.io 3 | from collections import defaultdict 4 | import os 5 | from os.path import join, exists 6 | import random 7 | 8 | ### command line args 9 | 10 | raw_dir = sys.argv[1] 11 | output_dir = sys.argv[2] 12 | 13 | 14 | ### split math 15 | 16 | train = 89 17 | test = 10 18 | valid = 1 19 | 20 | total = float(train + test + valid) 21 | train /= total 22 | test /= total 23 | valid /= total 24 | 25 | print (train, test, valid) 26 | 27 | random.seed(11) 28 | 29 | 30 | ### read in everything 31 | 32 | ratings = open(join(raw_dir, "ratings.txt"), 'r') 33 | user_ratings = defaultdict(dict) 34 | ur = defaultdict(set) 35 | for line in ratings: 36 | user, item, rating = [int(float(x)) for x in line.strip().split()] 37 | user_ratings[user][item] = rating 38 | ur[user].add(item) 39 | ratings.close() 40 | 41 | trustnetwork = open(join(raw_dir, "trust.txt"), 'r') 42 | network = set() 43 | for line in trustnetwork: 44 | user, friend, value = [int(x) for x in line.strip().split()] 45 | network.add((user, friend)) 46 | trustnetwork.close() 47 | 48 | 49 | ### write out everything 50 | 51 | if not exists(output_dir): 52 | os.mkdir(output_dir) 53 | 54 | train_file = open(join(output_dir, "train.tsv"), 'w+') 55 | valid_file = open(join(output_dir, "validation.tsv"), 'w+') 56 | test_file = open(join(output_dir, "test.tsv"), 'w+') 57 | network_file = open(join(output_dir, "network.tsv"), 'w+') 58 | 59 | a = 0 60 | b = 0 61 | c = 0 62 | for user in user_ratings: 63 | ratings = user_ratings[user].items() 64 | random.shuffle(ratings) 65 | R = len(ratings) 66 | for i in range(R): 67 | item, rating = ratings[i] 68 | r = i 69 | if (r < test * R and not (r+1) < test * R) or \ 70 | (r < (test + valid) * R and not (r+1) < (test + valid) * R): 71 | r += random.random() 72 | 73 | if r < test * R: 74 | test_file.write("%d\t%d\t%d\n" % (user, item, rating)) 75 | b += 1 76 | elif r < (test + valid) * R: 77 | valid_file.write("%d\t%d\t%d\n" % (user, item, rating)) 78 | c += 1 79 | else: 80 | train_file.write("%d\t%d\t%d\n" % (user, item, rating)) 81 | a += 1 82 | 83 | for user, friend in network: 84 | if len(ur[user] & ur[friend]) != 0: 85 | network_file.write("%d\t%d\n" % (user, friend)) 86 | 87 | 88 | train_file.close() 89 | valid_file.close() 90 | test_file.close() 91 | network_file.close() 92 | 93 | total = float(a + b + c) 94 | print (a/total, b/total, c/total) 95 | -------------------------------------------------------------------------------- /dat/filmtrust/raw/readme.txt: -------------------------------------------------------------------------------- 1 | 1. Item Ratings (ratings.txt): [user-id, item-id, rating-value] 2 | 3 | 2. Trust Ratings (trust.txt): [user-id (trustor), user-id (trustee), trust-value] 4 | 5 | The trust links are directed. 6 | 7 | 3. To use this data set in your research, please consider to cite our work: 8 | 9 | Guibing Guo, Jie Zhang and Neil Yorke-Smith. A Novel Bayesian Similarity Measure for Recommender Systems. Proceedings of the 23rd International Joint Conference on Artifical Intelligence. IJCAI 2013. 10 | 11 | 12 | Copy Right 2013 13 | Guibing Guo 14 | guoguibing@gmail.com -------------------------------------------------------------------------------- /dat/filmtrust/raw/trust.txt: -------------------------------------------------------------------------------- 1 | 2 966 1 2 | 2 104 1 3 | 5 1509 1 4 | 6 1192 1 5 | 7 1510 1 6 | 12 234 1 7 | 15 652 1 8 | 15 883 1 9 | 15 1511 1 10 | 15 1512 1 11 | 15 1239 1 12 | 16 301 1 13 | 16 509 1 14 | 18 1185 1 15 | 20 1441 1 16 | 20 1185 1 17 | 26 1179 1 18 | 26 60 1 19 | 27 1513 1 20 | 27 74 1 21 | 27 1514 1 22 | 29 965 1 23 | 29 298 1 24 | 29 1106 1 25 | 29 1398 1 26 | 29 436 1 27 | 29 892 1 28 | 29 1061 1 29 | 29 1350 1 30 | 29 989 1 31 | 29 1435 1 32 | 29 403 1 33 | 29 546 1 34 | 29 1159 1 35 | 29 1147 1 36 | 29 509 1 37 | 29 969 1 38 | 29 129 1 39 | 29 1481 1 40 | 29 568 1 41 | 29 716 1 42 | 29 188 1 43 | 29 628 1 44 | 35 1377 1 45 | 35 637 1 46 | 36 320 1 47 | 36 1229 1 48 | 1515 1333 1 49 | 38 483 1 50 | 38 1516 1 51 | 38 272 1 52 | 38 795 1 53 | 1517 947 1 54 | 47 1033 1 55 | 47 1518 1 56 | 51 1406 1 57 | 1519 1520 1 58 | 1521 1075 1 59 | 57 239 1 60 | 58 965 1 61 | 60 26 1 62 | 60 733 1 63 | 60 775 1 64 | 60 1522 1 65 | 60 1523 1 66 | 61 182 1 67 | 61 263 1 68 | 63 1524 1 69 | 63 1174 1 70 | 63 409 1 71 | 63 1274 1 72 | 1525 1327 1 73 | 1525 552 1 74 | 66 1147 1 75 | 74 27 1 76 | 78 1526 1 77 | 78 291 1 78 | 79 1153 1 79 | 79 739 1 80 | 79 1072 1 81 | 79 1249 1 82 | 79 188 1 83 | 79 315 1 84 | 79 426 1 85 | 79 355 1 86 | 79 863 1 87 | 79 842 1 88 | 79 782 1 89 | 79 282 1 90 | 1527 94 1 91 | 80 509 1 92 | 80 683 1 93 | 80 1187 1 94 | 80 628 1 95 | 80 965 1 96 | 80 1528 1 97 | 83 208 1 98 | 85 1055 1 99 | 86 43 1 100 | 1529 271 1 101 | 88 862 1 102 | 89 129 1 103 | 89 403 1 104 | 89 509 1 105 | 89 1530 1 106 | 89 410 1 107 | 89 298 1 108 | 89 1287 1 109 | 89 628 1 110 | 90 109 1 111 | 1531 1532 1 112 | 94 1495 1 113 | 95 1252 1 114 | 95 1533 1 115 | 95 1292 1 116 | 95 1232 1 117 | 95 1021 1 118 | 95 96 1 119 | 96 95 1 120 | 99 188 1 121 | 99 712 1 122 | 103 1505 1 123 | 103 368 1 124 | 109 1415 1 125 | 109 499 1 126 | 110 1491 1 127 | 114 509 1 128 | 114 1344 1 129 | 116 738 1 130 | 116 553 1 131 | 120 509 1 132 | 120 628 1 133 | 1534 316 1 134 | 129 298 1 135 | 129 1160 1 136 | 129 1414 1 137 | 129 29 1 138 | 129 89 1 139 | 135 436 1 140 | 1532 80 1 141 | 1532 188 1 142 | 1532 241 1 143 | 1532 1481 1 144 | 1532 892 1 145 | 1535 417 1 146 | 1535 826 1 147 | 141 965 1 148 | 143 514 1 149 | 143 1207 1 150 | 1509 230 1 151 | 146 23 1 152 | 146 264 1 153 | 146 1355 1 154 | 149 343 1 155 | 154 1288 1 156 | 154 898 1 157 | 154 372 1 158 | 154 1331 1 159 | 154 361 1 160 | 154 433 1 161 | 154 806 1 162 | 1536 272 1 163 | 161 300 1 164 | 161 825 1 165 | 161 272 1 166 | 161 1298 1 167 | 162 1398 1 168 | 162 1034 1 169 | 163 1435 1 170 | 163 1537 1 171 | 164 1538 1 172 | 164 644 1 173 | 165 584 1 174 | 165 499 1 175 | 165 1415 1 176 | 165 752 1 177 | 165 361 1 178 | 165 516 1 179 | 165 433 1 180 | 165 1288 1 181 | 166 188 1 182 | 167 628 1 183 | 1539 425 1 184 | 168 1216 1 185 | 168 630 1 186 | 171 1540 1 187 | 173 1541 1 188 | 174 872 1 189 | 1542 1327 1 190 | 182 263 1 191 | 182 61 1 192 | 188 1479 1 193 | 188 221 1 194 | 188 495 1 195 | 188 1022 1 196 | 188 509 1 197 | 188 905 1 198 | 188 1420 1 199 | 188 716 1 200 | 188 591 1 201 | 188 312 1 202 | 188 965 1 203 | 188 166 1 204 | 188 1398 1 205 | 188 1504 1 206 | 188 99 1 207 | 188 1361 1 208 | 188 402 1 209 | 188 989 1 210 | 188 478 1 211 | 188 29 1 212 | 188 782 1 213 | 188 1350 1 214 | 188 1327 1 215 | 188 552 1 216 | 188 150 1 217 | 188 491 1 218 | 188 1106 1 219 | 188 397 1 220 | 188 79 1 221 | 188 628 1 222 | 188 1532 1 223 | 188 1137 1 224 | 188 1481 1 225 | 188 307 1 226 | 188 1435 1 227 | 188 436 1 228 | 191 1284 1 229 | 191 810 1 230 | 192 300 1 231 | 193 1157 1 232 | 196 448 1 233 | 199 1043 1 234 | 200 507 1 235 | 200 1543 1 236 | 201 1197 1 237 | 201 395 1 238 | 201 319 1 239 | 208 83 1 240 | 1544 969 1 241 | 210 6 1 242 | 213 1057 1 243 | 215 1200 1 244 | 215 1187 1 245 | 217 1242 1 246 | 220 825 1 247 | 221 188 1 248 | 221 1420 1 249 | 222 938 1 250 | 222 578 1 251 | 222 315 1 252 | 222 273 1 253 | 222 782 1 254 | 223 509 1 255 | 223 1147 1 256 | 223 489 1 257 | 223 1398 1 258 | 223 636 1 259 | 227 1208 1 260 | 227 448 1 261 | 227 1545 1 262 | 227 457 1 263 | 227 345 1 264 | 227 196 1 265 | 228 718 1 266 | 228 1168 1 267 | 231 873 1 268 | 231 1039 1 269 | 232 477 1 270 | 238 509 1 271 | 240 1187 1 272 | 240 777 1 273 | 240 918 1 274 | 240 489 1 275 | 240 546 1 276 | 240 683 1 277 | 241 1532 1 278 | 242 1435 1 279 | 242 1159 1 280 | 244 1078 1 281 | 244 1546 1 282 | 249 984 1 283 | 254 298 1 284 | 256 1505 1 285 | 256 103 1 286 | 256 554 1 287 | 256 857 1 288 | 1547 1299 1 289 | 1547 320 1 290 | 261 1342 1 291 | 261 188 1 292 | 261 1504 1 293 | 263 182 1 294 | 263 61 1 295 | 267 1518 1 296 | 267 509 1 297 | 268 1548 1 298 | 268 436 1 299 | 268 1417 1 300 | 269 962 1 301 | 269 927 1 302 | 269 125 1 303 | 272 1536 1 304 | 272 1111 1 305 | 272 1131 1 306 | 272 300 1 307 | 272 571 1 308 | 272 1541 1 309 | 272 1180 1 310 | 272 508 1 311 | 273 938 1 312 | 273 222 1 313 | 278 812 1 314 | 278 509 1 315 | 281 718 1 316 | 282 1249 1 317 | 282 79 1 318 | 282 628 1 319 | 282 355 1 320 | 282 315 1 321 | 282 6 1 322 | 282 1355 1 323 | 282 188 1 324 | 282 739 1 325 | 282 782 1 326 | 282 1549 1 327 | 282 1302 1 328 | 282 863 1 329 | 282 452 1 330 | 282 716 1 331 | 282 1192 1 332 | 284 1288 1 333 | 284 1174 1 334 | 285 702 1 335 | 285 938 1 336 | 285 739 1 337 | 285 1398 1 338 | 285 1481 1 339 | 291 78 1 340 | 297 509 1 341 | 298 129 1 342 | 298 436 1 343 | 298 965 1 344 | 298 1550 1 345 | 298 509 1 346 | 298 969 1 347 | 298 1551 1 348 | 298 1159 1 349 | 298 254 1 350 | 298 188 1 351 | 298 160 1 352 | 298 89 1 353 | 298 1147 1 354 | 298 29 1 355 | 298 410 1 356 | 298 1398 1 357 | 298 1355 1 358 | 300 272 1 359 | 300 1466 1 360 | 300 161 1 361 | 300 849 1 362 | 300 795 1 363 | 300 310 1 364 | 300 764 1 365 | 300 702 1 366 | 300 192 1 367 | 300 938 1 368 | 301 16 1 369 | 304 798 1 370 | 305 683 1 371 | 306 226 1 372 | 306 1212 1 373 | 307 905 1 374 | 307 188 1 375 | 307 1273 1 376 | 307 631 1 377 | 310 300 1 378 | 310 1192 1 379 | 312 782 1 380 | 312 999 1 381 | 312 402 1 382 | 312 188 1 383 | 312 1348 1 384 | 312 282 1 385 | 312 426 1 386 | 312 858 1 387 | 313 29 1 388 | 313 716 1 389 | 313 892 1 390 | 313 903 1 391 | 313 568 1 392 | 313 509 1 393 | 313 188 1 394 | 313 628 1 395 | 313 436 1 396 | 314 918 1 397 | 315 79 1 398 | 315 188 1 399 | 315 1549 1 400 | 315 355 1 401 | 315 1153 1 402 | 315 739 1 403 | 315 863 1 404 | 316 1534 1 405 | 317 969 1 406 | 317 509 1 407 | 317 857 1 408 | 319 764 1 409 | 319 1552 1 410 | 319 201 1 411 | 319 188 1 412 | 319 1059 1 413 | 319 1213 1 414 | 319 678 1 415 | 320 1299 1 416 | 320 1282 1 417 | 320 1229 1 418 | 320 36 1 419 | 322 1553 1 420 | 328 337 1 421 | 336 533 1 422 | 340 1033 1 423 | 341 509 1 424 | 345 1272 1 425 | 345 554 1 426 | 345 94 1 427 | 353 445 1 428 | 353 535 1 429 | 353 1056 1 430 | 353 537 1 431 | 1545 452 1 432 | 355 863 1 433 | 355 1014 1 434 | 355 1165 1 435 | 355 739 1 436 | 355 315 1 437 | 355 79 1 438 | 361 806 1 439 | 361 1331 1 440 | 361 1468 1 441 | 361 752 1 442 | 361 516 1 443 | 361 433 1 444 | 361 1288 1 445 | 361 165 1 446 | 361 1507 1 447 | 361 372 1 448 | 361 898 1 449 | 361 154 1 450 | 1549 1062 1 451 | 364 628 1 452 | 372 433 1 453 | 372 516 1 454 | 372 361 1 455 | 373 1482 1 456 | 378 1554 1 457 | 378 771 1 458 | 380 379 1 459 | 380 1041 1 460 | 380 339 1 461 | 387 1039 1 462 | 395 188 1 463 | 395 628 1 464 | 395 1355 1 465 | 396 1466 1 466 | 397 616 1 467 | 397 868 1 468 | 397 580 1 469 | 397 694 1 470 | 397 188 1 471 | 397 489 1 472 | 397 1368 1 473 | 397 509 1 474 | 397 1060 1 475 | 399 1299 1 476 | 1555 556 1 477 | 1553 322 1 478 | 402 858 1 479 | 402 782 1 480 | 402 938 1 481 | 402 188 1 482 | 402 764 1 483 | 402 989 1 484 | 402 716 1 485 | 402 312 1 486 | 403 595 1 487 | 403 897 1 488 | 406 436 1 489 | 406 478 1 490 | 407 623 1 491 | 410 89 1 492 | 410 578 1 493 | 411 169 1 494 | 417 1556 1 495 | 417 96 1 496 | 417 1535 1 497 | 420 433 1 498 | 420 578 1 499 | 420 134 1 500 | 420 282 1 501 | 420 315 1 502 | 420 542 1 503 | 420 782 1 504 | 422 1249 1 505 | 423 428 1 506 | 423 106 1 507 | 425 1539 1 508 | 425 1295 1 509 | 1557 447 1 510 | 426 79 1 511 | 426 716 1 512 | 426 1249 1 513 | 426 1153 1 514 | 426 863 1 515 | 426 739 1 516 | 427 1104 1 517 | 427 313 1 518 | 428 423 1 519 | 428 1196 1 520 | 428 1439 1 521 | 430 1334 1 522 | 432 628 1 523 | 433 1507 1 524 | 433 516 1 525 | 433 154 1 526 | 433 1558 1 527 | 433 636 1 528 | 433 769 1 529 | 433 1331 1 530 | 433 508 1 531 | 433 752 1 532 | 433 420 1 533 | 433 1388 1 534 | 433 1397 1 535 | 433 1415 1 536 | 433 499 1 537 | 433 1288 1 538 | 433 1018 1 539 | 433 861 1 540 | 433 361 1 541 | 433 1383 1 542 | 433 584 1 543 | 433 372 1 544 | 433 806 1 545 | 436 135 1 546 | 436 918 1 547 | 436 1416 1 548 | 436 716 1 549 | 436 29 1 550 | 436 478 1 551 | 436 509 1 552 | 436 188 1 553 | 436 725 1 554 | 436 546 1 555 | 436 628 1 556 | 436 160 1 557 | 436 1413 1 558 | 436 1398 1 559 | 436 268 1 560 | 436 406 1 561 | 441 508 1 562 | 445 535 1 563 | 445 353 1 564 | 445 537 1 565 | 445 1056 1 566 | 447 1557 1 567 | 448 196 1 568 | 452 1420 1 569 | 452 1182 1 570 | 452 628 1 571 | 452 1549 1 572 | 452 568 1 573 | 452 282 1 574 | 452 221 1 575 | 452 782 1 576 | 452 312 1 577 | 452 1559 1 578 | 452 791 1 579 | 452 1202 1 580 | 452 1479 1 581 | 452 1137 1 582 | 452 1560 1 583 | 457 294 1 584 | 463 698 1 585 | 464 1065 1 586 | 467 164 1 587 | 467 1158 1 588 | 468 170 1 589 | 472 509 1 590 | 477 232 1 591 | 477 577 1 592 | 478 546 1 593 | 478 918 1 594 | 478 436 1 595 | 478 1416 1 596 | 478 406 1 597 | 481 676 1 598 | 481 1212 1 599 | 483 38 1 600 | 484 702 1 601 | 485 633 1 602 | 485 1271 1 603 | 485 1561 1 604 | 485 1562 1 605 | 485 1225 1 606 | 485 1563 1 607 | 485 873 1 608 | 485 1019 1 609 | 485 1564 1 610 | 485 1565 1 611 | 485 702 1 612 | 485 392 1 613 | 488 764 1 614 | 489 240 1 615 | 489 223 1 616 | 489 1187 1 617 | 489 397 1 618 | 489 509 1 619 | 489 931 1 620 | 489 1566 1 621 | 489 546 1 622 | 489 777 1 623 | 490 1102 1 624 | 490 883 1 625 | 490 265 1 626 | 490 1140 1 627 | 490 1493 1 628 | 491 395 1 629 | 493 1040 1 630 | 496 1187 1 631 | 499 1018 1 632 | 499 584 1 633 | 499 177 1 634 | 499 516 1 635 | 499 1415 1 636 | 499 433 1 637 | 499 165 1 638 | 502 1187 1 639 | 503 327 1 640 | 506 739 1 641 | 508 938 1 642 | 508 433 1 643 | 508 1065 1 644 | 508 1407 1 645 | 508 272 1 646 | 508 441 1 647 | 509 188 1 648 | 509 1567 1 649 | 509 617 1 650 | 509 278 1 651 | 509 1033 1 652 | 509 1427 1 653 | 509 489 1 654 | 509 114 1 655 | 509 89 1 656 | 509 1220 1 657 | 509 694 1 658 | 509 1147 1 659 | 509 618 1 660 | 509 574 1 661 | 509 965 1 662 | 509 931 1 663 | 509 80 1 664 | 509 1289 1 665 | 509 1078 1 666 | 509 297 1 667 | 509 223 1 668 | 509 1481 1 669 | 509 1276 1 670 | 509 1060 1 671 | 509 120 1 672 | 509 317 1 673 | 509 1463 1 674 | 509 1157 1 675 | 509 313 1 676 | 509 892 1 677 | 509 1206 1 678 | 509 1388 1 679 | 509 238 1 680 | 509 1336 1 681 | 509 1187 1 682 | 509 616 1 683 | 509 956 1 684 | 509 628 1 685 | 509 436 1 686 | 509 394 1 687 | 509 334 1 688 | 509 29 1 689 | 509 341 1 690 | 509 1508 1 691 | 509 658 1 692 | 509 1149 1 693 | 509 986 1 694 | 509 472 1 695 | 509 1398 1 696 | 509 546 1 697 | 509 298 1 698 | 509 1392 1 699 | 509 1128 1 700 | 509 397 1 701 | 509 1104 1 702 | 509 1368 1 703 | 509 580 1 704 | 509 267 1 705 | 509 1568 1 706 | 510 1569 1 707 | 514 143 1 708 | 514 1207 1 709 | 516 372 1 710 | 516 165 1 711 | 516 1018 1 712 | 516 433 1 713 | 516 1415 1 714 | 516 499 1 715 | 521 598 1 716 | 524 1212 1 717 | 524 1360 1 718 | 528 671 1 719 | 528 957 1 720 | 528 1398 1 721 | 532 1570 1 722 | 533 336 1 723 | 534 1571 1 724 | 534 167 1 725 | 535 537 1 726 | 535 1056 1 727 | 535 353 1 728 | 535 445 1 729 | 537 1056 1 730 | 537 353 1 731 | 537 535 1 732 | 537 445 1 733 | 539 502 1 734 | 1560 452 1 735 | 543 1572 1 736 | 543 1573 1 737 | 544 838 1 738 | 546 509 1 739 | 546 1202 1 740 | 546 965 1 741 | 546 1187 1 742 | 546 653 1 743 | 546 29 1 744 | 546 683 1 745 | 546 969 1 746 | 546 403 1 747 | 546 188 1 748 | 546 240 1 749 | 546 1147 1 750 | 546 478 1 751 | 546 489 1 752 | 546 436 1 753 | 546 223 1 754 | 546 918 1 755 | 546 628 1 756 | 546 716 1 757 | 546 777 1 758 | 546 123 1 759 | 546 931 1 760 | 546 1398 1 761 | 550 892 1 762 | 552 1525 1 763 | 552 1327 1 764 | 552 188 1 765 | 552 1574 1 766 | 553 116 1 767 | 556 319 1 768 | 556 1555 1 769 | 561 1575 1 770 | 562 1212 1 771 | 564 706 1 772 | 564 480 1 773 | 564 1576 1 774 | 564 62 1 775 | 564 810 1 776 | 564 1003 1 777 | 568 29 1 778 | 568 313 1 779 | 568 319 1 780 | 568 221 1 781 | 568 188 1 782 | 568 1420 1 783 | 568 452 1 784 | 574 509 1 785 | 574 1577 1 786 | 578 938 1 787 | 578 315 1 788 | 578 410 1 789 | 578 134 1 790 | 581 1104 1 791 | 584 165 1 792 | 584 1415 1 793 | 584 1018 1 794 | 584 433 1 795 | 584 499 1 796 | 1578 1034 1 797 | 1578 272 1 798 | 1578 1398 1 799 | 1578 825 1 800 | 591 188 1 801 | 593 1330 1 802 | 593 1166 1 803 | 593 1223 1 804 | 605 618 1 805 | 605 1149 1 806 | 605 1217 1 807 | 605 1205 1 808 | 606 1024 1 809 | 612 909 1 810 | 616 694 1 811 | 616 509 1 812 | 616 1434 1 813 | 616 397 1 814 | 616 617 1 815 | 617 509 1 816 | 617 616 1 817 | 618 1149 1 818 | 618 837 1 819 | 618 1205 1 820 | 618 605 1 821 | 618 640 1 822 | 618 509 1 823 | 618 1125 1 824 | 618 1217 1 825 | 619 114 1 826 | 620 938 1 827 | 1579 1580 1 828 | 623 776 1 829 | 624 942 1 830 | 628 1147 1 831 | 628 683 1 832 | 628 6 1 833 | 628 362 1 834 | 628 364 1 835 | 628 452 1 836 | 628 999 1 837 | 628 716 1 838 | 628 1423 1 839 | 628 89 1 840 | 628 120 1 841 | 628 1192 1 842 | 628 436 1 843 | 628 167 1 844 | 628 428 1 845 | 628 188 1 846 | 628 432 1 847 | 628 617 1 848 | 628 670 1 849 | 628 1159 1 850 | 628 509 1 851 | 628 1398 1 852 | 629 1159 1 853 | 629 628 1 854 | 629 428 1 855 | 629 1355 1 856 | 629 362 1 857 | 631 307 1 858 | 632 737 1 859 | 632 1304 1 860 | 632 1581 1 861 | 632 1428 1 862 | 634 1481 1 863 | 636 223 1 864 | 636 1398 1 865 | 636 123 1 866 | 637 35 1 867 | 640 1205 1 868 | 640 1217 1 869 | 640 618 1 870 | 640 1149 1 871 | 640 605 1 872 | 644 164 1 873 | 645 1103 1 874 | 648 911 1 875 | 649 776 1 876 | 655 1301 1 877 | 655 93 1 878 | 659 272 1 879 | 659 312 1 880 | 1582 1242 1 881 | 1582 1514 1 882 | 1582 986 1 883 | 661 1354 1 884 | 661 1344 1 885 | 665 1569 1 886 | 666 546 1 887 | 666 812 1 888 | 670 628 1 889 | 670 702 1 890 | 670 546 1 891 | 670 804 1 892 | 671 528 1 893 | 672 1272 1 894 | 672 1360 1 895 | 1583 1109 1 896 | 676 1212 1 897 | 679 1441 1 898 | 683 80 1 899 | 683 240 1 900 | 683 546 1 901 | 683 305 1 902 | 684 1269 1 903 | 685 393 1 904 | 694 616 1 905 | 695 1019 1 906 | 698 1201 1 907 | 698 463 1 908 | 702 782 1 909 | 702 300 1 910 | 702 484 1 911 | 702 938 1 912 | 711 716 1 913 | 712 99 1 914 | 716 1398 1 915 | 716 1159 1 916 | 716 436 1 917 | 716 312 1 918 | 716 1147 1 919 | 716 188 1 920 | 716 1481 1 921 | 716 509 1 922 | 716 402 1 923 | 716 1350 1 924 | 716 1504 1 925 | 716 711 1 926 | 716 426 1 927 | 716 282 1 928 | 716 628 1 929 | 716 1348 1 930 | 716 989 1 931 | 716 29 1 932 | 718 228 1 933 | 718 1168 1 934 | 721 1294 1 935 | 725 580 1 936 | 725 711 1 937 | 725 478 1 938 | 725 1416 1 939 | 725 436 1 940 | 725 616 1 941 | 725 406 1 942 | 725 215 1 943 | 728 298 1 944 | 1584 548 1 945 | 733 1456 1 946 | 733 1585 1 947 | 735 1015 1 948 | 735 928 1 949 | 1520 1519 1 950 | 738 553 1 951 | 738 116 1 952 | 739 79 1 953 | 739 863 1 954 | 739 1249 1 955 | 739 315 1 956 | 739 506 1 957 | 739 1153 1 958 | 739 282 1 959 | 739 426 1 960 | 739 355 1 961 | 741 680 1 962 | 743 1586 1 963 | 748 1040 1 964 | 748 337 1 965 | 752 806 1 966 | 752 1507 1 967 | 752 1018 1 968 | 752 433 1 969 | 752 1587 1 970 | 752 177 1 971 | 759 1208 1 972 | 759 307 1 973 | 762 1202 1 974 | 764 488 1 975 | 764 938 1 976 | 764 402 1 977 | 764 858 1 978 | 764 300 1 979 | 1588 262 1 980 | 769 433 1 981 | 769 361 1 982 | 771 378 1 983 | 772 210 1 984 | 773 905 1 985 | 773 272 1 986 | 773 1041 1 987 | 774 684 1 988 | 774 1269 1 989 | 776 649 1 990 | 776 1457 1 991 | 777 1187 1 992 | 781 1542 1 993 | 782 858 1 994 | 782 634 1 995 | 782 355 1 996 | 782 315 1 997 | 782 863 1 998 | 782 1348 1 999 | 782 312 1 1000 | 782 6 1 1001 | 782 188 1 1002 | 782 764 1 1003 | 782 79 1 1004 | 782 938 1 1005 | 782 134 1 1006 | 782 402 1 1007 | 782 1192 1 1008 | 782 509 1 1009 | 782 999 1 1010 | 788 1589 1 1011 | 790 448 1 1012 | 790 909 1 1013 | 791 1137 1 1014 | 791 1041 1 1015 | 795 300 1 1016 | 798 563 1 1017 | 798 304 1 1018 | 804 702 1 1019 | 804 739 1 1020 | 805 929 1 1021 | 806 1587 1 1022 | 806 1076 1 1023 | 806 154 1 1024 | 806 752 1 1025 | 806 1507 1 1026 | 807 1347 1 1027 | 807 906 1 1028 | 808 629 1 1029 | 810 191 1 1030 | 812 825 1 1031 | 812 1204 1 1032 | 812 666 1 1033 | 812 278 1 1034 | 813 867 1 1035 | 814 319 1 1036 | 821 1539 1 1037 | 824 61 1 1038 | 824 1000 1 1039 | 824 1201 1 1040 | 824 13 1 1041 | 824 702 1 1042 | 1537 163 1 1043 | 825 188 1 1044 | 825 812 1 1045 | 825 1064 1 1046 | 825 161 1 1047 | 825 272 1 1048 | 825 1033 1 1049 | 826 96 1 1050 | 828 720 1 1051 | 830 1168 1 1052 | 1564 196 1 1053 | 1564 485 1 1054 | 837 618 1 1055 | 838 544 1 1056 | 842 215 1 1057 | 842 1034 1 1058 | 842 653 1 1059 | 842 546 1 1060 | 842 945 1 1061 | 842 79 1 1062 | 842 1125 1 1063 | 842 618 1 1064 | 843 1541 1 1065 | 843 84 1 1066 | 843 1590 1 1067 | 847 865 1 1068 | 847 1591 1 1069 | 847 989 1 1070 | 849 300 1 1071 | 849 402 1 1072 | 853 319 1 1073 | 853 315 1 1074 | 853 764 1 1075 | 853 782 1 1076 | 853 938 1 1077 | 857 256 1 1078 | 857 605 1 1079 | 857 1149 1 1080 | 857 1369 1 1081 | 857 473 1 1082 | 859 1088 1 1083 | 862 489 1 1084 | 862 88 1 1085 | 862 1307 1 1086 | 863 79 1 1087 | 863 782 1 1088 | 863 355 1 1089 | 863 426 1 1090 | 863 282 1 1091 | 863 739 1 1092 | 867 813 1 1093 | 867 628 1 1094 | 873 1062 1 1095 | 883 1033 1 1096 | 883 849 1 1097 | 892 509 1 1098 | 892 969 1 1099 | 892 1532 1 1100 | 892 188 1 1101 | 892 1147 1 1102 | 892 1159 1 1103 | 892 550 1 1104 | 892 1398 1 1105 | 892 313 1 1106 | 892 436 1 1107 | 892 745 1 1108 | 892 1016 1 1109 | 892 965 1 1110 | 892 1481 1 1111 | 892 29 1 1112 | 892 321 1 1113 | 897 403 1 1114 | 897 595 1 1115 | 898 433 1 1116 | 898 1288 1 1117 | 898 752 1 1118 | 898 361 1 1119 | 898 1507 1 1120 | 898 806 1 1121 | 898 154 1 1122 | 900 458 1 1123 | 905 188 1 1124 | 905 1273 1 1125 | 906 1592 1 1126 | 906 999 1 1127 | 906 1593 1 1128 | 911 648 1 1129 | 911 949 1 1130 | 914 1183 1 1131 | 915 1594 1 1132 | 918 478 1 1133 | 918 436 1 1134 | 918 240 1 1135 | 918 123 1 1136 | 918 546 1 1137 | 918 223 1 1138 | 918 965 1 1139 | 1595 1223 1 1140 | 926 1565 1 1141 | 1567 509 1 1142 | 928 1015 1 1143 | 1541 272 1 1144 | 929 805 1 1145 | 931 489 1 1146 | 931 1187 1 1147 | 931 509 1 1148 | 938 853 1 1149 | 938 300 1 1150 | 938 1033 1 1151 | 938 402 1 1152 | 938 508 1 1153 | 938 702 1 1154 | 938 578 1 1155 | 938 273 1 1156 | 938 1160 1 1157 | 938 764 1 1158 | 938 999 1 1159 | 938 1508 1 1160 | 938 782 1 1161 | 938 858 1 1162 | 941 1162 1 1163 | 942 755 1 1164 | 942 961 1 1165 | 942 624 1 1166 | 947 985 1 1167 | 948 147 1 1168 | 949 648 1 1169 | 949 911 1 1170 | 956 803 1 1171 | 956 562 1 1172 | 956 509 1 1173 | 957 1398 1 1174 | 961 942 1 1175 | 961 624 1 1176 | 963 535 1 1177 | 1558 516 1 1178 | 1558 372 1 1179 | 1558 433 1 1180 | 965 975 1 1181 | 965 141 1 1182 | 965 1311 1 1183 | 965 58 1 1184 | 965 1596 1 1185 | 965 1597 1 1186 | 965 29 1 1187 | 965 509 1 1188 | 965 918 1 1189 | 965 298 1 1190 | 965 919 1 1191 | 965 221 1 1192 | 965 892 1 1193 | 965 546 1 1194 | 965 1398 1 1195 | 965 1147 1 1196 | 965 188 1 1197 | 965 80 1 1198 | 965 1187 1 1199 | 966 1567 1 1200 | 966 2 1 1201 | 969 628 1 1202 | 969 1147 1 1203 | 969 1598 1 1204 | 969 29 1 1205 | 969 188 1 1206 | 969 1159 1 1207 | 969 298 1 1208 | 969 1398 1 1209 | 969 509 1 1210 | 969 436 1 1211 | 969 892 1 1212 | 969 1187 1 1213 | 969 546 1 1214 | 969 965 1 1215 | 969 716 1 1216 | 969 1544 1 1217 | 969 1435 1 1218 | 970 234 1 1219 | 974 1174 1 1220 | 975 965 1 1221 | 978 977 1 1222 | 978 92 1 1223 | 978 732 1 1224 | 978 1556 1 1225 | 978 29 1 1226 | 978 1599 1 1227 | 979 1168 1 1228 | 983 830 1 1229 | 983 1074 1 1230 | 983 1168 1 1231 | 983 228 1 1232 | 983 147 1 1233 | 984 249 1 1234 | 986 509 1 1235 | 989 716 1 1236 | 989 1591 1 1237 | 989 312 1 1238 | 989 402 1 1239 | 989 188 1 1240 | 989 858 1 1241 | 989 865 1 1242 | 992 1149 1 1243 | 999 312 1 1244 | 999 1600 1 1245 | 999 188 1 1246 | 999 1348 1 1247 | 999 628 1 1248 | 1000 623 1 1249 | 1000 1551 1 1250 | 1001 682 1 1251 | 1002 298 1 1252 | 1002 898 1 1253 | 1002 683 1 1254 | 1002 80 1 1255 | 1013 1371 1 1256 | 1014 580 1 1257 | 1014 1417 1 1258 | 1014 503 1 1259 | 1014 355 1 1260 | 1014 1556 1 1261 | 1014 509 1 1262 | 1014 416 1 1263 | 1014 1533 1 1264 | 1014 1601 1 1265 | 1014 658 1 1266 | 1014 702 1 1267 | 1015 928 1 1268 | 1015 42 1 1269 | 1016 892 1 1270 | 1017 94 1 1271 | 1018 516 1 1272 | 1018 584 1 1273 | 1018 499 1 1274 | 1018 752 1 1275 | 1018 433 1 1276 | 1018 1415 1 1277 | 1019 1232 1 1278 | 1019 695 1 1279 | 1019 485 1 1280 | 1020 71 1 1281 | 1020 100 1 1282 | 1021 95 1 1283 | 1021 1602 1 1284 | 1022 188 1 1285 | 1024 606 1 1286 | 1024 1228 1 1287 | 1033 1518 1 1288 | 1033 764 1 1289 | 1033 938 1 1290 | 1034 1578 1 1291 | 1034 162 1 1292 | 1034 842 1 1293 | 1039 231 1 1294 | 1039 873 1 1295 | 1039 1437 1 1296 | 1039 387 1 1297 | 1040 493 1 1298 | 1041 1418 1 1299 | 1041 1481 1 1300 | 1041 1603 1 1301 | 1042 1149 1 1302 | 1042 605 1 1303 | 1043 1536 1 1304 | 1043 199 1 1305 | 1048 195 1 1306 | 1048 395 1 1307 | 1056 353 1 1308 | 1056 535 1 1309 | 1056 537 1 1310 | 1056 445 1 1311 | 1057 607 1 1312 | 1060 509 1 1313 | 1061 29 1 1314 | 1062 1549 1 1315 | 1062 1495 1 1316 | 1064 825 1 1317 | 1065 464 1 1318 | 1065 764 1 1319 | 1065 1178 1 1320 | 1065 319 1 1321 | 1065 508 1 1322 | 1065 315 1 1323 | 1072 1249 1 1324 | 1072 79 1 1325 | 1074 983 1 1326 | 1076 806 1 1327 | 1078 509 1 1328 | 1078 1114 1 1329 | 1080 1154 1 1330 | 1081 476 1 1331 | 1088 859 1 1332 | 1088 285 1 1333 | 1094 274 1 1334 | 1094 1459 1 1335 | 1094 1604 1 1336 | 1094 696 1 1337 | 1094 1500 1 1338 | 1094 1605 1 1339 | 1094 253 1 1340 | 1094 741 1 1341 | 1094 1565 1 1342 | 1094 857 1 1343 | 1094 159 1 1344 | 1094 329 1 1345 | 1094 295 1 1346 | 1094 703 1 1347 | 1094 32 1 1348 | 1094 1344 1 1349 | 1101 1091 1 1350 | 1101 707 1 1351 | 1103 645 1 1352 | 1104 509 1 1353 | 1104 581 1 1354 | 1104 1606 1 1355 | 1104 628 1 1356 | 1104 313 1 1357 | 1513 1607 1 1358 | 1513 27 1 1359 | 1106 188 1 1360 | 1106 29 1 1361 | 1109 1608 1 1362 | 1109 1545 1 1363 | 1109 1583 1 1364 | 1109 1609 1 1365 | 1111 272 1 1366 | 1112 825 1 1367 | 1117 1610 1 1368 | 1123 638 1 1369 | 1125 1205 1 1370 | 1125 618 1 1371 | 1125 640 1 1372 | 1125 1149 1 1373 | 1128 509 1 1374 | 1131 272 1 1375 | 1611 1612 1 1376 | 1135 1235 1 1377 | 1137 188 1 1378 | 1137 452 1 1379 | 1137 1560 1 1380 | 1137 1479 1 1381 | 1606 1104 1 1382 | 1140 224 1 1383 | 1140 490 1 1384 | 1142 1205 1 1385 | 1142 718 1 1386 | 1142 741 1 1387 | 1142 1500 1 1388 | 1577 574 1 1389 | 1147 546 1 1390 | 1147 969 1 1391 | 1147 628 1 1392 | 1147 403 1 1393 | 1147 1159 1 1394 | 1147 188 1 1395 | 1147 298 1 1396 | 1147 312 1 1397 | 1147 29 1 1398 | 1147 965 1 1399 | 1147 509 1 1400 | 1147 716 1 1401 | 1147 66 1 1402 | 1147 1398 1 1403 | 1147 1481 1 1404 | 1147 223 1 1405 | 1147 362 1 1406 | 1147 80 1 1407 | 1147 1488 1 1408 | 1147 892 1 1409 | 1149 705 1 1410 | 1149 640 1 1411 | 1149 1205 1 1412 | 1149 849 1 1413 | 1149 254 1 1414 | 1149 1042 1 1415 | 1149 618 1 1416 | 1149 1217 1 1417 | 1149 605 1 1418 | 1149 739 1 1419 | 1149 1125 1 1420 | 1149 876 1 1421 | 1149 382 1 1422 | 1149 473 1 1423 | 1149 837 1 1424 | 1149 471 1 1425 | 1149 509 1 1426 | 1153 739 1 1427 | 1153 1249 1 1428 | 1153 315 1 1429 | 1153 332 1 1430 | 1153 426 1 1431 | 1153 79 1 1432 | 1154 1080 1 1433 | 1154 1061 1 1434 | 1157 323 1 1435 | 1157 509 1 1436 | 1157 1194 1 1437 | 1157 1338 1 1438 | 1157 193 1 1439 | 1159 1147 1 1440 | 1159 969 1 1441 | 1159 242 1 1442 | 1159 298 1 1443 | 1159 1444 1 1444 | 1159 29 1 1445 | 1159 628 1 1446 | 1160 938 1 1447 | 1160 782 1 1448 | 1160 312 1 1449 | 1162 1223 1 1450 | 1162 1330 1 1451 | 1165 1348 1 1452 | 1165 355 1 1453 | 1165 188 1 1454 | 1165 1249 1 1455 | 1168 979 1 1456 | 1168 228 1 1457 | 1168 830 1 1458 | 1168 718 1 1459 | 1171 1170 1 1460 | 1173 91 1 1461 | 1173 877 1 1462 | 1173 1333 1 1463 | 1174 63 1 1464 | 1176 1398 1 1465 | 1176 1613 1 1466 | 1178 1357 1 1467 | 1178 1065 1 1468 | 1178 428 1 1469 | 1178 1132 1 1470 | 1178 484 1 1471 | 1178 877 1 1472 | 1179 26 1 1473 | 1180 272 1 1474 | 1607 1513 1 1475 | 1183 914 1 1476 | 1185 1614 1 1477 | 1185 20 1 1478 | 1185 18 1 1479 | 1187 496 1 1480 | 1187 240 1 1481 | 1187 215 1 1482 | 1187 509 1 1483 | 1187 777 1 1484 | 1187 294 1 1485 | 1187 123 1 1486 | 1187 502 1 1487 | 1187 628 1 1488 | 1187 931 1 1489 | 1187 1200 1 1490 | 1187 348 1 1491 | 1187 80 1 1492 | 1187 965 1 1493 | 1187 1615 1 1494 | 1187 546 1 1495 | 1187 489 1 1496 | 1187 223 1 1497 | 1187 1398 1 1498 | 1191 1327 1 1499 | 1192 6 1 1500 | 1192 938 1 1501 | 1195 1169 1 1502 | 1195 151 1 1503 | 1196 986 1 1504 | 1196 883 1 1505 | 1196 428 1 1506 | 1196 448 1 1507 | 1196 509 1 1508 | 1197 201 1 1509 | 1199 754 1 1510 | 1200 1187 1 1511 | 1200 215 1 1512 | 1200 240 1 1513 | 1201 1193 1 1514 | 1202 1357 1 1515 | 1202 403 1 1516 | 1202 1600 1 1517 | 1202 221 1 1518 | 1202 112 1 1519 | 1202 1528 1 1520 | 1202 1420 1 1521 | 1202 762 1 1522 | 1202 546 1 1523 | 1202 452 1 1524 | 1202 628 1 1525 | 1202 1167 1 1526 | 1202 1423 1 1527 | 1204 812 1 1528 | 1205 1217 1 1529 | 1205 1149 1 1530 | 1206 509 1 1531 | 1207 514 1 1532 | 1208 227 1 1533 | 1208 759 1 1534 | 1212 481 1 1535 | 1212 562 1 1536 | 1212 226 1 1537 | 1212 524 1 1538 | 1212 1360 1 1539 | 1212 676 1 1540 | 1212 1248 1 1541 | 1214 1560 1 1542 | 1217 618 1 1543 | 1217 1205 1 1544 | 1220 509 1 1545 | 1220 29 1 1546 | 1220 626 1 1547 | 1220 1335 1 1548 | 1223 593 1 1549 | 1223 1330 1 1550 | 1223 1166 1 1551 | 1223 1162 1 1552 | 1223 1243 1 1553 | 1223 1595 1 1554 | 1616 1016 1 1555 | 1226 147 1 1556 | 1228 1024 1 1557 | 1228 606 1 1558 | 1617 511 1 1559 | 1229 320 1 1560 | 1229 1348 1 1561 | 1232 1335 1 1562 | 1232 1505 1 1563 | 1232 95 1 1564 | 1232 1019 1 1565 | 1233 1618 1 1566 | 1538 164 1 1567 | 1243 1223 1 1568 | 1244 115 1 1569 | 1249 1072 1 1570 | 1249 79 1 1571 | 1249 791 1 1572 | 1249 282 1 1573 | 1249 362 1 1574 | 1249 1153 1 1575 | 1249 739 1 1576 | 1249 426 1 1577 | 1249 428 1 1578 | 1249 422 1 1579 | 1249 315 1 1580 | 1249 1165 1 1581 | 1252 95 1 1582 | 1252 1089 1 1583 | 1255 999 1 1584 | 1255 1147 1 1585 | 1591 989 1 1586 | 1619 147 1 1587 | 1262 1360 1 1588 | 1269 175 1 1589 | 1269 684 1 1590 | 1269 1620 1 1591 | 1269 774 1 1592 | 1272 345 1 1593 | 1273 905 1 1594 | 1275 532 1 1595 | 1621 1622 1 1596 | 1276 883 1 1597 | 1276 509 1 1598 | 1276 1203 1 1599 | 1278 1623 1 1600 | 1278 832 1 1601 | 1278 241 1 1602 | 1278 986 1 1603 | 1278 1624 1 1604 | 1278 1344 1 1605 | 1278 1625 1 1606 | 1278 114 1 1607 | 1278 1505 1 1608 | 1278 1531 1 1609 | 1281 272 1 1610 | 1281 1541 1 1611 | 1282 320 1 1612 | 1288 806 1 1613 | 1288 154 1 1614 | 1288 361 1 1615 | 1288 433 1 1616 | 1288 1331 1 1617 | 1288 284 1 1618 | 1288 898 1 1619 | 1288 1507 1 1620 | 1288 165 1 1621 | 1288 372 1 1622 | 1288 516 1 1623 | 1288 1468 1 1624 | 1289 320 1 1625 | 1289 509 1 1626 | 1289 628 1 1627 | 1292 324 1 1628 | 1292 95 1 1629 | 1295 425 1 1630 | 1298 1500 1 1631 | 1298 161 1 1632 | 1298 489 1 1633 | 1299 1626 1 1634 | 1299 399 1 1635 | 1301 93 1 1636 | 1306 1186 1 1637 | 1306 1500 1 1638 | 1307 626 1 1639 | 1321 147 1 1640 | 1325 1355 1 1641 | 1327 188 1 1642 | 1327 1191 1 1643 | 1327 552 1 1644 | 1327 1525 1 1645 | 1329 724 1 1646 | 1329 1545 1 1647 | 1329 1360 1 1648 | 1329 1627 1 1649 | 1330 1223 1 1650 | 1331 1288 1 1651 | 1331 1018 1 1652 | 1331 154 1 1653 | 1331 433 1 1654 | 1332 995 1 1655 | 1333 29 1 1656 | 1333 616 1 1657 | 1333 877 1 1658 | 1333 845 1 1659 | 1333 1515 1 1660 | 1333 1079 1 1661 | 1333 509 1 1662 | 1333 1168 1 1663 | 1333 681 1 1664 | 1333 1208 1 1665 | 1333 91 1 1666 | 1334 430 1 1667 | 1336 1032 1 1668 | 1336 509 1 1669 | 1336 946 1 1670 | 1338 1157 1 1671 | 1342 1366 1 1672 | 1342 1504 1 1673 | 1342 1628 1 1674 | 1342 261 1 1675 | 1628 1342 1 1676 | 1344 114 1 1677 | 1346 1406 1 1678 | 1348 716 1 1679 | 1348 999 1 1680 | 1348 312 1 1681 | 1350 188 1 1682 | 1355 1325 1 1683 | 1355 395 1 1684 | 1355 629 1 1685 | 1355 146 1 1686 | 1357 1398 1 1687 | 1359 1387 1 1688 | 1359 873 1 1689 | 1360 1329 1 1690 | 1360 524 1 1691 | 1360 1212 1 1692 | 1361 591 1 1693 | 1361 188 1 1694 | 1362 1624 1 1695 | 1366 1504 1 1696 | 1366 1342 1 1697 | 1369 857 1 1698 | 1371 1013 1 1699 | 1371 319 1 1700 | 1371 222 1 1701 | 1379 1629 1 1702 | 1379 1377 1 1703 | 1379 35 1 1704 | 1598 969 1 1705 | 1387 825 1 1706 | 1388 433 1 1707 | 1388 509 1 1708 | 1392 1336 1 1709 | 1392 509 1 1710 | 1395 302 1 1711 | 1398 1176 1 1712 | 1398 188 1 1713 | 1398 1578 1 1714 | 1398 1357 1 1715 | 1398 716 1 1716 | 1398 636 1 1717 | 1398 1147 1 1718 | 1398 546 1 1719 | 1398 509 1 1720 | 1398 528 1 1721 | 1398 965 1 1722 | 1398 29 1 1723 | 1398 223 1 1724 | 1398 957 1 1725 | 1398 1481 1 1726 | 1398 1432 1 1727 | 1398 436 1 1728 | 1398 160 1 1729 | 1398 1187 1 1730 | 1398 628 1 1731 | 1398 298 1 1732 | 1400 1630 1 1733 | 1400 1631 1 1734 | 1402 740 1 1735 | 1402 578 1 1736 | 1402 464 1 1737 | 1403 1411 1 1738 | 1632 1412 1 1739 | 1406 51 1 1740 | 1406 1346 1 1741 | 1406 4 1 1742 | 1406 509 1 1743 | 1407 441 1 1744 | 1407 508 1 1745 | 1410 1411 1 1746 | 1411 1403 1 1747 | 1412 1075 1 1748 | 1412 1632 1 1749 | 1413 436 1 1750 | 1413 683 1 1751 | 1415 109 1 1752 | 1415 584 1 1753 | 1415 499 1 1754 | 1415 516 1 1755 | 1415 433 1 1756 | 1415 1018 1 1757 | 1415 165 1 1758 | 1416 478 1 1759 | 1416 436 1 1760 | 1417 1466 1 1761 | 1418 1041 1 1762 | 1420 1202 1 1763 | 1420 568 1 1764 | 1420 221 1 1765 | 1420 452 1 1766 | 1422 1373 1 1767 | 1424 1016 1 1768 | 1424 1242 1 1769 | 1424 1272 1 1770 | 1427 509 1 1771 | 1432 1398 1 1772 | 1433 1633 1 1773 | 1435 242 1 1774 | 1435 29 1 1775 | 1435 969 1 1776 | 1435 1566 1 1777 | 1435 188 1 1778 | 1543 200 1 1779 | 1437 1039 1 1780 | 1620 1269 1 1781 | 1439 428 1 1782 | 1634 1101 1 1783 | 1634 707 1 1784 | 1634 1091 1 1785 | 1443 433 1 1786 | 1444 1159 1 1787 | 1452 1635 1 1788 | 1454 1636 1 1789 | 1637 854 1 1790 | 1458 554 1 1791 | 1458 663 1 1792 | 1458 448 1 1793 | 1459 1094 1 1794 | 1463 509 1 1795 | 1464 883 1 1796 | 1466 1417 1 1797 | 1466 396 1 1798 | 1466 300 1 1799 | 1466 938 1 1800 | 1468 1507 1 1801 | 1468 372 1 1802 | 1468 1288 1 1803 | 1469 979 1 1804 | 1469 1492 1 1805 | 1469 1498 1 1806 | 1469 384 1 1807 | 1473 1638 1 1808 | 1474 1639 1 1809 | 1475 1640 1 1810 | 1641 285 1 1811 | 1478 1641 1 1812 | 1478 285 1 1813 | 1479 792 1 1814 | 1479 1137 1 1815 | 1479 188 1 1816 | 1479 1182 1 1817 | 1480 1360 1 1818 | 1481 634 1 1819 | 1481 188 1 1820 | 1481 1041 1 1821 | 1481 1147 1 1822 | 1481 29 1 1823 | 1481 509 1 1824 | 1481 716 1 1825 | 1481 436 1 1826 | 1488 1147 1 1827 | 1491 110 1 1828 | 1493 490 1 1829 | 1494 1360 1 1830 | 1494 1039 1 1831 | 1494 1545 1 1832 | 1494 524 1 1833 | 1494 1570 1 1834 | 1494 1642 1 1835 | 1495 1062 1 1836 | 1516 38 1 1837 | 1516 645 1 1838 | 1504 261 1 1839 | 1504 716 1 1840 | 1504 188 1 1841 | 1504 1342 1 1842 | 1504 221 1 1843 | 1504 166 1 1844 | 1507 372 1 1845 | 1507 752 1 1846 | 1507 1468 1 1847 | 1507 1288 1 1848 | 1507 433 1 1849 | 1507 806 1 1850 | 1507 361 1 1851 | 1508 1187 1 1852 | 1508 509 1 1853 | 1508 938 1 1854 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | **Main scripts** 2 | - `process_data.py` process the data into 80% train, 10% test, and 1% validation sets 3 | - This script culls the user network such that only connections that have at least one item in common are included. 4 | - **Use:** `python process_data.py [ratings-file] [network-file] [output-dir]` 5 | - `setup.sh` download code for comparison models and compile; run from scripts dir 6 | - **Use:** `./setup.sh` 7 | - `study.sh` run SPF and comparison models on a specified dataset; run from scripts dir 8 | - **Use:** `./study [data-dir] [output-dir] [K] [directed/undirected]` 9 | - `aggregate_results.py` aggregate results of a study into a single comma-separated file 10 | - **Use:** `python aggregate_results.py [study-dir] [output-file] [K]` 11 | 12 | **Alternate data processing** 13 | - `process_time_data.py` used to process binary data that has timestamps (ratings are 14 | userID/itemID/unixTime instead of userID/itemID/rating) 15 | - **Use:** `python process_time_data.py [ratings-file] [network-file] [output-dir]` 16 | 17 | **Process to data form for comparison models** 18 | - `to_librec_form.py` process standard data form into form for LibRec; default directed network 19 | - **Use:** `python to_librec_form.py [data-dir] [optional:undirected]` 20 | - `to_list_form.py` process standard data form into form for CTR/MF; same use as above 21 | - `to_sorec_list_form.py` process standard data form into form for SoRec using CTR/MF; same use as above 22 | 23 | **Amplification studies** (older) 24 | - `adjust_amplification.py` create a new dataset from the src dataset, 25 | adjusting the percentage of items any given user shares with their 26 | friends 27 | - **Use:** `python adjust_amplification.py [ data src dir ] [ new data dir ] [ % shared ]` 28 | - `amplify_data.py` same as above, but users will only increase their % shared, never decrease 29 | - `deamplify_data.py` same as above, but users will only decrease their % shared, never increase 30 | - `amplification_check.py` print out the percent of items shared with friends, averaged across all users 31 | - **Use:** `python amplification_check.py [data-dir]` 32 | - `sim_data.sh` create a set of datasets, each with the same seed data, but different amplification settings 33 | - **Use:** `./sim_data.sh [data-dir]` 34 | - `aggregate_amp_results.py` aggregate results of an amplification study (on a range of amplification settings) 35 | - **Use:** `python aggregate_amp_results [fits-dir] [out-filename]` 36 | -------------------------------------------------------------------------------- /scripts/adjust_amplification.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | from random import randint, shuffle, sample 4 | 5 | dir = sys.argv[1] 6 | out = sys.argv[2] 7 | per = float(sys.argv[3]) / 100 8 | 9 | # read in network 10 | #print "* reading network data" 11 | 12 | network = defaultdict(set) 13 | for line in open(dir +'/network.tsv'): 14 | if ',' in line: 15 | a, b = [int(x) for x in line.strip().split(',')] 16 | else: 17 | a, b = [int(x) for x in line.strip().split('\t')] 18 | network[a].add(b) 19 | network[b].add(a) 20 | 21 | #print "* reading train, test, and validation data" 22 | 23 | user_items = defaultdict(set) 24 | train = defaultdict(set) 25 | test = defaultdict(set) 26 | valid = defaultdict(set) 27 | all_items = set() 28 | for line in open(dir +'/train.tsv'): 29 | u,i,r = [int(x) for x in line.strip().split('\t')] 30 | if len(network[u]) == 0: 31 | continue 32 | user_items[u].add(i) 33 | train[u].add(i) 34 | all_items.add(i) 35 | for line in open(dir +'/test.tsv'): 36 | u,i,r = [int(x) for x in line.strip().split('\t')] 37 | if len(network[u]) == 0: 38 | continue 39 | user_items[u].add(i) 40 | test[u].add(i) 41 | all_items.add(i) 42 | for line in open(dir +'/validation.tsv'): 43 | u,i,r = [int(x) for x in line.strip().split('\t')] 44 | if len(network[u]) == 0: 45 | continue 46 | user_items[u].add(i) 47 | valid[u].add(i) 48 | all_items.add(i) 49 | 50 | #print " %d unique items in original data" % len(all_items) 51 | 52 | #print "* exchanging items for each user" 53 | changed_amp = 0 54 | changed_deamp = 0 55 | # start with those with the fewest number of items 56 | for user in sorted(user_items.keys(), key=lambda x: len(user_items[x])): 57 | candidates = defaultdict(int) 58 | omit = set() 59 | shared = set() 60 | for friend in network[user]: 61 | for item in user_items[friend]: 62 | if item not in user_items[user]: 63 | candidates[item] += 1 64 | omit.add(item) 65 | else: 66 | shared.add(item) 67 | 68 | ### does this user have too many or too few items? 69 | # too many! 70 | #if len(shared) * 1.0 / len(user_items[user]) >= (per+0.05): 71 | if len(shared) * 1.0 / len(user_items[user]) > per: 72 | candidates = all_items - omit - shared 73 | items = list(user_items[user] - set(candidates)) 74 | shuffle(items) 75 | 76 | for item in items: 77 | #if len(shared) * 1.0 / len(user_items[user]) < (per+0.05) or \ 78 | if len(shared) * 1.0 / len(user_items[user]) <= per or \ 79 | len(candidates) == 0: 80 | break 81 | 82 | pick = sample(candidates, 1)[0] 83 | changed_deamp += 1 84 | 85 | if item in train[user]: 86 | train[user].remove(item) 87 | train[user].add(pick) 88 | if item in test[user]: 89 | test[user].remove(item) 90 | test[user].add(pick) 91 | if item in valid[user]: 92 | valid[user].remove(item) 93 | valid[user].add(pick) 94 | 95 | candidates.remove(pick) 96 | user_items[user].remove(item) 97 | user_items[user].add(pick) 98 | 99 | 100 | # too few! 101 | elif len(shared) * 1.0 / len(user_items[user]) < per: 102 | #elif len(shared) * 1.0 / len(user_items[user]) <= (per-0.05): 103 | 104 | items = list(user_items[user] - set(candidates)) 105 | shuffle(items) 106 | for item in items: 107 | #if len(shared) * 1.0 / len(user_items[user]) > (per-0.05) or \ 108 | if len(shared) * 1.0 / len(user_items[user]) >= per or \ 109 | len(candidates) == 0: 110 | break 111 | 112 | pickv = randint(1, sum(candidates.values())) 113 | for c in candidates: 114 | if pickv <= candidates[c]: 115 | pick = c 116 | break 117 | pickv -= candidates[c] 118 | 119 | shared.add(pick) 120 | changed_amp += 1 121 | 122 | if item in train[user]: 123 | train[user].remove(item) 124 | train[user].add(pick) 125 | if item in test[user]: 126 | test[user].remove(item) 127 | test[user].add(pick) 128 | if item in valid[user]: 129 | valid[user].remove(item) 130 | valid[user].add(pick) 131 | 132 | del candidates[pick] 133 | user_items[user].remove(item) 134 | user_items[user].add(pick) 135 | 136 | 137 | #print "user", user, "has", (len(shared) * 100.0 / len(user_items[user])), "%% items shared" 138 | all_items = set() 139 | for user in user_items.keys(): 140 | all_items = all_items | user_items[user] 141 | #print " %d unique items in amplified data" % len(all_items) 142 | print (changed_amp + changed_deamp), "changed items (", changed_amp, \ 143 | changed_deamp, ")" 144 | 145 | f = open(out +'/train.tsv', 'w+') 146 | for user in train: 147 | for item in train[user]: 148 | f.write('%d\t%d\t1\n' % (user,item)) 149 | f.close() 150 | #print "* done writing out training data" 151 | 152 | f = open(out +'/test.tsv', 'w+') 153 | for user in test: 154 | for item in test[user]: 155 | f.write('%d\t%d\t1\n' % (user,item)) 156 | f.close() 157 | #print "* done writing out testing data" 158 | 159 | f = open(out +'/validation.tsv', 'w+') 160 | for user in valid: 161 | for item in valid[user]: 162 | f.write('%d\t%d\t1\n' % (user,item)) 163 | f.close() 164 | #print "* done writing out validation data" 165 | -------------------------------------------------------------------------------- /scripts/aggregate_amp_results.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | from os.path import join, isdir, isfile 3 | import sys 4 | 5 | fits = sys.argv[1] 6 | out = sys.argv[2] 7 | 8 | fout = open(out, 'w+') 9 | 10 | for dir in sorted(listdir(fits)): 11 | dirp = join(fits,dir) 12 | if not isdir(dirp): 13 | continue 14 | print dir 15 | for model in sorted(listdir(dirp)): 16 | modelp = join(dirp,model) 17 | if not isdir(modelp): 18 | continue 19 | if isfile(join(modelp, 'summary_eval.dat')): 20 | print "\t", model 21 | f = open(join(modelp, 'summary_eval.dat')) 22 | for line in f: 23 | metric, val = line.strip().split('\t') 24 | fout.write("%s,%s,%s,%s\n" % (dir, model, metric, val)) 25 | f.close() 26 | else: 27 | print "\t**", model 28 | fout.close() 29 | -------------------------------------------------------------------------------- /scripts/aggregate_results.py: -------------------------------------------------------------------------------- 1 | import fnmatch 2 | import os, sys 3 | from os.path import isdir, join 4 | 5 | m = sys.argv[1] 6 | outfile = sys.argv[2] 7 | k = int(sys.argv[3]) 8 | fout = open(outfile, 'w+') 9 | fout.write("model,k,metric,value\n") 10 | 11 | for model in os.listdir(m): 12 | if not isdir(join(m, model)): 13 | continue 14 | for file in os.listdir(join(m, model)): 15 | if isdir(join(join(m, model), file)): 16 | for f in os.listdir(join(join(m, model), file)): 17 | if f == 'eval_summary_final.dat': 18 | fname = join(join(join(m, model), file), f) 19 | for line in open(fname).readlines()[1:]: 20 | tokens = line.split('\t') 21 | fout.write("%s,%d,%s,%s\n" % (file, \ 22 | k if model==0 else int(model), \ 23 | tokens[0], tokens[1])) 24 | 25 | continue 26 | if file == 'eval_summary_final.dat': 27 | fname = join(join(m, model), file) 28 | for line in open(fname).readlines()[1:]: 29 | tokens = line.split('\t') 30 | fout.write("%s,%d,%s,%s\n" % (model, k, tokens[0], tokens[1])) 31 | fout.close() 32 | -------------------------------------------------------------------------------- /scripts/amplification_check.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | from random import randint, shuffle 4 | 5 | dir = sys.argv[1] 6 | 7 | # read in network 8 | print "* reading network data" 9 | 10 | network = defaultdict(set) 11 | for line in open(dir +'/network.tsv'): 12 | if ',' in line: 13 | a, b = [int(x) for x in line.strip().split(',')] 14 | else: 15 | a, b = [int(x) for x in line.strip().split('\t')] 16 | network[a].add(b) 17 | network[b].add(a) 18 | 19 | print "* reading train, test, and validation data" 20 | 21 | user_items = defaultdict(set) 22 | train = defaultdict(set) 23 | test = defaultdict(set) 24 | valid = defaultdict(set) 25 | all_items = set() 26 | for line in open(dir +'/train.tsv'): 27 | u,i,r = [int(x) for x in line.strip().split('\t')] 28 | if len(network[u]) == 0: 29 | continue 30 | user_items[u].add(i) 31 | train[u].add(i) 32 | all_items.add(i) 33 | for line in open(dir +'/test.tsv'): 34 | u,i,r = [int(x) for x in line.strip().split('\t')] 35 | if len(network[u]) == 0: 36 | continue 37 | user_items[u].add(i) 38 | test[u].add(i) 39 | all_items.add(i) 40 | for line in open(dir +'/validation.tsv'): 41 | u,i,r = [int(x) for x in line.strip().split('\t')] 42 | if len(network[u]) == 0: 43 | continue 44 | user_items[u].add(i) 45 | valid[u].add(i) 46 | all_items.add(i) 47 | 48 | print " %d unique items in original data" % len(all_items) 49 | 50 | print "* finding shared" 51 | 52 | # start with those with the fewest number of items 53 | shared_sum = 0 54 | U = 0 55 | for user in sorted(user_items.keys(), key=lambda x: len(user_items[x])): 56 | shared = set() 57 | for friend in network[user]: 58 | for item in user_items[friend]: 59 | if item in user_items[user]: 60 | shared.add(item) 61 | 62 | shared_sum += len(shared)*100.0/ len(user_items[user]) 63 | U += 1 64 | print (shared_sum / U), "% shared" 65 | -------------------------------------------------------------------------------- /scripts/amplify_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | from random import randint, shuffle 4 | 5 | dir = sys.argv[1] 6 | out = sys.argv[2] 7 | per = float(sys.argv[3]) / 100 8 | 9 | # read in network 10 | #print "* reading network data" 11 | 12 | network = defaultdict(set) 13 | for line in open(dir +'/network.tsv'): 14 | if ',' in line: 15 | a, b = [int(x) for x in line.strip().split(',')] 16 | else: 17 | a, b = [int(x) for x in line.strip().split('\t')] 18 | network[a].add(b) 19 | network[b].add(a) 20 | 21 | #print "* reading train, test, and validation data" 22 | 23 | user_items = defaultdict(set) 24 | train = defaultdict(set) 25 | test = defaultdict(set) 26 | valid = defaultdict(set) 27 | all_items = set() 28 | for line in open(dir +'/train.tsv'): 29 | u,i,r = [int(x) for x in line.strip().split('\t')] 30 | if len(network[u]) == 0: 31 | continue 32 | user_items[u].add(i) 33 | train[u].add(i) 34 | all_items.add(i) 35 | for line in open(dir +'/test.tsv'): 36 | u,i,r = [int(x) for x in line.strip().split('\t')] 37 | if len(network[u]) == 0: 38 | continue 39 | user_items[u].add(i) 40 | test[u].add(i) 41 | all_items.add(i) 42 | for line in open(dir +'/validation.tsv'): 43 | u,i,r = [int(x) for x in line.strip().split('\t')] 44 | if len(network[u]) == 0: 45 | continue 46 | user_items[u].add(i) 47 | valid[u].add(i) 48 | all_items.add(i) 49 | 50 | #print " %d unique items in original data" % len(all_items) 51 | 52 | #print "* exchanging items for each user" 53 | changed = 0 54 | # start with those with the fewest number of items 55 | for user in sorted(user_items.keys(), key=lambda x: len(user_items[x])): 56 | candidates = defaultdict(int) 57 | shared = set() 58 | for friend in network[user]: 59 | for item in user_items[friend]: 60 | if item not in user_items[user]: 61 | candidates[item] += 1 62 | else: 63 | shared.add(item) 64 | 65 | items = list(user_items[user] - set(candidates)) 66 | shuffle(items) 67 | #if len(shared) != 0: 68 | # #print "user", user, "has", (len(shared) * 100.0 / len(user_items[user])), "%% items shared" 69 | for item in items: 70 | if len(shared) * 1.0 / len(user_items[user]) >= per or len(candidates) == 0: 71 | break 72 | 73 | pickv = randint(1, sum(candidates.values())) 74 | for c in candidates: 75 | if pickv <= candidates[c]: 76 | pick = c 77 | break 78 | pickv -= candidates[c] 79 | 80 | shared.add(pick) 81 | changed += 1 82 | 83 | if item in train[user]: 84 | train[user].remove(item) 85 | train[user].add(pick) 86 | if item in test[user]: 87 | test[user].remove(item) 88 | test[user].add(pick) 89 | if item in valid[user]: 90 | valid[user].remove(item) 91 | valid[user].add(pick) 92 | 93 | del candidates[pick] 94 | user_items[user].remove(item) 95 | user_items[user].add(pick) 96 | 97 | 98 | #print "user", user, "has", (len(shared) * 100.0 / len(user_items[user])), "%% items shared" 99 | all_items = set() 100 | for user in user_items.keys(): 101 | all_items = all_items | user_items[user] 102 | #print " %d unique items in amplified data" % len(all_items) 103 | print changed, "changed items" 104 | 105 | f = open(out +'/train.tsv', 'w+') 106 | for user in train: 107 | for item in train[user]: 108 | f.write('%d\t%d\t1\n' % (user,item)) 109 | f.close() 110 | #print "* done writing out training data" 111 | 112 | f = open(out +'/test.tsv', 'w+') 113 | for user in test: 114 | for item in test[user]: 115 | f.write('%d\t%d\t1\n' % (user,item)) 116 | f.close() 117 | #print "* done writing out testing data" 118 | 119 | f = open(out +'/validation.tsv', 'w+') 120 | for user in valid: 121 | for item in valid[user]: 122 | f.write('%d\t%d\t1\n' % (user,item)) 123 | f.close() 124 | #print "* done writing out validation data" 125 | -------------------------------------------------------------------------------- /scripts/deamplify_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | from random import randint, shuffle, sample 4 | 5 | dir = sys.argv[1] 6 | out = sys.argv[2] 7 | per = float(sys.argv[3]) / 100 8 | 9 | # read in network 10 | #print "* reading network data" 11 | 12 | network = defaultdict(set) 13 | for line in open(dir +'/network.tsv'): 14 | if ',' in line: 15 | a, b = [int(x) for x in line.strip().split(',')] 16 | else: 17 | a, b = [int(x) for x in line.strip().split('\t')] 18 | network[a].add(b) 19 | network[b].add(a) 20 | 21 | #print "* reading train, test, and validation data" 22 | 23 | user_items = defaultdict(set) 24 | train = defaultdict(set) 25 | test = defaultdict(set) 26 | valid = defaultdict(set) 27 | all_items = set() 28 | for line in open(dir +'/train.tsv'): 29 | u,i,r = [int(x) for x in line.strip().split('\t')] 30 | if len(network[u]) == 0: 31 | continue 32 | user_items[u].add(i) 33 | train[u].add(i) 34 | all_items.add(i) 35 | for line in open(dir +'/test.tsv'): 36 | u,i,r = [int(x) for x in line.strip().split('\t')] 37 | if len(network[u]) == 0: 38 | continue 39 | user_items[u].add(i) 40 | test[u].add(i) 41 | all_items.add(i) 42 | for line in open(dir +'/validation.tsv'): 43 | u,i,r = [int(x) for x in line.strip().split('\t')] 44 | if len(network[u]) == 0: 45 | continue 46 | user_items[u].add(i) 47 | valid[u].add(i) 48 | all_items.add(i) 49 | 50 | #print " %d unique items in original data" % len(all_items) 51 | 52 | #print "* exchanging items for each user" 53 | changed = 0 54 | # start with those with the largest number of items 55 | for user in sorted(user_items.keys(), key=lambda x: -len(user_items[x])): 56 | shared = set() 57 | omit = set() 58 | for friend in network[user]: 59 | for item in user_items[friend]: 60 | if item in user_items[user]: 61 | shared.add(item) 62 | else: 63 | omit.add(item) 64 | 65 | candidates = all_items - omit - shared 66 | items = list(user_items[user] - set(candidates)) 67 | shuffle(items) 68 | #if len(shared) != 0: 69 | # print "user", user, "has", (len(shared) * 100.0 / len(user_items[user])), "%% items shared" 70 | for item in items: 71 | if len(shared) * 1.0 / len(user_items[user]) <= per or len(candidates) == 0: 72 | break 73 | 74 | 75 | pick = sample(candidates, 1)[0] 76 | changed += 1 77 | 78 | if item in train[user]: 79 | train[user].remove(item) 80 | train[user].add(pick) 81 | if item in test[user]: 82 | test[user].remove(item) 83 | test[user].add(pick) 84 | if item in valid[user]: 85 | valid[user].remove(item) 86 | valid[user].add(pick) 87 | 88 | candidates.remove(pick) 89 | user_items[user].remove(item) 90 | user_items[user].add(pick) 91 | 92 | 93 | #print "user", user, "has", (len(shared) * 100.0 / len(user_items[user])), "%% items shared" 94 | all_items = set() 95 | for user in user_items.keys(): 96 | all_items = all_items | user_items[user] 97 | #print " %d unique items in amplified data" % len(all_items) 98 | print changed, "changed items" 99 | 100 | f = open(out +'/train.tsv', 'w+') 101 | for user in train: 102 | for item in train[user]: 103 | f.write('%d\t%d\t1\n' % (user,item)) 104 | f.close() 105 | #print "* done writing out training data" 106 | 107 | f = open(out +'/test.tsv', 'w+') 108 | for user in test: 109 | for item in test[user]: 110 | f.write('%d\t%d\t1\n' % (user,item)) 111 | f.close() 112 | #print "* done writing out testing data" 113 | 114 | f = open(out +'/validation.tsv', 'w+') 115 | for user in valid: 116 | for item in valid[user]: 117 | f.write('%d\t%d\t1\n' % (user,item)) 118 | f.close() 119 | #print "* done writing out validation data" 120 | -------------------------------------------------------------------------------- /scripts/get_time.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | t = 0 4 | for line in open(sys.argv[1]): 5 | toks = line.split('\t') 6 | if toks[0] == "user" or toks[0] == "sys": 7 | m,s = [float(i) for i in toks[1][:-2].split('m')] 8 | t += m*60 + s 9 | print t 10 | -------------------------------------------------------------------------------- /scripts/process_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import scipy.io 3 | from collections import defaultdict 4 | import os 5 | from os.path import join, exists 6 | import random 7 | 8 | ### command line args 9 | 10 | ratings_file = sys.argv[1] 11 | network_file = sys.argv[2] 12 | output_dir = sys.argv[3] 13 | 14 | splitchar = '\t' 15 | 16 | 17 | ### split math 18 | 19 | train = 89 20 | test = 10 21 | valid = 1 22 | 23 | total = float(train + test + valid) 24 | train /= total 25 | test /= total 26 | valid /= total 27 | 28 | print (train, test, valid) 29 | 30 | random.seed(11) 31 | 32 | 33 | ### read in everything 34 | 35 | ratings = open(ratings_file, 'r') 36 | user_ratings = defaultdict(list) 37 | ur = defaultdict(set) 38 | for line in ratings: 39 | user, item, rating = [int(x) for x in line.strip().split(splitchar)] 40 | user_ratings[user].append((item, rating)) 41 | ur[user].add(item) 42 | ratings.close() 43 | 44 | trustnetwork = open(network_file, 'r') 45 | network = set() 46 | for line in trustnetwork: 47 | user, friend = [int(x) for x in line.strip().split(splitchar)] 48 | if (friend, user) not in network: 49 | network.add((user, friend)) 50 | trustnetwork.close() 51 | 52 | 53 | ### write out everything 54 | 55 | if not exists(output_dir): 56 | os.mkdir(output_dir) 57 | 58 | train_file = open(join(output_dir, "train.tsv"), 'w+') 59 | valid_file = open(join(output_dir, "validation.tsv"), 'w+') 60 | test_file = open(join(output_dir, "test.tsv"), 'w+') 61 | network_file = open(join(output_dir, "network.tsv"), 'w+') 62 | 63 | a = 0 64 | b = 0 65 | c = 0 66 | for user in user_ratings: 67 | ratings = user_ratings[user] 68 | random.shuffle(ratings) 69 | R = len(ratings) 70 | for i in range(R): 71 | item, rating = ratings[i] 72 | if rating == 0: 73 | continue 74 | r = i 75 | if (r < test * R and not (r+1) < test * R) or \ 76 | (r < (test + valid) * R and not (r+1) < (test + valid) * R): 77 | r += random.random() 78 | 79 | if r < test * R: 80 | test_file.write("%d\t%d\t%d\n" % (user, item, rating)) 81 | b += 1 82 | elif r < (test + valid) * R: 83 | valid_file.write("%d\t%d\t%d\n" % (user, item, rating)) 84 | c += 1 85 | else: 86 | train_file.write("%d\t%d\t%d\n" % (user, item, rating)) 87 | a += 1 88 | 89 | for user, friend in network: 90 | if len(ur[user] & ur[friend]) != 0: 91 | network_file.write("%d\t%d\n" % (user, friend)) 92 | 93 | 94 | train_file.close() 95 | valid_file.close() 96 | test_file.close() 97 | network_file.close() 98 | 99 | total = float(a + b + c) 100 | print (a/total, b/total, c/total) 101 | -------------------------------------------------------------------------------- /scripts/process_data_Nusers.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import scipy.io 3 | from collections import defaultdict 4 | import os 5 | from os.path import join, exists 6 | import random 7 | 8 | ### command line args 9 | 10 | ratings_file = sys.argv[1] 11 | network_file = sys.argv[2] 12 | output_dir = sys.argv[3] 13 | Nusers = int(sys.argv[4]) 14 | 15 | splitchar = ' ' 16 | splitchar = '\t' 17 | 18 | 19 | ### split math 20 | 21 | train = 50 22 | test = 49 23 | valid = 1 24 | 25 | total = float(train + test + valid) 26 | train /= total 27 | test /= total 28 | valid /= total 29 | 30 | train2 = 99 / 100.0 31 | test2 = 0.0 32 | valid2 = 1.0 / 100.0 33 | 34 | print (train, test, valid) 35 | 36 | random.seed(11) 37 | 38 | 39 | ### read in everything 40 | 41 | ratings = open(ratings_file, 'r') 42 | user_ratings = defaultdict(list) 43 | ur = defaultdict(set) 44 | for line in ratings: 45 | user, item, rating = [int(x) for x in line.strip().split(splitchar)] 46 | #user, item, rating = line.strip().split(splitchar) 47 | #user = int(user) 48 | #item = int(item) 49 | #rating = float(rating) 50 | #if rating < 3.999: 51 | # continue 52 | 53 | user_ratings[user].append((item, rating)) 54 | ur[user].add(item) 55 | ratings.close() 56 | 57 | trustnetwork = open(network_file, 'r') 58 | network = set() 59 | for line in trustnetwork: 60 | user, friend = [int(x) for x in line.strip().split(splitchar)] 61 | if (friend, user) not in network: 62 | network.add((user, friend)) 63 | trustnetwork.close() 64 | 65 | 66 | ### write out everything 67 | 68 | if not exists(output_dir): 69 | os.mkdir(output_dir) 70 | 71 | train_file = open(join(output_dir, "train.tsv"), 'w+') 72 | valid_file = open(join(output_dir, "validation.tsv"), 'w+') 73 | test_file = open(join(output_dir, "test.tsv"), 'w+') 74 | network_file = open(join(output_dir, "network.tsv"), 'w+') 75 | 76 | a = 0 77 | b = 0 78 | c = 0 79 | 80 | all_users = user_ratings.keys() 81 | random.shuffle(all_users) 82 | test_users = set(all_users[:Nusers]) 83 | test_items = set() 84 | 85 | for user in user_ratings: 86 | ratings = user_ratings[user] 87 | random.shuffle(ratings) 88 | R = len(ratings) 89 | if user in test_users: 90 | for i in range(R): 91 | item, rating = ratings[i] 92 | if rating == 0: 93 | continue 94 | r = i 95 | if (r < test * R and not (r+1) < test * R) or \ 96 | (r < (test + valid) * R and not (r+1) < (test + valid) * R): 97 | r += random.random() 98 | 99 | if r < test * R: 100 | test_file.write("%d\t%d\t%d\n" % (user, item, rating)) 101 | test_items.add(item) 102 | b += 1 103 | elif r < (test + valid) * R: 104 | valid_file.write("%d\t%d\t%d\n" % (user, item, rating)) 105 | c += 1 106 | else: 107 | train_file.write("%d\t%d\t%d\n" % (user, item, rating)) 108 | a += 1 109 | else: 110 | for i in range(R): 111 | item, rating = ratings[i] 112 | if rating == 0: 113 | continue 114 | r = i 115 | if (r < test2 * R and not (r+1) < test2 * R) or \ 116 | (r < (test2 + valid2) * R and not (r+1) < (test2 + valid2) * R): 117 | r += random.random() 118 | 119 | if r < test2 * R: 120 | test_file.write("%d\t%d\t%d\n" % (user, item, rating)) 121 | b += 1 122 | elif r < (test2 + valid2) * R: 123 | valid_file.write("%d\t%d\t%d\n" % (user, item, rating)) 124 | c += 1 125 | else: 126 | train_file.write("%d\t%d\t%d\n" % (user, item, rating)) 127 | a += 1 128 | 129 | 130 | for user, friend in network: 131 | if len(ur[user] & ur[friend]) != 0: 132 | network_file.write("%d\t%d\n" % (user, friend)) 133 | 134 | 135 | train_file.close() 136 | valid_file.close() 137 | test_file.close() 138 | network_file.close() 139 | 140 | total = float(a + b + c) 141 | print (a/total, b/total, c/total) 142 | 143 | print len(test_users), 'x', len(test_items), '=', (len(test_users)*len(test_items)) 144 | -------------------------------------------------------------------------------- /scripts/process_time_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import scipy.io 3 | from collections import defaultdict 4 | import os 5 | from os.path import join, exists 6 | import random 7 | 8 | ### command line args 9 | 10 | ratings_file = sys.argv[1] 11 | network_file = sys.argv[2] 12 | output_dir = sys.argv[3] 13 | 14 | splitchar = '\t' 15 | 16 | 17 | ### split math 18 | 19 | train = 89 20 | test = 10 21 | valid = 1 22 | 23 | total = float(train + test + valid) 24 | train /= total 25 | test /= total 26 | valid /= total 27 | 28 | print (train, test, valid) 29 | 30 | random.seed(11) 31 | 32 | 33 | ### read in everything 34 | 35 | ratings = open(ratings_file, 'r') 36 | user_ratings = defaultdict(list) 37 | ur = defaultdict(set) 38 | pop = defaultdict(int) 39 | times = defaultdict(int) 40 | total = 0 41 | for line in ratings: 42 | user, item, time = [int(x) for x in line.strip().split(splitchar)] 43 | user_ratings[user].append((item, time)) 44 | times[time] += 1 45 | ur[user].add(item) 46 | pop[item] += 1 47 | total += 1 48 | ratings.close() 49 | 50 | trustnetwork = open(network_file, 'r') 51 | network = set() 52 | for line in trustnetwork: 53 | user, friend = [int(x) for x in line.strip().split(splitchar)] 54 | if (friend, user) not in network: 55 | network.add((user, friend)) 56 | trustnetwork.close() 57 | 58 | 59 | ### write out everything 60 | 61 | if not exists(output_dir): 62 | os.mkdir(output_dir) 63 | 64 | validation_start = 0 65 | test_start = 0 66 | cur = 0.0 67 | for time in sorted(times.keys(),): 68 | cur += times[time] 69 | if cur / total >= train and validation_start == 0: 70 | validation_start = time 71 | continue 72 | elif cur / total >= train + valid and test_start == 0 and time != validation_start: 73 | test_start = time 74 | break 75 | print validation_start, test_start 76 | train_file = open(join(output_dir, "train.tsv"), 'w+') 77 | valid_file = open(join(output_dir, "validation.tsv"), 'w+') 78 | test_file = open(join(output_dir, "test.tsv"), 'w+') 79 | network_file = open(join(output_dir, "network.tsv"), 'w+') 80 | 81 | a = 0 82 | b = 0 83 | c = 0 84 | uinc = set() 85 | for user in user_ratings: 86 | ratings = user_ratings[user] 87 | R = len(ratings) 88 | for i in range(R): 89 | item, time = ratings[i] 90 | tr = 0 91 | va = 0 92 | te = 0 93 | if time < validation_start: 94 | tr += 1 95 | elif time >= validation_start and time < test_start: 96 | va += 1 97 | else: 98 | te += 1 99 | 100 | if tr < 1: # user must have at least one training item 101 | print "user %d has no training items" % user 102 | continue 103 | uinc.add(user) 104 | 105 | for i in range(R): 106 | item, time = ratings[i] 107 | if pop[item] < 2: 108 | continue 109 | if time < validation_start: 110 | train_file.write("%d\t%d\t1\n" % (user, item)) 111 | a += 1 112 | elif time >= validation_start and time < test_start: 113 | valid_file.write("%d\t%d\t1\n" % (user, item)) 114 | c += 1 115 | else: 116 | test_file.write("%d\t%d\t1\n" % (user, item)) 117 | b += 1 118 | 119 | for user, friend in network: 120 | if user not in uinc or friend not in uinc: 121 | continue 122 | if len(ur[user] & ur[friend]) != 0: 123 | network_file.write("%d\t%d\n" % (user, friend)) 124 | 125 | 126 | train_file.close() 127 | valid_file.close() 128 | test_file.close() 129 | network_file.close() 130 | 131 | total = float(a + b + c) 132 | print a, b, c, total 133 | print (a/total, b/total, c/total) 134 | -------------------------------------------------------------------------------- /scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -ne 0 ]; then 4 | echo "usage: ./setup.sh" 5 | exit 6 | fi 7 | 8 | echo "compiling SPF and popularity baseline code" 9 | cd ../src 10 | make clean 11 | make 12 | make pop 13 | 14 | echo "setting up Chong's MF code (from CTR)" 15 | echo " getting src" 16 | rm -rf ctr 17 | mkdir ctr 18 | cd ctr 19 | wget http://www.cs.cmu.edu/~chongw/software/ctr.tar.gz 20 | tar -xvzf ctr.tar.gz 21 | cd ../ 22 | 23 | echo " compiling" 24 | cd ctr; make; cd ../ 25 | make mf 26 | 27 | echo "" 28 | echo "downloading LibRec" 29 | rm -rf librec 30 | mkdir librec; cd librec 31 | wget http://www.librec.net/release/librec-v1.2.zip 32 | unzip librec-v1.2.zip 33 | cd ../ 34 | 35 | echo "compiling LibRec eval code" 36 | make librec_eval 37 | 38 | echo "all done!" 39 | -------------------------------------------------------------------------------- /scripts/sim_data.sh: -------------------------------------------------------------------------------- 1 | data=$1 2 | #thresh=$2 3 | 4 | mkdir $1/amp 5 | for ((i=0; i <= 100 ; i=i+10)); do 6 | mkdir $1/amp/$i 7 | echo "(de)amp $i" 8 | python adjust_amplification.py $1 $1/amp/$i $i 9 | #if [ $i -lt $thresh ]; then 10 | # echo "deamp $i" 11 | # python deamplify_data.py $1 $1/amp/$i $i 12 | #else 13 | # echo "amp $i" 14 | # python amplify_data.py $1 $1/amp/$i $i 15 | #fi 16 | ln -s $1/network.tsv $1/amp/$i/network.tsv 17 | done 18 | -------------------------------------------------------------------------------- /scripts/study.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -ne 4 ]; then 4 | echo "usage: ./study [data-dir] [output-dir] [K] [directed/undirected]" 5 | exit 6 | fi 7 | 8 | 9 | datadir=$(readlink -f $1) 10 | outdir=$(readlink -f $2) 11 | K=$3 12 | directed=$4 13 | iter=100 14 | 15 | echo "creating directory structure" 16 | if [ -d $outdir ]; then 17 | rm -rf $outdir 18 | fi 19 | mkdir $outdir 20 | 21 | mkdir $outdir/spf 22 | mkdir $outdir/pf 23 | mkdir $outdir/sf 24 | mkdir $outdir/pop 25 | mkdir $outdir/rand 26 | 27 | seed=948237247 28 | 29 | cd ../src 30 | 31 | echo " * initializing study of main model (this will launch multiple processes" 32 | echo " that will continue living after this bash script has completed)" 33 | 34 | convf=100 35 | savef=1000 36 | mini=100 37 | maxi=1000 38 | 39 | if [ "$directed" = "directed" ]; then 40 | # directed 41 | (time (./spf --data $datadir --out $outdir/spf --directed --svi --K $K --seed $seed --save_freq $savef --conv_freq $convf --min_iter $mini --max_iter $maxi --final_pass > $outdir/spf.out 2> $outdir/spf.err) > $outdir/spf.time.out 2> $outdir/spf.time.err &) 42 | (time (./spf --data $datadir --out $outdir/pf --directed --svi --K $K --seed $seed --save_freq $savef --conv_freq $convf --factor_only --min_iter $mini --max_iter $maxi --final_pass > $outdir/pf.out 2> $outdir/pf.err) > $outdir/pf.time.out 2> $outdir/pf.time.err &) 43 | (time (./spf --data $datadir --out $outdir/sf --directed --svi --K $K --seed $seed --save_freq $savef --conv_freq $convf --social_only --min_iter $mini --max_iter $maxi --final_pass > $outdir/sf.out 2> $outdir/sf.err) > $outdir/sf.time.out 2> $outdir/sf.time.err &) 44 | else 45 | # undirected 46 | (time (./spf --data $datadir --out $outdir/spf --svi --K $K --seed $seed --save_freq $savef --conv_freq $conf --min_iter $mini --max_iter $maxi --final_pass > $outdir/spf.out 2> $outdir/spf.err) > $outdir/spf.time.out 2> $outdir/spf.time.err &) 47 | (time (./spf --data $datadir --out $outdir/pf --svi --K $K --seed $seed --save_freq $savef --conv_freq $conf --factor_only --min_iter $mini --max_iter $maxi --final_pass > $outdir/pf.out 2> $outdir/pf.err) > $outdir/pf.time.out 2> $outdir/pf.time.err &) 48 | (time (./spf --data $datadir --out $outdir/sf --svi --K $K --seed $seed --save_freq $savef --conv_freq $conf --social_only --min_iter $mini --max_iter $maxi --final_pass > $outdir/sf.out 2> $outdir/sf.err) > $outdir/sf.time.out 2> $outdir/sf.time.err &) 49 | fi 50 | 51 | (time (./pop --data $datadir --out $outdir/pop > $outdir/pop.out 2> $outdir/pop.err) > $outdir/pop.time.out 2> $outdir/pop.time.err &) 52 | (time (./rand --data $datadir --out $outdir/rand > $outdir/rand.out 2> $outdir/rand.err) > $outdir/rand.time.out 2> $outdir/rand.time.err &) 53 | 54 | 55 | 56 | echo " * reformatting input for MF comparisons" 57 | python ../scripts/to_list_form.py $datadir 58 | if [ "$directed" = "directed" ]; then 59 | # directed 60 | python ../scripts/to_sorec_list_form.py $datadir 61 | else 62 | # undirected 63 | python ../scripts/to_sorec_list_form.py $datadir undir 64 | fi 65 | 66 | echo " * fitting MF comparisons" 67 | mkdir $outdir/MF 68 | mkdir $outdir/SoRec 69 | time (./ctr/ctr --directory $outdir/MF --user $datadir/users.dat --item $datadir/items.dat --num_factors $K --b 1 --random_seed $seed) > $outdir/MF.time.out 2> $outdir.MF.time.err 70 | time (./ctr/ctr --directory $outdir/SoRec --user $datadir/users_sorec.dat --item $datadir/items_sorec.dat --num_factors $K --b 1 --random_seed $seed) > $outdir/SoRec-ctr.time.out 2> $outdir/SoRec-ctr.time.err 71 | 72 | echo " * evaluating MF comparisons" 73 | make mf 74 | time (./mf --data $datadir --out $outdir/MF --K $K) >$outdir.MF.eval.time.out 2>$outdir.MF.eval.time.err 75 | time (./mf --data $datadir --out $outdir/SoRec --K $K) >$outdir.SoRec-ctr.eval.time.out 2>$outdir.SoRec-ctr.eval.time.err 76 | 77 | mv $outdir/SoRec $outdir/SoRec-ctr 78 | 79 | echo "" 80 | 81 | echo "" 82 | echo " * getting data ready for librec comparisons" 83 | if [ "$directed" = "directed" ]; then 84 | # directed 85 | python ../scripts/to_librec_form.py $datadir 86 | else 87 | # undirected 88 | python ../scripts/to_librec_form.py $datadir undir 89 | fi 90 | 91 | echo " * fitting librec comparisons" 92 | 93 | #for model in SoRec SocialMF TrustMF SoReg RSTE PMF TrustSVD BiasedMF "SVD++" 94 | numtest=1 95 | for model in SoRec SocialMF TrustMF RSTE TrustSVD 96 | do 97 | rm $outdir/$model/ratings.dat 98 | for testidx in $(seq -f "%02g" 1 $numtest) 99 | do 100 | echo -e "$model\t(test section $testidx)" 101 | echo "dataset.training.lins=$datadir/ratings.dat" > tmp 102 | echo "dataset.social.lins=$datadir/network.dat" >> tmp 103 | echo "dataset.testing.lins=$datadir/test-$testidx.dat" >> tmp 104 | echo "recommender=$model" >> tmp 105 | echo "num.factors=$K" >> tmp 106 | echo "num.max.iter=$iter" >> tmp 107 | 108 | if [ "$model" = "TrustSVD" ]; then 109 | echo "val.reg.social=0.5" >> tmp 110 | echo "val.learn.rate=0.001" >> tmp 111 | else 112 | echo "val.reg.social=1.0" >> tmp 113 | echo "val.learn.rate=0.01" >> tmp 114 | fi 115 | 116 | cat tmp ../conf/base.conf > ../conf/tmp.conf 117 | echo "" 118 | time java -jar librec/librec.jar -c ../conf/tmp.conf 2> $outdir/$model.fit.time.err 119 | mkdir $outdir/$model 120 | tail -n +2 Results/$model*prediction.txt >> $outdir/$model/ratings.dat 121 | 122 | done 123 | 124 | LINECOUNT=`wc -l $outdir/$model/ratings.dat | cut -f1 -d' '` 125 | 126 | if [[ $LINECOUNT != 0 ]]; then 127 | time ./librec_eval --data $datadir --out $outdir/$model 2> $outdir/$model.eval.time.err 128 | fi 129 | done 130 | 131 | echo "all done!" 132 | -------------------------------------------------------------------------------- /scripts/to_librec_form.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | import random 4 | 5 | random.seed(11) 6 | 7 | 8 | path = sys.argv[1] 9 | undir = (len(sys.argv) == 3) 10 | 11 | users = set() 12 | items = set() 13 | fout = open(path +'/ratings.dat', 'w+') 14 | user_items = defaultdict(set) 15 | user_counts = defaultdict(int) 16 | for line in open(path + '/train.tsv'): 17 | user, item, rating = [int(x.strip()) for x in line.split('\t')] 18 | fout.write("%d\t%d\t%d\n" % (user, item, rating)) 19 | items.add(item) 20 | users.add(user) 21 | user_items[user].add(item) 22 | user_counts[user] += 1 23 | for line in open(path + '/validation.tsv'): 24 | user, item, rating = [int(x.strip()) for x in line.split('\t')] 25 | if user in users and item in items: 26 | fout.write("%d\t%d\t%d\n" % (user, item, rating)) 27 | user_items[user].add(item) 28 | user_counts[user] += 1 29 | test_users = set() 30 | test_items = set() 31 | ratings = dict() 32 | for line in open(path + '/test.tsv'): 33 | user, item, rating = [int(x.strip()) for x in line.split('\t')] 34 | if user in users and item in items: 35 | test_users.add(user) 36 | test_items.add(item) 37 | ratings[(user,item)] = rating 38 | user_items[user].add(item) 39 | 40 | test_max = 60000000 41 | test_index = 1 42 | test_count = 0 43 | fout_test = open(path +'/test-%02d.dat' % test_index, 'w+') 44 | 45 | print "starting test batch 1" 46 | for user in test_users: 47 | all_items = list(items) 48 | random.shuffle(all_items) 49 | while user_counts[user] > 0 and len(all_items) != 0: 50 | item = all_items.pop() 51 | if item not in user_items: 52 | fout.write("%d\t%d\t0\n" % (user, item)) 53 | user_counts[user] -= 1 54 | 55 | for item in test_items: 56 | if (user,item) in ratings: 57 | fout_test.write("%d\t%d\t%d\n" % (user, item, rating)) 58 | else: 59 | fout_test.write("%d\t%d\t0\n" % (user, item)) 60 | test_count += 1 61 | 62 | if test_count > test_max: 63 | fout_test.close() 64 | test_index += 1 65 | fout_test = open(path +'/test-%02d.dat' % test_index, 'w+') 66 | print "starting test batch %d" % test_index 67 | test_count = 0 68 | 69 | fout.close() 70 | fout_test.close() 71 | 72 | fout = open(path +'/network.dat', 'w+') 73 | for line in open(path + '/network.tsv'): 74 | user, friend = [int(x.strip()) for x in line.split('\t')] 75 | if user in users and friend in users: 76 | fout.write("%d\t%d\t1\n" % (user, friend)) 77 | if undir: 78 | fout.write("%d\t%d\t1\n" % (friend, user)) 79 | fout.close() 80 | -------------------------------------------------------------------------------- /scripts/to_list_form.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | 4 | path = sys.argv[1] 5 | 6 | fin = open(path +'/train.tsv') 7 | fout_items = open(path +'/items.dat', 'w+') 8 | fout_users = open(path +'/users.dat', 'w+') 9 | 10 | users = defaultdict(list) 11 | items = defaultdict(list) 12 | for line in fin: 13 | user, item, rating = [int(x.strip()) for x in line.split('\t')] 14 | users[user].append(item) 15 | items[item].append(user) 16 | 17 | umap = {} 18 | imap = {} 19 | fmap_items = open(path +'/item_map.dat', 'w+') 20 | fmap_users = open(path +'/user_map.dat', 'w+') 21 | for user in users: 22 | umap[user] = len(umap) 23 | fmap_users.write("%d,%d\n" % (user, umap[user])) 24 | for item in items: 25 | imap[item] = len(imap) 26 | fmap_items.write("%d,%d\n" % (item, imap[item])) 27 | fmap_users.close() 28 | fmap_items.close() 29 | 30 | for user in sorted(users, key=lambda x: umap[x]): 31 | line = str(len(users[user])) 32 | for item in users[user]: 33 | line += ' ' + str(imap[item]) 34 | fout_users.write(line + '\n') 35 | 36 | for item in sorted(items, key=lambda x: imap[x]): 37 | line = str(len(items[item])) 38 | for user in items[item]: 39 | line += ' ' + str(umap[user]) 40 | fout_items.write(line + '\n') 41 | 42 | fout_items.close() 43 | fout_users.close() 44 | 45 | -------------------------------------------------------------------------------- /scripts/to_sorec_list_form.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | 4 | path = sys.argv[1] 5 | undir = (len(sys.argv) == 3) 6 | 7 | fin = open(path +'/train.tsv') 8 | finn = open(path +'/network.tsv') 9 | fout_items = open(path +'/items_sorec.dat', 'w+') 10 | fout_users = open(path +'/users_sorec.dat', 'w+') 11 | 12 | users = defaultdict(list) 13 | items = set() 14 | for line in fin: 15 | user, item, rating = [int(x.strip()) for x in line.split('\t')] 16 | users[user].append(item) 17 | items.add(item) 18 | 19 | 20 | umap = {} 21 | imap = {} 22 | fmap_items = open(path +'/item_map_sorec.dat', 'w+') 23 | fmap_users = open(path +'/user_map_sorec.dat', 'w+') # this should be the same 24 | for user in users: 25 | umap[user] = len(umap) 26 | fmap_users.write("%d,%d\n" % (user, umap[user])) 27 | for item in items: 28 | imap[item] = len(imap) 29 | fmap_items.write("%d,%d\n" % (item, imap[item])) 30 | fmap_users.close() 31 | fmap_items.close() 32 | 33 | 34 | user_data = defaultdict(list) 35 | item_data = defaultdict(list) 36 | for user in users: 37 | for item in users[user]: 38 | user_data[umap[user]].append(imap[item]) 39 | item_data[imap[item]].append(umap[user]) 40 | 41 | for line in finn: 42 | user, friend = [int(x.strip()) for x in line.split('\t')] 43 | if user not in umap or friend not in umap: 44 | continue 45 | 46 | user_data[umap[user]].append(len(imap) + umap[friend]) 47 | item_data[len(imap) + umap[friend]].append(umap[user]) 48 | 49 | # undirected 50 | if undir: 51 | user_data[umap[friend]].append(len(imap) + umap[user]) 52 | item_data[len(imap) + umap[user]].append(umap[friend]) 53 | 54 | for item in sorted(items, key=lambda x: imap[x]): 55 | line = str(len(item_data[imap[item]])) 56 | for user in item_data[imap[item]]: 57 | line += ' ' + str(user) 58 | fout_items.write(line + '\n') 59 | 60 | for user in sorted(users, key=lambda x: umap[x]): 61 | line = str(len(user_data[umap[user]])) 62 | for item in user_data[umap[user]]: 63 | line += ' ' + str(item) 64 | fout_users.write(line + '\n') 65 | 66 | line = str(len(item_data[len(imap) + umap[user]])) 67 | for user in item_data[len(imap) + umap[user]]: 68 | line += ' ' + str(user) 69 | fout_items.write(line + '\n') 70 | 71 | fout_items.close() 72 | fout_users.close() 73 | 74 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ -O3 -larmadillo -lgsl -Wall -static-libstdc++ 2 | 3 | LSOURCE = main.cpp utils.cpp data.cpp spf.cpp eval.cpp 4 | CSOURCE = utils.cpp data.cpp eval.cpp 5 | 6 | 7 | # main model 8 | spf: $(LSOURCE) 9 | $(CC) $(LSOURCE) -o spf 10 | 11 | profile: $(LSOURCE) 12 | $(CC) $(LSOURCE) -o spf -pg 13 | 14 | 15 | # comparison methods 16 | pop: popularity.cpp $(CSOURCE) 17 | $(CC) popularity.cpp $(CSOURCE) -o pop 18 | 19 | rand: random.cpp $(CSOURCE) 20 | $(CC) random.cpp $(CSOURCE) -o rand 21 | 22 | mf: mf.cpp $(CSOURCE) 23 | $(CC) mf.cpp $(CSOURCE) -o mf 24 | 25 | librec_eval: librec.cpp $(CSOURCE) 26 | $(CC) librec.cpp $(CSOURCE) -o librec_eval 27 | 28 | 29 | # cleanup 30 | clean: 31 | -rm -f spf pop mf librec_eval rand 32 | -------------------------------------------------------------------------------- /src/data.cpp: -------------------------------------------------------------------------------- 1 | #include "data.h" 2 | 3 | Data::Data(bool bin, bool dir) { 4 | binary = bin; 5 | directed = dir; 6 | has_network = false; 7 | } 8 | 9 | void Data::read_ratings(string filename) { 10 | // read in training data 11 | FILE* fileptr = fopen(filename.c_str(), "r"); 12 | mean_rating = 0; 13 | 14 | int user, item, rating; 15 | set dupe_checker; 16 | while ((fscanf(fileptr, "%d\t%d\t%d\n", &user, &item, &rating) != EOF)) { 17 | // look for duplicate entries; this is not a perfect check, but it's ok 18 | unsigned long long dupe_id = item * 1000000000 + user * 100 + rating; 19 | if (dupe_checker.count(dupe_id) != 0) 20 | continue; 21 | dupe_checker.insert(dupe_id); 22 | 23 | // map user and item ids 24 | if (user_ids.count(user) == 0) { 25 | user_ids[user] = user_count() - 1; 26 | reverse_user_ids[user_ids[user]] = user; 27 | user_ave_ratings[user] = 0; 28 | } 29 | if (item_ids.count(item) == 0) { 30 | item_ids[item] = item_count() - 1; 31 | reverse_item_ids[item_ids[item]] = item; 32 | item_popularity[item_ids[item]] = 0; 33 | item_ave_ratings[item] = 0; 34 | } 35 | 36 | if (rating != 0) { 37 | train_users.push_back(user_ids[user]); 38 | train_items.push_back(item_ids[item]); 39 | train_ratings.push_back(binary ? 1 : rating); 40 | item_popularity[item_ids[item]] += 1; 41 | 42 | user_ave_ratings[user] += rating; 43 | item_ave_ratings[item] += rating; 44 | mean_rating += rating; 45 | } 46 | } 47 | fclose(fileptr); 48 | 49 | umat locations = umat(2, num_training()); 50 | fcolvec values = fcolvec(num_training()); 51 | user_items = new vector[user_count()]; 52 | for (int i = 0; i < num_training(); i++) { 53 | locations(0,i) = train_users[i]; // row 54 | locations(1,i) = train_items[i]; // col 55 | values(i) = train_ratings[i]; 56 | user_items[train_users[i]].push_back(train_items[i]); 57 | } 58 | ratings = sp_fmat(locations, values, user_count(), item_count()); 59 | 60 | mean_rating /= num_training(); 61 | for (int user = 0; user < user_count(); user++) { 62 | user_ave_ratings[reverse_user_ids[user]] /= user_items[user].size(); 63 | } 64 | for (int item = 0; item < item_count(); item++) { 65 | item_ave_ratings[reverse_item_ids[item]] /= item_popularity[item]; 66 | } 67 | } 68 | 69 | void Data::read_network(string filename) { 70 | // initialize network data structures 71 | network = new vector[user_count()]; 72 | has_network = true; 73 | 74 | // read in network data from file 75 | FILE* fileptr = fopen(filename.c_str(), "r"); 76 | 77 | int user, neighbor, u, n; 78 | int network_count = 0; 79 | while ((fscanf(fileptr, "%d\t%d\n", &user, &neighbor) != EOF)) { 80 | // skip connections in which either user or neighbor is seen in training 81 | if (user_ids.count(user) == 0 || user_ids.count(neighbor) == 0) 82 | continue; 83 | 84 | u = user_ids[user]; 85 | n = user_ids[neighbor]; 86 | 87 | if (!has_connection_init(u, n)) { 88 | network[u].push_back(n); 89 | network_count++; 90 | } 91 | if (!directed && !has_connection_init(n, u)) { 92 | network[n].push_back(u); 93 | network_count++; 94 | } 95 | } 96 | 97 | fclose(fileptr); 98 | 99 | umat locations = umat(2, network_count); 100 | fcolvec values = fcolvec(network_count); 101 | network_count = 0; 102 | for (user = 0; user < user_count(); user++) { 103 | for (n = 0; n < neighbor_count(user); n++) { 104 | neighbor = get_neighbor(user, n); 105 | 106 | locations(0, network_count) = neighbor; // row 107 | locations(1, network_count) = user; // col 108 | values(network_count) = 1; 109 | network_count++; 110 | } 111 | } 112 | 113 | network_spmat = sp_fmat(locations, values, user_count(), user_count()); 114 | 115 | } 116 | 117 | void Data::read_validation(string filename) { 118 | // read in validation data 119 | FILE* fileptr = fopen(filename.c_str(), "r"); 120 | 121 | int user, item, rating; 122 | set dupe_checker; 123 | while ((fscanf(fileptr, "%d\t%d\t%d\n", &user, &item, &rating) != EOF)) { 124 | // look for duplicate entries; this is not a perfect check, but it's ok 125 | long dupe_id = item * 100000 + user * 100 + rating; 126 | if (dupe_checker.count(dupe_id) != 0) 127 | continue; 128 | dupe_checker.insert(dupe_id); 129 | 130 | // map user and item ids 131 | if (user_ids.count(user) == 0 || item_ids.count(item) == 0) 132 | continue; 133 | 134 | validation_users.push_back(user_ids[user]); 135 | validation_items.push_back(item_ids[item]); 136 | if (binary) 137 | validation_ratings.push_back(rating != 0 ? 1 : 0); 138 | else 139 | validation_ratings.push_back(binary ? 1 : rating); 140 | //TODO: all evaluation assumes no zero held out (see above lines for problem) 141 | } 142 | 143 | fclose(fileptr); 144 | 145 | umat locations = umat(2, num_validation()); 146 | fcolvec values = fcolvec(num_validation()); 147 | for (int i = 0; i < num_validation(); i++) { 148 | locations(0, i) = validation_users[i]; 149 | locations(1, i) = validation_items[i]; 150 | values(i) = validation_ratings[i]; 151 | } 152 | 153 | validation_ratings_matrix = sp_fmat(locations, values, user_count(), item_count()); 154 | } 155 | 156 | void Data::read_test(string filename) { 157 | // read in test data 158 | FILE* fileptr = fopen(filename.c_str(), "r"); 159 | 160 | int user, item, rating, u, i; 161 | test_ratings = sp_umat(user_count(), item_count()); 162 | while ((fscanf(fileptr, "%d\t%d\t%d\n", &user, &item, &rating) != EOF)) { 163 | // map user and item ids 164 | if (user_ids.count(user) == 0 || item_ids.count(item) == 0) 165 | continue; 166 | u = user_ids[user]; 167 | i = item_ids[item]; 168 | if (ratings(u, i) != 0 || validation_ratings_matrix(u, i) != 0) 169 | continue; 170 | 171 | if (binary) 172 | rating = rating != 0 ? 1: 0; 173 | 174 | test_users.insert(u); 175 | test_items.insert(i); 176 | 177 | test_ratings(u, i) = rating; 178 | test_count[u]++; 179 | test_count_item[i]++; 180 | test_count[-1]++; 181 | } 182 | 183 | fclose(fileptr); 184 | } 185 | 186 | void Data::save_summary(string filename) { 187 | FILE* file = fopen(filename.c_str(), "w"); 188 | 189 | fprintf(file, "num users:\t%d\n", user_count()); 190 | fprintf(file, "num items:\t%d\n", item_count()); 191 | fprintf(file, "num ratings:\t%d\t%d\t%d\n", num_training(), num_validation(), num_test()); 192 | 193 | if (has_network) { 194 | int nc = 0; 195 | for (int user = 0; user < user_count(); user++) 196 | nc += network[user].size(); 197 | if (directed) { 198 | fprintf(file, "network connections:\t%d directed\n", nc); 199 | } else { 200 | fprintf(file, "network connections:\t%d undirected\n", (nc/2)); 201 | } 202 | fclose(file); 203 | } 204 | } 205 | 206 | int Data::user_count() { 207 | return user_ids.size(); 208 | } 209 | 210 | bool Data::has_connection_init(int user, int neighbor) { 211 | for (unsigned int i = 0; i < network[user].size(); i++) { 212 | if (network[user][i] == neighbor) 213 | return true; 214 | } 215 | return false; 216 | } 217 | 218 | bool Data::has_connection(int user, int neighbor) { 219 | if (network_spmat(neighbor, user) == 1) 220 | return true; 221 | return false; 222 | } 223 | 224 | int Data::item_count() { 225 | return item_ids.size(); 226 | } 227 | 228 | int Data::neighbor_count(int user) { 229 | return network[user].size(); 230 | } 231 | 232 | int Data::get_neighbor(int user, int n) { 233 | return network[user][n]; 234 | } 235 | 236 | int Data::connectivity(int user) { 237 | int connections = 0; 238 | unsigned int i, j; 239 | for (i = 0; i < network[user].size(); i++) { 240 | for (j = 0; j < network[user].size(); j++) { 241 | if (has_connection(network[user][i], network[user][j])) 242 | connections++; 243 | } 244 | } 245 | return connections; 246 | } 247 | 248 | int Data::item_count(int user) { 249 | return user_items[user].size(); 250 | } 251 | 252 | int Data::get_item(int user, int i) { 253 | return user_items[user][i]; 254 | } 255 | 256 | int Data::user_id(int user) { 257 | return reverse_user_ids[user]; 258 | } 259 | 260 | int Data::item_id(int item) { 261 | return reverse_item_ids[item]; 262 | } 263 | 264 | int Data::popularity(int item) { 265 | return item_popularity[item]; 266 | } 267 | 268 | float Data::ave_rating() { 269 | return mean_rating; 270 | } 271 | 272 | float Data::item_ave_rating(int item) { 273 | return item_ave_ratings[item]; 274 | } 275 | 276 | float Data::user_ave_rating(int user) { 277 | return user_ave_ratings[user]; 278 | } 279 | 280 | // training data 281 | int Data::num_training() { 282 | return train_ratings.size(); 283 | } 284 | 285 | int Data::get_train_user(int i) { 286 | return train_users[i]; 287 | } 288 | 289 | int Data::get_train_item(int i) { 290 | return train_items[i]; 291 | } 292 | 293 | int Data::get_train_rating(int i) { 294 | return train_ratings[i]; 295 | } 296 | 297 | // validation data 298 | int Data::num_validation() { 299 | return validation_ratings.size(); 300 | } 301 | 302 | int Data::get_validation_user(int i) { 303 | return validation_users[i]; 304 | } 305 | 306 | int Data::get_validation_item(int i) { 307 | return validation_items[i]; 308 | } 309 | 310 | int Data::get_validation_rating(int i) { 311 | return validation_ratings[i]; 312 | } 313 | 314 | bool Data::in_validation(int user, int item) { 315 | return validation_ratings_matrix(user, item) != 0; 316 | } 317 | 318 | // test data 319 | int Data::num_test() { 320 | return test_count[-1]; 321 | } 322 | 323 | int Data::num_test(int user) { 324 | return test_count[user]; 325 | } 326 | 327 | int Data::num_test_item(int item) { 328 | return test_count_item[item]; 329 | } 330 | 331 | -------------------------------------------------------------------------------- /src/data.h: -------------------------------------------------------------------------------- 1 | #ifndef DATA_H 2 | #define DATA_H 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define ARMA_64BIT_WORD 12 | #include 13 | 14 | using namespace std; 15 | using namespace arma; 16 | 17 | class Data { 18 | private: 19 | bool binary; 20 | bool directed; 21 | bool has_network; 22 | 23 | map user_ids; 24 | map item_ids; 25 | map reverse_user_ids; 26 | map reverse_item_ids; 27 | 28 | vector* network; 29 | vector* user_items; 30 | 31 | map item_popularity; 32 | 33 | // training data 34 | vector train_users; 35 | vector train_items; 36 | vector train_ratings; 37 | 38 | // validation data 39 | vector validation_users; 40 | vector validation_items; 41 | vector validation_ratings; 42 | sp_fmat validation_ratings_matrix; 43 | 44 | // test data 45 | map test_count; 46 | map test_count_item; 47 | 48 | // for use in initializing the network data structures only 49 | bool has_connection_init(int user, int neighbor); 50 | 51 | // simple summaries 52 | float mean_rating; 53 | map item_ave_ratings; 54 | map user_ave_ratings; 55 | 56 | public: 57 | sp_fmat ratings; 58 | sp_fmat network_spmat; 59 | 60 | Data(bool bin, bool dir); 61 | void read_ratings(string filename); 62 | void read_network(string filename); 63 | void read_validation(string filename); 64 | void read_test(string filename); 65 | void save_summary(string filename); 66 | 67 | int user_count(); 68 | int item_count(); 69 | 70 | int neighbor_count(int user); 71 | int get_neighbor(int user, int n); 72 | 73 | int connectivity(int user); 74 | 75 | int item_count(int user); 76 | int get_item(int user, int i); 77 | 78 | bool has_connection(int user, int neighbor); 79 | 80 | int user_id(int user); 81 | int item_id(int item); 82 | 83 | int popularity(int item); 84 | float ave_rating(); 85 | float item_ave_rating(int item); 86 | float user_ave_rating(int user); 87 | 88 | // training data 89 | int num_training(); 90 | int get_train_user(int i); 91 | int get_train_item(int i); 92 | int get_train_rating(int i); 93 | 94 | // validation data 95 | int num_validation(); 96 | int get_validation_user(int i); 97 | int get_validation_item(int i); 98 | int get_validation_rating(int i); 99 | bool in_validation(int user, int item); 100 | 101 | // test data 102 | set test_users; 103 | set test_items; 104 | sp_umat test_ratings; 105 | int num_test(); 106 | int num_test(int user); 107 | int num_test_item(int item); 108 | }; 109 | 110 | #endif 111 | -------------------------------------------------------------------------------- /src/eval.cpp: -------------------------------------------------------------------------------- 1 | #include "eval.h" 2 | 3 | // random generator to break ties 4 | gsl_rng* rand_gen = gsl_rng_alloc(gsl_rng_taus); 5 | 6 | // helper function to write out per-user info 7 | void log_user(FILE* file, Data *data, int user, int heldout, double rmse, double mae, 8 | double rank, int first, double crr, double ncrr, double ndcg, bool stats) { 9 | if (stats) 10 | fprintf(file, "%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%d\t%f\t%f\t%f\n", user, 11 | data->user_id(user), heldout, data->item_count(user), 12 | data->neighbor_count(user), data->connectivity(user), 13 | rmse, mae, rank, first, crr, ncrr, ndcg); 14 | else 15 | fprintf(file, "%d\t%d\t%f\t%f\t%f\t%d\t%f\t%f\t%f\n", user, 16 | data->user_id(user), 17 | rmse, mae, rank, first, crr, ncrr, ndcg); 18 | return; 19 | } 20 | 21 | void log_item(FILE* file, Data *data, int item, int heldout, double rmse, double mae, 22 | double rank, int first, double crr, double ncrr, double ndcg, bool stats) { 23 | if (stats) 24 | fprintf(file, "%d\t%d\t%d\t%d\t%f\t%f\t%f\t%d\t%f\t%f\t%f\n", item, 25 | data->item_id(item), data->popularity(item), heldout, 26 | rmse, mae, rank, first, crr, ncrr, ndcg); 27 | else 28 | fprintf(file, "%d\t%d\t%f\t%f\t%f\t%d\t%f\t%f\t%f\n", item, 29 | data->item_id(item), 30 | rmse, mae, rank, first, crr, ncrr, ndcg); 31 | return; 32 | } 33 | 34 | // helper function to sort predictions properly 35 | bool prediction_compare(const pair& itemA, 36 | const pair& itemB) { 37 | // if the two values are equal, sort by popularity! 38 | if (itemA.first == itemB.first) { 39 | return gsl_rng_uniform_int(rand_gen, 2) != 0; 40 | } 41 | return itemA.first > itemB.first; 42 | } 43 | 44 | 45 | // take a prediction function as an argument 46 | void eval(Model* model, double (Model::*prediction)(int,int), string outdir, Data* data, bool stats, 47 | unsigned long int seed, bool verbose, string label, bool write_rankings, bool mapped_ids) { 48 | // random generator to break ties 49 | gsl_rng_set(rand_gen, seed); 50 | 51 | // test the final model fit 52 | printf("evaluating model on held-out data\n"); 53 | 54 | FILE* file = fopen((outdir+"/rankings_" + label + ".tsv").c_str(), "w"); 55 | if (write_rankings) 56 | fprintf(file, "user.map\tuser.id\titem.map\titem.id\tpred\trank\trating\n"); 57 | 58 | FILE* user_file = fopen((outdir+"/user_eval_" + label + ".tsv").c_str(), "w"); 59 | if (stats) 60 | fprintf(user_file, "user.map\tuser.id\tnum.heldout\tnum.train\tdegree\tconnectivity\trmse\tmae\tave.rank\tfirst\tcrr\tncrr\tndcg\n"); 61 | else 62 | fprintf(user_file, "user.map\tuser.id\trmse\tmae\tave.rank\tfirst\tcrr\tncrr\tndcg\n"); 63 | 64 | FILE* item_file = fopen((outdir+"/item_eval_" + label + ".tsv").c_str(), "w"); 65 | fprintf(item_file, "item.map\titem.id\tpopularity\theldout\trmse\tmae\tave.rank\tfirst\tcrr\tncrr\tndcg\n"); 66 | 67 | // overall metrics to track 68 | double rmse = 0; 69 | double mae = 0; 70 | double aggr_rank = 0; 71 | double crr = 0; 72 | double user_sum_rmse = 0; 73 | double user_sum_mae = 0; 74 | double user_sum_rank = 0; 75 | double user_sum_first = 0; 76 | double user_sum_crr = 0; 77 | double user_sum_ncrr = 0; 78 | double user_sum_ndcg = 0; 79 | 80 | // per user attibutes 81 | double user_rmse = 0; 82 | double user_mae = 0; 83 | int user_heldout = 0; 84 | double user_rank = 0; 85 | int first = 0; 86 | double user_crr = 0; 87 | double user_ncrr = 0; 88 | double user_ncrr_normalizer = 0; 89 | double user_ndcg = 0; 90 | double user_ndcg_normalizer = 0; 91 | 92 | // helper var for evaluation (used for mulitple metrics) 93 | double local_metric; 94 | 95 | // helper var to hold predicted rating 96 | double pred; 97 | 98 | // overall attributes to track 99 | int user_count = 0; 100 | int heldout_count = 0; 101 | 102 | int user, item, rating, rank; 103 | list > ratings; 104 | int total_pred = 0; 105 | 106 | for (set::iterator iter_user = data->test_users.begin(); 107 | iter_user != data->test_users.end(); 108 | iter_user++){ 109 | 110 | user = *iter_user; 111 | if (verbose) { 112 | printf("user %d\n", user); 113 | } 114 | user_count++; 115 | 116 | user_rmse = 0; 117 | user_mae = 0; 118 | user_rank = 0; 119 | first = 0; 120 | user_crr = 0; 121 | user_ncrr_normalizer = 0; 122 | user_ndcg = 0; 123 | user_ndcg_normalizer = 0; 124 | user_heldout = 0; 125 | 126 | for (set::iterator iter_item = data->test_items.begin(); 127 | iter_item != data->test_items.end(); 128 | iter_item++){ 129 | 130 | item = *iter_item; 131 | 132 | // don't rank items that we've already seen 133 | if (data->ratings(user, item) != 0 || 134 | data->in_validation(user, item)) 135 | continue; 136 | 137 | total_pred++; 138 | 139 | double p = 0; 140 | if (mapped_ids) { 141 | p = (model->*prediction)(data->user_id(user),data->item_id(item)); 142 | } else { 143 | p = (model->*prediction)(user, item); 144 | } 145 | ratings.push_back(make_pair(p, item)); 146 | } 147 | 148 | ratings.sort(prediction_compare); 149 | 150 | rank = 0; 151 | int test_count = data->num_test(user); 152 | while (user_heldout < test_count && !ratings.empty()) { 153 | pair pred_set = ratings.front(); 154 | item = pred_set.second; 155 | rating = data->test_ratings(user, item); 156 | pred = pred_set.first; 157 | rank++; 158 | if (rank <= 1000 && write_rankings) { // TODO: make this threshold a command line arg 159 | fprintf(file, "%d\t%d\t%d\t%d\t%f\t%d\t%d\n", user, data->user_id(user), 160 | item, data->item_id(item), pred, rank, rating); 161 | } 162 | 163 | // compute metrics only on held-out items 164 | if (rating != 0) { 165 | user_heldout++; 166 | heldout_count++; 167 | 168 | local_metric = pow(rating - pred, 2); 169 | rmse += local_metric; 170 | user_rmse += local_metric; 171 | 172 | local_metric = abs(rating - pred); 173 | mae += local_metric; 174 | user_mae += local_metric; 175 | 176 | aggr_rank += rank; 177 | user_rank += rank; 178 | 179 | local_metric = 1.0 / rank; 180 | user_crr += local_metric; 181 | crr += local_metric; 182 | user_ncrr_normalizer += 1.0 / user_heldout; 183 | 184 | user_ndcg += rating / log(rank + 1); 185 | user_ndcg_normalizer += rating / log(user_heldout + 1); 186 | 187 | if (first == 0) 188 | first = rank; 189 | } 190 | 191 | ratings.pop_front(); 192 | } 193 | while (!ratings.empty()){ 194 | ratings.pop_front(); 195 | } 196 | 197 | // log this user's metrics 198 | user_rmse = sqrt(user_rmse / user_heldout); 199 | user_mae /= user_heldout; 200 | user_rank /= user_heldout; 201 | user_ncrr = user_crr / user_ncrr_normalizer; 202 | user_ndcg /= user_ndcg_normalizer; 203 | 204 | log_user(user_file, data, user, user_heldout, user_rmse, 205 | user_mae, user_rank, first, user_crr, user_ncrr, user_ndcg, stats); 206 | 207 | // add this user's metrics to overall metrics 208 | user_sum_rmse += user_rmse; 209 | user_sum_mae += user_mae; 210 | user_sum_rank += user_rank; 211 | user_sum_first += first; 212 | user_sum_crr += user_crr; 213 | user_sum_ncrr += user_ncrr; 214 | user_sum_ndcg += user_ndcg; 215 | } 216 | fclose(user_file); 217 | fclose(file); 218 | if (!write_rankings) 219 | remove((outdir+"/rankings_" + label + ".tsv").c_str()); 220 | 221 | 222 | // per item attibutes 223 | double item_rmse = 0; 224 | double item_mae = 0; 225 | int item_heldout = 0; 226 | double item_rank = 0; 227 | double item_crr = 0; 228 | double item_ncrr = 0; 229 | double item_ncrr_normalizer = 0; 230 | double item_ndcg = 0; 231 | double item_ndcg_normalizer = 0; 232 | 233 | for (set::iterator iter_item = data->test_items.begin(); 234 | iter_item != data->test_items.end(); 235 | iter_item++){ 236 | 237 | item = *iter_item; 238 | if (verbose) { 239 | printf("item %d\n", item); 240 | } 241 | 242 | item_rmse = 0; 243 | item_mae = 0; 244 | item_rank = 0; 245 | first = 0; 246 | item_crr = 0; 247 | item_ncrr_normalizer = 0; 248 | item_ndcg = 0; 249 | item_ndcg_normalizer = 0; 250 | item_heldout = 0; 251 | 252 | for (set::iterator iter_user = data->test_users.begin(); 253 | iter_user != data->test_users.end(); 254 | iter_user++){ 255 | 256 | user = *iter_user; 257 | 258 | // don't rank items that we've already seen 259 | if (data->ratings(user, item) != 0 || 260 | data->in_validation(user, item)) 261 | continue; 262 | 263 | total_pred++; 264 | 265 | ratings.push_back(make_pair((model->*prediction)(user, item), user)); 266 | } 267 | 268 | ratings.sort(prediction_compare); 269 | 270 | rank = 0; 271 | int test_count = data->num_test_item(item); 272 | while (item_heldout < test_count && !ratings.empty()) { 273 | pair pred_set = ratings.front(); 274 | user = pred_set.second; 275 | rating = data->test_ratings(user, item); 276 | pred = pred_set.first; 277 | rank++; 278 | 279 | // compute metrics only on held-out items 280 | if (rating != 0) { 281 | item_heldout++; 282 | 283 | item_rmse += pow(rating - pred, 2); 284 | item_mae += abs(rating - pred); 285 | item_rank += rank; 286 | item_crr += 1.0 / rank; 287 | item_ncrr_normalizer += 1.0 / item_heldout; 288 | 289 | item_ndcg += rating / log(rank + 1); 290 | item_ndcg_normalizer += rating / log(item_heldout + 1); 291 | 292 | if (first == 0) 293 | first = rank; 294 | } 295 | 296 | ratings.pop_front(); 297 | } 298 | while (!ratings.empty()){ 299 | ratings.pop_front(); 300 | } 301 | 302 | // log this item's metrics 303 | item_rmse = sqrt(item_rmse / item_heldout); 304 | item_mae /= item_heldout; 305 | item_rank /= item_heldout; 306 | item_ncrr = item_crr / item_ncrr_normalizer; 307 | item_ndcg /= item_ndcg_normalizer; 308 | 309 | log_item(item_file, data, item, item_heldout, item_rmse, 310 | item_mae, item_rank, first, item_crr, item_ncrr, item_ndcg, stats); 311 | } 312 | fclose(item_file); 313 | 314 | // write out results 315 | file = fopen((outdir+"/eval_summary_" + label + ".dat").c_str(), "w"); 316 | fprintf(file, "metric\tuser average\theldout pair average\n"); 317 | fprintf(file, "RMSE\t%f\t%f\n", user_sum_rmse/user_count, 318 | sqrt(rmse/heldout_count)); 319 | fprintf(file, "MAE\t%f\t%f\n", user_sum_mae/user_count, mae/heldout_count); 320 | fprintf(file, "rank\t%f\t%f\n", user_sum_rank/user_count, 321 | aggr_rank/heldout_count); 322 | fprintf(file, "first\t%f\t---\n", user_sum_first/user_count); 323 | fprintf(file, "CRR\t%f\t%f\n", user_sum_crr/user_count, crr/heldout_count); 324 | fprintf(file, "NCRR\t%f\t---\n", user_sum_ncrr/user_count); 325 | fprintf(file, "NDCG\t%f\t---\n", user_sum_ndcg/user_count); 326 | fclose(file); 327 | } 328 | -------------------------------------------------------------------------------- /src/eval.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | //#include "utils.h" 6 | #include "data.h" 7 | #include "model.h" 8 | 9 | // helper function to write out per-user info 10 | void log_user(FILE* file, Data *data, int user, int heldout, double rmse, double mae, 11 | double rank, int first, double crr, double ncrr, double ndcg, bool stats); 12 | 13 | void log_item(FILE* file, Data *data, int item, int heldout, double rmse, double mae, 14 | double rank, int first, double crr, double ncrr, double ndcg, bool stats); 15 | 16 | // helper function to sort predictions properly 17 | bool prediction_compare(const pair& itemA, 18 | const pair& itemB); 19 | 20 | // take a prediction function as an argument 21 | void eval(Model* model, double (Model::*prediction)(int,int), string outdir, Data* data, bool stats, unsigned long int seed, bool verbose, string label, bool write_rankings, bool mapped_ids); 22 | -------------------------------------------------------------------------------- /src/librec.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "utils.h" 6 | #include "data.h" 7 | #include "eval.h" 8 | 9 | void print_usage_and_exit() { 10 | // print usage information 11 | printf("*************************** Predict by Poupularity *****************************\n"); 12 | printf("(c) Copyright 2014-2015 Allison J.B. Chaney ( achaney@cs.princeton.edu )\n"); 13 | printf("Distributed under MIT License; see LICENSE file for details.\n"); 14 | 15 | printf("\nusage:\n"); 16 | printf(" ./librec_eval [options]\n"); 17 | printf(" --help print help information\n"); 18 | printf(" --verbose print extra information while running\n"); 19 | 20 | printf("\n"); 21 | printf(" --out {dir} save directory, required\n"); 22 | printf(" --data {dir} data directory, required\n"); 23 | 24 | printf("********************************************************************************\n"); 25 | 26 | exit(0); 27 | } 28 | 29 | class LibRec: protected Model { 30 | private: 31 | map > preds; 32 | 33 | public: 34 | LibRec(Data* d) { 35 | data = d; 36 | } 37 | 38 | void read_preds(string outdir) { 39 | int user, item; 40 | float r, prediction; 41 | 42 | FILE* fileptr = fopen((outdir+"/ratings.dat").c_str(), "r"); 43 | printf("about to read ratings from %s\n", (outdir+"/ratings.dat").c_str()); 44 | while (fscanf(fileptr, "%d %d %f %f\n", &user, &item, &r, &prediction) != EOF) { 45 | preds[user][item] = prediction; 46 | } 47 | fclose(fileptr); 48 | } 49 | 50 | double predict(int user, int item) { 51 | return preds[user][item]; 52 | } 53 | 54 | void evaluate(string outdir, bool verbose) { 55 | eval(this, &Model::predict, outdir, data, false, 11, verbose, "final", true, true); 56 | } 57 | }; 58 | 59 | 60 | int main(int argc, char* argv[]) { 61 | if (argc < 2) print_usage_and_exit(); 62 | 63 | // variables to store command line args + defaults 64 | string outdir = ""; 65 | string datadir = ""; 66 | bool verbose = false; 67 | 68 | // ':' after a character means it takes an argument 69 | const char* const short_options = "hqo:d:"; 70 | const struct option long_options[] = { 71 | {"help", no_argument, NULL, 'h'}, 72 | {"verbose", no_argument, NULL, 'q'}, 73 | {"out", required_argument, NULL, 'o'}, 74 | {"data", required_argument, NULL, 'd'}, 75 | {NULL, 0, NULL, 0}}; 76 | 77 | 78 | int opt = 0; 79 | while(true) { 80 | opt = getopt_long(argc, argv, short_options, long_options, NULL); 81 | switch(opt) { 82 | case 'h': 83 | print_usage_and_exit(); 84 | break; 85 | case 'q': 86 | verbose = true; 87 | break; 88 | case 'o': 89 | outdir = optarg; 90 | break; 91 | case 'd': 92 | datadir = optarg; 93 | break; 94 | case -1: 95 | break; 96 | case '?': 97 | print_usage_and_exit(); 98 | break; 99 | default: 100 | break; 101 | } 102 | if (opt == -1) 103 | break; 104 | } 105 | 106 | // print information 107 | printf("********************************************************************************\n"); 108 | 109 | if (outdir == "") { 110 | printf("No output directory specified. Exiting.\n"); 111 | exit(-1); 112 | } 113 | 114 | printf("output directory: %s\n", outdir.c_str()); 115 | 116 | if (datadir == "") { 117 | printf("No data directory specified. Exiting.\n"); 118 | exit(-1); 119 | } 120 | 121 | if (!dir_exists(datadir)) { 122 | printf("data directory %s doesn't exist! Exiting.\n", datadir.c_str()); 123 | exit(-1); 124 | } 125 | printf("data directory: %s\n", datadir.c_str()); 126 | 127 | if (!file_exists(datadir + "/train.tsv")) { 128 | printf("training data file (train.tsv) doesn't exist! Exiting.\n"); 129 | exit(-1); 130 | } 131 | 132 | if (!file_exists(datadir + "/validation.tsv")) { 133 | printf("validation data file (validation.tsv) doesn't exist! Exiting.\n"); 134 | exit(-1); 135 | } 136 | 137 | 138 | // read in the data 139 | printf("********************************************************************************\n"); 140 | printf("reading data\n"); 141 | Data *data = new Data(true, false); 142 | printf("\treading training data\t\t...\t"); 143 | data->read_ratings(datadir + "/train.tsv"); 144 | printf("done\n"); 145 | 146 | printf("\treading validation data\t\t...\t"); 147 | data->read_validation(datadir + "/validation.tsv"); 148 | printf("done\n"); 149 | 150 | if (!file_exists(datadir + "/test.tsv")) { 151 | printf("testing data file (test.tsv) doesn't exist! Exiting.\n"); 152 | exit(-1); 153 | } 154 | printf("\treading testing data\t\t...\t"); 155 | data->read_test(datadir + "/test.tsv"); 156 | printf("done\n"); 157 | 158 | printf("\tsaving data stats\t\t...\t"); 159 | data->save_summary(outdir + "/data_stats.txt"); 160 | printf("done\n"); 161 | 162 | printf("********************************************************************************\n"); 163 | printf("commencing model evaluation\n"); 164 | 165 | // read in the ratings 166 | 167 | LibRec lr = LibRec(data); 168 | 169 | printf("starting to read ratings\n"); 170 | lr.read_preds(outdir); 171 | 172 | lr.evaluate(outdir, verbose); 173 | 174 | delete data; 175 | 176 | return 0; 177 | } 178 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "spf.h" 3 | 4 | 5 | #include 6 | //#include 7 | //#include "ctr.h" 8 | 9 | //gsl_rng * RANDOM_NUMBER = NULL; 10 | 11 | void print_usage_and_exit() { 12 | // print usage information 13 | printf("********************** Social Poisson Factorization (SPF) **********************\n"); 14 | printf("(c) Copyright 2014-2015 Allison J.B. Chaney ( achaney@cs.princeton.edu )\n"); 15 | printf("Distributed under MIT License; see LICENSE file for details.\n"); 16 | 17 | printf("\nusage:\n"); 18 | printf(" spf [options]\n"); 19 | printf(" --help print help information\n"); 20 | printf(" --verbose print extra information while running\n"); 21 | 22 | printf("\n"); 23 | printf(" --out {dir} save directory, required\n"); 24 | printf(" --data {dir} data directory, required\n"); 25 | 26 | printf("\n"); 27 | printf(" --svi use stochastic VI (instead of batch VI)\n"); 28 | printf(" default off for < 10M ratings in training\n"); 29 | printf(" --batch use batch VI (instead of SVI)\n"); 30 | printf(" default on for < 10M ratings in training\n"); 31 | 32 | printf("\n"); 33 | printf(" --a_theta {a} shape hyperparamter to theta (user preferences); default 0.3\n"); 34 | printf(" --b_theta {b} rate hyperparamter to theta (user preferences); default 0.3\n"); 35 | printf(" --a_beta {a} shape hyperparamter to beta (item attributes); default 0.3\n"); 36 | printf(" --b_beta {b} rate hyperparamter to beta (item attributes); default 0.3\n"); 37 | printf(" --a_tau {a} shape hyperparamter to tau (user influence); default 2\n"); 38 | printf(" --b_tau {b} rate hyperparamter to tau (user influence); default 5\n"); 39 | printf(" --a_delta {a} shape hyperparamter to delta (item bias); default 0.3\n"); 40 | printf(" --b_delta {b} rate hyperparamter to delta (item bias); default 0.3\n"); 41 | 42 | printf("\n"); 43 | printf(" --social_only only consider social aspect of factorization (SF)\n"); 44 | printf(" --factor_only only consider general factors (no social; PF)\n"); 45 | printf(" --fix_influence fix all user influence to be 1\n"); 46 | printf(" --bias include a bias term for each item\n"); 47 | 48 | printf("\n"); 49 | printf(" --binary assume ratings are binary (instead of default integer)\n"); 50 | printf(" --directed assume network is directed (instead of default undirected)\n"); 51 | 52 | printf("\n"); 53 | printf(" --seed {seed} the random seed, default from time\n"); 54 | printf(" --save_freq {f} the saving frequency, default 20. Negative value means\n"); 55 | printf(" no savings for intermediate results.\n"); 56 | printf(" --eval_freq {f} the intermediate evaluating frequency, default -1.\n"); 57 | printf(" Negative means no evaluation for intermediate results.\n"); 58 | printf(" --conv_freq {f} the convergence check frequency, default 10.\n"); 59 | printf(" --max_iter {max} the max number of iterations, default 300\n"); 60 | printf(" --min_iter {min} the min number of iterations, default 30\n"); 61 | printf(" --converge {c} the change in log likelihood required for convergence\n"); 62 | printf(" default 1e-6\n"); 63 | printf(" --final_pass do a final pass on all users and items\n"); 64 | printf(" --final_pass_test do a final pass only on test users\n"); 65 | printf("\n"); 66 | 67 | printf(" --sample {size} the stochastic sample size, default 1000\n"); 68 | printf(" --svi_delay {t} SVI delay >= 0 to down-weight early samples, default 1024\n"); 69 | printf(" --svi_forget {k} SVI forgetting rate (0.5,1], default 0.75\n"); 70 | printf("\n"); 71 | 72 | printf(" --K {K} the number of general factors, default 100\n"); 73 | 74 | printf("********************************************************************************\n"); 75 | 76 | exit(0); 77 | } 78 | 79 | int main(int argc, char* argv[]) { 80 | if (argc < 2) print_usage_and_exit(); 81 | 82 | // variables to store command line args + defaults 83 | string out = ""; 84 | string data = ""; 85 | bool verbose = false; 86 | 87 | bool svi = false; 88 | bool batchvi = false; 89 | 90 | double a_theta = 0.3; 91 | double b_theta = 0.3; 92 | double a_beta = 0.3; 93 | double b_beta = 0.3; 94 | double a_tau = 2; 95 | double b_tau = 5; 96 | double a_delta = 0.3; 97 | double b_delta = 0.3; 98 | 99 | // these are really bools, but typed as integers to play nice with getopt 100 | int social_only = 0; 101 | int factor_only = 0; 102 | int fix_influence = 0; 103 | bool item_bias = 0; 104 | int binary = 0; 105 | int directed = 0; 106 | bool final_pass = 0; 107 | bool final_pass_test = 0; 108 | 109 | time_t t; time(&t); 110 | long seed = (long) t; 111 | int save_freq = 20; 112 | int eval_freq = -1; 113 | int conv_freq = 10; 114 | int max_iter = 300; 115 | int min_iter = 30; 116 | double converge_delta = 1e-6; 117 | 118 | int sample_size = 1000; 119 | double svi_delay = 1024; 120 | double svi_forget = 0.75; 121 | 122 | int k = 100; 123 | 124 | // ':' after a character means it takes an argument 125 | const char* const short_options = "hqo:d:vb1:2:3:4:5:6:7:8:is:w:j:g:x:m:c:a:e:f:ptk:"; 126 | const struct option long_options[] = { 127 | {"help", no_argument, NULL, 'h'}, 128 | {"verbose", no_argument, NULL, 'q'}, 129 | {"out", required_argument, NULL, 'o'}, 130 | {"data", required_argument, NULL, 'd'}, 131 | {"svi", no_argument, NULL, 'v'}, 132 | {"batch", no_argument, NULL, 'b'}, 133 | {"a_theta", required_argument, NULL, '1'}, 134 | {"b_theta", required_argument, NULL, '2'}, 135 | {"a_beta", required_argument, NULL, '3'}, 136 | {"b_beta", required_argument, NULL, '4'}, 137 | {"a_tau", required_argument, NULL, '5'}, 138 | {"b_tau", required_argument, NULL, '6'}, 139 | {"a_delta", required_argument, NULL, '7'}, 140 | {"b_delta", required_argument, NULL, '8'}, 141 | {"social_only", no_argument, &social_only, 1}, 142 | {"factor_only", no_argument, &factor_only, 1}, 143 | {"fix_influence", no_argument, &fix_influence, 1}, 144 | {"bias", no_argument, NULL, 'i'}, 145 | {"binary", no_argument, &binary, 1}, 146 | {"directed", no_argument, &directed, 1}, 147 | {"seed", required_argument, NULL, 's'}, 148 | {"save_freq", required_argument, NULL, 'w'}, 149 | {"eval_freq", required_argument, NULL, 'j'}, 150 | {"conv_freq", required_argument, NULL, 'g'}, 151 | {"max_iter", required_argument, NULL, 'x'}, 152 | {"min_iter", required_argument, NULL, 'm'}, 153 | {"converge", required_argument, NULL, 'c'}, 154 | {"sample", required_argument, NULL, 'a'}, 155 | {"delay", required_argument, NULL, 'e'}, 156 | {"forget", required_argument, NULL, 'f'}, 157 | {"final_pass", no_argument, NULL, 'p'}, 158 | {"final_pass_test", no_argument, NULL, 't'}, 159 | {"K", required_argument, NULL, 'k'}, 160 | {NULL, 0, NULL, 0}}; 161 | 162 | 163 | int opt = 0; 164 | while(true) { 165 | opt = getopt_long(argc, argv, short_options, long_options, NULL); 166 | switch(opt) { 167 | case 'h': 168 | print_usage_and_exit(); 169 | break; 170 | case 'q': 171 | verbose = true; 172 | break; 173 | case 'o': 174 | out = optarg; 175 | break; 176 | case 'd': 177 | data = optarg; 178 | break; 179 | case 'v': 180 | svi = true; 181 | break; 182 | case 'b': 183 | batchvi = true; 184 | break; 185 | case '1': 186 | a_theta = atof(optarg); 187 | break; 188 | case '2': 189 | b_theta = atof(optarg); 190 | break; 191 | case '3': 192 | a_beta = atof(optarg); 193 | break; 194 | case '4': 195 | b_beta = atof(optarg); 196 | break; 197 | case '5': 198 | a_tau = atof(optarg); 199 | break; 200 | case '6': 201 | b_tau = atof(optarg); 202 | break; 203 | case '7': 204 | a_delta = atof(optarg); 205 | break; 206 | case '8': 207 | b_delta = atof(optarg); 208 | break; 209 | case 'i': 210 | item_bias = true; 211 | break; 212 | case 's': 213 | seed = atoi(optarg); 214 | break; 215 | case 'w': 216 | save_freq = atoi(optarg); 217 | break; 218 | case 'j': 219 | eval_freq = atoi(optarg); 220 | break; 221 | case 'g': 222 | conv_freq = atoi(optarg); 223 | break; 224 | case 'x': 225 | max_iter = atoi(optarg); 226 | break; 227 | case 'm': 228 | min_iter = atoi(optarg); 229 | break; 230 | case 'c': 231 | converge_delta = atoi(optarg); 232 | break; 233 | case 'a': 234 | sample_size = atoi(optarg); 235 | break; 236 | case 'e': 237 | svi_delay = atof(optarg); 238 | break; 239 | case 'f': 240 | svi_forget = atof(optarg); 241 | break; 242 | case 'p': 243 | final_pass = true; 244 | break; 245 | case 't': 246 | final_pass_test = true; 247 | break; 248 | case 'k': 249 | k = atoi(optarg); 250 | break; 251 | case -1: 252 | break; 253 | case '?': 254 | print_usage_and_exit(); 255 | break; 256 | default: 257 | break; 258 | } 259 | if (opt == -1) 260 | break; 261 | } 262 | 263 | // print information 264 | printf("********************************************************************************\n"); 265 | 266 | if (out == "") { 267 | printf("No output directory specified. Exiting.\n"); 268 | exit(-1); 269 | } 270 | 271 | if (dir_exists(out)) { 272 | string rmout = "rm -rf " + out; 273 | system(rmout.c_str()); 274 | } 275 | make_directory(out); 276 | printf("output directory: %s\n", out.c_str()); 277 | 278 | if (data == "") { 279 | printf("No data directory specified. Exiting.\n"); 280 | exit(-1); 281 | } 282 | 283 | if (!dir_exists(data)) { 284 | printf("data directory %s doesn't exist! Exiting.\n", data.c_str()); 285 | exit(-1); 286 | } 287 | printf("data directory: %s\n", data.c_str()); 288 | 289 | if (!file_exists(data + "/train.tsv")) { 290 | printf("training data file (train.tsv) doesn't exist! Exiting.\n"); 291 | exit(-1); 292 | } 293 | 294 | if (!file_exists(data + "/validation.tsv")) { 295 | printf("validation data file (validation.tsv) doesn't exist! Exiting.\n"); 296 | exit(-1); 297 | } 298 | 299 | if (!factor_only && !file_exists(data + "/network.tsv")) { 300 | printf("network data file (network.tsv) doesn't exist! Exiting.\n"); 301 | exit(-1); 302 | } 303 | 304 | if (social_only && factor_only) { 305 | printf("Model cannot be both social only (SF) and factor only (PF). Exiting.\n"); 306 | exit(-1); 307 | } 308 | 309 | if (final_pass && final_pass_test) { 310 | printf("Model cannot do a final pass both on all users and only on test users. Exiting.\n"); 311 | exit(-1); 312 | } 313 | 314 | if (svi && batchvi) { 315 | printf("Inference method cannot be both stochatic (SVI) and batch. Exiting.\n"); 316 | exit(-1); 317 | } 318 | 319 | if (batchvi && (final_pass || final_pass_test)) { 320 | printf("Batch VI doesn't allow for a \"final pass.\" Ignoring this argument.\n"); 321 | final_pass = false; 322 | final_pass_test = false; 323 | } 324 | 325 | printf("\nmodel specification:\n"); 326 | 327 | if (social_only) { 328 | printf("\tsocial factorization (SF) [ social factors only ]\n"); 329 | } else if (factor_only) { 330 | printf("\tPoisson factorization (PF) [ general preference factors only ]\n"); 331 | } else { 332 | printf("\tsocial Poisson factorization (SPF)\n"); 333 | } 334 | 335 | if (fix_influence) { 336 | printf("\tsocial influence parameters fixed to 1\n"); 337 | } 338 | 339 | if (!social_only) { 340 | printf("\tK = %d (number of latent factors for general preferences)\n", k); 341 | } 342 | 343 | printf("\nshape and rate hyperparameters:\n"); 344 | if (!social_only) { 345 | printf("\ttheta (%.2f, %.2f)\n", a_theta, b_theta); 346 | printf("\tbeta (%.2f, %.2f)\n", a_beta, b_beta); 347 | } 348 | if (!factor_only) { 349 | printf("\ttau (%.2f, %.2f)\n", a_tau, b_tau); 350 | } 351 | if (item_bias) { 352 | printf("\tdelta (%.2f, %.2f)\n", a_delta, b_delta); 353 | } 354 | 355 | 356 | printf("\ndata attributes:\n"); 357 | 358 | if (binary) { 359 | printf("\tbinary ratings\n"); 360 | } else { 361 | printf("\tinteger ratings\n"); 362 | } 363 | 364 | if (!factor_only) { 365 | if (directed) { 366 | printf("\tdirected network\n"); 367 | } else { 368 | printf("\tundirected network\n"); 369 | } 370 | } 371 | 372 | printf("\ninference parameters:\n"); 373 | printf("\tseed: %d\n", (int)seed); 374 | printf("\tsave frequency: %d\n", save_freq); 375 | printf("\tevaluation frequency: %d\n", eval_freq); 376 | printf("\tconvergence check frequency: %d\n", conv_freq); 377 | printf("\tmaximum number of iterations: %d\n", max_iter); 378 | printf("\tminimum number of iterations: %d\n", min_iter); 379 | printf("\tchange in log likelihood for convergence: %f\n", converge_delta); 380 | printf("\tfinal pass after convergence: %s\n", final_pass ? "all users" : 381 | (final_pass_test ? "test users only" : "none")); 382 | 383 | 384 | if (!batchvi) { 385 | printf("\nStochastic variational inference parameters\n"); 386 | if (!svi) 387 | printf(" (may not be used, pending dataset size)\n"); 388 | printf("\tsample size: %d\n", sample_size); 389 | printf("\tSVI delay (tau): %f\n", svi_delay); 390 | printf("\tSVI forgetting rate (kappa): %f\n", svi_forget); 391 | } else { 392 | printf("\nusing batch variational inference\n"); 393 | } 394 | 395 | 396 | model_settings settings; 397 | settings.set(verbose, out, data, svi, a_theta, b_theta, a_beta, b_beta, a_tau, b_tau, 398 | a_delta, b_delta, 399 | (bool) social_only, (bool) factor_only, item_bias, (bool) binary, (bool) directed, 400 | seed, save_freq, eval_freq, conv_freq, max_iter, min_iter, converge_delta, 401 | final_pass, final_pass_test, sample_size, svi_delay, svi_forget, (bool) fix_influence, k); 402 | 403 | // read in the data 404 | printf("********************************************************************************\n"); 405 | printf("reading data\n"); 406 | Data *dataset = new Data(settings.binary, settings.directed); 407 | printf("\treading training data\t\t...\t"); 408 | dataset->read_ratings(settings.datadir + "/train.tsv"); 409 | printf("done\n"); 410 | 411 | if (!factor_only) { 412 | printf("\treading network data\t\t...\t"); 413 | dataset->read_network(settings.datadir + "/network.tsv"); 414 | printf("done\n"); 415 | } 416 | printf("\treading validation data\t\t...\t"); 417 | dataset->read_validation(settings.datadir + "/validation.tsv"); 418 | printf("done\n"); 419 | 420 | if (!file_exists(data + "/test.tsv")) { 421 | printf("testing data file (test.tsv) doesn't exist! Exiting.\n"); 422 | exit(-1); 423 | } 424 | printf("\treading testing data\t\t...\t"); 425 | dataset->read_test(settings.datadir + "/test.tsv"); 426 | printf("done\n"); 427 | 428 | printf("\tsaving data stats\t\t...\t"); 429 | dataset->save_summary(out + "/data_stats.txt"); 430 | printf("done\n"); 431 | 432 | // save the run settings 433 | printf("Saving settings\n"); 434 | if (!svi && !batchvi) { 435 | if (dataset->num_training() > 10000000) { 436 | settings.set_stochastic_inference(true); 437 | printf("using SVI (based on dataset size)\n"); 438 | } else { 439 | printf("using batch VI (based on dataset size)\n"); 440 | } 441 | } 442 | printf("user count %d\n", dataset->user_count()); 443 | if (!settings.svi) 444 | settings.set_sample_size(dataset->user_count()); 445 | printf("sample size %d\n", settings.sample_size); 446 | 447 | settings.save(out + "/settings.txt"); 448 | 449 | // TODO: make this/evaluate below optional (--test_only, --no_test) 450 | printf("********************************************************************************\n"); 451 | printf("commencing model evaluation\n"); 452 | 453 | // create model instance; learn! 454 | printf("\ncreating model instance\n"); 455 | SPF *model = new SPF(&settings, dataset); 456 | printf("commencing model inference\n"); 457 | model->learn(); 458 | 459 | // test the final model fit 460 | printf("evaluating model on held-out data\n"); 461 | model->evaluate(); 462 | 463 | delete model; 464 | delete dataset; 465 | 466 | return 0; 467 | } 468 | -------------------------------------------------------------------------------- /src/mf.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "utils.h" 5 | #include "data.h" 6 | #include "eval.h" 7 | 8 | void print_usage_and_exit() { 9 | // print usage information 10 | printf("*************************** Predict by Poupularity *****************************\n"); 11 | printf("(c) Copyright 2014-2015 Allison J.B. Chaney ( achaney@cs.princeton.edu )\n"); 12 | printf("Distributed under MIT License; see LICENSE file for details.\n"); 13 | 14 | printf("\nusage:\n"); 15 | printf(" ./mf [options]\n"); 16 | printf(" --help print help information\n"); 17 | printf(" --verbose print extra information while running\n"); 18 | 19 | printf("\n"); 20 | printf(" --out {dir} save directory, required\n"); 21 | printf(" --data {dir} data directory, required\n"); 22 | printf(" --K {K} the number of general factors, default 100\n"); 23 | 24 | printf("********************************************************************************\n"); 25 | 26 | exit(0); 27 | } 28 | 29 | class MF: protected Model { 30 | private: 31 | fmat* theta; 32 | fmat* beta; 33 | 34 | public: 35 | MF(Data* d, fmat* t, fmat* b) { 36 | data = d; 37 | theta = t; 38 | beta = b; 39 | } 40 | 41 | double predict(int user, int item) { 42 | return accu(theta->col(user) % beta->col(item)); 43 | } 44 | 45 | void evaluate(string outdir, bool verbose) { 46 | eval(this, &Model::predict, outdir, data, false, 11, verbose, "final", true, false); 47 | } 48 | }; 49 | 50 | 51 | int main(int argc, char* argv[]) { 52 | if (argc < 2) print_usage_and_exit(); 53 | 54 | // variables to store command line args + defaults 55 | string outdir = ""; 56 | string datadir = ""; 57 | bool verbose = false; 58 | int K = 100; 59 | 60 | // ':' after a character means it takes an argument 61 | const char* const short_options = "hqo:d:k:"; 62 | const struct option long_options[] = { 63 | {"help", no_argument, NULL, 'h'}, 64 | {"verbose", no_argument, NULL, 'q'}, 65 | {"out", required_argument, NULL, 'o'}, 66 | {"data", required_argument, NULL, 'd'}, 67 | {"K", required_argument, NULL, 'k'}, 68 | {NULL, 0, NULL, 0}}; 69 | 70 | 71 | int opt = 0; 72 | while(true) { 73 | opt = getopt_long(argc, argv, short_options, long_options, NULL); 74 | switch(opt) { 75 | case 'h': 76 | print_usage_and_exit(); 77 | break; 78 | case 'q': 79 | verbose = true; 80 | break; 81 | case 'o': 82 | outdir = optarg; 83 | break; 84 | case 'd': 85 | datadir = optarg; 86 | break; 87 | case 'k': 88 | K = atoi(optarg); 89 | break; 90 | case -1: 91 | break; 92 | case '?': 93 | print_usage_and_exit(); 94 | break; 95 | default: 96 | break; 97 | } 98 | if (opt == -1) 99 | break; 100 | } 101 | 102 | // print information 103 | printf("********************************************************************************\n"); 104 | 105 | if (outdir == "") { 106 | printf("No output directory specified. Exiting.\n"); 107 | exit(-1); 108 | } 109 | 110 | printf("output directory: %s\n", outdir.c_str()); 111 | 112 | if (datadir == "") { 113 | printf("No data directory specified. Exiting.\n"); 114 | exit(-1); 115 | } 116 | 117 | if (!dir_exists(datadir)) { 118 | printf("data directory %s doesn't exist! Exiting.\n", datadir.c_str()); 119 | exit(-1); 120 | } 121 | printf("data directory: %s\n", datadir.c_str()); 122 | 123 | if (!file_exists(datadir + "/train.tsv")) { 124 | printf("training data file (train.tsv) doesn't exist! Exiting.\n"); 125 | exit(-1); 126 | } 127 | 128 | if (!file_exists(datadir + "/validation.tsv")) { 129 | printf("validation data file (validation.tsv) doesn't exist! Exiting.\n"); 130 | exit(-1); 131 | } 132 | 133 | 134 | // read in the data 135 | printf("********************************************************************************\n"); 136 | printf("reading data\n"); 137 | Data *data = new Data(true, false); 138 | printf("\treading training data\t\t...\t"); 139 | data->read_ratings(datadir + "/train.tsv"); 140 | printf("done\n"); 141 | 142 | printf("\treading validation data\t\t...\t"); 143 | data->read_validation(datadir + "/validation.tsv"); 144 | printf("done\n"); 145 | 146 | if (!file_exists(datadir + "/test.tsv")) { 147 | printf("testing data file (test.tsv) doesn't exist! Exiting.\n"); 148 | exit(-1); 149 | } 150 | printf("\treading testing data\t\t...\t"); 151 | data->read_test(datadir + "/test.tsv"); 152 | printf("done\n"); 153 | 154 | printf("\tsaving data stats\t\t...\t"); 155 | data->save_summary(outdir + "/data_stats.txt"); 156 | printf("done\n"); 157 | 158 | printf("********************************************************************************\n"); 159 | printf("commencing model evaluation\n"); 160 | 161 | // read in the model 162 | printf("starting to read model\n"); 163 | fmat theta = fmat(K, data->user_count()); 164 | fmat beta = fmat(K, data->item_count()); 165 | 166 | int k=0, i=0; 167 | float value; 168 | 169 | FILE* fileptr = fopen((outdir+"/final-U.dat").c_str(), "r"); 170 | printf("bout to read theta from %s\n", (outdir+"/final-U.dat").c_str()); 171 | while (fscanf(fileptr, "%e", &value) != EOF) { 172 | //printf("theta i%d, k%d\n", i, k); 173 | theta(k,i) = value; 174 | k++; 175 | if (k >= K) { 176 | k = 0; 177 | i++; 178 | } 179 | if (i >= data->user_count()) 180 | break; 181 | } 182 | fclose(fileptr); 183 | 184 | fileptr = fopen((outdir+"/final-V.dat").c_str(), "r"); 185 | i = 0; 186 | k = 0; 187 | while (fscanf(fileptr, "%e", &value) != EOF) { 188 | //printf("beta i%d, k%d\n", i, k); 189 | beta(k,i) = value; 190 | k++; 191 | if (k >= K) { 192 | k = 0; 193 | i++; 194 | } 195 | if (i >= data->item_count()) 196 | break; 197 | } 198 | fclose(fileptr); 199 | 200 | MF mf = MF(data, &theta, &beta); 201 | mf.evaluate(outdir, verbose); 202 | 203 | delete data; 204 | 205 | return 0; 206 | } 207 | -------------------------------------------------------------------------------- /src/model.h: -------------------------------------------------------------------------------- 1 | class Model { 2 | protected: 3 | Data* data; 4 | 5 | public: 6 | virtual double predict(int user, int item) { return 0; }; 7 | }; 8 | -------------------------------------------------------------------------------- /src/popularity.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "utils.h" 5 | #include "data.h" 6 | #include "eval.h" 7 | 8 | void print_usage_and_exit() { 9 | // print usage information 10 | printf("*************************** Predict by Poupularity *****************************\n"); 11 | printf("(c) Copyright 2014-2015 Allison J.B. Chaney ( achaney@cs.princeton.edu )\n"); 12 | printf("Distributed under MIT License; see LICENSE file for details.\n"); 13 | 14 | printf("\nusage:\n"); 15 | printf(" ./pop [options]\n"); 16 | printf(" --help print help information\n"); 17 | printf(" --verbose print extra information while running\n"); 18 | 19 | printf("\n"); 20 | printf(" --out {dir} save directory, required\n"); 21 | printf(" --data {dir} data directory, required\n"); 22 | printf(" --seed {seed} the random seed, default from time\n"); 23 | 24 | printf("********************************************************************************\n"); 25 | 26 | exit(0); 27 | } 28 | 29 | // helper function to write out per-user info 30 | void log_user(FILE* file, Data *data, int user, int heldout, double rmse, double mae, 31 | double rank, int first, double crr, double ncrr, double ndcg) { 32 | fprintf(file, "%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%d\t%f\t%f\t%f\n", user, 33 | data->user_id(user), heldout, data->item_count(user), 34 | data->neighbor_count(user), data->connectivity(user), 35 | rmse, mae, rank, first, crr, ncrr, ndcg); 36 | return; 37 | } 38 | 39 | void log_item(FILE* file, Data *data, int item, int heldout, double rmse, double mae, 40 | double rank, int first, double crr, double ncrr, double ndcg) { 41 | fprintf(file, "%d\t%d\t%d\t%d\t%f\t%f\t%f\t%d\t%f\t%f\t%f\n", item, 42 | data->item_id(item), data->popularity(item), heldout, 43 | rmse, mae, rank, first, crr, ncrr, ndcg); 44 | return; 45 | } 46 | 47 | class Popularity: protected Model { 48 | public: 49 | double predict(int user, int item) { 50 | return data->popularity(item) * 5 / data->item_count(); 51 | } 52 | 53 | void evaluate(Data* d, string outdir, bool verbose, long seed) { 54 | data = d; 55 | eval(this, &Model::predict, outdir, data, true, seed, verbose, "final", true, false); 56 | } 57 | }; 58 | 59 | int main(int argc, char* argv[]) { 60 | if (argc < 2) print_usage_and_exit(); 61 | 62 | // variables to store command line args + defaults 63 | string outdir = ""; 64 | string datadir = ""; 65 | bool verbose = false; 66 | long seed = 11; 67 | 68 | // ':' after a character means it takes an argument 69 | const char* const short_options = "hqo:d:s:"; 70 | const struct option long_options[] = { 71 | {"help", no_argument, NULL, 'h'}, 72 | {"verbose", no_argument, NULL, 'q'}, 73 | {"out", required_argument, NULL, 'o'}, 74 | {"data", required_argument, NULL, 'd'}, 75 | {"seed", required_argument, NULL, 's'}, 76 | {NULL, 0, NULL, 0}}; 77 | 78 | 79 | int opt = 0; 80 | while(true) { 81 | opt = getopt_long(argc, argv, short_options, long_options, NULL); 82 | switch(opt) { 83 | case 'h': 84 | print_usage_and_exit(); 85 | break; 86 | case 'q': 87 | verbose = true; 88 | break; 89 | case 'o': 90 | outdir = optarg; 91 | break; 92 | case 'd': 93 | datadir = optarg; 94 | break; 95 | case 's': 96 | seed = atoi(optarg); 97 | break; 98 | case -1: 99 | break; 100 | case '?': 101 | print_usage_and_exit(); 102 | break; 103 | default: 104 | break; 105 | } 106 | if (opt == -1) 107 | break; 108 | } 109 | 110 | // print information 111 | printf("********************************************************************************\n"); 112 | 113 | if (outdir == "") { 114 | printf("No output directory specified. Exiting.\n"); 115 | exit(-1); 116 | } 117 | 118 | if (dir_exists(outdir)) { 119 | string rmout = "rm -rf " + outdir; 120 | system(rmout.c_str()); 121 | } 122 | make_directory(outdir); 123 | printf("output directory: %s\n", outdir.c_str()); 124 | 125 | if (datadir == "") { 126 | printf("No data directory specified. Exiting.\n"); 127 | exit(-1); 128 | } 129 | 130 | if (!dir_exists(datadir)) { 131 | printf("data directory %s doesn't exist! Exiting.\n", datadir.c_str()); 132 | exit(-1); 133 | } 134 | printf("data directory: %s\n", datadir.c_str()); 135 | 136 | if (!file_exists(datadir + "/train.tsv")) { 137 | printf("training data file (train.tsv) doesn't exist! Exiting.\n"); 138 | exit(-1); 139 | } 140 | 141 | if (!file_exists(datadir + "/validation.tsv")) { 142 | printf("validation data file (validation.tsv) doesn't exist! Exiting.\n"); 143 | exit(-1); 144 | } 145 | 146 | 147 | // read in the data 148 | printf("********************************************************************************\n"); 149 | printf("reading data\n"); 150 | Data *data = new Data(true, false); 151 | printf("\treading training data\t\t...\t"); 152 | data->read_ratings(datadir + "/train.tsv"); 153 | printf("done\n"); 154 | 155 | // read in the network for data stats only 156 | printf("\treading network data\t\t...\t"); 157 | data->read_network(datadir + "/network.tsv"); 158 | printf("done\n"); 159 | 160 | printf("\treading validation data\t\t...\t"); 161 | data->read_validation(datadir + "/validation.tsv"); 162 | printf("done\n"); 163 | 164 | if (!file_exists(datadir + "/test.tsv")) { 165 | printf("testing data file (test.tsv) doesn't exist! Exiting.\n"); 166 | exit(-1); 167 | } 168 | printf("\treading testing data\t\t...\t"); 169 | data->read_test(datadir + "/test.tsv"); 170 | printf("done\n"); 171 | 172 | printf("\tsaving data stats\t\t...\t"); 173 | data->save_summary(outdir + "/data_stats.txt"); 174 | printf("done\n"); 175 | 176 | printf("********************************************************************************\n"); 177 | printf("commencing model evaluation\n"); 178 | 179 | // test the final model fit 180 | Popularity pop = Popularity(); 181 | pop.evaluate(data, outdir, verbose, seed); 182 | 183 | delete data; 184 | 185 | return 0; 186 | } 187 | -------------------------------------------------------------------------------- /src/random.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "utils.h" 5 | #include "data.h" 6 | #include "eval.h" 7 | 8 | void print_usage_and_exit() { 9 | // print usage information 10 | printf("*************************** Predict by Poupularity *****************************\n"); 11 | printf("(c) Copyright 2014-2015 Allison J.B. Chaney ( achaney@cs.princeton.edu )\n"); 12 | printf("Distributed under MIT License; see LICENSE file for details.\n"); 13 | 14 | printf("\nusage:\n"); 15 | printf(" ./pop [options]\n"); 16 | printf(" --help print help information\n"); 17 | printf(" --verbose print extra information while running\n"); 18 | 19 | printf("\n"); 20 | printf(" --out {dir} save directory, required\n"); 21 | printf(" --data {dir} data directory, required\n"); 22 | printf(" --seed {seed} the random seed, default 11\n"); 23 | 24 | printf("********************************************************************************\n"); 25 | 26 | exit(0); 27 | } 28 | 29 | // helper function to write out per-user info 30 | void log_user(FILE* file, Data *data, int user, int heldout, double rmse, double mae, 31 | double rank, int first, double crr, double ncrr, double ndcg) { 32 | fprintf(file, "%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%d\t%f\t%f\t%f\n", user, 33 | data->user_id(user), heldout, data->item_count(user), 34 | data->neighbor_count(user), data->connectivity(user), 35 | rmse, mae, rank, first, crr, ncrr, ndcg); 36 | return; 37 | } 38 | 39 | void log_item(FILE* file, Data *data, int item, int heldout, double rmse, double mae, 40 | double rank, int first, double crr, double ncrr, double ndcg) { 41 | fprintf(file, "%d\t%d\t%d\t%d\t%f\t%f\t%f\t%d\t%f\t%f\t%f\n", item, 42 | data->item_id(item), data->popularity(item), heldout, 43 | rmse, mae, rank, first, crr, ncrr, ndcg); 44 | return; 45 | } 46 | 47 | class Random: protected Model { 48 | public: 49 | double predict(int user, int item) { 50 | return 0; 51 | } 52 | 53 | void evaluate(Data* d, string outdir, bool verbose, long seed) { 54 | data = d; 55 | eval(this, &Model::predict, outdir, data, true, seed, verbose, "final", true, false); 56 | } 57 | }; 58 | 59 | int main(int argc, char* argv[]) { 60 | if (argc < 2) print_usage_and_exit(); 61 | 62 | // variables to store command line args + defaults 63 | string outdir = ""; 64 | string datadir = ""; 65 | bool verbose = false; 66 | long seed = 11; 67 | 68 | // ':' after a character means it takes an argument 69 | const char* const short_options = "hqo:d:s:"; 70 | const struct option long_options[] = { 71 | {"help", no_argument, NULL, 'h'}, 72 | {"verbose", no_argument, NULL, 'q'}, 73 | {"out", required_argument, NULL, 'o'}, 74 | {"data", required_argument, NULL, 'd'}, 75 | {"seed", required_argument, NULL, 's'}, 76 | {NULL, 0, NULL, 0}}; 77 | 78 | 79 | int opt = 0; 80 | while(true) { 81 | opt = getopt_long(argc, argv, short_options, long_options, NULL); 82 | switch(opt) { 83 | case 'h': 84 | print_usage_and_exit(); 85 | break; 86 | case 'q': 87 | verbose = true; 88 | break; 89 | case 'o': 90 | outdir = optarg; 91 | break; 92 | case 'd': 93 | datadir = optarg; 94 | break; 95 | case 's': 96 | seed = atoi(optarg); 97 | break; 98 | case -1: 99 | break; 100 | case '?': 101 | print_usage_and_exit(); 102 | break; 103 | default: 104 | break; 105 | } 106 | if (opt == -1) 107 | break; 108 | } 109 | 110 | // print information 111 | printf("********************************************************************************\n"); 112 | 113 | if (outdir == "") { 114 | printf("No output directory specified. Exiting.\n"); 115 | exit(-1); 116 | } 117 | 118 | if (dir_exists(outdir)) { 119 | string rmout = "rm -rf " + outdir; 120 | system(rmout.c_str()); 121 | } 122 | make_directory(outdir); 123 | printf("output directory: %s\n", outdir.c_str()); 124 | 125 | if (datadir == "") { 126 | printf("No data directory specified. Exiting.\n"); 127 | exit(-1); 128 | } 129 | 130 | if (!dir_exists(datadir)) { 131 | printf("data directory %s doesn't exist! Exiting.\n", datadir.c_str()); 132 | exit(-1); 133 | } 134 | printf("data directory: %s\n", datadir.c_str()); 135 | 136 | if (!file_exists(datadir + "/train.tsv")) { 137 | printf("training data file (train.tsv) doesn't exist! Exiting.\n"); 138 | exit(-1); 139 | } 140 | 141 | if (!file_exists(datadir + "/validation.tsv")) { 142 | printf("validation data file (validation.tsv) doesn't exist! Exiting.\n"); 143 | exit(-1); 144 | } 145 | 146 | printf("\tseed: %d\n", (int)seed); 147 | 148 | 149 | // read in the data 150 | printf("********************************************************************************\n"); 151 | printf("reading data\n"); 152 | Data *data = new Data(true, false); 153 | printf("\treading training data\t\t...\t"); 154 | data->read_ratings(datadir + "/train.tsv"); 155 | printf("done\n"); 156 | 157 | // read in the network for data stats only 158 | printf("\treading network data\t\t...\t"); 159 | data->read_network(datadir + "/network.tsv"); 160 | printf("done\n"); 161 | 162 | printf("\treading validation data\t\t...\t"); 163 | data->read_validation(datadir + "/validation.tsv"); 164 | printf("done\n"); 165 | 166 | if (!file_exists(datadir + "/test.tsv")) { 167 | printf("testing data file (test.tsv) doesn't exist! Exiting.\n"); 168 | exit(-1); 169 | } 170 | printf("\treading testing data\t\t...\t"); 171 | data->read_test(datadir + "/test.tsv"); 172 | printf("done\n"); 173 | 174 | printf("\tsaving data stats\t\t...\t"); 175 | data->save_summary(outdir + "/data_stats.txt"); 176 | printf("done\n"); 177 | 178 | printf("********************************************************************************\n"); 179 | printf("commencing model evaluation\n"); 180 | 181 | // test the final model fit 182 | Random rand = Random(); 183 | rand.evaluate(data, outdir, verbose, seed); 184 | 185 | delete data; 186 | 187 | return 0; 188 | } 189 | -------------------------------------------------------------------------------- /src/spf.cpp: -------------------------------------------------------------------------------- 1 | #include "spf.h" 2 | 3 | SPF::SPF(model_settings* model_set, Data* dataset) { 4 | settings = model_set; 5 | data = dataset; 6 | 7 | // user influence 8 | printf("\tinitializing user influence (tau)\n"); 9 | tau = sp_fmat(data->user_count(), data->user_count()); 10 | logtau = sp_fmat(data->user_count(), data->user_count()); 11 | a_tau = sp_fmat(data->user_count(), data->user_count()); 12 | b_tau = sp_fmat(data->user_count(), data->user_count()); 13 | 14 | // user preferences 15 | printf("\tinitializing user preferences (theta)\n"); 16 | theta = fmat(settings->k, data->user_count()); 17 | logtheta = fmat(settings->k, data->user_count()); 18 | a_theta = fmat(settings->k, data->user_count()); 19 | b_theta = fmat(settings->k, data->user_count()); 20 | 21 | // item attributes 22 | printf("\tinitializing item attributes (beta)\n"); 23 | printf("\t%d users and %d items\n", data->user_count(), data->item_count()); 24 | beta = fmat(settings->k, data->item_count()); 25 | logbeta = fmat(settings->k, data->item_count()); 26 | a_beta = fmat(settings->k, data->item_count()); 27 | a_beta_user = fmat(settings->k, data->item_count()); 28 | b_beta = fmat(settings->k, data->item_count()); 29 | 30 | delta = fvec(data->item_count()); 31 | a_delta = fvec(data->item_count()); 32 | b_delta = settings->b_delta + data->user_count(); 33 | a_delta_user = fvec(data->item_count()); 34 | 35 | // keep track of old a_beta and a_delta for SVI 36 | a_beta_old = fmat(settings->k, data->item_count()); 37 | a_beta_old.fill(settings->a_beta); 38 | a_delta_old = fvec(data->item_count()); 39 | a_delta_old.fill(settings->a_delta); 40 | 41 | printf("\tsetting random seed\n"); 42 | rand_gen = gsl_rng_alloc(gsl_rng_taus); 43 | gsl_rng_set(rand_gen, (long) settings->seed); // init the seed 44 | 45 | initialize_parameters(); 46 | 47 | scale = settings->svi ? float(data->user_count()) / float(settings->sample_size) : 1; 48 | } 49 | 50 | void SPF::learn() { 51 | double old_likelihood, delta_likelihood, likelihood = -1e10; 52 | int likelihood_decreasing_count = 0; 53 | time_t start_time, end_time; 54 | 55 | int iteration = 0; 56 | char iter_as_str[4]; 57 | bool converged = false; 58 | bool on_final_pass = false; 59 | 60 | while (!converged) { 61 | time(&start_time); 62 | iteration++; 63 | printf("iteration %d\n", iteration); 64 | 65 | reset_helper_params(); 66 | 67 | // update rate for user preferences 68 | b_theta.each_col() += sum(beta, 1); 69 | 70 | set items; 71 | int user = -1, item, rating; 72 | for (int i = 0; i < settings->sample_size; i++) { 73 | if (on_final_pass && settings->final_pass_test) { 74 | user++; 75 | while (data->test_users.count(user)==0) { 76 | user++; 77 | } 78 | } else if (settings->svi) { 79 | user = gsl_rng_uniform_int(rand_gen, data->user_count()); 80 | } else { 81 | user = i; 82 | } 83 | 84 | bool user_converged = false; 85 | int user_iters = 0; 86 | while (!user_converged) { 87 | user_iters++; 88 | a_beta_user.zeros(); 89 | a_delta_user.zeros(); 90 | 91 | // look at all the user's items 92 | for (int j = 0; j < data->item_count(user); j++) { 93 | item = data->get_item(user, j); 94 | items.insert(item); 95 | rating = 1; 96 | //TODO: rating = data->get_train_rating(i); 97 | update_shape(user, item, rating); 98 | } 99 | 100 | // update per-user parameters 101 | double user_change = 0; 102 | if (!settings->factor_only && !settings->fix_influence) 103 | user_change += update_tau(user); 104 | if (!settings->social_only) 105 | user_change += update_theta(user); 106 | if (!settings->social_only && !settings->factor_only && !settings->fix_influence) { 107 | user_change /= 2; 108 | 109 | // if the updates are less than 1% change or over 10 local iterations, 110 | // declare the local params to be converged 111 | if (user_change < 0.01 || user_iters > 10) 112 | user_converged = true; 113 | 114 | } else { 115 | // if we're only looking at social or factor (not combined) 116 | // then the user parameters will always have converged with 117 | // a single pass (since there's nothing to balance against) 118 | user_converged = true; 119 | } 120 | } 121 | if (settings->verbose) 122 | printf("%d\tuser %d took %d iters to converge\n", iteration, user, user_iters); 123 | a_beta += a_beta_user; 124 | a_delta += a_delta_user; 125 | } 126 | 127 | if (!settings->social_only) { 128 | // update rate for item attributes 129 | b_beta.each_col() += sum(theta, 1); 130 | 131 | // update per-item parameters 132 | set::iterator it; 133 | for (it = items.begin(); it != items.end(); it++) { 134 | item = *it; 135 | if (iter_count[item] == 0) 136 | iter_count[item] = 0; 137 | iter_count[item]++; 138 | update_beta(item); 139 | if (settings->item_bias) 140 | update_delta(item); 141 | } 142 | } else if (settings->item_bias) { 143 | set::iterator it; 144 | for (it = items.begin(); it != items.end(); it++) { 145 | item = *it; 146 | if (iter_count[item] == 0) 147 | iter_count[item] = 0; 148 | iter_count[item]++; 149 | if (settings->item_bias) 150 | update_delta(item); 151 | } 152 | } 153 | 154 | 155 | // check for convergence 156 | if (on_final_pass) { 157 | printf("Final pass complete\n"); 158 | converged = true; 159 | 160 | old_likelihood = likelihood; 161 | likelihood = get_ave_log_likelihood(); 162 | delta_likelihood = abs((old_likelihood - likelihood) / 163 | old_likelihood); 164 | log_convergence(iteration, likelihood, delta_likelihood); 165 | } else if (iteration >= settings->max_iter) { 166 | printf("Reached maximum number of iterations.\n"); 167 | converged = true; 168 | 169 | old_likelihood = likelihood; 170 | likelihood = get_ave_log_likelihood(); 171 | delta_likelihood = abs((old_likelihood - likelihood) / 172 | old_likelihood); 173 | log_convergence(iteration, likelihood, delta_likelihood); 174 | } else if (iteration % settings->conv_freq == 0) { 175 | old_likelihood = likelihood; 176 | likelihood = get_ave_log_likelihood(); 177 | 178 | if (likelihood < old_likelihood) 179 | likelihood_decreasing_count += 1; 180 | else 181 | likelihood_decreasing_count = 0; 182 | delta_likelihood = abs((old_likelihood - likelihood) / 183 | old_likelihood); 184 | log_convergence(iteration, likelihood, delta_likelihood); 185 | if (settings->verbose) { 186 | printf("delta: %f\n", delta_likelihood); 187 | printf("old: %f\n", old_likelihood); 188 | printf("new: %f\n", likelihood); 189 | } 190 | if (iteration >= settings->min_iter && 191 | delta_likelihood < settings->likelihood_delta) { 192 | printf("Model converged.\n"); 193 | converged = true; 194 | } else if (iteration >= settings->min_iter && 195 | likelihood_decreasing_count >= 2) { 196 | printf("Likelihood decreasing.\n"); 197 | converged = true; 198 | } 199 | } 200 | 201 | // save intermediate results 202 | if (!converged && settings->save_freq > 0 && 203 | iteration % settings->save_freq == 0) { 204 | printf(" saving\n"); 205 | sprintf(iter_as_str, "%04d", iteration); 206 | save_parameters(iter_as_str); 207 | } 208 | 209 | // intermediate evaluation 210 | if (!converged && settings->eval_freq > 0 && 211 | iteration % settings->eval_freq == 0) { 212 | sprintf(iter_as_str, "%04d", iteration); 213 | evaluate(iter_as_str); 214 | } 215 | 216 | time(&end_time); 217 | log_time(iteration, difftime(end_time, start_time)); 218 | 219 | if (converged && !on_final_pass && 220 | (settings->final_pass || settings->final_pass_test)) { 221 | printf("final pass on all users.\n"); 222 | on_final_pass = true; 223 | converged = false; 224 | 225 | // we need to modify some settings for the final pass 226 | // things should look exactly like batch for all users 227 | if (settings->final_pass) { 228 | settings->set_stochastic_inference(false); 229 | settings->set_sample_size(data->user_count()); 230 | scale = 1; 231 | } else { 232 | settings->set_sample_size(data->test_users.size()); 233 | scale = data->user_count() / settings->sample_size; 234 | } 235 | } 236 | } 237 | 238 | save_parameters("final"); 239 | } 240 | 241 | double SPF::predict(int user, int item) { 242 | double prediction = settings->social_only ? 1e-10 : 0; 243 | 244 | prediction += accu(tau.col(user) % data->ratings.col(item)); 245 | 246 | if (!settings->social_only) { 247 | prediction += accu(theta.col(user) % beta.col(item)); 248 | } 249 | 250 | if (settings->item_bias) { 251 | prediction += delta(item); 252 | } 253 | 254 | return prediction; 255 | } 256 | 257 | // helper function to sort predictions properly 258 | bool prediction_compare(const pair, int>& itemA, 259 | const pair, int>& itemB) { 260 | // if the two values are equal, sort by popularity! 261 | if (itemA.first.first == itemB.first.first) { 262 | if (itemA.first.second == itemB.first.second) 263 | return itemA.second < itemB.second; 264 | return itemA.first.second > itemB.first.second; 265 | } 266 | return itemA.first.first > itemB.first.first; 267 | } 268 | 269 | void SPF::evaluate() { 270 | evaluate("final", true); 271 | } 272 | 273 | void SPF::evaluate(string label) { 274 | evaluate(label, false); 275 | } 276 | 277 | void SPF::evaluate(string label, bool write_rankings) { 278 | time_t start_time, end_time; 279 | time(&start_time); 280 | 281 | eval(this, &Model::predict, settings->outdir, data, false, settings->seed, 282 | settings->verbose, label, write_rankings, false); 283 | 284 | time(&end_time); 285 | log_time(-1, difftime(end_time, start_time)); 286 | } 287 | 288 | 289 | 290 | /* PRIVATE */ 291 | 292 | void SPF::initialize_parameters() { 293 | int user, neighbor, n, item, i, k; 294 | if (!settings->factor_only) { 295 | for (user = 0; user < data->user_count(); user++) { 296 | // user influence 297 | for (n = 0; n < data->neighbor_count(user); n++) { 298 | neighbor = data->get_neighbor(user, n); 299 | tau(neighbor, user) = 1.0; 300 | logtau(neighbor, user) = log(1.0 + 1e-5); 301 | 302 | double all = settings->b_tau; 303 | for (i = 0; i < data->item_count(neighbor); i++) { 304 | item = data->get_item(neighbor, i); 305 | all += data->ratings(neighbor, item); 306 | } //TODO: this doeesn't need to be done as much... only one time per user (U), not UxU times 307 | b_tau(neighbor, user) = all; 308 | } 309 | } 310 | } 311 | 312 | if (!settings->social_only) { 313 | // user preferences 314 | for (user = 0; user < data->user_count(); user++) { 315 | for (k = 0; k < settings->k; k++) { 316 | theta(k, user) = (settings->a_theta + 317 | gsl_rng_uniform_pos(rand_gen)) 318 | / (settings->b_theta); 319 | logtheta(k, user) = log(theta(k, user)); 320 | } 321 | theta.col(user) /= accu(theta.col(user)); 322 | } 323 | 324 | // item attributes 325 | for (item = 0; item < data->item_count(); item++) { 326 | for (k = 0; k < settings->k; k++) { 327 | beta(k, item) = (settings->a_beta + 328 | gsl_rng_uniform_pos(rand_gen)) 329 | / (settings->b_beta); 330 | logbeta(k, item) = log(beta(k, item)); 331 | } 332 | beta.col(item) /= accu(beta.col(item)); 333 | } 334 | } 335 | 336 | if (settings->item_bias) { 337 | for (item = 0; item < data->item_count(); item++) { 338 | delta(item) = data->popularity(item); 339 | } 340 | } 341 | } 342 | 343 | void SPF::reset_helper_params() { 344 | a_tau = data->network_spmat * settings->a_tau; 345 | 346 | a_theta.fill(settings->a_theta); 347 | b_theta.fill(settings->b_theta); 348 | a_beta.fill(settings->a_beta); 349 | b_beta.fill(settings->b_beta); 350 | a_delta.fill(settings->a_delta); 351 | } 352 | 353 | void SPF::save_parameters(string label) { 354 | FILE* file; 355 | if (!settings->factor_only & !settings->fix_influence) { 356 | // save tau 357 | file = fopen((settings->outdir+"/tau-"+label+".dat").c_str(), "w"); 358 | fprintf(file, "uid\torig.uid\tvid\torig.vid\ttau\n"); 359 | int user, neighbor, n; 360 | double tau_uv; 361 | for (user = 0; user < data->user_count(); user++) { 362 | for (n = 0; n < data->neighbor_count(user); n++) { 363 | neighbor = data->get_neighbor(user, n); 364 | tau_uv = tau(neighbor, user); 365 | fprintf(file, "%d\t%d\t%d\t%d\t%e\n", user, data->user_id(user), 366 | neighbor, data->user_id(neighbor), tau_uv); 367 | } 368 | } 369 | fclose(file); 370 | } 371 | 372 | if (!settings->social_only) { 373 | int k; 374 | 375 | // write out theta 376 | file = fopen((settings->outdir+"/theta-"+label+".dat").c_str(), "w"); 377 | for (int user = 0; user < data->user_count(); user++) { 378 | fprintf(file, "%d\t%d", user, data->user_id(user)); 379 | for (k = 0; k < settings->k; k++) 380 | fprintf(file, "\t%e", theta(k, user)); 381 | fprintf(file, "\n"); 382 | } 383 | fclose(file); 384 | 385 | // write out beta 386 | file = fopen((settings->outdir+"/beta-"+label+".dat").c_str(), "w"); 387 | for (int item = 0; item < data->item_count(); item++) { 388 | fprintf(file, "%d\t%d", item, data->item_id(item)); 389 | for (k = 0; k < settings->k; k++) 390 | fprintf(file, "\t%e", beta(k, item)); 391 | fprintf(file, "\n"); 392 | } 393 | fclose(file); 394 | } 395 | 396 | if (settings->item_bias) { 397 | // write out bias delta 398 | file = fopen((settings->outdir+"/delta-"+label+".dat").c_str(), "w"); 399 | for (int item = 0; item < data->item_count(); item++) { 400 | fprintf(file, "%d\t%d\t%e", item, data->item_id(item), delta(item)); 401 | } 402 | fclose(file); 403 | } 404 | } 405 | 406 | void SPF::update_shape(int user, int item, int rating) { 407 | sp_fmat phi_SF = logtau.col(user) % data->ratings.col(item); 408 | 409 | double phi_sum = accu(phi_SF); 410 | 411 | fmat phi_MF; 412 | float phi_B = 0; 413 | // we don't need to do a similar check for factor only because 414 | // sparse matrices play nice when empty 415 | if (!settings->social_only) { 416 | phi_MF = exp(logtheta.col(user) + logbeta.col(item)); 417 | phi_sum += accu(phi_MF); 418 | } 419 | 420 | if (settings->item_bias) { 421 | phi_B = delta(item); 422 | phi_sum += phi_B; 423 | } 424 | 425 | if (phi_sum == 0) 426 | return; 427 | 428 | if (!settings->factor_only & !settings->fix_influence) { 429 | phi_SF /= phi_sum * rating; 430 | int neighbor; 431 | for (int n = 0; n < data->neighbor_count(user); n++) { 432 | neighbor = data->get_neighbor(user, n); 433 | a_tau(neighbor, user) += phi_SF(neighbor, 0); 434 | } 435 | } 436 | 437 | if (!settings->social_only) { 438 | phi_MF /= phi_sum * rating; 439 | a_theta.col(user) += phi_MF; 440 | a_beta_user.col(item) += phi_MF * scale; 441 | } 442 | 443 | if (settings->item_bias) { 444 | a_delta(item) += (phi_B / (phi_sum * rating)) * scale; 445 | } 446 | } 447 | 448 | double SPF::update_tau(int user) { 449 | int neighbor, n; 450 | double old, change, total; 451 | change = 0; 452 | total = 0; 453 | for (n = 0; n < data->neighbor_count(user); n++) { 454 | neighbor = data->get_neighbor(user, n); 455 | 456 | old = tau(neighbor, user); 457 | total += tau(neighbor, user); 458 | 459 | tau(neighbor, user) = a_tau(neighbor, user) / b_tau(neighbor, user); 460 | // fake log! 461 | logtau(neighbor, user) = exp(gsl_sf_psi(a_tau(neighbor, user)) - log(b_tau(neighbor, user))); 462 | 463 | change += abs(old - tau(neighbor, user)); 464 | } 465 | 466 | return total==0 ? 0 : change / total; 467 | } 468 | 469 | double SPF::update_theta(int user) { 470 | double change = accu(abs(theta(user) - (a_theta(user) / b_theta(user)))); 471 | double total = accu(theta(user)); 472 | 473 | for (int k = 0; k < settings->k; k++) { 474 | theta(k, user) = a_theta(k, user) / b_theta(k, user); 475 | logtheta(k, user) = gsl_sf_psi(a_theta(k, user)); 476 | } 477 | logtheta(user) = logtheta(user) - log(b_theta(user)); 478 | 479 | return change / total; 480 | } 481 | 482 | void SPF::update_beta(int item) { 483 | if (settings->svi) { 484 | double rho = pow(iter_count[item] + settings->delay, 485 | -1 * settings->forget); 486 | a_beta(item) = (1 - rho) * a_beta_old(item) + rho * a_beta(item); 487 | a_beta_old(item) = a_beta(item); 488 | } 489 | 490 | for (int k = 0; k < settings->k; k++) { 491 | beta(k, item) = a_beta(k, item) / b_beta(k, item); 492 | logbeta(k, item) = gsl_sf_psi(a_beta(k, item)); 493 | } 494 | logbeta(item) = logbeta(item) - log(b_beta(item)); 495 | } 496 | 497 | void SPF::update_delta(int item) { 498 | if (settings->svi) { 499 | double rho = pow(iter_count[item] + settings->delay, 500 | -1 * settings->forget); 501 | a_delta(item) = (1 - rho) * a_delta_old(item) + rho * a_delta(item); 502 | a_delta_old(item) = a_delta(item); 503 | } 504 | delta(item) = a_delta(item) / b_delta; 505 | } 506 | 507 | double SPF::get_ave_log_likelihood() { 508 | double prediction, likelihood = 0; 509 | int user, item, rating; 510 | for (int i = 0; i < data->num_validation(); i++) { 511 | user = data->get_validation_user(i); 512 | item = data->get_validation_item(i); 513 | rating = data->get_validation_rating(i); 514 | 515 | prediction = predict(user, item); 516 | 517 | likelihood += 518 | log(prediction) * rating - log(factorial(rating)) - prediction; 519 | } 520 | 521 | return likelihood / data->num_validation(); 522 | } 523 | 524 | void SPF::log_convergence(int iteration, double ave_ll, double delta_ll) { 525 | FILE* file = fopen((settings->outdir+"/log_likelihood.dat").c_str(), "a"); 526 | fprintf(file, "%d\t%f\t%f\n", iteration, ave_ll, delta_ll); 527 | fclose(file); 528 | } 529 | 530 | void SPF::log_time(int iteration, double duration) { 531 | FILE* file = fopen((settings->outdir+"/time_log.dat").c_str(), "a"); 532 | fprintf(file, "%d\t%.f\n", iteration, duration); 533 | fclose(file); 534 | } 535 | 536 | void SPF::log_user(FILE* file, int user, int heldout, double rmse, double mae, 537 | double rank, int first, double crr, double ncrr, double ndcg) { 538 | fprintf(file, "%d\t%f\t%f\t%f\t%d\t%f\t%f\t%f\n", user, 539 | rmse, mae, rank, first, crr, ncrr, ndcg); 540 | } 541 | -------------------------------------------------------------------------------- /src/spf.h: -------------------------------------------------------------------------------- 1 | #include 2 | #define ARMA_64BIT_WORD 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "utils.h" 9 | #include "eval.h" 10 | 11 | using namespace std; 12 | using namespace arma; 13 | #include "data.h" 14 | 15 | struct model_settings { 16 | bool verbose; 17 | 18 | string outdir; 19 | string datadir; 20 | 21 | double a_theta; 22 | double b_theta; 23 | double a_beta; 24 | double b_beta; 25 | double a_tau; 26 | double b_tau; 27 | double a_delta; 28 | double b_delta; 29 | 30 | bool social_only; 31 | bool factor_only; 32 | bool fix_influence; 33 | bool item_bias; 34 | bool binary; 35 | bool directed; 36 | 37 | long seed; 38 | int save_freq; 39 | int eval_freq; 40 | int conv_freq; 41 | int max_iter; 42 | int min_iter; 43 | double likelihood_delta; 44 | 45 | bool svi; 46 | bool final_pass; 47 | bool final_pass_test; 48 | int sample_size; 49 | double delay; 50 | double forget; 51 | 52 | int k; 53 | 54 | 55 | void set(bool print, string out, string data, bool use_svi, 56 | double athe, double bthe, double abet, double bbet, 57 | double atau, double btau, double adelta, double bdelta, 58 | bool social, bool factor, bool bias, bool bin, bool dir, 59 | long rand, int savef, int evalf, int convf, 60 | int iter_max, int iter_min, double delta, 61 | bool finalpass, bool finalpasstest, 62 | int sample, double svi_delay, double svi_forget, 63 | bool fix, int num_factors) { 64 | verbose = print; 65 | 66 | outdir = out; 67 | datadir = data; 68 | 69 | svi = use_svi; 70 | 71 | a_theta = athe; 72 | b_theta = bthe; 73 | a_beta = abet; 74 | b_beta = bbet; 75 | a_tau = atau; 76 | b_tau = btau; 77 | a_delta = adelta; 78 | b_delta = bdelta; 79 | 80 | social_only = social; 81 | factor_only = factor; 82 | fix_influence = fix; 83 | item_bias = bias; 84 | binary = bin; 85 | directed = dir; 86 | 87 | seed = rand; 88 | save_freq = savef; 89 | eval_freq = evalf; 90 | conv_freq = convf; 91 | max_iter = iter_max; 92 | min_iter = iter_min; 93 | likelihood_delta = delta; 94 | 95 | final_pass = finalpass; 96 | final_pass_test = finalpasstest; 97 | sample_size = sample; 98 | delay = svi_delay; 99 | forget = svi_forget; 100 | 101 | k = num_factors; 102 | } 103 | 104 | void set_stochastic_inference(bool setting) { 105 | svi = setting; 106 | } 107 | 108 | void set_sample_size(int setting) { 109 | sample_size = setting; 110 | } 111 | 112 | void save(string filename) { 113 | FILE* file = fopen(filename.c_str(), "w"); 114 | 115 | fprintf(file, "data directory: %s\n", datadir.c_str()); 116 | 117 | fprintf(file, "\nmodel specification:\n"); 118 | if (social_only) { 119 | fprintf(file, "\tsocial factorization (SF) [ social factors only ]\n"); 120 | } else if (factor_only) { 121 | fprintf(file, "\tPoisson factorization (PF) [ general preference factors only ]\n"); 122 | } else { 123 | fprintf(file, "\tsocial Poisson factorization (SPF)\n"); 124 | } 125 | 126 | if (fix_influence) { 127 | fprintf(file,"\tsocial influence parameters fixed to 1\n"); 128 | } 129 | 130 | if (!social_only) { 131 | fprintf(file, "\tK = %d (number of latent factors for general preferences)\n", k); 132 | } 133 | 134 | fprintf(file, "\nshape and rate hyperparameters:\n"); 135 | if (!social_only) { 136 | fprintf(file, "\ttheta (%f, %f)\n", a_theta, b_theta); 137 | fprintf(file, "\tbeta (%f, %f)\n", a_beta, b_beta); 138 | } 139 | if (!factor_only) { 140 | fprintf(file, "\ttau (%f, %f)\n", a_tau, b_tau); 141 | } 142 | if (item_bias) { 143 | fprintf(file, "\tdelta (%.2f, %.2f)\n", a_delta, b_delta); 144 | } 145 | 146 | 147 | fprintf(file, "\ndata attributes:\n"); 148 | 149 | if (binary) { 150 | fprintf(file, "\tbinary ratings\n"); 151 | } else { 152 | fprintf(file, "\tinteger ratings\n"); 153 | } 154 | 155 | if (!factor_only) { 156 | if (directed) { 157 | fprintf(file, "\tdirected network\n"); 158 | } else { 159 | fprintf(file, "\tundirected network\n"); 160 | } 161 | } 162 | 163 | 164 | fprintf(file, "\ninference parameters:\n"); 165 | fprintf(file, "\tseed: %d\n", (int)seed); 166 | fprintf(file, "\tsave frequency: %d\n", save_freq); 167 | fprintf(file, "\tevaluation frequency: %d\n", eval_freq); 168 | fprintf(file, "\tconvergence check frequency: %d\n", conv_freq); 169 | fprintf(file, "\tmaximum number of iterations: %d\n", max_iter); 170 | fprintf(file, "\tminimum number of iterations: %d\n", min_iter); 171 | fprintf(file, "\tchange in log likelihood for convergence: %f\n", likelihood_delta); 172 | fprintf(file, "\tfinal pass after convergence: %s\n", final_pass ? "all users" : 173 | (final_pass_test ? "test users only" : "none")); 174 | 175 | if (svi) { 176 | fprintf(file, "\nStochastic variational inference parameters\n"); 177 | fprintf(file, "\tsample size: %d\n", sample_size); 178 | fprintf(file, "\tSVI delay (tau): %f\n", delay); 179 | fprintf(file, "\tSVI forgetting rate (kappa): %f\n", forget); 180 | } else { 181 | fprintf(file, "\nusing batch variational inference\n"); 182 | } 183 | 184 | fclose(file); 185 | } 186 | }; 187 | 188 | class SPF: protected Model { 189 | private: 190 | model_settings* settings; 191 | Data* data; 192 | 193 | // model parameters 194 | sp_fmat tau; // user influence 195 | sp_fmat logtau; // fake "log" user influence 196 | // it's really exp(E[log(tau)]) which != E[tau] 197 | fmat theta; // user preferences 198 | fmat beta; // item attributes 199 | fmat logtheta; // log variant of above 200 | fmat logbeta; // ditto 201 | fvec delta; 202 | 203 | // helper parameters 204 | sp_fmat a_tau; 205 | sp_fmat b_tau; 206 | fmat a_theta; 207 | fmat b_theta; 208 | fmat a_beta; 209 | fmat a_beta_user; 210 | fmat a_beta_old; 211 | fmat b_beta; 212 | fvec a_delta; 213 | float b_delta; 214 | fvec a_delta_user; 215 | fvec a_delta_old; 216 | 217 | // random number generator 218 | gsl_rng* rand_gen; 219 | 220 | void initialize_parameters(); 221 | void reset_helper_params(); 222 | void save_parameters(string label); 223 | 224 | // parameter updates 225 | void update_shape(int user, int item, int rating); 226 | double update_tau(int user); 227 | double update_theta(int user); 228 | void update_beta(int item); 229 | void update_delta(int item); 230 | 231 | double get_ave_log_likelihood(); 232 | void log_convergence(int iteration, double ave_ll, double delta_ll); 233 | void log_time(int iteration, double duration); 234 | void log_params(int iteration, double tau_change, double theta_change); 235 | void log_user(FILE* file, int user, int heldout, double rmse, 236 | double mae, double rank, int first, double crr, double ncrr, 237 | double ndcg); 238 | 239 | // define how to scale updates (training / sample size) (for SVI) 240 | double scale; 241 | 242 | // counts of number of times an item has been seen in a sample (for SVI) 243 | map iter_count; 244 | 245 | void evaluate(string label); 246 | void evaluate(string label, bool write_rankings); 247 | 248 | 249 | public: 250 | SPF(model_settings* model_set, Data* dataset); 251 | void learn(); 252 | double predict(int user, int item); 253 | void evaluate(); 254 | 255 | }; 256 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | // check if file exisits 4 | bool file_exists(string filename) { 5 | if ( 0 == access(filename.c_str(), R_OK)) 6 | return true; 7 | return false; 8 | } 9 | 10 | // check if a directory exists 11 | int dir_exists(string dname) { 12 | struct stat st; 13 | int ret; 14 | 15 | if (stat(dname.c_str(),&st) != 0) { 16 | return 0; 17 | } 18 | 19 | ret = S_ISDIR(st.st_mode); 20 | 21 | /*if(!ret) { 22 | errno = ENOTDIR; 23 | }*/ 24 | 25 | return ret; 26 | } 27 | 28 | void make_directory(string name) { 29 | mkdir(name.c_str(), S_IRUSR|S_IWUSR|S_IXUSR); 30 | } 31 | 32 | void remove_directory(string name) { 33 | rmdir(name.c_str()); 34 | } 35 | 36 | double factorial(int x) { 37 | if (x == 0) 38 | return 1; 39 | return x * factorial(x - 1); 40 | } 41 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | using namespace std; 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | /* 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | 31 | #define outlog(format, args...) \ 32 | fprintf(stderr, format, args); \ 33 | fprintf(stderr, "\n"); 34 | 35 | int compare (const void * a, const void * b); 36 | 37 | inline double safe_log(double x) { 38 | if (x <= 0) 39 | return(-10000); 40 | else 41 | return(log(x)); 42 | } 43 | double log_sum(double, double); 44 | 45 | inline double vget(const gsl_vector* v, int i) { return(gsl_vector_get(v, i)); } 46 | 47 | inline void vset(gsl_vector* v, int i, double x) { gsl_vector_set(v, i, x); } 48 | 49 | // Increment a vector element by a double. 50 | inline void vinc(gsl_vector* v, int i, double x) { 51 | vset(v, i, vget(v, i) + x); 52 | } 53 | 54 | inline double mget(const gsl_matrix* m, int i, int j) 55 | { return(gsl_matrix_get(m, i, j)); } 56 | 57 | inline void mset(gsl_matrix* m, int i, int j, double x) 58 | { gsl_matrix_set(m, i, j, x); } 59 | 60 | // Increment a matrix element by a double. 61 | void minc(gsl_matrix*, int, int, double); 62 | 63 | void col_sum(const gsl_matrix*, gsl_vector*); 64 | void row_sum(const gsl_matrix*, gsl_vector*); 65 | 66 | void vct_fprintf(FILE* file, const gsl_vector* v); 67 | void mtx_fprintf(FILE* file, const gsl_matrix* m); 68 | void mtx_fscanf(FILE* file, gsl_matrix* m); 69 | 70 | inline bool check_sym(const gsl_matrix *m) { 71 | for (size_t i = 0; i < m->size1-1; i ++) 72 | for (size_t j=i; j < m->size2; j ++) 73 | if (mget(m, i, j) != mget(m, j, i)) { 74 | printf("not sym\n"); 75 | return false; 76 | } 77 | return true; 78 | } 79 | 80 | double log_det(const gsl_matrix*); 81 | 82 | void matrix_inverse(const gsl_matrix*, gsl_matrix*); 83 | void matrix_vector_solve(const gsl_matrix* m, const gsl_vector* b, gsl_vector* v); 84 | 85 | void sym_eigen(gsl_matrix*, gsl_vector*, gsl_matrix*); 86 | 87 | inline double vsum(const gsl_vector* v) { 88 | double val = 0; 89 | int i, size = v->size; 90 | for (i = 0; i < size; i++) 91 | val += vget(v, i); 92 | return(val); 93 | } 94 | 95 | double vnorm(const gsl_vector * v); 96 | 97 | void gsl_vector_apply(gsl_vector* x, double(*fun)(double)); 98 | void vct_log(gsl_vector* v); 99 | void mtx_log(gsl_matrix* x); 100 | void vct_exp(gsl_vector* x); 101 | void mtx_exp(gsl_matrix* x); 102 | 103 | double mahalanobis_distance(const gsl_matrix * m, const gsl_vector* u, const gsl_vector* v); 104 | double mahalanobis_prod(const gsl_matrix * m, const gsl_vector* u, const gsl_vector* v); 105 | double matrix_dot_prod(const gsl_matrix * m1, const gsl_matrix* m2); 106 | 107 | void choose_k_from_n(int k, int n, int* result, int* src); 108 | void sample_k_from_n(int k, int n, int* result, int* src); 109 | 110 | double log_normalize(gsl_vector* x); 111 | double vnormalize(gsl_vector* x); 112 | */ 113 | int dir_exists(string dname); 114 | bool file_exists(string filename); 115 | void make_directory(string name); 116 | void remove_directory(string name); 117 | 118 | double factorial(int x); 119 | /* 120 | double digamma(double x); 121 | unsigned int rmultinomial(const gsl_vector* v); 122 | double rgamma(double a, double b); 123 | double rbeta(double a, double b); 124 | unsigned int rbernoulli(double p); 125 | double runiform(); 126 | void rshuffle (void* base, size_t n, size_t size); 127 | unsigned long int runiform_int(unsigned long int n); 128 | 129 | // new and free random number generator 130 | gsl_rng* new_random_number_generator(long seed); 131 | void free_random_number_generator(gsl_rng * random_number_generator); 132 | */ 133 | #endif 134 | --------------------------------------------------------------------------------