├── .gitignore
├── LICENSE
├── README.md
├── README_poisoning_attacks.md
├── __init__.py
├── algorithms
    ├── DBGD
    │   ├── __init__.py
    │   ├── neural
    │   │   ├── __init__.py
    │   │   └── pdbgd.py
    │   ├── pdbgd.py
    │   ├── pdbgd_dsp.py
    │   ├── pmgd.py
    │   ├── pmgd_dsp.py
    │   ├── tdNSGD.py
    │   ├── tdNSGD_dsp.py
    │   ├── tddbgd.py
    │   └── tdmgd.py
    ├── PDGD
    │   ├── __init__.py
    │   ├── deeppdgd.py
    │   └── pdgd.py
    ├── __init__.py
    ├── baselines
    │   ├── __init__.py
    │   └── pairwise.py
    └── basiconlineranker.py
├── attack.sh
├── attack_graph.py
├── attacker_avg_summarize.py
├── attacker_weights
    ├── Weights_mq2007.txt
    ├── Weights_td2003.txt
    ├── Weights_web10k.txt
    └── Weights_yahoo.txt
├── graphs
    ├── makeaverages.py
    ├── makegraphs.py
    └── maketables.py
├── models
    ├── __init__.py
    ├── evolutionneuralmodel.py
    ├── linearmodel.py
    ├── neuralmodel.py
    └── neuralnet.py
├── multileaving
    ├── PairwisePreferenceMultileave.py
    ├── ProbabilisticMultileave.py
    ├── TeamDraftMultileave.py
    └── __init__.py
├── scripts
    ├── CIKM2018.py
    ├── Poisoning_attacks
    │   ├── attack_DBGD_99_lr.py
    │   ├── attack_DBGD_base_lr.py
    │   ├── attack_MGD_99_lr.py
    │   └── attack_MGD_base_lr.py
    ├── SIGIR2018.py
    ├── SIGIR2019.py
    ├── SIGIR2019_nsgd.py
    ├── __init__.py
    └── slurm
    │   └── SIGIR2019
    │       ├── 0708.slurm
    │       ├── np.slurm
    │       ├── nsgd
    │           ├── 0708.slurm
    │           ├── np.slurm
    │           ├── web10k.slurm
    │           └── webscope1.slurm
    │       ├── web10k.slurm
    │       └── webscope1.slurm
└── utils
    ├── __init__.py
    ├── argparsers
        ├── __init__.py
        └── simulationargparser.py
    ├── attackeraverager.py
    ├── attackeroutput.py
    ├── attacksimulation.py
    ├── averageoutput.py
    ├── clicks.py
    ├── dataset.py
    ├── datasetcollections.py
    ├── datasimulation.py
    ├── evaluate.py
    ├── rankings.py
    └── simulationoutput.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | tmp/*.pickle
 2 | 
 3 | # Compiled source #
 4 | ###################
 5 | *.pyc
 6 | *.egg-info
 7 | build/
 8 | dist/
 9 | 
10 | # Files generated by eclipse #
11 | ##############################
12 | .coverage
13 | .project
14 | .pydevproject
15 | 
16 | # OS generated files #
17 | ######################
18 | .DS_Store
19 | .DS_Store?
20 | ._*
21 | .*.swp
22 | .Spotlight-V100
23 | .Trashes
24 | Icon?
25 | ehthumbs.db
26 | Thumbs.db
27 | 
28 | # Other files / directories
29 | exp
30 | gurobi.log
31 | 
32 | # Files generated by the runnner script (click models) #
33 | outdir
34 | pdf
35 | pdf_test
36 | log_folder
37 | 
38 | # Files generated by PyCharm
39 | .idea
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 H.R. Oosterhuis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Null Space Gradient Descent (NSGD) and Document Space Projected Dueling Bandit Gradient Descent (DBGD-DSP)
 2 | This repository contains the code used to produce the experimental results found in "Efficient Exploration of Gradient Space for Online Learning to Rank" and "Variance Reduction in Gradient Exploration for Online Learning to Rank" published at SIGIR 2018 and SIGIR 2019, respectively. It was forked from Harrie Oosterhuis's repository for "Differentiable Unbiased Online Learning to Rank" published at CIKM 2018, at https://github.com/HarrieO/OnlineLearningToRank.
 3 | 
 4 | NSGD Algorithm
 5 | -------
 6 | This algorithm was developed with the intent of increasing the efficiency of exploration of the gradient space for online learning to rank. It does this in a series of 3 steps. First, the null space of previously poorly performing directions is computed, and new directions are sampled from within this null space (this helps to avoid exploring less promising directions repeatedly). Second, a candidate preselection process is done wherein the sampled directions which are most differentiable by the current query's documents are chosen for evaluation. Thirdly, in the event of a tie, a tie-breaking mechanism uses historically difficult queries to reevaluate the candidates and choose a winner.
 7 | 
 8 | DBGD-DSP Algorithm
 9 | -------
10 | The aim of this algorithm is to act as a wrapper around other DBGD-style algorithms to reduce their ranker variance and improve overall performance in online learning to rank. DBGD-DSP works by modifying the winning ranker after the interleaved test. In particular, it projects the winning gradient into the space spanned by the query-document feature vectors associated with the given query. This reduces the variance in gradient exploration by removing the component of the winning gradient that was orthogonal to the document space, which does not contribute to the loss function and true gradient estimation.
11 | 
12 | Usage
13 | -------
14 | To run the code to generate experimental results like those found in our papers, you will need to run a command in the following format, using Python 2 (SIGIR2018.py, SIGIR2019.py, and SIGIR2019_NSGD.py are all run similarly):
15 | 
16 | ```
17 | python scripts/SIGIR2019.py [-h] [--n_runs N_RUNS] [--n_impr N_IMPRESSIONS] [--vali]
18 |                     [--vali_in_train] --data_sets DATA_SETS [DATA_SETS ...]
19 |                     [--output_folder OUTPUT_FOLDER] [--log_folder LOG_FOLDER]
20 |                     [--average_folder AVERAGE_FOLDER] [--small_dataset]
21 |                     --click_models CLICK_MODELS [CLICK_MODELS ...]
22 |                     [--print_freq PRINT_FREQ] [--print_logscale]
23 |                     [--print_output] [--max_folds MAX_FOLDS]
24 |                     [--n_proc N_PROCESSING] [--no_run_details]
25 |                     [--n_results N_RESULTS] [--skip_read_bin_data]
26 |                     [--skip_store_bin_data] [--train_only] [--all_train]
27 |                     [--nonrel_test]
28 | ```                 
29 | 
30 | In the command above, parameters within square brackets are optional. In our papers, we used datasets such as MQ2007 and MQ2008 from LETOR 4.0 datasets, the Yahoo! learning to rank challenge dataset, and MSLR-WEB10K dataset. The possible click models are described in our papers: inf = informational, nav = navigational, and per = perfect. 
31 | 
32 | Poisoning Attacks
33 | -------
34 | This repository also contains the code, that we used to show the robustness of DBGD/MGD based algorithms. Further details can be referred here: [link](README_poisoning_attacks.md)
35 | 
36 | Citation
37 | --------
38 | 
39 | If you use this code to produce results for your scientific publication, please refer to our SIGIR 2019 paper and/or SIGIR 2018 paper:
40 | 
41 | ```
42 | @inproceedings{wang2019variance,
43 |   title={Variance Reduction in Gradient Exploration for Online Learning to Rank},
44 |   author={Wang, Huazheng and Kim, Sonwoo and McCord-Snook, Eric and Wu, Qingyun and Wang, Hongning},
45 |   booktitle={The 42nd International ACM SIGIR Conference on Research \& Development in Information Retrieval},
46 |   year={2019},
47 |   organization={ACM}
48 | }
49 | 
50 | @inproceedings{wang2018efficient,
51 |   title={Efficient exploration of gradient space for online learning to rank},
52 |   author={Wang, Huazheng and Langley, Ramsey and Kim, Sonwoo and McCord-Snook, Eric and Wang, Hongning},
53 |   booktitle={The 41st International ACM SIGIR Conference on Research \& Development in Information Retrieval},
54 |   year={2018},
55 |   organization={ACM}
56 | }
57 | ```
58 | 
59 | License
60 | -------
61 | 
62 | The contents of this repository are licensed under the [MIT license](LICENSE). If you modify its contents in any way, please link back to this repository.
63 | 


--------------------------------------------------------------------------------
/README_poisoning_attacks.md:
--------------------------------------------------------------------------------
 1 | ## Poisoning Attacks on Online Learning to Rank
 2 | 
 3 | This repository contains the code, that we used to show the robustness of DBGD/MGD based algorithms.
 4 | 
 5 | **attacker_weights** folder contains the weight files of the attacker for 4 datasets. Additional weight files can be added here depending on the dataset (num. of features). These files are read in the **utils/attacksimulation.py**. 
 6 | 
 7 | Usage
 8 | -------
 9 | To run the code to generate experimental results you can simply run the attack.sh script. This script in turns calls another script present in the scripts/Poisoning_attacks directory. Four scripts are provided there depending on the algorithm and the learning rate decay.
10 | 
11 | An example of such a script is given:
12 | ```
13 | python2 scripts/Poisoning_attacks/attack_DBGD_base_lr.py --data_sets local_MQ2007 --attacker_click_model frequency_attack\
14 |        --click_models exper1 --log_folder ./log --output_folder ./output --average_folder ./average \
15 |        --n_impr 10000 --n_runs 10 --n_proc 10 --n_results 10 --start 0 --end 1 --which 1 --mf 5 --sd_const 2.0 --num_attacker_relevant 5
16 | ```                 
17 | 
18 | To know the details for each of the arguments, you can look at the **utils/argparsers/simulationargparser.py** file.
19 | 
20 | Additionally after running the experiments, 2 additional folders will be created namely **attackerourput** and **attackeraverage**. Graphs can be generated via the **attack_graph.py** script on the averaged out file. Here is an example:
21 | 
22 | ```
23 | python3 attack_graph.py attackeraverage/MQ2007/attack/TD_DBGD_frequency_attack_10_res_0_start_1_end_1_half_10000_impressions0.9999977_lrdecay.out
24 | ```
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/__init__.py


--------------------------------------------------------------------------------
/algorithms/DBGD/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/DBGD/__init__.py


--------------------------------------------------------------------------------
/algorithms/DBGD/neural/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/DBGD/neural/__init__.py


--------------------------------------------------------------------------------
/algorithms/DBGD/neural/pdbgd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | import utils.rankings as rnk
 7 | from models.evolutionneuralmodel import EvolutionNeuralModel
 8 | from algorithms.DBGD.pdbgd import P_DBGD
 9 | 
10 | # Probabilistic Interleaving Dueling Bandit Gradient Descent
11 | class Neural_P_DBGD(P_DBGD):
12 | 
13 |   def __init__(self, learning_rate, learning_rate_decay,
14 |                hidden_layers, *args, **kargs):
15 |     super(Neural_P_DBGD, self).__init__(learning_rate = learning_rate,
16 |                                         learning_rate_decay = learning_rate_decay,
17 |                                         *args, **kargs)
18 |     self.model = EvolutionNeuralModel(
19 |                              n_features = self.n_features,
20 |                              learning_rate = learning_rate,
21 |                              n_candidates = 1,
22 |                              learning_rate_decay = learning_rate_decay,
23 |                              hidden_layers = hidden_layers)
24 | 
25 |   @staticmethod
26 |   def default_parameters():
27 |     parent_parameters = P_DBGD.default_parameters()
28 |     parent_parameters.update({
29 |       'learning_rate': 0.01,
30 |       'learning_rate_decay': 1.0,
31 |       'PM_n_samples': 10000,
32 |       'PM_tau': 3.0,
33 |       'hidden_layers': [64],
34 |       })
35 |     return parent_parameters
36 | 


--------------------------------------------------------------------------------
/algorithms/DBGD/pdbgd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | import utils.rankings as rnk
 7 | from algorithms.DBGD.tddbgd import TD_DBGD
 8 | from multileaving.ProbabilisticMultileave import ProbabilisticMultileave
 9 | 
10 | # Probabilistic Interleaving Dueling Bandit Gradient Descent
11 | class P_DBGD(TD_DBGD):
12 | 
13 |   def __init__(self, PM_n_samples, PM_tau, *args, **kargs):
14 |     super(P_DBGD, self).__init__(*args, **kargs)
15 |     self.multileaving = ProbabilisticMultileave(
16 |                              n_samples = PM_n_samples,
17 |                              tau = PM_tau,
18 |                              n_results=self.n_results)
19 | 
20 |   @staticmethod
21 |   def default_parameters():
22 |     parent_parameters = TD_DBGD.default_parameters()
23 |     parent_parameters.update({
24 |       'learning_rate': 0.01,
25 |       'learning_rate_decay': 1.0,
26 |       'PM_n_samples': 10000,
27 |       'PM_tau': 3.0,
28 |       })
29 |     return parent_parameters
30 | 
31 |   def _create_train_ranking(self, query_id, query_feat, inverted):
32 |     assert inverted==False
33 |     self.model.sample_candidates()
34 |     scores = self.model.candidate_score(query_feat)
35 |     inverted_rankings = rnk.rank_single_query(scores,
36 |                                               inverted=True,
37 |                                               n_results=None)
38 |     multileaved_list = self.multileaving.make_multileaving(inverted_rankings)
39 |     return multileaved_list
40 | 


--------------------------------------------------------------------------------
/algorithms/DBGD/pdbgd_dsp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | import os
  5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
  6 | import utils.rankings as rnk
  7 | from algorithms.DBGD.tddbgd import TD_DBGD
  8 | from multileaving.ProbabilisticMultileave import ProbabilisticMultileave
  9 | import numpy as np
 10 | import math
 11 | 
 12 | # Probabilistic Interleaving Dueling Bandit Gradient Descent
 13 | class P_DBGD_DSP(TD_DBGD):
 14 | 
 15 |   def __init__(self, k_initial, k_increase, PM_n_samples, PM_tau, prev_qeury_len=None, docspace=[False,0], *args, **kargs):
 16 |     super(P_DBGD_DSP, self).__init__(*args, **kargs)
 17 | 
 18 |     self.multileaving = ProbabilisticMultileave(
 19 |                              n_samples = PM_n_samples,
 20 |                              tau = PM_tau,
 21 |                              n_results=self.n_results)
 22 | 
 23 |     self.k_initial = k_initial
 24 |     self.k_increase = k_increase
 25 | 
 26 |     self.prev_qeury_len = prev_qeury_len # queue size of features from previous queries
 27 |     if prev_qeury_len:
 28 |       self.prev_feat_list = []
 29 |     # for document space length experiment
 30 |     # docspace=[True,3] means use superset of document space with three additional documents to perfect DS user examined.
 31 |     self.docspace = docspace  
 32 | 
 33 |   @staticmethod
 34 |   def default_parameters():
 35 |     parent_parameters = TD_DBGD.default_parameters()
 36 |     parent_parameters.update({
 37 |       'learning_rate': 0.01,
 38 |       'learning_rate_decay': 1.0,
 39 |       'PM_n_samples': 10000,
 40 |       'PM_tau': 3.0,
 41 |       })
 42 |     return parent_parameters
 43 | 
 44 |   def _create_train_ranking(self, query_id, query_feat, inverted):
 45 |     # Save query_id to get access to query_feat when updating
 46 |     self.query_id = query_id
 47 |     assert inverted==False
 48 |     self.model.sample_candidates()
 49 |     scores = self.model.candidate_score(query_feat)
 50 |     inverted_rankings = rnk.rank_single_query(scores,
 51 |                                               inverted=True,
 52 |                                               n_results=None)
 53 |     multileaved_list = self.multileaving.make_multileaving(inverted_rankings)
 54 |     return multileaved_list
 55 | 
 56 | 
 57 |   def update_to_interaction(self, clicks, stop_index=None):
 58 | 
 59 |     winners = self.multileaving.winning_rankers(clicks)
 60 |     ###############################################################
 61 |     if True in clicks:
 62 |       # For projection
 63 |       # keep track of feature vectors of doc list
 64 |       viewed_list = []
 65 |       # index of last click
 66 |       last_click = max(loc for loc, val in enumerate(clicks) if val == True)
 67 |       # prevent last_click+k from exceeding interleaved list length
 68 |       k_current = self.k_initial
 69 |       if self.k_increase:
 70 |         # gradually increast k
 71 |         k_current += int(self.n_interactions/1000)
 72 |       last_doc_index = min(last_click+k_current, len(self._last_ranking))
 73 | 
 74 |       if self.docspace[0] and stop_index is not None: # for document space length experiment
 75 |         # create sub/super set of perfect document space user examined. 
 76 |         # user examined documents coming from ccm, where user leaves.
 77 |         last_doc_index = stop_index + self.docspace[1] + 1 # 1 added for stopping document, which has been examined.
 78 |         last_doc_index = max(last_doc_index,1) # At least 1
 79 |         last_doc_index = min(last_doc_index,len(self._last_ranking)) # At most length of current list
 80 | 
 81 |       query_feat = self.get_query_features(self.query_id,
 82 |                                        self._train_features,
 83 |                                        self._train_query_ranges)
 84 |       for i in range(last_doc_index):
 85 |         docid = self._last_ranking[i]
 86 |         feature = query_feat[docid]
 87 |         viewed_list.append(feature)
 88 |       add_list = viewed_list
 89 | 
 90 |       # Append feature vectors from previous queries
 91 |       if self.prev_qeury_len:
 92 |         if len(self.prev_feat_list) > 0:
 93 |           viewed_list = np.append(viewed_list,self.prev_feat_list, axis=0)
 94 | 
 95 |         # Add examined feature vectors of current query to be used in later iterations
 96 |         for i in add_list:
 97 |           if len(self.prev_feat_list) >= self.prev_qeury_len :
 98 |             self.prev_feat_list.pop(0)  # Remove oldest document feature.
 99 |           # if prev_feat_list is not filled up, add current list
100 |           self.prev_feat_list.append(i)
101 | 
102 |       self.model.update_to_mean_winners(winners,viewed_list)
103 |     ###############################################################
104 |     else:
105 |       self.model.update_to_mean_winners(winners)
106 | 
107 | 


--------------------------------------------------------------------------------
/algorithms/DBGD/pmgd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | from algorithms.DBGD.pdbgd import P_DBGD
 7 | from models.linearmodel import LinearModel
 8 | 
 9 | 
10 | # Probabilistic Interleaving Dueling Bandit Gradient Descent
11 | class P_MGD(P_DBGD):
12 | 
13 |   def __init__(self, n_candidates, *args, **kargs):
14 |     super(P_MGD, self).__init__(*args, **kargs)
15 |     self.n_candidates = n_candidates
16 |     self.model = LinearModel(n_features = self.n_features,
17 |                              learning_rate = self.learning_rate,
18 |                              n_candidates = self.n_candidates)
19 | 
20 | 
21 |   @staticmethod
22 |   def default_parameters():
23 |     parent_parameters = P_DBGD.default_parameters()
24 |     parent_parameters.update({
25 |       'n_candidates': 49,
26 |       })
27 |     return parent_parameters
28 | 


--------------------------------------------------------------------------------
/algorithms/DBGD/pmgd_dsp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | import os
  5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
  6 | from algorithms.DBGD.pdbgd import P_DBGD
  7 | import utils.rankings as rnk
  8 | from models.linearmodel import LinearModel
  9 | import numpy as np
 10 | import math
 11 | 
 12 | 
 13 | # Probabilistic Interleaving Dueling Bandit Gradient Descent
 14 | class P_MGD_DSP(P_DBGD):
 15 | 
 16 |   def __init__(self, k_initial, k_increase, n_candidates, prev_qeury_len=None, docspace=[False,0], *args, **kargs):
 17 |     super(P_MGD_DSP, self).__init__(*args, **kargs)
 18 |     self.n_candidates = n_candidates
 19 |     self.model = LinearModel(n_features = self.n_features,
 20 |                              learning_rate = self.learning_rate,
 21 |                              n_candidates = self.n_candidates)
 22 | 
 23 |     self.k_initial = k_initial
 24 |     self.k_increase = k_increase
 25 | 
 26 |     self.prev_qeury_len = prev_qeury_len # queue size of features from previous queries
 27 |     if prev_qeury_len:
 28 |       self.prev_feat_list = []
 29 |     # for document space length experiment
 30 |     # docspace=[True,3] means use superset of document space with three additional documents to perfect DS user examined.
 31 |     self.docspace = docspace
 32 | 
 33 |   @staticmethod
 34 |   def default_parameters():
 35 |     parent_parameters = P_DBGD.default_parameters()
 36 |     parent_parameters.update({
 37 |       'n_candidates': 49,
 38 |       })
 39 |     return parent_parameters
 40 | 
 41 |   def _create_train_ranking(self, query_id, query_feat, inverted):
 42 |     # Save query_id to get access to query_feat when updating
 43 |     self.query_id = query_id
 44 |     assert inverted==False
 45 |     self.model.sample_candidates()
 46 |     scores = self.model.candidate_score(query_feat)
 47 |     inverted_rankings = rnk.rank_single_query(scores,
 48 |                                               inverted=True,
 49 |                                               n_results=None)
 50 |     multileaved_list = self.multileaving.make_multileaving(inverted_rankings)
 51 |     return multileaved_list  
 52 | 
 53 |   def update_to_interaction(self, clicks, stop_index=None):
 54 | 
 55 |     winners = self.multileaving.winning_rankers(clicks)
 56 |     ###############################################################
 57 |     if True in clicks:
 58 |       # For projection
 59 |       # keep track of feature vectors of doc list
 60 |       viewed_list = []
 61 |       # index of last click
 62 |       last_click = max(loc for loc, val in enumerate(clicks) if val == True)
 63 |       # prevent last_click+k from exceeding interleaved list length
 64 |       k_current = self.k_initial
 65 |       if self.k_increase:
 66 |         # gradually increast k
 67 |         k_current += int(self.n_interactions/1000)
 68 |       last_doc_index = min(last_click+k_current, len(self._last_ranking))
 69 | 
 70 |       if self.docspace[0] and stop_index is not None: # for document space length experiment
 71 |         # create sub/super set of perfect document space user examined. 
 72 |         # user examined documents coming from ccm, where user leaves.
 73 |         last_doc_index = stop_index + self.docspace[1] + 1 # 1 added for stopping document, which has been examined.
 74 |         last_doc_index = max(last_doc_index,1) # At least 1
 75 |         last_doc_index = min(last_doc_index,len(self._last_ranking)) # At most length of current list
 76 | 
 77 |       query_feat = self.get_query_features(self.query_id,
 78 |                                        self._train_features,
 79 |                                        self._train_query_ranges)
 80 |       for i in range(last_doc_index):
 81 |         docid = self._last_ranking[i]
 82 |         feature = query_feat[docid]
 83 |         viewed_list.append(feature)
 84 |       add_list = viewed_list
 85 | 
 86 |       # Append feature vectors from previous queries
 87 |       if self.prev_qeury_len:
 88 |         if len(self.prev_feat_list) > 0:
 89 |           viewed_list = np.append(viewed_list,self.prev_feat_list, axis=0)
 90 | 
 91 |         # Add examined feature vectors of current query to be used in later iterations
 92 |         for i in add_list:
 93 |           if len(self.prev_feat_list) >= self.prev_qeury_len :
 94 |             self.prev_feat_list.pop(0)  # Remove oldest document feature.
 95 |           # if prev_feat_list is not filled up, add current list
 96 |           self.prev_feat_list.append(i)
 97 | 
 98 |       self.model.update_to_mean_winners(winners,viewed_list)
 99 |     ###############################################################
100 |     else:
101 |       self.model.update_to_mean_winners(winners)


--------------------------------------------------------------------------------
/algorithms/DBGD/tdNSGD.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | import os
  5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
  6 | from models.linearmodel import LinearModel
  7 | from algorithms.DBGD.tddbgd import TD_DBGD
  8 | import numpy as np
  9 | from sys import maxint
 10 | import copy
 11 | from scipy.spatial.distance import cosine
 12 | import utils.rankings as rnk
 13 | # Dueling Bandit Gradient Descent
 14 | class TD_NSGD(TD_DBGD):
 15 | 
 16 |     def __init__(self, n_candidates, GRAD_SIZE, EXP_SIZE, TB_QUEUE_SIZE=None, TB_WINDOW_SIZE=None, *args, **kargs):
 17 |         super(TD_NSGD, self).__init__(*args, **kargs)
 18 |         self.model = LinearModel(n_features = self.n_features,
 19 |                                  learning_rate = self.learning_rate,
 20 |                                  n_candidates = n_candidates)
 21 |         self.GRAD_SIZE = GRAD_SIZE
 22 |         self.EXP_SIZE = EXP_SIZE
 23 |         self.TB_QUEUE_SIZE = TB_QUEUE_SIZE
 24 |         self.TB_WINDOW_SIZE = TB_WINDOW_SIZE
 25 |         self.sample_basis = True
 26 |         self.clicklist = np.empty([self.GRAD_SIZE,1], dtype=int) #click array
 27 |         self.grad = np.zeros([self.GRAD_SIZE,self.n_features], dtype=float)
 28 |         self.gradCol = 0
 29 | 
 30 |         # DQ tie-break related lists
 31 |         self.difficult_NDCG =[]
 32 |         self.difficult_queries =[]
 33 |         self.difficult_document =[]
 34 |         self.difficult_time =[]
 35 |         self.query_id = 0
 36 | 
 37 |     @staticmethod
 38 |     def default_parameters():
 39 |         parent_parameters = TD_DBGD.default_parameters()
 40 |         parent_parameters.update({
 41 |           'n_candidates': 9,
 42 |           })
 43 |         return parent_parameters
 44 | 
 45 |     def update_to_interaction(self, clicks, stop_index=None):
 46 |         winners, ranker_clicks = self.multileaving.winning_rankers_with_clicks(clicks)
 47 | 
 48 |         # Fill out recent difficult query queues.
 49 |         if self.TB_QUEUE_SIZE > 0:   
 50 |             self.fill_difficult_query(clicks)
 51 |         # Trigger difficult-query tie-break strategy
 52 |         if len(self.difficult_queries) < self.TB_QUEUE_SIZE and len(winners) > 1:
 53 |             winners = self.tieBreak_difficultQuery(winners)
 54 | 
 55 |         self.model.update_to_mean_winners(winners)
 56 | 
 57 |         cl_sorted = sorted(ranker_clicks) # in ascending order
 58 |         for i in range(1, len(ranker_clicks)):
 59 |             # only save subset of rankers (worst 4 ouf of 9 rankers)
 60 |             # add if current cl is smaller than or equal to maximum form the set of candidates
 61 |             if ranker_clicks[i] <= cl_sorted[3] and ranker_clicks[i]<ranker_clicks[0]:
 62 |                 self.clicklist[self.gradCol] = ranker_clicks[i] -ranker_clicks[0]
 63 |                 self.grad[self.gradCol] = self.model.gs[i-1]
 64 |                 self.gradCol = (self.gradCol + 1) % self.GRAD_SIZE # update to reflect next column to be updaed
 65 | 
 66 | 
 67 | 
 68 |     def _create_train_ranking(self, query_id, query_feat, inverted):
 69 |         self.query_id = query_id
 70 |         assert inverted == False
 71 |         #  Get the worst gradients by click
 72 |         nums = []
 73 |         dif = self.GRAD_SIZE - self.EXP_SIZE
 74 |         for i in range(0, dif):
 75 |             max = -maxint-1
 76 |             n = 0
 77 |             # Choose
 78 |             for j in range(0, self.GRAD_SIZE):
 79 |                 if self.clicklist[j] > max and j not in nums:
 80 |                     max = self.clicklist[j] #  The better cl value to be excluded
 81 |                     n = j # index of it
 82 |             nums.append(n)
 83 | 
 84 |         #  create subset of gradient matrix
 85 |         grad_temp = np.zeros([self.EXP_SIZE, self.n_features], dtype=float)
 86 |         c = 0
 87 |         for i in range(0,self.GRAD_SIZE):
 88 |             if i not in nums:
 89 |                 # The wrost 'EXP_SIZE' gradients from grad[] added to gr_temp
 90 |                 grad_temp[c] = copy.deepcopy(self.grad[i])
 91 |                 c = c + 1
 92 | 
 93 |         self.model.sample_candidates_null_space(grad_temp, query_feat, self.sample_basis)
 94 |         scores = self.model.candidate_score(query_feat)
 95 |         rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results)
 96 |         multileaved_list = self.multileaving.make_multileaving(rankings)
 97 |         return multileaved_list
 98 | 
 99 |     def fill_difficult_query(self, clicks):
100 |         #  Set up for tie breaker- keep track of difficult queries
101 |         #  Find the rank of first clicked document
102 |         ndcg_current = 0
103 |         clickedList = []
104 |         for count, elem in enumerate(clicks):
105 |             if elem == 1: # if clicked
106 |                 ndcg_current += 1 / (count + 1.0)
107 |                 # Keep track of clicked documents of current query
108 |                 clickedList.append(self._last_ranking[count])
109 | 
110 |         # If difficult queries for tie breaking is not filled up, add current query
111 |         if len(self.difficult_NDCG) < self.TB_QUEUE_SIZE and ndcg_current > 0:
112 |             self.difficult_NDCG.append(ndcg_current)
113 |             self.difficult_queries.append(self.query_id)
114 |             self.difficult_document.append(clickedList)  # first clicked doc to follow
115 |             self.difficult_time.append(self.n_interactions)
116 |         else:
117 |             # If already filled up, check if current query is more difficult than any saved query.
118 |             if len(self.difficult_NDCG) > 0:
119 |                 flag = False
120 |                 for i in range(len(self.difficult_NDCG)):
121 |                     if self.n_interactions - self.difficult_time[i] > self.TB_WINDOW_SIZE:
122 |                     # Maintain queries winthin the window size
123 |                         flag = True
124 |                         index = i
125 |                         break
126 |                 if not flag and max(self.difficult_NDCG) > ndcg_current and ndcg_current > 0:
127 |                     # Current query is more difficult than one of queued ones
128 |                     flag = True
129 |                     index = self.difficult_NDCG.index(max(self.difficult_NDCG))
130 |                 if flag:
131 |                     self.difficult_NDCG[index] = ndcg_current
132 |                     self.difficult_queries[index] = self.query_id
133 |                     self.difficult_document[index] = clickedList
134 |                     self.difficult_time[index] = self.n_interactions
135 | 
136 |     def tieBreak_difficultQuery(self, winners):
137 |         # ScoreList keeps track of ranks each tied candidate perform in tie breaking
138 |         scoreList = np.zeros(self.model.n_models)
139 |         # Iterate through 10 stored difficult queries
140 |         for count_q, diff_query in enumerate(self.difficult_queries):
141 |             query_feat = self.get_query_features(diff_query,
142 |                                        self._train_features,
143 |                                        self._train_query_ranges)
144 |             scores = self.model.candidate_score(query_feat)
145 |             rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results)
146 | 
147 |             # Iterate through tied candidates
148 |             for winner in winners:
149 |                 candidate_NDCG = 0.0
150 |                 for count_d, doc in enumerate(self.difficult_document[count_q]):
151 |                     # Calculate NDCG performance in current difficult query
152 |                     diff_doc_rank = np.where(rankings[winner] == self.difficult_document[count_q][count_d])[0][0]
153 |                     temp = 1 / (diff_doc_rank + 1.0)
154 |                     candidate_NDCG += 1 / (diff_doc_rank + 1.0)
155 | 
156 |                 # Add the NDCG value of diff. query
157 |                 scoreList[winner] += candidate_NDCG
158 |         # Ranker with the least sum of NDCGs is the winner
159 |         maxRank_score = np.max(scoreList[np.nonzero(scoreList)])
160 |         winner = scoreList.tolist().index(maxRank_score)
161 |         return [winner]
162 | 


--------------------------------------------------------------------------------
/algorithms/DBGD/tdNSGD_dsp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | import os
  5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
  6 | from models.linearmodel import LinearModel
  7 | from algorithms.DBGD.tddbgd import TD_DBGD
  8 | import numpy as np
  9 | from sys import maxint
 10 | import copy
 11 | from scipy.spatial.distance import cosine
 12 | import utils.rankings as rnk
 13 | # Dueling Bandit Gradient Descent
 14 | class TD_NSGD_DSP(TD_DBGD):
 15 | 
 16 |   def __init__(self, n_candidates, GRAD_SIZE, EXP_SIZE, k_initial, k_increase, TB_QUEUE_SIZE=None, TB_WINDOW_SIZE=None, prev_qeury_len=None, *args, **kargs):
 17 |     super(TD_NSGD_DSP, self).__init__(*args, **kargs)
 18 |     self.model = LinearModel(n_features = self.n_features,
 19 |                              learning_rate = self.learning_rate,
 20 |                              n_candidates = n_candidates)
 21 |     self.GRAD_SIZE = GRAD_SIZE
 22 |     self.EXP_SIZE = EXP_SIZE
 23 |     self.TB_QUEUE_SIZE = TB_QUEUE_SIZE
 24 |     self.TB_WINDOW_SIZE = TB_WINDOW_SIZE
 25 |     self.sample_basis = True
 26 |     self.clicklist = np.empty([self.GRAD_SIZE,1], dtype=int) #click array
 27 |     self.grad = np.zeros([self.GRAD_SIZE,self.n_features], dtype=float)
 28 |     self.gradCol = 0
 29 | 
 30 |     # DQ tie-break related lists
 31 |     self.difficult_NDCG =[]
 32 |     self.difficult_queries =[]
 33 |     self.difficult_document =[]
 34 |     self.difficult_time =[]
 35 |     self.query_id = 0
 36 | 
 37 |     self.k_initial = k_initial
 38 |     self.k_increase = k_increase
 39 | 
 40 |     # Secondary techniques
 41 |     self.prev_qeury_len = prev_qeury_len
 42 |     if prev_qeury_len:
 43 |       self.prev_feat_list = []
 44 | 
 45 |   @staticmethod
 46 |   def default_parameters():
 47 |     parent_parameters = TD_DBGD.default_parameters()
 48 |     parent_parameters.update({
 49 |       'n_candidates': 9,
 50 |       })
 51 |     return parent_parameters
 52 | 
 53 |   def update_to_interaction(self, clicks, stop_index=None):
 54 | 
 55 |     winners, ranker_clicks = self.multileaving.winning_rankers_with_clicks(clicks)
 56 | 
 57 |     # Fill out recent difficult query queues.
 58 |     if self.TB_QUEUE_SIZE > 0:   
 59 |         self.fill_difficult_query(clicks)
 60 |     # Trigger difficult-query tie-break strategy
 61 |     if len(self.difficult_queries) < self.TB_QUEUE_SIZE and len(winners) > 1:
 62 |         winners = self.tieBreak_difficultQuery(winners)
 63 | 
 64 | 
 65 |     ###############################################################
 66 |     if True in clicks:
 67 |       # For projection
 68 |       # keep track of feature vectors of doc list
 69 |       viewed_list = []
 70 |       # index of last click
 71 |       last_click = max(loc for loc, val in enumerate(clicks) if val == True)
 72 |       # prevent last_click+k from exceeding interleaved list length
 73 |       k_current = self.k_initial
 74 |       if self.k_increase:
 75 |         # gradually increast k
 76 |         k_current += int(self.n_interactions/1000)
 77 |       last_doc_index = min(last_click+k_current, len(self._last_ranking)-1)
 78 | 
 79 |       query_feat = self.get_query_features(self.query_id,
 80 |                                        self._train_features,
 81 |                                        self._train_query_ranges)
 82 |       for i in range(last_doc_index):
 83 |         docid = self._last_ranking[i]
 84 |         feature = query_feat[docid]
 85 |         viewed_list.append(feature)
 86 |       self.model.update_to_mean_winners(winners,viewed_list)
 87 |     ###############################################################
 88 |     else:
 89 |       self.model.update_to_mean_winners(winners)
 90 | 
 91 |     cl_sorted = sorted(ranker_clicks) # in ascending order
 92 |     for i in range(1, len(ranker_clicks)):
 93 |         # only save subset of rankers (worst 4 ouf of 9 rankers)
 94 |         # add if current cl is smaller than or equal to maximum form the set of candidates
 95 |         if ranker_clicks[i] <= cl_sorted[3] and ranker_clicks[i]<ranker_clicks[0]:
 96 |             self.clicklist[self.gradCol] = ranker_clicks[i] -ranker_clicks[0]
 97 |             self.grad[self.gradCol] = self.model.gs[i-1]
 98 |             self.gradCol = (self.gradCol + 1) % self.GRAD_SIZE # update to reflect next column to be updaed
 99 | 
100 | 
101 | 
102 |   def _create_train_ranking(self, query_id, query_feat, inverted):
103 |     self.query_id = query_id
104 |     assert inverted == False
105 |     #  Get the worst gradients by click
106 |     nums = []
107 |     dif = self.GRAD_SIZE - self.EXP_SIZE
108 |     for i in range(0, dif):
109 |         max = -maxint-1
110 |         n = 0
111 |         # Choose
112 |         for j in range(0, self.GRAD_SIZE):
113 |             if self.clicklist[j] > max and j not in nums:
114 |                 max = self.clicklist[j] #  The better cl value to be excluded
115 |                 n = j # index of it
116 |         nums.append(n)
117 | 
118 |     #  create subset of gradient matrix
119 |     grad_temp = np.zeros([self.EXP_SIZE, self.n_features], dtype=float)
120 |     c = 0
121 |     for i in range(0,self.GRAD_SIZE):
122 |         if i not in nums:
123 |             # The wrost 'EXP_SIZE' gradients from grad[] added to gr_temp
124 |             grad_temp[c] = copy.deepcopy(self.grad[i])
125 |             c = c + 1
126 | 
127 |     self.model.sample_candidates_null_space(grad_temp, query_feat, self.sample_basis)
128 |     scores = self.model.candidate_score(query_feat)
129 |     rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results)
130 |     multileaved_list = self.multileaving.make_multileaving(rankings)
131 |     return multileaved_list
132 | 
133 |   def fill_difficult_query(self, clicks):
134 |       #  Set up for tie breaker- keep track of difficult queries
135 |       #  Find the rank of first clicked document
136 |       ndcg_current = 0
137 |       clickedList = []
138 |       for count, elem in enumerate(clicks):
139 |           if elem == 1: # if clicked
140 |               ndcg_current += 1 / (count + 1.0)
141 |               # Keep track of clicked documents of current query
142 |               clickedList.append(self._last_ranking[count])
143 | 
144 |       # If difficult queries for tie breaking is not filled up, add current query
145 |       if len(self.difficult_NDCG) < self.TB_QUEUE_SIZE and ndcg_current > 0:
146 |           self.difficult_NDCG.append(ndcg_current)
147 |           self.difficult_queries.append(self.query_id)
148 |           self.difficult_document.append(clickedList)  # first clicked doc to follow
149 |           self.difficult_time.append(self.n_interactions)
150 |       else:
151 |           # If already filled up, check if current query is more difficult than any saved query.
152 |           if len(self.difficult_NDCG) > 0:
153 |               flag = False
154 |               for i in range(len(self.difficult_NDCG)):
155 |                   if self.n_interactions - self.difficult_time[i] > self.TB_WINDOW_SIZE:
156 |                   # Maintain queries winthin the window size
157 |                       flag = True
158 |                       index = i
159 |                       break
160 |               if not flag and max(self.difficult_NDCG) > ndcg_current and ndcg_current > 0:
161 |                   # Current query is more difficult than one of queued ones
162 |                   flag = True
163 |                   index = self.difficult_NDCG.index(max(self.difficult_NDCG))
164 |               if flag:
165 |                   self.difficult_NDCG[index] = ndcg_current
166 |                   self.difficult_queries[index] = self.query_id
167 |                   self.difficult_document[index] = clickedList
168 |                   self.difficult_time[index] = self.n_interactions
169 | 
170 |   def tieBreak_difficultQuery(self, winners):
171 |       # ScoreList keeps track of ranks each tied candidate perform in tie breaking
172 |       scoreList = np.zeros(self.model.n_models)
173 |       # Iterate through 10 stored difficult queries
174 |       for count_q, diff_query in enumerate(self.difficult_queries):
175 |           query_feat = self.get_query_features(diff_query,
176 |                                      self._train_features,
177 |                                      self._train_query_ranges)
178 |           scores = self.model.candidate_score(query_feat)
179 |           rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results)
180 | 
181 |           # Iterate through tied candidates
182 |           for winner in winners:
183 |               candidate_NDCG = 0.0
184 |               for count_d, doc in enumerate(self.difficult_document[count_q]):
185 |                   # Calculate NDCG performance in current difficult query
186 |                   diff_doc_rank = np.where(rankings[winner] == self.difficult_document[count_q][count_d])[0][0]
187 |                   temp = 1 / (diff_doc_rank + 1.0)
188 |                   candidate_NDCG += 1 / (diff_doc_rank + 1.0)
189 | 
190 |               # Add the NDCG value of diff. query
191 |               scoreList[winner] += candidate_NDCG
192 |       # Ranker with the least sum of NDCGs is the winner
193 |       maxRank_score = np.max(scoreList[np.nonzero(scoreList)])
194 |       winner = scoreList.tolist().index(maxRank_score)
195 |       return [winner]
196 | 


--------------------------------------------------------------------------------
/algorithms/DBGD/tddbgd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | import numpy as np
 7 | import utils.rankings as rnk
 8 | from models.linearmodel import LinearModel
 9 | from algorithms.basiconlineranker import BasicOnlineRanker
10 | from multileaving.TeamDraftMultileave import TeamDraftMultileave
11 | 
12 | # Dueling Bandit Gradient Descent
13 | class TD_DBGD(BasicOnlineRanker):
14 | 
15 |   def __init__(self, learning_rate, learning_rate_decay,
16 |                *args, **kargs):
17 |     super(TD_DBGD, self).__init__(*args, **kargs)
18 |     self.learning_rate = learning_rate
19 |     self.model = LinearModel(n_features = self.n_features,
20 |                              learning_rate = learning_rate,
21 |                              n_candidates = 1,
22 |                              learning_rate_decay = learning_rate_decay)
23 |     self.multileaving = TeamDraftMultileave(
24 |                              n_results=self.n_results)
25 | 
26 | 
27 |   @staticmethod
28 |   def default_parameters():
29 |     parent_parameters = BasicOnlineRanker.default_parameters()
30 |     parent_parameters.update({
31 |       'learning_rate': 0.01,
32 |       'learning_rate_decay': 1.0,
33 |       })
34 |     return parent_parameters
35 | 
36 |   def get_test_rankings(self, features,
37 |                         query_ranges, inverted=True):
38 |     scores = self.model.score(features)
39 |     return rnk.rank_multiple_queries(
40 |                       scores,
41 |                       query_ranges,
42 |                       inverted=inverted,
43 |                       n_results=self.n_results)
44 | 
45 |   def _create_train_ranking(self, query_id, query_feat, inverted):
46 |     assert inverted == False
47 |     self.model.sample_candidates()
48 |     scores = self.model.candidate_score(query_feat)
49 |     rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results)
50 |     multileaved_list = self.multileaving.make_multileaving(rankings)
51 |     return multileaved_list
52 | 
53 |   def update_to_interaction(self, clicks):
54 |     winners = self.multileaving.winning_rankers(clicks)
55 |     self.model.update_to_mean_winners(winners)
56 | 


--------------------------------------------------------------------------------
/algorithms/DBGD/tdmgd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | from models.linearmodel import LinearModel
 7 | from algorithms.DBGD.tddbgd import TD_DBGD
 8 | 
 9 | 
10 | # Dueling Bandit Gradient Descent
11 | class TD_MGD(TD_DBGD):
12 | 
13 |   def __init__(self, n_candidates, *args, **kargs):
14 |     super(TD_MGD, self).__init__(*args, **kargs)
15 |     self.model = LinearModel(n_features = self.n_features,
16 |                              learning_rate = self.learning_rate,
17 |                              n_candidates = n_candidates,
18 |                              learning_rate_decay = self.model.learning_rate_decay)
19 | 
20 |   @staticmethod
21 |   def default_parameters():
22 |     parent_parameters = TD_DBGD.default_parameters()
23 |     parent_parameters.update({
24 |       'n_candidates': 9,
25 |       })
26 |     return parent_parameters
27 | 


--------------------------------------------------------------------------------
/algorithms/PDGD/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/PDGD/__init__.py


--------------------------------------------------------------------------------
/algorithms/PDGD/deeppdgd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | import numpy as np
 7 | import utils.rankings as rnk
 8 | from models.neuralmodel import NeuralModel
 9 | from algorithms.PDGD.pdgd import PDGD
10 | 
11 | # Pairwise Differentiable Gradient Descent
12 | class DeepPDGD(PDGD):
13 | 
14 |   def __init__(self, hidden_layers, *args, **kargs):
15 |     super(DeepPDGD, self).__init__(*args, **kargs)
16 |     self.model = NeuralModel(n_features = self.n_features,
17 |                              learning_rate = self.learning_rate,
18 |                              learning_rate_decay = self.learning_rate_decay,
19 |                              hidden_layers = hidden_layers)
20 | 
21 |   @staticmethod
22 |   def default_parameters():
23 |     parent_parameters = PDGD.default_parameters()
24 |     parent_parameters.update({
25 |       'learning_rate': 0.01,
26 |       'hidden_layers': [64],
27 |       })
28 |     return parent_parameters
29 | 


--------------------------------------------------------------------------------
/algorithms/PDGD/pdgd.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | import os
  5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
  6 | import numpy as np
  7 | import utils.rankings as rnk
  8 | from models.linearmodel import LinearModel
  9 | from algorithms.basiconlineranker import BasicOnlineRanker
 10 | 
 11 | # Pairwise Differentiable Gradient Descent
 12 | class PDGD(BasicOnlineRanker):
 13 | 
 14 |   def __init__(self, learning_rate, learning_rate_decay,
 15 |                *args, **kargs):
 16 |     super(PDGD, self).__init__(*args, **kargs)
 17 |     self.learning_rate = learning_rate
 18 |     self.learning_rate_decay = learning_rate_decay
 19 |     self.model = LinearModel(n_features = self.n_features,
 20 |                              learning_rate = learning_rate,
 21 |                              learning_rate_decay = learning_rate_decay,
 22 |                              n_candidates = 1)
 23 | 
 24 | 
 25 |   @staticmethod
 26 |   def default_parameters():
 27 |     parent_parameters = BasicOnlineRanker.default_parameters()
 28 |     parent_parameters.update({
 29 |       'learning_rate': 0.1,
 30 |       'learning_rate_decay': 1.0,
 31 |       })
 32 |     return parent_parameters
 33 | 
 34 |   def get_test_rankings(self, features,
 35 |                         query_ranges, inverted=True):
 36 |     scores = -self.model.score(features)
 37 |     return rnk.rank_multiple_queries(
 38 |                       scores,
 39 |                       query_ranges,
 40 |                       inverted=inverted,
 41 |                       n_results=self.n_results)
 42 | 
 43 |   def _create_train_ranking(self, query_id, query_feat, inverted):
 44 |     assert inverted == False
 45 |     n_docs = query_feat.shape[0]
 46 |     k = np.minimum(self.n_results, n_docs)
 47 |     self.doc_scores = self.model.score(query_feat)
 48 |     self.doc_scores += 18 - np.amax(self.doc_scores)
 49 |     self.ranking = self._recursive_choice(np.copy(self.doc_scores),
 50 |                                           np.array([], dtype=np.int32),
 51 |                                           k)
 52 |     self._last_query_feat = query_feat
 53 |     return self.ranking
 54 | 
 55 |   def _recursive_choice(self, scores, incomplete_ranking, k_left):
 56 |     n_docs = scores.shape[0]
 57 |     scores[incomplete_ranking] = np.amin(scores)
 58 |     scores += 18 - np.amax(scores)
 59 |     exp_scores = np.exp(scores)
 60 |     exp_scores[incomplete_ranking] = 0
 61 |     probs = exp_scores/np.sum(exp_scores)
 62 |     safe_n = np.sum(probs > 10**(-4)/n_docs)
 63 |     safe_k = np.minimum(safe_n, k_left)
 64 | 
 65 |     next_ranking = np.random.choice(np.arange(n_docs),
 66 |                                     replace=False,
 67 |                                     p=probs,
 68 |                                     size=safe_k)
 69 |     ranking = np.concatenate((incomplete_ranking, next_ranking))
 70 | 
 71 |     k_left = k_left - safe_k
 72 |     if k_left > 0:
 73 |       return self._recursive_choice(scores, ranking, k_left)
 74 |     else:
 75 |       return ranking
 76 | 
 77 |   def update_to_interaction(self, clicks):
 78 |     if np.any(clicks):
 79 |       self._update_to_clicks(clicks)
 80 | 
 81 |   def _update_to_clicks(self, clicks):
 82 |     n_docs = self.ranking.shape[0]
 83 |     cur_k = np.minimum(n_docs, self.n_results)
 84 | 
 85 |     included = np.ones(cur_k, dtype=np.int32)
 86 |     if not clicks[-1]:
 87 |       included[1:] = np.cumsum(clicks[::-1])[:0:-1]
 88 |     neg_ind = np.where(np.logical_xor(clicks, included))[0]
 89 |     pos_ind = np.where(clicks)[0]
 90 | 
 91 |     n_pos = pos_ind.shape[0]
 92 |     n_neg = neg_ind.shape[0]
 93 |     n_pairs = n_pos*n_neg
 94 | 
 95 |     if n_pairs == 0:
 96 |       return
 97 | 
 98 |     pos_r_ind = self.ranking[pos_ind]
 99 |     neg_r_ind = self.ranking[neg_ind]
100 | 
101 |     pos_scores = self.doc_scores[pos_r_ind]
102 |     neg_scores = self.doc_scores[neg_r_ind]
103 | 
104 |     log_pair_pos = np.tile(pos_scores, n_neg)
105 |     log_pair_neg = np.repeat(neg_scores, n_pos)
106 | 
107 |     pair_trans = 18 - np.maximum(log_pair_pos, log_pair_neg)
108 |     exp_pair_pos = np.exp(log_pair_pos + pair_trans)
109 |     exp_pair_neg = np.exp(log_pair_neg + pair_trans)
110 | 
111 |     pair_denom = (exp_pair_pos + exp_pair_neg)
112 |     pair_w = np.maximum(exp_pair_pos, exp_pair_neg)
113 |     pair_w /= pair_denom
114 |     pair_w /= pair_denom
115 |     pair_w *= np.minimum(exp_pair_pos, exp_pair_neg)
116 | 
117 |     pair_w *= self._calculate_unbias_weights(pos_ind, neg_ind)
118 | 
119 |     reshaped = np.reshape(pair_w, (n_neg, n_pos))
120 |     pos_w =  np.sum(reshaped, axis=0)
121 |     neg_w = -np.sum(reshaped, axis=1)
122 | 
123 |     all_w = np.concatenate([pos_w, neg_w])
124 |     all_ind = np.concatenate([pos_r_ind, neg_r_ind])
125 | 
126 |     self.model.update_to_documents(all_ind,
127 |                                    all_w)
128 | 
129 |   def _calculate_unbias_weights(self, pos_ind, neg_ind):
130 |     ranking_prob = self._calculate_observed_prob(pos_ind, neg_ind,
131 |                                                  self.doc_scores)
132 |     flipped_prob = self._calculate_flipped_prob(pos_ind, neg_ind,
133 |                                                 self.doc_scores)
134 |     return flipped_prob / (ranking_prob + flipped_prob)
135 | 
136 |   def _calculate_observed_prob(self, pos_ind, neg_ind, doc_scores):
137 |     n_pos = pos_ind.shape[0]
138 |     n_neg = neg_ind.shape[0]
139 |     n_pairs = n_pos * n_neg
140 |     n_results = self.ranking.shape[0]
141 |     n_docs = doc_scores.shape[0]
142 | 
143 |     results_i = np.arange(n_results)
144 |     pair_i = np.arange(n_pairs)
145 |     doc_i = np.arange(n_docs)
146 | 
147 |     pos_pair_i = np.tile(pos_ind, n_neg)
148 |     neg_pair_i = np.repeat(neg_ind, n_pos)
149 | 
150 |     min_pair_i = np.minimum(pos_pair_i, neg_pair_i)
151 |     max_pair_i = np.maximum(pos_pair_i, neg_pair_i)
152 |     range_mask = np.logical_and(min_pair_i[:, None] <= results_i,
153 |                                 max_pair_i[:, None] >= results_i)
154 | 
155 |     safe_log = np.tile(doc_scores[None, :],
156 |                        [n_results, 1])
157 | 
158 |     mask = np.zeros((n_results, n_docs))
159 |     mask[results_i[1:], self.ranking[:-1]] = True
160 |     mask = np.cumsum(mask, axis=0).astype(bool)
161 | 
162 |     safe_log[mask] = np.amin(safe_log)
163 |     safe_max = np.amax(safe_log, axis=1)
164 |     safe_log -= safe_max[:, None] - 18
165 |     safe_exp = np.exp(safe_log)
166 |     safe_exp[mask] = 0
167 | 
168 |     ranking_log = doc_scores[self.ranking] - safe_max + 18
169 |     ranking_exp = np.exp(ranking_log)
170 | 
171 |     safe_denom = np.sum(safe_exp, axis=1)
172 |     ranking_prob = ranking_exp/safe_denom
173 | 
174 |     tiled_prob = np.tile(ranking_prob[None, :], [n_pairs, 1])
175 | 
176 |     safe_prob = np.ones((n_pairs, n_results))
177 |     safe_prob[range_mask] = tiled_prob[range_mask]
178 | 
179 |     safe_pair_prob = np.prod(safe_prob, axis=1)
180 | 
181 |     return safe_pair_prob
182 | 
183 |   def _calculate_flipped_prob(self, pos_ind, neg_ind, doc_scores):
184 |     n_pos = pos_ind.shape[0]
185 |     n_neg = neg_ind.shape[0]
186 |     n_pairs = n_pos * n_neg
187 |     n_results = self.ranking.shape[0]
188 |     n_docs = doc_scores.shape[0]
189 | 
190 |     results_i = np.arange(n_results)
191 |     pair_i = np.arange(n_pairs)
192 |     doc_i = np.arange(n_docs)
193 | 
194 |     pos_pair_i = np.tile(pos_ind, n_neg)
195 |     neg_pair_i = np.repeat(neg_ind, n_pos)
196 | 
197 |     flipped_rankings = np.tile(self.ranking[None, :],
198 |                                [n_pairs, 1])
199 |     flipped_rankings[pair_i, pos_pair_i] = self.ranking[neg_pair_i]
200 |     flipped_rankings[pair_i, neg_pair_i] = self.ranking[pos_pair_i]
201 | 
202 |     min_pair_i = np.minimum(pos_pair_i, neg_pair_i)
203 |     max_pair_i = np.maximum(pos_pair_i, neg_pair_i)
204 |     range_mask = np.logical_and(min_pair_i[:, None] <= results_i,
205 |                                 max_pair_i[:, None] >= results_i)
206 | 
207 |     flipped_log = doc_scores[flipped_rankings]
208 | 
209 |     safe_log = np.tile(doc_scores[None, None, :],
210 |                        [n_pairs, n_results, 1])
211 | 
212 |     results_ij = np.tile(results_i[None, 1:], [n_pairs, 1])
213 |     pair_ij = np.tile(pair_i[:, None], [1, n_results-1])
214 |     mask = np.zeros((n_pairs, n_results, n_docs))
215 |     mask[pair_ij, results_ij, flipped_rankings[:, :-1]] = True
216 |     mask = np.cumsum(mask, axis=1).astype(bool)
217 | 
218 |     safe_log[mask] = np.amin(safe_log)
219 |     safe_max = np.amax(safe_log, axis=2)
220 |     safe_log -= safe_max[:, :, None] - 18
221 |     flipped_log -= safe_max - 18
222 |     flipped_exp = np.exp(flipped_log)
223 | 
224 |     safe_exp = np.exp(safe_log)
225 |     safe_exp[mask] = 0
226 |     safe_denom = np.sum(safe_exp, axis=2)
227 |     safe_prob = np.ones((n_pairs, n_results))
228 |     safe_prob[range_mask] = (flipped_exp/safe_denom)[range_mask]
229 | 
230 |     safe_pair_prob = np.prod(safe_prob, axis=1)
231 | 
232 |     return safe_pair_prob
233 | 
234 | 


--------------------------------------------------------------------------------
/algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/__init__.py


--------------------------------------------------------------------------------
/algorithms/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/baselines/__init__.py


--------------------------------------------------------------------------------
/algorithms/baselines/pairwise.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | import numpy as np
 7 | import utils.rankings as rnk
 8 | from algorithms.PDGD.pdgd import PDGD
 9 | 
10 | # Pairwise Baseline from Hofmann
11 | class Pairwise(PDGD):
12 | 
13 |   def __init__(self, epsilon,
14 |                *args, **kargs):
15 |     super(Pairwise, self).__init__(*args, **kargs)
16 |     self.epsilon = epsilon
17 | 
18 |   def _create_train_ranking(self, query_id, query_feat, inverted):
19 |     assert inverted == False
20 |     n_docs = query_feat.shape[0]
21 |     k = np.minimum(self.n_results, n_docs)
22 |     self.doc_scores = self.model.score(query_feat)
23 | 
24 |     exploit = rnk.rank_query(self.doc_scores, inverted=False, n_results=k)
25 |     explore = np.random.permutation(np.arange(n_docs))
26 |     coinflips = np.random.uniform(size=k) > self.epsilon
27 | 
28 |     self.ranking = -np.ones(k, dtype=np.int32)
29 |     exploit_i = 0
30 |     explore_i = 0
31 |     for i in range(k):
32 |       if coinflips[i]:
33 |         while exploit[exploit_i] in self.ranking:
34 |           exploit_i += 1
35 |         self.ranking[i] = exploit[exploit_i]
36 |         exploit_i += 1
37 |       else:
38 |         while explore[explore_i] in self.ranking:
39 |           explore_i += 1
40 |         self.ranking[i] = explore[explore_i]
41 |         explore_i += 1
42 | 
43 |     self._last_query_feat = query_feat
44 |     return self.ranking
45 | 
46 |   def _update_to_clicks(self, clicks):
47 |     n_docs = self.ranking.shape[0]
48 |     cur_k = np.minimum(n_docs, self.n_results)
49 | 
50 |     included = np.ones(cur_k, dtype=np.int32)
51 |     if not clicks[-1]:
52 |       included[1:] = np.cumsum(clicks[::-1])[:0:-1]
53 |     neg_ind = np.where(np.logical_xor(clicks, included))[0]
54 |     pos_ind = np.where(clicks)[0]
55 | 
56 |     n_pos = pos_ind.shape[0]
57 |     n_neg = neg_ind.shape[0]
58 |     n_pairs = n_pos*n_neg
59 | 
60 |     if n_pairs == 0:
61 |       return
62 | 
63 |     pos_r_ind = self.ranking[pos_ind]
64 |     neg_r_ind = self.ranking[neg_ind]
65 | 
66 |     all_w = np.zeros(n_pos + n_neg)
67 |     all_w[:n_pos] = n_neg
68 |     all_w[n_pos:] = -n_pos
69 | 
70 |     all_ind = np.concatenate([pos_r_ind, neg_r_ind])
71 | 
72 |     self.model.update_to_documents(all_ind,
73 |                                    all_w)


--------------------------------------------------------------------------------
/algorithms/basiconlineranker.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | import os
  5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
  6 | import numpy as np
  7 | import utils.rankings as rnk
  8 | 
  9 | class BasicOnlineRanker(object):
 10 | 
 11 |   def __init__(self, n_results, n_features):
 12 |     self.n_features = n_features
 13 |     self.n_results = n_results
 14 | 
 15 |     self.n_interactions = 0
 16 |     self.model_updates = 0
 17 |     self._messages = {}
 18 |     self._default_messages = {}
 19 | 
 20 |     self._train_features = None
 21 |     self._train_query_ranges = None
 22 | 
 23 |   @staticmethod
 24 |   def default_parameters():
 25 |     '''Return all parameter values for this ranker.
 26 |        used for logging purposes.'''
 27 |     return {}
 28 | 
 29 |   def add_message(self, name, default_value=0):
 30 |     self._default_messages[name] = default_value
 31 | 
 32 |   def remove_message(self, name):
 33 |     del self._default_messages[name]
 34 | 
 35 |   def set_message(self, name, value):
 36 |     self._messages[name] = value
 37 | 
 38 |   def get_messages(self):
 39 |     messages = self._default_messages.copy()
 40 |     messages.update(self._messages)
 41 |     return messages
 42 | 
 43 |   def reset_messages(self):
 44 |     self._messages.clear()
 45 | 
 46 |   def setup(self, train_features, train_query_ranges):
 47 |     self._train_features = train_features
 48 |     self._train_query_ranges = train_query_ranges
 49 | 
 50 |   def clean(self):
 51 |     del self._train_features
 52 |     del self._train_query_ranges
 53 | 
 54 |   def get_test_rankings(self, features,
 55 |                         query_ranges, inverted=True):
 56 |     return rnk.rank_multiple_queries(
 57 |                       np.zeros(features.shape[0]),
 58 |                       query_ranges,
 59 |                       inverted=inverted,
 60 |                       n_results=self.n_results)
 61 | 
 62 |   def get_query_features(self, query_id, features,
 63 |                          query_ranges):
 64 |     start_i = query_ranges[query_id]
 65 |     end_i = query_ranges[query_id + 1]
 66 |     return features[start_i:end_i, :]
 67 | 
 68 |   def get_query_label(self, query_id, label_vector,
 69 |                       query_ranges):
 70 |     start_i = query_ranges[query_id]
 71 |     end_i = query_ranges[query_id + 1]
 72 |     return label_vector[start_i:end_i]
 73 | 
 74 |   def get_query_size(self, query_id, query_ranges):
 75 |     return query_ranges[query_id+1] - query_ranges[query_id]
 76 | 
 77 |   def get_train_query_ranking(self, query_id, inverted=False):
 78 |     self._last_query_id = query_id
 79 |     query_feat = self.get_query_features(query_id,
 80 |                                      self._train_features,
 81 |                                      self._train_query_ranges)
 82 |     self._last_ranking = self._create_train_ranking(
 83 |                                         query_id,
 84 |                                         query_feat,
 85 |                                         inverted)[:self.n_results]
 86 |     return self._last_ranking
 87 | 
 88 |   def _create_train_ranking(self, query_id, query_feat, inverted):
 89 |     n_docs = self.get_query_size(query_id,
 90 |                                  self._train_query_ranges)
 91 |     return rnk.rank_single_query(np.zeros(n_docs),
 92 |                     inverted=inverted,
 93 |                     n_results=self.n_results)[:self.n_results]
 94 | 
 95 |   def process_clicks(self, clicks):
 96 |     self.update_to_interaction(clicks)
 97 |     self.n_interactions += 1
 98 | 
 99 |   def update_to_interaction(self, clicks):
100 |       pass


--------------------------------------------------------------------------------
/attack.sh:
--------------------------------------------------------------------------------
1 | python2 scripts/Poisoning_attacks/attack_DBGD_base_lr.py --data_sets local_MQ2007 --attacker_click_model frequency_attack\
2 |        --click_models exper1 --log_folder ./log --output_folder ./output --average_folder ./average \
3 |        --n_impr 10000 --n_runs 10 --n_proc 10 --n_results 10 --start 0 --end 1 --which 1 --mf 5 --sd_const 2.0 --num_attacker_relevant 5
4 | 


--------------------------------------------------------------------------------
/attack_graph.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import json
 3 | import sys
 4 | 
 5 | 
 6 | DBGD_output = open(sys.argv[1], 'r')
 7 | print(sys.argv[1])
 8 | DBGD_output_lines = DBGD_output.readlines()
 9 | # Extract macro here
10 | macro = json.loads(DBGD_output_lines[0])
11 | # print(macro)
12 | 
13 | DBGD_output_lines = DBGD_output_lines[0:]
14 | 
15 | NDCG_attack = []
16 | NDCG_label = []
17 | LR = []
18 | NDCG_label = []	
19 | iterations = []
20 | graph_title = ""
21 | attack_name = ""
22 | 
23 | label_lr = ""
24 | 
25 | if "TD2003" in sys.argv[1]:
26 | 	label_lr = "TD2003"
27 | elif "MQ2007" in sys.argv[1]:
28 | 	label_lr = "MQ2007"
29 | elif "Yahoo" in sys.argv[1]:
30 | 	label_lr = "Yahoo"
31 | elif "MSLR" in sys.argv[1]:
32 | 	label_lr = "MSLR"
33 | else:
34 | 	print("Wrong name of dataset!!!")
35 | 
36 | file_name = sys.argv[1]
37 | 
38 | if "frequency" in file_name:
39 | 	attack_name = "frequency_attack"
40 | else:
41 | 	attack_name = "naive_intersection_attack"
42 | 
43 | for line in DBGD_output_lines:
44 | 	run_details = json.loads(line)['simulation_arguments']
45 | 	graph_title = run_details['simulation_arguments']['attacker_click_model']
46 | 	Tau = []
47 | 	num_clicks= []
48 | 
49 | 
50 | 	run_results = json.loads(line)['results']
51 | 	print(graph_title)
52 | 	run_results = json.loads(line)['results']["NDCG_attack"][attack_name]["mean"]
53 | 	it = 0
54 | 	for val in run_results:
55 | 		NDCG_attack.append(val)
56 | 
57 | 	run_results = json.loads(line)['results']["NDCG_label"][attack_name]["mean"]
58 | 
59 | 	for val in run_results:
60 | 		NDCG_label.append(val)
61 | 		iterations.append(it)
62 | 		it += 1
63 | 
64 | 	run_results = json.loads(line)['results']["LR"][attack_name]["mean"]
65 | 
66 | 	for val in run_results:
67 | 		LR.append(float('%.8f'%val))
68 | 
69 | fig_ndcg_attack, ax_ndcg_attack = plt.subplots()
70 | ax_ndcg_attack.plot(iterations, NDCG_attack, '#FF0000', label="NDCG attacker")
71 | ax_ndcg_attack.plot(iterations, NDCG_label, '#d79232', label="NDCG ground truth")
72 |   
73 | 
74 | # ax_ndcg_attack.set_title("")
75 | ax_ndcg_attack.set_xlabel("Iteration")
76 | ax_ndcg_attack.set_ylabel("NDCG@10")
77 | plt.legend()
78 | 
79 | 
80 | fig_LR, ax_LR = plt.subplots()
81 | 
82 | ax_LR.plot(iterations, LR, '#FF0000', label=label_lr)
83 | ax_LR.set_xlabel("Iteration")
84 | ax_LR.set_ylabel("Learning Rate")
85 | plt.legend()
86 | 
87 | plt.show()
88 | 


--------------------------------------------------------------------------------
/attacker_avg_summarize.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import json
 3 | import sys
 4 | 
 5 | 
 6 | DBGD_output = open(sys.argv[1], 'r')
 7 | DBGD_output_lines = DBGD_output.readlines()
 8 | # Extract macro here
 9 | macro = json.loads(DBGD_output_lines[0])
10 | # print(macro)
11 | 
12 | DBGD_output_lines = DBGD_output_lines[0:]
13 | 
14 | NDCG_attack = []
15 | NDCG_label = []	
16 | iterations = []
17 | lr = []
18 | # graph_title = ""
19 | attack_name = ""
20 | 
21 | file_name = sys.argv[1]
22 | 
23 | if "new_freq_attack" in file_name:
24 | 	attack_name = "new_freq_attack"
25 | elif "freq_attack" in file_name:
26 | 	attack_name = "freq_attack"
27 | else:
28 | 	attack_name = "click_kth_doc"
29 | 
30 | for line in DBGD_output_lines:
31 | 	run_details = json.loads(line)['simulation_arguments']
32 | 	# graph_title = run_details['simulation_arguments']['click_models'][0]
33 | 	Tau = []
34 | 	num_clicks= []
35 | 
36 | 
37 | 	run_results = json.loads(line)['results']
38 | 	# print(graph_title)
39 | 	run_results = json.loads(line)['results']["NDCG_attack"][attack_name]["mean"]
40 | 
41 | 	count = 0
42 | 	for val in run_results:
43 | 		if count > 0 and (count % 1000 == 0 or count == 9999):
44 | 			NDCG_attack.append(round(val,5))
45 | 			# count = 0
46 | 		count += 1
47 | 
48 | 
49 | 	run_results = json.loads(line)['results']["NDCG_label"][attack_name]["mean"]
50 | 
51 | 	count = 0
52 | 
53 | 	for val in run_results:
54 | 		if count % 1000 == 0:
55 | 			NDCG_label.append(val)
56 | 			count = 0
57 | 		count += 1
58 | 
59 | 	run_results = json.loads(line)['results']["LR"][attack_name]["mean"]
60 | 
61 | 	count = 0
62 | 
63 | 	for val in run_results:
64 | 
65 | 		if count > 0 and (count % 1000 == 0 or count == 9999):
66 | 			lr.append(round(val, 6))
67 | 			# count = 0
68 | 
69 | 		count += 1
70 | 
71 | 
72 | 
73 | print("NDCG attack: ", NDCG_attack)
74 | print("NDCG label: ", NDCG_label)
75 | print("LR: ", lr)


--------------------------------------------------------------------------------
/attacker_weights/Weights_mq2007.txt:
--------------------------------------------------------------------------------
1 | 0.9166919478487141, 1.7784039695501537, -1.6769130412370732, -1.0041922049927996, -0.8730556165490367, -0.5939161488734054, -0.5250025451437097, 0.4753711525834708, -0.5744830378084759, -0.21007874901643958, 0.2787353823389643, -1.1002740024900035, -0.0930665557464917, 1.2507969372208942, -0.20785952917382847, 0.8229744423351555, -1.304110655709577, -1.0500020375389558, -0.6129241024409119, 0.8218856002635212, -0.7786836027940605, -0.8236289206938172, -0.5583321490637034, 0.3216787960938945, -0.7612135724352947, -0.5285954958295925, -1.03892181829786, -0.6746550701750351, 2.2008087591667973, -0.8902641264291119, -1.2921437618361649, -0.5826275584144579, -0.23792784910165976, 0.930849688497572, -0.48438708507058614, -1.0634970423248107, 0.6851423496958496, -0.2709910615850057, 1.0658650984145353, 0.5796336844374246, -0.008724844951078671


--------------------------------------------------------------------------------
/attacker_weights/Weights_td2003.txt:
--------------------------------------------------------------------------------
1 | -1.169315284362955, 0.004743552831948161, -0.2716495992544119, 1.7206214619901572, -1.0389289334618654, 1.828498381246963, 0.0957685844512115, 1.7768675284866982, -1.6085950004464, 1.9536578518010819, 0.5094803636777727, -1.8219560517709135, 0.16672947533285498, 0.774492228899978, -0.8000438873312938, 1.371752163283971, -1.5809555938819688, -1.8824776396802596, -1.847353085460993, 0.5933365199674299, -1.907365651885614, 0.5583549275711559, -1.6907993175212224, -0.5080409113792914, 0.18593325863932542, 1.237691768891898, 0.6506285466671793, -1.7486763705096213, 1.7676765442887667, -1.1493297913235412, 1.713217839000456, 0.07349645474949318, -1.6365281453037421, 1.566718953268857, -1.0427090424702952, 0.5214338726741836, -1.624595369683914, -0.14110808140185593, 0.3916497592951984, 0.714693244430086, 1.7629424730034238, -1.068917955159976, -1.0842509460396972, -0.26477504051836576, 0.5966841002862706, 0.19709130808314024, 1.8285894190582885, 1.4269690742660384, 0.5673460795171632, -1.9117583834847198, -1.5526961096228091, 0.5510562739930802, 0.1885894577134044, -1.2706942541153308, 1.2984730857636881, -1.6075491808497815, 1.7841811332702404, -0.28785427123698915, -1.0599049535064315


--------------------------------------------------------------------------------
/attacker_weights/Weights_web10k.txt:
--------------------------------------------------------------------------------
1 | -1.2918894660631546, -1.626611786778359, 0.5224666753764322, 0.5810908362813563, 1.5591812870538133, -0.3897725398372405, -1.8555490026700565, -1.1999196575008906, 0.5683077838197774, -1.55611596365868, -1.3731589625585863, 0.24971913117971045, 0.6229948454285941, -1.755683893046522, -1.970372354334914, 1.51808954518985, -1.2689018117232043, -0.1244493450428017, -0.12707349829113834, 0.601557266657065, 1.5189488517419276, -1.8228510762995822, 1.915942639946798, -1.186623174158215, -1.5692845785392566, 1.3117780143659368, -1.6419754006702578, -0.9388693258093666, -1.8619918426706539, -0.8970273743878558, 1.6048588774490882, -1.0347278661427284, -0.5084919797241403, -1.1020718152602575, -0.11254386823654627, -1.169315284362955, 0.004743552831948161, -0.2716495992544119, 1.7206214619901572, -1.0389289334618654, 1.828498381246963, 0.0957685844512115, 1.7768675284866982, -1.6085950004464, 1.9536578518010819, 0.5094803636777727, -1.8219560517709135, 0.16672947533285498, 0.774492228899978, -0.8000438873312938, 1.371752163283971, -1.5809555938819688, -1.8824776396802596, -1.847353085460993, 0.5933365199674299, -1.907365651885614, 0.5583549275711559, -1.6907993175212224, -0.5080409113792914, 0.18593325863932542, 1.237691768891898, 0.6506285466671793, -1.7486763705096213, 1.7676765442887667, -1.1493297913235412, 1.713217839000456, 0.07349645474949318, -1.6365281453037421, 1.566718953268857, -1.0427090424702952, 0.5214338726741836, -1.624595369683914, -0.14110808140185593, 0.3916497592951984, 0.714693244430086, 1.7629424730034238, -1.068917955159976, -1.0842509460396972, -0.26477504051836576, 0.5966841002862706, 0.19709130808314024, 1.8285894190582885, 1.4269690742660384, 0.5673460795171632, -1.9117583834847198, -1.5526961096228091, 0.5510562739930802, 0.1885894577134044, -1.2706942541153308, 1.2984730857636881, -1.6075491808497815, 1.7841811332702404, -0.28785427123698915, -1.0599049535064315, -0.46672715103695017, 0.9760471739089973, 0.5182538515265533, -1.4841453081143805, 0.5345699957426095, -0.07243785525036195, -1.0976245689168653, -0.1328041368477333, 0.7902408772271423, 1.6857623914133857, 1.7747436843321513, -1.2889580954141597, 0.13278396787662938, 1.3757499237546664, -1.4550758167451048, -0.209482063393017, -0.09582896694006227, 0.9972243100422764, -1.284074783621219, 1.6978357398119277, -0.42506051073460727, -1.1385528982028035, 0.050697046416877445, 0.03805650495055435, -0.7833642314164155, 1.2149669516161428, -1.6688119146299236, 1.2047288451589453, -0.8473391050511214, -0.4178580973738488, -0.7082477204136652, -0.5502155330456628, -1.9785959207009323, 1.1978313172157908, 0.931166672864471, -0.9981402273986739, -0.15518281873065654, -1.8281038641142415, 0.948831822932735, 1.3942131498150339, 0.3504681105627463, 1.2912271256664138
2 | 


--------------------------------------------------------------------------------
/attacker_weights/Weights_yahoo.txt:
--------------------------------------------------------------------------------
1 | 1.5828946583539185, 1.9896038399218923, 0.6903950307337174, 1.7265872898190806, -0.5673661301794262, 0.5012445462851272, -1.0176798372911664, -1.1147385167744202, 0.569615856135774, 1.0914865921290646, -0.4083644053404387, 1.431872593427824, -0.3165083687788717, -0.02071922775356816, 0.36531090086424367, -0.5812145888066582, 0.5401116470385774, -0.5567385469458195, 1.0999557430966638, -0.7522970546809051, -0.24258155684170157, 1.6095655365653823, -0.495432704415411, 0.9812915904282806, -1.3571112232995102, -0.5694170605811664, 1.4408160489284638, 1.198749623503545, 0.17678943445361694, 0.9899131247299726, -0.49123166389979644, -1.0102285305862027, -1.0222279771179021, -0.6988529508814483, 1.9830174247132546, 0.7680577259824659, 0.04428325680278178, -1.982466373800416, -0.6216758205168484, -0.34553091872124986, 1.0703811595523036, 0.19466497289529627, 1.6748671045085275, 1.1579013773581628, 0.4734479615276741, 1.2949038815266416, -0.2262939040791756, -0.12134363029522932, -0.584271306103938, -1.2340454385069024, -0.3709687242872981, -1.6252325396555025, -1.3273254874836256, 1.660208339842653, 0.8445205653584136, 1.0522097102453256, 0.32437903678098134, -0.04369092410539199, -1.019900994734221, 1.5030438064529248, -0.6262174165776573, -1.337426258070995, 0.05790758632172466, 1.1616185843989655, 0.9865444967325976, -1.9689584729288288, -1.3673409944879427, -0.2323226160421754, 1.756855515256066, -1.6329132114432388, 1.6810686760270404, 1.523863321033577, 0.3341087085554739, 0.01251870387303855, -1.520355595494593, 0.05158593260779121, -1.8363071680646548, 0.3544610585861232, -1.4609222058688283, 1.6570873177915186, -0.35355514975952795, 1.2768487133345734, 1.1270250623047988, -1.7373555723626781, 0.6201712115975986, -0.14328842975151757, -0.6202868614496646, 0.2929374888827603, 0.1543347391885943, -0.22580114584327227, -0.030009554293235485, -0.844563194994647, 0.8532502430776314, -1.964740898252682, 1.862527897225942, -1.1511537395097737, -1.957623597155553, 1.5643035750385001, -0.2629609249062659, -0.11529454550180684, -0.9156302224973576, 0.9809225521465756, -0.9391761633550115, 1.6144204800999948, -0.6167659260981746, 0.8199348837327669, -0.5645903075409491, 0.6701136295645975, -0.7207852300149447, 1.5644006590494235, 0.047542032155412084, -0.7743020319593392, 1.3024253370452183, -0.32257759233297056, -0.2653684165347707, -1.705376617703465, -0.29225604791257664, 1.7866790810205861, -0.3440038915779855, -0.8120108370738617, 0.47277629174343483, -1.7158853314792855, 1.0144326295143986, -0.9481464554897157, -1.5421360767973207, -0.7555847659277592, 0.3100026511307621, -0.5720214447732257, -0.28507818281132824, 1.0700013469896938, -1.84060893460033, -0.6243906861291646, -1.5093895897250174, -0.7582252913522076, 0.29868114005878343, 1.1242641126228494, -1.9275054878645106, -0.7935491845925737, 1.9444810724415866, -0.7885068995178601, 1.720083276582765, -0.5456404868961227, -1.4843126265896198, -0.0852665349195787, 1.7531127640107504, -0.9034328849212088, 1.2532447437853156, -1.138192700448998, 0.43259996123315814, -1.1326716963407177, 0.5154155910510738, 1.3762496383770695, 1.484337533500864, 0.1254875771806554, 0.13785285017743965, -0.4252156532712972, -0.9679581777086623, 0.08877206905167334, 1.8827770605057887, -1.1511743442546924, -0.22407019757360613, -0.5554089679536274, 1.043720642338997, -0.14476396070838327, -1.1379292602522617, -1.053414049604418, 0.5712496707022741, -1.3556056697878929, 0.9277167224787939, 1.0748793432643637, 1.1879275944973076, -1.0656952347304238, -0.6125922042736751, 1.549902631539395, -0.3127637237797405, 0.1268740449597976, -0.09320954158997452, 1.4918756053696072, 0.9170225490768424, -0.7737274061749262, 1.6465497279655037, -0.12175246775849002, -1.0351966911724513, -1.2250554282728432, -0.38598686880333366, -1.4525893796956244, -1.6575478737162967, -0.05841645185379862, -1.1801285301041986, -1.9935458062570452, 1.2653960452034463, -0.4376209111222189, 1.778940252010754, 1.0568236723421642, 1.0609730176510097, -1.7694398942846306, -1.1027174922437335, 0.1907571459797639, -1.9973111804388308, 1.7302820934988996, -1.0233206460263848, -1.0693722825394727, 0.13830539428591315, 0.09749597008309019, 0.39693869553495764, 1.6753471909078117, 1.1100917252451405, -0.022886731222536927, -1.0174851157214686, 0.8093110149828324, 1.8308322691127534, -1.4631667086795144, 1.1403359556774624, 0.6165032572192892, -0.9915479627077937, -0.42306747423758706, 0.6089194862168927, -0.3339073532105532, -1.6751340785415314, -1.077268064707991, 0.5309622287607842, 1.2097810024061366, 1.7130677951176465, 0.850399524404351, -0.7889327679983027, 1.9970289539492851, 0.5841399568912546, 1.000510514502039, -1.0423683864980173, 1.1117832309464784, 1.565090338794616, -1.3949737855981441, 0.13321744083431053, -0.004087178226273913, 1.7828274198391822, 1.1089143098353924, -1.7146183016112762, -0.8195862313368325, -1.0907562204765346, -0.8587895168377733, -0.5348287917303716, 1.843415421353665, -0.6387579384164002, -1.5702980028357065, -1.6540384540895472, -0.2701083463295717, -1.9792069923464575, 0.19330891156846963, -1.4425438564203046, 1.8851598589536205, -1.7304859492865572, -0.6130699752752817, 0.9396696569171352, -1.2767007580634488, 0.08092415952288823, -0.45045914712704826, 1.780481765212603, 1.4601197699006812, -1.1756804189810226, 1.8063987075511632, -1.2993785047414512, 0.4480558288739722, -1.3447492273930242, -0.06606767207107023, -0.055299145645124614, -0.15076790471094048, 0.027293702793865116, -0.49068074913516435, 0.2316048505777597, 1.1750924370825206, 0.28093802282622926, 0.8434098649337485, -1.212562256405012, 1.1405883212626065, 1.2108625977387457, 0.0006311322932543995, 1.1514055419858722, -0.1497238888482859, 0.5253301685473266, -1.6357101262352347, 0.739187600165951, 1.7452858736736339, -0.05694967811036733, -0.5902905536639507, -0.2910081412783443, -1.0419783635008573, 1.2830579096399135, 1.5866641542660078, 1.8960528869400322, 0.2991709923548451, 0.09455398910363533, -0.8486995131400437, 0.44716667945208943, -1.7466714452963235, -0.2209038351210899, 1.4340767872740328, -0.43853518289321247, -0.012143574358134845, 0.5796990147749583, -1.402581667628482, -0.5940185802575431, 1.6863913174878786, -1.7905846762400524, 0.09927742380893134, -1.3715820851432836, -0.5904978271760086, -1.6074204921905175, -0.17658669653065884, 1.34814344311977, 0.7732540099715735, -1.6399096224471275, 0.16963043419254298, 1.6372794610575534, 1.3232638748512846, 1.1471480707726829, -1.356814732761611, 0.43095655802624666, 1.5231973496505158, -1.7947071329669297, 1.367294567007899, 0.2962058290694629, 1.3320198707142121, 1.1032911696184242, -0.2622610918539259, -1.9689055374741984, -1.6793287819557485, 0.30505943254416756, -1.7011786160615454, 0.2889292230291489, 1.5668246665300392, -0.8325975131322494, -1.1667747685209342, 1.274008090744128, -1.4252408079846908, 1.3879558521051183, 0.27119146564549057, 0.22162343539545537, -1.387269133622461, 0.22686969678477498, 1.9775638570949696, -1.6194905749173487, -1.8122336967288106, -0.7722299727978967, 0.20350341515147985, 0.788599888591007, 1.3347378655454691, 1.8928793073567882, -1.4423607462433483, 1.6918896567871062, -0.16227429215387623, -1.0581819862393296, -0.15298462789657208, 1.0488200210605565, -0.8978623154396255, -0.7617461537229508, 1.0060814326299412, 1.5052314729678717, -0.678559819292301, -1.8202384096007855, -0.4335477347058889, 0.08204711116213037, -1.559419667887901, -0.7484819288014726, 0.7718381868713982, -0.5222522771317846, 0.03343352262806576, -1.2776222072409071, 1.084013006181038, 0.1183781511350408, 1.7327294314353425, -0.7412182048163358, -0.7679486932240263, -0.9517602673642425, -0.19215971701925794, -0.4728740310794697, -0.08101536489291039, -0.575822496828382, 0.8823675848291259, -1.804071635066987, 0.5132034930666505, -1.9260823692373013, -1.3553401671180532, -0.6473435358470363, 0.13121780817202788, 1.2173900891904528, 0.27812229455913773, -0.7040843921465427, 1.6521477107346652, -0.8177612727499106, 0.28173168446905583, 0.8915261228209559, 1.7933499628244824, 0.45601708960831466, 0.08509227626799154, 1.96655402058714, -0.03530891470880837, -1.5488198559209838, -0.8910437252187413, -0.7541194312168864, -1.016059417754188, 0.20690585226077962, 0.9893378756376543, 0.5307779389239049, 0.19382010234386637, 0.3333854057715575, 0.5847986975318973, -1.5072335033397448, 0.733552992086596, 1.665418492839505, 1.6313908135679815, 1.143207655528002, 1.705854449675209, -0.4855115744309195, 0.5474581796636397, -1.5293471520030977, 0.1908761110982069, 1.8704594909030932, -0.15808306818152928, 0.3553044421278635, -0.5551170510452992, 0.8105398676140503, -0.7888491919253307, -0.1690798742491859, 0.9138195410920544, 1.3013893175562519, 1.030464241395121, 1.18103903202661, 1.2258704862314591, 0.4393827647241464, 1.1491861945934656, -1.5030973573598643, -1.0287032689637567, 0.7890139601181851, 1.9149139173245886, 1.4305729705855526, -0.4172611996692641, -0.06243960752638067, 1.7274376716103426, 1.6429528088252523, -1.2066703632069689, -0.08547222510290098, 1.2990762307237143, 1.5185273782177853, 0.280518576815449, 0.7082207313722688, 1.7515408320336099, 0.4250894807887975, -1.4173003756200724, 0.0222778292419199, 1.8107244661253468, -1.6521511334085193, 1.672877410613868, -1.619638449115278, 1.2376727518757096, 1.2473813520150152, 0.03300340009262026, 1.7871944806030067, 1.989457176381341, 1.824772973024825, -0.5988810774781861, 0.014994389894054105, -1.8915113336170588, -1.9265968320997544, 0.22839740678069287, -0.4661631128284234, 0.5989752634688199, 1.17377615289231, -1.463511630252626, -0.8078723951568931, 1.2311478643831428, -0.48143088151913593
2 | 


--------------------------------------------------------------------------------
/graphs/makeaverages.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | import sys
 7 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 8 | from utils.averageoutput import IndependentOutputAverager
 9 | 
10 | def create_folders(filename):
11 |   if not os.path.exists(os.path.dirname(filename)):
12 |     os.makedirs(os.path.dirname(filename))
13 | 
14 | 
15 | description = 'Script for averaging over full run output files.'
16 | parser = argparse.ArgumentParser(description=description)
17 | 
18 | parser.add_argument('--average_folder', dest='average_folder', type=str,
19 |                     required=True, default=None,
20 |                     help='Folder to output pdfs into.')
21 | 
22 | parser.add_argument('--fullrun_prefix', dest='fullrun_prefix', type=str,
23 |                     required=True, default=None,
24 |                     help='Prefix for folders of full runs of the same dataset.')
25 | 
26 | parser.add_argument('output_files', type=str, nargs='+',
27 |                     help='Output files to be parsed.')
28 | 
29 | args = parser.parse_args()
30 | 
31 | 
32 | def create_folders(filename):
33 |   if not os.path.exists(os.path.dirname(filename)):
34 |     os.makedirs(os.path.dirname(filename))
35 | 
36 | def process_run_name(name):
37 |   name = name.replace('_', '\\_')
38 |   return name
39 | 
40 | 
41 | average_folder = args.average_folder
42 | averager = IndependentOutputAverager(average_folder)
43 | 
44 | path_pairs = []
45 | for output_file in args.output_files:
46 |   prefix = args.fullrun_prefix
47 |   assert prefix in output_file
48 |   average_file_name = output_file[output_file.find(prefix) + len(prefix):]
49 |   while average_file_name[0] == '/':
50 |     average_file_name = average_file_name[1:]
51 |   average_dest = '%s/%s' % (average_folder, average_file_name)
52 |   path_pairs.append((output_file, average_dest))
53 | 
54 | failed_paths = []
55 | success_paths = []
56 | for source, dest in path_pairs:
57 |   success = True
58 |   try:
59 |     average_results = averager.average_results(source)
60 |   except KeyboardInterrupt:
61 |     raise
62 |   except:
63 |     success = False
64 |     print 'Failed: ', source
65 |     failed_paths.append(source)
66 | 
67 |   if success:
68 |     print 'Success:', source, '   ->   ', dest
69 | 
70 |     create_folders(dest)
71 |     with open(dest, 'w') as w:
72 |       w.write(json.dumps(average_results))
73 | 
74 |     success_paths.append(source)
75 | 
76 | print
77 | print 'Done processing.'
78 | print
79 | print 'Successfully averaged the following files:'
80 | print
81 | print ' '.join(success_paths)
82 | print
83 | print 'Failed averaging the following files:'
84 | print
85 | print ' '.join(failed_paths)
86 | print
87 | 


--------------------------------------------------------------------------------
/graphs/makegraphs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import pylab as plt
  4 | import numpy as np
  5 | import random
  6 | import argparse
  7 | import os
  8 | import json
  9 | import datetime
 10 | 
 11 | def create_folders(filename):
 12 |   if not os.path.exists(os.path.dirname(filename)):
 13 |     os.makedirs(os.path.dirname(filename))
 14 | 
 15 | 
 16 | description = 'Script for displaying graphs from output files.'
 17 | parser = argparse.ArgumentParser(description=description)
 18 | 
 19 | parser.add_argument('--pdf_folder', dest='pdf_folder', type=str, required=False, default=None,
 20 |           help='Folder to output pdfs into.')
 21 | 
 22 | parser.add_argument('--folder_prefix', dest='folder_prefix', type=str, required=False,
 23 |           default=None, help='Prefix for folders of the same dataset.')
 24 | 
 25 | parser.add_argument('plot_name', type=str, help='Name to save plots under.')
 26 | 
 27 | parser.add_argument('output_files', type=str, help='Output files to be parsed.', nargs='+')
 28 | 
 29 | args = parser.parse_args()
 30 | 
 31 | 
 32 | def create_folders(filename):
 33 |   if not os.path.exists(os.path.dirname(filename)):
 34 |     os.makedirs(os.path.dirname(filename))
 35 | 
 36 | def process_run_name(name):
 37 |   name = name.replace('_', '\\_')
 38 |   name = name.replace('DeepP-DBGD', 'DBGD (neural)')
 39 |   name = name.replace('P-DBGD', 'DBGD')
 40 |   name = name.replace('P-MGD', 'MGD')
 41 |   name = name.replace('DeepPDGD', 'PDGD (neural)')
 42 |   return name
 43 | 
 44 | 
 45 | pdf_folder = args.pdf_folder
 46 | prefix_plot_name = args.plot_name
 47 | 
 48 | folder_structure = {}
 49 | if args.folder_prefix:
 50 |   for output_file in args.output_files:
 51 |     prefix = args.folder_prefix
 52 |     assert prefix in output_file
 53 |     average_file_name = output_file[output_file.find(prefix) + len(prefix):]
 54 |     while average_file_name[0] == '/':
 55 |       average_file_name = average_file_name[1:]
 56 |     data_folder = average_file_name[:average_file_name.find('/')]
 57 |     if data_folder not in folder_structure:
 58 |       folder_structure[data_folder] = []
 59 |     folder_structure[data_folder].append(output_file)
 60 | else:
 61 |   folder_structure[None] = args.output_files
 62 | 
 63 | to_plot = [
 64 |        ('offline', 'heldout'),
 65 |       ]
 66 | 
 67 | for data_folder in sorted(folder_structure.keys()):
 68 |   output_files = folder_structure[data_folder]
 69 |   data = {}
 70 |   file_names = []
 71 |   click_models = []
 72 |   value_names = []
 73 |   if data_folder is None:
 74 |     print 'No data folders found, outputting directly.'
 75 |   else:
 76 |     print 'Found data folder: %s' % data_folder
 77 |   for output_file in output_files:
 78 |     print 'reading', output_file
 79 |     file_name = output_file.split('/')[-1]
 80 |     if file_name[-4:] == '.out':
 81 |       file_name = file_name[:-4]
 82 |     assert file_name not in data
 83 |     data[file_name] = {}
 84 |     file_names.append(file_name)
 85 |     with open(output_file) as f:
 86 |       output = json.load(f)
 87 |       for name, value in output['runtimes'].items():
 88 |         print name,
 89 |         print datetime.timedelta(seconds=value),
 90 |         print '(%d seconds)' % value
 91 |       data[file_name] = output['results']
 92 |       for v_name in output['results']:
 93 |         if v_name not in value_names:
 94 |           value_names.append(v_name)
 95 |         for c_m in output['results'][v_name]:
 96 |           if c_m == 'indices':
 97 |             continue
 98 |           if c_m not in click_models:
 99 |             click_models.append(c_m)
100 | 
101 |     print
102 | 
103 |   print 'finished reading, found the following value types:'
104 |   for name in value_names:
105 |     print name
106 |   print
107 |   print 'start plotting'
108 | 
109 |   # params = {
110 |   #     'text.latex.preamble': r"\usepackage{lmodern}",
111 |   #     'text.usetex': True,
112 |   #     'font.size': 26,
113 |   #     'font.family': 'lmodern',
114 |   #     'text.latex.unicode': True,
115 |   #     }
116 |   # plt.rcParams.update(params)
117 | 
118 |   colours = [
119 |     'black',
120 |     'r',
121 |     'b',
122 |     'g',
123 |     'y',
124 |     'c',
125 |     'orange',
126 |     'purple',
127 |     'pink',
128 |     'gray',
129 |     ] * 30
130 | 
131 |   for plot_name, v_name in to_plot:
132 |     for click_model in click_models:
133 |       fig = plt.figure(figsize=(10.5, 6), linewidth=0.1)
134 |       # fig = plt.figure(figsize=(10.5, 4), linewidth=0.1)
135 |       plt.ioff()
136 |       plt.ylabel('NDCG')
137 |       plt.xlabel('impressions')
138 |       plt.gca().yaxis.set_ticks_position('both')
139 | 
140 |       labels = []
141 |       max_ind = np.NINF
142 |       for i, file_name in enumerate(file_names):
143 |         file_dict = data[file_name]
144 |         colour = colours[i]
145 | 
146 |         if v_name not in file_dict:
147 |           if v_name == 'heldout' and 'held-out' in file_dict:
148 |             v_name = 'held-out'
149 |           elif v_name == 'held-out' and 'heldout' in file_dict:
150 |             v_name = 'heldout'
151 |           else:
152 |             print 'not found', v_name, file_dict.keys()
153 |             continue
154 |         v_dict = file_dict[v_name]
155 |         ind = np.array(v_dict['indices'])
156 |         if click_model not in v_dict:
157 |           print 'not found', click_model, v_dict.keys()
158 |           continue
159 |         c_dict = v_dict[click_model]
160 | 
161 |         max_ind = max(max_ind, np.max(ind))
162 |         mean = np.array(c_dict['mean'])
163 |         std = np.array(c_dict['std'])
164 | 
165 |         plt.fill_between(ind, mean-std, mean+std, color=colour, alpha=0.2)
166 |         plt.plot(ind, mean, color=colour)
167 |         labels.append(process_run_name(file_name))
168 | 
169 |       if len(labels) > 0:
170 |         # if v_ind == "TEST INDICES":
171 |           # plt.ylim(.6,.8)
172 |         plt.ylim(.2,0.5)
173 |         plt.xlim(-5, 30000)
174 |         plt.xlim(-500, 1000000)
175 |         # plt.xlim(-5, max_ind)
176 |         # plt.xlim(-5, 100000)
177 |         plt.annotate(click_model, xy=(0.02, 0.90), xycoords='axes fraction')
178 |         if click_model == 'perfect':
179 |           plt.legend(labels, loc=4, fontsize=16, frameon=False, ncol=1)
180 |         # plt.legend(labels, loc=0, fontsize=26, frameon=False, ncol=1)
181 | 
182 |         if not pdf_folder:
183 |           plt.show()
184 |         else:
185 |           plot_file_name = '%s_%s_%s.pdf' % (prefix_plot_name, plot_name, click_model)
186 |           if not data_folder is None:
187 |             plot_file_name = os.path.join(data_folder, plot_file_name)
188 |             create_folders(os.path.join(pdf_folder, plot_file_name))
189 |           plt.savefig(os.path.join(pdf_folder, plot_file_name), bbox_inches='tight')
190 |           print 'saved', plot_file_name
191 |       plt.close(fig)
192 |     print
193 | 


--------------------------------------------------------------------------------
/graphs/maketables.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import pylab as plt
  4 | import numpy as np
  5 | import random
  6 | import argparse
  7 | import os
  8 | import json
  9 | import datetime
 10 | 
 11 | description = 'Script for displaying graphs from output files.'
 12 | parser = argparse.ArgumentParser(description=description)
 13 | 
 14 | parser.add_argument('--table_folder', dest='table_folder', type=str, required=False, default=None,
 15 |           help='Folder to output pdfs into.')
 16 | 
 17 | parser.add_argument('--baselines', dest='baselines', type=str, required=False, default=None,
 18 |           help='Folder to output pdfs into.', nargs='+')
 19 | 
 20 | parser.add_argument('--folder_prefix', dest='folder_prefix', type=str, required=False,
 21 |           default=None, help='Prefix for folders of the same dataset.')
 22 | 
 23 | parser.add_argument('plot_name', type=str, help='Name to save plots under.')
 24 | 
 25 | parser.add_argument('output_files', type=str, help='Output files to be parsed.', nargs='+')
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | def create_folders(filename):
 30 |   if not os.path.exists(os.path.dirname(filename)):
 31 |     os.makedirs(os.path.dirname(filename))
 32 | 
 33 | def get_significance(mean_1, mean_2, std_1, std_2, n):
 34 |     significance = ''
 35 |     ste_1 = std_1 / np.sqrt(n)
 36 |     ste_2 = std_2 / np.sqrt(n)
 37 |     t = (mean_1 - mean_2) / np.sqrt(ste_1 ** 2 + ste_2 ** 2)
 38 |     # treatment is worse than baseline
 39 |     # values used are for 120 degrees of freedom
 40 |     # (http://changingminds.org/explanations/research/analysis/
 41 |     # t-test_table.htm)
 42 |     significance = '\\hphantom{\\tiny \\dubbelneer}'
 43 |     if mean_1 > mean_2:
 44 |         if abs(t) >= 2.62:
 45 |             significance = '{\\tiny \\dubbelneer}'
 46 |         elif abs(t) >= 1.98:
 47 |             significance = '{\\tiny \\enkelneer}'
 48 |     else:
 49 |         if abs(t) >= 2.62:
 50 |             significance = '{\\tiny \\dubbelop}'
 51 |         elif abs(t) >= 1.98:
 52 |             significance = '{\\tiny \\enkelop}'
 53 |     return significance
 54 | 
 55 | class OutputTable(object):
 56 | 
 57 |     def __init__(self, table_name, table_folder):
 58 |         self._closed = False
 59 |         self.output_path = '%s/%s.tex' % (table_folder, table_name)
 60 |         print 'creating file at %s' % self.output_path
 61 |         create_folders(self.output_path)
 62 |         self._output_file = open(self.output_path, 'w')
 63 |         self.writeline('% !TEX root = ../main.tex')
 64 | 
 65 |     def writeline(self, *line):
 66 |         full_line = ' '.join(line)
 67 |         self._output_file.write(full_line + '\n')
 68 |         print full_line
 69 | 
 70 |     def write(self, *line):
 71 |         full_line = ' '.join(line)
 72 |         self._output_file.write(full_line + ' ')
 73 |         print full_line,
 74 | 
 75 |     def close(self):
 76 |         self._closed = True
 77 |         self._output_file.close()
 78 |         print 'Finished writing to and closed:', self.output_path
 79 | 
 80 | def process_run_name(name):
 81 |   name = name.replace('_', '\\_')
 82 |   name = name.replace('DeepP-DBGD', 'DBGD (neural)')
 83 |   name = name.replace('P-DBGD', 'DBGD (linear)')
 84 |   name = name.replace('P-MGD', 'MGD (linear)')
 85 |   name = name.replace('PDGD', 'PDGD (linear)')
 86 |   name = name.replace('DeepPDGD (linear)', 'PDGD (neural)')
 87 |   name = name.replace('Pairwise', 'Pairwise (linear)')
 88 |   return name
 89 | 
 90 | def process_folder_name(name):
 91 |   name = name.replace('_', '\\_')
 92 |   name = name.replace('Webscope\\_C14\\_Set1', 'Yahoo')
 93 | 
 94 |   return name
 95 | 
 96 | prefix_plot_name = args.plot_name
 97 | folder_structure = {}
 98 | if args.folder_prefix:
 99 |   for output_file in args.output_files + args.baselines:
100 |     prefix = args.folder_prefix
101 |     assert prefix in output_file
102 |     average_file_name = output_file[output_file.find(prefix) + len(prefix):]
103 |     while average_file_name[0] == '/':
104 |       average_file_name = average_file_name[1:]
105 |     data_folder = average_file_name[:average_file_name.find('/')]
106 |     data_folder = process_folder_name(data_folder)
107 |     if data_folder not in folder_structure:
108 |       folder_structure[data_folder] = []
109 |     folder_structure[data_folder].append(output_file)
110 | else:
111 |   folder_structure[None] = args.output_files
112 | 
113 | to_table = [
114 |        # ('offline', 'heldout', 10000),
115 |        ('online', 'cumulative-display', 10000),
116 |       ]
117 | 
118 | baselines = []
119 | methods = []
120 | 
121 | all_data = {}
122 | for data_folder in sorted(folder_structure.keys()):
123 |   output_files = folder_structure[data_folder]
124 |   data = {}
125 |   all_data[data_folder] = data
126 |   file_names = []
127 |   click_models = []
128 |   value_names = []
129 |   if data_folder is None:
130 |     print 'No data folders found, outputting directly.'
131 |   else:
132 |     print 'Found data folder: %s' % data_folder
133 |   for output_file in output_files:
134 |     print 'reading', output_file
135 |     file_name = output_file.split('/')[-1]
136 |     if file_name[-4:] == '.out':
137 |       file_name = file_name[:-4]
138 |     file_name = process_run_name(file_name)
139 |     if output_file in args.baselines and file_name not in baselines:
140 |       baselines.append(file_name)
141 |     elif output_file not in args.baselines and file_name not in methods:
142 |       methods.append(file_name)
143 |     assert file_name not in data, '%s already in %s' % (file_name, data.keys())
144 |     data[file_name] = {}
145 |     file_names.append(file_name)
146 |     with open(output_file) as f:
147 |       output = json.load(f)
148 |       for name, value in output['runtimes'].items():
149 |         print name,
150 |         print datetime.timedelta(seconds=value),
151 |         print '(%d seconds)' % value
152 |       data[file_name] = output['results']
153 |       for v_name in output['results']:
154 |         if v_name not in value_names:
155 |           value_names.append(v_name)
156 |         for c_m in output['results'][v_name]:
157 |           if c_m == 'indices':
158 |             continue
159 |           if c_m not in click_models:
160 |             click_models.append(c_m)
161 | 
162 |     print
163 | 
164 |   print 'finished reading, found the following value types:'
165 |   for name in value_names:
166 |     print name
167 |   print
168 | 
169 | click_models = ['perfect', 'navigational', 'informational']
170 | 
171 | folder_order = sorted(folder_structure.keys())
172 | for table_name, table_value, table_ind in to_table:
173 |   table_data = {}
174 |   for folder_name in folder_order:
175 |     all_f_data = all_data[folder_name]
176 |     f_data = {}
177 |     table_data[folder_name] = f_data
178 | 
179 |     for c_m in click_models:
180 |       c_data = {}
181 |       max_v = np.NINF
182 |       f_data[c_m] = c_data
183 |       for b_name in baselines:
184 |         b_data = all_data[folder_name][b_name][table_value]
185 |         b_ind = np.array(b_data['indices'])
186 |         if np.any(b_ind == table_ind):
187 |           v_i = np.where(b_ind == table_ind)[0][0]
188 |         else:
189 |           diff = b_ind - table_ind
190 |           v_i = np.argmax(diff[diff<=0])
191 |         v_mean = b_data[c_m]['mean'][v_i]
192 |         v_std = b_data[c_m]['std'][v_i]
193 | 
194 |         max_v = max(max_v, v_mean)
195 |         c_data[b_name] = (v_mean, v_std, None)
196 | 
197 |       for m_name in methods:
198 |         m_data = all_data[folder_name][m_name][table_value]
199 |         m_ind = np.array(m_data['indices'])
200 |         if np.any(m_ind == table_ind):
201 |           v_i = np.where(m_ind == table_ind)[0][0]
202 |         else:
203 |           diff = b=m_ind - table_ind
204 |           v_i = np.argmax(diff[diff<=0])
205 |         v_mean = m_data[c_m]['mean'][v_i]
206 |         v_std = m_data[c_m]['std'][v_i]
207 | 
208 |         sig = []
209 |         for b_name in baselines:
210 |           b_mean, b_std, _ = c_data[b_name]
211 |           sig.append(get_significance(b_mean, v_mean, b_std, v_std, 125))
212 | 
213 |         max_v = max(max_v, v_mean)
214 |         c_data[m_name] = (v_mean, v_std, sig)
215 | 
216 |       c_data['maximum'] = max_v
217 | 
218 |   out = OutputTable(table_name, args.table_folder)
219 |   out.writeline('\\begin{tabular*}{\\textwidth}{@{\\extracolsep{\\fill} } l ', 'l '
220 |                   * len(folder_order), '}')
221 |   out.writeline('\\toprule')
222 | 
223 |   for data_folder in folder_order:
224 |     out.write(' & { \\small \\textbf{%s}}' % data_folder)
225 |   out.writeline('\\\\')
226 | 
227 |   for click_model in click_models:
228 |     out.writeline('\\midrule')
229 |     out.writeline('& \\multicolumn{%d}{|c|}{\\textit{%s}} \\\\' % (len(folder_order), click_model))
230 |     out.writeline('\\midrule')
231 | 
232 |     for name in baselines + methods:
233 |       out.write(name)
234 | 
235 |       for folder in folder_order:
236 |         v_max = round(table_data[folder][click_model]['maximum'], 1)
237 |         v_mean, v_std, v_sig = table_data[folder][click_model][name]
238 |         out.write('&')
239 | 
240 |         if round(v_mean, 1) >= v_max:
241 |           out.write('\\bf')
242 | 
243 |         out.write('%0.01f {\\tiny (%0.01f)}' % (v_mean, v_std))
244 |         if not (v_sig is None):
245 |           out.write(' '.join(v_sig))
246 | 
247 |       out.writeline('\\\\')
248 | 
249 | 
250 | 
251 | 
252 |   out.writeline('\\bottomrule')
253 |   out.writeline('\\end{tabular*}')
254 |   out.close()
255 | 
256 |   print
257 |   print
258 |   print
259 | 
260 | 
261 | 
262 | 
263 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/models/__init__.py


--------------------------------------------------------------------------------
/models/evolutionneuralmodel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class EvolutionNeuralModel(object):
 4 | 
 5 |   def __init__(self, learning_rate,
 6 |                learning_rate_decay,
 7 |                hidden_layers, n_features,
 8 |                n_candidates):
 9 |     def normal(init, shape):
10 |       safe_shape = (self.n_models,) + shape
11 |       return np.random.normal(0., init, safe_shape)
12 | 
13 |     self.n_models = n_candidates + 1
14 |     self.learning_rate = learning_rate
15 |     self.hidden_layer_nodes = hidden_layers
16 |     self.hidden_layers = []
17 |     self.biases = []
18 |     self.n_nodes = 0
19 |     prev_units = n_features
20 |     for n_units in hidden_layers:
21 |       init = 1./prev_units
22 |       self.hidden_layers.append(normal(init, (prev_units, n_units)))
23 |       self.biases.append(normal(init, (1, n_units,)))
24 |       self.n_nodes += (prev_units+1)*n_units
25 |       prev_units = n_units
26 |     self.hidden_layers.append(normal(1./prev_units, (prev_units, 1)))
27 |     self.n_nodes += prev_units
28 |     self.learning_rate_decay = learning_rate_decay
29 | 
30 |   def sample_candidates(self):
31 |     assert self.n_models > 1
32 |     n_cand = self.n_models-1
33 |     vectors = np.random.randn(self.n_models-1, self.n_nodes)
34 |     vector_norms = np.sum(vectors ** 2, axis=1) ** (1. / 2)
35 |     vectors /= vector_norms[:, None]
36 |     vec_i = 0
37 |     for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases):
38 |       h_shape = hidden_layer.shape[1:3]
39 |       n_matrix = np.prod(h_shape)
40 |       n_bias = h_shape[1]
41 |       matrix_noise = np.reshape(vectors[:, vec_i:vec_i+n_matrix],
42 |                                 (n_cand, h_shape[0], h_shape[1]))
43 |       vec_i += n_matrix
44 |       bias_noise = np.reshape(vectors[:, vec_i:vec_i+n_bias],
45 |                               (n_cand, n_bias))
46 |       vec_i += n_bias
47 | 
48 |       hidden_layer[1:,:,:] = hidden_layer[0, None,:,:] + matrix_noise
49 |       bias[1:, :] = bias[0, None, :] + bias_noise
50 | 
51 |     matrix_noise = vectors[:,vec_i:,None]
52 |     self.hidden_layers[-1][1:,:,:] = self.hidden_layers[-1][0,None,:,:] + matrix_noise
53 | 
54 |   def score(self, features):
55 |     return self._score(features, 0)
56 | 
57 |   def _score(self, features, model_i):
58 |     prev_layer = features
59 |     self.input = features
60 |     self.activations = [prev_layer]
61 |     for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases):
62 |       prev_layer = np.dot(prev_layer, hidden_layer[model_i, :])
63 |       prev_layer += bias[model_i,:]
64 |       prev_layer = 1./(1. + np.exp(-prev_layer))
65 |       self.activations.append(prev_layer)
66 |     result = np.dot(prev_layer, self.hidden_layers[-1][model_i,: ])
67 |     self.activations.append(result)
68 |     return result[:, 0] 
69 | 
70 |   def candidate_score(self, features):
71 |     scores = []
72 |     for i in range(self.n_models):
73 |       scores.append(self._score(features, i))
74 |     return np.stack(scores, axis=0)
75 | 
76 |   def update_to_mean_winners(self, winners):
77 |     assert self.n_models > 1
78 |     if len(winners) > 0:
79 |       for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases):
80 |         average_layer = np.mean(hidden_layer[winners,:,:], axis=0)
81 |         average_bias = np.mean(bias[winners,:], axis=0)
82 | 
83 |         layer_gradient = (average_layer - hidden_layer[0,:,:])
84 |         bias_gradient = (average_bias - bias[0,:])
85 | 
86 |         hidden_layer[0,:,:] += self.learning_rate*layer_gradient
87 |         bias[0,:] += self.learning_rate*bias_gradient
88 |       
89 |       average_layer = np.mean(self.hidden_layers[-1][winners,:,:], axis=0)
90 |       layer_gradient = (average_layer - self.hidden_layers[-1][0,:,:])
91 |       self.hidden_layers[-1][0,:,:] += self.learning_rate*layer_gradient
92 | 
93 |       self.learning_rate *= self.learning_rate_decay
94 |     


--------------------------------------------------------------------------------
/models/linearmodel.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sympy import Matrix
  3 | from scipy.linalg import norm
  4 | def sample_with_basis(M):
  5 |     weight = np.random.normal(0, 1, len(M))
  6 |     v = weight.dot(M)
  7 |     # print(v)
  8 |     v /= norm(v)
  9 |     return v
 10 | 
 11 | class LinearModel(object):
 12 |   def __init__(self, n_features, learning_rate,
 13 |                n_candidates=0, learning_rate_decay=1.0):
 14 |     self.n_features = n_features
 15 |     self.learning_rate = learning_rate
 16 |     self.n_models = n_candidates + 1
 17 |     self.weights = np.zeros((n_features, self.n_models))
 18 |     self.learning_rate_decay = learning_rate_decay
 19 | 
 20 |   def copy(self):
 21 |     copy = LinearModel(n_features = self.n_features,
 22 |                        learning_rate = self.learning_rate,
 23 |                        n_candidates = self.n_models-1)
 24 |     copy.weights = self.weights.copy()
 25 |     return copy
 26 | 
 27 |   def candidate_score(self, features):
 28 |     self._last_features = features
 29 |     return np.dot(features, self.weights).T
 30 | 
 31 |   def score(self, features):
 32 |     self._last_features = features
 33 |     return np.dot(features, self.weights[:,0:1])[:,0]
 34 | 
 35 |   def sample_candidates(self):
 36 |     assert self.n_models > 1
 37 |     vectors = np.random.randn(self.n_features, self.n_models-1)
 38 |     vector_norms = np.sum(vectors ** 2, axis=0) ** (1. / 2)
 39 |     vectors /= vector_norms[None, :]
 40 |     self.weights[:, 1:] = self.weights[:, 0, None] + vectors
 41 | 
 42 |   def update_to_mean_winners(self, winners, viewed_list=None):
 43 |     assert self.n_models > 1
 44 |     if len(winners) > 0:
 45 |       # print 'winners:', winners
 46 |       gradient = np.mean(self.weights[:, winners], axis=1) - self.weights[:, 0]
 47 | 
 48 |       # Added for projection
 49 |       if viewed_list is not None and len(viewed_list)>0:
 50 |           gradient = self.project_to_viewed_doc(gradient,viewed_list)
 51 | 
 52 |       self.weights[:, 0] += self.learning_rate * gradient
 53 |       self.learning_rate *= self.learning_rate_decay
 54 | 
 55 |   def update_to_documents(self, doc_ind, doc_weights):
 56 |     weighted_docs = self._last_features[doc_ind, :] * doc_weights[:, None]
 57 |     gradient = np.sum(weighted_docs, axis=0)
 58 |     self.weights[:, 0] += self.learning_rate * gradient
 59 |     self.learning_rate *= self.learning_rate_decay
 60 | 
 61 | 
 62 |   def project_to_viewed_doc(self, winning_gradient, viewed_list):
 63 |     # Make projections to each of viewed document as basis vector
 64 |     gradient_proj = np.zeros(self.n_features)
 65 | 
 66 |     # viewed_list has each row as the basis, so it is the transpose of columnspace M
 67 |     basis_trans = np.matrix.transpose(np.asarray(viewed_list))
 68 | 
 69 |     # SVD decomposition, column of both u_ and vh_ is orthogonal basis of columnspace of input
 70 |     # Use u matrix for basis, as u_ is 'document-to-concept' simialrity
 71 |     # vh_ is 'feature-to-concept' similarity
 72 |     u_,s_,vh_ = np.linalg.svd(np.asarray(basis_trans), full_matrices=False)
 73 |     # transpose to row space
 74 |     basis_list = np.matrix.transpose(np.asarray(u_))
 75 | 
 76 |     
 77 |     # proj_g onto x =  dot(x,g)/|x|^2  x
 78 |     for basis in basis_list:
 79 |         len_basis = np.sqrt(basis.dot(basis)) 
 80 |         # len_basis = np.sqrt(sum(k*k for k in basis)) # could take out np.sqrt and square in next line
 81 |         gradient_proj += np.dot(basis, winning_gradient) / (len_basis * len_basis) * basis
 82 | 
 83 |     # Normalize
 84 |     norm = np.linalg.norm(gradient_proj)
 85 |     if norm > 0:
 86 |       gradient_proj = gradient_proj / norm
 87 | 
 88 |     return gradient_proj
 89 | 
 90 | 
 91 |   # sample candidate from null space for NSGD
 92 |   def sample_candidates_null_space(self, grads, features, withBasis=False):
 93 |     assert self.n_models > 1
 94 |     # vectors = np.random.randn(self.n_features, self.n_models-1)
 95 |     # vector_norms = np.sum(vectors ** 2, axis=0) ** (1. / 2)
 96 |     # vectors /= vector_norms[None, :]
 97 |     # self.weights[:, 1:] = self.weights[:, 0, None] + vectors
 98 | 
 99 |     N = Matrix(grads).nullspace() #  get null space of gradient matrix
100 |     newN = np.array(N).astype(np.float64)
101 |     for i in range(0, len(newN)):
102 |         norm = np.linalg.norm(newN[i])
103 |         if norm > 0:
104 |             newN[i] = newN[i]/norm
105 | 
106 |     # sample vectors normally from the nullspace
107 |     if withBasis:
108 |     # sample with basis
109 |         nsVecs = [sample_with_basis(newN) for i in range(2*self.n_models)]
110 |     else:
111 |     # Directly sample from null space
112 |         nsVecs = [newN[randint(0, len(N) - 1)] for i in range(2*self.n_models)]
113 | 
114 |     # get average candidate document feature vector
115 |     avgdocfeat = [sum(feat)/len(feat) for feat in zip(*features)]
116 |     # sort vectors by dot product (decreasing absolute value)
117 |     nsVecs = sorted(nsVecs, key=lambda vec: abs(np.dot(vec, avgdocfeat)), reverse=True)
118 | 
119 |     self.gs = np.array(nsVecs[:self.n_models-1])
120 |     self.weights[:, 1:] = self.weights[:, 0, None] + self.gs.T


--------------------------------------------------------------------------------
/models/neuralmodel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class NeuralModel(object):
 4 | 
 5 |   def __init__(self, learning_rate,
 6 |                learning_rate_decay,
 7 |                hidden_layers, n_features):
 8 |     def normal(init, shape):
 9 |       return np.random.normal(0., init, shape)
10 | 
11 |     self.learning_rate = learning_rate
12 |     self.hidden_layer_nodes = hidden_layers
13 |     self.hidden_layers = []
14 |     self.biases = []
15 |     prev_units = n_features
16 |     for n_units in hidden_layers:
17 |       init = 1./prev_units
18 |       self.hidden_layers.append(normal(init, (prev_units, n_units)))
19 |       self.biases.append(normal(init, n_units)[None, :])
20 |       prev_units = n_units
21 |     self.hidden_layers.append(normal(1./prev_units, (prev_units, 1)))
22 |     self.learning_rate_decay = learning_rate_decay
23 | 
24 |   def score(self, features):
25 |     prev_layer = features
26 |     self.input = features
27 |     self.activations = [prev_layer]
28 |     for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases):
29 |       prev_layer = np.dot(prev_layer, hidden_layer)
30 |       prev_layer += bias
31 |       prev_layer = 1./(1. + np.exp(-prev_layer))
32 |       self.activations.append(prev_layer)
33 |     result = np.dot(prev_layer, self.hidden_layers[-1])
34 |     self.activations.append(result)
35 |     return result[:, 0]
36 | 
37 |   def backpropagate(self, doc_ind, doc_weights):
38 |     activations = [a[doc_ind, :] for a in self.activations]
39 |     doc_weights = np.expand_dims(doc_weights, axis=1)
40 |     cur_der = (np.dot(activations[-2].T, doc_weights), None)
41 |     derivatives = [cur_der]
42 |     prev_der = doc_weights
43 |     for i in range(len(self.hidden_layers)-1):
44 |       prev_der = np.dot(prev_der, self.hidden_layers[-i-1].T)
45 |       prev_der *= activations[-i-2]*(1.-activations[-i-2])
46 | 
47 |       w_der = np.dot(activations[-i-3].T, prev_der)
48 |       b_der = np.sum(prev_der, axis=0, keepdims=True)
49 | 
50 |       derivatives.append((w_der, b_der))
51 | 
52 |     return derivatives
53 | 
54 |   def debugstr(self):
55 |     for i, hd in enumerate(self.hidden_layers[:-1]):
56 |       print 'layer %d:' % i, hd
57 |       print 'bias %d:' % i, self.biases[i]
58 |     print 'final hidden:', self.hidden_layers[-1]
59 | 
60 | 
61 |   def update_to_documents(self, doc_ind, doc_weights):
62 |     derivatives = self.backpropagate(doc_ind, doc_weights)
63 | 
64 |     first_wd = derivatives[0][0]
65 |     self.hidden_layers[-1] += first_wd * self.learning_rate
66 |     for i, (wd, bd) in enumerate(derivatives[1:], 2):
67 |       self.hidden_layers[-i] += wd * self.learning_rate
68 |       self.biases[-i + 1] += bd * self.learning_rate
69 |     self.learning_rate *= self.learning_rate_decay 
70 |     


--------------------------------------------------------------------------------
/models/neuralnet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class NeuralModel(object):
 4 | 
 5 |   def __init__(self, learning_rate, hidden_layers, n_features,
 6 |                regularize_rate=0., n_output=1):
 7 |     self.learning_rate = learning_rate
 8 |     self.regularize_rate = regularize_rate
 9 |     self.hidden_layer_nodes = hidden_layers
10 |     self.hidden_layers = []
11 |     self.biases = []
12 |     prev_units = n_features
13 |     for n_units in hidden_layers:
14 |       self.hidden_layers.append(np.random.normal(0., 1./prev_units, (prev_units, n_units)))
15 |       self.biases.append(np.random.normal(0., 1./prev_units, n_units)[None, :])
16 |       prev_units = n_units
17 |     self.hidden_layers.append(np.random.normal(0., 1./prev_units, (prev_units, n_output)))
18 | 
19 |   def score(self, input):
20 |     prev_layer = input.T
21 |     self.input = input
22 |     self.activations = [prev_layer]
23 |     for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases):
24 |       prev_layer = np.dot(prev_layer, hidden_layer)
25 |       prev_layer += bias
26 |       # prev_layer = np.maximum(0., prev_layer)
27 |       prev_layer = 1./(1. + np.exp(-prev_layer))
28 |       self.activations.append(prev_layer)
29 |     result = np.dot(prev_layer, self.hidden_layers[-1])
30 |     self.activations.append(result)
31 |     return result
32 | 
33 |   # def predict(self, input):
34 |   #   prev_layer = input.T
35 |   #   for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases):
36 |   #     prev_layer = np.dot(prev_layer, hidden_layer)
37 |   #     prev_layer += bias
38 |   #     # prev_layer = np.maximum(0, prev_layer)
39 |   #     prev_layer = 1./(1. + np.exp(-prev_layer))
40 |   #   return np.dot(prev_layer, self.hidden_layers[-1])
41 | 
42 |   def backpropagate(self, doc_weights):
43 |     activations = self.activations
44 |     doc_weights = np.expand_dims(doc_weights, axis=1)
45 |     cur_der = (np.dot(activations[-2].T, doc_weights), None)
46 |     derivatives = [cur_der]
47 |     prev_der = doc_weights
48 |     for i in range(len(self.hidden_layers)-1):
49 |       prev_der = np.dot(prev_der, self.hidden_layers[-i-1].T)
50 |       # prev_der[activations[-i-2] <= 0] = 0
51 |       prev_der *= activations[-i-2]*(1.-activations[-i-2])
52 | 
53 |       w_der = np.dot(activations[-i-3].T, prev_der)
54 |       b_der = np.sum(prev_der, axis=0, keepdims=True)
55 | 
56 |       derivatives.append((w_der, b_der))
57 | 
58 |     return derivatives
59 | 
60 |   def debugstr(self):
61 |     for i, hd in enumerate(self.hidden_layers[:-1]):
62 |       print 'layer %d:' % i, hd
63 |       print 'bias %d:' % i, self.biases[i]
64 |     print 'final hidden:', self.hidden_layers[-1]
65 | 
66 | 
67 |   def update_to_documents(self, doc_weights):
68 |     derivatives = self.backpropagate(doc_weights)
69 | 
70 |     first_wd = derivatives[0][0]
71 |     self.hidden_layers[-1] += first_wd * self.learning_rate
72 |     for i, (wd, bd) in enumerate(derivatives[1:], 2):
73 |       self.hidden_layers[-i] += wd * self.learning_rate
74 |       self.biases[-i + 1] += bd * self.learning_rate
75 | 
76 | 
77 |   # def regularize_update(self):
78 |   #   rate = self.regularize_rate
79 |   #   if rate != 0:
80 |   #     self.hidden_layers[-1] -= rate * self.hidden_layers[-1]
81 |   #     for i in range(len(self.hidden_layers) - 1):
82 |   #       self.hidden_layers[i] -= rate * self.hidden_layers[i]
83 |   #       # self.biases[i] -= rate * self.biases[i] * 0.1
84 | 
85 |     


--------------------------------------------------------------------------------
/multileaving/PairwisePreferenceMultileave.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class PairwisePreferenceMultileave(object):
 7 | 
 8 |     def __init__(self, num_data_features, k=10):
 9 |         self._name = 'Pairwise Preferences Multileave'
10 |         self._k = k
11 |         self.needs_inverted = True
12 |         self.needs_descending = True
13 |         self.needs_oracle = False
14 |         self.vector_aggregation = False
15 | 
16 |     def clean(self):
17 |         del self._last_inverted_rankings
18 | 
19 |     def top_rank(self, multileaving, top_docs):
20 |         n_disp = multileaving.shape[0]
21 |         top_rank = np.zeros(n_disp, dtype=np.int32)
22 |         top_rank[:] = n_disp
23 |         for i in range(n_disp):
24 |             in_rank = np.in1d(multileaving, top_docs[:,i])
25 |             top_rank[in_rank] = np.minimum(top_rank[in_rank],i)
26 |         return top_rank
27 | 
28 |     def make_multileaving(self, descending_rankings, inverted_rankings):
29 |         self._last_inverted_rankings = inverted_rankings
30 |         self._last_descending_rankings = descending_rankings
31 |         self._last_n_rankers = inverted_rankings.shape[0]
32 | 
33 |         n_docs = descending_rankings.shape[1]
34 |         n_rankers = descending_rankings.shape[0]
35 |         length = min(self._k,n_docs)
36 |         multileaving = np.zeros(length, dtype=np.int32)
37 |         previous_set = np.array([], dtype=np.int32)
38 |         previous_results = {}
39 |         self._last_choice_sizes = np.zeros(length)
40 |         for i in range(length):
41 |             full_set = np.unique(descending_rankings[:,:i+1])
42 |             cur_set = np.setdiff1d(full_set, multileaving[:i], assume_unique=True)
43 |             multileaving[i] = np.random.choice(cur_set,1)
44 |             self._last_choice_sizes[i] = cur_set.shape[0]
45 |         self._last_top_ranks = self.top_rank(multileaving, descending_rankings)
46 |         return multileaving
47 | 
48 |     def infer_preferences(self, result_list, clicked_docs):
49 |         if np.any(clicked_docs):
50 |             return self.preferences_of_list(result_list, clicked_docs.astype(bool))
51 |         else:
52 |             return np.zeros((self._last_n_rankers, self._last_n_rankers))
53 | 
54 |     def preferences_of_list(self, result_list, clicked_docs):
55 |         n_disp = result_list.shape[0]
56 |         n_rankers = self._last_n_rankers
57 |         included = np.ones(min(self._k, clicked_docs.shape[0]))
58 |         if not clicked_docs[-1]:
59 |             included[1:] = np.cumsum(clicked_docs[::-1])[:0:-1]
60 |         neg_pref = np.where(np.logical_xor(clicked_docs, included))[0]
61 |         pos_pref = np.where(clicked_docs)[0]
62 | 
63 |         pair_neg = np.repeat(neg_pref, pos_pref.shape[0])
64 |         pair_pos = np.tile(pos_pref, neg_pref.shape[0])
65 | 
66 |         pair_min_pos  = np.minimum(pair_pos, pair_neg)
67 |         pair_max_rank = np.maximum(self._last_top_ranks[pair_neg], self._last_top_ranks[pair_pos])
68 |         allowed_pairs = pair_min_pos >= pair_max_rank
69 | 
70 |         n_allowed_pairs = np.sum(allowed_pairs)
71 |         if n_allowed_pairs > 0:
72 |             pos_allow = pair_pos[allowed_pairs]
73 |             neg_allow = pair_neg[allowed_pairs]
74 |             pair_ind_pos = result_list[pos_allow]
75 |             pair_ind_neg = result_list[neg_allow]
76 | 
77 |             pair_prob_comp = np.zeros(n_allowed_pairs)
78 |             for i in range(n_allowed_pairs):
79 |                 pair_top = sorted([self._last_top_ranks[pos_allow[i]],self._last_top_ranks[neg_allow[i]]])
80 |                 pair_prob_comp[i] = 1./np.prod(1.-1./self._last_choice_sizes[pair_top[0]:pair_top[1]])
81 | 
82 |             correct_pairs = self._last_inverted_rankings[:, pair_ind_neg] \
83 |                                 - self._last_inverted_rankings[:, pair_ind_pos] > 0
84 | 
85 |             total_correct = np.sum(correct_pairs * pair_prob_comp, axis=1) \
86 |                               / n_allowed_pairs
87 | 
88 |         else:
89 |             total_correct = np.zeros(self._last_inverted_rankings.shape[0])
90 | 
91 |         return total_correct[:,None] - total_correct[None,:]
92 | 


--------------------------------------------------------------------------------
/multileaving/ProbabilisticMultileave.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | class ProbabilisticMultileave(object):
  7 | 
  8 |   def __init__(self, n_samples=10000, n_results=10, tau=3.0):
  9 |     self._name = 'Probabilistic Multileave'
 10 |     self._k = n_results
 11 |     self._tau = tau
 12 |     self._n_samples = n_samples
 13 |     self.uses_inverted_rankings = True
 14 |     self.needs_inverted = True
 15 |     self.needs_descending = False
 16 |     self.needs_oracle = False
 17 |     self.vector_aggregation = False
 18 | 
 19 |   def clean(self):
 20 |     del self._last_inverted_rankings
 21 | 
 22 |   def make_multileaving(self, inverted_rankings):
 23 |     '''
 24 |     ARGS: (all np.array of docids)
 25 |     - inverted_rankings: matrix (rankers x documents) where [x,y] corresponds to the rank of doc y in ranker x
 26 | 
 27 |     RETURNS
 28 |     - ranking of indices corresponding to inverted_rankings
 29 |     '''
 30 |     self._last_inverted_rankings = inverted_rankings
 31 |     self._last_n_rankers = inverted_rankings.shape[0]
 32 |     n = inverted_rankings.shape[1]
 33 |     k = min(n, self._k)
 34 | 
 35 |     unnorm_probs = 1. / (inverted_rankings + 1) ** self._tau
 36 |     denom = np.sum(unnorm_probs, axis=1)
 37 | 
 38 |     ranking = np.empty(k, dtype=np.int32)
 39 |     ind = np.arange(n)
 40 |     for i in range(k):
 41 |         norm_probs = unnorm_probs / denom[:, None]
 42 |         probs = np.mean(norm_probs, axis=0)
 43 |         choice = np.random.choice(ind, p=probs, replace=False)
 44 |         ranking[i] = choice
 45 |         denom -= unnorm_probs[:, choice]
 46 |         unnorm_probs[:, choice] = 0
 47 | 
 48 |     self._last_ranking = ranking
 49 |     return ranking
 50 | 
 51 |   def infer_preferences(self, clicked_docs):
 52 |     if np.any(clicked_docs):
 53 |       return self.preferences_of_list(self.probability_of_list(self._last_ranking,
 54 |                       self._last_inverted_rankings,
 55 |                       clicked_docs.astype(bool), self._tau), self._n_samples)
 56 |     else:
 57 |       return np.zeros((self._last_n_rankers, self._last_n_rankers))
 58 | 
 59 |   def winning_rankers(self, clicked_docs):
 60 |     match = self.infer_preferences(clicked_docs)
 61 |     return np.where(match[:, 0] > 0)[0]
 62 | 
 63 |   def probability_of_list(self, result_list, inverted_rankings, clicked_docs, tau):
 64 |     '''
 65 |     ARGS: (all np.array of docids)
 66 |     - result_list: the multileaved list
 67 |     - inverted_rankings: matrix (rankers x documents) where [x,y] corresponds to the rank of doc y in ranker x
 68 |     - clicked_docs: boolean array of result_list length indicating clicks
 69 | 
 70 |     RETURNS
 71 |     -sigmas: matrix (rankers x clicked_docs) with probabilty ranker added clicked doc
 72 |     '''
 73 |     n_docs = inverted_rankings.shape[1]
 74 |     n_rankers = inverted_rankings.shape[0]
 75 | 
 76 |     click_doc_ind = result_list[clicked_docs]
 77 | 
 78 |     # normalization denominator for the complete ranking
 79 |     sigmoid_total = np.sum(float(1) / (np.arange(n_docs) + 1) ** self._tau)
 80 | 
 81 |     
 82 |     # cumsum is used to renormalize the probs, it contains the part
 83 |     # the denominator that has to be removed due to previously added docs
 84 |     cumsum = np.zeros((n_rankers, result_list.shape[0]))
 85 |     cumsum[:, 1:] = np.cumsum(float(1) / (inverted_rankings[:, result_list[:-1]] + 1.)
 86 |                   ** self._tau, axis=1)
 87 | 
 88 |     # make sure inverted rankings is of dtype float
 89 |     sigmas = 1 / (inverted_rankings[:, click_doc_ind].T + 1.) ** self._tau
 90 |     sigmas /= sigmoid_total - cumsum[:, clicked_docs].T
 91 | 
 92 |     return sigmas / np.sum(sigmas, axis=1)[:, None]
 93 | 
 94 |   def preferences_of_list(self, probs, n_samples):
 95 |     '''
 96 |     ARGS:
 97 |     -probs: clicked docs x rankers matrix with probabilities ranker added clicked doc  (use probability_of_list)
 98 |     -n_samples: number of samples to base preference matrix on
 99 | 
100 |     RETURNS:
101 |     - preference matrix: matrix (rankers x rankers) in this matrix [x,y] > 0 means x won over y and [x,y] < 0 means x lost from y
102 |       the value is analogous to the (average) degree of preference
103 |     '''
104 | 
105 |     n_clicks = probs.shape[0]
106 |     n_rankers = probs.shape[1]
107 |     # determine upper bounds for each ranker (to see prob distribution as set of ranges)
108 |     upper = np.cumsum(probs, axis=1)
109 | 
110 |     # determine lower bounds
111 |     lower = np.zeros(probs.shape)
112 |     # lower[:,0] = 0
113 |     lower[:, 1:] += upper[:, :-1]
114 | 
115 |     # flip coins, coins fall between lower and upper
116 |     coinflips = np.random.rand(n_clicks, self._n_samples)
117 |     # make copies for each sample and each ranker
118 |     comps = coinflips[:, :, None]
119 |     # determine where each coin landed
120 |     log_assign = np.logical_and(comps > lower[:, None, :], comps <= upper[:, None, :])
121 |     # click count per ranker (samples x rankers)
122 |     click_count = np.sum(log_assign, axis=0)
123 |     # the preference matrix for each sample
124 |     prefs = np.sign(click_count[:, :, None] - click_count[:, None, :])
125 | 
126 |     # the preferences are averaged for each pair
127 |     # in this matrix [x,y] > 0 means x won over y and [x,y] < 0 means x lost from y
128 |     # the value is analogous to the (average) degree of preference
129 |     return np.sum(prefs, axis=0) / float(self._n_samples)
130 | 


--------------------------------------------------------------------------------
/multileaving/TeamDraftMultileave.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class TeamDraftMultileave(object):
 7 | 
 8 |   def __init__(self, n_results=10):
 9 |     self._name = 'Team-Draft Multileave'
10 |     self._k = n_results
11 |     self.uses_inverted_rankings = False
12 |     self.needs_inverted = False
13 |     self.needs_descending = True
14 |     self.needs_oracle = False
15 |     self.vector_aggregation = False
16 | 
17 |   def clean(self):
18 |     del self.teams
19 | 
20 |   def next_index_to_add(self, inter_result, inter_n, ranking, index):
21 |     while index < ranking.shape[0] and np.any(ranking[index] == inter_result[:inter_n]):
22 |       index += 1
23 |     return index
24 | 
25 |   def make_multileaving(self, descending_rankings):
26 | 
27 |     rankings = descending_rankings
28 | 
29 |     n_rankings = rankings.shape[0]
30 |     k = min(self._k, rankings.shape[1])
31 |     teams = np.zeros(k, dtype=np.int32)
32 |     multileaved = np.zeros(k, dtype=np.int32)
33 | 
34 |     multi_i = 0
35 |     while multi_i < k and np.all(rankings[1:, multi_i] == rankings[0, multi_i]):
36 |       multileaved[multi_i] = rankings[0][multi_i]
37 |       teams[multi_i] = -1
38 |       multi_i += 1
39 | 
40 |     indices  = np.zeros(n_rankings, dtype=np.int32) + multi_i
41 |     assignment = np.arange(n_rankings)
42 |     assign_i = n_rankings
43 |     while multi_i < k:
44 |       if assign_i == n_rankings:
45 |         np.random.shuffle(assignment)
46 |         assign_i = 0
47 | 
48 |       rank_i = assignment[assign_i]
49 |       indices[rank_i] = self.next_index_to_add(multileaved, multi_i,
50 |                            rankings[rank_i,:],
51 |                            indices[rank_i])
52 |       multileaved[multi_i] = rankings[rank_i, indices[rank_i]]
53 |       teams[multi_i] = rank_i
54 |       indices[rank_i] += 1
55 |       multi_i += 1
56 |       assign_i += 1
57 | 
58 |     self.teams = teams
59 |     self.n_rankers = n_rankings
60 |     return multileaved
61 | 
62 |   def infer_preferences(self, clicked_docs):
63 |     clicked_docs = clicked_docs.astype(bool)
64 |     assigned_clicks = np.sum(np.arange(self.n_rankers)[:,None] == self.teams[clicked_docs][None,:],axis=1)
65 |     return np.sign(assigned_clicks[:,None] - assigned_clicks[None,:])
66 | 
67 |   def winning_rankers(self, clicked_docs):
68 |     ranker_range = np.arange(self.n_rankers)
69 |     match_matrix = ranker_range[:,None] == self.teams[clicked_docs][None,:]
70 |     ranker_clicks = np.sum(match_matrix.astype(np.int32), axis=1)
71 |     # print self.teams, clicked_docs.astype(int),
72 |     # print ranker_range[ranker_clicks[0] < ranker_clicks]
73 |     return ranker_range[ranker_clicks[0] < ranker_clicks]
74 | 
75 |   def winning_rankers_with_clicks(self, clicked_docs):
76 |     # Return click info as well
77 |     ranker_range = np.arange(self.n_rankers)
78 |     match_matrix = ranker_range[:,None] == self.teams[clicked_docs][None,:]
79 |     ranker_clicks = np.sum(match_matrix.astype(np.int32), axis=1)
80 |     # print self.teams, clicked_docs.astype(int),
81 |     # print ranker_range[ranker_clicks[0] < ranker_clicks]
82 |     return ranker_range[ranker_clicks[0] < ranker_clicks], ranker_clicks
83 | 


--------------------------------------------------------------------------------
/multileaving/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/multileaving/__init__.py


--------------------------------------------------------------------------------
/scripts/CIKM2018.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 6 | from utils.datasimulation import DataSimulation
 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser
 8 | from algorithms.PDGD.pdgd import PDGD
 9 | from algorithms.PDGD.deeppdgd import DeepPDGD
10 | from algorithms.DBGD.tddbgd import TD_DBGD
11 | from algorithms.DBGD.pdbgd import P_DBGD
12 | from algorithms.DBGD.tdmgd import TD_MGD
13 | from algorithms.DBGD.pmgd import P_MGD
14 | from algorithms.baselines.pairwise import Pairwise
15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD
16 | 
17 | description = 'Run script for testing framework.'
18 | parser = SimulationArgumentParser(description=description)
19 | 
20 | rankers = []
21 | 
22 | ranker_params = {
23 |   'learning_rate_decay': 0.9999977}
24 | sim_args, other_args = parser.parse_all_args(ranker_params)
25 | 
26 | # run_name = 'speedtest/TD-DBGD' 
27 | # rankers.append((run_name, TD_DBGD, other_args))
28 | 
29 | run_name = 'CIKM2018/P-DBGD' 
30 | rankers.append((run_name, P_DBGD, other_args))
31 | 
32 | run_name = 'CIKM2018/DeepP-DBGD' 
33 | rankers.append((run_name, Neural_P_DBGD, other_args))
34 | 
35 | # run_name = 'speedtest/TD-MGD' 
36 | # rankers.append((run_name, TD_MGD, other_args))
37 | 
38 | run_name = 'CIKM2018/P-MGD' 
39 | rankers.append((run_name, P_MGD, other_args))
40 | 
41 | ranker_params = {
42 |   'learning_rate_decay': 0.9999977,
43 |   'epsilon': 0.8}
44 | sim_args, other_args = parser.parse_all_args(ranker_params)
45 | 
46 | run_name = 'CIKM2018/Pairwise' 
47 | rankers.append((run_name, Pairwise, other_args))
48 | 
49 | ranker_params = {
50 |   'learning_rate': 0.1,
51 |   'learning_rate_decay': 0.9999977,
52 | }
53 | sim_args, other_args = parser.parse_all_args(ranker_params)
54 | 
55 | run_name = 'CIKM2018/PDGD' 
56 | rankers.append((run_name, PDGD, other_args))
57 | 
58 | run_name = 'CIKM2018/DeepPDGD' 
59 | rankers.append((run_name, DeepPDGD, other_args))
60 | 
61 | sim = DataSimulation(sim_args)
62 | sim.run(rankers)


--------------------------------------------------------------------------------
/scripts/Poisoning_attacks/attack_DBGD_99_lr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | from utils.datasimulation import DataSimulation
 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser
 8 | from algorithms.PDGD.pdgd import PDGD
 9 | from algorithms.PDGD.deeppdgd import DeepPDGD
10 | from algorithms.DBGD.tddbgd import TD_DBGD
11 | from algorithms.DBGD.pdbgd import P_DBGD
12 | from algorithms.DBGD.tdmgd import TD_MGD
13 | from algorithms.DBGD.pmgd import P_MGD
14 | from algorithms.baselines.pairwise import Pairwise
15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD
16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP
17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP
18 | from algorithms.DBGD.tdNSGD import TD_NSGD
19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP
20 | 
21 | description = 'Run script for testing framework.'
22 | parser = SimulationArgumentParser(description=description)
23 | 
24 | rankers = []
25 | 
26 | # Baselines
27 | ranker_params = {
28 |   'learning_rate_decay': 0.99}
29 | 
30 | sim_args, other_args = parser.parse_all_args(ranker_params)
31 | run_name = 'attack/TD_DBGD' 
32 | 
33 | rankers.append((run_name, TD_DBGD, other_args))
34 | 
35 | sim = DataSimulation(sim_args)
36 | sim.run(rankers)
37 | 


--------------------------------------------------------------------------------
/scripts/Poisoning_attacks/attack_DBGD_base_lr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | from utils.datasimulation import DataSimulation
 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser
 8 | from algorithms.PDGD.pdgd import PDGD
 9 | from algorithms.PDGD.deeppdgd import DeepPDGD
10 | from algorithms.DBGD.tddbgd import TD_DBGD
11 | from algorithms.DBGD.pdbgd import P_DBGD
12 | from algorithms.DBGD.tdmgd import TD_MGD
13 | from algorithms.DBGD.pmgd import P_MGD
14 | from algorithms.baselines.pairwise import Pairwise
15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD
16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP
17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP
18 | from algorithms.DBGD.tdNSGD import TD_NSGD
19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP
20 | 
21 | description = 'Run script for testing framework.'
22 | parser = SimulationArgumentParser(description=description)
23 | 
24 | rankers = []
25 | 
26 | # Baselines
27 | ranker_params = {
28 |   'learning_rate_decay': 0.9999977}
29 | 
30 | sim_args, other_args = parser.parse_all_args(ranker_params)
31 | run_name = 'attack/TD_DBGD' 
32 | 
33 | rankers.append((run_name, TD_DBGD, other_args))
34 | 
35 | sim = DataSimulation(sim_args)
36 | sim.run(rankers)
37 | 


--------------------------------------------------------------------------------
/scripts/Poisoning_attacks/attack_MGD_99_lr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | from utils.datasimulation import DataSimulation
 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser
 8 | from algorithms.PDGD.pdgd import PDGD
 9 | from algorithms.PDGD.deeppdgd import DeepPDGD
10 | from algorithms.DBGD.tddbgd import TD_DBGD
11 | from algorithms.DBGD.pdbgd import P_DBGD
12 | from algorithms.DBGD.tdmgd import TD_MGD
13 | from algorithms.DBGD.pmgd import P_MGD
14 | from algorithms.baselines.pairwise import Pairwise
15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD
16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP
17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP
18 | from algorithms.DBGD.tdNSGD import TD_NSGD
19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP
20 | 
21 | description = 'Run script for testing framework.'
22 | parser = SimulationArgumentParser(description=description)
23 | 
24 | rankers = []
25 | 
26 | # Baselines
27 | ranker_params = {
28 |   'learning_rate_decay': 0.99}
29 | 
30 | sim_args, other_args = parser.parse_all_args(ranker_params)
31 | run_name = 'attack/TD_MGD' 
32 | 
33 | rankers.append((run_name, TD_MGD, other_args))
34 | 
35 | sim = DataSimulation(sim_args)
36 | sim.run(rankers)
37 | 


--------------------------------------------------------------------------------
/scripts/Poisoning_attacks/attack_MGD_base_lr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 6 | from utils.datasimulation import DataSimulation
 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser
 8 | from algorithms.PDGD.pdgd import PDGD
 9 | from algorithms.PDGD.deeppdgd import DeepPDGD
10 | from algorithms.DBGD.tddbgd import TD_DBGD
11 | from algorithms.DBGD.pdbgd import P_DBGD
12 | from algorithms.DBGD.tdmgd import TD_MGD
13 | from algorithms.DBGD.pmgd import P_MGD
14 | from algorithms.baselines.pairwise import Pairwise
15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD
16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP
17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP
18 | from algorithms.DBGD.tdNSGD import TD_NSGD
19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP
20 | 
21 | description = 'Run script for testing framework.'
22 | parser = SimulationArgumentParser(description=description)
23 | 
24 | rankers = []
25 | 
26 | # Baselines
27 | ranker_params = {
28 |   'learning_rate_decay': 0.9999977}
29 | 
30 | sim_args, other_args = parser.parse_all_args(ranker_params)
31 | run_name = 'attack/TD_MGD' 
32 | 
33 | rankers.append((run_name, TD_MGD, other_args))
34 | 
35 | sim = DataSimulation(sim_args)
36 | sim.run(rankers)
37 | 


--------------------------------------------------------------------------------
/scripts/SIGIR2018.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 6 | from utils.datasimulation import DataSimulation
 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser
 8 | from algorithms.PDGD.pdgd import PDGD
 9 | from algorithms.PDGD.deeppdgd import DeepPDGD
10 | from algorithms.DBGD.tddbgd import TD_DBGD
11 | from algorithms.DBGD.pdbgd import P_DBGD
12 | from algorithms.DBGD.tdmgd import TD_MGD
13 | from algorithms.DBGD.pmgd import P_MGD
14 | from algorithms.baselines.pairwise import Pairwise
15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD
16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP
17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP
18 | from algorithms.DBGD.tdNSGD import TD_NSGD
19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP
20 | 
21 | description = 'Run script for testing framework.'
22 | parser = SimulationArgumentParser(description=description)
23 | 
24 | rankers = []
25 | 
26 | # Baselines
27 | ranker_params = {
28 |   'learning_rate_decay': 0.9999977}
29 | sim_args, other_args = parser.parse_all_args(ranker_params)
30 | 
31 | run_name = 'SIGIR2018/TD-DBGD' 
32 | rankers.append((run_name, TD_DBGD, other_args))
33 | 
34 | run_name = 'SIGIR2018/TD-MGD' 
35 | rankers.append((run_name, TD_MGD, other_args))
36 | 
37 | ranker_params = {
38 |   'learning_rate_decay': 0.9999977,
39 |   'GRAD_SIZE':60,
40 |   'EXP_SIZE':25,
41 |   'TB_QUEUE_SIZE':10,
42 |   'TB_WINDOW_SIZE':50}
43 | sim_args, other_args = parser.parse_all_args(ranker_params)
44 | 
45 | run_name = 'SIGIR2018/TD_NSGD' 
46 | rankers.append((run_name, TD_NSGD, other_args))
47 | 
48 | 
49 | 
50 | 
51 | sim = DataSimulation(sim_args)
52 | sim.run(rankers)


--------------------------------------------------------------------------------
/scripts/SIGIR2019.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 6 | from utils.datasimulation import DataSimulation
 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser
 8 | from algorithms.PDGD.pdgd import PDGD
 9 | from algorithms.PDGD.deeppdgd import DeepPDGD
10 | from algorithms.DBGD.tddbgd import TD_DBGD
11 | from algorithms.DBGD.pdbgd import P_DBGD
12 | from algorithms.DBGD.tdmgd import TD_MGD
13 | from algorithms.DBGD.pmgd import P_MGD
14 | from algorithms.baselines.pairwise import Pairwise
15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD
16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP
17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP
18 | from algorithms.DBGD.tdNSGD import TD_NSGD
19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP
20 | 
21 | description = 'Run script for testing framework.'
22 | parser = SimulationArgumentParser(description=description)
23 | 
24 | rankers = []
25 | 
26 | # Baselines
27 | ranker_params = {
28 |   'learning_rate_decay': 0.9999977}
29 | sim_args, other_args = parser.parse_all_args(ranker_params)
30 | 
31 | run_name = 'SIGIR2019/P-DBGD' 
32 | rankers.append((run_name, P_DBGD, other_args))
33 | 
34 | run_name = 'SIGIR2019/P-MGD' 
35 | rankers.append((run_name, P_MGD, other_args))
36 | 
37 | ranker_params = {
38 |   'learning_rate_decay': 0.9999977,
39 |   'GRAD_SIZE':60,
40 |   'EXP_SIZE':25,
41 |   'TB_QUEUE_SIZE':10,
42 |   'TB_WINDOW_SIZE':50}
43 | sim_args, other_args = parser.parse_all_args(ranker_params)
44 | 
45 | run_name = 'SIGIR2019/TD_NSGD' 
46 | rankers.append((run_name, TD_NSGD, other_args))
47 | 
48 | 
49 | # DBGD based algorithms with document space projection
50 | ranker_params = {
51 |   'learning_rate_decay': 0.9999977,
52 |   'k_initial': 3,
53 |   'k_increase': False,
54 |   'prev_qeury_len': 10}
55 | sim_args, other_args = parser.parse_all_args(ranker_params)
56 | 
57 | run_name = 'SIGIR2019/P_DBGD_DSP' 
58 | rankers.append((run_name, P_DBGD_DSP, other_args))
59 | 
60 | run_name = 'SIGIR2019/P_MGD_DSP' 
61 | rankers.append((run_name, P_MGD_DSP, other_args))
62 | 
63 | 
64 | 
65 | # NSGD with document space projection
66 | ranker_params = {
67 |   'learning_rate_decay': 0.9999977,
68 |   'k_initial': 3,
69 |   'k_increase': False,
70 |   'GRAD_SIZE':60,
71 |   'EXP_SIZE':25,
72 |   'TB_QUEUE_SIZE':10,
73 |   'TB_WINDOW_SIZE':50,
74 |   'prev_qeury_len': 10}
75 | sim_args, other_args = parser.parse_all_args(ranker_params)
76 | 
77 | run_name = 'SIGIR2019/TD_NSGD_DSP' 
78 | rankers.append((run_name, TD_NSGD_DSP, other_args))
79 | 
80 | 
81 | 
82 | sim = DataSimulation(sim_args)
83 | sim.run(rankers)


--------------------------------------------------------------------------------
/scripts/SIGIR2019_nsgd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 6 | from utils.datasimulation import DataSimulation
 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser
 8 | from algorithms.PDGD.pdgd import PDGD
 9 | from algorithms.PDGD.deeppdgd import DeepPDGD
10 | from algorithms.DBGD.tddbgd import TD_DBGD
11 | from algorithms.DBGD.pdbgd import P_DBGD
12 | from algorithms.DBGD.tdmgd import TD_MGD
13 | from algorithms.DBGD.pmgd import P_MGD
14 | from algorithms.baselines.pairwise import Pairwise
15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD
16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP
17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP
18 | from algorithms.DBGD.tdNSGD import TD_NSGD
19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP
20 | 
21 | description = 'Run script for testing framework.'
22 | parser = SimulationArgumentParser(description=description)
23 | 
24 | rankers = []
25 | 
26 | # Baselines
27 | ranker_params = {
28 |   'learning_rate_decay': 0.9999977,
29 |   'GRAD_SIZE':60,
30 |   'EXP_SIZE':25,
31 |   'TB_QUEUE_SIZE':10,
32 |   'TB_WINDOW_SIZE':50}
33 | sim_args, other_args = parser.parse_all_args(ranker_params)
34 | 
35 | run_name = 'SIGIR2019/TD_NSGD' 
36 | rankers.append((run_name, TD_NSGD, other_args))
37 | 
38 | 
39 | 
40 | # NSGD with document space projection
41 | ranker_params = {
42 |   'learning_rate_decay': 0.9999977,
43 |   'k_initial': 3,
44 |   'k_increase': False,
45 |   'GRAD_SIZE':60,
46 |   'EXP_SIZE':25,
47 |   'TB_QUEUE_SIZE':10,
48 |   'TB_WINDOW_SIZE':50,
49 |   'prev_qeury_len': 10}
50 | sim_args, other_args = parser.parse_all_args(ranker_params)
51 | 
52 | run_name = 'SIGIR2019/DSGD_TD_NSGD' 
53 | rankers.append((run_name, TD_NSGD_DSP, other_args))
54 | 
55 | 
56 | 
57 | sim = DataSimulation(sim_args)
58 | sim.run(rankers)


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/slurm/SIGIR2019/0708.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --ntasks=1
 3 | #SBATCH --cpus-per-task=28
 4 | #SBATCH --time=120:00:00
 5 | #SBATCH -p standard
 6 | #SBATCH --output=job_output/out
 7 | #SBATCH --error=job_output/error
 8 | #SBATCH --mail-type=ALL
 9 | #SBATCH --mail-user=sak2km@virginia.edu
10 | #SBATCH -A hcdm
11 | 
12 | module load anaconda/5.2.0-py2.7
13 | python scripts/SIGIR2019.py --data_sets local_MQ2007 local_MQ2008 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000


--------------------------------------------------------------------------------
/scripts/slurm/SIGIR2019/np.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --ntasks=1
 3 | #SBATCH --cpus-per-task=28
 4 | #SBATCH --time=120:00:00
 5 | #SBATCH -p standard
 6 | #SBATCH --output=job_output/out
 7 | #SBATCH --error=job_output/error
 8 | #SBATCH --mail-type=ALL
 9 | #SBATCH --mail-user=sak2km@virginia.edu
10 | #SBATCH -A hcdm
11 | 
12 | module load anaconda/5.2.0-py2.7
13 | python scripts/SIGIR2019.py --data_sets local_NP2003 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000


--------------------------------------------------------------------------------
/scripts/slurm/SIGIR2019/nsgd/0708.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --ntasks=1
 3 | #SBATCH --cpus-per-task=28
 4 | #SBATCH --time=120:00:00
 5 | #SBATCH -p standard
 6 | #SBATCH --output=job_output/out
 7 | #SBATCH --error=job_output/error
 8 | #SBATCH --mail-type=ALL
 9 | #SBATCH --mail-user=sak2km@virginia.edu
10 | #SBATCH -A hcdm
11 | 
12 | module load anaconda/5.2.0-py2.7
13 | python scripts/SIGIR2019_nsgd.py --data_sets local_MQ2007 local_MQ2008 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000


--------------------------------------------------------------------------------
/scripts/slurm/SIGIR2019/nsgd/np.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --ntasks=1
 3 | #SBATCH --cpus-per-task=28
 4 | #SBATCH --time=120:00:00
 5 | #SBATCH -p standard
 6 | #SBATCH --output=job_output/out
 7 | #SBATCH --error=job_output/error
 8 | #SBATCH --mail-type=ALL
 9 | #SBATCH --mail-user=sak2km@virginia.edu
10 | #SBATCH -A hcdm
11 | 
12 | module load anaconda/5.2.0-py2.7
13 | python scripts/SIGIR2019_nsgd.py --data_sets local_NP2003 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000


--------------------------------------------------------------------------------
/scripts/slurm/SIGIR2019/nsgd/web10k.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --ntasks=1
 3 | #SBATCH --cpus-per-task=28
 4 | #SBATCH --time=120:00:00
 5 | #SBATCH -p standard
 6 | #SBATCH --output=job_output/out
 7 | #SBATCH --error=job_output/error
 8 | #SBATCH --mail-type=ALL
 9 | #SBATCH --mail-user=sak2km@virginia.edu
10 | #SBATCH -A hcdm
11 | 
12 | module load anaconda/5.2.0-py2.7
13 | python scripts/SIGIR2019_nsgd.py --data_sets local_MSLR-WEB10K --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000


--------------------------------------------------------------------------------
/scripts/slurm/SIGIR2019/nsgd/webscope1.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --ntasks=1
 3 | #SBATCH --cpus-per-task=28
 4 | #SBATCH --time=120:00:00
 5 | #SBATCH -p standard
 6 | #SBATCH --output=job_output/out
 7 | #SBATCH --error=job_output/error
 8 | #SBATCH --mail-type=ALL
 9 | #SBATCH --mail-user=sak2km@virginia.edu
10 | #SBATCH -A hcdm
11 | 
12 | module load anaconda/5.2.0-py2.7
13 | python scripts/SIGIR2019_nsgd.py --data_sets local_Webscope_C14_Set1 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 5 --n_proc 10 --n_impr 10000


--------------------------------------------------------------------------------
/scripts/slurm/SIGIR2019/web10k.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --ntasks=1
 3 | #SBATCH --cpus-per-task=28
 4 | #SBATCH --time=120:00:00
 5 | #SBATCH -p standard
 6 | #SBATCH --output=job_output/out
 7 | #SBATCH --error=job_output/error
 8 | #SBATCH --mail-type=ALL
 9 | #SBATCH --mail-user=sak2km@virginia.edu
10 | #SBATCH -A hcdm
11 | 
12 | module load anaconda/5.2.0-py2.7
13 | python scripts/SIGIR2019.py --data_sets local_MSLR-WEB10K --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000


--------------------------------------------------------------------------------
/scripts/slurm/SIGIR2019/webscope1.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --ntasks=1
 3 | #SBATCH --cpus-per-task=28
 4 | #SBATCH --time=120:00:00
 5 | #SBATCH -p standard
 6 | #SBATCH --output=job_output/out
 7 | #SBATCH --error=job_output/error
 8 | #SBATCH --mail-type=ALL
 9 | #SBATCH --mail-user=sak2km@virginia.edu
10 | #SBATCH -A hcdm
11 | 
12 | module load anaconda/5.2.0-py2.7
13 | python scripts/SIGIR2019.py --data_sets local_Webscope_C14_Set1 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 5 --n_proc 10 --n_impr 10000


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/utils/__init__.py


--------------------------------------------------------------------------------
/utils/argparsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/utils/argparsers/__init__.py


--------------------------------------------------------------------------------
/utils/argparsers/simulationargparser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import argparse
  4 | import time
  5 | import json
  6 | 
  7 | class SimulationArgumentParser(argparse.ArgumentParser):
  8 | 
  9 |   def __init__(self, description=None, set_arguments={}):
 10 |     self._description = description
 11 |     self._initial_set_arguments = set_arguments.copy()
 12 |     self._set_arguments = set_arguments
 13 |     self._initial_arguments = {}
 14 |     self._simulation_arguments = []
 15 |     self._arguments_initialized = False
 16 | 
 17 |     super(SimulationArgumentParser, self).__init__(description=description)
 18 | 
 19 |     self._sim_add_argument('--n_runs', dest='n_runs', default=125, type=int,
 20 |                       help='Number of runs to be simulated over a Dataset.')
 21 | 
 22 |     self._sim_add_argument('--n_impr', dest='n_impressions', default=1000, type=int,
 23 |                       help='Number of impressions per simulated run.')
 24 | 
 25 |     self._sim_add_argument('--vali', dest='validation', action='store_true',
 26 |                       help='Use of validation set instead of testset.')
 27 | 
 28 |     self._sim_add_argument('--vali_in_train', dest='validation_in_train', action='store_true',
 29 |                       help='Prevents validation set being added to training set.')
 30 | 
 31 |     self._sim_add_argument('--data_sets', dest='data_sets', type=str, required=True,
 32 |                       help='Paths to folders where the data-folds are stored.', nargs='+')
 33 | 
 34 |     self._sim_add_argument('--output_folder', dest='output_folder', type=str, required=False,
 35 |                       help='Path to folders where outputs should be stored, if not given output will be printed.'
 36 |                       , default='./output')
 37 | 
 38 |     self._sim_add_argument('--log_folder', dest='log_folder', type=str, required=False,
 39 |                       help='Path to folders where run log and errors will be stored.',
 40 |                       default='./log/')
 41 | 
 42 |     self._sim_add_argument('--average_folder', dest='average_folder', type=str, required=False,
 43 |                       help='Path to folders where averaged output of runs will be stored.',
 44 |                       default='./average')
 45 | 
 46 |     self._sim_add_argument('--attacker_average_folder', dest='attacker_average_folder', type=str, required=False,
 47 |                       help='Path to folders where averaged output of runs will be stored.',
 48 |                       default='./attackeraverage')
 49 | 
 50 |     self._sim_add_argument('--attacker_folder', dest='attacker_folder', type=str, required=False,
 51 |                       help='Path to folders where attacker output of runs will be stored.',
 52 |                       default='./attackeroutput')
 53 | 
 54 |     self._sim_add_argument('--small_dataset', dest='small_dataset', action='store_false',
 55 |                       help='Set true if dataset is small and memory is never a concern.')
 56 | 
 57 |     self._sim_add_argument('--click_models', dest='click_models', default='exper1', type=str, required=True,
 58 |                       help='Click models to be used.', nargs='+')
 59 | 
 60 |     self._sim_add_argument('--print_freq', dest='print_freq', type=int, required=False,
 61 |                       help='The number of steps taken before another one is printed after the first batch.'
 62 |                       , default=10)
 63 | 
 64 |     self._sim_add_argument('--print_logscale', dest='print_logscale', action='store_true',
 65 |                       help='Dencrease print frequency semi-logarithmically.')
 66 | 
 67 |     self._sim_add_argument('--print_output', dest='print_output', action='store_true',
 68 |                       help='Set true if outputs should be printed and not stored.')
 69 | 
 70 |     self._sim_add_argument('--max_folds', dest='max_folds', type=int, required=False,
 71 |                       help='The maximum number of folds that may be loaded at any time, default is unlimited.'
 72 |                       , default=None)
 73 | 
 74 |     self._sim_add_argument('--n_proc', dest='n_processing', default=1, type=int,
 75 |                       help='Max number of work-processes to run in parallel.')
 76 | 
 77 |     self._sim_add_argument('--no_run_details', dest='no_run_details', action='store_true',
 78 |                       help='Print all run arguments at start of simulation.')
 79 | 
 80 |     self._sim_add_argument('--n_results', dest='n_results', default=10, type=int,
 81 |                       help='Number of results shown after each query.')
 82 | 
 83 |     self._sim_add_argument('--skip_read_bin_data', dest='read_binarized_data', action='store_false')
 84 |     self._sim_add_argument('--skip_store_bin_data', dest='store_binarized_data_after_read',
 85 |                       action='store_false')
 86 | 
 87 |     self._sim_add_argument('--train_only', dest='train_only', action='store_true',
 88 |                       help='Only calculate train NDCG.')
 89 | 
 90 |     self._sim_add_argument('--all_train', dest='all_train', action='store_false',
 91 |                       help='Stop simulation from printing train NDCG at every step.')
 92 | 
 93 |     self._sim_add_argument('--nonrel_test', dest='purge_test_set', action='store_false',
 94 |                       help='Include non-relevant queries in evaluation on test-set.')
 95 | 
 96 |     # Additional arguments added by Rishab 
 97 |     self._sim_add_argument('--mf', dest='mf', default=5, type=int,
 98 |                       help='Number of most frequent documents to look.')
 99 | 
100 |     self._sim_add_argument('--sd_const', dest='sd_const', default=2.0, type=float,
101 |                       help='How many standard deviations away to look.')
102 | 
103 |     self._sim_add_argument('--start', dest='start', default=0, type=int,
104 |                       help='Which documents to intersect (start)')
105 | 
106 |     self._sim_add_argument('--end', dest='end', default=5, type=int,
107 |                       help='Which documents to intersect (end)')
108 | 
109 |     self._sim_add_argument('--which', dest='which', default=-1, type=int,
110 |                       help='Which half of the portion to attack. (First 2000, Second 2000 etc.). Possible values include (0 [all attack], 1, 2, 3 4, 5)')
111 | 
112 |     self._sim_add_argument('--attacker_click_model', dest='attacker_click_model', default='naive_intersection_attack', type=str,
113 |                       help="Name of the attacker's click model. Possible names include (naive_intersection_attack, frequency_attack)")
114 | 
115 |     self._sim_add_argument('--num_attacker_relevant', dest='num_attacker_relevant', default=5, type=int,
116 |                       help='How many documents in attacker ranking are relevant to the attacker')
117 | 
118 | 
119 | 
120 |     self._arguments_initialized = False
121 | 
122 |   def reset_arguments(self):
123 |     self._set_arguments = self._initial_set_arguments.copy()
124 | 
125 |   def set_argument(self, name, value):
126 |     self._set_arguments[name] = value
127 | 
128 |   def remove_argument(self, name):
129 |     del self._set_arguments[name]
130 | 
131 |   def _sim_add_argument(self, *args, **kargs):
132 |     if 'dest' in kargs:
133 |       name = kargs['dest']
134 |     elif args[0][:2] == '--':
135 |       name = args[0][2:]
136 |     else:
137 |       assert args[0][:1] == '-'
138 |       name = args[0][1:]
139 | 
140 |     assert name != 'description'
141 |     if not name in self._set_arguments:
142 |       super(SimulationArgumentParser, self).add_argument(*args, **kargs)
143 | 
144 |     assert name not in self._simulation_arguments
145 |     self._simulation_arguments.append(name)
146 | 
147 |   def parse_sim_args(self):
148 |     args = vars(self.parse_args())
149 |     sim_args = {
150 |         'description': self._description,
151 |       }
152 |     for name, value in args.items():
153 |       if name in self._simulation_arguments:
154 |         sim_args[name] = value
155 |     return argparse.Namespace(**sim_args)
156 | 
157 |   def parse_other_args(self, ranker_args=None, ranker=None):
158 |     args = vars(self.parse_args())
159 |     other_args = {}
160 |     if ranker:
161 |       other_args.update(
162 |           ranker.default_ranker_parameters()
163 |         )
164 |     for name, value in args.items():
165 |       if name not in self._simulation_arguments:
166 |         other_args[name] = value
167 |     if ranker_args:
168 |       other_args.update(ranker_args)
169 |     return other_args
170 | 
171 |   def parse_all_args(self, ranker_args=None, ranker=None):
172 |     return (self.parse_sim_args(),
173 |             self.parse_other_args(
174 |                     ranker_args = ranker_args,
175 |                     ranker = ranker))
176 | 


--------------------------------------------------------------------------------
/utils/attackeraverager.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import os
  5 | import traceback
  6 | import json
  7 | 
  8 | 
  9 | def cumulative(ranking, discount=0.9995):
 10 |   return np.cumsum(discount ** np.arange(ranking.shape[0]) * ranking)
 11 | 
 12 | 
 13 | def convert_time(time_in_seconds):
 14 |   seconds = time_in_seconds % 60
 15 |   minutes = time_in_seconds / 60 % 60
 16 |   hours = time_in_seconds / 3600
 17 |   return '%02d:%02d:%02d' % (hours, minutes, seconds)
 18 | 
 19 | 
 20 | def print_array(array):
 21 |   return ' '.join([str(x) for x in array] + ['\n'])
 22 | 
 23 | 
 24 | def create_folders(filename):
 25 |   if not os.path.exists(os.path.dirname(filename)):
 26 |     os.makedirs(os.path.dirname(filename))
 27 | 
 28 | class AttackerAverager(object):
 29 | 
 30 |   def __init__(self, simulation_arguments):
 31 |     self.attacker_average_folder = simulation_arguments.attacker_average_folder
 32 |     self._average_index = 0
 33 | 
 34 |   # def click_model_name(self, full_name):
 35 |   #   return str(full_name[:full_name.rfind('_')])
 36 | 
 37 |   def average_results(self, output_path):
 38 |     with open(output_path, 'r') as f:
 39 |       sim_args = json.loads(f.readline())
 40 |       first_run = json.loads(f.readline())
 41 |       run_details = first_run['run_details']
 42 | 
 43 |       cur_click_model = run_details['attacker_click_model'] 
 44 |       # self.click_model_name(
 45 |       #   run_details['click model'])
 46 |       runtimes = {
 47 |           cur_click_model: [float(run_details['runtime'])],
 48 |         }
 49 | 
 50 |       all_ind = {}
 51 |       first_val = {}
 52 |       for event in first_run['run_results']:
 53 |         iteration = event['iteration']
 54 |         for name, val in event.items():
 55 |           if name == 'iteration':
 56 |             continue
 57 |           if name not in all_ind:
 58 |             all_ind[name] = []
 59 |             first_val[name] = []
 60 |           all_ind[name].append(iteration)
 61 |           first_val[name].append(val)
 62 | 
 63 |       all_val = {}
 64 |       for name in all_ind:
 65 |         all_ind[name] = np.array(all_ind[name],
 66 |                                  dtype=np.int32)
 67 |         all_val[name] = {
 68 |             cur_click_model: [np.array(first_val[name],
 69 |                                        dtype=float)]
 70 |           }
 71 | 
 72 |       for line in f:
 73 |         events = json.loads(line)
 74 | 
 75 |         run_details = events['run_details']
 76 |         cur_click_model = run_details['attacker_click_model']
 77 |         # cur_click_model = self.click_model_name(
 78 |         #   run_details['click model'])
 79 |         if cur_click_model not in runtimes:
 80 |           runtimes[cur_click_model] = []
 81 | 
 82 |         runtimes[cur_click_model].append(
 83 |           float(run_details['runtime']))
 84 | 
 85 |         cur_i = {}
 86 |         cur_val = {}
 87 |         for name, val in all_ind.items():
 88 |           cur_i[name] = 0
 89 |           cur_val[name] = np.zeros(val.shape)
 90 |           if cur_click_model not in all_val[name]:
 91 |             all_val[name][cur_click_model] = []
 92 |           all_val[name][cur_click_model].append(cur_val[name])
 93 | 
 94 |         for event in events['run_results']:
 95 |           iteration = event['iteration']
 96 |           for name, val in event.items():
 97 |             if name != 'iteration':
 98 |               c_i = cur_i[name]
 99 |               assert all_ind[name][c_i] == iteration
100 |               cur_val[name][c_i] = val
101 |               cur_i[name] += 1
102 | 
103 |         for name, val in all_ind.items():
104 |           if name != 'iteration':
105 |             assert cur_i[name] == val.shape[0]
106 | 
107 |     average_runtimes = {}
108 |     for click_model, values in runtimes.items():
109 |       average_runtimes[click_model] = np.mean(values).tolist()
110 | 
111 |     results = {}
112 |     for name, cur_ind in all_ind.items():
113 |       cur_results = {
114 |           'indices': cur_ind.tolist()
115 |         }
116 |       results[name] = cur_results
117 |       for click_model, lists in all_val[name].items():
118 |         stacked = np.stack(lists)
119 |         cm_mean = np.mean(stacked, axis=0)
120 |         cm_std = np.std(stacked, axis=0)
121 |         cur_results[click_model] = {
122 |             'mean': cm_mean.tolist(),
123 |             'std': cm_std.tolist(),
124 |           }
125 | 
126 |     output = {
127 |       'simulation_arguments': sim_args,
128 |       'runtimes': average_runtimes,
129 |       'results': results
130 |     }
131 | 
132 |     return output
133 | 
134 |   def create_average_file(self, sim_output):
135 |     print "opening %s" % sim_output.output_path
136 |     output = self.average_results(sim_output.output_path)
137 | 
138 |     self.dataset_path = '%s/%s' % (self.attacker_average_folder, sim_output.dataset_name)
139 |     self.output_path = '%s/%s.out' % (self.dataset_path, sim_output.simulation_name+sim_output.additional_file_name)
140 | 
141 |     # print "Output path inside averager: ", self.output_path
142 |     create_folders(self.dataset_path)
143 |     create_folders(self.output_path)
144 |     with open(self.output_path, 'w') as w:
145 |       w.write(json.dumps(output))
146 |       print 'Closed %d: %s on %s was averaged and stored.' % (self._average_index,
147 |           sim_output.simulation_name+sim_output.additional_file_name, sim_output.dataset_name)
148 | 
149 |     self._average_index += 1
150 | 
151 | class IndependentAttackerAverager(AttackerAverager):
152 |   def __init__(self, attacker_average_folder):
153 |     self.attacker_average_folder = attacker_average_folder
154 |     self._average_index = 0
155 | 


--------------------------------------------------------------------------------
/utils/attackeroutput.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import os
  5 | import sys
  6 | import time
  7 | from datetime import timedelta
  8 | 
  9 | def create_folders(filename):
 10 |     if not os.path.exists(os.path.dirname(filename)):
 11 |         os.makedirs(os.path.dirname(filename))
 12 | 
 13 | class AttackerFileOutput(object):
 14 | 
 15 |     def __init__(self, output_file_path, output_header=None, close_between_writes=False,
 16 |                  also_print=False, write_date=False):
 17 |         self._output_file_path = output_file_path
 18 |         self._close_between_writes = close_between_writes
 19 |         self._also_print = also_print
 20 |         self._original_stdout = sys.stdout
 21 |         self.write_date = write_date
 22 |         create_folders(self._output_file_path)
 23 |         self._output_file = open(self._output_file_path, 'w')
 24 |         self._file_open = True
 25 |         self._new_line = True
 26 |         self._closed = False
 27 |         if not output_header is None:
 28 |             self.write(output_header)
 29 |         self._end_write()
 30 | 
 31 |     def _open_file(self):
 32 |         if not self._file_open:
 33 |             self._output_file = open(self._output_file_path, 'a')
 34 |             self._file_open = True
 35 | 
 36 |     def _close_file(self):
 37 |         self._output_file.close()
 38 |         self._file_open = False
 39 | 
 40 |     def _end_write(self):
 41 |         if self._close_between_writes:
 42 |             self._close_file()
 43 | 
 44 |     def _write_str_to_file(self, output_str):
 45 |         self._output_file.write(output_str)
 46 |         self._new_line = output_str[-1] == '\n'
 47 | 
 48 |     def flush(self):
 49 |         if self._also_print:
 50 |             self._original_stdout.flush()
 51 |         self._output_file.flush()
 52 | 
 53 |     def write(self, output, skip_write_date=False):
 54 |         assert not self._closed
 55 |         self._open_file()
 56 |         for line in output:
 57 |             if self.write_date and self._new_line and not skip_write_date:
 58 |                 line = '%s: %s' % (time.strftime('%c'), str(line))
 59 |             # assert type(line) is str, 'Output element %s is not a str' % line
 60 |             self._write_str_to_file(str(line))
 61 |             if self._also_print:
 62 |                 self._original_stdout.write(line)
 63 |         self._end_write()
 64 | 
 65 |     def close(self):
 66 |         self._close_file()
 67 |         self._closed = True
 68 |         if self._also_print:
 69 |             self._original_stdout.write('\n')
 70 | 
 71 | 
 72 | class AttackerOutput(object):
 73 | 
 74 |     """
 75 |     Class designed to manage the multiprocessing of simulations over multiple datasets.
 76 |     """
 77 | 
 78 |     def __init__(self, simulation_arguments, simulation_name, dataset, num_click_models,
 79 |                  ranker_arguments, attacker_averager):
 80 |         self._start_time = time.time()
 81 |         self.run_index = 0
 82 |         self.attacker_output_folder = simulation_arguments.attacker_folder
 83 |         self.simulation_name = simulation_name
 84 |         self.dataset_name = dataset.name
 85 |         self.attacker_averager = attacker_averager
 86 |         self.print_output = simulation_arguments.print_output
 87 |         self._expected_runs = dataset.num_runs_per_fold * dataset.num_folds * num_click_models
 88 |         self._closed = False
 89 | 
 90 | 
 91 |         self.additional_file_name = ""
 92 |         
 93 |         if "freq" in simulation_arguments.click_models[0]:
 94 |             self.additional_file_name = "_"+simulation_arguments.attacker_click_model+"_"+str(simulation_arguments.n_results)+"_res_" \
 95 |                                         +str(simulation_arguments.start)+"_start_"+str(simulation_arguments.end)+"_end_"+str(simulation_arguments.mf)+"_mf_"+str(simulation_arguments.sd_const)+"_sd_" \
 96 |                                         +str(simulation_arguments.which)+"_half_"+str(simulation_arguments.n_impressions)+"_impressions"+str(ranker_arguments['learning_rate_decay'])+"_lrdecay"
 97 | 
 98 |         else:
 99 |             self.additional_file_name = "_"+simulation_arguments.attacker_click_model+"_"+str(simulation_arguments.n_results)+"_res_"+str(simulation_arguments.start)+"_start_"+str(simulation_arguments.end)+"_end_" \
100 |                                         +str(simulation_arguments.which)+"_half_"+str(simulation_arguments.n_impressions)+"_impressions"+str(ranker_arguments['learning_rate_decay'])+"_lrdecay"
101 | 
102 |         self.output_path = '%s/%s/%s.out' % (self.attacker_output_folder, self.dataset_name,
103 |                                              self.simulation_name+self.additional_file_name)
104 |         print "output path: ", self.output_path
105 |         combined_args = {
106 |                 'simulation_arguments': vars(simulation_arguments),
107 |                 'ranker_arguments': ranker_arguments,
108 |             }
109 |         if self.print_output:
110 |             output_header = json.dumps(combined_args, sort_keys=True,
111 |                                        indent=4, separators=(',', ': '))
112 |             self.attacker_file_output = AttackerBufferPrintOutput(output_header)
113 |         else:
114 |             output_header = json.dumps(combined_args, separators=(',',':'))
115 |             self.attacker_file_output = AttackerFileOutput(self.output_path, output_header,
116 |                                           close_between_writes=True, also_print=False,
117 |                                           write_date=False)
118 | 
119 |     def expected_runs(self):
120 |         return self._expected_runs
121 | 
122 |     def finished(self):
123 |         return self._closed and self.run_index == self._expected_runs
124 | 
125 |     def write_run_output(self, run_output):
126 |         assert not self._closed, 'Simulation Output (%s) written to after being closed.' \
127 |             % self.output_path
128 | 
129 |         if self.print_output:
130 |             # self.file_output.write(json.dumps(run_output, sort_keys=True,
131 |             #                            indent=4, separators=(',', ': ')))
132 |             self.attacker_file_output.pretty_run_write(self.run_index, run_output)
133 |         else:
134 |             self.attacker_file_output.write('\n%s' % json.dumps(run_output))
135 | 
136 |         self.run_index += 1
137 |         if self.run_index >= self._expected_runs:
138 |             self.close()
139 | 
140 |     def close(self, output_file=None):
141 |         self.attacker_file_output.close()
142 |         self._closed = True
143 |         if not self.print_output:
144 |             self.attacker_averager.create_average_file(self)
145 | 
146 | 
147 | class AttackerBufferPrintOutput(object):
148 | 
149 |     def __init__(self, output_header=None):
150 |         self._closed = False
151 |         self._output_list = []
152 |         if not output_header is None:
153 |             self.write(output_header)
154 | 
155 |     def flush(self):
156 |         pass
157 | 
158 |     def write(self, output):
159 |         assert not self._closed
160 |         assert type(output) is str, 'Wrong output format %s' % type(output)
161 |         self._output_list.append(output)
162 | 
163 |     def pretty_run_write(self, run_index, run_output):
164 |       run_details = run_output['run_details']
165 |       run_lines = [
166 |           "RUN: %d" % run_index,
167 |           "DATAFOLD: %s" % run_details['data folder'],
168 |           "CLICK MODEL: %s" % run_details['click model'],
169 |           "ATTACKER CLICK MODEL: %s" % run_details['attacker_click_model'],
170 |           "RUN TIME: %s (%.02f seconds)" % (timedelta(seconds=run_details['runtime']),
171 |                                             run_details['runtime'])
172 |         ]
173 |       tag = run_details['held-out data']
174 |       for event in run_output['run_results']:
175 |         str_line = str(event['iteration'])
176 |         if 'display' in event:
177 |           str_line += ' DISPLAY: %0.3f' % event['display']
178 |         if 'heldout' in event:
179 |           str_line += ' %s: %0.3f' % (tag, event['heldout'])
180 |         run_lines.append(str_line)
181 |       for line in run_lines:
182 |         self.write(line)
183 | 
184 |     def close(self):
185 |         self._closed = True
186 |         print 'Run Output\n' + '\n'.join(self._output_list)
187 |         self._output_list = []
188 | 


--------------------------------------------------------------------------------
/utils/averageoutput.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import os
  5 | import traceback
  6 | import json
  7 | 
  8 | 
  9 | def cumulative(ranking, discount=0.9995):
 10 |   return np.cumsum(discount ** np.arange(ranking.shape[0]) * ranking)
 11 | 
 12 | 
 13 | def convert_time(time_in_seconds):
 14 |   seconds = time_in_seconds % 60
 15 |   minutes = time_in_seconds / 60 % 60
 16 |   hours = time_in_seconds / 3600
 17 |   return '%02d:%02d:%02d' % (hours, minutes, seconds)
 18 | 
 19 | 
 20 | def print_array(array):
 21 |   return ' '.join([str(x) for x in array] + ['\n'])
 22 | 
 23 | 
 24 | def create_folders(filename):
 25 |   if not os.path.exists(os.path.dirname(filename)):
 26 |     os.makedirs(os.path.dirname(filename))
 27 | 
 28 | class OutputAverager(object):
 29 | 
 30 |   def __init__(self, simulation_arguments):
 31 |     self.average_folder = simulation_arguments.average_folder
 32 |     self._average_index = 0
 33 | 
 34 |   def click_model_name(self, full_name):
 35 |     return str(full_name[:full_name.rfind('_')])
 36 | 
 37 |   def average_results(self, output_path):
 38 |     with open(output_path, 'r') as f:
 39 |       sim_args = json.loads(f.readline())
 40 |       first_run = json.loads(f.readline())
 41 |       run_details = first_run['run_details']
 42 | 
 43 |       cur_click_model = self.click_model_name(
 44 |         run_details['click model'])
 45 |       runtimes = {
 46 |           cur_click_model: [float(run_details['runtime'])],
 47 |         }
 48 | 
 49 |       all_ind = {}
 50 |       first_val = {}
 51 |       for event in first_run['run_results']:
 52 |         iteration = event['iteration']
 53 |         for name, val in event.items():
 54 |           if name == 'iteration':
 55 |             continue
 56 |           if name not in all_ind:
 57 |             all_ind[name] = []
 58 |             first_val[name] = []
 59 |           all_ind[name].append(iteration)
 60 |           first_val[name].append(val)
 61 | 
 62 |       all_val = {}
 63 |       for name in all_ind:
 64 |         all_ind[name] = np.array(all_ind[name],
 65 |                                  dtype=np.int32)
 66 |         all_val[name] = {
 67 |             cur_click_model: [np.array(first_val[name],
 68 |                                        dtype=float)]
 69 |           }
 70 | 
 71 |       for line in f:
 72 |         events = json.loads(line)
 73 | 
 74 |         run_details = events['run_details']
 75 |         cur_click_model = self.click_model_name(
 76 |           run_details['click model'])
 77 |         if cur_click_model not in runtimes:
 78 |           runtimes[cur_click_model] = []
 79 | 
 80 |         runtimes[cur_click_model].append(
 81 |           float(run_details['runtime']))
 82 | 
 83 |         cur_i = {}
 84 |         cur_val = {}
 85 |         for name, val in all_ind.items():
 86 |           cur_i[name] = 0
 87 |           cur_val[name] = np.zeros(val.shape)
 88 |           if cur_click_model not in all_val[name]:
 89 |             all_val[name][cur_click_model] = []
 90 |           all_val[name][cur_click_model].append(cur_val[name])
 91 | 
 92 |         for event in events['run_results']:
 93 |           iteration = event['iteration']
 94 |           for name, val in event.items():
 95 |             if name != 'iteration':
 96 |               c_i = cur_i[name]
 97 |               assert all_ind[name][c_i] == iteration
 98 |               cur_val[name][c_i] = val
 99 |               cur_i[name] += 1
100 | 
101 |         for name, val in all_ind.items():
102 |           if name != 'iteration':
103 |             assert cur_i[name] == val.shape[0]
104 | 
105 |     average_runtimes = {}
106 |     for click_model, values in runtimes.items():
107 |       average_runtimes[click_model] = np.mean(values).tolist()
108 | 
109 |     results = {}
110 |     for name, cur_ind in all_ind.items():
111 |       cur_results = {
112 |           'indices': cur_ind.tolist()
113 |         }
114 |       results[name] = cur_results
115 |       for click_model, lists in all_val[name].items():
116 |         stacked = np.stack(lists)
117 |         cm_mean = np.mean(stacked, axis=0)
118 |         cm_std = np.std(stacked, axis=0)
119 |         cur_results[click_model] = {
120 |             'mean': cm_mean.tolist(),
121 |             'std': cm_std.tolist(),          
122 |           }
123 | 
124 |     output = {
125 |       'simulation_arguments': sim_args,
126 |       'runtimes': average_runtimes,
127 |       'results': results
128 |     }
129 | 
130 |     return output
131 | 
132 |   def create_average_file(self, sim_output):
133 |     print "opening %s" % sim_output.output_path
134 |     output = self.average_results(sim_output.output_path)
135 | 
136 |     self.dataset_path = '%s/%s' % (self.average_folder, sim_output.dataset_name)
137 |     self.output_path = '%s/%s.out' % (self.dataset_path, sim_output.simulation_name)
138 |     create_folders(self.dataset_path)
139 |     create_folders(self.output_path)
140 |     with open(self.output_path, 'w') as w:
141 |       w.write(json.dumps(output))
142 |       print 'Closed %d: %s on %s was averaged and stored.' % (self._average_index,
143 |           sim_output.simulation_name, sim_output.dataset_name)
144 | 
145 |     self._average_index += 1
146 | 
147 | class IndependentOutputAverager(OutputAverager):
148 |   def __init__(self, average_folder):
149 |     self.average_folder = average_folder
150 |     self._average_index = 0
151 | 


--------------------------------------------------------------------------------
/utils/clicks.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import operator
  3 | import random
  4 | 
  5 | 
  6 | class ClickModel(object):
  7 | 
  8 |   '''
  9 |   Class for cascading click-models used to simulate clicks.
 10 |   '''
 11 | 
 12 |   def __init__(self, name, data_type, PCLICK, PSTOP):
 13 |     '''
 14 |     Name is used for logging, data_type denotes the degrees of relevance the data uses.
 15 |     PCLICK and PSTOP the probabilities used by the model.
 16 |     '''
 17 |     self.name = name
 18 |     self.type = data_type
 19 |     self.PCLICK = PCLICK
 20 |     self.PSTOP = PSTOP
 21 | 
 22 |   def get_name(self):
 23 |     '''
 24 |     Name that can be used for logging.
 25 |     '''
 26 |     return self.name + '_' + self.type
 27 | 
 28 |   def generate_clicks(self, ranking, all_labels):
 29 |     '''
 30 |     Generates clicks for a given ranking and relevance labels.
 31 |     ranking: np array of indices which correspond with all_labels
 32 |     all_labels: np array of integers
 33 |     '''
 34 |     labels = all_labels[ranking]
 35 |     coinflips = np.random.rand(*ranking.shape)
 36 |     clicks = coinflips < self.PCLICK[labels]
 37 |     coinflips = np.random.rand(*ranking.shape)
 38 |     stops = coinflips < self.PSTOP[labels]
 39 |     stopped_clicks = np.zeros(ranking.shape, dtype=bool)
 40 |     if np.any(stops):
 41 |         clicks_before_stop = np.logical_and(clicks, np.arange(ranking.shape[0])
 42 |                                             <= np.where(stops)[0][0])
 43 |         stopped_clicks[clicks_before_stop] = True
 44 |         return stopped_clicks
 45 |     else:
 46 |         return np.zeros(ranking.shape, dtype=bool) + clicks
 47 | 
 48 | 
 49 | class ExamineClickModel(object):
 50 | 
 51 |   '''
 52 |   Class for cascading click-models used to simulate clicks.
 53 |   '''
 54 | 
 55 |   def __init__(self, name, data_type, PCLICK, eta):
 56 |     '''
 57 |     Name is used for logging, data_type denotes the degrees of relevance the data uses.
 58 |     PCLICK and PSTOP the probabilities used by the model.
 59 |     '''
 60 |     self.name = name
 61 |     self.type = data_type
 62 |     self.PCLICK = PCLICK
 63 |     self.eta = eta
 64 | 
 65 |   def get_name(self):
 66 |     '''
 67 |     Name that can be used for logging.
 68 |     '''
 69 |     return self.name + '_' + self.type
 70 | 
 71 |   def generate_clicks(self, ranking, all_labels):
 72 |     '''
 73 |     Generates clicks for a given ranking and relevance labels.
 74 |     ranking: np array of indices which correspond with all_labels
 75 |     all_labels: np array of integers
 76 |     '''
 77 |     n_results = ranking.shape[0]
 78 |     examine_prob = (1./(np.arange(n_results)+1))**self.eta
 79 |     stop_prob = np.ones(n_results)
 80 |     stop_prob[1:] -= examine_prob[1:]/examine_prob[:-1]
 81 |     stop_prob[0] = 0.
 82 | 
 83 |     labels = all_labels[ranking]
 84 |     coinflips = np.random.rand(*ranking.shape)
 85 |     clicks = coinflips < self.PCLICK[labels]
 86 |     coinflips = np.random.rand(n_results)
 87 |     stops = coinflips < stop_prob
 88 |     stops = np.logical_and(stops, clicks)
 89 |     stopped_clicks = np.zeros(ranking.shape, dtype=bool)
 90 |     if np.any(stops):
 91 |         clicks_before_stop = np.logical_and(clicks, np.arange(ranking.shape[0])
 92 |                                             <= np.where(stops)[0][0])
 93 |         stopped_clicks[clicks_before_stop] = True
 94 |         return stopped_clicks
 95 |     else:
 96 |         return np.zeros(ranking.shape, dtype=bool) + clicks
 97 | 
 98 | class MaliciousClickModel(object):
 99 | 
100 |   '''
101 |   Class for cascading click-models used to simulate malicious clicks.
102 |   '''
103 | 
104 |   def __init__(self, name, data_type):
105 |     '''
106 |     Name is used for logging and identifying the attack type.
107 |     '''
108 |     self.name = name
109 |     self.type = data_type
110 | 
111 |   def get_name(self):
112 |     '''
113 |     Name that can be used for logging.
114 |     '''
115 |     return self.name + '_' + self.type
116 | 
117 |   def generate_clicks(self, train_ranking, attacker_ranking, start, end, freq, mf, sd_const):
118 |       
119 |       if self.name == "naive_intersection_attack":
120 |         return self.naive_intersection_attack(train_ranking, attacker_ranking, start, end)
121 |       elif self.name == "frequency_attack":
122 |         return self.frequency_attack(train_ranking, attacker_ranking, freq, mf, start, end, sd_const)
123 |       else:
124 |         print("Attack name is incorrect. Only 'naive_intersection_attack' and 'frequency_attack' are supported!!\n")
125 | 
126 | 
127 |   def naive_intersection_attack(self, train_ranking, attacker_ranking, start, end):
128 |     '''
129 |     Generates malicious clicks based on the intersection of train_ranking and attacker_ranking.
130 |     Intersection is guided by start and end hyper-parameters.
131 |     '''
132 |     clicks = []
133 | 
134 |     for i in range(0, len(train_ranking)):
135 | 
136 |       if (len(attacker_ranking) >= end and train_ranking[i] in attacker_ranking[start:end]):
137 |         clicks.append(True)
138 |       else:
139 |         clicks.append(False)
140 | 
141 |     return np.zeros(train_ranking.shape, dtype=bool) + clicks
142 | 
143 | 
144 |   def frequency_attack(self, train_ranking, attacker_ranking, freq, mf, start, end, sd_const):
145 |     '''
146 |     Generates malicious clicks based on the intersection of train_ranking and attacker_ranking.
147 |     Intersection is guided by start and end hyper-parameters.
148 |     mf controls which documents get clicked in the intersection.
149 |     mf: Number of most frequent docs that the attacker assumes come from the current ranker.
150 |     freq: Frequency table containing (doc, freq)
151 |     '''
152 |     
153 |     # Sorting the frequency based on frequency
154 |     sorted_freqs = sorted(freq.items(), key=operator.itemgetter(1), reverse=True)
155 | 
156 |     # Breaking the table into doc and frequency. The top_k_docs can be considered as a proxy for current ranker's ranking
157 |     i = 0
158 |     top_k_docs = []
159 |     top_k_freq = []
160 |     while (i < len(sorted_freqs) and i<mf):
161 |         top_k_docs.append(sorted_freqs[i][0])
162 |         top_k_freq.append(sorted_freqs[i][1])
163 |         i += 1
164 | 
165 |     clicks = []
166 | 
167 |     # Finding the standard deviation of top-k frequency list
168 |     sd = np.std(top_k_freq) if (len(top_k_freq) > 0) else 0
169 | 
170 |     # Finding the index of position which is atleast sd_const standard deviations away. If such position is not found then mf will be over-rided otherwise not.
171 |     ind = len(top_k_freq) 
172 |     for index in range(0, len(top_k_freq)-1):
173 |         if sd_const*sd <= top_k_freq[index] - top_k_freq[index+1]:
174 |             ind = index+1
175 |             break;
176 | 
177 |     # Generating the clicks using the intersection and also making sure that the document is not one of the most frequent documents
178 |     for i in range(0, len(train_ranking)):
179 |       if train_ranking[i] not in top_k_docs[0:ind] and train_ranking[i] in attacker_ranking[start:end]:
180 |         clicks.append(True)
181 |       else:
182 |         clicks.append(False)
183 | 
184 |     return np.zeros(train_ranking.shape, dtype=bool) + clicks
185 | 
186 | 
187 | # create synonyms for keywords to ease command line use
188 | syn_tuples = [
189 |     ('ex_per_1', ['exper1']),
190 |     ('navigational', ['nav', 'navi', 'navig', 'navigat']),
191 |     ('informational', ['inf', 'info', 'infor', 'informat']),
192 |     ('perfect', ['per', 'perf']),
193 |     ('almost_random', [
194 |         'alm',
195 |         'almost',
196 |         'alra',
197 |         'arand',
198 |         'almostrandom',
199 |         'almrand',
200 |         ]),
201 |     ('random', ['ran', 'rand']),
202 |     ('binary', ['bin']),
203 |     ('short', []),
204 |     ('long', []),
205 |     ]
206 | attack_tuples = [
207 |     ('naive_intersection_attack', []),
208 |     ('frequency_attack', []),
209 | ]
210 | synonyms = {}
211 | for full, abrv_list in syn_tuples:
212 |     assert full not in synonyms or synonyms[full] == full
213 |     synonyms[full] = full
214 |     for abrv in abrv_list:
215 |         assert abrv not in synonyms or synonyms[abrv] == full
216 |         synonyms[abrv] = full
217 | 
218 | attack_synonyms = {}
219 | for full, abrv_list in attack_tuples:
220 |     assert full not in attack_synonyms or attack_synonyms[full] == full
221 |     attack_synonyms[full] = full
222 |     for abrv in abrv_list:
223 |         assert abrv not in attack_synonyms or attack_synonyms[abrv] == full
224 |         attack_synonyms[abrv] = full
225 | 
226 | bin_models = {}
227 | bin_models['navigational'] = np.array([.05, .95]), np.array([.2, .9])
228 | bin_models['informational'] = np.array([.4, .9]), np.array([.1, .5])
229 | bin_models['perfect'] = np.array([.0, 1.]), np.array([.0, .0])
230 | bin_models['almost_random'] = np.array([.4, .6]), np.array([.5, .5])
231 | bin_models['random'] = np.array([.5, .5]), np.array([.0, .0])
232 | bin_models['ex_per_1'] = np.array([.0, 1.]), 1.0
233 | 
234 | short_models = {}
235 | short_models['navigational'] = np.array([.05, .5, .95]), np.array([.2, .5, .9])
236 | short_models['informational'] = np.array([.4, .7, .9]), np.array([.1, .3, .5])
237 | short_models['perfect'] = np.array([.0, .5, 1.]), np.array([.0, .0, .0])
238 | short_models['almost_random'] = np.array([.4, .5, .6]), np.array([.5, .5, .5])
239 | short_models['random'] = np.array([.5, .5, .5]), np.array([.0, .0, .0])
240 | short_models['ex_per_1'] = np.array([.0, .5, 1.]), 1.0
241 | 
242 | long_models = {}
243 | long_models['navigational'] = np.array([.05, .3, .5, .7, .95]), np.array([.2, .3, .5, .7, .9])
244 | long_models['informational'] = np.array([.4, .6, .7, .8, .9]), np.array([.1, .2, .3, .4, .5])
245 | long_models['perfect'] = np.array([.0, .2, .4, .8, 1.]), np.array([.0, .0, .0, .0, .0])
246 | long_models['almost_random'] = np.array([.4, .45, .5, .55, .6]), np.array([.5, .5, .5, .5, .5])
247 | long_models['random'] = np.array([.5, .5, .5, .5, .5]), np.array([.0, .0, .0, .0, .0])
248 | long_models['ex_per_1'] = np.array([.0, .2, .4, .8, 1.]), 1.0
249 | 
250 | all_models = {'short': short_models, 'binary': bin_models, 'long': long_models}
251 | 
252 | def get_click_models(keywords):
253 |     '''
254 |   Convenience function which returns click models corresponding with keywords.
255 |   only returns click functions for one data type: (bin,short,long)
256 |   '''
257 |     type_name = None
258 |     type_keyword = None
259 |     # print("Keywords: ", keywords)
260 |     for keyword in keywords:
261 |         assert (keyword in synonyms) or (keyword in attack_synonyms)
262 |         if keyword in synonyms and synonyms[keyword] in all_models:
263 |             type_name = synonyms[keyword]
264 |             type_keyword = keyword
265 |             break
266 |     assert type_name is not None and type_keyword is not None
267 | 
268 |     models_type = all_models[type_name]
269 |     full_names = []
270 |     for key in keywords:
271 |         if key in synonyms and key != type_keyword:
272 |             full_names.append(synonyms[key])
273 |         if key in attack_synonyms:
274 |             full_names.append(attack_synonyms[key])
275 | 
276 |     click_models = []
277 | 
278 |     for full in full_names:
279 |         if full in attack_synonyms:
280 |             c_m = MaliciousClickModel(full, type_name)
281 |         elif full == 'ex_per_1':
282 |             c_m = ExamineClickModel(full, type_name, *models_type[full])
283 |         else:
284 |             c_m = ClickModel(full, type_name, *models_type[full])
285 |         click_models.append(c_m)
286 | 
287 |     return click_models


--------------------------------------------------------------------------------
/utils/datasimulation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import random
  4 | import time
  5 | import sharedmem
  6 | import datetime
  7 | import gc
  8 | import sys
  9 | import numpy as np
 10 | from attacksimulation import AttackSimulation
 11 | from multiprocessing import Process, Queue
 12 | from Queue import Empty
 13 | from utils.clicks import get_click_models
 14 | from utils.datasetcollections import get_datasets
 15 | from utils.simulationoutput import SimulationOutput, get_simulation_report
 16 | from utils.attackeroutput import AttackerOutput
 17 | from utils.averageoutput import OutputAverager
 18 | from utils.attackeraverager import AttackerAverager
 19 | 
 20 | 
 21 | class DataSimulation(object):
 22 | 
 23 |   """
 24 |   Class designed to manage the multiprocessing of simulations over multiple datasets.
 25 |   """
 26 | 
 27 |   def __init__(self, simulation_arguments):
 28 |     self.sim_args = simulation_arguments
 29 |     self.num_proc = simulation_arguments.n_processing
 30 |     self.n_runs = simulation_arguments.n_runs
 31 | 
 32 |     self.output_queue = Queue()
 33 |     self.single_sims = []
 34 |     self.processes = []
 35 | 
 36 |     self.folds_in_mem = 0
 37 |     self.max_folds = 999
 38 | 
 39 |     self.output_averager = OutputAverager(simulation_arguments)
 40 |     self.attacker_averager = AttackerAverager(simulation_arguments)
 41 |     self.report_output = get_simulation_report(simulation_arguments)
 42 |     sys.stdout = self.report_output
 43 |     sys.stderr = self.report_output
 44 |     
 45 | 
 46 |   def run(self, ranker_tuples):
 47 |     starttime = time.time()
 48 |     self.active = 0
 49 |     self.click_models = {}
 50 |     self.run_outputs = {}
 51 |     self.all_launched = {}
 52 |     self.run_index = 0
 53 |     self.read_index = 0
 54 |     self.clean_index = 0
 55 |     self._launched = 0
 56 |     self._outputs_found = 0
 57 |     datasets = list(get_datasets(self.sim_args))
 58 |     for dataset in datasets:
 59 |       self.max_folds = min(self.max_folds, dataset.max_folds)
 60 |       if not dataset.click_model_type in self.click_models:
 61 |         missing_type = dataset.click_model_type
 62 |         missing_models = get_click_models(self.sim_args.click_models
 63 |                                           + [dataset.click_model_type])
 64 |         self.click_models[missing_type] = missing_models
 65 | 
 66 |     for process in self.load_processes(datasets, ranker_tuples):
 67 |       self._launched += 1
 68 |       process.start()
 69 |       while self.update_active() >= self.num_proc:
 70 |         self.wait_for_output()
 71 | 
 72 |     while self._expecting_output():
 73 |       self.wait_for_output()
 74 |     self.update_active()
 75 | 
 76 |     seconds_past = time.time() - starttime
 77 |     print ('Time taken: %s (%d seconds)' %
 78 |         (str(datetime.timedelta(seconds=seconds_past)),
 79 |          seconds_past))
 80 | 
 81 |     for output in self.run_outputs.values():
 82 |       print 'OP: ', output, ' -> ', output.finished()
 83 |     assert all(output.finished() for output in self.run_outputs.values()), \
 84 |         'Program exiting but not all outputs were finished.'
 85 | 
 86 |   def load_processes(self, datasets, ranker_tuples):
 87 |     for dataset in datasets:
 88 |       for datafold in dataset.get_data_folds(self.sim_args):
 89 |         for proc in self.load_datafold_processes(datafold, ranker_tuples):
 90 |           yield proc
 91 |         self.all_launched[datafold] = True
 92 |       while self.folds_in_mem >= dataset.max_folds:
 93 |         self.wait_for_output()
 94 | 
 95 |   def load_datafold_processes(self, datafold, ranker_tuples):
 96 |     while self.folds_in_mem >= datafold.max_folds:
 97 |       self.wait_for_output()
 98 |       self.update_active()
 99 |     print 'Read   %d: Fold %d of dataset %s.' % (self.read_index,
100 |               datafold.fold_num + 1, datafold.name)
101 |     datafold.read_data()
102 |     self.read_index += 1
103 |     self.wait_for_output()
104 |     self.update_active()
105 |     for run_name, r_class, r_new_args in ranker_tuples:
106 |       r_args = r_class.default_parameters()
107 |       r_args.update(r_new_args)
108 |       output_key = run_name, datafold.name
109 |       attacker_output_key = run_name, datafold.name, "attacker"
110 |       if not output_key in self.run_outputs:
111 |         self.run_outputs[output_key] = SimulationOutput(
112 |                 self.sim_args, run_name, datafold,
113 |                 len(self.click_models[datafold.click_model_type]), r_args,
114 |                 self.output_averager)
115 |       if not attacker_output_key in self.run_outputs:
116 |         self.run_outputs[attacker_output_key] = AttackerOutput(
117 |                 self.sim_args, run_name, datafold,
118 |                 len(self.click_models[datafold.click_model_type]), r_args,
119 |                 self.attacker_averager)
120 |       for c_m in self.click_models[datafold.click_model_type]:
121 |         sim = AttackSimulation(self.sim_args, self.output_queue, c_m, datafold)
122 |         ranker_setup = r_class, r_args
123 |         r_args['n_results'] = self.sim_args.n_results
124 |         r_args['n_features'] = datafold.num_features
125 |         for i in xrange(datafold.num_runs_per_fold):
126 |           new_proc = Process(target=self.start_run, args=(sim, output_key, attacker_output_key, ranker_setup,
127 |                              self.run_index))
128 |           self.processes.append((new_proc, datafold))
129 |           print 'Launch %d: %s %d with click model %s on fold %d from dataset %s.' % (
130 |               self.run_index,
131 |               run_name,
132 |               i,
133 |               c_m.name,
134 |               datafold.fold_num + 1,
135 |               datafold.name,
136 |               )
137 |           self.run_index += 1
138 |           self.report_output.flush()
139 |           yield new_proc
140 | 
141 |   def start_run(self, simulation, output_key, attacker_output_key, ranker_setup, seed=0):
142 |     """
143 |     Performs a single run.
144 |     Random functions get different seeds for each process.
145 |     """
146 |     random.seed((time.time(), seed))
147 |     np.random.seed(int(time.time() + seed * 100 + seed))
148 |     rankerclass, ranker_args = ranker_setup
149 |     ranker = rankerclass(**ranker_args)
150 |     # print("ranker class: ", rankerclass.learning_rate)
151 |     simulation.run(ranker, output_key=output_key, attacker_output_key=attacker_output_key)
152 | 
153 |   def update_active(self):
154 |     """
155 |     Checks how many child processes are still active.
156 |     """
157 |     dead_processes = [p for p in self.processes if not p[0].is_alive()]
158 |     self.processes = [p for p in self.processes if p[0].is_alive()]
159 |     alive_folds = {}
160 |     for _, datafold in self.processes:
161 |       alive_folds[datafold] = True
162 |     self.folds_in_mem = len(alive_folds)
163 | 
164 |     self.max_folds = min([999] + [datafold.max_folds for datafold in alive_folds])
165 |     self.active = len(self.processes)
166 |     dead_datafolds = {}
167 |     for proc, datafold in dead_processes:
168 |       proc.join()
169 |       if not datafold in alive_folds and datafold in self.all_launched:
170 |         dead_datafolds[datafold] = True
171 | 
172 |     for datafold in dead_datafolds:
173 |       print 'Clean  %d: Fold %d of dataset %s.' % (self.clean_index, datafold.fold_num + 1,
174 |               datafold.name)
175 |       datafold.clean_data()
176 |       self.clean_index += 1
177 | 
178 |     # make extra sure that the process is removed from memory
179 |     del dead_processes
180 |     gc.collect()
181 | 
182 |     # print 'Folds %d max folds %d active %d' % (self.folds_in_mem, self.max_folds, self.active)
183 |     return self.active
184 | 
185 |   def wait_for_output(self, timeout=50):  # 0):
186 |     """
187 |     Prints output for all finished threads
188 |     """
189 |     found = not self._expecting_output()
190 |     try:
191 |       while True:
192 |         output_key, run_output = self.output_queue.get(block=not found, timeout=timeout)
193 |         found = True
194 |         sim_output = self.run_outputs[output_key]
195 |         print 'Output %d: %s on dataset %s. (%d/%d)' % (self._outputs_found, output_key[0],
196 |                 output_key[1], sim_output.run_index+1, sim_output.expected_runs())
197 |         sim_output.write_run_output(run_output)
198 |         self._outputs_found += 1
199 |     except Empty:
200 |         pass
201 |     self.update_active()
202 | 
203 |   def _expecting_output(self):
204 |     return self._outputs_found < 2*self._launched
205 | 


--------------------------------------------------------------------------------
/utils/evaluate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from numpy import log2
  4 | from random import sample
  5 | import numpy as np
  6 | import math
  7 | import random
  8 | 
  9 | 
 10 | def get_dcg(ordered_labels):
 11 |     return np.sum((2 ** ordered_labels - 1) / np.log2(np.arange(ordered_labels.shape[0]) + 2))
 12 | 
 13 | 
 14 | def get_idcg(complete_labels, max_len):
 15 |     return get_dcg(np.sort(complete_labels)[:-1 - max_len:-1])
 16 | 
 17 | 
 18 | def get_single_ndcg_for_rankers(descending_rankings, document_labels, max_len, idcg=None):
 19 |     if idcg == None:
 20 |         idcg = get_idcg(document_labels, max_len)
 21 |     if idcg == 0:
 22 |         return np.zeros(descending_rankings.shape[0])
 23 |     return get_single_dcg_for_rankers(descending_rankings, document_labels, max_len)/idcg
 24 | 
 25 | 
 26 | def get_single_dcg_for_rankers(descending_rankings, document_labels, max_len):
 27 |     displayed_rankings = descending_rankings[:, :max_len]
 28 |     displayed_labels = document_labels[displayed_rankings]
 29 |     return np.sum((2 ** displayed_labels - 1) / np.log2(np.arange(displayed_labels.shape[1])
 30 |                   + 2)[None, :], axis=1)
 31 | 
 32 | 
 33 | def get_ndcg_with_labels(ranking, labels, max_len):
 34 |     '''
 35 |     Calculating the NDCG with a single array of descending ranking and the corresponding labels.
 36 |     '''
 37 |     idcg = get_idcg(np.asarray(labels), max_len)
 38 |     if idcg==0:
 39 |         return 0
 40 |     nominators = [2. ** label - 1. for label in labels]
 41 |     denominators = [math.log(r+2., 2) for r in ranking]
 42 |     for i in range(len(ranking)):
 43 |         if ranking[i]>=max_len:
 44 |             nominators[i] = 0
 45 | 
 46 |     ndcg = 0
 47 |     for i in range(len(nominators)):
 48 |         ndcg += nominators[i] / denominators[i] / idcg
 49 |     return ndcg
 50 | 
 51 | 
 52 | def get_ndcg_with_ranking(model_ranking, ideal_ranking, num_relevant, max_len):
 53 |     '''
 54 |     Given the model ranking and attacker's ranking (ideal ranking), calculate the NDCG performance.
 55 |     This score measures how close the two rankings are.
 56 |     '''
 57 | 
 58 |     # Re-invert the the model ranking eg., [2,3,4,1,0]  => [5, 4, 0, 1, 3]   (0 is at position 5, 1 is at position 4 ...)
 59 |     # This is required because the ideal ranking is not inverted while the model_ranking is.
 60 | 
 61 |     non_inv_model_ranking = [0 for i in range(len(model_ranking))]
 62 |   
 63 |     for i in range(len(model_ranking)):
 64 |         if model_ranking[i] < len(non_inv_model_ranking):
 65 |             non_inv_model_ranking[model_ranking[i]] = i
 66 | 
 67 |     # Creating labels for attacker. Num_relevant documents in the ideal ranking (attacker's ranking) are relevant (1), others are not (0).
 68 |     labels = [0 for i in range(len(model_ranking))]
 69 |     relevant_ideal_ranking = ideal_ranking[:num_relevant]
 70 | 
 71 |     for document in non_inv_model_ranking:
 72 |       if document in relevant_ideal_ranking and document<len(model_ranking):
 73 |           labels[document] = 1
 74 | 
 75 |     ndcg = get_ndcg_with_labels(model_ranking, labels, max_len)
 76 |     return ndcg
 77 | 
 78 | 
 79 | def evaluate_ranking(ranking, labels, idcg, max_len):
 80 |     ordered_labels = labels[ranking]
 81 |     if idcg == 0.0:
 82 |         return 0.0
 83 |     return get_dcg(ordered_labels) / idcg
 84 | 
 85 | 
 86 | def evaluate(rankings, label_vector, idcg_vector, n_queries, max_len):
 87 |     '''
 88 |     Takes rankings as lists of indices, which corresponds to label_lists, lists of label lists.
 89 |     '''
 90 |     nominators = 2. ** label_vector - 1.
 91 |     denominators = np.log2(rankings + 2.)
 92 |     nominators[idcg_vector == 0] = 0
 93 |     nominators[rankings >= max_len] = 0
 94 | 
 95 |     idcg_copy = np.copy(idcg_vector)
 96 |     idcg_copy[idcg_vector == 0] = 1
 97 |     return np.sum(nominators / denominators / idcg_copy) / n_queries
 98 | 
 99 | 
100 | def get_dcg_from_matrix(label_matrix, n_vector, max_len):
101 |     label_matrix = label_matrix[:, :max_len]
102 | 
103 |     nominators = 2 ** label_matrix - 1
104 |     nominators[np.arange(max_len)[None, :] >= n_vector[:, None]] = 0
105 | 
106 |     denominator = np.log2(np.arange(max_len) + 2)
107 |     idcg_vector = np.sum(nominators / denominator[None, :], axis=1)
108 | 
109 |     return idcg_vector
110 | 
111 | 
112 | def get_idcg_list(label_vector, qptr, max_len, spread=False):
113 | 
114 |     n = qptr[1:] - qptr[:-1]
115 |     max_documents = np.max(n)
116 | 
117 |     starts = np.zeros(n.shape[0] + 1, dtype=np.int32)
118 |     starts[1:] = np.cumsum(n)
119 | 
120 |     ind = starts[:-1, None] + np.arange(0, max_documents)[None, :]
121 |     ind = np.minimum(ind, starts[1:, None] - 1)
122 | 
123 |     label_matrix = label_vector[ind]
124 |     label_matrix[np.arange(max_documents)[None, :] >= n[:, None]] = 0
125 |     label_matrix = np.sort(label_matrix, axis=1)[:, ::-1]
126 | 
127 |     idcg_list = get_dcg_from_matrix(label_matrix, n, max_len)
128 | 
129 |     if spread:
130 |         spread_ind = np.zeros(qptr[-1], dtype=np.int32)
131 |         spread_ind[qptr[1:-1]] = 1
132 |         spread_ind = np.cumsum(spread_ind)
133 | 
134 |         return idcg_list[spread_ind]
135 |     else:
136 |         return idcg_list
137 | 


--------------------------------------------------------------------------------
/utils/rankings.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | def invert_rankings(rankings, dtype=None):
  4 |   '''
  5 |   Invert indices in a matrix of rankings, ranking per row.
  6 |   '''
  7 |   if dtype is None:
  8 |     inverted = np.zeros(rankings.shape)
  9 |   else:
 10 |     inverted = np.zeros(rankings.shape, dtype=dtype)
 11 |   inverted[np.arange(rankings.shape[0])[:,None],rankings] = np.arange(rankings.shape[1])[None,:]
 12 |   return inverted
 13 | 
 14 | def invert_ranking(ranking, dtype=None):
 15 |   """
 16 |   'Inverts' ranking, each element gets the index it has in the ranking.
 17 |   [2,0,1] becomes [1,3,0]
 18 |   """
 19 |   if dtype is None:
 20 |     inverted = np.zeros(ranking.shape)
 21 |   else:
 22 |     inverted = np.zeros(ranking.shape, dtype=dtype)
 23 |   inverted[ranking] = np.arange(ranking.shape[0])
 24 |   return inverted
 25 | 
 26 | def tiebreak_sort(unranked, n_results=None, full_sort=False):
 27 |   if full_sort or n_results is None:
 28 |     n_results = unranked.shape[-1]
 29 |   return _tiebreak_sort(unranked, n_results)
 30 | 
 31 | def _tiebreak_sort(unranked, n_results):
 32 |   """
 33 |   Sorts rows of a matrix using tiebreakers, along the last axis.
 34 |   """
 35 | 
 36 |   n_axis = len(unranked.shape)
 37 |   assert (n_axis == 1 or n_axis == 2)
 38 | 
 39 |   tiebreakers = np.random.random(unranked.shape)
 40 |   complex_predictions = np.empty(unranked.shape, dtype=np.complex)
 41 |   complex_predictions.real = unranked #score
 42 |   complex_predictions.imag = tiebreakers #random numbers to break ties
 43 | 
 44 |   max_n_docs = unranked.shape[-1]
 45 |   max_part = np.minimum(n_results, max_n_docs)
 46 |   if max_part == max_n_docs:
 47 |     return np.argsort(complex_predictions, axis=-1)
 48 | 
 49 |   part = np.argpartition(complex_predictions, max_part-1, axis=-1)
 50 |   slice_ind = (slice(None),) * (len(unranked.shape)-1)
 51 |   slice_ind += (slice(0,max_part),)
 52 | 
 53 |   if n_axis == 1:
 54 |     part_pred = complex_predictions[part[slice_ind]]
 55 |     front_sort = np.argsort(part_pred, axis=-1)
 56 |     part[slice_ind] = part[slice_ind][front_sort]
 57 |   else:
 58 |     extra_ind = np.arange(unranked.shape[0])[:,None]
 59 |     part_sliced = part[slice_ind]
 60 |     extra_ind = np.empty(part_sliced.shape, dtype=np.int32)
 61 |     extra_ind[:,:] = np.arange(unranked.shape[0])[:,None]
 62 |     part_pred = complex_predictions[extra_ind, part[slice_ind]]
 63 |     front_sort = np.argsort(part_pred, axis=-1) #index array from lowest prediction score to highest
 64 |     part_sliced[:, :] = part_sliced[extra_ind, front_sort]
 65 | 
 66 |   return part
 67 | 
 68 | def get_score_rankings(weights,feature_matrix,qptr,max_documents=None, inverted=False):
 69 |   """
 70 |   Given weights and a feature matrix the documents are ranked and scored according to their dot product.
 71 |   """
 72 |   # minus to reverse ranking
 73 |   predictions = -np.squeeze(np.dot(weights.T,feature_matrix))
 74 |   return rank_queries(predictions,qptr,max_documents=max_documents,inverted=inverted)
 75 | 
 76 | def rank_queries(predictions, qptr, max_documents=None, inverted=False):
 77 |   """
 78 |   Given predicted scores for queries rankings are generated and returned.
 79 |   """
 80 | 
 81 |   max_value = np.max(predictions)
 82 |   # vector with lenght of each doclist
 83 |   n = qptr[1:]-qptr[:-1]
 84 |   if not max_documents:
 85 |     max_documents = np.max(n)
 86 | 
 87 |   # the vector of documents is reshaped into a matrix
 88 |   # with a document list on every row
 89 |   ind = qptr[:-1,None] + np.arange(0,max_documents)[None,:]
 90 |   ind = np.minimum(ind,qptr[1:,None]-1)
 91 |   # warped is now a matrix of size n_queries x max_documents
 92 |   warped = predictions[ind]
 93 |   # every document that appears in a row but not in the query list
 94 |   # (due to n_query_list < max_documents) gets the worst score in off all documents
 95 |   # this makes sure they do not appear in the final ranking
 96 |   warped[np.arange(max_documents)[None,:] >= n[:,None]] = max_value + 1
 97 | 
 98 |   # tiebreak sort uses numpy to rank every row in the matrix
 99 |   # this is faster than ranking them by seperate calls
100 |   rankings = tiebreak_sort(warped)
101 |   if inverted:
102 |     inverted = invert_rankings(rankings,dtype=np.int32)
103 |     return inverted[np.arange(max_documents)[None,:] < n[:,None]]
104 | 
105 |   else:
106 |     return rankings[np.arange(max_documents)[None,:] < n[:,None]]
107 | 
108 | def rank_query(predictions, inverted=False, n_results=None):
109 |   """
110 |   Given predicted scores of a single query returns rankings.
111 |   """
112 |   ranking = tiebreak_sort(predictions, n_results)
113 |   if inverted:
114 |     if len(ranking.shape) == 1:
115 |       return invert_ranking(ranking,dtype=np.int32)
116 |     else:
117 |       return invert_rankings(ranking,dtype=np.int32)
118 |   else:
119 |     return ranking
120 | 
121 | def rank_candidate_queries(weights,feature_matrix,qptr,n_results=None,inverted=False):
122 |   n_docs = feature_matrix.shape[1]
123 |   scores = -np.dot(weights,feature_matrix)
124 |   qid_per_doc = np.zeros(n_docs, dtype=np.int32)
125 |   qid_per_doc[qptr[1:-1]] = 1
126 |   qid_per_doc = np.cumsum(qid_per_doc)
127 | 
128 |   index_offset = np.zeros(n_docs, dtype=np.int32)
129 |   index_offset[:] = qptr[qid_per_doc]
130 | 
131 |   score_offset = (np.max(np.abs(scores),axis=1)+1.)[:,None]*qid_per_doc[None,:]
132 |   scores += score_offset
133 | 
134 |   descending = rank_query(scores, n_results=n_results)
135 | 
136 |   if not inverted:
137 |     descending -= index_offset[None,:]
138 |     return descending, None
139 |   else:
140 |     inverted = invert_rankings(descending, dtype=np.int64)
141 |     descending -= index_offset[None,:]
142 |     inverted -= index_offset[None,:]
143 |     return descending, inverted
144 | 
145 | def get_query_scores(weights, feature_matrix, qptr, ranking_i):
146 |   return -np.dot(weights.T,feature_matrix[:,qptr[ranking_i]:qptr[ranking_i+1]])
147 | 
148 | def get_candidate_score_rankings(weights, feature_matrix, qptr, ranking_i, inverted=False):
149 |   scores = -np.dot(weights.T,feature_matrix[:,qptr[ranking_i]:qptr[ranking_i+1]])
150 |   return rank_query(scores,inverted)
151 | 
152 | def get_candidate_score_ranking(weights,query_feature_matrix,inverted=False):
153 |   scores = -np.dot(weights.T,query_feature_matrix)
154 |   return rank_query(scores,inverted)
155 | 
156 | def rank_single_query(predictions, inverted=False, n_results=None):
157 |   """
158 |   Given predicted scores of a single query returns rankings.
159 |   """
160 |   ranking = tiebreak_sort(predictions, n_results=n_results)
161 |   if inverted:
162 |     if len(ranking.shape) == 1:
163 |       return invert_ranking(ranking, dtype=np.int32)
164 |     else:
165 |       return invert_rankings(ranking, dtype=np.int32)
166 |   else:
167 |     return ranking
168 | 
169 | def rank_multiple_queries(predictions, qptr, max_documents=None,
170 |               inverted=False, n_results=None):
171 |   """
172 |   Given predicted scores for queries rankings are generated and returned.
173 |   """
174 | 
175 |   max_value = np.max(predictions)
176 |   # vector with lenght of each doclist
177 |   n = qptr[1:]-qptr[:-1]
178 |   if not max_documents:
179 |     max_documents = np.max(n)
180 | 
181 |   # the vector of documents is reshaped into a matrix
182 |   # with a document list on every row
183 |   ind = qptr[:-1,None] + np.arange(0,max_documents)[None,:]
184 |   ind = np.minimum(ind,qptr[1:,None]-1)
185 |   # warped is now a matrix of size n_queries x max_documents
186 |   warped = predictions[ind]
187 |   # every document that appears in a row but not in the query list
188 |   # (due to n_query_list < max_documents) gets the worst score in all documents
189 |   # this makes sure they do not appear in the final ranking
190 |   warped[np.arange(max_documents)[None,:] >= n[:,None]] = max_value + 1
191 | 
192 |   # tiebreak sort uses numpy to rank every row in the matrix
193 |   # this is faster than ranking them by seperate calls
194 |   rankings = tiebreak_sort(warped, n_results=n_results)
195 |   if inverted:
196 |     inverted = invert_rankings(rankings, dtype=np.int32) #index is document id and content is the ranking: inverted[10]=0 means document 10 has highest score
197 |     return inverted[np.arange(max_documents)[None,:] < n[:,None]]
198 |   else:
199 |     return rankings[np.arange(max_documents)[None,:] < n[:,None]]
200 | 


--------------------------------------------------------------------------------
/utils/simulationoutput.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import os
  5 | import sys
  6 | import time
  7 | from datetime import timedelta
  8 | 
  9 | def create_folders(filename):
 10 |     if not os.path.exists(os.path.dirname(filename)):
 11 |         os.makedirs(os.path.dirname(filename))
 12 | 
 13 | class FileOutput(object):
 14 | 
 15 |     def __init__(self, output_file_path, output_header=None, close_between_writes=False,
 16 |                  also_print=False, write_date=False):
 17 |         self._output_file_path = output_file_path
 18 |         self._close_between_writes = close_between_writes
 19 |         self._also_print = also_print
 20 |         self._original_stdout = sys.stdout
 21 |         self.write_date = write_date
 22 |         create_folders(self._output_file_path)
 23 |         self._output_file = open(self._output_file_path, 'w')
 24 |         self._file_open = True
 25 |         self._new_line = True
 26 |         self._closed = False
 27 |         if not output_header is None:
 28 |             self.write(output_header)
 29 |         self._end_write()
 30 | 
 31 |     def _open_file(self):
 32 |         if not self._file_open:
 33 |             self._output_file = open(self._output_file_path, 'a')
 34 |             self._file_open = True
 35 | 
 36 |     def _close_file(self):
 37 |         self._output_file.close()
 38 |         self._file_open = False
 39 | 
 40 |     def _end_write(self):
 41 |         if self._close_between_writes:
 42 |             self._close_file()
 43 | 
 44 |     def _write_str_to_file(self, output_str):
 45 |         self._output_file.write(output_str)
 46 |         self._new_line = output_str[-1] == '\n'
 47 | 
 48 |     def flush(self):
 49 |         if self._also_print:
 50 |             self._original_stdout.flush()
 51 |         self._output_file.flush()
 52 | 
 53 |     def write(self, output, skip_write_date=False):
 54 |         assert not self._closed
 55 |         # if isinstance(output, str):
 56 |         #     output = [output]
 57 |         # elif isinstance(output, list):
 58 |         #     output = [line + '\n' for line in output]
 59 |         # assert type(output) is list, 'Expected output to be list, found %s' % type(output)
 60 |         self._open_file()
 61 |         for line in output:
 62 |             if self.write_date and self._new_line and not skip_write_date:
 63 |                 line = '%s: %s' % (time.strftime('%c'), str(line))
 64 |             # assert type(line) is str, 'Output element %s is not a str' % line
 65 |             self._write_str_to_file(str(line))
 66 |             if self._also_print:
 67 |                 self._original_stdout.write(line)
 68 |         self._end_write()
 69 | 
 70 |     def close(self):
 71 |         self._close_file()
 72 |         self._closed = True
 73 |         if self._also_print:
 74 |             self._original_stdout.write('\n')
 75 | 
 76 | 
 77 | class PrintOutput(object):
 78 | 
 79 |     def __init__(self, output_header=None, write_date=False):
 80 |         self.write_date = write_date
 81 |         if not output_header is None:
 82 |             self.write(output_header)
 83 | 
 84 |     def write(self, output):
 85 |         if type(output) is str:
 86 |             output = [output]
 87 |         assert type(output) is list, 'Expected output to be list, found %s' % type(output)
 88 |         for line in output:
 89 |             if self.write_date:
 90 |                 line = '%s: %s' % (time.strftime('%c'), line)
 91 |             print line
 92 | 
 93 |     def close(self):
 94 |         pass
 95 | 
 96 | 
 97 | def get_simulation_report(simulation_arguments):
 98 |     file_name = sys.argv[0]
 99 |     if file_name[-3:] == ".py":
100 |         file_name = file_name[:-3].split("/")[-1]
101 |     date_str = file_name + "-" + time.strftime('Log-%y-%m-%d-%X')
102 | 
103 |     if not simulation_arguments.log_folder is None \
104 |         and os.path.isdir(simulation_arguments.log_folder):
105 |         output_path = simulation_arguments.log_folder + '/' + date_str.replace(' ', '-') + '.txt'
106 |         header = ['Starting simulation at %s.' % date_str, 'Log is also stored in output file at %s'
107 |                    % output_path]
108 |         return FileOutput(output_path, output_header=header, also_print=True, write_date=True)
109 |     else:
110 |         header = ['Starting simulation.',
111 |                   'WARNING: No log folder found, log is not stored elsewhere.']
112 |         return PrintOutput(output_header=header, write_date=True)
113 | 
114 | 
115 | class SimulationOutput(object):
116 | 
117 |     """
118 |     Class designed to manage the multiprocessing of simulations over multiple datasets.
119 |     """
120 | 
121 |     def __init__(self, simulation_arguments, simulation_name, dataset, num_click_models,
122 |                  ranker_arguments, output_averager):
123 |         self._start_time = time.time()
124 |         self.run_index = 0
125 |         self.output_folder = simulation_arguments.output_folder
126 |         self.simulation_name = simulation_name
127 |         self.dataset_name = dataset.name
128 |         self.output_averager = output_averager
129 |         self.print_output = simulation_arguments.print_output
130 |         self._expected_runs = dataset.num_runs_per_fold * dataset.num_folds * num_click_models
131 |         self._closed = False
132 |         self.output_path = '%s/%s/%s.out' % (self.output_folder, self.dataset_name,
133 |                                              self.simulation_name)
134 |         combined_args = {
135 |                 'simulation_arguments': vars(simulation_arguments),
136 |                 'ranker_arguments': ranker_arguments,
137 |             }
138 |         if self.print_output:
139 |             output_header = json.dumps(combined_args, sort_keys=True,
140 |                                        indent=4, separators=(',', ': '))
141 |             self.file_output = BufferPrintOutput(output_header=output_header)
142 |         else:
143 |             output_header = json.dumps(combined_args, separators=(',',':'))
144 |             self.file_output = FileOutput(self.output_path, output_header=output_header,
145 |                                           close_between_writes=True, also_print=False,
146 |                                           write_date=False)
147 | 
148 |     def expected_runs(self):
149 |         return self._expected_runs
150 | 
151 |     def finished(self):
152 |         return self._closed and self.run_index == self._expected_runs
153 | 
154 |     def write_run_output(self, run_output):
155 |         assert not self._closed, 'Simulation Output (%s) written to after being closed.' \
156 |             % self.output_path
157 | 
158 |         if self.print_output:
159 |             # self.file_output.write(json.dumps(run_output, sort_keys=True,
160 |             #                            indent=4, separators=(',', ': ')))
161 |             self.file_output.pretty_run_write(self.run_index, run_output)
162 |         else:
163 |             self.file_output.write('\n%s' % json.dumps(run_output))
164 |         
165 |         self.run_index += 1
166 |         if self.run_index >= self._expected_runs:
167 |             self.close()
168 | 
169 |     def close(self, output_file=None):
170 |         # self.file_output.write(['--------END--------'])
171 |         # total_time = time.time() - self._start_time
172 |         # seconds = total_time % 60
173 |         # minutes = total_time / 60 % 60
174 |         # hours = total_time / 3600
175 |         # self.file_output.write(['Total time taken %02d:%02d:%02d' % (hours, minutes, seconds)])
176 |         self.file_output.close()
177 |         self._closed = True
178 |         if not self.print_output:
179 |             self.output_averager.create_average_file(self)
180 | 
181 | 
182 | class BufferPrintOutput(object):
183 | 
184 |     def __init__(self, output_header=None):
185 |         self._closed = False
186 |         self._output_list = []
187 |         if not output_header is None:
188 |             self.write(output_header)
189 | 
190 |     def flush(self):
191 |         pass
192 | 
193 |     def write(self, output):
194 |         assert not self._closed
195 |         assert type(output) is str, 'Wrong output format %s' % type(output)
196 |         self._output_list.append(output)
197 | 
198 |     def pretty_run_write(self, run_index, run_output):
199 |       run_details = run_output['run_details']
200 |       run_lines = [
201 |           "RUN: %d" % run_index,
202 |           "DATAFOLD: %s" % run_details['data folder'],
203 |           "CLICK MODEL: %s" % run_details['click model'],
204 |           "RUN TIME: %s (%.02f seconds)" % (timedelta(seconds=run_details['runtime']),
205 |                                             run_details['runtime'])
206 |         ]
207 |       tag = run_details['held-out data']
208 |       for event in run_output['run_results']:
209 |         str_line = str(event['iteration'])
210 |         if 'display' in event:
211 |           str_line += ' DISPLAY: %0.3f' % event['display']
212 |         if 'heldout' in event:
213 |           str_line += ' %s: %0.3f' % (tag, event['heldout'])
214 |         run_lines.append(str_line)
215 |       for line in run_lines:
216 |         self.write(line)
217 | 
218 |     def close(self):
219 |         self._closed = True
220 |         print 'Run Output\n' + '\n'.join(self._output_list)
221 |         self._output_list = []
222 | 


--------------------------------------------------------------------------------