├── .gitignore ├── LICENSE ├── README.md ├── README_poisoning_attacks.md ├── __init__.py ├── algorithms ├── DBGD │ ├── __init__.py │ ├── neural │ │ ├── __init__.py │ │ └── pdbgd.py │ ├── pdbgd.py │ ├── pdbgd_dsp.py │ ├── pmgd.py │ ├── pmgd_dsp.py │ ├── tdNSGD.py │ ├── tdNSGD_dsp.py │ ├── tddbgd.py │ └── tdmgd.py ├── PDGD │ ├── __init__.py │ ├── deeppdgd.py │ └── pdgd.py ├── __init__.py ├── baselines │ ├── __init__.py │ └── pairwise.py └── basiconlineranker.py ├── attack.sh ├── attack_graph.py ├── attacker_avg_summarize.py ├── attacker_weights ├── Weights_mq2007.txt ├── Weights_td2003.txt ├── Weights_web10k.txt └── Weights_yahoo.txt ├── graphs ├── makeaverages.py ├── makegraphs.py └── maketables.py ├── models ├── __init__.py ├── evolutionneuralmodel.py ├── linearmodel.py ├── neuralmodel.py └── neuralnet.py ├── multileaving ├── PairwisePreferenceMultileave.py ├── ProbabilisticMultileave.py ├── TeamDraftMultileave.py └── __init__.py ├── scripts ├── CIKM2018.py ├── Poisoning_attacks │ ├── attack_DBGD_99_lr.py │ ├── attack_DBGD_base_lr.py │ ├── attack_MGD_99_lr.py │ └── attack_MGD_base_lr.py ├── SIGIR2018.py ├── SIGIR2019.py ├── SIGIR2019_nsgd.py ├── __init__.py └── slurm │ └── SIGIR2019 │ ├── 0708.slurm │ ├── np.slurm │ ├── nsgd │ ├── 0708.slurm │ ├── np.slurm │ ├── web10k.slurm │ └── webscope1.slurm │ ├── web10k.slurm │ └── webscope1.slurm └── utils ├── __init__.py ├── argparsers ├── __init__.py └── simulationargparser.py ├── attackeraverager.py ├── attackeroutput.py ├── attacksimulation.py ├── averageoutput.py ├── clicks.py ├── dataset.py ├── datasetcollections.py ├── datasimulation.py ├── evaluate.py ├── rankings.py └── simulationoutput.py /.gitignore: -------------------------------------------------------------------------------- 1 | tmp/*.pickle 2 | 3 | # Compiled source # 4 | ################### 5 | *.pyc 6 | *.egg-info 7 | build/ 8 | dist/ 9 | 10 | # Files generated by eclipse # 11 | ############################## 12 | .coverage 13 | .project 14 | .pydevproject 15 | 16 | # OS generated files # 17 | ###################### 18 | .DS_Store 19 | .DS_Store? 20 | ._* 21 | .*.swp 22 | .Spotlight-V100 23 | .Trashes 24 | Icon? 25 | ehthumbs.db 26 | Thumbs.db 27 | 28 | # Other files / directories 29 | exp 30 | gurobi.log 31 | 32 | # Files generated by the runnner script (click models) # 33 | outdir 34 | pdf 35 | pdf_test 36 | log_folder 37 | 38 | # Files generated by PyCharm 39 | .idea 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 H.R. Oosterhuis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Null Space Gradient Descent (NSGD) and Document Space Projected Dueling Bandit Gradient Descent (DBGD-DSP) 2 | This repository contains the code used to produce the experimental results found in "Efficient Exploration of Gradient Space for Online Learning to Rank" and "Variance Reduction in Gradient Exploration for Online Learning to Rank" published at SIGIR 2018 and SIGIR 2019, respectively. It was forked from Harrie Oosterhuis's repository for "Differentiable Unbiased Online Learning to Rank" published at CIKM 2018, at https://github.com/HarrieO/OnlineLearningToRank. 3 | 4 | NSGD Algorithm 5 | ------- 6 | This algorithm was developed with the intent of increasing the efficiency of exploration of the gradient space for online learning to rank. It does this in a series of 3 steps. First, the null space of previously poorly performing directions is computed, and new directions are sampled from within this null space (this helps to avoid exploring less promising directions repeatedly). Second, a candidate preselection process is done wherein the sampled directions which are most differentiable by the current query's documents are chosen for evaluation. Thirdly, in the event of a tie, a tie-breaking mechanism uses historically difficult queries to reevaluate the candidates and choose a winner. 7 | 8 | DBGD-DSP Algorithm 9 | ------- 10 | The aim of this algorithm is to act as a wrapper around other DBGD-style algorithms to reduce their ranker variance and improve overall performance in online learning to rank. DBGD-DSP works by modifying the winning ranker after the interleaved test. In particular, it projects the winning gradient into the space spanned by the query-document feature vectors associated with the given query. This reduces the variance in gradient exploration by removing the component of the winning gradient that was orthogonal to the document space, which does not contribute to the loss function and true gradient estimation. 11 | 12 | Usage 13 | ------- 14 | To run the code to generate experimental results like those found in our papers, you will need to run a command in the following format, using Python 2 (SIGIR2018.py, SIGIR2019.py, and SIGIR2019_NSGD.py are all run similarly): 15 | 16 | ``` 17 | python scripts/SIGIR2019.py [-h] [--n_runs N_RUNS] [--n_impr N_IMPRESSIONS] [--vali] 18 | [--vali_in_train] --data_sets DATA_SETS [DATA_SETS ...] 19 | [--output_folder OUTPUT_FOLDER] [--log_folder LOG_FOLDER] 20 | [--average_folder AVERAGE_FOLDER] [--small_dataset] 21 | --click_models CLICK_MODELS [CLICK_MODELS ...] 22 | [--print_freq PRINT_FREQ] [--print_logscale] 23 | [--print_output] [--max_folds MAX_FOLDS] 24 | [--n_proc N_PROCESSING] [--no_run_details] 25 | [--n_results N_RESULTS] [--skip_read_bin_data] 26 | [--skip_store_bin_data] [--train_only] [--all_train] 27 | [--nonrel_test] 28 | ``` 29 | 30 | In the command above, parameters within square brackets are optional. In our papers, we used datasets such as MQ2007 and MQ2008 from LETOR 4.0 datasets, the Yahoo! learning to rank challenge dataset, and MSLR-WEB10K dataset. The possible click models are described in our papers: inf = informational, nav = navigational, and per = perfect. 31 | 32 | Poisoning Attacks 33 | ------- 34 | This repository also contains the code, that we used to show the robustness of DBGD/MGD based algorithms. Further details can be referred here: [link](README_poisoning_attacks.md) 35 | 36 | Citation 37 | -------- 38 | 39 | If you use this code to produce results for your scientific publication, please refer to our SIGIR 2019 paper and/or SIGIR 2018 paper: 40 | 41 | ``` 42 | @inproceedings{wang2019variance, 43 | title={Variance Reduction in Gradient Exploration for Online Learning to Rank}, 44 | author={Wang, Huazheng and Kim, Sonwoo and McCord-Snook, Eric and Wu, Qingyun and Wang, Hongning}, 45 | booktitle={The 42nd International ACM SIGIR Conference on Research \& Development in Information Retrieval}, 46 | year={2019}, 47 | organization={ACM} 48 | } 49 | 50 | @inproceedings{wang2018efficient, 51 | title={Efficient exploration of gradient space for online learning to rank}, 52 | author={Wang, Huazheng and Langley, Ramsey and Kim, Sonwoo and McCord-Snook, Eric and Wang, Hongning}, 53 | booktitle={The 41st International ACM SIGIR Conference on Research \& Development in Information Retrieval}, 54 | year={2018}, 55 | organization={ACM} 56 | } 57 | ``` 58 | 59 | License 60 | ------- 61 | 62 | The contents of this repository are licensed under the [MIT license](LICENSE). If you modify its contents in any way, please link back to this repository. 63 | -------------------------------------------------------------------------------- /README_poisoning_attacks.md: -------------------------------------------------------------------------------- 1 | ## Poisoning Attacks on Online Learning to Rank 2 | 3 | This repository contains the code, that we used to show the robustness of DBGD/MGD based algorithms. 4 | 5 | **attacker_weights** folder contains the weight files of the attacker for 4 datasets. Additional weight files can be added here depending on the dataset (num. of features). These files are read in the **utils/attacksimulation.py**. 6 | 7 | Usage 8 | ------- 9 | To run the code to generate experimental results you can simply run the attack.sh script. This script in turns calls another script present in the scripts/Poisoning_attacks directory. Four scripts are provided there depending on the algorithm and the learning rate decay. 10 | 11 | An example of such a script is given: 12 | ``` 13 | python2 scripts/Poisoning_attacks/attack_DBGD_base_lr.py --data_sets local_MQ2007 --attacker_click_model frequency_attack\ 14 | --click_models exper1 --log_folder ./log --output_folder ./output --average_folder ./average \ 15 | --n_impr 10000 --n_runs 10 --n_proc 10 --n_results 10 --start 0 --end 1 --which 1 --mf 5 --sd_const 2.0 --num_attacker_relevant 5 16 | ``` 17 | 18 | To know the details for each of the arguments, you can look at the **utils/argparsers/simulationargparser.py** file. 19 | 20 | Additionally after running the experiments, 2 additional folders will be created namely **attackerourput** and **attackeraverage**. Graphs can be generated via the **attack_graph.py** script on the averaged out file. Here is an example: 21 | 22 | ``` 23 | python3 attack_graph.py attackeraverage/MQ2007/attack/TD_DBGD_frequency_attack_10_res_0_start_1_end_1_half_10000_impressions0.9999977_lrdecay.out 24 | ``` 25 | 26 | 27 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/__init__.py -------------------------------------------------------------------------------- /algorithms/DBGD/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/DBGD/__init__.py -------------------------------------------------------------------------------- /algorithms/DBGD/neural/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/DBGD/neural/__init__.py -------------------------------------------------------------------------------- /algorithms/DBGD/neural/pdbgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | import utils.rankings as rnk 7 | from models.evolutionneuralmodel import EvolutionNeuralModel 8 | from algorithms.DBGD.pdbgd import P_DBGD 9 | 10 | # Probabilistic Interleaving Dueling Bandit Gradient Descent 11 | class Neural_P_DBGD(P_DBGD): 12 | 13 | def __init__(self, learning_rate, learning_rate_decay, 14 | hidden_layers, *args, **kargs): 15 | super(Neural_P_DBGD, self).__init__(learning_rate = learning_rate, 16 | learning_rate_decay = learning_rate_decay, 17 | *args, **kargs) 18 | self.model = EvolutionNeuralModel( 19 | n_features = self.n_features, 20 | learning_rate = learning_rate, 21 | n_candidates = 1, 22 | learning_rate_decay = learning_rate_decay, 23 | hidden_layers = hidden_layers) 24 | 25 | @staticmethod 26 | def default_parameters(): 27 | parent_parameters = P_DBGD.default_parameters() 28 | parent_parameters.update({ 29 | 'learning_rate': 0.01, 30 | 'learning_rate_decay': 1.0, 31 | 'PM_n_samples': 10000, 32 | 'PM_tau': 3.0, 33 | 'hidden_layers': [64], 34 | }) 35 | return parent_parameters 36 | -------------------------------------------------------------------------------- /algorithms/DBGD/pdbgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | import utils.rankings as rnk 7 | from algorithms.DBGD.tddbgd import TD_DBGD 8 | from multileaving.ProbabilisticMultileave import ProbabilisticMultileave 9 | 10 | # Probabilistic Interleaving Dueling Bandit Gradient Descent 11 | class P_DBGD(TD_DBGD): 12 | 13 | def __init__(self, PM_n_samples, PM_tau, *args, **kargs): 14 | super(P_DBGD, self).__init__(*args, **kargs) 15 | self.multileaving = ProbabilisticMultileave( 16 | n_samples = PM_n_samples, 17 | tau = PM_tau, 18 | n_results=self.n_results) 19 | 20 | @staticmethod 21 | def default_parameters(): 22 | parent_parameters = TD_DBGD.default_parameters() 23 | parent_parameters.update({ 24 | 'learning_rate': 0.01, 25 | 'learning_rate_decay': 1.0, 26 | 'PM_n_samples': 10000, 27 | 'PM_tau': 3.0, 28 | }) 29 | return parent_parameters 30 | 31 | def _create_train_ranking(self, query_id, query_feat, inverted): 32 | assert inverted==False 33 | self.model.sample_candidates() 34 | scores = self.model.candidate_score(query_feat) 35 | inverted_rankings = rnk.rank_single_query(scores, 36 | inverted=True, 37 | n_results=None) 38 | multileaved_list = self.multileaving.make_multileaving(inverted_rankings) 39 | return multileaved_list 40 | -------------------------------------------------------------------------------- /algorithms/DBGD/pdbgd_dsp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | import utils.rankings as rnk 7 | from algorithms.DBGD.tddbgd import TD_DBGD 8 | from multileaving.ProbabilisticMultileave import ProbabilisticMultileave 9 | import numpy as np 10 | import math 11 | 12 | # Probabilistic Interleaving Dueling Bandit Gradient Descent 13 | class P_DBGD_DSP(TD_DBGD): 14 | 15 | def __init__(self, k_initial, k_increase, PM_n_samples, PM_tau, prev_qeury_len=None, docspace=[False,0], *args, **kargs): 16 | super(P_DBGD_DSP, self).__init__(*args, **kargs) 17 | 18 | self.multileaving = ProbabilisticMultileave( 19 | n_samples = PM_n_samples, 20 | tau = PM_tau, 21 | n_results=self.n_results) 22 | 23 | self.k_initial = k_initial 24 | self.k_increase = k_increase 25 | 26 | self.prev_qeury_len = prev_qeury_len # queue size of features from previous queries 27 | if prev_qeury_len: 28 | self.prev_feat_list = [] 29 | # for document space length experiment 30 | # docspace=[True,3] means use superset of document space with three additional documents to perfect DS user examined. 31 | self.docspace = docspace 32 | 33 | @staticmethod 34 | def default_parameters(): 35 | parent_parameters = TD_DBGD.default_parameters() 36 | parent_parameters.update({ 37 | 'learning_rate': 0.01, 38 | 'learning_rate_decay': 1.0, 39 | 'PM_n_samples': 10000, 40 | 'PM_tau': 3.0, 41 | }) 42 | return parent_parameters 43 | 44 | def _create_train_ranking(self, query_id, query_feat, inverted): 45 | # Save query_id to get access to query_feat when updating 46 | self.query_id = query_id 47 | assert inverted==False 48 | self.model.sample_candidates() 49 | scores = self.model.candidate_score(query_feat) 50 | inverted_rankings = rnk.rank_single_query(scores, 51 | inverted=True, 52 | n_results=None) 53 | multileaved_list = self.multileaving.make_multileaving(inverted_rankings) 54 | return multileaved_list 55 | 56 | 57 | def update_to_interaction(self, clicks, stop_index=None): 58 | 59 | winners = self.multileaving.winning_rankers(clicks) 60 | ############################################################### 61 | if True in clicks: 62 | # For projection 63 | # keep track of feature vectors of doc list 64 | viewed_list = [] 65 | # index of last click 66 | last_click = max(loc for loc, val in enumerate(clicks) if val == True) 67 | # prevent last_click+k from exceeding interleaved list length 68 | k_current = self.k_initial 69 | if self.k_increase: 70 | # gradually increast k 71 | k_current += int(self.n_interactions/1000) 72 | last_doc_index = min(last_click+k_current, len(self._last_ranking)) 73 | 74 | if self.docspace[0] and stop_index is not None: # for document space length experiment 75 | # create sub/super set of perfect document space user examined. 76 | # user examined documents coming from ccm, where user leaves. 77 | last_doc_index = stop_index + self.docspace[1] + 1 # 1 added for stopping document, which has been examined. 78 | last_doc_index = max(last_doc_index,1) # At least 1 79 | last_doc_index = min(last_doc_index,len(self._last_ranking)) # At most length of current list 80 | 81 | query_feat = self.get_query_features(self.query_id, 82 | self._train_features, 83 | self._train_query_ranges) 84 | for i in range(last_doc_index): 85 | docid = self._last_ranking[i] 86 | feature = query_feat[docid] 87 | viewed_list.append(feature) 88 | add_list = viewed_list 89 | 90 | # Append feature vectors from previous queries 91 | if self.prev_qeury_len: 92 | if len(self.prev_feat_list) > 0: 93 | viewed_list = np.append(viewed_list,self.prev_feat_list, axis=0) 94 | 95 | # Add examined feature vectors of current query to be used in later iterations 96 | for i in add_list: 97 | if len(self.prev_feat_list) >= self.prev_qeury_len : 98 | self.prev_feat_list.pop(0) # Remove oldest document feature. 99 | # if prev_feat_list is not filled up, add current list 100 | self.prev_feat_list.append(i) 101 | 102 | self.model.update_to_mean_winners(winners,viewed_list) 103 | ############################################################### 104 | else: 105 | self.model.update_to_mean_winners(winners) 106 | 107 | -------------------------------------------------------------------------------- /algorithms/DBGD/pmgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | from algorithms.DBGD.pdbgd import P_DBGD 7 | from models.linearmodel import LinearModel 8 | 9 | 10 | # Probabilistic Interleaving Dueling Bandit Gradient Descent 11 | class P_MGD(P_DBGD): 12 | 13 | def __init__(self, n_candidates, *args, **kargs): 14 | super(P_MGD, self).__init__(*args, **kargs) 15 | self.n_candidates = n_candidates 16 | self.model = LinearModel(n_features = self.n_features, 17 | learning_rate = self.learning_rate, 18 | n_candidates = self.n_candidates) 19 | 20 | 21 | @staticmethod 22 | def default_parameters(): 23 | parent_parameters = P_DBGD.default_parameters() 24 | parent_parameters.update({ 25 | 'n_candidates': 49, 26 | }) 27 | return parent_parameters 28 | -------------------------------------------------------------------------------- /algorithms/DBGD/pmgd_dsp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | from algorithms.DBGD.pdbgd import P_DBGD 7 | import utils.rankings as rnk 8 | from models.linearmodel import LinearModel 9 | import numpy as np 10 | import math 11 | 12 | 13 | # Probabilistic Interleaving Dueling Bandit Gradient Descent 14 | class P_MGD_DSP(P_DBGD): 15 | 16 | def __init__(self, k_initial, k_increase, n_candidates, prev_qeury_len=None, docspace=[False,0], *args, **kargs): 17 | super(P_MGD_DSP, self).__init__(*args, **kargs) 18 | self.n_candidates = n_candidates 19 | self.model = LinearModel(n_features = self.n_features, 20 | learning_rate = self.learning_rate, 21 | n_candidates = self.n_candidates) 22 | 23 | self.k_initial = k_initial 24 | self.k_increase = k_increase 25 | 26 | self.prev_qeury_len = prev_qeury_len # queue size of features from previous queries 27 | if prev_qeury_len: 28 | self.prev_feat_list = [] 29 | # for document space length experiment 30 | # docspace=[True,3] means use superset of document space with three additional documents to perfect DS user examined. 31 | self.docspace = docspace 32 | 33 | @staticmethod 34 | def default_parameters(): 35 | parent_parameters = P_DBGD.default_parameters() 36 | parent_parameters.update({ 37 | 'n_candidates': 49, 38 | }) 39 | return parent_parameters 40 | 41 | def _create_train_ranking(self, query_id, query_feat, inverted): 42 | # Save query_id to get access to query_feat when updating 43 | self.query_id = query_id 44 | assert inverted==False 45 | self.model.sample_candidates() 46 | scores = self.model.candidate_score(query_feat) 47 | inverted_rankings = rnk.rank_single_query(scores, 48 | inverted=True, 49 | n_results=None) 50 | multileaved_list = self.multileaving.make_multileaving(inverted_rankings) 51 | return multileaved_list 52 | 53 | def update_to_interaction(self, clicks, stop_index=None): 54 | 55 | winners = self.multileaving.winning_rankers(clicks) 56 | ############################################################### 57 | if True in clicks: 58 | # For projection 59 | # keep track of feature vectors of doc list 60 | viewed_list = [] 61 | # index of last click 62 | last_click = max(loc for loc, val in enumerate(clicks) if val == True) 63 | # prevent last_click+k from exceeding interleaved list length 64 | k_current = self.k_initial 65 | if self.k_increase: 66 | # gradually increast k 67 | k_current += int(self.n_interactions/1000) 68 | last_doc_index = min(last_click+k_current, len(self._last_ranking)) 69 | 70 | if self.docspace[0] and stop_index is not None: # for document space length experiment 71 | # create sub/super set of perfect document space user examined. 72 | # user examined documents coming from ccm, where user leaves. 73 | last_doc_index = stop_index + self.docspace[1] + 1 # 1 added for stopping document, which has been examined. 74 | last_doc_index = max(last_doc_index,1) # At least 1 75 | last_doc_index = min(last_doc_index,len(self._last_ranking)) # At most length of current list 76 | 77 | query_feat = self.get_query_features(self.query_id, 78 | self._train_features, 79 | self._train_query_ranges) 80 | for i in range(last_doc_index): 81 | docid = self._last_ranking[i] 82 | feature = query_feat[docid] 83 | viewed_list.append(feature) 84 | add_list = viewed_list 85 | 86 | # Append feature vectors from previous queries 87 | if self.prev_qeury_len: 88 | if len(self.prev_feat_list) > 0: 89 | viewed_list = np.append(viewed_list,self.prev_feat_list, axis=0) 90 | 91 | # Add examined feature vectors of current query to be used in later iterations 92 | for i in add_list: 93 | if len(self.prev_feat_list) >= self.prev_qeury_len : 94 | self.prev_feat_list.pop(0) # Remove oldest document feature. 95 | # if prev_feat_list is not filled up, add current list 96 | self.prev_feat_list.append(i) 97 | 98 | self.model.update_to_mean_winners(winners,viewed_list) 99 | ############################################################### 100 | else: 101 | self.model.update_to_mean_winners(winners) -------------------------------------------------------------------------------- /algorithms/DBGD/tdNSGD.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | from models.linearmodel import LinearModel 7 | from algorithms.DBGD.tddbgd import TD_DBGD 8 | import numpy as np 9 | from sys import maxint 10 | import copy 11 | from scipy.spatial.distance import cosine 12 | import utils.rankings as rnk 13 | # Dueling Bandit Gradient Descent 14 | class TD_NSGD(TD_DBGD): 15 | 16 | def __init__(self, n_candidates, GRAD_SIZE, EXP_SIZE, TB_QUEUE_SIZE=None, TB_WINDOW_SIZE=None, *args, **kargs): 17 | super(TD_NSGD, self).__init__(*args, **kargs) 18 | self.model = LinearModel(n_features = self.n_features, 19 | learning_rate = self.learning_rate, 20 | n_candidates = n_candidates) 21 | self.GRAD_SIZE = GRAD_SIZE 22 | self.EXP_SIZE = EXP_SIZE 23 | self.TB_QUEUE_SIZE = TB_QUEUE_SIZE 24 | self.TB_WINDOW_SIZE = TB_WINDOW_SIZE 25 | self.sample_basis = True 26 | self.clicklist = np.empty([self.GRAD_SIZE,1], dtype=int) #click array 27 | self.grad = np.zeros([self.GRAD_SIZE,self.n_features], dtype=float) 28 | self.gradCol = 0 29 | 30 | # DQ tie-break related lists 31 | self.difficult_NDCG =[] 32 | self.difficult_queries =[] 33 | self.difficult_document =[] 34 | self.difficult_time =[] 35 | self.query_id = 0 36 | 37 | @staticmethod 38 | def default_parameters(): 39 | parent_parameters = TD_DBGD.default_parameters() 40 | parent_parameters.update({ 41 | 'n_candidates': 9, 42 | }) 43 | return parent_parameters 44 | 45 | def update_to_interaction(self, clicks, stop_index=None): 46 | winners, ranker_clicks = self.multileaving.winning_rankers_with_clicks(clicks) 47 | 48 | # Fill out recent difficult query queues. 49 | if self.TB_QUEUE_SIZE > 0: 50 | self.fill_difficult_query(clicks) 51 | # Trigger difficult-query tie-break strategy 52 | if len(self.difficult_queries) < self.TB_QUEUE_SIZE and len(winners) > 1: 53 | winners = self.tieBreak_difficultQuery(winners) 54 | 55 | self.model.update_to_mean_winners(winners) 56 | 57 | cl_sorted = sorted(ranker_clicks) # in ascending order 58 | for i in range(1, len(ranker_clicks)): 59 | # only save subset of rankers (worst 4 ouf of 9 rankers) 60 | # add if current cl is smaller than or equal to maximum form the set of candidates 61 | if ranker_clicks[i] <= cl_sorted[3] and ranker_clicks[i] max and j not in nums: 80 | max = self.clicklist[j] # The better cl value to be excluded 81 | n = j # index of it 82 | nums.append(n) 83 | 84 | # create subset of gradient matrix 85 | grad_temp = np.zeros([self.EXP_SIZE, self.n_features], dtype=float) 86 | c = 0 87 | for i in range(0,self.GRAD_SIZE): 88 | if i not in nums: 89 | # The wrost 'EXP_SIZE' gradients from grad[] added to gr_temp 90 | grad_temp[c] = copy.deepcopy(self.grad[i]) 91 | c = c + 1 92 | 93 | self.model.sample_candidates_null_space(grad_temp, query_feat, self.sample_basis) 94 | scores = self.model.candidate_score(query_feat) 95 | rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results) 96 | multileaved_list = self.multileaving.make_multileaving(rankings) 97 | return multileaved_list 98 | 99 | def fill_difficult_query(self, clicks): 100 | # Set up for tie breaker- keep track of difficult queries 101 | # Find the rank of first clicked document 102 | ndcg_current = 0 103 | clickedList = [] 104 | for count, elem in enumerate(clicks): 105 | if elem == 1: # if clicked 106 | ndcg_current += 1 / (count + 1.0) 107 | # Keep track of clicked documents of current query 108 | clickedList.append(self._last_ranking[count]) 109 | 110 | # If difficult queries for tie breaking is not filled up, add current query 111 | if len(self.difficult_NDCG) < self.TB_QUEUE_SIZE and ndcg_current > 0: 112 | self.difficult_NDCG.append(ndcg_current) 113 | self.difficult_queries.append(self.query_id) 114 | self.difficult_document.append(clickedList) # first clicked doc to follow 115 | self.difficult_time.append(self.n_interactions) 116 | else: 117 | # If already filled up, check if current query is more difficult than any saved query. 118 | if len(self.difficult_NDCG) > 0: 119 | flag = False 120 | for i in range(len(self.difficult_NDCG)): 121 | if self.n_interactions - self.difficult_time[i] > self.TB_WINDOW_SIZE: 122 | # Maintain queries winthin the window size 123 | flag = True 124 | index = i 125 | break 126 | if not flag and max(self.difficult_NDCG) > ndcg_current and ndcg_current > 0: 127 | # Current query is more difficult than one of queued ones 128 | flag = True 129 | index = self.difficult_NDCG.index(max(self.difficult_NDCG)) 130 | if flag: 131 | self.difficult_NDCG[index] = ndcg_current 132 | self.difficult_queries[index] = self.query_id 133 | self.difficult_document[index] = clickedList 134 | self.difficult_time[index] = self.n_interactions 135 | 136 | def tieBreak_difficultQuery(self, winners): 137 | # ScoreList keeps track of ranks each tied candidate perform in tie breaking 138 | scoreList = np.zeros(self.model.n_models) 139 | # Iterate through 10 stored difficult queries 140 | for count_q, diff_query in enumerate(self.difficult_queries): 141 | query_feat = self.get_query_features(diff_query, 142 | self._train_features, 143 | self._train_query_ranges) 144 | scores = self.model.candidate_score(query_feat) 145 | rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results) 146 | 147 | # Iterate through tied candidates 148 | for winner in winners: 149 | candidate_NDCG = 0.0 150 | for count_d, doc in enumerate(self.difficult_document[count_q]): 151 | # Calculate NDCG performance in current difficult query 152 | diff_doc_rank = np.where(rankings[winner] == self.difficult_document[count_q][count_d])[0][0] 153 | temp = 1 / (diff_doc_rank + 1.0) 154 | candidate_NDCG += 1 / (diff_doc_rank + 1.0) 155 | 156 | # Add the NDCG value of diff. query 157 | scoreList[winner] += candidate_NDCG 158 | # Ranker with the least sum of NDCGs is the winner 159 | maxRank_score = np.max(scoreList[np.nonzero(scoreList)]) 160 | winner = scoreList.tolist().index(maxRank_score) 161 | return [winner] 162 | -------------------------------------------------------------------------------- /algorithms/DBGD/tdNSGD_dsp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | from models.linearmodel import LinearModel 7 | from algorithms.DBGD.tddbgd import TD_DBGD 8 | import numpy as np 9 | from sys import maxint 10 | import copy 11 | from scipy.spatial.distance import cosine 12 | import utils.rankings as rnk 13 | # Dueling Bandit Gradient Descent 14 | class TD_NSGD_DSP(TD_DBGD): 15 | 16 | def __init__(self, n_candidates, GRAD_SIZE, EXP_SIZE, k_initial, k_increase, TB_QUEUE_SIZE=None, TB_WINDOW_SIZE=None, prev_qeury_len=None, *args, **kargs): 17 | super(TD_NSGD_DSP, self).__init__(*args, **kargs) 18 | self.model = LinearModel(n_features = self.n_features, 19 | learning_rate = self.learning_rate, 20 | n_candidates = n_candidates) 21 | self.GRAD_SIZE = GRAD_SIZE 22 | self.EXP_SIZE = EXP_SIZE 23 | self.TB_QUEUE_SIZE = TB_QUEUE_SIZE 24 | self.TB_WINDOW_SIZE = TB_WINDOW_SIZE 25 | self.sample_basis = True 26 | self.clicklist = np.empty([self.GRAD_SIZE,1], dtype=int) #click array 27 | self.grad = np.zeros([self.GRAD_SIZE,self.n_features], dtype=float) 28 | self.gradCol = 0 29 | 30 | # DQ tie-break related lists 31 | self.difficult_NDCG =[] 32 | self.difficult_queries =[] 33 | self.difficult_document =[] 34 | self.difficult_time =[] 35 | self.query_id = 0 36 | 37 | self.k_initial = k_initial 38 | self.k_increase = k_increase 39 | 40 | # Secondary techniques 41 | self.prev_qeury_len = prev_qeury_len 42 | if prev_qeury_len: 43 | self.prev_feat_list = [] 44 | 45 | @staticmethod 46 | def default_parameters(): 47 | parent_parameters = TD_DBGD.default_parameters() 48 | parent_parameters.update({ 49 | 'n_candidates': 9, 50 | }) 51 | return parent_parameters 52 | 53 | def update_to_interaction(self, clicks, stop_index=None): 54 | 55 | winners, ranker_clicks = self.multileaving.winning_rankers_with_clicks(clicks) 56 | 57 | # Fill out recent difficult query queues. 58 | if self.TB_QUEUE_SIZE > 0: 59 | self.fill_difficult_query(clicks) 60 | # Trigger difficult-query tie-break strategy 61 | if len(self.difficult_queries) < self.TB_QUEUE_SIZE and len(winners) > 1: 62 | winners = self.tieBreak_difficultQuery(winners) 63 | 64 | 65 | ############################################################### 66 | if True in clicks: 67 | # For projection 68 | # keep track of feature vectors of doc list 69 | viewed_list = [] 70 | # index of last click 71 | last_click = max(loc for loc, val in enumerate(clicks) if val == True) 72 | # prevent last_click+k from exceeding interleaved list length 73 | k_current = self.k_initial 74 | if self.k_increase: 75 | # gradually increast k 76 | k_current += int(self.n_interactions/1000) 77 | last_doc_index = min(last_click+k_current, len(self._last_ranking)-1) 78 | 79 | query_feat = self.get_query_features(self.query_id, 80 | self._train_features, 81 | self._train_query_ranges) 82 | for i in range(last_doc_index): 83 | docid = self._last_ranking[i] 84 | feature = query_feat[docid] 85 | viewed_list.append(feature) 86 | self.model.update_to_mean_winners(winners,viewed_list) 87 | ############################################################### 88 | else: 89 | self.model.update_to_mean_winners(winners) 90 | 91 | cl_sorted = sorted(ranker_clicks) # in ascending order 92 | for i in range(1, len(ranker_clicks)): 93 | # only save subset of rankers (worst 4 ouf of 9 rankers) 94 | # add if current cl is smaller than or equal to maximum form the set of candidates 95 | if ranker_clicks[i] <= cl_sorted[3] and ranker_clicks[i] max and j not in nums: 114 | max = self.clicklist[j] # The better cl value to be excluded 115 | n = j # index of it 116 | nums.append(n) 117 | 118 | # create subset of gradient matrix 119 | grad_temp = np.zeros([self.EXP_SIZE, self.n_features], dtype=float) 120 | c = 0 121 | for i in range(0,self.GRAD_SIZE): 122 | if i not in nums: 123 | # The wrost 'EXP_SIZE' gradients from grad[] added to gr_temp 124 | grad_temp[c] = copy.deepcopy(self.grad[i]) 125 | c = c + 1 126 | 127 | self.model.sample_candidates_null_space(grad_temp, query_feat, self.sample_basis) 128 | scores = self.model.candidate_score(query_feat) 129 | rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results) 130 | multileaved_list = self.multileaving.make_multileaving(rankings) 131 | return multileaved_list 132 | 133 | def fill_difficult_query(self, clicks): 134 | # Set up for tie breaker- keep track of difficult queries 135 | # Find the rank of first clicked document 136 | ndcg_current = 0 137 | clickedList = [] 138 | for count, elem in enumerate(clicks): 139 | if elem == 1: # if clicked 140 | ndcg_current += 1 / (count + 1.0) 141 | # Keep track of clicked documents of current query 142 | clickedList.append(self._last_ranking[count]) 143 | 144 | # If difficult queries for tie breaking is not filled up, add current query 145 | if len(self.difficult_NDCG) < self.TB_QUEUE_SIZE and ndcg_current > 0: 146 | self.difficult_NDCG.append(ndcg_current) 147 | self.difficult_queries.append(self.query_id) 148 | self.difficult_document.append(clickedList) # first clicked doc to follow 149 | self.difficult_time.append(self.n_interactions) 150 | else: 151 | # If already filled up, check if current query is more difficult than any saved query. 152 | if len(self.difficult_NDCG) > 0: 153 | flag = False 154 | for i in range(len(self.difficult_NDCG)): 155 | if self.n_interactions - self.difficult_time[i] > self.TB_WINDOW_SIZE: 156 | # Maintain queries winthin the window size 157 | flag = True 158 | index = i 159 | break 160 | if not flag and max(self.difficult_NDCG) > ndcg_current and ndcg_current > 0: 161 | # Current query is more difficult than one of queued ones 162 | flag = True 163 | index = self.difficult_NDCG.index(max(self.difficult_NDCG)) 164 | if flag: 165 | self.difficult_NDCG[index] = ndcg_current 166 | self.difficult_queries[index] = self.query_id 167 | self.difficult_document[index] = clickedList 168 | self.difficult_time[index] = self.n_interactions 169 | 170 | def tieBreak_difficultQuery(self, winners): 171 | # ScoreList keeps track of ranks each tied candidate perform in tie breaking 172 | scoreList = np.zeros(self.model.n_models) 173 | # Iterate through 10 stored difficult queries 174 | for count_q, diff_query in enumerate(self.difficult_queries): 175 | query_feat = self.get_query_features(diff_query, 176 | self._train_features, 177 | self._train_query_ranges) 178 | scores = self.model.candidate_score(query_feat) 179 | rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results) 180 | 181 | # Iterate through tied candidates 182 | for winner in winners: 183 | candidate_NDCG = 0.0 184 | for count_d, doc in enumerate(self.difficult_document[count_q]): 185 | # Calculate NDCG performance in current difficult query 186 | diff_doc_rank = np.where(rankings[winner] == self.difficult_document[count_q][count_d])[0][0] 187 | temp = 1 / (diff_doc_rank + 1.0) 188 | candidate_NDCG += 1 / (diff_doc_rank + 1.0) 189 | 190 | # Add the NDCG value of diff. query 191 | scoreList[winner] += candidate_NDCG 192 | # Ranker with the least sum of NDCGs is the winner 193 | maxRank_score = np.max(scoreList[np.nonzero(scoreList)]) 194 | winner = scoreList.tolist().index(maxRank_score) 195 | return [winner] 196 | -------------------------------------------------------------------------------- /algorithms/DBGD/tddbgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | import numpy as np 7 | import utils.rankings as rnk 8 | from models.linearmodel import LinearModel 9 | from algorithms.basiconlineranker import BasicOnlineRanker 10 | from multileaving.TeamDraftMultileave import TeamDraftMultileave 11 | 12 | # Dueling Bandit Gradient Descent 13 | class TD_DBGD(BasicOnlineRanker): 14 | 15 | def __init__(self, learning_rate, learning_rate_decay, 16 | *args, **kargs): 17 | super(TD_DBGD, self).__init__(*args, **kargs) 18 | self.learning_rate = learning_rate 19 | self.model = LinearModel(n_features = self.n_features, 20 | learning_rate = learning_rate, 21 | n_candidates = 1, 22 | learning_rate_decay = learning_rate_decay) 23 | self.multileaving = TeamDraftMultileave( 24 | n_results=self.n_results) 25 | 26 | 27 | @staticmethod 28 | def default_parameters(): 29 | parent_parameters = BasicOnlineRanker.default_parameters() 30 | parent_parameters.update({ 31 | 'learning_rate': 0.01, 32 | 'learning_rate_decay': 1.0, 33 | }) 34 | return parent_parameters 35 | 36 | def get_test_rankings(self, features, 37 | query_ranges, inverted=True): 38 | scores = self.model.score(features) 39 | return rnk.rank_multiple_queries( 40 | scores, 41 | query_ranges, 42 | inverted=inverted, 43 | n_results=self.n_results) 44 | 45 | def _create_train_ranking(self, query_id, query_feat, inverted): 46 | assert inverted == False 47 | self.model.sample_candidates() 48 | scores = self.model.candidate_score(query_feat) 49 | rankings = rnk.rank_single_query(scores, inverted=False, n_results=self.n_results) 50 | multileaved_list = self.multileaving.make_multileaving(rankings) 51 | return multileaved_list 52 | 53 | def update_to_interaction(self, clicks): 54 | winners = self.multileaving.winning_rankers(clicks) 55 | self.model.update_to_mean_winners(winners) 56 | -------------------------------------------------------------------------------- /algorithms/DBGD/tdmgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | from models.linearmodel import LinearModel 7 | from algorithms.DBGD.tddbgd import TD_DBGD 8 | 9 | 10 | # Dueling Bandit Gradient Descent 11 | class TD_MGD(TD_DBGD): 12 | 13 | def __init__(self, n_candidates, *args, **kargs): 14 | super(TD_MGD, self).__init__(*args, **kargs) 15 | self.model = LinearModel(n_features = self.n_features, 16 | learning_rate = self.learning_rate, 17 | n_candidates = n_candidates, 18 | learning_rate_decay = self.model.learning_rate_decay) 19 | 20 | @staticmethod 21 | def default_parameters(): 22 | parent_parameters = TD_DBGD.default_parameters() 23 | parent_parameters.update({ 24 | 'n_candidates': 9, 25 | }) 26 | return parent_parameters 27 | -------------------------------------------------------------------------------- /algorithms/PDGD/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/PDGD/__init__.py -------------------------------------------------------------------------------- /algorithms/PDGD/deeppdgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | import numpy as np 7 | import utils.rankings as rnk 8 | from models.neuralmodel import NeuralModel 9 | from algorithms.PDGD.pdgd import PDGD 10 | 11 | # Pairwise Differentiable Gradient Descent 12 | class DeepPDGD(PDGD): 13 | 14 | def __init__(self, hidden_layers, *args, **kargs): 15 | super(DeepPDGD, self).__init__(*args, **kargs) 16 | self.model = NeuralModel(n_features = self.n_features, 17 | learning_rate = self.learning_rate, 18 | learning_rate_decay = self.learning_rate_decay, 19 | hidden_layers = hidden_layers) 20 | 21 | @staticmethod 22 | def default_parameters(): 23 | parent_parameters = PDGD.default_parameters() 24 | parent_parameters.update({ 25 | 'learning_rate': 0.01, 26 | 'hidden_layers': [64], 27 | }) 28 | return parent_parameters 29 | -------------------------------------------------------------------------------- /algorithms/PDGD/pdgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | import numpy as np 7 | import utils.rankings as rnk 8 | from models.linearmodel import LinearModel 9 | from algorithms.basiconlineranker import BasicOnlineRanker 10 | 11 | # Pairwise Differentiable Gradient Descent 12 | class PDGD(BasicOnlineRanker): 13 | 14 | def __init__(self, learning_rate, learning_rate_decay, 15 | *args, **kargs): 16 | super(PDGD, self).__init__(*args, **kargs) 17 | self.learning_rate = learning_rate 18 | self.learning_rate_decay = learning_rate_decay 19 | self.model = LinearModel(n_features = self.n_features, 20 | learning_rate = learning_rate, 21 | learning_rate_decay = learning_rate_decay, 22 | n_candidates = 1) 23 | 24 | 25 | @staticmethod 26 | def default_parameters(): 27 | parent_parameters = BasicOnlineRanker.default_parameters() 28 | parent_parameters.update({ 29 | 'learning_rate': 0.1, 30 | 'learning_rate_decay': 1.0, 31 | }) 32 | return parent_parameters 33 | 34 | def get_test_rankings(self, features, 35 | query_ranges, inverted=True): 36 | scores = -self.model.score(features) 37 | return rnk.rank_multiple_queries( 38 | scores, 39 | query_ranges, 40 | inverted=inverted, 41 | n_results=self.n_results) 42 | 43 | def _create_train_ranking(self, query_id, query_feat, inverted): 44 | assert inverted == False 45 | n_docs = query_feat.shape[0] 46 | k = np.minimum(self.n_results, n_docs) 47 | self.doc_scores = self.model.score(query_feat) 48 | self.doc_scores += 18 - np.amax(self.doc_scores) 49 | self.ranking = self._recursive_choice(np.copy(self.doc_scores), 50 | np.array([], dtype=np.int32), 51 | k) 52 | self._last_query_feat = query_feat 53 | return self.ranking 54 | 55 | def _recursive_choice(self, scores, incomplete_ranking, k_left): 56 | n_docs = scores.shape[0] 57 | scores[incomplete_ranking] = np.amin(scores) 58 | scores += 18 - np.amax(scores) 59 | exp_scores = np.exp(scores) 60 | exp_scores[incomplete_ranking] = 0 61 | probs = exp_scores/np.sum(exp_scores) 62 | safe_n = np.sum(probs > 10**(-4)/n_docs) 63 | safe_k = np.minimum(safe_n, k_left) 64 | 65 | next_ranking = np.random.choice(np.arange(n_docs), 66 | replace=False, 67 | p=probs, 68 | size=safe_k) 69 | ranking = np.concatenate((incomplete_ranking, next_ranking)) 70 | 71 | k_left = k_left - safe_k 72 | if k_left > 0: 73 | return self._recursive_choice(scores, ranking, k_left) 74 | else: 75 | return ranking 76 | 77 | def update_to_interaction(self, clicks): 78 | if np.any(clicks): 79 | self._update_to_clicks(clicks) 80 | 81 | def _update_to_clicks(self, clicks): 82 | n_docs = self.ranking.shape[0] 83 | cur_k = np.minimum(n_docs, self.n_results) 84 | 85 | included = np.ones(cur_k, dtype=np.int32) 86 | if not clicks[-1]: 87 | included[1:] = np.cumsum(clicks[::-1])[:0:-1] 88 | neg_ind = np.where(np.logical_xor(clicks, included))[0] 89 | pos_ind = np.where(clicks)[0] 90 | 91 | n_pos = pos_ind.shape[0] 92 | n_neg = neg_ind.shape[0] 93 | n_pairs = n_pos*n_neg 94 | 95 | if n_pairs == 0: 96 | return 97 | 98 | pos_r_ind = self.ranking[pos_ind] 99 | neg_r_ind = self.ranking[neg_ind] 100 | 101 | pos_scores = self.doc_scores[pos_r_ind] 102 | neg_scores = self.doc_scores[neg_r_ind] 103 | 104 | log_pair_pos = np.tile(pos_scores, n_neg) 105 | log_pair_neg = np.repeat(neg_scores, n_pos) 106 | 107 | pair_trans = 18 - np.maximum(log_pair_pos, log_pair_neg) 108 | exp_pair_pos = np.exp(log_pair_pos + pair_trans) 109 | exp_pair_neg = np.exp(log_pair_neg + pair_trans) 110 | 111 | pair_denom = (exp_pair_pos + exp_pair_neg) 112 | pair_w = np.maximum(exp_pair_pos, exp_pair_neg) 113 | pair_w /= pair_denom 114 | pair_w /= pair_denom 115 | pair_w *= np.minimum(exp_pair_pos, exp_pair_neg) 116 | 117 | pair_w *= self._calculate_unbias_weights(pos_ind, neg_ind) 118 | 119 | reshaped = np.reshape(pair_w, (n_neg, n_pos)) 120 | pos_w = np.sum(reshaped, axis=0) 121 | neg_w = -np.sum(reshaped, axis=1) 122 | 123 | all_w = np.concatenate([pos_w, neg_w]) 124 | all_ind = np.concatenate([pos_r_ind, neg_r_ind]) 125 | 126 | self.model.update_to_documents(all_ind, 127 | all_w) 128 | 129 | def _calculate_unbias_weights(self, pos_ind, neg_ind): 130 | ranking_prob = self._calculate_observed_prob(pos_ind, neg_ind, 131 | self.doc_scores) 132 | flipped_prob = self._calculate_flipped_prob(pos_ind, neg_ind, 133 | self.doc_scores) 134 | return flipped_prob / (ranking_prob + flipped_prob) 135 | 136 | def _calculate_observed_prob(self, pos_ind, neg_ind, doc_scores): 137 | n_pos = pos_ind.shape[0] 138 | n_neg = neg_ind.shape[0] 139 | n_pairs = n_pos * n_neg 140 | n_results = self.ranking.shape[0] 141 | n_docs = doc_scores.shape[0] 142 | 143 | results_i = np.arange(n_results) 144 | pair_i = np.arange(n_pairs) 145 | doc_i = np.arange(n_docs) 146 | 147 | pos_pair_i = np.tile(pos_ind, n_neg) 148 | neg_pair_i = np.repeat(neg_ind, n_pos) 149 | 150 | min_pair_i = np.minimum(pos_pair_i, neg_pair_i) 151 | max_pair_i = np.maximum(pos_pair_i, neg_pair_i) 152 | range_mask = np.logical_and(min_pair_i[:, None] <= results_i, 153 | max_pair_i[:, None] >= results_i) 154 | 155 | safe_log = np.tile(doc_scores[None, :], 156 | [n_results, 1]) 157 | 158 | mask = np.zeros((n_results, n_docs)) 159 | mask[results_i[1:], self.ranking[:-1]] = True 160 | mask = np.cumsum(mask, axis=0).astype(bool) 161 | 162 | safe_log[mask] = np.amin(safe_log) 163 | safe_max = np.amax(safe_log, axis=1) 164 | safe_log -= safe_max[:, None] - 18 165 | safe_exp = np.exp(safe_log) 166 | safe_exp[mask] = 0 167 | 168 | ranking_log = doc_scores[self.ranking] - safe_max + 18 169 | ranking_exp = np.exp(ranking_log) 170 | 171 | safe_denom = np.sum(safe_exp, axis=1) 172 | ranking_prob = ranking_exp/safe_denom 173 | 174 | tiled_prob = np.tile(ranking_prob[None, :], [n_pairs, 1]) 175 | 176 | safe_prob = np.ones((n_pairs, n_results)) 177 | safe_prob[range_mask] = tiled_prob[range_mask] 178 | 179 | safe_pair_prob = np.prod(safe_prob, axis=1) 180 | 181 | return safe_pair_prob 182 | 183 | def _calculate_flipped_prob(self, pos_ind, neg_ind, doc_scores): 184 | n_pos = pos_ind.shape[0] 185 | n_neg = neg_ind.shape[0] 186 | n_pairs = n_pos * n_neg 187 | n_results = self.ranking.shape[0] 188 | n_docs = doc_scores.shape[0] 189 | 190 | results_i = np.arange(n_results) 191 | pair_i = np.arange(n_pairs) 192 | doc_i = np.arange(n_docs) 193 | 194 | pos_pair_i = np.tile(pos_ind, n_neg) 195 | neg_pair_i = np.repeat(neg_ind, n_pos) 196 | 197 | flipped_rankings = np.tile(self.ranking[None, :], 198 | [n_pairs, 1]) 199 | flipped_rankings[pair_i, pos_pair_i] = self.ranking[neg_pair_i] 200 | flipped_rankings[pair_i, neg_pair_i] = self.ranking[pos_pair_i] 201 | 202 | min_pair_i = np.minimum(pos_pair_i, neg_pair_i) 203 | max_pair_i = np.maximum(pos_pair_i, neg_pair_i) 204 | range_mask = np.logical_and(min_pair_i[:, None] <= results_i, 205 | max_pair_i[:, None] >= results_i) 206 | 207 | flipped_log = doc_scores[flipped_rankings] 208 | 209 | safe_log = np.tile(doc_scores[None, None, :], 210 | [n_pairs, n_results, 1]) 211 | 212 | results_ij = np.tile(results_i[None, 1:], [n_pairs, 1]) 213 | pair_ij = np.tile(pair_i[:, None], [1, n_results-1]) 214 | mask = np.zeros((n_pairs, n_results, n_docs)) 215 | mask[pair_ij, results_ij, flipped_rankings[:, :-1]] = True 216 | mask = np.cumsum(mask, axis=1).astype(bool) 217 | 218 | safe_log[mask] = np.amin(safe_log) 219 | safe_max = np.amax(safe_log, axis=2) 220 | safe_log -= safe_max[:, :, None] - 18 221 | flipped_log -= safe_max - 18 222 | flipped_exp = np.exp(flipped_log) 223 | 224 | safe_exp = np.exp(safe_log) 225 | safe_exp[mask] = 0 226 | safe_denom = np.sum(safe_exp, axis=2) 227 | safe_prob = np.ones((n_pairs, n_results)) 228 | safe_prob[range_mask] = (flipped_exp/safe_denom)[range_mask] 229 | 230 | safe_pair_prob = np.prod(safe_prob, axis=1) 231 | 232 | return safe_pair_prob 233 | 234 | -------------------------------------------------------------------------------- /algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/__init__.py -------------------------------------------------------------------------------- /algorithms/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/algorithms/baselines/__init__.py -------------------------------------------------------------------------------- /algorithms/baselines/pairwise.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | import numpy as np 7 | import utils.rankings as rnk 8 | from algorithms.PDGD.pdgd import PDGD 9 | 10 | # Pairwise Baseline from Hofmann 11 | class Pairwise(PDGD): 12 | 13 | def __init__(self, epsilon, 14 | *args, **kargs): 15 | super(Pairwise, self).__init__(*args, **kargs) 16 | self.epsilon = epsilon 17 | 18 | def _create_train_ranking(self, query_id, query_feat, inverted): 19 | assert inverted == False 20 | n_docs = query_feat.shape[0] 21 | k = np.minimum(self.n_results, n_docs) 22 | self.doc_scores = self.model.score(query_feat) 23 | 24 | exploit = rnk.rank_query(self.doc_scores, inverted=False, n_results=k) 25 | explore = np.random.permutation(np.arange(n_docs)) 26 | coinflips = np.random.uniform(size=k) > self.epsilon 27 | 28 | self.ranking = -np.ones(k, dtype=np.int32) 29 | exploit_i = 0 30 | explore_i = 0 31 | for i in range(k): 32 | if coinflips[i]: 33 | while exploit[exploit_i] in self.ranking: 34 | exploit_i += 1 35 | self.ranking[i] = exploit[exploit_i] 36 | exploit_i += 1 37 | else: 38 | while explore[explore_i] in self.ranking: 39 | explore_i += 1 40 | self.ranking[i] = explore[explore_i] 41 | explore_i += 1 42 | 43 | self._last_query_feat = query_feat 44 | return self.ranking 45 | 46 | def _update_to_clicks(self, clicks): 47 | n_docs = self.ranking.shape[0] 48 | cur_k = np.minimum(n_docs, self.n_results) 49 | 50 | included = np.ones(cur_k, dtype=np.int32) 51 | if not clicks[-1]: 52 | included[1:] = np.cumsum(clicks[::-1])[:0:-1] 53 | neg_ind = np.where(np.logical_xor(clicks, included))[0] 54 | pos_ind = np.where(clicks)[0] 55 | 56 | n_pos = pos_ind.shape[0] 57 | n_neg = neg_ind.shape[0] 58 | n_pairs = n_pos*n_neg 59 | 60 | if n_pairs == 0: 61 | return 62 | 63 | pos_r_ind = self.ranking[pos_ind] 64 | neg_r_ind = self.ranking[neg_ind] 65 | 66 | all_w = np.zeros(n_pos + n_neg) 67 | all_w[:n_pos] = n_neg 68 | all_w[n_pos:] = -n_pos 69 | 70 | all_ind = np.concatenate([pos_r_ind, neg_r_ind]) 71 | 72 | self.model.update_to_documents(all_ind, 73 | all_w) -------------------------------------------------------------------------------- /algorithms/basiconlineranker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 6 | import numpy as np 7 | import utils.rankings as rnk 8 | 9 | class BasicOnlineRanker(object): 10 | 11 | def __init__(self, n_results, n_features): 12 | self.n_features = n_features 13 | self.n_results = n_results 14 | 15 | self.n_interactions = 0 16 | self.model_updates = 0 17 | self._messages = {} 18 | self._default_messages = {} 19 | 20 | self._train_features = None 21 | self._train_query_ranges = None 22 | 23 | @staticmethod 24 | def default_parameters(): 25 | '''Return all parameter values for this ranker. 26 | used for logging purposes.''' 27 | return {} 28 | 29 | def add_message(self, name, default_value=0): 30 | self._default_messages[name] = default_value 31 | 32 | def remove_message(self, name): 33 | del self._default_messages[name] 34 | 35 | def set_message(self, name, value): 36 | self._messages[name] = value 37 | 38 | def get_messages(self): 39 | messages = self._default_messages.copy() 40 | messages.update(self._messages) 41 | return messages 42 | 43 | def reset_messages(self): 44 | self._messages.clear() 45 | 46 | def setup(self, train_features, train_query_ranges): 47 | self._train_features = train_features 48 | self._train_query_ranges = train_query_ranges 49 | 50 | def clean(self): 51 | del self._train_features 52 | del self._train_query_ranges 53 | 54 | def get_test_rankings(self, features, 55 | query_ranges, inverted=True): 56 | return rnk.rank_multiple_queries( 57 | np.zeros(features.shape[0]), 58 | query_ranges, 59 | inverted=inverted, 60 | n_results=self.n_results) 61 | 62 | def get_query_features(self, query_id, features, 63 | query_ranges): 64 | start_i = query_ranges[query_id] 65 | end_i = query_ranges[query_id + 1] 66 | return features[start_i:end_i, :] 67 | 68 | def get_query_label(self, query_id, label_vector, 69 | query_ranges): 70 | start_i = query_ranges[query_id] 71 | end_i = query_ranges[query_id + 1] 72 | return label_vector[start_i:end_i] 73 | 74 | def get_query_size(self, query_id, query_ranges): 75 | return query_ranges[query_id+1] - query_ranges[query_id] 76 | 77 | def get_train_query_ranking(self, query_id, inverted=False): 78 | self._last_query_id = query_id 79 | query_feat = self.get_query_features(query_id, 80 | self._train_features, 81 | self._train_query_ranges) 82 | self._last_ranking = self._create_train_ranking( 83 | query_id, 84 | query_feat, 85 | inverted)[:self.n_results] 86 | return self._last_ranking 87 | 88 | def _create_train_ranking(self, query_id, query_feat, inverted): 89 | n_docs = self.get_query_size(query_id, 90 | self._train_query_ranges) 91 | return rnk.rank_single_query(np.zeros(n_docs), 92 | inverted=inverted, 93 | n_results=self.n_results)[:self.n_results] 94 | 95 | def process_clicks(self, clicks): 96 | self.update_to_interaction(clicks) 97 | self.n_interactions += 1 98 | 99 | def update_to_interaction(self, clicks): 100 | pass -------------------------------------------------------------------------------- /attack.sh: -------------------------------------------------------------------------------- 1 | python2 scripts/Poisoning_attacks/attack_DBGD_base_lr.py --data_sets local_MQ2007 --attacker_click_model frequency_attack\ 2 | --click_models exper1 --log_folder ./log --output_folder ./output --average_folder ./average \ 3 | --n_impr 10000 --n_runs 10 --n_proc 10 --n_results 10 --start 0 --end 1 --which 1 --mf 5 --sd_const 2.0 --num_attacker_relevant 5 4 | -------------------------------------------------------------------------------- /attack_graph.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import json 3 | import sys 4 | 5 | 6 | DBGD_output = open(sys.argv[1], 'r') 7 | print(sys.argv[1]) 8 | DBGD_output_lines = DBGD_output.readlines() 9 | # Extract macro here 10 | macro = json.loads(DBGD_output_lines[0]) 11 | # print(macro) 12 | 13 | DBGD_output_lines = DBGD_output_lines[0:] 14 | 15 | NDCG_attack = [] 16 | NDCG_label = [] 17 | LR = [] 18 | NDCG_label = [] 19 | iterations = [] 20 | graph_title = "" 21 | attack_name = "" 22 | 23 | label_lr = "" 24 | 25 | if "TD2003" in sys.argv[1]: 26 | label_lr = "TD2003" 27 | elif "MQ2007" in sys.argv[1]: 28 | label_lr = "MQ2007" 29 | elif "Yahoo" in sys.argv[1]: 30 | label_lr = "Yahoo" 31 | elif "MSLR" in sys.argv[1]: 32 | label_lr = "MSLR" 33 | else: 34 | print("Wrong name of dataset!!!") 35 | 36 | file_name = sys.argv[1] 37 | 38 | if "frequency" in file_name: 39 | attack_name = "frequency_attack" 40 | else: 41 | attack_name = "naive_intersection_attack" 42 | 43 | for line in DBGD_output_lines: 44 | run_details = json.loads(line)['simulation_arguments'] 45 | graph_title = run_details['simulation_arguments']['attacker_click_model'] 46 | Tau = [] 47 | num_clicks= [] 48 | 49 | 50 | run_results = json.loads(line)['results'] 51 | print(graph_title) 52 | run_results = json.loads(line)['results']["NDCG_attack"][attack_name]["mean"] 53 | it = 0 54 | for val in run_results: 55 | NDCG_attack.append(val) 56 | 57 | run_results = json.loads(line)['results']["NDCG_label"][attack_name]["mean"] 58 | 59 | for val in run_results: 60 | NDCG_label.append(val) 61 | iterations.append(it) 62 | it += 1 63 | 64 | run_results = json.loads(line)['results']["LR"][attack_name]["mean"] 65 | 66 | for val in run_results: 67 | LR.append(float('%.8f'%val)) 68 | 69 | fig_ndcg_attack, ax_ndcg_attack = plt.subplots() 70 | ax_ndcg_attack.plot(iterations, NDCG_attack, '#FF0000', label="NDCG attacker") 71 | ax_ndcg_attack.plot(iterations, NDCG_label, '#d79232', label="NDCG ground truth") 72 | 73 | 74 | # ax_ndcg_attack.set_title("") 75 | ax_ndcg_attack.set_xlabel("Iteration") 76 | ax_ndcg_attack.set_ylabel("NDCG@10") 77 | plt.legend() 78 | 79 | 80 | fig_LR, ax_LR = plt.subplots() 81 | 82 | ax_LR.plot(iterations, LR, '#FF0000', label=label_lr) 83 | ax_LR.set_xlabel("Iteration") 84 | ax_LR.set_ylabel("Learning Rate") 85 | plt.legend() 86 | 87 | plt.show() 88 | -------------------------------------------------------------------------------- /attacker_avg_summarize.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import json 3 | import sys 4 | 5 | 6 | DBGD_output = open(sys.argv[1], 'r') 7 | DBGD_output_lines = DBGD_output.readlines() 8 | # Extract macro here 9 | macro = json.loads(DBGD_output_lines[0]) 10 | # print(macro) 11 | 12 | DBGD_output_lines = DBGD_output_lines[0:] 13 | 14 | NDCG_attack = [] 15 | NDCG_label = [] 16 | iterations = [] 17 | lr = [] 18 | # graph_title = "" 19 | attack_name = "" 20 | 21 | file_name = sys.argv[1] 22 | 23 | if "new_freq_attack" in file_name: 24 | attack_name = "new_freq_attack" 25 | elif "freq_attack" in file_name: 26 | attack_name = "freq_attack" 27 | else: 28 | attack_name = "click_kth_doc" 29 | 30 | for line in DBGD_output_lines: 31 | run_details = json.loads(line)['simulation_arguments'] 32 | # graph_title = run_details['simulation_arguments']['click_models'][0] 33 | Tau = [] 34 | num_clicks= [] 35 | 36 | 37 | run_results = json.loads(line)['results'] 38 | # print(graph_title) 39 | run_results = json.loads(line)['results']["NDCG_attack"][attack_name]["mean"] 40 | 41 | count = 0 42 | for val in run_results: 43 | if count > 0 and (count % 1000 == 0 or count == 9999): 44 | NDCG_attack.append(round(val,5)) 45 | # count = 0 46 | count += 1 47 | 48 | 49 | run_results = json.loads(line)['results']["NDCG_label"][attack_name]["mean"] 50 | 51 | count = 0 52 | 53 | for val in run_results: 54 | if count % 1000 == 0: 55 | NDCG_label.append(val) 56 | count = 0 57 | count += 1 58 | 59 | run_results = json.loads(line)['results']["LR"][attack_name]["mean"] 60 | 61 | count = 0 62 | 63 | for val in run_results: 64 | 65 | if count > 0 and (count % 1000 == 0 or count == 9999): 66 | lr.append(round(val, 6)) 67 | # count = 0 68 | 69 | count += 1 70 | 71 | 72 | 73 | print("NDCG attack: ", NDCG_attack) 74 | print("NDCG label: ", NDCG_label) 75 | print("LR: ", lr) -------------------------------------------------------------------------------- /attacker_weights/Weights_mq2007.txt: -------------------------------------------------------------------------------- 1 | 0.9166919478487141, 1.7784039695501537, -1.6769130412370732, -1.0041922049927996, -0.8730556165490367, -0.5939161488734054, -0.5250025451437097, 0.4753711525834708, -0.5744830378084759, -0.21007874901643958, 0.2787353823389643, -1.1002740024900035, -0.0930665557464917, 1.2507969372208942, -0.20785952917382847, 0.8229744423351555, -1.304110655709577, -1.0500020375389558, -0.6129241024409119, 0.8218856002635212, -0.7786836027940605, -0.8236289206938172, -0.5583321490637034, 0.3216787960938945, -0.7612135724352947, -0.5285954958295925, -1.03892181829786, -0.6746550701750351, 2.2008087591667973, -0.8902641264291119, -1.2921437618361649, -0.5826275584144579, -0.23792784910165976, 0.930849688497572, -0.48438708507058614, -1.0634970423248107, 0.6851423496958496, -0.2709910615850057, 1.0658650984145353, 0.5796336844374246, -0.008724844951078671 -------------------------------------------------------------------------------- /attacker_weights/Weights_td2003.txt: -------------------------------------------------------------------------------- 1 | -1.169315284362955, 0.004743552831948161, -0.2716495992544119, 1.7206214619901572, -1.0389289334618654, 1.828498381246963, 0.0957685844512115, 1.7768675284866982, -1.6085950004464, 1.9536578518010819, 0.5094803636777727, -1.8219560517709135, 0.16672947533285498, 0.774492228899978, -0.8000438873312938, 1.371752163283971, -1.5809555938819688, -1.8824776396802596, -1.847353085460993, 0.5933365199674299, -1.907365651885614, 0.5583549275711559, -1.6907993175212224, -0.5080409113792914, 0.18593325863932542, 1.237691768891898, 0.6506285466671793, -1.7486763705096213, 1.7676765442887667, -1.1493297913235412, 1.713217839000456, 0.07349645474949318, -1.6365281453037421, 1.566718953268857, -1.0427090424702952, 0.5214338726741836, -1.624595369683914, -0.14110808140185593, 0.3916497592951984, 0.714693244430086, 1.7629424730034238, -1.068917955159976, -1.0842509460396972, -0.26477504051836576, 0.5966841002862706, 0.19709130808314024, 1.8285894190582885, 1.4269690742660384, 0.5673460795171632, -1.9117583834847198, -1.5526961096228091, 0.5510562739930802, 0.1885894577134044, -1.2706942541153308, 1.2984730857636881, -1.6075491808497815, 1.7841811332702404, -0.28785427123698915, -1.0599049535064315 -------------------------------------------------------------------------------- /attacker_weights/Weights_web10k.txt: -------------------------------------------------------------------------------- 1 | -1.2918894660631546, -1.626611786778359, 0.5224666753764322, 0.5810908362813563, 1.5591812870538133, -0.3897725398372405, -1.8555490026700565, -1.1999196575008906, 0.5683077838197774, -1.55611596365868, -1.3731589625585863, 0.24971913117971045, 0.6229948454285941, -1.755683893046522, -1.970372354334914, 1.51808954518985, -1.2689018117232043, -0.1244493450428017, -0.12707349829113834, 0.601557266657065, 1.5189488517419276, -1.8228510762995822, 1.915942639946798, -1.186623174158215, -1.5692845785392566, 1.3117780143659368, -1.6419754006702578, -0.9388693258093666, -1.8619918426706539, -0.8970273743878558, 1.6048588774490882, -1.0347278661427284, -0.5084919797241403, -1.1020718152602575, -0.11254386823654627, -1.169315284362955, 0.004743552831948161, -0.2716495992544119, 1.7206214619901572, -1.0389289334618654, 1.828498381246963, 0.0957685844512115, 1.7768675284866982, -1.6085950004464, 1.9536578518010819, 0.5094803636777727, -1.8219560517709135, 0.16672947533285498, 0.774492228899978, -0.8000438873312938, 1.371752163283971, -1.5809555938819688, -1.8824776396802596, -1.847353085460993, 0.5933365199674299, -1.907365651885614, 0.5583549275711559, -1.6907993175212224, -0.5080409113792914, 0.18593325863932542, 1.237691768891898, 0.6506285466671793, -1.7486763705096213, 1.7676765442887667, -1.1493297913235412, 1.713217839000456, 0.07349645474949318, -1.6365281453037421, 1.566718953268857, -1.0427090424702952, 0.5214338726741836, -1.624595369683914, -0.14110808140185593, 0.3916497592951984, 0.714693244430086, 1.7629424730034238, -1.068917955159976, -1.0842509460396972, -0.26477504051836576, 0.5966841002862706, 0.19709130808314024, 1.8285894190582885, 1.4269690742660384, 0.5673460795171632, -1.9117583834847198, -1.5526961096228091, 0.5510562739930802, 0.1885894577134044, -1.2706942541153308, 1.2984730857636881, -1.6075491808497815, 1.7841811332702404, -0.28785427123698915, -1.0599049535064315, -0.46672715103695017, 0.9760471739089973, 0.5182538515265533, -1.4841453081143805, 0.5345699957426095, -0.07243785525036195, -1.0976245689168653, -0.1328041368477333, 0.7902408772271423, 1.6857623914133857, 1.7747436843321513, -1.2889580954141597, 0.13278396787662938, 1.3757499237546664, -1.4550758167451048, -0.209482063393017, -0.09582896694006227, 0.9972243100422764, -1.284074783621219, 1.6978357398119277, -0.42506051073460727, -1.1385528982028035, 0.050697046416877445, 0.03805650495055435, -0.7833642314164155, 1.2149669516161428, -1.6688119146299236, 1.2047288451589453, -0.8473391050511214, -0.4178580973738488, -0.7082477204136652, -0.5502155330456628, -1.9785959207009323, 1.1978313172157908, 0.931166672864471, -0.9981402273986739, -0.15518281873065654, -1.8281038641142415, 0.948831822932735, 1.3942131498150339, 0.3504681105627463, 1.2912271256664138 2 | -------------------------------------------------------------------------------- /attacker_weights/Weights_yahoo.txt: -------------------------------------------------------------------------------- 1 | 1.5828946583539185, 1.9896038399218923, 0.6903950307337174, 1.7265872898190806, -0.5673661301794262, 0.5012445462851272, -1.0176798372911664, -1.1147385167744202, 0.569615856135774, 1.0914865921290646, -0.4083644053404387, 1.431872593427824, -0.3165083687788717, -0.02071922775356816, 0.36531090086424367, -0.5812145888066582, 0.5401116470385774, -0.5567385469458195, 1.0999557430966638, -0.7522970546809051, -0.24258155684170157, 1.6095655365653823, -0.495432704415411, 0.9812915904282806, -1.3571112232995102, -0.5694170605811664, 1.4408160489284638, 1.198749623503545, 0.17678943445361694, 0.9899131247299726, -0.49123166389979644, -1.0102285305862027, -1.0222279771179021, -0.6988529508814483, 1.9830174247132546, 0.7680577259824659, 0.04428325680278178, -1.982466373800416, -0.6216758205168484, -0.34553091872124986, 1.0703811595523036, 0.19466497289529627, 1.6748671045085275, 1.1579013773581628, 0.4734479615276741, 1.2949038815266416, -0.2262939040791756, -0.12134363029522932, -0.584271306103938, -1.2340454385069024, -0.3709687242872981, -1.6252325396555025, -1.3273254874836256, 1.660208339842653, 0.8445205653584136, 1.0522097102453256, 0.32437903678098134, -0.04369092410539199, -1.019900994734221, 1.5030438064529248, -0.6262174165776573, -1.337426258070995, 0.05790758632172466, 1.1616185843989655, 0.9865444967325976, -1.9689584729288288, -1.3673409944879427, -0.2323226160421754, 1.756855515256066, -1.6329132114432388, 1.6810686760270404, 1.523863321033577, 0.3341087085554739, 0.01251870387303855, -1.520355595494593, 0.05158593260779121, -1.8363071680646548, 0.3544610585861232, -1.4609222058688283, 1.6570873177915186, -0.35355514975952795, 1.2768487133345734, 1.1270250623047988, -1.7373555723626781, 0.6201712115975986, -0.14328842975151757, -0.6202868614496646, 0.2929374888827603, 0.1543347391885943, -0.22580114584327227, -0.030009554293235485, -0.844563194994647, 0.8532502430776314, -1.964740898252682, 1.862527897225942, -1.1511537395097737, -1.957623597155553, 1.5643035750385001, -0.2629609249062659, -0.11529454550180684, -0.9156302224973576, 0.9809225521465756, -0.9391761633550115, 1.6144204800999948, -0.6167659260981746, 0.8199348837327669, -0.5645903075409491, 0.6701136295645975, -0.7207852300149447, 1.5644006590494235, 0.047542032155412084, -0.7743020319593392, 1.3024253370452183, -0.32257759233297056, -0.2653684165347707, -1.705376617703465, -0.29225604791257664, 1.7866790810205861, -0.3440038915779855, -0.8120108370738617, 0.47277629174343483, -1.7158853314792855, 1.0144326295143986, -0.9481464554897157, -1.5421360767973207, -0.7555847659277592, 0.3100026511307621, -0.5720214447732257, -0.28507818281132824, 1.0700013469896938, -1.84060893460033, -0.6243906861291646, -1.5093895897250174, -0.7582252913522076, 0.29868114005878343, 1.1242641126228494, -1.9275054878645106, -0.7935491845925737, 1.9444810724415866, -0.7885068995178601, 1.720083276582765, -0.5456404868961227, -1.4843126265896198, -0.0852665349195787, 1.7531127640107504, -0.9034328849212088, 1.2532447437853156, -1.138192700448998, 0.43259996123315814, -1.1326716963407177, 0.5154155910510738, 1.3762496383770695, 1.484337533500864, 0.1254875771806554, 0.13785285017743965, -0.4252156532712972, -0.9679581777086623, 0.08877206905167334, 1.8827770605057887, -1.1511743442546924, -0.22407019757360613, -0.5554089679536274, 1.043720642338997, -0.14476396070838327, -1.1379292602522617, -1.053414049604418, 0.5712496707022741, -1.3556056697878929, 0.9277167224787939, 1.0748793432643637, 1.1879275944973076, -1.0656952347304238, -0.6125922042736751, 1.549902631539395, -0.3127637237797405, 0.1268740449597976, -0.09320954158997452, 1.4918756053696072, 0.9170225490768424, -0.7737274061749262, 1.6465497279655037, -0.12175246775849002, -1.0351966911724513, -1.2250554282728432, -0.38598686880333366, -1.4525893796956244, -1.6575478737162967, -0.05841645185379862, -1.1801285301041986, -1.9935458062570452, 1.2653960452034463, -0.4376209111222189, 1.778940252010754, 1.0568236723421642, 1.0609730176510097, -1.7694398942846306, -1.1027174922437335, 0.1907571459797639, -1.9973111804388308, 1.7302820934988996, -1.0233206460263848, -1.0693722825394727, 0.13830539428591315, 0.09749597008309019, 0.39693869553495764, 1.6753471909078117, 1.1100917252451405, -0.022886731222536927, -1.0174851157214686, 0.8093110149828324, 1.8308322691127534, -1.4631667086795144, 1.1403359556774624, 0.6165032572192892, -0.9915479627077937, -0.42306747423758706, 0.6089194862168927, -0.3339073532105532, -1.6751340785415314, -1.077268064707991, 0.5309622287607842, 1.2097810024061366, 1.7130677951176465, 0.850399524404351, -0.7889327679983027, 1.9970289539492851, 0.5841399568912546, 1.000510514502039, -1.0423683864980173, 1.1117832309464784, 1.565090338794616, -1.3949737855981441, 0.13321744083431053, -0.004087178226273913, 1.7828274198391822, 1.1089143098353924, -1.7146183016112762, -0.8195862313368325, -1.0907562204765346, -0.8587895168377733, -0.5348287917303716, 1.843415421353665, -0.6387579384164002, -1.5702980028357065, -1.6540384540895472, -0.2701083463295717, -1.9792069923464575, 0.19330891156846963, -1.4425438564203046, 1.8851598589536205, -1.7304859492865572, -0.6130699752752817, 0.9396696569171352, -1.2767007580634488, 0.08092415952288823, -0.45045914712704826, 1.780481765212603, 1.4601197699006812, -1.1756804189810226, 1.8063987075511632, -1.2993785047414512, 0.4480558288739722, -1.3447492273930242, -0.06606767207107023, -0.055299145645124614, -0.15076790471094048, 0.027293702793865116, -0.49068074913516435, 0.2316048505777597, 1.1750924370825206, 0.28093802282622926, 0.8434098649337485, -1.212562256405012, 1.1405883212626065, 1.2108625977387457, 0.0006311322932543995, 1.1514055419858722, -0.1497238888482859, 0.5253301685473266, -1.6357101262352347, 0.739187600165951, 1.7452858736736339, -0.05694967811036733, -0.5902905536639507, -0.2910081412783443, -1.0419783635008573, 1.2830579096399135, 1.5866641542660078, 1.8960528869400322, 0.2991709923548451, 0.09455398910363533, -0.8486995131400437, 0.44716667945208943, -1.7466714452963235, -0.2209038351210899, 1.4340767872740328, -0.43853518289321247, -0.012143574358134845, 0.5796990147749583, -1.402581667628482, -0.5940185802575431, 1.6863913174878786, -1.7905846762400524, 0.09927742380893134, -1.3715820851432836, -0.5904978271760086, -1.6074204921905175, -0.17658669653065884, 1.34814344311977, 0.7732540099715735, -1.6399096224471275, 0.16963043419254298, 1.6372794610575534, 1.3232638748512846, 1.1471480707726829, -1.356814732761611, 0.43095655802624666, 1.5231973496505158, -1.7947071329669297, 1.367294567007899, 0.2962058290694629, 1.3320198707142121, 1.1032911696184242, -0.2622610918539259, -1.9689055374741984, -1.6793287819557485, 0.30505943254416756, -1.7011786160615454, 0.2889292230291489, 1.5668246665300392, -0.8325975131322494, -1.1667747685209342, 1.274008090744128, -1.4252408079846908, 1.3879558521051183, 0.27119146564549057, 0.22162343539545537, -1.387269133622461, 0.22686969678477498, 1.9775638570949696, -1.6194905749173487, -1.8122336967288106, -0.7722299727978967, 0.20350341515147985, 0.788599888591007, 1.3347378655454691, 1.8928793073567882, -1.4423607462433483, 1.6918896567871062, -0.16227429215387623, -1.0581819862393296, -0.15298462789657208, 1.0488200210605565, -0.8978623154396255, -0.7617461537229508, 1.0060814326299412, 1.5052314729678717, -0.678559819292301, -1.8202384096007855, -0.4335477347058889, 0.08204711116213037, -1.559419667887901, -0.7484819288014726, 0.7718381868713982, -0.5222522771317846, 0.03343352262806576, -1.2776222072409071, 1.084013006181038, 0.1183781511350408, 1.7327294314353425, -0.7412182048163358, -0.7679486932240263, -0.9517602673642425, -0.19215971701925794, -0.4728740310794697, -0.08101536489291039, -0.575822496828382, 0.8823675848291259, -1.804071635066987, 0.5132034930666505, -1.9260823692373013, -1.3553401671180532, -0.6473435358470363, 0.13121780817202788, 1.2173900891904528, 0.27812229455913773, -0.7040843921465427, 1.6521477107346652, -0.8177612727499106, 0.28173168446905583, 0.8915261228209559, 1.7933499628244824, 0.45601708960831466, 0.08509227626799154, 1.96655402058714, -0.03530891470880837, -1.5488198559209838, -0.8910437252187413, -0.7541194312168864, -1.016059417754188, 0.20690585226077962, 0.9893378756376543, 0.5307779389239049, 0.19382010234386637, 0.3333854057715575, 0.5847986975318973, -1.5072335033397448, 0.733552992086596, 1.665418492839505, 1.6313908135679815, 1.143207655528002, 1.705854449675209, -0.4855115744309195, 0.5474581796636397, -1.5293471520030977, 0.1908761110982069, 1.8704594909030932, -0.15808306818152928, 0.3553044421278635, -0.5551170510452992, 0.8105398676140503, -0.7888491919253307, -0.1690798742491859, 0.9138195410920544, 1.3013893175562519, 1.030464241395121, 1.18103903202661, 1.2258704862314591, 0.4393827647241464, 1.1491861945934656, -1.5030973573598643, -1.0287032689637567, 0.7890139601181851, 1.9149139173245886, 1.4305729705855526, -0.4172611996692641, -0.06243960752638067, 1.7274376716103426, 1.6429528088252523, -1.2066703632069689, -0.08547222510290098, 1.2990762307237143, 1.5185273782177853, 0.280518576815449, 0.7082207313722688, 1.7515408320336099, 0.4250894807887975, -1.4173003756200724, 0.0222778292419199, 1.8107244661253468, -1.6521511334085193, 1.672877410613868, -1.619638449115278, 1.2376727518757096, 1.2473813520150152, 0.03300340009262026, 1.7871944806030067, 1.989457176381341, 1.824772973024825, -0.5988810774781861, 0.014994389894054105, -1.8915113336170588, -1.9265968320997544, 0.22839740678069287, -0.4661631128284234, 0.5989752634688199, 1.17377615289231, -1.463511630252626, -0.8078723951568931, 1.2311478643831428, -0.48143088151913593 2 | -------------------------------------------------------------------------------- /graphs/makeaverages.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import argparse 4 | import json 5 | import os 6 | import sys 7 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 8 | from utils.averageoutput import IndependentOutputAverager 9 | 10 | def create_folders(filename): 11 | if not os.path.exists(os.path.dirname(filename)): 12 | os.makedirs(os.path.dirname(filename)) 13 | 14 | 15 | description = 'Script for averaging over full run output files.' 16 | parser = argparse.ArgumentParser(description=description) 17 | 18 | parser.add_argument('--average_folder', dest='average_folder', type=str, 19 | required=True, default=None, 20 | help='Folder to output pdfs into.') 21 | 22 | parser.add_argument('--fullrun_prefix', dest='fullrun_prefix', type=str, 23 | required=True, default=None, 24 | help='Prefix for folders of full runs of the same dataset.') 25 | 26 | parser.add_argument('output_files', type=str, nargs='+', 27 | help='Output files to be parsed.') 28 | 29 | args = parser.parse_args() 30 | 31 | 32 | def create_folders(filename): 33 | if not os.path.exists(os.path.dirname(filename)): 34 | os.makedirs(os.path.dirname(filename)) 35 | 36 | def process_run_name(name): 37 | name = name.replace('_', '\\_') 38 | return name 39 | 40 | 41 | average_folder = args.average_folder 42 | averager = IndependentOutputAverager(average_folder) 43 | 44 | path_pairs = [] 45 | for output_file in args.output_files: 46 | prefix = args.fullrun_prefix 47 | assert prefix in output_file 48 | average_file_name = output_file[output_file.find(prefix) + len(prefix):] 49 | while average_file_name[0] == '/': 50 | average_file_name = average_file_name[1:] 51 | average_dest = '%s/%s' % (average_folder, average_file_name) 52 | path_pairs.append((output_file, average_dest)) 53 | 54 | failed_paths = [] 55 | success_paths = [] 56 | for source, dest in path_pairs: 57 | success = True 58 | try: 59 | average_results = averager.average_results(source) 60 | except KeyboardInterrupt: 61 | raise 62 | except: 63 | success = False 64 | print 'Failed: ', source 65 | failed_paths.append(source) 66 | 67 | if success: 68 | print 'Success:', source, ' -> ', dest 69 | 70 | create_folders(dest) 71 | with open(dest, 'w') as w: 72 | w.write(json.dumps(average_results)) 73 | 74 | success_paths.append(source) 75 | 76 | print 77 | print 'Done processing.' 78 | print 79 | print 'Successfully averaged the following files:' 80 | print 81 | print ' '.join(success_paths) 82 | print 83 | print 'Failed averaging the following files:' 84 | print 85 | print ' '.join(failed_paths) 86 | print 87 | -------------------------------------------------------------------------------- /graphs/makegraphs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pylab as plt 4 | import numpy as np 5 | import random 6 | import argparse 7 | import os 8 | import json 9 | import datetime 10 | 11 | def create_folders(filename): 12 | if not os.path.exists(os.path.dirname(filename)): 13 | os.makedirs(os.path.dirname(filename)) 14 | 15 | 16 | description = 'Script for displaying graphs from output files.' 17 | parser = argparse.ArgumentParser(description=description) 18 | 19 | parser.add_argument('--pdf_folder', dest='pdf_folder', type=str, required=False, default=None, 20 | help='Folder to output pdfs into.') 21 | 22 | parser.add_argument('--folder_prefix', dest='folder_prefix', type=str, required=False, 23 | default=None, help='Prefix for folders of the same dataset.') 24 | 25 | parser.add_argument('plot_name', type=str, help='Name to save plots under.') 26 | 27 | parser.add_argument('output_files', type=str, help='Output files to be parsed.', nargs='+') 28 | 29 | args = parser.parse_args() 30 | 31 | 32 | def create_folders(filename): 33 | if not os.path.exists(os.path.dirname(filename)): 34 | os.makedirs(os.path.dirname(filename)) 35 | 36 | def process_run_name(name): 37 | name = name.replace('_', '\\_') 38 | name = name.replace('DeepP-DBGD', 'DBGD (neural)') 39 | name = name.replace('P-DBGD', 'DBGD') 40 | name = name.replace('P-MGD', 'MGD') 41 | name = name.replace('DeepPDGD', 'PDGD (neural)') 42 | return name 43 | 44 | 45 | pdf_folder = args.pdf_folder 46 | prefix_plot_name = args.plot_name 47 | 48 | folder_structure = {} 49 | if args.folder_prefix: 50 | for output_file in args.output_files: 51 | prefix = args.folder_prefix 52 | assert prefix in output_file 53 | average_file_name = output_file[output_file.find(prefix) + len(prefix):] 54 | while average_file_name[0] == '/': 55 | average_file_name = average_file_name[1:] 56 | data_folder = average_file_name[:average_file_name.find('/')] 57 | if data_folder not in folder_structure: 58 | folder_structure[data_folder] = [] 59 | folder_structure[data_folder].append(output_file) 60 | else: 61 | folder_structure[None] = args.output_files 62 | 63 | to_plot = [ 64 | ('offline', 'heldout'), 65 | ] 66 | 67 | for data_folder in sorted(folder_structure.keys()): 68 | output_files = folder_structure[data_folder] 69 | data = {} 70 | file_names = [] 71 | click_models = [] 72 | value_names = [] 73 | if data_folder is None: 74 | print 'No data folders found, outputting directly.' 75 | else: 76 | print 'Found data folder: %s' % data_folder 77 | for output_file in output_files: 78 | print 'reading', output_file 79 | file_name = output_file.split('/')[-1] 80 | if file_name[-4:] == '.out': 81 | file_name = file_name[:-4] 82 | assert file_name not in data 83 | data[file_name] = {} 84 | file_names.append(file_name) 85 | with open(output_file) as f: 86 | output = json.load(f) 87 | for name, value in output['runtimes'].items(): 88 | print name, 89 | print datetime.timedelta(seconds=value), 90 | print '(%d seconds)' % value 91 | data[file_name] = output['results'] 92 | for v_name in output['results']: 93 | if v_name not in value_names: 94 | value_names.append(v_name) 95 | for c_m in output['results'][v_name]: 96 | if c_m == 'indices': 97 | continue 98 | if c_m not in click_models: 99 | click_models.append(c_m) 100 | 101 | print 102 | 103 | print 'finished reading, found the following value types:' 104 | for name in value_names: 105 | print name 106 | print 107 | print 'start plotting' 108 | 109 | # params = { 110 | # 'text.latex.preamble': r"\usepackage{lmodern}", 111 | # 'text.usetex': True, 112 | # 'font.size': 26, 113 | # 'font.family': 'lmodern', 114 | # 'text.latex.unicode': True, 115 | # } 116 | # plt.rcParams.update(params) 117 | 118 | colours = [ 119 | 'black', 120 | 'r', 121 | 'b', 122 | 'g', 123 | 'y', 124 | 'c', 125 | 'orange', 126 | 'purple', 127 | 'pink', 128 | 'gray', 129 | ] * 30 130 | 131 | for plot_name, v_name in to_plot: 132 | for click_model in click_models: 133 | fig = plt.figure(figsize=(10.5, 6), linewidth=0.1) 134 | # fig = plt.figure(figsize=(10.5, 4), linewidth=0.1) 135 | plt.ioff() 136 | plt.ylabel('NDCG') 137 | plt.xlabel('impressions') 138 | plt.gca().yaxis.set_ticks_position('both') 139 | 140 | labels = [] 141 | max_ind = np.NINF 142 | for i, file_name in enumerate(file_names): 143 | file_dict = data[file_name] 144 | colour = colours[i] 145 | 146 | if v_name not in file_dict: 147 | if v_name == 'heldout' and 'held-out' in file_dict: 148 | v_name = 'held-out' 149 | elif v_name == 'held-out' and 'heldout' in file_dict: 150 | v_name = 'heldout' 151 | else: 152 | print 'not found', v_name, file_dict.keys() 153 | continue 154 | v_dict = file_dict[v_name] 155 | ind = np.array(v_dict['indices']) 156 | if click_model not in v_dict: 157 | print 'not found', click_model, v_dict.keys() 158 | continue 159 | c_dict = v_dict[click_model] 160 | 161 | max_ind = max(max_ind, np.max(ind)) 162 | mean = np.array(c_dict['mean']) 163 | std = np.array(c_dict['std']) 164 | 165 | plt.fill_between(ind, mean-std, mean+std, color=colour, alpha=0.2) 166 | plt.plot(ind, mean, color=colour) 167 | labels.append(process_run_name(file_name)) 168 | 169 | if len(labels) > 0: 170 | # if v_ind == "TEST INDICES": 171 | # plt.ylim(.6,.8) 172 | plt.ylim(.2,0.5) 173 | plt.xlim(-5, 30000) 174 | plt.xlim(-500, 1000000) 175 | # plt.xlim(-5, max_ind) 176 | # plt.xlim(-5, 100000) 177 | plt.annotate(click_model, xy=(0.02, 0.90), xycoords='axes fraction') 178 | if click_model == 'perfect': 179 | plt.legend(labels, loc=4, fontsize=16, frameon=False, ncol=1) 180 | # plt.legend(labels, loc=0, fontsize=26, frameon=False, ncol=1) 181 | 182 | if not pdf_folder: 183 | plt.show() 184 | else: 185 | plot_file_name = '%s_%s_%s.pdf' % (prefix_plot_name, plot_name, click_model) 186 | if not data_folder is None: 187 | plot_file_name = os.path.join(data_folder, plot_file_name) 188 | create_folders(os.path.join(pdf_folder, plot_file_name)) 189 | plt.savefig(os.path.join(pdf_folder, plot_file_name), bbox_inches='tight') 190 | print 'saved', plot_file_name 191 | plt.close(fig) 192 | print 193 | -------------------------------------------------------------------------------- /graphs/maketables.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pylab as plt 4 | import numpy as np 5 | import random 6 | import argparse 7 | import os 8 | import json 9 | import datetime 10 | 11 | description = 'Script for displaying graphs from output files.' 12 | parser = argparse.ArgumentParser(description=description) 13 | 14 | parser.add_argument('--table_folder', dest='table_folder', type=str, required=False, default=None, 15 | help='Folder to output pdfs into.') 16 | 17 | parser.add_argument('--baselines', dest='baselines', type=str, required=False, default=None, 18 | help='Folder to output pdfs into.', nargs='+') 19 | 20 | parser.add_argument('--folder_prefix', dest='folder_prefix', type=str, required=False, 21 | default=None, help='Prefix for folders of the same dataset.') 22 | 23 | parser.add_argument('plot_name', type=str, help='Name to save plots under.') 24 | 25 | parser.add_argument('output_files', type=str, help='Output files to be parsed.', nargs='+') 26 | 27 | args = parser.parse_args() 28 | 29 | def create_folders(filename): 30 | if not os.path.exists(os.path.dirname(filename)): 31 | os.makedirs(os.path.dirname(filename)) 32 | 33 | def get_significance(mean_1, mean_2, std_1, std_2, n): 34 | significance = '' 35 | ste_1 = std_1 / np.sqrt(n) 36 | ste_2 = std_2 / np.sqrt(n) 37 | t = (mean_1 - mean_2) / np.sqrt(ste_1 ** 2 + ste_2 ** 2) 38 | # treatment is worse than baseline 39 | # values used are for 120 degrees of freedom 40 | # (http://changingminds.org/explanations/research/analysis/ 41 | # t-test_table.htm) 42 | significance = '\\hphantom{\\tiny \\dubbelneer}' 43 | if mean_1 > mean_2: 44 | if abs(t) >= 2.62: 45 | significance = '{\\tiny \\dubbelneer}' 46 | elif abs(t) >= 1.98: 47 | significance = '{\\tiny \\enkelneer}' 48 | else: 49 | if abs(t) >= 2.62: 50 | significance = '{\\tiny \\dubbelop}' 51 | elif abs(t) >= 1.98: 52 | significance = '{\\tiny \\enkelop}' 53 | return significance 54 | 55 | class OutputTable(object): 56 | 57 | def __init__(self, table_name, table_folder): 58 | self._closed = False 59 | self.output_path = '%s/%s.tex' % (table_folder, table_name) 60 | print 'creating file at %s' % self.output_path 61 | create_folders(self.output_path) 62 | self._output_file = open(self.output_path, 'w') 63 | self.writeline('% !TEX root = ../main.tex') 64 | 65 | def writeline(self, *line): 66 | full_line = ' '.join(line) 67 | self._output_file.write(full_line + '\n') 68 | print full_line 69 | 70 | def write(self, *line): 71 | full_line = ' '.join(line) 72 | self._output_file.write(full_line + ' ') 73 | print full_line, 74 | 75 | def close(self): 76 | self._closed = True 77 | self._output_file.close() 78 | print 'Finished writing to and closed:', self.output_path 79 | 80 | def process_run_name(name): 81 | name = name.replace('_', '\\_') 82 | name = name.replace('DeepP-DBGD', 'DBGD (neural)') 83 | name = name.replace('P-DBGD', 'DBGD (linear)') 84 | name = name.replace('P-MGD', 'MGD (linear)') 85 | name = name.replace('PDGD', 'PDGD (linear)') 86 | name = name.replace('DeepPDGD (linear)', 'PDGD (neural)') 87 | name = name.replace('Pairwise', 'Pairwise (linear)') 88 | return name 89 | 90 | def process_folder_name(name): 91 | name = name.replace('_', '\\_') 92 | name = name.replace('Webscope\\_C14\\_Set1', 'Yahoo') 93 | 94 | return name 95 | 96 | prefix_plot_name = args.plot_name 97 | folder_structure = {} 98 | if args.folder_prefix: 99 | for output_file in args.output_files + args.baselines: 100 | prefix = args.folder_prefix 101 | assert prefix in output_file 102 | average_file_name = output_file[output_file.find(prefix) + len(prefix):] 103 | while average_file_name[0] == '/': 104 | average_file_name = average_file_name[1:] 105 | data_folder = average_file_name[:average_file_name.find('/')] 106 | data_folder = process_folder_name(data_folder) 107 | if data_folder not in folder_structure: 108 | folder_structure[data_folder] = [] 109 | folder_structure[data_folder].append(output_file) 110 | else: 111 | folder_structure[None] = args.output_files 112 | 113 | to_table = [ 114 | # ('offline', 'heldout', 10000), 115 | ('online', 'cumulative-display', 10000), 116 | ] 117 | 118 | baselines = [] 119 | methods = [] 120 | 121 | all_data = {} 122 | for data_folder in sorted(folder_structure.keys()): 123 | output_files = folder_structure[data_folder] 124 | data = {} 125 | all_data[data_folder] = data 126 | file_names = [] 127 | click_models = [] 128 | value_names = [] 129 | if data_folder is None: 130 | print 'No data folders found, outputting directly.' 131 | else: 132 | print 'Found data folder: %s' % data_folder 133 | for output_file in output_files: 134 | print 'reading', output_file 135 | file_name = output_file.split('/')[-1] 136 | if file_name[-4:] == '.out': 137 | file_name = file_name[:-4] 138 | file_name = process_run_name(file_name) 139 | if output_file in args.baselines and file_name not in baselines: 140 | baselines.append(file_name) 141 | elif output_file not in args.baselines and file_name not in methods: 142 | methods.append(file_name) 143 | assert file_name not in data, '%s already in %s' % (file_name, data.keys()) 144 | data[file_name] = {} 145 | file_names.append(file_name) 146 | with open(output_file) as f: 147 | output = json.load(f) 148 | for name, value in output['runtimes'].items(): 149 | print name, 150 | print datetime.timedelta(seconds=value), 151 | print '(%d seconds)' % value 152 | data[file_name] = output['results'] 153 | for v_name in output['results']: 154 | if v_name not in value_names: 155 | value_names.append(v_name) 156 | for c_m in output['results'][v_name]: 157 | if c_m == 'indices': 158 | continue 159 | if c_m not in click_models: 160 | click_models.append(c_m) 161 | 162 | print 163 | 164 | print 'finished reading, found the following value types:' 165 | for name in value_names: 166 | print name 167 | print 168 | 169 | click_models = ['perfect', 'navigational', 'informational'] 170 | 171 | folder_order = sorted(folder_structure.keys()) 172 | for table_name, table_value, table_ind in to_table: 173 | table_data = {} 174 | for folder_name in folder_order: 175 | all_f_data = all_data[folder_name] 176 | f_data = {} 177 | table_data[folder_name] = f_data 178 | 179 | for c_m in click_models: 180 | c_data = {} 181 | max_v = np.NINF 182 | f_data[c_m] = c_data 183 | for b_name in baselines: 184 | b_data = all_data[folder_name][b_name][table_value] 185 | b_ind = np.array(b_data['indices']) 186 | if np.any(b_ind == table_ind): 187 | v_i = np.where(b_ind == table_ind)[0][0] 188 | else: 189 | diff = b_ind - table_ind 190 | v_i = np.argmax(diff[diff<=0]) 191 | v_mean = b_data[c_m]['mean'][v_i] 192 | v_std = b_data[c_m]['std'][v_i] 193 | 194 | max_v = max(max_v, v_mean) 195 | c_data[b_name] = (v_mean, v_std, None) 196 | 197 | for m_name in methods: 198 | m_data = all_data[folder_name][m_name][table_value] 199 | m_ind = np.array(m_data['indices']) 200 | if np.any(m_ind == table_ind): 201 | v_i = np.where(m_ind == table_ind)[0][0] 202 | else: 203 | diff = b=m_ind - table_ind 204 | v_i = np.argmax(diff[diff<=0]) 205 | v_mean = m_data[c_m]['mean'][v_i] 206 | v_std = m_data[c_m]['std'][v_i] 207 | 208 | sig = [] 209 | for b_name in baselines: 210 | b_mean, b_std, _ = c_data[b_name] 211 | sig.append(get_significance(b_mean, v_mean, b_std, v_std, 125)) 212 | 213 | max_v = max(max_v, v_mean) 214 | c_data[m_name] = (v_mean, v_std, sig) 215 | 216 | c_data['maximum'] = max_v 217 | 218 | out = OutputTable(table_name, args.table_folder) 219 | out.writeline('\\begin{tabular*}{\\textwidth}{@{\\extracolsep{\\fill} } l ', 'l ' 220 | * len(folder_order), '}') 221 | out.writeline('\\toprule') 222 | 223 | for data_folder in folder_order: 224 | out.write(' & { \\small \\textbf{%s}}' % data_folder) 225 | out.writeline('\\\\') 226 | 227 | for click_model in click_models: 228 | out.writeline('\\midrule') 229 | out.writeline('& \\multicolumn{%d}{|c|}{\\textit{%s}} \\\\' % (len(folder_order), click_model)) 230 | out.writeline('\\midrule') 231 | 232 | for name in baselines + methods: 233 | out.write(name) 234 | 235 | for folder in folder_order: 236 | v_max = round(table_data[folder][click_model]['maximum'], 1) 237 | v_mean, v_std, v_sig = table_data[folder][click_model][name] 238 | out.write('&') 239 | 240 | if round(v_mean, 1) >= v_max: 241 | out.write('\\bf') 242 | 243 | out.write('%0.01f {\\tiny (%0.01f)}' % (v_mean, v_std)) 244 | if not (v_sig is None): 245 | out.write(' '.join(v_sig)) 246 | 247 | out.writeline('\\\\') 248 | 249 | 250 | 251 | 252 | out.writeline('\\bottomrule') 253 | out.writeline('\\end{tabular*}') 254 | out.close() 255 | 256 | print 257 | print 258 | print 259 | 260 | 261 | 262 | 263 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/models/__init__.py -------------------------------------------------------------------------------- /models/evolutionneuralmodel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class EvolutionNeuralModel(object): 4 | 5 | def __init__(self, learning_rate, 6 | learning_rate_decay, 7 | hidden_layers, n_features, 8 | n_candidates): 9 | def normal(init, shape): 10 | safe_shape = (self.n_models,) + shape 11 | return np.random.normal(0., init, safe_shape) 12 | 13 | self.n_models = n_candidates + 1 14 | self.learning_rate = learning_rate 15 | self.hidden_layer_nodes = hidden_layers 16 | self.hidden_layers = [] 17 | self.biases = [] 18 | self.n_nodes = 0 19 | prev_units = n_features 20 | for n_units in hidden_layers: 21 | init = 1./prev_units 22 | self.hidden_layers.append(normal(init, (prev_units, n_units))) 23 | self.biases.append(normal(init, (1, n_units,))) 24 | self.n_nodes += (prev_units+1)*n_units 25 | prev_units = n_units 26 | self.hidden_layers.append(normal(1./prev_units, (prev_units, 1))) 27 | self.n_nodes += prev_units 28 | self.learning_rate_decay = learning_rate_decay 29 | 30 | def sample_candidates(self): 31 | assert self.n_models > 1 32 | n_cand = self.n_models-1 33 | vectors = np.random.randn(self.n_models-1, self.n_nodes) 34 | vector_norms = np.sum(vectors ** 2, axis=1) ** (1. / 2) 35 | vectors /= vector_norms[:, None] 36 | vec_i = 0 37 | for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases): 38 | h_shape = hidden_layer.shape[1:3] 39 | n_matrix = np.prod(h_shape) 40 | n_bias = h_shape[1] 41 | matrix_noise = np.reshape(vectors[:, vec_i:vec_i+n_matrix], 42 | (n_cand, h_shape[0], h_shape[1])) 43 | vec_i += n_matrix 44 | bias_noise = np.reshape(vectors[:, vec_i:vec_i+n_bias], 45 | (n_cand, n_bias)) 46 | vec_i += n_bias 47 | 48 | hidden_layer[1:,:,:] = hidden_layer[0, None,:,:] + matrix_noise 49 | bias[1:, :] = bias[0, None, :] + bias_noise 50 | 51 | matrix_noise = vectors[:,vec_i:,None] 52 | self.hidden_layers[-1][1:,:,:] = self.hidden_layers[-1][0,None,:,:] + matrix_noise 53 | 54 | def score(self, features): 55 | return self._score(features, 0) 56 | 57 | def _score(self, features, model_i): 58 | prev_layer = features 59 | self.input = features 60 | self.activations = [prev_layer] 61 | for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases): 62 | prev_layer = np.dot(prev_layer, hidden_layer[model_i, :]) 63 | prev_layer += bias[model_i,:] 64 | prev_layer = 1./(1. + np.exp(-prev_layer)) 65 | self.activations.append(prev_layer) 66 | result = np.dot(prev_layer, self.hidden_layers[-1][model_i,: ]) 67 | self.activations.append(result) 68 | return result[:, 0] 69 | 70 | def candidate_score(self, features): 71 | scores = [] 72 | for i in range(self.n_models): 73 | scores.append(self._score(features, i)) 74 | return np.stack(scores, axis=0) 75 | 76 | def update_to_mean_winners(self, winners): 77 | assert self.n_models > 1 78 | if len(winners) > 0: 79 | for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases): 80 | average_layer = np.mean(hidden_layer[winners,:,:], axis=0) 81 | average_bias = np.mean(bias[winners,:], axis=0) 82 | 83 | layer_gradient = (average_layer - hidden_layer[0,:,:]) 84 | bias_gradient = (average_bias - bias[0,:]) 85 | 86 | hidden_layer[0,:,:] += self.learning_rate*layer_gradient 87 | bias[0,:] += self.learning_rate*bias_gradient 88 | 89 | average_layer = np.mean(self.hidden_layers[-1][winners,:,:], axis=0) 90 | layer_gradient = (average_layer - self.hidden_layers[-1][0,:,:]) 91 | self.hidden_layers[-1][0,:,:] += self.learning_rate*layer_gradient 92 | 93 | self.learning_rate *= self.learning_rate_decay 94 | -------------------------------------------------------------------------------- /models/linearmodel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sympy import Matrix 3 | from scipy.linalg import norm 4 | def sample_with_basis(M): 5 | weight = np.random.normal(0, 1, len(M)) 6 | v = weight.dot(M) 7 | # print(v) 8 | v /= norm(v) 9 | return v 10 | 11 | class LinearModel(object): 12 | def __init__(self, n_features, learning_rate, 13 | n_candidates=0, learning_rate_decay=1.0): 14 | self.n_features = n_features 15 | self.learning_rate = learning_rate 16 | self.n_models = n_candidates + 1 17 | self.weights = np.zeros((n_features, self.n_models)) 18 | self.learning_rate_decay = learning_rate_decay 19 | 20 | def copy(self): 21 | copy = LinearModel(n_features = self.n_features, 22 | learning_rate = self.learning_rate, 23 | n_candidates = self.n_models-1) 24 | copy.weights = self.weights.copy() 25 | return copy 26 | 27 | def candidate_score(self, features): 28 | self._last_features = features 29 | return np.dot(features, self.weights).T 30 | 31 | def score(self, features): 32 | self._last_features = features 33 | return np.dot(features, self.weights[:,0:1])[:,0] 34 | 35 | def sample_candidates(self): 36 | assert self.n_models > 1 37 | vectors = np.random.randn(self.n_features, self.n_models-1) 38 | vector_norms = np.sum(vectors ** 2, axis=0) ** (1. / 2) 39 | vectors /= vector_norms[None, :] 40 | self.weights[:, 1:] = self.weights[:, 0, None] + vectors 41 | 42 | def update_to_mean_winners(self, winners, viewed_list=None): 43 | assert self.n_models > 1 44 | if len(winners) > 0: 45 | # print 'winners:', winners 46 | gradient = np.mean(self.weights[:, winners], axis=1) - self.weights[:, 0] 47 | 48 | # Added for projection 49 | if viewed_list is not None and len(viewed_list)>0: 50 | gradient = self.project_to_viewed_doc(gradient,viewed_list) 51 | 52 | self.weights[:, 0] += self.learning_rate * gradient 53 | self.learning_rate *= self.learning_rate_decay 54 | 55 | def update_to_documents(self, doc_ind, doc_weights): 56 | weighted_docs = self._last_features[doc_ind, :] * doc_weights[:, None] 57 | gradient = np.sum(weighted_docs, axis=0) 58 | self.weights[:, 0] += self.learning_rate * gradient 59 | self.learning_rate *= self.learning_rate_decay 60 | 61 | 62 | def project_to_viewed_doc(self, winning_gradient, viewed_list): 63 | # Make projections to each of viewed document as basis vector 64 | gradient_proj = np.zeros(self.n_features) 65 | 66 | # viewed_list has each row as the basis, so it is the transpose of columnspace M 67 | basis_trans = np.matrix.transpose(np.asarray(viewed_list)) 68 | 69 | # SVD decomposition, column of both u_ and vh_ is orthogonal basis of columnspace of input 70 | # Use u matrix for basis, as u_ is 'document-to-concept' simialrity 71 | # vh_ is 'feature-to-concept' similarity 72 | u_,s_,vh_ = np.linalg.svd(np.asarray(basis_trans), full_matrices=False) 73 | # transpose to row space 74 | basis_list = np.matrix.transpose(np.asarray(u_)) 75 | 76 | 77 | # proj_g onto x = dot(x,g)/|x|^2 x 78 | for basis in basis_list: 79 | len_basis = np.sqrt(basis.dot(basis)) 80 | # len_basis = np.sqrt(sum(k*k for k in basis)) # could take out np.sqrt and square in next line 81 | gradient_proj += np.dot(basis, winning_gradient) / (len_basis * len_basis) * basis 82 | 83 | # Normalize 84 | norm = np.linalg.norm(gradient_proj) 85 | if norm > 0: 86 | gradient_proj = gradient_proj / norm 87 | 88 | return gradient_proj 89 | 90 | 91 | # sample candidate from null space for NSGD 92 | def sample_candidates_null_space(self, grads, features, withBasis=False): 93 | assert self.n_models > 1 94 | # vectors = np.random.randn(self.n_features, self.n_models-1) 95 | # vector_norms = np.sum(vectors ** 2, axis=0) ** (1. / 2) 96 | # vectors /= vector_norms[None, :] 97 | # self.weights[:, 1:] = self.weights[:, 0, None] + vectors 98 | 99 | N = Matrix(grads).nullspace() # get null space of gradient matrix 100 | newN = np.array(N).astype(np.float64) 101 | for i in range(0, len(newN)): 102 | norm = np.linalg.norm(newN[i]) 103 | if norm > 0: 104 | newN[i] = newN[i]/norm 105 | 106 | # sample vectors normally from the nullspace 107 | if withBasis: 108 | # sample with basis 109 | nsVecs = [sample_with_basis(newN) for i in range(2*self.n_models)] 110 | else: 111 | # Directly sample from null space 112 | nsVecs = [newN[randint(0, len(N) - 1)] for i in range(2*self.n_models)] 113 | 114 | # get average candidate document feature vector 115 | avgdocfeat = [sum(feat)/len(feat) for feat in zip(*features)] 116 | # sort vectors by dot product (decreasing absolute value) 117 | nsVecs = sorted(nsVecs, key=lambda vec: abs(np.dot(vec, avgdocfeat)), reverse=True) 118 | 119 | self.gs = np.array(nsVecs[:self.n_models-1]) 120 | self.weights[:, 1:] = self.weights[:, 0, None] + self.gs.T -------------------------------------------------------------------------------- /models/neuralmodel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class NeuralModel(object): 4 | 5 | def __init__(self, learning_rate, 6 | learning_rate_decay, 7 | hidden_layers, n_features): 8 | def normal(init, shape): 9 | return np.random.normal(0., init, shape) 10 | 11 | self.learning_rate = learning_rate 12 | self.hidden_layer_nodes = hidden_layers 13 | self.hidden_layers = [] 14 | self.biases = [] 15 | prev_units = n_features 16 | for n_units in hidden_layers: 17 | init = 1./prev_units 18 | self.hidden_layers.append(normal(init, (prev_units, n_units))) 19 | self.biases.append(normal(init, n_units)[None, :]) 20 | prev_units = n_units 21 | self.hidden_layers.append(normal(1./prev_units, (prev_units, 1))) 22 | self.learning_rate_decay = learning_rate_decay 23 | 24 | def score(self, features): 25 | prev_layer = features 26 | self.input = features 27 | self.activations = [prev_layer] 28 | for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases): 29 | prev_layer = np.dot(prev_layer, hidden_layer) 30 | prev_layer += bias 31 | prev_layer = 1./(1. + np.exp(-prev_layer)) 32 | self.activations.append(prev_layer) 33 | result = np.dot(prev_layer, self.hidden_layers[-1]) 34 | self.activations.append(result) 35 | return result[:, 0] 36 | 37 | def backpropagate(self, doc_ind, doc_weights): 38 | activations = [a[doc_ind, :] for a in self.activations] 39 | doc_weights = np.expand_dims(doc_weights, axis=1) 40 | cur_der = (np.dot(activations[-2].T, doc_weights), None) 41 | derivatives = [cur_der] 42 | prev_der = doc_weights 43 | for i in range(len(self.hidden_layers)-1): 44 | prev_der = np.dot(prev_der, self.hidden_layers[-i-1].T) 45 | prev_der *= activations[-i-2]*(1.-activations[-i-2]) 46 | 47 | w_der = np.dot(activations[-i-3].T, prev_der) 48 | b_der = np.sum(prev_der, axis=0, keepdims=True) 49 | 50 | derivatives.append((w_der, b_der)) 51 | 52 | return derivatives 53 | 54 | def debugstr(self): 55 | for i, hd in enumerate(self.hidden_layers[:-1]): 56 | print 'layer %d:' % i, hd 57 | print 'bias %d:' % i, self.biases[i] 58 | print 'final hidden:', self.hidden_layers[-1] 59 | 60 | 61 | def update_to_documents(self, doc_ind, doc_weights): 62 | derivatives = self.backpropagate(doc_ind, doc_weights) 63 | 64 | first_wd = derivatives[0][0] 65 | self.hidden_layers[-1] += first_wd * self.learning_rate 66 | for i, (wd, bd) in enumerate(derivatives[1:], 2): 67 | self.hidden_layers[-i] += wd * self.learning_rate 68 | self.biases[-i + 1] += bd * self.learning_rate 69 | self.learning_rate *= self.learning_rate_decay 70 | -------------------------------------------------------------------------------- /models/neuralnet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class NeuralModel(object): 4 | 5 | def __init__(self, learning_rate, hidden_layers, n_features, 6 | regularize_rate=0., n_output=1): 7 | self.learning_rate = learning_rate 8 | self.regularize_rate = regularize_rate 9 | self.hidden_layer_nodes = hidden_layers 10 | self.hidden_layers = [] 11 | self.biases = [] 12 | prev_units = n_features 13 | for n_units in hidden_layers: 14 | self.hidden_layers.append(np.random.normal(0., 1./prev_units, (prev_units, n_units))) 15 | self.biases.append(np.random.normal(0., 1./prev_units, n_units)[None, :]) 16 | prev_units = n_units 17 | self.hidden_layers.append(np.random.normal(0., 1./prev_units, (prev_units, n_output))) 18 | 19 | def score(self, input): 20 | prev_layer = input.T 21 | self.input = input 22 | self.activations = [prev_layer] 23 | for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases): 24 | prev_layer = np.dot(prev_layer, hidden_layer) 25 | prev_layer += bias 26 | # prev_layer = np.maximum(0., prev_layer) 27 | prev_layer = 1./(1. + np.exp(-prev_layer)) 28 | self.activations.append(prev_layer) 29 | result = np.dot(prev_layer, self.hidden_layers[-1]) 30 | self.activations.append(result) 31 | return result 32 | 33 | # def predict(self, input): 34 | # prev_layer = input.T 35 | # for hidden_layer, bias in zip(self.hidden_layers[:-1], self.biases): 36 | # prev_layer = np.dot(prev_layer, hidden_layer) 37 | # prev_layer += bias 38 | # # prev_layer = np.maximum(0, prev_layer) 39 | # prev_layer = 1./(1. + np.exp(-prev_layer)) 40 | # return np.dot(prev_layer, self.hidden_layers[-1]) 41 | 42 | def backpropagate(self, doc_weights): 43 | activations = self.activations 44 | doc_weights = np.expand_dims(doc_weights, axis=1) 45 | cur_der = (np.dot(activations[-2].T, doc_weights), None) 46 | derivatives = [cur_der] 47 | prev_der = doc_weights 48 | for i in range(len(self.hidden_layers)-1): 49 | prev_der = np.dot(prev_der, self.hidden_layers[-i-1].T) 50 | # prev_der[activations[-i-2] <= 0] = 0 51 | prev_der *= activations[-i-2]*(1.-activations[-i-2]) 52 | 53 | w_der = np.dot(activations[-i-3].T, prev_der) 54 | b_der = np.sum(prev_der, axis=0, keepdims=True) 55 | 56 | derivatives.append((w_der, b_der)) 57 | 58 | return derivatives 59 | 60 | def debugstr(self): 61 | for i, hd in enumerate(self.hidden_layers[:-1]): 62 | print 'layer %d:' % i, hd 63 | print 'bias %d:' % i, self.biases[i] 64 | print 'final hidden:', self.hidden_layers[-1] 65 | 66 | 67 | def update_to_documents(self, doc_weights): 68 | derivatives = self.backpropagate(doc_weights) 69 | 70 | first_wd = derivatives[0][0] 71 | self.hidden_layers[-1] += first_wd * self.learning_rate 72 | for i, (wd, bd) in enumerate(derivatives[1:], 2): 73 | self.hidden_layers[-i] += wd * self.learning_rate 74 | self.biases[-i + 1] += bd * self.learning_rate 75 | 76 | 77 | # def regularize_update(self): 78 | # rate = self.regularize_rate 79 | # if rate != 0: 80 | # self.hidden_layers[-1] -= rate * self.hidden_layers[-1] 81 | # for i in range(len(self.hidden_layers) - 1): 82 | # self.hidden_layers[i] -= rate * self.hidden_layers[i] 83 | # # self.biases[i] -= rate * self.biases[i] * 0.1 84 | 85 | -------------------------------------------------------------------------------- /multileaving/PairwisePreferenceMultileave.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | 6 | class PairwisePreferenceMultileave(object): 7 | 8 | def __init__(self, num_data_features, k=10): 9 | self._name = 'Pairwise Preferences Multileave' 10 | self._k = k 11 | self.needs_inverted = True 12 | self.needs_descending = True 13 | self.needs_oracle = False 14 | self.vector_aggregation = False 15 | 16 | def clean(self): 17 | del self._last_inverted_rankings 18 | 19 | def top_rank(self, multileaving, top_docs): 20 | n_disp = multileaving.shape[0] 21 | top_rank = np.zeros(n_disp, dtype=np.int32) 22 | top_rank[:] = n_disp 23 | for i in range(n_disp): 24 | in_rank = np.in1d(multileaving, top_docs[:,i]) 25 | top_rank[in_rank] = np.minimum(top_rank[in_rank],i) 26 | return top_rank 27 | 28 | def make_multileaving(self, descending_rankings, inverted_rankings): 29 | self._last_inverted_rankings = inverted_rankings 30 | self._last_descending_rankings = descending_rankings 31 | self._last_n_rankers = inverted_rankings.shape[0] 32 | 33 | n_docs = descending_rankings.shape[1] 34 | n_rankers = descending_rankings.shape[0] 35 | length = min(self._k,n_docs) 36 | multileaving = np.zeros(length, dtype=np.int32) 37 | previous_set = np.array([], dtype=np.int32) 38 | previous_results = {} 39 | self._last_choice_sizes = np.zeros(length) 40 | for i in range(length): 41 | full_set = np.unique(descending_rankings[:,:i+1]) 42 | cur_set = np.setdiff1d(full_set, multileaving[:i], assume_unique=True) 43 | multileaving[i] = np.random.choice(cur_set,1) 44 | self._last_choice_sizes[i] = cur_set.shape[0] 45 | self._last_top_ranks = self.top_rank(multileaving, descending_rankings) 46 | return multileaving 47 | 48 | def infer_preferences(self, result_list, clicked_docs): 49 | if np.any(clicked_docs): 50 | return self.preferences_of_list(result_list, clicked_docs.astype(bool)) 51 | else: 52 | return np.zeros((self._last_n_rankers, self._last_n_rankers)) 53 | 54 | def preferences_of_list(self, result_list, clicked_docs): 55 | n_disp = result_list.shape[0] 56 | n_rankers = self._last_n_rankers 57 | included = np.ones(min(self._k, clicked_docs.shape[0])) 58 | if not clicked_docs[-1]: 59 | included[1:] = np.cumsum(clicked_docs[::-1])[:0:-1] 60 | neg_pref = np.where(np.logical_xor(clicked_docs, included))[0] 61 | pos_pref = np.where(clicked_docs)[0] 62 | 63 | pair_neg = np.repeat(neg_pref, pos_pref.shape[0]) 64 | pair_pos = np.tile(pos_pref, neg_pref.shape[0]) 65 | 66 | pair_min_pos = np.minimum(pair_pos, pair_neg) 67 | pair_max_rank = np.maximum(self._last_top_ranks[pair_neg], self._last_top_ranks[pair_pos]) 68 | allowed_pairs = pair_min_pos >= pair_max_rank 69 | 70 | n_allowed_pairs = np.sum(allowed_pairs) 71 | if n_allowed_pairs > 0: 72 | pos_allow = pair_pos[allowed_pairs] 73 | neg_allow = pair_neg[allowed_pairs] 74 | pair_ind_pos = result_list[pos_allow] 75 | pair_ind_neg = result_list[neg_allow] 76 | 77 | pair_prob_comp = np.zeros(n_allowed_pairs) 78 | for i in range(n_allowed_pairs): 79 | pair_top = sorted([self._last_top_ranks[pos_allow[i]],self._last_top_ranks[neg_allow[i]]]) 80 | pair_prob_comp[i] = 1./np.prod(1.-1./self._last_choice_sizes[pair_top[0]:pair_top[1]]) 81 | 82 | correct_pairs = self._last_inverted_rankings[:, pair_ind_neg] \ 83 | - self._last_inverted_rankings[:, pair_ind_pos] > 0 84 | 85 | total_correct = np.sum(correct_pairs * pair_prob_comp, axis=1) \ 86 | / n_allowed_pairs 87 | 88 | else: 89 | total_correct = np.zeros(self._last_inverted_rankings.shape[0]) 90 | 91 | return total_correct[:,None] - total_correct[None,:] 92 | -------------------------------------------------------------------------------- /multileaving/ProbabilisticMultileave.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | 6 | class ProbabilisticMultileave(object): 7 | 8 | def __init__(self, n_samples=10000, n_results=10, tau=3.0): 9 | self._name = 'Probabilistic Multileave' 10 | self._k = n_results 11 | self._tau = tau 12 | self._n_samples = n_samples 13 | self.uses_inverted_rankings = True 14 | self.needs_inverted = True 15 | self.needs_descending = False 16 | self.needs_oracle = False 17 | self.vector_aggregation = False 18 | 19 | def clean(self): 20 | del self._last_inverted_rankings 21 | 22 | def make_multileaving(self, inverted_rankings): 23 | ''' 24 | ARGS: (all np.array of docids) 25 | - inverted_rankings: matrix (rankers x documents) where [x,y] corresponds to the rank of doc y in ranker x 26 | 27 | RETURNS 28 | - ranking of indices corresponding to inverted_rankings 29 | ''' 30 | self._last_inverted_rankings = inverted_rankings 31 | self._last_n_rankers = inverted_rankings.shape[0] 32 | n = inverted_rankings.shape[1] 33 | k = min(n, self._k) 34 | 35 | unnorm_probs = 1. / (inverted_rankings + 1) ** self._tau 36 | denom = np.sum(unnorm_probs, axis=1) 37 | 38 | ranking = np.empty(k, dtype=np.int32) 39 | ind = np.arange(n) 40 | for i in range(k): 41 | norm_probs = unnorm_probs / denom[:, None] 42 | probs = np.mean(norm_probs, axis=0) 43 | choice = np.random.choice(ind, p=probs, replace=False) 44 | ranking[i] = choice 45 | denom -= unnorm_probs[:, choice] 46 | unnorm_probs[:, choice] = 0 47 | 48 | self._last_ranking = ranking 49 | return ranking 50 | 51 | def infer_preferences(self, clicked_docs): 52 | if np.any(clicked_docs): 53 | return self.preferences_of_list(self.probability_of_list(self._last_ranking, 54 | self._last_inverted_rankings, 55 | clicked_docs.astype(bool), self._tau), self._n_samples) 56 | else: 57 | return np.zeros((self._last_n_rankers, self._last_n_rankers)) 58 | 59 | def winning_rankers(self, clicked_docs): 60 | match = self.infer_preferences(clicked_docs) 61 | return np.where(match[:, 0] > 0)[0] 62 | 63 | def probability_of_list(self, result_list, inverted_rankings, clicked_docs, tau): 64 | ''' 65 | ARGS: (all np.array of docids) 66 | - result_list: the multileaved list 67 | - inverted_rankings: matrix (rankers x documents) where [x,y] corresponds to the rank of doc y in ranker x 68 | - clicked_docs: boolean array of result_list length indicating clicks 69 | 70 | RETURNS 71 | -sigmas: matrix (rankers x clicked_docs) with probabilty ranker added clicked doc 72 | ''' 73 | n_docs = inverted_rankings.shape[1] 74 | n_rankers = inverted_rankings.shape[0] 75 | 76 | click_doc_ind = result_list[clicked_docs] 77 | 78 | # normalization denominator for the complete ranking 79 | sigmoid_total = np.sum(float(1) / (np.arange(n_docs) + 1) ** self._tau) 80 | 81 | 82 | # cumsum is used to renormalize the probs, it contains the part 83 | # the denominator that has to be removed due to previously added docs 84 | cumsum = np.zeros((n_rankers, result_list.shape[0])) 85 | cumsum[:, 1:] = np.cumsum(float(1) / (inverted_rankings[:, result_list[:-1]] + 1.) 86 | ** self._tau, axis=1) 87 | 88 | # make sure inverted rankings is of dtype float 89 | sigmas = 1 / (inverted_rankings[:, click_doc_ind].T + 1.) ** self._tau 90 | sigmas /= sigmoid_total - cumsum[:, clicked_docs].T 91 | 92 | return sigmas / np.sum(sigmas, axis=1)[:, None] 93 | 94 | def preferences_of_list(self, probs, n_samples): 95 | ''' 96 | ARGS: 97 | -probs: clicked docs x rankers matrix with probabilities ranker added clicked doc (use probability_of_list) 98 | -n_samples: number of samples to base preference matrix on 99 | 100 | RETURNS: 101 | - preference matrix: matrix (rankers x rankers) in this matrix [x,y] > 0 means x won over y and [x,y] < 0 means x lost from y 102 | the value is analogous to the (average) degree of preference 103 | ''' 104 | 105 | n_clicks = probs.shape[0] 106 | n_rankers = probs.shape[1] 107 | # determine upper bounds for each ranker (to see prob distribution as set of ranges) 108 | upper = np.cumsum(probs, axis=1) 109 | 110 | # determine lower bounds 111 | lower = np.zeros(probs.shape) 112 | # lower[:,0] = 0 113 | lower[:, 1:] += upper[:, :-1] 114 | 115 | # flip coins, coins fall between lower and upper 116 | coinflips = np.random.rand(n_clicks, self._n_samples) 117 | # make copies for each sample and each ranker 118 | comps = coinflips[:, :, None] 119 | # determine where each coin landed 120 | log_assign = np.logical_and(comps > lower[:, None, :], comps <= upper[:, None, :]) 121 | # click count per ranker (samples x rankers) 122 | click_count = np.sum(log_assign, axis=0) 123 | # the preference matrix for each sample 124 | prefs = np.sign(click_count[:, :, None] - click_count[:, None, :]) 125 | 126 | # the preferences are averaged for each pair 127 | # in this matrix [x,y] > 0 means x won over y and [x,y] < 0 means x lost from y 128 | # the value is analogous to the (average) degree of preference 129 | return np.sum(prefs, axis=0) / float(self._n_samples) 130 | -------------------------------------------------------------------------------- /multileaving/TeamDraftMultileave.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | 6 | class TeamDraftMultileave(object): 7 | 8 | def __init__(self, n_results=10): 9 | self._name = 'Team-Draft Multileave' 10 | self._k = n_results 11 | self.uses_inverted_rankings = False 12 | self.needs_inverted = False 13 | self.needs_descending = True 14 | self.needs_oracle = False 15 | self.vector_aggregation = False 16 | 17 | def clean(self): 18 | del self.teams 19 | 20 | def next_index_to_add(self, inter_result, inter_n, ranking, index): 21 | while index < ranking.shape[0] and np.any(ranking[index] == inter_result[:inter_n]): 22 | index += 1 23 | return index 24 | 25 | def make_multileaving(self, descending_rankings): 26 | 27 | rankings = descending_rankings 28 | 29 | n_rankings = rankings.shape[0] 30 | k = min(self._k, rankings.shape[1]) 31 | teams = np.zeros(k, dtype=np.int32) 32 | multileaved = np.zeros(k, dtype=np.int32) 33 | 34 | multi_i = 0 35 | while multi_i < k and np.all(rankings[1:, multi_i] == rankings[0, multi_i]): 36 | multileaved[multi_i] = rankings[0][multi_i] 37 | teams[multi_i] = -1 38 | multi_i += 1 39 | 40 | indices = np.zeros(n_rankings, dtype=np.int32) + multi_i 41 | assignment = np.arange(n_rankings) 42 | assign_i = n_rankings 43 | while multi_i < k: 44 | if assign_i == n_rankings: 45 | np.random.shuffle(assignment) 46 | assign_i = 0 47 | 48 | rank_i = assignment[assign_i] 49 | indices[rank_i] = self.next_index_to_add(multileaved, multi_i, 50 | rankings[rank_i,:], 51 | indices[rank_i]) 52 | multileaved[multi_i] = rankings[rank_i, indices[rank_i]] 53 | teams[multi_i] = rank_i 54 | indices[rank_i] += 1 55 | multi_i += 1 56 | assign_i += 1 57 | 58 | self.teams = teams 59 | self.n_rankers = n_rankings 60 | return multileaved 61 | 62 | def infer_preferences(self, clicked_docs): 63 | clicked_docs = clicked_docs.astype(bool) 64 | assigned_clicks = np.sum(np.arange(self.n_rankers)[:,None] == self.teams[clicked_docs][None,:],axis=1) 65 | return np.sign(assigned_clicks[:,None] - assigned_clicks[None,:]) 66 | 67 | def winning_rankers(self, clicked_docs): 68 | ranker_range = np.arange(self.n_rankers) 69 | match_matrix = ranker_range[:,None] == self.teams[clicked_docs][None,:] 70 | ranker_clicks = np.sum(match_matrix.astype(np.int32), axis=1) 71 | # print self.teams, clicked_docs.astype(int), 72 | # print ranker_range[ranker_clicks[0] < ranker_clicks] 73 | return ranker_range[ranker_clicks[0] < ranker_clicks] 74 | 75 | def winning_rankers_with_clicks(self, clicked_docs): 76 | # Return click info as well 77 | ranker_range = np.arange(self.n_rankers) 78 | match_matrix = ranker_range[:,None] == self.teams[clicked_docs][None,:] 79 | ranker_clicks = np.sum(match_matrix.astype(np.int32), axis=1) 80 | # print self.teams, clicked_docs.astype(int), 81 | # print ranker_range[ranker_clicks[0] < ranker_clicks] 82 | return ranker_range[ranker_clicks[0] < ranker_clicks], ranker_clicks 83 | -------------------------------------------------------------------------------- /multileaving/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/multileaving/__init__.py -------------------------------------------------------------------------------- /scripts/CIKM2018.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 6 | from utils.datasimulation import DataSimulation 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser 8 | from algorithms.PDGD.pdgd import PDGD 9 | from algorithms.PDGD.deeppdgd import DeepPDGD 10 | from algorithms.DBGD.tddbgd import TD_DBGD 11 | from algorithms.DBGD.pdbgd import P_DBGD 12 | from algorithms.DBGD.tdmgd import TD_MGD 13 | from algorithms.DBGD.pmgd import P_MGD 14 | from algorithms.baselines.pairwise import Pairwise 15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD 16 | 17 | description = 'Run script for testing framework.' 18 | parser = SimulationArgumentParser(description=description) 19 | 20 | rankers = [] 21 | 22 | ranker_params = { 23 | 'learning_rate_decay': 0.9999977} 24 | sim_args, other_args = parser.parse_all_args(ranker_params) 25 | 26 | # run_name = 'speedtest/TD-DBGD' 27 | # rankers.append((run_name, TD_DBGD, other_args)) 28 | 29 | run_name = 'CIKM2018/P-DBGD' 30 | rankers.append((run_name, P_DBGD, other_args)) 31 | 32 | run_name = 'CIKM2018/DeepP-DBGD' 33 | rankers.append((run_name, Neural_P_DBGD, other_args)) 34 | 35 | # run_name = 'speedtest/TD-MGD' 36 | # rankers.append((run_name, TD_MGD, other_args)) 37 | 38 | run_name = 'CIKM2018/P-MGD' 39 | rankers.append((run_name, P_MGD, other_args)) 40 | 41 | ranker_params = { 42 | 'learning_rate_decay': 0.9999977, 43 | 'epsilon': 0.8} 44 | sim_args, other_args = parser.parse_all_args(ranker_params) 45 | 46 | run_name = 'CIKM2018/Pairwise' 47 | rankers.append((run_name, Pairwise, other_args)) 48 | 49 | ranker_params = { 50 | 'learning_rate': 0.1, 51 | 'learning_rate_decay': 0.9999977, 52 | } 53 | sim_args, other_args = parser.parse_all_args(ranker_params) 54 | 55 | run_name = 'CIKM2018/PDGD' 56 | rankers.append((run_name, PDGD, other_args)) 57 | 58 | run_name = 'CIKM2018/DeepPDGD' 59 | rankers.append((run_name, DeepPDGD, other_args)) 60 | 61 | sim = DataSimulation(sim_args) 62 | sim.run(rankers) -------------------------------------------------------------------------------- /scripts/Poisoning_attacks/attack_DBGD_99_lr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | from utils.datasimulation import DataSimulation 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser 8 | from algorithms.PDGD.pdgd import PDGD 9 | from algorithms.PDGD.deeppdgd import DeepPDGD 10 | from algorithms.DBGD.tddbgd import TD_DBGD 11 | from algorithms.DBGD.pdbgd import P_DBGD 12 | from algorithms.DBGD.tdmgd import TD_MGD 13 | from algorithms.DBGD.pmgd import P_MGD 14 | from algorithms.baselines.pairwise import Pairwise 15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD 16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP 17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP 18 | from algorithms.DBGD.tdNSGD import TD_NSGD 19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP 20 | 21 | description = 'Run script for testing framework.' 22 | parser = SimulationArgumentParser(description=description) 23 | 24 | rankers = [] 25 | 26 | # Baselines 27 | ranker_params = { 28 | 'learning_rate_decay': 0.99} 29 | 30 | sim_args, other_args = parser.parse_all_args(ranker_params) 31 | run_name = 'attack/TD_DBGD' 32 | 33 | rankers.append((run_name, TD_DBGD, other_args)) 34 | 35 | sim = DataSimulation(sim_args) 36 | sim.run(rankers) 37 | -------------------------------------------------------------------------------- /scripts/Poisoning_attacks/attack_DBGD_base_lr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | from utils.datasimulation import DataSimulation 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser 8 | from algorithms.PDGD.pdgd import PDGD 9 | from algorithms.PDGD.deeppdgd import DeepPDGD 10 | from algorithms.DBGD.tddbgd import TD_DBGD 11 | from algorithms.DBGD.pdbgd import P_DBGD 12 | from algorithms.DBGD.tdmgd import TD_MGD 13 | from algorithms.DBGD.pmgd import P_MGD 14 | from algorithms.baselines.pairwise import Pairwise 15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD 16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP 17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP 18 | from algorithms.DBGD.tdNSGD import TD_NSGD 19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP 20 | 21 | description = 'Run script for testing framework.' 22 | parser = SimulationArgumentParser(description=description) 23 | 24 | rankers = [] 25 | 26 | # Baselines 27 | ranker_params = { 28 | 'learning_rate_decay': 0.9999977} 29 | 30 | sim_args, other_args = parser.parse_all_args(ranker_params) 31 | run_name = 'attack/TD_DBGD' 32 | 33 | rankers.append((run_name, TD_DBGD, other_args)) 34 | 35 | sim = DataSimulation(sim_args) 36 | sim.run(rankers) 37 | -------------------------------------------------------------------------------- /scripts/Poisoning_attacks/attack_MGD_99_lr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | from utils.datasimulation import DataSimulation 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser 8 | from algorithms.PDGD.pdgd import PDGD 9 | from algorithms.PDGD.deeppdgd import DeepPDGD 10 | from algorithms.DBGD.tddbgd import TD_DBGD 11 | from algorithms.DBGD.pdbgd import P_DBGD 12 | from algorithms.DBGD.tdmgd import TD_MGD 13 | from algorithms.DBGD.pmgd import P_MGD 14 | from algorithms.baselines.pairwise import Pairwise 15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD 16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP 17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP 18 | from algorithms.DBGD.tdNSGD import TD_NSGD 19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP 20 | 21 | description = 'Run script for testing framework.' 22 | parser = SimulationArgumentParser(description=description) 23 | 24 | rankers = [] 25 | 26 | # Baselines 27 | ranker_params = { 28 | 'learning_rate_decay': 0.99} 29 | 30 | sim_args, other_args = parser.parse_all_args(ranker_params) 31 | run_name = 'attack/TD_MGD' 32 | 33 | rankers.append((run_name, TD_MGD, other_args)) 34 | 35 | sim = DataSimulation(sim_args) 36 | sim.run(rankers) 37 | -------------------------------------------------------------------------------- /scripts/Poisoning_attacks/attack_MGD_base_lr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 6 | from utils.datasimulation import DataSimulation 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser 8 | from algorithms.PDGD.pdgd import PDGD 9 | from algorithms.PDGD.deeppdgd import DeepPDGD 10 | from algorithms.DBGD.tddbgd import TD_DBGD 11 | from algorithms.DBGD.pdbgd import P_DBGD 12 | from algorithms.DBGD.tdmgd import TD_MGD 13 | from algorithms.DBGD.pmgd import P_MGD 14 | from algorithms.baselines.pairwise import Pairwise 15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD 16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP 17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP 18 | from algorithms.DBGD.tdNSGD import TD_NSGD 19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP 20 | 21 | description = 'Run script for testing framework.' 22 | parser = SimulationArgumentParser(description=description) 23 | 24 | rankers = [] 25 | 26 | # Baselines 27 | ranker_params = { 28 | 'learning_rate_decay': 0.9999977} 29 | 30 | sim_args, other_args = parser.parse_all_args(ranker_params) 31 | run_name = 'attack/TD_MGD' 32 | 33 | rankers.append((run_name, TD_MGD, other_args)) 34 | 35 | sim = DataSimulation(sim_args) 36 | sim.run(rankers) 37 | -------------------------------------------------------------------------------- /scripts/SIGIR2018.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 6 | from utils.datasimulation import DataSimulation 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser 8 | from algorithms.PDGD.pdgd import PDGD 9 | from algorithms.PDGD.deeppdgd import DeepPDGD 10 | from algorithms.DBGD.tddbgd import TD_DBGD 11 | from algorithms.DBGD.pdbgd import P_DBGD 12 | from algorithms.DBGD.tdmgd import TD_MGD 13 | from algorithms.DBGD.pmgd import P_MGD 14 | from algorithms.baselines.pairwise import Pairwise 15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD 16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP 17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP 18 | from algorithms.DBGD.tdNSGD import TD_NSGD 19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP 20 | 21 | description = 'Run script for testing framework.' 22 | parser = SimulationArgumentParser(description=description) 23 | 24 | rankers = [] 25 | 26 | # Baselines 27 | ranker_params = { 28 | 'learning_rate_decay': 0.9999977} 29 | sim_args, other_args = parser.parse_all_args(ranker_params) 30 | 31 | run_name = 'SIGIR2018/TD-DBGD' 32 | rankers.append((run_name, TD_DBGD, other_args)) 33 | 34 | run_name = 'SIGIR2018/TD-MGD' 35 | rankers.append((run_name, TD_MGD, other_args)) 36 | 37 | ranker_params = { 38 | 'learning_rate_decay': 0.9999977, 39 | 'GRAD_SIZE':60, 40 | 'EXP_SIZE':25, 41 | 'TB_QUEUE_SIZE':10, 42 | 'TB_WINDOW_SIZE':50} 43 | sim_args, other_args = parser.parse_all_args(ranker_params) 44 | 45 | run_name = 'SIGIR2018/TD_NSGD' 46 | rankers.append((run_name, TD_NSGD, other_args)) 47 | 48 | 49 | 50 | 51 | sim = DataSimulation(sim_args) 52 | sim.run(rankers) -------------------------------------------------------------------------------- /scripts/SIGIR2019.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 6 | from utils.datasimulation import DataSimulation 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser 8 | from algorithms.PDGD.pdgd import PDGD 9 | from algorithms.PDGD.deeppdgd import DeepPDGD 10 | from algorithms.DBGD.tddbgd import TD_DBGD 11 | from algorithms.DBGD.pdbgd import P_DBGD 12 | from algorithms.DBGD.tdmgd import TD_MGD 13 | from algorithms.DBGD.pmgd import P_MGD 14 | from algorithms.baselines.pairwise import Pairwise 15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD 16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP 17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP 18 | from algorithms.DBGD.tdNSGD import TD_NSGD 19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP 20 | 21 | description = 'Run script for testing framework.' 22 | parser = SimulationArgumentParser(description=description) 23 | 24 | rankers = [] 25 | 26 | # Baselines 27 | ranker_params = { 28 | 'learning_rate_decay': 0.9999977} 29 | sim_args, other_args = parser.parse_all_args(ranker_params) 30 | 31 | run_name = 'SIGIR2019/P-DBGD' 32 | rankers.append((run_name, P_DBGD, other_args)) 33 | 34 | run_name = 'SIGIR2019/P-MGD' 35 | rankers.append((run_name, P_MGD, other_args)) 36 | 37 | ranker_params = { 38 | 'learning_rate_decay': 0.9999977, 39 | 'GRAD_SIZE':60, 40 | 'EXP_SIZE':25, 41 | 'TB_QUEUE_SIZE':10, 42 | 'TB_WINDOW_SIZE':50} 43 | sim_args, other_args = parser.parse_all_args(ranker_params) 44 | 45 | run_name = 'SIGIR2019/TD_NSGD' 46 | rankers.append((run_name, TD_NSGD, other_args)) 47 | 48 | 49 | # DBGD based algorithms with document space projection 50 | ranker_params = { 51 | 'learning_rate_decay': 0.9999977, 52 | 'k_initial': 3, 53 | 'k_increase': False, 54 | 'prev_qeury_len': 10} 55 | sim_args, other_args = parser.parse_all_args(ranker_params) 56 | 57 | run_name = 'SIGIR2019/P_DBGD_DSP' 58 | rankers.append((run_name, P_DBGD_DSP, other_args)) 59 | 60 | run_name = 'SIGIR2019/P_MGD_DSP' 61 | rankers.append((run_name, P_MGD_DSP, other_args)) 62 | 63 | 64 | 65 | # NSGD with document space projection 66 | ranker_params = { 67 | 'learning_rate_decay': 0.9999977, 68 | 'k_initial': 3, 69 | 'k_increase': False, 70 | 'GRAD_SIZE':60, 71 | 'EXP_SIZE':25, 72 | 'TB_QUEUE_SIZE':10, 73 | 'TB_WINDOW_SIZE':50, 74 | 'prev_qeury_len': 10} 75 | sim_args, other_args = parser.parse_all_args(ranker_params) 76 | 77 | run_name = 'SIGIR2019/TD_NSGD_DSP' 78 | rankers.append((run_name, TD_NSGD_DSP, other_args)) 79 | 80 | 81 | 82 | sim = DataSimulation(sim_args) 83 | sim.run(rankers) -------------------------------------------------------------------------------- /scripts/SIGIR2019_nsgd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 6 | from utils.datasimulation import DataSimulation 7 | from utils.argparsers.simulationargparser import SimulationArgumentParser 8 | from algorithms.PDGD.pdgd import PDGD 9 | from algorithms.PDGD.deeppdgd import DeepPDGD 10 | from algorithms.DBGD.tddbgd import TD_DBGD 11 | from algorithms.DBGD.pdbgd import P_DBGD 12 | from algorithms.DBGD.tdmgd import TD_MGD 13 | from algorithms.DBGD.pmgd import P_MGD 14 | from algorithms.baselines.pairwise import Pairwise 15 | from algorithms.DBGD.neural.pdbgd import Neural_P_DBGD 16 | from algorithms.DBGD.pdbgd_dsp import P_DBGD_DSP 17 | from algorithms.DBGD.pmgd_dsp import P_MGD_DSP 18 | from algorithms.DBGD.tdNSGD import TD_NSGD 19 | from algorithms.DBGD.tdNSGD_dsp import TD_NSGD_DSP 20 | 21 | description = 'Run script for testing framework.' 22 | parser = SimulationArgumentParser(description=description) 23 | 24 | rankers = [] 25 | 26 | # Baselines 27 | ranker_params = { 28 | 'learning_rate_decay': 0.9999977, 29 | 'GRAD_SIZE':60, 30 | 'EXP_SIZE':25, 31 | 'TB_QUEUE_SIZE':10, 32 | 'TB_WINDOW_SIZE':50} 33 | sim_args, other_args = parser.parse_all_args(ranker_params) 34 | 35 | run_name = 'SIGIR2019/TD_NSGD' 36 | rankers.append((run_name, TD_NSGD, other_args)) 37 | 38 | 39 | 40 | # NSGD with document space projection 41 | ranker_params = { 42 | 'learning_rate_decay': 0.9999977, 43 | 'k_initial': 3, 44 | 'k_increase': False, 45 | 'GRAD_SIZE':60, 46 | 'EXP_SIZE':25, 47 | 'TB_QUEUE_SIZE':10, 48 | 'TB_WINDOW_SIZE':50, 49 | 'prev_qeury_len': 10} 50 | sim_args, other_args = parser.parse_all_args(ranker_params) 51 | 52 | run_name = 'SIGIR2019/DSGD_TD_NSGD' 53 | rankers.append((run_name, TD_NSGD_DSP, other_args)) 54 | 55 | 56 | 57 | sim = DataSimulation(sim_args) 58 | sim.run(rankers) -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/slurm/SIGIR2019/0708.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --cpus-per-task=28 4 | #SBATCH --time=120:00:00 5 | #SBATCH -p standard 6 | #SBATCH --output=job_output/out 7 | #SBATCH --error=job_output/error 8 | #SBATCH --mail-type=ALL 9 | #SBATCH --mail-user=sak2km@virginia.edu 10 | #SBATCH -A hcdm 11 | 12 | module load anaconda/5.2.0-py2.7 13 | python scripts/SIGIR2019.py --data_sets local_MQ2007 local_MQ2008 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000 -------------------------------------------------------------------------------- /scripts/slurm/SIGIR2019/np.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --cpus-per-task=28 4 | #SBATCH --time=120:00:00 5 | #SBATCH -p standard 6 | #SBATCH --output=job_output/out 7 | #SBATCH --error=job_output/error 8 | #SBATCH --mail-type=ALL 9 | #SBATCH --mail-user=sak2km@virginia.edu 10 | #SBATCH -A hcdm 11 | 12 | module load anaconda/5.2.0-py2.7 13 | python scripts/SIGIR2019.py --data_sets local_NP2003 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000 -------------------------------------------------------------------------------- /scripts/slurm/SIGIR2019/nsgd/0708.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --cpus-per-task=28 4 | #SBATCH --time=120:00:00 5 | #SBATCH -p standard 6 | #SBATCH --output=job_output/out 7 | #SBATCH --error=job_output/error 8 | #SBATCH --mail-type=ALL 9 | #SBATCH --mail-user=sak2km@virginia.edu 10 | #SBATCH -A hcdm 11 | 12 | module load anaconda/5.2.0-py2.7 13 | python scripts/SIGIR2019_nsgd.py --data_sets local_MQ2007 local_MQ2008 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000 -------------------------------------------------------------------------------- /scripts/slurm/SIGIR2019/nsgd/np.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --cpus-per-task=28 4 | #SBATCH --time=120:00:00 5 | #SBATCH -p standard 6 | #SBATCH --output=job_output/out 7 | #SBATCH --error=job_output/error 8 | #SBATCH --mail-type=ALL 9 | #SBATCH --mail-user=sak2km@virginia.edu 10 | #SBATCH -A hcdm 11 | 12 | module load anaconda/5.2.0-py2.7 13 | python scripts/SIGIR2019_nsgd.py --data_sets local_NP2003 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000 -------------------------------------------------------------------------------- /scripts/slurm/SIGIR2019/nsgd/web10k.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --cpus-per-task=28 4 | #SBATCH --time=120:00:00 5 | #SBATCH -p standard 6 | #SBATCH --output=job_output/out 7 | #SBATCH --error=job_output/error 8 | #SBATCH --mail-type=ALL 9 | #SBATCH --mail-user=sak2km@virginia.edu 10 | #SBATCH -A hcdm 11 | 12 | module load anaconda/5.2.0-py2.7 13 | python scripts/SIGIR2019_nsgd.py --data_sets local_MSLR-WEB10K --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000 -------------------------------------------------------------------------------- /scripts/slurm/SIGIR2019/nsgd/webscope1.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --cpus-per-task=28 4 | #SBATCH --time=120:00:00 5 | #SBATCH -p standard 6 | #SBATCH --output=job_output/out 7 | #SBATCH --error=job_output/error 8 | #SBATCH --mail-type=ALL 9 | #SBATCH --mail-user=sak2km@virginia.edu 10 | #SBATCH -A hcdm 11 | 12 | module load anaconda/5.2.0-py2.7 13 | python scripts/SIGIR2019_nsgd.py --data_sets local_Webscope_C14_Set1 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 5 --n_proc 10 --n_impr 10000 -------------------------------------------------------------------------------- /scripts/slurm/SIGIR2019/web10k.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --cpus-per-task=28 4 | #SBATCH --time=120:00:00 5 | #SBATCH -p standard 6 | #SBATCH --output=job_output/out 7 | #SBATCH --error=job_output/error 8 | #SBATCH --mail-type=ALL 9 | #SBATCH --mail-user=sak2km@virginia.edu 10 | #SBATCH -A hcdm 11 | 12 | module load anaconda/5.2.0-py2.7 13 | python scripts/SIGIR2019.py --data_sets local_MSLR-WEB10K --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 10 --n_proc 10 --n_impr 10000 -------------------------------------------------------------------------------- /scripts/slurm/SIGIR2019/webscope1.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --cpus-per-task=28 4 | #SBATCH --time=120:00:00 5 | #SBATCH -p standard 6 | #SBATCH --output=job_output/out 7 | #SBATCH --error=job_output/error 8 | #SBATCH --mail-type=ALL 9 | #SBATCH --mail-user=sak2km@virginia.edu 10 | #SBATCH -A hcdm 11 | 12 | module load anaconda/5.2.0-py2.7 13 | python scripts/SIGIR2019.py --data_sets local_Webscope_C14_Set1 --click_models inf nav per --log_folder ~/../../../../scratch/sak2km/l2r_data/log_folder --average_folder ~/../../../../scratch/sak2km/l2r_data/outdir/average --output_folder ~/../../../../scratch/sak2km/l2r_data/outdir/fullruns/ --n_runs 5 --n_proc 10 --n_impr 10000 -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/utils/__init__.py -------------------------------------------------------------------------------- /utils/argparsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sak2km/OnlineLearningToRank/866890112ee0971c330467e892614a58807012be/utils/argparsers/__init__.py -------------------------------------------------------------------------------- /utils/argparsers/simulationargparser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import argparse 4 | import time 5 | import json 6 | 7 | class SimulationArgumentParser(argparse.ArgumentParser): 8 | 9 | def __init__(self, description=None, set_arguments={}): 10 | self._description = description 11 | self._initial_set_arguments = set_arguments.copy() 12 | self._set_arguments = set_arguments 13 | self._initial_arguments = {} 14 | self._simulation_arguments = [] 15 | self._arguments_initialized = False 16 | 17 | super(SimulationArgumentParser, self).__init__(description=description) 18 | 19 | self._sim_add_argument('--n_runs', dest='n_runs', default=125, type=int, 20 | help='Number of runs to be simulated over a Dataset.') 21 | 22 | self._sim_add_argument('--n_impr', dest='n_impressions', default=1000, type=int, 23 | help='Number of impressions per simulated run.') 24 | 25 | self._sim_add_argument('--vali', dest='validation', action='store_true', 26 | help='Use of validation set instead of testset.') 27 | 28 | self._sim_add_argument('--vali_in_train', dest='validation_in_train', action='store_true', 29 | help='Prevents validation set being added to training set.') 30 | 31 | self._sim_add_argument('--data_sets', dest='data_sets', type=str, required=True, 32 | help='Paths to folders where the data-folds are stored.', nargs='+') 33 | 34 | self._sim_add_argument('--output_folder', dest='output_folder', type=str, required=False, 35 | help='Path to folders where outputs should be stored, if not given output will be printed.' 36 | , default='./output') 37 | 38 | self._sim_add_argument('--log_folder', dest='log_folder', type=str, required=False, 39 | help='Path to folders where run log and errors will be stored.', 40 | default='./log/') 41 | 42 | self._sim_add_argument('--average_folder', dest='average_folder', type=str, required=False, 43 | help='Path to folders where averaged output of runs will be stored.', 44 | default='./average') 45 | 46 | self._sim_add_argument('--attacker_average_folder', dest='attacker_average_folder', type=str, required=False, 47 | help='Path to folders where averaged output of runs will be stored.', 48 | default='./attackeraverage') 49 | 50 | self._sim_add_argument('--attacker_folder', dest='attacker_folder', type=str, required=False, 51 | help='Path to folders where attacker output of runs will be stored.', 52 | default='./attackeroutput') 53 | 54 | self._sim_add_argument('--small_dataset', dest='small_dataset', action='store_false', 55 | help='Set true if dataset is small and memory is never a concern.') 56 | 57 | self._sim_add_argument('--click_models', dest='click_models', default='exper1', type=str, required=True, 58 | help='Click models to be used.', nargs='+') 59 | 60 | self._sim_add_argument('--print_freq', dest='print_freq', type=int, required=False, 61 | help='The number of steps taken before another one is printed after the first batch.' 62 | , default=10) 63 | 64 | self._sim_add_argument('--print_logscale', dest='print_logscale', action='store_true', 65 | help='Dencrease print frequency semi-logarithmically.') 66 | 67 | self._sim_add_argument('--print_output', dest='print_output', action='store_true', 68 | help='Set true if outputs should be printed and not stored.') 69 | 70 | self._sim_add_argument('--max_folds', dest='max_folds', type=int, required=False, 71 | help='The maximum number of folds that may be loaded at any time, default is unlimited.' 72 | , default=None) 73 | 74 | self._sim_add_argument('--n_proc', dest='n_processing', default=1, type=int, 75 | help='Max number of work-processes to run in parallel.') 76 | 77 | self._sim_add_argument('--no_run_details', dest='no_run_details', action='store_true', 78 | help='Print all run arguments at start of simulation.') 79 | 80 | self._sim_add_argument('--n_results', dest='n_results', default=10, type=int, 81 | help='Number of results shown after each query.') 82 | 83 | self._sim_add_argument('--skip_read_bin_data', dest='read_binarized_data', action='store_false') 84 | self._sim_add_argument('--skip_store_bin_data', dest='store_binarized_data_after_read', 85 | action='store_false') 86 | 87 | self._sim_add_argument('--train_only', dest='train_only', action='store_true', 88 | help='Only calculate train NDCG.') 89 | 90 | self._sim_add_argument('--all_train', dest='all_train', action='store_false', 91 | help='Stop simulation from printing train NDCG at every step.') 92 | 93 | self._sim_add_argument('--nonrel_test', dest='purge_test_set', action='store_false', 94 | help='Include non-relevant queries in evaluation on test-set.') 95 | 96 | # Additional arguments added by Rishab 97 | self._sim_add_argument('--mf', dest='mf', default=5, type=int, 98 | help='Number of most frequent documents to look.') 99 | 100 | self._sim_add_argument('--sd_const', dest='sd_const', default=2.0, type=float, 101 | help='How many standard deviations away to look.') 102 | 103 | self._sim_add_argument('--start', dest='start', default=0, type=int, 104 | help='Which documents to intersect (start)') 105 | 106 | self._sim_add_argument('--end', dest='end', default=5, type=int, 107 | help='Which documents to intersect (end)') 108 | 109 | self._sim_add_argument('--which', dest='which', default=-1, type=int, 110 | help='Which half of the portion to attack. (First 2000, Second 2000 etc.). Possible values include (0 [all attack], 1, 2, 3 4, 5)') 111 | 112 | self._sim_add_argument('--attacker_click_model', dest='attacker_click_model', default='naive_intersection_attack', type=str, 113 | help="Name of the attacker's click model. Possible names include (naive_intersection_attack, frequency_attack)") 114 | 115 | self._sim_add_argument('--num_attacker_relevant', dest='num_attacker_relevant', default=5, type=int, 116 | help='How many documents in attacker ranking are relevant to the attacker') 117 | 118 | 119 | 120 | self._arguments_initialized = False 121 | 122 | def reset_arguments(self): 123 | self._set_arguments = self._initial_set_arguments.copy() 124 | 125 | def set_argument(self, name, value): 126 | self._set_arguments[name] = value 127 | 128 | def remove_argument(self, name): 129 | del self._set_arguments[name] 130 | 131 | def _sim_add_argument(self, *args, **kargs): 132 | if 'dest' in kargs: 133 | name = kargs['dest'] 134 | elif args[0][:2] == '--': 135 | name = args[0][2:] 136 | else: 137 | assert args[0][:1] == '-' 138 | name = args[0][1:] 139 | 140 | assert name != 'description' 141 | if not name in self._set_arguments: 142 | super(SimulationArgumentParser, self).add_argument(*args, **kargs) 143 | 144 | assert name not in self._simulation_arguments 145 | self._simulation_arguments.append(name) 146 | 147 | def parse_sim_args(self): 148 | args = vars(self.parse_args()) 149 | sim_args = { 150 | 'description': self._description, 151 | } 152 | for name, value in args.items(): 153 | if name in self._simulation_arguments: 154 | sim_args[name] = value 155 | return argparse.Namespace(**sim_args) 156 | 157 | def parse_other_args(self, ranker_args=None, ranker=None): 158 | args = vars(self.parse_args()) 159 | other_args = {} 160 | if ranker: 161 | other_args.update( 162 | ranker.default_ranker_parameters() 163 | ) 164 | for name, value in args.items(): 165 | if name not in self._simulation_arguments: 166 | other_args[name] = value 167 | if ranker_args: 168 | other_args.update(ranker_args) 169 | return other_args 170 | 171 | def parse_all_args(self, ranker_args=None, ranker=None): 172 | return (self.parse_sim_args(), 173 | self.parse_other_args( 174 | ranker_args = ranker_args, 175 | ranker = ranker)) 176 | -------------------------------------------------------------------------------- /utils/attackeraverager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import os 5 | import traceback 6 | import json 7 | 8 | 9 | def cumulative(ranking, discount=0.9995): 10 | return np.cumsum(discount ** np.arange(ranking.shape[0]) * ranking) 11 | 12 | 13 | def convert_time(time_in_seconds): 14 | seconds = time_in_seconds % 60 15 | minutes = time_in_seconds / 60 % 60 16 | hours = time_in_seconds / 3600 17 | return '%02d:%02d:%02d' % (hours, minutes, seconds) 18 | 19 | 20 | def print_array(array): 21 | return ' '.join([str(x) for x in array] + ['\n']) 22 | 23 | 24 | def create_folders(filename): 25 | if not os.path.exists(os.path.dirname(filename)): 26 | os.makedirs(os.path.dirname(filename)) 27 | 28 | class AttackerAverager(object): 29 | 30 | def __init__(self, simulation_arguments): 31 | self.attacker_average_folder = simulation_arguments.attacker_average_folder 32 | self._average_index = 0 33 | 34 | # def click_model_name(self, full_name): 35 | # return str(full_name[:full_name.rfind('_')]) 36 | 37 | def average_results(self, output_path): 38 | with open(output_path, 'r') as f: 39 | sim_args = json.loads(f.readline()) 40 | first_run = json.loads(f.readline()) 41 | run_details = first_run['run_details'] 42 | 43 | cur_click_model = run_details['attacker_click_model'] 44 | # self.click_model_name( 45 | # run_details['click model']) 46 | runtimes = { 47 | cur_click_model: [float(run_details['runtime'])], 48 | } 49 | 50 | all_ind = {} 51 | first_val = {} 52 | for event in first_run['run_results']: 53 | iteration = event['iteration'] 54 | for name, val in event.items(): 55 | if name == 'iteration': 56 | continue 57 | if name not in all_ind: 58 | all_ind[name] = [] 59 | first_val[name] = [] 60 | all_ind[name].append(iteration) 61 | first_val[name].append(val) 62 | 63 | all_val = {} 64 | for name in all_ind: 65 | all_ind[name] = np.array(all_ind[name], 66 | dtype=np.int32) 67 | all_val[name] = { 68 | cur_click_model: [np.array(first_val[name], 69 | dtype=float)] 70 | } 71 | 72 | for line in f: 73 | events = json.loads(line) 74 | 75 | run_details = events['run_details'] 76 | cur_click_model = run_details['attacker_click_model'] 77 | # cur_click_model = self.click_model_name( 78 | # run_details['click model']) 79 | if cur_click_model not in runtimes: 80 | runtimes[cur_click_model] = [] 81 | 82 | runtimes[cur_click_model].append( 83 | float(run_details['runtime'])) 84 | 85 | cur_i = {} 86 | cur_val = {} 87 | for name, val in all_ind.items(): 88 | cur_i[name] = 0 89 | cur_val[name] = np.zeros(val.shape) 90 | if cur_click_model not in all_val[name]: 91 | all_val[name][cur_click_model] = [] 92 | all_val[name][cur_click_model].append(cur_val[name]) 93 | 94 | for event in events['run_results']: 95 | iteration = event['iteration'] 96 | for name, val in event.items(): 97 | if name != 'iteration': 98 | c_i = cur_i[name] 99 | assert all_ind[name][c_i] == iteration 100 | cur_val[name][c_i] = val 101 | cur_i[name] += 1 102 | 103 | for name, val in all_ind.items(): 104 | if name != 'iteration': 105 | assert cur_i[name] == val.shape[0] 106 | 107 | average_runtimes = {} 108 | for click_model, values in runtimes.items(): 109 | average_runtimes[click_model] = np.mean(values).tolist() 110 | 111 | results = {} 112 | for name, cur_ind in all_ind.items(): 113 | cur_results = { 114 | 'indices': cur_ind.tolist() 115 | } 116 | results[name] = cur_results 117 | for click_model, lists in all_val[name].items(): 118 | stacked = np.stack(lists) 119 | cm_mean = np.mean(stacked, axis=0) 120 | cm_std = np.std(stacked, axis=0) 121 | cur_results[click_model] = { 122 | 'mean': cm_mean.tolist(), 123 | 'std': cm_std.tolist(), 124 | } 125 | 126 | output = { 127 | 'simulation_arguments': sim_args, 128 | 'runtimes': average_runtimes, 129 | 'results': results 130 | } 131 | 132 | return output 133 | 134 | def create_average_file(self, sim_output): 135 | print "opening %s" % sim_output.output_path 136 | output = self.average_results(sim_output.output_path) 137 | 138 | self.dataset_path = '%s/%s' % (self.attacker_average_folder, sim_output.dataset_name) 139 | self.output_path = '%s/%s.out' % (self.dataset_path, sim_output.simulation_name+sim_output.additional_file_name) 140 | 141 | # print "Output path inside averager: ", self.output_path 142 | create_folders(self.dataset_path) 143 | create_folders(self.output_path) 144 | with open(self.output_path, 'w') as w: 145 | w.write(json.dumps(output)) 146 | print 'Closed %d: %s on %s was averaged and stored.' % (self._average_index, 147 | sim_output.simulation_name+sim_output.additional_file_name, sim_output.dataset_name) 148 | 149 | self._average_index += 1 150 | 151 | class IndependentAttackerAverager(AttackerAverager): 152 | def __init__(self, attacker_average_folder): 153 | self.attacker_average_folder = attacker_average_folder 154 | self._average_index = 0 155 | -------------------------------------------------------------------------------- /utils/attackeroutput.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import os 5 | import sys 6 | import time 7 | from datetime import timedelta 8 | 9 | def create_folders(filename): 10 | if not os.path.exists(os.path.dirname(filename)): 11 | os.makedirs(os.path.dirname(filename)) 12 | 13 | class AttackerFileOutput(object): 14 | 15 | def __init__(self, output_file_path, output_header=None, close_between_writes=False, 16 | also_print=False, write_date=False): 17 | self._output_file_path = output_file_path 18 | self._close_between_writes = close_between_writes 19 | self._also_print = also_print 20 | self._original_stdout = sys.stdout 21 | self.write_date = write_date 22 | create_folders(self._output_file_path) 23 | self._output_file = open(self._output_file_path, 'w') 24 | self._file_open = True 25 | self._new_line = True 26 | self._closed = False 27 | if not output_header is None: 28 | self.write(output_header) 29 | self._end_write() 30 | 31 | def _open_file(self): 32 | if not self._file_open: 33 | self._output_file = open(self._output_file_path, 'a') 34 | self._file_open = True 35 | 36 | def _close_file(self): 37 | self._output_file.close() 38 | self._file_open = False 39 | 40 | def _end_write(self): 41 | if self._close_between_writes: 42 | self._close_file() 43 | 44 | def _write_str_to_file(self, output_str): 45 | self._output_file.write(output_str) 46 | self._new_line = output_str[-1] == '\n' 47 | 48 | def flush(self): 49 | if self._also_print: 50 | self._original_stdout.flush() 51 | self._output_file.flush() 52 | 53 | def write(self, output, skip_write_date=False): 54 | assert not self._closed 55 | self._open_file() 56 | for line in output: 57 | if self.write_date and self._new_line and not skip_write_date: 58 | line = '%s: %s' % (time.strftime('%c'), str(line)) 59 | # assert type(line) is str, 'Output element %s is not a str' % line 60 | self._write_str_to_file(str(line)) 61 | if self._also_print: 62 | self._original_stdout.write(line) 63 | self._end_write() 64 | 65 | def close(self): 66 | self._close_file() 67 | self._closed = True 68 | if self._also_print: 69 | self._original_stdout.write('\n') 70 | 71 | 72 | class AttackerOutput(object): 73 | 74 | """ 75 | Class designed to manage the multiprocessing of simulations over multiple datasets. 76 | """ 77 | 78 | def __init__(self, simulation_arguments, simulation_name, dataset, num_click_models, 79 | ranker_arguments, attacker_averager): 80 | self._start_time = time.time() 81 | self.run_index = 0 82 | self.attacker_output_folder = simulation_arguments.attacker_folder 83 | self.simulation_name = simulation_name 84 | self.dataset_name = dataset.name 85 | self.attacker_averager = attacker_averager 86 | self.print_output = simulation_arguments.print_output 87 | self._expected_runs = dataset.num_runs_per_fold * dataset.num_folds * num_click_models 88 | self._closed = False 89 | 90 | 91 | self.additional_file_name = "" 92 | 93 | if "freq" in simulation_arguments.click_models[0]: 94 | self.additional_file_name = "_"+simulation_arguments.attacker_click_model+"_"+str(simulation_arguments.n_results)+"_res_" \ 95 | +str(simulation_arguments.start)+"_start_"+str(simulation_arguments.end)+"_end_"+str(simulation_arguments.mf)+"_mf_"+str(simulation_arguments.sd_const)+"_sd_" \ 96 | +str(simulation_arguments.which)+"_half_"+str(simulation_arguments.n_impressions)+"_impressions"+str(ranker_arguments['learning_rate_decay'])+"_lrdecay" 97 | 98 | else: 99 | self.additional_file_name = "_"+simulation_arguments.attacker_click_model+"_"+str(simulation_arguments.n_results)+"_res_"+str(simulation_arguments.start)+"_start_"+str(simulation_arguments.end)+"_end_" \ 100 | +str(simulation_arguments.which)+"_half_"+str(simulation_arguments.n_impressions)+"_impressions"+str(ranker_arguments['learning_rate_decay'])+"_lrdecay" 101 | 102 | self.output_path = '%s/%s/%s.out' % (self.attacker_output_folder, self.dataset_name, 103 | self.simulation_name+self.additional_file_name) 104 | print "output path: ", self.output_path 105 | combined_args = { 106 | 'simulation_arguments': vars(simulation_arguments), 107 | 'ranker_arguments': ranker_arguments, 108 | } 109 | if self.print_output: 110 | output_header = json.dumps(combined_args, sort_keys=True, 111 | indent=4, separators=(',', ': ')) 112 | self.attacker_file_output = AttackerBufferPrintOutput(output_header) 113 | else: 114 | output_header = json.dumps(combined_args, separators=(',',':')) 115 | self.attacker_file_output = AttackerFileOutput(self.output_path, output_header, 116 | close_between_writes=True, also_print=False, 117 | write_date=False) 118 | 119 | def expected_runs(self): 120 | return self._expected_runs 121 | 122 | def finished(self): 123 | return self._closed and self.run_index == self._expected_runs 124 | 125 | def write_run_output(self, run_output): 126 | assert not self._closed, 'Simulation Output (%s) written to after being closed.' \ 127 | % self.output_path 128 | 129 | if self.print_output: 130 | # self.file_output.write(json.dumps(run_output, sort_keys=True, 131 | # indent=4, separators=(',', ': '))) 132 | self.attacker_file_output.pretty_run_write(self.run_index, run_output) 133 | else: 134 | self.attacker_file_output.write('\n%s' % json.dumps(run_output)) 135 | 136 | self.run_index += 1 137 | if self.run_index >= self._expected_runs: 138 | self.close() 139 | 140 | def close(self, output_file=None): 141 | self.attacker_file_output.close() 142 | self._closed = True 143 | if not self.print_output: 144 | self.attacker_averager.create_average_file(self) 145 | 146 | 147 | class AttackerBufferPrintOutput(object): 148 | 149 | def __init__(self, output_header=None): 150 | self._closed = False 151 | self._output_list = [] 152 | if not output_header is None: 153 | self.write(output_header) 154 | 155 | def flush(self): 156 | pass 157 | 158 | def write(self, output): 159 | assert not self._closed 160 | assert type(output) is str, 'Wrong output format %s' % type(output) 161 | self._output_list.append(output) 162 | 163 | def pretty_run_write(self, run_index, run_output): 164 | run_details = run_output['run_details'] 165 | run_lines = [ 166 | "RUN: %d" % run_index, 167 | "DATAFOLD: %s" % run_details['data folder'], 168 | "CLICK MODEL: %s" % run_details['click model'], 169 | "ATTACKER CLICK MODEL: %s" % run_details['attacker_click_model'], 170 | "RUN TIME: %s (%.02f seconds)" % (timedelta(seconds=run_details['runtime']), 171 | run_details['runtime']) 172 | ] 173 | tag = run_details['held-out data'] 174 | for event in run_output['run_results']: 175 | str_line = str(event['iteration']) 176 | if 'display' in event: 177 | str_line += ' DISPLAY: %0.3f' % event['display'] 178 | if 'heldout' in event: 179 | str_line += ' %s: %0.3f' % (tag, event['heldout']) 180 | run_lines.append(str_line) 181 | for line in run_lines: 182 | self.write(line) 183 | 184 | def close(self): 185 | self._closed = True 186 | print 'Run Output\n' + '\n'.join(self._output_list) 187 | self._output_list = [] 188 | -------------------------------------------------------------------------------- /utils/averageoutput.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import os 5 | import traceback 6 | import json 7 | 8 | 9 | def cumulative(ranking, discount=0.9995): 10 | return np.cumsum(discount ** np.arange(ranking.shape[0]) * ranking) 11 | 12 | 13 | def convert_time(time_in_seconds): 14 | seconds = time_in_seconds % 60 15 | minutes = time_in_seconds / 60 % 60 16 | hours = time_in_seconds / 3600 17 | return '%02d:%02d:%02d' % (hours, minutes, seconds) 18 | 19 | 20 | def print_array(array): 21 | return ' '.join([str(x) for x in array] + ['\n']) 22 | 23 | 24 | def create_folders(filename): 25 | if not os.path.exists(os.path.dirname(filename)): 26 | os.makedirs(os.path.dirname(filename)) 27 | 28 | class OutputAverager(object): 29 | 30 | def __init__(self, simulation_arguments): 31 | self.average_folder = simulation_arguments.average_folder 32 | self._average_index = 0 33 | 34 | def click_model_name(self, full_name): 35 | return str(full_name[:full_name.rfind('_')]) 36 | 37 | def average_results(self, output_path): 38 | with open(output_path, 'r') as f: 39 | sim_args = json.loads(f.readline()) 40 | first_run = json.loads(f.readline()) 41 | run_details = first_run['run_details'] 42 | 43 | cur_click_model = self.click_model_name( 44 | run_details['click model']) 45 | runtimes = { 46 | cur_click_model: [float(run_details['runtime'])], 47 | } 48 | 49 | all_ind = {} 50 | first_val = {} 51 | for event in first_run['run_results']: 52 | iteration = event['iteration'] 53 | for name, val in event.items(): 54 | if name == 'iteration': 55 | continue 56 | if name not in all_ind: 57 | all_ind[name] = [] 58 | first_val[name] = [] 59 | all_ind[name].append(iteration) 60 | first_val[name].append(val) 61 | 62 | all_val = {} 63 | for name in all_ind: 64 | all_ind[name] = np.array(all_ind[name], 65 | dtype=np.int32) 66 | all_val[name] = { 67 | cur_click_model: [np.array(first_val[name], 68 | dtype=float)] 69 | } 70 | 71 | for line in f: 72 | events = json.loads(line) 73 | 74 | run_details = events['run_details'] 75 | cur_click_model = self.click_model_name( 76 | run_details['click model']) 77 | if cur_click_model not in runtimes: 78 | runtimes[cur_click_model] = [] 79 | 80 | runtimes[cur_click_model].append( 81 | float(run_details['runtime'])) 82 | 83 | cur_i = {} 84 | cur_val = {} 85 | for name, val in all_ind.items(): 86 | cur_i[name] = 0 87 | cur_val[name] = np.zeros(val.shape) 88 | if cur_click_model not in all_val[name]: 89 | all_val[name][cur_click_model] = [] 90 | all_val[name][cur_click_model].append(cur_val[name]) 91 | 92 | for event in events['run_results']: 93 | iteration = event['iteration'] 94 | for name, val in event.items(): 95 | if name != 'iteration': 96 | c_i = cur_i[name] 97 | assert all_ind[name][c_i] == iteration 98 | cur_val[name][c_i] = val 99 | cur_i[name] += 1 100 | 101 | for name, val in all_ind.items(): 102 | if name != 'iteration': 103 | assert cur_i[name] == val.shape[0] 104 | 105 | average_runtimes = {} 106 | for click_model, values in runtimes.items(): 107 | average_runtimes[click_model] = np.mean(values).tolist() 108 | 109 | results = {} 110 | for name, cur_ind in all_ind.items(): 111 | cur_results = { 112 | 'indices': cur_ind.tolist() 113 | } 114 | results[name] = cur_results 115 | for click_model, lists in all_val[name].items(): 116 | stacked = np.stack(lists) 117 | cm_mean = np.mean(stacked, axis=0) 118 | cm_std = np.std(stacked, axis=0) 119 | cur_results[click_model] = { 120 | 'mean': cm_mean.tolist(), 121 | 'std': cm_std.tolist(), 122 | } 123 | 124 | output = { 125 | 'simulation_arguments': sim_args, 126 | 'runtimes': average_runtimes, 127 | 'results': results 128 | } 129 | 130 | return output 131 | 132 | def create_average_file(self, sim_output): 133 | print "opening %s" % sim_output.output_path 134 | output = self.average_results(sim_output.output_path) 135 | 136 | self.dataset_path = '%s/%s' % (self.average_folder, sim_output.dataset_name) 137 | self.output_path = '%s/%s.out' % (self.dataset_path, sim_output.simulation_name) 138 | create_folders(self.dataset_path) 139 | create_folders(self.output_path) 140 | with open(self.output_path, 'w') as w: 141 | w.write(json.dumps(output)) 142 | print 'Closed %d: %s on %s was averaged and stored.' % (self._average_index, 143 | sim_output.simulation_name, sim_output.dataset_name) 144 | 145 | self._average_index += 1 146 | 147 | class IndependentOutputAverager(OutputAverager): 148 | def __init__(self, average_folder): 149 | self.average_folder = average_folder 150 | self._average_index = 0 151 | -------------------------------------------------------------------------------- /utils/clicks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import operator 3 | import random 4 | 5 | 6 | class ClickModel(object): 7 | 8 | ''' 9 | Class for cascading click-models used to simulate clicks. 10 | ''' 11 | 12 | def __init__(self, name, data_type, PCLICK, PSTOP): 13 | ''' 14 | Name is used for logging, data_type denotes the degrees of relevance the data uses. 15 | PCLICK and PSTOP the probabilities used by the model. 16 | ''' 17 | self.name = name 18 | self.type = data_type 19 | self.PCLICK = PCLICK 20 | self.PSTOP = PSTOP 21 | 22 | def get_name(self): 23 | ''' 24 | Name that can be used for logging. 25 | ''' 26 | return self.name + '_' + self.type 27 | 28 | def generate_clicks(self, ranking, all_labels): 29 | ''' 30 | Generates clicks for a given ranking and relevance labels. 31 | ranking: np array of indices which correspond with all_labels 32 | all_labels: np array of integers 33 | ''' 34 | labels = all_labels[ranking] 35 | coinflips = np.random.rand(*ranking.shape) 36 | clicks = coinflips < self.PCLICK[labels] 37 | coinflips = np.random.rand(*ranking.shape) 38 | stops = coinflips < self.PSTOP[labels] 39 | stopped_clicks = np.zeros(ranking.shape, dtype=bool) 40 | if np.any(stops): 41 | clicks_before_stop = np.logical_and(clicks, np.arange(ranking.shape[0]) 42 | <= np.where(stops)[0][0]) 43 | stopped_clicks[clicks_before_stop] = True 44 | return stopped_clicks 45 | else: 46 | return np.zeros(ranking.shape, dtype=bool) + clicks 47 | 48 | 49 | class ExamineClickModel(object): 50 | 51 | ''' 52 | Class for cascading click-models used to simulate clicks. 53 | ''' 54 | 55 | def __init__(self, name, data_type, PCLICK, eta): 56 | ''' 57 | Name is used for logging, data_type denotes the degrees of relevance the data uses. 58 | PCLICK and PSTOP the probabilities used by the model. 59 | ''' 60 | self.name = name 61 | self.type = data_type 62 | self.PCLICK = PCLICK 63 | self.eta = eta 64 | 65 | def get_name(self): 66 | ''' 67 | Name that can be used for logging. 68 | ''' 69 | return self.name + '_' + self.type 70 | 71 | def generate_clicks(self, ranking, all_labels): 72 | ''' 73 | Generates clicks for a given ranking and relevance labels. 74 | ranking: np array of indices which correspond with all_labels 75 | all_labels: np array of integers 76 | ''' 77 | n_results = ranking.shape[0] 78 | examine_prob = (1./(np.arange(n_results)+1))**self.eta 79 | stop_prob = np.ones(n_results) 80 | stop_prob[1:] -= examine_prob[1:]/examine_prob[:-1] 81 | stop_prob[0] = 0. 82 | 83 | labels = all_labels[ranking] 84 | coinflips = np.random.rand(*ranking.shape) 85 | clicks = coinflips < self.PCLICK[labels] 86 | coinflips = np.random.rand(n_results) 87 | stops = coinflips < stop_prob 88 | stops = np.logical_and(stops, clicks) 89 | stopped_clicks = np.zeros(ranking.shape, dtype=bool) 90 | if np.any(stops): 91 | clicks_before_stop = np.logical_and(clicks, np.arange(ranking.shape[0]) 92 | <= np.where(stops)[0][0]) 93 | stopped_clicks[clicks_before_stop] = True 94 | return stopped_clicks 95 | else: 96 | return np.zeros(ranking.shape, dtype=bool) + clicks 97 | 98 | class MaliciousClickModel(object): 99 | 100 | ''' 101 | Class for cascading click-models used to simulate malicious clicks. 102 | ''' 103 | 104 | def __init__(self, name, data_type): 105 | ''' 106 | Name is used for logging and identifying the attack type. 107 | ''' 108 | self.name = name 109 | self.type = data_type 110 | 111 | def get_name(self): 112 | ''' 113 | Name that can be used for logging. 114 | ''' 115 | return self.name + '_' + self.type 116 | 117 | def generate_clicks(self, train_ranking, attacker_ranking, start, end, freq, mf, sd_const): 118 | 119 | if self.name == "naive_intersection_attack": 120 | return self.naive_intersection_attack(train_ranking, attacker_ranking, start, end) 121 | elif self.name == "frequency_attack": 122 | return self.frequency_attack(train_ranking, attacker_ranking, freq, mf, start, end, sd_const) 123 | else: 124 | print("Attack name is incorrect. Only 'naive_intersection_attack' and 'frequency_attack' are supported!!\n") 125 | 126 | 127 | def naive_intersection_attack(self, train_ranking, attacker_ranking, start, end): 128 | ''' 129 | Generates malicious clicks based on the intersection of train_ranking and attacker_ranking. 130 | Intersection is guided by start and end hyper-parameters. 131 | ''' 132 | clicks = [] 133 | 134 | for i in range(0, len(train_ranking)): 135 | 136 | if (len(attacker_ranking) >= end and train_ranking[i] in attacker_ranking[start:end]): 137 | clicks.append(True) 138 | else: 139 | clicks.append(False) 140 | 141 | return np.zeros(train_ranking.shape, dtype=bool) + clicks 142 | 143 | 144 | def frequency_attack(self, train_ranking, attacker_ranking, freq, mf, start, end, sd_const): 145 | ''' 146 | Generates malicious clicks based on the intersection of train_ranking and attacker_ranking. 147 | Intersection is guided by start and end hyper-parameters. 148 | mf controls which documents get clicked in the intersection. 149 | mf: Number of most frequent docs that the attacker assumes come from the current ranker. 150 | freq: Frequency table containing (doc, freq) 151 | ''' 152 | 153 | # Sorting the frequency based on frequency 154 | sorted_freqs = sorted(freq.items(), key=operator.itemgetter(1), reverse=True) 155 | 156 | # Breaking the table into doc and frequency. The top_k_docs can be considered as a proxy for current ranker's ranking 157 | i = 0 158 | top_k_docs = [] 159 | top_k_freq = [] 160 | while (i < len(sorted_freqs) and i 0) else 0 169 | 170 | # Finding the index of position which is atleast sd_const standard deviations away. If such position is not found then mf will be over-rided otherwise not. 171 | ind = len(top_k_freq) 172 | for index in range(0, len(top_k_freq)-1): 173 | if sd_const*sd <= top_k_freq[index] - top_k_freq[index+1]: 174 | ind = index+1 175 | break; 176 | 177 | # Generating the clicks using the intersection and also making sure that the document is not one of the most frequent documents 178 | for i in range(0, len(train_ranking)): 179 | if train_ranking[i] not in top_k_docs[0:ind] and train_ranking[i] in attacker_ranking[start:end]: 180 | clicks.append(True) 181 | else: 182 | clicks.append(False) 183 | 184 | return np.zeros(train_ranking.shape, dtype=bool) + clicks 185 | 186 | 187 | # create synonyms for keywords to ease command line use 188 | syn_tuples = [ 189 | ('ex_per_1', ['exper1']), 190 | ('navigational', ['nav', 'navi', 'navig', 'navigat']), 191 | ('informational', ['inf', 'info', 'infor', 'informat']), 192 | ('perfect', ['per', 'perf']), 193 | ('almost_random', [ 194 | 'alm', 195 | 'almost', 196 | 'alra', 197 | 'arand', 198 | 'almostrandom', 199 | 'almrand', 200 | ]), 201 | ('random', ['ran', 'rand']), 202 | ('binary', ['bin']), 203 | ('short', []), 204 | ('long', []), 205 | ] 206 | attack_tuples = [ 207 | ('naive_intersection_attack', []), 208 | ('frequency_attack', []), 209 | ] 210 | synonyms = {} 211 | for full, abrv_list in syn_tuples: 212 | assert full not in synonyms or synonyms[full] == full 213 | synonyms[full] = full 214 | for abrv in abrv_list: 215 | assert abrv not in synonyms or synonyms[abrv] == full 216 | synonyms[abrv] = full 217 | 218 | attack_synonyms = {} 219 | for full, abrv_list in attack_tuples: 220 | assert full not in attack_synonyms or attack_synonyms[full] == full 221 | attack_synonyms[full] = full 222 | for abrv in abrv_list: 223 | assert abrv not in attack_synonyms or attack_synonyms[abrv] == full 224 | attack_synonyms[abrv] = full 225 | 226 | bin_models = {} 227 | bin_models['navigational'] = np.array([.05, .95]), np.array([.2, .9]) 228 | bin_models['informational'] = np.array([.4, .9]), np.array([.1, .5]) 229 | bin_models['perfect'] = np.array([.0, 1.]), np.array([.0, .0]) 230 | bin_models['almost_random'] = np.array([.4, .6]), np.array([.5, .5]) 231 | bin_models['random'] = np.array([.5, .5]), np.array([.0, .0]) 232 | bin_models['ex_per_1'] = np.array([.0, 1.]), 1.0 233 | 234 | short_models = {} 235 | short_models['navigational'] = np.array([.05, .5, .95]), np.array([.2, .5, .9]) 236 | short_models['informational'] = np.array([.4, .7, .9]), np.array([.1, .3, .5]) 237 | short_models['perfect'] = np.array([.0, .5, 1.]), np.array([.0, .0, .0]) 238 | short_models['almost_random'] = np.array([.4, .5, .6]), np.array([.5, .5, .5]) 239 | short_models['random'] = np.array([.5, .5, .5]), np.array([.0, .0, .0]) 240 | short_models['ex_per_1'] = np.array([.0, .5, 1.]), 1.0 241 | 242 | long_models = {} 243 | long_models['navigational'] = np.array([.05, .3, .5, .7, .95]), np.array([.2, .3, .5, .7, .9]) 244 | long_models['informational'] = np.array([.4, .6, .7, .8, .9]), np.array([.1, .2, .3, .4, .5]) 245 | long_models['perfect'] = np.array([.0, .2, .4, .8, 1.]), np.array([.0, .0, .0, .0, .0]) 246 | long_models['almost_random'] = np.array([.4, .45, .5, .55, .6]), np.array([.5, .5, .5, .5, .5]) 247 | long_models['random'] = np.array([.5, .5, .5, .5, .5]), np.array([.0, .0, .0, .0, .0]) 248 | long_models['ex_per_1'] = np.array([.0, .2, .4, .8, 1.]), 1.0 249 | 250 | all_models = {'short': short_models, 'binary': bin_models, 'long': long_models} 251 | 252 | def get_click_models(keywords): 253 | ''' 254 | Convenience function which returns click models corresponding with keywords. 255 | only returns click functions for one data type: (bin,short,long) 256 | ''' 257 | type_name = None 258 | type_keyword = None 259 | # print("Keywords: ", keywords) 260 | for keyword in keywords: 261 | assert (keyword in synonyms) or (keyword in attack_synonyms) 262 | if keyword in synonyms and synonyms[keyword] in all_models: 263 | type_name = synonyms[keyword] 264 | type_keyword = keyword 265 | break 266 | assert type_name is not None and type_keyword is not None 267 | 268 | models_type = all_models[type_name] 269 | full_names = [] 270 | for key in keywords: 271 | if key in synonyms and key != type_keyword: 272 | full_names.append(synonyms[key]) 273 | if key in attack_synonyms: 274 | full_names.append(attack_synonyms[key]) 275 | 276 | click_models = [] 277 | 278 | for full in full_names: 279 | if full in attack_synonyms: 280 | c_m = MaliciousClickModel(full, type_name) 281 | elif full == 'ex_per_1': 282 | c_m = ExamineClickModel(full, type_name, *models_type[full]) 283 | else: 284 | c_m = ClickModel(full, type_name, *models_type[full]) 285 | click_models.append(c_m) 286 | 287 | return click_models -------------------------------------------------------------------------------- /utils/datasimulation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import random 4 | import time 5 | import sharedmem 6 | import datetime 7 | import gc 8 | import sys 9 | import numpy as np 10 | from attacksimulation import AttackSimulation 11 | from multiprocessing import Process, Queue 12 | from Queue import Empty 13 | from utils.clicks import get_click_models 14 | from utils.datasetcollections import get_datasets 15 | from utils.simulationoutput import SimulationOutput, get_simulation_report 16 | from utils.attackeroutput import AttackerOutput 17 | from utils.averageoutput import OutputAverager 18 | from utils.attackeraverager import AttackerAverager 19 | 20 | 21 | class DataSimulation(object): 22 | 23 | """ 24 | Class designed to manage the multiprocessing of simulations over multiple datasets. 25 | """ 26 | 27 | def __init__(self, simulation_arguments): 28 | self.sim_args = simulation_arguments 29 | self.num_proc = simulation_arguments.n_processing 30 | self.n_runs = simulation_arguments.n_runs 31 | 32 | self.output_queue = Queue() 33 | self.single_sims = [] 34 | self.processes = [] 35 | 36 | self.folds_in_mem = 0 37 | self.max_folds = 999 38 | 39 | self.output_averager = OutputAverager(simulation_arguments) 40 | self.attacker_averager = AttackerAverager(simulation_arguments) 41 | self.report_output = get_simulation_report(simulation_arguments) 42 | sys.stdout = self.report_output 43 | sys.stderr = self.report_output 44 | 45 | 46 | def run(self, ranker_tuples): 47 | starttime = time.time() 48 | self.active = 0 49 | self.click_models = {} 50 | self.run_outputs = {} 51 | self.all_launched = {} 52 | self.run_index = 0 53 | self.read_index = 0 54 | self.clean_index = 0 55 | self._launched = 0 56 | self._outputs_found = 0 57 | datasets = list(get_datasets(self.sim_args)) 58 | for dataset in datasets: 59 | self.max_folds = min(self.max_folds, dataset.max_folds) 60 | if not dataset.click_model_type in self.click_models: 61 | missing_type = dataset.click_model_type 62 | missing_models = get_click_models(self.sim_args.click_models 63 | + [dataset.click_model_type]) 64 | self.click_models[missing_type] = missing_models 65 | 66 | for process in self.load_processes(datasets, ranker_tuples): 67 | self._launched += 1 68 | process.start() 69 | while self.update_active() >= self.num_proc: 70 | self.wait_for_output() 71 | 72 | while self._expecting_output(): 73 | self.wait_for_output() 74 | self.update_active() 75 | 76 | seconds_past = time.time() - starttime 77 | print ('Time taken: %s (%d seconds)' % 78 | (str(datetime.timedelta(seconds=seconds_past)), 79 | seconds_past)) 80 | 81 | for output in self.run_outputs.values(): 82 | print 'OP: ', output, ' -> ', output.finished() 83 | assert all(output.finished() for output in self.run_outputs.values()), \ 84 | 'Program exiting but not all outputs were finished.' 85 | 86 | def load_processes(self, datasets, ranker_tuples): 87 | for dataset in datasets: 88 | for datafold in dataset.get_data_folds(self.sim_args): 89 | for proc in self.load_datafold_processes(datafold, ranker_tuples): 90 | yield proc 91 | self.all_launched[datafold] = True 92 | while self.folds_in_mem >= dataset.max_folds: 93 | self.wait_for_output() 94 | 95 | def load_datafold_processes(self, datafold, ranker_tuples): 96 | while self.folds_in_mem >= datafold.max_folds: 97 | self.wait_for_output() 98 | self.update_active() 99 | print 'Read %d: Fold %d of dataset %s.' % (self.read_index, 100 | datafold.fold_num + 1, datafold.name) 101 | datafold.read_data() 102 | self.read_index += 1 103 | self.wait_for_output() 104 | self.update_active() 105 | for run_name, r_class, r_new_args in ranker_tuples: 106 | r_args = r_class.default_parameters() 107 | r_args.update(r_new_args) 108 | output_key = run_name, datafold.name 109 | attacker_output_key = run_name, datafold.name, "attacker" 110 | if not output_key in self.run_outputs: 111 | self.run_outputs[output_key] = SimulationOutput( 112 | self.sim_args, run_name, datafold, 113 | len(self.click_models[datafold.click_model_type]), r_args, 114 | self.output_averager) 115 | if not attacker_output_key in self.run_outputs: 116 | self.run_outputs[attacker_output_key] = AttackerOutput( 117 | self.sim_args, run_name, datafold, 118 | len(self.click_models[datafold.click_model_type]), r_args, 119 | self.attacker_averager) 120 | for c_m in self.click_models[datafold.click_model_type]: 121 | sim = AttackSimulation(self.sim_args, self.output_queue, c_m, datafold) 122 | ranker_setup = r_class, r_args 123 | r_args['n_results'] = self.sim_args.n_results 124 | r_args['n_features'] = datafold.num_features 125 | for i in xrange(datafold.num_runs_per_fold): 126 | new_proc = Process(target=self.start_run, args=(sim, output_key, attacker_output_key, ranker_setup, 127 | self.run_index)) 128 | self.processes.append((new_proc, datafold)) 129 | print 'Launch %d: %s %d with click model %s on fold %d from dataset %s.' % ( 130 | self.run_index, 131 | run_name, 132 | i, 133 | c_m.name, 134 | datafold.fold_num + 1, 135 | datafold.name, 136 | ) 137 | self.run_index += 1 138 | self.report_output.flush() 139 | yield new_proc 140 | 141 | def start_run(self, simulation, output_key, attacker_output_key, ranker_setup, seed=0): 142 | """ 143 | Performs a single run. 144 | Random functions get different seeds for each process. 145 | """ 146 | random.seed((time.time(), seed)) 147 | np.random.seed(int(time.time() + seed * 100 + seed)) 148 | rankerclass, ranker_args = ranker_setup 149 | ranker = rankerclass(**ranker_args) 150 | # print("ranker class: ", rankerclass.learning_rate) 151 | simulation.run(ranker, output_key=output_key, attacker_output_key=attacker_output_key) 152 | 153 | def update_active(self): 154 | """ 155 | Checks how many child processes are still active. 156 | """ 157 | dead_processes = [p for p in self.processes if not p[0].is_alive()] 158 | self.processes = [p for p in self.processes if p[0].is_alive()] 159 | alive_folds = {} 160 | for _, datafold in self.processes: 161 | alive_folds[datafold] = True 162 | self.folds_in_mem = len(alive_folds) 163 | 164 | self.max_folds = min([999] + [datafold.max_folds for datafold in alive_folds]) 165 | self.active = len(self.processes) 166 | dead_datafolds = {} 167 | for proc, datafold in dead_processes: 168 | proc.join() 169 | if not datafold in alive_folds and datafold in self.all_launched: 170 | dead_datafolds[datafold] = True 171 | 172 | for datafold in dead_datafolds: 173 | print 'Clean %d: Fold %d of dataset %s.' % (self.clean_index, datafold.fold_num + 1, 174 | datafold.name) 175 | datafold.clean_data() 176 | self.clean_index += 1 177 | 178 | # make extra sure that the process is removed from memory 179 | del dead_processes 180 | gc.collect() 181 | 182 | # print 'Folds %d max folds %d active %d' % (self.folds_in_mem, self.max_folds, self.active) 183 | return self.active 184 | 185 | def wait_for_output(self, timeout=50): # 0): 186 | """ 187 | Prints output for all finished threads 188 | """ 189 | found = not self._expecting_output() 190 | try: 191 | while True: 192 | output_key, run_output = self.output_queue.get(block=not found, timeout=timeout) 193 | found = True 194 | sim_output = self.run_outputs[output_key] 195 | print 'Output %d: %s on dataset %s. (%d/%d)' % (self._outputs_found, output_key[0], 196 | output_key[1], sim_output.run_index+1, sim_output.expected_runs()) 197 | sim_output.write_run_output(run_output) 198 | self._outputs_found += 1 199 | except Empty: 200 | pass 201 | self.update_active() 202 | 203 | def _expecting_output(self): 204 | return self._outputs_found < 2*self._launched 205 | -------------------------------------------------------------------------------- /utils/evaluate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from numpy import log2 4 | from random import sample 5 | import numpy as np 6 | import math 7 | import random 8 | 9 | 10 | def get_dcg(ordered_labels): 11 | return np.sum((2 ** ordered_labels - 1) / np.log2(np.arange(ordered_labels.shape[0]) + 2)) 12 | 13 | 14 | def get_idcg(complete_labels, max_len): 15 | return get_dcg(np.sort(complete_labels)[:-1 - max_len:-1]) 16 | 17 | 18 | def get_single_ndcg_for_rankers(descending_rankings, document_labels, max_len, idcg=None): 19 | if idcg == None: 20 | idcg = get_idcg(document_labels, max_len) 21 | if idcg == 0: 22 | return np.zeros(descending_rankings.shape[0]) 23 | return get_single_dcg_for_rankers(descending_rankings, document_labels, max_len)/idcg 24 | 25 | 26 | def get_single_dcg_for_rankers(descending_rankings, document_labels, max_len): 27 | displayed_rankings = descending_rankings[:, :max_len] 28 | displayed_labels = document_labels[displayed_rankings] 29 | return np.sum((2 ** displayed_labels - 1) / np.log2(np.arange(displayed_labels.shape[1]) 30 | + 2)[None, :], axis=1) 31 | 32 | 33 | def get_ndcg_with_labels(ranking, labels, max_len): 34 | ''' 35 | Calculating the NDCG with a single array of descending ranking and the corresponding labels. 36 | ''' 37 | idcg = get_idcg(np.asarray(labels), max_len) 38 | if idcg==0: 39 | return 0 40 | nominators = [2. ** label - 1. for label in labels] 41 | denominators = [math.log(r+2., 2) for r in ranking] 42 | for i in range(len(ranking)): 43 | if ranking[i]>=max_len: 44 | nominators[i] = 0 45 | 46 | ndcg = 0 47 | for i in range(len(nominators)): 48 | ndcg += nominators[i] / denominators[i] / idcg 49 | return ndcg 50 | 51 | 52 | def get_ndcg_with_ranking(model_ranking, ideal_ranking, num_relevant, max_len): 53 | ''' 54 | Given the model ranking and attacker's ranking (ideal ranking), calculate the NDCG performance. 55 | This score measures how close the two rankings are. 56 | ''' 57 | 58 | # Re-invert the the model ranking eg., [2,3,4,1,0] => [5, 4, 0, 1, 3] (0 is at position 5, 1 is at position 4 ...) 59 | # This is required because the ideal ranking is not inverted while the model_ranking is. 60 | 61 | non_inv_model_ranking = [0 for i in range(len(model_ranking))] 62 | 63 | for i in range(len(model_ranking)): 64 | if model_ranking[i] < len(non_inv_model_ranking): 65 | non_inv_model_ranking[model_ranking[i]] = i 66 | 67 | # Creating labels for attacker. Num_relevant documents in the ideal ranking (attacker's ranking) are relevant (1), others are not (0). 68 | labels = [0 for i in range(len(model_ranking))] 69 | relevant_ideal_ranking = ideal_ranking[:num_relevant] 70 | 71 | for document in non_inv_model_ranking: 72 | if document in relevant_ideal_ranking and document= max_len] = 0 94 | 95 | idcg_copy = np.copy(idcg_vector) 96 | idcg_copy[idcg_vector == 0] = 1 97 | return np.sum(nominators / denominators / idcg_copy) / n_queries 98 | 99 | 100 | def get_dcg_from_matrix(label_matrix, n_vector, max_len): 101 | label_matrix = label_matrix[:, :max_len] 102 | 103 | nominators = 2 ** label_matrix - 1 104 | nominators[np.arange(max_len)[None, :] >= n_vector[:, None]] = 0 105 | 106 | denominator = np.log2(np.arange(max_len) + 2) 107 | idcg_vector = np.sum(nominators / denominator[None, :], axis=1) 108 | 109 | return idcg_vector 110 | 111 | 112 | def get_idcg_list(label_vector, qptr, max_len, spread=False): 113 | 114 | n = qptr[1:] - qptr[:-1] 115 | max_documents = np.max(n) 116 | 117 | starts = np.zeros(n.shape[0] + 1, dtype=np.int32) 118 | starts[1:] = np.cumsum(n) 119 | 120 | ind = starts[:-1, None] + np.arange(0, max_documents)[None, :] 121 | ind = np.minimum(ind, starts[1:, None] - 1) 122 | 123 | label_matrix = label_vector[ind] 124 | label_matrix[np.arange(max_documents)[None, :] >= n[:, None]] = 0 125 | label_matrix = np.sort(label_matrix, axis=1)[:, ::-1] 126 | 127 | idcg_list = get_dcg_from_matrix(label_matrix, n, max_len) 128 | 129 | if spread: 130 | spread_ind = np.zeros(qptr[-1], dtype=np.int32) 131 | spread_ind[qptr[1:-1]] = 1 132 | spread_ind = np.cumsum(spread_ind) 133 | 134 | return idcg_list[spread_ind] 135 | else: 136 | return idcg_list 137 | -------------------------------------------------------------------------------- /utils/rankings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def invert_rankings(rankings, dtype=None): 4 | ''' 5 | Invert indices in a matrix of rankings, ranking per row. 6 | ''' 7 | if dtype is None: 8 | inverted = np.zeros(rankings.shape) 9 | else: 10 | inverted = np.zeros(rankings.shape, dtype=dtype) 11 | inverted[np.arange(rankings.shape[0])[:,None],rankings] = np.arange(rankings.shape[1])[None,:] 12 | return inverted 13 | 14 | def invert_ranking(ranking, dtype=None): 15 | """ 16 | 'Inverts' ranking, each element gets the index it has in the ranking. 17 | [2,0,1] becomes [1,3,0] 18 | """ 19 | if dtype is None: 20 | inverted = np.zeros(ranking.shape) 21 | else: 22 | inverted = np.zeros(ranking.shape, dtype=dtype) 23 | inverted[ranking] = np.arange(ranking.shape[0]) 24 | return inverted 25 | 26 | def tiebreak_sort(unranked, n_results=None, full_sort=False): 27 | if full_sort or n_results is None: 28 | n_results = unranked.shape[-1] 29 | return _tiebreak_sort(unranked, n_results) 30 | 31 | def _tiebreak_sort(unranked, n_results): 32 | """ 33 | Sorts rows of a matrix using tiebreakers, along the last axis. 34 | """ 35 | 36 | n_axis = len(unranked.shape) 37 | assert (n_axis == 1 or n_axis == 2) 38 | 39 | tiebreakers = np.random.random(unranked.shape) 40 | complex_predictions = np.empty(unranked.shape, dtype=np.complex) 41 | complex_predictions.real = unranked #score 42 | complex_predictions.imag = tiebreakers #random numbers to break ties 43 | 44 | max_n_docs = unranked.shape[-1] 45 | max_part = np.minimum(n_results, max_n_docs) 46 | if max_part == max_n_docs: 47 | return np.argsort(complex_predictions, axis=-1) 48 | 49 | part = np.argpartition(complex_predictions, max_part-1, axis=-1) 50 | slice_ind = (slice(None),) * (len(unranked.shape)-1) 51 | slice_ind += (slice(0,max_part),) 52 | 53 | if n_axis == 1: 54 | part_pred = complex_predictions[part[slice_ind]] 55 | front_sort = np.argsort(part_pred, axis=-1) 56 | part[slice_ind] = part[slice_ind][front_sort] 57 | else: 58 | extra_ind = np.arange(unranked.shape[0])[:,None] 59 | part_sliced = part[slice_ind] 60 | extra_ind = np.empty(part_sliced.shape, dtype=np.int32) 61 | extra_ind[:,:] = np.arange(unranked.shape[0])[:,None] 62 | part_pred = complex_predictions[extra_ind, part[slice_ind]] 63 | front_sort = np.argsort(part_pred, axis=-1) #index array from lowest prediction score to highest 64 | part_sliced[:, :] = part_sliced[extra_ind, front_sort] 65 | 66 | return part 67 | 68 | def get_score_rankings(weights,feature_matrix,qptr,max_documents=None, inverted=False): 69 | """ 70 | Given weights and a feature matrix the documents are ranked and scored according to their dot product. 71 | """ 72 | # minus to reverse ranking 73 | predictions = -np.squeeze(np.dot(weights.T,feature_matrix)) 74 | return rank_queries(predictions,qptr,max_documents=max_documents,inverted=inverted) 75 | 76 | def rank_queries(predictions, qptr, max_documents=None, inverted=False): 77 | """ 78 | Given predicted scores for queries rankings are generated and returned. 79 | """ 80 | 81 | max_value = np.max(predictions) 82 | # vector with lenght of each doclist 83 | n = qptr[1:]-qptr[:-1] 84 | if not max_documents: 85 | max_documents = np.max(n) 86 | 87 | # the vector of documents is reshaped into a matrix 88 | # with a document list on every row 89 | ind = qptr[:-1,None] + np.arange(0,max_documents)[None,:] 90 | ind = np.minimum(ind,qptr[1:,None]-1) 91 | # warped is now a matrix of size n_queries x max_documents 92 | warped = predictions[ind] 93 | # every document that appears in a row but not in the query list 94 | # (due to n_query_list < max_documents) gets the worst score in off all documents 95 | # this makes sure they do not appear in the final ranking 96 | warped[np.arange(max_documents)[None,:] >= n[:,None]] = max_value + 1 97 | 98 | # tiebreak sort uses numpy to rank every row in the matrix 99 | # this is faster than ranking them by seperate calls 100 | rankings = tiebreak_sort(warped) 101 | if inverted: 102 | inverted = invert_rankings(rankings,dtype=np.int32) 103 | return inverted[np.arange(max_documents)[None,:] < n[:,None]] 104 | 105 | else: 106 | return rankings[np.arange(max_documents)[None,:] < n[:,None]] 107 | 108 | def rank_query(predictions, inverted=False, n_results=None): 109 | """ 110 | Given predicted scores of a single query returns rankings. 111 | """ 112 | ranking = tiebreak_sort(predictions, n_results) 113 | if inverted: 114 | if len(ranking.shape) == 1: 115 | return invert_ranking(ranking,dtype=np.int32) 116 | else: 117 | return invert_rankings(ranking,dtype=np.int32) 118 | else: 119 | return ranking 120 | 121 | def rank_candidate_queries(weights,feature_matrix,qptr,n_results=None,inverted=False): 122 | n_docs = feature_matrix.shape[1] 123 | scores = -np.dot(weights,feature_matrix) 124 | qid_per_doc = np.zeros(n_docs, dtype=np.int32) 125 | qid_per_doc[qptr[1:-1]] = 1 126 | qid_per_doc = np.cumsum(qid_per_doc) 127 | 128 | index_offset = np.zeros(n_docs, dtype=np.int32) 129 | index_offset[:] = qptr[qid_per_doc] 130 | 131 | score_offset = (np.max(np.abs(scores),axis=1)+1.)[:,None]*qid_per_doc[None,:] 132 | scores += score_offset 133 | 134 | descending = rank_query(scores, n_results=n_results) 135 | 136 | if not inverted: 137 | descending -= index_offset[None,:] 138 | return descending, None 139 | else: 140 | inverted = invert_rankings(descending, dtype=np.int64) 141 | descending -= index_offset[None,:] 142 | inverted -= index_offset[None,:] 143 | return descending, inverted 144 | 145 | def get_query_scores(weights, feature_matrix, qptr, ranking_i): 146 | return -np.dot(weights.T,feature_matrix[:,qptr[ranking_i]:qptr[ranking_i+1]]) 147 | 148 | def get_candidate_score_rankings(weights, feature_matrix, qptr, ranking_i, inverted=False): 149 | scores = -np.dot(weights.T,feature_matrix[:,qptr[ranking_i]:qptr[ranking_i+1]]) 150 | return rank_query(scores,inverted) 151 | 152 | def get_candidate_score_ranking(weights,query_feature_matrix,inverted=False): 153 | scores = -np.dot(weights.T,query_feature_matrix) 154 | return rank_query(scores,inverted) 155 | 156 | def rank_single_query(predictions, inverted=False, n_results=None): 157 | """ 158 | Given predicted scores of a single query returns rankings. 159 | """ 160 | ranking = tiebreak_sort(predictions, n_results=n_results) 161 | if inverted: 162 | if len(ranking.shape) == 1: 163 | return invert_ranking(ranking, dtype=np.int32) 164 | else: 165 | return invert_rankings(ranking, dtype=np.int32) 166 | else: 167 | return ranking 168 | 169 | def rank_multiple_queries(predictions, qptr, max_documents=None, 170 | inverted=False, n_results=None): 171 | """ 172 | Given predicted scores for queries rankings are generated and returned. 173 | """ 174 | 175 | max_value = np.max(predictions) 176 | # vector with lenght of each doclist 177 | n = qptr[1:]-qptr[:-1] 178 | if not max_documents: 179 | max_documents = np.max(n) 180 | 181 | # the vector of documents is reshaped into a matrix 182 | # with a document list on every row 183 | ind = qptr[:-1,None] + np.arange(0,max_documents)[None,:] 184 | ind = np.minimum(ind,qptr[1:,None]-1) 185 | # warped is now a matrix of size n_queries x max_documents 186 | warped = predictions[ind] 187 | # every document that appears in a row but not in the query list 188 | # (due to n_query_list < max_documents) gets the worst score in all documents 189 | # this makes sure they do not appear in the final ranking 190 | warped[np.arange(max_documents)[None,:] >= n[:,None]] = max_value + 1 191 | 192 | # tiebreak sort uses numpy to rank every row in the matrix 193 | # this is faster than ranking them by seperate calls 194 | rankings = tiebreak_sort(warped, n_results=n_results) 195 | if inverted: 196 | inverted = invert_rankings(rankings, dtype=np.int32) #index is document id and content is the ranking: inverted[10]=0 means document 10 has highest score 197 | return inverted[np.arange(max_documents)[None,:] < n[:,None]] 198 | else: 199 | return rankings[np.arange(max_documents)[None,:] < n[:,None]] 200 | -------------------------------------------------------------------------------- /utils/simulationoutput.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import os 5 | import sys 6 | import time 7 | from datetime import timedelta 8 | 9 | def create_folders(filename): 10 | if not os.path.exists(os.path.dirname(filename)): 11 | os.makedirs(os.path.dirname(filename)) 12 | 13 | class FileOutput(object): 14 | 15 | def __init__(self, output_file_path, output_header=None, close_between_writes=False, 16 | also_print=False, write_date=False): 17 | self._output_file_path = output_file_path 18 | self._close_between_writes = close_between_writes 19 | self._also_print = also_print 20 | self._original_stdout = sys.stdout 21 | self.write_date = write_date 22 | create_folders(self._output_file_path) 23 | self._output_file = open(self._output_file_path, 'w') 24 | self._file_open = True 25 | self._new_line = True 26 | self._closed = False 27 | if not output_header is None: 28 | self.write(output_header) 29 | self._end_write() 30 | 31 | def _open_file(self): 32 | if not self._file_open: 33 | self._output_file = open(self._output_file_path, 'a') 34 | self._file_open = True 35 | 36 | def _close_file(self): 37 | self._output_file.close() 38 | self._file_open = False 39 | 40 | def _end_write(self): 41 | if self._close_between_writes: 42 | self._close_file() 43 | 44 | def _write_str_to_file(self, output_str): 45 | self._output_file.write(output_str) 46 | self._new_line = output_str[-1] == '\n' 47 | 48 | def flush(self): 49 | if self._also_print: 50 | self._original_stdout.flush() 51 | self._output_file.flush() 52 | 53 | def write(self, output, skip_write_date=False): 54 | assert not self._closed 55 | # if isinstance(output, str): 56 | # output = [output] 57 | # elif isinstance(output, list): 58 | # output = [line + '\n' for line in output] 59 | # assert type(output) is list, 'Expected output to be list, found %s' % type(output) 60 | self._open_file() 61 | for line in output: 62 | if self.write_date and self._new_line and not skip_write_date: 63 | line = '%s: %s' % (time.strftime('%c'), str(line)) 64 | # assert type(line) is str, 'Output element %s is not a str' % line 65 | self._write_str_to_file(str(line)) 66 | if self._also_print: 67 | self._original_stdout.write(line) 68 | self._end_write() 69 | 70 | def close(self): 71 | self._close_file() 72 | self._closed = True 73 | if self._also_print: 74 | self._original_stdout.write('\n') 75 | 76 | 77 | class PrintOutput(object): 78 | 79 | def __init__(self, output_header=None, write_date=False): 80 | self.write_date = write_date 81 | if not output_header is None: 82 | self.write(output_header) 83 | 84 | def write(self, output): 85 | if type(output) is str: 86 | output = [output] 87 | assert type(output) is list, 'Expected output to be list, found %s' % type(output) 88 | for line in output: 89 | if self.write_date: 90 | line = '%s: %s' % (time.strftime('%c'), line) 91 | print line 92 | 93 | def close(self): 94 | pass 95 | 96 | 97 | def get_simulation_report(simulation_arguments): 98 | file_name = sys.argv[0] 99 | if file_name[-3:] == ".py": 100 | file_name = file_name[:-3].split("/")[-1] 101 | date_str = file_name + "-" + time.strftime('Log-%y-%m-%d-%X') 102 | 103 | if not simulation_arguments.log_folder is None \ 104 | and os.path.isdir(simulation_arguments.log_folder): 105 | output_path = simulation_arguments.log_folder + '/' + date_str.replace(' ', '-') + '.txt' 106 | header = ['Starting simulation at %s.' % date_str, 'Log is also stored in output file at %s' 107 | % output_path] 108 | return FileOutput(output_path, output_header=header, also_print=True, write_date=True) 109 | else: 110 | header = ['Starting simulation.', 111 | 'WARNING: No log folder found, log is not stored elsewhere.'] 112 | return PrintOutput(output_header=header, write_date=True) 113 | 114 | 115 | class SimulationOutput(object): 116 | 117 | """ 118 | Class designed to manage the multiprocessing of simulations over multiple datasets. 119 | """ 120 | 121 | def __init__(self, simulation_arguments, simulation_name, dataset, num_click_models, 122 | ranker_arguments, output_averager): 123 | self._start_time = time.time() 124 | self.run_index = 0 125 | self.output_folder = simulation_arguments.output_folder 126 | self.simulation_name = simulation_name 127 | self.dataset_name = dataset.name 128 | self.output_averager = output_averager 129 | self.print_output = simulation_arguments.print_output 130 | self._expected_runs = dataset.num_runs_per_fold * dataset.num_folds * num_click_models 131 | self._closed = False 132 | self.output_path = '%s/%s/%s.out' % (self.output_folder, self.dataset_name, 133 | self.simulation_name) 134 | combined_args = { 135 | 'simulation_arguments': vars(simulation_arguments), 136 | 'ranker_arguments': ranker_arguments, 137 | } 138 | if self.print_output: 139 | output_header = json.dumps(combined_args, sort_keys=True, 140 | indent=4, separators=(',', ': ')) 141 | self.file_output = BufferPrintOutput(output_header=output_header) 142 | else: 143 | output_header = json.dumps(combined_args, separators=(',',':')) 144 | self.file_output = FileOutput(self.output_path, output_header=output_header, 145 | close_between_writes=True, also_print=False, 146 | write_date=False) 147 | 148 | def expected_runs(self): 149 | return self._expected_runs 150 | 151 | def finished(self): 152 | return self._closed and self.run_index == self._expected_runs 153 | 154 | def write_run_output(self, run_output): 155 | assert not self._closed, 'Simulation Output (%s) written to after being closed.' \ 156 | % self.output_path 157 | 158 | if self.print_output: 159 | # self.file_output.write(json.dumps(run_output, sort_keys=True, 160 | # indent=4, separators=(',', ': '))) 161 | self.file_output.pretty_run_write(self.run_index, run_output) 162 | else: 163 | self.file_output.write('\n%s' % json.dumps(run_output)) 164 | 165 | self.run_index += 1 166 | if self.run_index >= self._expected_runs: 167 | self.close() 168 | 169 | def close(self, output_file=None): 170 | # self.file_output.write(['--------END--------']) 171 | # total_time = time.time() - self._start_time 172 | # seconds = total_time % 60 173 | # minutes = total_time / 60 % 60 174 | # hours = total_time / 3600 175 | # self.file_output.write(['Total time taken %02d:%02d:%02d' % (hours, minutes, seconds)]) 176 | self.file_output.close() 177 | self._closed = True 178 | if not self.print_output: 179 | self.output_averager.create_average_file(self) 180 | 181 | 182 | class BufferPrintOutput(object): 183 | 184 | def __init__(self, output_header=None): 185 | self._closed = False 186 | self._output_list = [] 187 | if not output_header is None: 188 | self.write(output_header) 189 | 190 | def flush(self): 191 | pass 192 | 193 | def write(self, output): 194 | assert not self._closed 195 | assert type(output) is str, 'Wrong output format %s' % type(output) 196 | self._output_list.append(output) 197 | 198 | def pretty_run_write(self, run_index, run_output): 199 | run_details = run_output['run_details'] 200 | run_lines = [ 201 | "RUN: %d" % run_index, 202 | "DATAFOLD: %s" % run_details['data folder'], 203 | "CLICK MODEL: %s" % run_details['click model'], 204 | "RUN TIME: %s (%.02f seconds)" % (timedelta(seconds=run_details['runtime']), 205 | run_details['runtime']) 206 | ] 207 | tag = run_details['held-out data'] 208 | for event in run_output['run_results']: 209 | str_line = str(event['iteration']) 210 | if 'display' in event: 211 | str_line += ' DISPLAY: %0.3f' % event['display'] 212 | if 'heldout' in event: 213 | str_line += ' %s: %0.3f' % (tag, event['heldout']) 214 | run_lines.append(str_line) 215 | for line in run_lines: 216 | self.write(line) 217 | 218 | def close(self): 219 | self._closed = True 220 | print 'Run Output\n' + '\n'.join(self._output_list) 221 | self._output_list = [] 222 | --------------------------------------------------------------------------------