├── CreateCompetitionData └── createCompetitionData.py ├── LICENSE.md ├── PostgresIngest ├── exploratoryQueries.sql └── ingest.sql ├── PythonBenchmark ├── SETTINGS.json ├── data_io.py ├── data_io.pyc ├── predict.py ├── randomBenchmark.py ├── reverse_predictions.py ├── testOrderBenchmark.py └── train.py └── README.md /CreateCompetitionData/createCompetitionData.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import csv 3 | import heapq 4 | from itertools import ifilter 5 | import json 6 | import numpy as np 7 | import os 8 | 9 | def expedia_personalized_sort_data_split_seed(): 10 | seed_path = os.environ["SeedPath"] 11 | seeds = json.loads(open(seed_path).read()) 12 | return seeds["ExpediaPersonalizedSortDataSplitSeed"] 13 | 14 | def get_ids(raw_path): 15 | ids_dict = { "srch_ids" : set() 16 | , "site_ids" : set() 17 | , "country_ids" : set() 18 | , "property_ids" : set() 19 | , "destination_ids" : set()} 20 | f = open(raw_path) 21 | # skip the header 22 | f.readline() 23 | reader = csv.reader(f, delimiter="\t") 24 | for row in reader: 25 | ids_dict["srch_ids"].add(row[0]) 26 | ids_dict["site_ids"].add(row[2]) 27 | ids_dict["country_ids"].add(row[3]) 28 | ids_dict["country_ids"].add(row[6]) 29 | ids_dict["property_ids"].add(row[7]) 30 | ids_dict["destination_ids"].add(row[17]) 31 | f.close() 32 | return ids_dict 33 | 34 | def remap_ids(ids_dict): 35 | ids_map = {} 36 | for key in ids_dict: 37 | unique_ids = list(ids_dict[key]) 38 | np.random.shuffle(unique_ids) 39 | ids_map[key] = {old_id: str(new_id+1) for new_id, old_id in enumerate(unique_ids)} 40 | return ids_map 41 | 42 | def save_ids_map(ids_map, release_path): 43 | for key in ids_map: 44 | ids_list = [(old_id, ids_map[key][old_id]) for old_id in ids_map[key]] 45 | f = open(os.path.join(release_path, "%s_id_map.csv" % key), "w") 46 | writer = csv.writer(f, lineterminator="\n") 47 | writer.writerow(["OldId", "NewId"]) 48 | writer.writerows(ids_list) 49 | f.close() 50 | 51 | def remap_row(ids_map, row): 52 | row[0] = ids_map["srch_ids"][row[0]] 53 | row[2] = ids_map["site_ids"][row[2]] 54 | row[3] = ids_map["country_ids"][row[3]] 55 | row[6] = ids_map["country_ids"][row[6]] 56 | row[7] = ids_map["property_ids"][row[7]] 57 | row[17] = ids_map["destination_ids"][row[17]] 58 | 59 | def row_reader(ids_map, raw_reader): 60 | for row in raw_reader: 61 | remap_row(ids_map, row) 62 | yield row 63 | 64 | def split_search_ids(search_ids, train_frac, valid_frac): 65 | index = np.arange(len(search_ids)) 66 | np.random.shuffle(index) 67 | train_end = int(train_frac*len(search_ids)) 68 | valid_end = int((train_frac+valid_frac)*len(search_ids)) 69 | train = index[:train_end] 70 | valid = index[train_end:valid_end] 71 | test = index[valid_end:] 72 | 73 | search_ids_split = dict() 74 | 75 | for i in train: 76 | search_ids_split[search_ids[i]] = "train" 77 | for i in valid: 78 | search_ids_split[search_ids[i]] = "valid" 79 | for i in test: 80 | search_ids_split[search_ids[i]] = "test" 81 | 82 | return search_ids_split 83 | 84 | def row_order_key(row): 85 | return (int(row[0]), int(row[7])) 86 | 87 | def row_filter(row, min_row=None): 88 | if min_row is None: 89 | return True 90 | return row_order_key(row) > row_order_key(min_row) 91 | 92 | def create_competition_data(): 93 | data_path = os.path.join(os.environ["DataPath"], "ExpediaPersonalizedSort") 94 | raw_path = os.path.join(data_path, "Raw", "ExpediaRaw.tsv") 95 | release_path = os.path.join(data_path, "Release 1") 96 | train_path = os.path.join(release_path, "train.csv") 97 | test_path = os.path.join(release_path, "test.csv") 98 | solution_path = os.path.join(release_path, "solution.csv") 99 | position_benchark_path = os.path.join(data_path, "Submissions", "positionBenchmark.csv") 100 | 101 | f_train = open(train_path, "w") 102 | f_test = open(test_path, "w") 103 | f_solution = open(solution_path, "w") 104 | f_position_benchmark = open(position_benchark_path, "w") 105 | 106 | train_writer = csv.writer(f_train, lineterminator="\n") 107 | test_writer = csv.writer(f_test, lineterminator="\n") 108 | solution_writer = csv.writer(f_solution, lineterminator="\n") 109 | position_writer = csv.writer(f_position_benchmark, lineterminator="\n") 110 | 111 | ids_dict = get_ids(raw_path) 112 | ids_map = remap_ids(ids_dict) 113 | save_ids_map(ids_map, release_path) 114 | search_ids_split = split_search_ids(ids_map["srch_ids"].values(), 0.6, 0.1) 115 | 116 | f_raw = open(raw_path) 117 | raw_reader = csv.reader(f_raw, delimiter="\t") 118 | header = raw_reader.next() 119 | reader = row_reader(ids_map, raw_reader) 120 | train_writer.writerow(header) 121 | test_writer.writerow(header[:14] + header[15:51]) 122 | solution_writer.writerow(["SearchId", "PropertyId", "Relevance", "Usage"]) 123 | position_writer.writerow(["SearchId", "PropertyId"]) 124 | 125 | rows_remaining = True 126 | min_row = None 127 | i=0 128 | batch_size = 500000 129 | 130 | # Ugly & slow way to do the sort with limited memory, but it works 131 | # If this was on Unix would save to a temp file and then use Unix sort 132 | while rows_remaining: 133 | remaining_rows = ifilter(lambda r: row_filter(r, min_row), reader) 134 | nsmallest = heapq.nsmallest(batch_size, remaining_rows, key=row_order_key) 135 | solution_set = [row for row in nsmallest if search_ids_split[row[0]] != "train"] 136 | for row in sorted(solution_set, key=lambda row: (int(row[0]), int(row[14]))): 137 | position_writer.writerow([row[0], row[7]]) 138 | for row in nsmallest: 139 | if search_ids_split[row[0]] == "train": 140 | train_writer.writerow(row) 141 | else: 142 | test_writer.writerow(row[:14]+row[15:51]) 143 | relevance = str(min(5, 5*int(row[-1])+int(row[-3]))) 144 | usage = "Public" if search_ids_split[row[0]] == "valid" else "Private" 145 | 146 | solution_row = [row[0], row[7], relevance, usage] 147 | solution_writer.writerow(solution_row) 148 | i += len(nsmallest) 149 | print("%dk rows processed" % int(i/1000)) 150 | if len(nsmallest)==batch_size: 151 | min_row = nsmallest[-1] 152 | else: 153 | rows_remaining = False 154 | f_raw.close() 155 | f_raw = open(raw_path) 156 | raw_reader = csv.reader(f_raw, delimiter="\t") 157 | header = raw_reader.next() 158 | reader = row_reader(ids_map, raw_reader) 159 | 160 | f_train.flush() 161 | f_test.flush() 162 | f_solution.flush() 163 | f_position_benchmark.flush() 164 | 165 | f_raw.close() 166 | f_train.close() 167 | f_test.close() 168 | f_solution.close() 169 | f_position_benchmark.close() 170 | 171 | if __name__=="__main__": 172 | np.random.seed(expedia_personalized_sort_data_split_seed()) 173 | create_competition_data() -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Kaggle 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /PostgresIngest/exploratoryQueries.sql: -------------------------------------------------------------------------------- 1 | SELECT srch_id, 2 | COUNT(srch_id) 3 | FROM UserSearch 4 | GROUP BY srch_id 5 | ORDER BY COUNT(srch_id) DESC; 6 | 7 | SELECT srch_id, 8 | SUM(click_bool) num_clicks, 9 | SUM(booking_bool) num_bookings 10 | FROM UserSearch 11 | GROUP BY srch_id 12 | ORDER BY SUM(click_bool) DESC; 13 | 14 | SELECT srch_id, 15 | SUM(click_bool) num_clicks, 16 | SUM(booking_bool) num_bookings 17 | FROM UserSearch 18 | GROUP BY srch_id 19 | ORDER BY SUM(booking_bool) DESC; 20 | 21 | SELECT SUM(booking_bool) num_bookings, 22 | COUNT(DISTINCT srch_id) num_searches 23 | FROM UserSearch; 24 | 25 | SELECT random_bool, 26 | COUNT(random_bool) 27 | FROM UserSearch 28 | GROUP BY random_bool; 29 | 30 | SELECT min(date_time), 31 | max(date_time) 32 | FROM UserSearch; 33 | 34 | SELECT srch_id, COUNT(DISTINCT date_time) 35 | FROM UserSearch 36 | GROUP BY srch_id 37 | ORDER BY COUNT(DISTINCT date_time) DESC; 38 | 39 | SELECT site_id, COUNT(site_id) 40 | FROM UserSearch 41 | GROUP BY site_id; 42 | 43 | SELECT site_id, COUNT(site_id) 44 | FROM UserSearch 45 | GROUP BY site_id 46 | ORDER BY COUNT(site_id) DESC; 47 | 48 | SELECT prop_country_id, COUNT(prop_country_id) 49 | FROM UserSearch 50 | GROUP BY prop_country_id 51 | ORDER BY COUNT(prop_country_id) DESC; 52 | 53 | SELECT prop_id, COUNT(prop_id) 54 | FROM UserSearch 55 | GROUP BY prop_id 56 | ORDER BY COUNT(prop_id) DESC; 57 | 58 | SELECT visitor_location_country_id, COUNT(visitor_location_country_id) 59 | FROM UserSearch 60 | GROUP BY visitor_location_country_id 61 | ORDER BY COUNT(visitor_location_country_id) DESC; 62 | 63 | SELECT DISTINCT prop_starrating 64 | FROM UserSearch; -------------------------------------------------------------------------------- /PostgresIngest/ingest.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE TABLE UserSearch ( 3 | srch_id BIGINT, 4 | date_time TIMESTAMP WITH TIME ZONE, 5 | site_id BIGINT, 6 | visitor_location_country_id BIGINT, 7 | visitor_hist_starrating DOUBLE PRECISION, 8 | visitor_hist_adr_usd DOUBLE PRECISION, 9 | prop_country_id BIGINT, 10 | prop_id BIGINT, 11 | prop_starrating BIGINT, 12 | prop_review_score DOUBLE PRECISION, 13 | prop_brand_bool BIGINT, 14 | prop_location_score1 DOUBLE PRECISION, 15 | prop_location_score2 DOUBLE PRECISION, 16 | prop_log_historical_price DOUBLE PRECISION, 17 | position BIGINT, 18 | price_usd DOUBLE PRECISION, 19 | promotion_flag BIGINT, 20 | srch_destination_id BIGINT, 21 | srch_length_of_stay BIGINT, 22 | srch_booking_window BIGINT, 23 | srch_adults_count BIGINT, 24 | srch_children_count BIGINT, 25 | srch_room_count BIGINT, 26 | srch_saturday_night_bool BIGINT, 27 | srch_query_affinity_score DOUBLE PRECISION, 28 | orig_destination_distance DOUBLE PRECISION, 29 | random_bool BIGINT, 30 | comp1_rate DOUBLE PRECISION, 31 | comp1_inv DOUBLE PRECISION, 32 | comp1_rate_percent_diff DOUBLE PRECISION, 33 | comp2_rate DOUBLE PRECISION, 34 | comp2_inv DOUBLE PRECISION, 35 | comp2_rate_percent_diff DOUBLE PRECISION, 36 | comp3_rate DOUBLE PRECISION, 37 | comp3_inv DOUBLE PRECISION, 38 | comp3_rate_percent_diff DOUBLE PRECISION, 39 | comp4_rate DOUBLE PRECISION, 40 | comp4_inv DOUBLE PRECISION, 41 | comp4_rate_percent_diff DOUBLE PRECISION, 42 | comp5_rate DOUBLE PRECISION, 43 | comp5_inv DOUBLE PRECISION, 44 | comp5_rate_percent_diff DOUBLE PRECISION, 45 | comp6_rate DOUBLE PRECISION, 46 | comp6_inv DOUBLE PRECISION, 47 | comp6_rate_percent_diff DOUBLE PRECISION, 48 | comp7_rate DOUBLE PRECISION, 49 | comp7_inv DOUBLE PRECISION, 50 | comp7_rate_percent_diff DOUBLE PRECISION, 51 | comp8_rate DOUBLE PRECISION, 52 | comp8_inv DOUBLE PRECISION, 53 | comp8_rate_percent_diff DOUBLE PRECISION, 54 | click_bool BIGINT, 55 | gross_bookings_usd DOUBLE PRECISION, 56 | booking_bool BIGINT); 57 | 58 | COPY UserSearch FROM 'C:\Users\ben_000\Dropbox\Data\ExpediaPersonalizedSort\Raw\ExpediaRaw.tsv' DELIMITERS '(tab goes here)' CSV HEADER NULL AS 'NULL'; -------------------------------------------------------------------------------- /PythonBenchmark/SETTINGS.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_path": "$DataPath/ExpediaPersonalizedSort/Models/basicPythonBenchmark.pickle", 3 | "submission_path": "$DataPath/ExpediaPersonalizedSort/Submissions/basicPythonBenchmark.csv", 4 | "train_path": "$DataPath/ExpediaPersonalizedSort/Release 1/train.csv", 5 | "test_path": "$DataPath/ExpediaPersonalizedSort/Release 1/test.csv" 6 | } 7 | -------------------------------------------------------------------------------- /PythonBenchmark/data_io.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from operator import itemgetter 3 | import os 4 | import json 5 | import pickle 6 | import pandas as pd 7 | 8 | def get_paths(): 9 | paths = json.loads(open("SETTINGS.json").read()) 10 | for key in paths: 11 | paths[key] = os.path.expandvars(paths[key]) 12 | return paths 13 | 14 | def read_train(): 15 | train_path = get_paths()["train_path"] 16 | return pd.read_csv(train_path) 17 | 18 | def read_test(): 19 | test_path = get_paths()["test_path"] 20 | return pd.read_csv(test_path) 21 | 22 | def save_model(model): 23 | out_path = get_paths()["model_path"] 24 | pickle.dump(model, open(out_path, "w")) 25 | 26 | def load_model(): 27 | in_path = get_paths()["model_path"] 28 | return pickle.load(open(in_path)) 29 | 30 | def write_submission(recommendations, submission_file=None): 31 | if submission_file is None: 32 | submission_path = get_paths()["submission_path"] 33 | else: 34 | path, file_name = os.path.split(get_paths()["submission_path"]) 35 | submission_path = os.path.join(path, submission_file) 36 | rows = [(srch_id, prop_id) 37 | for srch_id, prop_id, rank_float 38 | in sorted(recommendations, key=itemgetter(0,2))] 39 | writer = csv.writer(open(submission_path, "w"), lineterminator="\n") 40 | writer.writerow(("SearchId", "PropertyId")) 41 | writer.writerows(rows) 42 | -------------------------------------------------------------------------------- /PythonBenchmark/data_io.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benhamner/ExpediaPersonalizedSortCompetition/432cfee41ea2949328d56972cc0bfc82b3b81157/PythonBenchmark/data_io.pyc -------------------------------------------------------------------------------- /PythonBenchmark/predict.py: -------------------------------------------------------------------------------- 1 | import data_io 2 | 3 | def main(): 4 | print("Reading test data") 5 | test = data_io.read_test() 6 | test.fillna(0, inplace=True) 7 | 8 | feature_names = list(test.columns) 9 | feature_names.remove("date_time") 10 | 11 | features = test[feature_names].values 12 | 13 | print("Loading the classifier") 14 | classifier = data_io.load_model() 15 | 16 | print("Making predictions") 17 | predictions = classifier.predict_proba(features)[:,1] 18 | predictions = list(-1.0*predictions) 19 | recommendations = zip(test["srch_id"], test["prop_id"], predictions) 20 | 21 | print("Writing predictions to file") 22 | data_io.write_submission(recommendations) 23 | 24 | if __name__=="__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /PythonBenchmark/randomBenchmark.py: -------------------------------------------------------------------------------- 1 | import data_io 2 | import numpy as np 3 | 4 | def main(): 5 | print("Reading test data") 6 | test = data_io.read_test() 7 | 8 | np.random.seed(1) 9 | ordinals = np.arange(len(test)) 10 | np.random.shuffle(ordinals) 11 | 12 | recommendations = zip(test["srch_id"], test["prop_id"], ordinals) 13 | 14 | print("Writing predictions to file") 15 | data_io.write_submission(recommendations, "randomBenchmark.csv") 16 | 17 | if __name__=="__main__": 18 | main() 19 | -------------------------------------------------------------------------------- /PythonBenchmark/reverse_predictions.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import data_io 3 | 4 | def main(): 5 | submission_path = data_io.get_paths()["submission_path"] 6 | reader = csv.reader(open(submission_path)) 7 | reader.next() # skipping the header 8 | recommendations = [(int(row[0]), int(row[1]), -i) 9 | for i,row in enumerate(reader)] 10 | out_path = submission_path[:-4]+"Reversed.csv" 11 | data_io.write_submission(recommendations, submission_path=out_path) 12 | 13 | if __name__=="__main__": 14 | main() -------------------------------------------------------------------------------- /PythonBenchmark/testOrderBenchmark.py: -------------------------------------------------------------------------------- 1 | import data_io 2 | import numpy as np 3 | 4 | def main(): 5 | print("Reading test data") 6 | test = data_io.read_test() 7 | 8 | ordinals = np.arange(len(test)) 9 | 10 | recommendations = zip(test["srch_id"], test["prop_id"], ordinals) 11 | 12 | print("Writing predictions to file") 13 | data_io.write_submission(recommendations, "testOrderBenchmark.csv") 14 | 15 | if __name__=="__main__": 16 | main() 17 | -------------------------------------------------------------------------------- /PythonBenchmark/train.py: -------------------------------------------------------------------------------- 1 | import data_io 2 | from sklearn.ensemble import RandomForestClassifier 3 | 4 | def main(): 5 | print("Reading training data") 6 | train = data_io.read_train() 7 | train.fillna(0, inplace=True) 8 | 9 | train_sample = train[:100000].fillna(value=0) 10 | 11 | feature_names = list(train_sample.columns) 12 | feature_names.remove("click_bool") 13 | feature_names.remove("booking_bool") 14 | feature_names.remove("gross_bookings_usd") 15 | feature_names.remove("date_time") 16 | feature_names.remove("position") 17 | 18 | features = train_sample[feature_names].values 19 | target = train_sample["booking_bool"].values 20 | 21 | print("Training the Classifier") 22 | classifier = RandomForestClassifier(n_estimators=50, 23 | verbose=2, 24 | n_jobs=1, 25 | min_samples_split=10, 26 | random_state=1) 27 | classifier.fit(features, target) 28 | 29 | print("Saving the classifier") 30 | data_io.save_model(classifier) 31 | 32 | if __name__=="__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Expedia Personalized Sort Competition 2 | ===================================== 3 | 4 | This repo contains a benchmark and sample code in Python for the [Expedia Personalized Sort Competition](https://www.kaggle.com/c/expedia-personalized-sort), a machine learning challenged hosted by [Kaggle](https://www.kaggle.com) in conjunction with [Expedia](http://www.expedia.com/). 5 | 6 | It also contains the transformation code used to create the competition data files from the raw data in the CreateCompetitionData directory. This code is provided for your information only (and does not need to be looked at or run by competition participants). 7 | 8 | This version of the repo contains the **Basic Python Benchmark**. Future benchmarks may be included here as well and will be marked with git tags. 9 | 10 | This benchmark is intended to provide a simple example of reading the data and creating the submission file, not as a state of the art benchmark on this problem. 11 | 12 | Executing this benchmark requires Python 2.7, along with the Python package sklearn version 0.13, and pandas version 0.10.1 (other versions may work, but this has not been tested). 13 | 14 | To run the benchmark, 15 | 16 | 1. [Download data.zip from the competition page](https://www.kaggle.com/c/expedia-personalized-sort/data). This contains the dataset as two csv files, train.csv and test.csv. 17 | 3. Switch to the "PythonBenchmark" directory 18 | 4. Modify SETTINGS.json to include the paths to the data files, as well as a place to save the trained model and a place to save the submission 19 | 5. Train the model by running `python train.py` 20 | 6. Make predictions on the validation set by running `python predict.py` 21 | 7. [Make a submission](https://www.kaggle.com/c/expedia-personalized-sort/team/select) with the output file 22 | 23 | This benchmark took less than 5 minutes to execute on a Windows 8 laptop with 8GB of RAM and 4 cores at 2.7GHz. --------------------------------------------------------------------------------