├── .DS_Store ├── stack ├── StackNet.jar ├── start.sh ├── parse.py ├── params.txt └── utils.py ├── ppt └── AIC-Sharing-11-19.pptx ├── README.md ├── classifiers.py ├── modelTraining.py └── preprocess.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/HEAD/.DS_Store -------------------------------------------------------------------------------- /stack/StackNet.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/HEAD/stack/StackNet.jar -------------------------------------------------------------------------------- /ppt/AIC-Sharing-11-19.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/HEAD/ppt/AIC-Sharing-11-19.pptx -------------------------------------------------------------------------------- /stack/start.sh: -------------------------------------------------------------------------------- 1 | java -Xmx3048m -jar StackNet.jar train train_file=train_stacknet.csv test_file=test_stacknet.csv params=params.txt pred_file=sigma_stack_pred.csv test_target=true verbose=true Threads=4 stackdata=false folds=5 seed=1 metric=logloss 2 | -------------------------------------------------------------------------------- /stack/parse.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | 4 | with open("result.txt", "r") as f: 5 | raw = "".join(f.readlines()) 6 | 7 | str_res = re.findall(pattern="logloss : 0\.[0-9]+", string=raw) 8 | res = [float(x.split(" : ")[1]) for x in str_res] 9 | results = {i: [] for i in range(len(res) // 5)} 10 | for i in range(len(res)): 11 | results[i % (len(res) // 5)].append(res[i]) 12 | results = {i: np.mean(results[i]) for i in results} 13 | for item in sorted(results.items(), key=lambda x: x[1]): 14 | print(item) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | * ```preprocess.py```: data cleaning, feature engineering 4 | * ```modelTraining.py```: cross validation, submission generating, stacking preparing 5 | * ```classifiers.py```: my encapsulation of xgboost 6 | * stack 7 | * ```StackNet.jar```: stacking tools shared by KazAnova, repo is [here](https://github.com/kaz-Anova/StackNet) 8 | * ```parse.py```: tools for evaluate the cv scores during stacking. 9 | * ```utils.py```: generating submission after StackNet 10 | * ```start.sh```: commands for executing StackNet 11 | * ```params.txt```: my params for stacking 12 | 13 | ### links: 14 | * [Kaggle:Rental Listing Inquireies](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries) 15 | * [Summary of getting a silver medal in kaggle](http://scarletpan.github.io/summary-of-get-a-silver-medal-in-kaggle/) 16 | * [Kaggle 首战拿银总结 | 入门指导 (长文、干货) -- 知乎专栏](https://zhuanlan.zhihu.com/p/26645088) 17 | * [AI Challenge 分享会PPT](https://github.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/blob/master/ppt/AIC-Sharing-11-19.pptx) 18 | 19 | -------------------------------------------------------------------------------- /stack/params.txt: -------------------------------------------------------------------------------- 1 | LogisticRegression Type:Liblinear C:6.1 threads:1 usescale:True maxim_Iteration:200 seed:1 verbose:false 2 | GradientBoostingForestClassifier estimators:300 shrinkage:0.18 threads:1 offset:0.00001 max_depth:3 max_features:0.65 min_leaf:2.0 min_split:7.0 Objective:RMSE row_subsample:1.0 seed:1 verbose:false 3 | LibFmClassifier maxim_Iteration:70 C:0.0041 C2:0.00120 lfeatures:1 seed:1 usescale:True init_values:0.046 learn_rate:0.05 smooth:0.1 threads:1 verbose:false 4 | softmaxnnclassifier usescale:True seed:1 Type:SGD maxim_Iteration:50 C:0.0000008 shuffle:false tolerance:0.01 learn_rate:0.0065 smooth:0.1 h1:40 h2:35 connection_nonlinearity:Relu init_values:0.020 verbose:false 5 | RandomForestClassifier bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.95 seed:1 verbose:false 6 | AdaboostRandomForestClassifier bootsrap:false weight_thresold:0.95 estimators:100 threads:1 max_depth:6 max_features:0.5 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.9 seed:1 verbose:false 7 | GradientBoostingForestRegressor bootsrap:false estimators:300 shrinkage:0.1 threads:1 offset:0.00001 max_depth:3 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.9 seed:1 verbose:false 8 | RandomForestRegressor bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.95 seed:1 verbose:false 9 | LibFmRegressor maxim_Iteration:70 C:0.0001 C2:0.0009 lfeatures:2 seed:1 usescale:True init_values:0.1 learn_rate:0.1 threads:1 verbose:false 10 | 11 | RandomForestClassifier bootsrap:false estimators:500 threads:3 offset:0.00001 max_depth:5 max_features:0.3 min_leaf:1.0 min_split:5.0 Objective:ENTROPY row_subsample:0.8 seed:1 verbose:false -------------------------------------------------------------------------------- /classifiers.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | import numpy as np 3 | from sklearn.metrics import log_loss 4 | 5 | 6 | class xgboostClassifier(): 7 | def __init__(self, **params): 8 | self.clf = None 9 | self.progress = {} 10 | self.params = params 11 | 12 | def fit(self, X, y): 13 | xg_train = xgb.DMatrix(X, label=y) 14 | self.clf = xgb.train(self.params, xg_train, self.params['num_rounds']) 15 | 16 | def fit_CV(self, X_train, X_val, y_train, y_val): 17 | xg_train = xgb.DMatrix(X_train, label=y_train) 18 | xg_val = xgb.DMatrix(X_val, label=y_val) 19 | watchlist = [(xg_train, 'train'), (xg_val, 'eval')] 20 | self.clf = xgb.train(self.params, xg_train, self.params['num_rounds'], 21 | watchlist, early_stopping_rounds=200, evals_result=self.progress) 22 | 23 | def get_eval_res(self): 24 | return self.progress 25 | 26 | def score(self, X, y): 27 | Y = self.predict_proba(X) 28 | return 1 / log_loss(y, Y) 29 | 30 | def predict_proba(self, X_test): 31 | res = self.clf.predict(xgb.DMatrix(X_test)) 32 | return res.astype(np.float32) 33 | 34 | def predict(self, X_test): 35 | res = np.argmax(self.clf.predict(xgb.DMatrix(X_test)), axis=1) 36 | return res 37 | 38 | def get_params(self, **params): 39 | return self.params 40 | 41 | def set_params(self, **params): 42 | self.params.update(params) 43 | 44 | def getSortedImportance(self, features): 45 | with open('xgb.fmap', 'w') as f: 46 | for i in range(len(features)): 47 | f.write('{0}\t{1}\tq\n'.format(i, features[i])) 48 | importance = self.clf.get_fscore(fmap='xgb.fmap') 49 | importance = sorted(importance.items(), key=operator.itemgetter(1)) 50 | #print(importance) 51 | return importance 52 | 53 | class BaseClassifier(object): 54 | def __init__(self, clf, seed=0, params=None): 55 | params['random_state'] = seed 56 | self.clf = clf(**params) 57 | 58 | def train(self, x_train, y_train): 59 | self.clf.fit(x_train, y_train) 60 | 61 | def predict(self, x): 62 | return self.clf.predict(x) 63 | 64 | def predict_proba(self, x): 65 | return self.clf.predict_proba(x) 66 | 67 | def fit(self,x,y): 68 | return self.clf.fit(x,y) 69 | 70 | def set_params(self, **params): 71 | self.params.update(params) 72 | -------------------------------------------------------------------------------- /stack/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def getAvgSub(subs_in): 6 | subs = [] 7 | for sub in subs_in: 8 | sub = sub.sort_values(by=["listing_id"]).reset_index() 9 | subs.append(sub) 10 | n = len(subs) 11 | new_sub = subs[0].copy() 12 | for i in range(1, n): 13 | sub = subs[i] 14 | new_sub["high"] = new_sub["high"] + sub["high"] 15 | new_sub["medium"] = new_sub["medium"] + sub["medium"] 16 | new_sub["low"] = new_sub["low"] + sub["low"] 17 | new_sub["high"] = new_sub["high"] / n 18 | new_sub["medium"] = new_sub["medium"] / n 19 | new_sub["low"] = new_sub["low"] / n 20 | del new_sub["index"] 21 | return new_sub 22 | 23 | def getWeightedAvgSub(subs_in, weights): 24 | assert np.sum(weights) == 1, "Sum of weights need to be 1" 25 | subs = [] 26 | for sub in subs_in: 27 | sub = sub.sort_values(by=["listing_id"]).reset_index() 28 | subs.append(sub) 29 | n = len(subs) 30 | new_sub = subs[0].copy() 31 | new_sub["high"] = new_sub["high"] * weights[0] 32 | new_sub["medium"] = new_sub["medium"] * weights[0] 33 | new_sub["low"] = new_sub["low"] * weights[0] 34 | for i in range(1, n): 35 | sub = subs[i] 36 | new_sub["high"] = new_sub["high"] + sub["high"] * weights[i] 37 | new_sub["medium"] = new_sub["medium"] + sub["medium"] * weights[i] 38 | new_sub["low"] = new_sub["low"] + sub["low"] * weights[i] 39 | del new_sub["index"] 40 | return new_sub 41 | 42 | def generateStackSub(test_file_name, sub_file_name): 43 | test_array = np.loadtxt(test_file_name, delimiter=",") 44 | test = pd.DataFrame(test_array) 45 | sub_array = np.loadtxt(sub_file_name, delimiter=",") 46 | sub = pd.DataFrame(sub_array) 47 | sub.columns = ["high", "medium", "low"] 48 | sub["listing_id"] = test.iloc[:, 0].apply(lambda x: int(x)) 49 | sub.to_csv("new_sub.csv", index=False) 50 | 51 | 52 | 53 | 54 | def correct(df): 55 | interest_levels = ['low', 'medium', 'high'] 56 | 57 | tau = { 58 | 'low': 0.69195995, 59 | 'medium': 0.23108864, 60 | 'high': 0.07695141, 61 | } 62 | 63 | y = df[interest_levels].mean() 64 | a = [tau[k] / y[k] for k in interest_levels] 65 | print(a) 66 | 67 | def f(p): 68 | for k in range(len(interest_levels)): 69 | p[k] *= a[k] 70 | return p / p.sum() 71 | 72 | df_correct = df.copy() 73 | df_correct[interest_levels] = df_correct[interest_levels].apply(f, axis=1) 74 | 75 | y = df_correct[interest_levels].mean() 76 | a = [tau[k] / y[k] for k in interest_levels] 77 | print(a) 78 | 79 | return df_correct -------------------------------------------------------------------------------- /modelTraining.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import random 4 | from collections import defaultdict 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.model_selection import KFold, StratifiedKFold, train_test_split 8 | from sklearn.metrics import log_loss 9 | from preprocess import coreProcess 10 | from classifiers import xgboostClassifier 11 | 12 | TRAIN_FILE_NAME = '~/Kaggle/RLI/input/train.json' 13 | TEST_FILE_NAME = '~/Kaggle/RLI/input/test.json' 14 | target_num_map = {'high': 0, 'medium': 1, 'low': 2} 15 | train_data = pd.read_json(TRAIN_FILE_NAME).reset_index() 16 | test_data = pd.read_json(TEST_FILE_NAME).reset_index() 17 | list_img_time = pd.read_csv("~/Kaggle/RLI/input/listing_image_time.csv") 18 | train_data = train_data.merge(list_img_time, left_on="listing_id", right_on="Listing_Id", how='inner') 19 | test_data = test_data.merge(list_img_time, left_on="listing_id", right_on="Listing_Id", how='inner') 20 | RS = 2016 21 | random.seed(RS) 22 | np.random.seed(RS) 23 | # RS = 0 24 | 25 | def validation_score(early_stop=False): 26 | clf = xgboostClassifier( 27 | objective = 'multi:softprob', 28 | eval_metric = 'mlogloss', 29 | num_class = 3, 30 | nthread = 3, 31 | eta = 0.04, 32 | max_depth = 6, 33 | subsample = 0.7, 34 | colsample_bytree = 1.0, 35 | colsample_bylevel = 0.7, 36 | min_child_weight=1, 37 | silent = 1, 38 | num_rounds = 700, 39 | seed = RS, 40 | ) 41 | print("*** Validation start ***") 42 | data = train_data.copy() 43 | y = data["interest_level"].apply(lambda x: target_num_map[x]) 44 | del data["interest_level"] 45 | 46 | # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True) 47 | skf = StratifiedKFold(n_splits=3, shuffle=False) 48 | cv_scores = [] 49 | i = 0 50 | for train_idx, val_idx in skf.split(data, y): 51 | i += 1 52 | X = data.copy() 53 | y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] 54 | X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) 55 | clf.fit(X_train, y_train) 56 | # clf.fit_CV(X_train, X_val, y_train, y_val) 57 | y_val_pred = clf.predict_proba(X_val) 58 | loss = log_loss(y_val, y_val_pred) 59 | print("Iteration {}'s loss: {}".format(i, loss)) 60 | cv_scores.append(loss) 61 | if early_stop: 62 | break 63 | print("*** Validation finished ***\n") 64 | return cv_scores 65 | 66 | 67 | def validation_avg_score(clfs): 68 | print("*** Validation start ***") 69 | data = train_data.copy() 70 | y = data["interest_level"].apply(lambda x: target_num_map[x]) 71 | del data["interest_level"] 72 | 73 | # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True) 74 | skf = StratifiedKFold(n_splits=3) 75 | cv_scores = {i:[] for i in range(len(clfs))} 76 | cv_scores["Avg"] = [] 77 | i = 0 78 | for train_idx, val_idx in skf.split(data, y): 79 | i += 1 80 | X = data.copy() 81 | y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] 82 | X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) 83 | tmp = [] 84 | preds = [] 85 | j = 0 86 | for clf in clfs: 87 | clf.fit(X_train, y_train) 88 | y_val_pred = clf.predict_proba(X_val) 89 | tmp.append(y_val_pred) 90 | loss = log_loss(y_val, y_val_pred) 91 | cv_scores[j].append(loss) 92 | preds.append(y_val_pred) 93 | j += 1 94 | print("clf_{}, Iteration {}'s loss: {}".format(j, i, loss)) 95 | preds = np.array(preds) 96 | avg_pred = np.mean(preds, axis=0) 97 | loss = log_loss(y_val, avg_pred) 98 | cv_scores["Avg"].append(loss) 99 | print("Iteration {}'s Avg loss: {}".format(i, loss)) 100 | for i in range(len(clfs)): 101 | print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i]))) 102 | print("Average validation loss : {}".format(np.mean(cv_scores["Avg"]))) 103 | print("*** Validation finished ***\n") 104 | return cv_scores["Avg"] 105 | 106 | 107 | def paramSearch(clf, param_dict): 108 | 109 | def outer_join(left, right): 110 | if left == []: 111 | return right 112 | if right == []: 113 | return left 114 | res = [] 115 | for i in left: 116 | for j in right: 117 | if isinstance(i, list): 118 | tmp = i[:] 119 | tmp.append(j) 120 | res.append(tmp) 121 | else: 122 | res.append([i, j]) 123 | return res 124 | # Creating list of param_dict 125 | param_list = sorted(param_dict.items(), key=lambda x: x[0]) 126 | param_keys = [ item[0] for item in param_list ] 127 | param_vals = [ item[1] for item in param_list ] 128 | all_vals = [] 129 | for val in param_vals: 130 | all_vals = outer_join(all_vals, val) 131 | all_param_lists = [] 132 | for vals in all_vals: 133 | all_param_lists.append(dict(zip(param_keys, vals))) 134 | # for item in all_param_lists: 135 | # print(item) 136 | 137 | # Searching 138 | best_score = float('inf') 139 | best_params = None 140 | scores = [] 141 | i = 0 142 | for params in all_param_lists: 143 | print("\n" + "-" * 70) 144 | for param_name in params.keys(): 145 | print("{} : {}".format(param_name, params[param_name])) 146 | clf.set_params(**params) 147 | score = np.mean(validation_score(clf)) 148 | if score < best_score: 149 | best_score = score 150 | best_params = params 151 | i += 1 152 | print("{} / {}, Done".format(i, len(all_param_lists))) 153 | print("Score: ", score) 154 | scores.append(score) 155 | print(scores) 156 | print("Best parameters:") 157 | for param_name in best_params.keys(): 158 | print("{} : {}".format(param_name, best_params[param_name])) 159 | print("Score: ", best_score) 160 | 161 | 162 | def gen_sub(): 163 | train = train_data.copy() 164 | train_idx = [i for i in range(train.shape[0])] 165 | test = test_data.copy() 166 | test_idx = [i + train.shape[0] for i in range(test.shape[0])] 167 | y = train["interest_level"].apply(lambda x: target_num_map[x]) 168 | del train["interest_level"] 169 | data = pd.concat([train, test]).reset_index() 170 | X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx) 171 | xgb_clf = xgboostClassifier( 172 | objective = 'multi:softprob', 173 | eval_metric = 'mlogloss', 174 | num_class = 3, 175 | nthread = 12, 176 | eta = 0.02, 177 | max_depth = 6, 178 | subsample = 0.8, 179 | colsample_bytree = 1.0, 180 | colsample_bylevel = 0.8, 181 | min_child_weight=1, 182 | silent = 1, 183 | num_rounds = 1700, 184 | seed = RS, 185 | ) 186 | print("Trainning:...") 187 | xgb_clf.fit(X_train, y) 188 | 189 | preds = xgb_clf.predict_proba(X_test) 190 | sub = pd.DataFrame(preds) 191 | # sub.columns = ["high", "medium", "low"] 192 | sub.columns = [ "high", "medium", "low"] 193 | sub["listing_id"] = test.listing_id.values 194 | sub.to_csv("submission.csv", index=False) 195 | 196 | 197 | def genAvgSub(clfs): 198 | train = train_data.copy() 199 | train_idx = [i for i in range(train.shape[0])] 200 | test = test_data.copy() 201 | test_idx = [i + train.shape[0] for i in range(test.shape[0])] 202 | y = train["interest_level"].apply(lambda x: target_num_map[x]) 203 | del train["interest_level"] 204 | data = pd.concat([train, test]).reset_index() 205 | X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx) 206 | print("Trainning:...") 207 | preds = [] 208 | for i in range(len(clfs)): 209 | print("Clf_{} fiting".format(i)) 210 | clfs[i].fit(X_train, y) 211 | print("Clf_{} predicting".format(i)) 212 | pred = clfs[i].predict_proba(X_test) 213 | preds.append(pred) 214 | sub = pd.DataFrame(np.mean(preds, axis=0)) 215 | # sub.columns = ["high", "medium", "low"] 216 | sub.columns = [ "high", "medium", "low"] 217 | sub["listing_id"] = test.listing_id.values 218 | sub.to_csv("submission.csv", index=False) 219 | print("Train done.") 220 | 221 | 222 | def validate(clfs): 223 | cv_scores = validation_avg_score(clfs) 224 | return cv_scores 225 | 226 | 227 | def search(): 228 | param_dict = { 229 | 'eta' : [0.02], 230 | 'max_depth' : [6], 231 | 'subsample' : [0.8], 232 | 'colsample_bylevel' : [0.7], 233 | 'num_rounds' : [1400, 1500, 1600, 1650], 234 | } 235 | clf = xgboostClassifier( 236 | objective = 'multi:softprob', 237 | eval_metric = 'mlogloss', 238 | num_class = 3, 239 | nthread = 12, 240 | eta = 0.04, 241 | max_depth = 6, 242 | subsample = 0.7, 243 | colsample_bytree = 1.0, 244 | colsample_bylevel = 1.0, 245 | min_child_weight=1, 246 | silent = 1, 247 | num_rounds = 700, 248 | seed = RS, 249 | ) 250 | paramSearch(clf, param_dict) 251 | 252 | 253 | def write2file(cv_scores, val_desc=None): 254 | print("*" * 50) 255 | print("Cross validation loss: ", np.mean(cv_scores)) 256 | with open("results.log", "a") as fp: 257 | fp.write(time.strftime("%m/%d/%Y %H:%M") + '\n') 258 | if(val_desc is not None): 259 | fp.write(val_desc + '\n') 260 | for score in cv_scores: 261 | fp.write(str(score) + " ") 262 | fp.write("\nCross Validation: {}\n".format(np.array(cv_scores).mean())) 263 | fp.write("*" * 50 + "\n") 264 | 265 | 266 | def stacking(clfs): 267 | print("Stacking") 268 | train = train_data.copy() 269 | test = test_data.copy() 270 | y = train["interest_level"].apply(lambda x: target_num_map[x]) 271 | del train["interest_level"] 272 | train_stackers = [] 273 | for RS in [0, 1, 2, 64, 128, 256, 512, 1024, 2048, 4096]: 274 | skf = StratifiedKFold(n_splits=10, random_state=RS, shuffle=True) 275 | #Create Arrays for meta 276 | train_stacker = [[0.0 for s in range(3)] for k in range (0,(train.shape[0]))] 277 | cv_scores = {i:[] for i in range(len(clfs))} 278 | cv_scores["Avg"] = [] 279 | print("Begin 10-flod cross validation") 280 | cnt = 0 281 | for train_idx, val_idx in skf.split(train, y): 282 | cnt += 1 283 | X = train.copy() 284 | y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] 285 | X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) 286 | X_train.toarray() 287 | preds = [] 288 | k = 0 289 | for clf in clfs: 290 | clf.fit(X_train, y_train) 291 | y_val_pred = clf.predict_proba(X_val) 292 | loss = log_loss(y_val, y_val_pred) 293 | preds.append(y_val_pred) 294 | cv_scores[k].append(loss) 295 | k += 1 296 | print("Clf_{} iteration {}'s loss: {}".format(k, cnt, loss)) 297 | preds = np.array(preds) 298 | avg_pred = np.mean(preds, axis=0) 299 | avg_loss = log_loss(y_val, avg_pred) 300 | cv_scores["Avg"].append(avg_loss) 301 | print("Iteration {}'s Avg loss: {}".format(cnt, avg_loss)) 302 | no = 0 303 | for real_idx in val_idx: 304 | for i in range(3): 305 | train_stacker[real_idx][i] = avg_pred[no][i] 306 | no += 1 307 | for i in range(len(clfs)): 308 | print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i]))) 309 | print("Average validation loss : {}".format(np.mean(cv_scores["Avg"]))) 310 | train_stackers.append(train_stacker) 311 | train_stacker = np.mean(train_stackers, axis=0) 312 | print("*** Validation finished ***\n") 313 | 314 | test_stacker = [[0.0 for s in range(3)] for k in range (0,(test.shape[0]))] 315 | train_idx = [i for i in range(train.shape[0])] 316 | test_idx = [i + train.shape[0] for i in range(test.shape[0])] 317 | data = pd.concat([train, test]).reset_index() 318 | X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx) 319 | print(X_train.shape, len(train_stacker)) 320 | print("Begin predicting") 321 | preds = [] 322 | for i in range(len(clfs)): 323 | print("Clf_{} fiting".format(i)) 324 | clfs[i].fit(X_train, y) 325 | print("Clf_{} predicting".format(i)) 326 | pred = clfs[i].predict_proba(X_test) 327 | preds.append(pred) 328 | preds = np.mean(preds, axis=0) 329 | for pr in range (0, len(preds)): 330 | for d in range (0,3): 331 | test_stacker[pr][d]=(preds[pr][d]) 332 | print ("merging columns") 333 | #stack xgboost predictions 334 | X_train = np.column_stack((X_train.toarray(),train_stacker)) 335 | # stack id to test 336 | X_test = np.column_stack((X_test.toarray(),test_stacker)) 337 | # stack target to train 338 | X = np.column_stack((y,X_train)) 339 | ids = test.listing_id.values 340 | X_test = np.column_stack((ids, X_test)) 341 | np.savetxt("./train_stacknet.csv", X, delimiter=",", fmt='%.5f') 342 | np.savetxt("./test_stacknet.csv", X_test, delimiter=",", fmt='%.5f') 343 | print("Write results...") 344 | output_file = "submission_{}.csv".format(np.mean(cv_scores["Avg"])) 345 | print("Writing submission to %s" % output_file) 346 | f = open(output_file, "w") 347 | f.write("listing_id,high,medium,low\n")# the header 348 | for g in range(0, len(test_stacker)) : 349 | f.write("%s" % (ids[g])) 350 | for prediction in test_stacker[g]: 351 | f.write(",%f" % (prediction)) 352 | f.write("\n") 353 | f.close() 354 | print("Done.") 355 | 356 | 357 | if __name__ == "__main__": 358 | clfs = [] 359 | # clfs.append(xgboostClassifier( 360 | # objective = 'multi:softprob', 361 | # eval_metric = 'mlogloss', 362 | # num_class = 3, 363 | # nthread = 6, 364 | # eta = 0.04, 365 | # max_depth = 6, 366 | # subsample = 0.7, 367 | # colsample_bytree = 1.0, 368 | # colsample_bylevel = 0.7, 369 | # min_child_weight=1, 370 | # silent = 1, 371 | # num_rounds = 700, 372 | # seed = 0, 373 | # )) 374 | # clfs.append(xgboostClassifier( 375 | # objective = 'multi:softprob', 376 | # eval_metric = 'mlogloss', 377 | # num_class = 3, 378 | # nthread = 6, 379 | # eta = 0.02, 380 | # max_depth = 6, 381 | # subsample = 0.8, 382 | # colsample_bytree = 1.0, 383 | # colsample_bylevel = 0.8, 384 | # min_child_weight=1, 385 | # silent = 1, 386 | # num_rounds = 1700, 387 | # seed = 0, 388 | # )) 389 | clfs.append(xgboostClassifier( 390 | objective = 'multi:softprob', 391 | eval_metric = 'mlogloss', 392 | num_class = 3, 393 | nthread = 9, 394 | eta = 0.02, 395 | max_depth = 6, 396 | subsample = 0.8, 397 | colsample_bytree = 1.0, 398 | colsample_bylevel = 0.7, 399 | min_child_weight=1, 400 | silent = 1, 401 | num_rounds = 1500, 402 | seed = 0, 403 | )) 404 | clfs.append(xgboostClassifier( 405 | objective = 'multi:softprob', 406 | eval_metric = 'mlogloss', 407 | num_class = 3, 408 | nthread = 9, 409 | eta = 0.02, 410 | max_depth = 6, 411 | subsample = 0.8, 412 | colsample_bytree = 1.0, 413 | colsample_bylevel = 0.8, 414 | min_child_weight=1, 415 | silent = 1, 416 | num_rounds = 1500, 417 | seed = 128, 418 | )) 419 | clfs.append(xgboostClassifier( 420 | objective = 'multi:softprob', 421 | eval_metric = 'mlogloss', 422 | num_class = 3, 423 | nthread = 9, 424 | eta = 0.02, 425 | max_depth = 6, 426 | subsample = 0.8, 427 | colsample_bytree = 1.0, 428 | colsample_bylevel = 0.8, 429 | min_child_weight=1, 430 | silent = 1, 431 | num_rounds = 1500, 432 | seed = 512, 433 | )) 434 | clfs.append(xgboostClassifier( 435 | objective = 'multi:softprob', 436 | eval_metric = 'mlogloss', 437 | num_class = 3, 438 | nthread = 9, 439 | eta = 0.02, 440 | max_depth = 6, 441 | subsample = 0.8, 442 | colsample_bytree = 1.0, 443 | colsample_bylevel = 0.8, 444 | min_child_weight=1, 445 | silent = 1, 446 | num_rounds = 1500, 447 | seed = 1024, 448 | )) 449 | clfs.append(xgboostClassifier( 450 | objective = 'multi:softprob', 451 | eval_metric = 'mlogloss', 452 | num_class = 3, 453 | nthread = 9, 454 | eta = 0.02, 455 | max_depth = 6, 456 | subsample = 0.8, 457 | colsample_bytree = 1.0, 458 | colsample_bylevel = 0.8, 459 | min_child_weight=1, 460 | silent = 1, 461 | num_rounds = 1500, 462 | seed = 2048, 463 | )) 464 | if len(sys.argv) == 1: 465 | cv_scores = validate(clfs) 466 | write2file(cv_scores) 467 | elif len(sys.argv) == 2: 468 | if sys.argv[1] == '-v': 469 | cv_scores = validate(clfs) 470 | write2file(cv_scores) 471 | elif sys.argv[1] == '-g': 472 | gen_sub() 473 | elif sys.argv[1] == '-s': 474 | search() 475 | elif sys.argv[1] == '-ga': 476 | genAvgSub(clfs) 477 | elif sys.argv[1] == '-stack': 478 | stacking(clfs) 479 | elif sys.argv[1] == '-v3': 480 | cv_scores = validate(clfs) 481 | val_desc = sys.argv[2] 482 | write2file(cv_scores, val_desc) 483 | elif len(sys.argv) == 3: 484 | if sys.argv[1] == '-v': 485 | cv_scores = validate(clfs) 486 | val_desc = sys.argv[2] 487 | write2file(cv_scores, val_desc) 488 | elif sys.argv[1] == '-g': 489 | gen_sub() 490 | elif sys.argv[1] == '-v3': 491 | cv_scores = validation_score() 492 | val_desc = sys.argv[2] 493 | write2file(cv_scores, val_desc) 494 | 495 | 496 | 497 | 498 | 499 | 500 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | #-*- encoding: utf-8 -*- 3 | import sys 4 | import random 5 | import operator 6 | import datetime 7 | import time 8 | from collections import defaultdict, Counter 9 | import pandas as pd 10 | import numpy as np 11 | from scipy import sparse 12 | import xgboost as xgb 13 | from sklearn import preprocessing 14 | from sklearn.model_selection import train_test_split, GridSearchCV, KFold 15 | from sklearn.metrics import log_loss 16 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 17 | from sklearn.cluster import KMeans 18 | from sklearn.cross_validation import StratifiedKFold 19 | from sklearn.preprocessing import StandardScaler 20 | from nltk.metrics import distance as distance 21 | 22 | 23 | FEATURE_NOT_USE = ['created','description','features','photos', 'index']# ,'bathrooms', 'bedrooms''listing_id', 24 | FEATURE_NOT_USE.append('display_address') 25 | FEATURE_NOT_USE.extend(['low_build_frac', 'high_build_frac', 'medium_build_frac', 'build_count'])# 26 | FEATURE_NOT_USE.extend(['low_manager_frac', 'high_manager_frac', 'medium_manager_frac','manager_count'])# 27 | FEATURE_NOT_USE.extend(['Listing_Id', 'img_created']) # , 'time_stamp' 28 | 29 | def bedroomProcess(data, train_idx, test_idx): 30 | # Some basic feature from bedrooms 31 | data["no_bedroom"] = data["bedrooms"].apply(lambda x: 1 if x == 0 else 0) 32 | data["more_than_5_bedroom"] = data["bedrooms"].apply(lambda x: 1 if x >= 5 else 0) 33 | data.loc[data["bedrooms"] + data["bathrooms"] == 0, "bedrooms"] = 0.001 34 | train = data.iloc[train_idx, :].copy() 35 | test = data.iloc[test_idx, :].copy() 36 | # remove null value (ugly code) 37 | train.loc[data["bedrooms"] == 0.001, "bathrooms"] = train["bathrooms"].mean() 38 | test.loc[data["bedrooms"] == 0.001, "bathrooms"] = test["bathrooms"].mean() 39 | data.iloc[train_idx, :] = train 40 | data.iloc[test_idx, :] = test 41 | data["bedroom_per_room"] = data["bedrooms"] / (data["bedrooms"] + data["bathrooms"]) 42 | data.loc[data["bedrooms"] == 0.001, "bathrooms"] = 0 43 | data.loc[data["bedrooms"] == 0.001, "bedrooms"] = 0 44 | return data 45 | 46 | 47 | def bathroomProcess(data, train_idx, test_idx): 48 | # Some basic feature from bathrooms 49 | data.loc[data["bathrooms"] == 112, "bathrooms"] = 1.5 50 | data.loc[data["bathrooms"] == 10, "bathrooms"] = 1 51 | data.loc[data["bathrooms"] == 20, "bathrooms"] = 2 52 | data["1_to_2_bathrooms"] = data["bathrooms"].apply(lambda x : 1if x != 0 and x <= 2 else 0) 53 | data.loc[data["bedrooms"] + data["bathrooms"] == 0, "bathrooms"] = 0.001 54 | train = data.iloc[train_idx, :].copy() 55 | test = data.iloc[test_idx, :].copy() 56 | # remove null value (ugly code) 57 | train.loc[data["bathrooms"] == 0.001, "bedrooms"] = train["bedrooms"].mean() 58 | test.loc[data["bathrooms"] == 0.001, "bedrooms"] = test["bedrooms"].mean() 59 | data.iloc[train_idx, :] = train 60 | data.iloc[test_idx, :] = test 61 | data["bathoom_per_room"] = data["bathrooms"] / (data["bedrooms"] + data["bathrooms"]) 62 | data.loc[data["bathrooms"] == 0.001, "bedrooms"] = 0 63 | data.loc[data["bathrooms"] == 0.001, "bathrooms"] = 0 64 | return data 65 | 66 | 67 | def buildingIdProcess(data, y, train_idx, test_idx): 68 | # Have tried some ideas but failed 69 | return data 70 | 71 | 72 | def createdProcess(data): 73 | # Some basic features from created 74 | data["created"] = pd.to_datetime(data['created']) 75 | data["latest"] = (data["created"]- data["created"].min()) 76 | data["latest"] = data["latest"].apply(lambda x: x.total_seconds()) 77 | data["passed"] = (data["created"].max()- data["created"]) 78 | data["passed"] = data["passed"].apply(lambda x: x.total_seconds()) 79 | # year is weird 80 | data["year"] = data["created"].dt.year 81 | data['month'] = data['created'].dt.month 82 | data['day'] = data['created'].dt.day 83 | data['hour'] = data['created'].dt.hour 84 | data['weekday'] = data['created'].dt.weekday 85 | data['week'] = data['created'].dt.week 86 | data['quarter'] = data['created'].dt.quarter 87 | data['weekend'] = ((data['weekday'] == 5) & (data['weekday'] == 6)) 88 | data['weekend'] = data['weekend'].apply(int) 89 | # data["created_stamp"] = data["created"].apply(lambda x: time.mktime(x.timetuple())) 90 | #* 91 | data["latest_list_rank"] = data["latest"] / data["listing_id"] 92 | # data["diff_rank_2"] = data["passed"] / data["listing_id"] 93 | #* 94 | 95 | # image time after leak 96 | data.loc[data["time_stamp"] > 1490000000, "time_stamp"] = 1478524550 97 | data["img_created"] = data["time_stamp"].apply(lambda x: datetime.datetime.fromtimestamp(x)) 98 | data["img_latest"] = (data["img_created"]- data["img_created"].min()) 99 | data["img_latest"] = data["img_latest"].apply(lambda x: x.total_seconds()) 100 | data["img_passed"] = (data["img_created"].max()- data["img_created"]) 101 | data["img_passed"] = data["img_passed"].apply(lambda x: x.total_seconds()) 102 | data["img_year"] = data["img_created"].dt.year 103 | data['img_month'] = data['img_created'].dt.month 104 | data['img_day'] = data['img_created'].dt.day 105 | data['img_hour'] = data['img_created'].dt.hour 106 | # data['img_weekday'] = data['img_created'].dt.weekday 107 | # data['img_week'] = data['img_created'].dt.week 108 | # data['img_quarter'] = data['img_created'].dt.quarter 109 | # data['img_weekend'] = ((data['img_weekday'] == 5) & (data['img_weekday'] == 6)) 110 | # data['img_weekend'] = data['img_weekend'].apply(int) 111 | data["img_latest_list_rank"] = data["img_latest"] / data["listing_id"] 112 | 113 | return data 114 | 115 | 116 | def descriptionProcess(data, train_idx, test_idx): 117 | data["description_words_num"] = data["description"].apply(lambda x: len(x.split(' '))) 118 | data["description_len"] = data["description"].apply(len) 119 | # Some info from descriptions 120 | desc_feats = { 121 | 'bedroom_mentions': ['br ', '---', "", "

"], 122 | 'html_tag_1':["", "

  • ", "
  • ", "", "-->", "2 else "nulldesc") 128 | # Tf-idf Encode 129 | tfidfdesc=TfidfVectorizer(min_df=20, max_features=50, strip_accents='unicode',lowercase =True, 130 | analyzer='word', token_pattern=r'\w{16,}', ngram_range=(1, 2), use_idf=False,smooth_idf=False, 131 | sublinear_tf=True, stop_words = 'english') 132 | tr_sparsed = tfidfdesc.fit_transform (data.iloc[train_idx, :]["description"]) 133 | te_sparsed = tfidfdesc.transform(data.iloc[test_idx, :]["description"]) 134 | feats_names = ["desc_" + x for x in tfidfdesc.get_feature_names()] 135 | return data, tr_sparsed, te_sparsed, feats_names 136 | 137 | 138 | def displayAddrProcess(data): 139 | # disp_price_dict = dict(data.groupby('display_address')['price'].mean()) 140 | # data["mean_disp_price"] = data.apply(lambda row: disp_price_dict[row["display_address"]], axis=1) 141 | # data["addr_sim"] = data.apply(lambda row: distance.edit_distance(row["display_address"].lower(), row["street_address"].lower()), axis=1) 142 | return data 143 | 144 | 145 | def featuresProcess(data, train_idx, test_idx): 146 | def afterRemoveStr(l, s): 147 | while s in l: 148 | l.remove(s) 149 | return l 150 | 151 | def afterRemoveFirstSpace(l): 152 | res = [] 153 | for s in l: 154 | res.append(s.strip()) 155 | return res 156 | 157 | data["features_num"] = data["features"].apply(len) 158 | mark = "#+-+#" 159 | data["features"] = data["features"].apply(lambda x: mark.join([i for i in x])) 160 | data["features"] = data["features"].apply(lambda x: x.lower()) 161 | 162 | # Deal with list like data 163 | data["features"] = data["features"].apply(lambda x: mark.join([i for i in x.split(" * ")])) 164 | data["features"] = data["features"].apply(lambda x: mark.join([i for i in x.split("**")])) 165 | data['features']=data['features'].str.replace("✓ hardwood floor ✓ high ceilings ✓ dishwasher", 166 | "hardwood floor" + mark + "high ceilings" + mark + "dishwasher") 167 | data['features']=data['features'].str.replace( 168 | "• on-site lifestyle concierge by luxury attaché " + 169 | "•24/7 doorman " + 170 | "• state of the art cardiovascular and weight training equipment " + 171 | "• 24-hour valet parking garage " + 172 | "• valet services including dry cleaning", 173 | "on-site lifestyle concierge by luxury attaché" + mark + 174 | "24/7 doorman" + mark + 175 | "state of the art cardiovascular and weight training equipment" + mark + 176 | "24-hour valet parking garage" + mark + 177 | "valet services including dry cleaning") 178 | data['features']=data['features'].str.replace( 179 | '{ 0 = "laundry in unit"; ' + 180 | '1 = "cats allowed"; '+ 181 | '10 = hardwood; '+ 182 | '11 = "high ceilings"; '+ 183 | '12 = renovated; '+ 184 | '13 = "marble bath"; '+ 185 | '14 = "granite kitchen"; '+ 186 | '15 = light; '+ 187 | '16 = "no fee"; '+ 188 | '17 = "walk-in closet"; '+ 189 | '2 = "dogs allowed"; '+ 190 | '3 = elevator; '+ 191 | '4 = exclusive; '+ 192 | '6 = laundry; '+ 193 | '7 = subway; '+ 194 | '8 = dishwasher; '+ 195 | '9 = washer; }', 196 | "laundry in unit" + mark + "cats allowed" + mark + "hardwood" + 197 | "high ceilings" + mark + "renovated" + mark + "marble bath" + 198 | "granite kitchen" + mark + "light" + mark + "no fee" + 199 | "walk-in closet" + mark + "dogs allowed" + mark + "elevator" + 200 | "exclusive" + mark + "laundry" + mark + "subway"+ 201 | "dishwasher" + mark + "washer") 202 | data['features']=data['features'].str.replace("windowed air-conditioned and monitored laundry room", 203 | "windowed air-conditioned" + mark + "monitored laundry room") 204 | data['features']=data['features'].str.replace("wall of windows. huge bedrooms", 205 | "wall of windows" + mark + "huge bedrooms") 206 | data['features']=data['features'].str.replace("to relax and recharge. this spacious 3 bedroom/2 bath residence also features oak hardwood flooring", 207 | "spacious" + mark + "3 bedroom" + mark + "2 bath" + mark + "residence" + mark + "oak hardwood flooring") 208 | data['features']=data['features'].str.replace("stunning 3 bedroom apartment with a terrace! east harlem! the best deal out now! get it now!!!!", 209 | "stunning" + mark + "3 bedroom" + mark + "a terrace" + mark + "east harlem" + mark + "the best deal out now! get it now!!!!") 210 | data['features']=data['features'].str.replace("ss appliances - d/w - m/w - recessed lighting - hardwood floors - high ceilings - marble bath", 211 | "ss appliances - d/w - m/w - " + mark + "recessed lighting" + mark + "hardwood floors" + mark + "high ceilings" + mark + "marble bath") 212 | data['features']=data['features'].str.replace("spacious living room for any kind of entertainment. prime location in theater distric", 213 | "spacious living room for any kind of entertainment." + mark + "prime location in theater distric") 214 | data['features']=data['features'].str.replace("spacious living room + home office", 215 | "spacious living room" + mark + "home office") 216 | data['features']=data['features'].str.replace("spacious and sunny 1st floor apartment "+ 217 | "overlooking the garden " + 218 | "*great williamsburg location* "+ 219 | "steps from shopping and cafes "+ 220 | "and 5 minute walk to graham avenue l train (3rd stop from manhattan) "+ 221 | "*shared back yard * "+ 222 | "large box style rooms * "+ 223 | "huge living room with high ceilings * "+ 224 | "nice bathroom with granite floor & ceramic tile * "+ 225 | "beautiful kitchen with granite counter tops lots of closet spacehardwood floors *"+ 226 | " heat included in the rent "+ 227 | "clean quiet building "+ 228 | "cat ok "+ 229 | "great location close to shopping", 230 | "spacious"+ mark +"sunny 1st floor"+ mark+ 231 | "overlooking the garden" + mark+ 232 | "great williamsburg location"+ mark+ 233 | "steps from shopping and cafes"+ mark+ 234 | "5 minute walk to graham avenue"+ mark +"train (3rd stop from manhattan)"+ mark+ 235 | "shared back yard"+mark+ 236 | "large box style rooms"+mark+ 237 | "huge living room " + mark + "high ceilings"+ mark+ 238 | "nice bathroom" + mark +"granite floor" + mark +"ceramic tile * "+mark+ 239 | "beautiful kitchen" + mark +"granite counter tops" + mark +"closet " + mark +"spacehardwood floors"+mark+ 240 | "heat included in the rent"+mark+ 241 | "clean quiet building"+mark+ 242 | "cat ok"+mark+ 243 | "close to shopping") 244 | data['features']=data['features'].str.replace("residents-only " + 245 | "fitness center " + 246 | "and aerobic room " + 247 | "professionally outfitted with a full complement of strength and cardio-training equipment", 248 | "residents-only"+ mark +"itness center"+ mark+ 249 | "and aerobic room" + mark+ 250 | "cardio-training equipment") 251 | data['features']=data['features'].str.replace("owner occupied - " + 252 | "3 family townhouse - " + 253 | "no realtor fees -"+ 254 | " this beautiful apt is offered below market rate", 255 | "owner occupied"+ mark +"3 family townhouse"+ mark+ 256 | "no realtor fees" + mark+ 257 | "this beautiful apt is offered below market rate") 258 | data['features']=data['features'].str.replace("newly renovated "+ 259 | "w/ oak wood floors "+ 260 | "mid century modern style interior "+ 261 | "large closets in every bedroom "+ 262 | "extra storage space in hall. "+ 263 | "large living room", 264 | "newly renovated"+ mark +"oak wood floors"+ mark+ 265 | "mid century modern style interior" + mark+ 266 | "large closets in every bedroom" + mark+ 267 | "extra storage space in hall"+ mark +"large living room") 268 | data['features']=data['features'].str.replace("live-in super package room "+ 269 | "smoke-free "+ 270 | "storage available "+ 271 | "virtual doorman "+ 272 | "guarantors accepted", 273 | 274 | "live-in super package room"+ mark +"smoke-free"+ mark+ 275 | "storage available" + mark+ 276 | "virtual doorman" + mark+ 277 | "guarantors accepted") 278 | data['features']=data['features'].str.replace("live-in super package room "+ 279 | "smoke-free "+ 280 | "storage available "+ 281 | "virtual doorman "+ 282 | "guarantors accepted", 283 | 284 | "live-in super package room"+ mark +"smoke-free"+ mark+ 285 | "storage available" + mark+ 286 | "virtual doorman" + mark+ 287 | "guarantors accepted") 288 | 289 | # Merging some features 290 | data['features']=data['features'].str.replace("washer/dyer combo","washer/dyer") 291 | data['features']=data['features'].str.replace("washer/dryer inside the unit","washer/dyer") 292 | data['features']=data['features'].str.replace("washer/dryer in-unit","washer/dyer") 293 | data['features']=data['features'].str.replace("washer/dryer in unit","washer/dyer") 294 | data['features']=data['features'].str.replace("washer/dryer in building","washer/dyer") 295 | data['features']=data['features'].str.replace("washer/dryer in bldg","washer/dyer") 296 | data['features']=data['features'].str.replace("washer/dryer hookup","washer/dyer") 297 | data['features']=data['features'].str.replace("washer/dryer stove/oven","washer/dyer") 298 | data['features']=data['features'].str.replace("washer/drier hookups","washer/dyer") 299 | data['features']=data['features'].str.replace("washer/ dryer in unit","washer/dyer") 300 | data['features']=data['features'].str.replace("washer/ dryer hookups","washer/dyer") 301 | data['features']=data['features'].str.replace("washer-dryer in unit","washer/dyer") 302 | data['features']=data['features'].str.replace("washer-dryer hookups","washer/dyer") 303 | data['features']=data['features'].str.replace("washer in unit","washer/dyer") 304 | data['features']=data['features'].str.replace("washer dryer in unit","washer/dyer") 305 | data['features']=data['features'].str.replace("washer dryer hookup","washer/dyer") 306 | data['features']=data['features'].str.replace("washer dryer hook up","washer/dyer") 307 | data['features']=data['features'].str.replace("washer and dryer in unit","washer/dyer") 308 | data['features']=data['features'].str.replace("washer and dryer in the unit","washer/dyer") 309 | data['features']=data['features'].str.replace("washer and dryer","washer/dyer") 310 | data['features']=data['features'].str.replace("washer / dryer in unit","washer/dyer") 311 | data['features']=data['features'].str.replace("washer / dryer (hookup only)","washer/dyer") 312 | data['features']=data['features'].str.replace("washer / dryer","washer/dyer") 313 | data['features']=data['features'].str.replace("washer & dryer.","washer/dyer") 314 | data['features']=data['features'].str.replace("washer","washer/dyer") 315 | data['features']=data['features'].str.replace("wash/dryer","washer/dyer") 316 | 317 | 318 | data['features']=data['features'].str.replace("pets: cats/small dogs","pet-friendly") 319 | data['features']=data['features'].str.replace("pets welcome","pet-friendly") 320 | data['features']=data['features'].str.replace("pets upon approval","pet-friendly") 321 | data['features']=data['features'].str.replace("pets on approval","pet-friendly") 322 | data['features']=data['features'].str.replace("pets ok.","pet-friendly") 323 | data['features']=data['features'].str.replace("pets ok","pet-friendly") 324 | data['features']=data['features'].str.replace("pets are welcome","pet-friendly") 325 | data['features']=data['features'].str.replace("pets allowed","pet-friendly") 326 | data['features']=data['features'].str.replace("pets accepted (on approval)","pet-friendly") 327 | data['features']=data['features'].str.replace("pets","pet-friendly") 328 | data['features']=data['features'].str.replace("pet grooming room","pet-friendly") 329 | data['features']=data['features'].str.replace("pet friendly building","pet-friendly") 330 | data['features']=data['features'].str.replace("pet friendly ( case by case )","pet-friendly") 331 | data['features']=data['features'].str.replace("pet friendly","pet-friendly") 332 | data['features']=data['features'].str.replace("pet friendly building","pet-friendly") 333 | data['features']=data['features'].str.replace("pet friendly building","pet-friendly") 334 | 335 | data['features']=data['features'].str.replace("garden/patio","garden") 336 | data['features']=data['features'].str.replace("patio","garden") 337 | data['features']=data['features'].str.replace("residents_garden","garden") 338 | data['features']=data['features'].str.replace("common garden","garden") 339 | 340 | data['features']=data['features'].str.replace("wifi access","wifi") 341 | data['features']=data['features'].str.replace("wifi included","wifi") 342 | data['features']=data['features'].str.replace("wifi in resident lounge","wifi") 343 | data['features']=data['features'].str.replace("wifi + utilities","wifi") 344 | data['features']=data['features'].str.replace("wi fi work lounge","wifi") 345 | data['features']=data['features'].str.replace("wi-fi access","wifi") 346 | 347 | data['features']=data['features'].str.replace("24/7","24") 348 | data['features']=data['features'].str.replace("24-hour","24") 349 | data['features']=data['features'].str.replace("24hr","24") 350 | data['features']=data['features'].str.replace("concierge","doorman") 351 | data['features']=data['features'].str.replace("ft doorman","doorman") 352 | data['features']=data['features'].str.replace("24 doorman","doorman") 353 | data['features']=data['features'].str.replace("24 hr doorman","doorman") 354 | data['features']=data['features'].str.replace("doorman service","doorman") 355 | data['features']=data['features'].str.replace("full-time doorman","doorman") 356 | 357 | data['features']=data['features'].str.replace("gym/fitness","fitness") 358 | data['features']=data['features'].str.replace("fitness room","fitness") 359 | 360 | data['features']=data['features'].str.replace("washer","laundry") 361 | data['features']=data['features'].str.replace("laundry in bldg","laundry") 362 | data['features']=data['features'].str.replace("laundry in building","laundry") 363 | data['features']=data['features'].str.replace("laundry in building/dryer","laundry") 364 | data['features']=data['features'].str.replace("laundry in building_&_dryer","laundry") 365 | data['features']=data['features'].str.replace("laundry room","laundry") 366 | data['features']=data['features'].str.replace("laundry & housekeeping","laundry") 367 | data['features']=data['features'].str.replace("laundry in unit","laundry") 368 | data['features']=data['features'].str.replace("laundry in-unit","laundry") 369 | data['features']=data['features'].str.replace("laundry on every floor","laundry") 370 | data['features']=data['features'].str.replace("laundry on floor","laundry") 371 | data['features']=data['features'].str.replace("in-unit laundry/dryer","laundry") 372 | data['features']=data['features'].str.replace("on-site laundry","laundry") 373 | data['features']=data['features'].str.replace("laundry/dryer","laundry") 374 | 375 | data['features']=data['features'].str.replace("high-speed internet","high_speed_internet") 376 | data['features']=data['features'].str.replace("high speed internet available","high_speed_internet") 377 | 378 | data['features']=data['features'].str.replace("parking available","parking") 379 | data['features']=data['features'].str.replace("parking space","parking") 380 | data['features']=data['features'].str.replace("on-site garage","parking") 381 | data['features']=data['features'].str.replace("on-site parking","parking") 382 | data['features']=data['features'].str.replace("on-site parking lot","parking") 383 | data['features']=data['features'].str.replace("full service garage","parking") 384 | data['features']=data['features'].str.replace("common parking/garage","parking") 385 | data['features']=data['features'].str.replace("garage","parking") 386 | data['features']=data['features'].str.replace("assigned-parking-space","private_parking") 387 | 388 | data['features']=data['features'].str.replace("storage available","storage") 389 | data['features']=data['features'].str.replace("storage facilities available","storage") 390 | data['features']=data['features'].str.replace("storage space","storage") 391 | data['features']=data['features'].str.replace("storage room","storage") 392 | data['features']=data['features'].str.replace("common storage","storage") 393 | 394 | data['features']=data['features'].str.replace("central a/c","central_air") 395 | data['features']=data['features'].str.replace("central ac","central_air") 396 | data['features']=data['features'].str.replace("air conditioning","central_air") 397 | 398 | data['features']=data['features'].str.replace("close to subway","subway") 399 | 400 | data['features']=data['features'].str.replace("roofdeck","roof-deck") 401 | data['features']=data['features'].str.replace("roof-deck","roof-deck") 402 | data['features']=data['features'].str.replace("rooftop terrace","roof-deck") 403 | data['features']=data['features'].str.replace("rooftop deck","roof-deck") 404 | data['features']=data['features'].str.replace("roof access","roof-deck") 405 | data['features']=data['features'].str.replace("common roof deck","roof-deck") 406 | data['features']=data['features'].str.replace("roof decks","roof-deck") 407 | data['features']=data['features'].str.replace("roof grilling area","roof-deck") 408 | data['features']=data['features'].str.replace("roof garden and lounge","roof-deck") 409 | data['features']=data['features'].str.replace("roof deck with stunning view","roof-deck") 410 | data['features']=data['features'].str.replace("roof deck with real grass","roof-deck") 411 | data['features']=data['features'].str.replace("roof deck with grills","roof-deck") 412 | data['features']=data['features'].str.replace("roof deck w/ grills","roof-deck") 413 | data['features']=data['features'].str.replace("roof deck / sun deck","roof-deck") 414 | data['features']=data['features'].str.replace("roof deck","roof-deck") 415 | 416 | data['features']=data['features'].str.replace("swimming pool","pool") 417 | data['features']=data['features'].str.replace("indoor pool","pool") 418 | 419 | data['features']=data['features'].str.replace("deco fireplace","fireplaces") 420 | data['features']=data['features'].str.replace("decorative fireplace","fireplaces") 421 | 422 | data['features']=data['features'].str.replace("yoga/pilates studio","yoga") 423 | data['features']=data['features'].str.replace("yoga studio","yoga") 424 | data['features']=data['features'].str.replace("yoga room","yoga") 425 | data['features']=data['features'].str.replace("yoga classes","yoga") 426 | data['features']=data['features'].str.replace("yoga and spin studios","yoga") 427 | data['features']=data['features'].str.replace("yoga an pilates class","yoga") 428 | data['features']=data['features'].str.replace("yoga / dance studio","yoga") 429 | 430 | 431 | # data["features"] = data["features"].apply(lambda x: afterRemoveStr(x, '')) 432 | # data["features"] = data["features"].apply(lambda x: afterRemoveFirstSpace(x)) 433 | data["features"] = data["features"].apply(lambda x: x.split(mark)) 434 | data["features"] = data["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x])) 435 | tfidf = CountVectorizer(stop_words="english", max_features=200) 436 | tr_sparse_feats = tfidf.fit_transform(data.iloc[train_idx, :]["features"]) 437 | te_sparse_feats = tfidf.transform(data.iloc[test_idx, :]["features"]) 438 | feats_names = ["features_" + x for x in tfidf.get_feature_names()] 439 | return data, tr_sparse_feats, te_sparse_feats, feats_names 440 | 441 | 442 | def locationProcess(data, train_idx, test_idx): 443 | # Clustering 444 | 445 | # train_x = data.iloc[train_idx,:][['new_latitude', 'new_longitude']] 446 | # stest_x = data.iloc[test_idx,:][['new_latitude', 'new_longitude']] 447 | train_x = data.iloc[train_idx, :][['latitude', 'longitude']] 448 | test_x = data.iloc[test_idx, :][['latitude', 'longitude']] 449 | kmeans_cluster = KMeans(n_clusters=20) 450 | res = kmeans_cluster.fit(train_x) 451 | res = kmeans_cluster.predict(pd.concat([train_x, test_x])) 452 | d = dict(zip(data['listing_id'], res)) 453 | data['cenroid'] = data['listing_id'].apply(lambda x: d[x]) 454 | # Manhattan distance 455 | center = [data.iloc[train_idx, :]['latitude'].mean(), data.iloc[train_idx, :]['longitude'].mean()] 456 | data['distance'] = abs(data['latitude'] - center[0]) + abs(data['longitude'] - center[1]) 457 | # data['distance_2'] = np.sqrt((data['latitude'] - center[0]) ** 2 + (data['longitude'] - center[1]) ** 2) 458 | 459 | return data 460 | 461 | 462 | def managerIdProcess(data, y, train_idx, test_idx): 463 | manager_lgt_dict = dict(data.groupby('manager_id')['longitude'].mean()) 464 | manager_ltt_dict = dict(data.groupby('manager_id')['latitude'].mean()) 465 | 466 | # Group manager_id with location info 467 | data["mean_man_longitude"] = data.apply(lambda row: manager_lgt_dict[row["manager_id"]], axis=1) 468 | data["mean_man_latitude"] = data.apply(lambda row: manager_ltt_dict[row["manager_id"]], axis=1) 469 | 470 | # Group manager_id with time info 471 | data = group_with_time_features(data, "manager_id") 472 | data = group_with_img_time_features(data, "manager_id") 473 | manager_stamp_dict = dict(data.groupby('manager_id')['time_stamp'].mean()) 474 | data["mean_man_timestamp"] = data.apply(lambda row: manager_stamp_dict[row["manager_id"]], axis=1) 475 | # manager_stamp_dict = dict(data.groupby('manager_id')['created_stamp'].mean()) 476 | # data["mean_man_createdstamp"] = data.apply(lambda row: manager_stamp_dict[row["manager_id"]], axis=1) 477 | return data 478 | 479 | 480 | def photoProcess(data): 481 | data["photo_num"] = data["photos"].apply(len) 482 | return data 483 | 484 | 485 | def priceProcess(data): 486 | #data["out_price"] = data["price"].apply(lambda x: 1 if x < 700 or x > 15000 else 0) 487 | # Clean the outlier 488 | ulimit = 15000#np.percentile(data.price.values, 99) 489 | data.loc[data["price"] > ulimit, "price"] = ulimit 490 | dlimit = 350 491 | data.loc[data["price"] < dlimit, "price"] = dlimit 492 | data["price_per_room"] = data["price"] / (data["bedrooms"] + data["bathrooms"] + 1.0) 493 | data["price_per_bed"] = data["price"] / (data["bedrooms"] + 1.0) 494 | #* 495 | # data.loc[~np.isfinite(data["price_per_room"]), "price_per_room"] = 0 496 | # data.loc[~np.isfinite(data["price_per_bed"]), "price_per_bed"] = 0 497 | data["price_latitude"] = data["price"] / (data["latitude"] + 1.0) 498 | data["price_longitude"] = data["price"] / (data["longitude"] + 1.0) 499 | 500 | # Grouping price with size or build 501 | median_list = ['bedrooms', 'bathrooms', 'building_id'] 502 | # median_list = ['month', 'day', 'hour', 'weekday', 'quarter', 'week', 'passed', 'latest'] 503 | for col in median_list: 504 | median_price = data[[col, 'price']].groupby(col)['price'].median() 505 | median_price = median_price[data[col]].values.astype(float) 506 | data['median_' + col] = median_price 507 | data['ratio_' + col] = data['price'] / median_price 508 | data['median_' + col] = data['median_' + col].apply(lambda x: np.log(x)) 509 | # data["price"] = data["price"].apply(lambda x: np.log(x)) 510 | return data 511 | 512 | 513 | def streetAddrProcess(data): 514 | #data["new_addr"] = data["street_address"].apply(lambda x: ' '.join([x.split()[i] for i in range(1, len(x.split()))])) 515 | #data["new_addr"] = preprocessing.LabelEncoder().fit_transform(data["new_addr"]) 516 | # data["street_address"] = data["street_address"].apply(lambda x: x.replace('\u00a0', '').strip().lower) 517 | return data 518 | 519 | 520 | def listingIdProcess(data): 521 | # It's weird。 522 | data["listing_id"] = data["listing_id"] - 68119576.0 523 | return data 524 | 525 | 526 | def coreProcess(data, y_train, train_idx, test_idx): 527 | data = listingIdProcess(data) 528 | data = bedroomProcess(data, train_idx, test_idx) 529 | data = bathroomProcess(data, train_idx, test_idx) 530 | data["room_diff"] = data["bathrooms"] - data["bedrooms"] 531 | data["room_num"] = data["bedrooms"] + data["bathrooms"] 532 | data = createdProcess(data) 533 | data = buildingIdProcess(data, y_train, train_idx, test_idx) 534 | data, tr_sparsed, te_sparsed, feats_sparsed = descriptionProcess(data, train_idx, test_idx) 535 | data = displayAddrProcess(data) 536 | data, tr_sparse, te_sparse, feats_sparse = featuresProcess(data, train_idx, test_idx) 537 | data = locationProcess(data, train_idx, test_idx) 538 | data = managerIdProcess(data, y_train, train_idx, test_idx) 539 | data = photoProcess(data) 540 | data = priceProcess(data) 541 | data = streetAddrProcess(data) 542 | 543 | categorical = ["display_address", "manager_id", "building_id", "street_address"] 544 | for f in categorical: 545 | if data[f].dtype=='object': 546 | cases=defaultdict(int) 547 | temp=np.array(data[f]).tolist() 548 | for k in temp: 549 | cases[k]+=1 550 | # print(f, len(cases)) 551 | data[f] = data[f].apply(lambda x: cases[x]) 552 | 553 | feats_in_use = [col for col in data.columns if col not in FEATURE_NOT_USE] 554 | 555 | data_train = np.array(data.iloc[train_idx, :][feats_in_use]) 556 | data_test = np.array(data.iloc[test_idx, :][feats_in_use]) 557 | # Feature Scaling 558 | stda = StandardScaler() 559 | data_test = stda.fit_transform(data_test) 560 | data_train = stda.transform(data_train) 561 | # High cardinality feature 562 | high_card_feats = ["building_id", "manager_id", "longitude", "room_diff"] # "building_id", "manager_id", 563 | # C0 = [3, 12, 0, 4] 564 | C0 = [feats_in_use.index(f) for f in high_card_feats] 565 | W_train, W_cv = convert_to_avg(data_train, y_train, data_test, seed=1, cvals=5, roundings=2, columns=C0) 566 | # Add Sparse feature 567 | data_train = sparse.hstack([data_train, tr_sparse, tr_sparsed, W_train[:, C0]]).tocsr() 568 | data_test = sparse.hstack([data_test, te_sparse, te_sparsed, W_cv[:, C0]]).tocsr() 569 | feats_in_use.extend(feats_sparse) 570 | feats_in_use.extend(feats_sparsed) 571 | feats_in_use.extend(["build_high_card", "manager_high_card"]) 572 | # print(len(feats_in_use)) 573 | # print(tr_sparse.toarray().shape, tr_sparsed.toarray().shape, len(feats_in_use), data_train.shape) 574 | return data_train, data_test, feats_in_use 575 | 576 | 577 | # Copy from KazAnova's starter code 578 | def convert_dataset_to_avg(xc,yc,xt, rounding=2,cols=None): 579 | xc = xc.tolist() 580 | xt = xt.tolist() 581 | yc = yc.tolist() 582 | if cols == None: 583 | cols =[k for k in range(0,len(xc[0]))] 584 | woe=[ [0.0 for k in range(0,len(cols))] for g in range(0,len(xt))] 585 | good=[] 586 | bads=[] 587 | for col in cols: 588 | dictsgoouds=defaultdict(int) 589 | dictsbads=defaultdict(int) 590 | good.append(dictsgoouds) 591 | bads.append(dictsbads) 592 | total_count=0.0 593 | total_sum =0.0 594 | 595 | for a in range (0,len(xc)): 596 | target=yc[a] 597 | total_sum+=target 598 | total_count+=1.0 599 | for j in range(0,len(cols)): 600 | col=cols[j] 601 | good[j][round(xc[a][col],rounding)]+=target 602 | bads[j][round(xc[a][col],rounding)]+=1.0 603 | #print(total_goods,total_bads) 604 | 605 | for a in range (0,len(xt)): 606 | for j in range(0,len(cols)): 607 | col=cols[j] 608 | if round(xt[a][col],rounding) in good[j]: 609 | woe[a][j]=float(good[j][round(xt[a][col],rounding)])/float(bads[j][round(xt[a][col],rounding)]) 610 | else : 611 | woe[a][j]=round(total_sum/total_count) 612 | return woe 613 | 614 | 615 | def convert_to_avg(X,y, Xt, seed=1, cvals=5, roundings=2, columns=None): 616 | 617 | if columns==None: 618 | columns=[k for k in range(0,(X.shape[1]))] 619 | #print("it is not!!") 620 | X=X.tolist() 621 | Xt=Xt.tolist() 622 | woetrain=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(X))] 623 | woetest=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(Xt))] 624 | 625 | kfolder=StratifiedKFold(y, n_folds=cvals,shuffle=True, random_state=seed) 626 | for train_index, test_index in kfolder: 627 | # creaning and validation sets 628 | X_train, X_cv = np.array(X)[train_index], np.array(X)[test_index] 629 | y_train = np.array(y)[train_index] 630 | 631 | woecv=convert_dataset_to_avg(X_train,y_train,X_cv, rounding=roundings,cols=columns) 632 | X_cv=X_cv.tolist() 633 | no=0 634 | for real_index in test_index: 635 | for j in range(0,len(X_cv[0])): 636 | woetrain[real_index][j]=X_cv[no][j] 637 | no+=1 638 | no=0 639 | for real_index in test_index: 640 | for j in range(0,len(columns)): 641 | col=columns[j] 642 | woetrain[real_index][col]=woecv[no][j] 643 | no+=1 644 | woefinal=convert_dataset_to_avg(np.array(X),np.array(y),np.array(Xt), rounding=roundings,cols=columns) 645 | 646 | for real_index in range(0,len(Xt)): 647 | for j in range(0,len(Xt[0])): 648 | woetest[real_index][j]=Xt[real_index][j] 649 | 650 | for real_index in range(0,len(Xt)): 651 | for j in range(0,len(columns)): 652 | col=columns[j] 653 | woetest[real_index][col]=woefinal[real_index][j] 654 | 655 | return np.array(woetrain), np.array(woetest) 656 | 657 | 658 | # Grouping (Very important) 659 | def group_with_time_features(data, g_feat): 660 | mean_month_dict = dict(data.groupby(g_feat)['month'].mean()) 661 | data["mean_" + g_feat + "_month"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1) 662 | mean_day_dict = dict(data.groupby(g_feat)['day'].mean()) 663 | data["mean_" + g_feat + "_day"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1) 664 | mean_hour_dict = dict(data.groupby(g_feat)['hour'].mean()) 665 | data["mean_" + g_feat + "_hour"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1) 666 | mean_weekday_dict = dict(data.groupby(g_feat)['weekday'].mean()) 667 | data["mean_" + g_feat + "_weekday"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1) 668 | mean_quarter_dict = dict(data.groupby(g_feat)['quarter'].mean()) 669 | data["mean_" + g_feat + "_quater"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1) 670 | mean_week_dict = dict(data.groupby(g_feat)['week'].mean()) 671 | data["mean_" + g_feat + "_week"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1) 672 | mean_passed_dict = dict(data.groupby(g_feat)['passed'].mean()) 673 | data["mean_" + g_feat + "_passed"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1) 674 | mean_latest_dict = dict(data.groupby(g_feat)['latest'].mean()) 675 | data["mean_" + g_feat + "_latest"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1) 676 | 677 | return data 678 | 679 | 680 | def group_with_img_time_features(data, g_feat): 681 | mean_month_dict = dict(data.groupby(g_feat)['img_month'].mean()) 682 | data["mean_" + g_feat + "_img_month"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1) 683 | mean_day_dict = dict(data.groupby(g_feat)['img_day'].mean()) 684 | data["mean_" + g_feat + "_img_day"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1) 685 | mean_hour_dict = dict(data.groupby(g_feat)['img_hour'].mean()) 686 | data["mean_" + g_feat + "_img_hour"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1) 687 | # mean_weekday_dict = dict(data.groupby(g_feat)['img_weekday'].mean()) 688 | # data["mean_" + g_feat + "_img_weekday"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1) 689 | # mean_quarter_dict = dict(data.groupby(g_feat)['img_quarter'].mean()) 690 | # data["mean_" + g_feat + "_img_quater"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1) 691 | # mean_week_dict = dict(data.groupby(g_feat)['img_week'].mean()) 692 | # data["mean_" + g_feat + "_img_week"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1) 693 | mean_passed_dict = dict(data.groupby(g_feat)['img_passed'].mean()) 694 | data["mean_" + g_feat + "_img_passed"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1) 695 | mean_latest_dict = dict(data.groupby(g_feat)['img_latest'].mean()) 696 | data["mean_" + g_feat + "_img_latest"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1) 697 | return data 698 | 699 | 700 | 701 | 702 | 703 | 704 | --------------------------------------------------------------------------------