├── .DS_Store
├── stack
    ├── StackNet.jar
    ├── start.sh
    ├── parse.py
    ├── params.txt
    └── utils.py
├── ppt
    └── AIC-Sharing-11-19.pptx
├── README.md
├── classifiers.py
├── modelTraining.py
└── preprocess.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/HEAD/.DS_Store


--------------------------------------------------------------------------------
/stack/StackNet.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/HEAD/stack/StackNet.jar


--------------------------------------------------------------------------------
/ppt/AIC-Sharing-11-19.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/HEAD/ppt/AIC-Sharing-11-19.pptx


--------------------------------------------------------------------------------
/stack/start.sh:
--------------------------------------------------------------------------------
1 | java -Xmx3048m -jar StackNet.jar train train_file=train_stacknet.csv test_file=test_stacknet.csv params=params.txt pred_file=sigma_stack_pred.csv test_target=true verbose=true Threads=4 stackdata=false folds=5 seed=1 metric=logloss
2 | 


--------------------------------------------------------------------------------
/stack/parse.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import numpy as np
 3 | 
 4 | with open("result.txt", "r") as f:
 5 |     raw = "".join(f.readlines())
 6 | 
 7 | str_res = re.findall(pattern="logloss : 0\.[0-9]+", string=raw)
 8 | res = [float(x.split(" : ")[1]) for x in str_res]
 9 | results = {i: [] for i in range(len(res) // 5)}
10 | for i in range(len(res)):
11 |     results[i % (len(res) // 5)].append(res[i])
12 | results = {i: np.mean(results[i]) for i in results}
13 | for item in sorted(results.items(), key=lambda x: x[1]):
14 |     print(item)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | 
 3 | * ```preprocess.py```: data cleaning, feature engineering
 4 | * ```modelTraining.py```: cross validation, submission generating, stacking preparing
 5 | * ```classifiers.py```: my encapsulation of xgboost
 6 | * stack
 7 |   * ```StackNet.jar```: stacking tools shared by KazAnova, repo is [here](https://github.com/kaz-Anova/StackNet)
 8 |   * ```parse.py```: tools for evaluate the cv scores during stacking.
 9 |   * ```utils.py```: generating submission after StackNet
10 |   * ```start.sh```: commands for executing StackNet
11 |   * ```params.txt```: my params for stacking
12 | 
13 | ### links:
14 |   * [Kaggle:Rental Listing Inquireies](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)
15 |   * [Summary of getting a silver medal in kaggle](http://scarletpan.github.io/summary-of-get-a-silver-medal-in-kaggle/)
16 |   * [Kaggle 首战拿银总结 | 入门指导 (长文、干货） -- 知乎专栏](https://zhuanlan.zhihu.com/p/26645088)
17 |   * [AI Challenge 分享会PPT](https://github.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/blob/master/ppt/AIC-Sharing-11-19.pptx)
18 |   
19 | 


--------------------------------------------------------------------------------
/stack/params.txt:
--------------------------------------------------------------------------------
 1 | LogisticRegression Type:Liblinear C:6.1 threads:1 usescale:True maxim_Iteration:200 seed:1 verbose:false
 2 | GradientBoostingForestClassifier estimators:300 shrinkage:0.18 threads:1 offset:0.00001 max_depth:3 max_features:0.65 min_leaf:2.0 min_split:7.0 Objective:RMSE row_subsample:1.0 seed:1 verbose:false
 3 | LibFmClassifier maxim_Iteration:70 C:0.0041 C2:0.00120 lfeatures:1 seed:1 usescale:True init_values:0.046 learn_rate:0.05 smooth:0.1 threads:1 verbose:false
 4 | softmaxnnclassifier usescale:True seed:1 Type:SGD maxim_Iteration:50 C:0.0000008 shuffle:false tolerance:0.01 learn_rate:0.0065 smooth:0.1 h1:40 h2:35 connection_nonlinearity:Relu init_values:0.020 verbose:false
 5 | RandomForestClassifier bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.95 seed:1 verbose:false
 6 | AdaboostRandomForestClassifier bootsrap:false weight_thresold:0.95 estimators:100 threads:1 max_depth:6 max_features:0.5 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.9 seed:1 verbose:false
 7 | GradientBoostingForestRegressor bootsrap:false estimators:300 shrinkage:0.1 threads:1 offset:0.00001 max_depth:3 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.9 seed:1 verbose:false
 8 | RandomForestRegressor bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.95 seed:1 verbose:false
 9 | LibFmRegressor maxim_Iteration:70 C:0.0001 C2:0.0009 lfeatures:2 seed:1 usescale:True init_values:0.1 learn_rate:0.1 threads:1 verbose:false
10 | 
11 | RandomForestClassifier bootsrap:false estimators:500 threads:3 offset:0.00001 max_depth:5 max_features:0.3 min_leaf:1.0 min_split:5.0 Objective:ENTROPY row_subsample:0.8 seed:1 verbose:false


--------------------------------------------------------------------------------
/classifiers.py:
--------------------------------------------------------------------------------
 1 | import xgboost as xgb
 2 | import numpy as np
 3 | from sklearn.metrics import log_loss
 4 | 
 5 | 
 6 | class xgboostClassifier():
 7 |     def __init__(self, **params):
 8 |         self.clf = None
 9 |         self.progress = {}
10 |         self.params = params
11 | 
12 |     def fit(self, X, y):
13 |         xg_train = xgb.DMatrix(X, label=y)
14 |         self.clf = xgb.train(self.params, xg_train, self.params['num_rounds'])
15 | 
16 |     def fit_CV(self, X_train, X_val, y_train, y_val):
17 |         xg_train = xgb.DMatrix(X_train, label=y_train)
18 |         xg_val = xgb.DMatrix(X_val, label=y_val)
19 |         watchlist = [(xg_train, 'train'), (xg_val, 'eval')]
20 |         self.clf = xgb.train(self.params, xg_train, self.params['num_rounds'],
21 |                          watchlist, early_stopping_rounds=200, evals_result=self.progress)
22 | 
23 |     def get_eval_res(self):
24 |         return self.progress
25 | 
26 |     def score(self, X, y):
27 |         Y = self.predict_proba(X)
28 |         return 1 / log_loss(y, Y)
29 | 
30 |     def predict_proba(self, X_test):
31 |         res = self.clf.predict(xgb.DMatrix(X_test))
32 |         return res.astype(np.float32)
33 | 
34 |     def predict(self, X_test):
35 |         res = np.argmax(self.clf.predict(xgb.DMatrix(X_test)), axis=1)
36 |         return res 
37 | 
38 |     def get_params(self, **params):
39 |         return self.params
40 | 
41 |     def set_params(self, **params):
42 |         self.params.update(params)
43 | 
44 |     def getSortedImportance(self, features):
45 |         with open('xgb.fmap', 'w') as f:
46 |             for i in range(len(features)):
47 |                 f.write('{0}\t{1}\tq\n'.format(i, features[i]))
48 |         importance = self.clf.get_fscore(fmap='xgb.fmap')
49 |         importance = sorted(importance.items(), key=operator.itemgetter(1))
50 |         #print(importance)
51 |         return importance
52 | 
53 | class BaseClassifier(object):
54 |     def __init__(self, clf, seed=0, params=None):
55 |         params['random_state'] = seed
56 |         self.clf = clf(**params)
57 | 
58 |     def train(self, x_train, y_train):
59 |         self.clf.fit(x_train, y_train)
60 | 
61 |     def predict(self, x):
62 |         return self.clf.predict(x)
63 | 
64 |     def predict_proba(self, x):
65 |         return self.clf.predict_proba(x)
66 | 
67 |     def fit(self,x,y):
68 |         return self.clf.fit(x,y)
69 | 
70 |     def set_params(self, **params):
71 |         self.params.update(params)
72 |     


--------------------------------------------------------------------------------
/stack/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | 
 5 | def getAvgSub(subs_in):
 6 |     subs = []
 7 |     for sub in subs_in:
 8 |         sub = sub.sort_values(by=["listing_id"]).reset_index()
 9 |         subs.append(sub)
10 |     n = len(subs)
11 |     new_sub = subs[0].copy()
12 |     for i in range(1, n):
13 |         sub = subs[i]
14 |         new_sub["high"] = new_sub["high"] + sub["high"]
15 |         new_sub["medium"] = new_sub["medium"] + sub["medium"]
16 |         new_sub["low"] = new_sub["low"] + sub["low"]
17 |     new_sub["high"] =  new_sub["high"] / n
18 |     new_sub["medium"] = new_sub["medium"] / n
19 |     new_sub["low"] = new_sub["low"] / n
20 |     del new_sub["index"]
21 |     return new_sub
22 | 
23 | def getWeightedAvgSub(subs_in, weights):
24 |     assert np.sum(weights) == 1, "Sum of weights need to be 1"
25 |     subs = []
26 |     for sub in subs_in:
27 |         sub = sub.sort_values(by=["listing_id"]).reset_index()
28 |         subs.append(sub)
29 |     n = len(subs)
30 |     new_sub = subs[0].copy() 
31 |     new_sub["high"] = new_sub["high"] * weights[0]
32 |     new_sub["medium"] = new_sub["medium"] * weights[0]
33 |     new_sub["low"] = new_sub["low"] * weights[0]
34 |     for i in range(1, n):
35 |         sub = subs[i]
36 |         new_sub["high"] = new_sub["high"] + sub["high"] * weights[i]
37 |         new_sub["medium"] = new_sub["medium"] + sub["medium"] * weights[i]
38 |         new_sub["low"] = new_sub["low"] + sub["low"] * weights[i]
39 |     del new_sub["index"]
40 |     return new_sub
41 | 
42 | def generateStackSub(test_file_name, sub_file_name):
43 |     test_array = np.loadtxt(test_file_name, delimiter=",") 
44 |     test = pd.DataFrame(test_array)
45 |     sub_array = np.loadtxt(sub_file_name, delimiter=",") 
46 |     sub = pd.DataFrame(sub_array)
47 |     sub.columns = ["high", "medium", "low"]
48 |     sub["listing_id"] = test.iloc[:, 0].apply(lambda x: int(x))
49 |     sub.to_csv("new_sub.csv", index=False)   
50 | 
51 | 
52 | 
53 | 
54 | def correct(df):
55 |     interest_levels = ['low', 'medium', 'high']
56 | 
57 |     tau = {
58 |         'low': 0.69195995, 
59 |         'medium': 0.23108864,
60 |         'high': 0.07695141, 
61 |     }
62 | 
63 |     y = df[interest_levels].mean()
64 |     a = [tau[k] / y[k]  for k in interest_levels]
65 |     print(a)
66 | 
67 |     def f(p):
68 |         for k in range(len(interest_levels)):
69 |             p[k] *= a[k]
70 |         return p / p.sum()
71 | 
72 |     df_correct = df.copy()
73 |     df_correct[interest_levels] = df_correct[interest_levels].apply(f, axis=1)
74 | 
75 |     y = df_correct[interest_levels].mean()
76 |     a = [tau[k] / y[k]  for k in interest_levels]
77 |     print(a)
78 | 
79 |     return df_correct


--------------------------------------------------------------------------------
/modelTraining.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import random
  4 | from collections import defaultdict
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
  8 | from sklearn.metrics import log_loss
  9 | from preprocess import coreProcess
 10 | from classifiers import xgboostClassifier
 11 | 
 12 | TRAIN_FILE_NAME = '~/Kaggle/RLI/input/train.json'
 13 | TEST_FILE_NAME = '~/Kaggle/RLI/input/test.json'
 14 | target_num_map = {'high': 0, 'medium': 1, 'low': 2}
 15 | train_data = pd.read_json(TRAIN_FILE_NAME).reset_index()
 16 | test_data = pd.read_json(TEST_FILE_NAME).reset_index()
 17 | list_img_time = pd.read_csv("~/Kaggle/RLI/input/listing_image_time.csv")
 18 | train_data = train_data.merge(list_img_time, left_on="listing_id", right_on="Listing_Id", how='inner')
 19 | test_data = test_data.merge(list_img_time, left_on="listing_id", right_on="Listing_Id", how='inner')
 20 | RS = 2016
 21 | random.seed(RS)
 22 | np.random.seed(RS)
 23 | # RS = 0
 24 | 
 25 | def validation_score(early_stop=False):
 26 |     clf = xgboostClassifier(
 27 |         objective = 'multi:softprob',
 28 |         eval_metric = 'mlogloss',
 29 |         num_class = 3,
 30 |         nthread = 3,
 31 |         eta = 0.04,
 32 |         max_depth = 6,
 33 |         subsample = 0.7,
 34 |         colsample_bytree = 1.0,
 35 |         colsample_bylevel = 0.7,
 36 |         min_child_weight=1,
 37 |         silent = 1,
 38 |         num_rounds = 700,
 39 |         seed = RS,
 40 |     )
 41 |     print("*** Validation start ***")
 42 |     data = train_data.copy()
 43 |     y = data["interest_level"].apply(lambda x: target_num_map[x])
 44 |     del data["interest_level"]
 45 | 
 46 |     # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True)
 47 |     skf = StratifiedKFold(n_splits=3, shuffle=False)
 48 |     cv_scores = []
 49 |     i = 0
 50 |     for train_idx, val_idx in skf.split(data, y):
 51 |         i += 1
 52 |         X = data.copy()
 53 |         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
 54 |         X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)
 55 |         clf.fit(X_train, y_train)
 56 |         # clf.fit_CV(X_train, X_val, y_train, y_val)
 57 |         y_val_pred = clf.predict_proba(X_val)
 58 |         loss = log_loss(y_val, y_val_pred)
 59 |         print("Iteration {}'s loss: {}".format(i, loss))
 60 |         cv_scores.append(loss)
 61 |         if early_stop:
 62 |             break
 63 |     print("*** Validation finished ***\n")
 64 |     return cv_scores
 65 | 
 66 | 
 67 | def validation_avg_score(clfs):
 68 |     print("*** Validation start ***")
 69 |     data = train_data.copy()
 70 |     y = data["interest_level"].apply(lambda x: target_num_map[x])
 71 |     del data["interest_level"]
 72 | 
 73 |     # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True)
 74 |     skf = StratifiedKFold(n_splits=3)
 75 |     cv_scores = {i:[] for i in range(len(clfs))}
 76 |     cv_scores["Avg"] = []
 77 |     i = 0
 78 |     for train_idx, val_idx in skf.split(data, y):
 79 |         i += 1
 80 |         X = data.copy()
 81 |         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
 82 |         X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)
 83 |         tmp = []
 84 |         preds = []
 85 |         j = 0
 86 |         for clf in clfs:
 87 |             clf.fit(X_train, y_train)
 88 |             y_val_pred = clf.predict_proba(X_val)
 89 |             tmp.append(y_val_pred)
 90 |             loss = log_loss(y_val, y_val_pred)
 91 |             cv_scores[j].append(loss)
 92 |             preds.append(y_val_pred)
 93 |             j += 1
 94 |             print("clf_{}, Iteration {}'s loss: {}".format(j, i, loss))
 95 |         preds = np.array(preds)
 96 |         avg_pred = np.mean(preds, axis=0)
 97 |         loss = log_loss(y_val, avg_pred)
 98 |         cv_scores["Avg"].append(loss)
 99 |         print("Iteration {}'s Avg loss: {}".format(i, loss))
100 |     for i in range(len(clfs)):
101 |         print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i])))
102 |     print("Average validation loss : {}".format(np.mean(cv_scores["Avg"])))
103 |     print("*** Validation finished ***\n")
104 |     return cv_scores["Avg"]
105 | 
106 | 
107 | def paramSearch(clf, param_dict):
108 | 
109 |     def outer_join(left, right):
110 |         if left == []:
111 |             return right
112 |         if right == []:
113 |             return left
114 |         res = []
115 |         for i in left:
116 |             for j in right:
117 |                 if isinstance(i, list):
118 |                     tmp = i[:]
119 |                     tmp.append(j)
120 |                     res.append(tmp)
121 |                 else:
122 |                     res.append([i, j])
123 |         return res
124 |     # Creating list of param_dict
125 |     param_list = sorted(param_dict.items(), key=lambda x: x[0])
126 |     param_keys = [ item[0] for item in param_list ]
127 |     param_vals = [ item[1] for item in param_list ]
128 |     all_vals = []
129 |     for val in param_vals:
130 |         all_vals = outer_join(all_vals, val)
131 |     all_param_lists = []
132 |     for vals in all_vals:
133 |         all_param_lists.append(dict(zip(param_keys, vals)))
134 |     # for item in all_param_lists:
135 |     #     print(item)
136 | 
137 |     # Searching
138 |     best_score = float('inf')
139 |     best_params = None
140 |     scores = []
141 |     i = 0
142 |     for params in all_param_lists:
143 |         print("\n" + "-" * 70)
144 |         for param_name in params.keys():
145 |             print("{} : {}".format(param_name, params[param_name]))
146 |         clf.set_params(**params)
147 |         score = np.mean(validation_score(clf))
148 |         if score < best_score:
149 |             best_score = score
150 |             best_params = params
151 |         i += 1
152 |         print("{} / {}, Done".format(i, len(all_param_lists)))
153 |         print("Score: ", score)
154 |         scores.append(score)
155 |     print(scores)
156 |     print("Best parameters:")
157 |     for param_name in best_params.keys():
158 |         print("{} : {}".format(param_name, best_params[param_name]))
159 |     print("Score: ", best_score)
160 | 
161 | 
162 | def gen_sub():
163 |     train = train_data.copy()
164 |     train_idx = [i for i in range(train.shape[0])]
165 |     test = test_data.copy()
166 |     test_idx = [i + train.shape[0] for i in range(test.shape[0])]
167 |     y = train["interest_level"].apply(lambda x: target_num_map[x])
168 |     del train["interest_level"]
169 |     data = pd.concat([train, test]).reset_index()
170 |     X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)
171 |     xgb_clf = xgboostClassifier(
172 |         objective = 'multi:softprob',
173 |         eval_metric = 'mlogloss',
174 |         num_class = 3,
175 |         nthread = 12,
176 |         eta = 0.02,
177 |         max_depth = 6,
178 |         subsample = 0.8,
179 |         colsample_bytree = 1.0,
180 |         colsample_bylevel = 0.8,
181 |         min_child_weight=1,
182 |         silent = 1,
183 |         num_rounds = 1700,
184 |         seed = RS,
185 |     )
186 |     print("Trainning:...")
187 |     xgb_clf.fit(X_train, y)
188 | 
189 |     preds = xgb_clf.predict_proba(X_test)
190 |     sub = pd.DataFrame(preds)
191 |     # sub.columns = ["high", "medium", "low"]
192 |     sub.columns = [ "high", "medium", "low"]
193 |     sub["listing_id"] = test.listing_id.values
194 |     sub.to_csv("submission.csv", index=False)
195 | 
196 | 
197 | def genAvgSub(clfs):
198 |     train = train_data.copy()
199 |     train_idx = [i for i in range(train.shape[0])]
200 |     test = test_data.copy()
201 |     test_idx = [i + train.shape[0] for i in range(test.shape[0])]
202 |     y = train["interest_level"].apply(lambda x: target_num_map[x])
203 |     del train["interest_level"]
204 |     data = pd.concat([train, test]).reset_index()
205 |     X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)
206 |     print("Trainning:...")
207 |     preds = []
208 |     for i in range(len(clfs)):
209 |         print("Clf_{} fiting".format(i))
210 |         clfs[i].fit(X_train, y)
211 |         print("Clf_{} predicting".format(i))
212 |         pred = clfs[i].predict_proba(X_test)
213 |         preds.append(pred)
214 |     sub = pd.DataFrame(np.mean(preds, axis=0))
215 |     # sub.columns = ["high", "medium", "low"]
216 |     sub.columns = [ "high", "medium", "low"]
217 |     sub["listing_id"] = test.listing_id.values
218 |     sub.to_csv("submission.csv", index=False)
219 |     print("Train done.")
220 | 
221 | 
222 | def validate(clfs):
223 |     cv_scores = validation_avg_score(clfs)
224 |     return cv_scores
225 | 
226 | 
227 | def search():
228 |     param_dict = {
229 |         'eta' : [0.02],
230 |         'max_depth' : [6],
231 |         'subsample' : [0.8],
232 |         'colsample_bylevel' : [0.7],
233 |         'num_rounds' : [1400, 1500, 1600, 1650],
234 |     }
235 |     clf = xgboostClassifier(
236 |         objective = 'multi:softprob',
237 |         eval_metric = 'mlogloss',
238 |         num_class = 3,
239 |         nthread = 12,
240 |         eta = 0.04,
241 |         max_depth = 6,
242 |         subsample = 0.7,
243 |         colsample_bytree = 1.0,
244 |         colsample_bylevel = 1.0,
245 |         min_child_weight=1,
246 |         silent = 1,
247 |         num_rounds = 700,
248 |         seed = RS,
249 |     )
250 |     paramSearch(clf, param_dict)
251 | 
252 | 
253 | def write2file(cv_scores, val_desc=None):
254 |     print("*" * 50)
255 |     print("Cross validation loss: ", np.mean(cv_scores))
256 |     with open("results.log", "a") as fp:
257 |         fp.write(time.strftime("%m/%d/%Y %H:%M") + '\n')
258 |         if(val_desc is not None):
259 |             fp.write(val_desc + '\n')
260 |         for score in cv_scores:
261 |             fp.write(str(score) + " ")
262 |         fp.write("\nCross Validation: {}\n".format(np.array(cv_scores).mean()))
263 |         fp.write("*" * 50 + "\n")
264 | 
265 | 
266 | def stacking(clfs):
267 |     print("Stacking")
268 |     train = train_data.copy()
269 |     test = test_data.copy()
270 |     y = train["interest_level"].apply(lambda x: target_num_map[x])
271 |     del train["interest_level"]
272 |     train_stackers = []
273 |     for RS in [0, 1, 2, 64, 128, 256, 512, 1024, 2048, 4096]:
274 |         skf = StratifiedKFold(n_splits=10, random_state=RS, shuffle=True)
275 |         #Create Arrays for meta
276 |         train_stacker = [[0.0 for s in range(3)]  for k in range (0,(train.shape[0]))]
277 |         cv_scores = {i:[] for i in range(len(clfs))}
278 |         cv_scores["Avg"] = []
279 |         print("Begin 10-flod cross validation")
280 |         cnt = 0
281 |         for train_idx, val_idx in skf.split(train, y):
282 |             cnt += 1
283 |             X = train.copy()
284 |             y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
285 |             X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)
286 |             X_train.toarray()
287 |             preds = []
288 |             k = 0
289 |             for clf in clfs:
290 |                 clf.fit(X_train, y_train)
291 |                 y_val_pred = clf.predict_proba(X_val)
292 |                 loss = log_loss(y_val, y_val_pred)
293 |                 preds.append(y_val_pred)
294 |                 cv_scores[k].append(loss)
295 |                 k += 1
296 |                 print("Clf_{} iteration {}'s loss: {}".format(k, cnt, loss))
297 |             preds = np.array(preds)
298 |             avg_pred = np.mean(preds, axis=0)
299 |             avg_loss = log_loss(y_val, avg_pred)
300 |             cv_scores["Avg"].append(avg_loss)
301 |             print("Iteration {}'s Avg loss: {}".format(cnt, avg_loss))
302 |             no = 0
303 |             for real_idx in val_idx:
304 |                 for i in range(3):
305 |                     train_stacker[real_idx][i] = avg_pred[no][i]
306 |                 no += 1
307 |         for i in range(len(clfs)):
308 |             print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i])))
309 |         print("Average validation loss : {}".format(np.mean(cv_scores["Avg"])))
310 |         train_stackers.append(train_stacker)
311 |     train_stacker = np.mean(train_stackers, axis=0)
312 |     print("*** Validation finished ***\n")
313 | 
314 |     test_stacker = [[0.0 for s in range(3)]   for k in range (0,(test.shape[0]))]
315 |     train_idx = [i for i in range(train.shape[0])]
316 |     test_idx = [i + train.shape[0] for i in range(test.shape[0])]
317 |     data = pd.concat([train, test]).reset_index()
318 |     X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)
319 |     print(X_train.shape, len(train_stacker))
320 |     print("Begin predicting")
321 |     preds = []
322 |     for i in range(len(clfs)):
323 |         print("Clf_{} fiting".format(i))
324 |         clfs[i].fit(X_train, y)
325 |         print("Clf_{} predicting".format(i))
326 |         pred = clfs[i].predict_proba(X_test)
327 |         preds.append(pred)
328 |     preds = np.mean(preds, axis=0)
329 |     for pr in range (0, len(preds)):  
330 |             for d in range (0,3):            
331 |                 test_stacker[pr][d]=(preds[pr][d])   
332 |     print ("merging columns")   
333 |     #stack xgboost predictions
334 |     X_train = np.column_stack((X_train.toarray(),train_stacker))
335 |     # stack id to test
336 |     X_test = np.column_stack((X_test.toarray(),test_stacker))         
337 |     # stack target to train
338 |     X = np.column_stack((y,X_train))
339 |     ids = test.listing_id.values
340 |     X_test = np.column_stack((ids, X_test))
341 |     np.savetxt("./train_stacknet.csv", X, delimiter=",", fmt='%.5f')
342 |     np.savetxt("./test_stacknet.csv", X_test, delimiter=",", fmt='%.5f') 
343 |     print("Write results...")
344 |     output_file = "submission_{}.csv".format(np.mean(cv_scores["Avg"]))
345 |     print("Writing submission to %s" % output_file)
346 |     f = open(output_file, "w")   
347 |     f.write("listing_id,high,medium,low\n")# the header   
348 |     for g in range(0, len(test_stacker))  :
349 |       f.write("%s" % (ids[g]))
350 |       for prediction in test_stacker[g]:
351 |          f.write(",%f" % (prediction))    
352 |       f.write("\n")
353 |     f.close()
354 |     print("Done.")
355 | 
356 | 
357 | if __name__ == "__main__":
358 |     clfs = []
359 |     # clfs.append(xgboostClassifier(
360 |     #     objective = 'multi:softprob',
361 |     #     eval_metric = 'mlogloss',
362 |     #     num_class = 3,
363 |     #     nthread = 6,
364 |     #     eta = 0.04,
365 |     #     max_depth = 6,
366 |     #     subsample = 0.7,
367 |     #     colsample_bytree = 1.0,
368 |     #     colsample_bylevel = 0.7,
369 |     #     min_child_weight=1,
370 |     #     silent = 1,
371 |     #     num_rounds = 700,
372 |     #     seed = 0,
373 |     # ))
374 |     # clfs.append(xgboostClassifier(
375 |     #     objective = 'multi:softprob',
376 |     #     eval_metric = 'mlogloss',
377 |     #     num_class = 3,
378 |     #     nthread = 6,
379 |     #     eta = 0.02,
380 |     #     max_depth = 6,
381 |     #     subsample = 0.8,
382 |     #     colsample_bytree = 1.0,
383 |     #     colsample_bylevel = 0.8,
384 |     #     min_child_weight=1,
385 |     #     silent = 1,
386 |     #     num_rounds = 1700,
387 |     #     seed = 0,
388 |     # ))
389 |     clfs.append(xgboostClassifier(
390 |         objective = 'multi:softprob',
391 |         eval_metric = 'mlogloss',
392 |         num_class = 3,
393 |         nthread = 9,
394 |         eta = 0.02,
395 |         max_depth = 6,
396 |         subsample = 0.8,
397 |         colsample_bytree = 1.0,
398 |         colsample_bylevel = 0.7,
399 |         min_child_weight=1,
400 |         silent = 1,
401 |         num_rounds = 1500,
402 |         seed = 0,
403 |     ))
404 |     clfs.append(xgboostClassifier(
405 |         objective = 'multi:softprob',
406 |         eval_metric = 'mlogloss',
407 |         num_class = 3,
408 |         nthread = 9,
409 |         eta = 0.02,
410 |         max_depth = 6,
411 |         subsample = 0.8,
412 |         colsample_bytree = 1.0,
413 |         colsample_bylevel = 0.8,
414 |         min_child_weight=1,
415 |         silent = 1,
416 |         num_rounds = 1500,
417 |         seed = 128,
418 |     ))
419 |     clfs.append(xgboostClassifier(
420 |         objective = 'multi:softprob',
421 |         eval_metric = 'mlogloss',
422 |         num_class = 3,
423 |         nthread = 9,
424 |         eta = 0.02,
425 |         max_depth = 6,
426 |         subsample = 0.8,
427 |         colsample_bytree = 1.0,
428 |         colsample_bylevel = 0.8,
429 |         min_child_weight=1,
430 |         silent = 1,
431 |         num_rounds = 1500,
432 |         seed = 512,
433 |     )) 
434 |     clfs.append(xgboostClassifier(
435 |         objective = 'multi:softprob',
436 |         eval_metric = 'mlogloss',
437 |         num_class = 3,
438 |         nthread = 9,
439 |         eta = 0.02,
440 |         max_depth = 6,
441 |         subsample = 0.8,
442 |         colsample_bytree = 1.0,
443 |         colsample_bylevel = 0.8,
444 |         min_child_weight=1,
445 |         silent = 1,
446 |         num_rounds = 1500,
447 |         seed = 1024,
448 |     ))   
449 |     clfs.append(xgboostClassifier(
450 |         objective = 'multi:softprob',
451 |         eval_metric = 'mlogloss',
452 |         num_class = 3,
453 |         nthread = 9,
454 |         eta = 0.02,
455 |         max_depth = 6,
456 |         subsample = 0.8,
457 |         colsample_bytree = 1.0,
458 |         colsample_bylevel = 0.8,
459 |         min_child_weight=1,
460 |         silent = 1,
461 |         num_rounds = 1500,
462 |         seed = 2048,
463 |     ))    
464 |     if len(sys.argv) == 1:
465 |         cv_scores = validate(clfs)
466 |         write2file(cv_scores)
467 |     elif len(sys.argv) == 2:
468 |         if sys.argv[1] == '-v':
469 |             cv_scores = validate(clfs)
470 |             write2file(cv_scores)
471 |         elif sys.argv[1] == '-g':
472 |             gen_sub()
473 |         elif sys.argv[1] == '-s':
474 |             search()
475 |         elif sys.argv[1] == '-ga':
476 |             genAvgSub(clfs)
477 |         elif sys.argv[1] == '-stack':
478 |             stacking(clfs)
479 |         elif sys.argv[1] == '-v3':
480 |             cv_scores = validate(clfs)
481 |             val_desc = sys.argv[2]
482 |             write2file(cv_scores, val_desc)
483 |     elif len(sys.argv) == 3:
484 |         if sys.argv[1] == '-v':
485 |             cv_scores = validate(clfs)
486 |             val_desc = sys.argv[2]
487 |             write2file(cv_scores, val_desc)
488 |         elif sys.argv[1] == '-g':
489 |             gen_sub()
490 |         elif sys.argv[1] == '-v3':
491 |             cv_scores = validation_score()
492 |             val_desc = sys.argv[2]
493 |             write2file(cv_scores, val_desc)
494 | 
495 | 
496 | 
497 | 
498 | 
499 | 
500 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #-*- encoding: utf-8 -*-
  3 | import sys
  4 | import random
  5 | import operator
  6 | import datetime
  7 | import time
  8 | from collections import defaultdict, Counter
  9 | import pandas as pd
 10 | import numpy as np
 11 | from scipy import sparse
 12 | import xgboost as xgb
 13 | from sklearn import preprocessing
 14 | from sklearn.model_selection import train_test_split, GridSearchCV, KFold
 15 | from sklearn.metrics import log_loss
 16 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 17 | from sklearn.cluster import KMeans
 18 | from sklearn.cross_validation import StratifiedKFold
 19 | from sklearn.preprocessing import StandardScaler
 20 | from nltk.metrics import distance as distance
 21 | 
 22 | 
 23 | FEATURE_NOT_USE = ['created','description','features','photos', 'index']# ,'bathrooms', 'bedrooms''listing_id',
 24 | FEATURE_NOT_USE.append('display_address')
 25 | FEATURE_NOT_USE.extend(['low_build_frac', 'high_build_frac', 'medium_build_frac', 'build_count'])# 
 26 | FEATURE_NOT_USE.extend(['low_manager_frac', 'high_manager_frac', 'medium_manager_frac','manager_count'])#
 27 | FEATURE_NOT_USE.extend(['Listing_Id', 'img_created']) # , 'time_stamp'
 28 | 
 29 | def bedroomProcess(data, train_idx, test_idx):
 30 |     # Some basic feature from bedrooms
 31 |     data["no_bedroom"] = data["bedrooms"].apply(lambda x: 1 if x == 0 else 0)
 32 |     data["more_than_5_bedroom"] = data["bedrooms"].apply(lambda x: 1 if x >= 5 else 0)
 33 |     data.loc[data["bedrooms"] + data["bathrooms"] == 0, "bedrooms"] = 0.001
 34 |     train = data.iloc[train_idx, :].copy()
 35 |     test = data.iloc[test_idx, :].copy()
 36 |     # remove null value (ugly code)
 37 |     train.loc[data["bedrooms"] == 0.001, "bathrooms"] = train["bathrooms"].mean()
 38 |     test.loc[data["bedrooms"] == 0.001, "bathrooms"] = test["bathrooms"].mean()
 39 |     data.iloc[train_idx, :] = train
 40 |     data.iloc[test_idx, :] = test
 41 |     data["bedroom_per_room"] = data["bedrooms"] / (data["bedrooms"] + data["bathrooms"])
 42 |     data.loc[data["bedrooms"] == 0.001, "bathrooms"] = 0
 43 |     data.loc[data["bedrooms"] == 0.001, "bedrooms"] = 0
 44 |     return data
 45 | 
 46 | 
 47 | def bathroomProcess(data, train_idx, test_idx):
 48 |     # Some basic feature from bathrooms
 49 |     data.loc[data["bathrooms"] == 112, "bathrooms"] = 1.5
 50 |     data.loc[data["bathrooms"] == 10, "bathrooms"] = 1
 51 |     data.loc[data["bathrooms"] == 20, "bathrooms"] = 2
 52 |     data["1_to_2_bathrooms"] = data["bathrooms"].apply(lambda x : 1if x != 0 and x <= 2 else 0)
 53 |     data.loc[data["bedrooms"] + data["bathrooms"] == 0, "bathrooms"] = 0.001
 54 |     train = data.iloc[train_idx, :].copy()
 55 |     test = data.iloc[test_idx, :].copy()
 56 |     # remove null value (ugly code)
 57 |     train.loc[data["bathrooms"] == 0.001, "bedrooms"] = train["bedrooms"].mean()
 58 |     test.loc[data["bathrooms"] == 0.001, "bedrooms"] = test["bedrooms"].mean()
 59 |     data.iloc[train_idx, :] = train
 60 |     data.iloc[test_idx, :] = test
 61 |     data["bathoom_per_room"] = data["bathrooms"] / (data["bedrooms"] + data["bathrooms"])
 62 |     data.loc[data["bathrooms"] == 0.001, "bedrooms"] = 0
 63 |     data.loc[data["bathrooms"] == 0.001, "bathrooms"] = 0
 64 |     return data
 65 | 
 66 | 
 67 | def buildingIdProcess(data, y, train_idx, test_idx):
 68 |     # Have tried some ideas but failed
 69 |     return data
 70 | 
 71 | 
 72 | def createdProcess(data):
 73 |     # Some basic features from created
 74 |     data["created"] = pd.to_datetime(data['created'])
 75 |     data["latest"] = (data["created"]- data["created"].min())
 76 |     data["latest"] = data["latest"].apply(lambda x: x.total_seconds())
 77 |     data["passed"] = (data["created"].max()- data["created"])
 78 |     data["passed"] = data["passed"].apply(lambda x: x.total_seconds())
 79 |     # year is weird
 80 |     data["year"] = data["created"].dt.year
 81 |     data['month'] = data['created'].dt.month
 82 |     data['day'] = data['created'].dt.day
 83 |     data['hour'] = data['created'].dt.hour
 84 |     data['weekday'] = data['created'].dt.weekday
 85 |     data['week'] = data['created'].dt.week
 86 |     data['quarter'] = data['created'].dt.quarter
 87 |     data['weekend'] = ((data['weekday'] == 5) & (data['weekday'] == 6))
 88 |     data['weekend'] = data['weekend'].apply(int)
 89 |     # data["created_stamp"] = data["created"].apply(lambda x: time.mktime(x.timetuple()))
 90 |     #*
 91 |     data["latest_list_rank"] = data["latest"] / data["listing_id"]   
 92 |     # data["diff_rank_2"] = data["passed"] / data["listing_id"]
 93 |     #*
 94 | 
 95 |     # image time after leak
 96 |     data.loc[data["time_stamp"] > 1490000000, "time_stamp"] = 1478524550
 97 |     data["img_created"] = data["time_stamp"].apply(lambda x: datetime.datetime.fromtimestamp(x))
 98 |     data["img_latest"] = (data["img_created"]- data["img_created"].min())
 99 |     data["img_latest"] = data["img_latest"].apply(lambda x: x.total_seconds())
100 |     data["img_passed"] = (data["img_created"].max()- data["img_created"])
101 |     data["img_passed"] = data["img_passed"].apply(lambda x: x.total_seconds())
102 |     data["img_year"] = data["img_created"].dt.year
103 |     data['img_month'] = data['img_created'].dt.month
104 |     data['img_day'] = data['img_created'].dt.day
105 |     data['img_hour'] = data['img_created'].dt.hour
106 |     # data['img_weekday'] = data['img_created'].dt.weekday
107 |     # data['img_week'] = data['img_created'].dt.week
108 |     # data['img_quarter'] = data['img_created'].dt.quarter
109 |     # data['img_weekend'] = ((data['img_weekday'] == 5) & (data['img_weekday'] == 6))
110 |     # data['img_weekend'] = data['img_weekend'].apply(int)
111 |     data["img_latest_list_rank"] = data["img_latest"] / data["listing_id"] 
112 | 
113 |     return data
114 | 
115 | 
116 | def descriptionProcess(data, train_idx, test_idx):
117 |     data["description_words_num"] = data["description"].apply(lambda x: len(x.split(' ')))
118 |     data["description_len"] = data["description"].apply(len)
119 |     # Some info from descriptions
120 |     desc_feats = {
121 |                   'bedroom_mentions': ['br ', '---', "<a", "a>", "<p>"],
122 |                   'html_tag_1':["<img ", "</a>", "<li>", "</li>", "<ul>", "</ul>", "-->", "<close","<hr"],
123 |                 }
124 |     for name, kwords in desc_feats.items():
125 |         data[name] =  data['description'].apply(lambda x: sum([x.count(w)  for w in kwords]))
126 | 
127 |     data['description'] =  data['description'].apply(lambda x: str(x).encode('utf-8') if len(x)>2 else "nulldesc") 
128 |     # Tf-idf Encode
129 |     tfidfdesc=TfidfVectorizer(min_df=20, max_features=50, strip_accents='unicode',lowercase =True,
130 |                         analyzer='word', token_pattern=r'\w{16,}', ngram_range=(1, 2), use_idf=False,smooth_idf=False, 
131 |     sublinear_tf=True, stop_words = 'english')  
132 |     tr_sparsed = tfidfdesc.fit_transform (data.iloc[train_idx, :]["description"])  
133 |     te_sparsed = tfidfdesc.transform(data.iloc[test_idx, :]["description"])
134 |     feats_names = ["desc_" + x for x in tfidfdesc.get_feature_names()]
135 |     return data, tr_sparsed, te_sparsed, feats_names
136 | 
137 | 
138 | def displayAddrProcess(data):
139 |     # disp_price_dict = dict(data.groupby('display_address')['price'].mean())
140 |     # data["mean_disp_price"] = data.apply(lambda row: disp_price_dict[row["display_address"]], axis=1)
141 |     # data["addr_sim"] = data.apply(lambda row: distance.edit_distance(row["display_address"].lower(), row["street_address"].lower()), axis=1)
142 |     return data
143 | 
144 | 
145 | def featuresProcess(data, train_idx, test_idx):
146 |     def afterRemoveStr(l, s):
147 |         while s in l:
148 |             l.remove(s)
149 |         return l
150 | 
151 |     def afterRemoveFirstSpace(l):
152 |         res = []
153 |         for s in l:
154 |             res.append(s.strip())
155 |         return res
156 | 
157 |     data["features_num"] = data["features"].apply(len)
158 |     mark = "#+-+#"
159 |     data["features"] = data["features"].apply(lambda x: mark.join([i for i in x]))
160 |     data["features"] = data["features"].apply(lambda x: x.lower())
161 | 
162 |     # Deal with list like data
163 |     data["features"] = data["features"].apply(lambda x: mark.join([i for i in x.split(" * ")]))
164 |     data["features"] = data["features"].apply(lambda x: mark.join([i for i in x.split("**")]))
165 |     data['features']=data['features'].str.replace("✓ hardwood floor ✓ high ceilings ✓ dishwasher",
166 |         "hardwood floor" + mark + "high ceilings" + mark + "dishwasher")
167 |     data['features']=data['features'].str.replace(
168 |         "• on-site lifestyle concierge by luxury attaché " + 
169 |         "•24/7 doorman " + 
170 |         "• state of the art cardiovascular and weight training equipment " +
171 |         "• 24-hour valet parking garage " +
172 |         "• valet services including dry cleaning",
173 |         "on-site lifestyle concierge by luxury attaché" + mark + 
174 |         "24/7 doorman" + mark + 
175 |         "state of the art cardiovascular and weight training equipment" + mark + 
176 |         "24-hour valet parking garage" + mark + 
177 |         "valet services including dry cleaning")
178 |     data['features']=data['features'].str.replace(
179 |         '{     0 = "laundry in unit";     ' + 
180 |         '1 = "cats allowed";     '+
181 |         '10 = hardwood;     '+
182 |         '11 = "high ceilings";     '+
183 |         '12 = renovated;     '+
184 |         '13 = "marble bath";     '+
185 |         '14 = "granite kitchen";     '+
186 |         '15 = light;     '+
187 |         '16 = "no fee";     '+
188 |         '17 = "walk-in closet";     '+
189 |         '2 = "dogs allowed";     '+
190 |         '3 = elevator;     '+
191 |         '4 = exclusive;     '+
192 |         '6 = laundry;     '+
193 |         '7 = subway;     '+
194 |         '8 = dishwasher;     '+
195 |         '9 = washer; }',
196 |         "laundry in unit" + mark + "cats allowed" + mark + "hardwood" + 
197 |         "high ceilings" + mark + "renovated" + mark + "marble bath" + 
198 |         "granite kitchen" + mark + "light" + mark + "no fee" +
199 |         "walk-in closet" + mark + "dogs allowed" + mark + "elevator" +
200 |         "exclusive" + mark + "laundry" + mark + "subway"+
201 |         "dishwasher" + mark + "washer")
202 |     data['features']=data['features'].str.replace("windowed air-conditioned and monitored laundry room",
203 |         "windowed air-conditioned" + mark + "monitored laundry room")
204 |     data['features']=data['features'].str.replace("wall of windows. huge bedrooms",
205 |         "wall of windows" + mark + "huge bedrooms")
206 |     data['features']=data['features'].str.replace("to relax and recharge. this spacious 3 bedroom/2 bath residence also features oak hardwood flooring",
207 |         "spacious" + mark + "3 bedroom" + mark + "2 bath" + mark + "residence" + mark + "oak hardwood flooring")
208 |     data['features']=data['features'].str.replace("stunning 3 bedroom apartment with a terrace! east harlem! the best deal out now! get it now!!!!",
209 |         "stunning" + mark + "3 bedroom" + mark + "a terrace" + mark + "east harlem" + mark + "the best deal out now! get it now!!!!")
210 |     data['features']=data['features'].str.replace("ss appliances - d/w -  m/w - recessed lighting - hardwood floors - high ceilings - marble bath",
211 |         "ss appliances - d/w -  m/w - " + mark + "recessed lighting" + mark + "hardwood floors" + mark + "high ceilings" + mark + "marble bath")
212 |     data['features']=data['features'].str.replace("spacious living room for any kind of entertainment. prime location in theater distric",
213 |         "spacious living room for any kind of entertainment." + mark + "prime location in theater distric")
214 |     data['features']=data['features'].str.replace("spacious living room + home office",
215 |         "spacious living room" + mark + "home office")
216 |     data['features']=data['features'].str.replace("spacious and sunny 1st floor apartment "+
217 |         "overlooking the garden  " + 
218 |         "*great williamsburg location*  "+
219 |         "steps from shopping and cafes "+
220 |         "and 5 minute walk to graham avenue l train (3rd stop from manhattan)  "+
221 |         "*shared back yard * "+
222 |         "large box style rooms * "+
223 |         "huge living room with high ceilings * "+
224 |         "nice bathroom with granite floor & ceramic tile * "+
225 |         "beautiful kitchen with granite counter tops  lots of closet spacehardwood floors *"+
226 |         " heat included in the rent  "+
227 |         "clean quiet building   "+
228 |         "cat ok  "+
229 |         "great location close to shopping",
230 |         "spacious"+ mark +"sunny 1st floor"+ mark+ 
231 |         "overlooking the garden" + mark+ 
232 |         "great williamsburg location"+ mark+ 
233 |         "steps from shopping and cafes"+ mark+ 
234 |         "5 minute walk to graham avenue"+ mark +"train (3rd stop from manhattan)"+ mark+ 
235 |         "shared back yard"+mark+ 
236 |         "large box style rooms"+mark+ 
237 |         "huge living room " + mark + "high ceilings"+ mark+ 
238 |         "nice bathroom" + mark +"granite floor" + mark +"ceramic tile * "+mark+ 
239 |         "beautiful kitchen" + mark +"granite counter tops" + mark +"closet " + mark +"spacehardwood floors"+mark+ 
240 |         "heat included in the rent"+mark+ 
241 |         "clean quiet building"+mark+ 
242 |         "cat ok"+mark+ 
243 |         "close to shopping")
244 |     data['features']=data['features'].str.replace("residents-only " + 
245 |         "fitness center " + 
246 |         "and aerobic room " + 
247 |         "professionally outfitted with a full complement of strength and cardio-training equipment",
248 |         "residents-only"+ mark +"itness center"+ mark+ 
249 |         "and aerobic room" + mark+ 
250 |         "cardio-training equipment")
251 |     data['features']=data['features'].str.replace("owner occupied - " + 
252 |         "3 family townhouse - " + 
253 |         "no realtor fees -"+
254 |         " this beautiful apt is offered below market rate",
255 |         "owner occupied"+ mark +"3 family townhouse"+ mark+ 
256 |         "no realtor fees" + mark+ 
257 |         "this beautiful apt is offered below market rate")
258 |     data['features']=data['features'].str.replace("newly renovated "+
259 |         "w/ oak wood floors   "+
260 |         "mid century modern style interior   "+
261 |         "large closets in every bedroom "+
262 |         "extra storage space in hall. "+
263 |         "large living room",
264 |         "newly renovated"+ mark +"oak wood floors"+ mark+ 
265 |         "mid century modern style interior" + mark+ 
266 |         "large closets in every bedroom" + mark+ 
267 |         "extra storage space in hall"+ mark +"large living room")
268 |     data['features']=data['features'].str.replace("live-in super package room "+
269 |         "smoke-free "+
270 |         "storage available "+
271 |         "virtual doorman "+
272 |         "guarantors accepted",
273 | 
274 |         "live-in super package room"+ mark +"smoke-free"+ mark+ 
275 |         "storage available" + mark+ 
276 |         "virtual doorman" + mark+ 
277 |         "guarantors accepted")
278 |     data['features']=data['features'].str.replace("live-in super package room "+
279 |         "smoke-free "+
280 |         "storage available "+
281 |         "virtual doorman "+
282 |         "guarantors accepted",
283 | 
284 |         "live-in super package room"+ mark +"smoke-free"+ mark+ 
285 |         "storage available" + mark+ 
286 |         "virtual doorman" + mark+ 
287 |         "guarantors accepted")
288 | 
289 |     # Merging some features
290 |     data['features']=data['features'].str.replace("washer/dyer combo","washer/dyer")
291 |     data['features']=data['features'].str.replace("washer/dryer inside the unit","washer/dyer")
292 |     data['features']=data['features'].str.replace("washer/dryer in-unit","washer/dyer")
293 |     data['features']=data['features'].str.replace("washer/dryer in unit","washer/dyer")
294 |     data['features']=data['features'].str.replace("washer/dryer in building","washer/dyer")
295 |     data['features']=data['features'].str.replace("washer/dryer in bldg","washer/dyer")
296 |     data['features']=data['features'].str.replace("washer/dryer hookup","washer/dyer")
297 |     data['features']=data['features'].str.replace("washer/dryer  stove/oven","washer/dyer")
298 |     data['features']=data['features'].str.replace("washer/drier hookups","washer/dyer")
299 |     data['features']=data['features'].str.replace("washer/ dryer in unit","washer/dyer")
300 |     data['features']=data['features'].str.replace("washer/ dryer hookups","washer/dyer")
301 |     data['features']=data['features'].str.replace("washer-dryer in unit","washer/dyer")
302 |     data['features']=data['features'].str.replace("washer-dryer hookups","washer/dyer")
303 |     data['features']=data['features'].str.replace("washer in unit","washer/dyer")
304 |     data['features']=data['features'].str.replace("washer dryer in unit","washer/dyer")
305 |     data['features']=data['features'].str.replace("washer dryer hookup","washer/dyer")
306 |     data['features']=data['features'].str.replace("washer dryer hook up","washer/dyer")
307 |     data['features']=data['features'].str.replace("washer and dryer in unit","washer/dyer")
308 |     data['features']=data['features'].str.replace("washer and dryer in the unit","washer/dyer")
309 |     data['features']=data['features'].str.replace("washer and dryer","washer/dyer")
310 |     data['features']=data['features'].str.replace("washer / dryer in unit","washer/dyer")
311 |     data['features']=data['features'].str.replace("washer / dryer (hookup only)","washer/dyer")
312 |     data['features']=data['features'].str.replace("washer / dryer","washer/dyer")
313 |     data['features']=data['features'].str.replace("washer & dryer.","washer/dyer")
314 |     data['features']=data['features'].str.replace("washer","washer/dyer")
315 |     data['features']=data['features'].str.replace("wash/dryer","washer/dyer")
316 | 
317 | 
318 |     data['features']=data['features'].str.replace("pets: cats/small dogs","pet-friendly")
319 |     data['features']=data['features'].str.replace("pets welcome","pet-friendly")
320 |     data['features']=data['features'].str.replace("pets upon approval","pet-friendly")
321 |     data['features']=data['features'].str.replace("pets on approval","pet-friendly")
322 |     data['features']=data['features'].str.replace("pets ok.","pet-friendly")
323 |     data['features']=data['features'].str.replace("pets ok","pet-friendly")
324 |     data['features']=data['features'].str.replace("pets are welcome","pet-friendly")
325 |     data['features']=data['features'].str.replace("pets allowed","pet-friendly")
326 |     data['features']=data['features'].str.replace("pets accepted (on approval)","pet-friendly")
327 |     data['features']=data['features'].str.replace("pets","pet-friendly")
328 |     data['features']=data['features'].str.replace("pet grooming room","pet-friendly")
329 |     data['features']=data['features'].str.replace("pet friendly building","pet-friendly")
330 |     data['features']=data['features'].str.replace("pet friendly ( case by case )","pet-friendly")
331 |     data['features']=data['features'].str.replace("pet friendly","pet-friendly")
332 |     data['features']=data['features'].str.replace("pet friendly building","pet-friendly")
333 |     data['features']=data['features'].str.replace("pet friendly building","pet-friendly")
334 | 
335 |     data['features']=data['features'].str.replace("garden/patio","garden")
336 |     data['features']=data['features'].str.replace("patio","garden")
337 |     data['features']=data['features'].str.replace("residents_garden","garden")
338 |     data['features']=data['features'].str.replace("common garden","garden")
339 | 
340 |     data['features']=data['features'].str.replace("wifi access","wifi")
341 |     data['features']=data['features'].str.replace("wifi included","wifi")
342 |     data['features']=data['features'].str.replace("wifi in resident lounge","wifi")
343 |     data['features']=data['features'].str.replace("wifi + utilities","wifi")
344 |     data['features']=data['features'].str.replace("wi fi work lounge","wifi")
345 |     data['features']=data['features'].str.replace("wi-fi access","wifi")
346 | 
347 |     data['features']=data['features'].str.replace("24/7","24")
348 |     data['features']=data['features'].str.replace("24-hour","24")
349 |     data['features']=data['features'].str.replace("24hr","24")
350 |     data['features']=data['features'].str.replace("concierge","doorman")
351 |     data['features']=data['features'].str.replace("ft doorman","doorman")
352 |     data['features']=data['features'].str.replace("24 doorman","doorman")
353 |     data['features']=data['features'].str.replace("24 hr doorman","doorman")
354 |     data['features']=data['features'].str.replace("doorman service","doorman")
355 |     data['features']=data['features'].str.replace("full-time doorman","doorman")
356 | 
357 |     data['features']=data['features'].str.replace("gym/fitness","fitness")
358 |     data['features']=data['features'].str.replace("fitness room","fitness")
359 | 
360 |     data['features']=data['features'].str.replace("washer","laundry")
361 |     data['features']=data['features'].str.replace("laundry in bldg","laundry")
362 |     data['features']=data['features'].str.replace("laundry in building","laundry")
363 |     data['features']=data['features'].str.replace("laundry in building/dryer","laundry")
364 |     data['features']=data['features'].str.replace("laundry in building_&_dryer","laundry")
365 |     data['features']=data['features'].str.replace("laundry room","laundry")
366 |     data['features']=data['features'].str.replace("laundry & housekeeping","laundry")
367 |     data['features']=data['features'].str.replace("laundry in unit","laundry")
368 |     data['features']=data['features'].str.replace("laundry in-unit","laundry")
369 |     data['features']=data['features'].str.replace("laundry on every floor","laundry")
370 |     data['features']=data['features'].str.replace("laundry on floor","laundry")
371 |     data['features']=data['features'].str.replace("in-unit laundry/dryer","laundry")
372 |     data['features']=data['features'].str.replace("on-site laundry","laundry")
373 |     data['features']=data['features'].str.replace("laundry/dryer","laundry")
374 | 
375 |     data['features']=data['features'].str.replace("high-speed internet","high_speed_internet")
376 |     data['features']=data['features'].str.replace("high speed internet available","high_speed_internet")
377 | 
378 |     data['features']=data['features'].str.replace("parking available","parking")
379 |     data['features']=data['features'].str.replace("parking space","parking")
380 |     data['features']=data['features'].str.replace("on-site garage","parking")
381 |     data['features']=data['features'].str.replace("on-site parking","parking")
382 |     data['features']=data['features'].str.replace("on-site parking lot","parking")
383 |     data['features']=data['features'].str.replace("full service garage","parking")
384 |     data['features']=data['features'].str.replace("common parking/garage","parking")
385 |     data['features']=data['features'].str.replace("garage","parking")
386 |     data['features']=data['features'].str.replace("assigned-parking-space","private_parking")
387 | 
388 |     data['features']=data['features'].str.replace("storage available","storage")
389 |     data['features']=data['features'].str.replace("storage facilities available","storage")
390 |     data['features']=data['features'].str.replace("storage space","storage")
391 |     data['features']=data['features'].str.replace("storage room","storage")
392 |     data['features']=data['features'].str.replace("common storage","storage")
393 | 
394 |     data['features']=data['features'].str.replace("central a/c","central_air")
395 |     data['features']=data['features'].str.replace("central ac","central_air")
396 |     data['features']=data['features'].str.replace("air conditioning","central_air")
397 | 
398 |     data['features']=data['features'].str.replace("close to  subway","subway")
399 | 
400 |     data['features']=data['features'].str.replace("roofdeck","roof-deck")
401 |     data['features']=data['features'].str.replace("roof-deck","roof-deck")
402 |     data['features']=data['features'].str.replace("rooftop terrace","roof-deck")
403 |     data['features']=data['features'].str.replace("rooftop deck","roof-deck")
404 |     data['features']=data['features'].str.replace("roof access","roof-deck")
405 |     data['features']=data['features'].str.replace("common roof deck","roof-deck")
406 |     data['features']=data['features'].str.replace("roof decks","roof-deck")
407 |     data['features']=data['features'].str.replace("roof grilling area","roof-deck")
408 |     data['features']=data['features'].str.replace("roof garden and lounge","roof-deck")
409 |     data['features']=data['features'].str.replace("roof deck with stunning view","roof-deck")
410 |     data['features']=data['features'].str.replace("roof deck with real grass","roof-deck")
411 |     data['features']=data['features'].str.replace("roof deck with grills","roof-deck")
412 |     data['features']=data['features'].str.replace("roof deck w/ grills","roof-deck")
413 |     data['features']=data['features'].str.replace("roof deck / sun deck","roof-deck")
414 |     data['features']=data['features'].str.replace("roof deck","roof-deck")
415 | 
416 |     data['features']=data['features'].str.replace("swimming pool","pool")
417 |     data['features']=data['features'].str.replace("indoor pool","pool")
418 | 
419 |     data['features']=data['features'].str.replace("deco fireplace","fireplaces")
420 |     data['features']=data['features'].str.replace("decorative fireplace","fireplaces")
421 | 
422 |     data['features']=data['features'].str.replace("yoga/pilates studio","yoga")
423 |     data['features']=data['features'].str.replace("yoga studio","yoga")
424 |     data['features']=data['features'].str.replace("yoga room","yoga")
425 |     data['features']=data['features'].str.replace("yoga classes","yoga")
426 |     data['features']=data['features'].str.replace("yoga and spin studios","yoga")
427 |     data['features']=data['features'].str.replace("yoga an pilates class","yoga")
428 |     data['features']=data['features'].str.replace("yoga / dance studio","yoga")
429 | 
430 | 
431 |     # data["features"] = data["features"].apply(lambda x: afterRemoveStr(x, ''))
432 |     # data["features"] = data["features"].apply(lambda x: afterRemoveFirstSpace(x))
433 |     data["features"] = data["features"].apply(lambda x: x.split(mark))
434 |     data["features"] = data["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
435 |     tfidf = CountVectorizer(stop_words="english", max_features=200)
436 |     tr_sparse_feats = tfidf.fit_transform(data.iloc[train_idx, :]["features"])
437 |     te_sparse_feats = tfidf.transform(data.iloc[test_idx, :]["features"])
438 |     feats_names = ["features_" + x for x in tfidf.get_feature_names()]
439 |     return data, tr_sparse_feats, te_sparse_feats, feats_names
440 | 
441 | 
442 | def locationProcess(data, train_idx, test_idx):
443 |     # Clustering
444 | 
445 |     # train_x = data.iloc[train_idx,:][['new_latitude', 'new_longitude']]
446 |     # stest_x = data.iloc[test_idx,:][['new_latitude', 'new_longitude']]
447 |     train_x = data.iloc[train_idx, :][['latitude', 'longitude']]
448 |     test_x = data.iloc[test_idx, :][['latitude', 'longitude']]
449 |     kmeans_cluster = KMeans(n_clusters=20)
450 |     res = kmeans_cluster.fit(train_x)
451 |     res = kmeans_cluster.predict(pd.concat([train_x, test_x]))
452 |     d = dict(zip(data['listing_id'], res))
453 |     data['cenroid'] = data['listing_id'].apply(lambda x: d[x])
454 |     # Manhattan distance
455 |     center = [data.iloc[train_idx, :]['latitude'].mean(), data.iloc[train_idx, :]['longitude'].mean()]
456 |     data['distance'] = abs(data['latitude'] - center[0]) + abs(data['longitude'] - center[1])
457 |     # data['distance_2'] = np.sqrt((data['latitude'] - center[0]) ** 2 + (data['longitude'] - center[1]) ** 2)
458 | 
459 |     return data
460 | 
461 | 
462 | def managerIdProcess(data, y, train_idx, test_idx):
463 |     manager_lgt_dict = dict(data.groupby('manager_id')['longitude'].mean())
464 |     manager_ltt_dict =  dict(data.groupby('manager_id')['latitude'].mean())
465 | 
466 |     # Group manager_id with location info
467 |     data["mean_man_longitude"] = data.apply(lambda row: manager_lgt_dict[row["manager_id"]], axis=1)
468 |     data["mean_man_latitude"] = data.apply(lambda row: manager_ltt_dict[row["manager_id"]], axis=1)
469 | 
470 |     # Group manager_id with time info
471 |     data = group_with_time_features(data, "manager_id")
472 |     data = group_with_img_time_features(data, "manager_id")
473 |     manager_stamp_dict = dict(data.groupby('manager_id')['time_stamp'].mean())
474 |     data["mean_man_timestamp"] = data.apply(lambda row: manager_stamp_dict[row["manager_id"]], axis=1)
475 |     # manager_stamp_dict = dict(data.groupby('manager_id')['created_stamp'].mean())
476 |     # data["mean_man_createdstamp"] = data.apply(lambda row: manager_stamp_dict[row["manager_id"]], axis=1)  
477 |     return data
478 | 
479 | 
480 | def photoProcess(data):
481 |     data["photo_num"] = data["photos"].apply(len)
482 |     return data
483 | 
484 | 
485 | def priceProcess(data):
486 |     #data["out_price"] = data["price"].apply(lambda x: 1 if x < 700 or x > 15000 else 0)
487 |     # Clean the outlier
488 |     ulimit = 15000#np.percentile(data.price.values, 99)
489 |     data.loc[data["price"] > ulimit, "price"] = ulimit
490 |     dlimit = 350
491 |     data.loc[data["price"] < dlimit, "price"] = dlimit
492 |     data["price_per_room"] = data["price"] / (data["bedrooms"] + data["bathrooms"] + 1.0)
493 |     data["price_per_bed"] = data["price"] / (data["bedrooms"] + 1.0)
494 |     #*
495 |     # data.loc[~np.isfinite(data["price_per_room"]), "price_per_room"] = 0
496 |     # data.loc[~np.isfinite(data["price_per_bed"]), "price_per_bed"] = 0
497 |     data["price_latitude"] = data["price"] / (data["latitude"] + 1.0)
498 |     data["price_longitude"] = data["price"] / (data["longitude"] + 1.0)
499 | 
500 |     # Grouping price with size or build
501 |     median_list = ['bedrooms', 'bathrooms', 'building_id']
502 |     # median_list = ['month', 'day', 'hour', 'weekday', 'quarter', 'week', 'passed', 'latest']
503 |     for col in median_list:
504 |         median_price = data[[col, 'price']].groupby(col)['price'].median()
505 |         median_price = median_price[data[col]].values.astype(float)
506 |         data['median_' + col] = median_price
507 |         data['ratio_' + col] = data['price'] / median_price
508 |         data['median_' + col] = data['median_' + col].apply(lambda x: np.log(x))
509 |     # data["price"] = data["price"].apply(lambda x: np.log(x))
510 |     return data
511 | 
512 | 
513 | def streetAddrProcess(data):
514 |     #data["new_addr"] = data["street_address"].apply(lambda x: ' '.join([x.split()[i] for i in range(1, len(x.split()))]))
515 |     #data["new_addr"] = preprocessing.LabelEncoder().fit_transform(data["new_addr"])
516 |     # data["street_address"] = data["street_address"].apply(lambda x: x.replace('\u00a0', '').strip().lower)
517 |     return data
518 | 
519 | 
520 | def listingIdProcess(data):
521 |     # It's weird。
522 |     data["listing_id"] = data["listing_id"] - 68119576.0
523 |     return data
524 | 
525 | 
526 | def coreProcess(data, y_train, train_idx, test_idx):
527 |     data = listingIdProcess(data)
528 |     data = bedroomProcess(data, train_idx, test_idx)
529 |     data = bathroomProcess(data, train_idx, test_idx)
530 |     data["room_diff"] = data["bathrooms"] - data["bedrooms"]
531 |     data["room_num"] = data["bedrooms"] + data["bathrooms"]
532 |     data = createdProcess(data)
533 |     data = buildingIdProcess(data, y_train, train_idx, test_idx)
534 |     data, tr_sparsed, te_sparsed, feats_sparsed = descriptionProcess(data, train_idx, test_idx)
535 |     data = displayAddrProcess(data)
536 |     data, tr_sparse, te_sparse, feats_sparse = featuresProcess(data, train_idx, test_idx)
537 |     data = locationProcess(data, train_idx, test_idx)
538 |     data = managerIdProcess(data, y_train, train_idx, test_idx)
539 |     data = photoProcess(data)
540 |     data = priceProcess(data)
541 |     data = streetAddrProcess(data)
542 |     
543 |     categorical = ["display_address", "manager_id", "building_id", "street_address"]
544 |     for f in categorical:
545 |         if data[f].dtype=='object':
546 |             cases=defaultdict(int)
547 |             temp=np.array(data[f]).tolist()
548 |             for k in temp:
549 |                 cases[k]+=1
550 |             # print(f, len(cases))
551 |             data[f] = data[f].apply(lambda x: cases[x])
552 |             
553 |     feats_in_use = [col for col in data.columns if col not in FEATURE_NOT_USE]
554 | 
555 |     data_train = np.array(data.iloc[train_idx, :][feats_in_use])
556 |     data_test  = np.array(data.iloc[test_idx, :][feats_in_use])
557 |     # Feature Scaling
558 |     stda = StandardScaler()  
559 |     data_test = stda.fit_transform(data_test)          
560 |     data_train = stda.transform(data_train)
561 |     #  High cardinality feature
562 |     high_card_feats = ["building_id", "manager_id", "longitude", "room_diff"] # "building_id", "manager_id", 
563 |     # C0 = [3, 12, 0, 4]
564 |     C0 = [feats_in_use.index(f) for f in high_card_feats]
565 |     W_train, W_cv = convert_to_avg(data_train, y_train, data_test, seed=1, cvals=5, roundings=2, columns=C0)
566 |     #  Add Sparse feature
567 |     data_train = sparse.hstack([data_train, tr_sparse, tr_sparsed, W_train[:, C0]]).tocsr()
568 |     data_test = sparse.hstack([data_test, te_sparse, te_sparsed, W_cv[:, C0]]).tocsr()
569 |     feats_in_use.extend(feats_sparse)
570 |     feats_in_use.extend(feats_sparsed)
571 |     feats_in_use.extend(["build_high_card", "manager_high_card"])
572 |     # print(len(feats_in_use))
573 |     # print(tr_sparse.toarray().shape, tr_sparsed.toarray().shape, len(feats_in_use), data_train.shape)
574 |     return data_train, data_test, feats_in_use
575 | 
576 | 
577 | # Copy from KazAnova's starter code
578 | def convert_dataset_to_avg(xc,yc,xt, rounding=2,cols=None):
579 |     xc = xc.tolist()
580 |     xt = xt.tolist()
581 |     yc = yc.tolist()
582 |     if cols == None:
583 |         cols =[k for k in range(0,len(xc[0]))]
584 |     woe=[ [0.0 for k in range(0,len(cols))] for g in range(0,len(xt))]
585 |     good=[]
586 |     bads=[]
587 |     for col in cols:
588 |         dictsgoouds=defaultdict(int)        
589 |         dictsbads=defaultdict(int)
590 |         good.append(dictsgoouds)
591 |         bads.append(dictsbads)        
592 |     total_count=0.0
593 |     total_sum =0.0
594 | 
595 |     for a in range (0,len(xc)):
596 |         target=yc[a]
597 |         total_sum+=target
598 |         total_count+=1.0
599 |         for j in range(0,len(cols)):
600 |             col=cols[j]
601 |             good[j][round(xc[a][col],rounding)]+=target
602 |             bads[j][round(xc[a][col],rounding)]+=1.0  
603 |     #print(total_goods,total_bads)            
604 |     
605 |     for a in range (0,len(xt)):    
606 |         for j in range(0,len(cols)):
607 |             col=cols[j]
608 |             if round(xt[a][col],rounding) in good[j]:
609 |                  woe[a][j]=float(good[j][round(xt[a][col],rounding)])/float(bads[j][round(xt[a][col],rounding)])  
610 |             else :
611 |                  woe[a][j]=round(total_sum/total_count)
612 |     return woe            
613 | 
614 | 
615 | def convert_to_avg(X,y, Xt, seed=1, cvals=5, roundings=2, columns=None):
616 |     
617 |     if columns==None:
618 |         columns=[k for k in range(0,(X.shape[1]))]    
619 |     #print("it is not!!")        
620 |     X=X.tolist()
621 |     Xt=Xt.tolist() 
622 |     woetrain=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(X))]
623 |     woetest=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(Xt))]    
624 |     
625 |     kfolder=StratifiedKFold(y, n_folds=cvals,shuffle=True, random_state=seed)
626 |     for train_index, test_index in kfolder:
627 |         # creaning and validation sets
628 |         X_train, X_cv = np.array(X)[train_index], np.array(X)[test_index]
629 |         y_train = np.array(y)[train_index]
630 | 
631 |         woecv=convert_dataset_to_avg(X_train,y_train,X_cv, rounding=roundings,cols=columns)
632 |         X_cv=X_cv.tolist()
633 |         no=0
634 |         for real_index in test_index:
635 |             for j in range(0,len(X_cv[0])):
636 |                 woetrain[real_index][j]=X_cv[no][j]
637 |             no+=1
638 |         no=0
639 |         for real_index in test_index:
640 |             for j in range(0,len(columns)):
641 |                 col=columns[j]
642 |                 woetrain[real_index][col]=woecv[no][j]
643 |             no+=1      
644 |     woefinal=convert_dataset_to_avg(np.array(X),np.array(y),np.array(Xt), rounding=roundings,cols=columns) 
645 | 
646 |     for real_index in range(0,len(Xt)):
647 |         for j in range(0,len(Xt[0])):           
648 |             woetest[real_index][j]=Xt[real_index][j]
649 |             
650 |     for real_index in range(0,len(Xt)):
651 |         for j in range(0,len(columns)):
652 |             col=columns[j]
653 |             woetest[real_index][col]=woefinal[real_index][j]
654 |             
655 |     return np.array(woetrain), np.array(woetest)
656 | 
657 | 
658 | # Grouping (Very important)
659 | def group_with_time_features(data, g_feat):
660 |     mean_month_dict = dict(data.groupby(g_feat)['month'].mean())
661 |     data["mean_" + g_feat + "_month"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1)
662 |     mean_day_dict = dict(data.groupby(g_feat)['day'].mean())
663 |     data["mean_" + g_feat + "_day"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1)
664 |     mean_hour_dict = dict(data.groupby(g_feat)['hour'].mean())
665 |     data["mean_" + g_feat + "_hour"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1)
666 |     mean_weekday_dict = dict(data.groupby(g_feat)['weekday'].mean())
667 |     data["mean_" + g_feat + "_weekday"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1)
668 |     mean_quarter_dict = dict(data.groupby(g_feat)['quarter'].mean())
669 |     data["mean_" + g_feat + "_quater"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1)
670 |     mean_week_dict = dict(data.groupby(g_feat)['week'].mean())
671 |     data["mean_" + g_feat + "_week"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1)
672 |     mean_passed_dict = dict(data.groupby(g_feat)['passed'].mean())
673 |     data["mean_" + g_feat + "_passed"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1)
674 |     mean_latest_dict = dict(data.groupby(g_feat)['latest'].mean())
675 |     data["mean_" + g_feat + "_latest"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1)
676 | 
677 |     return data
678 | 
679 | 
680 | def group_with_img_time_features(data, g_feat):
681 |     mean_month_dict = dict(data.groupby(g_feat)['img_month'].mean())
682 |     data["mean_" + g_feat + "_img_month"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1)
683 |     mean_day_dict = dict(data.groupby(g_feat)['img_day'].mean())
684 |     data["mean_" + g_feat + "_img_day"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1)
685 |     mean_hour_dict = dict(data.groupby(g_feat)['img_hour'].mean())
686 |     data["mean_" + g_feat + "_img_hour"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1)
687 |     # mean_weekday_dict = dict(data.groupby(g_feat)['img_weekday'].mean())
688 |     # data["mean_" + g_feat + "_img_weekday"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1)
689 |     # mean_quarter_dict = dict(data.groupby(g_feat)['img_quarter'].mean())
690 |     # data["mean_" + g_feat + "_img_quater"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1)
691 |     # mean_week_dict = dict(data.groupby(g_feat)['img_week'].mean())
692 |     # data["mean_" + g_feat + "_img_week"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1)
693 |     mean_passed_dict = dict(data.groupby(g_feat)['img_passed'].mean())
694 |     data["mean_" + g_feat + "_img_passed"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1)
695 |     mean_latest_dict = dict(data.groupby(g_feat)['img_latest'].mean())
696 |     data["mean_" + g_feat + "_img_latest"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1)
697 |     return data
698 | 
699 | 
700 | 
701 | 
702 | 
703 | 
704 | 


--------------------------------------------------------------------------------