├── .gitignore ├── README.md ├── embed_engineering.py ├── lgb_models.py ├── logconfig.py ├── main.py ├── optimize.py ├── resources └── stop_words.txt ├── stacking.py ├── stat_engineering.py ├── utils.py ├── w2v.py └── w2v_engineering.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Flask stuff: 56 | instance/ 57 | .webassets-cache 58 | 59 | # Scrapy stuff: 60 | .scrapy 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # Jupyter Notebook 69 | .ipynb_checkpoints 70 | 71 | # pyenv 72 | .python-version 73 | 74 | # celery beat schedule file 75 | celerybeat-schedule 76 | 77 | # SageMath parsed files 78 | *.sage.py 79 | 80 | # Environments 81 | .env 82 | .venv 83 | env/ 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | .spyproject 90 | 91 | # Rope project settings 92 | .ropeproject 93 | 94 | # jetbrains IDEA Project files 95 | .idea/ 96 | 97 | # VS Code 98 | .vscode/ 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # reference 107 | reference/ 108 | 109 | # data 110 | *.csv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TianChi_OGeek 2 | 3 | - 数据问题:train部分数据第1815102行漏了个引号,手动删除或不上引号即可 4 | 5 | 6 | -------------------------------------------------------------------------------- /embed_engineering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from logconfig import config_logging 5 | from utils import char_cleaner, char_list_cheaner 6 | import logging 7 | import warnings 8 | 9 | config_logging() 10 | logger = logging.getLogger('embed_features') 11 | 12 | warnings.filterwarnings('ignore') 13 | 14 | -------------------------------------------------------------------------------- /lgb_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | import os 6 | import warnings 7 | 8 | import lightgbm as lgb 9 | import numpy as np 10 | import pandas as pd 11 | from scipy import sparse 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.metrics import f1_score 14 | from sklearn.metrics import log_loss 15 | from sklearn.model_selection import StratifiedKFold 16 | from sklearn.preprocessing import MinMaxScaler 17 | from sklearn.preprocessing import OneHotEncoder 18 | 19 | from logconfig import config_logging 20 | 21 | warnings.filterwarnings('ignore') 22 | 23 | config_logging() 24 | logger = logging.getLogger('models') 25 | 26 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data") 27 | ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData") 28 | 29 | 30 | def get_data(name): 31 | etl_path = os.path.join("data", "EtlData") 32 | 33 | if name == "train": 34 | file_name = "train.csv" 35 | elif name == "validate": 36 | file_name = "validate.csv" 37 | elif name == "test": 38 | file_name = "test.csv" 39 | else: 40 | raise FileNotFoundError() 41 | 42 | data_name = os.path.join(etl_path, file_name) 43 | 44 | df = pd.read_csv(data_name, header=0) 45 | 46 | one_hot_columns = ['tag', 'prefix_kmeans', 'title_kmeans', 'complete_prefix_kmeans'] 47 | df = pd.get_dummies(df, columns=one_hot_columns) 48 | 49 | return df 50 | 51 | 52 | def combine(): 53 | names = ['train', 'test', 'validate'] 54 | for name in names: 55 | stat_name = os.path.join(ETL_DATA_PATH, '{}_stat.csv'.format(name)) 56 | stat_df = pd.read_csv(stat_name) 57 | 58 | w2v_name = os.path.join(ETL_DATA_PATH, '{}_w2v.csv'.format(name)) 59 | w2v_df = pd.read_csv(w2v_name) 60 | 61 | df = pd.concat([stat_df, w2v_df], axis=1) 62 | 63 | df_name = os.path.join(ETL_DATA_PATH, '{}.csv'.format(name)) 64 | df.to_csv(df_name, index=False) 65 | 66 | 67 | def lgb_model(train_data, validate_data, test_data, parms, threshold, n_folds=5): 68 | columns = train_data.columns 69 | remove_columns = ["label"] 70 | features_columns = [column for column in columns if column not in remove_columns] 71 | 72 | train_data = pd.concat([train_data, validate_data], axis=0, ignore_index=True, sort=False) 73 | train_features = train_data[features_columns] 74 | train_labels = train_data["label"] 75 | 76 | validate_labels = validate_data["label"] 77 | 78 | test_data = pd.concat([validate_data, test_data], axis=0, ignore_index=True, sort=False) 79 | validate_data_length = validate_data.shape[0] 80 | test_features = test_data[features_columns] 81 | 82 | kfolder = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2018) 83 | kfold = kfolder.split(train_features, train_labels) 84 | 85 | preds_list = list() 86 | for train_index, test_index in kfold: 87 | k_x_train = train_features.loc[train_index] 88 | k_y_train = train_labels.loc[train_index] 89 | k_x_test = train_features.loc[test_index] 90 | k_y_test = train_labels.loc[test_index] 91 | 92 | gbm = lgb.LGBMClassifier(**parms) 93 | gbm = gbm.fit(k_x_train, k_y_train, 94 | eval_metric="logloss", 95 | eval_set=[(k_x_train, k_y_train), 96 | (k_x_test, k_y_test)], 97 | eval_names=["train", "valid"], 98 | early_stopping_rounds=100, 99 | verbose=True) 100 | 101 | preds = gbm.predict_proba(test_features, num_iteration=gbm.best_iteration_)[:, 1] 102 | 103 | preds_list.append(preds) 104 | 105 | length = len(preds_list) 106 | preds_columns = ["preds_{id}".format(id=i) for i in range(length)] 107 | 108 | preds_df = pd.DataFrame(data=preds_list) 109 | preds_df = preds_df.T 110 | preds_df.columns = preds_columns 111 | preds_df["mean"] = preds_df.mean(axis=1) 112 | 113 | preds_df["mean"] = preds_df["mean"].apply(lambda item: 1 if item >= threshold else 0) 114 | 115 | validate_preds = preds_df[:validate_data_length] 116 | test_preds = preds_df[validate_data_length:] 117 | 118 | logger.info('the avg of test is {}'.format(np.mean(test_preds["mean"]))) 119 | 120 | f_score = f1_score(validate_labels, validate_preds["mean"]) 121 | logger.info('validate f_score is {}'.format(f_score)) 122 | logger.info('validate the avg of validate is {}'.format(np.mean(validate_preds["mean"]))) 123 | 124 | predictions = pd.DataFrame({"predicted_score": test_preds["mean"]}) 125 | 126 | predictions.to_csv("predict.csv", index=False, header=False) 127 | 128 | 129 | def lgb_lr_model(train_data, validate_data, test_data, threshold, n_folds=5): 130 | columns = train_data.columns 131 | remove_columns = ["label"] 132 | features_columns = [column for column in columns if column not in remove_columns] 133 | 134 | validate_data_length = validate_data.shape[0] 135 | 136 | train_data = pd.concat([train_data, validate_data], axis=0, ignore_index=True, sort=False) 137 | train_features = train_data[features_columns] 138 | train_labels = train_data["label"] 139 | 140 | validate_labels = validate_data["label"] 141 | 142 | test_data = pd.concat([validate_data, test_data], axis=0, ignore_index=True, sort=False) 143 | test_features = test_data[features_columns] 144 | 145 | gbm = lgb.LGBMClassifier(boosting_type='gbdt', 146 | num_leaves=127, 147 | reg_alpha=3, 148 | reg_lambda=5, 149 | max_depth=-1, 150 | n_estimators=80, 151 | objective='binary', 152 | subsample=0.8, 153 | colsample_bytree=0.8, 154 | subsample_freq=1, 155 | min_child_weight=0.1, 156 | learning_rate=0.1, 157 | random_state=2018, 158 | n_jobs=-1, 159 | min_child_samples=200) 160 | 161 | gbm.fit(train_features, train_labels, eval_metric='binary_logloss', early_stopping_rounds=100) 162 | 163 | lgb_train_leaf = gbm.predict(train_features, pred_leaf=True) 164 | lgb_test_leaf = gbm.predict(test_features, pred_leaf=True) 165 | 166 | leaf_columns = ['leaf_{}'.format(i) for i in range(lgb_train_leaf.shape[1])] 167 | 168 | train_leaf_df = pd.DataFrame(lgb_train_leaf, columns=leaf_columns) 169 | test_leaf_df = pd.DataFrame(lgb_test_leaf, columns=leaf_columns) 170 | 171 | train_features = pd.concat([train_features, train_leaf_df], axis=1) 172 | test_features = pd.concat([test_features, test_leaf_df], axis=1) 173 | 174 | df_features = pd.concat([train_features, test_features], ignore_index=True, sort=False, axis=0) 175 | cate_columns = ['tag', 'prefix_kmeans', 'title_kmeans', 'complete_kmeans'] 176 | cate_columns.extend(leaf_columns) 177 | 178 | df_columns = df_features.columns 179 | num_columns = [column for column in df_columns if column not in cate_columns] 180 | 181 | train_csr = sparse.csr_matrix(train_features.shape[0], 0) 182 | test_csr = sparse.csr_matrix(test_features.shape[0], 0) 183 | 184 | # cate columns one-hot 185 | one_hot_encoder = OneHotEncoder() 186 | for col in cate_columns: 187 | one_hot_encoder.fit(df_features[col].values.reshape(-1, 1)) 188 | 189 | train_encoder = one_hot_encoder.transform(train_features[col].values.reshape(-1, 1)) 190 | train_csr = sparse.hstack((train_csr, train_encoder), 'csr', 'bool') 191 | 192 | test_encoder = one_hot_encoder.transform(test_features[col].values.reshape(-1, 1)) 193 | test_csr = sparse.hstack((test_csr, test_encoder), 'csr', 'bool') 194 | 195 | # num columns min-max scaler 196 | min_max_scaler = MinMaxScaler() 197 | for col in num_columns: 198 | df_features[col].fillna(0, inplace=True) 199 | train_features[col].fillna(0, inplace=True) 200 | test_features[col].fillna(0, inplace=True) 201 | 202 | min_max_scaler.fit(np.array(df_features[col].values.tolist()).reshape(-1, 1)) 203 | 204 | train_features[col] = min_max_scaler.transform(np.array(train_features[col].values.tolist()).reshape(-1, 1)) 205 | test_features[col] = min_max_scaler.transform(np.array(test_features[col].values.tolist()).reshape(-1, 1)) 206 | 207 | # combine num features 208 | train_csr = sparse.hstack(sparse.csr_matrix(train_features[num_columns], train_csr), 'csr').astype('float32') 209 | test_csr = sparse.hstack(sparse.csr_matrix(test_features[num_columns], test_csr), 'csr').astype('float32') 210 | 211 | lr_clf = LogisticRegression(penalty='l2', solver='sag', C=0.1, n_jobs=-1) 212 | 213 | kfolder = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2018) 214 | kfold = kfolder.split(train_csr, train_labels) 215 | 216 | preds_list = list() 217 | for train_index, test_index in kfold: 218 | k_x_train = train_csr.loc[train_index] 219 | k_y_train = train_labels.loc[train_index] 220 | k_x_test = train_csr.loc[test_index] 221 | k_y_test = train_labels.loc[test_index] 222 | 223 | lr_clf.fit(k_x_train, k_y_train) 224 | 225 | eval_pred = lr_clf.predict_proba(k_x_test)[:, 1] 226 | eval_loss = log_loss(k_y_test, eval_pred) 227 | logger.info('eval log loss: {}'.format(eval_loss)) 228 | 229 | test_preds = lr_clf.predict_proba(test_csr)[:, 1] 230 | preds_list.append(test_preds) 231 | 232 | length = len(preds_list) 233 | preds_columns = ["preds_{id}".format(id=i) for i in range(length)] 234 | 235 | preds_df = pd.DataFrame(data=preds_list) 236 | preds_df = preds_df.T 237 | preds_df.columns = preds_columns 238 | preds_df["mean"] = preds_df.mean(axis=1) 239 | 240 | preds_df["mean"] = preds_df["mean"].apply(lambda item: 1 if item >= threshold else 0) 241 | 242 | validate_preds = preds_df[:validate_data_length] 243 | test_preds = preds_df[validate_data_length:] 244 | 245 | logger.info('the avg of test is {}'.format(np.mean(test_preds["mean"]))) 246 | 247 | f_score = f1_score(validate_labels, validate_preds["mean"]) 248 | logger.info('validate f_score is {}'.format(f_score)) 249 | logger.info('validate the avg of validate is {}'.format(np.mean(validate_preds["mean"]))) 250 | 251 | predictions = pd.DataFrame({"predicted_score": test_preds["mean"]}) 252 | 253 | predictions.to_csv("predict.csv", index=False, header=False) 254 | 255 | 256 | def model_main(model='lgb', threshold=0.5): 257 | lgb_parms = { 258 | "boosting_type": "gbdt", 259 | "num_leaves": 127, 260 | "max_depth": -1, 261 | "learning_rate": 0.05, 262 | "n_estimators": 10000, 263 | "max_bin": 425, 264 | "subsample_for_bin": 20000, 265 | "objective": 'binary', 266 | "metric": 'logloss', 267 | "min_split_gain": 0, 268 | "min_child_weight": 0.001, 269 | "min_child_samples": 20, 270 | "subsample": 0.8, 271 | "subsample_freq": 1, 272 | "colsample_bytree": 0.8, 273 | "reg_alpha": 3, 274 | "reg_lambda": 5, 275 | "seed": 2018, 276 | "n_jobs": -1, 277 | "verbose": 1, 278 | "silent": False 279 | } 280 | 281 | train_df = get_data(name="train") 282 | validate_df = get_data(name="validate") 283 | test_df = get_data(name="test") 284 | 285 | if model == 'lgb': 286 | lgb_model(train_df, validate_df, test_df, lgb_parms, threshold=threshold) 287 | elif model == 'lgb_lr': 288 | lgb_lr_model(train_df, validate_df, test_df, threshold=threshold) 289 | else: 290 | raise ValueError() 291 | 292 | 293 | if __name__ == "__main__": 294 | combine() # features combine, ignore it if features not change 295 | model_main(model='lgb', threshold=0.4) 296 | -------------------------------------------------------------------------------- /logconfig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | from logging.config import dictConfig 6 | 7 | 8 | def config_logging(): 9 | logging_config = { 10 | 'version': 1, 11 | 'formatters': { 12 | 'default': {'format': '%(asctime)s %(name)s : %(levelname)s %(message)s', 13 | 'datefmt': '%Y-%m-%d %H:%M:%S'} 14 | }, 15 | 'handlers': { 16 | 'console': { 17 | 'level': logging.DEBUG, 18 | 'class': 'logging.StreamHandler', 19 | 'formatter': 'default', 20 | }, 21 | 'file': { 22 | 'level': logging.DEBUG, 23 | 'class': 'logging.handlers.RotatingFileHandler', 24 | 'formatter': 'default', 25 | 'filename': 'ogeek.log', 26 | 'maxBytes': 1024 * 1024 * 10, 27 | 'backupCount': 1 28 | } 29 | }, 30 | 'loggers': { 31 | 'test': { 32 | 'level': logging.DEBUG, 33 | 'handlers': ['console', 'file'] 34 | } 35 | }, 36 | 'root': { 37 | 'level': logging.DEBUG, 38 | 'handlers': ['console', 'file'] 39 | }, 40 | 'disable_existing_loggers': False 41 | } 42 | dictConfig(logging_config) 43 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from stat_engineering import Processing as Sp 5 | from w2v_engineering import Procossing as Wp 6 | 7 | if __name__ == "__main__": 8 | # 统计特征 9 | stat_processing = Sp() 10 | stat_processing.get_processing() 11 | 12 | # w2v特征 13 | w2v_processing = Wp(force=False, size=100) 14 | w2v_processing.get_processing() 15 | -------------------------------------------------------------------------------- /optimize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | import warnings 6 | from collections import namedtuple 7 | 8 | import lightgbm as lgb 9 | import numpy as np 10 | from sklearn.model_selection import GridSearchCV 11 | from sklearn.model_selection import RandomizedSearchCV 12 | 13 | from logconfig import config_logging 14 | 15 | warnings.filterwarnings('ignore') 16 | 17 | config_logging() 18 | logger = logging.getLogger('optimize') 19 | 20 | Property = namedtuple('Property', ['min', 'max', 'type']) 21 | 22 | 23 | class Optimize(object): 24 | _num_leaves = None 25 | _learning_rate = None 26 | _n_estimators = None 27 | _min_child_weight = None 28 | _min_child_samples = None 29 | _reg_alpha = None 30 | _reg_lambda = None 31 | _colsample_bytree = None 32 | _subsample = None 33 | 34 | def __init__(self, x_train, y_train, params, grid_params, iter_num=1): 35 | self.x_train = x_train 36 | self.y_train = y_train 37 | self.params = params 38 | self.grid_params = grid_params 39 | self.iter_num = iter_num 40 | 41 | # init property 42 | self.num_leaves = None 43 | self.learning_rate = None 44 | self.n_estimators = None 45 | self.min_child_weight = None 46 | self.min_child_samples = None 47 | self.reg_alpha = None 48 | self.reg_lambda = None 49 | self.colsample_bytree = None 50 | self.subsample = None 51 | 52 | # zip property as a dict 53 | self.property_dict = dict( 54 | num_leaves=self.num_leaves, 55 | learning_rate=self.learning_rate, 56 | n_estimators=self.n_estimators, 57 | min_child_weight=self.min_child_weight, 58 | min_child_samples=self.min_child_samples, 59 | reg_alpha=self.reg_alpha, 60 | reg_lambda=self.reg_lambda, 61 | colsample_bytree=self.colsample_bytree, 62 | subsample=self.subsample 63 | ) 64 | 65 | @property 66 | def num_leaves(self): 67 | return self._num_leaves 68 | 69 | @num_leaves.setter 70 | def num_leaves(self, value=None): 71 | default = [10, 1000, 'int'] 72 | 73 | if value is None: 74 | self._num_leaves = Property._make(default) 75 | elif isinstance(value, list): 76 | self._num_leaves = Property._make(value) 77 | elif isinstance(value, dict): 78 | self._num_leaves = Property(**value) 79 | else: 80 | raise ValueError() 81 | 82 | @property 83 | def learning_rate(self): 84 | return self._learning_rate 85 | 86 | @learning_rate.setter 87 | def learning_rate(self, value=None): 88 | default = [0.01, 0.5, 'float'] 89 | 90 | if value is None: 91 | self._learning_rate = Property._make(default) 92 | elif isinstance(value, list): 93 | self._learning_rate = Property._make(value) 94 | elif isinstance(value, dict): 95 | self._learning_rate = Property(**value) 96 | else: 97 | raise ValueError() 98 | 99 | @property 100 | def n_estimators(self): 101 | return self._n_estimators 102 | 103 | @n_estimators.setter 104 | def n_estimators(self, value=None): 105 | default = [500, 20000, 'int'] 106 | 107 | if value is None: 108 | self._n_estimators = Property._make(default) 109 | elif isinstance(value, list): 110 | self._n_estimators = Property._make(value) 111 | elif isinstance(value, dict): 112 | self._n_estimators = Property(**value) 113 | else: 114 | raise ValueError() 115 | 116 | @property 117 | def min_child_weight(self): 118 | return self._min_child_weight 119 | 120 | @min_child_weight.setter 121 | def min_child_weight(self, value=None): 122 | default = [0.1, 10, 'float'] 123 | 124 | if value is None: 125 | self._min_child_weight = Property._make(default) 126 | elif isinstance(value, list): 127 | self._min_child_weight = Property._make(value) 128 | elif isinstance(value, dict): 129 | self._min_child_weight = Property(**value) 130 | else: 131 | raise ValueError() 132 | 133 | @property 134 | def min_child_samples(self): 135 | return self._min_child_samples 136 | 137 | @min_child_samples.setter 138 | def min_child_samples(self, value=None): 139 | default = [50, 1000, 'int'] 140 | 141 | if value is None: 142 | self._min_child_samples = Property._make(default) 143 | elif isinstance(value, list): 144 | self._min_child_samples = Property._make(value) 145 | elif isinstance(value, dict): 146 | self._min_child_samples = Property(**value) 147 | else: 148 | raise ValueError() 149 | 150 | @property 151 | def reg_alpha(self): 152 | return self._reg_alpha 153 | 154 | @reg_alpha.setter 155 | def reg_alpha(self, value=None): 156 | default = [0, 10, 'float'] 157 | 158 | if value is None: 159 | self._reg_alpha = Property._make(default) 160 | elif isinstance(value, list): 161 | self._reg_alpha = Property._make(value) 162 | elif isinstance(value, dict): 163 | self._reg_alpha = Property(**value) 164 | else: 165 | raise ValueError() 166 | 167 | @property 168 | def reg_lambda(self): 169 | return self._reg_lambda 170 | 171 | @reg_lambda.setter 172 | def reg_lambda(self, value=None): 173 | default = [0, 10, 'float'] 174 | 175 | if value is None: 176 | self._reg_lambda = Property._make(default) 177 | elif isinstance(value, list): 178 | self._reg_lambda = Property._make(value) 179 | elif isinstance(value, dict): 180 | self._reg_lambda = Property(**value) 181 | else: 182 | raise ValueError() 183 | 184 | @property 185 | def colsample_bytree(self): 186 | return self._colsample_bytree 187 | 188 | @colsample_bytree.setter 189 | def colsample_bytree(self, value=None): 190 | default = [0.5, 1, 'float'] 191 | 192 | if value is None: 193 | self._colsample_bytree = Property._make(default) 194 | elif isinstance(value, list): 195 | self._colsample_bytree = Property._make(value) 196 | elif isinstance(value, dict): 197 | self._colsample_bytree = Property(**value) 198 | else: 199 | raise ValueError() 200 | 201 | @property 202 | def subsample(self): 203 | return self._subsample 204 | 205 | @subsample.setter 206 | def subsample(self, value=None): 207 | default = [0.5, 1, 'float'] 208 | 209 | if value is None: 210 | self._subsample = Property._make(default) 211 | elif isinstance(value, list): 212 | self._subsample = Property._make(value) 213 | elif isinstance(value, dict): 214 | self._subsample = Property(**value) 215 | else: 216 | raise ValueError() 217 | 218 | @staticmethod 219 | def _get_values_list(low, high, dtype, size): 220 | linspace = np.linspace(low, high, size, dtype=dtype) 221 | 222 | if dtype == 'float': 223 | linspace = list(map(lambda item: round(item, 4), linspace)) 224 | 225 | return linspace 226 | 227 | def _get_grid_params(self, values, key, best_value, size): 228 | max_value = max(values) 229 | min_value = min(values) 230 | 231 | property_item = self.property_dict[key] 232 | 233 | if best_value == max_value: 234 | if best_value == property_item.max: 235 | return [best_value] 236 | low = best_value 237 | high = property_item.max 238 | linspace = self._get_values_list(low, high, property_item.type, size) 239 | elif best_value == min_value: 240 | if best_value == property_item.min: 241 | return [best_value] 242 | low = min_value 243 | high = best_value 244 | linspace = self._get_values_list(low, high, property_item.type, size) 245 | else: 246 | best_index = values.index(best_value) 247 | low = values[best_index - 1] 248 | high = values[best_index + 1] 249 | linspace = self._get_values_list(low, high, property_item.type, size) 250 | 251 | linspace = list(set(linspace)) 252 | return linspace 253 | 254 | def _update_params(self, best_params): 255 | for key, value in best_params.items(): 256 | self.params[key] = value 257 | 258 | def _update_grid_params(self, best_params, size=4): 259 | for key, value in best_params.items(): 260 | values = self.grid_params[key] 261 | 262 | values_list = self._get_grid_params(values, key, value, size) 263 | self.grid_params[key] = values_list 264 | 265 | def _optimize(self, params, grid_params): 266 | clf = lgb.LGBMClassifier(**params) 267 | grid_clf = GridSearchCV(clf, grid_params, cv=5, scoring='neg_log_loss', n_jobs=1, verbose=100) 268 | grid_clf.fit(self.x_train, self.y_train) 269 | return grid_clf 270 | 271 | def optimize(self): 272 | best_params = None 273 | 274 | while self.iter_num > 0: 275 | grid_clf = self._optimize(self.params, self.grid_params) 276 | 277 | best_params = grid_clf.best_params_ 278 | best_score = grid_clf.best_score_ 279 | 280 | logger.info('iter_num: {} best_params: {}'.format(self.iter_num, best_params)) 281 | logger.info('iter_num: {} best_score: {}'.format(self.iter_num, best_score)) 282 | 283 | self._update_params(best_params) 284 | self._update_grid_params(best_params) 285 | 286 | self.iter_num -= 1 287 | 288 | return best_params 289 | 290 | 291 | class SimpleOptimize(object): 292 | def __init__(self, x_train, y_train, params, opt_params): 293 | self.x_train = x_train 294 | self.y_train = y_train 295 | self.params = params 296 | self.opt_params = opt_params 297 | 298 | def _update_params(self, best_params): 299 | for key, value in best_params.items(): 300 | self.params[key] = value 301 | 302 | def optimize(self, grid=True, random=False): 303 | gbm = lgb.LGBMClassifier(**self.params) 304 | if grid: 305 | opt_gbm = GridSearchCV(gbm, self.opt_params, cv=5, scoring='neg_log_loss', refit="binary_logloss", 306 | n_jobs=1, verbose=100) 307 | elif random: 308 | opt_gbm = RandomizedSearchCV(gbm, self.opt_params, cv=5, scoring='neg_log_loss', refit="binary_logloss", 309 | n_jobs=1, verbose=100) 310 | else: 311 | raise ValueError() 312 | 313 | opt_gbm.fit(self.x_train, self.y_train) 314 | best_params = opt_gbm.best_params_ 315 | best_score = opt_gbm.best_score_ 316 | 317 | logger.info('best_params: {}'.format(best_params)) 318 | logger.info('best_score: {}'.format(best_score)) 319 | 320 | self._update_params(best_params) 321 | 322 | logger.info('update best params: {}'.format(self.params)) 323 | return self.params 324 | -------------------------------------------------------------------------------- /resources/stop_words.txt: -------------------------------------------------------------------------------- 1 | 一 2 | 一些 3 | 一何 4 | 一切 5 | 一则 6 | 一方面 7 | 一旦 8 | 一来 9 | 一样 10 | 一般 11 | 一转眼 12 | 七 13 | 万一 14 | 三 15 | 上 16 | 上下 17 | 下 18 | 不 19 | 不仅 20 | 不但 21 | 不光 22 | 不单 23 | 不只 24 | 不外乎 25 | 不如 26 | 不妨 27 | 不尽 28 | 不尽然 29 | 不得 30 | 不怕 31 | 不惟 32 | 不成 33 | 不拘 34 | 不料 35 | 不是 36 | 不比 37 | 不然 38 | 不特 39 | 不独 40 | 不管 41 | 不至于 42 | 不若 43 | 不论 44 | 不过 45 | 不问 46 | 与 47 | 与其 48 | 与其说 49 | 与否 50 | 与此同时 51 | 且 52 | 且不说 53 | 且说 54 | 两者 55 | 个 56 | 个别 57 | 中 58 | 临 59 | 为 60 | 为了 61 | 为什么 62 | 为何 63 | 为止 64 | 为此 65 | 为着 66 | 乃 67 | 乃至 68 | 乃至于 69 | 么 70 | 之 71 | 之一 72 | 之所以 73 | 之类 74 | 乌乎 75 | 乎 76 | 乘 77 | 九 78 | 也 79 | 也好 80 | 也罢 81 | 了 82 | 二 83 | 二来 84 | 于 85 | 于是 86 | 于是乎 87 | 云云 88 | 云尔 89 | 五 90 | 些 91 | 亦 92 | 人 93 | 人们 94 | 人家 95 | 什 96 | 什么 97 | 什么样 98 | 今 99 | 介于 100 | 仍 101 | 仍旧 102 | 从 103 | 从此 104 | 从而 105 | 他 106 | 他人 107 | 他们 108 | 他们们 109 | 以 110 | 以上 111 | 以为 112 | 以便 113 | 以免 114 | 以及 115 | 以故 116 | 以期 117 | 以来 118 | 以至 119 | 以至于 120 | 以致 121 | 们 122 | 任 123 | 任何 124 | 任凭 125 | 会 126 | 似的 127 | 但 128 | 但凡 129 | 但是 130 | 何 131 | 何以 132 | 何况 133 | 何处 134 | 何时 135 | 余外 136 | 作为 137 | 你 138 | 你们 139 | 使 140 | 使得 141 | 例如 142 | 依 143 | 依据 144 | 依照 145 | 便于 146 | 俺 147 | 俺们 148 | 倘 149 | 倘使 150 | 倘或 151 | 倘然 152 | 倘若 153 | 借 154 | 借傥然 155 | 假使 156 | 假如 157 | 假若 158 | 做 159 | 像 160 | 儿 161 | 先不先 162 | 光是 163 | 全体 164 | 全部 165 | 八 166 | 六 167 | 兮 168 | 共 169 | 关于 170 | 关于具体地说 171 | 其 172 | 其一 173 | 其中 174 | 其二 175 | 其他 176 | 其余 177 | 其它 178 | 其次 179 | 具体地说 180 | 具体说来 181 | 兼之 182 | 内 183 | 再 184 | 再其次 185 | 再则 186 | 再有 187 | 再者 188 | 再者说 189 | 再说 190 | 冒 191 | 冲 192 | 况且 193 | 几 194 | 几时 195 | 凡 196 | 凡是 197 | 凭 198 | 凭借 199 | 出于 200 | 出来 201 | 分 202 | 分别 203 | 则 204 | 则甚 205 | 别 206 | 别人 207 | 别处 208 | 别是 209 | 别的 210 | 别管 211 | 别说 212 | 到 213 | 前后 214 | 前此 215 | 前者 216 | 加之 217 | 加以 218 | 即 219 | 即令 220 | 即使 221 | 即便 222 | 即如 223 | 即或 224 | 即若 225 | 却 226 | 去 227 | 又 228 | 又及 229 | 及 230 | 及其 231 | 及至 232 | 反之 233 | 反而 234 | 反过来 235 | 反过来说 236 | 受到 237 | 另 238 | 另一方面 239 | 另外 240 | 另悉 241 | 只 242 | 只当 243 | 只怕 244 | 只是 245 | 只有 246 | 只消 247 | 只要 248 | 只限 249 | 叫 250 | 叮咚 251 | 可 252 | 可以 253 | 可是 254 | 可见 255 | 各 256 | 各个 257 | 各位 258 | 各种 259 | 各自 260 | 同 261 | 同时 262 | 后 263 | 后者 264 | 向 265 | 向使 266 | 向着 267 | 吓 268 | 吗 269 | 否则 270 | 吧 271 | 吧哒 272 | 含 273 | 吱 274 | 呀 275 | 呃 276 | 呕 277 | 呗 278 | 呜 279 | 呜呼 280 | 呢 281 | 呵 282 | 呵呵 283 | 呸 284 | 呼哧 285 | 咋 286 | 和 287 | 咚 288 | 咦 289 | 咧 290 | 咱 291 | 咱们 292 | 咳 293 | 哇 294 | 哈 295 | 哈哈 296 | 哉 297 | 哎 298 | 哎呀 299 | 哎哟 300 | 哗 301 | 哟 302 | 哦 303 | 哩 304 | 哪 305 | 哪个 306 | 哪些 307 | 哪儿 308 | 哪天 309 | 哪年 310 | 哪怕 311 | 哪样 312 | 哪边 313 | 哪里 314 | 哼 315 | 哼唷 316 | 唉 317 | 唯有 318 | 啊 319 | 啐 320 | 啥 321 | 啦 322 | 啪达 323 | 啷当 324 | 喂 325 | 喏 326 | 喔唷 327 | 喽 328 | 嗡 329 | 嗡嗡 330 | 嗬 331 | 嗯 332 | 嗳 333 | 嘎 334 | 嘎登 335 | 嘘 336 | 嘛 337 | 嘻 338 | 嘿 339 | 嘿嘿 340 | 四 341 | 因 342 | 因为 343 | 因了 344 | 因此 345 | 因着 346 | 因而 347 | 固然 348 | 在 349 | 在下 350 | 在于 351 | 地 352 | 基于 353 | 处在 354 | 多 355 | 多么 356 | 多少 357 | 大 358 | 大家 359 | 她 360 | 她们 361 | 好 362 | 如 363 | 如上 364 | 如上所述 365 | 如下 366 | 如何 367 | 如其 368 | 如同 369 | 如是 370 | 如果 371 | 如此 372 | 如若 373 | 始而 374 | 孰料 375 | 孰知 376 | 宁 377 | 宁可 378 | 宁愿 379 | 宁肯 380 | 它 381 | 它们 382 | 对 383 | 对于 384 | 对待 385 | 对方 386 | 对比 387 | 将 388 | 小 389 | 尔 390 | 尔后 391 | 尔尔 392 | 尚且 393 | 就 394 | 就是 395 | 就是了 396 | 就是说 397 | 就算 398 | 就要 399 | 尽 400 | 尽管 401 | 尽管如此 402 | 岂但 403 | 己 404 | 已 405 | 已矣 406 | 巴 407 | 巴巴 408 | 年 409 | 并 410 | 并且 411 | 庶乎 412 | 庶几 413 | 开外 414 | 开始 415 | 归 416 | 归齐 417 | 当 418 | 当地 419 | 当然 420 | 当着 421 | 彼 422 | 彼时 423 | 彼此 424 | 往 425 | 待 426 | 很 427 | 得 428 | 得了 429 | 怎 430 | 怎么 431 | 怎么办 432 | 怎么样 433 | 怎奈 434 | 怎样 435 | 总之 436 | 总的来看 437 | 总的来说 438 | 总的说来 439 | 总而言之 440 | 恰恰相反 441 | 您 442 | 惟其 443 | 慢说 444 | 我 445 | 我们 446 | 或 447 | 或则 448 | 或是 449 | 或曰 450 | 或者 451 | 截至 452 | 所 453 | 所以 454 | 所在 455 | 所幸 456 | 所有 457 | 才 458 | 才能 459 | 打 460 | 打从 461 | 把 462 | 抑或 463 | 拿 464 | 按 465 | 按照 466 | 换句话说 467 | 换言之 468 | 据 469 | 据此 470 | 接着 471 | 故 472 | 故此 473 | 故而 474 | 旁人 475 | 无 476 | 无宁 477 | 无论 478 | 既 479 | 既往 480 | 既是 481 | 既然 482 | 日 483 | 时 484 | 时候 485 | 是 486 | 是以 487 | 是的 488 | 更 489 | 曾 490 | 替 491 | 替代 492 | 最 493 | 月 494 | 有 495 | 有些 496 | 有关 497 | 有及 498 | 有时 499 | 有的 500 | 望 501 | 朝 502 | 朝着 503 | 本 504 | 本人 505 | 本地 506 | 本着 507 | 本身 508 | 来 509 | 来着 510 | 来自 511 | 来说 512 | 极了 513 | 果然 514 | 果真 515 | 某 516 | 某个 517 | 某些 518 | 某某 519 | 根据 520 | 欤 521 | 正值 522 | 正如 523 | 正巧 524 | 正是 525 | 此 526 | 此地 527 | 此处 528 | 此外 529 | 此时 530 | 此次 531 | 此间 532 | 毋宁 533 | 每 534 | 每当 535 | 比 536 | 比及 537 | 比如 538 | 比方 539 | 没奈何 540 | 沿 541 | 沿着 542 | 漫说 543 | 焉 544 | 然则 545 | 然后 546 | 然而 547 | 照 548 | 照着 549 | 犹且 550 | 犹自 551 | 甚且 552 | 甚么 553 | 甚或 554 | 甚而 555 | 甚至 556 | 甚至于 557 | 用 558 | 用来 559 | 由 560 | 由于 561 | 由是 562 | 由此 563 | 由此可见 564 | 的 565 | 的确 566 | 的话 567 | 直到 568 | 相对而言 569 | 省得 570 | 看 571 | 眨眼 572 | 着 573 | 着呢 574 | 矣 575 | 矣乎 576 | 矣哉 577 | 离 578 | 秒 579 | 竟而 580 | 第 581 | 等 582 | 等到 583 | 等等 584 | 简言之 585 | 管 586 | 类如 587 | 紧接着 588 | 纵 589 | 纵令 590 | 纵使 591 | 纵然 592 | 经 593 | 经过 594 | 结果 595 | 给 596 | 继之 597 | 继后 598 | 继而 599 | 综上所述 600 | 罢了 601 | 者 602 | 而 603 | 而且 604 | 而况 605 | 而后 606 | 而外 607 | 而已 608 | 而是 609 | 而言 610 | 能 611 | 能否 612 | 腾 613 | 自 614 | 自个儿 615 | 自从 616 | 自各儿 617 | 自后 618 | 自家 619 | 自己 620 | 自打 621 | 自身 622 | 至 623 | 至于 624 | 至今 625 | 至若 626 | 致 627 | 般的 628 | 若 629 | 若夫 630 | 若是 631 | 若果 632 | 若非 633 | 莫不然 634 | 莫如 635 | 莫若 636 | 虽 637 | 虽则 638 | 虽然 639 | 虽说 640 | 被 641 | 要 642 | 要不 643 | 要不是 644 | 要不然 645 | 要么 646 | 要是 647 | 譬喻 648 | 譬如 649 | 让 650 | 许多 651 | 论 652 | 设使 653 | 设或 654 | 设若 655 | 诚如 656 | 诚然 657 | 该 658 | 说 659 | 说来 660 | 请 661 | 诸 662 | 诸位 663 | 诸如 664 | 谁 665 | 谁人 666 | 谁料 667 | 谁知 668 | 贼死 669 | 赖以 670 | 赶 671 | 起 672 | 起见 673 | 趁 674 | 趁着 675 | 越是 676 | 距 677 | 跟 678 | 较 679 | 较之 680 | 边 681 | 过 682 | 还 683 | 还是 684 | 还有 685 | 还要 686 | 这 687 | 这一来 688 | 这个 689 | 这么 690 | 这么些 691 | 这么样 692 | 这么点儿 693 | 这些 694 | 这会儿 695 | 这儿 696 | 这就是说 697 | 这时 698 | 这样 699 | 这次 700 | 这般 701 | 这边 702 | 这里 703 | 进而 704 | 连 705 | 连同 706 | 逐步 707 | 通过 708 | 遵循 709 | 遵照 710 | 那 711 | 那个 712 | 那么 713 | 那么些 714 | 那么样 715 | 那些 716 | 那会儿 717 | 那儿 718 | 那时 719 | 那样 720 | 那般 721 | 那边 722 | 那里 723 | 都 724 | 鄙人 725 | 鉴于 726 | 针对 727 | 阿 728 | 除 729 | 除了 730 | 除外 731 | 除开 732 | 除此之外 733 | 除非 734 | 随 735 | 随后 736 | 随时 737 | 随着 738 | 难道说 739 | 零 740 | 非 741 | 非但 742 | 非徒 743 | 非特 744 | 非独 745 | 靠 746 | 顺 747 | 顺着 748 | 首先 -------------------------------------------------------------------------------- /stacking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | class Stacking(object): 9 | def __init__(self, kflod, df, train_df_length): 10 | self.kflod = kflod 11 | self.df = df.copy() 12 | self.train_df_length = train_df_length 13 | 14 | def _get_kflod(self, list_): 15 | list_array = np.array(list_) 16 | np.random.shuffle(list_array) 17 | 18 | list_part = np.array_split(list_array, self.kflod) 19 | for idx, list_item in enumerate(list_part): 20 | list_part_copy = list_part.copy() 21 | list_part_copy.pop(idx) 22 | 23 | other_list_part = np.concatenate(list_part_copy).ravel() 24 | 25 | yield other_list_part, list_item 26 | 27 | def get_stacking_df(self, columns=None): 28 | if columns is None: 29 | columns = ['prefix', 'title', 'tag', 'prefix_title', 'prefix_tag', 'title_tag'] 30 | 31 | train_df = self.df[:self.train_df_length] 32 | train_df_index = train_df.index 33 | 34 | validate_test_df = self.df[self.train_df_length:] 35 | 36 | stacking_df = pd.DataFrame() 37 | stacking_columns = ['stacking_{}'.format(column) for column in columns] 38 | 39 | kfloder = self._get_kflod(train_df_index) 40 | kflod_list = list() 41 | for kflod_item in kfloder: 42 | kflod_list.append(kflod_item) 43 | 44 | for column in columns: 45 | stacking_train_df = pd.DataFrame() 46 | stacking_test_list = list() 47 | 48 | for train_index, test_index in kflod_list: 49 | k_train_df = train_df.loc[train_index] 50 | k_test_df = train_df.loc[test_index] 51 | 52 | click_column = "{column}_click".format(column=column) 53 | count_column = "{column}_count".format(column=column) 54 | stacking_column = "{column}_stacking".format(column=column) 55 | 56 | agg_dict = {click_column: "sum", count_column: "count"} 57 | _stacking_df = k_train_df.groupby(column, as_index=False)["label"].agg(agg_dict) 58 | _stacking_df[stacking_column] = _stacking_df[click_column] / (_stacking_df[count_column] + 5) 59 | 60 | k_test_df = pd.merge(k_test_df, _stacking_df, how='left', on=column) 61 | stacking_train_df = pd.concat([stacking_train_df, k_test_df[stacking_column]], 62 | axis=0, ignore_index=False, sort=False) 63 | 64 | temp_df = pd.merge(validate_test_df, _stacking_df, how='left', on=column) 65 | temp_column_list = temp_df[stacking_column].tolist() 66 | stacking_test_list.append(temp_column_list) 67 | 68 | # train data 69 | stacking_train_df.sort_index(inplace=True) 70 | 71 | # validate + test data 72 | length = len(stacking_test_list) 73 | stacking_test_columns = ["stacking_{id}".format(id=i) for i in range(length)] 74 | stacking_test_df = pd.DataFrame(data=stacking_test_list) 75 | stacking_test_df = stacking_test_df.T 76 | stacking_test_df.columns = stacking_test_columns 77 | stacking_test_df['mean'] = stacking_test_df.mean(axis=1) 78 | 79 | # contact train validate test 80 | column_stacking_df = pd.concat([stacking_train_df, stacking_test_df['mean']], 81 | axis=0, ignore_index=True, sort=False) 82 | 83 | # contact column to stacking df 84 | stacking_df = pd.concat([stacking_df, column_stacking_df], axis=1) 85 | 86 | stacking_df.columns = stacking_columns 87 | return stacking_df 88 | -------------------------------------------------------------------------------- /stat_engineering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | import json 6 | import logging 7 | import os 8 | import time 9 | import warnings 10 | from operator import itemgetter 11 | 12 | import jieba 13 | import numpy as np 14 | import pandas as pd 15 | from sklearn.preprocessing import LabelEncoder 16 | 17 | from logconfig import config_logging 18 | from utils import char_cleaner 19 | 20 | config_logging() 21 | logger = logging.getLogger('stat_features') 22 | 23 | warnings.filterwarnings('ignore') 24 | np.random.seed(2018) 25 | 26 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data") 27 | RAW_DATA_PATH = os.path.join(BASE_PATH, "RawData") 28 | ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData") 29 | 30 | 31 | class PrefixProcessing(object): 32 | @staticmethod 33 | def _is_in_title(item): 34 | prefix = item["prefix"] 35 | title = item["title"] 36 | 37 | if not isinstance(prefix, str): 38 | prefix = "null" 39 | 40 | if prefix in title: 41 | return 1 42 | return 0 43 | 44 | @staticmethod 45 | def _levenshtein_distance(item): 46 | str1 = item["prefix"] 47 | str2 = item["title"] 48 | 49 | if not isinstance(str1, str): 50 | str1 = "null" 51 | 52 | x_size = len(str1) + 1 53 | y_size = len(str2) + 1 54 | 55 | matrix = np.zeros((x_size, y_size), dtype=np.int_) 56 | 57 | for x in range(x_size): 58 | matrix[x, 0] = x 59 | 60 | for y in range(y_size): 61 | matrix[0, y] = y 62 | 63 | for x in range(1, x_size): 64 | for y in range(1, y_size): 65 | if str1[x - 1] == str2[y - 1]: 66 | matrix[x, y] = min(matrix[x - 1, y] + 1, matrix[x - 1, y - 1], matrix[x, y - 1] + 1) 67 | else: 68 | matrix[x, y] = min(matrix[x - 1, y] + 1, matrix[x - 1, y - 1] + 1, matrix[x, y - 1] + 1) 69 | 70 | return matrix[x_size - 1, y_size - 1] 71 | 72 | @staticmethod 73 | def _distince_rate(item): 74 | str1 = item["prefix"] 75 | str2 = item["title"] 76 | leven_distance = item["leven_distance"] 77 | 78 | if not isinstance(str1, str): 79 | str1 = "null" 80 | 81 | length = max(len(str1), len(str2)) 82 | 83 | return leven_distance / (length + 5) # 平滑 84 | 85 | def get_prefix_df(self, df): 86 | prefix_df = pd.DataFrame() 87 | 88 | prefix_df[["prefix", "title"]] = df[["prefix", "title"]] 89 | prefix_df["is_in_title"] = prefix_df.apply(self._is_in_title, axis=1) 90 | prefix_df["leven_distance"] = prefix_df.apply(self._levenshtein_distance, axis=1) 91 | prefix_df["distance_rate"] = prefix_df.apply(self._distince_rate, axis=1) 92 | 93 | return prefix_df 94 | 95 | 96 | class QueryProcessing(object): 97 | @staticmethod 98 | def _get_query_dict(item): 99 | item_dict = dict() 100 | 101 | query_predict = item["query_prediction"] 102 | 103 | if not query_predict: 104 | item_dict["query_length"] = 0 105 | item_dict["prob_sum"] = None 106 | item_dict["prob_max"] = None 107 | item_dict["prob_mean"] = None 108 | return item_dict 109 | 110 | prob_list = list() 111 | for _, prob in query_predict.items(): 112 | prob = float(prob) 113 | prob_list.append(prob) 114 | 115 | item_dict["query_length"] = len(prob_list) 116 | item_dict["prob_sum"] = np.sum(prob_list) 117 | item_dict["prob_max"] = np.max(prob_list) 118 | item_dict["prob_mean"] = np.mean(prob_list) 119 | 120 | return item_dict 121 | 122 | def get_query_df(self, df): 123 | query_df = pd.DataFrame() 124 | 125 | query_df["item_dict"] = df.apply(self._get_query_dict, axis=1) 126 | query_df["query_length"] = query_df["item_dict"].apply(lambda item: item.get("query_length")) 127 | query_df["prob_sum"] = query_df["item_dict"].apply(lambda item: item.get("prob_sum")) 128 | query_df["prob_max"] = query_df["item_dict"].apply(lambda item: item.get("prob_max")) 129 | query_df["prob_mean"] = query_df["item_dict"].apply(lambda item: item.get("prob_mean")) 130 | query_df = query_df.drop(columns=["item_dict"]) 131 | 132 | return query_df 133 | 134 | 135 | class Processing(object): 136 | 137 | @staticmethod 138 | def _get_data(name): 139 | if name == "test": 140 | columns = ['prefix', 'query_prediction', 'title', 'tag'] 141 | else: 142 | columns = ['prefix', 'query_prediction', 'title', 'tag', 'label'] 143 | 144 | data_name = os.path.join(RAW_DATA_PATH, "oppo_round1_{}.txt".format(name)) 145 | df = pd.read_csv(data_name, names=columns, sep="\t", header=None, encoding="utf-8") 146 | 147 | return df 148 | 149 | @staticmethod 150 | def _loads(item): 151 | try: 152 | return json.loads(item) 153 | except (json.JSONDecodeError, TypeError): 154 | return json.loads("{}") 155 | 156 | @staticmethod 157 | def _get_apriori_df(df, train_df_length, columns=None): 158 | df = df.copy() 159 | 160 | train_df = df[:train_df_length] 161 | 162 | if columns is None: 163 | columns = ['prefix', 'complete_prefix', 'title', 'tag'] 164 | 165 | ctr_columns = columns.copy() 166 | ctr_columns.extend(['prefix_title', 'prefix_tag', 'complete_prefix_title', 'complete_prefix_tag', 'title_tag']) 167 | apriori_df = df[ctr_columns] 168 | 169 | # click count and ctr 170 | for idx, column in enumerate(ctr_columns): 171 | click_column = "{column}_click".format(column=column) 172 | count_column = "{column}_count".format(column=column) 173 | ctr_column = "{column}_ctr".format(column=column) 174 | 175 | agg_dict = {click_column: "sum", count_column: "count"} 176 | column_apriori_df = train_df.groupby(column, as_index=False)["label"].agg(agg_dict) 177 | column_apriori_df[ctr_column] = column_apriori_df[click_column] / (column_apriori_df[count_column] + 5) 178 | apriori_df = pd.merge(apriori_df, column_apriori_df, how='left', on=column) 179 | 180 | length = apriori_df.shape[0] 181 | all_columns = apriori_df.columns 182 | 183 | # apriori 184 | for column1 in columns: 185 | for column2 in columns: 186 | if column1 == column2: 187 | continue 188 | 189 | if column1 in column2: 190 | continue 191 | 192 | if column2 in column1: 193 | continue 194 | 195 | temp_click_column = "{}_{}_click".format(column1, column2) 196 | if temp_click_column in all_columns: 197 | click_column = temp_click_column 198 | else: 199 | click_column = "{}_{}_click".format(column2, column1) 200 | 201 | temp_count_column = "{}_{}_count".format(column1, column2) 202 | if temp_count_column in all_columns: 203 | count_column = temp_count_column 204 | else: 205 | count_column = "{}_{}_count".format(column2, column1) 206 | 207 | click_column1 = "{column}_click".format(column=column1) 208 | count_column1 = "{column}_count".format(column=column1) 209 | click_column2 = "{column}_click".format(column=column2) 210 | count_column2 = "{column}_count".format(column=column2) 211 | 212 | click_confidence_column = "click_{}_{}_confidence".format(column1, column2) 213 | count_confidence_column = "count_{}_{}_confidence".format(column1, column2) 214 | click_lift_column = "click_{}_{}_lift".format(column1, column2) 215 | count_lift_column = "count_{}_{}_lift".format(column1, column2) 216 | 217 | # confidence = P(A&B)/P(A) 218 | apriori_df[click_confidence_column] = apriori_df[click_column] * 100 / (apriori_df[click_column1] + 5) 219 | apriori_df[count_confidence_column] = apriori_df[count_column] * 100 / (apriori_df[count_column1] + 5) 220 | 221 | # lift = P(A&B)/(P(A)*P(B)) 222 | apriori_df[click_lift_column] = (apriori_df[click_column] / length) / ( 223 | (apriori_df[click_column1] * apriori_df[click_column2]) / (length * length)) 224 | apriori_df[count_lift_column] = (apriori_df[count_column] / length) / ( 225 | (apriori_df[count_column1] * apriori_df[count_column2]) / (length * length)) 226 | 227 | apriori_df = apriori_df.drop(columns=ctr_columns) 228 | return apriori_df 229 | 230 | @staticmethod 231 | def _get_expose_df(df, columns=None): 232 | df = df.copy() 233 | 234 | if columns is None: 235 | columns = ['prefix', 'complete_prefix', 'title', 'tag'] 236 | 237 | expose_df = df[columns] 238 | 239 | for column1 in columns: 240 | for column2 in columns: 241 | 242 | if column1 == column2: 243 | continue 244 | 245 | nunique_column_name = "{}_{}_nunique".format(column1, column2) 246 | temp_df = expose_df.groupby(column1)[column2].nunique().reset_index().rename( 247 | columns={column2: nunique_column_name}) 248 | expose_df = pd.merge(expose_df, temp_df, how='left', on=column1) 249 | 250 | expose_df = expose_df.drop(columns=columns) 251 | return expose_df 252 | 253 | @staticmethod 254 | def _get_complete_prefix(item): 255 | prefix = item['prefix'] 256 | query_prediction = item['query_prediction'] 257 | 258 | if not query_prediction: 259 | return prefix 260 | 261 | predict_word_dict = dict() 262 | prefix = str(prefix) 263 | 264 | for query_item, query_ratio in query_prediction.items(): 265 | query_item_cut = jieba.lcut(query_item) 266 | item_word = "" 267 | for item in query_item_cut: 268 | if prefix not in item_word: 269 | item_word += item 270 | else: 271 | if item_word not in predict_word_dict.keys(): 272 | predict_word_dict[item_word] = 0.0 273 | predict_word_dict[item_word] += float(query_ratio) 274 | break 275 | 276 | if not predict_word_dict: 277 | return prefix 278 | 279 | predict_word_dict = sorted(predict_word_dict.items(), key=itemgetter(1), reverse=True) 280 | complete_prefix = predict_word_dict[0][0] 281 | return complete_prefix 282 | 283 | @staticmethod 284 | def _get_max_query_ratio(item): 285 | query_prediction = item['query_prediction'] 286 | title = item['title'] 287 | 288 | if not query_prediction: 289 | return 0 290 | 291 | for query_wrod, ratio in query_prediction.items(): 292 | if title == query_wrod: 293 | if float(ratio) > 0.1: 294 | return 1 295 | 296 | return 0 297 | 298 | @staticmethod 299 | def _get_word_length(item): 300 | item = str(item) 301 | 302 | word_cut = jieba.lcut(item) 303 | length = len(word_cut) 304 | return length 305 | 306 | @staticmethod 307 | def _get_small_query_num(item): 308 | small_query_num = 0 309 | 310 | for _, ratio in item.items(): 311 | if float(ratio) <= 0.08: 312 | small_query_num += 1 313 | 314 | return small_query_num 315 | 316 | def _get_length_df(self, df): 317 | df = df.copy() 318 | 319 | columns = ['query_prediction', 'prefix', 'title'] 320 | length_df = df[columns] 321 | 322 | length_df['max_query_ratio'] = length_df.apply(self._get_max_query_ratio, axis=1) 323 | length_df['prefix_word_num'] = length_df['prefix'].apply(self._get_word_length) 324 | length_df['title_word_num'] = length_df['title'].apply(self._get_word_length) 325 | length_df['title_len'] = length_df['title'].apply(len) 326 | length_df['small_query_num'] = length_df['query_prediction'].apply(self._get_small_query_num) 327 | 328 | length_df = length_df.drop(columns=columns) 329 | return length_df 330 | 331 | def get_processing(self): 332 | train_df = self._get_data(name="train") 333 | validate_df = self._get_data(name="vali") 334 | test_df = self._get_data(name="test") 335 | logger.info('finish load data!') 336 | 337 | train_df_length = train_df.shape[0] 338 | validate_df_length = validate_df.shape[0] 339 | df = pd.concat([train_df, validate_df, test_df], axis=0, ignore_index=True, sort=False) 340 | 341 | # make query prediction to json 342 | df["query_prediction"] = df["query_prediction"].apply(self._loads) 343 | 344 | # complete prefix 345 | df['complete_prefix'] = df[['prefix', 'query_prediction']].apply(self._get_complete_prefix, axis=1) 346 | logger.info('finish get complete prefix!') 347 | 348 | length_df = self._get_length_df(df) 349 | logger.info('finish get length df!') 350 | 351 | # clearn prefix and title 352 | df["prefix"] = df["prefix"].apply(char_cleaner) 353 | df["title"] = df["title"].apply(char_cleaner) 354 | df["complete_prefix"] = df["complete_prefix"].apply(char_cleaner) 355 | logger.info('finish clearn columns') 356 | 357 | # combine columns 358 | df['prefix_title'] = df[['prefix', 'title']].apply(lambda item: '_'.join(map(str, item)), axis=1) 359 | df['prefix_tag'] = df[['prefix', 'tag']].apply(lambda item: '_'.join(map(str, item)), axis=1) 360 | df['complete_prefix_title'] = df[['complete_prefix', 'title']].apply(lambda item: '_'.join(map(str, item)), 361 | axis=1) 362 | df['complete_prefix_tag'] = df[['complete_prefix', 'tag']].apply(lambda item: '_'.join(map(str, item)), axis=1) 363 | df['title_tag'] = df[['title', 'tag']].apply(lambda item: '_'.join(map(str, item)), axis=1) 364 | logger.info('finish combine columns') 365 | 366 | apriori_df = self._get_apriori_df(df, train_df_length) 367 | logger.info('finish get apriori df!') 368 | 369 | drop_columns = ['prefix_title', 'prefix_tag', 'title_tag', 'complete_prefix_title', 'complete_prefix_tag'] 370 | df = df.drop(columns=drop_columns) 371 | 372 | expose_df = self._get_expose_df(df) 373 | logger.info('finish get expose df!') 374 | 375 | prefix_processing = PrefixProcessing() 376 | prefix_df = prefix_processing.get_prefix_df(df) 377 | logger.info('finish get prefix df!') 378 | 379 | query_processing = QueryProcessing() 380 | query_df = query_processing.get_query_df(df) 381 | logger.info('finish get query df!') 382 | 383 | df = pd.concat([df, length_df, apriori_df, expose_df, prefix_df, query_df], axis=1) 384 | logger.info('finish combine all df!') 385 | 386 | drop_columns = ['prefix', 'complete_prefix', 'query_prediction', 'title'] 387 | df = df.drop(columns=drop_columns) 388 | 389 | # label encoder 390 | label_encoder = LabelEncoder() 391 | df['tag'] = label_encoder.fit_transform(df['tag']) 392 | logger.info('finish label encoder tag!') 393 | 394 | train_data = df[:train_df_length] 395 | train_data["label"] = train_data["label"].apply(int) 396 | 397 | validate_data = df[train_df_length:train_df_length + validate_df_length] 398 | validate_data["label"] = validate_data["label"].apply(int) 399 | 400 | test_data = df[train_df_length + validate_df_length:] 401 | test_data = test_data.drop(columns=["label"]) 402 | 403 | train_data_name = os.path.join(ETL_DATA_PATH, "train_stat.csv") 404 | validate_data_name = os.path.join(ETL_DATA_PATH, "validate_stat.csv") 405 | test_data_name = os.path.join(ETL_DATA_PATH, "test_stat.csv") 406 | 407 | train_data.to_csv(train_data_name, index=False) 408 | validate_data.to_csv(validate_data_name, index=False) 409 | test_data.to_csv(test_data_name, index=False) 410 | 411 | 412 | if __name__ == "__main__": 413 | t0 = time.time() 414 | processing = Processing() 415 | processing.get_processing() 416 | print(time.time() - t0) 417 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import re 5 | 6 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data") 7 | ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData") 8 | RESOURCE_PATH = os.path.join('resources') 9 | 10 | 11 | def get_stop_words(): 12 | stop_wrods_name = os.path.join(RESOURCE_PATH, 'stop_words.txt') 13 | _stop_words_list = list() 14 | with open(stop_wrods_name, encoding='utf-8') as f: 15 | for line in f: 16 | _stop_words_list.append(line.strip()) 17 | 18 | _stop_words_set = set(_stop_words_list) 19 | return _stop_words_set 20 | 21 | 22 | stop_words_set = get_stop_words() 23 | 24 | 25 | def char_cleaner(char): 26 | if not isinstance(char, str): 27 | char = "null" 28 | pattern = re.compile("[^0-9a-zA-Z\u4E00-\u9FA5 ]") 29 | char = re.sub(pattern, "", char) 30 | char = char.lower() 31 | return char 32 | 33 | 34 | def char_list_cheaner(char_list): 35 | new_char_list = list() 36 | for char in char_list: 37 | if len(char) == 0: 38 | continue 39 | if char in stop_words_set: 40 | continue 41 | new_char_list.append(char) 42 | 43 | return new_char_list 44 | -------------------------------------------------------------------------------- /w2v.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import json 4 | import logging 5 | import os 6 | import time 7 | 8 | import jieba 9 | from gensim.models import Word2Vec 10 | 11 | from logconfig import config_logging 12 | from utils import char_cleaner, char_list_cheaner 13 | 14 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data") 15 | RawData = os.path.join(BASE_PATH, "RawData") 16 | 17 | config_logging() 18 | logger = logging.getLogger('w2v') 19 | 20 | 21 | def get_sentence(name): 22 | if isinstance(name, list): 23 | name_list = name 24 | else: 25 | name_list = [name] 26 | 27 | for name in name_list: 28 | name = "oppo_round1_{fname}.txt".format(fname=name) 29 | file_path = os.path.join(RawData, name) 30 | if not os.path.exists(file_path): 31 | raise FileNotFoundError("{} Not Found!".format(file_path)) 32 | 33 | with open(file_path, "r", encoding="utf-8") as f: 34 | line = f.readline() 35 | 36 | while line: 37 | line_arr = line.split("\t") 38 | 39 | query_prediction = line_arr[1] 40 | try: 41 | sentences = json.loads(query_prediction) 42 | except json.JSONDecodeError: 43 | sentences = json.loads("{}") 44 | 45 | for sentence in sentences: 46 | yield sentence 47 | 48 | title = line_arr[2] 49 | yield title 50 | 51 | line = f.readline() 52 | 53 | 54 | class MySentence(object): 55 | def __init__(self, fname): 56 | self.fname = fname 57 | 58 | def __iter__(self): 59 | for sentence in get_sentence(self.fname): 60 | sentence = char_cleaner(sentence) 61 | seg_list = jieba.lcut(sentence) 62 | seg_list = char_list_cheaner(seg_list) 63 | 64 | if seg_list: 65 | yield seg_list 66 | 67 | 68 | def build_model(fname, size): 69 | sentences = MySentence(fname) 70 | model_name = "w2v_{}.bin".format(size) 71 | model_path = os.path.join("resources", model_name) 72 | my_model = Word2Vec(sentences, size=size, window=5, sg=1, hs=1, min_count=5, workers=10) 73 | my_model.wv.save_word2vec_format(model_path, binary=True) 74 | 75 | 76 | if __name__ == "__main__": 77 | t0 = time.time() 78 | build_model(fname=['train', 'vali', 'test'], size=100) 79 | print(time.time() - t0) 80 | -------------------------------------------------------------------------------- /w2v_engineering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import gc 4 | import json 5 | import logging 6 | import os 7 | import time 8 | import warnings 9 | from operator import itemgetter 10 | 11 | import jieba 12 | import numpy as np 13 | import pandas as pd 14 | from gensim import matutils 15 | from gensim.models.keyedvectors import KeyedVectors 16 | from sklearn.cluster import MiniBatchKMeans 17 | from sklearn.decomposition import PCA 18 | from tqdm import tqdm 19 | 20 | from logconfig import config_logging 21 | from utils import char_cleaner, char_list_cheaner 22 | from w2v import build_model 23 | 24 | config_logging() 25 | logger = logging.getLogger('w2v_features') 26 | 27 | warnings.filterwarnings('ignore') 28 | 29 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data") 30 | RAW_DATA_PATH = os.path.join(BASE_PATH, "RawData") 31 | ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData") 32 | TEMP_DATA_PATH = os.path.join(BASE_PATH, "TempData") 33 | 34 | 35 | class PreProcessing(object): 36 | def __init__(self, size, w2v_model): 37 | self.size = size 38 | self.w2v_model = w2v_model 39 | 40 | def to_csv(self, df, col): 41 | file_name = '{col}_w2v.csv'.format(col=col) 42 | file_path = os.path.join(TEMP_DATA_PATH, file_name) 43 | if os.path.exists(file_path): 44 | os.remove(file_path) 45 | 46 | columns = ['w2v_{}'.format(i) for i in range(self.size)] 47 | 48 | with open(file_path, 'a', encoding='utf-8') as f: 49 | # write columns 50 | f.write(','.join(columns) + '\n') 51 | 52 | for idx, item in tqdm(df[col].items()): 53 | item = char_cleaner(item) 54 | if item == 'null': 55 | item_list = [''] * self.size 56 | elif not item: 57 | item_list = [''] * self.size 58 | else: 59 | seg_cut = jieba.lcut(str(item)) 60 | seg_cut = char_list_cheaner(seg_cut) 61 | 62 | w2v_array = list() 63 | for word in seg_cut: 64 | try: 65 | similar_list = self.w2v_model[word] 66 | w2v_array.append(similar_list) 67 | except KeyError: 68 | pass 69 | 70 | if not w2v_array: 71 | item_list = [''] * self.size 72 | else: 73 | item_list = matutils.unitvec(np.array(w2v_array).mean(axis=0)) 74 | 75 | f.write(','.join(map(str, item_list)) + '\n') 76 | 77 | 78 | class Procossing(object): 79 | def __init__(self, size, force): 80 | self.size = size 81 | self.force = force 82 | self.w2v_model = self._get_w2v_model() 83 | 84 | def _get_w2v_model(self): 85 | w2v_model_name = "w2v_{}.bin".format(self.size) 86 | w2v_model_path = os.path.join("resources", w2v_model_name) 87 | if not os.path.exists(w2v_model_path): 88 | build_model(fname=['train', 'vali', 'test'], size=self.size) 89 | w2v_model = KeyedVectors.load_word2vec_format(w2v_model_path, binary=True, unicode_errors="ignore") 90 | return w2v_model 91 | 92 | @staticmethod 93 | def _get_data(name): 94 | if name == "test": 95 | columns = ['prefix', 'query_prediction', 'title', 'tag'] 96 | else: 97 | columns = ['prefix', 'query_prediction', 'title', 'tag', 'label'] 98 | 99 | data_name = os.path.join(RAW_DATA_PATH, "oppo_round1_{}.txt".format(name)) 100 | df = pd.read_csv(data_name, names=columns, sep="\t", header=None, encoding="utf-8") 101 | 102 | return df 103 | 104 | def _get_jieba_array(self, words): 105 | words = char_cleaner(words) 106 | seg_cut = jieba.lcut(words) 107 | seg_cut = char_list_cheaner(seg_cut) 108 | 109 | w2v_array = list() 110 | for word in seg_cut: 111 | try: 112 | similar_list = self.w2v_model[word] 113 | w2v_array.append(similar_list) 114 | except KeyError: 115 | continue 116 | 117 | if not w2v_array: 118 | w2v_array = [None] * self.size 119 | else: 120 | w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0)) 121 | 122 | return w2v_array 123 | 124 | def _get_w2v_similar(self, item): 125 | item_dict = dict() 126 | 127 | query_predict = item["query_prediction"] 128 | title = item['title'] 129 | 130 | if not query_predict: 131 | item_dict["max_similar"] = None 132 | item_dict["mean_similar"] = None 133 | item_dict["weight_similar"] = None 134 | return item_dict 135 | 136 | query_predict = sorted(query_predict.items(), key=itemgetter(1), reverse=True) 137 | query_predict = query_predict[:3] 138 | 139 | similar_list = list() 140 | weight_similar_list = list() 141 | 142 | title_array = self._get_jieba_array(title) 143 | for key, value in query_predict: 144 | query_cut_array = self._get_jieba_array(key) 145 | 146 | try: 147 | w2v_similar = np.dot(query_cut_array, title_array) 148 | except (KeyError, ZeroDivisionError, TypeError): 149 | w2v_similar = np.nan 150 | 151 | similar_list.append(w2v_similar) 152 | weight_w2v_similar = w2v_similar * float(value) 153 | weight_similar_list.append(weight_w2v_similar) 154 | 155 | max_similar = np.nanmax(similar_list) 156 | mean_similar = np.nanmean(similar_list) 157 | weight_similar = np.nansum(weight_similar_list) 158 | 159 | item_dict["max_similar"] = max_similar 160 | item_dict["mean_similar"] = mean_similar 161 | item_dict["weight_similar"] = weight_similar 162 | 163 | return item_dict 164 | 165 | @staticmethod 166 | def _get_help_flag(item): 167 | if np.isnan(item): 168 | return 0 169 | return 1 170 | 171 | def _get_w2v_df(self, df, col): 172 | file_name = '{col}_w2v.csv'.format(col=col) 173 | file_path = os.path.join(TEMP_DATA_PATH, file_name) 174 | 175 | if os.path.exists(file_path) and not self.force: 176 | pass 177 | else: 178 | pre_processing = PreProcessing(self.size, self.w2v_model) 179 | pre_processing.to_csv(df, col) 180 | 181 | w2v_df = pd.read_csv(file_path, header=0) 182 | w2v_df['help_index'] = w2v_df.index 183 | w2v_df['help_flag'] = w2v_df['w2v_0'].apply(self._get_help_flag) 184 | 185 | return w2v_df 186 | 187 | def _get_query_df(self, df): 188 | query_df = pd.DataFrame() 189 | 190 | query_df["item_dict"] = df[['query_prediction', 'title']].apply(self._get_w2v_similar, axis=1) 191 | query_df["max_similar"] = query_df["item_dict"].apply(lambda item: item.get("max_similar")) 192 | query_df["mean_similar"] = query_df["item_dict"].apply(lambda item: item.get("mean_similar")) 193 | query_df["weight_similar"] = query_df["item_dict"].apply(lambda item: item.get("weight_similar")) 194 | query_df = query_df.drop(columns=["item_dict"]) 195 | 196 | return query_df 197 | 198 | @staticmethod 199 | def _get_prefix_df(prefix_w2v_df, title_w2v_df, col_name): 200 | prefix_df = pd.DataFrame() 201 | 202 | remove_columns = ['help_index', 'help_flag'] 203 | 204 | prefix_w2v_df = prefix_w2v_df.copy() 205 | prefix_w2v_df = prefix_w2v_df.drop(columns=remove_columns) 206 | 207 | title_w2v_df = title_w2v_df.copy() 208 | title_w2v_df = title_w2v_df.drop(columns=remove_columns) 209 | 210 | prefix_w2v_list = list() 211 | for idx, prefix in prefix_w2v_df.iterrows(): 212 | if np.isnan(prefix[0]): 213 | prefix_w2v_list.append(None) 214 | continue 215 | 216 | title = title_w2v_df.loc[idx] 217 | if np.isnan(title[0]): 218 | prefix_w2v_list.append(None) 219 | continue 220 | 221 | similar = np.dot(prefix, title) 222 | prefix_w2v_list.append(similar) 223 | 224 | prefix_df[col_name] = prefix_w2v_list 225 | return prefix_df 226 | 227 | @staticmethod 228 | def _get_kmeans_dict(df, size=20): 229 | df = df.copy() 230 | df = df[df['help_flag'] == 1] 231 | help_index = df['help_index'].tolist() 232 | 233 | df = df.drop(columns=['help_index', 'help_flag']) 234 | 235 | kmeans = MiniBatchKMeans(n_clusters=size, reassignment_ratio=0.001) 236 | preds = kmeans.fit_predict(df) 237 | 238 | kmeans_dict = dict(zip(help_index, preds)) 239 | return kmeans_dict 240 | 241 | @staticmethod 242 | def _loads(item): 243 | try: 244 | return json.loads(item) 245 | except (json.JSONDecodeError, TypeError): 246 | return json.loads("{}") 247 | 248 | @staticmethod 249 | def _mapping_kmeans(item, mapping_dict): 250 | return mapping_dict.get(item, -1) 251 | 252 | @staticmethod 253 | def _get_ctr_df(df, train_df_length, columns=None): 254 | df = df.copy() 255 | 256 | train_df = df[:train_df_length] 257 | 258 | if columns is None: 259 | columns = ['prefix_kmeans', 'title_kmeans', 'complete_prefix_kmeans'] 260 | 261 | ctr_df = df[columns] 262 | 263 | # click count and ctr 264 | for idx, column in enumerate(columns): 265 | click_column = "{column}_click".format(column=column) 266 | count_column = "{column}_count".format(column=column) 267 | ctr_column = "{column}_ctr".format(column=column) 268 | 269 | agg_dict = {click_column: "sum", count_column: "count"} 270 | column_apriori_df = train_df.groupby(column, as_index=False)["label"].agg(agg_dict) 271 | column_apriori_df[ctr_column] = column_apriori_df[click_column] / (column_apriori_df[count_column] + 5) 272 | ctr_df = pd.merge(ctr_df, column_apriori_df, how='left', on=column) 273 | 274 | ctr_df = ctr_df.drop(columns=columns) 275 | 276 | return ctr_df 277 | 278 | @staticmethod 279 | def _get_pca_df(df, name, n_components=5): 280 | df = df.copy() 281 | 282 | remove_columns = ['help_flag', 'help_index'] 283 | 284 | df_effective = df[df['help_flag'] == 1] 285 | df_invalid = df[df['help_flag'] == 0] 286 | 287 | df_effective = df_effective.drop(columns=remove_columns) 288 | df_invalid = df_invalid.drop(columns=remove_columns) 289 | 290 | pca_columns = ['{}_pca_{}'.format(name, i) for i in range(n_components)] 291 | 292 | pca = PCA(n_components=n_components) 293 | 294 | pca_data = pca.fit_transform(df_effective) 295 | pca_df = pd.DataFrame(pca_data, index=df_effective.index, columns=pca_columns) 296 | none_df = pd.DataFrame(index=df_invalid.index, columns=pca_columns) 297 | 298 | pca_df = pd.concat([pca_df, none_df], axis=0, ignore_index=False, sort=False) 299 | pca_df = pca_df.sort_index() 300 | 301 | return pca_df 302 | 303 | @staticmethod 304 | def _get_complete_prefix(item): 305 | prefix = item['prefix'] 306 | query_prediction = item['query_prediction'] 307 | 308 | if not query_prediction: 309 | return prefix 310 | 311 | predict_word_dict = dict() 312 | prefix = str(prefix) 313 | 314 | for query_item, query_ratio in query_prediction.items(): 315 | query_item_cut = jieba.lcut(query_item) 316 | item_word = "" 317 | for item in query_item_cut: 318 | if prefix not in item_word: 319 | item_word += item 320 | else: 321 | if item_word not in predict_word_dict.keys(): 322 | predict_word_dict[item_word] = 0.0 323 | predict_word_dict[item_word] += float(query_ratio) 324 | 325 | if not predict_word_dict: 326 | return prefix 327 | 328 | predict_word_dict = sorted(predict_word_dict.items(), key=itemgetter(1), reverse=True) 329 | complete_prefix = predict_word_dict[0][0] 330 | return complete_prefix 331 | 332 | def get_processing(self): 333 | train_df = self._get_data(name="train") 334 | validate_df = self._get_data(name="vali") 335 | test_df = self._get_data(name="test") 336 | logger.info('finish load data!') 337 | 338 | train_df_length = train_df.shape[0] 339 | validate_df_length = validate_df.shape[0] 340 | df = pd.concat([train_df, validate_df, test_df], axis=0, ignore_index=True, sort=False) 341 | 342 | del train_df, validate_df, test_df 343 | gc.collect() 344 | 345 | # make query prediction to json 346 | df["query_prediction"] = df["query_prediction"].apply(self._loads) 347 | 348 | # complete prefix 349 | df['complete_prefix'] = df[['prefix', 'query_prediction']].apply(self._get_complete_prefix, axis=1) 350 | 351 | # clearn prefix and title 352 | df["prefix"] = df["prefix"].apply(char_cleaner) 353 | df["title"] = df["title"].apply(char_cleaner) 354 | df["complete_prefix"] = df["complete_prefix"].apply(char_cleaner) 355 | 356 | w2v_df = df[['label']] 357 | 358 | prefix_w2v_df = self._get_w2v_df(df, col='prefix') 359 | title_w2v_df = self._get_w2v_df(df, col='title') 360 | complete_prefix_w2v_df = self._get_w2v_df(df, col='complete_prefix') 361 | logger.info('finish get prefix and title w2v df!') 362 | 363 | prefix_pca_df = self._get_pca_df(prefix_w2v_df, 'prefix') 364 | title_pca_df = self._get_pca_df(title_w2v_df, 'title') 365 | complete_prefix_pca_df = self._get_pca_df(complete_prefix_w2v_df, 'complete_prefix') 366 | w2v_df = pd.concat([w2v_df, prefix_pca_df, title_pca_df, complete_prefix_pca_df], axis=1) 367 | 368 | del prefix_pca_df, title_pca_df, complete_prefix_pca_df 369 | gc.collect() 370 | 371 | prefix_kmeans_dict = self._get_kmeans_dict(prefix_w2v_df) 372 | title_kmeans_dict = self._get_kmeans_dict(title_w2v_df) 373 | complete_prefix_kmeans_dict = self._get_kmeans_dict(complete_prefix_w2v_df) 374 | logger.info('finish make kmeans!') 375 | 376 | w2v_df['prefix_kmeans'] = prefix_w2v_df['help_index'].apply(self._mapping_kmeans, args=(prefix_kmeans_dict,)) 377 | w2v_df['title_kmeans'] = title_w2v_df['help_index'].apply(self._mapping_kmeans, args=(title_kmeans_dict,)) 378 | w2v_df['complete_prefix_kmeans'] = complete_prefix_w2v_df['help_index'].apply( 379 | self._mapping_kmeans, args=(complete_prefix_kmeans_dict,)) 380 | 381 | ctr_df = self._get_ctr_df(w2v_df, train_df_length) 382 | w2v_df = pd.concat([w2v_df, ctr_df], axis=1) 383 | 384 | del ctr_df, prefix_kmeans_dict, title_kmeans_dict, complete_prefix_kmeans_dict 385 | gc.collect() 386 | 387 | prefix_df = self._get_prefix_df(prefix_w2v_df, title_w2v_df, 'prefix_w2v') 388 | omplete_prefix_df = self._get_prefix_df(complete_prefix_w2v_df, title_w2v_df, 'complete_prefix_w2v') 389 | logger.info('finish get prefix df!') 390 | w2v_df = pd.concat([w2v_df, prefix_df, omplete_prefix_df], axis=1) 391 | 392 | del prefix_df, omplete_prefix_df, prefix_w2v_df, title_w2v_df 393 | gc.collect() 394 | 395 | query_df = self._get_query_df(df) 396 | logger.info('finish get query_df!') 397 | w2v_df = pd.concat([w2v_df, query_df], axis=1) 398 | 399 | w2v_df = w2v_df.drop(columns=['label']) 400 | 401 | train_data = w2v_df[:train_df_length] 402 | validate_data = w2v_df[train_df_length:train_df_length + validate_df_length] 403 | test_data = w2v_df[train_df_length + validate_df_length:] 404 | 405 | train_data_name = os.path.join(ETL_DATA_PATH, "train_w2v.csv") 406 | validate_data_name = os.path.join(ETL_DATA_PATH, "validate_w2v.csv") 407 | test_data_name = os.path.join(ETL_DATA_PATH, "test_w2v.csv") 408 | 409 | train_data.to_csv(train_data_name, index=False) 410 | validate_data.to_csv(validate_data_name, index=False) 411 | test_data.to_csv(test_data_name, index=False) 412 | 413 | 414 | if __name__ == "__main__": 415 | t0 = time.time() 416 | processing = Procossing(size=100, force=False) 417 | processing.get_processing() 418 | print(time.time() - t0) 419 | --------------------------------------------------------------------------------