├── .gitignore
├── README.md
├── embed_engineering.py
├── lgb_models.py
├── logconfig.py
├── main.py
├── optimize.py
├── resources
    └── stop_words.txt
├── stacking.py
├── stat_engineering.py
├── utils.py
├── w2v.py
└── w2v_engineering.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *.cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | 
 55 | # Flask stuff:
 56 | instance/
 57 | .webassets-cache
 58 | 
 59 | # Scrapy stuff:
 60 | .scrapy
 61 | 
 62 | # Sphinx documentation
 63 | docs/_build/
 64 | 
 65 | # PyBuilder
 66 | target/
 67 | 
 68 | # Jupyter Notebook
 69 | .ipynb_checkpoints
 70 | 
 71 | # pyenv
 72 | .python-version
 73 | 
 74 | # celery beat schedule file
 75 | celerybeat-schedule
 76 | 
 77 | # SageMath parsed files
 78 | *.sage.py
 79 | 
 80 | # Environments
 81 | .env
 82 | .venv
 83 | env/
 84 | venv/
 85 | ENV/
 86 | 
 87 | # Spyder project settings
 88 | .spyderproject
 89 | .spyproject
 90 | 
 91 | # Rope project settings
 92 | .ropeproject
 93 | 
 94 | # jetbrains IDEA Project files
 95 | .idea/
 96 | 
 97 | # VS Code 
 98 | .vscode/
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # reference
107 | reference/
108 | 
109 | # data
110 | *.csv


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TianChi_OGeek
2 | 
3 | - 数据问题:train部分数据第1815102行漏了个引号，手动删除或不上引号即可
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/embed_engineering.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from logconfig import config_logging
 5 | from utils import char_cleaner, char_list_cheaner
 6 | import logging
 7 | import warnings
 8 | 
 9 | config_logging()
10 | logger = logging.getLogger('embed_features')
11 | 
12 | warnings.filterwarnings('ignore')
13 | 
14 | 


--------------------------------------------------------------------------------
/lgb_models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import logging
  5 | import os
  6 | import warnings
  7 | 
  8 | import lightgbm as lgb
  9 | import numpy as np
 10 | import pandas as pd
 11 | from scipy import sparse
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.metrics import f1_score
 14 | from sklearn.metrics import log_loss
 15 | from sklearn.model_selection import StratifiedKFold
 16 | from sklearn.preprocessing import MinMaxScaler
 17 | from sklearn.preprocessing import OneHotEncoder
 18 | 
 19 | from logconfig import config_logging
 20 | 
 21 | warnings.filterwarnings('ignore')
 22 | 
 23 | config_logging()
 24 | logger = logging.getLogger('models')
 25 | 
 26 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data")
 27 | ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData")
 28 | 
 29 | 
 30 | def get_data(name):
 31 |     etl_path = os.path.join("data", "EtlData")
 32 | 
 33 |     if name == "train":
 34 |         file_name = "train.csv"
 35 |     elif name == "validate":
 36 |         file_name = "validate.csv"
 37 |     elif name == "test":
 38 |         file_name = "test.csv"
 39 |     else:
 40 |         raise FileNotFoundError()
 41 | 
 42 |     data_name = os.path.join(etl_path, file_name)
 43 | 
 44 |     df = pd.read_csv(data_name, header=0)
 45 | 
 46 |     one_hot_columns = ['tag', 'prefix_kmeans', 'title_kmeans', 'complete_prefix_kmeans']
 47 |     df = pd.get_dummies(df, columns=one_hot_columns)
 48 | 
 49 |     return df
 50 | 
 51 | 
 52 | def combine():
 53 |     names = ['train', 'test', 'validate']
 54 |     for name in names:
 55 |         stat_name = os.path.join(ETL_DATA_PATH, '{}_stat.csv'.format(name))
 56 |         stat_df = pd.read_csv(stat_name)
 57 | 
 58 |         w2v_name = os.path.join(ETL_DATA_PATH, '{}_w2v.csv'.format(name))
 59 |         w2v_df = pd.read_csv(w2v_name)
 60 | 
 61 |         df = pd.concat([stat_df, w2v_df], axis=1)
 62 | 
 63 |         df_name = os.path.join(ETL_DATA_PATH, '{}.csv'.format(name))
 64 |         df.to_csv(df_name, index=False)
 65 | 
 66 | 
 67 | def lgb_model(train_data, validate_data, test_data, parms, threshold, n_folds=5):
 68 |     columns = train_data.columns
 69 |     remove_columns = ["label"]
 70 |     features_columns = [column for column in columns if column not in remove_columns]
 71 | 
 72 |     train_data = pd.concat([train_data, validate_data], axis=0, ignore_index=True, sort=False)
 73 |     train_features = train_data[features_columns]
 74 |     train_labels = train_data["label"]
 75 | 
 76 |     validate_labels = validate_data["label"]
 77 | 
 78 |     test_data = pd.concat([validate_data, test_data], axis=0, ignore_index=True, sort=False)
 79 |     validate_data_length = validate_data.shape[0]
 80 |     test_features = test_data[features_columns]
 81 | 
 82 |     kfolder = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2018)
 83 |     kfold = kfolder.split(train_features, train_labels)
 84 | 
 85 |     preds_list = list()
 86 |     for train_index, test_index in kfold:
 87 |         k_x_train = train_features.loc[train_index]
 88 |         k_y_train = train_labels.loc[train_index]
 89 |         k_x_test = train_features.loc[test_index]
 90 |         k_y_test = train_labels.loc[test_index]
 91 | 
 92 |         gbm = lgb.LGBMClassifier(**parms)
 93 |         gbm = gbm.fit(k_x_train, k_y_train,
 94 |                       eval_metric="logloss",
 95 |                       eval_set=[(k_x_train, k_y_train),
 96 |                                 (k_x_test, k_y_test)],
 97 |                       eval_names=["train", "valid"],
 98 |                       early_stopping_rounds=100,
 99 |                       verbose=True)
100 | 
101 |         preds = gbm.predict_proba(test_features, num_iteration=gbm.best_iteration_)[:, 1]
102 | 
103 |         preds_list.append(preds)
104 | 
105 |     length = len(preds_list)
106 |     preds_columns = ["preds_{id}".format(id=i) for i in range(length)]
107 | 
108 |     preds_df = pd.DataFrame(data=preds_list)
109 |     preds_df = preds_df.T
110 |     preds_df.columns = preds_columns
111 |     preds_df["mean"] = preds_df.mean(axis=1)
112 | 
113 |     preds_df["mean"] = preds_df["mean"].apply(lambda item: 1 if item >= threshold else 0)
114 | 
115 |     validate_preds = preds_df[:validate_data_length]
116 |     test_preds = preds_df[validate_data_length:]
117 | 
118 |     logger.info('the avg of test is {}'.format(np.mean(test_preds["mean"])))
119 | 
120 |     f_score = f1_score(validate_labels, validate_preds["mean"])
121 |     logger.info('validate f_score is {}'.format(f_score))
122 |     logger.info('validate the avg of validate is {}'.format(np.mean(validate_preds["mean"])))
123 | 
124 |     predictions = pd.DataFrame({"predicted_score": test_preds["mean"]})
125 | 
126 |     predictions.to_csv("predict.csv", index=False, header=False)
127 | 
128 | 
129 | def lgb_lr_model(train_data, validate_data, test_data, threshold, n_folds=5):
130 |     columns = train_data.columns
131 |     remove_columns = ["label"]
132 |     features_columns = [column for column in columns if column not in remove_columns]
133 | 
134 |     validate_data_length = validate_data.shape[0]
135 | 
136 |     train_data = pd.concat([train_data, validate_data], axis=0, ignore_index=True, sort=False)
137 |     train_features = train_data[features_columns]
138 |     train_labels = train_data["label"]
139 | 
140 |     validate_labels = validate_data["label"]
141 | 
142 |     test_data = pd.concat([validate_data, test_data], axis=0, ignore_index=True, sort=False)
143 |     test_features = test_data[features_columns]
144 | 
145 |     gbm = lgb.LGBMClassifier(boosting_type='gbdt',
146 |                              num_leaves=127,
147 |                              reg_alpha=3,
148 |                              reg_lambda=5,
149 |                              max_depth=-1,
150 |                              n_estimators=80,
151 |                              objective='binary',
152 |                              subsample=0.8,
153 |                              colsample_bytree=0.8,
154 |                              subsample_freq=1,
155 |                              min_child_weight=0.1,
156 |                              learning_rate=0.1,
157 |                              random_state=2018,
158 |                              n_jobs=-1,
159 |                              min_child_samples=200)
160 | 
161 |     gbm.fit(train_features, train_labels, eval_metric='binary_logloss', early_stopping_rounds=100)
162 | 
163 |     lgb_train_leaf = gbm.predict(train_features, pred_leaf=True)
164 |     lgb_test_leaf = gbm.predict(test_features, pred_leaf=True)
165 | 
166 |     leaf_columns = ['leaf_{}'.format(i) for i in range(lgb_train_leaf.shape[1])]
167 | 
168 |     train_leaf_df = pd.DataFrame(lgb_train_leaf, columns=leaf_columns)
169 |     test_leaf_df = pd.DataFrame(lgb_test_leaf, columns=leaf_columns)
170 | 
171 |     train_features = pd.concat([train_features, train_leaf_df], axis=1)
172 |     test_features = pd.concat([test_features, test_leaf_df], axis=1)
173 | 
174 |     df_features = pd.concat([train_features, test_features], ignore_index=True, sort=False, axis=0)
175 |     cate_columns = ['tag', 'prefix_kmeans', 'title_kmeans', 'complete_kmeans']
176 |     cate_columns.extend(leaf_columns)
177 | 
178 |     df_columns = df_features.columns
179 |     num_columns = [column for column in df_columns if column not in cate_columns]
180 | 
181 |     train_csr = sparse.csr_matrix(train_features.shape[0], 0)
182 |     test_csr = sparse.csr_matrix(test_features.shape[0], 0)
183 | 
184 |     # cate columns one-hot
185 |     one_hot_encoder = OneHotEncoder()
186 |     for col in cate_columns:
187 |         one_hot_encoder.fit(df_features[col].values.reshape(-1, 1))
188 | 
189 |         train_encoder = one_hot_encoder.transform(train_features[col].values.reshape(-1, 1))
190 |         train_csr = sparse.hstack((train_csr, train_encoder), 'csr', 'bool')
191 | 
192 |         test_encoder = one_hot_encoder.transform(test_features[col].values.reshape(-1, 1))
193 |         test_csr = sparse.hstack((test_csr, test_encoder), 'csr', 'bool')
194 | 
195 |     # num columns min-max scaler
196 |     min_max_scaler = MinMaxScaler()
197 |     for col in num_columns:
198 |         df_features[col].fillna(0, inplace=True)
199 |         train_features[col].fillna(0, inplace=True)
200 |         test_features[col].fillna(0, inplace=True)
201 | 
202 |         min_max_scaler.fit(np.array(df_features[col].values.tolist()).reshape(-1, 1))
203 | 
204 |         train_features[col] = min_max_scaler.transform(np.array(train_features[col].values.tolist()).reshape(-1, 1))
205 |         test_features[col] = min_max_scaler.transform(np.array(test_features[col].values.tolist()).reshape(-1, 1))
206 | 
207 |     # combine num features
208 |     train_csr = sparse.hstack(sparse.csr_matrix(train_features[num_columns], train_csr), 'csr').astype('float32')
209 |     test_csr = sparse.hstack(sparse.csr_matrix(test_features[num_columns], test_csr), 'csr').astype('float32')
210 | 
211 |     lr_clf = LogisticRegression(penalty='l2', solver='sag', C=0.1, n_jobs=-1)
212 | 
213 |     kfolder = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2018)
214 |     kfold = kfolder.split(train_csr, train_labels)
215 | 
216 |     preds_list = list()
217 |     for train_index, test_index in kfold:
218 |         k_x_train = train_csr.loc[train_index]
219 |         k_y_train = train_labels.loc[train_index]
220 |         k_x_test = train_csr.loc[test_index]
221 |         k_y_test = train_labels.loc[test_index]
222 | 
223 |         lr_clf.fit(k_x_train, k_y_train)
224 | 
225 |         eval_pred = lr_clf.predict_proba(k_x_test)[:, 1]
226 |         eval_loss = log_loss(k_y_test, eval_pred)
227 |         logger.info('eval log loss: {}'.format(eval_loss))
228 | 
229 |         test_preds = lr_clf.predict_proba(test_csr)[:, 1]
230 |         preds_list.append(test_preds)
231 | 
232 |     length = len(preds_list)
233 |     preds_columns = ["preds_{id}".format(id=i) for i in range(length)]
234 | 
235 |     preds_df = pd.DataFrame(data=preds_list)
236 |     preds_df = preds_df.T
237 |     preds_df.columns = preds_columns
238 |     preds_df["mean"] = preds_df.mean(axis=1)
239 | 
240 |     preds_df["mean"] = preds_df["mean"].apply(lambda item: 1 if item >= threshold else 0)
241 | 
242 |     validate_preds = preds_df[:validate_data_length]
243 |     test_preds = preds_df[validate_data_length:]
244 | 
245 |     logger.info('the avg of test is {}'.format(np.mean(test_preds["mean"])))
246 | 
247 |     f_score = f1_score(validate_labels, validate_preds["mean"])
248 |     logger.info('validate f_score is {}'.format(f_score))
249 |     logger.info('validate the avg of validate is {}'.format(np.mean(validate_preds["mean"])))
250 | 
251 |     predictions = pd.DataFrame({"predicted_score": test_preds["mean"]})
252 | 
253 |     predictions.to_csv("predict.csv", index=False, header=False)
254 | 
255 | 
256 | def model_main(model='lgb', threshold=0.5):
257 |     lgb_parms = {
258 |         "boosting_type": "gbdt",
259 |         "num_leaves": 127,
260 |         "max_depth": -1,
261 |         "learning_rate": 0.05,
262 |         "n_estimators": 10000,
263 |         "max_bin": 425,
264 |         "subsample_for_bin": 20000,
265 |         "objective": 'binary',
266 |         "metric": 'logloss',
267 |         "min_split_gain": 0,
268 |         "min_child_weight": 0.001,
269 |         "min_child_samples": 20,
270 |         "subsample": 0.8,
271 |         "subsample_freq": 1,
272 |         "colsample_bytree": 0.8,
273 |         "reg_alpha": 3,
274 |         "reg_lambda": 5,
275 |         "seed": 2018,
276 |         "n_jobs": -1,
277 |         "verbose": 1,
278 |         "silent": False
279 |     }
280 | 
281 |     train_df = get_data(name="train")
282 |     validate_df = get_data(name="validate")
283 |     test_df = get_data(name="test")
284 | 
285 |     if model == 'lgb':
286 |         lgb_model(train_df, validate_df, test_df, lgb_parms, threshold=threshold)
287 |     elif model == 'lgb_lr':
288 |         lgb_lr_model(train_df, validate_df, test_df, threshold=threshold)
289 |     else:
290 |         raise ValueError()
291 | 
292 | 
293 | if __name__ == "__main__":
294 |     combine()  # features combine, ignore it if features not change
295 |     model_main(model='lgb', threshold=0.4)
296 | 


--------------------------------------------------------------------------------
/logconfig.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import logging
 5 | from logging.config import dictConfig
 6 | 
 7 | 
 8 | def config_logging():
 9 |     logging_config = {
10 |         'version': 1,
11 |         'formatters': {
12 |             'default': {'format': '%(asctime)s  %(name)s : %(levelname)s  %(message)s',
13 |                         'datefmt': '%Y-%m-%d %H:%M:%S'}
14 |         },
15 |         'handlers': {
16 |             'console': {
17 |                 'level': logging.DEBUG,
18 |                 'class': 'logging.StreamHandler',
19 |                 'formatter': 'default',
20 |             },
21 |             'file': {
22 |                 'level': logging.DEBUG,
23 |                 'class': 'logging.handlers.RotatingFileHandler',
24 |                 'formatter': 'default',
25 |                 'filename': 'ogeek.log',
26 |                 'maxBytes': 1024 * 1024 * 10,
27 |                 'backupCount': 1
28 |             }
29 |         },
30 |         'loggers': {
31 |             'test': {
32 |                 'level': logging.DEBUG,
33 |                 'handlers': ['console', 'file']
34 |             }
35 |         },
36 |         'root': {
37 |             'level': logging.DEBUG,
38 |             'handlers': ['console', 'file']
39 |         },
40 |         'disable_existing_loggers': False
41 |     }
42 |     dictConfig(logging_config)
43 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from stat_engineering import Processing as Sp
 5 | from w2v_engineering import Procossing as Wp
 6 | 
 7 | if __name__ == "__main__":
 8 |     # 统计特征
 9 |     stat_processing = Sp()
10 |     stat_processing.get_processing()
11 | 
12 |     # w2v特征
13 |     w2v_processing = Wp(force=False, size=100)
14 |     w2v_processing.get_processing()
15 | 


--------------------------------------------------------------------------------
/optimize.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import logging
  5 | import warnings
  6 | from collections import namedtuple
  7 | 
  8 | import lightgbm as lgb
  9 | import numpy as np
 10 | from sklearn.model_selection import GridSearchCV
 11 | from sklearn.model_selection import RandomizedSearchCV
 12 | 
 13 | from logconfig import config_logging
 14 | 
 15 | warnings.filterwarnings('ignore')
 16 | 
 17 | config_logging()
 18 | logger = logging.getLogger('optimize')
 19 | 
 20 | Property = namedtuple('Property', ['min', 'max', 'type'])
 21 | 
 22 | 
 23 | class Optimize(object):
 24 |     _num_leaves = None
 25 |     _learning_rate = None
 26 |     _n_estimators = None
 27 |     _min_child_weight = None
 28 |     _min_child_samples = None
 29 |     _reg_alpha = None
 30 |     _reg_lambda = None
 31 |     _colsample_bytree = None
 32 |     _subsample = None
 33 | 
 34 |     def __init__(self, x_train, y_train, params, grid_params, iter_num=1):
 35 |         self.x_train = x_train
 36 |         self.y_train = y_train
 37 |         self.params = params
 38 |         self.grid_params = grid_params
 39 |         self.iter_num = iter_num
 40 | 
 41 |         # init property
 42 |         self.num_leaves = None
 43 |         self.learning_rate = None
 44 |         self.n_estimators = None
 45 |         self.min_child_weight = None
 46 |         self.min_child_samples = None
 47 |         self.reg_alpha = None
 48 |         self.reg_lambda = None
 49 |         self.colsample_bytree = None
 50 |         self.subsample = None
 51 | 
 52 |         # zip property as a dict
 53 |         self.property_dict = dict(
 54 |             num_leaves=self.num_leaves,
 55 |             learning_rate=self.learning_rate,
 56 |             n_estimators=self.n_estimators,
 57 |             min_child_weight=self.min_child_weight,
 58 |             min_child_samples=self.min_child_samples,
 59 |             reg_alpha=self.reg_alpha,
 60 |             reg_lambda=self.reg_lambda,
 61 |             colsample_bytree=self.colsample_bytree,
 62 |             subsample=self.subsample
 63 |         )
 64 | 
 65 |     @property
 66 |     def num_leaves(self):
 67 |         return self._num_leaves
 68 | 
 69 |     @num_leaves.setter
 70 |     def num_leaves(self, value=None):
 71 |         default = [10, 1000, 'int']
 72 | 
 73 |         if value is None:
 74 |             self._num_leaves = Property._make(default)
 75 |         elif isinstance(value, list):
 76 |             self._num_leaves = Property._make(value)
 77 |         elif isinstance(value, dict):
 78 |             self._num_leaves = Property(**value)
 79 |         else:
 80 |             raise ValueError()
 81 | 
 82 |     @property
 83 |     def learning_rate(self):
 84 |         return self._learning_rate
 85 | 
 86 |     @learning_rate.setter
 87 |     def learning_rate(self, value=None):
 88 |         default = [0.01, 0.5, 'float']
 89 | 
 90 |         if value is None:
 91 |             self._learning_rate = Property._make(default)
 92 |         elif isinstance(value, list):
 93 |             self._learning_rate = Property._make(value)
 94 |         elif isinstance(value, dict):
 95 |             self._learning_rate = Property(**value)
 96 |         else:
 97 |             raise ValueError()
 98 | 
 99 |     @property
100 |     def n_estimators(self):
101 |         return self._n_estimators
102 | 
103 |     @n_estimators.setter
104 |     def n_estimators(self, value=None):
105 |         default = [500, 20000, 'int']
106 | 
107 |         if value is None:
108 |             self._n_estimators = Property._make(default)
109 |         elif isinstance(value, list):
110 |             self._n_estimators = Property._make(value)
111 |         elif isinstance(value, dict):
112 |             self._n_estimators = Property(**value)
113 |         else:
114 |             raise ValueError()
115 | 
116 |     @property
117 |     def min_child_weight(self):
118 |         return self._min_child_weight
119 | 
120 |     @min_child_weight.setter
121 |     def min_child_weight(self, value=None):
122 |         default = [0.1, 10, 'float']
123 | 
124 |         if value is None:
125 |             self._min_child_weight = Property._make(default)
126 |         elif isinstance(value, list):
127 |             self._min_child_weight = Property._make(value)
128 |         elif isinstance(value, dict):
129 |             self._min_child_weight = Property(**value)
130 |         else:
131 |             raise ValueError()
132 | 
133 |     @property
134 |     def min_child_samples(self):
135 |         return self._min_child_samples
136 | 
137 |     @min_child_samples.setter
138 |     def min_child_samples(self, value=None):
139 |         default = [50, 1000, 'int']
140 | 
141 |         if value is None:
142 |             self._min_child_samples = Property._make(default)
143 |         elif isinstance(value, list):
144 |             self._min_child_samples = Property._make(value)
145 |         elif isinstance(value, dict):
146 |             self._min_child_samples = Property(**value)
147 |         else:
148 |             raise ValueError()
149 | 
150 |     @property
151 |     def reg_alpha(self):
152 |         return self._reg_alpha
153 | 
154 |     @reg_alpha.setter
155 |     def reg_alpha(self, value=None):
156 |         default = [0, 10, 'float']
157 | 
158 |         if value is None:
159 |             self._reg_alpha = Property._make(default)
160 |         elif isinstance(value, list):
161 |             self._reg_alpha = Property._make(value)
162 |         elif isinstance(value, dict):
163 |             self._reg_alpha = Property(**value)
164 |         else:
165 |             raise ValueError()
166 | 
167 |     @property
168 |     def reg_lambda(self):
169 |         return self._reg_lambda
170 | 
171 |     @reg_lambda.setter
172 |     def reg_lambda(self, value=None):
173 |         default = [0, 10, 'float']
174 | 
175 |         if value is None:
176 |             self._reg_lambda = Property._make(default)
177 |         elif isinstance(value, list):
178 |             self._reg_lambda = Property._make(value)
179 |         elif isinstance(value, dict):
180 |             self._reg_lambda = Property(**value)
181 |         else:
182 |             raise ValueError()
183 | 
184 |     @property
185 |     def colsample_bytree(self):
186 |         return self._colsample_bytree
187 | 
188 |     @colsample_bytree.setter
189 |     def colsample_bytree(self, value=None):
190 |         default = [0.5, 1, 'float']
191 | 
192 |         if value is None:
193 |             self._colsample_bytree = Property._make(default)
194 |         elif isinstance(value, list):
195 |             self._colsample_bytree = Property._make(value)
196 |         elif isinstance(value, dict):
197 |             self._colsample_bytree = Property(**value)
198 |         else:
199 |             raise ValueError()
200 | 
201 |     @property
202 |     def subsample(self):
203 |         return self._subsample
204 | 
205 |     @subsample.setter
206 |     def subsample(self, value=None):
207 |         default = [0.5, 1, 'float']
208 | 
209 |         if value is None:
210 |             self._subsample = Property._make(default)
211 |         elif isinstance(value, list):
212 |             self._subsample = Property._make(value)
213 |         elif isinstance(value, dict):
214 |             self._subsample = Property(**value)
215 |         else:
216 |             raise ValueError()
217 | 
218 |     @staticmethod
219 |     def _get_values_list(low, high, dtype, size):
220 |         linspace = np.linspace(low, high, size, dtype=dtype)
221 | 
222 |         if dtype == 'float':
223 |             linspace = list(map(lambda item: round(item, 4), linspace))
224 | 
225 |         return linspace
226 | 
227 |     def _get_grid_params(self, values, key, best_value, size):
228 |         max_value = max(values)
229 |         min_value = min(values)
230 | 
231 |         property_item = self.property_dict[key]
232 | 
233 |         if best_value == max_value:
234 |             if best_value == property_item.max:
235 |                 return [best_value]
236 |             low = best_value
237 |             high = property_item.max
238 |             linspace = self._get_values_list(low, high, property_item.type, size)
239 |         elif best_value == min_value:
240 |             if best_value == property_item.min:
241 |                 return [best_value]
242 |             low = min_value
243 |             high = best_value
244 |             linspace = self._get_values_list(low, high, property_item.type, size)
245 |         else:
246 |             best_index = values.index(best_value)
247 |             low = values[best_index - 1]
248 |             high = values[best_index + 1]
249 |             linspace = self._get_values_list(low, high, property_item.type, size)
250 | 
251 |         linspace = list(set(linspace))
252 |         return linspace
253 | 
254 |     def _update_params(self, best_params):
255 |         for key, value in best_params.items():
256 |             self.params[key] = value
257 | 
258 |     def _update_grid_params(self, best_params, size=4):
259 |         for key, value in best_params.items():
260 |             values = self.grid_params[key]
261 | 
262 |             values_list = self._get_grid_params(values, key, value, size)
263 |             self.grid_params[key] = values_list
264 | 
265 |     def _optimize(self, params, grid_params):
266 |         clf = lgb.LGBMClassifier(**params)
267 |         grid_clf = GridSearchCV(clf, grid_params, cv=5, scoring='neg_log_loss', n_jobs=1, verbose=100)
268 |         grid_clf.fit(self.x_train, self.y_train)
269 |         return grid_clf
270 | 
271 |     def optimize(self):
272 |         best_params = None
273 | 
274 |         while self.iter_num > 0:
275 |             grid_clf = self._optimize(self.params, self.grid_params)
276 | 
277 |             best_params = grid_clf.best_params_
278 |             best_score = grid_clf.best_score_
279 | 
280 |             logger.info('iter_num: {} best_params: {}'.format(self.iter_num, best_params))
281 |             logger.info('iter_num: {} best_score: {}'.format(self.iter_num, best_score))
282 | 
283 |             self._update_params(best_params)
284 |             self._update_grid_params(best_params)
285 | 
286 |             self.iter_num -= 1
287 | 
288 |         return best_params
289 | 
290 | 
291 | class SimpleOptimize(object):
292 |     def __init__(self, x_train, y_train, params, opt_params):
293 |         self.x_train = x_train
294 |         self.y_train = y_train
295 |         self.params = params
296 |         self.opt_params = opt_params
297 | 
298 |     def _update_params(self, best_params):
299 |         for key, value in best_params.items():
300 |             self.params[key] = value
301 | 
302 |     def optimize(self, grid=True, random=False):
303 |         gbm = lgb.LGBMClassifier(**self.params)
304 |         if grid:
305 |             opt_gbm = GridSearchCV(gbm, self.opt_params, cv=5, scoring='neg_log_loss', refit="binary_logloss",
306 |                                    n_jobs=1, verbose=100)
307 |         elif random:
308 |             opt_gbm = RandomizedSearchCV(gbm, self.opt_params, cv=5, scoring='neg_log_loss', refit="binary_logloss",
309 |                                          n_jobs=1, verbose=100)
310 |         else:
311 |             raise ValueError()
312 | 
313 |         opt_gbm.fit(self.x_train, self.y_train)
314 |         best_params = opt_gbm.best_params_
315 |         best_score = opt_gbm.best_score_
316 | 
317 |         logger.info('best_params: {}'.format(best_params))
318 |         logger.info('best_score: {}'.format(best_score))
319 | 
320 |         self._update_params(best_params)
321 | 
322 |         logger.info('update best params: {}'.format(self.params))
323 |         return self.params
324 | 


--------------------------------------------------------------------------------
/resources/stop_words.txt:
--------------------------------------------------------------------------------
  1 | 一
  2 | 一些
  3 | 一何
  4 | 一切
  5 | 一则
  6 | 一方面
  7 | 一旦
  8 | 一来
  9 | 一样
 10 | 一般
 11 | 一转眼
 12 | 七
 13 | 万一
 14 | 三
 15 | 上
 16 | 上下
 17 | 下
 18 | 不
 19 | 不仅
 20 | 不但
 21 | 不光
 22 | 不单
 23 | 不只
 24 | 不外乎
 25 | 不如
 26 | 不妨
 27 | 不尽
 28 | 不尽然
 29 | 不得
 30 | 不怕
 31 | 不惟
 32 | 不成
 33 | 不拘
 34 | 不料
 35 | 不是
 36 | 不比
 37 | 不然
 38 | 不特
 39 | 不独
 40 | 不管
 41 | 不至于
 42 | 不若
 43 | 不论
 44 | 不过
 45 | 不问
 46 | 与
 47 | 与其
 48 | 与其说
 49 | 与否
 50 | 与此同时
 51 | 且
 52 | 且不说
 53 | 且说
 54 | 两者
 55 | 个
 56 | 个别
 57 | 中
 58 | 临
 59 | 为
 60 | 为了
 61 | 为什么
 62 | 为何
 63 | 为止
 64 | 为此
 65 | 为着
 66 | 乃
 67 | 乃至
 68 | 乃至于
 69 | 么
 70 | 之
 71 | 之一
 72 | 之所以
 73 | 之类
 74 | 乌乎
 75 | 乎
 76 | 乘
 77 | 九
 78 | 也
 79 | 也好
 80 | 也罢
 81 | 了
 82 | 二
 83 | 二来
 84 | 于
 85 | 于是
 86 | 于是乎
 87 | 云云
 88 | 云尔
 89 | 五
 90 | 些
 91 | 亦
 92 | 人
 93 | 人们
 94 | 人家
 95 | 什
 96 | 什么
 97 | 什么样
 98 | 今
 99 | 介于
100 | 仍
101 | 仍旧
102 | 从
103 | 从此
104 | 从而
105 | 他
106 | 他人
107 | 他们
108 | 他们们
109 | 以
110 | 以上
111 | 以为
112 | 以便
113 | 以免
114 | 以及
115 | 以故
116 | 以期
117 | 以来
118 | 以至
119 | 以至于
120 | 以致
121 | 们
122 | 任
123 | 任何
124 | 任凭
125 | 会
126 | 似的
127 | 但
128 | 但凡
129 | 但是
130 | 何
131 | 何以
132 | 何况
133 | 何处
134 | 何时
135 | 余外
136 | 作为
137 | 你
138 | 你们
139 | 使
140 | 使得
141 | 例如
142 | 依
143 | 依据
144 | 依照
145 | 便于
146 | 俺
147 | 俺们
148 | 倘
149 | 倘使
150 | 倘或
151 | 倘然
152 | 倘若
153 | 借
154 | 借傥然
155 | 假使
156 | 假如
157 | 假若
158 | 做
159 | 像
160 | 儿
161 | 先不先
162 | 光是
163 | 全体
164 | 全部
165 | 八
166 | 六
167 | 兮
168 | 共
169 | 关于
170 | 关于具体地说
171 | 其
172 | 其一
173 | 其中
174 | 其二
175 | 其他
176 | 其余
177 | 其它
178 | 其次
179 | 具体地说
180 | 具体说来
181 | 兼之
182 | 内
183 | 再
184 | 再其次
185 | 再则
186 | 再有
187 | 再者
188 | 再者说
189 | 再说
190 | 冒
191 | 冲
192 | 况且
193 | 几
194 | 几时
195 | 凡
196 | 凡是
197 | 凭
198 | 凭借
199 | 出于
200 | 出来
201 | 分
202 | 分别
203 | 则
204 | 则甚
205 | 别
206 | 别人
207 | 别处
208 | 别是
209 | 别的
210 | 别管
211 | 别说
212 | 到
213 | 前后
214 | 前此
215 | 前者
216 | 加之
217 | 加以
218 | 即
219 | 即令
220 | 即使
221 | 即便
222 | 即如
223 | 即或
224 | 即若
225 | 却
226 | 去
227 | 又
228 | 又及
229 | 及
230 | 及其
231 | 及至
232 | 反之
233 | 反而
234 | 反过来
235 | 反过来说
236 | 受到
237 | 另
238 | 另一方面
239 | 另外
240 | 另悉
241 | 只
242 | 只当
243 | 只怕
244 | 只是
245 | 只有
246 | 只消
247 | 只要
248 | 只限
249 | 叫
250 | 叮咚
251 | 可
252 | 可以
253 | 可是
254 | 可见
255 | 各
256 | 各个
257 | 各位
258 | 各种
259 | 各自
260 | 同
261 | 同时
262 | 后
263 | 后者
264 | 向
265 | 向使
266 | 向着
267 | 吓
268 | 吗
269 | 否则
270 | 吧
271 | 吧哒
272 | 含
273 | 吱
274 | 呀
275 | 呃
276 | 呕
277 | 呗
278 | 呜
279 | 呜呼
280 | 呢
281 | 呵
282 | 呵呵
283 | 呸
284 | 呼哧
285 | 咋
286 | 和
287 | 咚
288 | 咦
289 | 咧
290 | 咱
291 | 咱们
292 | 咳
293 | 哇
294 | 哈
295 | 哈哈
296 | 哉
297 | 哎
298 | 哎呀
299 | 哎哟
300 | 哗
301 | 哟
302 | 哦
303 | 哩
304 | 哪
305 | 哪个
306 | 哪些
307 | 哪儿
308 | 哪天
309 | 哪年
310 | 哪怕
311 | 哪样
312 | 哪边
313 | 哪里
314 | 哼
315 | 哼唷
316 | 唉
317 | 唯有
318 | 啊
319 | 啐
320 | 啥
321 | 啦
322 | 啪达
323 | 啷当
324 | 喂
325 | 喏
326 | 喔唷
327 | 喽
328 | 嗡
329 | 嗡嗡
330 | 嗬
331 | 嗯
332 | 嗳
333 | 嘎
334 | 嘎登
335 | 嘘
336 | 嘛
337 | 嘻
338 | 嘿
339 | 嘿嘿
340 | 四
341 | 因
342 | 因为
343 | 因了
344 | 因此
345 | 因着
346 | 因而
347 | 固然
348 | 在
349 | 在下
350 | 在于
351 | 地
352 | 基于
353 | 处在
354 | 多
355 | 多么
356 | 多少
357 | 大
358 | 大家
359 | 她
360 | 她们
361 | 好
362 | 如
363 | 如上
364 | 如上所述
365 | 如下
366 | 如何
367 | 如其
368 | 如同
369 | 如是
370 | 如果
371 | 如此
372 | 如若
373 | 始而
374 | 孰料
375 | 孰知
376 | 宁
377 | 宁可
378 | 宁愿
379 | 宁肯
380 | 它
381 | 它们
382 | 对
383 | 对于
384 | 对待
385 | 对方
386 | 对比
387 | 将
388 | 小
389 | 尔
390 | 尔后
391 | 尔尔
392 | 尚且
393 | 就
394 | 就是
395 | 就是了
396 | 就是说
397 | 就算
398 | 就要
399 | 尽
400 | 尽管
401 | 尽管如此
402 | 岂但
403 | 己
404 | 已
405 | 已矣
406 | 巴
407 | 巴巴
408 | 年
409 | 并
410 | 并且
411 | 庶乎
412 | 庶几
413 | 开外
414 | 开始
415 | 归
416 | 归齐
417 | 当
418 | 当地
419 | 当然
420 | 当着
421 | 彼
422 | 彼时
423 | 彼此
424 | 往
425 | 待
426 | 很
427 | 得
428 | 得了
429 | 怎
430 | 怎么
431 | 怎么办
432 | 怎么样
433 | 怎奈
434 | 怎样
435 | 总之
436 | 总的来看
437 | 总的来说
438 | 总的说来
439 | 总而言之
440 | 恰恰相反
441 | 您
442 | 惟其
443 | 慢说
444 | 我
445 | 我们
446 | 或
447 | 或则
448 | 或是
449 | 或曰
450 | 或者
451 | 截至
452 | 所
453 | 所以
454 | 所在
455 | 所幸
456 | 所有
457 | 才
458 | 才能
459 | 打
460 | 打从
461 | 把
462 | 抑或
463 | 拿
464 | 按
465 | 按照
466 | 换句话说
467 | 换言之
468 | 据
469 | 据此
470 | 接着
471 | 故
472 | 故此
473 | 故而
474 | 旁人
475 | 无
476 | 无宁
477 | 无论
478 | 既
479 | 既往
480 | 既是
481 | 既然
482 | 日
483 | 时
484 | 时候
485 | 是
486 | 是以
487 | 是的
488 | 更
489 | 曾
490 | 替
491 | 替代
492 | 最
493 | 月
494 | 有
495 | 有些
496 | 有关
497 | 有及
498 | 有时
499 | 有的
500 | 望
501 | 朝
502 | 朝着
503 | 本
504 | 本人
505 | 本地
506 | 本着
507 | 本身
508 | 来
509 | 来着
510 | 来自
511 | 来说
512 | 极了
513 | 果然
514 | 果真
515 | 某
516 | 某个
517 | 某些
518 | 某某
519 | 根据
520 | 欤
521 | 正值
522 | 正如
523 | 正巧
524 | 正是
525 | 此
526 | 此地
527 | 此处
528 | 此外
529 | 此时
530 | 此次
531 | 此间
532 | 毋宁
533 | 每
534 | 每当
535 | 比
536 | 比及
537 | 比如
538 | 比方
539 | 没奈何
540 | 沿
541 | 沿着
542 | 漫说
543 | 焉
544 | 然则
545 | 然后
546 | 然而
547 | 照
548 | 照着
549 | 犹且
550 | 犹自
551 | 甚且
552 | 甚么
553 | 甚或
554 | 甚而
555 | 甚至
556 | 甚至于
557 | 用
558 | 用来
559 | 由
560 | 由于
561 | 由是
562 | 由此
563 | 由此可见
564 | 的
565 | 的确
566 | 的话
567 | 直到
568 | 相对而言
569 | 省得
570 | 看
571 | 眨眼
572 | 着
573 | 着呢
574 | 矣
575 | 矣乎
576 | 矣哉
577 | 离
578 | 秒
579 | 竟而
580 | 第
581 | 等
582 | 等到
583 | 等等
584 | 简言之
585 | 管
586 | 类如
587 | 紧接着
588 | 纵
589 | 纵令
590 | 纵使
591 | 纵然
592 | 经
593 | 经过
594 | 结果
595 | 给
596 | 继之
597 | 继后
598 | 继而
599 | 综上所述
600 | 罢了
601 | 者
602 | 而
603 | 而且
604 | 而况
605 | 而后
606 | 而外
607 | 而已
608 | 而是
609 | 而言
610 | 能
611 | 能否
612 | 腾
613 | 自
614 | 自个儿
615 | 自从
616 | 自各儿
617 | 自后
618 | 自家
619 | 自己
620 | 自打
621 | 自身
622 | 至
623 | 至于
624 | 至今
625 | 至若
626 | 致
627 | 般的
628 | 若
629 | 若夫
630 | 若是
631 | 若果
632 | 若非
633 | 莫不然
634 | 莫如
635 | 莫若
636 | 虽
637 | 虽则
638 | 虽然
639 | 虽说
640 | 被
641 | 要
642 | 要不
643 | 要不是
644 | 要不然
645 | 要么
646 | 要是
647 | 譬喻
648 | 譬如
649 | 让
650 | 许多
651 | 论
652 | 设使
653 | 设或
654 | 设若
655 | 诚如
656 | 诚然
657 | 该
658 | 说
659 | 说来
660 | 请
661 | 诸
662 | 诸位
663 | 诸如
664 | 谁
665 | 谁人
666 | 谁料
667 | 谁知
668 | 贼死
669 | 赖以
670 | 赶
671 | 起
672 | 起见
673 | 趁
674 | 趁着
675 | 越是
676 | 距
677 | 跟
678 | 较
679 | 较之
680 | 边
681 | 过
682 | 还
683 | 还是
684 | 还有
685 | 还要
686 | 这
687 | 这一来
688 | 这个
689 | 这么
690 | 这么些
691 | 这么样
692 | 这么点儿
693 | 这些
694 | 这会儿
695 | 这儿
696 | 这就是说
697 | 这时
698 | 这样
699 | 这次
700 | 这般
701 | 这边
702 | 这里
703 | 进而
704 | 连
705 | 连同
706 | 逐步
707 | 通过
708 | 遵循
709 | 遵照
710 | 那
711 | 那个
712 | 那么
713 | 那么些
714 | 那么样
715 | 那些
716 | 那会儿
717 | 那儿
718 | 那时
719 | 那样
720 | 那般
721 | 那边
722 | 那里
723 | 都
724 | 鄙人
725 | 鉴于
726 | 针对
727 | 阿
728 | 除
729 | 除了
730 | 除外
731 | 除开
732 | 除此之外
733 | 除非
734 | 随
735 | 随后
736 | 随时
737 | 随着
738 | 难道说
739 | 零
740 | 非
741 | 非但
742 | 非徒
743 | 非特
744 | 非独
745 | 靠
746 | 顺
747 | 顺着
748 | 首先


--------------------------------------------------------------------------------
/stacking.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | 
 8 | class Stacking(object):
 9 |     def __init__(self, kflod, df, train_df_length):
10 |         self.kflod = kflod
11 |         self.df = df.copy()
12 |         self.train_df_length = train_df_length
13 | 
14 |     def _get_kflod(self, list_):
15 |         list_array = np.array(list_)
16 |         np.random.shuffle(list_array)
17 | 
18 |         list_part = np.array_split(list_array, self.kflod)
19 |         for idx, list_item in enumerate(list_part):
20 |             list_part_copy = list_part.copy()
21 |             list_part_copy.pop(idx)
22 | 
23 |             other_list_part = np.concatenate(list_part_copy).ravel()
24 | 
25 |             yield other_list_part, list_item
26 | 
27 |     def get_stacking_df(self, columns=None):
28 |         if columns is None:
29 |             columns = ['prefix', 'title', 'tag', 'prefix_title', 'prefix_tag', 'title_tag']
30 | 
31 |         train_df = self.df[:self.train_df_length]
32 |         train_df_index = train_df.index
33 | 
34 |         validate_test_df = self.df[self.train_df_length:]
35 | 
36 |         stacking_df = pd.DataFrame()
37 |         stacking_columns = ['stacking_{}'.format(column) for column in columns]
38 | 
39 |         kfloder = self._get_kflod(train_df_index)
40 |         kflod_list = list()
41 |         for kflod_item in kfloder:
42 |             kflod_list.append(kflod_item)
43 | 
44 |         for column in columns:
45 |             stacking_train_df = pd.DataFrame()
46 |             stacking_test_list = list()
47 | 
48 |             for train_index, test_index in kflod_list:
49 |                 k_train_df = train_df.loc[train_index]
50 |                 k_test_df = train_df.loc[test_index]
51 | 
52 |                 click_column = "{column}_click".format(column=column)
53 |                 count_column = "{column}_count".format(column=column)
54 |                 stacking_column = "{column}_stacking".format(column=column)
55 | 
56 |                 agg_dict = {click_column: "sum", count_column: "count"}
57 |                 _stacking_df = k_train_df.groupby(column, as_index=False)["label"].agg(agg_dict)
58 |                 _stacking_df[stacking_column] = _stacking_df[click_column] / (_stacking_df[count_column] + 5)
59 | 
60 |                 k_test_df = pd.merge(k_test_df, _stacking_df, how='left', on=column)
61 |                 stacking_train_df = pd.concat([stacking_train_df, k_test_df[stacking_column]],
62 |                                               axis=0, ignore_index=False, sort=False)
63 | 
64 |                 temp_df = pd.merge(validate_test_df, _stacking_df, how='left', on=column)
65 |                 temp_column_list = temp_df[stacking_column].tolist()
66 |                 stacking_test_list.append(temp_column_list)
67 | 
68 |             # train data
69 |             stacking_train_df.sort_index(inplace=True)
70 | 
71 |             # validate + test data
72 |             length = len(stacking_test_list)
73 |             stacking_test_columns = ["stacking_{id}".format(id=i) for i in range(length)]
74 |             stacking_test_df = pd.DataFrame(data=stacking_test_list)
75 |             stacking_test_df = stacking_test_df.T
76 |             stacking_test_df.columns = stacking_test_columns
77 |             stacking_test_df['mean'] = stacking_test_df.mean(axis=1)
78 | 
79 |             # contact train validate test
80 |             column_stacking_df = pd.concat([stacking_train_df, stacking_test_df['mean']],
81 |                                            axis=0, ignore_index=True, sort=False)
82 | 
83 |             # contact column to stacking df
84 |             stacking_df = pd.concat([stacking_df, column_stacking_df], axis=1)
85 | 
86 |         stacking_df.columns = stacking_columns
87 |         return stacking_df
88 | 


--------------------------------------------------------------------------------
/stat_engineering.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | import json
  6 | import logging
  7 | import os
  8 | import time
  9 | import warnings
 10 | from operator import itemgetter
 11 | 
 12 | import jieba
 13 | import numpy as np
 14 | import pandas as pd
 15 | from sklearn.preprocessing import LabelEncoder
 16 | 
 17 | from logconfig import config_logging
 18 | from utils import char_cleaner
 19 | 
 20 | config_logging()
 21 | logger = logging.getLogger('stat_features')
 22 | 
 23 | warnings.filterwarnings('ignore')
 24 | np.random.seed(2018)
 25 | 
 26 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data")
 27 | RAW_DATA_PATH = os.path.join(BASE_PATH, "RawData")
 28 | ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData")
 29 | 
 30 | 
 31 | class PrefixProcessing(object):
 32 |     @staticmethod
 33 |     def _is_in_title(item):
 34 |         prefix = item["prefix"]
 35 |         title = item["title"]
 36 | 
 37 |         if not isinstance(prefix, str):
 38 |             prefix = "null"
 39 | 
 40 |         if prefix in title:
 41 |             return 1
 42 |         return 0
 43 | 
 44 |     @staticmethod
 45 |     def _levenshtein_distance(item):
 46 |         str1 = item["prefix"]
 47 |         str2 = item["title"]
 48 | 
 49 |         if not isinstance(str1, str):
 50 |             str1 = "null"
 51 | 
 52 |         x_size = len(str1) + 1
 53 |         y_size = len(str2) + 1
 54 | 
 55 |         matrix = np.zeros((x_size, y_size), dtype=np.int_)
 56 | 
 57 |         for x in range(x_size):
 58 |             matrix[x, 0] = x
 59 | 
 60 |         for y in range(y_size):
 61 |             matrix[0, y] = y
 62 | 
 63 |         for x in range(1, x_size):
 64 |             for y in range(1, y_size):
 65 |                 if str1[x - 1] == str2[y - 1]:
 66 |                     matrix[x, y] = min(matrix[x - 1, y] + 1, matrix[x - 1, y - 1], matrix[x, y - 1] + 1)
 67 |                 else:
 68 |                     matrix[x, y] = min(matrix[x - 1, y] + 1, matrix[x - 1, y - 1] + 1, matrix[x, y - 1] + 1)
 69 | 
 70 |         return matrix[x_size - 1, y_size - 1]
 71 | 
 72 |     @staticmethod
 73 |     def _distince_rate(item):
 74 |         str1 = item["prefix"]
 75 |         str2 = item["title"]
 76 |         leven_distance = item["leven_distance"]
 77 | 
 78 |         if not isinstance(str1, str):
 79 |             str1 = "null"
 80 | 
 81 |         length = max(len(str1), len(str2))
 82 | 
 83 |         return leven_distance / (length + 5)  # 平滑
 84 | 
 85 |     def get_prefix_df(self, df):
 86 |         prefix_df = pd.DataFrame()
 87 | 
 88 |         prefix_df[["prefix", "title"]] = df[["prefix", "title"]]
 89 |         prefix_df["is_in_title"] = prefix_df.apply(self._is_in_title, axis=1)
 90 |         prefix_df["leven_distance"] = prefix_df.apply(self._levenshtein_distance, axis=1)
 91 |         prefix_df["distance_rate"] = prefix_df.apply(self._distince_rate, axis=1)
 92 | 
 93 |         return prefix_df
 94 | 
 95 | 
 96 | class QueryProcessing(object):
 97 |     @staticmethod
 98 |     def _get_query_dict(item):
 99 |         item_dict = dict()
100 | 
101 |         query_predict = item["query_prediction"]
102 | 
103 |         if not query_predict:
104 |             item_dict["query_length"] = 0
105 |             item_dict["prob_sum"] = None
106 |             item_dict["prob_max"] = None
107 |             item_dict["prob_mean"] = None
108 |             return item_dict
109 | 
110 |         prob_list = list()
111 |         for _, prob in query_predict.items():
112 |             prob = float(prob)
113 |             prob_list.append(prob)
114 | 
115 |         item_dict["query_length"] = len(prob_list)
116 |         item_dict["prob_sum"] = np.sum(prob_list)
117 |         item_dict["prob_max"] = np.max(prob_list)
118 |         item_dict["prob_mean"] = np.mean(prob_list)
119 | 
120 |         return item_dict
121 | 
122 |     def get_query_df(self, df):
123 |         query_df = pd.DataFrame()
124 | 
125 |         query_df["item_dict"] = df.apply(self._get_query_dict, axis=1)
126 |         query_df["query_length"] = query_df["item_dict"].apply(lambda item: item.get("query_length"))
127 |         query_df["prob_sum"] = query_df["item_dict"].apply(lambda item: item.get("prob_sum"))
128 |         query_df["prob_max"] = query_df["item_dict"].apply(lambda item: item.get("prob_max"))
129 |         query_df["prob_mean"] = query_df["item_dict"].apply(lambda item: item.get("prob_mean"))
130 |         query_df = query_df.drop(columns=["item_dict"])
131 | 
132 |         return query_df
133 | 
134 | 
135 | class Processing(object):
136 | 
137 |     @staticmethod
138 |     def _get_data(name):
139 |         if name == "test":
140 |             columns = ['prefix', 'query_prediction', 'title', 'tag']
141 |         else:
142 |             columns = ['prefix', 'query_prediction', 'title', 'tag', 'label']
143 | 
144 |         data_name = os.path.join(RAW_DATA_PATH, "oppo_round1_{}.txt".format(name))
145 |         df = pd.read_csv(data_name, names=columns, sep="\t", header=None, encoding="utf-8")
146 | 
147 |         return df
148 | 
149 |     @staticmethod
150 |     def _loads(item):
151 |         try:
152 |             return json.loads(item)
153 |         except (json.JSONDecodeError, TypeError):
154 |             return json.loads("{}")
155 | 
156 |     @staticmethod
157 |     def _get_apriori_df(df, train_df_length, columns=None):
158 |         df = df.copy()
159 | 
160 |         train_df = df[:train_df_length]
161 | 
162 |         if columns is None:
163 |             columns = ['prefix', 'complete_prefix', 'title', 'tag']
164 | 
165 |         ctr_columns = columns.copy()
166 |         ctr_columns.extend(['prefix_title', 'prefix_tag', 'complete_prefix_title', 'complete_prefix_tag', 'title_tag'])
167 |         apriori_df = df[ctr_columns]
168 | 
169 |         # click count and ctr
170 |         for idx, column in enumerate(ctr_columns):
171 |             click_column = "{column}_click".format(column=column)
172 |             count_column = "{column}_count".format(column=column)
173 |             ctr_column = "{column}_ctr".format(column=column)
174 | 
175 |             agg_dict = {click_column: "sum", count_column: "count"}
176 |             column_apriori_df = train_df.groupby(column, as_index=False)["label"].agg(agg_dict)
177 |             column_apriori_df[ctr_column] = column_apriori_df[click_column] / (column_apriori_df[count_column] + 5)
178 |             apriori_df = pd.merge(apriori_df, column_apriori_df, how='left', on=column)
179 | 
180 |         length = apriori_df.shape[0]
181 |         all_columns = apriori_df.columns
182 | 
183 |         # apriori
184 |         for column1 in columns:
185 |             for column2 in columns:
186 |                 if column1 == column2:
187 |                     continue
188 | 
189 |                 if column1 in column2:
190 |                     continue
191 | 
192 |                 if column2 in column1:
193 |                     continue
194 | 
195 |                 temp_click_column = "{}_{}_click".format(column1, column2)
196 |                 if temp_click_column in all_columns:
197 |                     click_column = temp_click_column
198 |                 else:
199 |                     click_column = "{}_{}_click".format(column2, column1)
200 | 
201 |                 temp_count_column = "{}_{}_count".format(column1, column2)
202 |                 if temp_count_column in all_columns:
203 |                     count_column = temp_count_column
204 |                 else:
205 |                     count_column = "{}_{}_count".format(column2, column1)
206 | 
207 |                 click_column1 = "{column}_click".format(column=column1)
208 |                 count_column1 = "{column}_count".format(column=column1)
209 |                 click_column2 = "{column}_click".format(column=column2)
210 |                 count_column2 = "{column}_count".format(column=column2)
211 | 
212 |                 click_confidence_column = "click_{}_{}_confidence".format(column1, column2)
213 |                 count_confidence_column = "count_{}_{}_confidence".format(column1, column2)
214 |                 click_lift_column = "click_{}_{}_lift".format(column1, column2)
215 |                 count_lift_column = "count_{}_{}_lift".format(column1, column2)
216 | 
217 |                 # confidence = P(A&B)/P(A)
218 |                 apriori_df[click_confidence_column] = apriori_df[click_column] * 100 / (apriori_df[click_column1] + 5)
219 |                 apriori_df[count_confidence_column] = apriori_df[count_column] * 100 / (apriori_df[count_column1] + 5)
220 | 
221 |                 # lift = P(A&B)/(P(A)*P(B))
222 |                 apriori_df[click_lift_column] = (apriori_df[click_column] / length) / (
223 |                         (apriori_df[click_column1] * apriori_df[click_column2]) / (length * length))
224 |                 apriori_df[count_lift_column] = (apriori_df[count_column] / length) / (
225 |                         (apriori_df[count_column1] * apriori_df[count_column2]) / (length * length))
226 | 
227 |         apriori_df = apriori_df.drop(columns=ctr_columns)
228 |         return apriori_df
229 | 
230 |     @staticmethod
231 |     def _get_expose_df(df, columns=None):
232 |         df = df.copy()
233 | 
234 |         if columns is None:
235 |             columns = ['prefix', 'complete_prefix', 'title', 'tag']
236 | 
237 |         expose_df = df[columns]
238 | 
239 |         for column1 in columns:
240 |             for column2 in columns:
241 | 
242 |                 if column1 == column2:
243 |                     continue
244 | 
245 |                 nunique_column_name = "{}_{}_nunique".format(column1, column2)
246 |                 temp_df = expose_df.groupby(column1)[column2].nunique().reset_index().rename(
247 |                     columns={column2: nunique_column_name})
248 |                 expose_df = pd.merge(expose_df, temp_df, how='left', on=column1)
249 | 
250 |         expose_df = expose_df.drop(columns=columns)
251 |         return expose_df
252 | 
253 |     @staticmethod
254 |     def _get_complete_prefix(item):
255 |         prefix = item['prefix']
256 |         query_prediction = item['query_prediction']
257 | 
258 |         if not query_prediction:
259 |             return prefix
260 | 
261 |         predict_word_dict = dict()
262 |         prefix = str(prefix)
263 | 
264 |         for query_item, query_ratio in query_prediction.items():
265 |             query_item_cut = jieba.lcut(query_item)
266 |             item_word = ""
267 |             for item in query_item_cut:
268 |                 if prefix not in item_word:
269 |                     item_word += item
270 |                 else:
271 |                     if item_word not in predict_word_dict.keys():
272 |                         predict_word_dict[item_word] = 0.0
273 |                     predict_word_dict[item_word] += float(query_ratio)
274 |                     break
275 | 
276 |         if not predict_word_dict:
277 |             return prefix
278 | 
279 |         predict_word_dict = sorted(predict_word_dict.items(), key=itemgetter(1), reverse=True)
280 |         complete_prefix = predict_word_dict[0][0]
281 |         return complete_prefix
282 | 
283 |     @staticmethod
284 |     def _get_max_query_ratio(item):
285 |         query_prediction = item['query_prediction']
286 |         title = item['title']
287 | 
288 |         if not query_prediction:
289 |             return 0
290 | 
291 |         for query_wrod, ratio in query_prediction.items():
292 |             if title == query_wrod:
293 |                 if float(ratio) > 0.1:
294 |                     return 1
295 | 
296 |         return 0
297 | 
298 |     @staticmethod
299 |     def _get_word_length(item):
300 |         item = str(item)
301 | 
302 |         word_cut = jieba.lcut(item)
303 |         length = len(word_cut)
304 |         return length
305 | 
306 |     @staticmethod
307 |     def _get_small_query_num(item):
308 |         small_query_num = 0
309 | 
310 |         for _, ratio in item.items():
311 |             if float(ratio) <= 0.08:
312 |                 small_query_num += 1
313 | 
314 |         return small_query_num
315 | 
316 |     def _get_length_df(self, df):
317 |         df = df.copy()
318 | 
319 |         columns = ['query_prediction', 'prefix', 'title']
320 |         length_df = df[columns]
321 | 
322 |         length_df['max_query_ratio'] = length_df.apply(self._get_max_query_ratio, axis=1)
323 |         length_df['prefix_word_num'] = length_df['prefix'].apply(self._get_word_length)
324 |         length_df['title_word_num'] = length_df['title'].apply(self._get_word_length)
325 |         length_df['title_len'] = length_df['title'].apply(len)
326 |         length_df['small_query_num'] = length_df['query_prediction'].apply(self._get_small_query_num)
327 | 
328 |         length_df = length_df.drop(columns=columns)
329 |         return length_df
330 | 
331 |     def get_processing(self):
332 |         train_df = self._get_data(name="train")
333 |         validate_df = self._get_data(name="vali")
334 |         test_df = self._get_data(name="test")
335 |         logger.info('finish load data!')
336 | 
337 |         train_df_length = train_df.shape[0]
338 |         validate_df_length = validate_df.shape[0]
339 |         df = pd.concat([train_df, validate_df, test_df], axis=0, ignore_index=True, sort=False)
340 | 
341 |         # make query prediction to json
342 |         df["query_prediction"] = df["query_prediction"].apply(self._loads)
343 | 
344 |         # complete prefix
345 |         df['complete_prefix'] = df[['prefix', 'query_prediction']].apply(self._get_complete_prefix, axis=1)
346 |         logger.info('finish get complete prefix!')
347 | 
348 |         length_df = self._get_length_df(df)
349 |         logger.info('finish get length df!')
350 | 
351 |         # clearn prefix and title
352 |         df["prefix"] = df["prefix"].apply(char_cleaner)
353 |         df["title"] = df["title"].apply(char_cleaner)
354 |         df["complete_prefix"] = df["complete_prefix"].apply(char_cleaner)
355 |         logger.info('finish clearn columns')
356 | 
357 |         # combine columns
358 |         df['prefix_title'] = df[['prefix', 'title']].apply(lambda item: '_'.join(map(str, item)), axis=1)
359 |         df['prefix_tag'] = df[['prefix', 'tag']].apply(lambda item: '_'.join(map(str, item)), axis=1)
360 |         df['complete_prefix_title'] = df[['complete_prefix', 'title']].apply(lambda item: '_'.join(map(str, item)),
361 |                                                                              axis=1)
362 |         df['complete_prefix_tag'] = df[['complete_prefix', 'tag']].apply(lambda item: '_'.join(map(str, item)), axis=1)
363 |         df['title_tag'] = df[['title', 'tag']].apply(lambda item: '_'.join(map(str, item)), axis=1)
364 |         logger.info('finish combine columns')
365 | 
366 |         apriori_df = self._get_apriori_df(df, train_df_length)
367 |         logger.info('finish get apriori df!')
368 | 
369 |         drop_columns = ['prefix_title', 'prefix_tag', 'title_tag', 'complete_prefix_title', 'complete_prefix_tag']
370 |         df = df.drop(columns=drop_columns)
371 | 
372 |         expose_df = self._get_expose_df(df)
373 |         logger.info('finish get expose df!')
374 | 
375 |         prefix_processing = PrefixProcessing()
376 |         prefix_df = prefix_processing.get_prefix_df(df)
377 |         logger.info('finish get prefix df!')
378 | 
379 |         query_processing = QueryProcessing()
380 |         query_df = query_processing.get_query_df(df)
381 |         logger.info('finish get query df!')
382 | 
383 |         df = pd.concat([df, length_df, apriori_df, expose_df, prefix_df, query_df], axis=1)
384 |         logger.info('finish combine all df!')
385 | 
386 |         drop_columns = ['prefix', 'complete_prefix', 'query_prediction', 'title']
387 |         df = df.drop(columns=drop_columns)
388 | 
389 |         # label encoder
390 |         label_encoder = LabelEncoder()
391 |         df['tag'] = label_encoder.fit_transform(df['tag'])
392 |         logger.info('finish label encoder tag!')
393 | 
394 |         train_data = df[:train_df_length]
395 |         train_data["label"] = train_data["label"].apply(int)
396 | 
397 |         validate_data = df[train_df_length:train_df_length + validate_df_length]
398 |         validate_data["label"] = validate_data["label"].apply(int)
399 | 
400 |         test_data = df[train_df_length + validate_df_length:]
401 |         test_data = test_data.drop(columns=["label"])
402 | 
403 |         train_data_name = os.path.join(ETL_DATA_PATH, "train_stat.csv")
404 |         validate_data_name = os.path.join(ETL_DATA_PATH, "validate_stat.csv")
405 |         test_data_name = os.path.join(ETL_DATA_PATH, "test_stat.csv")
406 | 
407 |         train_data.to_csv(train_data_name, index=False)
408 |         validate_data.to_csv(validate_data_name, index=False)
409 |         test_data.to_csv(test_data_name, index=False)
410 | 
411 | 
412 | if __name__ == "__main__":
413 |     t0 = time.time()
414 |     processing = Processing()
415 |     processing.get_processing()
416 |     print(time.time() - t0)
417 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | import re
 5 | 
 6 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data")
 7 | ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData")
 8 | RESOURCE_PATH = os.path.join('resources')
 9 | 
10 | 
11 | def get_stop_words():
12 |     stop_wrods_name = os.path.join(RESOURCE_PATH, 'stop_words.txt')
13 |     _stop_words_list = list()
14 |     with open(stop_wrods_name, encoding='utf-8') as f:
15 |         for line in f:
16 |             _stop_words_list.append(line.strip())
17 | 
18 |     _stop_words_set = set(_stop_words_list)
19 |     return _stop_words_set
20 | 
21 | 
22 | stop_words_set = get_stop_words()
23 | 
24 | 
25 | def char_cleaner(char):
26 |     if not isinstance(char, str):
27 |         char = "null"
28 |     pattern = re.compile("[^0-9a-zA-Z\u4E00-\u9FA5 ]")
29 |     char = re.sub(pattern, "", char)
30 |     char = char.lower()
31 |     return char
32 | 
33 | 
34 | def char_list_cheaner(char_list):
35 |     new_char_list = list()
36 |     for char in char_list:
37 |         if len(char) == 0:
38 |             continue
39 |         if char in stop_words_set:
40 |             continue
41 |         new_char_list.append(char)
42 | 
43 |     return new_char_list
44 | 


--------------------------------------------------------------------------------
/w2v.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import json
 4 | import logging
 5 | import os
 6 | import time
 7 | 
 8 | import jieba
 9 | from gensim.models import Word2Vec
10 | 
11 | from logconfig import config_logging
12 | from utils import char_cleaner, char_list_cheaner
13 | 
14 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data")
15 | RawData = os.path.join(BASE_PATH, "RawData")
16 | 
17 | config_logging()
18 | logger = logging.getLogger('w2v')
19 | 
20 | 
21 | def get_sentence(name):
22 |     if isinstance(name, list):
23 |         name_list = name
24 |     else:
25 |         name_list = [name]
26 | 
27 |     for name in name_list:
28 |         name = "oppo_round1_{fname}.txt".format(fname=name)
29 |         file_path = os.path.join(RawData, name)
30 |         if not os.path.exists(file_path):
31 |             raise FileNotFoundError("{} Not Found!".format(file_path))
32 | 
33 |         with open(file_path, "r", encoding="utf-8") as f:
34 |             line = f.readline()
35 | 
36 |             while line:
37 |                 line_arr = line.split("\t")
38 | 
39 |                 query_prediction = line_arr[1]
40 |                 try:
41 |                     sentences = json.loads(query_prediction)
42 |                 except json.JSONDecodeError:
43 |                     sentences = json.loads("{}")
44 | 
45 |                 for sentence in sentences:
46 |                     yield sentence
47 | 
48 |                 title = line_arr[2]
49 |                 yield title
50 | 
51 |                 line = f.readline()
52 | 
53 | 
54 | class MySentence(object):
55 |     def __init__(self, fname):
56 |         self.fname = fname
57 | 
58 |     def __iter__(self):
59 |         for sentence in get_sentence(self.fname):
60 |             sentence = char_cleaner(sentence)
61 |             seg_list = jieba.lcut(sentence)
62 |             seg_list = char_list_cheaner(seg_list)
63 | 
64 |             if seg_list:
65 |                 yield seg_list
66 | 
67 | 
68 | def build_model(fname, size):
69 |     sentences = MySentence(fname)
70 |     model_name = "w2v_{}.bin".format(size)
71 |     model_path = os.path.join("resources", model_name)
72 |     my_model = Word2Vec(sentences, size=size, window=5, sg=1, hs=1, min_count=5, workers=10)
73 |     my_model.wv.save_word2vec_format(model_path, binary=True)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     t0 = time.time()
78 |     build_model(fname=['train', 'vali', 'test'], size=100)
79 |     print(time.time() - t0)
80 | 


--------------------------------------------------------------------------------
/w2v_engineering.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | import gc
  4 | import json
  5 | import logging
  6 | import os
  7 | import time
  8 | import warnings
  9 | from operator import itemgetter
 10 | 
 11 | import jieba
 12 | import numpy as np
 13 | import pandas as pd
 14 | from gensim import matutils
 15 | from gensim.models.keyedvectors import KeyedVectors
 16 | from sklearn.cluster import MiniBatchKMeans
 17 | from sklearn.decomposition import PCA
 18 | from tqdm import tqdm
 19 | 
 20 | from logconfig import config_logging
 21 | from utils import char_cleaner, char_list_cheaner
 22 | from w2v import build_model
 23 | 
 24 | config_logging()
 25 | logger = logging.getLogger('w2v_features')
 26 | 
 27 | warnings.filterwarnings('ignore')
 28 | 
 29 | BASE_PATH = os.path.join(os.path.dirname(__file__), "data")
 30 | RAW_DATA_PATH = os.path.join(BASE_PATH, "RawData")
 31 | ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData")
 32 | TEMP_DATA_PATH = os.path.join(BASE_PATH, "TempData")
 33 | 
 34 | 
 35 | class PreProcessing(object):
 36 |     def __init__(self, size, w2v_model):
 37 |         self.size = size
 38 |         self.w2v_model = w2v_model
 39 | 
 40 |     def to_csv(self, df, col):
 41 |         file_name = '{col}_w2v.csv'.format(col=col)
 42 |         file_path = os.path.join(TEMP_DATA_PATH, file_name)
 43 |         if os.path.exists(file_path):
 44 |             os.remove(file_path)
 45 | 
 46 |         columns = ['w2v_{}'.format(i) for i in range(self.size)]
 47 | 
 48 |         with open(file_path, 'a', encoding='utf-8') as f:
 49 |             # write columns
 50 |             f.write(','.join(columns) + '\n')
 51 | 
 52 |             for idx, item in tqdm(df[col].items()):
 53 |                 item = char_cleaner(item)
 54 |                 if item == 'null':
 55 |                     item_list = [''] * self.size
 56 |                 elif not item:
 57 |                     item_list = [''] * self.size
 58 |                 else:
 59 |                     seg_cut = jieba.lcut(str(item))
 60 |                     seg_cut = char_list_cheaner(seg_cut)
 61 | 
 62 |                     w2v_array = list()
 63 |                     for word in seg_cut:
 64 |                         try:
 65 |                             similar_list = self.w2v_model[word]
 66 |                             w2v_array.append(similar_list)
 67 |                         except KeyError:
 68 |                             pass
 69 | 
 70 |                     if not w2v_array:
 71 |                         item_list = [''] * self.size
 72 |                     else:
 73 |                         item_list = matutils.unitvec(np.array(w2v_array).mean(axis=0))
 74 | 
 75 |                 f.write(','.join(map(str, item_list)) + '\n')
 76 | 
 77 | 
 78 | class Procossing(object):
 79 |     def __init__(self, size, force):
 80 |         self.size = size
 81 |         self.force = force
 82 |         self.w2v_model = self._get_w2v_model()
 83 | 
 84 |     def _get_w2v_model(self):
 85 |         w2v_model_name = "w2v_{}.bin".format(self.size)
 86 |         w2v_model_path = os.path.join("resources", w2v_model_name)
 87 |         if not os.path.exists(w2v_model_path):
 88 |             build_model(fname=['train', 'vali', 'test'], size=self.size)
 89 |         w2v_model = KeyedVectors.load_word2vec_format(w2v_model_path, binary=True, unicode_errors="ignore")
 90 |         return w2v_model
 91 | 
 92 |     @staticmethod
 93 |     def _get_data(name):
 94 |         if name == "test":
 95 |             columns = ['prefix', 'query_prediction', 'title', 'tag']
 96 |         else:
 97 |             columns = ['prefix', 'query_prediction', 'title', 'tag', 'label']
 98 | 
 99 |         data_name = os.path.join(RAW_DATA_PATH, "oppo_round1_{}.txt".format(name))
100 |         df = pd.read_csv(data_name, names=columns, sep="\t", header=None, encoding="utf-8")
101 | 
102 |         return df
103 | 
104 |     def _get_jieba_array(self, words):
105 |         words = char_cleaner(words)
106 |         seg_cut = jieba.lcut(words)
107 |         seg_cut = char_list_cheaner(seg_cut)
108 | 
109 |         w2v_array = list()
110 |         for word in seg_cut:
111 |             try:
112 |                 similar_list = self.w2v_model[word]
113 |                 w2v_array.append(similar_list)
114 |             except KeyError:
115 |                 continue
116 | 
117 |         if not w2v_array:
118 |             w2v_array = [None] * self.size
119 |         else:
120 |             w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0))
121 | 
122 |         return w2v_array
123 | 
124 |     def _get_w2v_similar(self, item):
125 |         item_dict = dict()
126 | 
127 |         query_predict = item["query_prediction"]
128 |         title = item['title']
129 | 
130 |         if not query_predict:
131 |             item_dict["max_similar"] = None
132 |             item_dict["mean_similar"] = None
133 |             item_dict["weight_similar"] = None
134 |             return item_dict
135 | 
136 |         query_predict = sorted(query_predict.items(), key=itemgetter(1), reverse=True)
137 |         query_predict = query_predict[:3]
138 | 
139 |         similar_list = list()
140 |         weight_similar_list = list()
141 | 
142 |         title_array = self._get_jieba_array(title)
143 |         for key, value in query_predict:
144 |             query_cut_array = self._get_jieba_array(key)
145 | 
146 |             try:
147 |                 w2v_similar = np.dot(query_cut_array, title_array)
148 |             except (KeyError, ZeroDivisionError, TypeError):
149 |                 w2v_similar = np.nan
150 | 
151 |             similar_list.append(w2v_similar)
152 |             weight_w2v_similar = w2v_similar * float(value)
153 |             weight_similar_list.append(weight_w2v_similar)
154 | 
155 |         max_similar = np.nanmax(similar_list)
156 |         mean_similar = np.nanmean(similar_list)
157 |         weight_similar = np.nansum(weight_similar_list)
158 | 
159 |         item_dict["max_similar"] = max_similar
160 |         item_dict["mean_similar"] = mean_similar
161 |         item_dict["weight_similar"] = weight_similar
162 | 
163 |         return item_dict
164 | 
165 |     @staticmethod
166 |     def _get_help_flag(item):
167 |         if np.isnan(item):
168 |             return 0
169 |         return 1
170 | 
171 |     def _get_w2v_df(self, df, col):
172 |         file_name = '{col}_w2v.csv'.format(col=col)
173 |         file_path = os.path.join(TEMP_DATA_PATH, file_name)
174 | 
175 |         if os.path.exists(file_path) and not self.force:
176 |             pass
177 |         else:
178 |             pre_processing = PreProcessing(self.size, self.w2v_model)
179 |             pre_processing.to_csv(df, col)
180 | 
181 |         w2v_df = pd.read_csv(file_path, header=0)
182 |         w2v_df['help_index'] = w2v_df.index
183 |         w2v_df['help_flag'] = w2v_df['w2v_0'].apply(self._get_help_flag)
184 | 
185 |         return w2v_df
186 | 
187 |     def _get_query_df(self, df):
188 |         query_df = pd.DataFrame()
189 | 
190 |         query_df["item_dict"] = df[['query_prediction', 'title']].apply(self._get_w2v_similar, axis=1)
191 |         query_df["max_similar"] = query_df["item_dict"].apply(lambda item: item.get("max_similar"))
192 |         query_df["mean_similar"] = query_df["item_dict"].apply(lambda item: item.get("mean_similar"))
193 |         query_df["weight_similar"] = query_df["item_dict"].apply(lambda item: item.get("weight_similar"))
194 |         query_df = query_df.drop(columns=["item_dict"])
195 | 
196 |         return query_df
197 | 
198 |     @staticmethod
199 |     def _get_prefix_df(prefix_w2v_df, title_w2v_df, col_name):
200 |         prefix_df = pd.DataFrame()
201 | 
202 |         remove_columns = ['help_index', 'help_flag']
203 | 
204 |         prefix_w2v_df = prefix_w2v_df.copy()
205 |         prefix_w2v_df = prefix_w2v_df.drop(columns=remove_columns)
206 | 
207 |         title_w2v_df = title_w2v_df.copy()
208 |         title_w2v_df = title_w2v_df.drop(columns=remove_columns)
209 | 
210 |         prefix_w2v_list = list()
211 |         for idx, prefix in prefix_w2v_df.iterrows():
212 |             if np.isnan(prefix[0]):
213 |                 prefix_w2v_list.append(None)
214 |                 continue
215 | 
216 |             title = title_w2v_df.loc[idx]
217 |             if np.isnan(title[0]):
218 |                 prefix_w2v_list.append(None)
219 |                 continue
220 | 
221 |             similar = np.dot(prefix, title)
222 |             prefix_w2v_list.append(similar)
223 | 
224 |         prefix_df[col_name] = prefix_w2v_list
225 |         return prefix_df
226 | 
227 |     @staticmethod
228 |     def _get_kmeans_dict(df, size=20):
229 |         df = df.copy()
230 |         df = df[df['help_flag'] == 1]
231 |         help_index = df['help_index'].tolist()
232 | 
233 |         df = df.drop(columns=['help_index', 'help_flag'])
234 | 
235 |         kmeans = MiniBatchKMeans(n_clusters=size, reassignment_ratio=0.001)
236 |         preds = kmeans.fit_predict(df)
237 | 
238 |         kmeans_dict = dict(zip(help_index, preds))
239 |         return kmeans_dict
240 | 
241 |     @staticmethod
242 |     def _loads(item):
243 |         try:
244 |             return json.loads(item)
245 |         except (json.JSONDecodeError, TypeError):
246 |             return json.loads("{}")
247 | 
248 |     @staticmethod
249 |     def _mapping_kmeans(item, mapping_dict):
250 |         return mapping_dict.get(item, -1)
251 | 
252 |     @staticmethod
253 |     def _get_ctr_df(df, train_df_length, columns=None):
254 |         df = df.copy()
255 | 
256 |         train_df = df[:train_df_length]
257 | 
258 |         if columns is None:
259 |             columns = ['prefix_kmeans', 'title_kmeans', 'complete_prefix_kmeans']
260 | 
261 |         ctr_df = df[columns]
262 | 
263 |         # click count and ctr
264 |         for idx, column in enumerate(columns):
265 |             click_column = "{column}_click".format(column=column)
266 |             count_column = "{column}_count".format(column=column)
267 |             ctr_column = "{column}_ctr".format(column=column)
268 | 
269 |             agg_dict = {click_column: "sum", count_column: "count"}
270 |             column_apriori_df = train_df.groupby(column, as_index=False)["label"].agg(agg_dict)
271 |             column_apriori_df[ctr_column] = column_apriori_df[click_column] / (column_apriori_df[count_column] + 5)
272 |             ctr_df = pd.merge(ctr_df, column_apriori_df, how='left', on=column)
273 | 
274 |         ctr_df = ctr_df.drop(columns=columns)
275 | 
276 |         return ctr_df
277 | 
278 |     @staticmethod
279 |     def _get_pca_df(df, name, n_components=5):
280 |         df = df.copy()
281 | 
282 |         remove_columns = ['help_flag', 'help_index']
283 | 
284 |         df_effective = df[df['help_flag'] == 1]
285 |         df_invalid = df[df['help_flag'] == 0]
286 | 
287 |         df_effective = df_effective.drop(columns=remove_columns)
288 |         df_invalid = df_invalid.drop(columns=remove_columns)
289 | 
290 |         pca_columns = ['{}_pca_{}'.format(name, i) for i in range(n_components)]
291 | 
292 |         pca = PCA(n_components=n_components)
293 | 
294 |         pca_data = pca.fit_transform(df_effective)
295 |         pca_df = pd.DataFrame(pca_data, index=df_effective.index, columns=pca_columns)
296 |         none_df = pd.DataFrame(index=df_invalid.index, columns=pca_columns)
297 | 
298 |         pca_df = pd.concat([pca_df, none_df], axis=0, ignore_index=False, sort=False)
299 |         pca_df = pca_df.sort_index()
300 | 
301 |         return pca_df
302 | 
303 |     @staticmethod
304 |     def _get_complete_prefix(item):
305 |         prefix = item['prefix']
306 |         query_prediction = item['query_prediction']
307 | 
308 |         if not query_prediction:
309 |             return prefix
310 | 
311 |         predict_word_dict = dict()
312 |         prefix = str(prefix)
313 | 
314 |         for query_item, query_ratio in query_prediction.items():
315 |             query_item_cut = jieba.lcut(query_item)
316 |             item_word = ""
317 |             for item in query_item_cut:
318 |                 if prefix not in item_word:
319 |                     item_word += item
320 |                 else:
321 |                     if item_word not in predict_word_dict.keys():
322 |                         predict_word_dict[item_word] = 0.0
323 |                     predict_word_dict[item_word] += float(query_ratio)
324 | 
325 |         if not predict_word_dict:
326 |             return prefix
327 | 
328 |         predict_word_dict = sorted(predict_word_dict.items(), key=itemgetter(1), reverse=True)
329 |         complete_prefix = predict_word_dict[0][0]
330 |         return complete_prefix
331 | 
332 |     def get_processing(self):
333 |         train_df = self._get_data(name="train")
334 |         validate_df = self._get_data(name="vali")
335 |         test_df = self._get_data(name="test")
336 |         logger.info('finish load data!')
337 | 
338 |         train_df_length = train_df.shape[0]
339 |         validate_df_length = validate_df.shape[0]
340 |         df = pd.concat([train_df, validate_df, test_df], axis=0, ignore_index=True, sort=False)
341 | 
342 |         del train_df, validate_df, test_df
343 |         gc.collect()
344 | 
345 |         # make query prediction to json
346 |         df["query_prediction"] = df["query_prediction"].apply(self._loads)
347 | 
348 |         # complete prefix
349 |         df['complete_prefix'] = df[['prefix', 'query_prediction']].apply(self._get_complete_prefix, axis=1)
350 | 
351 |         # clearn prefix and title
352 |         df["prefix"] = df["prefix"].apply(char_cleaner)
353 |         df["title"] = df["title"].apply(char_cleaner)
354 |         df["complete_prefix"] = df["complete_prefix"].apply(char_cleaner)
355 | 
356 |         w2v_df = df[['label']]
357 | 
358 |         prefix_w2v_df = self._get_w2v_df(df, col='prefix')
359 |         title_w2v_df = self._get_w2v_df(df, col='title')
360 |         complete_prefix_w2v_df = self._get_w2v_df(df, col='complete_prefix')
361 |         logger.info('finish get prefix and title w2v df!')
362 | 
363 |         prefix_pca_df = self._get_pca_df(prefix_w2v_df, 'prefix')
364 |         title_pca_df = self._get_pca_df(title_w2v_df, 'title')
365 |         complete_prefix_pca_df = self._get_pca_df(complete_prefix_w2v_df, 'complete_prefix')
366 |         w2v_df = pd.concat([w2v_df, prefix_pca_df, title_pca_df, complete_prefix_pca_df], axis=1)
367 | 
368 |         del prefix_pca_df, title_pca_df, complete_prefix_pca_df
369 |         gc.collect()
370 | 
371 |         prefix_kmeans_dict = self._get_kmeans_dict(prefix_w2v_df)
372 |         title_kmeans_dict = self._get_kmeans_dict(title_w2v_df)
373 |         complete_prefix_kmeans_dict = self._get_kmeans_dict(complete_prefix_w2v_df)
374 |         logger.info('finish make kmeans!')
375 | 
376 |         w2v_df['prefix_kmeans'] = prefix_w2v_df['help_index'].apply(self._mapping_kmeans, args=(prefix_kmeans_dict,))
377 |         w2v_df['title_kmeans'] = title_w2v_df['help_index'].apply(self._mapping_kmeans, args=(title_kmeans_dict,))
378 |         w2v_df['complete_prefix_kmeans'] = complete_prefix_w2v_df['help_index'].apply(
379 |             self._mapping_kmeans, args=(complete_prefix_kmeans_dict,))
380 | 
381 |         ctr_df = self._get_ctr_df(w2v_df, train_df_length)
382 |         w2v_df = pd.concat([w2v_df, ctr_df], axis=1)
383 | 
384 |         del ctr_df, prefix_kmeans_dict, title_kmeans_dict, complete_prefix_kmeans_dict
385 |         gc.collect()
386 | 
387 |         prefix_df = self._get_prefix_df(prefix_w2v_df, title_w2v_df, 'prefix_w2v')
388 |         omplete_prefix_df = self._get_prefix_df(complete_prefix_w2v_df, title_w2v_df, 'complete_prefix_w2v')
389 |         logger.info('finish get prefix  df!')
390 |         w2v_df = pd.concat([w2v_df, prefix_df, omplete_prefix_df], axis=1)
391 | 
392 |         del prefix_df, omplete_prefix_df, prefix_w2v_df, title_w2v_df
393 |         gc.collect()
394 | 
395 |         query_df = self._get_query_df(df)
396 |         logger.info('finish get query_df!')
397 |         w2v_df = pd.concat([w2v_df, query_df], axis=1)
398 | 
399 |         w2v_df = w2v_df.drop(columns=['label'])
400 | 
401 |         train_data = w2v_df[:train_df_length]
402 |         validate_data = w2v_df[train_df_length:train_df_length + validate_df_length]
403 |         test_data = w2v_df[train_df_length + validate_df_length:]
404 | 
405 |         train_data_name = os.path.join(ETL_DATA_PATH, "train_w2v.csv")
406 |         validate_data_name = os.path.join(ETL_DATA_PATH, "validate_w2v.csv")
407 |         test_data_name = os.path.join(ETL_DATA_PATH, "test_w2v.csv")
408 | 
409 |         train_data.to_csv(train_data_name, index=False)
410 |         validate_data.to_csv(validate_data_name, index=False)
411 |         test_data.to_csv(test_data_name, index=False)
412 | 
413 | 
414 | if __name__ == "__main__":
415 |     t0 = time.time()
416 |     processing = Procossing(size=100, force=False)
417 |     processing.get_processing()
418 |     print(time.time() - t0)
419 | 


--------------------------------------------------------------------------------