├── DealData.py
├── GetData.py
├── Model.py
├── PrintData.py
├── README.md
├── VisualData.py
├── data
└── 可以从云盘下载文件到这里.txt
├── main.py
└── tool
└── nlp_basic.py
/DealData.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import re
4 | import json
5 | import jieba
6 | import Levenshtein
7 | import logging
8 | import warnings
9 | import pickle
10 | warnings.filterwarnings('ignore')
11 |
12 | logging.basicConfig(level=logging.INFO,format="[%(asctime)s] %(message)s",datefmt="%Y-%m-%d %H:%M:%S",)
13 | pd.set_option('display.max_columns', 1000)
14 | pd.set_option('display.width', 1000)
15 |
16 | # ----- Abbreviation -----
17 | # ttv_data: train_test_validate_data
18 | # tpn_data: train_positive_negative_data
19 | # -------------------------
20 |
21 |
22 | # ----- deal data function -----
23 | def deal_data_label(data): # 处理数据标签
24 | data['label'] = data['label'].astype(str)
25 | judge = data['label'] == '音乐'
26 | data = data[~judge]
27 | data.reset_index(inplace=True, drop=True)
28 | data['label'] = data['label'].astype(int)
29 | return data
30 |
31 |
32 | def deal_data_flag(data, flag):
33 | # 处理数据标志用于区别数据性质,0表示训练数据 1表示测试数据 2表示验证数据
34 | data['data_flag'] = flag
35 | return data
36 |
37 |
38 | def deal_ttv_data_flag(train_data, test_data, validate_data): # 处理ttv数据标志
39 | train_data = deal_data_flag(train_data, 0)
40 | test_data = deal_data_flag(test_data, 1)
41 | validate_data = deal_data_flag(validate_data, 2)
42 | return train_data, test_data, validate_data
43 |
44 |
45 | def deal_ttv_data_by_func(train_data, test_data, validate_data, deal_func):
46 | # 处理ttv数据
47 | train_data = deal_func(train_data)
48 | test_data = deal_func(test_data)
49 | validate_data = deal_func(validate_data)
50 | return train_data, test_data, validate_data
51 |
52 |
53 | def deal_data_col_type(data): # 处理数据类型
54 | data['label'] = data['label'].astype(int)
55 | data['prefix'] = data['prefix'].astype(str)
56 | data['title'] = data['title'].astype(str)
57 | data['query_prediction'] = data['query_prediction'].astype(str)
58 | return data
59 |
60 |
61 | def deal_data_col_len(data): # 处理数据长度
62 | data['prefix_len'] = data['prefix'].apply(lambda x: len(x)) # 增加prefix长度字段
63 | data['title_len'] = data['title'].apply(lambda x: len(x)) # 增加title长度字段
64 | data['title_diff_prefix_len'] = data['title_len'] - data['prefix_len']
65 | return data
66 |
67 |
68 | #################################################################
69 | # query
70 | def move_useless_char(s):
71 | # 提出无效字符
72 | return re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+??!,。??、~@#¥%……&*()]+", "", s)
73 |
74 |
75 | def query_prediction_text(query_prediction):
76 | if (query_prediction == "{}") | (query_prediction == "") | pd.isna(query_prediction) | (query_prediction == "nan"):
77 | return ["PAD"]
78 | json_data = json.loads(query_prediction)
79 | result = sorted(json_data.items(), key=lambda d: d[1], reverse=True)
80 | texts = [move_useless_char(item[0]) for item in result]
81 | return texts
82 |
83 |
84 | def query_prediction_score(query_prediction):
85 | if (query_prediction == "{}") | (query_prediction == "") | pd.isna(query_prediction) | (query_prediction == "nan"):
86 | return [0]
87 | json_data = json.loads(query_prediction)
88 | result = sorted(json_data.items(), key=lambda d: d[1], reverse=True)
89 | scores = [float(item[1]) for item in result]
90 | return scores
91 |
92 |
93 | def deal_data_query_score(data):
94 | data['query_score'] = data['query_prediction'].apply(lambda x: query_prediction_score(x))
95 | data['query_score_max'] = data['query_score'].apply(lambda x: np.max(x))
96 | data['query_score_min'] = data['query_score'].apply(lambda x: np.min(x))
97 | data['query_score_mean'] = data['query_score'].apply(lambda x: np.mean(x))
98 | data['query_score_median'] = data['query_score'].apply(lambda x: np.median(x))
99 | data['query_score_sum'] = data['query_score'].apply(lambda x: np.sum(x))
100 | data['query_score_std'] = data['query_score'].apply(lambda x: np.std(x))
101 | data['query_score'] = data['query_score'].apply(lambda x: sorted(x, reverse=True))
102 | data['query_score'] = data['query_score'].apply(lambda x: x+[0 for _ in range(10-len(x))])
103 | for i in range(10):
104 | data['query_score_'+str(i)] = data['query_score'].apply(lambda x: x[i])
105 | data = data.drop(['query_score'], axis =1)
106 | return data
107 |
108 |
109 | def get_word_vector():
110 | word2vec = dict()
111 | with open("data/zh_word_vectors.txt", 'r', encoding="utf-8") as f:
112 | for line in f:
113 | line = line.strip()
114 | if not line:
115 | continue
116 | tokens = line.split()
117 | word = tokens[0]
118 | vecs = tokens[1:]
119 | tmp = []
120 | for vec in vecs:
121 | try:
122 | tmp.append(float(vec))
123 | except:
124 | pass
125 | word2vec[word] = np.array(tmp)
126 | return word2vec
127 |
128 |
129 | def get_text_vector(x, word2vec, default_vec):
130 | try:
131 | return word2vec[x]
132 | except:
133 | return default_vec
134 |
135 |
136 | def deal_data_query_word(data):
137 | data['query_word'] = data['query_prediction'].apply(lambda x: query_prediction_text(x))
138 | data['query_len'] = data['query_word'].apply(lambda x: len(x))
139 | temp_data = data['query_word'].apply(lambda x: [len(_x) for _x in x])
140 | data['query_word_max_len'] = temp_data.apply(lambda x: np.max(x) if len(x) > 0 else 0)
141 | data['query_word_min_len'] = temp_data.apply(lambda x: np.min(x) if len(x) > 0 else 0)
142 | data['query_word_mean_len'] = temp_data.apply(lambda x: np.mean(x) if len(x) > 0 else 0)
143 | data['query_word_median_len'] = temp_data.apply(lambda x: np.median(x) if len(x) > 0 else 0)
144 | data['query_word_sum_len'] = temp_data.apply(lambda x: np.sum(x) if len(x) > 0 else 0)
145 | data['query_word_std_len'] = temp_data.apply(lambda x: np.std(x) if len(x) > 0 else 0)
146 | data['query_word'] = data['query_word'].apply(lambda x: x+['PAD' for _ in range(10-len(x))])
147 |
148 | # word2vec = get_word_vector()
149 | # default_vec = np.array([0.0 for _ in range(len(word2vec[list(word2vec.keys())[0]]))])
150 | # temp_data = data[['prefix', 'query_word']].drop_duplicates('prefix')
151 | # for i in range(10):
152 | # temp_data['query_word_seg_'+str(i)] = temp_data['query_word'].apply(lambda x: "|".join(jieba.cut(str(x[i]))))
153 | # tmp_vec = temp_data['query_word_seg_'+str(i)].str.split("|").apply(lambda x: [get_text_vector(_x, word2vec, default_vec) for _x in x])
154 | # temp_data['query_word_seg_' + str(i) + '_vec'] = tmp_vec.apply(lambda x: np.sum(x, axis=0))
155 | # temp_data.drop(columns=['query_word'], inplace= True)
156 | # data = data.merge(temp_data, on='prefix', how='left')
157 | return data
158 |
159 |
160 | def deal_eig_value(similarity_matrix):
161 | # similarity_matrix: 对称矩阵
162 | similarity_matrix = np.array(similarity_matrix)
163 | similarity_matrix = similarity_matrix + similarity_matrix.T
164 | similarity_matrix[np.eye(similarity_matrix.shape[0]) == 1] = 1
165 | eig_value = np.linalg.eig(similarity_matrix)[0]
166 | eig_value = [float(x) for x in eig_value]
167 | eig_value = sorted(eig_value, reverse=True) + [0 for _ in range(10 - len(eig_value))]
168 | return eig_value
169 |
170 |
171 | def deal_query_word_mutual_text_eig_vector(sub_word):
172 | # 计算query_word 中词组包含关系信息主向量
173 | sub_word = [x for x in sub_word if x != ""]
174 | if len(sub_word) > 0:
175 | similarity_matrix = []
176 | for _sw in sub_word:
177 | similarity = [1-(len(sw)-len(_sw))/max([len(sw), len(_sw)]) if _sw in sw else 0 for sw in sub_word ]
178 | similarity_matrix.append(similarity)
179 | eig_value = deal_eig_value(similarity_matrix) # 计算特征向量特征值
180 | else:
181 | eig_value = [0 for _ in range(10)]
182 | return eig_value
183 |
184 |
185 | def deal_query_word_levenshtein_ratio_eig_vector(sub_word):
186 | # 计算query_word的 levenshetein 相似度
187 | sub_word = [x for x in sub_word if x != ""]
188 | if len(sub_word) > 0:
189 | similarity_matrix = []
190 | for _sw in sub_word:
191 | similarity = [Levenshtein.ratio(_sw, sw) if _sw in sw else 0 for sw in sub_word ]
192 | similarity_matrix.append(similarity)
193 | eig_value = deal_eig_value(similarity_matrix) # 计算特征向量
194 | else:
195 | eig_value = [0 for _ in range(10)]
196 | return eig_value
197 |
198 |
199 | def deal_query_word_levenshtein_distance_eig_vector(sub_word):
200 | # 计算query_word的 levenshetein 相似度
201 | sub_word = [x for x in sub_word if x != ""]
202 | if len(sub_word) > 0:
203 | similarity_matrix = []
204 | for _sw in sub_word:
205 | similarity = [Levenshtein.distance(_sw, sw) if _sw in sw else 0 for sw in sub_word ]
206 | similarity_matrix.append(similarity)
207 | eig_value = deal_eig_value(similarity_matrix) # 计算特征向量
208 | else:
209 | eig_value = [0 for _ in range(10)]
210 | return eig_value
211 |
212 |
213 | def deal_query_word_levenshtein_jaro_eig_vector(sub_word):
214 | # 计算query_word的 levenshetein 相似度
215 | sub_word = [x for x in sub_word if x != ""]
216 | if len(sub_word) > 0:
217 | similarity_matrix = []
218 | for _sw in sub_word:
219 | similarity = [Levenshtein.jaro(_sw, sw) if _sw in sw else 0 for sw in sub_word ]
220 | similarity_matrix.append(similarity)
221 | eig_value = deal_eig_value(similarity_matrix) # 计算特征向量
222 | else:
223 | eig_value = [0 for _ in range(10)]
224 | return eig_value
225 |
226 |
227 | def deal_data_query_sub_word_info(x):
228 | # 对每个 query_word 删除 prefix
229 | try:
230 | rst = [re.sub(x['prefix'], "", _x) for _x in x['query_word']] if len(x['query_word']) > 0 else ['NAN']
231 | except:
232 | rst = [_x for _x in x['query_word']]
233 | return rst
234 |
235 |
236 | def deal_data_prefix_is_incomplete_input(detected_word, key_word):
237 | rest_word = detected_word.replace(key_word, "")
238 | if len(rest_word) > 0:
239 | return rest_word[0] == "|"
240 | else:
241 | return False
242 |
243 |
244 | def deal_data_query_word_information(data):
245 | temp_data = data[['prefix', 'query_word', 'prefix_word_seg']].drop_duplicates('prefix')
246 |
247 | # 判断关键词是否输入完整
248 | temp_data['query_word_seg_0'] = temp_data['query_word'].apply(lambda x: "|".join(jieba.cut(str(x[0]))))
249 | temp_data['prefix_is_incomplete_input'] = temp_data.apply(lambda x: deal_data_prefix_is_incomplete_input(x['query_word_seg_0'], x['prefix_word_seg']), axis=1).astype(int)
250 | data = data.merge(temp_data[['prefix', 'prefix_is_incomplete_input']], on='prefix', how='left')
251 | temp_data = temp_data.drop(['prefix_is_incomplete_input', 'query_word_seg_0', 'prefix_word_seg'], axis=1)
252 |
253 | temp_data['query_sub_word'] = temp_data[['prefix', 'query_word']].apply(lambda x: deal_data_query_sub_word_info(x), axis=1)
254 | # query_word 交互文本信息
255 | eig_values = temp_data['query_sub_word'].apply(lambda x: deal_query_word_mutual_text_eig_vector(x))
256 | for i in range(10):
257 | temp_data['mutual_text_eig_value_'+str(i)] = eig_values.apply(lambda x: x[i])
258 | data = data.merge(temp_data.drop(['query_word', 'query_sub_word'], axis=1), on='prefix', how='left')
259 | temp_data = temp_data[['prefix', 'query_word', 'query_sub_word']]
260 |
261 | # levenshtein ratio 交互文本信息
262 | eig_values = temp_data['query_sub_word'].apply(lambda x: deal_query_word_levenshtein_ratio_eig_vector(x))
263 | for i in range(10):
264 | temp_data['levenshtein_ratio_eig_value_'+str(i)] = eig_values.apply(lambda x: x[i])
265 | data = data.merge(temp_data.drop(['query_word', 'query_sub_word'], axis=1), on='prefix', how='left')
266 | temp_data = temp_data[['prefix', 'query_word', 'query_sub_word']]
267 |
268 | # levenshtein distance 交互文本信息
269 | eig_values = temp_data['query_sub_word'].apply(lambda x: deal_query_word_levenshtein_distance_eig_vector(x))
270 | for i in range(10):
271 | temp_data['levenshtein_distance_eig_value_' + str(i)] = eig_values.apply(lambda x: x[i])
272 | data = data.merge(temp_data.drop(['query_word', 'query_sub_word'], axis=1), on='prefix', how='left')
273 | return data
274 |
275 |
276 | #################################################################
277 | # ----- is特征 + prefix -----
278 | def deal_prefix_is_in_title(data):
279 | data['is_prefix_in_title'] = data.apply(lambda x: int(x['prefix'] in x['title']), axis=1)
280 | return data
281 |
282 |
283 | def deal_title_is_in_query_keys(data):
284 | data['is_title_in_query_keys'] = data.apply(lambda x: int(sum([int(x['title'] in _x) for _x in x['query_word']])>0), axis=1)
285 | return data
286 |
287 |
288 | # 是否全是中文
289 | def deal_prefix_is_all_chinese_word(data):
290 | judge = data['prefix'].apply(lambda x:len(re.findall("[0-9|a-z|A-Z|+??!,。??、~@#¥%……&*()|\s+\.\!\/_,$%^*(+\"\']", x)) == 0)
291 | data['is_all_chinese_word'] = 0
292 | data.loc[judge, 'is_all_chinese_word'] = 1
293 | return data
294 |
295 |
296 | # 是否全是数字
297 | def deal_prefix_is_all_number(data):
298 | judge = data['prefix'].apply(lambda x:len(re.findall("\D", x))==0)
299 | data['is_all_number'] = 0
300 | data.loc[judge, 'is_all_number'] = 1
301 | return data
302 |
303 |
304 | # 是否全是英文字母
305 | def deal_prefix_is_all_english(data):
306 | judge = data['prefix'].apply(lambda x:len(re.findall("[a-z|A-Z]", x)) == len(x))
307 | data[judge]
308 | data['is_all_English'] = 0
309 | data.loc[judge, 'is_all_English'] = 1
310 | return data
311 |
312 |
313 | # 是否全是大写英文字母
314 | def deal_prefix_is_all_upper_english(data):
315 | judge = data['prefix'].apply(lambda x: len(re.findall("[A-Z]", x)) == len(x))
316 | # data[judge]
317 | data['is_all_upper_english'] = 0
318 | data.loc[judge, 'is_all_upper_english'] = 1
319 | return data
320 |
321 |
322 | # 是否全是小写英文字母
323 | def deal_prefix_is_all_lower_english(data):
324 | judge = data['prefix'].apply(lambda x: len(re.findall("[a-z]", x)) == len(x))
325 | data['is_all_upperEnglish'] = 0
326 | data.loc[judge, 'is_all_lower_english'] = 1
327 | return data
328 |
329 |
330 | # 是否全是特殊符号
331 | def deal_prefix_is_all_symbol(data):
332 | judge = data['prefix'].apply(lambda x:len(re.findall("\w", x))==0)
333 | data['is_all_symbol'] = 0
334 | data.loc[judge, 'is_all_symbol'] = 1
335 | return data
336 |
337 |
338 | # 是否中英文一起出现
339 | def deal_prefix_is_combine_chinese_english(data):
340 | judge = data['prefix'].apply(lambda x: len(re.findall("[\u4e00-\u9fa5]+[a-z|A-Z]+|[a-z|A-Z]+[\u4e00-\u9fa5]+", x))>0)
341 | data['is_combine_chinese_english'] = 0
342 | data.loc[judge, 'is_combine_chinese_english'] = 1
343 | return data
344 |
345 |
346 | # 是否中文数字出现
347 | def deal_prefix_is_combine_chinese_number(data):
348 | judge = data['prefix'].apply(lambda x: len(re.findall("[\u4e00-\u9fa5]+[0-9]+|[0-9]+[\u4e00-\u9fa5]+", x))>0)
349 | data['is_combine_chinese_number'] = 0
350 | data.loc[judge, 'is_combine_chinese_number'] = 1
351 | return data
352 |
353 |
354 | # 是否英文和数字一起出现
355 | def deal_prefix_is_combine_english_number(data):
356 | judge = data['prefix'].apply(lambda x: len(re.findall("[0-9]+[a-z|A-Z]+|[a-z|A-Z]+[0-9]+", x))>0)
357 | data['is_combine_english_number'] = 0
358 | data.loc[judge, 'is_combine_english_number'] = 1
359 | return data
360 |
361 |
362 | # 是否网址 # .com 结尾
363 | def deal_prefix_is_network_ip(data):
364 | judge = data['prefix'].apply(lambda x: len(re.findall("\.(com)$", x))>0)
365 | data['is_network_ip'] = 0
366 | data.loc[judge, 'is_network_ip'] = 1
367 | return data
368 |
369 |
370 | # prefix归属于tag个数
371 | def deal_prefix_belongs_tag_number(data):
372 | temp_data = data.groupby(['prefix', 'tag'], as_index=False)['query_prediction'].agg({'prefix_belongs_tag_count': 'count'})
373 | temp_data = temp_data.groupby('prefix', as_index=False)['prefix_belongs_tag_count'].count()
374 | data = data.merge(temp_data, on='prefix', how='left')
375 | return data
376 |
377 |
378 | # prefix归属于title个数
379 | def deal_prefix_belongs_title_number(data):
380 | temp_data = data.groupby(['prefix', 'title'], as_index=False)['query_prediction'].agg({'prefix_belongs_title_count': 'count'})
381 | temp_data = temp_data.groupby('prefix', as_index=False)['prefix_belongs_title_count'].count()
382 | data = data.merge(temp_data, on='prefix', how='left')
383 | return data
384 |
385 |
386 | def deal_data_title_word(data):
387 | temp_data = data[['title']].drop_duplicates('title')
388 | temp_data['title_word_seg'] = temp_data['title'].apply(lambda x: "|".join(jieba.cut(x)))
389 | temp_data['title_word_seg_len'] = temp_data['title_word_seg'].apply(lambda x: len(x.split("|")))
390 | data = data.merge(temp_data, on='title', how='left')
391 | return data
392 |
393 |
394 | def deal_data_prefix_word(data):
395 | temp_data = data[['prefix']].drop_duplicates('prefix')
396 | temp_data['prefix_word_seg'] = temp_data['prefix'].apply(lambda x: "|".join(jieba.cut(x)))
397 | temp_data['prefix_word_seg_len'] = temp_data['prefix_word_seg'].apply(lambda x: len(x.split("|")))
398 | data = data.merge(temp_data, on='prefix', how='left')
399 | return data
400 |
401 |
402 | # static feature
403 | def get_ctr_feature(cols, data, train_data, is_add_0926_data, is_debug):
404 | ctr_feature_dict = {}
405 | for col in cols:
406 | tmp = train_data.groupby(col, as_index=False)["label"].agg({col + "_click": "sum", col + "_show": "count"})
407 | tmp[col + "_ctr"] = tmp[col + "_click"] / (tmp[col + "_show"] + 3)
408 | for tmp_col in [col + "_show", col + "_click", col + "_ctr"]:
409 | tmp[tmp_col] = tmp[tmp_col].apply(lambda x: x if x != "PAD" else -1)
410 | ctr_feature_dict[col] = tmp
411 | data = pd.merge(data, tmp, on=col, how="left")
412 |
413 | for i in range(len(cols)):
414 | for j in range(i + 1, len(cols)):
415 | group_col = [cols[i], cols[j]]
416 | tmp = train_data.groupby(group_col, as_index=False)["label"].agg(
417 | {"_".join(group_col) + "_click": "sum", "_".join(group_col) + "_show": "count"})
418 | tmp["_".join(group_col) + "_ctr"] = tmp["_".join(group_col) + "_click"] / (
419 | tmp["_".join(group_col) + "_show"] + 3)
420 | for tmp_col in ["_".join(group_col) + "_show", "_".join(group_col) + "_click",
421 | "_".join(group_col) + "_ctr"]:
422 | tmp[tmp_col] = tmp[group_col + [tmp_col]].apply(
423 | lambda x: x[tmp_col] if "PAD" not in x[group_col].values else -1, axis=1)
424 | ctr_feature_dict["_".join(group_col)] = tmp
425 | data = pd.merge(data, tmp, on=group_col, how="left")
426 |
427 | group_col = cols
428 | tmp = train_data.groupby(group_col, as_index=False)["label"].agg({"_".join(group_col) + "_click": "sum", "_".join(group_col) + "_show": "count"})
429 | tmp["_".join(group_col) + "_ctr"] = tmp["_".join(group_col) + "_click"] / (tmp["_".join(group_col) + "_show"] + 3)
430 | ctr_feature_dict["_".join(group_col)] = tmp
431 | data = pd.merge(data, tmp, on=cols, how="left")
432 | if is_debug == 0: # 判断是否调试模式,调试模式不保存ctr_feature数据
433 | if is_add_0926_data == 1: # 判断是否加载0926数据
434 | with open('data/ctr_feature_dict_0926.pickle', 'wb') as f:
435 | pickle.dump(ctr_feature_dict, f, pickle.HIGHEST_PROTOCOL)
436 | else:
437 | with open('data/ctr_feature_dict.pickle', 'wb') as f:
438 | pickle.dump(ctr_feature_dict, f, pickle.HIGHEST_PROTOCOL)
439 |
440 | data = data.fillna(-1)
441 | return data
442 |
443 |
444 | def deal_static_feature(data, mode='unload', is_add_0926_data=1, is_debug=1):
445 | if mode == 'load': # 加载数据模式
446 | if is_add_0926_data == 1:
447 | with open("data/ctr_feature_dict_0926.pickle", 'rb') as f:
448 | ctr_feature_dict = pickle.load(f)
449 | else:
450 | with open("data/ctr_feature_dict.pickle", 'rb') as f:
451 | ctr_feature_dict = pickle.load(f)
452 | for key in list(ctr_feature_dict.keys()):
453 | tmp_data = ctr_feature_dict[key]
454 | data = data.merge(tmp_data, on=key.split("_"), how='left')
455 | data = data.fillna(-1)
456 | else:
457 | train_data = data[data['data_flag'] == 0]
458 | train_data.columns.tolist()
459 | cols = ["prefix", "title", "tag"]
460 | data = get_ctr_feature(cols, data, train_data, is_add_0926_data, is_debug)
461 | return data
462 |
463 |
464 | def deal_drop_data(data):
465 | data = data.select_dtypes(include=['number'])
466 | # -------- 分割线 --------
467 | # data = data.drop([x for x in data.columns.tolist() if 'mutual_text_eig_value' in x], axis=1)
468 | # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_ratio' in x], axis=1)
469 | # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_distance' in x], axis=1)
470 |
471 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_is_incomplete_input' in x], axis=1)
472 | # data = data.drop([x for x in data.columns.tolist() if 'title_word_seg_len' in x], axis=1)
473 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_word_seg_len' in x], axis=1)
474 |
475 | # 如果是版本6 注释上面语句 执行下面语句
476 | # # data = data.drop([x for x in data.columns.tolist() if 'mutual_text_eig_value' in x], axis=1)
477 | # # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_ratio' in x], axis=1)
478 | # # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_distance' in x], axis=1)
479 | #
480 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_is_incomplete_input' in x], axis=1)
481 | # data = data.drop([x for x in data.columns.tolist() if 'title_word_seg_len' in x], axis=1)
482 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_word_seg_len' in x], axis=1)
483 |
484 | return data
485 |
486 |
487 | def extral_drop_feature(data):
488 | # data = data.drop([x for x in data.columns.tolist() if 'mutual_text_eig_value' in x], axis=1)
489 | # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_ratio' in x], axis=1)
490 | # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_distance' in x], axis=1)
491 |
492 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_is_incomplete_input' in x], axis=1)
493 | # data = data.drop([x for x in data.columns.tolist() if 'title_word_seg_len' in x], axis=1)
494 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_word_seg_len' in x], axis=1)
495 | return data
496 |
497 |
498 | def deal_data_main(data, static_feature_mode="unload", is_add_0926_data=1, is_debug=1):
499 | # 处理特征代码主程序
500 | logging.info("start deal data feature ...")
501 | data = deal_data_label(data)
502 | data = deal_data_col_type(data) # 处理指定col的数据类型
503 | logging.info("col type finish ...")
504 | data = deal_data_col_len(data) # 处理指定col的长度
505 | logging.info("col len finish ...")
506 |
507 | data = deal_prefix_is_in_title(data) # 判断prefix是否在title里面
508 | data = deal_prefix_is_all_chinese_word(data) # 判断是否全部中文
509 | data = deal_prefix_is_all_number(data) # 判断是否全部数字
510 | data = deal_prefix_is_all_english(data) # 判断是否全英文
511 | data = deal_prefix_is_all_upper_english(data) # 判断是否全部英文大写
512 | data = deal_prefix_is_all_lower_english(data) # 判断是否全部英文小写
513 | data = deal_prefix_is_all_symbol(data) # 判断是否全部符号
514 | data = deal_prefix_is_combine_chinese_english(data) # 判断是否中英字符结合
515 | data = deal_prefix_is_combine_chinese_number(data) # 判读是否中文数字结合
516 | data = deal_prefix_is_combine_english_number(data) # 判断是否英文数字结合
517 | data = deal_prefix_is_network_ip(data) # 判断是否网址
518 | data = data.fillna(0) # 将缺失值补充为0
519 | logging.info("is feature finish ...")
520 |
521 | data = deal_prefix_belongs_tag_number(data) # 计算prefix归属tag数量
522 | data = deal_prefix_belongs_title_number(data) # 计算prefix归属title数量
523 | logging.info("belongs finish ...")
524 |
525 | data = deal_data_query_score(data) # 处理query_score 分数特征
526 | logging.info("query score finish ...")
527 |
528 | data = deal_data_query_word(data) # 处理query_word
529 | logging.info("query word finish ...")
530 |
531 | data = deal_data_title_word(data) # 处理title分词
532 | logging.info("title word finish...")
533 |
534 | data = deal_data_prefix_word(data)
535 | logging.info("prefix word finish...")
536 |
537 | data = deal_static_feature(data, static_feature_mode, is_add_0926_data, is_debug) # 获取统计特征
538 | logging.info("static finish ...")
539 |
540 | data = deal_data_query_word_information(data)
541 | logging.info("query_word_information finish ...")
542 |
543 | data = deal_drop_data(data)
544 |
545 | return data
546 |
547 |
548 | if __name__ == '__main__':
549 | from GetData import *
550 | is_deal_data = 1 # 1 表示处理数据,0 表示直接读入处理好数据
551 | is_add_0926_data = 1 # 1 表示加入0926数据,0 表示不加入0926数据
552 | is_debug = 1 # 1 表示调试,0表示不调试
553 |
554 | train_data, test_data, validate_data = get_ttv_data(is_debug, is_add_0926_data)
555 | train_data, test_data, validate_data = deal_ttv_data_flag(train_data, test_data, validate_data)
556 | all_data = get_merge_data(train_data, test_data, validate_data) # 合并数据
557 | all_data = deal_data_main(all_data, "unload", is_add_0926_data) # 处理特征
558 | print(all_data.columns.tolist())
559 | # 新增加指标
560 | ['prefix_is_incomplete_input', 'prefix_word_seg', 'title_word_seg_len', 'prefix_word_seg_len']
--------------------------------------------------------------------------------
/GetData.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | pd.set_option('display.max_columns', 1000)
3 | pd.set_option('display.width', 1000)
4 |
5 |
6 | # ----- Abbreviation -----
7 | # ttv_data: train_test_validate_data
8 | # tpn_data: train_positive_negative_data
9 | # -------------------------
10 |
11 |
12 | # ----- get data function -----
13 | def get_ttv_data(is_debug,is_add_0926_validate_data, is_test_b): # 获取ttv数据
14 | # 导入数据
15 | col_names = ['prefix', 'query_prediction', 'title', 'tag', 'label']
16 | if is_debug == 1:
17 | train_data = pd.read_csv("data/oppo_round1_train_20180929.txt", names=col_names, sep="\t", low_memory=False, nrows=250)
18 | test_data = pd.read_csv("data/oppo_round1_test_A_20180929.txt", names=col_names, sep="\t", low_memory=False, nrows=250)
19 | test_data['label'] = '0'
20 | validate_data = pd.read_csv("data/oppo_round1_vali_20180929.txt", names=col_names, sep="\t", low_memory=False, nrows=250)
21 | else:
22 | train_data = pd.read_csv("data/oppo_round1_train_20180929.txt", names=col_names, sep="\t", low_memory=False)
23 |
24 | if is_test_b == 1:
25 | test_data = pd.read_csv("data/oppo_round1_test_B_20181106.txt", names=col_names, sep="\t", low_memory=False)
26 | test_data['label'] = '0'
27 | else:
28 | test_data = pd.read_csv("data/oppo_round1_test_A_20180929.txt", names=col_names, sep="\t", low_memory=False)
29 | test_data['label'] = '0'
30 |
31 | validate_data = pd.read_csv("data/oppo_round1_vali_20180929.txt", names=col_names, sep="\t", low_memory=False)
32 | if is_add_0926_validate_data == 1:
33 | train_0926_data = pd.read_csv("data/oppo_round1_train_20180926.txt", names=col_names, sep="\t", low_memory=False)
34 | train_data = pd.concat([train_data, train_0926_data], ignore_index=True)
35 | validate_0926_data = pd.read_csv("data/oppo_round1_vali_20180926.txt", names=col_names, sep="\t", low_memory=False)
36 | validate_data = pd.concat([validate_data, validate_0926_data], ignore_index=True)
37 | return train_data, test_data, validate_data
38 |
39 |
40 | def get_merge_data(train_data,test_data,validate_data):
41 | # 合并ttv数据
42 | data = pd.concat([train_data, test_data, validate_data], ignore_index=True)
43 | return data
44 |
45 |
46 | def get_positive_data(data):
47 | # 获取正样本数据
48 | positive_data = data[data['label'] == 1]
49 | return positive_data
50 |
51 |
52 | def get_negative_data(data):
53 | # 获取负样本数据
54 | negative_data = data[data['label'] == 0]
55 | return negative_data
56 |
57 |
58 | def get_group_data_by_col(data, group_col_name, new_col_name, static_name, static_method):
59 | # 按照某一列汇总
60 | group_data = data[[group_col_name, static_name]].groupby(by=group_col_name, as_index=False).agg({static_name: static_method})
61 | group_data.rename(columns={static_name: new_col_name}, inplace=True)
62 | return group_data
63 |
64 |
65 | def get_group_data_by_collist(data, group_col_name, new_col_name, static_name, static_method):
66 | # 按照列表汇总
67 | group_data = data[group_col_name+[static_name]].groupby(by=group_col_name, as_index=False).agg({static_name: static_method})
68 | group_data.rename(columns={static_name: new_col_name}, inplace=True)
69 | return group_data
70 |
71 |
72 | def get_tpn_group_data_by_col(train_data, positive_data, negative_data, group_col_name, static_name, static_method):
73 | # 对ttv数据按照某一列汇总
74 | train_new_col_name = group_col_name+"_"+static_method
75 | positive_new_col_name = 'positive_'+group_col_name+"_"+static_method
76 | negative_new_col_name = 'negative_'+group_col_name+"_"+static_method
77 | group_train_data = get_group_data_by_col(train_data, group_col_name, train_new_col_name, static_name, static_method)
78 | group_positive_data = get_group_data_by_col(positive_data, group_col_name, positive_new_col_name, static_name, static_method)
79 | group_negative_data = get_group_data_by_col(negative_data, group_col_name, negative_new_col_name, static_name, static_method)
80 | group_data = pd.merge(group_train_data, group_positive_data, on=group_col_name, how='left')
81 | group_data = pd.merge(group_data, group_negative_data, on=group_col_name, how='left')
82 | group_data.fillna(0, inplace=True)
83 | group_data = get_data_rate(group_data, positive_new_col_name, train_new_col_name)
84 | group_data.sort_values(by=train_new_col_name, ascending=False, inplace=True)
85 | return group_data
86 |
87 |
88 | def get_merge_col_name(col_list):
89 | # 合并列表字符元素
90 | merge_col_name = "".join(["@"+x for x in col_list])
91 | merge_col_name = "merge"+merge_col_name
92 | return merge_col_name
93 |
94 |
95 | def get_tpn_group_data_by_collist(train_data, positive_data, negative_data, group_col_name, static_name, static_method):
96 | merge_group_col_name = get_merge_col_name(group_col_name)
97 | train_new_col_name = merge_group_col_name+"_"+static_method
98 | positive_new_col_name = 'positive_'+merge_group_col_name+"_"+static_method
99 | negative_new_col_name = 'negative_'+merge_group_col_name+"_"+static_method
100 |
101 | group_train_data = get_group_data_by_collist(train_data, group_col_name, train_new_col_name, static_name, static_method)
102 | group_positive_data = get_group_data_by_collist(positive_data, group_col_name, positive_new_col_name, static_name, static_method)
103 | group_negative_data = get_group_data_by_collist(negative_data, group_col_name, negative_new_col_name, static_name, static_method)
104 |
105 | group_data = pd.merge(group_train_data, group_positive_data, on=group_col_name, how='left')
106 | group_data = pd.merge(group_data, group_negative_data, on=group_col_name, how='left')
107 | group_data.fillna(0, inplace=True)
108 | group_data = get_data_rate(group_data, positive_new_col_name, train_new_col_name)
109 | group_data = get_data_idf(group_data, train_new_col_name)
110 | group_data.sort_values(by=train_new_col_name, ascending=False, inplace=True)
111 | return group_data
112 |
113 |
114 | def get_data_rate(data, col_name_1, col_name_2):
115 | data['rate'] = (data[col_name_1]).div(data[col_name_2])
116 | return data
117 |
118 |
119 | def get_data_idf(data, col_name):
120 | sum_value = data[col_name].sum()
121 | data['idf'] = data[col_name]/sum_value
122 | return data
123 |
124 |
125 | def detect_train_validate_distribution(train_data, validate_data):
126 | col_name = 'prefix'
127 | head_n = 10
128 | train_positive_data = get_positive_data(train_data)
129 | train_negative_data = get_negative_data(train_data)
130 | validate_positive_data = get_positive_data(validate_data)
131 | validate_negative_data = get_negative_data(validate_data)
132 |
133 | train_group_data = get_tpn_group_data_by_col(train_data, train_positive_data, train_negative_data, col_name, 'query_prediction', 'count')
134 | validate_group_data = get_tpn_group_data_by_col(validate_data, validate_positive_data, validate_negative_data, col_name, 'query_prediction', 'count')
135 | print(train_group_data.head(head_n))
136 | judge = validate_group_data[col_name].isin(train_group_data[col_name].head(head_n))
137 | print(validate_group_data[judge])
138 |
139 | train_value_rate = train_group_data['rate'].head(head_n)
140 | validate_value_rate = validate_group_data[judge]['rate']
141 | print("mean_value:", [train_value_rate.mean(), validate_value_rate.mean()])
142 | print("std_value:", [train_value_rate.std(), validate_value_rate.std()])
--------------------------------------------------------------------------------
/Model.py:
--------------------------------------------------------------------------------
1 | from GetData import *
2 | import DealData
3 | from itertools import combinations
4 | import pickle
5 | import numpy as np
6 | import logging
7 | from sklearn.metrics import f1_score
8 | from sklearn.model_selection import StratifiedKFold
9 | import lightgbm as lgb
10 | import xgboost as xgb
11 | logging.basicConfig(level=logging.INFO,format="[%(asctime)s] %(message)s",datefmt="%Y-%m-%d %H:%M:%S",)
12 |
13 | # ----- model class -----
14 | class BaseModel:
15 | def __init__(self, data, positive_data, negative_data):
16 | self.data = data
17 | self.positive_data = positive_data
18 | self.negative_data = negative_data
19 | self.__base_model_data = None
20 | self.predict_result = None
21 | self.threshold_value = 1/3
22 | self.f1_score = 0
23 | self.precision = 0
24 | self.recall = 0
25 |
26 |
27 | def set_threshold_value(self, value):
28 | self.threshold_value = value
29 |
30 | def find_best_threshold_value(self, value_list, data):
31 | for value in value_list:
32 | print("#" * 100)
33 | print(value / 100)
34 | self.set_threshold_value(value / 100)
35 | self.train()
36 | self.predict(data)
37 | self.score("BaseModel")
38 |
39 | def train(self):
40 | group_data = get_tpn_group_data_by_col(self.data, self.positive_data, self.negative_data, 'tag', 'prefix', 'count')
41 | group_data['predict_label'] = 0
42 | judge = group_data['rate'] > self.threshold_value
43 | group_data.loc[judge, 'predict_label'] = 1
44 | self.__base_model_data = group_data[['tag', 'predict_label']]
45 |
46 | def predict(self, data):
47 | new_predict_data = pd.merge(data, self.__base_model_data, on='tag', how='left')
48 | self.predict_result = new_predict_data[['label', 'predict_label']]
49 |
50 | def score(self, model_name):
51 | from sklearn.metrics import f1_score
52 | score = f1_score(self.predict_result['label'].astype(int), self.predict_result['predict_label'].astype(int), pos_label=1)
53 | print(model_name+" score:"+str(score))
54 | self.f1_score = score
55 |
56 | def precision_score(self, model_name):
57 | from sklearn.metrics import precision_score
58 | score = precision_score(self.predict_result['label'], self.predict_result['predict_label'], pos_label=1)
59 | self.precision = score
60 | print(model_name + " score:" + str(score))
61 |
62 | def recall_score(self, model_name):
63 | from sklearn.metrics import recall_score
64 | score = recall_score(self.predict_result['label'], self.predict_result['predict_label'], pos_label=1)
65 | self.recall = score
66 | print(model_name + " score:" + str(score))
67 |
68 | def output_result(self, path):
69 | self.predict_result['predict_label'].to_csv(path, header=False, index=False, encoding='utf8')
70 |
71 | def reverse_predict_result(self):
72 | judge1 = self.predict_result['predict_label'] == 0
73 | judge2 = self.predict_result['predict_label'] == 1
74 | self.predict_result.loc[judge1, 'predict_label'] = 1
75 | self.predict_result.loc[judge2, 'predict_label'] = 0
76 |
77 |
78 | # ----- CombSearchModel -----
79 | class CombSearchModel(BaseModel):
80 | def __init__(self, data, positive_data, negative_data):
81 | super().__init__(data, positive_data, negative_data)
82 | self.__cs_model_data = pd.DataFrame()
83 | self.support_num = 50 # the number to guarantee the model stability
84 | self.support_rate = 0.7 # the rate to guarantee the model score
85 | self.candidate_list = ['tag', 'prefix', 'title', 'query_len', 'prefix_len', 'title_len']
86 | self.comb_num = 4 # 特征组合数目
87 | self.count_num = 0
88 | self.combined_feature_data = {} # 存储组合特征数据
89 |
90 | def set_candidate_list(self, candidate_list): # 设置特征候选集合
91 | self.candidate_list = candidate_list
92 |
93 | def set_support_num(self, support_num): # 设置支持数目
94 | self.support_num = support_num
95 |
96 | def set_support_rate(self, support_rate): # 设置支持率
97 | self.support_rate = support_rate
98 |
99 | def get_train_combined_feature_data(self): # 获取组合特征数据
100 | print("CombModel Train")
101 | for comb_len in list(range(1, self.comb_num+1)):
102 | comb_lists = list(combinations(self.candidate_list, comb_len))
103 | for comb_list in comb_lists:
104 | group_col_list = [x for x in comb_list]
105 | merge_col_name = get_merge_col_name(group_col_list)
106 | group_data = get_tpn_group_data_by_collist(self.data, self.positive_data, self.negative_data, group_col_list, 'query_prediction', 'count')
107 | if group_col_list == ["tag"]:
108 | train_group_data = group_data.copy()
109 | else:
110 | judge1 = (group_data[merge_col_name + "_count"] > self.support_num)
111 | judge2 = (group_data['rate'] >= self.support_rate) | (group_data['rate'] <= (1 - self.support_rate))
112 | judge = judge1 & judge2
113 | train_group_data = group_data[judge].copy()
114 | if train_group_data.shape[0] > 0:
115 | self.count_num += train_group_data[merge_col_name + "_count"].sum()
116 | print(group_col_list, 'data shape:', train_group_data.shape[0], "count_num",
117 | train_group_data[merge_col_name + "_count"].sum())
118 | self.combined_feature_data[merge_col_name] = train_group_data
119 |
120 | def train(self):
121 | self.get_train_combined_feature_data()
122 |
123 | def predict(self, data):
124 | keys = list(self.combined_feature_data.keys())
125 | predict_data = data.copy()
126 | for key in keys:
127 | merge_col = key.split("@")[1:]
128 | new_merge_col = merge_col + ['rate']
129 | predict_data = pd.merge(predict_data, self.combined_feature_data[key][new_merge_col], on=merge_col, how='left')
130 | predict_data.rename(columns={"rate": "rate_"+key}, inplace=True) # 重新命名
131 | rate_judge = ["rate" in x for x in list(predict_data.columns)]
132 | rate_cols = list(predict_data.columns[rate_judge])
133 | predict_data['positive_rate'] = predict_data[rate_cols].max(axis=1)
134 | predict_data['negative_rate'] = 1 - predict_data[rate_cols].min(axis=1)
135 | predict_data['predict_label'] = 1
136 |
137 | judge = predict_data['positive_rate'] < predict_data['negative_rate']
138 | predict_data.loc[judge, 'predict_label'] = 0
139 | predict_data[['positive_rate', 'label', 'predict_label']].sort_values(by='positive_rate')
140 | self.predict_result = predict_data[['label', 'predict_label']]
141 |
142 |
143 | # ----- EnsembleModel -----
144 | class BaseFeatureEnsembleModel(CombSearchModel):
145 | def __init__(self, data, positive_data, negative_data):
146 | super().__init__(data, positive_data, negative_data)
147 | self.ef_model = {}
148 | self.cv_k = 1
149 | self.data_col = []
150 |
151 | def save_combined_feature_data(self, data_name): # 保存组合特征数据
152 | with open(data_name, 'wb') as f:
153 | pickle.dump(self.combined_feature_data, f, pickle.HIGHEST_PROTOCOL)
154 |
155 | def update_combined_feature_data(self, data_name): # 更新组合特征数据
156 | # 加载更新模型
157 | with open(data_name, 'rb') as f:
158 | self.combined_feature_data = pickle.load(f)
159 |
160 | def get_ef_data(self, data):
161 | _ef_data = data.select_dtypes(include=['number'])
162 | _ef_data = _ef_data.drop(['data_flag'], axis=1)
163 | return _ef_data
164 |
165 |
166 | class LgbFeatureEnsembleModel(BaseFeatureEnsembleModel):
167 | def __init__(self, data, positive_data, negative_data):
168 | super().__init__(data, positive_data, negative_data)
169 | self.cut_value = 0.3
170 | self.train_device = 'cpu' # 设置训练
171 |
172 | def f1_score_metric(self, pred, d_valid):
173 | label = d_valid.get_label()
174 | pred = [int(i >= self.cut_value) for i in pred]
175 | return "f1_score", f1_score(label, pred), True
176 |
177 |
178 | def set_train_device(self, device='cpu'): # 设置训练设备
179 | self.train_device = device
180 |
181 | def train(self):
182 | self.data_col = self.data.columns.tolist()
183 | _ef_data = self.get_ef_data(self.data)
184 | X = np.array(_ef_data.drop(['label'], axis=1))
185 | y = np.array(_ef_data['label'])
186 | result_logloss = []
187 | skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
188 | if self.train_device == 'cpu':
189 | params = {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss',
190 | 'num_leaves': 32, 'learning_rate': 0.05, 'feature_fraction': 0.3, 'bagging_fraction': 0.8,
191 | 'bagging_freq': 5, 'verbose': -1, 'device': 'cpu', }
192 | else:
193 | params = {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss',
194 | 'num_leaves': 32, 'learning_rate': 0.05, 'feature_fraction': 0.1, 'bagging_fraction': 0.8,
195 | 'bagging_freq': 5, 'verbose': -1, 'device': 'gpu', 'gpu_platform_id': 0,'gpu_device_id': 0,
196 | }
197 | for k, (train_in, test_in) in enumerate(skf.split(X, y)):
198 | if k < self.cv_k:
199 | logging.info("train _K_ flod "+str(k))
200 | X_train, X_valid, y_train, y_valid = X[train_in], X[test_in], y[train_in], y[test_in]
201 | lgb_train = lgb.Dataset(X_train, y_train)
202 | lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
203 | gbm = lgb.train(params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval, early_stopping_rounds=500, verbose_eval=250, feval=self.f1_score_metric)
204 | valid_f1_score = f1_score(y_valid, np.where(gbm.predict(X_valid, num_iteration=gbm.best_iteration) > self.cut_value, 1, 0))
205 | print("best_iteration: ", gbm.best_iteration)
206 | print("valid_f1_score: ", valid_f1_score)
207 | result_logloss.append(gbm.best_score['valid_0']['binary_logloss'])
208 | self.ef_model[str(k)] = gbm
209 | feature_importances = sorted(zip(_ef_data.columns.drop('label'), gbm.feature_importance()), key=lambda x: x[1], reverse=True)
210 | print('feature_importances', feature_importances)
211 |
212 | def save_model(self, model_name):
213 | with open(model_name, 'wb') as f:
214 | pickle.dump(self.ef_model, f, pickle.HIGHEST_PROTOCOL)
215 |
216 | def update_model(self, model_name):
217 | with open(model_name, 'wb') as f:
218 | pickle.load(self.ef_model, f, pickle.HIGHEST_PROTOCOL)
219 |
220 | def predict(self, data):
221 | result_submit = []
222 | _ef_data = self.get_ef_data(data)
223 | for key in self.ef_model.keys():
224 | gbm = self.ef_model[key]
225 | result_submit.append(gbm.predict(_ef_data.drop(columns=['label']), num_iteration=gbm.best_iteration))
226 | self.predict_result = data.copy()
227 | self.predict_result['predict_label'] = list(np.sum(np.array(result_submit), axis=0) / len(result_submit))
228 | self.predict_result['predict_label'] = self.predict_result['predict_label'].apply(lambda x: 1 if x > self.cut_value else 0)
229 | self.predict_result = self.predict_result[['label', 'predict_label']]
230 |
231 |
232 | class LogisticRegression(BaseFeatureEnsembleModel):
233 |
234 | def __init__(self, data, positive_data, negative_data):
235 | super().__init__(data, positive_data, negative_data)
236 |
237 | def train(self):
238 | print('abc')
239 |
240 | def predict(self):
241 | print('abc')
242 |
243 |
244 | class XgbFeatureEnsembleModel(BaseFeatureEnsembleModel):
245 | def __init__(self, data, positive_data, negative_data):
246 | super().__init__(data, positive_data, negative_data)
247 | self.cut_value = 0.3
248 |
249 | def f1_score_metric(self, pred, d_valid):
250 | label = d_valid.get_label()
251 | pred = [int(i >= self.cut_value) for i in pred]
252 | return "f1_score", f1_score(label, pred)
253 |
254 | def train(self):
255 | _ef_data = self.get_ef_data(self.data)
256 | X = np.array(_ef_data.drop(['label'], axis=1))
257 | y = np.array(_ef_data['label'])
258 | skf = StratifiedKFold(n_splits=5, random_state=34, shuffle=True)
259 | params = {'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'eta': 0.05,
260 | 'max_depth': 5, 'colsample_bytree': 0.8, 'subsample': 0.8, 'alpha':1,
261 | 'min_child_weight': 1, 'seed': 10086, 'silent': 1}
262 | for k, (train_in, test_in) in enumerate(skf.split(X, y)):
263 | if k < self.cv_k:
264 | logging.info("train _K_ flod "+str(k))
265 | X_train, X_valid, y_train, y_valid = X[train_in], X[test_in], y[train_in], y[test_in]
266 | dtrain = xgb.DMatrix(X_train, label=y_train)
267 | dvali = xgb.DMatrix(X_valid, label=y_valid)
268 | model = xgb.train(params, dtrain, evals=[(dtrain,"train"), (dvali, "vali")], num_boost_round=5000, early_stopping_rounds=500, verbose_eval=1000, feval=self.f1_score_metric)
269 | feature_importances = sorted(zip(_ef_data.columns.drop('label'), list(model.get_score().values())), key=lambda x: x[1], reverse=True)
270 | self.ef_model[str(k)] = model
271 | print("best_iteration: ", model.best_iteration)
272 | print('feature_importances', feature_importances)
273 |
274 | def predict(self, data):
275 | result_submit = []
276 | _ef_data = self.get_ef_data(data)
277 | X = np.array(_ef_data.drop(['label'], axis=1))
278 | for key in self.ef_model.keys():
279 | model = self.ef_model[key]
280 | result_submit.append(model.predict(xgb.DMatrix(X)))
281 | self.predict_result = data.copy()
282 | self.predict_result['predict_label'] = list(np.sum(np.array(result_submit), axis=0) / len(result_submit))
283 | self.predict_result['predict_label'] = self.predict_result['predict_label'].apply(lambda x: 1 if x > self.cut_value else 0)
284 | self.predict_result = self.predict_result[['label', 'predict_label']]
285 |
286 |
287 | class SaveClassModel():
288 | def __init__(self):
289 | self.model = {}
290 | self.data = {}
291 |
292 | def save_model(self, model_path, model):
293 | with open(model_path, 'wb') as f:
294 | pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
295 |
296 | def load_model(self, model_path):
297 | with open(model_path, 'rb') as f:
298 | self.model = pickle.load(f)
299 |
300 |
301 | class DirectlyPredictResult(SaveClassModel):
302 | # 该类用于根据已有模型和输入数据路径进行结果预测
303 | def __init__(self, model_path, data_path, is_ouput_score=1, is_add_0926_data=1, is_debug=1):
304 | super().__init__()
305 | self.is_ouput_score = is_ouput_score # is_ouput_score =1 表示输出分数,is_ouput_score=0 表示不输出分数
306 | self.data_path = data_path
307 | self.is_add_0926_data = is_add_0926_data
308 | self.is_debug = is_debug
309 | self.model_path = model_path
310 | self.main()
311 |
312 | def main(self):
313 | self.load_model(self.model_path)
314 | self.import_data()
315 |
316 | def import_data(self):
317 | col_names = ['prefix', 'query_prediction', 'title', 'tag', 'label']
318 | data = pd.read_csv(self.data_path, names=col_names, sep="\t", low_memory=False)
319 | data.loc[pd.isna(data['label']), 'label'] = 0
320 | data = DealData.deal_data_flag(data, 1)
321 | data = DealData.deal_data_main(data, 'load', self.is_add_0926_data, self.is_debug)
322 | # data = DealData.extral_drop_feature(data)
323 | self.data = data[self.model.data_col]
324 |
325 | def predict(self):
326 | self.model.predict(self.data)
327 | if self.is_ouput_score == 1:
328 | self.model.precision_score("dpr precision score")
329 | self.model.recall_score("dpr recall score")
330 | self.model.score("dpr validate score")
331 |
332 | def output_result(self, output_result_path):
333 | self.model.output_result(output_result_path)
334 |
335 |
336 |
--------------------------------------------------------------------------------
/PrintData.py:
--------------------------------------------------------------------------------
1 | # ----- print function -----
2 | def print_data_num(data, data_name):
3 | print(data_name+" number:"+str(data.shape[0]))
4 |
5 |
6 | def print_ttv_data_num(train_data,test_data,validate_data):
7 | print_data_num(train_data, "train_data")
8 | print_data_num(test_data, "test_data")
9 | print_data_num(validate_data, "validate_data")
10 |
11 |
12 | def print_data_unique_context(data, data_name, col_name):
13 | unique_col = data[col_name].unique()
14 | print(data_name+"'s "+col_name+" unique context is:\n"+str(unique_col))
15 | print("number of "+data_name+"'s "+col_name+" unique context:"+str(len(unique_col)))
16 |
17 |
18 | def print_ttv_data_unique_context(train_data, test_data, validate_data):
19 | print_data_unique_context(train_data, "train_data", "tag")
20 | print_data_unique_context(test_data, "test_data", "tag")
21 | print_data_unique_context(validate_data, "validate_data", "tag")
22 |
23 | print_data_unique_context(train_data, "train_data", "prefix")
24 | print_data_unique_context(test_data, "test_data", "prefix")
25 | print_data_unique_context(validate_data, "validate_data", "prefix")
26 |
27 | print_data_unique_context(train_data, "train_data", "title")
28 | print_data_unique_context(test_data, "test_data", "title")
29 | print_data_unique_context(validate_data, "validate_data", "title")
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tianchi_OGeek
2 | 在搜索业务下有一个场景叫实时搜索(Instance Search),就是在用户不断输入过程中,实时返回查询结果。 此次赛题来自OPPO手机搜索排序优化的一个子场景,并做了相应的简化,意在解决query-title语义匹配的问题。简化后,本次题目内容主要为一个实时搜索场景下query-title的ctr预估问题。
3 |
4 | 0 分数
5 | ======
6 | >(1) A榜:0.7347
7 | >(2) B榜:0.7335
8 | >(3) 比赛网址:https://tianchi.aliyun.com/competition/introduction.htm?spm=5176.11409106.5678.1.2c547b6fmKviKy&raceId=231688
9 | >(4) 数据下载地址:链接:https://pan.baidu.com/s/1NPUWzt7usUniogCJosWnzw 提取码:69xr
10 |
11 | 1 baseline 共享网址
12 | ======
13 | >(1) 天池-OGeek算法挑战赛baseline(0.7016) https://zhuanlan.zhihu.com/p/46482521
14 | >(2) OGEEK算法挑战赛代码分享 https://zhuanlan.zhihu.com/p/46479794
15 | >(3) GrinAndBear/OGeek: https://github.com/GrinAndBear/OGeek
16 | >(4) flytoylf/OGeek 一个lgb和rnn的代码: https://github.com/flytoylf/OGeek
17 | >(5) https://github.com/search?q=OGeek
18 | >(6) https://github.com/search?q=tianchi_oppo
19 | >(7) https://github.com/luoling1993/TianChi_OGeek/stargazers
20 |
21 |
22 | 2 CTR 参考资料
23 | ======
24 | >(1) 推荐系统遇上深度学习: https://github.com/princewen/tensorflow_practice
25 | >(2) 推荐系统中使用ctr排序的f(x)的设计-dnn篇: https://github.com/nzc/dnn_ctr
26 | >(3) CTR预估算法之FM, FFM, DeepFM及实践: https://github.com/milkboylyf/CTR_Prediction
27 | >(4) MLR算法: https://wenku.baidu.com/view/b0e8976f2b160b4e767fcfdc.html
28 |
29 |
30 | 3 nlp 参考资料
31 | ======
32 | >(1) 用深度学习(CNN RNN Attention)解决大规模文本分类问题 - 综述和实践 https://zhuanlan.zhihu.com/p/25928551
33 | >(2) 知乎“看山杯” 夺冠记:https://zhuanlan.zhihu.com/p/28923961
34 | >(3) 2017知乎看山杯 从入门到第二 https://zhuanlan.zhihu.com/p/29020616
35 | >(4) liuhuanyong https://github.com/liuhuanyong
36 | >(5) Chinese Word Vectors 中文词向量 https://github.com/Embedding/Chinese-Word-Vectors 注释:这个链接收藏语料库
37 |
38 | 4 其他比赛总结参考链接
39 | ======
40 | >(1) ML理论&实践 https://zhuanlan.zhihu.com/c_152307828?tdsourcetag=s_pctim_aiomsg
41 |
42 | 5 未整理思路
43 | ======
44 | >(1) 主线思路:CTR思路,围绕用户点击率做文章(如开源中:单字段点击率,组合字段点击率等等) (FM, FFM模型,参考腾讯社交广告比赛??)
45 | >(2) 文本匹配思路(Kaggle Quora) 传统特征:抽取文本相似度特征,各个字段之间的距离量化 https://www.kaggle.com/c/quora-question-pairs https://github.com/qqgeogor/kaggle-quora-solution-8th https://github.com/abhishekkrthakur/is_that_a_duplicate_quora_question
46 | >(3) 深度学习模型(1DCNN, Esim, Decomp Attention,ELMO等等): https://www.kaggle.com/rethfro/1d-cnn-single-model-score-0-14-0-16-or-0-23/notebook https://www.kaggle.com/lamdang/dl-models/comments 更多文本匹配模型见斯坦福SNLI论文集:https://nlp.stanford.edu/projects/snli/
47 | >(4) 文本分类思想:主要是如何组织输入文本?另外query_prediction权重考虑? 传统特征:tfidf,bow,ngram+tfidf,sent2vec,lsi,lda等特征
48 | >(5) 深度学习模型: 参考知乎看山杯(知乎)以及Kaggle Toxic比赛
49 | >>https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
50 | >>https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52557
51 | >>https://www.kaggle.com/larryfreeman/toxic-comments-code-for-alexander-s-9872-model/comments
52 | >>https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52702
53 |
54 | >(6) Stacking无效(模型个数限制),简单Blending,NN+LightGBM的方案比较靠谱?
55 | >(7) PS1:词向量可使用word2vec训练或者使用公开词向量数据:https://github.com/Embedding/Chinese-Word-Vectors PS2:分词需要加上自定义词典,分词质量对模型训练很重要!
56 |
57 |
58 | 6 基本思考
59 | ======
60 | >(1):如何选用一些泛化能力分类器 -> logistic regression; support vector machine; linear regression
61 | >(2):如何构造文本特征 -> nlp分析
62 | >(3):如何解决特征稀疏问题 -> deep-fm
63 |
--------------------------------------------------------------------------------
/VisualData.py:
--------------------------------------------------------------------------------
1 | from pyecharts import Bar
2 | from pyecharts import Scatter
3 | from GetData import *
4 | from PrintData import *
5 | pd.set_option('display.max_columns', 1000)
6 | pd.set_option('display.width', 1000)
7 |
8 |
9 | def visual_bar_group_data_by_col(group_data, col_name, page):
10 | bar = Bar(col_name)
11 | attr = list(group_data[col_name])
12 | train_value = list(group_data[col_name+'_count'])
13 | positive_value = list(group_data['positive_'+col_name+'_count'])
14 | negative_value = list(group_data['negative_'+col_name+'_count'])
15 | bar.add("train", attr, train_value)
16 | bar.add("positive", attr, positive_value)
17 | bar.add("negative", attr, negative_value)
18 | page.add(bar)
19 | return page
20 |
21 |
22 | def visual_scatter_group_data_by_col(group_data, col_name, page):
23 | x_value = list(group_data[col_name])
24 | y_value = list(group_data['rate'])
25 | scatter = Scatter()
26 | scatter.add(col_name, x_value, y_value)
27 | page.add(scatter)
28 | return page
29 |
30 |
31 | def visual_bar_tpn_data_by_col_list(train_data, positive_data, negative_data, page):
32 | col_list = ['tag', 'prefix', 'title', 'query_len', 'prefix_len', 'title_len', 'query_num_sum', 'query_num_max',
33 | 'query_num_first']
34 | for col_name in col_list:
35 | group_data = get_tpn_group_data_by_col(train_data, positive_data, negative_data, col_name, 'query_prediction', 'count')
36 | print_data_num(group_data, col_name + "_data")
37 | extracted_group_data = group_data.sort_values(by= col_name + '_count', ascending=False)
38 | page = visual_bar_group_data_by_col(extracted_group_data.head(25), col_name, page)
39 | return page
40 |
41 |
42 | def visual_scatter_tpn_data_by_col_list(train_data, positive_data, negative_data, page):
43 | col_list = ['query_num_sum', 'query_num_max', 'query_num_first']
44 | for col_name in col_list:
45 | group_data = get_tpn_group_data_by_col(train_data, positive_data, negative_data, col_name, 'query_prediction', 'count')
46 | print_data_num(group_data, col_name + "_data")
47 | judge = group_data[col_name + "_count"] > group_data[col_name + "_count"].mean()
48 | extracted_group_data = group_data[judge]
49 | extracted_group_data = extracted_group_data.sort_values(by=col_name)
50 | page = visual_scatter_group_data_by_col(extracted_group_data, col_name, page)
51 | return page
52 |
--------------------------------------------------------------------------------
/data/可以从云盘下载文件到这里.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/milkboylyf/tianchi_OGeek/04bb66b9d58b5410105e258ca3ba888f9098154c/data/可以从云盘下载文件到这里.txt
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from DealData import *
2 | from Model import *
3 | import time
4 | import os
5 | import logging
6 | logging.basicConfig(level=logging.INFO,format="[%(asctime)s] %(message)s",datefmt="%Y-%m-%d %H:%M:%S",)
7 |
8 | # ----- Abbreviation -----
9 | # ttv_data: train_test_validate_data
10 | # tpn_data: train_positive_negative_data
11 | # -------------------------
12 |
13 |
14 | if __name__ == '__main__':
15 | print("final")
16 | if not os.path.exists("result"):
17 | os.mkdir("result")
18 | if not os.path.exists("model"):
19 | os.mkdir("model")
20 | is_test_b = 1 # 1 表示b榜测试集 0表示a榜测试集
21 | is_deal_data = 1 # 1 表示处理数据,0 表示直接读入处理好数据
22 | is_add_0926_data = 0 # 1 表示加入0926数据,0 表示不加入0926数据
23 | is_debug = 0 # 1 表示调试,0表示不调试
24 |
25 | if is_deal_data == 1:
26 | train_data, test_data, validate_data = get_ttv_data(is_debug, is_add_0926_data,is_test_b )
27 | train_data, test_data, validate_data = deal_ttv_data_flag(train_data, test_data, validate_data)
28 |
29 | all_data = get_merge_data(train_data, test_data, validate_data) # 合并数据
30 | all_data = deal_data_main(all_data, "unload", is_add_0926_data, is_debug) # 处理特征
31 |
32 | train_data = all_data[all_data['data_flag'] == 0]
33 | test_data = all_data[all_data['data_flag'] == 1]
34 | validate_data = all_data[all_data['data_flag'] == 2]
35 | if is_debug == 0:
36 | if is_add_0926_data == 1:
37 | train_data.to_csv("data/train_data_add_0926.csv", header=True, index=False, encoding='utf8')
38 | test_data.to_csv("data/test_data_add_0926.csv", header=True, index=False, encoding='utf8')
39 | validate_data.to_csv("data/validate_data_add_0926.csv", header=True, index=False, encoding='utf8')
40 | else:
41 | train_data.to_csv("data/train_data.csv", header=True, index=False, encoding='utf8')
42 | test_data.to_csv("data/test_data.csv", header=True, index=False, encoding='utf8')
43 | validate_data.to_csv("data/validate_data.csv", header=True, index=False, encoding='utf8')
44 | else:
45 | if is_add_0926_data == 1:
46 | train_data = pd.read_csv("data/train_data_add_0926.csv")
47 | test_data = pd.read_csv("data/test_data_add_0926.csv")
48 | validate_data = pd.read_csv("data/validate_data_add_0926.csv")
49 | else:
50 | train_data = pd.read_csv("data/train_data.csv")
51 | test_data = pd.read_csv("data/test_data.csv")
52 | validate_data = pd.read_csv("data/validate_data.csv")
53 |
54 |
55 | train_data = pd.read_csv("data/train_data.csv")
56 | test_data = pd.read_csv("data/test_data.csv")
57 | validate_data = pd.read_csv("data/validate_data.csv")
58 |
59 | train_data = pd.concat([train_data, validate_data], ignore_index=False)
60 | train_data = extral_drop_feature(train_data) # 该函数用于调试特征
61 |
62 | train_positive_data = get_positive_data(train_data)
63 | train_negative_data = get_negative_data(train_data)
64 | validate_positive_data = get_positive_data(validate_data)
65 | validate_negative_data = get_negative_data(validate_data)
66 |
67 | time_name = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())) # 获取当前时间
68 | save_assistant_name = time_name + "_is_used_0926data_"+str(is_add_0926_data) # 保存文件辅助变量
69 |
70 | # xgb_fe_model = XgbFeatureEnsembleModel(train_data, train_positive_data, train_negative_data)
71 | # xgb_fe_model.train()
72 | # xgb_fe_model.predict(validate_data)
73 | # xgb_fe_model.precision_score("xgb_fe model precision score")
74 | # xgb_fe_model.recall_score("xgb_fe model recall score")
75 | # xgb_fe_model.score("xgb_fe model validate score")
76 | # xgb_fe_model.output_result("result"+"/validate_xgb_"+save_assistant_name + "_score_" + str(int(xgb_fe_model.f1_score*1000000)) + ".csv")
77 | #
78 | # xgb_fe_model.predict(test_data)
79 | # xgb_fe_model.output_result("result" + "/test_xgb_" + save_assistant_name + "_score_" + str(int(xgb_fe_model.f1_score*1000000)) + ".csv")
80 | #
81 | # xgb_fe_model.data = []
82 | # xgb_fe_model.positive_data = []
83 | # xgb_fe_model.negative_data = []
84 | # SaveClassModel().save_model("model/class_xgb_"+save_assistant_name + "_score_" + str(int(xgb_fe_model.f1_score*1000000)) + ".pickle", xgb_fe_model)
85 |
86 | lgb_fe_model = LgbFeatureEnsembleModel(train_data, train_positive_data, train_negative_data)
87 | lgb_fe_model.set_train_device()
88 | lgb_fe_model.train()
89 | lgb_fe_model.predict(validate_data)
90 | lgb_fe_model.precision_score("lgb_fe model precision score")
91 | lgb_fe_model.recall_score("lgb_fe model recall score")
92 | lgb_fe_model.score("lgb_fe model validate score")
93 | lgb_fe_model.output_result("result"+"/validate_lgb_"+save_assistant_name + "_score_" + str(int(lgb_fe_model.f1_score*1000000)) + ".csv")
94 |
95 | lgb_fe_model.predict(test_data)
96 | lgb_fe_model.output_result("result"+"/test_lgb_"+save_assistant_name + "_score_" + str(int(lgb_fe_model.f1_score*1000000)) + ".csv")
97 |
98 | lgb_fe_model.data = []
99 | lgb_fe_model.positive_data = []
100 | lgb_fe_model.negative_data = []
101 | SaveClassModel().save_model("model/class_lgb_" + save_assistant_name + "_score_" + str(int(lgb_fe_model.f1_score*1000000)) + ".pickle", lgb_fe_model)
102 | logging.info("model finish ...")
103 |
104 | # print("*"*100+"DirectlyPredictResult"+"*"*100)
105 | # model_path = "model/class_lgb_" + save_assistant_name + "_score_" + str(int(lgb_fe_model.f1_score*1000000)) + ".pickle"
106 | # data_path = "data/oppo_round1_vali_20180929.txt"
107 | #
108 | # dpr = DirectlyPredictResult(model_path=model_path, data_path=data_path, is_ouput_score=1, is_add_0926_data=is_add_0926_data, is_debug=is_debug)
109 | # dpr.predict()
110 | # rst_path = "result"+"/vali_dpr_"+save_assistant_name + "_score_" + str(int(dpr.model.f1_score*1000000)) + ".csv"
111 | # dpr.output_result(rst_path)
112 |
113 | # np.sum(dpr.data.drop(['label'],axis=1)==validate_data[lgb_fe_model.data_col].drop(['label'],axis=1),axis=0)
--------------------------------------------------------------------------------
/tool/nlp_basic.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on 2018/10/17 9:21
4 | @Author: Johnson
5 | @Email:593956670@qq.com
6 | @Software: PyCharm
7 | """
8 | import os
9 | import sys
10 | import pyltp
11 |
12 | personal_seg_dict = './tmp_file'
13 | ltp_models_dir = 'D:/GithubRepos/gitdata/tcgame_ogeek/ltp_data_v3.4.0'
14 |
15 | model_files = os.listdir(ltp_models_dir)
16 | ltp_models = {os.path.splitext(fname)[0]:os.path.join(ltp_models_dir,fname) for fname in model_files}
17 |
18 | sensplit = pyltp.SentenceSplitter.split
19 | segmentor_ = None
20 | postagger_ = None
21 | ner_ = None
22 | parser_ = None
23 | srl_ = None
24 |
25 |
26 | def segment(sentence):
27 | global segmentor_
28 | if segmentor_ is None:
29 | segmentor_ = pyltp.Segmentor()
30 | #segmentor_.load(ltp_models['cws'])
31 | # 加载模型,第二个参数是您的外部词典文件路径
32 | segmentor_.load_with_lexicon(ltp_models['cws'], personal_seg_dict)
33 | return segmentor_.segment(sentence)
34 |
35 |
36 | def postag(words):
37 | global postagger_
38 | if postagger_ is None:
39 | postagger_ = pyltp.Postagger()
40 | postagger_.load(ltp_models['pos'])
41 | return postagger_.postag(words)
42 |
43 |
44 | def ner(words, postags):
45 | global ner_
46 | if ner_ is None:
47 | ner_ = pyltp.NamedEntityRecognizer()
48 | ner_.load(ltp_models['ner'])
49 | return ner_.recognize(words, postags)
50 |
51 |
52 | def parse(words, postags):
53 | global parser_
54 | if parser_ is None:
55 | parser_ = pyltp.Parser()
56 | parser_.load(ltp_models['parser'])
57 | return parser_.parse(words, postags)
58 |
59 |
60 | def srl(words, postags, arcs):
61 | global srl_
62 | if srl_ is None:
63 | srl_ = pyltp.SementicRoleLabeller()
64 | srl_.load(ltp_models['pisrl_win'])
65 | return srl_.label(words, postags, arcs)
66 |
67 |
68 | def release():
69 | global segmentor_, postagger_, ner_, parser_, srl_
70 | if segmentor_ is not None:
71 | segmentor_.release()
72 | if postagger_ is not None:
73 | postagger_.release()
74 | if ner_ is not None:
75 | ner_.release()
76 | if parser_ is not None:
77 | parser_.release()
78 | if srl_ is not None:
79 | srl_.release()
--------------------------------------------------------------------------------