├── DealData.py ├── GetData.py ├── Model.py ├── PrintData.py ├── README.md ├── VisualData.py ├── data └── 可以从云盘下载文件到这里.txt ├── main.py └── tool └── nlp_basic.py /DealData.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import re 4 | import json 5 | import jieba 6 | import Levenshtein 7 | import logging 8 | import warnings 9 | import pickle 10 | warnings.filterwarnings('ignore') 11 | 12 | logging.basicConfig(level=logging.INFO,format="[%(asctime)s] %(message)s",datefmt="%Y-%m-%d %H:%M:%S",) 13 | pd.set_option('display.max_columns', 1000) 14 | pd.set_option('display.width', 1000) 15 | 16 | # ----- Abbreviation ----- 17 | # ttv_data: train_test_validate_data 18 | # tpn_data: train_positive_negative_data 19 | # ------------------------- 20 | 21 | 22 | # ----- deal data function ----- 23 | def deal_data_label(data): # 处理数据标签 24 | data['label'] = data['label'].astype(str) 25 | judge = data['label'] == '音乐' 26 | data = data[~judge] 27 | data.reset_index(inplace=True, drop=True) 28 | data['label'] = data['label'].astype(int) 29 | return data 30 | 31 | 32 | def deal_data_flag(data, flag): 33 | # 处理数据标志用于区别数据性质,0表示训练数据 1表示测试数据 2表示验证数据 34 | data['data_flag'] = flag 35 | return data 36 | 37 | 38 | def deal_ttv_data_flag(train_data, test_data, validate_data): # 处理ttv数据标志 39 | train_data = deal_data_flag(train_data, 0) 40 | test_data = deal_data_flag(test_data, 1) 41 | validate_data = deal_data_flag(validate_data, 2) 42 | return train_data, test_data, validate_data 43 | 44 | 45 | def deal_ttv_data_by_func(train_data, test_data, validate_data, deal_func): 46 | # 处理ttv数据 47 | train_data = deal_func(train_data) 48 | test_data = deal_func(test_data) 49 | validate_data = deal_func(validate_data) 50 | return train_data, test_data, validate_data 51 | 52 | 53 | def deal_data_col_type(data): # 处理数据类型 54 | data['label'] = data['label'].astype(int) 55 | data['prefix'] = data['prefix'].astype(str) 56 | data['title'] = data['title'].astype(str) 57 | data['query_prediction'] = data['query_prediction'].astype(str) 58 | return data 59 | 60 | 61 | def deal_data_col_len(data): # 处理数据长度 62 | data['prefix_len'] = data['prefix'].apply(lambda x: len(x)) # 增加prefix长度字段 63 | data['title_len'] = data['title'].apply(lambda x: len(x)) # 增加title长度字段 64 | data['title_diff_prefix_len'] = data['title_len'] - data['prefix_len'] 65 | return data 66 | 67 | 68 | ################################################################# 69 | # query 70 | def move_useless_char(s): 71 | # 提出无效字符 72 | return re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+??!,。??、~@#¥%……&*()]+", "", s) 73 | 74 | 75 | def query_prediction_text(query_prediction): 76 | if (query_prediction == "{}") | (query_prediction == "") | pd.isna(query_prediction) | (query_prediction == "nan"): 77 | return ["PAD"] 78 | json_data = json.loads(query_prediction) 79 | result = sorted(json_data.items(), key=lambda d: d[1], reverse=True) 80 | texts = [move_useless_char(item[0]) for item in result] 81 | return texts 82 | 83 | 84 | def query_prediction_score(query_prediction): 85 | if (query_prediction == "{}") | (query_prediction == "") | pd.isna(query_prediction) | (query_prediction == "nan"): 86 | return [0] 87 | json_data = json.loads(query_prediction) 88 | result = sorted(json_data.items(), key=lambda d: d[1], reverse=True) 89 | scores = [float(item[1]) for item in result] 90 | return scores 91 | 92 | 93 | def deal_data_query_score(data): 94 | data['query_score'] = data['query_prediction'].apply(lambda x: query_prediction_score(x)) 95 | data['query_score_max'] = data['query_score'].apply(lambda x: np.max(x)) 96 | data['query_score_min'] = data['query_score'].apply(lambda x: np.min(x)) 97 | data['query_score_mean'] = data['query_score'].apply(lambda x: np.mean(x)) 98 | data['query_score_median'] = data['query_score'].apply(lambda x: np.median(x)) 99 | data['query_score_sum'] = data['query_score'].apply(lambda x: np.sum(x)) 100 | data['query_score_std'] = data['query_score'].apply(lambda x: np.std(x)) 101 | data['query_score'] = data['query_score'].apply(lambda x: sorted(x, reverse=True)) 102 | data['query_score'] = data['query_score'].apply(lambda x: x+[0 for _ in range(10-len(x))]) 103 | for i in range(10): 104 | data['query_score_'+str(i)] = data['query_score'].apply(lambda x: x[i]) 105 | data = data.drop(['query_score'], axis =1) 106 | return data 107 | 108 | 109 | def get_word_vector(): 110 | word2vec = dict() 111 | with open("data/zh_word_vectors.txt", 'r', encoding="utf-8") as f: 112 | for line in f: 113 | line = line.strip() 114 | if not line: 115 | continue 116 | tokens = line.split() 117 | word = tokens[0] 118 | vecs = tokens[1:] 119 | tmp = [] 120 | for vec in vecs: 121 | try: 122 | tmp.append(float(vec)) 123 | except: 124 | pass 125 | word2vec[word] = np.array(tmp) 126 | return word2vec 127 | 128 | 129 | def get_text_vector(x, word2vec, default_vec): 130 | try: 131 | return word2vec[x] 132 | except: 133 | return default_vec 134 | 135 | 136 | def deal_data_query_word(data): 137 | data['query_word'] = data['query_prediction'].apply(lambda x: query_prediction_text(x)) 138 | data['query_len'] = data['query_word'].apply(lambda x: len(x)) 139 | temp_data = data['query_word'].apply(lambda x: [len(_x) for _x in x]) 140 | data['query_word_max_len'] = temp_data.apply(lambda x: np.max(x) if len(x) > 0 else 0) 141 | data['query_word_min_len'] = temp_data.apply(lambda x: np.min(x) if len(x) > 0 else 0) 142 | data['query_word_mean_len'] = temp_data.apply(lambda x: np.mean(x) if len(x) > 0 else 0) 143 | data['query_word_median_len'] = temp_data.apply(lambda x: np.median(x) if len(x) > 0 else 0) 144 | data['query_word_sum_len'] = temp_data.apply(lambda x: np.sum(x) if len(x) > 0 else 0) 145 | data['query_word_std_len'] = temp_data.apply(lambda x: np.std(x) if len(x) > 0 else 0) 146 | data['query_word'] = data['query_word'].apply(lambda x: x+['PAD' for _ in range(10-len(x))]) 147 | 148 | # word2vec = get_word_vector() 149 | # default_vec = np.array([0.0 for _ in range(len(word2vec[list(word2vec.keys())[0]]))]) 150 | # temp_data = data[['prefix', 'query_word']].drop_duplicates('prefix') 151 | # for i in range(10): 152 | # temp_data['query_word_seg_'+str(i)] = temp_data['query_word'].apply(lambda x: "|".join(jieba.cut(str(x[i])))) 153 | # tmp_vec = temp_data['query_word_seg_'+str(i)].str.split("|").apply(lambda x: [get_text_vector(_x, word2vec, default_vec) for _x in x]) 154 | # temp_data['query_word_seg_' + str(i) + '_vec'] = tmp_vec.apply(lambda x: np.sum(x, axis=0)) 155 | # temp_data.drop(columns=['query_word'], inplace= True) 156 | # data = data.merge(temp_data, on='prefix', how='left') 157 | return data 158 | 159 | 160 | def deal_eig_value(similarity_matrix): 161 | # similarity_matrix: 对称矩阵 162 | similarity_matrix = np.array(similarity_matrix) 163 | similarity_matrix = similarity_matrix + similarity_matrix.T 164 | similarity_matrix[np.eye(similarity_matrix.shape[0]) == 1] = 1 165 | eig_value = np.linalg.eig(similarity_matrix)[0] 166 | eig_value = [float(x) for x in eig_value] 167 | eig_value = sorted(eig_value, reverse=True) + [0 for _ in range(10 - len(eig_value))] 168 | return eig_value 169 | 170 | 171 | def deal_query_word_mutual_text_eig_vector(sub_word): 172 | # 计算query_word 中词组包含关系信息主向量 173 | sub_word = [x for x in sub_word if x != ""] 174 | if len(sub_word) > 0: 175 | similarity_matrix = [] 176 | for _sw in sub_word: 177 | similarity = [1-(len(sw)-len(_sw))/max([len(sw), len(_sw)]) if _sw in sw else 0 for sw in sub_word ] 178 | similarity_matrix.append(similarity) 179 | eig_value = deal_eig_value(similarity_matrix) # 计算特征向量特征值 180 | else: 181 | eig_value = [0 for _ in range(10)] 182 | return eig_value 183 | 184 | 185 | def deal_query_word_levenshtein_ratio_eig_vector(sub_word): 186 | # 计算query_word的 levenshetein 相似度 187 | sub_word = [x for x in sub_word if x != ""] 188 | if len(sub_word) > 0: 189 | similarity_matrix = [] 190 | for _sw in sub_word: 191 | similarity = [Levenshtein.ratio(_sw, sw) if _sw in sw else 0 for sw in sub_word ] 192 | similarity_matrix.append(similarity) 193 | eig_value = deal_eig_value(similarity_matrix) # 计算特征向量 194 | else: 195 | eig_value = [0 for _ in range(10)] 196 | return eig_value 197 | 198 | 199 | def deal_query_word_levenshtein_distance_eig_vector(sub_word): 200 | # 计算query_word的 levenshetein 相似度 201 | sub_word = [x for x in sub_word if x != ""] 202 | if len(sub_word) > 0: 203 | similarity_matrix = [] 204 | for _sw in sub_word: 205 | similarity = [Levenshtein.distance(_sw, sw) if _sw in sw else 0 for sw in sub_word ] 206 | similarity_matrix.append(similarity) 207 | eig_value = deal_eig_value(similarity_matrix) # 计算特征向量 208 | else: 209 | eig_value = [0 for _ in range(10)] 210 | return eig_value 211 | 212 | 213 | def deal_query_word_levenshtein_jaro_eig_vector(sub_word): 214 | # 计算query_word的 levenshetein 相似度 215 | sub_word = [x for x in sub_word if x != ""] 216 | if len(sub_word) > 0: 217 | similarity_matrix = [] 218 | for _sw in sub_word: 219 | similarity = [Levenshtein.jaro(_sw, sw) if _sw in sw else 0 for sw in sub_word ] 220 | similarity_matrix.append(similarity) 221 | eig_value = deal_eig_value(similarity_matrix) # 计算特征向量 222 | else: 223 | eig_value = [0 for _ in range(10)] 224 | return eig_value 225 | 226 | 227 | def deal_data_query_sub_word_info(x): 228 | # 对每个 query_word 删除 prefix 229 | try: 230 | rst = [re.sub(x['prefix'], "", _x) for _x in x['query_word']] if len(x['query_word']) > 0 else ['NAN'] 231 | except: 232 | rst = [_x for _x in x['query_word']] 233 | return rst 234 | 235 | 236 | def deal_data_prefix_is_incomplete_input(detected_word, key_word): 237 | rest_word = detected_word.replace(key_word, "") 238 | if len(rest_word) > 0: 239 | return rest_word[0] == "|" 240 | else: 241 | return False 242 | 243 | 244 | def deal_data_query_word_information(data): 245 | temp_data = data[['prefix', 'query_word', 'prefix_word_seg']].drop_duplicates('prefix') 246 | 247 | # 判断关键词是否输入完整 248 | temp_data['query_word_seg_0'] = temp_data['query_word'].apply(lambda x: "|".join(jieba.cut(str(x[0])))) 249 | temp_data['prefix_is_incomplete_input'] = temp_data.apply(lambda x: deal_data_prefix_is_incomplete_input(x['query_word_seg_0'], x['prefix_word_seg']), axis=1).astype(int) 250 | data = data.merge(temp_data[['prefix', 'prefix_is_incomplete_input']], on='prefix', how='left') 251 | temp_data = temp_data.drop(['prefix_is_incomplete_input', 'query_word_seg_0', 'prefix_word_seg'], axis=1) 252 | 253 | temp_data['query_sub_word'] = temp_data[['prefix', 'query_word']].apply(lambda x: deal_data_query_sub_word_info(x), axis=1) 254 | # query_word 交互文本信息 255 | eig_values = temp_data['query_sub_word'].apply(lambda x: deal_query_word_mutual_text_eig_vector(x)) 256 | for i in range(10): 257 | temp_data['mutual_text_eig_value_'+str(i)] = eig_values.apply(lambda x: x[i]) 258 | data = data.merge(temp_data.drop(['query_word', 'query_sub_word'], axis=1), on='prefix', how='left') 259 | temp_data = temp_data[['prefix', 'query_word', 'query_sub_word']] 260 | 261 | # levenshtein ratio 交互文本信息 262 | eig_values = temp_data['query_sub_word'].apply(lambda x: deal_query_word_levenshtein_ratio_eig_vector(x)) 263 | for i in range(10): 264 | temp_data['levenshtein_ratio_eig_value_'+str(i)] = eig_values.apply(lambda x: x[i]) 265 | data = data.merge(temp_data.drop(['query_word', 'query_sub_word'], axis=1), on='prefix', how='left') 266 | temp_data = temp_data[['prefix', 'query_word', 'query_sub_word']] 267 | 268 | # levenshtein distance 交互文本信息 269 | eig_values = temp_data['query_sub_word'].apply(lambda x: deal_query_word_levenshtein_distance_eig_vector(x)) 270 | for i in range(10): 271 | temp_data['levenshtein_distance_eig_value_' + str(i)] = eig_values.apply(lambda x: x[i]) 272 | data = data.merge(temp_data.drop(['query_word', 'query_sub_word'], axis=1), on='prefix', how='left') 273 | return data 274 | 275 | 276 | ################################################################# 277 | # ----- is特征 + prefix ----- 278 | def deal_prefix_is_in_title(data): 279 | data['is_prefix_in_title'] = data.apply(lambda x: int(x['prefix'] in x['title']), axis=1) 280 | return data 281 | 282 | 283 | def deal_title_is_in_query_keys(data): 284 | data['is_title_in_query_keys'] = data.apply(lambda x: int(sum([int(x['title'] in _x) for _x in x['query_word']])>0), axis=1) 285 | return data 286 | 287 | 288 | # 是否全是中文 289 | def deal_prefix_is_all_chinese_word(data): 290 | judge = data['prefix'].apply(lambda x:len(re.findall("[0-9|a-z|A-Z|+??!,。??、~@#¥%……&*()|\s+\.\!\/_,$%^*(+\"\']", x)) == 0) 291 | data['is_all_chinese_word'] = 0 292 | data.loc[judge, 'is_all_chinese_word'] = 1 293 | return data 294 | 295 | 296 | # 是否全是数字 297 | def deal_prefix_is_all_number(data): 298 | judge = data['prefix'].apply(lambda x:len(re.findall("\D", x))==0) 299 | data['is_all_number'] = 0 300 | data.loc[judge, 'is_all_number'] = 1 301 | return data 302 | 303 | 304 | # 是否全是英文字母 305 | def deal_prefix_is_all_english(data): 306 | judge = data['prefix'].apply(lambda x:len(re.findall("[a-z|A-Z]", x)) == len(x)) 307 | data[judge] 308 | data['is_all_English'] = 0 309 | data.loc[judge, 'is_all_English'] = 1 310 | return data 311 | 312 | 313 | # 是否全是大写英文字母 314 | def deal_prefix_is_all_upper_english(data): 315 | judge = data['prefix'].apply(lambda x: len(re.findall("[A-Z]", x)) == len(x)) 316 | # data[judge] 317 | data['is_all_upper_english'] = 0 318 | data.loc[judge, 'is_all_upper_english'] = 1 319 | return data 320 | 321 | 322 | # 是否全是小写英文字母 323 | def deal_prefix_is_all_lower_english(data): 324 | judge = data['prefix'].apply(lambda x: len(re.findall("[a-z]", x)) == len(x)) 325 | data['is_all_upperEnglish'] = 0 326 | data.loc[judge, 'is_all_lower_english'] = 1 327 | return data 328 | 329 | 330 | # 是否全是特殊符号 331 | def deal_prefix_is_all_symbol(data): 332 | judge = data['prefix'].apply(lambda x:len(re.findall("\w", x))==0) 333 | data['is_all_symbol'] = 0 334 | data.loc[judge, 'is_all_symbol'] = 1 335 | return data 336 | 337 | 338 | # 是否中英文一起出现 339 | def deal_prefix_is_combine_chinese_english(data): 340 | judge = data['prefix'].apply(lambda x: len(re.findall("[\u4e00-\u9fa5]+[a-z|A-Z]+|[a-z|A-Z]+[\u4e00-\u9fa5]+", x))>0) 341 | data['is_combine_chinese_english'] = 0 342 | data.loc[judge, 'is_combine_chinese_english'] = 1 343 | return data 344 | 345 | 346 | # 是否中文数字出现 347 | def deal_prefix_is_combine_chinese_number(data): 348 | judge = data['prefix'].apply(lambda x: len(re.findall("[\u4e00-\u9fa5]+[0-9]+|[0-9]+[\u4e00-\u9fa5]+", x))>0) 349 | data['is_combine_chinese_number'] = 0 350 | data.loc[judge, 'is_combine_chinese_number'] = 1 351 | return data 352 | 353 | 354 | # 是否英文和数字一起出现 355 | def deal_prefix_is_combine_english_number(data): 356 | judge = data['prefix'].apply(lambda x: len(re.findall("[0-9]+[a-z|A-Z]+|[a-z|A-Z]+[0-9]+", x))>0) 357 | data['is_combine_english_number'] = 0 358 | data.loc[judge, 'is_combine_english_number'] = 1 359 | return data 360 | 361 | 362 | # 是否网址 # .com 结尾 363 | def deal_prefix_is_network_ip(data): 364 | judge = data['prefix'].apply(lambda x: len(re.findall("\.(com)$", x))>0) 365 | data['is_network_ip'] = 0 366 | data.loc[judge, 'is_network_ip'] = 1 367 | return data 368 | 369 | 370 | # prefix归属于tag个数 371 | def deal_prefix_belongs_tag_number(data): 372 | temp_data = data.groupby(['prefix', 'tag'], as_index=False)['query_prediction'].agg({'prefix_belongs_tag_count': 'count'}) 373 | temp_data = temp_data.groupby('prefix', as_index=False)['prefix_belongs_tag_count'].count() 374 | data = data.merge(temp_data, on='prefix', how='left') 375 | return data 376 | 377 | 378 | # prefix归属于title个数 379 | def deal_prefix_belongs_title_number(data): 380 | temp_data = data.groupby(['prefix', 'title'], as_index=False)['query_prediction'].agg({'prefix_belongs_title_count': 'count'}) 381 | temp_data = temp_data.groupby('prefix', as_index=False)['prefix_belongs_title_count'].count() 382 | data = data.merge(temp_data, on='prefix', how='left') 383 | return data 384 | 385 | 386 | def deal_data_title_word(data): 387 | temp_data = data[['title']].drop_duplicates('title') 388 | temp_data['title_word_seg'] = temp_data['title'].apply(lambda x: "|".join(jieba.cut(x))) 389 | temp_data['title_word_seg_len'] = temp_data['title_word_seg'].apply(lambda x: len(x.split("|"))) 390 | data = data.merge(temp_data, on='title', how='left') 391 | return data 392 | 393 | 394 | def deal_data_prefix_word(data): 395 | temp_data = data[['prefix']].drop_duplicates('prefix') 396 | temp_data['prefix_word_seg'] = temp_data['prefix'].apply(lambda x: "|".join(jieba.cut(x))) 397 | temp_data['prefix_word_seg_len'] = temp_data['prefix_word_seg'].apply(lambda x: len(x.split("|"))) 398 | data = data.merge(temp_data, on='prefix', how='left') 399 | return data 400 | 401 | 402 | # static feature 403 | def get_ctr_feature(cols, data, train_data, is_add_0926_data, is_debug): 404 | ctr_feature_dict = {} 405 | for col in cols: 406 | tmp = train_data.groupby(col, as_index=False)["label"].agg({col + "_click": "sum", col + "_show": "count"}) 407 | tmp[col + "_ctr"] = tmp[col + "_click"] / (tmp[col + "_show"] + 3) 408 | for tmp_col in [col + "_show", col + "_click", col + "_ctr"]: 409 | tmp[tmp_col] = tmp[tmp_col].apply(lambda x: x if x != "PAD" else -1) 410 | ctr_feature_dict[col] = tmp 411 | data = pd.merge(data, tmp, on=col, how="left") 412 | 413 | for i in range(len(cols)): 414 | for j in range(i + 1, len(cols)): 415 | group_col = [cols[i], cols[j]] 416 | tmp = train_data.groupby(group_col, as_index=False)["label"].agg( 417 | {"_".join(group_col) + "_click": "sum", "_".join(group_col) + "_show": "count"}) 418 | tmp["_".join(group_col) + "_ctr"] = tmp["_".join(group_col) + "_click"] / ( 419 | tmp["_".join(group_col) + "_show"] + 3) 420 | for tmp_col in ["_".join(group_col) + "_show", "_".join(group_col) + "_click", 421 | "_".join(group_col) + "_ctr"]: 422 | tmp[tmp_col] = tmp[group_col + [tmp_col]].apply( 423 | lambda x: x[tmp_col] if "PAD" not in x[group_col].values else -1, axis=1) 424 | ctr_feature_dict["_".join(group_col)] = tmp 425 | data = pd.merge(data, tmp, on=group_col, how="left") 426 | 427 | group_col = cols 428 | tmp = train_data.groupby(group_col, as_index=False)["label"].agg({"_".join(group_col) + "_click": "sum", "_".join(group_col) + "_show": "count"}) 429 | tmp["_".join(group_col) + "_ctr"] = tmp["_".join(group_col) + "_click"] / (tmp["_".join(group_col) + "_show"] + 3) 430 | ctr_feature_dict["_".join(group_col)] = tmp 431 | data = pd.merge(data, tmp, on=cols, how="left") 432 | if is_debug == 0: # 判断是否调试模式,调试模式不保存ctr_feature数据 433 | if is_add_0926_data == 1: # 判断是否加载0926数据 434 | with open('data/ctr_feature_dict_0926.pickle', 'wb') as f: 435 | pickle.dump(ctr_feature_dict, f, pickle.HIGHEST_PROTOCOL) 436 | else: 437 | with open('data/ctr_feature_dict.pickle', 'wb') as f: 438 | pickle.dump(ctr_feature_dict, f, pickle.HIGHEST_PROTOCOL) 439 | 440 | data = data.fillna(-1) 441 | return data 442 | 443 | 444 | def deal_static_feature(data, mode='unload', is_add_0926_data=1, is_debug=1): 445 | if mode == 'load': # 加载数据模式 446 | if is_add_0926_data == 1: 447 | with open("data/ctr_feature_dict_0926.pickle", 'rb') as f: 448 | ctr_feature_dict = pickle.load(f) 449 | else: 450 | with open("data/ctr_feature_dict.pickle", 'rb') as f: 451 | ctr_feature_dict = pickle.load(f) 452 | for key in list(ctr_feature_dict.keys()): 453 | tmp_data = ctr_feature_dict[key] 454 | data = data.merge(tmp_data, on=key.split("_"), how='left') 455 | data = data.fillna(-1) 456 | else: 457 | train_data = data[data['data_flag'] == 0] 458 | train_data.columns.tolist() 459 | cols = ["prefix", "title", "tag"] 460 | data = get_ctr_feature(cols, data, train_data, is_add_0926_data, is_debug) 461 | return data 462 | 463 | 464 | def deal_drop_data(data): 465 | data = data.select_dtypes(include=['number']) 466 | # -------- 分割线 -------- 467 | # data = data.drop([x for x in data.columns.tolist() if 'mutual_text_eig_value' in x], axis=1) 468 | # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_ratio' in x], axis=1) 469 | # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_distance' in x], axis=1) 470 | 471 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_is_incomplete_input' in x], axis=1) 472 | # data = data.drop([x for x in data.columns.tolist() if 'title_word_seg_len' in x], axis=1) 473 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_word_seg_len' in x], axis=1) 474 | 475 | # 如果是版本6 注释上面语句 执行下面语句 476 | # # data = data.drop([x for x in data.columns.tolist() if 'mutual_text_eig_value' in x], axis=1) 477 | # # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_ratio' in x], axis=1) 478 | # # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_distance' in x], axis=1) 479 | # 480 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_is_incomplete_input' in x], axis=1) 481 | # data = data.drop([x for x in data.columns.tolist() if 'title_word_seg_len' in x], axis=1) 482 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_word_seg_len' in x], axis=1) 483 | 484 | return data 485 | 486 | 487 | def extral_drop_feature(data): 488 | # data = data.drop([x for x in data.columns.tolist() if 'mutual_text_eig_value' in x], axis=1) 489 | # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_ratio' in x], axis=1) 490 | # data = data.drop([x for x in data.columns.tolist() if 'levenshtein_distance' in x], axis=1) 491 | 492 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_is_incomplete_input' in x], axis=1) 493 | # data = data.drop([x for x in data.columns.tolist() if 'title_word_seg_len' in x], axis=1) 494 | # data = data.drop([x for x in data.columns.tolist() if 'prefix_word_seg_len' in x], axis=1) 495 | return data 496 | 497 | 498 | def deal_data_main(data, static_feature_mode="unload", is_add_0926_data=1, is_debug=1): 499 | # 处理特征代码主程序 500 | logging.info("start deal data feature ...") 501 | data = deal_data_label(data) 502 | data = deal_data_col_type(data) # 处理指定col的数据类型 503 | logging.info("col type finish ...") 504 | data = deal_data_col_len(data) # 处理指定col的长度 505 | logging.info("col len finish ...") 506 | 507 | data = deal_prefix_is_in_title(data) # 判断prefix是否在title里面 508 | data = deal_prefix_is_all_chinese_word(data) # 判断是否全部中文 509 | data = deal_prefix_is_all_number(data) # 判断是否全部数字 510 | data = deal_prefix_is_all_english(data) # 判断是否全英文 511 | data = deal_prefix_is_all_upper_english(data) # 判断是否全部英文大写 512 | data = deal_prefix_is_all_lower_english(data) # 判断是否全部英文小写 513 | data = deal_prefix_is_all_symbol(data) # 判断是否全部符号 514 | data = deal_prefix_is_combine_chinese_english(data) # 判断是否中英字符结合 515 | data = deal_prefix_is_combine_chinese_number(data) # 判读是否中文数字结合 516 | data = deal_prefix_is_combine_english_number(data) # 判断是否英文数字结合 517 | data = deal_prefix_is_network_ip(data) # 判断是否网址 518 | data = data.fillna(0) # 将缺失值补充为0 519 | logging.info("is feature finish ...") 520 | 521 | data = deal_prefix_belongs_tag_number(data) # 计算prefix归属tag数量 522 | data = deal_prefix_belongs_title_number(data) # 计算prefix归属title数量 523 | logging.info("belongs finish ...") 524 | 525 | data = deal_data_query_score(data) # 处理query_score 分数特征 526 | logging.info("query score finish ...") 527 | 528 | data = deal_data_query_word(data) # 处理query_word 529 | logging.info("query word finish ...") 530 | 531 | data = deal_data_title_word(data) # 处理title分词 532 | logging.info("title word finish...") 533 | 534 | data = deal_data_prefix_word(data) 535 | logging.info("prefix word finish...") 536 | 537 | data = deal_static_feature(data, static_feature_mode, is_add_0926_data, is_debug) # 获取统计特征 538 | logging.info("static finish ...") 539 | 540 | data = deal_data_query_word_information(data) 541 | logging.info("query_word_information finish ...") 542 | 543 | data = deal_drop_data(data) 544 | 545 | return data 546 | 547 | 548 | if __name__ == '__main__': 549 | from GetData import * 550 | is_deal_data = 1 # 1 表示处理数据,0 表示直接读入处理好数据 551 | is_add_0926_data = 1 # 1 表示加入0926数据,0 表示不加入0926数据 552 | is_debug = 1 # 1 表示调试,0表示不调试 553 | 554 | train_data, test_data, validate_data = get_ttv_data(is_debug, is_add_0926_data) 555 | train_data, test_data, validate_data = deal_ttv_data_flag(train_data, test_data, validate_data) 556 | all_data = get_merge_data(train_data, test_data, validate_data) # 合并数据 557 | all_data = deal_data_main(all_data, "unload", is_add_0926_data) # 处理特征 558 | print(all_data.columns.tolist()) 559 | # 新增加指标 560 | ['prefix_is_incomplete_input', 'prefix_word_seg', 'title_word_seg_len', 'prefix_word_seg_len'] -------------------------------------------------------------------------------- /GetData.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | pd.set_option('display.max_columns', 1000) 3 | pd.set_option('display.width', 1000) 4 | 5 | 6 | # ----- Abbreviation ----- 7 | # ttv_data: train_test_validate_data 8 | # tpn_data: train_positive_negative_data 9 | # ------------------------- 10 | 11 | 12 | # ----- get data function ----- 13 | def get_ttv_data(is_debug,is_add_0926_validate_data, is_test_b): # 获取ttv数据 14 | # 导入数据 15 | col_names = ['prefix', 'query_prediction', 'title', 'tag', 'label'] 16 | if is_debug == 1: 17 | train_data = pd.read_csv("data/oppo_round1_train_20180929.txt", names=col_names, sep="\t", low_memory=False, nrows=250) 18 | test_data = pd.read_csv("data/oppo_round1_test_A_20180929.txt", names=col_names, sep="\t", low_memory=False, nrows=250) 19 | test_data['label'] = '0' 20 | validate_data = pd.read_csv("data/oppo_round1_vali_20180929.txt", names=col_names, sep="\t", low_memory=False, nrows=250) 21 | else: 22 | train_data = pd.read_csv("data/oppo_round1_train_20180929.txt", names=col_names, sep="\t", low_memory=False) 23 | 24 | if is_test_b == 1: 25 | test_data = pd.read_csv("data/oppo_round1_test_B_20181106.txt", names=col_names, sep="\t", low_memory=False) 26 | test_data['label'] = '0' 27 | else: 28 | test_data = pd.read_csv("data/oppo_round1_test_A_20180929.txt", names=col_names, sep="\t", low_memory=False) 29 | test_data['label'] = '0' 30 | 31 | validate_data = pd.read_csv("data/oppo_round1_vali_20180929.txt", names=col_names, sep="\t", low_memory=False) 32 | if is_add_0926_validate_data == 1: 33 | train_0926_data = pd.read_csv("data/oppo_round1_train_20180926.txt", names=col_names, sep="\t", low_memory=False) 34 | train_data = pd.concat([train_data, train_0926_data], ignore_index=True) 35 | validate_0926_data = pd.read_csv("data/oppo_round1_vali_20180926.txt", names=col_names, sep="\t", low_memory=False) 36 | validate_data = pd.concat([validate_data, validate_0926_data], ignore_index=True) 37 | return train_data, test_data, validate_data 38 | 39 | 40 | def get_merge_data(train_data,test_data,validate_data): 41 | # 合并ttv数据 42 | data = pd.concat([train_data, test_data, validate_data], ignore_index=True) 43 | return data 44 | 45 | 46 | def get_positive_data(data): 47 | # 获取正样本数据 48 | positive_data = data[data['label'] == 1] 49 | return positive_data 50 | 51 | 52 | def get_negative_data(data): 53 | # 获取负样本数据 54 | negative_data = data[data['label'] == 0] 55 | return negative_data 56 | 57 | 58 | def get_group_data_by_col(data, group_col_name, new_col_name, static_name, static_method): 59 | # 按照某一列汇总 60 | group_data = data[[group_col_name, static_name]].groupby(by=group_col_name, as_index=False).agg({static_name: static_method}) 61 | group_data.rename(columns={static_name: new_col_name}, inplace=True) 62 | return group_data 63 | 64 | 65 | def get_group_data_by_collist(data, group_col_name, new_col_name, static_name, static_method): 66 | # 按照列表汇总 67 | group_data = data[group_col_name+[static_name]].groupby(by=group_col_name, as_index=False).agg({static_name: static_method}) 68 | group_data.rename(columns={static_name: new_col_name}, inplace=True) 69 | return group_data 70 | 71 | 72 | def get_tpn_group_data_by_col(train_data, positive_data, negative_data, group_col_name, static_name, static_method): 73 | # 对ttv数据按照某一列汇总 74 | train_new_col_name = group_col_name+"_"+static_method 75 | positive_new_col_name = 'positive_'+group_col_name+"_"+static_method 76 | negative_new_col_name = 'negative_'+group_col_name+"_"+static_method 77 | group_train_data = get_group_data_by_col(train_data, group_col_name, train_new_col_name, static_name, static_method) 78 | group_positive_data = get_group_data_by_col(positive_data, group_col_name, positive_new_col_name, static_name, static_method) 79 | group_negative_data = get_group_data_by_col(negative_data, group_col_name, negative_new_col_name, static_name, static_method) 80 | group_data = pd.merge(group_train_data, group_positive_data, on=group_col_name, how='left') 81 | group_data = pd.merge(group_data, group_negative_data, on=group_col_name, how='left') 82 | group_data.fillna(0, inplace=True) 83 | group_data = get_data_rate(group_data, positive_new_col_name, train_new_col_name) 84 | group_data.sort_values(by=train_new_col_name, ascending=False, inplace=True) 85 | return group_data 86 | 87 | 88 | def get_merge_col_name(col_list): 89 | # 合并列表字符元素 90 | merge_col_name = "".join(["@"+x for x in col_list]) 91 | merge_col_name = "merge"+merge_col_name 92 | return merge_col_name 93 | 94 | 95 | def get_tpn_group_data_by_collist(train_data, positive_data, negative_data, group_col_name, static_name, static_method): 96 | merge_group_col_name = get_merge_col_name(group_col_name) 97 | train_new_col_name = merge_group_col_name+"_"+static_method 98 | positive_new_col_name = 'positive_'+merge_group_col_name+"_"+static_method 99 | negative_new_col_name = 'negative_'+merge_group_col_name+"_"+static_method 100 | 101 | group_train_data = get_group_data_by_collist(train_data, group_col_name, train_new_col_name, static_name, static_method) 102 | group_positive_data = get_group_data_by_collist(positive_data, group_col_name, positive_new_col_name, static_name, static_method) 103 | group_negative_data = get_group_data_by_collist(negative_data, group_col_name, negative_new_col_name, static_name, static_method) 104 | 105 | group_data = pd.merge(group_train_data, group_positive_data, on=group_col_name, how='left') 106 | group_data = pd.merge(group_data, group_negative_data, on=group_col_name, how='left') 107 | group_data.fillna(0, inplace=True) 108 | group_data = get_data_rate(group_data, positive_new_col_name, train_new_col_name) 109 | group_data = get_data_idf(group_data, train_new_col_name) 110 | group_data.sort_values(by=train_new_col_name, ascending=False, inplace=True) 111 | return group_data 112 | 113 | 114 | def get_data_rate(data, col_name_1, col_name_2): 115 | data['rate'] = (data[col_name_1]).div(data[col_name_2]) 116 | return data 117 | 118 | 119 | def get_data_idf(data, col_name): 120 | sum_value = data[col_name].sum() 121 | data['idf'] = data[col_name]/sum_value 122 | return data 123 | 124 | 125 | def detect_train_validate_distribution(train_data, validate_data): 126 | col_name = 'prefix' 127 | head_n = 10 128 | train_positive_data = get_positive_data(train_data) 129 | train_negative_data = get_negative_data(train_data) 130 | validate_positive_data = get_positive_data(validate_data) 131 | validate_negative_data = get_negative_data(validate_data) 132 | 133 | train_group_data = get_tpn_group_data_by_col(train_data, train_positive_data, train_negative_data, col_name, 'query_prediction', 'count') 134 | validate_group_data = get_tpn_group_data_by_col(validate_data, validate_positive_data, validate_negative_data, col_name, 'query_prediction', 'count') 135 | print(train_group_data.head(head_n)) 136 | judge = validate_group_data[col_name].isin(train_group_data[col_name].head(head_n)) 137 | print(validate_group_data[judge]) 138 | 139 | train_value_rate = train_group_data['rate'].head(head_n) 140 | validate_value_rate = validate_group_data[judge]['rate'] 141 | print("mean_value:", [train_value_rate.mean(), validate_value_rate.mean()]) 142 | print("std_value:", [train_value_rate.std(), validate_value_rate.std()]) -------------------------------------------------------------------------------- /Model.py: -------------------------------------------------------------------------------- 1 | from GetData import * 2 | import DealData 3 | from itertools import combinations 4 | import pickle 5 | import numpy as np 6 | import logging 7 | from sklearn.metrics import f1_score 8 | from sklearn.model_selection import StratifiedKFold 9 | import lightgbm as lgb 10 | import xgboost as xgb 11 | logging.basicConfig(level=logging.INFO,format="[%(asctime)s] %(message)s",datefmt="%Y-%m-%d %H:%M:%S",) 12 | 13 | # ----- model class ----- 14 | class BaseModel: 15 | def __init__(self, data, positive_data, negative_data): 16 | self.data = data 17 | self.positive_data = positive_data 18 | self.negative_data = negative_data 19 | self.__base_model_data = None 20 | self.predict_result = None 21 | self.threshold_value = 1/3 22 | self.f1_score = 0 23 | self.precision = 0 24 | self.recall = 0 25 | 26 | 27 | def set_threshold_value(self, value): 28 | self.threshold_value = value 29 | 30 | def find_best_threshold_value(self, value_list, data): 31 | for value in value_list: 32 | print("#" * 100) 33 | print(value / 100) 34 | self.set_threshold_value(value / 100) 35 | self.train() 36 | self.predict(data) 37 | self.score("BaseModel") 38 | 39 | def train(self): 40 | group_data = get_tpn_group_data_by_col(self.data, self.positive_data, self.negative_data, 'tag', 'prefix', 'count') 41 | group_data['predict_label'] = 0 42 | judge = group_data['rate'] > self.threshold_value 43 | group_data.loc[judge, 'predict_label'] = 1 44 | self.__base_model_data = group_data[['tag', 'predict_label']] 45 | 46 | def predict(self, data): 47 | new_predict_data = pd.merge(data, self.__base_model_data, on='tag', how='left') 48 | self.predict_result = new_predict_data[['label', 'predict_label']] 49 | 50 | def score(self, model_name): 51 | from sklearn.metrics import f1_score 52 | score = f1_score(self.predict_result['label'].astype(int), self.predict_result['predict_label'].astype(int), pos_label=1) 53 | print(model_name+" score:"+str(score)) 54 | self.f1_score = score 55 | 56 | def precision_score(self, model_name): 57 | from sklearn.metrics import precision_score 58 | score = precision_score(self.predict_result['label'], self.predict_result['predict_label'], pos_label=1) 59 | self.precision = score 60 | print(model_name + " score:" + str(score)) 61 | 62 | def recall_score(self, model_name): 63 | from sklearn.metrics import recall_score 64 | score = recall_score(self.predict_result['label'], self.predict_result['predict_label'], pos_label=1) 65 | self.recall = score 66 | print(model_name + " score:" + str(score)) 67 | 68 | def output_result(self, path): 69 | self.predict_result['predict_label'].to_csv(path, header=False, index=False, encoding='utf8') 70 | 71 | def reverse_predict_result(self): 72 | judge1 = self.predict_result['predict_label'] == 0 73 | judge2 = self.predict_result['predict_label'] == 1 74 | self.predict_result.loc[judge1, 'predict_label'] = 1 75 | self.predict_result.loc[judge2, 'predict_label'] = 0 76 | 77 | 78 | # ----- CombSearchModel ----- 79 | class CombSearchModel(BaseModel): 80 | def __init__(self, data, positive_data, negative_data): 81 | super().__init__(data, positive_data, negative_data) 82 | self.__cs_model_data = pd.DataFrame() 83 | self.support_num = 50 # the number to guarantee the model stability 84 | self.support_rate = 0.7 # the rate to guarantee the model score 85 | self.candidate_list = ['tag', 'prefix', 'title', 'query_len', 'prefix_len', 'title_len'] 86 | self.comb_num = 4 # 特征组合数目 87 | self.count_num = 0 88 | self.combined_feature_data = {} # 存储组合特征数据 89 | 90 | def set_candidate_list(self, candidate_list): # 设置特征候选集合 91 | self.candidate_list = candidate_list 92 | 93 | def set_support_num(self, support_num): # 设置支持数目 94 | self.support_num = support_num 95 | 96 | def set_support_rate(self, support_rate): # 设置支持率 97 | self.support_rate = support_rate 98 | 99 | def get_train_combined_feature_data(self): # 获取组合特征数据 100 | print("CombModel Train") 101 | for comb_len in list(range(1, self.comb_num+1)): 102 | comb_lists = list(combinations(self.candidate_list, comb_len)) 103 | for comb_list in comb_lists: 104 | group_col_list = [x for x in comb_list] 105 | merge_col_name = get_merge_col_name(group_col_list) 106 | group_data = get_tpn_group_data_by_collist(self.data, self.positive_data, self.negative_data, group_col_list, 'query_prediction', 'count') 107 | if group_col_list == ["tag"]: 108 | train_group_data = group_data.copy() 109 | else: 110 | judge1 = (group_data[merge_col_name + "_count"] > self.support_num) 111 | judge2 = (group_data['rate'] >= self.support_rate) | (group_data['rate'] <= (1 - self.support_rate)) 112 | judge = judge1 & judge2 113 | train_group_data = group_data[judge].copy() 114 | if train_group_data.shape[0] > 0: 115 | self.count_num += train_group_data[merge_col_name + "_count"].sum() 116 | print(group_col_list, 'data shape:', train_group_data.shape[0], "count_num", 117 | train_group_data[merge_col_name + "_count"].sum()) 118 | self.combined_feature_data[merge_col_name] = train_group_data 119 | 120 | def train(self): 121 | self.get_train_combined_feature_data() 122 | 123 | def predict(self, data): 124 | keys = list(self.combined_feature_data.keys()) 125 | predict_data = data.copy() 126 | for key in keys: 127 | merge_col = key.split("@")[1:] 128 | new_merge_col = merge_col + ['rate'] 129 | predict_data = pd.merge(predict_data, self.combined_feature_data[key][new_merge_col], on=merge_col, how='left') 130 | predict_data.rename(columns={"rate": "rate_"+key}, inplace=True) # 重新命名 131 | rate_judge = ["rate" in x for x in list(predict_data.columns)] 132 | rate_cols = list(predict_data.columns[rate_judge]) 133 | predict_data['positive_rate'] = predict_data[rate_cols].max(axis=1) 134 | predict_data['negative_rate'] = 1 - predict_data[rate_cols].min(axis=1) 135 | predict_data['predict_label'] = 1 136 | 137 | judge = predict_data['positive_rate'] < predict_data['negative_rate'] 138 | predict_data.loc[judge, 'predict_label'] = 0 139 | predict_data[['positive_rate', 'label', 'predict_label']].sort_values(by='positive_rate') 140 | self.predict_result = predict_data[['label', 'predict_label']] 141 | 142 | 143 | # ----- EnsembleModel ----- 144 | class BaseFeatureEnsembleModel(CombSearchModel): 145 | def __init__(self, data, positive_data, negative_data): 146 | super().__init__(data, positive_data, negative_data) 147 | self.ef_model = {} 148 | self.cv_k = 1 149 | self.data_col = [] 150 | 151 | def save_combined_feature_data(self, data_name): # 保存组合特征数据 152 | with open(data_name, 'wb') as f: 153 | pickle.dump(self.combined_feature_data, f, pickle.HIGHEST_PROTOCOL) 154 | 155 | def update_combined_feature_data(self, data_name): # 更新组合特征数据 156 | # 加载更新模型 157 | with open(data_name, 'rb') as f: 158 | self.combined_feature_data = pickle.load(f) 159 | 160 | def get_ef_data(self, data): 161 | _ef_data = data.select_dtypes(include=['number']) 162 | _ef_data = _ef_data.drop(['data_flag'], axis=1) 163 | return _ef_data 164 | 165 | 166 | class LgbFeatureEnsembleModel(BaseFeatureEnsembleModel): 167 | def __init__(self, data, positive_data, negative_data): 168 | super().__init__(data, positive_data, negative_data) 169 | self.cut_value = 0.3 170 | self.train_device = 'cpu' # 设置训练 171 | 172 | def f1_score_metric(self, pred, d_valid): 173 | label = d_valid.get_label() 174 | pred = [int(i >= self.cut_value) for i in pred] 175 | return "f1_score", f1_score(label, pred), True 176 | 177 | 178 | def set_train_device(self, device='cpu'): # 设置训练设备 179 | self.train_device = device 180 | 181 | def train(self): 182 | self.data_col = self.data.columns.tolist() 183 | _ef_data = self.get_ef_data(self.data) 184 | X = np.array(_ef_data.drop(['label'], axis=1)) 185 | y = np.array(_ef_data['label']) 186 | result_logloss = [] 187 | skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True) 188 | if self.train_device == 'cpu': 189 | params = {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 190 | 'num_leaves': 32, 'learning_rate': 0.05, 'feature_fraction': 0.3, 'bagging_fraction': 0.8, 191 | 'bagging_freq': 5, 'verbose': -1, 'device': 'cpu', } 192 | else: 193 | params = {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 194 | 'num_leaves': 32, 'learning_rate': 0.05, 'feature_fraction': 0.1, 'bagging_fraction': 0.8, 195 | 'bagging_freq': 5, 'verbose': -1, 'device': 'gpu', 'gpu_platform_id': 0,'gpu_device_id': 0, 196 | } 197 | for k, (train_in, test_in) in enumerate(skf.split(X, y)): 198 | if k < self.cv_k: 199 | logging.info("train _K_ flod "+str(k)) 200 | X_train, X_valid, y_train, y_valid = X[train_in], X[test_in], y[train_in], y[test_in] 201 | lgb_train = lgb.Dataset(X_train, y_train) 202 | lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train) 203 | gbm = lgb.train(params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval, early_stopping_rounds=500, verbose_eval=250, feval=self.f1_score_metric) 204 | valid_f1_score = f1_score(y_valid, np.where(gbm.predict(X_valid, num_iteration=gbm.best_iteration) > self.cut_value, 1, 0)) 205 | print("best_iteration: ", gbm.best_iteration) 206 | print("valid_f1_score: ", valid_f1_score) 207 | result_logloss.append(gbm.best_score['valid_0']['binary_logloss']) 208 | self.ef_model[str(k)] = gbm 209 | feature_importances = sorted(zip(_ef_data.columns.drop('label'), gbm.feature_importance()), key=lambda x: x[1], reverse=True) 210 | print('feature_importances', feature_importances) 211 | 212 | def save_model(self, model_name): 213 | with open(model_name, 'wb') as f: 214 | pickle.dump(self.ef_model, f, pickle.HIGHEST_PROTOCOL) 215 | 216 | def update_model(self, model_name): 217 | with open(model_name, 'wb') as f: 218 | pickle.load(self.ef_model, f, pickle.HIGHEST_PROTOCOL) 219 | 220 | def predict(self, data): 221 | result_submit = [] 222 | _ef_data = self.get_ef_data(data) 223 | for key in self.ef_model.keys(): 224 | gbm = self.ef_model[key] 225 | result_submit.append(gbm.predict(_ef_data.drop(columns=['label']), num_iteration=gbm.best_iteration)) 226 | self.predict_result = data.copy() 227 | self.predict_result['predict_label'] = list(np.sum(np.array(result_submit), axis=0) / len(result_submit)) 228 | self.predict_result['predict_label'] = self.predict_result['predict_label'].apply(lambda x: 1 if x > self.cut_value else 0) 229 | self.predict_result = self.predict_result[['label', 'predict_label']] 230 | 231 | 232 | class LogisticRegression(BaseFeatureEnsembleModel): 233 | 234 | def __init__(self, data, positive_data, negative_data): 235 | super().__init__(data, positive_data, negative_data) 236 | 237 | def train(self): 238 | print('abc') 239 | 240 | def predict(self): 241 | print('abc') 242 | 243 | 244 | class XgbFeatureEnsembleModel(BaseFeatureEnsembleModel): 245 | def __init__(self, data, positive_data, negative_data): 246 | super().__init__(data, positive_data, negative_data) 247 | self.cut_value = 0.3 248 | 249 | def f1_score_metric(self, pred, d_valid): 250 | label = d_valid.get_label() 251 | pred = [int(i >= self.cut_value) for i in pred] 252 | return "f1_score", f1_score(label, pred) 253 | 254 | def train(self): 255 | _ef_data = self.get_ef_data(self.data) 256 | X = np.array(_ef_data.drop(['label'], axis=1)) 257 | y = np.array(_ef_data['label']) 258 | skf = StratifiedKFold(n_splits=5, random_state=34, shuffle=True) 259 | params = {'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'eta': 0.05, 260 | 'max_depth': 5, 'colsample_bytree': 0.8, 'subsample': 0.8, 'alpha':1, 261 | 'min_child_weight': 1, 'seed': 10086, 'silent': 1} 262 | for k, (train_in, test_in) in enumerate(skf.split(X, y)): 263 | if k < self.cv_k: 264 | logging.info("train _K_ flod "+str(k)) 265 | X_train, X_valid, y_train, y_valid = X[train_in], X[test_in], y[train_in], y[test_in] 266 | dtrain = xgb.DMatrix(X_train, label=y_train) 267 | dvali = xgb.DMatrix(X_valid, label=y_valid) 268 | model = xgb.train(params, dtrain, evals=[(dtrain,"train"), (dvali, "vali")], num_boost_round=5000, early_stopping_rounds=500, verbose_eval=1000, feval=self.f1_score_metric) 269 | feature_importances = sorted(zip(_ef_data.columns.drop('label'), list(model.get_score().values())), key=lambda x: x[1], reverse=True) 270 | self.ef_model[str(k)] = model 271 | print("best_iteration: ", model.best_iteration) 272 | print('feature_importances', feature_importances) 273 | 274 | def predict(self, data): 275 | result_submit = [] 276 | _ef_data = self.get_ef_data(data) 277 | X = np.array(_ef_data.drop(['label'], axis=1)) 278 | for key in self.ef_model.keys(): 279 | model = self.ef_model[key] 280 | result_submit.append(model.predict(xgb.DMatrix(X))) 281 | self.predict_result = data.copy() 282 | self.predict_result['predict_label'] = list(np.sum(np.array(result_submit), axis=0) / len(result_submit)) 283 | self.predict_result['predict_label'] = self.predict_result['predict_label'].apply(lambda x: 1 if x > self.cut_value else 0) 284 | self.predict_result = self.predict_result[['label', 'predict_label']] 285 | 286 | 287 | class SaveClassModel(): 288 | def __init__(self): 289 | self.model = {} 290 | self.data = {} 291 | 292 | def save_model(self, model_path, model): 293 | with open(model_path, 'wb') as f: 294 | pickle.dump(model, f, pickle.HIGHEST_PROTOCOL) 295 | 296 | def load_model(self, model_path): 297 | with open(model_path, 'rb') as f: 298 | self.model = pickle.load(f) 299 | 300 | 301 | class DirectlyPredictResult(SaveClassModel): 302 | # 该类用于根据已有模型和输入数据路径进行结果预测 303 | def __init__(self, model_path, data_path, is_ouput_score=1, is_add_0926_data=1, is_debug=1): 304 | super().__init__() 305 | self.is_ouput_score = is_ouput_score # is_ouput_score =1 表示输出分数,is_ouput_score=0 表示不输出分数 306 | self.data_path = data_path 307 | self.is_add_0926_data = is_add_0926_data 308 | self.is_debug = is_debug 309 | self.model_path = model_path 310 | self.main() 311 | 312 | def main(self): 313 | self.load_model(self.model_path) 314 | self.import_data() 315 | 316 | def import_data(self): 317 | col_names = ['prefix', 'query_prediction', 'title', 'tag', 'label'] 318 | data = pd.read_csv(self.data_path, names=col_names, sep="\t", low_memory=False) 319 | data.loc[pd.isna(data['label']), 'label'] = 0 320 | data = DealData.deal_data_flag(data, 1) 321 | data = DealData.deal_data_main(data, 'load', self.is_add_0926_data, self.is_debug) 322 | # data = DealData.extral_drop_feature(data) 323 | self.data = data[self.model.data_col] 324 | 325 | def predict(self): 326 | self.model.predict(self.data) 327 | if self.is_ouput_score == 1: 328 | self.model.precision_score("dpr precision score") 329 | self.model.recall_score("dpr recall score") 330 | self.model.score("dpr validate score") 331 | 332 | def output_result(self, output_result_path): 333 | self.model.output_result(output_result_path) 334 | 335 | 336 | -------------------------------------------------------------------------------- /PrintData.py: -------------------------------------------------------------------------------- 1 | # ----- print function ----- 2 | def print_data_num(data, data_name): 3 | print(data_name+" number:"+str(data.shape[0])) 4 | 5 | 6 | def print_ttv_data_num(train_data,test_data,validate_data): 7 | print_data_num(train_data, "train_data") 8 | print_data_num(test_data, "test_data") 9 | print_data_num(validate_data, "validate_data") 10 | 11 | 12 | def print_data_unique_context(data, data_name, col_name): 13 | unique_col = data[col_name].unique() 14 | print(data_name+"'s "+col_name+" unique context is:\n"+str(unique_col)) 15 | print("number of "+data_name+"'s "+col_name+" unique context:"+str(len(unique_col))) 16 | 17 | 18 | def print_ttv_data_unique_context(train_data, test_data, validate_data): 19 | print_data_unique_context(train_data, "train_data", "tag") 20 | print_data_unique_context(test_data, "test_data", "tag") 21 | print_data_unique_context(validate_data, "validate_data", "tag") 22 | 23 | print_data_unique_context(train_data, "train_data", "prefix") 24 | print_data_unique_context(test_data, "test_data", "prefix") 25 | print_data_unique_context(validate_data, "validate_data", "prefix") 26 | 27 | print_data_unique_context(train_data, "train_data", "title") 28 | print_data_unique_context(test_data, "test_data", "title") 29 | print_data_unique_context(validate_data, "validate_data", "title") 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tianchi_OGeek 2 | 在搜索业务下有一个场景叫实时搜索(Instance Search),就是在用户不断输入过程中,实时返回查询结果。 此次赛题来自OPPO手机搜索排序优化的一个子场景,并做了相应的简化,意在解决query-title语义匹配的问题。简化后,本次题目内容主要为一个实时搜索场景下query-title的ctr预估问题。 3 | 4 | 0 分数 5 | ====== 6 | >(1) A榜:0.7347
7 | >(2) B榜:0.7335
8 | >(3) 比赛网址:https://tianchi.aliyun.com/competition/introduction.htm?spm=5176.11409106.5678.1.2c547b6fmKviKy&raceId=231688
9 | >(4) 数据下载地址:链接:https://pan.baidu.com/s/1NPUWzt7usUniogCJosWnzw 提取码:69xr
10 | 11 | 1 baseline 共享网址 12 | ====== 13 | >(1) 天池-OGeek算法挑战赛baseline(0.7016) https://zhuanlan.zhihu.com/p/46482521
14 | >(2) OGEEK算法挑战赛代码分享 https://zhuanlan.zhihu.com/p/46479794
15 | >(3) GrinAndBear/OGeek: https://github.com/GrinAndBear/OGeek
16 | >(4) flytoylf/OGeek 一个lgb和rnn的代码: https://github.com/flytoylf/OGeek
17 | >(5) https://github.com/search?q=OGeek
18 | >(6) https://github.com/search?q=tianchi_oppo
19 | >(7) https://github.com/luoling1993/TianChi_OGeek/stargazers
20 | 21 | 22 | 2 CTR 参考资料 23 | ====== 24 | >(1) 推荐系统遇上深度学习: https://github.com/princewen/tensorflow_practice
25 | >(2) 推荐系统中使用ctr排序的f(x)的设计-dnn篇: https://github.com/nzc/dnn_ctr
26 | >(3) CTR预估算法之FM, FFM, DeepFM及实践: https://github.com/milkboylyf/CTR_Prediction
27 | >(4) MLR算法: https://wenku.baidu.com/view/b0e8976f2b160b4e767fcfdc.html
28 | 29 | 30 | 3 nlp 参考资料 31 | ====== 32 | >(1) 用深度学习(CNN RNN Attention)解决大规模文本分类问题 - 综述和实践 https://zhuanlan.zhihu.com/p/25928551
33 | >(2) 知乎“看山杯” 夺冠记:https://zhuanlan.zhihu.com/p/28923961
34 | >(3) 2017知乎看山杯 从入门到第二 https://zhuanlan.zhihu.com/p/29020616
35 | >(4) liuhuanyong https://github.com/liuhuanyong
36 | >(5) Chinese Word Vectors 中文词向量 https://github.com/Embedding/Chinese-Word-Vectors 注释:这个链接收藏语料库
37 | 38 | 4 其他比赛总结参考链接 39 | ====== 40 | >(1) ML理论&实践 https://zhuanlan.zhihu.com/c_152307828?tdsourcetag=s_pctim_aiomsg
41 | 42 | 5 未整理思路 43 | ====== 44 | >(1) 主线思路:CTR思路,围绕用户点击率做文章(如开源中:单字段点击率,组合字段点击率等等) (FM, FFM模型,参考腾讯社交广告比赛??)
45 | >(2) 文本匹配思路(Kaggle Quora) 传统特征:抽取文本相似度特征,各个字段之间的距离量化 https://www.kaggle.com/c/quora-question-pairs https://github.com/qqgeogor/kaggle-quora-solution-8th https://github.com/abhishekkrthakur/is_that_a_duplicate_quora_question
46 | >(3) 深度学习模型(1DCNN, Esim, Decomp Attention,ELMO等等): https://www.kaggle.com/rethfro/1d-cnn-single-model-score-0-14-0-16-or-0-23/notebook https://www.kaggle.com/lamdang/dl-models/comments 更多文本匹配模型见斯坦福SNLI论文集:https://nlp.stanford.edu/projects/snli/
47 | >(4) 文本分类思想:主要是如何组织输入文本?另外query_prediction权重考虑? 传统特征:tfidf,bow,ngram+tfidf,sent2vec,lsi,lda等特征
48 | >(5) 深度学习模型: 参考知乎看山杯(知乎)以及Kaggle Toxic比赛
49 | >>https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
50 | >>https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52557
51 | >>https://www.kaggle.com/larryfreeman/toxic-comments-code-for-alexander-s-9872-model/comments
52 | >>https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52702
53 | 54 | >(6) Stacking无效(模型个数限制),简单Blending,NN+LightGBM的方案比较靠谱?
55 | >(7) PS1:词向量可使用word2vec训练或者使用公开词向量数据:https://github.com/Embedding/Chinese-Word-Vectors PS2:分词需要加上自定义词典,分词质量对模型训练很重要! 56 | 57 | 58 | 6 基本思考 59 | ====== 60 | >(1):如何选用一些泛化能力分类器 -> logistic regression; support vector machine; linear regression
61 | >(2):如何构造文本特征 -> nlp分析
62 | >(3):如何解决特征稀疏问题 -> deep-fm
63 | -------------------------------------------------------------------------------- /VisualData.py: -------------------------------------------------------------------------------- 1 | from pyecharts import Bar 2 | from pyecharts import Scatter 3 | from GetData import * 4 | from PrintData import * 5 | pd.set_option('display.max_columns', 1000) 6 | pd.set_option('display.width', 1000) 7 | 8 | 9 | def visual_bar_group_data_by_col(group_data, col_name, page): 10 | bar = Bar(col_name) 11 | attr = list(group_data[col_name]) 12 | train_value = list(group_data[col_name+'_count']) 13 | positive_value = list(group_data['positive_'+col_name+'_count']) 14 | negative_value = list(group_data['negative_'+col_name+'_count']) 15 | bar.add("train", attr, train_value) 16 | bar.add("positive", attr, positive_value) 17 | bar.add("negative", attr, negative_value) 18 | page.add(bar) 19 | return page 20 | 21 | 22 | def visual_scatter_group_data_by_col(group_data, col_name, page): 23 | x_value = list(group_data[col_name]) 24 | y_value = list(group_data['rate']) 25 | scatter = Scatter() 26 | scatter.add(col_name, x_value, y_value) 27 | page.add(scatter) 28 | return page 29 | 30 | 31 | def visual_bar_tpn_data_by_col_list(train_data, positive_data, negative_data, page): 32 | col_list = ['tag', 'prefix', 'title', 'query_len', 'prefix_len', 'title_len', 'query_num_sum', 'query_num_max', 33 | 'query_num_first'] 34 | for col_name in col_list: 35 | group_data = get_tpn_group_data_by_col(train_data, positive_data, negative_data, col_name, 'query_prediction', 'count') 36 | print_data_num(group_data, col_name + "_data") 37 | extracted_group_data = group_data.sort_values(by= col_name + '_count', ascending=False) 38 | page = visual_bar_group_data_by_col(extracted_group_data.head(25), col_name, page) 39 | return page 40 | 41 | 42 | def visual_scatter_tpn_data_by_col_list(train_data, positive_data, negative_data, page): 43 | col_list = ['query_num_sum', 'query_num_max', 'query_num_first'] 44 | for col_name in col_list: 45 | group_data = get_tpn_group_data_by_col(train_data, positive_data, negative_data, col_name, 'query_prediction', 'count') 46 | print_data_num(group_data, col_name + "_data") 47 | judge = group_data[col_name + "_count"] > group_data[col_name + "_count"].mean() 48 | extracted_group_data = group_data[judge] 49 | extracted_group_data = extracted_group_data.sort_values(by=col_name) 50 | page = visual_scatter_group_data_by_col(extracted_group_data, col_name, page) 51 | return page 52 | -------------------------------------------------------------------------------- /data/可以从云盘下载文件到这里.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milkboylyf/tianchi_OGeek/04bb66b9d58b5410105e258ca3ba888f9098154c/data/可以从云盘下载文件到这里.txt -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from DealData import * 2 | from Model import * 3 | import time 4 | import os 5 | import logging 6 | logging.basicConfig(level=logging.INFO,format="[%(asctime)s] %(message)s",datefmt="%Y-%m-%d %H:%M:%S",) 7 | 8 | # ----- Abbreviation ----- 9 | # ttv_data: train_test_validate_data 10 | # tpn_data: train_positive_negative_data 11 | # ------------------------- 12 | 13 | 14 | if __name__ == '__main__': 15 | print("final") 16 | if not os.path.exists("result"): 17 | os.mkdir("result") 18 | if not os.path.exists("model"): 19 | os.mkdir("model") 20 | is_test_b = 1 # 1 表示b榜测试集 0表示a榜测试集 21 | is_deal_data = 1 # 1 表示处理数据,0 表示直接读入处理好数据 22 | is_add_0926_data = 0 # 1 表示加入0926数据,0 表示不加入0926数据 23 | is_debug = 0 # 1 表示调试,0表示不调试 24 | 25 | if is_deal_data == 1: 26 | train_data, test_data, validate_data = get_ttv_data(is_debug, is_add_0926_data,is_test_b ) 27 | train_data, test_data, validate_data = deal_ttv_data_flag(train_data, test_data, validate_data) 28 | 29 | all_data = get_merge_data(train_data, test_data, validate_data) # 合并数据 30 | all_data = deal_data_main(all_data, "unload", is_add_0926_data, is_debug) # 处理特征 31 | 32 | train_data = all_data[all_data['data_flag'] == 0] 33 | test_data = all_data[all_data['data_flag'] == 1] 34 | validate_data = all_data[all_data['data_flag'] == 2] 35 | if is_debug == 0: 36 | if is_add_0926_data == 1: 37 | train_data.to_csv("data/train_data_add_0926.csv", header=True, index=False, encoding='utf8') 38 | test_data.to_csv("data/test_data_add_0926.csv", header=True, index=False, encoding='utf8') 39 | validate_data.to_csv("data/validate_data_add_0926.csv", header=True, index=False, encoding='utf8') 40 | else: 41 | train_data.to_csv("data/train_data.csv", header=True, index=False, encoding='utf8') 42 | test_data.to_csv("data/test_data.csv", header=True, index=False, encoding='utf8') 43 | validate_data.to_csv("data/validate_data.csv", header=True, index=False, encoding='utf8') 44 | else: 45 | if is_add_0926_data == 1: 46 | train_data = pd.read_csv("data/train_data_add_0926.csv") 47 | test_data = pd.read_csv("data/test_data_add_0926.csv") 48 | validate_data = pd.read_csv("data/validate_data_add_0926.csv") 49 | else: 50 | train_data = pd.read_csv("data/train_data.csv") 51 | test_data = pd.read_csv("data/test_data.csv") 52 | validate_data = pd.read_csv("data/validate_data.csv") 53 | 54 | 55 | train_data = pd.read_csv("data/train_data.csv") 56 | test_data = pd.read_csv("data/test_data.csv") 57 | validate_data = pd.read_csv("data/validate_data.csv") 58 | 59 | train_data = pd.concat([train_data, validate_data], ignore_index=False) 60 | train_data = extral_drop_feature(train_data) # 该函数用于调试特征 61 | 62 | train_positive_data = get_positive_data(train_data) 63 | train_negative_data = get_negative_data(train_data) 64 | validate_positive_data = get_positive_data(validate_data) 65 | validate_negative_data = get_negative_data(validate_data) 66 | 67 | time_name = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())) # 获取当前时间 68 | save_assistant_name = time_name + "_is_used_0926data_"+str(is_add_0926_data) # 保存文件辅助变量 69 | 70 | # xgb_fe_model = XgbFeatureEnsembleModel(train_data, train_positive_data, train_negative_data) 71 | # xgb_fe_model.train() 72 | # xgb_fe_model.predict(validate_data) 73 | # xgb_fe_model.precision_score("xgb_fe model precision score") 74 | # xgb_fe_model.recall_score("xgb_fe model recall score") 75 | # xgb_fe_model.score("xgb_fe model validate score") 76 | # xgb_fe_model.output_result("result"+"/validate_xgb_"+save_assistant_name + "_score_" + str(int(xgb_fe_model.f1_score*1000000)) + ".csv") 77 | # 78 | # xgb_fe_model.predict(test_data) 79 | # xgb_fe_model.output_result("result" + "/test_xgb_" + save_assistant_name + "_score_" + str(int(xgb_fe_model.f1_score*1000000)) + ".csv") 80 | # 81 | # xgb_fe_model.data = [] 82 | # xgb_fe_model.positive_data = [] 83 | # xgb_fe_model.negative_data = [] 84 | # SaveClassModel().save_model("model/class_xgb_"+save_assistant_name + "_score_" + str(int(xgb_fe_model.f1_score*1000000)) + ".pickle", xgb_fe_model) 85 | 86 | lgb_fe_model = LgbFeatureEnsembleModel(train_data, train_positive_data, train_negative_data) 87 | lgb_fe_model.set_train_device() 88 | lgb_fe_model.train() 89 | lgb_fe_model.predict(validate_data) 90 | lgb_fe_model.precision_score("lgb_fe model precision score") 91 | lgb_fe_model.recall_score("lgb_fe model recall score") 92 | lgb_fe_model.score("lgb_fe model validate score") 93 | lgb_fe_model.output_result("result"+"/validate_lgb_"+save_assistant_name + "_score_" + str(int(lgb_fe_model.f1_score*1000000)) + ".csv") 94 | 95 | lgb_fe_model.predict(test_data) 96 | lgb_fe_model.output_result("result"+"/test_lgb_"+save_assistant_name + "_score_" + str(int(lgb_fe_model.f1_score*1000000)) + ".csv") 97 | 98 | lgb_fe_model.data = [] 99 | lgb_fe_model.positive_data = [] 100 | lgb_fe_model.negative_data = [] 101 | SaveClassModel().save_model("model/class_lgb_" + save_assistant_name + "_score_" + str(int(lgb_fe_model.f1_score*1000000)) + ".pickle", lgb_fe_model) 102 | logging.info("model finish ...") 103 | 104 | # print("*"*100+"DirectlyPredictResult"+"*"*100) 105 | # model_path = "model/class_lgb_" + save_assistant_name + "_score_" + str(int(lgb_fe_model.f1_score*1000000)) + ".pickle" 106 | # data_path = "data/oppo_round1_vali_20180929.txt" 107 | # 108 | # dpr = DirectlyPredictResult(model_path=model_path, data_path=data_path, is_ouput_score=1, is_add_0926_data=is_add_0926_data, is_debug=is_debug) 109 | # dpr.predict() 110 | # rst_path = "result"+"/vali_dpr_"+save_assistant_name + "_score_" + str(int(dpr.model.f1_score*1000000)) + ".csv" 111 | # dpr.output_result(rst_path) 112 | 113 | # np.sum(dpr.data.drop(['label'],axis=1)==validate_data[lgb_fe_model.data_col].drop(['label'],axis=1),axis=0) -------------------------------------------------------------------------------- /tool/nlp_basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2018/10/17 9:21 4 | @Author: Johnson 5 | @Email:593956670@qq.com 6 | @Software: PyCharm 7 | """ 8 | import os 9 | import sys 10 | import pyltp 11 | 12 | personal_seg_dict = './tmp_file' 13 | ltp_models_dir = 'D:/GithubRepos/gitdata/tcgame_ogeek/ltp_data_v3.4.0' 14 | 15 | model_files = os.listdir(ltp_models_dir) 16 | ltp_models = {os.path.splitext(fname)[0]:os.path.join(ltp_models_dir,fname) for fname in model_files} 17 | 18 | sensplit = pyltp.SentenceSplitter.split 19 | segmentor_ = None 20 | postagger_ = None 21 | ner_ = None 22 | parser_ = None 23 | srl_ = None 24 | 25 | 26 | def segment(sentence): 27 | global segmentor_ 28 | if segmentor_ is None: 29 | segmentor_ = pyltp.Segmentor() 30 | #segmentor_.load(ltp_models['cws']) 31 | # 加载模型,第二个参数是您的外部词典文件路径 32 | segmentor_.load_with_lexicon(ltp_models['cws'], personal_seg_dict) 33 | return segmentor_.segment(sentence) 34 | 35 | 36 | def postag(words): 37 | global postagger_ 38 | if postagger_ is None: 39 | postagger_ = pyltp.Postagger() 40 | postagger_.load(ltp_models['pos']) 41 | return postagger_.postag(words) 42 | 43 | 44 | def ner(words, postags): 45 | global ner_ 46 | if ner_ is None: 47 | ner_ = pyltp.NamedEntityRecognizer() 48 | ner_.load(ltp_models['ner']) 49 | return ner_.recognize(words, postags) 50 | 51 | 52 | def parse(words, postags): 53 | global parser_ 54 | if parser_ is None: 55 | parser_ = pyltp.Parser() 56 | parser_.load(ltp_models['parser']) 57 | return parser_.parse(words, postags) 58 | 59 | 60 | def srl(words, postags, arcs): 61 | global srl_ 62 | if srl_ is None: 63 | srl_ = pyltp.SementicRoleLabeller() 64 | srl_.load(ltp_models['pisrl_win']) 65 | return srl_.label(words, postags, arcs) 66 | 67 | 68 | def release(): 69 | global segmentor_, postagger_, ner_, parser_, srl_ 70 | if segmentor_ is not None: 71 | segmentor_.release() 72 | if postagger_ is not None: 73 | postagger_.release() 74 | if ner_ is not None: 75 | ner_.release() 76 | if parser_ is not None: 77 | parser_.release() 78 | if srl_ is not None: 79 | srl_.release() --------------------------------------------------------------------------------