├── fusai ├── result_deal.ipynb ├── fusai_pandas_keng_w2v_total.py ├── pandas_keng_w2v_total_youhua.py ├── pandas_keng_w2v_total_drop.py ├── fusai_pandas_lyeby_w2v_total.py ├── offline_keng_20181122.py └── lake_20181118.py ├── chusai ├── result_ronghe.ipynb ├── ronghe_nn.ipynb ├── result_ronghe_valid.ipynb ├── oppo_online_alldata_model_lgb.ipynb ├── data_analysis.ipynb └── oppo_model_lgb_online.ipynb ├── README.md └── B_oppo_online_pre.py /fusai/result_deal.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RHKeng/OGeek/HEAD/fusai/result_deal.ipynb -------------------------------------------------------------------------------- /fusai/fusai_pandas_keng_w2v_total.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import time 6 | import datetime 7 | import gc 8 | import jieba 9 | import functools 10 | from gensim.models import Word2Vec 11 | import json 12 | 13 | #定义jieba分词函数 14 | def jieba_sentences(sentence): 15 | seg_list = jieba.cut(sentence) 16 | seg_list = list(seg_list) 17 | return seg_list 18 | 19 | ##----------------------------------------------------------------------------- 20 | if __name__=='__main__': 21 | now = datetime.datetime.now() 22 | now = now.strftime('%m-%d-%H-%M') 23 | print(now) 24 | train_df = pd.read_table('../data/data_train.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 25 | valid_df = pd.read_table('../data/data_vali.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 26 | test_df = pd.read_table('../data/data_test.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 27 | total_df = pd.concat([test_df, valid_df, train_df]) 28 | del test_df 29 | del valid_df 30 | del train_df 31 | gc.collect() 32 | total_df['query_prediction'] = total_df['query_prediction'].map(lambda x : np.nan if x is np.nan else eval(x)) 33 | total_df['query_prediction'] = total_df['query_prediction'].map(lambda x : np.nan if x is np.nan else sorted(x.keys())) 34 | sentence_list = [t for x in total_df['query_prediction'][total_df.query_prediction.notnull()] for t in x] + total_df['title'].tolist() + total_df['prefix'].tolist() 35 | sentence_list = [jieba_sentences(x) for x in sentence_list] 36 | my_model = Word2Vec(sentence_list, size=50, window=5, sg=1, hs=1, min_count=2, workers=1, seed=0) 37 | my_model.save('../data/keng_940129seed0/w2v_total_final_50wei_1.model') 38 | 39 | now = datetime.datetime.now() 40 | now = now.strftime('%m-%d-%H-%M') 41 | print(now) 42 | -------------------------------------------------------------------------------- /fusai/pandas_keng_w2v_total_youhua.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import time 6 | import datetime 7 | import gc 8 | import jieba 9 | import functools 10 | from gensim.models import Word2Vec 11 | import json 12 | 13 | #定义jieba分词函数 14 | def jieba_sentences(sentence, stop_words): 15 | sentence = sentence.replace('%2C', ',') 16 | seg_list = [word for word in jieba.cut(sentence) if (word not in stop_words)] 17 | return seg_list 18 | 19 | ##----------------------------------------------------------------------------- 20 | if __name__=='__main__': 21 | now = datetime.datetime.now() 22 | now = now.strftime('%m-%d-%H-%M') 23 | print(now) 24 | train_df = pd.read_table('../data/data_train.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 25 | valid_df = pd.read_table('../data/data_vali.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 26 | test_df = pd.read_table('../data/data_test.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 27 | total_df = pd.concat([test_df, valid_df, train_df]) 28 | del test_df 29 | del valid_df 30 | del train_df 31 | gc.collect() 32 | total_df['query_prediction'] = total_df['query_prediction'].map(lambda x : np.nan if x is np.nan else eval(x)) 33 | total_df['query_prediction'] = total_df['query_prediction'].map(lambda x : np.nan if x is np.nan else sorted(x.keys())) 34 | jieba.load_userdict('../data/user_dict.dat') 35 | stop_words = [w.replace('\n','') for w in open("../data/user_stopwords.dat", 'r', encoding='utf-8').readlines()] 36 | sentence_list = [t for x in total_df['query_prediction'][total_df.query_prediction.notnull()] for t in x] + total_df['title'].tolist() + total_df['prefix'].tolist() 37 | sentence_list = [jieba_sentences(x, stop_words) for x in sentence_list] 38 | my_model = Word2Vec(sentence_list, size=50, window=5, sg=1, hs=1, min_count=2, workers=1, seed=0) 39 | my_model.save('../data/keng_2018seed0_youhua/w2v_total_final_50wei_1.model') 40 | 41 | now = datetime.datetime.now() 42 | now = now.strftime('%m-%d-%H-%M') 43 | print(now) 44 | -------------------------------------------------------------------------------- /fusai/pandas_keng_w2v_total_drop.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import time 6 | import datetime 7 | import gc 8 | import jieba 9 | import functools 10 | from gensim.models import Word2Vec 11 | import json 12 | 13 | #定义jieba分词函数 14 | def jieba_sentences(sentence): 15 | seg_list = jieba.cut(sentence) 16 | seg_list = list(seg_list) 17 | return seg_list 18 | 19 | ##----------------------------------------------------------------------------- 20 | if __name__=='__main__': 21 | now = datetime.datetime.now() 22 | now = now.strftime('%m-%d-%H-%M') 23 | print(now) 24 | train_df = pd.read_table('../data/data_train.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 25 | valid_df = pd.read_table('../data/data_vali.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 26 | test_df = pd.read_table('../data/data_test.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 27 | total_df = pd.concat([test_df, valid_df, train_df]) 28 | del test_df 29 | del valid_df 30 | del train_df 31 | gc.collect() 32 | total_df_prefix = total_df.drop_duplicates(['prefix']) 33 | total_df_title = total_df.drop_duplicates(['title']) 34 | total_df_query = total_df.drop_duplicates(['query_prediction']) 35 | del total_df 36 | gc.collect() 37 | total_df_query['query_prediction'] = total_df_query['query_prediction'].map(lambda x : np.nan if x is np.nan else eval(x)) 38 | total_df_query['query_prediction'] = total_df_query['query_prediction'].map(lambda x : np.nan if x is np.nan else sorted(x.keys())) 39 | sentence_list = [t for x in total_df_query['query_prediction'][total_df_query.query_prediction.notnull()] for t in x] + total_df_title['title'].tolist() + total_df_prefix['prefix'].tolist() 40 | sentence_list = [jieba_sentences(x) for x in sentence_list] 41 | my_model = Word2Vec(sentence_list, size=50, window=5, sg=1, hs=1, min_count=2, workers=1, seed=0) 42 | my_model.save('../data/keng_2018seed0_drop/w2v_total_final_50wei_1.model') 43 | 44 | now = datetime.datetime.now() 45 | now = now.strftime('%m-%d-%H-%M') 46 | print(now) 47 | -------------------------------------------------------------------------------- /fusai/fusai_pandas_lyeby_w2v_total.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import time 6 | import datetime 7 | import gc 8 | import jieba 9 | import functools 10 | from gensim.models import Word2Vec 11 | import json 12 | 13 | #定义jieba分词函数 14 | def jieba_sentences(sentence): 15 | seg_list = jieba.cut(sentence) 16 | seg_list = list(seg_list) 17 | return seg_list 18 | 19 | ##----------------------------------------------------------------------------- 20 | if __name__=='__main__': 21 | now = datetime.datetime.now() 22 | now = now.strftime('%m-%d-%H-%M') 23 | print(now) 24 | 25 | train_df = pd.read_table('../data/data_train.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3) 26 | valid_df = pd.read_table('../data/data_vali.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3) 27 | test_df = pd.read_table('../data/data_test.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3) 28 | total_df = pd.concat([test_df, valid_df, train_df]) 29 | del test_df 30 | del valid_df 31 | del train_df 32 | gc.collect() 33 | total_df['query_prediction'] = total_df['query_prediction'].map(lambda x : np.nan if x is np.nan else eval(x)) 34 | total_df['query_prediction'] = total_df['query_prediction'].map(lambda x : np.nan if x is np.nan else sorted(x.keys())) 35 | sentence_list = [] 36 | temp_sentence_list = [] 37 | i = 0 38 | for query_prediction_jieba_keys, title_jieba, prefix_jieba in total_df[['query_prediction', 'title', 'prefix']].values: 39 | i = i + 1 40 | if (i%2000) == 0: 41 | # print(i) 42 | sentence_list = sentence_list + temp_sentence_list 43 | temp_sentence_list = [] 44 | if query_prediction_jieba_keys is not np.nan: 45 | temp_sentence_list = temp_sentence_list + query_prediction_jieba_keys 46 | temp_sentence_list = temp_sentence_list + [title_jieba] 47 | temp_sentence_list = temp_sentence_list + [prefix_jieba] 48 | 49 | sentence_list = sentence_list + temp_sentence_list 50 | print(len(sentence_list)) 51 | sentence_list = [jieba_sentences(x) for x in sentence_list] 52 | my_model = Word2Vec(sentence_list, size=50, window=5, sg=1, hs=1, min_count=2, workers=1, seed=0) 53 | my_model.save('../data/pandas_w2v_lyeby_3/w2v_total_final_50wei_1.model') 54 | 55 | now = datetime.datetime.now() 56 | now = now.strftime('%m-%d-%H-%M') 57 | print(now) 58 | -------------------------------------------------------------------------------- /chusai/result_ronghe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np \n", 12 | "import pandas as pd\n", 13 | "import time\n", 14 | "import datetime\n", 15 | "import gc\n", 16 | "from sklearn.model_selection import KFold, cross_val_score, train_test_split\n", 17 | "from sklearn.model_selection import StratifiedKFold\n", 18 | "from sklearn.metrics import roc_auc_score, log_loss\n", 19 | "import lightgbm as lgb\n", 20 | "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", 21 | "from sklearn.feature_extraction.text import CountVectorizer\n", 22 | "from sklearn.feature_selection import chi2, SelectPercentile\n", 23 | "import math\n", 24 | "from sklearn.metrics import f1_score\n", 25 | "import jieba\n", 26 | "import jieba.posseg as psg\n", 27 | "from collections import Counter\n", 28 | "import functools\n", 29 | "from gensim.models import word2vec\n", 30 | "import Levenshtein\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | " is_prefix_in_train predicted_score\n", 43 | "0 1 0.060153\n", 44 | "1 1 0.702566\n", 45 | "2 1 0.678097\n", 46 | "3 1 0.286608\n", 47 | "4 1 0.092973\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "keng_result_df = pd.read_csv('../result/keng_score.csv')\n", 53 | "yuna_result_df = pd.read_csv('../result/lgb1_select_pred.csv')\n", 54 | "print(keng_result_df.head())\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "yuna_result_df['is_prefix_in_train'] = keng_result_df['is_prefix_in_train']\n", 66 | "yuna_result_df.rename(columns={'pred':'predicted_score'}, inplace=True)\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "0.4446379748502132\n", 79 | "0.4446454488950832\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "print(np.mean(keng_result_df['predicted_score'][keng_result_df.is_prefix_in_train == 0]))\n", 85 | "print(np.mean(keng_result_df['predicted_score'][keng_result_df.is_prefix_in_train == 1]))\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "original mean : 0.4493946473263437\n", 98 | "0.44464061240463637\n", 99 | "0.44462622176804295\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "yuna_test_prefix0_df = yuna_result_df[yuna_result_df.is_prefix_in_train == 0].copy()\n", 105 | "yuna_test_prefix1_df = yuna_result_df[yuna_result_df.is_prefix_in_train == 1].copy()\n", 106 | "\n", 107 | "#定义调整函数\n", 108 | "def resultAdjustment(result_df, t):\n", 109 | " result_df_temp = result_df.copy()\n", 110 | " result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))\n", 111 | " result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) \n", 112 | " print(result_df_temp['adjust_result'].mean())\n", 113 | " return result_df_temp['adjust_result']\n", 114 | "\n", 115 | "print('original mean : ', yuna_test_prefix0_df['predicted_score'].mean())\n", 116 | "yuna_test_df_after0 = resultAdjustment(yuna_test_prefix0_df, -0.0231)\n", 117 | "yuna_test_df_after1 = resultAdjustment(yuna_test_prefix1_df, 0.49635)\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "0.44464061240463637\n", 130 | "0.44462622176804295\n" 131 | ] 132 | }, 133 | { 134 | "name": "stderr", 135 | "output_type": "stream", 136 | "text": [ 137 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 138 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 139 | "\n", 140 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 141 | " \"\"\"Entry point for launching an IPython kernel.\n", 142 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 143 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 144 | "\n", 145 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 146 | " \n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "yuna_result_df['predicted_score'][yuna_result_df.is_prefix_in_train == 0] = yuna_test_df_after0\n", 152 | "yuna_result_df['predicted_score'][yuna_result_df.is_prefix_in_train == 1] = yuna_test_df_after1\n", 153 | "print(np.mean(yuna_result_df['predicted_score'][yuna_result_df.is_prefix_in_train == 0]))\n", 154 | "print(np.mean(yuna_result_df['predicted_score'][yuna_result_df.is_prefix_in_train == 1]))\n" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 9, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "0.44462748785625167\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "keng_result_df.rename(columns={'predicted_score':'keng_pred'}, inplace=True)\n", 172 | "keng_result_df['yuna_pred'] = yuna_result_df['predicted_score']\n", 173 | "print(np.mean(keng_result_df['yuna_pred']))\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 12, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | " is_prefix_in_train keng_pred yuna_pred predicted_score\n", 186 | "0 1 0.060153 0.060377 0.060377\n", 187 | "1 1 0.702566 0.686186 0.702566\n", 188 | "2 1 0.678097 0.683564 0.683564\n", 189 | "3 1 0.286608 0.264104 0.286608\n", 190 | "4 1 0.092973 0.094173 0.094173\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "def get_max_pred(df):\n", 196 | " keng_pred = df['keng_pred']\n", 197 | " yuna_pred = df['yuna_pred']\n", 198 | " if keng_pred > yuna_pred:\n", 199 | " return keng_pred\n", 200 | " else:\n", 201 | " return yuna_pred\n", 202 | " \n", 203 | "keng_result_df['predicted_score'] = keng_result_df.apply(get_max_pred, axis=1)\n", 204 | "print(keng_result_df.head())\n" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 30, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | " is_prefix_in_train keng_pred yuna_pred predicted_score\n", 217 | "0 1 0.060153 0.060377 0.060265\n", 218 | "1 1 0.702566 0.686186 0.694376\n", 219 | "2 1 0.678097 0.683564 0.680831\n", 220 | "3 1 0.286608 0.264104 0.275356\n", 221 | "4 1 0.092973 0.094173 0.093573\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "keng_result_df['predicted_score'] = keng_result_df['keng_pred'] * 0.5 + keng_result_df['yuna_pred'] * 0.5\n", 227 | "print(keng_result_df.head())\n" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 17, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "0.4081\n" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "keng_result_df['predicted_label'] = keng_result_df['predicted_score'].map(lambda x : 1 if x > 0.519 else 0)\n", 245 | "print(np.mean(keng_result_df['predicted_label']))\n" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 18, 251 | "metadata": { 252 | "collapsed": true 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "# 导出预测结果\n", 257 | "def exportResult(df, fileName):\n", 258 | " df.to_csv('../result/%s.csv' % fileName, header=False, index=False)\n", 259 | "\n", 260 | "exportResult(keng_result_df[['predicted_label']], 'keng_yuna_ronghe_11_5')\n" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": true 268 | }, 269 | "outputs": [], 270 | "source": [] 271 | } 272 | ], 273 | "metadata": { 274 | "kernelspec": { 275 | "display_name": "Python 3", 276 | "language": "python", 277 | "name": "python3" 278 | }, 279 | "language_info": { 280 | "codemirror_mode": { 281 | "name": "ipython", 282 | "version": 3 283 | }, 284 | "file_extension": ".py", 285 | "mimetype": "text/x-python", 286 | "name": "python", 287 | "nbconvert_exporter": "python", 288 | "pygments_lexer": "ipython3", 289 | "version": "3.6.1" 290 | } 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 2 294 | } 295 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OGeek 2 | A competition about search results CTR prediction in real-time search scenario ! Rank : 26 / 2888 ! 3 | [OGeek算法挑战赛](https://tianchi.aliyun.com/competition/entrance/231688/introduction?spm=5176.12281957.0.0.38b04c2azpDhUa)是天池平台上的一个大赛,主要内容是基于百万最新真实用户搜索数据的实时搜索场景下搜索结果ctr预估。给定用户输入prefix(用户输入,查询词前缀)以及文章标题、文章类型等数据,预测用户是否点击,评价标准采用F1 score 指标。 4 | 最终排名:初赛:17 / 2888;复赛:26 / 2888 5 | 附:[github地址](https://github.com/RHKeng/OGeek) 6 | 7 | * 1 **比赛简介** 8 | 在搜索业务下有一个场景叫实时搜索(Instance Search),就是在用户不断输入过程中,实时返回查询结果。此次赛题来自OPPO手机搜索排序优化的一个子场景,并做了相应的简化,意在解决query-title语义匹配的问题。简化后,本次题目内容主要为一个实时搜索场景下query-title的ctr预估问题。给定用户输入prefix(用户输入,查询词前缀)以及文章标题、文章类型等数据,预测用户是否点击,评价标准采用F1 score 指标。 9 | 10 | * 2 **问题分析** 11 | 通过分析问题,我们尝试将本次比赛分解成两个子问题,第一个是从传统CTR角度思考,根据出现过的搜索前缀prefix和标题title等数据提取相关特征预测当今样本的点击率,可以尝试提取相关的点击率特征来解决;第二个是从搜索前缀prefix和标题title的语义相似度判断用户点击的概率,相似度越高代表用户点击的概率越大,可以尝试从文本语义匹配特征或者是神经网络的角度来解决。 12 | 13 | * 3 **数据分析与数据清洗** 14 | 由于本次比赛的字段比较少,而且每个字段都有特定的含义,在初步分析过后发现数据中明显的噪声并不多,所以没有做比较特别的数据清洗。在从传统CTR角度做了一个baseline模型后,在线下验证时发现验证集中新prefix的预测值偏低,因此开始研究整个比赛的数据分布。 15 | (1)训练集,验证集和测试集的数据分布(初赛) 16 | 思考:验证集中新prefix样本预测值偏低原因:统计特征是针对整个训练集的统计,也就是训练集中的数据都有其对应的历史统计特征。然而验证集中却有一部分数据在训练集中从未出现过,但模型在训练时却并没有碰到过带有缺失值的统计数据,因此在验证集的新数据上的预测自然会出现偏差。因此我们开始进行EDA发现以下: 17 | 1、对于prefix来说,无论新旧数据,点击率的均值都是0.37+;而对于title,旧数据的点击率是0.37,但新数据的点击率是0.32,有一定的区别; 18 | 2、有些统计特征是只缺失prefix,有些是只缺失title,有些则是prefix跟title都缺失。具体分布情况如下(后期主要关注prefix和title的新样本率): 19 | ![image.png](https://upload-images.jianshu.io/upload_images/12207295-b423bf65491eb735.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 20 | 3、猜想出题方构造数据集的场景,这三个测试集应该是从一个全集的数据库中随机抽样出来的。比如训练集抽200w,再抽5w训练集,这5w训练集自然而然会有一些样本是原先训练集没有的。之后再抽5w的测试集,同样也会有之前两个数据集没有的。而在分布特性上还有个最直观的的数据就是新旧prefix和title的平均样本量。 21 | ![image.png](https://upload-images.jianshu.io/upload_images/12207295-77bd6d68a4f62396.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 22 | 新数据的平均样本量要明显低于整个数据集的均值,正好符合抽样时小概率样本容易缺失的规律。(从一个数据集中抽取数据时,样本量少的,即小概率样本,被抽到的概率相对来说就会比较小,所以在验证集和测试集中新prefix和title的平均样本量就会低) 23 | (2)训练集、验证集和测试集的数据生成猜想 24 | 从训练集的样本记录中,我们发现了很多prefix是扎堆出现的:要不就是直接连着好几个prefix都一样,要不就是中间穿插着一些其他的prefix,还有些是临近的prefix内容比较接近。感觉这样的数据分布有点像是后端的日志记录。根据比赛最开始对数据场景的分析理解,用户的一次搜索行为应该会产生多条记录。比如推荐给用户的title有3个,那么就会对应产生3条样本数据,这三条样本数据要不只有一个被点击(标签是1),要不就全部都没被点击。如果这个样本数据真的是日志记录,那么就有时序性的关系在里面,也就不难理解为什么总是有多条相同prefix的记录相邻出现。 25 | 26 | * 4 **训练样本构造** 27 | 本次比赛,其实构造出跟测试集接近的训练样本是关键点,因此研究数据分布,构造合适的训练样本基本贯穿了我们这次比赛,接下来详细说一下我们研究数据分布的几个阶段,以及每个阶段我们是如何构造训练样本的: 28 | (1)初赛 29 | 在初赛初期,我们基于传统ctr的做法构造了一个baseline,直接将训练集基于全局提取点击率等特征(还没有考虑数据穿越的问题),结果发现验证集中新prefix样本预测出来的概率偏低,而实际新旧prefix对应的样本点击率都是0.37左右,因此我们开始研究如何构造出跟验证集和测试集分布接近的数据用以训练,从而“教会模型”如何预测有缺失值特征的样本。 30 | *****在训练集中模拟出新数据的样本(队友研究的)***** 31 | 通过设定一定比例的只缺失prefix,只缺失title,同时缺失prefix和title样本,提取转化率,拟合具有缺失值的样本,具体做法如下: 32 | 1、同样是对训练集复制一份“copy”数据集用来构造缺失特征; 33 | 2、将样本打乱后,根据验证集统计的三种情况的比例,将42.33%的样本的prefix设置为空,11.7%的样本的title设置为空,剩下的样本prefix和title都设置为空; 34 | 3、用训练集统计数据,构造copy数据集的样本特征; 35 | 4、模型训练时根据缺失样本比例从copy中抽样并入训练集中,线下为30%,线上为15%; 36 | *****发现全局提取特征存在数据穿越问题,参考腾讯赛中无时间信息数据点击率特征的提取方法,进行五折交叉提特征,发现较好拟合了数据分布***** 37 | 1、五折交叉提特征的具体做法:将训练集划分成五份,每一份的点击率特征是基于其他四份数据提取的,与全局提特征不同; 38 | 2、五折交叉提特征避免穿越:五折交叉提特征时,每一折的数据的点击率特征都是基于其他四折数据提取的,跟本折数据没有关系,不会把自己算进去,这样就不会有穿越问题,不会利用到未来信息; 39 | 3、五折交叉提特征拟合分布:前面在数据分析说过,很有可能主办方是先抽取200w训练集,再抽取5w验证集,最后抽取5w测试集。我们在提取点击率特征时,验证集是基于训练集来提的,测试集是基于训练集和验证集来提的,自然会有新的prefix和title样本出现。而当我们进行五折交叉提特征时,每一折是基于其他四折来提取的,这样也会产生新的prefix和title样本,过程一致,合理; 40 | ![image.png](https://upload-images.jianshu.io/upload_images/12207295-dcd3b1ccf5457498.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 41 | 4、五折相对于其他折数要好:为什么5折是最好的?尝试过40折,但是发现结果爆跌,3折和10折的成绩也都不如5折的好,我们猜测这个跟分布比较相关,但是分析过后没发现什么规律 42 | ![image.png](https://upload-images.jianshu.io/upload_images/12207295-66d21552723a449a.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 43 | 我们对比了从训练集模拟新样本和五折交叉提特征两种方法,发现五折交叉提特征的方法相对来说要好一点,因此初赛一直沿用这种方法,最后构造训练样本的过程:训练集中的统计特征采用五折交叉提取的方法来做,验证集中的统计特征基于训练集来提取,测试集中的统计特征基于训练集和验证集来提取。 44 | (2)复赛 45 | 复赛一开始拿到数据,我们就对数据分布进行了分析,好消息是复赛的验证集跟测试集同分布了,意味着线下用验证集验证会比初赛更准确。坏消息是测试集的新样本率暴增,训练集五折抽样的新样本率与验证集和测试集相去甚远,基本不再适用。 46 | ![image.png](https://upload-images.jianshu.io/upload_images/12207295-824fe47ecd9ca3b4.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 47 | 我们发现,新prefix的平均样本量竟然和旧数据差不多,甚至还更高,这根本就不是普通样本抽样应该会产生的数据分布。我们猜想,这近40%的新prefix数据,怕是出题方硬塞进去的。而由于样本的prefix跟title之间有很强的相关性,所以间接带动了title的新样本量的上升,至少从新旧title的平均样本量来看,title还有比较明显的抽样稀释的规律。后来又发现了在验证集和测试集中,有一条很明显的60%的新旧数据交界线,也证明了这一点。 48 | 针对这一点,我们开始进行分工,一部分人进行模型的特征融合(因为这个比赛限制了只能使用两个模型),之前研究新样本拟合的队友进行数据分布拟合,结果如下: 49 | *****仿造官方从训练集里拿出一整份的新prefix来做特征****** 50 | 1、复制一份train数据集; 51 | 2、对复制出来的数据集按prefix进行分组归类,然后再对prefix进行五折交叉统计特征,此时相同prefix的样本只会被分配到同一折中; 52 | 3、上一步诞生出来的只是200w的“新prefix”数据集。对原本的train数据集,还是采用原来的五折交叉统计,为了避免这里五折产生的数据集影响到后边的新旧数据比例计算,因此对数据集中空缺了prefix或title的数据进行剔除,因此得到的是约170w的“旧prefix”数据集; 53 | 4、按照40%的prefix新旧数据比例,从新prefix数据集中随机抽样一部分数据,并入上一步产生的“旧prefix”数据集中,使得最终数据集中的prefix新旧比例为4:6。由此得到的大约是290w的训练集数据; 54 | ![image.png](https://upload-images.jianshu.io/upload_images/12207295-ae10849c5caee137.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 55 | *****发现随机与不随机的5折有着重大区别***** 56 | 在做特征融合到主模型的时候,我们发现主模型(师兄的模型初赛成绩较好,我们用来当做复赛lgb的主模型)在做五折交叉时,用的不是StratifiedKFold,也没有shuffle,直接就是KFold,一开始也并没有觉得打乱跟不打乱有什么区别,只是觉得不打乱也可以,就没有去改过。后面留意到了训练集中prefix经常扎堆出现的情况,所以当看到kfold时,就开始怀疑打乱跟不打乱的kfold到底等不等价,结果发现不打乱的5折竟然也很接近测试集的分布: 57 | ![image.png](https://upload-images.jianshu.io/upload_images/12207295-20ad822d05e0f3a3.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 58 | *****线上数据训练模型分布抽样方案***** 59 | 我们之前做EDA一直都知道在线下不打乱5折的训练集跟测试集分布比较接近,那么对于线上的训练集与验证集合并后的数据,是不是对它们进行不打乱的5折依旧能保持这种比较接近的分布?结果我们通过EDA发现,新样本率已经不一致了,其他一些特征的均值对比以前也出现了稍大点的偏差。 60 | ![image.png](https://upload-images.jianshu.io/upload_images/12207295-10fedea7132b8686.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 61 | 最后我们采用的方案是,对训练集进行五折交叉统计,用训练集统计验证集,再把训练集跟验证集合并起来。这样构造出来的线上训练集依旧保持了与测试集接近的分布,而且由于完整保留了同分布的验证集,使得合并后数据集的相比单纯五折的训练集的分布要更加接近测试集。 62 | 63 | * 5 **特征工程** 64 | 本次比赛由于限制了模型的数量,因此我们队只保留了一个lgb主模型,通过特征融合的方式,把我们初赛每个人所做的lgb模型的想法融合进来。我们本次比赛的最终主模型特征可以分为以下几个部分: 65 | (1)点击率相关特征(没有做贝叶斯平滑): 66 | prefix  title  query_prediction  tag的五折点击率; 67 | prefix  title  tag两两之间交叉的五折点击率; 68 | 思考:点击率相关的特征其实反映了某个维度的数据在过去一段时间的表现规律,所以在本时段他很有可能也有这个规律,因此提取点击率特征会有效果。 69 | (2)语义相似度特征: 70 | prefix和title的编辑距离,jaccard相似度,word2vec出来向量的余弦距离等; 71 | prefix和query_prediction各个key的编辑距离,jaccard相似度,word2vvec出来向量的余弦距离之和(这里针对每个key都乘上了对应的概率值); 72 | title是否存在于query_prediction的key中; 73 | prefix是否是title的子字符串; 74 | 思考:依据业务场景,prefix是用户的输入,title是推荐的内容,通常来说,title的内容和prefix内容(语义)越接近,用户点击的概率越大。 75 | (3)文本特征: 76 | title的长度; 77 | prefix的长度; 78 | query_prediction的长度; 79 | title的长度减去prefix的长度; 80 | prefix的长度与title的长度比值; 81 | query_prediction各个key长度的均值减去prefix的长度; 82 | query_prediction各个key长度的均值与title的长度比值; 83 | prefix在title中的位置,比如prefix为‘腾讯’,title为‘欧普大战腾讯’,那么返回的值为4,即prefix在title中的第四个字符位出现; 84 | prefix、title和query_prediction出来的50维word2vec表征特征; 85 | (4)统计特征: 86 | title和query_prediction中各个key的编辑距离的sum,最大值,最小值,均值,方差; 87 | title和query_prediction中各个key的余弦距离的sum,最大值,最小值,均值,方差; 88 | query_prediction预测概率值的最大、最小、均值、方差; 89 | (5)占比特征: 90 | prefix+title的出现次数/prefix的出现次数; 91 | (6)rank特征: 92 | tag_ctr在title下的排名; 93 | title_ctr在tag下的排名; 94 | 95 | 附:几种距离的具体意义 96 | 编辑距离:又称Levenshtein距离,是指两个字串之间,由一个转成另一个所需的最少编辑操作次数。许可的编辑操作包括将一个字符替换成另一个字符,插入一个字符,删除一个字符。一般来说,编辑距离越小,两个串的相似度越大。 97 | Jaccard相似度:两个集合的交集除以两个集合的并集,所得的就是两个集合的相似度。 98 | 余弦距离:余弦相似度用向量空间中两个向量夹角的余弦值作为衡量两个个体间差异的大小。相比距离度量,余弦相似度更加注重两个向量在方向上的差异,而非距离或长度上。 99 | 100 | * 6 **模型选择** 101 | 本次比赛最主流的模型还是lgb,鉴于主办方限制了模型的使用个数(不能超过两个),因此很多队伍都是做了一个lgb模型,然后再做一个nn模型用于匹配prefix和title的语义相似度。我们队伍也是做了一个lgb主模型和一个nn模型,由于我们nn模型是最后一天才融合进来,所以无法得知它的效果,接下来就先主要介绍我们的lgb模型,nn模型由于是队友做的,目前还不是特别清楚其具体的技术实现细节,这部分就等了解清楚后再补充。 102 | (1)lgb主模型 103 | *****训练方式***** 104 | 我们本次比赛主模型lgb在线下验证时,采用训练集训练,验证集作为模型early_stopping的验证条件,通过验证集的表现来衡量特征/模型的好坏。而在线上的时候,我们是将训练集和验证集同时作为训练数据,至于模型的迭代次数,则根据线下最佳模型的迭代次数来设定。 105 | 我们尝试过模型训练时不用验证集作为早停,而是从训练集中抽一部分数据作为早停的valid,再用迭代出的次数训练全部训练集来预测验证集,线下训练时很快就发现这种训练方案的迭代次数从原来的200多暴涨到了1000多,线上成绩也不例外的降了2个千。于是便意识到,抽样后的训练集分布到底并非跟验证集和测试集完全一致,因此官方才会提供了验证集以便我们用来验证模型在不同分布的数据上的鲁棒性。 106 | *****通过对验证集加权来拟合分布***** 107 | 鉴于我们分析过,其实验证集与测试集的分布是最一致的,我们尝试过通过加大验证集的样本权重,使得线上模型的分布更偏向于验证集的分布(验证集与测试集同分布)。由于我们当时是同时加了几个元素进去(同时加了特征和改了验证集提取特征的方式),无法知道具体的提升,当时目测有两个千。 108 | *****对f1评价函数的探讨***** 109 | 初赛我的模型采用的是AUC,之所以选择AUC而不是logloss,是因为想到F1指标是基于混淆矩阵的统计,AUC也是基于混淆矩阵的指标(只不过无视了阈值的影响),auc和f1其实都关注样本概率的相对大小,而不关心预测概率的均值,而logloss还比较关心预测概率均值,所以直觉上AUC会比logloss更适合。初赛阈值的确定,主要是根据验证集搜索遍历出来五个比较好的阈值来确定的,其实很明显可以看出是有两个峰值,一个是在0.37附近,一个是在0.4附近,我通常会选择第二个峰值作为阈值。 110 | 复赛我们是以师兄的模型为主模型,师兄的模型采用的是logloss,我们有对比过auc和logloss的效果,下面的表格中每种评价指标都分别用不同随机种子测了三组结果: 111 | ![image.png](https://upload-images.jianshu.io/upload_images/12207295-4dd28e9107f6ddc6.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 112 | 迭代次数上AUC普遍比logloss要少几十次。模型早停的评分上,AUC与logloss的抖动都是0.001左右,从模型迭代过程的输出来看,两个指标的迭代都是比较稳定的,没有抖动很厉害的情况。最佳F1评分,线下测出来logloss的成绩似乎还要好一点点,最佳阈值的均值其实也差不太多。 113 | 赛后我们打听了其他队伍自定义f1评价指标的做法,关于评价指标中阈值的选择,主要有两种思路:一种是将阈值当做模型的超参数,整个模型迭代过程都使用这一个阈值进行F1分数的计算,最佳阈值查找就跟搜索模型最优参数一样;另一种思路则是每一步迭代时,都去寻找最佳阈值的位置,把阈值对应的最佳F1评分作为本次迭代的分数。 114 | *****数据后处理***** 115 | 在数据分析中我们曾经提到,我们发现了prefix扎堆出现的情况,具有时序性,用户的一次搜索行为应该会产生多条记录。比如推荐给用户的title有3个,那么就会对应产生3条样本数据,这三条样本数据要不只有一个被点击(标签是1),要不就全部都没被点击。于是,我们将连续的相同prefix且不同title_tag的记录判定为用户的一次搜索行为,在一组记录中,若模型预测有多个记录为1,则只保留模型预测概率最大的记录为1,其他记录更改为0。 116 | 在线下的5w验证集上,这个后处理规则总共修改了13条预测记录,全部修改正确!因此这个假设很可能是成立的。这条后处理规则最后被我们用到了B榜最后一次模型提交上,在20w测试集上总共修改了117个数据,但由于最后一次提交变量太多,不确定效果究竟如何。假设这117个数据全部修改正确的话,大约是5个万的提升。 117 | (2)nn模型 118 | 待补充!!! 119 | 120 | * 7 **赛后总结** 121 | 本次比赛跟CTR和文本都有关联,因此我们都比较感兴趣。但由于开始时间跟神策杯的时间有一点冲突,我在北京答辩完回到学校才开始做,从开始到初赛结束时间跨度大概是一个星期左右,所以初赛用的时间和精力并不多。初赛主要花时间在研究分布和数据集构造上面,还有就是做特征工程,提取比较有用的特征,所以没有时间去研究nn那块。到了复赛组好队,我们队出现了一个问题,就是关于word2vec的随机性问题,我们发现我们复现不了线上最好的模型对应的word2vec特征。由于我们队直接加入了跟word2vec相关的150维特征,这样每次word2vec的结果对我们模型的精度影响很大,很容易产生波动,所以我们在复赛还在研究怎么保证word2vec特征的复现,以及复现一下跟最好成绩比较接近的word2vec模型。此外,复赛我的精力还在做特征融合那块,而且有队友对nn比较熟悉,所以没有特别多时间做nn,这是比较可惜的。由于现在比赛还没答辩,所以我们也拿不到特别好的方案,只是在一些大佬那里打听到一些本次比赛的关键点,如下: 122 | (1)关于点击率的平滑问题:据说,到了复赛,做了贝叶斯平滑和没做贝叶斯平滑成绩差了大概4到5个千。我们队在复赛没做贝叶斯平滑,我在初赛的时候是做了贝叶斯平滑的,但是当时没什么效果,做和没做差别不大,到了复赛就没想着把主模型的点击率特征做一下贝叶斯平滑,这实在可惜; 123 | (2)关于数据分布的问题:其实前排都知道数据分布很关键,每个队伍都有自己的方案,据说,因为验证集分布跟测试集最接近,第一名植物他们是直接拿验证集5w数据来训练,而训练集的数据只是拿来提取特征,很大胆,不过确实很有效果,这就是冠军方案吧。我们也想过,但是没去做,发现了问题,却没有用了最好的方法解决。 124 | 125 | 附: 126 | 保证word2vec模型能复现的方法:设定word2vec运行的核数是1,在运行word2vec模型是时,需要保证当时系统的环境变量PYTHONHASHSEED是同一个数,也可以通过设定系统临时变量来做,例:echo PYTHONSEED = 2018 127 | 128 | -------------------------------------------------------------------------------- /chusai/ronghe_nn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np \n", 12 | "import pandas as pd\n", 13 | "import time\n", 14 | "import datetime\n", 15 | "import gc\n", 16 | "from sklearn.model_selection import KFold, cross_val_score, train_test_split\n", 17 | "from sklearn.model_selection import StratifiedKFold\n", 18 | "from sklearn.metrics import roc_auc_score, log_loss\n", 19 | "import lightgbm as lgb\n", 20 | "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", 21 | "from sklearn.feature_extraction.text import CountVectorizer\n", 22 | "from sklearn.feature_selection import chi2, SelectPercentile\n", 23 | "import math\n", 24 | "from sklearn.metrics import f1_score\n", 25 | "import jieba\n", 26 | "import jieba.posseg as psg\n", 27 | "from collections import Counter\n", 28 | "import functools\n", 29 | "from gensim.models import word2vec\n", 30 | "import Levenshtein\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 14, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "keng_valid_29 = pd.read_csv('../result/valid_29_pred.csv')\n", 42 | "nn_valid_26 = pd.read_csv('../result/submission4.csv')\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 15, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "valid_26_df = keng_valid_29.copy()\n", 54 | "valid_26_df.rename(columns={'predicted_score':'keng_pred'}, inplace=True)\n", 55 | "valid_26_df['nn_pred'] = nn_valid_26['score'][:len(valid_26_df)]\n", 56 | "\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 16, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "100000\n", 69 | "0.3729864797753706\n", 70 | "0.38116897249021026\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "print(len(nn_valid_26))\n", 76 | "print(np.mean(valid_26_df[valid_26_df.is_prefix_in_train == 1]['nn_pred']))\n", 77 | "print(np.mean(valid_26_df[valid_26_df.is_prefix_in_train == 0]['nn_pred']))\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 23, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "0.37251355671061986\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "#定义调整函数\n", 95 | "def resultAdjustment(result_df, t):\n", 96 | " result_df_temp = result_df.copy()\n", 97 | " result_df_temp['x'] = result_df_temp.keng_pred.map(lambda x: -(math.log(((1 - x) / x), math.e)))\n", 98 | " result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) \n", 99 | " print(result_df_temp['adjust_result'].mean())\n", 100 | " return result_df_temp['adjust_result']\n", 101 | "\n", 102 | "keng_valid_df_after = resultAdjustment(valid_26_df, -0.1121)\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 24, 108 | "metadata": { 109 | "collapsed": true 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "valid_26_df['keng_pred'] = keng_valid_df_after\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 28, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "0.3724193617292467\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "#定义调整函数\n", 131 | "def resultAdjustment(result_df, t):\n", 132 | " result_df_temp = result_df.copy()\n", 133 | " result_df_temp['x'] = result_df_temp.nn_pred.map(lambda x: -(math.log(((1 - x) / x), math.e)))\n", 134 | " result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) \n", 135 | " print(result_df_temp['adjust_result'].mean())\n", 136 | " return result_df_temp['adjust_result']\n", 137 | "\n", 138 | "nn_valid_df_after0 = resultAdjustment(valid_26_df[valid_26_df.is_prefix_in_train == 0], -0.0571)\n", 139 | "\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 29, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "valid_26_df.loc[valid_26_df.is_prefix_in_train == 0, 'nn_pred'] = nn_valid_df_after0\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 30, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "0.3729864797753706\n", 163 | "0.3724193617292467\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "print(np.mean(valid_26_df[valid_26_df.is_prefix_in_train == 1]['nn_pred']))\n", 169 | "print(np.mean(valid_26_df[valid_26_df.is_prefix_in_train == 0]['nn_pred']))\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 37, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "valid_26 = pd.read_table('../data/oppo_round1_vali_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 181 | "valid_26_df['label'] = valid_26['label']\n" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 38, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "{'0.35': 0.7388241070332638, '0.352': 0.738685556867375, '0.354': 0.7391947898164595, '0.356': 0.7388273680047459, '0.358': 0.738801168143345, '0.36': 0.7390150105445975, '0.362': 0.7394674085850558, '0.364': 0.7395991534918461, '0.366': 0.7391315192178186, '0.368': 0.7395453409942543, '0.37': 0.73906191369606, '0.372': 0.7389461540390742, '0.374': 0.7391074795725959, '0.376': 0.7397536337758521, '0.378': 0.7395954194622408, '0.38': 0.7394949494949495, '0.382': 0.7395382942678702, '0.384': 0.7392978367698466, '0.386': 0.739692682184695, '0.388': 0.74006908462867, '0.39': 0.7401622872841046, '0.392': 0.7400254777070063, '0.394': 0.7400581959262853, '0.396': 0.739895630819605, '0.398': 0.7398861713582525, '0.4': 0.7395670150748607, '0.402': 0.7400490132851799, '0.404': 0.7400459912668268, '0.406': 0.7401366884125505, '0.408': 0.7405659153979821, '0.41': 0.7403271877434433, '0.412': 0.7405036944531169, '0.414': 0.7403064415259537, '0.416': 0.7405183889744459, '0.418': 0.7402505688802865, '0.42': 0.740546218487395, '0.422': 0.7398862199747156, '0.424': 0.7404140699271152, '0.426': 0.7403291527755729, '0.428': 0.740173747218985, '0.43': 0.7400514329648188, '0.432': 0.7401742086254514, '0.434': 0.7398622596856967, '0.436': 0.7398376148010116, '0.438': 0.7392915211770664, '0.44': 0.7387613454351308, '0.442': 0.7381652848355175, '0.444': 0.737862517740942, '0.446': 0.7377880307948175, '0.448': 0.737871380218127, '0.45': 0.7377476678226739, '0.452': 0.7376043068640645, '0.454': 0.7372762562001294, '0.456': 0.7364492048036353, '0.458': 0.7363631439250312, '0.46': 0.7360312466095258, '0.462': 0.7354379482721148, '0.464': 0.735258102261286, '0.466': 0.7351698956374834, '0.468': 0.7344833233255091, '0.47': 0.7330651120781673, '0.472': 0.7326461234784516, '0.474': 0.7324917672886937, '0.476': 0.732377341904291, '0.478': 0.7318589426197942, '0.48': 0.7311709427618707, '0.482': 0.7287765207048214, '0.484': 0.7282346749397525, '0.486': 0.7284403669724772, '0.488': 0.7281437125748503, '0.49': 0.7277952953651253, '0.492': 0.7276740806974404, '0.494': 0.7272116649408077, '0.496': 0.7267122519896871, '0.498': 0.7264209167929081}\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "yuzhi_dict = {}\n", 199 | "# 定义搜索方法获取最佳F1对应的阈值\n", 200 | "for yuzhi in range(350, 500, 2):\n", 201 | " real_yuzhi = yuzhi / 1000\n", 202 | " valid_26_df['predicted_label'] = valid_26_df['keng_pred'].map(lambda x : 1 if x > real_yuzhi else 0)\n", 203 | " f1 = f1_score(valid_26_df['label'], valid_26_df['predicted_label'])\n", 204 | " yuzhi_dict[str(real_yuzhi)] = f1\n", 205 | "print(yuzhi_dict)\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 43, 211 | "metadata": { 212 | "collapsed": true 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "valid_26_df['predicted_score'] = valid_26_df['keng_pred'] * 0.9 + valid_26_df['nn_pred'] *0.1\n" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 44, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "{'0.35': 0.7370967741935482, '0.352': 0.7371515359365048, '0.354': 0.7377725397760754, '0.356': 0.7378339812035626, '0.358': 0.738448641451748, '0.36': 0.7383771225190935, '0.362': 0.738280959809821, '0.364': 0.738770040204497, '0.366': 0.7393996667578523, '0.368': 0.739588785046729, '0.37': 0.7394420718763267, '0.372': 0.7397897897897897, '0.374': 0.7397658619738787, '0.376': 0.7401135621325562, '0.378': 0.7399813606710158, '0.38': 0.7398899379007421, '0.382': 0.7401766060268705, '0.384': 0.7402916930881421, '0.386': 0.740554284261378, '0.388': 0.7403914772148028, '0.39': 0.7402789842113331, '0.392': 0.740427927927928, '0.394': 0.7406438373733487, '0.396': 0.7405693126591066, '0.398': 0.7399943303352833, '0.4': 0.7400196250581005, '0.402': 0.7399265504577666, '0.404': 0.7395371738060069, '0.406': 0.7395154379495701, '0.408': 0.7391700273188501, '0.41': 0.7388946819603756, '0.412': 0.738865815113758, '0.414': 0.7384840871021776, '0.416': 0.7385034869697447, '0.418': 0.7382165939782461, '0.42': 0.7378308688101879, '0.422': 0.7375497614088741, '0.424': 0.737448472677307, '0.426': 0.7371820323460122, '0.428': 0.7365992096119671, '0.43': 0.7361819727891157, '0.432': 0.7357656458055926, '0.434': 0.7355248412401942, '0.436': 0.7352068237118639, '0.438': 0.7347922749310262, '0.44': 0.7342982197040895, '0.442': 0.73404169468729, '0.444': 0.7338850996144825, '0.446': 0.733581968542241, '0.448': 0.7333712336554861, '0.45': 0.7329445273294453, '0.452': 0.7332916587394341, '0.454': 0.7331135391406229, '0.456': 0.7324543444435346, '0.458': 0.7318527432853783, '0.46': 0.7312982956102374, '0.462': 0.7312548029421452, '0.464': 0.7311319457809793, '0.466': 0.7305880732681448, '0.468': 0.7300676141851801, '0.47': 0.7295907079646018, '0.472': 0.7290460225540994, '0.474': 0.7288775396913512, '0.476': 0.728061877469256, '0.478': 0.7277540896803502, '0.48': 0.7277652118064282, '0.482': 0.7273083379966424, '0.484': 0.7267935742521516, '0.486': 0.7263382575970341, '0.488': 0.725783756402319, '0.49': 0.7253447643757578, '0.492': 0.7247916372368978, '0.494': 0.7240656851642129, '0.496': 0.7235539417321494, '0.498': 0.7229504933602525}\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "yuzhi_dict = {}\n", 234 | "# 定义搜索方法获取最佳F1对应的阈值\n", 235 | "for yuzhi in range(350, 500, 2):\n", 236 | " real_yuzhi = yuzhi / 1000\n", 237 | " valid_26_df['predicted_label'] = valid_26_df['predicted_score'].map(lambda x : 1 if x > real_yuzhi else 0)\n", 238 | " f1 = f1_score(valid_26_df['label'], valid_26_df['predicted_label'])\n", 239 | " yuzhi_dict[str(real_yuzhi)] = f1\n", 240 | "print(yuzhi_dict)\n" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [] 251 | } 252 | ], 253 | "metadata": { 254 | "kernelspec": { 255 | "display_name": "Python 3", 256 | "language": "python", 257 | "name": "python3" 258 | }, 259 | "language_info": { 260 | "codemirror_mode": { 261 | "name": "ipython", 262 | "version": 3 263 | }, 264 | "file_extension": ".py", 265 | "mimetype": "text/x-python", 266 | "name": "python", 267 | "nbconvert_exporter": "python", 268 | "pygments_lexer": "ipython3", 269 | "version": "3.6.1" 270 | } 271 | }, 272 | "nbformat": 4, 273 | "nbformat_minor": 2 274 | } 275 | -------------------------------------------------------------------------------- /chusai/result_ronghe_valid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np \n", 12 | "import pandas as pd\n", 13 | "import time\n", 14 | "import datetime\n", 15 | "import gc\n", 16 | "from sklearn.model_selection import KFold, cross_val_score, train_test_split\n", 17 | "from sklearn.model_selection import StratifiedKFold\n", 18 | "from sklearn.metrics import roc_auc_score, log_loss\n", 19 | "import lightgbm as lgb\n", 20 | "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", 21 | "from sklearn.feature_extraction.text import CountVectorizer\n", 22 | "from sklearn.feature_selection import chi2, SelectPercentile\n", 23 | "import math\n", 24 | "from sklearn.metrics import f1_score\n", 25 | "import jieba\n", 26 | "import jieba.posseg as psg\n", 27 | "from collections import Counter\n", 28 | "import functools\n", 29 | "from gensim.models import word2vec\n", 30 | "import Levenshtein\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "keng_valid_26 = pd.read_csv('../result/valid_26_pred.csv')\n", 42 | "keng_valid_29 = pd.read_csv('../result/valid_29_pred.csv')\n", 43 | "keng_testA = pd.read_csv('../result/keng_score.csv')\n", 44 | "\n", 45 | "yuna_valid_26 = pd.read_csv('../result/lgb1_testB_pred2.csv')\n", 46 | "yuna_valid_29 = pd.read_csv('../result/lgb1_testB_valid.csv')\n", 47 | "yuna_testA = pd.read_csv('../result/lgb1_testB_pred.csv')\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 22, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "valid_26_df = keng_valid_26.copy()\n", 57 | "valid_26_df.rename(columns={'predicted_score':'keng_pred'}, inplace=True)\n", 58 | "yuna_valid_26 = yuna_valid_26[yuna_valid_26['is_valid26'] == 1]\n", 59 | "yuna_valid_26.reset_index(inplace=True)\n", 60 | "valid_26_df['yuna_pred'] = yuna_valid_26['pred']\n", 61 | "\n", 62 | "valid_29_df = keng_valid_29.copy()\n", 63 | "valid_29_df.rename(columns={'predicted_score':'keng_pred'}, inplace=True)\n", 64 | "valid_29_df['yuna_pred'] = yuna_valid_29['pred']\n", 65 | "\n", 66 | "testA_df = keng_testA.copy()\n", 67 | "testA_df.rename(columns={'predicted_score':'keng_pred'}, inplace=True)\n", 68 | "testA_df['yuna_pred'] = yuna_testA['pred']\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 73, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "0.44875104378627195\n", 81 | "0.3737914647546385\n", 82 | "original mean : 0.44875104378627195\n", 83 | "0.37078865606571043\n", 84 | "0.3707624986956756\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "yuna_test_prefix0_df = testA_df[testA_df.is_prefix_in_train == 0].copy()\n", 90 | "yuna_test_prefix1_df = testA_df[testA_df.is_prefix_in_train == 1].copy()\n", 91 | "print(np.mean(yuna_test_prefix0_df['yuna_pred']))\n", 92 | "print(np.mean(yuna_test_prefix1_df['yuna_pred']))\n", 93 | "\n", 94 | "#定义调整函数\n", 95 | "def resultAdjustment(result_df, t):\n", 96 | " result_df_temp = result_df.copy()\n", 97 | " result_df_temp['x'] = result_df_temp.yuna_pred.map(lambda x: -(math.log(((1 - x) / x), math.e)))\n", 98 | " result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) \n", 99 | " print(result_df_temp['adjust_result'].mean())\n", 100 | " return result_df_temp['adjust_result']\n", 101 | "\n", 102 | "print('original mean : ', yuna_test_prefix0_df['yuna_pred'].mean())\n", 103 | "yuna_test_df_after0 = resultAdjustment(yuna_test_prefix0_df, -0.3871)\n", 104 | "yuna_test_df_after1 = resultAdjustment(yuna_test_prefix1_df, -0.02235)\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 74, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stderr", 114 | "output_type": "stream", 115 | "text": [ 116 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 117 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 118 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 119 | "\n", 120 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 121 | " \"\"\"Entry point for launching an IPython kernel.\n", 122 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 123 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 124 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 125 | "\n", 126 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 127 | " \n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "testA_df[testA_df.is_prefix_in_train == 0]['yuna_pred'] = yuna_test_df_after0\n", 133 | "testA_df[testA_df.is_prefix_in_train == 1]['yuna_pred'] = yuna_test_df_after1\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 76, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "valid_26 = pd.read_table('../data/oppo_round1_vali_20180926.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 143 | "valid_29 = pd.read_table('../data/oppo_round1_vali_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 144 | "valid_26_df['label'] = valid_26['label']\n", 145 | "valid_29_df['label'] = valid_29['label']\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 77, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "valid_26_df['predicted_score'] = valid_26_df['keng_pred'] *0.5 + valid_26_df['yuna_pred'] * 0.5\n", 157 | "valid_29_df['predicted_score'] = valid_29_df['keng_pred'] *0.5 + valid_29_df['yuna_pred'] * 0.5\n" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 78, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "def getPredLabel(predArr, threshold=None, tops=None):\n", 169 | " '''\n", 170 | " 根据阈值返回分类预测结果\n", 171 | " '''\n", 172 | " if tops is not None :\n", 173 | " temp = np.sort(np.array(predArr))\n", 174 | " if tops < 1:\n", 175 | " threshold = temp[-1*round(len(temp)*tops)]\n", 176 | " else:\n", 177 | " threshold = temp[-round(tops)]\n", 178 | " if threshold is None:\n", 179 | " print('[Error] could not get threshold value.')\n", 180 | " exit()\n", 181 | " return (predArr>=threshold).astype(int)\n", 182 | "\n", 183 | "def findF1Threshold(predictList, labelList):\n", 184 | " '''\n", 185 | " 寻找F1最佳阈值\n", 186 | " '''\n", 187 | " tempDf = pd.DataFrame({'predict':predictList, 'label':labelList})\n", 188 | " trueNum = len(tempDf[tempDf.label==1])\n", 189 | " thrList = np.unique(tempDf['predict'])\n", 190 | " f1List = []\n", 191 | " for thr in thrList:\n", 192 | " tempDf['temp'] = getPredLabel(tempDf['predict'], thr)\n", 193 | " TP = len(tempDf[(tempDf.label==1)&(tempDf.temp==1)])\n", 194 | " if TP==0:\n", 195 | " break\n", 196 | " positiveNum = len(tempDf[tempDf.temp==1])\n", 197 | " precise = TP / positiveNum\n", 198 | " recall = TP / trueNum\n", 199 | " f1 = 2 * precise * recall / (precise + recall)\n", 200 | " f1List.append(f1)\n", 201 | " f1Df = pd.DataFrame({'thr':thrList[:len(f1List)], 'f1':f1List}).sort_values(by=['f1','thr'], ascending=[False,True])\n", 202 | " bestThs = thrList[f1List.index(max(f1List))]\n", 203 | " averThr = f1Df.head(5).sort_values(by=['thr']).head(4)['thr'].mean() # 取前5,去掉最大阈值后取平均\n", 204 | " # print('tops 5 thr:\\n', f1Df.head(5),'aver thr:',averThr)\n", 205 | " return averThr\n", 206 | "\n", 207 | "\n" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 79, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "valid_29_prefix1_newthr = findF1Threshold(valid_29_df[valid_29_df.is_prefix_in_train == 1]['predicted_score'], valid_29_df[valid_29_df.is_prefix_in_train == 1]['label'])\n", 219 | "\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 81, 225 | "metadata": { 226 | "collapsed": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "valid_29_prefix0_newthr = findF1Threshold(valid_29_df[valid_29_df.is_prefix_in_train == 0]['predicted_score'], valid_29_df[valid_29_df.is_prefix_in_train == 0]['label'])\n", 231 | "\n" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 84, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "testA_df['predicted_score'] = testA_df['keng_pred'] *0.5 + testA_df['yuna_pred'] * 0.5\n", 241 | "testA_df['label'] = 0\n", 242 | "testA_df.loc[testA_df.is_prefix_in_train == 0, 'label'] = getPredLabel(testA_df.loc[testA_df.is_prefix_in_train == 0, 'predicted_score'], valid_29_prefix0_newthr)\n", 243 | "testA_df.loc[testA_df.is_prefix_in_train == 1, 'label'] = getPredLabel(testA_df.loc[testA_df.is_prefix_in_train == 1, 'predicted_score'], valid_29_prefix1_newthr)\n" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 85, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "# 导出预测结果\n", 255 | "def exportResult(df, fileName):\n", 256 | " df.to_csv('../result/%s.csv' % fileName, header=False, index=False)\n", 257 | "\n", 258 | "exportResult(testA_df[['label']], 'ronghe_11_6')\n" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 86, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "name": "stdout", 268 | "output_type": "stream", 269 | "text": [ 270 | " is_prefix_in_train keng_pred yuna_pred label predicted_score\n", 271 | "0 1 0.060153 0.036359 0 0.048256\n", 272 | "1 1 0.702566 0.572474 1 0.637520\n", 273 | "2 1 0.678097 0.564478 1 0.621288\n", 274 | "3 1 0.286608 0.185611 0 0.236110\n", 275 | "4 1 0.092973 0.061782 0 0.077378\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "print(testA_df.head())\n", 281 | "\n", 282 | "\n" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 88, 288 | "metadata": { 289 | "scrolled": true 290 | }, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "is_prefix_in_train 0 1\n", 297 | "keng_pred count 4399.000000 45601.000000\n", 298 | " mean 0.444638 0.444645\n", 299 | " std 0.204084 0.325480\n", 300 | " min 0.022724 0.009959\n", 301 | " 25% 0.271123 0.142847\n", 302 | " 50% 0.414194 0.354251\n", 303 | " 75% 0.630357 0.783005\n", 304 | " max 0.881357 0.993897\n", 305 | "yuna_pred count 4399.000000 45601.000000\n", 306 | " mean 0.448751 0.373791\n", 307 | " std 0.204494 0.313598\n", 308 | " min 0.039272 0.010500\n", 309 | " 25% 0.274776 0.096333\n", 310 | " 50% 0.437667 0.249850\n", 311 | " 75% 0.641158 0.675860\n", 312 | " max 0.864953 0.982737\n", 313 | "label count 4399.000000 45601.000000\n", 314 | " mean 0.463287 0.460450\n", 315 | " std 0.498707 0.498439\n", 316 | " min 0.000000 0.000000\n", 317 | " 25% 0.000000 0.000000\n", 318 | " 50% 0.000000 0.000000\n", 319 | " 75% 1.000000 1.000000\n", 320 | " max 1.000000 1.000000\n", 321 | "predicted_score count 4399.000000 45601.000000\n", 322 | " mean 0.446695 0.409218\n", 323 | " std 0.201616 0.318626\n", 324 | " min 0.033444 0.010230\n", 325 | " 25% 0.276202 0.119662\n", 326 | " 50% 0.420519 0.303424\n", 327 | " 75% 0.637485 0.729512\n", 328 | " max 0.873155 0.988188\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "print(testA_df.groupby('is_prefix_in_train')[['keng_pred','yuna_pred','label','predicted_score']].describe().T)" 334 | ] 335 | } 336 | ], 337 | "metadata": { 338 | "kernelspec": { 339 | "display_name": "Python 3", 340 | "language": "python", 341 | "name": "python3" 342 | }, 343 | "language_info": { 344 | "codemirror_mode": { 345 | "name": "ipython", 346 | "version": 3 347 | }, 348 | "file_extension": ".py", 349 | "mimetype": "text/x-python", 350 | "name": "python", 351 | "nbconvert_exporter": "python", 352 | "pygments_lexer": "ipython3", 353 | "version": "3.6.1" 354 | } 355 | }, 356 | "nbformat": 4, 357 | "nbformat_minor": 2 358 | } 359 | -------------------------------------------------------------------------------- /chusai/oppo_online_alldata_model_lgb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np \n", 12 | "import pandas as pd\n", 13 | "import time\n", 14 | "import datetime\n", 15 | "import gc\n", 16 | "from sklearn.model_selection import KFold, cross_val_score, train_test_split\n", 17 | "from sklearn.model_selection import StratifiedKFold\n", 18 | "from sklearn.metrics import roc_auc_score, log_loss\n", 19 | "import lightgbm as lgb\n", 20 | "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", 21 | "from sklearn.feature_extraction.text import CountVectorizer\n", 22 | "from sklearn.feature_selection import chi2, SelectPercentile\n", 23 | "import math\n", 24 | "from sklearn.metrics import f1_score\n", 25 | "import jieba\n", 26 | "import jieba.posseg as psg\n", 27 | "from collections import Counter\n", 28 | "import functools\n", 29 | "from gensim.models import word2vec\n", 30 | "import Levenshtein\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "train_dataset = pd.read_csv('../temp/train_online_alldata_df.csv')\n", 42 | "test_dataset = pd.read_csv('../temp/test_online_alldata_df.csv')\n", 43 | "test_dataset_29 = pd.read_csv('../temp/test_online_df.csv')\n", 44 | "test_dataset = test_dataset[(len(test_dataset) - len(test_dataset_29)):]\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "fea = [\n", 56 | " 'query_prediction_number', 'query_prediction_max', 'query_prediction_min', 'query_prediction_mean', 'query_prediction_std',\n", 57 | " 'prefix_count', 'prefix_rate',\n", 58 | " 'title_count', 'title_rate', 'tag_count', 'tag_rate',\n", 59 | " 'query_prediction_count', 'query_prediction_rate', 'prefix_title_count',\n", 60 | " 'prefix_title_rate', 'prefix_tag_count', 'prefix_tag_rate',\n", 61 | " 'title_tag_count', 'title_tag_rate',\n", 62 | " 'prefix_click_number', 'title_click_number', 'query_prediction_click_number', 'prefix_tag_click_number', \n", 63 | " 'prefix_title_click_number', 'title_tag_click_number',\n", 64 | " 'is_title_in_query', 'is_prefix_in_title', \n", 65 | " 'title_tag_types', 'prefix_tag_types', 'tag_title_types', 'tag_prefix_types',\n", 66 | " 'title_prefix_types', 'prefix_title_types', 'tag_query_prediction_types', 'title_query_prediction_types',\n", 67 | " 'prefix_len', 'title_len',\n", 68 | " 'query_prediction_key_len_max', 'query_prediction_key_len_min',\n", 69 | " 'query_prediction_key_len_mean', 'query_prediction_key_len_std',\n", 70 | " 'len_title-prefix', 'len_prefix/title', 'len_mean-title', 'len_mean/title',\n", 71 | " 'q_t_word_match', 'q_t_jaccard', 'q_t_common_words',\n", 72 | " 'q_t_total_unique_words', 'q_t_wc_diff', 'q_t_wc_ratio',\n", 73 | " 'q_t_wc_diff_unique', 'q_t_wc_ratio_unique', 'q_t_tfidf_word_match_share',\n", 74 | " 'p_t_word_match', 'p_t_jaccard', 'p_t_common_words',\n", 75 | " 'p_t_total_unique_words', 'p_t_wc_diff', 'p_t_wc_ratio',\n", 76 | " 'p_t_wc_diff_unique', 'p_t_wc_ratio_unique', 'p_t_tfidf_word_match_share',\n", 77 | " 'p_q_word_match', 'p_q_jaccard', 'p_q_common_words',\n", 78 | " 'p_q_total_unique_words', 'p_q_wc_diff', 'p_q_wc_ratio',\n", 79 | " 'p_q_wc_diff_unique', 'p_q_wc_ratio_unique', 'p_q_tfidf_word_match_share',\n", 80 | " 'title_prefix_dot_similarity',\n", 81 | " 'title_query_dot_similarity', 'title_prefix_norm_similarity',\n", 82 | " 'title_query_norm_similarity', 'title_prefix_cosine_similarity',\n", 83 | " 'title_query_cosine_similarity',\n", 84 | " 'title_query_dot_similarity_max', 'title_query_dot_similarity_min',\n", 85 | " 'title_query_dot_similarity_mean', 'title_query_dot_similarity_std',\n", 86 | " 'title_query_norm_similarity_min', 'title_query_norm_similarity_mean',\n", 87 | " 'title_query_norm_similarity_std', 'title_prefix_cosine_similarity',\n", 88 | " 'title_query_cosine_similarity_max', 'title_query_cosine_similarity_min',\n", 89 | " 'title_query_cosine_similarity_mean', 'title_query_cosine_similarity_std',\n", 90 | " 'title_prefix_leven', 'title_prefix_leven_rate',\n", 91 | " 'title_query_leven_sum', 'title_query_leven_max', 'title_query_leven_min',\n", 92 | " 'title_query_leven_mean', 'title_query_leven_std',\n", 93 | " ]\n", 94 | "\n", 95 | "\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": { 102 | "scrolled": true 103 | }, 104 | "outputs": [ 105 | { 106 | "name": "stderr", 107 | "output_type": "stream", 108 | "text": [ 109 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:99: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n", 110 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" 111 | ] 112 | }, 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "0.37901486345508767\n", 118 | " importance\n", 119 | "prefix_title_rate 10894\n", 120 | "prefix_tag_rate 8246\n", 121 | "prefix_rate 5820\n", 122 | "query_prediction_rate 5368\n", 123 | "prefix_click_number 4187\n", 124 | "title_tag_rate 3935\n", 125 | "prefix_title_count 2837\n", 126 | "title_rate 2708\n", 127 | "prefix_title_click_number 2300\n", 128 | "prefix_tag_click_number 2220\n", 129 | "tag_rate 1976\n", 130 | "prefix_title_types 1937\n", 131 | "is_title_in_query 1784\n", 132 | "title_tag_count 1681\n", 133 | "prefix_tag_count 1671\n", 134 | "prefix_count 1331\n", 135 | "query_prediction_count 1284\n", 136 | "tag_count 1191\n", 137 | "query_prediction_click_number 1115\n", 138 | "prefix_tag_types 1114\n", 139 | "title_tag_click_number 1102\n", 140 | "title_query_norm_similarity_std 1093\n", 141 | "q_t_word_match 983\n", 142 | "title_query_norm_similarity_mean 921\n", 143 | "title_query_prediction_types 827\n", 144 | "title_tag_types 822\n", 145 | "title_count 779\n", 146 | "title_prefix_types 753\n", 147 | "q_t_tfidf_word_match_share 749\n", 148 | "tag_title_types 701\n", 149 | "... ...\n", 150 | "q_t_wc_ratio 207\n", 151 | "p_q_word_match 206\n", 152 | "query_prediction_min 201\n", 153 | "q_t_wc_diff 194\n", 154 | "title_query_leven_std 186\n", 155 | "q_t_total_unique_words 182\n", 156 | "title_query_leven_min 180\n", 157 | "title_query_norm_similarity_min 159\n", 158 | "prefix_len 146\n", 159 | "query_prediction_key_len_min 142\n", 160 | "query_prediction_number 127\n", 161 | "p_t_total_unique_words 125\n", 162 | "p_q_total_unique_words 113\n", 163 | "p_q_wc_diff_unique 105\n", 164 | "len_title-prefix 101\n", 165 | "q_t_wc_diff_unique 97\n", 166 | "p_t_wc_ratio 94\n", 167 | "p_t_wc_ratio_unique 85\n", 168 | "p_t_word_match 81\n", 169 | "p_t_wc_diff 62\n", 170 | "tag_prefix_types 60\n", 171 | "title_prefix_leven 54\n", 172 | "p_t_common_words 51\n", 173 | "p_t_wc_diff_unique 51\n", 174 | "p_q_common_words 40\n", 175 | "tag_query_prediction_types 16\n", 176 | "p_t_jaccard 0\n", 177 | "q_t_jaccard 0\n", 178 | "p_q_jaccard 0\n", 179 | "title_prefix_cosine_similarity 0\n", 180 | "\n", 181 | "[97 rows x 1 columns]\n" 182 | ] 183 | }, 184 | { 185 | "name": "stderr", 186 | "output_type": "stream", 187 | "text": [ 188 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:18: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "lgb_model = lgb.LGBMClassifier(\n", 194 | " boosting_type='gbdt', num_leaves=127, max_depth=-1, n_estimators=5000, objective='binary',\n", 195 | " subsample=0.8, colsample_bytree=1, subsample_freq=1,\n", 196 | " learning_rate=0.01, random_state=2018, n_jobs=-1, num_boost_round=710\n", 197 | ")\n", 198 | "\n", 199 | "test_dataset['predicted_score'] = 0\n", 200 | "\n", 201 | "# lgb_model.fit(train_df[fea], train_df['label'], eval_set=[(train_df[fea], train_df['label']),\n", 202 | "# (valid_df[fea], valid_df['label'])], early_stopping_rounds=50, eval_metric='auc')\n", 203 | "lgb_model.fit(train_dataset[fea], train_dataset['label'], eval_metric='auc')\n", 204 | "test_pred = lgb_model.predict_proba(test_dataset[fea], num_iteration=710)[:, 1]\n", 205 | "print(np.mean(test_pred))\n", 206 | "\n", 207 | "fscore = lgb_model.booster_.feature_importance()\n", 208 | "feaNames = lgb_model.booster_.feature_name()\n", 209 | "scoreDf = pd.DataFrame(index=feaNames, columns=['importance'], data=fscore)\n", 210 | "print(scoreDf.sort_index(by=['importance'], ascending=False))\n", 211 | "\n", 212 | "\n" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 7, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "0.3742659758197111\n", 225 | "0.44615346651564886\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "test_dataset['predicted_score'] = test_pred\n", 231 | "\n", 232 | "train_prefix_set = set(train_dataset['prefix'])\n", 233 | "\n", 234 | "test_dataset['is_prefix_in_train'] = test_dataset['prefix'].map(lambda x : 1 if x in train_prefix_set else 0)\n", 235 | "print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 1]['predicted_score']))\n", 236 | "print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 0]['predicted_score']))\n", 237 | "\n", 238 | "\n" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 10, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "original mean : 0.3742659758197111\n", 251 | "0.4461603692668604\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "test_prefix0_df = test_dataset[test_dataset.is_prefix_in_train == 1].copy()\n", 257 | "\n", 258 | "#定义调整函数\n", 259 | "def resultAdjustment(result_df, t):\n", 260 | " result_df_temp = result_df.copy()\n", 261 | " result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))\n", 262 | " result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) \n", 263 | " print(result_df_temp['adjust_result'].mean())\n", 264 | " return result_df_temp['adjust_result']\n", 265 | "\n", 266 | "print('original mean : ', test_prefix0_df['predicted_score'].mean())\n", 267 | "test_df_after = resultAdjustment(test_prefix0_df, 0.55585)\n" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 11, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "0.44615346651564886\n", 280 | "0.4461603692668604\n" 281 | ] 282 | }, 283 | { 284 | "name": "stderr", 285 | "output_type": "stream", 286 | "text": [ 287 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 288 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 289 | "\n", 290 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 291 | " \"\"\"Entry point for launching an IPython kernel.\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1] = test_df_after\n", 297 | "print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 0]))\n", 298 | "print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1]))\n", 299 | "\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 14, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "name": "stdout", 309 | "output_type": "stream", 310 | "text": [ 311 | "0.40802\n" 312 | ] 313 | } 314 | ], 315 | "source": [ 316 | "test_dataset['predicted_label'] = test_dataset['predicted_score'].map(lambda x : 1 if x > 0.515 else 0)\n", 317 | "print(np.mean(test_dataset['predicted_label']))\n", 318 | "\n" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 15, 324 | "metadata": { 325 | "collapsed": true 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "# 导出预测结果\n", 330 | "def exportResult(df, fileName):\n", 331 | " df.to_csv('../result/%s.csv' % fileName, header=False, index=False)\n", 332 | "\n", 333 | "exportResult(test_dataset[['predicted_label']], 'lgb_yi_alldata_11_3')\n", 334 | "\n" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "collapsed": true 342 | }, 343 | "outputs": [], 344 | "source": [] 345 | } 346 | ], 347 | "metadata": { 348 | "kernelspec": { 349 | "display_name": "Python 3", 350 | "language": "python", 351 | "name": "python3" 352 | }, 353 | "language_info": { 354 | "codemirror_mode": { 355 | "name": "ipython", 356 | "version": 3 357 | }, 358 | "file_extension": ".py", 359 | "mimetype": "text/x-python", 360 | "name": "python", 361 | "nbconvert_exporter": "python", 362 | "pygments_lexer": "ipython3", 363 | "version": "3.6.1" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 2 368 | } 369 | -------------------------------------------------------------------------------- /fusai/offline_keng_20181122.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import pandas as pd 5 | import numpy as np 6 | import lightgbm as lgb 7 | from sklearn.metrics import f1_score 8 | import scipy as sp 9 | from sklearn.preprocessing import LabelEncoder,OneHotEncoder 10 | import jieba 11 | from Levenshtein import distance as lev_distance 12 | from sklearn.model_selection import KFold 13 | from gensim.models import KeyedVectors, Word2Vec 14 | from time import time 15 | from multiprocessing import Pool 16 | import gc 17 | import warnings 18 | 19 | warnings.filterwarnings("ignore") 20 | 21 | 22 | def importDf(url, sep='\t', na_values=None, header=None, index_col=None, colNames=None): 23 | df = pd.read_table(url, names=colNames, header=header, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 24 | return df 25 | 26 | def one_zero2(data,thre): 27 | if data0]=1 253 | sample['query_num'] = np.sum(num,axis=1) 254 | 255 | sample['weight_sum'] = np.sum(weights,1) 256 | sample = min_max_mean_std(sample,weights,'weight','') 257 | 258 | print(' cost: %.1f ' %(time()-start)) 259 | return sample 260 | 261 | def get_sentence_vec(sentence): 262 | s_vector = np.zeros((len(w2v_model['我']))) 263 | if sentence: 264 | count=0 265 | # sentence = sentence.replace('%2C', ',') 266 | for word in jieba.cut(sentence) : 267 | if word not in stop_words: 268 | try: 269 | vec = w2v_model[word] 270 | s_vector += vec 271 | count += 1 272 | except (KeyError): 273 | pass 274 | if count: 275 | s_vector /= count 276 | return s_vector 277 | 278 | def cosine(v1,v2): 279 | if len(v1.shape)==1: 280 | multi = np.dot(v1,v2) 281 | axis=None 282 | else: 283 | multi = np.sum(v1*v2,1) 284 | axis=1 285 | s1_norm = np.linalg.norm(v1,axis=axis) 286 | s2_norm = np.linalg.norm(v2,axis=axis) 287 | cos = multi/(s1_norm*s2_norm+0.001) 288 | return cos 289 | 290 | def sentence_simi(s1,s2): 291 | s1_vec = get_sentence_vec(s1) 292 | s2_vec = get_sentence_vec(s2) 293 | cos = cosine(s1_vec,s2_vec) 294 | return cos 295 | 296 | def query_data_cos(query,data): 297 | q_data_cos = [sentence_simi(q,data) for q in query] 298 | return q_data_cos 299 | 300 | def word2vec_features_1(sample,cos_feature=False): 301 | print('------ word2vec features 1',end='') 302 | start = time() 303 | 304 | title_embed = map_to_array(get_sentence_vec,sample['title']) 305 | prefix_embed = map_to_array(get_sentence_vec,sample['prefix']) 306 | 307 | max_w_query = querys[idx,weight_argmax] 308 | mx_w_query_embed = map_to_array(get_sentence_vec,max_w_query) 309 | 310 | if cos_feature: 311 | querys_title_cos = [cosine(map_to_array(get_sentence_vec,querys[:,i],paral=True),title_embed) for i in range(11)] 312 | querys_title_cos = np.array(querys_title_cos).T 313 | sample = min_max_mean_std(sample,querys_title_cos,'querys_title','cos') 314 | sample['mx_w_query_title_cos'] = querys_title_cos[idx,weight_argmax] 315 | 316 | sample['prefix_title_cos'] = cosine(title_embed,prefix_embed) 317 | sample['prefix_mx_query_cos'] = cosine(prefix_embed,mx_w_query_embed) 318 | sample['mx_w_query_title_cos'] = cosine(mx_w_query_embed,title_embed) 319 | 320 | t_p_embed = title_embed - prefix_embed 321 | t_mq_embed = title_embed - mx_w_query_embed 322 | 323 | title_embed = pd.DataFrame(title_embed,columns=['title_w2v_'+str(i) for i in range(50)], index=sample.index) 324 | sample = pd.concat([sample,title_embed],axis=1) 325 | 326 | prefix_embed = pd.DataFrame(prefix_embed,columns=['prefix_w2v_'+str(i) for i in range(50)], index=sample.index) 327 | sample = pd.concat([sample,prefix_embed],axis=1) 328 | 329 | mx_w_query_embed = pd.DataFrame(mx_w_query_embed,columns=['mx_w_query_w2v_'+str(i) for i in range(50)], index=sample.index) 330 | sample = pd.concat([sample,mx_w_query_embed],axis=1) 331 | 332 | t_p_embed = pd.DataFrame(t_p_embed,columns=['t_p_w2v_'+str(i) for i in range(50)], index=sample.index) 333 | sample = pd.concat([sample,t_p_embed],axis=1) 334 | 335 | t_mq_embed = pd.DataFrame(t_mq_embed,columns=['t_mq_w2v_'+str(i) for i in range(50)], index=sample.index) 336 | sample = pd.concat([sample,t_mq_embed],axis=1) 337 | 338 | print(' cost: %.1f ' %(time()-start)) 339 | return sample 340 | 341 | def word2vec_features_2(sample): 342 | print('------ word2vec features 2',end='') 343 | start = time() 344 | 345 | def calc_all_cos(s1,s2,s3): 346 | prefix_embed = get_sentence_vec(s1) 347 | title_embed = get_sentence_vec(s2) 348 | mx_w_query_embed = get_sentence_vec(s3) 349 | cos = [0,0,0] 350 | cos[0] = cosine(prefix_embed,title_embed) 351 | cos[1] = cosine(prefix_embed,mx_w_query_embed) 352 | cos[2] = cosine(title_embed,mx_w_query_embed) 353 | return cos 354 | 355 | max_w_query = querys[idx,weight_argmax] 356 | cos = list(map(calc_all_cos,sample['prefix'],sample['title'],max_w_query)) 357 | 358 | cos = pd.DataFrame(cos,columns=['prefix_title_cos_2','prefix_mx_query_cos_2','mx_w_query_title_cos_2'], index=sample.index) 359 | 360 | sample = pd.concat([sample,cos],axis=1) 361 | print(' cost: %.1f ' %(time()-start)) 362 | return sample 363 | 364 | def jaccard_features(sample): 365 | print('------ jaccard features',end='') 366 | start = time() 367 | def jaccard(s1,s2): 368 | inter=len([w for w in s1 if w in s2]) 369 | union = len(s1)+len(s2)-inter 370 | return inter/(union+0.001) 371 | 372 | def jaccard_dist(querys,data): 373 | res = [jaccard(q,data) for q in querys] 374 | return res 375 | 376 | print('jaccard->querys:', querys.shape) 377 | querys_title_jac = map_to_array(jaccard_dist,querys,sample['title']) 378 | sample = min_max_mean_std(sample,querys_title_jac,'query_title','jac') 379 | sample['mx_w_query_title_jac'] = querys_title_jac[idx,weight_argmax] 380 | sample['prefix_title_jac'] = list(map(jaccard,sample['prefix'],sample['title'])) 381 | 382 | jacs = pd.DataFrame(-np.sort(-querys_title_jac,axis=1),columns=['query_title_jac_'+str(i) for i in range(11)], index=sample.index) 383 | 384 | sample = pd.concat([sample,jacs],axis=1) 385 | 386 | sample = sample.fillna(0) 387 | print(' cost: %.1f ' %(time()-start)) 388 | return sample 389 | 390 | 391 | sample = str_lower(sample) 392 | tempDf = sample.drop_duplicates(['prefix','query_prediction','title']) 393 | # tempDf['old_prefix'] = tempDf['prefix'] 394 | # tempDf['old_title'] = tempDf['title'] 395 | querys,weights,norm_weights = get_query_weight(tempDf['query_prediction']) 396 | # tempDf.drop(['query_prediction'],axis=1,inplace=True) 397 | gc.collect() 398 | idx,weight_argmax = get_max_weight_idx() 399 | tempDf = weight_features(tempDf) 400 | gc.collect() 401 | tempDf = len_features(tempDf) 402 | gc.collect() 403 | tempDf = lev_features(tempDf) 404 | gc.collect() 405 | tempDf = jaccard_features(tempDf) 406 | gc.collect() 407 | w2v_model = w2v_model_1 408 | tempDf = word2vec_features_1(tempDf) 409 | gc.collect() 410 | 411 | # w2v_model = w2v_model_2 412 | # tempDf = word2vec_features_2(tempDf) 413 | # gc.collect() 414 | sample = sample.merge( 415 | tempDf[['prefix','query_prediction','title'] + np.setdiff1d(tempDf.columns,sample.columns).tolist()], 416 | how='left', 417 | on=['prefix','query_prediction','title']) 418 | return sample 419 | 420 | def runLGBCV(train_X, train_y,vali_X=None,vali_y=None, seed_val=2012, num_rounds = 2000): 421 | def lgb_f1_score(y_hat, data): 422 | y_true = data.get_label() 423 | y_hat = np.round(y_hat) 424 | return 'f1', f1_score(y_true, y_hat), True 425 | 426 | params = { 427 | 'task': 'train', 428 | 'boosting_type': 'gbdt', 429 | 'objective': 'binary', 430 | 'metric': 'binary_logloss', 431 | 'num_leaves': 127, 432 | 'learning_rate': 0.02, 433 | 'feature_fraction': 1, 434 | 'num_threads':-1, 435 | 'seed':2018, 436 | 'is_training_metric':True, 437 | } 438 | 439 | lgb_train = lgb.Dataset(train_X, train_y) 440 | 441 | if vali_y is not None: 442 | lgb_vali = lgb.Dataset(vali_X,vali_y) 443 | model = lgb.train(params,lgb_train,num_boost_round=num_rounds,verbose_eval=10,early_stopping_rounds=200, 444 | valid_sets=[lgb_vali, lgb_train],valid_names=['val', 'train']) 445 | 446 | else: 447 | model = lgb.train(params,lgb_train,num_boost_round=num_rounds,verbose_eval=10, 448 | valid_sets=[lgb_train],valid_names=['train']) 449 | 450 | return model,model.best_iteration 451 | 452 | def get_x_y(data): 453 | drop_list = ['prefix','query_prediction','title'] 454 | if 'label' in data.columns: 455 | y = data['label'] 456 | data.drop(drop_list+['label'],axis=1,inplace=True) 457 | else: 458 | y=None 459 | data.drop(drop_list,axis=1,inplace=True) 460 | print('------ ',data.shape) 461 | return data,y 462 | 463 | def train_and_predict(samples,vali_samples,num_rounds=3000): 464 | print('-- train and predict') 465 | print('---- get x and y') 466 | train_x,train_y = get_x_y(samples) 467 | vali_X,vali_y = get_x_y(vali_samples) 468 | 469 | print('---- training') 470 | model,best_iter = runLGBCV(train_x, train_y,vali_X,vali_y,num_rounds=num_rounds) 471 | print('best_iteration:',best_iter) 472 | 473 | print('---- predict') 474 | vali_pred = model.predict(vali_X) 475 | return model,best_iter,vali_pred,vali_y 476 | 477 | def result_analysis(res): 478 | print('mean : ',np.mean(res)) 479 | 480 | if __name__ == "__main__": 481 | # 路径 482 | train_dir = '../data/data_train.txt' 483 | vali_dir = '../data/data_vali.txt' 484 | test_dir = '../data/data_test.txt' 485 | vec_dir_1 = '../data/keng_2018seed0_drop/w2v_total_final_50wei_1.model' 486 | vec_dir_2 = '../data/merge_sgns_bigram_char300/merge_sgns_bigram_char300.txt' 487 | srop_word_dir = '../data/user_stopwords.dat' 488 | test_result_dir = './lake_20181122.csv' 489 | 490 | # jieba.load_userdict('../data/user_dict.dat') 491 | 492 | # 导入数据 493 | print("-- 导入原始数据", end='') 494 | start = time() 495 | raw_train = importDf(train_dir, colNames=['prefix','query_prediction','title','tag','label']) 496 | raw_vali = importDf(vali_dir, colNames=['prefix','query_prediction','title','tag','label']) 497 | print(' cost: %.1f ' %(time() - start)) 498 | 499 | # temp_df = pd.read_csv('./kmeans_1fea_df.csv') 500 | # raw_train = pd.concat([raw_train, temp_df[:len(raw_train)]], axis=1) 501 | # print(raw_train.shape) 502 | # raw_vali = pd.concat([raw_vali, (temp_df.iloc[len(raw_train):(len(raw_train) + len(raw_vali))]).reset_index(drop=True)], axis=1) 503 | # print(raw_vali.shape) 504 | 505 | # 清洗数据 506 | print("-- 清洗数据", end='') 507 | start = time() 508 | raw_train['query_prediction'].replace({'':'{}', np.nan:'{}'}, inplace=True) 509 | raw_vali['query_prediction'].replace({'':'{}', np.nan:'{}'}, inplace=True) 510 | raw_train = str_lower(raw_train) 511 | raw_vali = str_lower(raw_vali) 512 | gc.collect() 513 | print(' cost: %.1f ' %(time() - start)) 514 | 515 | # raw_train['prefix_position'] = raw_train.apply(get_prefix_position, axis=1) 516 | # raw_vali['prefix_position'] = raw_vali.apply(get_prefix_position, axis=1) 517 | 518 | ## 提取统计特征 519 | # 提取训练集统计特征 520 | print("-- 提取训练集统计特征", end='') 521 | start = time() 522 | raw_train = k_fold_stat_features(raw_train) 523 | gc.collect() 524 | print(' cost: %.1f ' %(time() - start)) 525 | 526 | # 提取验证集统计特征 527 | print("-- 提取验证集统计特征", end='') 528 | start = time() 529 | raw_vali = stat_features(raw_train, raw_vali) 530 | gc.collect() 531 | print(' cost: %.1f ' %(time() - start)) 532 | 533 | ## 提取文本特征 534 | ''' 535 | 1、tag进行labelEncoder 536 | ''' 537 | print("-- 对tag进行encoder", end='') 538 | start = time() 539 | encoder = get_tag_dict(raw_train) 540 | # print(raw_train['tag'].unique()) 541 | raw_train['tag'] = encoder.transform(raw_train['tag']) 542 | # print(raw_vali['tag'].unique()) 543 | raw_vali['tag'] = encoder.transform(raw_vali['tag']) 544 | print(' cost: %.1f ' %(time() - start)) 545 | del encoder 546 | gc.collect() 547 | 548 | ''' 549 | 2、tag进行onehot 550 | ''' 551 | raw_train = pd.get_dummies(raw_train, columns=['tag']) 552 | gc.collect() 553 | raw_vali = pd.get_dummies(raw_vali, columns=['tag']) 554 | gc.collect() 555 | 556 | 557 | ''' 558 | #3、其他 559 | ''' 560 | # 导入模型和停用词表 561 | print("-- 导入词模型和停用词表", end='') 562 | start = time() 563 | w2v_model_1 = read_w2v_model(vec_dir_1) 564 | w2v_model_2 = None 565 | #w2v_model_2 = read_w2v_model(vec_dir_2, persist=False) 566 | stop_words = read_stop_word(srop_word_dir) 567 | print(' cost: %.1f ' %(time() - start)) 568 | 569 | # 提取其他文本特征 570 | print("-- 提取训练集其他文本特征", end='') 571 | start = time() 572 | raw_train = text_features(raw_train, w2v_model_1, w2v_model_2, stop_words) 573 | gc.collect() 574 | print("-- 提取验证集其他文本特征", end='') 575 | raw_vali = text_features(raw_vali, w2v_model_1, w2v_model_2, stop_words) 576 | del w2v_model_1, stop_words 577 | gc.collect() 578 | 579 | 580 | #raw_train.to_csv('train_re.csv', index=False) 581 | #raw_vali.to_csv('vali_re.csv', index=False) 582 | 583 | 584 | model,best_iter,vali_pred,vali_y = train_and_predict(raw_train, raw_vali) 585 | 586 | scores = [] 587 | print('-- search best split point') 588 | for thre in range(100): 589 | thre *=0.01 590 | score = f1_score(vali_y,list(map(one_zero2,vali_pred,[thre]*len(vali_pred)))) 591 | scores.append(score) 592 | 593 | scores = np.array(scores) 594 | best_5 = np.argsort(scores)[-5:] 595 | best_5_s = scores[best_5] 596 | for x,y in zip(best_5,best_5_s): 597 | print('%.2f %.4f' %(0.01*x,y)) 598 | max_thre = np.mean(best_5)*0.01 599 | -------------------------------------------------------------------------------- /fusai/lake_20181118.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import pandas as pd 5 | import numpy as np 6 | import lightgbm as lgb 7 | from sklearn.metrics import f1_score 8 | import scipy as sp 9 | from sklearn.preprocessing import LabelEncoder,OneHotEncoder 10 | import jieba 11 | from Levenshtein import distance as lev_distance 12 | from sklearn.model_selection import KFold 13 | from gensim.models import KeyedVectors, Word2Vec 14 | from time import time 15 | from multiprocessing import Pool 16 | import gc 17 | 18 | def importDf(url, sep='\t', na_values=None, header=None, index_col=None, colNames=None): 19 | df = pd.read_table(url, names=colNames, header=header, na_values='', keep_default_na=False, encoding='utf-8', quoting=3) 20 | return df 21 | 22 | def importCacheDf(url): 23 | df = df = pd.read_csv(url, na_values='', keep_default_na=False) 24 | return df 25 | 26 | def clean_data(): 27 | # raw_train.drop(1815101, inplace=True) 28 | # raw_train.drop(['aa'],axis=1,inplace=True) 29 | raw_train.reset_index(drop=True,inplace=True) 30 | raw_train['label'] = raw_train['label'].astype(int) 31 | raw_train['query_prediction'].replace({'':'{}',np.nan:'{}'},inplace=True) 32 | raw_vali['query_prediction'].replace({'':'{}',np.nan:'{}'},inplace=True) 33 | raw_testa['query_prediction'].replace({'':'{}',np.nan:'{}'},inplace=True) 34 | 35 | def read_w2v_model(model_dir,persist=True): 36 | if persist: 37 | w2v_model = Word2Vec.load(model_dir) 38 | else: 39 | w2v_model = KeyedVectors.load_word2vec_format(model_dir) 40 | return w2v_model 41 | 42 | 43 | def one_zero2(data,thre): 44 | if data0]=1 208 | sample['query_num'] = np.sum(num,axis=1) 209 | 210 | sample['weight_sum'] = np.sum(weights,1) 211 | sample = min_max_mean_std(sample,weights,'weight','') 212 | 213 | print(' cost: %.1f ' %(time()-start)) 214 | return sample 215 | 216 | def get_sentence_vec(sentence): 217 | s_vector = np.zeros((len(w2v_model['我']))) 218 | if sentence: 219 | count=0 220 | for word in jieba.cut(sentence) : 221 | if word not in stop_words: 222 | try: 223 | vec = w2v_model[word] 224 | s_vector += vec 225 | count += 1 226 | except (KeyError): 227 | pass 228 | if count: 229 | s_vector /= count 230 | return s_vector 231 | 232 | def cosine(v1,v2): 233 | if len(v1.shape)==1: 234 | multi = np.dot(v1,v2) 235 | axis=None 236 | else: 237 | multi = np.sum(v1*v2,1) 238 | axis=1 239 | s1_norm = np.linalg.norm(v1,axis=axis) 240 | s2_norm = np.linalg.norm(v2,axis=axis) 241 | cos = multi/(s1_norm*s2_norm+0.001) 242 | return cos 243 | 244 | 245 | def sentence_simi(s1,s2): 246 | s1_vec = get_sentence_vec(s1) 247 | s2_vec = get_sentence_vec(s2) 248 | cos = cosine(s1_vec,s2_vec) 249 | return cos 250 | 251 | def query_data_cos(query,data): 252 | q_data_cos = [sentence_simi(q,data) for q in query] 253 | return q_data_cos 254 | 255 | def word2vec_features_1(sample,cos_feature=False): 256 | print('------ word2vec features 1',end='') 257 | start = time() 258 | 259 | title_embed = map_to_array(get_sentence_vec,sample['title']) 260 | prefix_embed = map_to_array(get_sentence_vec,sample['prefix']) 261 | 262 | max_w_query = querys[idx,weight_argmax] 263 | mx_w_query_embed = map_to_array(get_sentence_vec,max_w_query) 264 | 265 | if cos_feature: 266 | querys_title_cos = [cosine(map_to_array(get_sentence_vec,querys[:,i],paral=True),title_embed) for i in range(11)] 267 | querys_title_cos = np.array(querys_title_cos).T 268 | sample = min_max_mean_std(sample,querys_title_cos,'querys_title','cos') 269 | sample['mx_w_query_title_cos'] = querys_title_cos[idx,weight_argmax] 270 | 271 | sample['prefix_title_cos'] = cosine(title_embed,prefix_embed) 272 | sample['prefix_mx_query_cos'] = cosine(prefix_embed,mx_w_query_embed) 273 | sample['mx_w_query_title_cos'] = cosine(mx_w_query_embed,title_embed) 274 | 275 | title_embed = pd.DataFrame(title_embed,columns=['title_w2v_'+str(i) for i in range(50)]) 276 | sample = pd.concat([sample,title_embed],axis=1) 277 | 278 | prefix_embed = pd.DataFrame(prefix_embed,columns=['prefix_w2v_'+str(i) for i in range(50)]) 279 | sample = pd.concat([sample,prefix_embed],axis=1) 280 | 281 | mx_w_query_embed = pd.DataFrame(mx_w_query_embed,columns=['mx_w_query_w2v_'+str(i) for i in range(50)]) 282 | sample = pd.concat([sample,mx_w_query_embed],axis=1) 283 | 284 | print(' cost: %.1f ' %(time()-start)) 285 | return sample 286 | 287 | 288 | 289 | def word2vec_features_2(sample): 290 | print('------ word2vec features 2',end='') 291 | start = time() 292 | 293 | def calc_all_cos(s1,s2,s3): 294 | prefix_embed = get_sentence_vec(s1) 295 | title_embed = get_sentence_vec(s2) 296 | mx_w_query_embed = get_sentence_vec(s3) 297 | cos = [0,0,0] 298 | cos[0] = cosine(prefix_embed,title_embed) 299 | cos[1] = cosine(prefix_embed,mx_w_query_embed) 300 | cos[2] = cosine(title_embed,mx_w_query_embed) 301 | return cos 302 | 303 | max_w_query = querys[idx,weight_argmax] 304 | cos = list(map(calc_all_cos,sample['prefix'],sample['title'],max_w_query)) 305 | cos = pd.DataFrame(cos,columns=['prefix_title_cos_2','prefix_mx_query_cos_2','mx_w_query_title_cos_2']) 306 | sample = pd.concat([sample,cos],axis=1) 307 | print(' cost: %.1f ' %(time()-start)) 308 | return sample 309 | 310 | def jaccard_features(sample): 311 | print('------ jaccard features',end='') 312 | start = time() 313 | def jaccard(s1,s2): 314 | inter=len([w for w in s1 if w in s2]) 315 | union = len(s1)+len(s2)-inter 316 | return inter/(union+0.001) 317 | 318 | def jaccard_dist(querys,data): 319 | res = [jaccard(q,data) for q in querys] 320 | return res 321 | 322 | querys_title_jac = map_to_array(jaccard_dist,querys,sample['title']) 323 | sample = min_max_mean_std(sample,querys_title_jac,'query_title','jac') 324 | sample['mx_w_query_title_jac'] = querys_title_jac[idx,weight_argmax] 325 | sample['prefix_title_jac'] = list(map(jaccard,sample['prefix'],sample['title'])) 326 | 327 | jacs = pd.DataFrame(-np.sort(-querys_title_jac,axis=1),columns=['query_title_jac_'+str(i) for i in range(11)]) 328 | sample = pd.concat([sample,jacs],axis=1) 329 | 330 | sample = sample.fillna(0) 331 | print(' cost: %.1f ' %(time()-start)) 332 | return sample 333 | 334 | 335 | sample = str_lower(sample) 336 | querys,weights,norm_weights = get_query_weight(sample['query_prediction']) 337 | # sample.drop(['query_prediction'],axis=1,inplace=True) 338 | gc.collect() 339 | idx,weight_argmax = get_max_weight_idx() 340 | sample = tag_one_hot(sample) 341 | gc.collect() 342 | sample = weight_features(sample) 343 | gc.collect() 344 | sample = len_features(sample) 345 | gc.collect() 346 | sample = lev_features(sample) 347 | gc.collect() 348 | sample = jaccard_features(sample) 349 | gc.collect() 350 | w2v_model = w2v_model_1 351 | sample = word2vec_features_1(sample) 352 | gc.collect() 353 | #w2v_model = w2v_model_2 354 | #sample = word2vec_features_2(sample) 355 | #gc.collect() 356 | return sample 357 | 358 | 359 | def stat_features(raw,sample): 360 | def ctr_features(raw,sample): 361 | print('------ ctr features',end='') 362 | start = time() 363 | def ctr(raw,sample,stat_list): 364 | rate_stat = raw[stat_list+['label']].groupby(stat_list).mean().reset_index() 365 | rate_stat = rate_stat.rename(columns={'label':'_'.join(stat_list)+'_ctr'}) 366 | sample = pd.merge(sample,rate_stat,on=stat_list,how='left') 367 | 368 | count_stat = raw[stat_list+['label']].groupby(stat_list).count().reset_index() 369 | count_stat = count_stat.rename(columns={'label':'_'.join(stat_list)+'_count'}) 370 | sample = pd.merge(sample,count_stat,on=stat_list,how='left').fillna(0) 371 | 372 | click_stat = raw[stat_list+['label']].groupby(stat_list).sum().reset_index() 373 | click_stat = click_stat.rename(columns={'label':'_'.join(stat_list)+'_click'}) 374 | sample = pd.merge(sample,click_stat,on=stat_list,how='left').fillna(0) 375 | 376 | return sample 377 | 378 | stat_ls = [['prefix'], 379 | ['title'], 380 | ['tag'], 381 | ['prefix','title'], 382 | ['prefix','tag'], 383 | ['title','tag'], 384 | ['prefix','title','tag']] 385 | for l in stat_ls: 386 | sample = ctr(raw,sample,l) 387 | gc.collect() 388 | 389 | print(' cost: %.1f ' %(time()-start)) 390 | return sample 391 | 392 | def lake_features(raw,sample): 393 | print('------ lake features ',end='') 394 | start = time() 395 | def get_nunique(raw,sample,c1,c2): 396 | n_stat = raw[[c1,c2]].drop_duplicates() 397 | n_stat = n_stat.groupby(c1).count().reset_index() 398 | n_stat.columns = [c1, c1+'_'+c2+'_nunique'] 399 | sample = pd.merge(sample,n_stat,on=c1,how='left').fillna(0) 400 | return sample 401 | 402 | c1_list = ['prefix','title','prefix','title'] 403 | c2_list = ['title','prefix','tag','tag'] 404 | for c1,c2 in zip(c1_list,c2_list): 405 | sample = get_nunique(raw,sample,c1,c2) 406 | 407 | print(' cost: %.1f ' %(time()-start)) 408 | return sample 409 | 410 | 411 | sample = str_lower(sample) 412 | sample = lake_features(raw,sample) 413 | sample = ctr_features(raw,sample) 414 | return sample 415 | 416 | def runLGBCV(train_X, train_y,vali_X=None,vali_y=None, seed_val=2012, num_rounds = 2000): 417 | def lgb_f1_score(y_hat, data): 418 | y_true = data.get_label() 419 | y_hat = np.round(y_hat) 420 | return 'f1', f1_score(y_true, y_hat), True 421 | 422 | params = { 423 | 'task': 'train', 424 | 'boosting_type': 'gbdt', 425 | 'objective': 'binary', 426 | 'metric': 'binary_logloss', 427 | 'num_leaves': 127, 428 | 'learning_rate': 0.02, 429 | 'feature_fraction': 1, 430 | 'num_threads':-1, 431 | 'is_training_metric':True, 432 | } 433 | 434 | lgb_train = lgb.Dataset(train_X, train_y) 435 | 436 | if vali_y is not None: 437 | lgb_vali = lgb.Dataset(vali_X,vali_y) 438 | model = lgb.train(params,lgb_train,num_boost_round=num_rounds,verbose_eval=10,early_stopping_rounds=200, 439 | valid_sets=[lgb_vali, lgb_train],valid_names=['val', 'train']) 440 | 441 | else: 442 | model = lgb.train(params,lgb_train,num_boost_round=num_rounds,verbose_eval=10, 443 | valid_sets=[lgb_train],valid_names=['train']) 444 | 445 | return model,model.best_iteration 446 | 447 | def get_x_y(data): 448 | drop_list = ['prefix','query_prediction','title','tag'] 449 | if 'label' in data.columns: 450 | y = data['label'] 451 | data.drop(drop_list+['label'],axis=1,inplace=True) 452 | else: 453 | y=None 454 | data.drop(drop_list,axis=1,inplace=True) 455 | print('------ ',data.shape) 456 | return data,y 457 | 458 | def k_fold_stat_features(data,k=5): 459 | print('-- get 5 fold stat features') 460 | kf = KFold(n_splits=k) 461 | samples = [] 462 | for raw_idx,sample_idx in kf.split(data.index): 463 | gc.collect() 464 | raw = data[data.index.isin(raw_idx)].reset_index(drop=True) 465 | sample = data[data.index.isin(sample_idx)].reset_index(drop=True) 466 | sample = stat_features(raw,sample) 467 | samples.append(sample) 468 | samples = pd.concat(samples,ignore_index=True) 469 | samples = samples.reset_index(drop=True) 470 | gc.collect() 471 | return samples 472 | 473 | def train_and_predict(samples,vali_samples,num_rounds=3000): 474 | print('-- train and predict') 475 | print('---- get x and y') 476 | train_x,train_y = get_x_y(samples) 477 | vali_X,vali_y = get_x_y(vali_samples) 478 | 479 | print('---- training') 480 | model,best_iter = runLGBCV(train_x, train_y,vali_X,vali_y,num_rounds=num_rounds) 481 | print('best_iteration:',best_iter) 482 | 483 | print('---- predict') 484 | vali_pred = model.predict(vali_X) 485 | return model,best_iter,vali_pred,vali_y 486 | 487 | def result_analysis(res): 488 | print('mean : ',np.mean(res)) 489 | ##----------------------------------------------------------------------------- 490 | if __name__=='__main__': 491 | print('2018-11-15 19:45') 492 | train_dir = '../data/data_train.txt' 493 | vali_dir = '../data/data_vali.txt' 494 | test_dir = '../data/data_test.txt' 495 | vec_dir_1 = '../data/w2v_model/w2v_total_50wei.model' 496 | #vec_dir_2 = '../data/merge_sgns_bigram_char300/merge_sgns_bigram_char300.txt' 497 | srop_word_dir = '../data/stop_words.txt' 498 | test_result_dir = './lake_20181118.csv' 499 | 500 | print('prepare data') 501 | print('read raw data') 502 | raw_train = importDf(train_dir,colNames=['prefix','query_prediction','title','tag','label']) 503 | raw_vali = importDf(vali_dir,colNames=['prefix','query_prediction','title','tag','label']) 504 | raw_testa = importDf(test_dir,colNames=['prefix', 'query_prediction', 'title', 'tag']) 505 | 506 | vali_start = time() 507 | clean_data() 508 | # raw_train = get_index(raw_train) 509 | # raw_vali = get_index(raw_vali) 510 | encoder = get_tag_dict(raw_train) 511 | w2v_model_1 = read_w2v_model(vec_dir_1) 512 | #w2v_model_2 = read_w2v_model(vec_dir_2,persist=False) 513 | stop_words = read_stop_word(srop_word_dir) 514 | 515 | # raw_train = raw_train.head(10000) 516 | # raw_vali = raw_vali.head(1000) 517 | # raw_testa = raw_testa.head(1000) 518 | 519 | 520 | print('validation') 521 | print('-- get train sample') 522 | train = text_features(raw_train) 523 | train = k_fold_stat_features(train) 524 | 525 | print('-- get vali sample') 526 | vali = text_features(raw_vali) 527 | vali = stat_features(raw_train,vali) 528 | 529 | cols = list(train.columns) 530 | 531 | print('-- get final sample') 532 | raw_data = pd.concat([train,vali],ignore_index=True).reset_index(drop=True) 533 | drop_list = [c for c in raw_data.columns if 'ctr' in c or 'count' in c or 'click' in c or 'nunique' in c] 534 | raw_data.drop(drop_list,axis=1,inplace=True) 535 | 536 | del raw_train,raw_vali 537 | gc.collect() 538 | train.to_csv('train_1118.csv', index=False) 539 | vali.to_csv('vali_1118.csv', index=False) 540 | model,best_iter,vali_pred,vali_y = train_and_predict(train,vali) 541 | 542 | scores = [] 543 | print('-- search best split point') 544 | for thre in range(100): 545 | thre *=0.01 546 | score = f1_score(vali_y,list(map(one_zero2,vali_pred,[thre]*len(vali_pred)))) 547 | scores.append(score) 548 | 549 | scores = np.array(scores) 550 | best_5 = np.argsort(scores)[-5:] 551 | best_5_s = scores[best_5] 552 | for x,y in zip(best_5,best_5_s): 553 | print('%.2f %.4f' %(0.01*x,y)) 554 | max_thre = np.mean(best_5)*0.01 555 | ##----------------------------------------------------------------------------- 556 | 557 | ''' 558 | print('predict') 559 | # raw_data = raw_data.reset_index() 560 | # raw_data.rename(columns={'index':'instance_id'},inplace=True) 561 | print('-- get final train sample') 562 | data = k_fold_stat_features(raw_data) 563 | data = data[cols] 564 | #data.to_csv('data.csv', index=False) 565 | train_X,train_y = get_x_y(data) 566 | print('-- final training ') 567 | del train,vali 568 | gc.collect() 569 | model_,best_iter_ = runLGBCV(train_X, train_y,num_rounds=best_iter) 570 | print('best_iteration:',best_iter) 571 | 572 | 573 | print('---- predict') 574 | predict_start = time() 575 | print('-- get test sample') 576 | # raw_testa = get_index(raw_testa) 577 | test = text_features(raw_testa) 578 | test = stat_features(raw_data,test) 579 | #test.to_csv('test.csv', index=False) 580 | test_X,_ = get_x_y(test) 581 | test_pred = model_.predict(test_X) 582 | print('-- process to get result') 583 | test_y = pd.Series(list(map(one_zero2,test_pred,[max_thre]*len(test_pred)))) 584 | test_y.to_csv(test_result_dir,header=None,index=None) 585 | 586 | 587 | print('print result') 588 | for x,y in zip(best_5,best_5_s): 589 | print('threshold: %.2f f1 score: %.4f' %(0.01*x,y)) 590 | print('best iteration:', best_iter) 591 | result_analysis(test_pred) 592 | 593 | ''' -------------------------------------------------------------------------------- /chusai/data_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "from pandas import Series, DataFrame\n", 13 | "import numpy as np\n", 14 | "import csv\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "import datetime\n", 17 | "import math\n", 18 | "from datetime import *\n", 19 | "import gensim\n", 20 | "from gensim.models.doc2vec import Doc2Vec\n", 21 | "\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "\n", 34 | "RangeIndex: 50000 entries, 0 to 49999\n", 35 | "Data columns (total 5 columns):\n", 36 | "prefix 50000 non-null object\n", 37 | "query_prediction 49042 non-null object\n", 38 | "title 50000 non-null object\n", 39 | "tag 50000 non-null object\n", 40 | "label 50000 non-null int64\n", 41 | "dtypes: int64(1), object(4)\n", 42 | "memory usage: 1.9+ MB\n", 43 | "None\n", 44 | "\n", 45 | "RangeIndex: 50000 entries, 0 to 49999\n", 46 | "Data columns (total 5 columns):\n", 47 | "prefix 50000 non-null object\n", 48 | "query_prediction 49036 non-null object\n", 49 | "title 50000 non-null object\n", 50 | "tag 50000 non-null object\n", 51 | "label 0 non-null float64\n", 52 | "dtypes: float64(1), object(4)\n", 53 | "memory usage: 1.9+ MB\n", 54 | "None\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "train_df = pd.read_table('../data/oppo_round1_train_20180926.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 60 | "valid_df = pd.read_table('../data/oppo_round1_vali_20180926.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 61 | "test_df = pd.read_table('../data/oppo_round1_test_A_20180926.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 62 | "print(valid_df.info())\n", 63 | "print(test_df.info())\n", 64 | "\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "37248\n", 77 | "0.38815506872852235\n", 78 | "0.3747345\n", 79 | "0.3778705636743215\n", 80 | "0.37626\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "print(len(train_df[train_df.query_prediction.isnull()]))\n", 86 | "print(np.mean(train_df[train_df.query_prediction.isnull()]['label']))\n", 87 | "print(np.mean(train_df['label']))\n", 88 | "print(np.mean(valid_df[valid_df.query_prediction.isnull()]['label']))\n", 89 | "print(np.mean(valid_df['label']))\n", 90 | "\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 2, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "\n", 103 | "RangeIndex: 250000 entries, 0 to 249999\n", 104 | "Data columns (total 5 columns):\n", 105 | "prefix 250000 non-null object\n", 106 | "query_prediction 250000 non-null object\n", 107 | "title 250000 non-null object\n", 108 | "tag 250000 non-null object\n", 109 | "label 0 non-null float64\n", 110 | "dtypes: float64(1), object(4)\n", 111 | "memory usage: 9.5+ MB\n", 112 | "None\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "train_df = pd.read_table('../data/oppo_round1_train_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 118 | "valid_df = pd.read_table('../data/oppo_round1_vali_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 119 | "train_df = pd.concat([train_df, valid_df])\n", 120 | "train_df.reset_index(inplace=True)\n", 121 | "train_df['index'] = train_df.index\n", 122 | "\n", 123 | "test_df = pd.read_table('../data/oppo_round1_test_B_20181106.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 124 | "# test_df = pd.read_table('../data/oppo_round1_vali_20180926.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 125 | "print(test_df.info())\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 7, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "228115\n", 138 | "69080\n", 139 | "162754\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "train_prefix_set = set(train_df['prefix'])\n", 145 | "\n", 146 | "test_df['is_prefix_in_train'] = test_df['prefix'].map(lambda x : 1 if x in train_prefix_set else 0)\n", 147 | "print(len(test_df[test_df.is_prefix_in_train == 1]))\n", 148 | "print(len(set(test_df['prefix'])))\n", 149 | "print(len(train_prefix_set))\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 31, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "\n", 162 | "RangeIndex: 2000000 entries, 0 to 1999999\n", 163 | "Data columns (total 5 columns):\n", 164 | "prefix object\n", 165 | "query_prediction object\n", 166 | "title object\n", 167 | "tag object\n", 168 | "label int64\n", 169 | "dtypes: int64(1), object(4)\n", 170 | "memory usage: 76.3+ MB\n", 171 | "None\n", 172 | " prefix query_prediction title \\\n", 173 | "0 小品 {\"小品大全宋小宝\": \"0.009\", \"小品相亲\": \"0.012\", \"小品剧本\": ... 小品 \n", 174 | "1 1368 {\"13688cc赛马会\": \"0.059\", \"13685367892\": \"0.124\"... HCG大于1368%2C正常吗 \n", 175 | "2 1368 {\"13688cc赛马会\": \"0.059\", \"13685367892\": \"0.124\"... 1368年 \n", 176 | "3 银耳 {\"银耳汤的功效\": \"0.012\", \"银耳为什么不能天天吃\": \"0.009\", \"银耳... 银耳红枣汤的做法 \n", 177 | "4 月经量少 {\"月经量少喝红糖水好吗\": \"0.010\", \"月经量少该怎么调理\": \"0.016\", ... 月经量少怎么调理 \n", 178 | "\n", 179 | " tag label \n", 180 | "0 阅读 0 \n", 181 | "1 健康 0 \n", 182 | "2 百科 1 \n", 183 | "3 菜谱 1 \n", 184 | "4 百科 0 \n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "train_df = pd.read_table('../data/oppo_round1_train_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 190 | "print(train_df.info())\n", 191 | "print(train_df.head())\n" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 32, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | " query_prediction prefix\n", 204 | "0 {\"小品大全宋小宝\": \"0.009\", \"小品相亲\": \"0.012\", \"小品剧本\": ... 小品\n", 205 | "1 {\"13688cc赛马会\": \"0.059\", \"13685367892\": \"0.124\"... 1368\n", 206 | "2 {\"13688cc赛马会\": \"0.059\", \"13685367892\": \"0.124\"... 1368\n", 207 | "3 {\"银耳汤的功效\": \"0.012\", \"银耳为什么不能天天吃\": \"0.009\", \"银耳... 银耳\n", 208 | "4 {\"月经量少喝红糖水好吗\": \"0.010\", \"月经量少该怎么调理\": \"0.016\", ... 月经量少\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "train_df['query_prediction_dict'] = train_df['query_prediction'].astype(str).map(lambda x : eval(x))\n", 214 | "print(train_df[['query_prediction', 'prefix']].head())\n", 215 | "\n", 216 | "\n" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 33, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | " query_prediction_values_list query_prediction_len\n", 229 | "0 [0.009, 0.012, 0.020, 0.066, 0.007, 0.010, 0.1... 10\n", 230 | "1 [0.059, 0.124, 0.029, 0.070, 0.022, 0.042, 0.0... 9\n", 231 | "2 [0.059, 0.124, 0.029, 0.070, 0.022, 0.042, 0.0... 9\n", 232 | "3 [0.012, 0.009, 0.050, 0.045, 0.053, 0.014, 0.0... 10\n", 233 | "4 [0.010, 0.016, 0.009, 0.009, 0.569, 0.016, 0.0... 10\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "train_df['query_prediction_values_list'] = train_df['query_prediction_dict'].map(lambda x : list(x.values()))\n", 239 | "train_df['query_prediction_keys_list'] = train_df['query_prediction_dict'].map(lambda x : list(x.keys()))\n", 240 | "train_df['query_prediction_len'] = train_df['query_prediction_values_list'].map(lambda x : len(x))\n", 241 | "print(train_df[['query_prediction_values_list', 'query_prediction_len']].head())\n", 242 | "\n" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 23, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | " label\n", 255 | "query_prediction_len \n", 256 | "0 6926\n", 257 | "1 13713\n", 258 | "2 14593\n", 259 | "3 11998\n", 260 | "4 11814\n", 261 | "5 13207\n", 262 | "6 12399\n", 263 | "7 14982\n", 264 | "8 128500\n", 265 | "9 299634\n", 266 | "10 1472234\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "query_prediction_len_pivot_table = pd.pivot_table(train_df, index='query_prediction_len', values='label', aggfunc=len)\n", 272 | "print(query_prediction_len_pivot_table)\n", 273 | "\n", 274 | "\n" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 34, 280 | "metadata": { 281 | "scrolled": true 282 | }, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | " prefix prefix_number\n", 289 | "0 # 41\n", 290 | "1 % 33\n", 291 | "2 & 14\n", 292 | "3 (女人 1\n", 293 | "4 *p++ 1\n", 294 | " prefix\n", 295 | "prefix_number \n", 296 | "1 45173\n", 297 | "2 36675\n", 298 | "3 17138\n", 299 | "4 10095\n", 300 | "5 7165\n", 301 | "6 5136\n", 302 | "7 3884\n", 303 | "8 3043\n", 304 | "9 2536\n", 305 | "10 2123\n", 306 | "11 1762\n", 307 | "12 1555\n", 308 | "13 1354\n", 309 | "14 1215\n", 310 | "15 1041\n", 311 | "16 901\n", 312 | "17 795\n", 313 | "18 753\n", 314 | "19 682\n", 315 | "20 659\n", 316 | "21 540\n", 317 | "22 519\n", 318 | "23 485\n", 319 | "24 460\n", 320 | "25 405\n", 321 | "26 352\n", 322 | "27 364\n", 323 | "28 330\n", 324 | "29 318\n", 325 | "30 276\n", 326 | "... ...\n", 327 | "2813 1\n", 328 | "2834 1\n", 329 | "3033 1\n", 330 | "3158 1\n", 331 | "3323 1\n", 332 | "3382 1\n", 333 | "3437 1\n", 334 | "3498 1\n", 335 | "3722 1\n", 336 | "3760 1\n", 337 | "3833 1\n", 338 | "3892 1\n", 339 | "4494 1\n", 340 | "4500 1\n", 341 | "4564 1\n", 342 | "4581 1\n", 343 | "4851 1\n", 344 | "4955 1\n", 345 | "4991 1\n", 346 | "5171 1\n", 347 | "5295 1\n", 348 | "5363 1\n", 349 | "5379 1\n", 350 | "7039 1\n", 351 | "8400 1\n", 352 | "8469 1\n", 353 | "8509 1\n", 354 | "9580 1\n", 355 | "11151 1\n", 356 | "20066 1\n", 357 | "\n", 358 | "[823 rows x 1 columns]\n" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "prefix_pivot_table = pd.pivot_table(train_df, index='prefix', values='label', aggfunc=len)\n", 364 | "prefix_pivot_table.reset_index(inplace=True)\n", 365 | "prefix_pivot_table.rename(columns={'label' : 'prefix_number'}, inplace=True)\n", 366 | "prefix_number_pivot_table = pd.pivot_table(prefix_pivot_table, index='prefix_number', values='prefix', aggfunc=len)\n", 367 | "print(prefix_pivot_table.head())\n", 368 | "print(prefix_number_pivot_table)\n", 369 | "\n" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 35, 375 | "metadata": { 376 | "scrolled": true 377 | }, 378 | "outputs": [ 379 | { 380 | "name": "stdout", 381 | "output_type": "stream", 382 | "text": [ 383 | " query_prediction query_prediction_number\n", 384 | "0 {\"#*\": \"0.065\", \"#什么意思\": \"0.065\", \"#n/a是什么意思\":... 41\n", 385 | "1 {\"%2c...8一\": \"0.053\", \"%2c\": \"0.044\"} 33\n", 386 | "2 {\" \": \"0.022\", \"&怎么读\": \"0.104\", \"&g90tvk75... 14\n", 387 | "3 {\"- 千千音乐\": \"0.079\", \"- 百度\": \"0.274\", \"- 魔镜原创摄影... 1\n", 388 | "4 {\". 新闻\": \"0.008\", \". 新闻美女丑事\": \"0.012\", \". 新闻 新... 9\n", 389 | " query_prediction\n", 390 | "query_prediction_number \n", 391 | "1 42779\n", 392 | "2 35024\n", 393 | "3 16782\n", 394 | "4 10001\n", 395 | "5 7121\n", 396 | "6 5142\n", 397 | "7 3873\n", 398 | "8 3047\n", 399 | "9 2540\n", 400 | "10 2121\n", 401 | "11 1771\n", 402 | "12 1558\n", 403 | "13 1344\n", 404 | "14 1216\n", 405 | "15 1043\n", 406 | "16 901\n", 407 | "17 796\n", 408 | "18 753\n", 409 | "19 684\n", 410 | "20 656\n", 411 | "21 539\n", 412 | "22 521\n", 413 | "23 487\n", 414 | "24 460\n", 415 | "25 409\n", 416 | "26 351\n", 417 | "27 363\n", 418 | "28 330\n", 419 | "29 317\n", 420 | "30 276\n", 421 | "... ...\n", 422 | "2834 1\n", 423 | "3033 1\n", 424 | "3158 1\n", 425 | "3323 1\n", 426 | "3382 1\n", 427 | "3437 1\n", 428 | "3498 1\n", 429 | "3722 1\n", 430 | "3760 1\n", 431 | "3833 1\n", 432 | "3892 1\n", 433 | "4494 1\n", 434 | "4500 1\n", 435 | "4564 1\n", 436 | "4581 1\n", 437 | "4851 1\n", 438 | "4955 1\n", 439 | "4991 1\n", 440 | "5171 1\n", 441 | "5295 1\n", 442 | "5363 1\n", 443 | "5379 1\n", 444 | "6926 1\n", 445 | "7039 1\n", 446 | "8400 1\n", 447 | "8469 1\n", 448 | "8509 1\n", 449 | "9580 1\n", 450 | "11151 1\n", 451 | "20066 1\n", 452 | "\n", 453 | "[824 rows x 1 columns]\n" 454 | ] 455 | } 456 | ], 457 | "source": [ 458 | "query_prediction_pivot_table = pd.pivot_table(train_df, index='query_prediction', values='label', aggfunc=len)\n", 459 | "query_prediction_pivot_table.reset_index(inplace=True)\n", 460 | "query_prediction_pivot_table.rename(columns={'label' : 'query_prediction_number'}, inplace=True)\n", 461 | "query_prediction_number_pivot_table = pd.pivot_table(query_prediction_pivot_table, index='query_prediction_number', values='query_prediction', aggfunc=len)\n", 462 | "print(query_prediction_pivot_table.head())\n", 463 | "print(query_prediction_number_pivot_table)\n", 464 | "\n" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 36, 470 | "metadata": { 471 | "scrolled": true 472 | }, 473 | "outputs": [ 474 | { 475 | "name": "stdout", 476 | "output_type": "stream", 477 | "text": [ 478 | " prefix\n", 479 | "prefix_query_number \n", 480 | "1 45173\n", 481 | "2 36675\n", 482 | "3 17138\n", 483 | "4 10095\n", 484 | "5 7165\n", 485 | "6 5136\n", 486 | "7 3884\n", 487 | "8 3043\n", 488 | "9 2536\n", 489 | "10 2123\n", 490 | "11 1762\n", 491 | "12 1555\n", 492 | "13 1354\n", 493 | "14 1215\n", 494 | "15 1041\n", 495 | "16 901\n", 496 | "17 795\n", 497 | "18 753\n", 498 | "19 682\n", 499 | "20 659\n", 500 | "21 540\n", 501 | "22 519\n", 502 | "23 485\n", 503 | "24 460\n", 504 | "25 405\n", 505 | "26 352\n", 506 | "27 364\n", 507 | "28 330\n", 508 | "29 318\n", 509 | "30 276\n", 510 | "... ...\n", 511 | "2813 1\n", 512 | "2834 1\n", 513 | "3033 1\n", 514 | "3158 1\n", 515 | "3323 1\n", 516 | "3382 1\n", 517 | "3437 1\n", 518 | "3498 1\n", 519 | "3722 1\n", 520 | "3760 1\n", 521 | "3833 1\n", 522 | "3892 1\n", 523 | "4494 1\n", 524 | "4500 1\n", 525 | "4564 1\n", 526 | "4581 1\n", 527 | "4851 1\n", 528 | "4955 1\n", 529 | "4991 1\n", 530 | "5171 1\n", 531 | "5295 1\n", 532 | "5363 1\n", 533 | "5379 1\n", 534 | "7039 1\n", 535 | "8400 1\n", 536 | "8469 1\n", 537 | "8509 1\n", 538 | "9580 1\n", 539 | "11151 1\n", 540 | "20066 1\n", 541 | "\n", 542 | "[823 rows x 1 columns]\n" 543 | ] 544 | } 545 | ], 546 | "source": [ 547 | "prefix_query_pivot_table = pd.pivot_table(train_df, index=['prefix', 'query_prediction'], values='label', aggfunc=len)\n", 548 | "prefix_query_pivot_table.reset_index(inplace=True)\n", 549 | "prefix_query_pivot_table.rename(columns={'label' : 'prefix_query_number'}, inplace=True)\n", 550 | "prefix_query_number_query_pivot_table = pd.pivot_table(prefix_query_pivot_table, index='prefix_query_number', values='prefix', aggfunc=len)\n", 551 | "print(prefix_query_number_query_pivot_table)\n" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 38, 557 | "metadata": {}, 558 | "outputs": [ 559 | { 560 | "name": "stdout", 561 | "output_type": "stream", 562 | "text": [ 563 | "\n", 564 | "RangeIndex: 50000 entries, 0 to 49999\n", 565 | "Data columns (total 5 columns):\n", 566 | "prefix 50000 non-null object\n", 567 | "query_prediction 50000 non-null object\n", 568 | "title 50000 non-null object\n", 569 | "tag 50000 non-null object\n", 570 | "label 50000 non-null int64\n", 571 | "dtypes: int64(1), object(4)\n", 572 | "memory usage: 1.9+ MB\n", 573 | "None\n", 574 | " prefix query_prediction title tag label\n", 575 | "0 重庆旅游 {\"重庆旅游景区\": \"0.018\", \"重庆旅游攻略\": \"0.373\", \"重庆旅游景点... 皇包车旅行 应用 0\n", 576 | "1 婆婆来了 {\"婆婆来了大结局\": \"0.021\", \"婆婆来了电视剧\": \"0.100\", \"婆婆来了... 搜狐视频 应用 0\n", 577 | "2 张国荣 {\"张国荣遗体很恐怖\": \"0.020\", \"张国荣头像\": \"0.013\", \"张国荣24... 张国荣 歌手 0\n", 578 | "3 陌陌 {\"陌陌晒奶\": \"0.008\", \"陌陌下载安装\": \"0.009\", \"陌陌交友\": \"... 陌陌 百科 0\n", 579 | "4 畅游 {\"畅游阁\": \"0.322\", \"畅游六零年代\": \"0.134\", \"畅游书城82212... 李畅游 百科 0\n" 580 | ] 581 | } 582 | ], 583 | "source": [ 584 | "valid_df = pd.read_table('../data/oppo_round1_vali_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)\n", 585 | "print(valid_df.info())\n", 586 | "print(valid_df.head())\n" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 39, 592 | "metadata": {}, 593 | "outputs": [ 594 | { 595 | "name": "stdout", 596 | "output_type": "stream", 597 | "text": [ 598 | " query_prediction title\n", 599 | "0 {\"重庆旅游景区\": \"0.018\", \"重庆旅游攻略\": \"0.373\", \"重庆旅游景点... 皇包车旅行\n", 600 | "1 {\"婆婆来了大结局\": \"0.021\", \"婆婆来了电视剧\": \"0.100\", \"婆婆来了... 搜狐视频\n", 601 | "2 {\"张国荣遗体很恐怖\": \"0.020\", \"张国荣头像\": \"0.013\", \"张国荣24... 张国荣\n", 602 | "3 {\"陌陌晒奶\": \"0.008\", \"陌陌下载安装\": \"0.009\", \"陌陌交友\": \"... 陌陌\n", 603 | "4 {\"畅游阁\": \"0.322\", \"畅游六零年代\": \"0.134\", \"畅游书城82212... 李畅游\n" 604 | ] 605 | } 606 | ], 607 | "source": [ 608 | "valid_df['query_prediction'] = valid_df['query_prediction'].astype(str)\n", 609 | "print(valid_df[['query_prediction', 'title']].head())\n", 610 | "\n" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 41, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "name": "stdout", 620 | "output_type": "stream", 621 | "text": [ 622 | "111910\n" 623 | ] 624 | } 625 | ], 626 | "source": [ 627 | "prefix_repeat_set = set(prefix_pivot_table['prefix'][prefix_pivot_table.prefix_number > 1])\n", 628 | "print(len(prefix_repeat_set))\n", 629 | "\n" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 42, 635 | "metadata": {}, 636 | "outputs": [ 637 | { 638 | "name": "stdout", 639 | "output_type": "stream", 640 | "text": [ 641 | " prefix is_repeat_prefix\n", 642 | "0 小品 1\n", 643 | "1 1368 1\n", 644 | "2 1368 1\n", 645 | "3 银耳 1\n", 646 | "4 月经量少 1\n" 647 | ] 648 | } 649 | ], 650 | "source": [ 651 | "train_df['is_repeat_prefix'] = train_df['prefix'].map(lambda x : 1 if x in prefix_repeat_set else 0)\n", 652 | "print(train_df[['prefix', 'is_repeat_prefix']].head())\n", 653 | "\n" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "metadata": { 660 | "collapsed": true 661 | }, 662 | "outputs": [], 663 | "source": [ 664 | "\n", 665 | "\n" 666 | ] 667 | } 668 | ], 669 | "metadata": { 670 | "kernelspec": { 671 | "display_name": "Python 3", 672 | "language": "python", 673 | "name": "python3" 674 | }, 675 | "language_info": { 676 | "codemirror_mode": { 677 | "name": "ipython", 678 | "version": 3 679 | }, 680 | "file_extension": ".py", 681 | "mimetype": "text/x-python", 682 | "name": "python", 683 | "nbconvert_exporter": "python", 684 | "pygments_lexer": "ipython3", 685 | "version": "3.6.1" 686 | } 687 | }, 688 | "nbformat": 4, 689 | "nbformat_minor": 2 690 | } 691 | -------------------------------------------------------------------------------- /chusai/oppo_model_lgb_online.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np \n", 12 | "import pandas as pd\n", 13 | "import time\n", 14 | "import datetime\n", 15 | "import gc\n", 16 | "from sklearn.model_selection import KFold, cross_val_score, train_test_split\n", 17 | "from sklearn.model_selection import StratifiedKFold\n", 18 | "from sklearn.metrics import roc_auc_score, log_loss\n", 19 | "import lightgbm as lgb\n", 20 | "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", 21 | "from sklearn.feature_extraction.text import CountVectorizer\n", 22 | "from sklearn.feature_selection import chi2, SelectPercentile\n", 23 | "import math\n", 24 | "from sklearn.metrics import f1_score\n", 25 | "import jieba\n", 26 | "import jieba.posseg as psg\n", 27 | "from collections import Counter\n", 28 | "import functools\n", 29 | "from gensim.models import word2vec\n", 30 | "import Levenshtein\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "train_dataset = pd.read_csv('../temp/train_online_df.csv')\n", 42 | "test_dataset = pd.read_csv('../temp/test_online_df.csv')\n", 43 | "\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "fea = [\n", 55 | " 'query_prediction_number', 'query_prediction_max', 'query_prediction_min', 'query_prediction_mean', 'query_prediction_std',\n", 56 | " 'prefix_count', 'prefix_rate',\n", 57 | " 'title_count', 'title_rate', 'tag_count', 'tag_rate',\n", 58 | " 'query_prediction_count', 'query_prediction_rate', 'prefix_title_count',\n", 59 | " 'prefix_title_rate', 'prefix_tag_count', 'prefix_tag_rate',\n", 60 | " 'title_tag_count', 'title_tag_rate',\n", 61 | " 'prefix_click_number', 'title_click_number', 'query_prediction_click_number', 'prefix_tag_click_number', \n", 62 | " 'prefix_title_click_number', 'title_tag_click_number',\n", 63 | " 'is_title_in_query', 'is_prefix_in_title', \n", 64 | " 'title_tag_types', 'prefix_tag_types', 'tag_title_types', 'tag_prefix_types',\n", 65 | " 'title_prefix_types', 'prefix_title_types', 'tag_query_prediction_types', 'title_query_prediction_types',\n", 66 | " 'prefix_len', 'title_len',\n", 67 | " 'query_prediction_key_len_max', 'query_prediction_key_len_min',\n", 68 | " 'query_prediction_key_len_mean', 'query_prediction_key_len_std',\n", 69 | " 'len_title-prefix', 'len_prefix/title', 'len_mean-title', 'len_mean/title',\n", 70 | " 'q_t_word_match', 'q_t_jaccard', 'q_t_common_words',\n", 71 | " 'q_t_total_unique_words', 'q_t_wc_diff', 'q_t_wc_ratio',\n", 72 | " 'q_t_wc_diff_unique', 'q_t_wc_ratio_unique', 'q_t_tfidf_word_match_share',\n", 73 | " 'p_t_word_match', 'p_t_jaccard', 'p_t_common_words',\n", 74 | " 'p_t_total_unique_words', 'p_t_wc_diff', 'p_t_wc_ratio',\n", 75 | " 'p_t_wc_diff_unique', 'p_t_wc_ratio_unique', 'p_t_tfidf_word_match_share',\n", 76 | " 'p_q_word_match', 'p_q_jaccard', 'p_q_common_words',\n", 77 | " 'p_q_total_unique_words', 'p_q_wc_diff', 'p_q_wc_ratio',\n", 78 | " 'p_q_wc_diff_unique', 'p_q_wc_ratio_unique', 'p_q_tfidf_word_match_share',\n", 79 | " 'title_prefix_dot_similarity',\n", 80 | " 'title_query_dot_similarity', 'title_prefix_norm_similarity',\n", 81 | " 'title_query_norm_similarity', 'title_prefix_cosine_similarity',\n", 82 | " 'title_query_cosine_similarity',\n", 83 | " 'title_query_dot_similarity_max', 'title_query_dot_similarity_min',\n", 84 | " 'title_query_dot_similarity_mean', 'title_query_dot_similarity_std',\n", 85 | " 'title_query_norm_similarity_min', 'title_query_norm_similarity_mean',\n", 86 | " 'title_query_norm_similarity_std', 'title_prefix_cosine_similarity',\n", 87 | " 'title_query_cosine_similarity_max', 'title_query_cosine_similarity_min',\n", 88 | " 'title_query_cosine_similarity_mean', 'title_query_cosine_similarity_std',\n", 89 | " 'title_prefix_leven', 'title_prefix_leven_rate',\n", 90 | " 'title_query_leven_sum', 'title_query_leven_max', 'title_query_leven_min',\n", 91 | " 'title_query_leven_mean', 'title_query_leven_std',\n", 92 | " ]\n", 93 | "\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "metadata": { 100 | "scrolled": true 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stderr", 105 | "output_type": "stream", 106 | "text": [ 107 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:99: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n", 108 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" 109 | ] 110 | }, 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "0.3820973024838574\n", 116 | " importance\n", 117 | "prefix_title_rate 8445\n", 118 | "prefix_tag_rate 6665\n", 119 | "prefix_rate 5300\n", 120 | "title_tag_rate 3884\n", 121 | "query_prediction_rate 2738\n", 122 | "title_rate 2670\n", 123 | "tag_rate 2659\n", 124 | "prefix_click_number 2436\n", 125 | "prefix_title_count 1948\n", 126 | "prefix_title_click_number 1867\n", 127 | "title_tag_count 1783\n", 128 | "tag_count 1709\n", 129 | "prefix_title_types 1597\n", 130 | "q_t_word_match 1499\n", 131 | "prefix_tag_count 1430\n", 132 | "p_t_tfidf_word_match_share 1359\n", 133 | "title_query_norm_similarity_std 1337\n", 134 | "prefix_tag_click_number 1293\n", 135 | "query_prediction_click_number 1138\n", 136 | "title_tag_click_number 1074\n", 137 | "title_query_norm_similarity 1047\n", 138 | "title_query_norm_similarity_min 998\n", 139 | "query_prediction_key_len_mean 994\n", 140 | "q_t_tfidf_word_match_share 964\n", 141 | "title_query_leven_min 960\n", 142 | "prefix_count 932\n", 143 | "tag_title_types 925\n", 144 | "prefix_tag_types 922\n", 145 | "title_query_dot_similarity 871\n", 146 | "title_query_leven_sum 847\n", 147 | "... ...\n", 148 | "title_query_cosine_similarity_std 356\n", 149 | "p_q_wc_ratio 355\n", 150 | "q_t_wc_diff 314\n", 151 | "p_q_wc_diff 288\n", 152 | "p_t_total_unique_words 253\n", 153 | "p_q_word_match 217\n", 154 | "p_t_word_match 210\n", 155 | "prefix_len 208\n", 156 | "title_query_prediction_types 201\n", 157 | "is_title_in_query 188\n", 158 | "q_t_wc_diff_unique 182\n", 159 | "q_t_total_unique_words 181\n", 160 | "p_t_wc_ratio 175\n", 161 | "p_q_total_unique_words 170\n", 162 | "p_t_wc_ratio_unique 157\n", 163 | "len_title-prefix 151\n", 164 | "p_q_wc_diff_unique 149\n", 165 | "query_prediction_key_len_min 148\n", 166 | "title_prefix_leven 136\n", 167 | "p_t_wc_diff 122\n", 168 | "tag_prefix_types 120\n", 169 | "p_q_common_words 94\n", 170 | "query_prediction_number 90\n", 171 | "p_t_common_words 83\n", 172 | "p_t_wc_diff_unique 82\n", 173 | "p_t_jaccard 0\n", 174 | "q_t_jaccard 0\n", 175 | "title_prefix_cosine_similarity 0\n", 176 | "tag_query_prediction_types 0\n", 177 | "p_q_jaccard 0\n", 178 | "\n", 179 | "[97 rows x 1 columns]\n" 180 | ] 181 | }, 182 | { 183 | "name": "stderr", 184 | "output_type": "stream", 185 | "text": [ 186 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:18: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "lgb_model = lgb.LGBMClassifier(\n", 192 | " boosting_type='gbdt', num_leaves=127, max_depth=-1, n_estimators=5000, objective='binary',\n", 193 | " subsample=0.8, colsample_bytree=1, subsample_freq=1,\n", 194 | " learning_rate=0.01, random_state=2018, n_jobs=-1, num_boost_round=700\n", 195 | ")\n", 196 | "\n", 197 | "test_dataset['predicted_score'] = 0\n", 198 | "\n", 199 | "# lgb_model.fit(train_df[fea], train_df['label'], eval_set=[(train_df[fea], train_df['label']),\n", 200 | "# (valid_df[fea], valid_df['label'])], early_stopping_rounds=50, eval_metric='auc')\n", 201 | "lgb_model.fit(train_dataset[fea], train_dataset['label'], eval_metric='auc')\n", 202 | "test_pred = lgb_model.predict_proba(test_dataset[fea], num_iteration=700)[:, 1]\n", 203 | "print(np.mean(test_pred))\n", 204 | "\n", 205 | "fscore = lgb_model.booster_.feature_importance()\n", 206 | "feaNames = lgb_model.booster_.feature_name()\n", 207 | "scoreDf = pd.DataFrame(index=feaNames, columns=['importance'], data=fscore)\n", 208 | "print(scoreDf.sort_index(by=['importance'], ascending=False))\n", 209 | "\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 5, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "0.37606418001418024\n", 222 | "0.4446379748502132\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "test_dataset['predicted_score'] = test_pred\n", 228 | "\n", 229 | "train_prefix_set = set(train_dataset['prefix'])\n", 230 | "\n", 231 | "test_dataset['is_prefix_in_train'] = test_dataset['prefix'].map(lambda x : 1 if x in train_prefix_set else 0)\n", 232 | "print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 1]['predicted_score']))\n", 233 | "print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 0]['predicted_score']))\n", 234 | "\n" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 27, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "0.4004078857919782\n", 247 | "0.4010002273243919\n" 248 | ] 249 | }, 250 | { 251 | "name": "stderr", 252 | "output_type": "stream", 253 | "text": [ 254 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", 255 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 256 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 257 | "\n", 258 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 259 | " after removing the cwd from sys.path.\n", 260 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: \n", 261 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 262 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 263 | "\n", 264 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 265 | " \n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "test_df_copy = test_dataset.copy()\n", 271 | "\n", 272 | "test_df_1 = test_df_copy[test_df_copy.is_prefix_in_train == 1]\n", 273 | "test_df_1['predict_label'] = test_df_1['predicted_score'].map(lambda x : 1 if x > 0.395 else 0)\n", 274 | "print(np.mean(test_df_1['predict_label']))\n", 275 | "\n", 276 | "test_df_0 = test_df_copy[test_df_copy.is_prefix_in_train == 0]\n", 277 | "test_df_0['predict_label'] = test_df_0['predicted_score'].map(lambda x : 1 if x > 0.5115 else 0)\n", 278 | "print(np.mean(test_df_0['predict_label']))\n", 279 | "\n" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 28, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "0.4004078857919782\n", 292 | "0.4010002273243919\n", 293 | "0.40046\n" 294 | ] 295 | }, 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n", 301 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 302 | "\n", 303 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 304 | " This is separate from the ipykernel package so we can avoid doing imports until\n", 305 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", 306 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 307 | "\n", 308 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 309 | " after removing the cwd from sys.path.\n" 310 | ] 311 | } 312 | ], 313 | "source": [ 314 | "test_df_copy['predict_label'] = 0\n", 315 | "\n", 316 | "test_df_copy['predict_label'][test_df_copy.is_prefix_in_train == 1] = test_df_1['predict_label']\n", 317 | "test_df_copy['predict_label'][test_df_copy.is_prefix_in_train == 0] = test_df_0['predict_label']\n", 318 | "print(np.mean(test_df_copy[test_df_copy.is_prefix_in_train == 1]['predict_label']))\n", 319 | "print(np.mean(test_df_copy[test_df_copy.is_prefix_in_train == 0]['predict_label']))\n", 320 | "print(np.mean(test_df_copy['predict_label']))\n", 321 | "\n" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 29, 327 | "metadata": { 328 | "collapsed": true 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "# 导出预测结果\n", 333 | "def exportResult(df, fileName):\n", 334 | " df.to_csv('../result/%s.csv' % fileName, header=False, index=False)\n", 335 | "\n", 336 | "exportResult(test_df_copy[['predict_label']], 'lgb_wen_11_1')\n", 337 | "\n" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 35, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "0.41028\n" 350 | ] 351 | } 352 | ], 353 | "source": [ 354 | "# test_df_copy['predict_label'] = test_df_copy['predicted_score'].map(lambda x : 1 if x > 0.394 else 0)\n", 355 | "# print(np.mean(test_df_copy['predict_label']))\n", 356 | "\n" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 36, 362 | "metadata": { 363 | "collapsed": true 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "# exportResult(test_df_copy[['predict_label']], 'lgb_wen_10_30')\n" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 6, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "name": "stdout", 377 | "output_type": "stream", 378 | "text": [ 379 | "original mean : 0.37606418001418024\n", 380 | "0.4446454488950832\n" 381 | ] 382 | } 383 | ], 384 | "source": [ 385 | "test_prefix0_df = test_dataset[test_dataset.is_prefix_in_train == 1].copy()\n", 386 | "\n", 387 | "#定义调整函数\n", 388 | "def resultAdjustment(result_df, t):\n", 389 | " result_df_temp = result_df.copy()\n", 390 | " result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))\n", 391 | " result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) \n", 392 | " print(result_df_temp['adjust_result'].mean())\n", 393 | " return result_df_temp['adjust_result']\n", 394 | "\n", 395 | "print('original mean : ', test_prefix0_df['predicted_score'].mean())\n", 396 | "test_df_after = resultAdjustment(test_prefix0_df, 0.49985)\n", 397 | "\n", 398 | "\n" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 7, 404 | "metadata": {}, 405 | "outputs": [ 406 | { 407 | "name": "stdout", 408 | "output_type": "stream", 409 | "text": [ 410 | "0.4446379748502132\n", 411 | "0.4446454488950832\n" 412 | ] 413 | }, 414 | { 415 | "name": "stderr", 416 | "output_type": "stream", 417 | "text": [ 418 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 419 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 420 | "\n", 421 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 422 | " \"\"\"Entry point for launching an IPython kernel.\n" 423 | ] 424 | } 425 | ], 426 | "source": [ 427 | "test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1] = test_df_after\n", 428 | "print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 0]))\n", 429 | "print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1]))\n" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 9, 435 | "metadata": { 436 | "collapsed": true 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "# # 导出预测结果\n", 441 | "# def exportScore(df, fileName):\n", 442 | "# df.to_csv('../result/%s.csv' % fileName, header=True, index=False)\n", 443 | "\n", 444 | "# exportScore(test_dataset[['is_prefix_in_train', 'predicted_score']], 'keng_score')\n" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 11, 450 | "metadata": {}, 451 | "outputs": [ 452 | { 453 | "name": "stdout", 454 | "output_type": "stream", 455 | "text": [ 456 | "0.40544\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "test_dataset['predicted_label'] = test_dataset['predicted_score'].map(lambda x : 1 if x > 0.509 else 0)\n", 462 | "print(np.mean(test_dataset['predicted_label']))\n" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 12, 468 | "metadata": { 469 | "collapsed": true 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "# 导出预测结果\n", 474 | "def exportResult(df, fileName):\n", 475 | " df.to_csv('../result/%s.csv' % fileName, header=False, index=False)\n", 476 | "\n", 477 | "exportResult(test_dataset[['predicted_label']], 'lgb_keng_11_2')\n", 478 | "\n" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 23, 484 | "metadata": { 485 | "scrolled": true 486 | }, 487 | "outputs": [ 488 | { 489 | "name": "stderr", 490 | "output_type": "stream", 491 | "text": [ 492 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:99: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n", 493 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" 494 | ] 495 | }, 496 | { 497 | "name": "stdout", 498 | "output_type": "stream", 499 | "text": [ 500 | "test mean: 0.38187212610870797\n" 501 | ] 502 | }, 503 | { 504 | "name": "stderr", 505 | "output_type": "stream", 506 | "text": [ 507 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:99: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n", 508 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" 509 | ] 510 | }, 511 | { 512 | "name": "stdout", 513 | "output_type": "stream", 514 | "text": [ 515 | "test mean: 0.3810200871274055\n" 516 | ] 517 | }, 518 | { 519 | "name": "stderr", 520 | "output_type": "stream", 521 | "text": [ 522 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:99: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n", 523 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" 524 | ] 525 | }, 526 | { 527 | "name": "stdout", 528 | "output_type": "stream", 529 | "text": [ 530 | "test mean: 0.3818233445511438\n" 531 | ] 532 | }, 533 | { 534 | "name": "stderr", 535 | "output_type": "stream", 536 | "text": [ 537 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:99: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n", 538 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" 539 | ] 540 | }, 541 | { 542 | "name": "stdout", 543 | "output_type": "stream", 544 | "text": [ 545 | "test mean: 0.38186220531599846\n" 546 | ] 547 | }, 548 | { 549 | "name": "stderr", 550 | "output_type": "stream", 551 | "text": [ 552 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:99: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n", 553 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" 554 | ] 555 | }, 556 | { 557 | "name": "stdout", 558 | "output_type": "stream", 559 | "text": [ 560 | "test mean: 0.3818072348278654\n", 561 | "mean: 0.3816769995862319\n" 562 | ] 563 | } 564 | ], 565 | "source": [ 566 | "# 五折模型结果\n", 567 | "test_dataset['predicted_score'] = 0\n", 568 | "\n", 569 | "skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)\n", 570 | "early_stopping_dict = {'0' : 670, '1' : 560, '2' : 700, '3' : 680, '4' : 680}\n", 571 | "for index, (train_index, test_index) in enumerate(skf.split(train_dataset, train_dataset['label'])):\n", 572 | " num_boost_round = early_stopping_dict[str(index)]\n", 573 | " lgb_model = lgb.LGBMClassifier(\n", 574 | " boosting_type='gbdt', num_leaves=127, max_depth=-1, n_estimators=5000, objective='binary',\n", 575 | " subsample=0.8, colsample_bytree=1, subsample_freq=1,\n", 576 | " learning_rate=0.01, random_state=2018, n_jobs=-1, num_boost_round=num_boost_round\n", 577 | " )\n", 578 | " lgb_model.fit(train_dataset[fea].iloc[train_index], train_dataset['label'][train_index], eval_metric='auc')\n", 579 | " test_pred = lgb_model.predict_proba(test_dataset[fea], num_iteration=num_boost_round)[:, 1]\n", 580 | " \n", 581 | " print('test mean:', test_pred.mean())\n", 582 | " test_dataset['predicted_score'] = test_dataset['predicted_score'] + test_pred\n", 583 | "\n", 584 | "test_dataset['predicted_score'] = test_dataset['predicted_score'] / 5\n", 585 | "mean = test_dataset['predicted_score'].mean()\n", 586 | "print('mean:', mean)\n", 587 | "\n" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 25, 593 | "metadata": {}, 594 | "outputs": [ 595 | { 596 | "name": "stdout", 597 | "output_type": "stream", 598 | "text": [ 599 | "4399\n", 600 | "0.3757038099658089\n", 601 | "0.44359639487622804\n" 602 | ] 603 | } 604 | ], 605 | "source": [ 606 | "print(len(test_dataset[test_dataset.is_prefix_in_train == 0]))\n", 607 | "print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 1]['predicted_score']))\n", 608 | "print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 0]['predicted_score']))\n", 609 | "\n" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 29, 615 | "metadata": {}, 616 | "outputs": [ 617 | { 618 | "name": "stdout", 619 | "output_type": "stream", 620 | "text": [ 621 | "original mean : 0.3757038099658089\n", 622 | "0.44352972613919045\n" 623 | ] 624 | } 625 | ], 626 | "source": [ 627 | "test_prefix0_df = test_dataset[test_dataset.is_prefix_in_train == 1].copy()\n", 628 | "\n", 629 | "#定义调整函数\n", 630 | "def resultAdjustment(result_df, t):\n", 631 | " result_df_temp = result_df.copy()\n", 632 | " result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))\n", 633 | " result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) \n", 634 | " print(result_df_temp['adjust_result'].mean())\n", 635 | " return result_df_temp['adjust_result']\n", 636 | "\n", 637 | "print('original mean : ', test_prefix0_df['predicted_score'].mean())\n", 638 | "test_df_after = resultAdjustment(test_prefix0_df, 0.49285)\n", 639 | "\n" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 30, 645 | "metadata": {}, 646 | "outputs": [ 647 | { 648 | "name": "stdout", 649 | "output_type": "stream", 650 | "text": [ 651 | "0.44359639487622804\n", 652 | "0.44352972613919045\n" 653 | ] 654 | }, 655 | { 656 | "name": "stderr", 657 | "output_type": "stream", 658 | "text": [ 659 | "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 660 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 661 | "\n", 662 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 663 | " \"\"\"Entry point for launching an IPython kernel.\n" 664 | ] 665 | } 666 | ], 667 | "source": [ 668 | "test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1] = test_df_after\n", 669 | "print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 0]))\n", 670 | "print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1]))\n" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 31, 676 | "metadata": {}, 677 | "outputs": [ 678 | { 679 | "name": "stdout", 680 | "output_type": "stream", 681 | "text": [ 682 | "0.40992\n" 683 | ] 684 | } 685 | ], 686 | "source": [ 687 | "test_dataset['predicted_label'] = test_dataset['predicted_score'].map(lambda x : 1 if x > 0.5 else 0)\n", 688 | "print(np.mean(test_dataset['predicted_label']))\n", 689 | "\n" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 32, 695 | "metadata": { 696 | "collapsed": true 697 | }, 698 | "outputs": [], 699 | "source": [ 700 | "exportResult(test_dataset[['predicted_label']], 'lgb_yi_10_31')\n" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "metadata": { 707 | "collapsed": true 708 | }, 709 | "outputs": [], 710 | "source": [] 711 | } 712 | ], 713 | "metadata": { 714 | "kernelspec": { 715 | "display_name": "Python 3", 716 | "language": "python", 717 | "name": "python3" 718 | }, 719 | "language_info": { 720 | "codemirror_mode": { 721 | "name": "ipython", 722 | "version": 3 723 | }, 724 | "file_extension": ".py", 725 | "mimetype": "text/x-python", 726 | "name": "python", 727 | "nbconvert_exporter": "python", 728 | "pygments_lexer": "ipython3", 729 | "version": "3.6.1" 730 | } 731 | }, 732 | "nbformat": 4, 733 | "nbformat_minor": 2 734 | } 735 | -------------------------------------------------------------------------------- /B_oppo_online_pre.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*-coding:utf-8-*- 3 | 4 | ''' 5 | 6 | ''' 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import time 11 | import datetime 12 | import gc 13 | from sklearn.model_selection import KFold, cross_val_score, train_test_split 14 | from sklearn.model_selection import StratifiedKFold 15 | from sklearn.metrics import roc_auc_score, log_loss 16 | import lightgbm as lgb 17 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from sklearn.feature_selection import chi2, SelectPercentile 20 | import math 21 | from sklearn.metrics import f1_score 22 | import jieba 23 | import jieba.posseg as psg 24 | from collections import Counter 25 | import functools 26 | from gensim.models import word2vec 27 | import Levenshtein 28 | 29 | def get_float_list(x): 30 | return_list = [] 31 | for temp in x: 32 | return_list.append(float(temp)) 33 | return return_list 34 | 35 | # 处理跟query_prediction相关的统计特征 36 | def get_query_prediction_feature(df): 37 | df['query_prediction_dict'] = df['query_prediction'].map(lambda x : eval(x)) 38 | df['query_prediction_keys'] = df['query_prediction_dict'].map(lambda x : list(x.keys())) 39 | df['query_prediction_values'] = df['query_prediction_dict'].map(lambda x : get_float_list(list(x.values()))) 40 | df['query_prediction_number'] = df['query_prediction_keys'].map(lambda x : len(x)) 41 | df['query_prediction_max'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.max(x)) 42 | df['query_prediction_min'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.min(x)) 43 | df['query_prediction_mean'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.mean(x)) 44 | df['query_prediction_std'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.std(x)) 45 | return df 46 | 47 | def getBayesSmoothParam(origion_rate): 48 | origion_rate_mean = origion_rate.mean() 49 | origion_rate_var = origion_rate.var() 50 | alpha = origion_rate_mean / origion_rate_var * (origion_rate_mean * (1 - origion_rate_mean) - origion_rate_var) 51 | beta = (1 - origion_rate_mean) / origion_rate_var * (origion_rate_mean * (1 - origion_rate_mean) - origion_rate_var) 52 | # print('origion_rate_mean : ', origion_rate_mean) 53 | # print('origion_rate_var : ', origion_rate_var) 54 | # print('alpha : ', alpha) 55 | # print('beta : ', beta) 56 | return alpha, beta 57 | 58 | # 统计单维度的转化率特征 59 | def get_single_dimension_rate_feature(train_df, fea_set): 60 | for fea in fea_set: 61 | train_temp_df = pd.DataFrame() 62 | for index, (train_index, test_index) in enumerate(skf.split(train_df, train_df['label'])): 63 | temp_df = train_df[[fea, 'label']].iloc[train_index].copy() 64 | temp_pivot_table = pd.pivot_table(temp_df, index=fea, values='label', aggfunc={len, np.mean, np.sum}) 65 | temp_pivot_table.reset_index(inplace=True) 66 | temp_pivot_table.rename(columns={'len':fea + '_count', 'mean':fea + '_rate', 'sum':fea + '_click_number'}, inplace=True) 67 | alpha, beta = getBayesSmoothParam(temp_pivot_table[fea + '_rate']) 68 | temp_pivot_table[fea + '_rate'] = (temp_pivot_table[fea + '_click_number'] + alpha) / (temp_pivot_table[fea + '_count'] + alpha + beta) 69 | # del temp_pivot_table[fea + '_click_number'] 70 | fea_df = train_df.iloc[test_index].copy() 71 | fea_df = pd.merge(fea_df, temp_pivot_table, on=fea, how='left') 72 | # print(fea_df.head()) 73 | train_temp_df = pd.concat([train_temp_df, fea_df]) 74 | # temp_df = train_df[[fea, 'label']].copy() 75 | # temp_pivot_table = pd.pivot_table(temp_df, index=fea, values='label', aggfunc={len, np.mean, np.sum}) 76 | # temp_pivot_table.reset_index(inplace=True) 77 | # temp_pivot_table.rename(columns={'len':fea + '_count', 'mean':fea + '_rate', 'sum':fea + '_click_number'}, inplace=True) 78 | # alpha, beta = getBayesSmoothParam(temp_pivot_table[fea + '_rate']) 79 | # temp_pivot_table[fea + '_rate'] = (temp_pivot_table[fea + '_click_number'] + alpha) / (temp_pivot_table[fea + '_count'] + alpha + beta) 80 | # # del temp_pivot_table[fea + '_click_number'] 81 | # valid_df = pd.merge(valid_df, temp_pivot_table, on=fea, how='left') 82 | print(fea + ' : finish!!!') 83 | train_df = train_temp_df 84 | train_df.sort_index(by='index', ascending=True, inplace=True) 85 | return train_df 86 | 87 | # 统计双维度交叉转化率 88 | def get_jiaocha_dimension_rate_feature(train_df, fea_set): 89 | for i in range(len(fea_set)): 90 | for j in range((i+1), len(fea_set)): 91 | fea1 = fea_set[i] 92 | fea2 = fea_set[j] 93 | train_temp_df = pd.DataFrame() 94 | for index, (train_index, test_index) in enumerate(skf.split(train_df, train_df['label'])): 95 | temp_df = train_df[[fea1, fea2, 'label']].iloc[train_index].copy() 96 | temp_pivot_table = pd.pivot_table(temp_df, index=[fea1, fea2], values='label', aggfunc={len, np.mean, np.sum}) 97 | temp_pivot_table.reset_index(inplace=True) 98 | temp_pivot_table.rename(columns={'len':fea1 + '_' + fea2 + '_count', 'mean':fea1 + '_' + fea2 + '_rate', 'sum':fea1 + '_' + fea2 + '_click_number'}, inplace=True) 99 | alpha, beta = getBayesSmoothParam(temp_pivot_table[fea1 + '_' + fea2 + '_rate']) 100 | temp_pivot_table[fea1 + '_' + fea2 + '_rate'] = (temp_pivot_table[fea1 + '_' + fea2 + '_click_number'] + alpha) / (temp_pivot_table[fea1 + '_' + fea2 + '_count'] + alpha + beta) 101 | # del temp_pivot_table[fea1 + '_' + fea2 + '_click_number'] 102 | fea_df = train_df.iloc[test_index].copy() 103 | fea_df = pd.merge(fea_df, temp_pivot_table, on=[fea1, fea2], how='left') 104 | train_temp_df = pd.concat([train_temp_df, fea_df]) 105 | # temp_df = train_df[[fea1, fea2, 'label']].copy() 106 | # temp_pivot_table = pd.pivot_table(temp_df, index=[fea1, fea2], values='label', aggfunc={len, np.mean, np.sum}) 107 | # temp_pivot_table.reset_index(inplace=True) 108 | # temp_pivot_table.rename(columns={'len':fea1 + '_' + fea2 + '_count', 'mean':fea1 + '_' + fea2 + '_rate', 'sum':fea1 + '_' + fea2 + '_click_number'}, inplace=True) 109 | # alpha, beta = getBayesSmoothParam(temp_pivot_table[fea1 + '_' + fea2 + '_rate']) 110 | # temp_pivot_table[fea1 + '_' + fea2 + '_rate'] = (temp_pivot_table[fea1 + '_' + fea2 + '_click_number'] + alpha) / (temp_pivot_table[fea1 + '_' + fea2 + '_count'] + alpha + beta) 111 | # # del temp_pivot_table[fea1 + '_' + fea2 + '_click_number'] 112 | print(fea1 + '_' + fea2 + ' : finish!!!') 113 | # valid_df = pd.merge(valid_df, temp_pivot_table, on=[fea1, fea2], how='left') 114 | train_df = train_temp_df 115 | train_df.sort_index(by='index', ascending=True, inplace=True) 116 | return train_df 117 | 118 | # 统计一些是否交叉的特征 119 | def get_is_title_in_query_feature(df): 120 | x = df['title'] 121 | y = df['query_prediction_keys'] 122 | is_title_in_query = np.nan 123 | if len(y) > 0: 124 | if x in y: 125 | is_title_in_query = 1 126 | else: 127 | is_title_in_query = 0 128 | return is_title_in_query 129 | 130 | def get_is_prefix_in_title_feature(df): 131 | x = df['prefix'] 132 | y = df['title'] 133 | is_prefix_in_title = np.nan 134 | if x in y: 135 | is_prefix_in_title = 1 136 | else: 137 | is_prefix_in_title = 0 138 | return is_prefix_in_title 139 | 140 | def get_key_len_list(x): 141 | return_list = [] 142 | for temp in x: 143 | return_list.append(len(temp)) 144 | return return_list 145 | 146 | # 统计一些跟字符串长度相关的特征 147 | def get_string_len_feature(df): 148 | df['prefix_len'] = df['prefix'].map(lambda x : len(x)) 149 | df['title_len'] = df['title'].map(lambda x : len(x)) 150 | df['query_prediction_key_len_list'] = df['query_prediction_keys'].map(lambda x : get_key_len_list(x)) 151 | df['query_prediction_key_len_max'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.max(x)) 152 | df['query_prediction_key_len_min'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.min(x)) 153 | df['query_prediction_key_len_mean'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.mean(x)) 154 | df['query_prediction_key_len_std'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.std(x)) 155 | df['len_title-prefix'] = df['title_len'] - df['prefix_len'] 156 | df['len_prefix/title'] = df['prefix_len'] / df['title_len'] 157 | df['len_mean-title'] = df['query_prediction_key_len_mean'] - df['title_len'] 158 | df['len_mean/title'] = df['query_prediction_key_len_mean'] / df['title_len'] 159 | del df['query_prediction_key_len_list'] 160 | return df 161 | 162 | # 统计title跟prefix的编辑距离 163 | def get_title_prefix_levenshtein_distance(df): 164 | title = df['title'] 165 | prefix = df['prefix'] 166 | return Levenshtein.distance(title, prefix) 167 | 168 | def get_title_prefix_levenshtein_distance_rate(df): 169 | title_prefix_leven = df['title_prefix_leven'] 170 | title = df['title'] 171 | return (title_prefix_leven / (len(title) + 3)) 172 | 173 | # 统计title跟query_prediction编辑距离相关的特征 174 | def get_title_query_levenshtein_distance_list(df): 175 | query_keys_list = df['query_prediction_keys'] 176 | query_values_list = df['query_prediction_values'] 177 | title = df['title'] 178 | return_list = list() 179 | for i in range(len(query_keys_list)): 180 | distance = Levenshtein.distance(title, query_keys_list[i]) 181 | return_list.append(distance * query_values_list[i]) 182 | return return_list 183 | 184 | def get_title_query_levenshtein_distance_feature(df): 185 | df['title_query_leven_list'] = df[['query_prediction_keys', 'query_prediction_values', 'title']].apply(get_title_query_levenshtein_distance_list, axis=1) 186 | df['title_query_leven_sum'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.sum(x)) 187 | df['title_query_leven_max'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.max(x)) 188 | df['title_query_leven_min'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.min(x)) 189 | df['title_query_leven_mean'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.mean(x)) 190 | df['title_query_leven_std'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.std(x)) 191 | return df 192 | 193 | #分词方法,调用结巴接口 194 | def jieba_seg_to_list(sentence, pos=False): 195 | if not pos: 196 | #不进行词性标注的分词方法 197 | seg_list = jieba.cut(sentence) 198 | else: 199 | #进行词性标注的分词方法 200 | seg_list = psg.cut(sentence) 201 | return seg_list 202 | 203 | #去除干扰词 204 | def jieba_word_filter(seg_list, pos=False): 205 | 206 | filter_list = [] 207 | #根据pos参数选择是否词性过滤 208 | #不进行词性过滤,则将词性都标记为n,表示全部保留 209 | for seg in seg_list: 210 | if not pos: 211 | word = seg 212 | flag = 'n' 213 | else: 214 | word = seg.word 215 | flag = seg.flag 216 | if not flag.startswith('n'): 217 | continue 218 | filter_list.append(word) 219 | return filter_list 220 | 221 | def jieba_word_deal(sentence, pos=False): 222 | #调用上面方式对数据集进行处理,处理后的每条数据仅保留非干扰词 223 | seg_list = jieba_seg_to_list(sentence, pos) 224 | filter_list = jieba_word_filter(seg_list, pos) 225 | return filter_list 226 | 227 | def get_prefix_prediction_key_sentences(x): 228 | prefix_prediction_key_sentences = "" 229 | for temp in x: 230 | if len(prefix_prediction_key_sentences) > 0: 231 | prefix_prediction_key_sentences = prefix_prediction_key_sentences + temp 232 | else: 233 | prefix_prediction_key_sentences = temp 234 | return prefix_prediction_key_sentences 235 | 236 | def get_max_query_key_sentences(x): 237 | if len(x) == 0: 238 | return "" 239 | else: 240 | return max(x, key=x.get) 241 | 242 | def get_jieba_word(df): 243 | df['query_prediction_key_sentences'] = df['query_prediction_keys'].map(lambda x : get_prefix_prediction_key_sentences(x)) 244 | # df['query_prediction_key_sentences'] = df['query_prediction_dict'].map(lambda x : get_max_query_key_sentences(x)) 245 | df['query_prediction_key_jieba_words'] = df['query_prediction_key_sentences'].map(lambda x : jieba_word_deal(x, False)) 246 | df['query_prediction_words'] = df['query_prediction_keys'].map(lambda x : [jieba_word_deal(j, False) for j in x] if len(x) > 0 else np.nan) 247 | df['title_jieba_words'] = df['title'].map(lambda x : jieba_word_deal(x, False)) 248 | df['prefix_jieba_words'] = df['prefix'].map(lambda x : jieba_word_deal(x, False)) 249 | # del df['query_prediction_key_sentences'] 250 | return df 251 | 252 | def word_match_share(df): 253 | q1words = {} 254 | q2words = {} 255 | for word in df[0]: 256 | q1words[word] = 1 257 | for word in df[1]: 258 | q2words[word] = 1 259 | if len(q1words) == 0 or len(q2words) == 0: 260 | # The computer-generated chaff includes a few questions that are nothing but stopwords 261 | return 0 262 | shared_words_in_q1 = [w for w in q1words.keys() if w in q2words] 263 | shared_words_in_q2 = [w for w in q2words.keys() if w in q1words] 264 | R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words)) 265 | return R 266 | 267 | def jaccard(df): 268 | wic = set(df[0]).intersection(set(df[1])) 269 | uw = set(df[0]).union(df[1]) 270 | if len(uw) == 0: 271 | uw = [1] 272 | return (len(wic) / len(uw)) 273 | 274 | def common_words(df): 275 | return len(set(df[0]).intersection(set(df[1]))) 276 | 277 | def total_unique_words(df): 278 | return len(set(df[0]).union(df[1])) 279 | 280 | def wc_diff(df): 281 | return abs(len(df[0]) - len(df[1])) 282 | 283 | def wc_ratio(df): 284 | l1 = len(df[0])*1.0 285 | l2 = len(df[1]) 286 | if l2 == 0: 287 | return np.nan 288 | if l1 / l2: 289 | return l2 / l1 290 | else: 291 | return l1 / l2 292 | 293 | def wc_diff_unique(df): 294 | return abs(len(set(df[0])) - len(set(df[1]))) 295 | 296 | def wc_ratio_unique(df): 297 | l1 = len(set(df[0])) * 1.0 298 | l2 = len(set(df[1])) 299 | if l2 == 0: 300 | return np.nan 301 | if l1 / l2: 302 | return l2 / l1 303 | else: 304 | return l1 / l2 305 | 306 | def tfidf_word_match_share(df, weights=None): 307 | q1words = {} 308 | q2words = {} 309 | for word in df[0]: 310 | q1words[word] = 1 311 | for word in df[1]: 312 | q2words[word] = 1 313 | if len(q1words) == 0 or len(q2words) == 0: 314 | # The computer-generated chaff includes a few questions that are nothing but stopwords 315 | return 0 316 | shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words] 317 | total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words] 318 | R = np.sum(shared_weights) / np.sum(total_weights) 319 | return R 320 | 321 | def deal_word_for_all(train_df, fea1, fea2, func, colName): 322 | train_df[colName] = train_df[[fea1, fea2]].apply(func, axis=1) 323 | # valid_df[colName] = valid_df[[fea1, fea2]].apply(func, axis=1) 324 | print(colName + ' finish!!!') 325 | return train_df 326 | 327 | def get_weight(count, eps=10000, min_count=2): 328 | if count < min_count: 329 | return 0 330 | else: 331 | return 1 / (count + eps) 332 | 333 | def get_word_statistic_feature(train_df, col_list): 334 | # df = pd.concat([train_df[['query_prediction_key_jieba_words', 'title_jieba_words', 'prefix_jieba_words']], valid_df[['query_prediction_key_jieba_words', 'title_jieba_words', 'prefix_jieba_words']]]) 335 | # train_qs = pd.Series(df['query_prediction_key_jieba_words'].tolist() + df['title_jieba_words'].tolist() + df['prefix_jieba_words'].tolist()) 336 | # words = [x for y in train_qs for x in y] 337 | # counts = Counter(words) 338 | # weights = {word: get_weight(count) for word, count in counts.items()} 339 | for col in col_list: 340 | fea1 = col[0] 341 | fea2 = col[1] 342 | train_df = deal_word_for_all(train_df, fea1, fea2, word_match_share, fea1[0] + '_' + fea2[0] + '_word_match') 343 | train_df = deal_word_for_all(train_df, fea1, fea2, jaccard, fea1[0] + '_' + fea2[0] + '_jaccard') 344 | train_df = deal_word_for_all(train_df, fea1, fea2, common_words, fea1[0] + '_' + fea2[0] + '_common_words') 345 | train_df = deal_word_for_all(train_df, fea1, fea2, total_unique_words, fea1[0] + '_' + fea2[0] + '_total_unique_words') 346 | train_df = deal_word_for_all(train_df, fea1, fea2, wc_diff, fea1[0] + '_' + fea2[0] + '_wc_diff') 347 | train_df = deal_word_for_all(train_df, fea1, fea2, wc_ratio, fea1[0] + '_' + fea2[0] + '_wc_ratio') 348 | train_df = deal_word_for_all(train_df, fea1, fea2, wc_diff_unique, fea1[0] + '_' + fea2[0] + '_wc_diff_unique') 349 | train_df = deal_word_for_all(train_df, fea1, fea2, wc_ratio_unique, fea1[0] + '_' + fea2[0] + '_wc_ratio_unique') 350 | # f = functools.partial(tfidf_word_match_share, weights=weights) 351 | # train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, f, fea1[0] + '_' + fea2[0] + '_tfidf_word_match_share') 352 | return train_df 353 | 354 | def get_w2v_array(word_list, word_wv, num_features): 355 | word_vectors = np.zeros((len(word_list), num_features)) 356 | for i in range(len(word_list)): 357 | word_vectors[i][:] = word_wv[str(word_list[i])] 358 | mean_array = np.mean(word_vectors, axis=0) 359 | return mean_array 360 | 361 | def get_title_prefix_similarity(df, f_similarity): 362 | title_array = df['title_jieba_array'] 363 | prefix_array = df['prefix_jieba_array'] 364 | similarity = 0 365 | if f_similarity == 'dot': 366 | similarity = np.dot(title_array, prefix_array) 367 | elif f_similarity == 'norm': 368 | similarity = np.linalg.norm(title_array - prefix_array) 369 | else: 370 | similarity = np.dot(title_array,prefix_array) / (np.linalg.norm(title_array) * np.linalg.norm(prefix_array)) 371 | return similarity 372 | 373 | def get_title_query_similarity_list(df, f_similarity, word_wv, num_features): 374 | title_array = df['title_jieba_array'] 375 | query_prediction_words = df['query_prediction_words'] 376 | query_prediction_keys = df['query_prediction_keys'] 377 | query_prediction_dict = df['query_prediction_dict'] 378 | similarity_list = list() 379 | if len(query_prediction_keys) <= 0: 380 | return similarity_list 381 | if f_similarity == 'dot': 382 | i = 0 383 | for key in query_prediction_keys: 384 | key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features) 385 | similarity = np.dot(title_array, key_array) * float(query_prediction_dict[key]) 386 | similarity_list.append(similarity) 387 | i = i + 1 388 | elif f_similarity == 'norm': 389 | i = 0 390 | for key in query_prediction_keys: 391 | key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features) 392 | similarity = np.linalg.norm(title_array - key_array) * float(query_prediction_dict[key]) 393 | similarity_list.append(similarity) 394 | i = i + 1 395 | else: 396 | i = 0 397 | for key in query_prediction_keys: 398 | key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features) 399 | similarity = (np.dot(title_array, key_array) / (np.linalg.norm(title_array) * np.linalg.norm(key_array))) * float(query_prediction_dict[key]) 400 | similarity_list.append(similarity) 401 | i = i + 1 402 | return similarity_list 403 | 404 | def get_similarity_feature(train_df): 405 | f_list = ['dot', 'norm', 'cosine'] 406 | for fun in f_list: 407 | f_prefix_similarity = functools.partial(get_title_prefix_similarity, f_similarity=fun) 408 | train_df['title_prefix_' + fun + '_similarity'] = train_df[['title_jieba_array', 'prefix_jieba_array']].apply(f_prefix_similarity, axis=1) 409 | # f_query_similarity = functools.partial(get_title_query_similarity, f_similarity=fun, word_wv=word_wv, num_features=num_features) 410 | # train_df['title_query_' + fun + '_similarity'] = train_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity, axis=1) 411 | # valid_df['title_query_' + fun + '_similarity'] = valid_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity, axis=1) 412 | f_query_similarity_list = functools.partial(get_title_query_similarity_list, f_similarity=fun, word_wv=word_wv, num_features=num_features) 413 | train_df['title_query_' + fun + '_similarity_list'] = train_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity_list, axis=1) 414 | train_df['title_query_' + fun + '_similarity'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.sum(x)) 415 | train_df['title_query_' + fun + '_similarity_max'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.max(x)) 416 | train_df['title_query_' + fun + '_similarity_min'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.min(x)) 417 | train_df['title_query_' + fun + '_similarity_mean'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.mean(x)) 418 | train_df['title_query_' + fun + '_similarity_std'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.std(x)) 419 | print(fun + ' : finish!!!') 420 | return train_df 421 | 422 | # 导出特征工程文件 423 | def exportDf(df, fileName): 424 | df.to_csv('../temp/%s.csv' % fileName, header=True, index=True) 425 | 426 | def main(): 427 | 428 | # 开始导入数据 429 | print("~~~~~~~~~~~~~~~~~~~~~~开始导入数据~~~~~~~~~~~~~~~~~~~~~~~~~~") 430 | train_df = pd.read_table('../data/oppo_round1_train_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3) 431 | valid_df = pd.read_table('../data/oppo_round1_vali_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3) 432 | train_df = pd.concat([train_df, valid_df]) 433 | train_df.reset_index(inplace=True) 434 | train_df['index'] = train_df.index 435 | 436 | print("~~~~~~~~~~~~~~~~~~~~~~统计特征~~~~~~~~~~~~~~~~~~~~~~~~~~") 437 | train_df = get_query_prediction_feature(train_df) 438 | skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True) 439 | fea_set = ['prefix', 'title', 'tag', 'query_prediction'] 440 | train_df = get_single_dimension_rate_feature(train_df, fea_set) 441 | jiaocha_fea_set = ['prefix', 'title', 'tag'] 442 | train_df = get_jiaocha_dimension_rate_feature(train_df, jiaocha_fea_set) 443 | train_df['is_title_in_query'] = train_df[['title', 'query_prediction_keys']].apply(get_is_title_in_query_feature, axis = 1) 444 | train_df['is_prefix_in_title'] = train_df[['prefix', 'title']].apply(get_is_prefix_in_title_feature, axis = 1) 445 | train_df = get_string_len_feature(train_df) 446 | 447 | print("~~~~~~~~~~~~~~~~~~~~~~编辑距离特征~~~~~~~~~~~~~~~~~~~~~~~~~~") 448 | train_df['title_prefix_leven'] = train_df[['title', 'prefix']].apply(get_title_prefix_levenshtein_distance, axis=1) 449 | train_df['title_prefix_leven_rate'] = train_df[['title', 'title_prefix_leven']].apply(get_title_prefix_levenshtein_distance_rate, axis=1) 450 | train_df = get_title_query_levenshtein_distance_feature(train_df) 451 | 452 | print("~~~~~~~~~~~~~~~~~~~~~~分词~~~~~~~~~~~~~~~~~~~~~~~~~~") 453 | train_df = get_jieba_word(train_df) 454 | 455 | print("~~~~~~~~~~~~~~~~~~~~~~距离特征~~~~~~~~~~~~~~~~~~~~~~~~~~") 456 | col_list = [['query_prediction_key_jieba_words', 'title_jieba_words'], ['prefix_jieba_words', 'title_jieba_words'], ['prefix_jieba_words', 'query_prediction_key_jieba_words']] 457 | train_df = get_word_statistic_feature(train_df, col_list) 458 | 459 | print("~~~~~~~~~~~~~~~~~~~~~~w2v相关特征~~~~~~~~~~~~~~~~~~~~~~~~~~") 460 | # Set values for various parameters 461 | num_features = 500 # Word vector dimensionality 462 | min_word_count = 1 # Minimum word count 463 | num_workers = 20 # Number of threads to run in parallel 464 | context = 5 # Context window size 465 | downsampling = 1e-3 # Downsample setting for frequent words 466 | 467 | word2vec_df = train_df[['query_prediction_words', 'title_jieba_words', 'prefix_jieba_words', 'query_prediction_number']] 468 | word2vec_df.reset_index(inplace=True) 469 | word2vec_list = word2vec_df['title_jieba_words'].tolist() + word2vec_df['prefix_jieba_words'].tolist() + [y for x in word2vec_df['query_prediction_words'][word2vec_df.query_prediction_number > 0] for y in x] 470 | model = word2vec.Word2Vec(word2vec_list, workers=num_workers, \ 471 | size=num_features, min_count = min_word_count, \ 472 | window = context, sample = downsampling) 473 | 474 | # If you don't plan to train the model any further, calling 475 | # init_sims will make the model much more memory-efficient. 476 | model.init_sims(replace=True) 477 | 478 | word_wv = model.wv 479 | train_df['title_jieba_array'] = train_df['title_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features)) 480 | train_df['prefix_jieba_array'] = train_df['prefix_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features)) 481 | train_df = get_similarity_feature(train_df) 482 | # 保存训练的词向量模型 483 | model.save("../temp/B_word2vec.model") 484 | fea = [ 485 | 'query_prediction_number', 'query_prediction_max', 'query_prediction_min', 'query_prediction_mean', 'query_prediction_std', 486 | 'prefix_count', 'prefix_rate', 487 | 'title_count', 'title_rate', 'tag_count', 'tag_rate', 488 | 'query_prediction_count', 'query_prediction_rate', 'prefix_title_count', 489 | 'prefix_title_rate', 'prefix_tag_count', 'prefix_tag_rate', 490 | 'title_tag_count', 'title_tag_rate', 491 | 'prefix_click_number', 'title_click_number', 'query_prediction_click_number', 'prefix_tag_click_number', 492 | 'prefix_title_click_number', 'title_tag_click_number', 493 | 'is_title_in_query', 'is_prefix_in_title', 494 | # 'title_tag_types', 'prefix_tag_types', 'tag_title_types', 'tag_prefix_types', 495 | # 'title_prefix_types', 'prefix_title_types', 'tag_query_prediction_types', 'title_query_prediction_types', 496 | 'prefix_len', 'title_len', 497 | 'query_prediction_key_len_max', 'query_prediction_key_len_min', 498 | 'query_prediction_key_len_mean', 'query_prediction_key_len_std', 499 | 'len_title-prefix', 'len_prefix/title', 'len_mean-title', 'len_mean/title', 500 | # 'q_t_jaccard', 'p_t_jaccard', 'p_q_jaccard', 501 | 'q_t_word_match', 'q_t_common_words', 502 | # 'q_t_tfidf_word_match_share', 'p_t_tfidf_word_match_share', 'p_q_tfidf_word_match_share', 503 | 'q_t_total_unique_words', 'q_t_wc_diff', 'q_t_wc_ratio', 504 | 'q_t_wc_diff_unique', 'q_t_wc_ratio_unique', 505 | 'p_t_word_match', 'p_t_common_words', 506 | 'p_t_total_unique_words', 'p_t_wc_diff', 'p_t_wc_ratio', 507 | 'p_t_wc_diff_unique', 'p_t_wc_ratio_unique', 508 | 'p_q_word_match', 'p_q_common_words', 509 | 'p_q_total_unique_words', 'p_q_wc_diff', 'p_q_wc_ratio', 510 | 'p_q_wc_diff_unique', 'p_q_wc_ratio_unique', 511 | 'title_prefix_dot_similarity', 512 | 'title_query_dot_similarity', 'title_prefix_norm_similarity', 513 | 'title_query_norm_similarity', 'title_prefix_cosine_similarity', 514 | 'title_query_cosine_similarity', 515 | 'title_query_dot_similarity_max', 'title_query_dot_similarity_min', 516 | 'title_query_dot_similarity_mean', 'title_query_dot_similarity_std', 517 | 'title_query_norm_similarity_min', 'title_query_norm_similarity_mean', 518 | 'title_query_norm_similarity_std', 519 | 'title_query_cosine_similarity_max', 'title_query_cosine_similarity_min', 520 | 'title_query_cosine_similarity_mean', 'title_query_cosine_similarity_std', 521 | 'title_prefix_leven', 'title_prefix_leven_rate', 522 | 'title_query_leven_sum', 'title_query_leven_max', 'title_query_leven_min', 523 | 'title_query_leven_mean', 'title_query_leven_std', 524 | 'prefix', 'query_prediction', 'title', 'tag', 'index', 'label' 525 | ] 526 | 527 | exportDf(train_df[fea], 'A_final_train_online_df') 528 | 529 | print("~~~~~~~~~~~~~~~~~~~~~~finish!!!!~~~~~~~~~~~~~~~~~~~~~~~~~~") 530 | 531 | 532 | if __name__ == '__main__': 533 | main() 534 | --------------------------------------------------------------------------------