├── 1完整流程 ├── 4模型 │ ├── 1_lgb │ │ ├── 说明 │ │ ├── lgb_50w.sh │ │ └── lgb_50w.py │ ├── 2_xgb │ │ ├── 说明 │ │ ├── xgb_50w.sh │ │ └── xgb_50w.py │ └── 3_XDeepFM │ │ ├── 说明 │ │ ├── __pycache__ │ │ └── ctrNet.cpython-36.pyc │ │ ├── models │ │ ├── __pycache__ │ │ │ ├── ffm.cpython-36.pyc │ │ │ ├── ffm.cpython-37.pyc │ │ │ ├── fm.cpython-36.pyc │ │ │ ├── fm.cpython-37.pyc │ │ │ ├── nffm.cpython-36.pyc │ │ │ ├── nffm.cpython-37.pyc │ │ │ ├── xdeepfm.cpython-36.pyc │ │ │ ├── xdeepfm.cpython-37.pyc │ │ │ ├── base_model.cpython-36.pyc │ │ │ └── base_model.cpython-37.pyc │ │ ├── ffm.py │ │ ├── base_model.py │ │ ├── fm.py │ │ ├── nffm.py │ │ └── xdeepfm.py │ │ ├── src │ │ ├── __pycache__ │ │ │ ├── misc_utils.cpython-36.pyc │ │ │ └── misc_utils.cpython-37.pyc │ │ └── misc_utils.py │ │ ├── test_26_27_28_50w.sh │ │ ├── ctrNet.py │ │ └── test_26_27_28_50w.py ├── 5融合 │ ├── 说明 │ ├── combine_1.ipynb │ └── .ipynb_checkpoints │ │ └── combine-checkpoint.ipynb ├── 2采样_匹配数据 │ ├── 2.separate_by_day.ipynb │ ├── 3.uid1_day_caiyang_1.ipynb │ └── 4.uid1_day_caiyang_2.ipynb ├── 3特征工程 │ └── tezhenggongcheng.py └── 1数据预处理_采样_匹配 │ └── 1.EDA.ipynb └── README.md /1完整流程/4模型/1_lgb/说明: -------------------------------------------------------------------------------- 1 | by_data_50w 存放采样过后的数据 2 | lgb_result_50w 存放结果 3 | 运行 直接运行 lgb_50w.sh文件 4 | -------------------------------------------------------------------------------- /1完整流程/4模型/2_xgb/说明: -------------------------------------------------------------------------------- 1 | by_data_50w 存放采样过后的数据 2 | xgb_result_50w 存放结果 3 | 运行 直接运行 xgb_50w.sh文件 4 | -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/说明: -------------------------------------------------------------------------------- 1 | data 存放数据 2 | result_best 存放结果 3 | 运行:直接运行test_26_27_28_50w.sh 4 | -------------------------------------------------------------------------------- /1完整流程/5融合/说明: -------------------------------------------------------------------------------- 1 | lgb_result 存放lgb结果 2 | xgb_result 存放xgb结果 3 | xdeepfm_result 存放xdeepfm结果 4 | 5 | 运行:combine.inpy文件 6 | -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/__pycache__/ctrNet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/__pycache__/ctrNet.cpython-36.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/ffm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/ffm.cpython-36.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/ffm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/ffm.cpython-37.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/fm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/fm.cpython-36.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/fm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/fm.cpython-37.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/nffm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/nffm.cpython-36.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/nffm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/nffm.cpython-37.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/xdeepfm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/xdeepfm.cpython-36.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/xdeepfm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/xdeepfm.cpython-37.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/src/__pycache__/misc_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/src/__pycache__/misc_utils.cpython-36.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/src/__pycache__/misc_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/src/__pycache__/misc_utils.cpython-37.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/base_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/base_model.cpython-36.pyc -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/__pycache__/base_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tersaiz/Huawei-Digix-Algorithm-Contest/HEAD/1完整流程/4模型/3_XDeepFM/models/__pycache__/base_model.cpython-37.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Huawei-Digix-Algorithm-Contest 2 | 3 | # 赛题:CTR预测 4 | 任务描述:基于用户对广告任务的历史行为和广告任务属性,选择合适的算法预测用户在特定上下文下对某个广告任务的点击概率。尝试解决的问题:提高广告点击转化率预估的准确性 5 | 6 | 难点:广告任务相对可推用户数量非常少;有行为的广告任务较少,数据非常稀疏;广告任务在投放周期的不同阶段转化率差异较大;存在误点击噪音数据;有效特征识别困难 7 | 8 | 举办方:华为,江苏省人工智能学会 9 | 10 | # 排名 11 | 初赛:6/2447 12 | 决赛:第七名,优胜奖 13 | 14 | -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/test_26_27_28_50w.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | echo "test 27-50w." 5 | python test_26_27_28_50w.py 27-50w 27-50w 6 | echo "finish" 7 | 8 | echo "test 28-50w." 9 | python test_26_27_28_50w.py 28-50w 28-50w 10 | echo "finish" 11 | 12 | echo "test 29-50w." 13 | python test_26_27_28_50w.py 29-50w 29-50w 14 | echo "finish" 15 | -------------------------------------------------------------------------------- /1完整流程/4模型/1_lgb/lgb_50w.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #echo "test 26-50w." 4 | #python lgb.py 26-50w uid1_label1_test_data 26-50w 5 | #echo "finish" 6 | 7 | echo "test 27-50w." 8 | python lgb_50w_add.py 27-50w uid1_label1_test_data 27-50w 9 | echo "finish" 10 | 11 | echo "test 28-50w." 12 | python lgb_50w_add.py 28-50w uid1_label1_test_data 28-50w 13 | echo "finish" 14 | 15 | echo "test 29-50w." 16 | python lgb_50w_add.py 29-50w uid1_label1_test_data 29-50w 17 | echo "finish" 18 | 19 | #echo "test 30-50w." 20 | #python lgb.py 30-50w uid1_label1_test_data 30-50w 21 | #echo "finish" 22 | 23 | 24 | #echo "test 31-50w." 25 | #python lgb.py 31-50w uid1_label1_test_data 31-50w 26 | #echo "finish" 27 | 28 | 29 | -------------------------------------------------------------------------------- /1完整流程/4模型/2_xgb/xgb_50w.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # echo "test 26-50w." 4 | # python xgb_50w.py 26-50w uid1_label1_test_data 26-50w 5 | # echo "finish" 6 | 7 | echo "test 27-50w." 8 | python xgb_50w.py 27-50w uid1_label1_test_data 27-50w 9 | echo "finish" 10 | 11 | echo "test 28-50w." 12 | python xgb_50w.py 28-50w uid1_label1_test_data 28-50w 13 | echo "finish" 14 | 15 | echo "test 29-50w." 16 | python xgb_50w.py 29-50w uid1_label1_test_data 29-50w 17 | echo "finish" 18 | 19 | # echo "test 30-50w." 20 | # python xgb_50w.py 30-50w uid1_label1_test_data 30-50w 21 | # echo "finish" 22 | 23 | 24 | # echo "test 31-50w." 25 | # python xgb_50w.py 31-50w uid1_label1_test_data 31-50w 26 | # echo "finish" 27 | 28 | 29 | -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/ctrNet.py: -------------------------------------------------------------------------------- 1 | from src import misc_utils as utils 2 | from models import fm 3 | from models import ffm 4 | from models import nffm 5 | from models import xdeepfm 6 | import tensorflow as tf 7 | from imp import reload 8 | def build_model(hparams): 9 | tf.reset_default_graph() 10 | if hparams.model=='fm': 11 | model=fm.Model(hparams) 12 | elif hparams.model=='ffm': 13 | model=ffm.Model(hparams) 14 | elif hparams.model=='nffm': 15 | model=nffm.Model(hparams) 16 | elif hparams.model=='xdeepfm': 17 | model=xdeepfm.Model(hparams) 18 | config_proto = tf.ConfigProto(log_device_placement=0,allow_soft_placement=0) 19 | config_proto.gpu_options.allow_growth = True 20 | sess=tf.Session(config=config_proto) 21 | sess.run(tf.global_variables_initializer()) 22 | # writer = tf.summary.FileWriter("D://DeepLearning//Tensorflow//ctrNet-tool-master//graph-1",sess.graph) 23 | # writer.close() 24 | model.set_Session(sess) 25 | 26 | return model -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/src/misc_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Generally useful utility functions.""" 17 | from __future__ import print_function 18 | 19 | import codecs 20 | import collections 21 | import json 22 | import math 23 | import os 24 | import sys 25 | import time 26 | 27 | import numpy as np 28 | import tensorflow as tf 29 | import pandas as pd 30 | 31 | 32 | def hash_batch(batch,hparams): 33 | batch=pd.DataFrame(batch) 34 | batch=list(batch.values) 35 | for b in batch: 36 | for i in range(len(b)): 37 | b[i]=abs(hash('key_'+str(i)+' value_'+str(b[i]))) % hparams.hash_ids 38 | return batch 39 | 40 | def print_time(s, start_time): 41 | """Take a start time, print elapsed duration, and return a new time.""" 42 | print("%s, time %ds, %s." % (s, (time.time() - start_time), time.ctime())) 43 | sys.stdout.flush() 44 | return time.time() 45 | 46 | def print_out(s, f=None, new_line=True): 47 | """Similar to print but with support to flush and output to a file.""" 48 | if isinstance(s, bytes): 49 | s = s.decode("utf-8") 50 | 51 | if f: 52 | f.write(s.encode("utf-8")) 53 | if new_line: 54 | f.write(b"\n") 55 | 56 | # stdout 57 | out_s = s.encode("utf-8") 58 | if not isinstance(out_s, str): 59 | out_s = out_s.decode("utf-8") 60 | print(out_s, end="", file=sys.stdout) 61 | 62 | if new_line: 63 | sys.stdout.write("\n") 64 | sys.stdout.flush() 65 | 66 | def print_step_info(prefix,epoch, global_step, info): 67 | print_out("%sepoch %d step %d lr %g logloss %.6f gN %.2f, %s" % 68 | (prefix, epoch,global_step, info["learning_rate"], 69 | info["train_ppl"], info["avg_grad_norm"], time.ctime())) 70 | 71 | def print_hparams(hparams, skip_patterns=None, header=None): 72 | """Print hparams, can skip keys based on pattern.""" 73 | if header: print_out("%s" % header) 74 | values = hparams.values() 75 | for key in sorted(values.keys()): 76 | if not skip_patterns or all( 77 | [skip_pattern not in key for skip_pattern in skip_patterns]): 78 | print_out(" %s=%s" % (key, str(values[key]))) 79 | -------------------------------------------------------------------------------- /1完整流程/2采样_匹配数据/2.separate_by_day.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "将每一天的数据分离出开来,date中存放日期,reader_pandas第一个参数输入需要分离的原数据的存放路径,path是按日期分离后数据的存放路径" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "Reading ...: 16%|██████████▏ | 1598/10000 [05:15<26:56, 5.20it/s]\n" 20 | ] 21 | }, 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "(25583340, 8)\n", 27 | "Day 26 separated!\n" 28 | ] 29 | }, 30 | { 31 | "name": "stderr", 32 | "output_type": "stream", 33 | "text": [ 34 | "Reading ...: 16%|██████████▏ | 1598/10000 [04:53<25:39, 5.46it/s]\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "import pandas as pd\n", 40 | "import numpy as np\n", 41 | "import time\n", 42 | "from tqdm import tqdm\n", 43 | "\n", 44 | "#chunksize是一次读入多少条数据,patitions是最大迭代次数,chunksize*patitions应大于总数据的条数\n", 45 | "def reader_pandas(file,date_day, chunkSize=100000, patitions=10 ** 4):\n", 46 | " reader = pd.read_csv(file,iterator=True)\n", 47 | " chunks = []\n", 48 | " with tqdm(range(patitions), 'Reading ...') as t:\n", 49 | " for _ in t:\n", 50 | " try:\n", 51 | " chunk = reader.get_chunk(chunkSize)\n", 52 | " chunk.columns = ['label', 'uId', 'adID', 'operTime', 'siteId', 'slotId', 'contentId', 'netType']\n", 53 | " date = pd.to_datetime(chunk['operTime'])\n", 54 | " x=date.dt.day\n", 55 | " chunk1 = chunk[x == date_day]\n", 56 | " # print(type(chunks))\n", 57 | " chunks.append(chunk1)\n", 58 | " except StopIteration:\n", 59 | " break\n", 60 | " return pd.concat(chunks, ignore_index=True)\n", 61 | "\n", 62 | "\n", 63 | "\n", 64 | "#####开始分离数据#####\n", 65 | "date = [26,27,28,29,30,31]\n", 66 | "for i in date:\n", 67 | " result = reader_pandas('C:/Users/Growing/Desktop/hw_dataset/train/train_20190518.csv',i)\n", 68 | " path = 'C:/Users/Growing/Desktop/hw_dataset/train/train_'+str(i)+'.csv'\n", 69 | "# result.to_csv(path,index = False)\n", 70 | " print(result.shape)\n", 71 | " print('Day '+str(i)+ ' separated!')\n", 72 | "\n" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.6.8" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 2 97 | } 98 | -------------------------------------------------------------------------------- /1完整流程/4模型/2_xgb/xgb_50w.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | from sklearn import metrics, preprocessing 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.metrics import mean_squared_error 6 | from sklearn.metrics import roc_auc_score 7 | from sklearn.model_selection import StratifiedKFold 8 | import warnings 9 | import sys 10 | 11 | input_train = sys.argv[1] 12 | input_test = sys.argv[2] 13 | output_name = sys.argv[3] 14 | 15 | INPUT_TRAIN = 'by_data_50w/' + input_train +'.csv' 16 | INPUT_TEST = 'by_data_50w/' + input_test +'.csv' 17 | 18 | 19 | 20 | data= pd.read_csv(INPUT_TRAIN,iterator=True) 21 | train_df = data.get_chunk(10000) 22 | test_df= pd.read_csv(INPUT_TEST) 23 | 24 | 25 | date = pd.to_datetime(train_df['operTime']) 26 | x=date.dt.hour 27 | train_df['operTime'] = x 28 | 29 | date = pd.to_datetime(test_df['operTime']) 30 | x=date.dt.hour 31 | test_df['operTime'] = x 32 | 33 | 34 | # 计数特征 35 | len_train = train_df.shape[0] 36 | data = pd.concat([train_df,test_df]) 37 | 38 | feature=[ 'slotId','phoneType','adId','city','operTime'] 39 | for f in feature: 40 | count1 = data.groupby([f])['uId'].count().reset_index(name=f+'_count') 41 | data = data.merge(count1,on=f,how='left') 42 | 43 | 44 | data.drop(['uId'],axis = 1,inplace = True) 45 | 46 | train_df = data.iloc[0:len_train,:] 47 | test_df = data.iloc[len_train:,:] 48 | 49 | features = [c for c in train_df.columns if c not in ['label']] 50 | target = train_df['label'] 51 | 52 | 53 | 54 | params = {'objective': 'binary:logistic', 55 | 'eval_metric': 'auc', 56 | 'max_depth': 14, 57 | 'eta': 0.1, 58 | 'gamma': 6, 59 | 'subsample': 0.9, 60 | 'colsample_bytree': 0.9, 61 | 'min_child_weight': 51, 62 | 'colsample_bylevel': 0.6, 63 | 'lambda': 0.5, 64 | 'alpha': 0.1, 65 | 'silent':0} 66 | 67 | 68 | folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=2019) 69 | oof = np.zeros(len(train_df)) 70 | predictions =np.zeros(len(test_df)) 71 | 72 | for i, (trn, val) in enumerate(folds.split(train_df.values,target.values)): 73 | print(i+1, "fold. AUC") 74 | 75 | trn_x = train_df.iloc[trn][features] 76 | trn_y = target.iloc[trn] 77 | val_x = train_df.iloc[val][features] 78 | val_y = target.iloc[val] 79 | 80 | 81 | 82 | model = xgb.train(params 83 | , xgb.DMatrix(trn_x, trn_y) 84 | , 100000 85 | , [(xgb.DMatrix(trn_x, trn_y), 'train'), (xgb.DMatrix(val_x, val_y), 'valid')] 86 | , verbose_eval=5000 87 | , early_stopping_rounds=3000 88 | ) 89 | 90 | oof[val] = model.predict(xgb.DMatrix(val_x), ntree_limit=model.best_ntree_limit) 91 | predictions += model.predict(xgb.DMatrix(test_df[features]), ntree_limit=model.best_ntree_limit) / folds.n_splits 92 | 93 | print("CV score: {:<8.5f}".format(roc_auc_score(target, oof))) 94 | cv_auc = roc_auc_score(target, oof) 95 | cv_auc = cv_auc.round(6) 96 | 97 | 98 | OUTPUT_FILE = 'xgb_result_50w/' + output_name +'-'+ str(cv_auc) + '.csv' 99 | sub_df = pd.DataFrame({"id":test_df["label"].values}) 100 | sub_df["probability"] = predictions.round(6) 101 | sub_df.to_csv(OUTPUT_FILE, index=False) 102 | 103 | -------------------------------------------------------------------------------- /1完整流程/5融合/combine_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# lgb" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 21, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "lgb_result_27 = pd.read_csv('lgb_result/********.csv')\n", 26 | "lgb_result_28 = pd.read_csv('lgb_result/********.csv')\n", 27 | "lgb_result_29 = pd.read_csv('lgb_result/********.csv')\n", 28 | "sub1 = pd.DataFrame()\n", 29 | "pred = (lgb_result_27['probability'] + lgb_result_28['probability'] + lgb_result_29['probability'])/3\n", 30 | "sub1['probability'] = pred.round(6)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "# xgb" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "xgb_result_27 = pd.read_csv('xgb_result/********.csv')\n", 47 | "xgb_result_28 = pd.read_csv('xgb_result/********.csv')\n", 48 | "xgb_result_29 = pd.read_csv('xgb_result/********.csv')\n", 49 | "sub2 = pd.DataFrame()\n", 50 | "pred = (xgb_result_27['probability'] + xgb_result_28['probability'] + xgb_result_29['probability'])/3\n", 51 | "sub2['probability'] = pred.round(6)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "# xDeepFM" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "xDeepFM_result_27 = pd.read_csv('xdeepfm_result/********.csv')\n", 68 | "xDeepFM_result_28 = pd.read_csv('xdeepfm_result/********.csv')\n", 69 | "xDeepFM_result_29 = pd.read_csv('v/********.csv')\n", 70 | "sub3 = pd.DataFrame()\n", 71 | "pred = (xDeepFM_result_27['probability'] + xDeepFM_result_28['probability'] + xDeepFM_result_29['probability'])/3\n", 72 | "sub3['probability'] = pred.round(6)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "# 融合(权重需要试)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "sub = pd.DataFrame({'id':lgb_result_27['id']})\n", 89 | "sub['probability'] = (0.4*sub1['probability'] + \n", 90 | " 0.4*sub2['probability'] + \n", 91 | " 0.2*sub3['probability'] )\n", 92 | "\n", 93 | "sub.to_csv('submission.csv',index = False)" 94 | ] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 3", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.6.8" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 2 118 | } 119 | -------------------------------------------------------------------------------- /1完整流程/5融合/.ipynb_checkpoints/combine-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# lgb" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 21, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "lgb_result_27 = pd.read_csv('lgb_result/********.csv')\n", 26 | "lgb_result_28 = pd.read_csv('lgb_result/********.csv')\n", 27 | "lgb_result_29 = pd.read_csv('lgb_result/********.csv')\n", 28 | "sub1 = pd.DataFrame()\n", 29 | "pred = (lgb_result_27['probability'] + lgb_result_28['probability'] + lgb_result_29['probability'])/3\n", 30 | "sub1['probability'] = pred.round(6)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "# xgb" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "xgb_result_27 = pd.read_csv('xgb_result/********.csv')\n", 47 | "xgb_result_28 = pd.read_csv('xgb_result/********.csv')\n", 48 | "xgb_result_29 = pd.read_csv('xgb_result/********.csv')\n", 49 | "sub2 = pd.DataFrame()\n", 50 | "pred = (xgb_result_27['probability'] + xgb_result_28['probability'] + xgb_result_29['probability'])/3\n", 51 | "sub2['probability'] = pred.round(6)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "# xDeepFM" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "xDeepFM_result_27 = pd.read_csv('xdeepfm_result/********.csv')\n", 68 | "xDeepFM_result_28 = pd.read_csv('xdeepfm_result/********.csv')\n", 69 | "xDeepFM_result_29 = pd.read_csv('v/********.csv')\n", 70 | "sub3 = pd.DataFrame()\n", 71 | "pred = (xDeepFM_result_27['probability'] + xDeepFM_result_28['probability'] + xDeepFM_result_29['probability'])/3\n", 72 | "sub3['probability'] = pred.round(6)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "# 融合(权重需要试)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "sub = pd.DataFrame({'id':lgb_result_27['id']})\n", 89 | "sub['probability'] = (0.4*sub1['probability'] + \n", 90 | " 0.4*sub2['probability'] + \n", 91 | " 0.2*sub3['probability'] )\n", 92 | "\n", 93 | "sub.to_csv('submission.csv',index = False)" 94 | ] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 3", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.6.8" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 2 118 | } 119 | -------------------------------------------------------------------------------- /1完整流程/4模型/1_lgb/lgb_50w.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # -*- coding: utf-8 -*- 4 | 5 | import pandas as pd 6 | import time 7 | import gc 8 | import datetime 9 | from sklearn import preprocessing 10 | import lightgbm as lgb 11 | from sklearn.linear_model import BayesianRidge 12 | from sklearn.model_selection import KFold, StratifiedKFold 13 | import numpy as np 14 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder 15 | from sklearn.metrics import roc_auc_score 16 | from sklearn.cluster import KMeans 17 | import matplotlib.pyplot as mp 18 | import matplotlib.pyplot as plt 19 | import time 20 | import sys 21 | import seaborn as sns 22 | encoder=preprocessing.LabelEncoder() 23 | 24 | 25 | 26 | 27 | input_train = sys.argv[1] 28 | input_test = sys.argv[2] 29 | output_name = sys.argv[3] 30 | 31 | INPUT_TRAIN = 'by_data_50w/' + input_train +'.csv' 32 | INPUT_TEST = 'by_data_50w/' + input_test +'.csv' 33 | OUTPUT_PIC = 'result_50w/' + output_name +'.png' 34 | 35 | 36 | train_df= pd.read_csv(INPUT_TRAIN) 37 | test= pd.read_csv(INPUT_TEST) 38 | 39 | 40 | date = pd.to_datetime(train_df['operTime']) 41 | x=date.dt.hour 42 | train_df['operTime'] = x 43 | 44 | date = pd.to_datetime(test['operTime']) 45 | x=date.dt.hour 46 | test['operTime'] = x 47 | 48 | 49 | # 计数特征 50 | len_train = train_df.shape[0] 51 | data = pd.concat([train_df,test]) 52 | 53 | feature=[ 'slotId','phoneType','adId','city','operTime'] 54 | for f in feature: 55 | count1 = data.groupby([f])['uId'].count().reset_index(name=f+'_count') 56 | data = data.merge(count1,on=f,how='left') 57 | 58 | 59 | data.drop(['uId'],axis = 1,inplace = True) 60 | 61 | train_df = data.iloc[0:len_train,:] 62 | test = data.iloc[len_train:,:] 63 | 64 | #lgb 65 | features = [c for c in train_df.columns if c not in ['label']] 66 | target = train_df['label'] 67 | param = { 68 | 'boosting_type': 'gbdt', 69 | 'objective': 'binary', 70 | 'metric': 'auc', 71 | 'learning_rate': 0.05, 72 | 'num_leaves': 50, 73 | 'max_depth':-1, 74 | 'min_child_samples': 121, 75 | 'max_bin': 15, 76 | # 'subsample': .7, 77 | # 'subsample_freq': 1, 78 | # 'colsample_bytree': 0.7, 79 | # 'min_child_weight': 0, 80 | # 'scale_pos_weight': 0.43, 81 | 'seed': 2019, 82 | 'nthread': 6, 83 | 'verbose': 0, 84 | } 85 | folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=44000) 86 | oof = np.zeros(len(train_df)) 87 | predictions = np.zeros(len(test)) 88 | feature_importance_df = pd.DataFrame() 89 | 90 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)): 91 | print("Fold {}".format(fold_)) 92 | trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx]) 93 | val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx]) 94 | 95 | num_round = 1000000 96 | clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000) 97 | oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration) 98 | 99 | fold_importance_df = pd.DataFrame() 100 | fold_importance_df["Feature"] = features 101 | fold_importance_df["importance"] = clf.feature_importance() 102 | fold_importance_df["fold"] = fold_ + 1 103 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 104 | 105 | predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits 106 | 107 | print("CV score: {:<8.5f}".format(roc_auc_score(target, oof))) 108 | cv_auc = roc_auc_score(target, oof) 109 | cv_auc = cv_auc.round(6) 110 | 111 | 112 | cols = (feature_importance_df[["Feature", "importance"]] 113 | .groupby("Feature") 114 | .mean() 115 | .sort_values(by="importance", ascending=False)[:150].index) 116 | best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)] 117 | 118 | plt.figure(figsize=(14,28)) 119 | sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False)) 120 | plt.title('Features importance (averaged/folds)') 121 | plt.tight_layout() 122 | plt.savefig(OUTPUT_PIC) 123 | 124 | 125 | 126 | OUTPUT_FILE = 'result_50w/' + output_name +'-'+ str(cv_auc) + '.csv' 127 | sub_df = pd.DataFrame({"id":test["label"].values}) 128 | sub_df["probability"] = predictions.round(6) 129 | sub_df.to_csv(OUTPUT_FILE, index=False) -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/ffm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from src import misc_utils as utils 3 | from tensorflow.python.ops import lookup_ops 4 | from tensorflow.python.layers import core as layers_core 5 | from models.base_model import BaseModel 6 | import numpy as np 7 | import time 8 | import os 9 | class Model(BaseModel): 10 | def __init__(self,hparams): 11 | self.hparams=hparams 12 | if hparams.metric in ['logloss']: 13 | self.best_score=100000 14 | else: 15 | self.best_score=0 16 | self.build_graph(hparams) 17 | self.optimizer(hparams) 18 | params = tf.trainable_variables() 19 | utils.print_out("# Trainable variables") 20 | for param in params: 21 | utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()),param.op.device)) 22 | 23 | def set_Session(self,sess): 24 | self.sess=sess 25 | 26 | def build_graph(self, hparams): 27 | initializer = self._get_initializer(hparams) 28 | self.label = tf.placeholder(shape=(None), dtype=tf.float32) 29 | self.features=tf.placeholder(shape=(None,hparams.feature_nums), dtype=tf.int32) 30 | self.emb_v1=tf.get_variable(shape=[hparams.hash_ids,1], 31 | initializer=initializer,name='emb_v1') 32 | self.emb_v2=tf.get_variable(shape=[hparams.hash_ids,hparams.feature_nums,hparams.k], 33 | initializer=initializer,name='emb_v2') 34 | 35 | #lr 36 | emb_inp_v1=tf.gather(self.emb_v1, self.features) 37 | w1=tf.reduce_sum(emb_inp_v1,[-1,-2]) 38 | 39 | 40 | emb_inp_v2=tf.gather(self.emb_v2, self.features) 41 | emb_inp_v2=tf.reduce_sum(emb_inp_v2*tf.transpose(emb_inp_v2,[0,2,1,3]),-1) 42 | temp=[] 43 | for i in range(hparams.feature_nums): 44 | if i!=0: 45 | temp.append(emb_inp_v2[:,i,:i]) 46 | w2=tf.reduce_sum(tf.concat(temp,-1),-1) 47 | 48 | logit=w1+w2 49 | self.prob=tf.sigmoid(logit) 50 | logit_1=tf.log(self.prob+1e-20) 51 | logit_0=tf.log(1-self.prob+1e-20) 52 | self.loss=-tf.reduce_mean(self.label*logit_1+(1-self.label)*logit_0) 53 | self.cost=-(self.label*logit_1+(1-self.label)*logit_0) 54 | self.saver= tf.train.Saver() 55 | 56 | def optimizer(self,hparams): 57 | opt=self._build_train_opt(hparams) 58 | params = tf.trainable_variables() 59 | gradients = tf.gradients(self.loss,params,colocate_gradients_with_ops=True) 60 | clipped_grads, gradient_norm = tf.clip_by_global_norm(gradients, 5.0) 61 | self.grad_norm =gradient_norm 62 | self.update = opt.apply_gradients(zip(clipped_grads, params)) 63 | 64 | def train(self,train_data,dev_data=None): 65 | hparams=self.hparams 66 | sess=self.sess 67 | assert len(train_data[0])==len(train_data[1]), "Size of features data must be equal to label" 68 | for epoch in range(hparams.epoch): 69 | info={} 70 | info['loss']=[] 71 | info['norm']=[] 72 | start_time = time.time() 73 | for idx in range(len(train_data[0])//hparams.batch_size+3): 74 | try: 75 | if hparams.steps<=idx: 76 | T=(time.time()-start_time) 77 | self.eval(T,dev_data,hparams,sess) 78 | break 79 | except: 80 | pass 81 | if idx*hparams.batch_size>=len(train_data[0]): 82 | T=(time.time()-start_time) 83 | self.eval(T,dev_data,hparams,sess) 84 | break 85 | 86 | batch=train_data[0][idx*hparams.batch_size:\ 87 | min((idx+1)*hparams.batch_size,len(train_data[0]))] 88 | batch=utils.hash_batch(batch,hparams) 89 | label=train_data[1][idx*hparams.batch_size:\ 90 | min((idx+1)*hparams.batch_size,len(train_data[1]))] 91 | loss,_,norm=sess.run([self.loss,self.update,self.grad_norm],\ 92 | feed_dict={self.features:batch,self.label:label}) 93 | info['loss'].append(loss) 94 | info['norm'].append(norm) 95 | if (idx+1)%hparams.num_display_steps==0: 96 | info['learning_rate']=hparams.learning_rate 97 | info["train_ppl"]= np.mean(info['loss']) 98 | info["avg_grad_norm"]=np.mean(info['norm']) 99 | utils.print_step_info(" ", epoch,idx+1, info) 100 | del info 101 | info={} 102 | info['loss']=[] 103 | info['norm']=[] 104 | if (idx+1)%hparams.num_eval_steps==0 and dev_data: 105 | T=(time.time()-start_time) 106 | self.eval(T,dev_data,hparams,sess) 107 | self.saver.restore(sess,'model_tmp/model') 108 | T=(time.time()-start_time) 109 | self.eval(T,dev_data,hparams,sess) 110 | os.system("rm -r model_tmp") 111 | 112 | 113 | def infer(self,dev_data): 114 | hparams=self.hparams 115 | sess=self.sess 116 | assert len(dev_data[0])==len(dev_data[1]), "Size of features data must be equal to label" 117 | preds=[] 118 | total_loss=[] 119 | for idx in range(len(dev_data[0])//hparams.batch_size+1): 120 | batch=dev_data[0][idx*hparams.batch_size:\ 121 | min((idx+1)*hparams.batch_size,len(dev_data[0]))] 122 | if len(batch)==0: 123 | break 124 | batch=utils.hash_batch(batch,hparams) 125 | label=dev_data[1][idx*hparams.batch_size:\ 126 | min((idx+1)*hparams.batch_size,len(dev_data[1]))] 127 | pred=sess.run(self.prob,\ 128 | feed_dict={self.features:batch,self.label:label}) 129 | preds.append(pred) 130 | preds=np.concatenate(preds) 131 | return preds 132 | 133 | -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/base_model.py: -------------------------------------------------------------------------------- 1 | """define base class model""" 2 | import abc 3 | import math 4 | import tensorflow as tf 5 | from sklearn import metrics 6 | import os 7 | from src import misc_utils as utils 8 | import numpy as np 9 | from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm 10 | __all__ = ["BaseModel"] 11 | 12 | 13 | class BaseModel(object): 14 | def __init__(self, hparams, scope=None): 15 | tf.set_random_seed(1234) 16 | 17 | @abc.abstractmethod 18 | def _build_graph(self, hparams): 19 | """Subclass must implement this.""" 20 | pass 21 | 22 | 23 | def _get_initializer(self, hparams): 24 | if hparams.init_method == 'tnormal': 25 | return tf.truncated_normal_initializer(stddev=hparams.init_value) 26 | elif hparams.init_method == 'uniform': 27 | return tf.random_uniform_initializer(-hparams.init_value, hparams.init_value) 28 | elif hparams.init_method == 'normal': 29 | return tf.random_normal_initializer(stddev=hparams.init_value) 30 | elif hparams.init_method == 'xavier_normal': 31 | return tf.contrib.layers.xavier_initializer(uniform=False) 32 | elif hparams.init_method == 'xavier_uniform': 33 | return tf.contrib.layers.xavier_initializer(uniform=True) 34 | elif hparams.init_method == 'he_normal': 35 | return tf.contrib.layers.variance_scaling_initializer( \ 36 | factor=2.0, mode='FAN_AVG', uniform=False) 37 | elif hparams.init_method == 'he_uniform': 38 | return tf.contrib.layers.variance_scaling_initializer( \ 39 | factor=2.0, mode='FAN_AVG', uniform=True) 40 | else: 41 | return tf.truncated_normal_initializer(stddev=hparams.init_value) 42 | 43 | 44 | def _build_train_opt(self, hparams): 45 | def train_opt(hparams): 46 | if hparams.optimizer == 'adadelta': 47 | train_step = tf.train.AdadeltaOptimizer( \ 48 | hparams.learning_rate) 49 | elif hparams.optimizer == 'adagrad': 50 | train_step = tf.train.AdagradOptimizer( \ 51 | hparams.learning_rate) 52 | elif hparams.optimizer == 'sgd': 53 | train_step = tf.train.GradientDescentOptimizer( \ 54 | hparams.learning_rate) 55 | elif hparams.optimizer == 'adam': 56 | train_step = tf.train.AdamOptimizer( \ 57 | hparams.learning_rate) 58 | elif hparams.optimizer == 'ftrl': 59 | train_step = tf.train.FtrlOptimizer( \ 60 | hparams.learning_rate) 61 | elif hparams.optimizer == 'gd': 62 | train_step = tf.train.GradientDescentOptimizer( \ 63 | hparams.learning_rate) 64 | elif hparams.optimizer == 'padagrad': 65 | train_step = tf.train.ProximalAdagradOptimizer( \ 66 | hparams.learning_rate) 67 | elif hparams.optimizer == 'pgd': 68 | train_step = tf.train.ProximalGradientDescentOptimizer( \ 69 | hparams.learning_rate) 70 | elif hparams.optimizer == 'rmsprop': 71 | train_step = tf.train.RMSPropOptimizer( \ 72 | hparams.learning_rate) 73 | else: 74 | train_step = tf.train.GradientDescentOptimizer( \ 75 | hparams.learning_rate) 76 | return train_step 77 | 78 | train_step = train_opt(hparams) 79 | return train_step 80 | 81 | 82 | 83 | def _active_layer(self, logit, scope, activation, layer_idx): 84 | logit = self._activate(logit, activation) 85 | return logit 86 | 87 | def _activate(self, logit, activation): 88 | if activation == 'sigmoid': 89 | return tf.nn.sigmoid(logit) 90 | elif activation == 'softmax': 91 | return tf.nn.softmax(logit) 92 | elif activation == 'relu': 93 | return tf.nn.relu(logit) 94 | elif activation == 'tanh': 95 | return tf.nn.tanh(logit) 96 | elif activation == 'elu': 97 | return tf.nn.elu(logit) 98 | elif activation == 'identity': 99 | return tf.identity(logit) 100 | else: 101 | raise ValueError("this activations not defined {0}".format(activation)) 102 | 103 | def _dropout(self, logit, layer_idx): 104 | logit = tf.nn.dropout(x=logit, keep_prob=self.layer_keeps[layer_idx]) 105 | return logit 106 | 107 | def train(self, sess): 108 | return sess.run([self.update, self.loss, self.data_loss, self.merged], \ 109 | feed_dict={self.layer_keeps: self.keep_prob_train}) 110 | 111 | def eval(self,T,dev_data,hparams,sess): 112 | preds=self.infer(dev_data) 113 | if hparams.metric=='logloss': 114 | log_loss=metrics.log_loss(dev_data[1],preds) 115 | if self.best_score>log_loss: 116 | self.best_score=log_loss 117 | try: 118 | os.makedirs('model_tmp/') 119 | except: 120 | pass 121 | self.saver.save(sess,'model_tmp/model') 122 | utils.print_out("# Epcho-time %.2fs Eval logloss %.6f. Best logloss %.6f." \ 123 | %(T,log_loss,self.best_score)) 124 | elif hparams.metric=='auc': 125 | fpr, tpr, thresholds = metrics.roc_curve(dev_data[1]+1, preds, pos_label=2) 126 | auc=metrics.auc(fpr, tpr) 127 | if self.best_score=len(train_data[0]): 92 | T=(time.time()-start_time) 93 | self.eval(T,dev_data,hparams,sess) 94 | break 95 | 96 | batch=train_data[0][idx*hparams.batch_size:\ 97 | min((idx+1)*hparams.batch_size,len(train_data[0]))] 98 | batch=utils.hash_batch(batch,hparams) 99 | label=train_data[1][idx*hparams.batch_size:\ 100 | min((idx+1)*hparams.batch_size,len(train_data[1]))] 101 | loss,_,norm=sess.run([self.loss,self.update,self.grad_norm],\ 102 | feed_dict={self.features:batch,self.label:label}) 103 | info['loss'].append(loss) 104 | info['norm'].append(norm) 105 | if (idx+1)%hparams.num_display_steps==0: 106 | info['learning_rate']=hparams.learning_rate 107 | info["train_ppl"]= np.mean(info['loss']) 108 | info["avg_grad_norm"]=np.mean(info['norm']) 109 | utils.print_step_info(" ", epoch,idx+1, info) 110 | del info 111 | info={} 112 | info['loss']=[] 113 | info['norm']=[] 114 | if (idx+1)%hparams.num_eval_steps==0 and dev_data: 115 | T=(time.time()-start_time) 116 | self.eval(T,dev_data,hparams,sess) 117 | 118 | self.saver.restore(sess,'model_tmp/model') 119 | T=(time.time()-start_time) 120 | self.eval(T,dev_data,hparams,sess) 121 | os.system("rm -r model_tmp") 122 | 123 | 124 | def infer(self,dev_data): 125 | hparams=self.hparams 126 | sess=self.sess 127 | assert len(dev_data[0])==len(dev_data[1]), "Size of features data must be equal to label" 128 | preds=[] 129 | total_loss=[] 130 | for idx in range(len(dev_data[0])//hparams.batch_size+1): 131 | batch=dev_data[0][idx*hparams.batch_size:\ 132 | min((idx+1)*hparams.batch_size,len(dev_data[0]))] 133 | if len(batch)==0: 134 | break 135 | batch=utils.hash_batch(batch,hparams) 136 | label=dev_data[1][idx*hparams.batch_size:\ 137 | min((idx+1)*hparams.batch_size,len(dev_data[1]))] 138 | pred=sess.run(self.prob,\ 139 | feed_dict={self.features:batch,self.label:label}) 140 | preds.append(pred) 141 | preds=np.concatenate(preds) 142 | return preds 143 | 144 | def get_embedding(self,dev_data): 145 | hparams=self.hparams 146 | sess=self.sess 147 | assert len(dev_data[0])==len(dev_data[1]), "Size of features data must be equal to label" 148 | embedding=[] 149 | total_loss=[] 150 | for idx in range(len(dev_data[0])//hparams.batch_size+1): 151 | batch=dev_data[0][idx*hparams.batch_size:\ 152 | min((idx+1)*hparams.batch_size,len(dev_data[0]))] 153 | if len(batch)==0: 154 | break 155 | batch=utils.hash_batch(batch,hparams) 156 | label=dev_data[1][idx*hparams.batch_size:\ 157 | min((idx+1)*hparams.batch_size,len(dev_data[1]))] 158 | temp=sess.run(self.emb_inp_v2,\ 159 | feed_dict={self.features:batch,self.label:label}) 160 | embedding.append(temp) 161 | embedding=np.concatenate(embedding,0) 162 | return embedding 163 | 164 | -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/nffm.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | from src import misc_utils as utils 4 | from tensorflow.python.ops import lookup_ops 5 | from tensorflow.python.layers import core as layers_core 6 | from models.base_model import BaseModel 7 | import numpy as np 8 | import time 9 | import os 10 | class Model(BaseModel): 11 | def __init__(self,hparams): 12 | self.hparams=hparams 13 | if hparams.metric in ['logloss']: 14 | self.best_score=100000 15 | else: 16 | self.best_score=0 17 | self.build_graph(hparams) 18 | self.optimizer(hparams) 19 | params = tf.trainable_variables() 20 | utils.print_out("# Trainable variables") 21 | for param in params: 22 | utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()),param.op.device)) 23 | 24 | def set_Session(self,sess): 25 | self.sess=sess 26 | 27 | def build_graph(self, hparams): 28 | initializer = self._get_initializer(hparams) 29 | self.label = tf.placeholder(shape=(None), dtype=tf.float32) 30 | self.use_norm=tf.placeholder(tf.bool) 31 | self.features=tf.placeholder(shape=(None,hparams.feature_nums), dtype=tf.int32) 32 | self.emb_v1=tf.get_variable(shape=[hparams.hash_ids,1], 33 | initializer=initializer,name='emb_v1') 34 | self.emb_v2=tf.get_variable(shape=[hparams.hash_ids,hparams.feature_nums,hparams.k], 35 | initializer=initializer,name='emb_v2') 36 | 37 | #lr 38 | emb_inp_v1=tf.gather(self.emb_v1, self.features) 39 | w1=tf.reduce_sum(emb_inp_v1,[-1,-2]) 40 | 41 | emb_inp_v2=tf.gather(self.emb_v2, self.features) 42 | emb_inp_v2=tf.reduce_sum(emb_inp_v2*tf.transpose(emb_inp_v2,[0,2,1,3]),-1) 43 | 44 | # the for loop creates a copy of tensor emb_inp_v2 in each iteration 45 | # which is memory intensive and not gpu-friendly 46 | 47 | #temp=[] 48 | #for i in range(hparams.feature_nums): 49 | # if i!=0: 50 | # temp.append(emb_inp_v2[:,i,:i]) 51 | #w2=tf.reduce_sum(tf.concat(temp,-1),-1) 52 | #DNN 53 | #dnn_input=tf.concat(temp,-1) 54 | 55 | ones = tf.ones_like(emb_inp_v2) 56 | mask_a = tf.matrix_band_part(ones, 0, -1) # Upper triangular matrix of 0s and 1s 57 | mask_b = tf.matrix_band_part(ones, 0, 0) # Diagonal matrix of 0s and 1s 58 | mask = tf.cast(mask_a - mask_b, dtype=tf.bool) # Make a bool mask 59 | 60 | #DNN 61 | dnn_input = tf.boolean_mask(emb_inp_v2, mask) 62 | dnn_input = tf.reshape(dnn_input,[tf.shape(emb_inp_v2)[0],hparams.feature_nums*(hparams.feature_nums-1)//2]) 63 | 64 | input_size=int(dnn_input.shape[-1]) 65 | for idx in range(len(hparams.hidden_size)): 66 | glorot = np.sqrt(2.0 / (input_size + hparams.hidden_size[idx])) 67 | W = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(input_size, hparams.hidden_size[idx])), dtype=np.float32) 68 | dnn_input=tf.tensordot(dnn_input,W,[[-1],[0]]) 69 | if hparams.norm is True: 70 | dnn_input=self.batch_norm_layer(dnn_input,\ 71 | self.use_norm,'norm_'+str(idx)) 72 | dnn_input=tf.nn.relu(dnn_input) 73 | input_size=hparams.hidden_size[idx] 74 | 75 | glorot = np.sqrt(2.0 / (hparams.hidden_size[-1] + 1)) 76 | W = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(hparams.hidden_size[-1], 1)), dtype=np.float32) 77 | b = tf.Variable(tf.constant(-3.5), dtype=np.float32) 78 | w3=tf.tensordot(dnn_input,W,[[-1],[0]])+b 79 | 80 | 81 | logit=w3[:,0] 82 | self.prob=tf.sigmoid(logit) 83 | logit_1=tf.log(self.prob+1e-20) 84 | logit_0=tf.log(1-self.prob+1e-20) 85 | self.loss=-tf.reduce_mean(self.label*logit_1+(1-self.label)*logit_0) 86 | self.cost=-(self.label*logit_1+(1-self.label)*logit_0) 87 | self.saver= tf.train.Saver() 88 | 89 | def optimizer(self,hparams): 90 | opt=self._build_train_opt(hparams) 91 | params = tf.trainable_variables() 92 | gradients = tf.gradients(self.loss,params,colocate_gradients_with_ops=True) 93 | clipped_grads, gradient_norm = tf.clip_by_global_norm(gradients, 5.0) 94 | self.grad_norm =gradient_norm 95 | self.update = opt.apply_gradients(zip(clipped_grads, params)) 96 | 97 | def train(self,train_data,dev_data): 98 | hparams=self.hparams 99 | sess=self.sess 100 | assert len(train_data[0])==len(train_data[1]), "Size of features data must be equal to label" 101 | for epoch in range(hparams.epoch): 102 | info={} 103 | info['loss']=[] 104 | info['norm']=[] 105 | start_time = time.time() 106 | for idx in range(len(train_data[0])//hparams.batch_size+3): 107 | try: 108 | if hparams.steps<=idx: 109 | T=(time.time()-start_time) 110 | self.eval(T,dev_data,hparams,sess) 111 | break 112 | except: 113 | pass 114 | if idx*hparams.batch_size>=len(train_data[0]): 115 | T=(time.time()-start_time) 116 | self.eval(T,dev_data,hparams,sess) 117 | break 118 | 119 | batch=train_data[0][idx*hparams.batch_size:\ 120 | min((idx+1)*hparams.batch_size,len(train_data[0]))] 121 | batch=utils.hash_batch(batch,hparams) 122 | label=train_data[1][idx*hparams.batch_size:\ 123 | min((idx+1)*hparams.batch_size,len(train_data[1]))] 124 | loss,_,norm=sess.run([self.loss,self.update,self.grad_norm],feed_dict=\ 125 | {self.features:batch,self.label:label,self.use_norm:True}) 126 | info['loss'].append(loss) 127 | info['norm'].append(norm) 128 | if (idx+1)%hparams.num_display_steps==0: 129 | info['learning_rate']=hparams.learning_rate 130 | info["train_ppl"]= np.mean(info['loss']) 131 | info["avg_grad_norm"]=np.mean(info['norm']) 132 | utils.print_step_info(" ", epoch,idx+1, info) 133 | del info 134 | info={} 135 | info['loss']=[] 136 | info['norm']=[] 137 | if (idx+1)%hparams.num_eval_steps==0 and dev_data: 138 | T=(time.time()-start_time) 139 | self.eval(T,dev_data,hparams,sess) 140 | self.saver.restore(sess,'model_tmp/model') 141 | T=(time.time()-start_time) 142 | self.eval(T,dev_data,hparams,sess) 143 | os.system("rm -r model_tmp") 144 | 145 | 146 | def infer(self,dev_data): 147 | hparams=self.hparams 148 | sess=self.sess 149 | assert len(dev_data[0])==len(dev_data[1]), "Size of features data must be equal to label" 150 | preds=[] 151 | total_loss=[] 152 | for idx in range(len(dev_data[0])//hparams.batch_size+1): 153 | batch=dev_data[0][idx*hparams.batch_size:\ 154 | min((idx+1)*hparams.batch_size,len(dev_data[0]))] 155 | if len(batch)==0: 156 | break 157 | batch=utils.hash_batch(batch,hparams) 158 | label=dev_data[1][idx*hparams.batch_size:\ 159 | min((idx+1)*hparams.batch_size,len(dev_data[1]))] 160 | pred=sess.run(self.prob,feed_dict=\ 161 | {self.features:batch,self.label:label,self.use_norm:False}) 162 | preds.append(pred) 163 | preds=np.concatenate(preds) 164 | return preds 165 | 166 | -------------------------------------------------------------------------------- /1完整流程/3特征工程/tezhenggongcheng.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # -*- coding: utf-8 -*- 4 | 5 | import pandas as pd 6 | import lightgbm as lgb 7 | import xgboost as xgb 8 | from sklearn.linear_model import BayesianRidge 9 | from sklearn.model_selection import KFold, StratifiedKFold 10 | import numpy as np 11 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder 12 | from sklearn.metrics import roc_auc_score 13 | from sklearn.cluster import KMeans 14 | import matplotlib.pyplot as mp 15 | import matplotlib.pyplot as plt 16 | import time 17 | import seaborn as sns 18 | #from untitled0 import MeanEncoder 19 | 20 | 21 | 22 | 23 | train_df= pd.read_csv('F:/train26.csv') 24 | test= pd.read_csv('F:/test.csv') 25 | test['Id']= test['label'] 26 | train_df['Id'] = -1 27 | del test['label'] 28 | data = pd.concat([train_df,test],axis=0,ignore_index=True) 29 | #data = pd.read_csv('F:/tezheng.csv') 30 | def reduce_mem_usage(df): 31 | start_mem = df.memory_usage().sum() / 1024 ** 2 32 | for col in df.columns: 33 | col_type = df[col].dtypes 34 | if col_type != object: 35 | c_min = df[col].min() 36 | c_max = df[col].max() 37 | if str(col_type)[:3] == 'int': 38 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 39 | df[col] = df[col].astype(np.int8) 40 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 41 | df[col] = df[col].astype(np.int16) 42 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 43 | df[col] = df[col].astype(np.int32) 44 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 45 | df[col] = df[col].astype(np.int64) 46 | else: 47 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 48 | df[col] = df[col].astype(np.float16) 49 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 50 | df[col] = df[col].astype(np.float32) 51 | else: 52 | df[col] = df[col].astype(np.float64) 53 | end_mem = df.memory_usage().sum() / 1024 ** 2 54 | print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 55 | 100 * (start_mem - end_mem) / start_mem)) 56 | return df 57 | 58 | data = reduce_mem_usage(data) 59 | 60 | #user_info 相关特征 61 | 62 | #feature=[ 'slotId','adId','siteId','contentId','primId','creativeType','intertype','firstClass','spreadAppId',] 63 | feature=[ 'city','province','phoneType','carrier','age','gender'] 64 | for f in feature: 65 | e = data.groupby([f])['operTime'].count().reset_index(name=f+'_count') 66 | data = data.merge(e,on=f,how='left') 67 | data[f+'_uId_mean'] = data[f+'_count']/data['uId_count'] 68 | data['per_area'] = data['city_count']/data['province_count'] 69 | 70 | #ad_info 相关特征 71 | feature=[ 'adId','billId','primId','creativeType','intertype'] 72 | for f in feature: 73 | e = data.groupby([f])['operTime'].count().reset_index(name=f+'_count') 74 | data = data.merge(e,on=f,how='left') 75 | data[f+'_uId_mean'] = data[f+'_count']/data['uId_count'] 76 | 77 | #content_info 相关特征 78 | feature=['contentId','firstClass','secondClass'] 79 | for f in feature: 80 | e = data.groupby([f])['operTime'].count().reset_index(name=f+'_count') 81 | data = data.merge(e,on=f,how='left') 82 | data[f+'_uId_mean'] = data[f+'_count']/data['uId_count'] 83 | data['per_class'] = data['secondClass_count']/data['firstClass_count'] 84 | import datetime 85 | 86 | feature=[ 'siteId','slotId','netType'] 87 | for f in feature: 88 | e = data.groupby([f])['operTime'].count().reset_index(name=f+'_count') 89 | data = data.merge(e,on=f,how='left') 90 | data[f+'_uId_mean'] = data[f+'_count']/data['uId_count'] 91 | 92 | #del data['netType_count_x'] 93 | data['operTime']=pd.to_datetime(data['operTime']) 94 | data['hour'] = data['operTime'].dt.hour 95 | #data['day'] = data['operTime'].dt.day 96 | #e = data.groupby(['uId'])['day'].nunique().reset_index(name=f+'_count') 97 | #data = data.merge(e,on='uId',how='left') 98 | 99 | #encoder= LabelEncoder().fit(data['day']) 100 | #data['day'] = encoder.transform(data["day"]) 101 | #组合特征 102 | a = data.groupby(['uId'])['hour'].max().reset_index(name='most_time')#各用户最常访问时间段 103 | b = data.groupby(['hour'])['uId'].count().reset_index(name='most_time_Id') #最多访问次数的时间段 104 | data = data.merge(a,on='uId',how='left').merge(b,on='hour',how='left') 105 | 106 | feature=[ 'billId','hour'] 107 | for f in feature: 108 | e = data.groupby(['uId'])[f].var().reset_index(name=f+'_var') 109 | data = data.merge(e,on='uId',how='left') 110 | 111 | 112 | #feature = ['adId','contentId'] 113 | #for f in feature: 114 | # e = data.groupby(['uId'])[f].nunique().reset_index(name=f+'_unique') 115 | # data = data.merge(e,on='uId',how='left') 116 | 117 | #相关性可视化 118 | #plt.figure(figsize=(40, 32)) # 指定绘图对象宽度和高度 119 | #colnm = data.columns.tolist() # 列表头 120 | #mcorr = data[colnm].corr(method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数 121 | #mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型 122 | #mask[np.triu_indices_from(mask)] = True # 角分线右侧为True 123 | #cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象 124 | #g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度) 125 | #plt.show() 126 | #plt.savefig('F:/2I.fig') 127 | # 0.95041 0.962868 128 | #加上city province 0.95031 0.962869 129 | #按关系表删除most_time——id per_area hour province_count city_ount 0.95021 130 | 131 | #lgb 132 | # 63特征 0.97012 0.95103 0.94994 0.96763 0.96736 0.96781 0.963798 133 | # 50特征 0.97005 0.95101 0.94991 0.96759 0.96736 0.96781 0.963548 134 | #权重 0.963305 135 | #权重 136 | #删掉mean 28 0.94756 0.962897 137 | # 按label相关性筛选 0.97001 0.95089 0.94992 0.96764 0.96735 0.96785 0.963586 138 | # 去掉uid计数 0.96951 0.95032 0.94922 0.96717 0.96693 0.96737 139 | del data['operTime'] 140 | del data['uId'] 141 | 142 | #综合各天相关度特征系数小于10 143 | #del data['billId_count'] 144 | #del data['intertype_count'] 145 | del data['gender_count'] 146 | #del data['netType_count'] 147 | #del data['intertype'] 148 | #del data['creativeType_count'] 149 | #del data['carrier_count'] 150 | #del data['carrier'] 151 | #del data['gender'] 152 | #del data['firstClass_count'] 153 | #del data['netType'] 154 | #del data['secondClass_count'] 155 | #于label相关性太低 156 | #del data['most_time_Id'] 157 | #del data['hour'] 158 | #del data['netType_count'] 159 | #del data['siteId_count'] 160 | #del data['per_area'] 161 | #del data['gender_count'] 162 | #del data['age_count'] 163 | #del data['province_count'] 164 | #del data['carrier_count'] 165 | #del data['siteId'] 166 | #del data['city_count'] 167 | #del data['province'] 168 | #del data['phoneType'] 169 | #del data['slotId'] 170 | #del data['carrier'] 171 | #del data['netType'] 172 | #del data['city'] 173 | 174 | # 分布非常不一致 175 | #del data['hour_var'] 176 | del data['secondClass_count'] 177 | del data['firstClass_count'] 178 | del data['contentId_uId_mean'] #与adid mean相关性高 179 | #del data['contentId_count'] 180 | del data['creativeType_count'] 181 | #del data['primId_uId_mean'] 182 | del data['billId_count'] 183 | #del data['adId_uId_mean'] 184 | #del data['adId_count'] 185 | del data['per_area'] 186 | #del data['spreadAppId'] 187 | del data['secondClass'] 188 | del data['intertype'] 189 | #del data['firstClass'] 190 | #del data['creativeType'] 191 | del data['contentId'] #与contentID count重复率高 192 | #del data['billId'] 193 | #del data['adId'] 194 | del data['uId_count'] 195 | del data['netType_count'] #与nettype重复率高 196 | del data['contentId_count'] #与adid重复率高,分布差 197 | del data['secondClass_uId_mean'] #与firstclassuidmean 重复率高 198 | #lgb 199 | 200 | 201 | data = data.fillna(-1) 202 | train_df = data[data['Id']<0] 203 | test = data[data['Id']>0] 204 | 205 | train_df.to_csv('train.csv',index = False) 206 | test.to_csv('test.csv') 207 | -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/test_26_27_28_50w.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import tensorflow as tf 4 | import ctrNet 5 | from sklearn.model_selection import train_test_split 6 | from src import misc_utils as utils 7 | import os 8 | from sklearn.model_selection import KFold, StratifiedKFold 9 | from sklearn.metrics import roc_auc_score 10 | import sys 11 | 12 | input_data = sys.argv[1] 13 | output_data = sys.argv[2] 14 | 15 | input_path = 'data/' + input_data +'.csv' 16 | output_path = 'result_best/XdeepFM_' + output_data + '_' 17 | 18 | 19 | train_df=pd.read_csv(input_path) 20 | test_df=pd.read_csv('data/uid1_label1_test_data.csv') 21 | 22 | date = pd.to_datetime(train_df['operTime']) 23 | x=date.dt.hour 24 | train_df['operTime'] = x 25 | 26 | date = pd.to_datetime(test_df['operTime']) 27 | x=date.dt.hour 28 | test_df['operTime'] = x 29 | 30 | 31 | 32 | len_train = train_df.shape[0] 33 | data = pd.concat([train_df,test_df]) 34 | 35 | feature=[ 'slotId','phoneType','adId','city','operTime'] 36 | for f in feature: 37 | count1 = data.groupby([f])['uId'].count().reset_index(name=f+'_count') 38 | data = data.merge(count1,on=f,how='left') 39 | 40 | 41 | data.drop(['uId'],axis = 1,inplace = True) 42 | 43 | train_df = data.iloc[0:len_train,:] 44 | test_df = data.iloc[len_train:,:] 45 | 46 | 47 | 48 | sub = pd.DataFrame() 49 | sub['id'] = test_df['label'] 50 | test_df['label'] = -1 51 | 52 | 53 | features=[i for i in train_df.columns if i not in ['label']] 54 | 55 | 56 | folds = StratifiedKFold(n_splits=3, shuffle=False, random_state=44000) 57 | 58 | # oof_FM = np.zeros(len(train_df)) 59 | # preds_FM = np.zeros(len(test_df)) 60 | 61 | # oof_FFM = np.zeros(len(train_df)) 62 | # preds_FFM = np.zeros(len(test_df)) 63 | 64 | # oof_FNFM = np.zeros(len(train_df)) 65 | # preds_NFFM = np.zeros(len(test_df)) 66 | 67 | oof_XDEEPFM = np.zeros(len(train_df)) 68 | preds_XDEEPFM = np.zeros(len(test_df)) 69 | 70 | # train_df, dev_df,_,_ = train_test_split(train_df,train_df,test_size=0.1, random_state=2019) 71 | 72 | 73 | # #FM 74 | # hparam=tf.contrib.training.HParams( 75 | # model='fm', #['fm','ffm','nffm'] 76 | # k=16, 77 | # hash_ids=int(1e5), 78 | # batch_size=1024, 79 | # optimizer="adam", #['adadelta','adagrad','sgd','adam','ftrl','gd','padagrad','pgd','rmsprop'] 80 | # learning_rate=0.0002, 81 | # num_display_steps=1000, 82 | # num_eval_steps=1000, 83 | # # steps=200, 84 | # epoch=10, 85 | # metric='auc', #['auc','logloss'] 86 | # init_method='uniform', #['tnormal','uniform','normal','xavier_normal','xavier_uniform','he_normal','he_uniform'] 87 | # init_value=0.1, 88 | # feature_nums=len(features)) 89 | # utils.print_hparams(hparam) 90 | # os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID' 91 | # os.environ["CUDA_VISIBLE_DEVICES"]='0' 92 | # model=ctrNet.build_model(hparam) 93 | # print("Testing FM....") 94 | # model.train(train_data=(train_df[features],train_df['label']),\ 95 | # dev_data=(dev_df[features],dev_df['label'])) 96 | # # from sklearn import metrics 97 | # preds=model.infer(dev_data=(test_df[features],test_df['label'])) 98 | # # fpr, tpr, thresholds = metrics.roc_curve(test_df['label']+1, preds, pos_label=2) 99 | # # auc=metrics.auc(fpr, tpr) 100 | # # print(preds) 101 | # preds = np.round(preds,6) 102 | # sub['probability'] = preds 103 | # sub.to_csv('result_28/submission_'+'FM'+'.csv', index=False) 104 | 105 | 106 | 107 | # print("FM Done....") 108 | 109 | 110 | 111 | 112 | # #FFM 113 | # hparam=tf.contrib.training.HParams( 114 | # model='ffm', #['fm','ffm','nffm'] 115 | # k=16, 116 | # hash_ids=int(1e5), 117 | # batch_size=1024, 118 | # optimizer="adam", #['adadelta','adagrad','sgd','adam','ftrl','gd','padagrad','pgd','rmsprop'] 119 | # learning_rate=0.0002, 120 | # num_display_steps=1000, 121 | # num_eval_steps=1000, 122 | # epoch=10, 123 | # metric='auc', #['auc','logloss'] 124 | # init_method='uniform', #['tnormal','uniform','normal','xavier_normal','xavier_uniform','he_normal','he_uniform'] 125 | # init_value=0.1, 126 | # feature_nums=len(features)) 127 | # utils.print_hparams(hparam) 128 | # os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID' 129 | # os.environ["CUDA_VISIBLE_DEVICES"]='0' 130 | # model=ctrNet.build_model(hparam) 131 | # print("Testing FFM....") 132 | # model.train(train_data=(train_df[features],train_df['label']),\ 133 | # dev_data=(dev_df[features],dev_df['label'])) 134 | # # from sklearn import metrics 135 | # preds=model.infer(dev_data=(test_df[features],test_df['label'])) 136 | # # fpr, tpr, thresholds = metrics.roc_curve(test_df['label']+1, preds, pos_label=2) 137 | # # auc=metrics.auc(fpr, tpr) 138 | # # print(auc) 139 | # preds = np.round(preds,6) 140 | # sub['probability'] = preds 141 | # sub.to_csv('result_28/submission_'+'FFM'+'.csv', index=False) 142 | 143 | 144 | # print("FFM Done....") 145 | 146 | # #NFFM 147 | # hparam=tf.contrib.training.HParams( 148 | # model='nffm', 149 | # norm=True, 150 | # batch_norm_decay=0.9, 151 | # hidden_size=[128,128], 152 | # cross_layer_sizes=[128,128,128], 153 | # k=8, 154 | # hash_ids=int(2e5), 155 | # batch_size=1024, 156 | # optimizer="adam", 157 | # learning_rate=0.001, 158 | # num_display_steps=1000, 159 | # num_eval_steps=1000, 160 | # epoch=10, 161 | # metric='auc', 162 | # activation=['relu','relu','relu'], 163 | # cross_activation='identity', 164 | # init_method='uniform', 165 | # init_value=0.1, 166 | # feature_nums=len(features)) 167 | # utils.print_hparams(hparam) 168 | # os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID' 169 | # os.environ["CUDA_VISIBLE_DEVICES"]='0' 170 | # model=ctrNet.build_model(hparam) 171 | 172 | # print("Testing NFFM....") 173 | 174 | # for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values,train_df['label'].values)): 175 | 176 | # print("Fold {}".format(fold_)) 177 | 178 | # train = train_df.iloc[trn_idx] 179 | # dev = train_df.iloc[val_idx] 180 | 181 | 182 | # model.train(train_data=(train[features],train['label']),dev_data=(dev[features],dev['label'])) 183 | 184 | # oof_FNFM[val_idx] = model.infer(dev_data=(dev[features],dev['label'])) 185 | # preds_NFFM += model.infer(dev_data=(test_df[features],test_df['label']))/ folds.n_splits 186 | 187 | # print("CV score: {:<8.5f}".format(roc_auc_score(train_df['label'], oof_FNFM))) 188 | # cv_auc = roc_auc_score(train_df['label'], oof_FNFM) 189 | # cv_auc = cv_auc.round(6) 190 | 191 | # preds_NFFM = np.round(preds_NFFM,6) 192 | # sub['probability'] = preds_NFFM 193 | # sub.to_csv('result_best/NFFM'+str(cv_auc) + '.csv', index=False) 194 | 195 | 196 | # print("NFFM Done....") 197 | 198 | 199 | 200 | 201 | # #Xdeepfm 202 | hparam=tf.contrib.training.HParams( 203 | model='xdeepfm', 204 | norm=True, 205 | batch_norm_decay=0.9, 206 | hidden_size=[128,128], 207 | cross_layer_sizes=[128,128,128], 208 | k=8, 209 | hash_ids=int(2e5), 210 | batch_size=1024, 211 | optimizer="adam", 212 | learning_rate=0.001, 213 | num_display_steps=1000, 214 | num_eval_steps=1000, 215 | epoch=10, 216 | metric='auc', 217 | activation=['relu','relu','relu'], 218 | cross_activation='identity', 219 | init_method='uniform', 220 | init_value=0.1, 221 | feature_nums=len(features)) 222 | utils.print_hparams(hparam) 223 | os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID' 224 | os.environ["CUDA_VISIBLE_DEVICES"]='0' 225 | model=ctrNet.build_model(hparam) 226 | print("Testing XdeepFM....") 227 | 228 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values,train_df['label'].values)): 229 | 230 | print("Fold {}".format(fold_)) 231 | 232 | train = train_df.iloc[trn_idx] 233 | dev = train_df.iloc[val_idx] 234 | 235 | 236 | model.train(train_data=(train[features],train['label']),dev_data=(dev[features],dev['label'])) 237 | 238 | 239 | 240 | oof_XDEEPFM[val_idx] = model.infer(dev_data=(dev[features],dev['label'])) 241 | # from sklearn import metrics 242 | preds_XDEEPFM += model.infer(dev_data=(test_df[features],test_df['label']))/ folds.n_splits 243 | 244 | print("CV score: {:<8.5f}".format(roc_auc_score(train_df['label'], oof_XDEEPFM))) 245 | cv_auc = roc_auc_score(train_df['label'], oof_XDEEPFM) 246 | cv_auc = cv_auc.round(6) 247 | 248 | preds_XDEEPFM = np.round(preds_XDEEPFM,6) 249 | sub['probability'] = preds_XDEEPFM 250 | sub.to_csv(output_path+str(cv_auc) + '.csv', index=False) 251 | 252 | print("XdeepFM Done....") 253 | 254 | -------------------------------------------------------------------------------- /1完整流程/4模型/3_XDeepFM/models/xdeepfm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from src import misc_utils as utils 3 | from tensorflow.python.ops import lookup_ops 4 | from tensorflow.python.layers import core as layers_core 5 | from models.base_model import BaseModel 6 | import numpy as np 7 | import time 8 | import os 9 | class Model(BaseModel): 10 | def __init__(self,hparams): 11 | self.hparams=hparams 12 | if hparams.metric in ['logloss']: 13 | self.best_score=100000 14 | else: 15 | self.best_score=0 16 | self.build_graph(hparams) 17 | self.optimizer(hparams) 18 | params = tf.trainable_variables() 19 | utils.print_out("# Trainable variables") 20 | for param in params: 21 | utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()),param.op.device)) 22 | 23 | def set_Session(self,sess): 24 | self.sess=sess 25 | 26 | def build_graph(self, hparams): 27 | self.initializer = self._get_initializer(hparams) 28 | self.label = tf.placeholder(shape=(None), dtype=tf.float32) 29 | self.use_norm=tf.placeholder(tf.bool) 30 | self.features=tf.placeholder(shape=(None,hparams.feature_nums), dtype=tf.int32) 31 | self.emb_v1=tf.get_variable(shape=[hparams.hash_ids,1], 32 | initializer=self.initializer,name='emb_v1') 33 | self.emb_v2=tf.get_variable(shape=[hparams.hash_ids,hparams.k], 34 | initializer=self.initializer,name='emb_v2') 35 | 36 | #lr 37 | emb_inp_v1=tf.gather(self.emb_v1, self.features) 38 | lr_logits=tf.reduce_sum(emb_inp_v1,[-1,-2]) 39 | 40 | emb_inp_v2=tf.gather(self.emb_v2, self.features) 41 | self.emb_inp_v2=emb_inp_v2 42 | #DNN 43 | dnn_input=tf.reshape(emb_inp_v2,[-1,hparams.feature_nums*hparams.k]) 44 | input_size=int(dnn_input.shape[-1]) 45 | for idx in range(len(hparams.hidden_size)): 46 | glorot = np.sqrt(2.0 / (input_size + hparams.hidden_size[idx])) 47 | W = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(input_size, hparams.hidden_size[idx])), dtype=np.float32) 48 | dnn_input=tf.tensordot(dnn_input,W,[[-1],[0]]) 49 | if hparams.norm is True: 50 | dnn_input=self.batch_norm_layer(dnn_input,\ 51 | self.use_norm,'norm_'+str(idx)) 52 | dnn_input=tf.nn.relu(dnn_input) 53 | input_size=hparams.hidden_size[idx] 54 | 55 | glorot = np.sqrt(2.0 / (hparams.hidden_size[-1] + 1)) 56 | W = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(hparams.hidden_size[-1], 1)), dtype=np.float32) 57 | dnn_logits=tf.tensordot(dnn_input,W,[[-1],[0]])[:,0] 58 | 59 | #exFM 60 | exfm_logit=self._build_extreme_FM(hparams, emb_inp_v2, res=False, direct=False, bias=False, reduce_D=False, f_dim=2)[:,0] 61 | 62 | logit=lr_logits+dnn_logits+exfm_logit 63 | self.prob=tf.sigmoid(logit) 64 | logit_1=tf.log(self.prob+1e-20) 65 | logit_0=tf.log(1-self.prob+1e-20) 66 | self.loss=-tf.reduce_mean(self.label*logit_1+(1-self.label)*logit_0) 67 | self.cost=-(self.label*logit_1+(1-self.label)*logit_0) 68 | self.saver= tf.train.Saver() 69 | 70 | def _build_extreme_FM(self, hparams, nn_input, res=False, direct=False, bias=False, reduce_D=False, f_dim=2): 71 | hidden_nn_layers = [] 72 | field_nums = [] 73 | final_len = 0 74 | field_num = hparams.feature_nums 75 | nn_input = tf.reshape(nn_input, shape=[-1, int(field_num), hparams.k]) 76 | field_nums.append(int(field_num)) 77 | hidden_nn_layers.append(nn_input) 78 | final_result = [] 79 | split_tensor0 = tf.split(hidden_nn_layers[0], hparams.k * [1], 2) 80 | with tf.variable_scope("exfm_part", initializer=self.initializer) as scope: 81 | for idx, layer_size in enumerate(hparams.cross_layer_sizes): 82 | split_tensor = tf.split(hidden_nn_layers[-1], hparams.k * [1], 2) 83 | dot_result_m = tf.matmul(split_tensor0, split_tensor, transpose_b=True) 84 | dot_result_o = tf.reshape(dot_result_m, shape=[hparams.k, -1, field_nums[0]*field_nums[-1]]) 85 | dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2]) 86 | 87 | if reduce_D: 88 | filters0 = tf.get_variable("f0_" + str(idx), 89 | shape=[1, layer_size, field_nums[0], f_dim], 90 | dtype=tf.float32) 91 | filters_ = tf.get_variable("f__" + str(idx), 92 | shape=[1, layer_size, f_dim, field_nums[-1]], 93 | dtype=tf.float32) 94 | filters_m = tf.matmul(filters0, filters_) 95 | filters_o = tf.reshape(filters_m, shape=[1, layer_size, field_nums[0] * field_nums[-1]]) 96 | filters = tf.transpose(filters_o, perm=[0, 2, 1]) 97 | else: 98 | filters = tf.get_variable(name="f_"+str(idx), 99 | shape=[1, field_nums[-1]*field_nums[0], layer_size], 100 | dtype=tf.float32) 101 | # dot_result = tf.transpose(dot_result, perm=[0, 2, 1]) 102 | curr_out = tf.nn.conv1d(dot_result, filters=filters, stride=1, padding='VALID') 103 | 104 | # BIAS ADD 105 | if bias: 106 | b = tf.get_variable(name="f_b" + str(idx), 107 | shape=[layer_size], 108 | dtype=tf.float32, 109 | initializer=tf.zeros_initializer()) 110 | curr_out = tf.nn.bias_add(curr_out, b) 111 | 112 | curr_out = self._activate(curr_out, hparams.cross_activation) 113 | 114 | curr_out = tf.transpose(curr_out, perm=[0, 2, 1]) 115 | 116 | if direct: 117 | 118 | direct_connect = curr_out 119 | next_hidden = curr_out 120 | final_len += layer_size 121 | field_nums.append(int(layer_size)) 122 | 123 | else: 124 | if idx != len(hparams.cross_layer_sizes) - 1: 125 | next_hidden, direct_connect = tf.split(curr_out, 2 * [int(layer_size / 2)], 1) 126 | final_len += int(layer_size / 2) 127 | else: 128 | direct_connect = curr_out 129 | next_hidden = 0 130 | final_len += layer_size 131 | field_nums.append(int(layer_size / 2)) 132 | 133 | final_result.append(direct_connect) 134 | hidden_nn_layers.append(next_hidden) 135 | 136 | 137 | result = tf.concat(final_result, axis=1) 138 | 139 | result = tf.reduce_sum(result, -1) 140 | if res: 141 | w_nn_output1 = tf.get_variable(name='w_nn_output1', 142 | shape=[final_len, 128], 143 | dtype=tf.float32) 144 | b_nn_output1 = tf.get_variable(name='b_nn_output1', 145 | shape=[128], 146 | dtype=tf.float32, 147 | initializer=tf.zeros_initializer()) 148 | self.layer_params.append(w_nn_output1) 149 | self.layer_params.append(b_nn_output1) 150 | exFM_out0 = tf.nn.xw_plus_b(result, w_nn_output1, b_nn_output1) 151 | exFM_out1 = self._active_layer(logit=exFM_out0, 152 | scope=scope, 153 | activation="relu", 154 | layer_idx=0) 155 | w_nn_output2 = tf.get_variable(name='w_nn_output2', 156 | shape=[128 + final_len, 1], 157 | dtype=tf.float32) 158 | b_nn_output2 = tf.get_variable(name='b_nn_output2', 159 | shape=[1], 160 | dtype=tf.float32, 161 | initializer=tf.zeros_initializer()) 162 | self.layer_params.append(w_nn_output2) 163 | self.layer_params.append(b_nn_output2) 164 | exFM_in = tf.concat([exFM_out1, result], axis=1, name="user_emb") 165 | exFM_out = tf.nn.xw_plus_b(exFM_in, w_nn_output2, b_nn_output2) 166 | 167 | else: 168 | w_nn_output = tf.get_variable(name='w_nn_output', 169 | shape=[final_len, 1], 170 | dtype=tf.float32) 171 | b_nn_output = tf.get_variable(name='b_nn_output', 172 | shape=[1], 173 | dtype=tf.float32, 174 | initializer=tf.zeros_initializer()) 175 | exFM_out = tf.nn.xw_plus_b(result, w_nn_output, b_nn_output) 176 | 177 | return exFM_out 178 | 179 | 180 | def optimizer(self,hparams): 181 | opt=self._build_train_opt(hparams) 182 | params = tf.trainable_variables() 183 | gradients = tf.gradients(self.loss,params,colocate_gradients_with_ops=True) 184 | clipped_grads, gradient_norm = tf.clip_by_global_norm(gradients, 5.0) 185 | self.grad_norm =gradient_norm 186 | self.update = opt.apply_gradients(zip(clipped_grads, params)) 187 | 188 | def train(self,train_data,dev_data): 189 | hparams=self.hparams 190 | sess=self.sess 191 | assert len(train_data[0])==len(train_data[1]), "Size of features data must be equal to label" 192 | for epoch in range(hparams.epoch): 193 | info={} 194 | info['loss']=[] 195 | info['norm']=[] 196 | start_time = time.time() 197 | for idx in range(len(train_data[0])//hparams.batch_size+3): 198 | try: 199 | if hparams.steps<=idx: 200 | T=(time.time()-start_time) 201 | self.eval(T,dev_data,hparams,sess) 202 | break 203 | except: 204 | pass 205 | if idx*hparams.batch_size>=len(train_data[0]): 206 | T=(time.time()-start_time) 207 | self.eval(T,dev_data,hparams,sess) 208 | break 209 | 210 | batch=train_data[0][idx*hparams.batch_size:\ 211 | min((idx+1)*hparams.batch_size,len(train_data[0]))] 212 | batch=utils.hash_batch(batch,hparams) 213 | label=train_data[1][idx*hparams.batch_size:\ 214 | min((idx+1)*hparams.batch_size,len(train_data[1]))] 215 | loss,_,norm=sess.run([self.loss,self.update,self.grad_norm],feed_dict=\ 216 | {self.features:batch,self.label:label,self.use_norm:True}) 217 | info['loss'].append(loss) 218 | info['norm'].append(norm) 219 | if (idx+1)%hparams.num_display_steps==0: 220 | info['learning_rate']=hparams.learning_rate 221 | info["train_ppl"]= np.mean(info['loss']) 222 | info["avg_grad_norm"]=np.mean(info['norm']) 223 | utils.print_step_info(" ", epoch,idx+1, info) 224 | del info 225 | info={} 226 | info['loss']=[] 227 | info['norm']=[] 228 | if (idx+1)%hparams.num_eval_steps==0 and dev_data: 229 | T=(time.time()-start_time) 230 | self.eval(T,dev_data,hparams,sess) 231 | self.saver.restore(sess,'model_tmp/model') 232 | T=(time.time()-start_time) 233 | self.eval(T,dev_data,hparams,sess) 234 | # os.system("rm -r model_tmp") 235 | 236 | 237 | def infer(self,dev_data): 238 | hparams=self.hparams 239 | sess=self.sess 240 | assert len(dev_data[0])==len(dev_data[1]), "Size of features data must be equal to label" 241 | preds=[] 242 | total_loss=[] 243 | for idx in range(len(dev_data[0])//hparams.batch_size+1): 244 | batch=dev_data[0][idx*hparams.batch_size:\ 245 | min((idx+1)*hparams.batch_size,len(dev_data[0]))] 246 | if len(batch)==0: 247 | break 248 | batch=utils.hash_batch(batch,hparams) 249 | label=dev_data[1][idx*hparams.batch_size:\ 250 | min((idx+1)*hparams.batch_size,len(dev_data[1]))] 251 | pred=sess.run(self.prob,feed_dict=\ 252 | {self.features:batch,self.label:label,self.use_norm:False}) 253 | preds.append(pred) 254 | preds=np.concatenate(preds) 255 | return preds 256 | def get_embedding(self,dev_data): 257 | hparams=self.hparams 258 | sess=self.sess 259 | assert len(dev_data[0])==len(dev_data[1]), "Size of features data must be equal to label" 260 | embedding=[] 261 | total_loss=[] 262 | for idx in range(len(dev_data[0])//hparams.batch_size+1): 263 | batch=dev_data[0][idx*hparams.batch_size:\ 264 | min((idx+1)*hparams.batch_size,len(dev_data[0]))] 265 | if len(batch)==0: 266 | break 267 | batch=utils.hash_batch(batch,hparams) 268 | label=dev_data[1][idx*hparams.batch_size:\ 269 | min((idx+1)*hparams.batch_size,len(dev_data[1]))] 270 | temp=sess.run(self.emb_inp_v2,\ 271 | feed_dict={self.features:batch,self.label:label}) 272 | embedding.append(temp) 273 | embedding=np.concatenate(embedding,0) 274 | return embedding 275 | -------------------------------------------------------------------------------- /1完整流程/1数据预处理_采样_匹配/1.EDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import sklearn\n", 12 | "from sklearn.neighbors import KNeighborsClassifier\n", 13 | "from sklearn.model_selection import train_test_split\n", 14 | "from sklearn.metrics import accuracy_score\n", 15 | "import gc" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "# 读取数据" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "train = pd.read_csv(r'data/train_20190518.csv',header=None, names = ['label', 'uId', 'adId', 'operTime', 'siteId', 'slotId', 'contentId', 'netType'])\n", 32 | "test = pd.read_csv(r'data/test_20190518.csv',header=None, names = ['label', 'uId', 'adId', 'operTime', 'siteId', 'slotId', 'contentId', 'netType'] )\n", 33 | "user_info = pd.read_csv(r'data/clean_user_info.csv',header=None, names =['uId', 'age', 'gender', 'city', 'province', 'phoneType', 'carrier'])\n", 34 | "ad_info = pd.read_csv(r'data/clean_ad_info.csv',header=None, names =['adId', 'billId', 'primId','creativeType', 'intertype', 'spreadAppId'])\n", 35 | "content_info = pd.read_csv(r'data/clean_content_info.csv',header=None, names =['contentId', 'firstClass', 'secondClass'])" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "print('train size:',train.shape)\n", 45 | "print('test size:',test.shape)\n", 46 | "print('user_info size:',user_info.shape)\n", 47 | "print('ad_info size:',ad_info.shape)\n", 48 | "print('content_info size:',content_info.shape)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# 处理user_info数据\n", 56 | "\n", 57 | "user_info包含很多缺失值,经过观察,只有phoneType这个特征,只有5个缺失值,并且其总共有512个种类,根据常识,使用同种手机的人群特征也是比较类似的,所以根据每一类手机将用户分类,用此类手机中其他特征的众数,来填充此类用户中的缺失值,填充的原则就是取出现次数最多的元素进行填充\n", 58 | "\n", 59 | "如果某一类手机的某个特征中,全部都是缺失值,则使用所有数据中出现次数最多的元素进行填充" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "phone_type = user_info['phoneType'].unique().tolist()\n", 69 | "\n", 70 | "# 计算每个特征中,出现次数最多的那个元素\n", 71 | "max_total_age = pd.value_counts(user_info['age'])\n", 72 | "max_total_age = max_total_age.index[0]\n", 73 | "\n", 74 | "max_total_gender = pd.value_counts(user_info['gender'])\n", 75 | "max_total_gender = max_total_gender.index[0]\n", 76 | "\n", 77 | "max_total_city = pd.value_counts(user_info['city'])\n", 78 | "max_total_city = max_total_city.index[0]\n", 79 | "\n", 80 | "max_total_province = pd.value_counts(user_info['province'])\n", 81 | "max_total_province = max_total_province.index[0]\n", 82 | "\n", 83 | "max_total_phoneType = pd.value_counts(user_info['phoneType'])\n", 84 | "max_total_phoneType = max_total_phoneType.index[0]\n", 85 | "\n", 86 | "max_total_carrier = pd.value_counts(user_info['carrier'])\n", 87 | "max_total_carrier = max_total_carrier.index[0]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "number of 1\n", 100 | "number of 2\n", 101 | "number of 3\n", 102 | "number of 4\n", 103 | "number of 5\n", 104 | "number of 6\n", 105 | "number of 7\n", 106 | "number of 8\n", 107 | "number of 9\n", 108 | "number of 10\n", 109 | "number of 11\n", 110 | "number of 12\n", 111 | "number of 13\n", 112 | "number of 14\n", 113 | "number of 15\n", 114 | "number of 16\n", 115 | "number of 17\n", 116 | "number of 18\n", 117 | "number of 19\n", 118 | "number of 20\n", 119 | "number of 21\n", 120 | "number of 22\n", 121 | "number of 23\n", 122 | "number of 24\n", 123 | "number of 25\n", 124 | "number of 26\n", 125 | "number of 27\n", 126 | "number of 28\n", 127 | "number of 29\n", 128 | "number of 30\n", 129 | "number of 31\n", 130 | "number of 32\n", 131 | "number of 33\n", 132 | "number of 34\n", 133 | "number of 35\n", 134 | "number of 36\n", 135 | "number of 37\n", 136 | "number of 38\n", 137 | "number of 39\n", 138 | "number of 40\n", 139 | "number of 41\n", 140 | "number of 42\n", 141 | "number of 43\n", 142 | "number of 44\n", 143 | "number of 45\n", 144 | "number of 46\n", 145 | "number of 47\n", 146 | "number of 48\n", 147 | "number of 49\n", 148 | "number of 50\n", 149 | "number of 51\n", 150 | "number of 52\n", 151 | "number of 53\n", 152 | "number of 54\n", 153 | "number of 55\n", 154 | "number of 56\n", 155 | "number of 57\n", 156 | "number of 58\n", 157 | "number of 59\n", 158 | "number of 60\n", 159 | "number of 61\n", 160 | "number of 62\n", 161 | "number of 63\n", 162 | "number of 64\n", 163 | "number of 65\n", 164 | "number of 66\n", 165 | "number of 67\n", 166 | "number of 68\n", 167 | "number of 69\n", 168 | "number of 70\n", 169 | "number of 71\n", 170 | "number of 72\n", 171 | "number of 73\n", 172 | "number of 74\n", 173 | "number of 75\n", 174 | "number of 76\n", 175 | "number of 77\n", 176 | "number of 78\n", 177 | "number of 79\n", 178 | "number of 80\n", 179 | "number of 81\n", 180 | "number of 82\n", 181 | "number of 83\n", 182 | "number of 84\n", 183 | "number of 85\n", 184 | "number of 86\n", 185 | "number of 87\n", 186 | "number of 88\n", 187 | "number of 89\n", 188 | "number of 90\n", 189 | "number of 91\n", 190 | "number of 92\n", 191 | "number of 93\n", 192 | "number of 94\n", 193 | "number of 95\n", 194 | "number of 96\n", 195 | "number of 97\n", 196 | "number of 98\n", 197 | "number of 99\n", 198 | "number of 100\n", 199 | "number of 101\n", 200 | "number of 102\n", 201 | "number of 103\n", 202 | "number of 104\n", 203 | "number of 105\n", 204 | "number of 106\n", 205 | "number of 107\n", 206 | "number of 108\n", 207 | "number of 109\n", 208 | "number of 110\n", 209 | "number of 111\n", 210 | "number of 112\n", 211 | "number of 113\n", 212 | "number of 114\n", 213 | "number of 115\n", 214 | "number of 116\n", 215 | "number of 117\n", 216 | "number of 118\n", 217 | "number of 119\n", 218 | "number of 120\n", 219 | "number of 121\n", 220 | "number of 122\n", 221 | "number of 123\n", 222 | "number of 124\n", 223 | "number of 125\n", 224 | "number of 126\n", 225 | "number of 127\n", 226 | "number of 128\n", 227 | "number of 129\n", 228 | "number of 130\n", 229 | "number of 131\n", 230 | "number of 132\n", 231 | "number of 133\n", 232 | "number of 134\n", 233 | "number of 135\n", 234 | "number of 136\n", 235 | "number of 137\n", 236 | "number of 138\n", 237 | "number of 139\n", 238 | "number of 140\n", 239 | "number of 141\n", 240 | "number of 142\n", 241 | "number of 143\n", 242 | "number of 144\n", 243 | "number of 145\n", 244 | "number of 146\n", 245 | "number of 147\n", 246 | "number of 148\n", 247 | "number of 149\n", 248 | "number of 150\n", 249 | "number of 151\n", 250 | "number of 152\n", 251 | "number of 153\n", 252 | "number of 154\n", 253 | "number of 155\n", 254 | "number of 156\n", 255 | "number of 157\n", 256 | "number of 158\n", 257 | "number of 159\n", 258 | "number of 160\n", 259 | "number of 161\n", 260 | "number of 162\n", 261 | "number of 163\n", 262 | "number of 164\n", 263 | "number of 165\n", 264 | "number of 166\n", 265 | "number of 167\n", 266 | "number of 168\n", 267 | "number of 169\n", 268 | "number of 170\n", 269 | "number of 171\n", 270 | "number of 172\n", 271 | "number of 173\n", 272 | "number of 174\n", 273 | "number of 175\n", 274 | "number of 176\n", 275 | "number of 177\n", 276 | "number of 178\n", 277 | "number of 179\n", 278 | "number of 180\n", 279 | "number of 181\n", 280 | "number of 182\n", 281 | "number of 183\n", 282 | "number of 184\n", 283 | "number of 185\n", 284 | "number of 186\n", 285 | "number of 187\n", 286 | "number of 188\n", 287 | "number of 189\n", 288 | "number of 190\n", 289 | "number of 191\n", 290 | "number of 192\n", 291 | "number of 193\n", 292 | "number of 194\n", 293 | "number of 195\n", 294 | "number of 196\n", 295 | "number of 197\n", 296 | "number of 198\n", 297 | "number of 199\n", 298 | "number of 200\n", 299 | "number of 201\n", 300 | "number of 202\n", 301 | "number of 203\n", 302 | "number of 204\n", 303 | "number of 205\n", 304 | "number of 206\n", 305 | "number of 207\n", 306 | "number of 208\n", 307 | "number of 209\n", 308 | "number of 210\n", 309 | "number of 211\n", 310 | "number of 212\n", 311 | "number of 213\n", 312 | "number of 214\n", 313 | "number of 215\n", 314 | "number of 216\n", 315 | "number of 217\n", 316 | "number of 218\n", 317 | "number of 219\n", 318 | "number of 220\n", 319 | "number of 221\n", 320 | "number of 222\n", 321 | "number of 223\n", 322 | "number of 224\n", 323 | "number of 225\n", 324 | "number of 226\n", 325 | "number of 227\n", 326 | "number of 228\n", 327 | "number of 229\n", 328 | "number of 230\n", 329 | "number of 231\n", 330 | "number of 232\n", 331 | "number of 233\n", 332 | "number of 234\n", 333 | "number of 235\n", 334 | "number of 236\n", 335 | "number of 237\n", 336 | "number of 238\n", 337 | "number of 239\n", 338 | "number of 240\n", 339 | "number of 241\n", 340 | "number of 242\n", 341 | "number of 243\n", 342 | "number of 244\n", 343 | "number of 245\n", 344 | "number of 246\n", 345 | "number of 247\n", 346 | "number of 248\n", 347 | "number of 249\n", 348 | "number of 250\n", 349 | "number of 251\n", 350 | "number of 252\n", 351 | "number of 253\n", 352 | "number of 254\n", 353 | "number of 255\n", 354 | "number of 256\n", 355 | "number of 257\n", 356 | "number of 258\n", 357 | "number of 259\n", 358 | "number of 260\n", 359 | "number of 261\n", 360 | "number of 262\n", 361 | "number of 263\n", 362 | "number of 264\n", 363 | "number of 265\n", 364 | "number of 266\n", 365 | "number of 267\n", 366 | "number of 268\n", 367 | "number of 269\n", 368 | "number of 270\n", 369 | "number of 271\n", 370 | "number of 272\n", 371 | "number of 273\n", 372 | "number of 274\n", 373 | "number of 275\n", 374 | "number of 276\n", 375 | "number of 277\n", 376 | "number of 278\n", 377 | "number of 279\n", 378 | "number of 280\n", 379 | "number of 281\n", 380 | "number of 282\n", 381 | "number of 283\n", 382 | "number of 284\n", 383 | "number of 285\n", 384 | "number of 286\n", 385 | "number of 287\n", 386 | "number of 288\n", 387 | "number of 289\n", 388 | "number of 290\n", 389 | "number of 291\n", 390 | "number of 292\n", 391 | "number of 293\n", 392 | "number of 294\n", 393 | "number of 295\n", 394 | "number of 296\n", 395 | "number of 297\n", 396 | "number of 298\n", 397 | "number of 299\n", 398 | "number of 300\n", 399 | "number of 301\n", 400 | "number of 302\n", 401 | "number of 303\n", 402 | "number of 304\n", 403 | "number of 305\n", 404 | "number of 306\n", 405 | "number of 307\n", 406 | "number of 308\n", 407 | "number of 309\n", 408 | "number of 310\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "# 先将phoneType为缺失值的那5个数据用所有数据出现次数最多的元素继续宁填充\n", 414 | "tem_int = user_info[user_info['phoneType'].isnull()]['age'].fillna(max_total_age)\n", 415 | "user_info.loc[user_info['phoneType'].isnull(),'age'] = tem_int\n", 416 | "\n", 417 | "tem_int = user_info[user_info['phoneType'].isnull()]['gender'].fillna(max_total_gender)\n", 418 | "user_info.loc[user_info['phoneType'].isnull(),'gender'] = tem_int\n", 419 | "\n", 420 | "tem_int = user_info[user_info['phoneType'].isnull()]['province'].fillna(max_total_province)\n", 421 | "user_info.loc[user_info['phoneType'].isnull(),'province'] = tem_int\n", 422 | "\n", 423 | "tem_int = user_info[user_info['phoneType'].isnull()]['carrier'].fillna(max_total_carrier)\n", 424 | "user_info.loc[user_info['phoneType'].isnull(),'carrier'] = tem_int\n", 425 | "\n", 426 | "tem_int = user_info[user_info['phoneType'].isnull()]['phoneType'].fillna(max_total_phoneType)\n", 427 | "user_info.loc[user_info['phoneType'].isnull(),'phoneType'] = tem_int\n", 428 | "\n", 429 | "# 用手机类型进行分类,分别处理每个批量的数据\n", 430 | "num = 1\n", 431 | "for i in phone_type:\n", 432 | " data = user_info[user_info['phoneType'] == i]\n", 433 | " if (data['age'].isnull().sum() == len(data) or \n", 434 | " data['gender'].isnull().sum() == len(data) or\n", 435 | " data['city'].isnull().sum() == len(data) or\n", 436 | " data['province'].isnull().sum() == len(data) or\n", 437 | " data['carrier'].isnull().sum() == len(data)):\n", 438 | " \n", 439 | " tem_int = user_info[user_info['phoneType'] == i]['age'].fillna(max_total_age)\n", 440 | " user_info.loc[user_info['phoneType'] == i,'age'] = tem_int\n", 441 | " \n", 442 | " tem_int = user_info[user_info['phoneType'] == i]['gender'].fillna(max_total_gender)\n", 443 | " user_info.loc[user_info['phoneType'] == i,'gender'] = tem_int\n", 444 | " \n", 445 | " tem_int = user_info[user_info['phoneType'] == i]['city'].fillna(max_total_city)\n", 446 | " user_info.loc[user_info['phoneType'] == i,'city'] = tem_int\n", 447 | " \n", 448 | " tem_int = user_info[user_info['phoneType'] == i]['province'].fillna(max_total_province)\n", 449 | " user_info.loc[user_info['phoneType'] == i,'province'] = tem_int\n", 450 | " \n", 451 | " tem_int = user_info[user_info['phoneType'] == i]['carrier'].fillna(max_total_carrier)\n", 452 | " user_info.loc[user_info['phoneType'] == i,'carrier'] = tem_int\n", 453 | " \n", 454 | " else:\n", 455 | " max_age = pd.value_counts(data['age'])\n", 456 | " max_age = max_age.index[0]\n", 457 | "\n", 458 | " max_gender = pd.value_counts(data['gender'])\n", 459 | " max_gender = max_gender.index[0]\n", 460 | "\n", 461 | " max_city = pd.value_counts(data['city'])\n", 462 | " max_city = max_city.index[0]\n", 463 | "\n", 464 | " max_province = pd.value_counts(data['province'])\n", 465 | " max_province = max_province.index[0]\n", 466 | "\n", 467 | " max_carrier = pd.value_counts(data['carrier'])\n", 468 | " max_carrier = max_carrier.index[0]\n", 469 | " \n", 470 | " tem_int = user_info[user_info['phoneType'] == i]['age'].fillna(max_age)\n", 471 | " user_info.loc[user_info['phoneType'] == i,'age'] = tem_int\n", 472 | "\n", 473 | " tem_int = user_info[user_info['phoneType'] == i]['gender'].fillna(max_gender)\n", 474 | " user_info.loc[user_info['phoneType'] == i,'gender'] = tem_int\n", 475 | "\n", 476 | " tem_int = user_info[user_info['phoneType'] == i]['city'].fillna(max_city)\n", 477 | " user_info.loc[user_info['phoneType'] == i,'city'] = tem_int\n", 478 | "\n", 479 | " tem_int = user_info[user_info['phoneType'] == i]['province'].fillna(max_province)\n", 480 | " user_info.loc[user_info['phoneType'] == i,'province'] = tem_int\n", 481 | "\n", 482 | " tem_int = user_info[user_info['phoneType'] == i]['carrier'].fillna(max_carrier)\n", 483 | " user_info.loc[user_info['phoneType'] == i,'carrier'] = tem_int\n", 484 | " \n", 485 | "# 监控处理过程\n", 486 | " print('number of',num)\n", 487 | " num += 1\n" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "user_info.isnull().any()" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "user_info.to_csv('data/clean_user_info.csv',index = False)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": {}, 511 | "source": [ 512 | "# 处理ad_info数据\n", 513 | "\n", 514 | "1.付费方式"billId"需要数值化\n", 515 | "\n", 516 | "2.spreadAppId(广告对应的appId)这个特征就有缺失值\n", 517 | "\n", 518 | "解决方案:将spreadAppId作为标签,用前五个特征进行训练,使用KNN算法,预测无标签数据的标签,在验证数据集上,得到了79%的准确率" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "# 字符数据离散化 \n", 528 | "labels_3 = ad_info['billId'].unique().tolist()\n", 529 | "q = 1\n", 530 | "for i in labels_3:\n", 531 | " ad_info.loc[ad_info['billId'] ==i,'billId'] = q\n", 532 | " q += 1" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "ad_info[ad_info['spreadAppId'].isnull()]['primId'].unique()" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "X_data = ad_info[ad_info['spreadAppId'].notnull()]\n", 551 | "y = X_data['spreadAppId']\n", 552 | "X = X_data.drop(['spreadAppId'],axis = 1)\n", 553 | "\n", 554 | "X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2)\n", 555 | "knn=KNeighborsClassifier(n_neighbors=2,weights='distance')\n", 556 | "knn.fit(X_train,Y_train)\n", 557 | "y_pred = knn.predict(X_test)\n", 558 | "print('accuracy of KNN',accuracy_score(Y_test, y_pred))\n", 559 | "\n", 560 | "null_data = ad_info[ad_info['spreadAppId'].isnull()]\n", 561 | "null_data = null_data.drop(['spreadAppId'],axis = 1)\n", 562 | "y_pred_null = knn.predict(null_data)\n", 563 | "null_data['spreadAppId'] = y_pred_null" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "ad_info = pd.concat([X_data,null_data])\n", 573 | "cad_info = clean_ad_info.sort_index()\n", 574 | "ad_info.to_csv('data/clean_ad_info.csv',index = False)" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "# 处理content_info数据\n", 582 | "\n", 583 | "对于content info文件,特征secondClass是对firstClass的一个更详细的分类。对于每一个firstClass,同一个firstClass下对应的secondClass是固定且唯一的。并且文件中只有secondClass存在缺失值,因此先将数据根据firstClass进行分类,再在每一个小类中,用seccondClass出现次数最多的元素填充缺失值\n", 584 | "\n", 585 | "再将字符串特征转换成数值特征(从1开始)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [ 594 | "first_class = content_info['firstClass'].unique().tolist()\n", 595 | "\n", 596 | "for i in first_class:\n", 597 | " data = content_info[content_info['firstClass'] == i] \n", 598 | " \n", 599 | " if (data['secondClass'].isnull().sum() == len(data)): \n", 600 | " content_info.loc[content_info['firstClass'] == i,'secondClass'] = i\n", 601 | " \n", 602 | " else:\n", 603 | " max_secondclass = pd.value_counts(data['secondClass'])\n", 604 | " max_secondclass = max_secondclass.index[0]\n", 605 | "\n", 606 | " tem_int = content_info[content_info['firstClass'] == i]['secondClass'].fillna(max_secondclass)\n", 607 | " content_info.loc[content_info['firstClass'] == i,'secondClass'] = tem_int\n", 608 | " \n", 609 | " \n", 610 | "content_info[\"firstClass\"] = pd.factorize(content_info[\"firstClass\"])[0].astype(np.uint64)\n", 611 | "content_info[\"secondClass\"] = pd.factorize(content_info[\"secondClass\"])[0].astype(np.uint64)\n", 612 | "\n", 613 | "content_info.loc[content_info['firstClass'] ==0,'firstClass'] = 23\n", 614 | "content_info.loc[content_info['secondClass'] ==0,'secondClass'] = 89" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "metadata": {}, 621 | "outputs": [], 622 | "source": [ 623 | "content_info.to_csv('clean_contentId_info.csv',index = False)" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": {}, 629 | "source": [ 630 | "# 查看训练数据与测试数据\n", 631 | "\n", 632 | "1.训练数据中无缺失值,无需处理\n", 633 | "\n", 634 | "2.测试数据中,"contentId"存在缺失值,数目并不多,采用众数填充" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": null, 640 | "metadata": {}, 641 | "outputs": [], 642 | "source": [ 643 | "train.isnull().any()" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [ 652 | "test.isnull().any()" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 92, 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [ 661 | "max_contendId = pd.value_counts(test['contentId'])\n", 662 | "max_contendId = max_contendId.index[0]\n", 663 | "\n", 664 | "tem_int = test[test['contentId'].isnull()]['contentId'].fillna(max_contendId)\n", 665 | "test.loc[test['contentId'].isnull(),'contentId'] = tem_int" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 93, 671 | "metadata": {}, 672 | "outputs": [ 673 | { 674 | "data": { 675 | "text/plain": [ 676 | "label False\n", 677 | "uId False\n", 678 | "adId False\n", 679 | "operTime False\n", 680 | "siteId False\n", 681 | "slotId False\n", 682 | "contentId False\n", 683 | "netType False\n", 684 | "dtype: bool" 685 | ] 686 | }, 687 | "execution_count": 93, 688 | "metadata": {}, 689 | "output_type": "execute_result" 690 | } 691 | ], 692 | "source": [ 693 | "test.isnull().any()" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": 94, 699 | "metadata": {}, 700 | "outputs": [], 701 | "source": [ 702 | "test.to_csv('data/clean_test.csv',index = False)" 703 | ] 704 | } 705 | ], 706 | "metadata": { 707 | "kernelspec": { 708 | "display_name": "Python 3", 709 | "language": "python", 710 | "name": "python3" 711 | }, 712 | "language_info": { 713 | "codemirror_mode": { 714 | "name": "ipython", 715 | "version": 3 716 | }, 717 | "file_extension": ".py", 718 | "mimetype": "text/x-python", 719 | "name": "python", 720 | "nbconvert_exporter": "python", 721 | "pygments_lexer": "ipython3", 722 | "version": "3.6.8" 723 | } 724 | }, 725 | "nbformat": 4, 726 | "nbformat_minor": 2 727 | } 728 | -------------------------------------------------------------------------------- /1完整流程/2采样_匹配数据/3.uid1_day_caiyang_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "import seaborn as sns\n", 13 | "from sklearn.utils import shuffle\n", 14 | "from sklearn import metrics, preprocessing\n", 15 | "import gc\n", 16 | "#显示所有列\n", 17 | "pd.set_option('display.max_columns', None)\n", 18 | "#显示所有行\n", 19 | "pd.set_option('display.max_rows', None)\n", 20 | "#设置value的显示长度为100,默认为50\n", 21 | "pd.set_option('max_colwidth',100)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "user_info = pd.read_csv('/media/zsy/Data/ZSY/hw_dataset/data/clean_user_info.csv')\n", 31 | "ad_info = pd.read_csv('/media/zsy/Data/ZSY/hw_dataset/data/clean_ad_info.csv')\n", 32 | "content_info = pd.read_csv('/media/zsy/Data/ZSY/hw_dataset/data/clean_contentId_info.csv')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/html": [ 43 | "
\n", 44 | "\n", 57 | "\n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | "
uIdagegendercityprovincephoneTypecarrier
0u1298974134.01.0184.025.0487.03.0
1u1089064584.01.0125.028.0217.02.0
2u1258704613.01.0184.025.0474.03.0
3u1339243614.01.0184.025.0502.02.0
4u1339243604.01.0184.025.0502.03.0
\n", 123 | "
" 124 | ], 125 | "text/plain": [ 126 | " uId age gender city province phoneType carrier\n", 127 | "0 u129897413 4.0 1.0 184.0 25.0 487.0 3.0\n", 128 | "1 u108906458 4.0 1.0 125.0 28.0 217.0 2.0\n", 129 | "2 u125870461 3.0 1.0 184.0 25.0 474.0 3.0\n", 130 | "3 u133924361 4.0 1.0 184.0 25.0 502.0 2.0\n", 131 | "4 u133924360 4.0 1.0 184.0 25.0 502.0 3.0" 132 | ] 133 | }, 134 | "execution_count": 3, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "user_info.head()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 4, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/html": [ 151 | "
\n", 152 | "\n", 165 | "\n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | "
adIdbillIdprimIdcreativeTypeintertypespreadAppId
05223163273.0
15222163273.0
2522112343273.0
3522012348273.0
4521912343273.0
\n", 225 | "
" 226 | ], 227 | "text/plain": [ 228 | " adId billId primId creativeType intertype spreadAppId\n", 229 | "0 5223 1 6 3 2 73.0\n", 230 | "1 5222 1 6 3 2 73.0\n", 231 | "2 5221 1 234 3 2 73.0\n", 232 | "3 5220 1 234 8 2 73.0\n", 233 | "4 5219 1 234 3 2 73.0" 234 | ] 235 | }, 236 | "execution_count": 4, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "ad_info.head()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/html": [ 253 | "
\n", 254 | "\n", 267 | "\n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | "
contentIdfirstClasssecondClass
051982389
1519711
2519611
3519511
4519411
\n", 309 | "
" 310 | ], 311 | "text/plain": [ 312 | " contentId firstClass secondClass\n", 313 | "0 5198 23 89\n", 314 | "1 5197 1 1\n", 315 | "2 5196 1 1\n", 316 | "3 5195 1 1\n", 317 | "4 5194 1 1" 318 | ] 319 | }, 320 | "execution_count": 5, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | } 324 | ], 325 | "source": [ 326 | "content_info.head()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 1, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "del train\n", 336 | "del neg\n", 337 | "del pos\n", 338 | "del train_uid1_label1_counts_data\n", 339 | "del train_uid_label0_counts_data\n", 340 | "gc.collect()" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "# 处理每一天的数据,按照uid出现次数采样,label=1\n", 348 | "# train_28.csv代表不同的天数" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 15, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "train = pd.read_csv(r'/media/zsy/Data/ZSY/hw_dataset/data/init/26-31/train_28.csv')" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 16, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "train.columns = ['label', 'uId', 'adId', 'operTime', 'siteId', 'slotId', 'contentId', 'netType']" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "# 分离出train中label = 1 和 lable = 0 的数据\n", 374 | "# 并分别计算他们的uid 出现的次数,作为新的一列" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 17, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "neg = train.loc[train['label'] == 0]\n", 384 | "pos = train.loc[train['label'] == 1]\n", 385 | "neg = neg.reset_index(drop = True)\n", 386 | "pos = pos.reset_index(drop = True)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 18, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "(23331611, 8)" 398 | ] 399 | }, 400 | "execution_count": 18, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "neg.shape" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 19, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "(1475854, 8)" 418 | ] 419 | }, 420 | "execution_count": 19, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "pos.shape" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 20, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "uid_stats = pos['uId'].value_counts()\n", 436 | "pos_uid_counts = np.hstack([pos['uId'].values.reshape(-1,1),pos['uId'].map(uid_stats).values.reshape(-1,1)])\n", 437 | "pos_uid_counts = pd.DataFrame(pos_uid_counts)\n", 438 | "pos_uid_counts.columns = ['uId','counts']\n", 439 | "pos['uId_count'] = pos_uid_counts['counts']" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 21, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "uid_stats = neg['uId'].value_counts()\n", 449 | "neg_uid_counts = np.hstack([neg['uId'].values.reshape(-1,1),neg['uId'].map(uid_stats).values.reshape(-1,1)])\n", 450 | "neg_uid_counts = pd.DataFrame(neg_uid_counts)\n", 451 | "neg_uid_counts.columns = ['uId','counts']\n", 452 | "neg['uId_count'] = neg_uid_counts['counts']" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "# 将label = 1 的数据中,uid仅出现过一次的那些数据提取出来" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 22, 465 | "metadata": {}, 466 | "outputs": [ 467 | { 468 | "data": { 469 | "text/plain": [ 470 | "(761090, 9)" 471 | ] 472 | }, 473 | "execution_count": 22, 474 | "metadata": {}, 475 | "output_type": "execute_result" 476 | } 477 | ], 478 | "source": [ 479 | "train_uid1_label1_counts_data = pos.loc[pos['uId_count'] == 1,:]\n", 480 | "train_uid1_label1_counts_data = train_uid1_label1_counts_data.reset_index(drop = True)\n", 481 | "train_uid1_label1_counts_data.shape" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "# 在label = 0 的数据中,使用上一步得到的数据的uid进行匹配,得到label = 0 的数据" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 23, 494 | "metadata": {}, 495 | "outputs": [ 496 | { 497 | "data": { 498 | "text/plain": [ 499 | "(2628280, 9)" 500 | ] 501 | }, 502 | "execution_count": 23, 503 | "metadata": {}, 504 | "output_type": "execute_result" 505 | } 506 | ], 507 | "source": [ 508 | "train_uid_label0_counts_data = neg.loc[neg['uId'].isin(train_uid1_label1_counts_data['uId'].unique().tolist()),:]\n", 509 | "train_uid_label0_counts_data = train_uid_label0_counts_data.reset_index(drop = True)\n", 510 | "train_uid_label0_counts_data.shape" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 24, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "rest_num = 1000000-train_uid1_label1_counts_data.shape[0]" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "# 采样剩下的负样本,凑够100万数据" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 69, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "index = np.random.randint(0,train_uid_label0_counts_data.shape[0],rest_num)\n", 536 | "train_uid_label0_counts_sample = train_uid_label0_counts_data.iloc[index,:]" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "# 进行数据融合,并且打乱顺序" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 70, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "data_label1_uid1 = pd.concat([train_uid1_label1_counts_data,train_uid_label0_counts_sample])\n", 553 | "data_label1_uid1 = data_label1_uid1.reset_index(drop = True)\n", 554 | "data_label1_uid1.drop(['uId_count'],axis = 1,inplace = True)\n", 555 | "data_label1_uid1 = shuffle(data_label1_uid1)\n", 556 | "data_label1_uid1 = data_label1_uid1.reset_index(drop = True)" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 71, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "uid_stats = data_label1_uid1['uId'].value_counts()\n", 566 | "train_uid1_counts = np.hstack([data_label1_uid1['uId'].values.reshape(-1,1),data_label1_uid1['uId'].map(uid_stats).values.reshape(-1,1)])\n", 567 | "train_uid1_counts = pd.DataFrame(train_uid1_counts)\n", 568 | "train_uid1_counts.columns = ['uId','counts']\n", 569 | "data_label1_uid1['uId_count'] = train_uid1_counts['counts']" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 72, 575 | "metadata": {}, 576 | "outputs": [ 577 | { 578 | "data": { 579 | "text/html": [ 580 | "
\n", 581 | "\n", 594 | "\n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | "
labeluIdadIdoperTimesiteIdslotIdcontentIdnetTypeuId_count
01u12691983915822019-03-31 12:39:05.245108122211
11u14337306534082019-03-31 18:40:48.623887400511
21u14806032138872019-03-31 08:30:45.8051030213711
31u13458261516142019-03-31 19:41:25.356922167611
41u12086515534862019-03-31 22:15:03.475888410442
\n", 672 | "
" 673 | ], 674 | "text/plain": [ 675 | " label uId adId operTime siteId slotId \\\n", 676 | "0 1 u126919839 1582 2019-03-31 12:39:05.245 10 8 \n", 677 | "1 1 u143373065 3408 2019-03-31 18:40:48.623 8 87 \n", 678 | "2 1 u148060321 3887 2019-03-31 08:30:45.805 10 30 \n", 679 | "3 1 u134582615 1614 2019-03-31 19:41:25.356 9 22 \n", 680 | "4 1 u120865155 3486 2019-03-31 22:15:03.475 8 88 \n", 681 | "\n", 682 | " contentId netType uId_count \n", 683 | "0 1222 1 1 \n", 684 | "1 4005 1 1 \n", 685 | "2 2137 1 1 \n", 686 | "3 1676 1 1 \n", 687 | "4 4104 4 2 " 688 | ] 689 | }, 690 | "execution_count": 72, 691 | "metadata": {}, 692 | "output_type": "execute_result" 693 | } 694 | ], 695 | "source": [ 696 | "data_label1_uid1.head()" 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "# 最终数据融合" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 73, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "data_label1_uid1 = pd.merge(data_label1_uid1,user_info, how='left', left_on='uId',right_on='uId')\n", 713 | "data_label1_uid1 = pd.merge(data_label1_uid1,ad_info, how='left', left_on='adId',right_on='adId')\n", 714 | "data_label1_uid1 = pd.merge(data_label1_uid1,content_info, how='left', left_on='contentId',right_on='contentId')\n", 715 | "data_label1_uid1 = data_label1_uid1.fillna(0)" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 5, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "# test只需运行一次,生成一个test样本就可以了\n", 725 | "test_data = pd.merge(test_data,user_info, how='left', left_on='uId',right_on='uId')\n", 726 | "test_data = pd.merge(test_data,ad_info, how='left', left_on='adId',right_on='adId')\n", 727 | "test_data = pd.merge(test_data,content_info, how='left', left_on='contentId',right_on='contentId')\n", 728 | "test_data = data_label1_uid1.fillna(0)\n", 729 | "test_data.to_csv('test.csv',index = False)" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 6, 735 | "metadata": {}, 736 | "outputs": [], 737 | "source": [ 738 | "# 存储数据" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 75, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [ 747 | "data_label1_uid1.to_csv('data/by_data/uid1_label1_train_28.csv',index = False)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [] 756 | } 757 | ], 758 | "metadata": { 759 | "kernelspec": { 760 | "display_name": "Python 3", 761 | "language": "python", 762 | "name": "python3" 763 | }, 764 | "language_info": { 765 | "codemirror_mode": { 766 | "name": "ipython", 767 | "version": 3 768 | }, 769 | "file_extension": ".py", 770 | "mimetype": "text/x-python", 771 | "name": "python", 772 | "nbconvert_exporter": "python", 773 | "pygments_lexer": "ipython3", 774 | "version": "3.6.8" 775 | } 776 | }, 777 | "nbformat": 4, 778 | "nbformat_minor": 2 779 | } 780 | -------------------------------------------------------------------------------- /1完整流程/2采样_匹配数据/4.uid1_day_caiyang_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "import seaborn as sns\n", 13 | "from sklearn.utils import shuffle\n", 14 | "from sklearn import metrics, preprocessing\n", 15 | "import gc\n", 16 | "import random\n", 17 | "#显示所有列\n", 18 | "pd.set_option('display.max_columns', None)\n", 19 | "#显示所有行\n", 20 | "pd.set_option('display.max_rows', None)\n", 21 | "#设置value的显示长度为100,默认为50\n", 22 | "pd.set_option('max_colwidth',100)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "user_info = pd.read_csv('/media/zsy/Data/ZSY/hw_dataset/data/clean_user_info.csv')\n", 32 | "ad_info = pd.read_csv('/media/zsy/Data/ZSY/hw_dataset/data/clean_ad_info.csv')\n", 33 | "content_info = pd.read_csv('/media/zsy/Data/ZSY/hw_dataset/data/clean_contentId_info.csv')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | "
uIdagegendercityprovincephoneTypecarrier
0u1298974134.01.0184.025.0487.03.0
1u1089064584.01.0125.028.0217.02.0
2u1258704613.01.0184.025.0474.03.0
3u1339243614.01.0184.025.0502.02.0
4u1339243604.01.0184.025.0502.03.0
\n", 124 | "
" 125 | ], 126 | "text/plain": [ 127 | " uId age gender city province phoneType carrier\n", 128 | "0 u129897413 4.0 1.0 184.0 25.0 487.0 3.0\n", 129 | "1 u108906458 4.0 1.0 125.0 28.0 217.0 2.0\n", 130 | "2 u125870461 3.0 1.0 184.0 25.0 474.0 3.0\n", 131 | "3 u133924361 4.0 1.0 184.0 25.0 502.0 2.0\n", 132 | "4 u133924360 4.0 1.0 184.0 25.0 502.0 3.0" 133 | ] 134 | }, 135 | "execution_count": 3, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "user_info.head()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 4, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/html": [ 152 | "
\n", 153 | "\n", 166 | "\n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | "
adIdbillIdprimIdcreativeTypeintertypespreadAppId
05223163273.0
15222163273.0
2522112343273.0
3522012348273.0
4521912343273.0
\n", 226 | "
" 227 | ], 228 | "text/plain": [ 229 | " adId billId primId creativeType intertype spreadAppId\n", 230 | "0 5223 1 6 3 2 73.0\n", 231 | "1 5222 1 6 3 2 73.0\n", 232 | "2 5221 1 234 3 2 73.0\n", 233 | "3 5220 1 234 8 2 73.0\n", 234 | "4 5219 1 234 3 2 73.0" 235 | ] 236 | }, 237 | "execution_count": 4, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "ad_info.head()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 5, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/html": [ 254 | "
\n", 255 | "\n", 268 | "\n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | "
contentIdfirstClasssecondClass
051982389
1519711
2519611
3519511
4519411
\n", 310 | "
" 311 | ], 312 | "text/plain": [ 313 | " contentId firstClass secondClass\n", 314 | "0 5198 23 89\n", 315 | "1 5197 1 1\n", 316 | "2 5196 1 1\n", 317 | "3 5195 1 1\n", 318 | "4 5194 1 1" 319 | ] 320 | }, 321 | "execution_count": 5, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "content_info.head()" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "# 处理每一天的数据,按照uid出现次数采样,label=1" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 6, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "train = pd.read_csv(r'/media/zsy/Data/ZSY/hw_dataset/data/init/26-31/train_31.csv')" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 7, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "train.columns = ['label', 'uId', 'adId', 'operTime', 'siteId', 'slotId', 'contentId', 'netType']" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "# 分离出train中label = 1 和 lable = 0 的数据\n", 360 | "# 并分别计算他们的uid 出现的次数,作为新的一列" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 8, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "neg = train.loc[train['label'] == 0]\n", 370 | "pos = train.loc[train['label'] == 1]\n", 371 | "neg = neg.reset_index(drop = True)\n", 372 | "pos = pos.reset_index(drop = True)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 9, 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/plain": [ 383 | "(26340851, 8)" 384 | ] 385 | }, 386 | "execution_count": 9, 387 | "metadata": {}, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "neg.shape" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 10, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "(1687407, 8)" 404 | ] 405 | }, 406 | "execution_count": 10, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "pos.shape" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 11, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "uid_stats = pos['uId'].value_counts()\n", 422 | "pos_uid_counts = np.hstack([pos['uId'].values.reshape(-1,1),pos['uId'].map(uid_stats).values.reshape(-1,1)])\n", 423 | "pos_uid_counts = pd.DataFrame(pos_uid_counts)\n", 424 | "pos_uid_counts.columns = ['uId','counts']\n", 425 | "pos['uId_count'] = pos_uid_counts['counts']" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 12, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "uid_stats = neg['uId'].value_counts()\n", 435 | "neg_uid_counts = np.hstack([neg['uId'].values.reshape(-1,1),neg['uId'].map(uid_stats).values.reshape(-1,1)])\n", 436 | "neg_uid_counts = pd.DataFrame(neg_uid_counts)\n", 437 | "neg_uid_counts.columns = ['uId','counts']\n", 438 | "neg['uId_count'] = neg_uid_counts['counts']" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "# 将label = 1 的数据中,uid仅出现过一次的那些数据提取出来(50w)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 13, 451 | "metadata": {}, 452 | "outputs": [ 453 | { 454 | "data": { 455 | "text/plain": [ 456 | "(500000, 9)" 457 | ] 458 | }, 459 | "execution_count": 13, 460 | "metadata": {}, 461 | "output_type": "execute_result" 462 | } 463 | ], 464 | "source": [ 465 | "uid1_num = 500000\n", 466 | "train_uid1_label1_counts_data = pos.loc[pos['uId_count'] == 1,:]\n", 467 | "train_uid1_label1_counts_data = train_uid1_label1_counts_data.reset_index(drop = True)\n", 468 | "index = random.sample(range(0,train_uid1_label1_counts_data.shape[0]),uid1_num)\n", 469 | "train_uid1_label1_counts_data = train_uid1_label1_counts_data.iloc[index,:]\n", 470 | "train_uid1_label1_counts_data = train_uid1_label1_counts_data.reset_index(drop = True)\n", 471 | "train_uid1_label1_counts_data.shape" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 14, 477 | "metadata": {}, 478 | "outputs": [ 479 | { 480 | "data": { 481 | "text/plain": [ 482 | "500000" 483 | ] 484 | }, 485 | "execution_count": 14, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "len(set(index))" 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "# 在label = 0 的数据中,使用上一步得到的数据的uid进行匹配,得到label = 0 的数据" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 15, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/plain": [ 509 | "(1845486, 9)" 510 | ] 511 | }, 512 | "execution_count": 15, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "train_uid_label0_counts_data = neg.loc[neg['uId'].isin(train_uid1_label1_counts_data['uId'].unique().tolist()),:]\n", 519 | "train_uid_label0_counts_data = train_uid_label0_counts_data.reset_index(drop = True)\n", 520 | "train_uid_label0_counts_data.shape" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 16, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "rest_num = 1000000-train_uid1_label1_counts_data.shape[0]" 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "metadata": {}, 535 | "source": [ 536 | "# 采样剩下的负样本,凑够100万数据" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 17, 542 | "metadata": {}, 543 | "outputs": [], 544 | "source": [ 545 | "index = random.sample(range(0,train_uid_label0_counts_data.shape[0]),rest_num)\n", 546 | "# index = np.random.randint(0,train_uid_label0_counts_data.shape[0],rest_num)\n", 547 | "train_uid_label0_counts_sample = train_uid_label0_counts_data.iloc[index,:]" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 18, 553 | "metadata": {}, 554 | "outputs": [ 555 | { 556 | "data": { 557 | "text/plain": [ 558 | "(500000, 9)" 559 | ] 560 | }, 561 | "execution_count": 18, 562 | "metadata": {}, 563 | "output_type": "execute_result" 564 | } 565 | ], 566 | "source": [ 567 | "train_uid_label0_counts_sample.shape" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 19, 573 | "metadata": {}, 574 | "outputs": [ 575 | { 576 | "data": { 577 | "text/plain": [ 578 | "500000" 579 | ] 580 | }, 581 | "execution_count": 19, 582 | "metadata": {}, 583 | "output_type": "execute_result" 584 | } 585 | ], 586 | "source": [ 587 | "len(set(index))" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": {}, 593 | "source": [ 594 | "# 进行数据融合,并且打乱顺序" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 20, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "data_label1_uid1 = pd.concat([train_uid1_label1_counts_data,train_uid_label0_counts_sample])\n", 604 | "data_label1_uid1 = data_label1_uid1.reset_index(drop = True)\n", 605 | "data_label1_uid1.drop(['uId_count'],axis = 1,inplace = True)\n", 606 | "data_label1_uid1 = shuffle(data_label1_uid1)\n", 607 | "data_label1_uid1 = data_label1_uid1.reset_index(drop = True)" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 21, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "uid_stats = data_label1_uid1['uId'].value_counts()\n", 617 | "train_uid1_counts = np.hstack([data_label1_uid1['uId'].values.reshape(-1,1),data_label1_uid1['uId'].map(uid_stats).values.reshape(-1,1)])\n", 618 | "train_uid1_counts = pd.DataFrame(train_uid1_counts)\n", 619 | "train_uid1_counts.columns = ['uId','counts']\n", 620 | "data_label1_uid1['uId_count'] = train_uid1_counts['counts']" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 22, 626 | "metadata": {}, 627 | "outputs": [ 628 | { 629 | "data": { 630 | "text/html": [ 631 | "
\n", 632 | "\n", 645 | "\n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | "
labeluIdadIdoperTimesiteIdslotIdcontentIdnetTypeuId_count
00u11279071129052019-03-31 18:29:24.8831010213917
11u14820883348562019-03-31 20:03:10.077327563614
20u13811456715502019-03-31 08:26:00.6343511470112
30u14759475416022019-03-31 14:47:30.048920164213
40u12189862115562019-03-31 09:43:39.9098881477412
50u10530032740292019-03-31 21:11:12.5861071481214
60u14032592921592019-03-31 11:44:01.6511082430112
71u14340420544742019-03-31 19:21:18.228888528111
81u10700288514592019-03-31 17:37:33.639332137012
90u11878679640132019-03-31 14:50:14.4261010213615
100u11903663948642019-03-31 17:55:13.7421012564412
110u12330850712272019-03-31 16:00:07.040103094443
121u13350649219022019-03-31 09:53:47.134281209711
131u11202827616142019-03-31 11:07:20.365922167614
140u11472510515562019-03-31 14:34:52.308108147742
151u14043956928142019-03-31 23:46:45.490888316811
161u13824045014832019-03-31 08:33:04.264327138614
171u13595142416002019-03-31 12:30:09.077368132242
180u14154872714622019-03-31 17:58:26.066332137318
190u10858159420842019-03-31 16:50:16.979103095014
\n", 903 | "
" 904 | ], 905 | "text/plain": [ 906 | " label uId adId operTime siteId slotId \\\n", 907 | "0 0 u112790711 2905 2019-03-31 18:29:24.883 10 10 \n", 908 | "1 1 u148208833 4856 2019-03-31 20:03:10.077 3 27 \n", 909 | "2 0 u138114567 1550 2019-03-31 08:26:00.634 3 51 \n", 910 | "3 0 u147594754 1602 2019-03-31 14:47:30.048 9 20 \n", 911 | "4 0 u121898621 1556 2019-03-31 09:43:39.909 8 88 \n", 912 | "5 0 u105300327 4029 2019-03-31 21:11:12.586 10 71 \n", 913 | "6 0 u140325929 2159 2019-03-31 11:44:01.651 10 8 \n", 914 | "7 1 u143404205 4474 2019-03-31 19:21:18.228 8 88 \n", 915 | "8 1 u107002885 1459 2019-03-31 17:37:33.639 3 32 \n", 916 | "9 0 u118786796 4013 2019-03-31 14:50:14.426 10 10 \n", 917 | "10 0 u119036639 4864 2019-03-31 17:55:13.742 10 12 \n", 918 | "11 0 u123308507 1227 2019-03-31 16:00:07.040 10 30 \n", 919 | "12 1 u133506492 1902 2019-03-31 09:53:47.134 2 81 \n", 920 | "13 1 u112028276 1614 2019-03-31 11:07:20.365 9 22 \n", 921 | "14 0 u114725105 1556 2019-03-31 14:34:52.308 10 8 \n", 922 | "15 1 u140439569 2814 2019-03-31 23:46:45.490 8 88 \n", 923 | "16 1 u138240450 1483 2019-03-31 08:33:04.264 3 27 \n", 924 | "17 1 u135951424 1600 2019-03-31 12:30:09.077 3 68 \n", 925 | "18 0 u141548727 1462 2019-03-31 17:58:26.066 3 32 \n", 926 | "19 0 u108581594 2084 2019-03-31 16:50:16.979 10 30 \n", 927 | "\n", 928 | " contentId netType uId_count \n", 929 | "0 2139 1 7 \n", 930 | "1 5636 1 4 \n", 931 | "2 1470 1 12 \n", 932 | "3 1642 1 3 \n", 933 | "4 1477 4 12 \n", 934 | "5 4812 1 4 \n", 935 | "6 2430 1 12 \n", 936 | "7 5281 1 1 \n", 937 | "8 1370 1 2 \n", 938 | "9 2136 1 5 \n", 939 | "10 5644 1 2 \n", 940 | "11 944 4 3 \n", 941 | "12 2097 1 1 \n", 942 | "13 1676 1 4 \n", 943 | "14 1477 4 2 \n", 944 | "15 3168 1 1 \n", 945 | "16 1386 1 4 \n", 946 | "17 1322 4 2 \n", 947 | "18 1373 1 8 \n", 948 | "19 950 1 4 " 949 | ] 950 | }, 951 | "execution_count": 22, 952 | "metadata": {}, 953 | "output_type": "execute_result" 954 | } 955 | ], 956 | "source": [ 957 | "data_label1_uid1.head(20)" 958 | ] 959 | }, 960 | { 961 | "cell_type": "markdown", 962 | "metadata": {}, 963 | "source": [ 964 | "# 最终数据融合" 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": 23, 970 | "metadata": {}, 971 | "outputs": [], 972 | "source": [ 973 | "data_label1_uid1 = pd.merge(data_label1_uid1,user_info, how='left', left_on='uId',right_on='uId')\n", 974 | "data_label1_uid1 = pd.merge(data_label1_uid1,ad_info, how='left', left_on='adId',right_on='adId')\n", 975 | "data_label1_uid1 = pd.merge(data_label1_uid1,content_info, how='left', left_on='contentId',right_on='contentId')\n", 976 | "data_label1_uid1 = data_label1_uid1.fillna(0)" 977 | ] 978 | }, 979 | { 980 | "cell_type": "code", 981 | "execution_count": 24, 982 | "metadata": {}, 983 | "outputs": [ 984 | { 985 | "data": { 986 | "text/plain": [ 987 | "(1000000, 22)" 988 | ] 989 | }, 990 | "execution_count": 24, 991 | "metadata": {}, 992 | "output_type": "execute_result" 993 | } 994 | ], 995 | "source": [ 996 | "data_label1_uid1.shape" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": 25, 1002 | "metadata": {}, 1003 | "outputs": [], 1004 | "source": [ 1005 | "# test_data = pd.merge(test_data,user_info, how='left', left_on='uId',right_on='uId')\n", 1006 | "# test_data = pd.merge(test_data,ad_info, how='left', left_on='adId',right_on='adId')\n", 1007 | "# test_data = pd.merge(test_data,content_info, how='left', left_on='contentId',right_on='contentId')\n", 1008 | "# test_data = data_label1_uid1.fillna(0)" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": 26, 1014 | "metadata": {}, 1015 | "outputs": [], 1016 | "source": [ 1017 | "data_label1_uid1.to_csv('data/by_data_2/by_data_50w/31-50w.csv',index = False)" 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "code", 1022 | "execution_count": null, 1023 | "metadata": {}, 1024 | "outputs": [], 1025 | "source": [] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": null, 1030 | "metadata": {}, 1031 | "outputs": [], 1032 | "source": [] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": null, 1037 | "metadata": {}, 1038 | "outputs": [], 1039 | "source": [] 1040 | } 1041 | ], 1042 | "metadata": { 1043 | "kernelspec": { 1044 | "display_name": "Python 3", 1045 | "language": "python", 1046 | "name": "python3" 1047 | }, 1048 | "language_info": { 1049 | "codemirror_mode": { 1050 | "name": "ipython", 1051 | "version": 3 1052 | }, 1053 | "file_extension": ".py", 1054 | "mimetype": "text/x-python", 1055 | "name": "python", 1056 | "nbconvert_exporter": "python", 1057 | "pygments_lexer": "ipython3", 1058 | "version": "3.6.8" 1059 | } 1060 | }, 1061 | "nbformat": 4, 1062 | "nbformat_minor": 2 1063 | } 1064 | --------------------------------------------------------------------------------