├── .idea ├── TIANCHI_Project.iml ├── misc.xml └── modules.xml ├── Ad_Convert_prediction ├── README.md ├── data │ ├── round1_ijcai_18_result_demo_20180301.txt │ └── 数据说明.txt ├── doc │ ├── paper │ │ ├── Factorization Machines with libFM.pdf │ │ ├── Factorization Machines--Steffen Rendle.pdf │ │ ├── Field-aware Factorization Machines for CTR Prediction.pdf │ │ ├── Field-aware Factorization Machines in a Real-world Online Advertising System-ind0438-juanA.pdf │ │ ├── Recurrent Neural Networks with Top-k Gains for Session-based Recommendations.pdf │ │ ├── SESSION-BASED RECOMMENDATIONS WITH RECURRENT NEURAL NETWORKS.pdf │ │ ├── Wide & Deep Learning for Recommender Systems.pdf │ │ ├── XGBoost A Scalable Tree Boosting System.pdf │ │ ├── 【ECIR-16-FNN】Deep Learning over Multi-field Categorical Data--A Case Study on User Response Prediction.pdf │ │ ├── 【IJCAI-17】-DeepFM-A Factorization-Machine based Neural Network for CTR Prediction.pdf │ │ ├── 【NIPS-2017】lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf │ │ └── 【SIGIR-17】Neural Factorization Machines for r Sparse Predictive Analytics.pdf │ └── 基于深度学习的搜索广告点击率预测方法研究.pdf └── src │ ├── Data_Preprocess.py │ └── dnn_42.ipynb ├── Click_prediction ├── README.md ├── code │ ├── Logloss.py │ ├── blagging.py │ ├── ctr │ │ ├── Preprocess.py │ │ ├── ctr.ipynb │ │ └── ffm.py │ ├── ctr_nn │ │ ├── Main.py │ │ ├── Models.py │ │ ├── Utils.py │ │ └── __init__.py │ └── cvr │ │ ├── 1.problem_setting.ipynb │ │ ├── 2.Baseline_version.ipynb │ │ ├── 3.feature_engineering_and_machine_learning.ipynb │ │ └── README.md ├── data │ ├── data.pdf │ ├── data_description.pdf │ ├── download.sh │ └── tencent_数据说明 │ │ ├── Tencent_cvr_prediction.png │ │ ├── data_dscr_4.png │ │ ├── data_dscr_5.png │ │ ├── 上下文特征.png │ │ ├── 广告特征.png │ │ └── 用户特征.png ├── doc │ ├── 8课下课件-张伟楠.pdf │ ├── Ad click prediction a view from the trenches.pdf │ ├── ffm.txt │ ├── fm.txt │ └── 资料.txt ├── libffm │ └── libffm │ │ ├── COPYRIGHT │ │ ├── Makefile │ │ ├── Makefile.win │ │ ├── README │ │ ├── ffm-predict │ │ ├── ffm-predict.cpp │ │ ├── ffm-train │ │ ├── ffm-train.cpp │ │ ├── ffm.cpp │ │ ├── ffm.h │ │ ├── ffm.o │ │ ├── timer.cpp │ │ ├── timer.h │ │ └── timer.o ├── libfm │ └── libfm │ │ ├── Makefile │ │ ├── README.md │ │ ├── bin │ │ ├── convert │ │ ├── fm_model │ │ ├── libFM │ │ └── transpose │ │ ├── license.txt │ │ ├── scripts │ │ └── triple_format_to_libfm.pl │ │ └── src │ │ ├── fm_core │ │ ├── fm_data.h │ │ ├── fm_model.h │ │ └── fm_sgd.h │ │ ├── libfm │ │ ├── Makefile │ │ ├── libfm.cpp │ │ ├── libfm.o │ │ ├── src │ │ │ ├── Data.h │ │ │ ├── fm_learn.h │ │ │ ├── fm_learn_mcmc.h │ │ │ ├── fm_learn_mcmc_simultaneous.h │ │ │ ├── fm_learn_sgd.h │ │ │ ├── fm_learn_sgd_element.h │ │ │ ├── fm_learn_sgd_element_adapt_reg.h │ │ │ └── relation.h │ │ └── tools │ │ │ ├── convert.cpp │ │ │ ├── convert.o │ │ │ ├── transpose.cpp │ │ │ └── transpose.o │ │ └── util │ │ ├── cmdline.h │ │ ├── fmatrix.h │ │ ├── matrix.h │ │ ├── memory.h │ │ ├── random.h │ │ ├── rlog.h │ │ ├── smatrix.h │ │ └── util.h └── output │ ├── criteo.jpg │ ├── facebook.png │ ├── ffm_formula.png │ ├── fm_format.png │ ├── fm_formula.png │ ├── fm_formula2.png │ ├── loss.png │ ├── model.png │ ├── tensorboard.png │ └── train_info.png ├── Coupon_Usage_Predict └── readme.md ├── Loan_risk_prediction ├── README.md ├── code │ ├── XGBoost models.ipynb │ ├── Xgboost调优示例.py │ └── data_preparation.ipynb ├── data │ ├── Test_bCtAN1w.csv │ ├── Train_nyOWmfK.csv │ └── train_modified.csv └── doc │ ├── README.md │ ├── 不得直视本王-解决方案.pdf │ ├── 创新应用.docx │ ├── 最优分箱.docx │ └── 风控算法大赛解决方案.pdf ├── PPD_RiskControl ├── README.md └── doc │ └── 风控算法大赛解决方案.pdf ├── README.md ├── Shangjialiuliang_predict ├── README.md ├── data │ ├── results │ │ ├── result_2017-03-11_model.csv │ │ ├── result_2017-03-11_special_day_weather_huopot.csv │ │ ├── result_2017-03-16_.csv │ │ ├── result_2017-03-16_fuse.csv │ │ ├── result_2017-03-16_special_day.csv │ │ ├── result_2017-03-16_special_day_weather.csv │ │ └── result_2017-03-16_special_day_weather_huopot.csv │ ├── shop_info_name2Id │ │ ├── cate_1_name.csv │ │ ├── cate_2_name.csv │ │ ├── cate_3_name.csv │ │ ├── city_name.csv │ │ ├── shop_info.csv │ │ └── shop_info_num.csv │ ├── statistics │ │ ├── all_mon_week3_mean_med_var_std.csv │ │ ├── city_weather.csv │ │ ├── count_user_pay.csv │ │ ├── count_user_pay_avg.csv │ │ ├── count_user_pay_avg_no_header.csv │ │ ├── count_user_view.csv │ │ ├── result_avg7_common_with_last_week.csv │ │ ├── shop_info.txt │ │ ├── shop_info_num.csv │ │ ├── shopid_day_num.txt │ │ ├── weather-10-11.csv │ │ ├── weather-11-14.csv │ │ └── weather_city.csv │ ├── test_train │ │ ├── 2017-03-16_test_off_x.csv │ │ ├── 2017-03-16_test_off_y.csv │ │ ├── 2017-03-16_test_on_x.csv │ │ ├── 2017-03-16_train_off_x.csv │ │ ├── 2017-03-16_train_off_y.csv │ │ ├── 2017-03-16_train_on_x.csv │ │ └── 2017-03-16_train_on_y.csv │ ├── weekABCD │ │ ├── A.csv │ │ ├── B.csv │ │ ├── C.csv │ │ ├── D.csv │ │ ├── week0.csv │ │ ├── week1.csv │ │ ├── week2.csv │ │ ├── week3.csv │ │ ├── week4.csv │ │ ├── weekA.csv │ │ ├── weekA1.csv │ │ ├── weekA_view.csv │ │ ├── weekB.csv │ │ ├── weekB1.csv │ │ ├── weekB_view.csv │ │ ├── weekC.csv │ │ ├── weekC1.csv │ │ ├── weekC_view.csv │ │ ├── weekD.csv │ │ ├── weekD1.csv │ │ ├── weekD_view.csv │ │ ├── weekP.csv │ │ ├── weekP2.csv │ │ ├── weekZ.csv │ │ └── weekZ1.csv │ └── weekABCD_0123 │ │ ├── A0.csv │ │ ├── A1.csv │ │ ├── A2.csv │ │ ├── A3.csv │ │ ├── B0.csv │ │ ├── B1.csv │ │ ├── B2.csv │ │ ├── B3.csv │ │ ├── C0.csv │ │ ├── C1.csv │ │ ├── C2.csv │ │ ├── C3.csv │ │ ├── D0.csv │ │ ├── D1.csv │ │ ├── D2.csv │ │ └── D3.csv ├── doc │ └── 资料.txt ├── main │ └── __init__.py ├── notebook │ ├── Untitled.ipynb │ └── a.txt ├── pictures │ ├── cate_shop_number │ │ ├── cate_1.csv │ │ ├── cate_1.png │ │ ├── cate_2.csv │ │ ├── cate_2.png │ │ ├── cate_3.csv │ │ └── cate_3.png │ └── city_shop_number │ │ ├── 0-50.png │ │ ├── 101-121.png │ │ ├── 51-100.png │ │ ├── all.png │ │ └── city_shop_number.csv └── run.py ├── Tencent_Social_Ads ├── README.md ├── data │ └── 数据说明.txt ├── doc │ ├── 各代码功能说明.txt │ └── 模型介绍.txt ├── notebook │ └── _1_preprocess_data.ipynb ├── run.sh └── src │ ├── Ad_Utils.py │ ├── Feature_joint.py │ ├── Gen_ID_click_vectors.py │ ├── Gen_app_install_features.py │ ├── Gen_global_sum_counts.py │ ├── Gen_smooth_cvr.py │ ├── Gen_tricks.py │ ├── Gen_tricks_final.py │ ├── Gen_user_click_features.py │ ├── Preprocess_Data.py │ ├── Smooth.py │ ├── __init__.py │ └── ffm.py └── Zhihuijiaotong ├── README.md ├── code ├── Preprocess.py ├── Related_lagging.py ├── Utils.py ├── Xgboost_Model.py └── __init__.py └── doc └── “数聚华夏 创享未来”中国数据创新行——智慧交通预测挑战赛 _ 赛题与数据.html /.idea/TIANCHI_Project.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ApexVCS 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Ad_Convert_prediction/README.md: -------------------------------------------------------------------------------- 1 | # TIANCHI_Project 2 | 天池大数据比赛总结 3 | 4 | 数据下载链接: -------------------------------------------------------------------------------- /Ad_Convert_prediction/data/round1_ijcai_18_result_demo_20180301.txt: -------------------------------------------------------------------------------- 1 | instance_id predicted_score 2 | 2475218615076601065 0.9 3 | 398316874173557226 0.7 4 | 6586402638209028583 0.5 5 | 1040996105851528465 0.3 6 | 6316278569655873454 0.1 7 | -------------------------------------------------------------------------------- /Ad_Convert_prediction/data/数据说明.txt: -------------------------------------------------------------------------------- 1 | 基础数据 2 | 字段 解释 特征重要性(1-5排列,数值越大越重要) 3 | instance_id 样本编号,Long 4 | is_trade 是否交易的标记位,Int类型;取值是0或者1,其中1 5 | 表示这条样本最终产生交易,0 表示没有交易 6 | item_id 广告商品编号,Long类型 7 | user_id 用户的编号,Long类型 8 | context_id 上下文信息的编号,Long类型 9 | shop_id 店铺的编号,Long类型 10 | 11 | 12 | 13 | 14 | 广告商品信息 15 | 字段 解释 16 | item_id 广告商品编号,Long类型 17 | item_category_list 广告商品的的类目列表,String类型;从根类目(最粗略的一级类目)向叶子类目 18 | (最精细的类目)依次排列,数据拼接格式为 "category_0;category_1;category_2",其中 category_1 是 category_0 的子类目, 19 | category_2 是 category_1 的子类目 20 | item_property_list 广告商品的属性列表,String类型;数据拼接格式为 "property_0;property_1;property_2",各个属性没有从属关系 21 | item_brand_id 广告商品的品牌编号,Long类型 22 | item_city_id 广告商品的城市编号,Long类型 23 | item_price_level 广告商品的价格等级,Int类型;取值从0开始,数值越大表示价格越高 24 | item_sales_level 广告商品的销量等级,Int类型;取值从0开始,数值越大表示销量越大 25 | item_collected_level 广告商品被收藏次数的等级,Int类型;取值从0开始,数值越大表示被收藏次数越大 26 | item_pv_level 广告商品被展示次数的等级,Int类型;取值从0开始,数值越大表示被展示次数越大 27 | 28 | 29 | 用户信息 30 | 字段 解释 31 | user_id 用户的编号,Long类型 32 | user_gender_id 用户的预测性别编号,Int类型;0表示女性用户,1表示男性用户,2表示家庭用户 33 | user_age_level 用户的预测年龄等级,Int类型;数值越大表示年龄越大 34 | user_occupation_id 用户的预测职业编号,Int类型 35 | user_star_level 用户的星级编号,Int类型;数值越大表示用户的星级越高 36 | 37 | 38 | 39 | 上下文信息 40 | 字段 解释 41 | context_id 上下文信息的编号,Long类型 42 | context_timestamp 广告商品的展示时间,Long类型;取值是以秒为单位的Unix时间戳,以1天为单位对时间戳进行了偏移 43 | context_page_id 广告商品的展示页面编号,Int类型;取值从1开始,依次增加;在一次搜索的展示结果中第一屏的编号为1,第二屏的编号为2 44 | predict_category_property 根据查询词预测的类目属性列表,String类型;数据拼接格式为 “category_A:property_A_1,property_A_2,property_A_3;category_B:-1;category_C:property_C_1,property_C_2” ,其中 category_A、category_B、category_C 是预测的三个类目;property_B 取值为-1,表示预测的第二个类目 category_B 没有对应的预测属性 45 | 46 | 47 | 48 | 店铺信息 49 | 字段 解释 50 | shop_id 店铺的编号,Long类型 51 | shop_review_num_level 店铺的评价数量等级,Int类型;取值从0开始,数值越大表示评价数量越多 52 | shop_review_positive_rate 店铺的好评率,Double类型;取值在0到1之间,数值越大表示好评率越高 53 | shop_star_level 店铺的星级编号,Int类型;取值从0开始,数值越大表示店铺的星级越高 54 | shop_score_service 店铺的服务态度评分,Double类型;取值在0到1之间,数值越大表示评分越高 55 | shop_score_delivery 店铺的物流服务评分,Double类型;取值在0到1之间,数值越大表示评分越高 56 | shop_score_description 店铺的描述相符评分,Double类型;取值在0到1之间,数值越大表示评分越高 -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/Factorization Machines with libFM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/Factorization Machines with libFM.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/Factorization Machines--Steffen Rendle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/Factorization Machines--Steffen Rendle.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/Field-aware Factorization Machines for CTR Prediction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/Field-aware Factorization Machines for CTR Prediction.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/Field-aware Factorization Machines in a Real-world Online Advertising System-ind0438-juanA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/Field-aware Factorization Machines in a Real-world Online Advertising System-ind0438-juanA.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/Recurrent Neural Networks with Top-k Gains for Session-based Recommendations.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/Recurrent Neural Networks with Top-k Gains for Session-based Recommendations.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/SESSION-BASED RECOMMENDATIONS WITH RECURRENT NEURAL NETWORKS.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/SESSION-BASED RECOMMENDATIONS WITH RECURRENT NEURAL NETWORKS.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/Wide & Deep Learning for Recommender Systems.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/Wide & Deep Learning for Recommender Systems.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/XGBoost A Scalable Tree Boosting System.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/XGBoost A Scalable Tree Boosting System.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/【ECIR-16-FNN】Deep Learning over Multi-field Categorical Data--A Case Study on User Response Prediction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/【ECIR-16-FNN】Deep Learning over Multi-field Categorical Data--A Case Study on User Response Prediction.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/【IJCAI-17】-DeepFM-A Factorization-Machine based Neural Network for CTR Prediction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/【IJCAI-17】-DeepFM-A Factorization-Machine based Neural Network for CTR Prediction.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/【NIPS-2017】lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/【NIPS-2017】lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/paper/【SIGIR-17】Neural Factorization Machines for r Sparse Predictive Analytics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/paper/【SIGIR-17】Neural Factorization Machines for r Sparse Predictive Analytics.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/doc/基于深度学习的搜索广告点击率预测方法研究.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Ad_Convert_prediction/doc/基于深度学习的搜索广告点击率预测方法研究.pdf -------------------------------------------------------------------------------- /Ad_Convert_prediction/src/Data_Preprocess.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Data_Preprocess.py 8 | @time: 2018/3/2 13:15 9 | @desc: 阿里妈妈广告点击转化数据预处理 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | 15 | # 读取数据 16 | test_set = pd.read_csv('E:\dataset\TIANCHI_ad\\test.txt',sep=' ') 17 | train_set = pd.read_csv('E:\dataset\TIANCHI_ad\\train.txt', sep=' ') 18 | # print(test_set.info()) 19 | # print(train_set.info()) 20 | 21 | train_set['dayofweek'] = (train_set['context_timestamp']/(60*60*24)).apply(np.floor) % 7 22 | train_set['hourofday'] = (train_set['context_timestamp']/(60*60)).apply(np.floor)%24 23 | train_set['minofday'] = (train_set['context_timestamp']/(60)).apply(np.floor)%(24*60) 24 | 25 | 26 | test_set['is_trade'] = -1 27 | test_set['dayofweek'] = (test_set['context_timestamp']/(60*60*24)).apply(np.floor)%7 28 | test_set['hourofday'] = (test_set['context_timestamp']/(60*60)).apply(np.floor)%24 29 | test_set['minofday'] = (test_set['context_timestamp']/(60)).apply(np.floor)%(24*60) 30 | 31 | print((train_set['context_timestamp']/(60*60*24)).apply(np.floor).max()) 32 | 33 | 34 | 35 | 36 | # if __name__ == '__main__': 37 | # pass -------------------------------------------------------------------------------- /Click_prediction/README.md: -------------------------------------------------------------------------------- 1 | # kaggle_criteo_ctr_challenge- 2 | This is a kaggle challenge project called Display Advertising Challenge by CriteoLabs at 2014. 3 | 这是2014年由CriteoLabs在kaggle上发起的广告点击率预估挑战项目。 4 | 使用TensorFlow1.0和Python 3.5开发。 5 | 6 | 代码详解请参见jupyter notebook和↓↓↓ 7 | 8 | 博客:http://blog.csdn.net/chengcheng1394/article/details/78940565 9 | 10 | 知乎专栏:https://zhuanlan.zhihu.com/p/32500652 11 | 12 | 欢迎转发扩散 ^_^ 13 | 14 | 本文使用GBDT、FM、FFM和神经网络构建了点击率预估模型。 15 | 16 | ## 网络模型 17 | ![image](https://raw.githubusercontent.com/chengstone/kaggle_criteo_ctr_challenge-/master/model.png) 18 | 19 | ## LogLoss曲线 20 | ![image](https://raw.githubusercontent.com/chengstone/kaggle_criteo_ctr_challenge-/master/tensorboard.png) 21 | 22 | ## 验证集上的训练信息 23 | - 平均准确率 24 | - 平均损失 25 | - 平均Auc 26 | - 预测的平均点击率 27 | - 精确率、召回率、F1 Score等信息 28 | 29 | ![image](https://raw.githubusercontent.com/chengstone/kaggle_criteo_ctr_challenge-/master/train_info.png) 30 | 31 | 更多内容请参考代码,Enjoy! 32 | -------------------------------------------------------------------------------- /Click_prediction/code/Logloss.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | """ 3 | @author:charlesXu 4 | @file: Logloss.py 5 | @desc: 腾讯算法大赛logloss求法 6 | @time: 2018/03/04 7 | """ 8 | 9 | import scipy as sp 10 | 11 | def logloss(act, pred): 12 | epsilon = 1e-15 13 | pred = sp.maximum(epsilon, pred) 14 | pred = sp.minimum(1-epsilon, pred) 15 | ll = sum(act * sp.log(pred) + sp.subtract(1, act) * sp.log(sp.subtract(1, pred))) 16 | ll = ll * - 1.0 / len(act) 17 | return ll -------------------------------------------------------------------------------- /Click_prediction/code/ctr/Preprocess.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Preprocess.py 8 | @time: 2018/3/5 11:08 9 | @desc: 数据预处理 10 | """ 11 | 12 | ''' 13 | 生成神经网络的输入 14 | 生成ffm的输入 15 | 生成GBDT的输入 16 | ''' 17 | 18 | continous_features = range(1, 14) 19 | categorial_features = range(14, 40) 20 | 21 | 22 | 23 | 24 | 25 | if __name__ == '__main__': 26 | pass -------------------------------------------------------------------------------- /Click_prediction/code/ctr/ffm.py: -------------------------------------------------------------------------------- 1 | import subprocess,multiprocessing 2 | import os,time 3 | import pandas as pd 4 | import numpy as np 5 | 6 | class FFM: 7 | """libffm-1.21 Python Wrapper with libffm format data 8 | 9 | :Args: 10 | - reg_lambda: float, default: 2e-5 11 | regularization parameter 12 | - factor: int,default: 4 13 | number of latent factors 14 | - iteration: int, default: 15 15 | - learning_rate: float, default: 0.2 16 | - n_jobs: int, default: 1 17 | Number of parallel threads 18 | - verbose: int, default: 1 19 | - norm: bool, default: True 20 | instance-wise normalization 21 | """ 22 | def __init__(self,reg_lambda=0.00002,factor=4,iteration=15,learning_rate=0.2,n_jobs=1, 23 | verbose=1,norm=True,): 24 | if n_jobs <=0 or n_jobs > multiprocessing.cpu_count(): 25 | raise ValueError('n_jobs must be 1~{0}'.format(multiprocessing.cpu_count())) 26 | 27 | self.reg_lambda = reg_lambda 28 | self.factor = factor 29 | self.iteration = iteration 30 | self.learning_rate = learning_rate 31 | self.n_jobs = n_jobs 32 | self.verbose = verbose 33 | self.norm = norm 34 | 35 | 36 | self.cmd = '' 37 | 38 | self.output_name = 'ffm_result'+str(int(time.time()))# temp predict result file 39 | 40 | 41 | 42 | 43 | def fit(self,train_ffm_path,valid_ffm_path=None,model_path=None,auto_stop=False,): 44 | """ Train the FFM model with ffm-format data, 45 | 46 | :Args: 47 | - train_ffm_path: str 48 | - valid_ffm_path: str, default: None 49 | - model_path: str, default: None 50 | - auto_stop: bool, default: False 51 | stop at the iteration that achieves the best validation loss 52 | """ 53 | 54 | if not os.path.exists(train_ffm_path): 55 | raise FileNotFoundError("file '{0}' not exists".format(train_ffm_path)) 56 | self.train_ffm_path = train_ffm_path 57 | self.valid_ffm_path = valid_ffm_path 58 | self.model_path = None 59 | self.auto_stop = auto_stop 60 | 61 | 62 | cmd = 'ffm-train -l {l} -k {k} -t {t} -r {r} -s {s}'\ 63 | .format(l=self.reg_lambda,k=self.factor,t=self.iteration,r=self.learning_rate,s=self.n_jobs) 64 | if self.valid_ffm_path is not None: 65 | cmd +=' -p {p}'.format(p=self.valid_ffm_path) 66 | 67 | if self.verbose == 0: 68 | cmd += ' --quiet' 69 | if not self.norm: 70 | cmd += ' --no-norm' 71 | 72 | if self.auto_stop: 73 | if self.valid_ffm_path is None: 74 | raise ValueError('Must specify valid_ffm_path when auto_stop = True') 75 | cmd += ' --auto-stop' 76 | cmd += ' {p}'.format(p=self.train_ffm_path) 77 | if not model_path is None: 78 | cmd +=' {p}'.format(p=model_path) 79 | self.model_path = model_path 80 | self.cmd = cmd 81 | print('Sending command...') 82 | popen = subprocess.Popen(cmd, stdout = subprocess.PIPE,shell=True) 83 | while True: 84 | output = str(popen.stdout.readline(),encoding='utf-8').strip('\n') 85 | if output.strip()=='': 86 | print('FFM training done') 87 | break 88 | print(output) 89 | 90 | def predict(self,test_ffm_path,model_path=None): 91 | """ Predict and return the probability of positive class. 92 | 93 | :Args: 94 | - test_ffm_path: str 95 | - model_path: str, default: None 96 | :returns: 97 | - pred_prob: np.array 98 | """ 99 | 100 | cmd = "ffm-predict {t}".format(t=test_ffm_path) 101 | if model_path is None and self.model_path is None: 102 | raise ValueError('Must specify model_path') 103 | elif model_path is not None: 104 | self.model_path = model_path 105 | cmd +=" {0} {1}".format(self.model_path,self.output_name) 106 | self.cmd = cmd 107 | print('Sending command...') 108 | popen = subprocess.Popen(cmd, stdout = subprocess.PIPE,shell=True) 109 | while True: 110 | output = str(popen.stdout.readline(),encoding='utf-8').strip('\n') 111 | if output.strip()=='': 112 | print('FFM predicting done') 113 | break 114 | print(output) 115 | 116 | ans = pd.read_csv(self.output_name,names=['prob']) 117 | os.remove(self.output_name) 118 | return ans.prob.values -------------------------------------------------------------------------------- /Click_prediction/code/ctr_nn/Main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import roc_auc_score 3 | 4 | import Utils 5 | from Models import LR, FM, PNN1, PNN2, FNN, CCPM 6 | 7 | train_file = '../data/train.yx.txt' 8 | test_file = '../data/test.yx.txt' 9 | # fm_model_file = '../data/fm.model.txt' 10 | 11 | input_dim = Utils.INPUT_DIM 12 | 13 | train_data = Utils.read_data(train_file) 14 | train_data = Utils.shuffle(train_data) 15 | test_data = Utils.read_data(test_file) 16 | 17 | if train_data[1].ndim > 1: 18 | print ('label must be 1-dim') 19 | exit(0) 20 | print('read finish') 21 | 22 | train_size = train_data[0].shape[0] 23 | test_size = test_data[0].shape[0] 24 | num_feas = len(Utils.FIELD_SIZES) 25 | 26 | min_round = 1 27 | num_round = 1000 28 | early_stop_round = 50 29 | batch_size = 1024 30 | 31 | field_sizes = Utils.FIELD_SIZES 32 | field_offsets = Utils.FIELD_OFFSETS 33 | 34 | 35 | def train(model): 36 | history_score = [] 37 | for i in range(num_round): 38 | fetches = [model.optimizer, model.loss] 39 | if batch_size > 0: 40 | ls = [] 41 | for j in range(train_size / batch_size + 1): 42 | X_i, y_i = Utils.slice(train_data, j * batch_size, batch_size) 43 | _, l = model.run(fetches, X_i, y_i) 44 | ls.append(l) 45 | elif batch_size == -1: 46 | X_i, y_i = Utils.slice(train_data) 47 | _, l = model.run(fetches, X_i, y_i) 48 | ls = [l] 49 | train_preds = model.run(model.y_prob, Utils.slice(train_data)[0]) 50 | test_preds = model.run(model.y_prob, Utils.slice(test_data)[0]) 51 | train_score = roc_auc_score(train_data[1], train_preds) 52 | test_score = roc_auc_score(test_data[1], test_preds) 53 | print('[%d]\tloss (with l2 norm):%f\ttrain-auc: %f\teval-auc: %f' % (i, np.mean(ls), train_score, test_score)) 54 | history_score.append(test_score) 55 | if i > min_round and i > early_stop_round: 56 | if np.argmax(history_score) == i - early_stop_round and history_score[-1] - history_score[ 57 | -1 * early_stop_round] < 1e-5: 58 | print('early stop\nbest iteration:\n[%d]\teval-auc: %f' % ( 59 | np.argmax(history_score), np.max(history_score))) 60 | break 61 | 62 | 63 | algo = 'pnn2' 64 | 65 | if algo == 'lr': 66 | lr_params = { 67 | 'input_dim': input_dim, 68 | 'opt_algo': 'gd', 69 | 'learning_rate': 0.01, 70 | 'l2_weight': 0, 71 | 'random_seed': 0 72 | } 73 | 74 | model = LR(**lr_params) 75 | elif algo == 'fm': 76 | fm_params = { 77 | 'input_dim': input_dim, 78 | 'factor_order': 10, 79 | 'opt_algo': 'gd', 80 | 'learning_rate': 0.1, 81 | 'l2_w': 0, 82 | 'l2_v': 0, 83 | } 84 | 85 | model = FM(**fm_params) 86 | elif algo == 'fnn': 87 | fnn_params = { 88 | 'layer_sizes': [field_sizes, 10, 1], 89 | 'layer_acts': ['tanh', 'none'], 90 | 'drop_out': [0, 0], 91 | 'opt_algo': 'gd', 92 | 'learning_rate': 0.1, 93 | 'layer_l2': [0, 0], 94 | 'random_seed': 0 95 | } 96 | 97 | model = FNN(**fnn_params) 98 | elif algo == 'ccpm': 99 | ccpm_params = { 100 | 'layer_sizes': [field_sizes, 10, 5, 3], 101 | 'layer_acts': ['tanh', 'tanh', 'none'], 102 | 'drop_out': [0, 0, 0], 103 | 'opt_algo': 'gd', 104 | 'learning_rate': 0.1, 105 | 'random_seed': 0 106 | } 107 | 108 | model = CCPM(**ccpm_params) 109 | elif algo == 'pnn1': 110 | pnn1_params = { 111 | 'layer_sizes': [field_sizes, 10, 1], 112 | 'layer_acts': ['tanh', 'none'], 113 | 'drop_out': [0, 0], 114 | 'opt_algo': 'gd', 115 | 'learning_rate': 0.1, 116 | 'layer_l2': [0, 0], 117 | 'kernel_l2': 0, 118 | 'random_seed': 0 119 | } 120 | 121 | model = PNN1(**pnn1_params) 122 | elif algo == 'pnn2': 123 | pnn2_params = { 124 | 'layer_sizes': [field_sizes, 10, 1], 125 | 'layer_acts': ['tanh', 'none'], 126 | 'drop_out': [0, 0], 127 | 'opt_algo': 'gd', 128 | 'learning_rate': 0.01, 129 | 'layer_l2': [0, 0], 130 | 'kernel_l2': 0, 131 | 'random_seed': 0 132 | } 133 | 134 | model = PNN2(**pnn2_params) 135 | 136 | if algo in {'fnn', 'ccpm', 'pnn1', 'pnn2'}: 137 | train_data = Utils.split_data(train_data) 138 | test_data = Utils.split_data(test_data) 139 | 140 | train(model) 141 | 142 | # X_i, y_i = utils.slice(train_data, 0, 100) 143 | # fetches = [model.tmp1, model.tmp2] 144 | # tmp1, tmp2 = model.run(fetches, X_i, y_i) 145 | # print tmp1.shape 146 | # print tmp2.shape 147 | -------------------------------------------------------------------------------- /Click_prediction/code/ctr_nn/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: __init__.py.py 8 | @time: 2018/3/6 18:51 9 | @desc: 10 | """ 11 | 12 | if __name__ == '__main__': 13 | pass -------------------------------------------------------------------------------- /Click_prediction/code/cvr/1.problem_setting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 腾讯移动App广告转化率预估" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "![](./image/Tencent_cvr_prediction.png)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### 题目描述\n", 22 | "计算广告是互联网最重要的商业模式之一,广告投放效果通常通过曝光、点击和转化各环节来衡量,大多数广告系统受广告效果数据回流的限制只能通过曝光或点击作为投放效果的衡量标准开展优化。\n", 23 | "\n", 24 | "腾讯社交广告(`http://ads.tencent.com`)发挥特有的用户识别和转化跟踪数据能力,帮助广告主跟踪广告投放后的转化效果,基于广告转化数据训练转化率预估模型(pCVR,Predicted Conversion Rate),在广告排序中引入pCVR因子优化广告投放效果,提升ROI。\n", 25 | "\n", 26 | "本题目以移动App广告为研究对象,预测App广告点击后被激活的概率:pCVR=P(conversion=1 | Ad,User,Context),即给定广告、用户和上下文情况下广告被点击后发生激活的概率。" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### 训练数据\n", 34 | "从腾讯社交广告系统中某一连续两周的日志中按照推广中的App和用户维度随机采样。\n", 35 | "\n", 36 | "每一条训练样本即为一条广告点击日志(点击时间用clickTime表示),样本label取值0或1,其中0表示点击后没有发生转化,1表示点击后有发生转化,如果label为1,还会提供转化回流时间(conversionTime,定义详见“FAQ”)。给定特征集如下:" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "![](./image/data_dscr_1.png)\n", 44 | "![](./image/data_dscr_2.png)\n", 45 | "![](./image/data_dscr_3.png)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "特别的,出于数据安全的考虑,对于userID,appID,特征,以及时间字段,我们不提供原始数据,按照如下方式加密处理:" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "![](./image/data_dscr_4.png)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "#### 训练数据文件(train.csv)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "每行代表一个训练样本,各字段之间由逗号分隔,顺序依次为:“label,clickTime,conversionTime,creativeID,userID,positionID,connectionType,telecomsOperator”。\n", 74 | "\n", 75 | "当label=0时,conversionTime字段为空字符串。特别的,训练数据时间范围为第17天0点到第31天0点(定义详见下面的“补充说明”)。为了节省存储空间,用户、App、广告和广告位相关信息以独立文件提供(训练数据和测试数据共用),具体如下:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "![](./image/data_dscr_5.png)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "注:若字段取值为0或空字符串均代表未知。(站点集合ID(sitesetID)为0并不表示未知,而是一个特定的站点集合。)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### 测试数据\n", 97 | "从训练数据时段随后1天(即第31天)的广告日志中按照与训练数据同样的采样方式抽取得到,测试数据文件(test.csv)每行代表一个测试样本,各字段之间由逗号分隔,顺序依次为:“instanceID,-1,clickTime,creativeID,userID,positionID,connectionType,telecomsOperator”。其中,instanceID唯一标识一个样本,-1代表label占位使用,表示待预测。" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "### 评估方式\n", 105 | "通过Logarithmic Loss评估(越小越好),公式如下:\n", 106 | "![](http://qzonestyle.gtimg.cn/gdt/canvas/Starry/public/image/formula-1.png)\n", 107 | "其中,N是测试样本总数,yi是二值变量,取值0或1,表示第i个样本的label,pi为模型预测第i个样本 label为1的概率。" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "示例代码如下(Python语言):\n", 115 | "```python\n", 116 | "import scipy as sp\n", 117 | "def logloss(act, pred):\n", 118 | " epsilon = 1e-15\n", 119 | " pred = sp.maximum(epsilon, pred)\n", 120 | " pred = sp.minimum(1-epsilon, pred)\n", 121 | " ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))\n", 122 | " ll = ll * -1.0/len(act)\n", 123 | " return ll\n", 124 | "```" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "### 提交格式\n", 132 | "模型预估结果以zip压缩文件方式提交,内部文件名是submission.csv。每行代表一个测试样本,第一行为header,可以记录本文件相关关键信息,评测时会忽略,从第二行开始各字段之间由逗号分隔,顺序依次为:“instanceID, prob”,其中,instanceID唯一标识一个测试样本,必须升序排列,prob为模型预估的广告转化概率。示例如下:\n", 133 | "![](http://qzonestyle.gtimg.cn/gdt/canvas/Starry/public/image/test-data.png?version=1.0.7)" 134 | ] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 2", 140 | "language": "python", 141 | "name": "python2" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 2 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython2", 153 | "version": "2.7.12" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 2 158 | } 159 | -------------------------------------------------------------------------------- /Click_prediction/code/cvr/2.Baseline_version.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## CVR预估基线版本" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### 2.1 基于AD统计的版本" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "# -*- coding: utf-8 -*-\n", 26 | "\"\"\"\n", 27 | "baseline 1: history pCVR of creativeID/adID/camgaignID/advertiserID/appID/appPlatform\n", 28 | "\"\"\"\n", 29 | "\n", 30 | "import zipfile\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "\n", 34 | "# load data\n", 35 | "data_root = \"E:\\dataset\\pre\"\n", 36 | "dfTrain = pd.read_csv(\"%s/train.csv\"%data_root)\n", 37 | "dfTest = pd.read_csv(\"%s/test.csv\"%data_root)\n", 38 | "dfAd = pd.read_csv(\"%s/ad.csv\"%data_root)\n", 39 | "\n", 40 | "# process data\n", 41 | "dfTrain = pd.merge(dfTrain, dfAd, on=\"creativeID\")\n", 42 | "dfTest = pd.merge(dfTest, dfAd, on=\"creativeID\")\n", 43 | "y_train = dfTrain[\"label\"].values\n", 44 | "\n", 45 | "# model building\n", 46 | "key = \"appID\"\n", 47 | "dfCvr = dfTrain.groupby(key).apply(lambda df: np.mean(df[\"label\"])).reset_index()\n", 48 | "dfCvr.columns = [key, \"avg_cvr\"]\n", 49 | "dfTest = pd.merge(dfTest, dfCvr, how=\"left\", on=key)\n", 50 | "dfTest[\"avg_cvr\"].fillna(np.mean(dfTrain[\"label\"]), inplace=True)\n", 51 | "proba_test = dfTest[\"avg_cvr\"].values\n", 52 | "\n", 53 | "# submission\n", 54 | "df = pd.DataFrame({\"instanceID\": dfTest[\"instanceID\"].values, \"proba\": proba_test})\n", 55 | "df.sort_values(\"instanceID\", inplace=True)\n", 56 | "df.to_csv(\"submission.csv\", index=False)\n", 57 | "with zipfile.ZipFile(\"submission.zip\", \"w\") as fout:\n", 58 | " fout.write(\"submission.csv\", compress_type=zipfile.ZIP_DEFLATED)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "### 得分\n", 66 | "| Submission | 描述| 初赛A | 初赛B | 决赛A | 决赛B |\n", 67 | "| :------- | :-------: | :-------: | :-------: | :-------: | :-------: |\n", 68 | "| baseline 2.1 | ad 统计 | 0.10988 | - | - | - |" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### 2.2 AD+LR版本" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "# -*- coding: utf-8 -*-\n", 87 | "\"\"\"\n", 88 | "baseline 2: ad.csv (creativeID/adID/camgaignID/advertiserID/appID/appPlatform) + lr\n", 89 | "\"\"\"\n", 90 | "\n", 91 | "import zipfile\n", 92 | "import pandas as pd\n", 93 | "from scipy import sparse\n", 94 | "from sklearn.preprocessing import OneHotEncoder\n", 95 | "from sklearn.linear_model import LogisticRegression\n", 96 | "\n", 97 | "# load data\n", 98 | "data_root = \"./data\"\n", 99 | "dfTrain = pd.read_csv(\"%s/train.csv\"%data_root)\n", 100 | "dfTest = pd.read_csv(\"%s/test.csv\"%data_root)\n", 101 | "dfAd = pd.read_csv(\"%s/ad.csv\"%data_root)\n", 102 | "\n", 103 | "# process data\n", 104 | "dfTrain = pd.merge(dfTrain, dfAd, on=\"creativeID\")\n", 105 | "dfTest = pd.merge(dfTest, dfAd, on=\"creativeID\")\n", 106 | "y_train = dfTrain[\"label\"].values\n", 107 | "\n", 108 | "# feature engineering/encoding\n", 109 | "enc = OneHotEncoder()\n", 110 | "feats = [\"creativeID\", \"adID\", \"camgaignID\", \"advertiserID\", \"appID\", \"appPlatform\"]\n", 111 | "for i,feat in enumerate(feats):\n", 112 | " x_train = enc.fit_transform(dfTrain[feat].values.reshape(-1, 1))\n", 113 | " x_test = enc.transform(dfTest[feat].values.reshape(-1, 1))\n", 114 | " if i == 0:\n", 115 | " X_train, X_test = x_train, x_test\n", 116 | " else:\n", 117 | " X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test))\n", 118 | "\n", 119 | "# model training\n", 120 | "lr = LogisticRegression()\n", 121 | "lr.fit(X_train, y_train)\n", 122 | "proba_test = lr.predict_proba(X_test)[:,1]\n", 123 | "\n", 124 | "# submission\n", 125 | "df = pd.DataFrame({\"instanceID\": dfTest[\"instanceID\"].values, \"proba\": proba_test})\n", 126 | "df.sort_values(\"instanceID\", inplace=True)\n", 127 | "df.to_csv(\"submission.csv\", index=False)\n", 128 | "with zipfile.ZipFile(\"submission.zip\", \"w\") as fout:\n", 129 | " fout.write(\"submission.csv\", compress_type=zipfile.ZIP_DEFLATED)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "### 得分\n", 137 | "| Submission | 描述| 初赛A | 初赛B | 决赛A | 决赛B |\n", 138 | "| :------- | :-------: | :-------: | :-------: | :-------: | :-------: |\n", 139 | "| baseline 2.2 | ad + lr | 0.10743 | - | - | - |" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 2", 146 | "language": "python", 147 | "name": "python2" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 2 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython2", 159 | "version": "2.7.12" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 2 164 | } 165 | -------------------------------------------------------------------------------- /Click_prediction/code/cvr/README.md: -------------------------------------------------------------------------------- 1 | # 第一届腾讯社交广告高校算法大赛-移动App广告转化率预估 2 | 赛题详情http://algo.tpai.qq.com/home/information/index.html 3 | 题目描述 4 | 根据从某社交广告系统连续两周的日志记录按照推广中的App和用户维度随机采样构造的数据,预测App广告点击后被激活的概率:pCVR=P(conversion=1 | Ad,User,Context),即给定广告、用户和上下文情况下广告被点击后发生激活的概率。 5 | # 运行环境 6 | - 操作系统 Ubuntu 14.04.4 LTS (GNU/Linux 4.2.0-27-generic x86_64) 7 | - 内存 128GB 8 | - CPU 32 Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz 9 | - 显卡 TITAN X (Pascal) 12GB 10 | - 语言 Python3.6 11 | - Python依赖包 12 | 1. Keras==2.0.6 13 | 2. lightgbm==0.1 14 | 3. matplotlib==2.0.0 15 | 4. numpy==1.11.3 16 | 5. pandas==0.19.2 17 | 6. scikit-learn==0.18.1 18 | 7. scipy==0.18.1 19 | 8. tensorflow-gpu==1.2.1 20 | 9. tqdm==4.11.2 21 | 10. xgboost==0.6a2 22 | - 其他库 23 | LIBFFM v121 24 | # 运行说明 25 | 1. 将复赛数据文件`final.zip`放在根目录下 26 | 2. 在根目录下运行`sh run.sh`命令生成特征文件 27 | 3. 打开`./code/_4_*_model_*.ipynb`分别进行模型训练和预测,生成单模型提交结果,包括`lgb,xgb,ffm,mlp` 28 | 4. 打开`./code/_4_5_model_avg.ipynb`进行最终的加权平均并生成最终提交结果 29 | # 方案说明 30 | 31 | 1. 用户点击日志挖掘`_2_1_gen_user_click_features.py` 32 | 挖掘广告点击日志,从不同时间粒度(天,小时)和不同属性维度(点击的素材,广告,推广计划,广告主类型,广告位等)提取用户点击行为的统计特征。 33 | 2. 用户安装日志挖掘 `_2_2_gen_app_install_features.py` 34 | 根据用户历史APP安装记录日志,分析用户的安装偏好和APP的流行趋势,结合APP安装时间的信息提取APP的时间维度的描述向量。这里最后只用了一种特征。 35 | 3. 广告主转化回流上报机制分析`_2_4_gen_tricks.py` 36 | 不同的广告主具有不同的转化计算方式,如第一次点击算转化,最后一次点击算转化,安装时点击算转化,分析并构造相应描述特征,提升模型预测精度。 37 | 4. 广告转化率特征提取`_2_5_gen_smooth_cvr.py` 38 | 构造转化率特征,使用全局和滑动窗口等方式计算单特征转化率,组合特征转化率,使用均值填充,层级填充,贝叶斯平滑,拉普拉斯平滑等方式对转化率进行修正。 39 | 5. 广告描述向量特征提取`_2_6_gen_ID_click_vectors.py` 40 | 广告投放是有特定受众对象的,而特定的受众对象也可以描述广告的相关特性,使用不同的人口属性对广告ID和APPID进行向量表示,学习隐含的语义特征。 41 | 6. 建模预测 42 | 使用多种模型进行训练,包括LightGBM,XGBoost,FFM和神经网络,最后进行多模型加权融合提高最终模型性能。 43 | 44 | # 其他 45 | - 最终线上排名20,logloss 0.101763 46 | - 最终特征维度在110左右 47 | - 部分最终没有采用的特征代码依然保留 48 | - 由于我们团队的代码是3个人共同完成的,我这里整理的模型训练的部分可能和当时略有差异,但特征部分基本一致。 49 | - `deprecated`目录下为弃用的代码,包括一些原始代码和打算尝试的方法 -------------------------------------------------------------------------------- /Click_prediction/data/data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/data/data.pdf -------------------------------------------------------------------------------- /Click_prediction/data/data_description.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/data/data_description.pdf -------------------------------------------------------------------------------- /Click_prediction/data/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget --no-check-certificate https://s3-eu-west-1.amazonaws.com/criteo-labs/dac.tar.gz 4 | tar zxf dac.tar.gz 5 | rm -f dac.tar.gz 6 | 7 | mkdir raw 8 | mv ./*.txt raw/ 9 | -------------------------------------------------------------------------------- /Click_prediction/data/tencent_数据说明/Tencent_cvr_prediction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/data/tencent_数据说明/Tencent_cvr_prediction.png -------------------------------------------------------------------------------- /Click_prediction/data/tencent_数据说明/data_dscr_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/data/tencent_数据说明/data_dscr_4.png -------------------------------------------------------------------------------- /Click_prediction/data/tencent_数据说明/data_dscr_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/data/tencent_数据说明/data_dscr_5.png -------------------------------------------------------------------------------- /Click_prediction/data/tencent_数据说明/上下文特征.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/data/tencent_数据说明/上下文特征.png -------------------------------------------------------------------------------- /Click_prediction/data/tencent_数据说明/广告特征.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/data/tencent_数据说明/广告特征.png -------------------------------------------------------------------------------- /Click_prediction/data/tencent_数据说明/用户特征.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/data/tencent_数据说明/用户特征.png -------------------------------------------------------------------------------- /Click_prediction/doc/8课下课件-张伟楠.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/doc/8课下课件-张伟楠.pdf -------------------------------------------------------------------------------- /Click_prediction/doc/Ad click prediction a view from the trenches.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/doc/Ad click prediction a view from the trenches.pdf -------------------------------------------------------------------------------- /Click_prediction/doc/ffm.txt: -------------------------------------------------------------------------------- 1 | FFM应用 2 | 3 | 在计算广告领域,点击率CTR(click-through rate)和转化率CVR(conversion rate)是衡量广告流量的两个关键指标。 4 | 准确的估计CTR、CVR对于提高流量的价值,增加广告收入有重要的指导作用。 5 | 6 | 预估CTR/CVR,业界常用的方法有 7 | 人工特征工程 + 8 | LR(Logistic Regression)、 9 | GBDT(Gradient Boosting Decision Tree) + 10 | LR[1][2][3]、 11 | FM(Factorization Machine)[2][7]和 12 | FFM(Field-aware Factorization Machine)[9]模型。 13 | 在这些模型中,FM和FFM近年来表现突出,分别在由Criteo和Avazu举办的CTR预测竞赛中夺得冠军[4][5]。 14 | 15 | 16 | -------------------------------------------------------------------------------- /Click_prediction/doc/fm.txt: -------------------------------------------------------------------------------- 1 | FM说明文档 2 | 3 | FM用来解决数据量大并且特征稀疏下的特征组合问题。先来看看公式(只考虑二阶多项式的情况):n代表样本的特征数量,xi是第i个特征的值,w0、wi、wij是模型参数。 4 | 5 | 6 | -------------------------------------------------------------------------------- /Click_prediction/doc/资料.txt: -------------------------------------------------------------------------------- 1 | 参考博客: 2 | 点击率预估数据下载链接: wget --no-check-certificate https://s3-eu-west-1.amazonaws.com/criteo-labs/dac.tar.gz 3 | 点击率预估算法:FM与FFM详解: http://blog.csdn.net/jediael_lu/article/details/77772565 4 | Kaggle实战——点击率预估: http://blog.csdn.net/chengcheng1394/article/details/78940565 5 | 深入FFM原理与实践: http://blog.csdn.net/mmc2015/article/details/51760681 6 | 关于CTR预估的面试问题: http://blog.csdn.net/wanghai00/article/details/60466617 7 | 8 | -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/COPYRIGHT: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2017 The LIBFFM Project. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | 3. Neither name of copyright holders nor the names of its contributors 17 | may be used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR 25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | CXXFLAGS = -Wall -O3 -std=c++0x -march=native 3 | 4 | # comment the following flags if you do not want to SSE instructions 5 | DFLAG += -DUSESSE 6 | 7 | # comment the following flags if you do not want to use OpenMP 8 | #DFLAG += -DUSEOMP 9 | #CXXFLAGS += -fopenmp 10 | 11 | all: ffm-train ffm-predict 12 | 13 | ffm-train: ffm-train.cpp ffm.o timer.o 14 | $(CXX) $(CXXFLAGS) $(DFLAG) -o $@ $^ 15 | 16 | ffm-predict: ffm-predict.cpp ffm.o timer.o 17 | $(CXX) $(CXXFLAGS) $(DFLAG) -o $@ $^ 18 | 19 | ffm.o: ffm.cpp ffm.h timer.o 20 | $(CXX) $(CXXFLAGS) $(DFLAG) -c -o $@ $< 21 | 22 | timer.o: timer.cpp timer.h 23 | $(CXX) $(CXXFLAGS) $(DFLAG) -c -o $@ $< 24 | 25 | clean: 26 | rm -f ffm-train ffm-predict ffm.o timer.o 27 | -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/Makefile.win: -------------------------------------------------------------------------------- 1 | CXX = cl.exe 2 | CFLAGS = /nologo /O2 /EHsc /D "_CRT_SECURE_NO_DEPRECATE" /D "USEOMP" /D "USESSE" /openmp 3 | 4 | TARGET = windows 5 | 6 | all: $(TARGET) $(TARGET)\ffm-train.exe $(TARGET)\ffm-predict.exe 7 | 8 | $(TARGET)\ffm-predict.exe: ffm.h ffm-predict.cpp ffm.obj timer.obj 9 | $(CXX) $(CFLAGS) ffm-predict.cpp ffm.obj timer.obj -Fe$(TARGET)\ffm-predict.exe 10 | 11 | $(TARGET)\ffm-train.exe: ffm.h ffm-train.cpp ffm.obj timer.obj 12 | $(CXX) $(CFLAGS) ffm-train.cpp ffm.obj timer.obj -Fe$(TARGET)\ffm-train.exe 13 | 14 | ffm.obj: ffm.cpp ffm.h 15 | $(CXX) $(CFLAGS) -c ffm.cpp 16 | 17 | timer.obj: timer.cpp timer.h 18 | $(CXX) $(CFLAGS) -c timer.cpp 19 | 20 | .PHONY: $(TARGET) 21 | $(TARGET): 22 | -mkdir $(TARGET) 23 | 24 | clean: 25 | -erase /Q *.obj *.exe $(TARGET)\. 26 | -rd $(TARGET) 27 | -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/ffm-predict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libffm/libffm/ffm-predict -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/ffm-predict.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "ffm.h" 13 | 14 | using namespace std; 15 | using namespace ffm; 16 | 17 | struct Option { 18 | string test_path, model_path, output_path, withoutY_flag; 19 | }; 20 | 21 | string predict_help() { 22 | return string( 23 | "usage: ffm-predict test_file model_file output_file\n"); 24 | } 25 | 26 | Option parse_option(int argc, char **argv) { 27 | vector args; 28 | for(int i = 0; i < argc; i++) 29 | args.push_back(string(argv[i])); 30 | 31 | if(argc == 1) 32 | throw invalid_argument(predict_help()); 33 | 34 | Option option; 35 | 36 | if(argc != 4 && argc != 5) 37 | throw invalid_argument("cannot parse argument"); 38 | 39 | option.test_path = string(args[1]); 40 | option.model_path = string(args[2]); 41 | option.output_path = string(args[3]); 42 | if(argc == 5){ 43 | option.withoutY_flag = string(args[4]); 44 | } else { 45 | option.withoutY_flag = ""; 46 | } 47 | 48 | return option; 49 | } 50 | 51 | void predict(string test_path, string model_path, string output_path) { 52 | int const kMaxLineSize = 1000000; 53 | 54 | FILE *f_in = fopen(test_path.c_str(), "r"); 55 | ofstream f_out(output_path); 56 | ofstream f_out_t(output_path + ".logit"); 57 | char line[kMaxLineSize]; 58 | 59 | ffm_model model = ffm_load_model(model_path); 60 | 61 | ffm_double loss = 0; 62 | vector x; 63 | ffm_int i = 0; 64 | 65 | for(; fgets(line, kMaxLineSize, f_in) != nullptr; i++) { 66 | x.clear(); 67 | char *y_char = strtok(line, " \t"); 68 | ffm_float y = (atoi(y_char)>0)? 1.0f : -1.0f; 69 | 70 | while(true) { 71 | char *field_char = strtok(nullptr,":"); 72 | char *idx_char = strtok(nullptr,":"); 73 | char *value_char = strtok(nullptr," \t"); 74 | if(field_char == nullptr || *field_char == '\n') 75 | break; 76 | 77 | ffm_node N; 78 | N.f = atoi(field_char); 79 | N.j = atoi(idx_char); 80 | N.v = atof(value_char); 81 | 82 | x.push_back(N); 83 | } 84 | 85 | ffm_float y_bar = ffm_predict(x.data(), x.data()+x.size(), model); 86 | ffm_float ret_t = ffm_get_wTx(x.data(), x.data()+x.size(), model); 87 | loss -= y==1? log(y_bar) : log(1-y_bar); 88 | 89 | f_out_t << ret_t << "\n"; 90 | f_out << y_bar << "\n"; 91 | } 92 | 93 | loss /= i; 94 | 95 | cout << "logloss = " << fixed << setprecision(5) << loss << endl; 96 | 97 | fclose(f_in); 98 | } 99 | 100 | 101 | void predict_withoutY(string test_path, string model_path, string output_path) { 102 | int const kMaxLineSize = 1000000; 103 | 104 | FILE *f_in = fopen(test_path.c_str(), "r"); 105 | ofstream f_out(output_path); 106 | ofstream f_out_t(output_path + ".logit"); 107 | char line[kMaxLineSize]; 108 | 109 | ffm_model model = ffm_load_model(model_path); 110 | 111 | //ffm_double loss = 0; 112 | vector x; 113 | ffm_int i = 0; 114 | 115 | for(; fgets(line, kMaxLineSize, f_in) != nullptr; i++) { 116 | x.clear(); 117 | //char *y_char = strtok(line, " \t"); 118 | //ffm_float y = (atoi(y_char)>0)? 1.0f : -1.0f; 119 | 120 | char *field_char = strtok(line,":"); 121 | char *idx_char = strtok(nullptr,":"); 122 | char *value_char = strtok(nullptr," \t"); 123 | if(field_char == nullptr || *field_char == '\n') 124 | continue; 125 | 126 | ffm_node N; 127 | N.f = atoi(field_char); 128 | N.j = atoi(idx_char); 129 | N.v = atof(value_char); 130 | 131 | x.push_back(N); 132 | 133 | while(true) { 134 | char *field_char = strtok(nullptr,":"); 135 | char *idx_char = strtok(nullptr,":"); 136 | char *value_char = strtok(nullptr," \t"); 137 | if(field_char == nullptr || *field_char == '\n') 138 | break; 139 | 140 | ffm_node N; 141 | N.f = atoi(field_char); 142 | N.j = atoi(idx_char); 143 | N.v = atof(value_char); 144 | 145 | x.push_back(N); 146 | } 147 | 148 | ffm_float y_bar = ffm_predict(x.data(), x.data()+x.size(), model); 149 | ffm_float ret_t = ffm_get_wTx(x.data(), x.data()+x.size(), model); 150 | //loss -= y==1? log(y_bar) : log(1-y_bar); 151 | 152 | f_out_t << ret_t << "\n"; 153 | f_out << y_bar << "\n"; 154 | } 155 | 156 | //loss /= i; 157 | 158 | //cout << "logloss = " << fixed << setprecision(5) << loss << endl; 159 | cout << "done!" << endl; 160 | 161 | fclose(f_in); 162 | } 163 | 164 | int main(int argc, char **argv) { 165 | Option option; 166 | try { 167 | option = parse_option(argc, argv); 168 | } catch(invalid_argument const &e) { 169 | cout << e.what() << endl; 170 | return 1; 171 | } 172 | 173 | if(argc == 5 && option.withoutY_flag.compare("true") == 0){ 174 | predict_withoutY(option.test_path, option.model_path, option.output_path); 175 | } else { 176 | predict(option.test_path, option.model_path, option.output_path); 177 | } 178 | return 0; 179 | } 180 | -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/ffm-train: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libffm/libffm/ffm-train -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/ffm-train.cpp: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic ignored "-Wunused-result" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ffm.h" 11 | 12 | #if defined USEOMP 13 | #include 14 | #endif 15 | 16 | using namespace std; 17 | using namespace ffm; 18 | 19 | string train_help() { 20 | return string( 21 | "usage: ffm-train [options] training_set_file [model_file]\n" 22 | "\n" 23 | "options:\n" 24 | "-l : set regularization parameter (default 0.00002)\n" 25 | "-k : set number of latent factors (default 4)\n" 26 | "-t : set number of iterations (default 15)\n" 27 | "-r : set learning rate (default 0.2)\n" 28 | "-s : set number of threads (default 1)\n" 29 | "-p : set path to the validation set\n" 30 | "--quiet: quiet mode (no output)\n" 31 | "--no-norm: disable instance-wise normalization\n" 32 | "--auto-stop: stop at the iteration that achieves the best validation loss (must be used with -p)\n"); 33 | } 34 | 35 | struct Option { 36 | string tr_path; 37 | string va_path; 38 | string model_path; 39 | ffm_parameter param; 40 | bool quiet = false; 41 | ffm_int nr_threads = 1; 42 | }; 43 | 44 | string basename(string path) { 45 | const char *ptr = strrchr(&*path.begin(), '/'); 46 | if(!ptr) 47 | ptr = path.c_str(); 48 | else 49 | ptr++; 50 | return string(ptr); 51 | } 52 | 53 | Option parse_option(int argc, char **argv) { 54 | vector args; 55 | for(int i = 0; i < argc; i++) 56 | args.push_back(string(argv[i])); 57 | 58 | if(argc == 1) 59 | throw invalid_argument(train_help()); 60 | 61 | Option opt; 62 | 63 | ffm_int i = 1; 64 | for(; i < argc; i++) { 65 | if(args[i].compare("-t") == 0) 66 | { 67 | if(i == argc-1) 68 | throw invalid_argument("need to specify number of iterations after -t"); 69 | i++; 70 | opt.param.nr_iters = atoi(args[i].c_str()); 71 | if(opt.param.nr_iters <= 0) 72 | throw invalid_argument("number of iterations should be greater than zero"); 73 | } else if(args[i].compare("-k") == 0) { 74 | if(i == argc-1) 75 | throw invalid_argument("need to specify number of factors after -k"); 76 | i++; 77 | opt.param.k = atoi(args[i].c_str()); 78 | if(opt.param.k <= 0) 79 | throw invalid_argument("number of factors should be greater than zero"); 80 | } else if(args[i].compare("-r") == 0) { 81 | if(i == argc-1) 82 | throw invalid_argument("need to specify eta after -r"); 83 | i++; 84 | opt.param.eta = atof(args[i].c_str()); 85 | if(opt.param.eta <= 0) 86 | throw invalid_argument("learning rate should be greater than zero"); 87 | } else if(args[i].compare("-l") == 0) { 88 | if(i == argc-1) 89 | throw invalid_argument("need to specify lambda after -l"); 90 | i++; 91 | opt.param.lambda = atof(args[i].c_str()); 92 | if(opt.param.lambda < 0) 93 | throw invalid_argument("regularization cost should not be smaller than zero"); 94 | } else if(args[i].compare("-s") == 0) { 95 | if(i == argc-1) 96 | throw invalid_argument("need to specify number of threads after -s"); 97 | i++; 98 | opt.nr_threads = atoi(args[i].c_str()); 99 | if(opt.nr_threads <= 0) 100 | throw invalid_argument("number of threads should be greater than zero"); 101 | } else if(args[i].compare("-p") == 0) { 102 | if(i == argc-1) 103 | throw invalid_argument("need to specify path after -p"); 104 | i++; 105 | opt.va_path = args[i]; 106 | } else if(args[i].compare("--no-norm") == 0) { 107 | opt.param.normalization = false; 108 | } else if(args[i].compare("--quiet") == 0) { 109 | opt.quiet = true; 110 | } else if(args[i].compare("--auto-stop") == 0) { 111 | opt.param.auto_stop = true; 112 | } else { 113 | break; 114 | } 115 | } 116 | 117 | if(i != argc-2 && i != argc-1) 118 | throw invalid_argument("cannot parse command\n"); 119 | 120 | opt.tr_path = args[i]; 121 | i++; 122 | 123 | if(i < argc) { 124 | opt.model_path = string(args[i]); 125 | } else if(i == argc) { 126 | opt.model_path = basename(opt.tr_path) + ".model"; 127 | } else { 128 | throw invalid_argument("cannot parse argument"); 129 | } 130 | 131 | return opt; 132 | } 133 | 134 | int train_on_disk(Option opt) { 135 | string tr_bin_path = basename(opt.tr_path) + ".bin"; 136 | string va_bin_path = opt.va_path.empty()? "" : basename(opt.va_path) + ".bin"; 137 | 138 | ffm_read_problem_to_disk(opt.tr_path, tr_bin_path); 139 | if(!opt.va_path.empty()) 140 | ffm_read_problem_to_disk(opt.va_path, va_bin_path); 141 | 142 | ffm_model model = ffm_train_on_disk(tr_bin_path.c_str(), va_bin_path.c_str(), opt.param); 143 | 144 | ffm_save_model(model, opt.model_path); 145 | 146 | return 0; 147 | } 148 | 149 | int main(int argc, char **argv) { 150 | Option opt; 151 | try { 152 | opt = parse_option(argc, argv); 153 | } catch(invalid_argument &e) { 154 | cout << e.what() << endl; 155 | return 1; 156 | } 157 | 158 | if(opt.quiet) 159 | cout.setstate(ios_base::badbit); 160 | 161 | if(opt.param.auto_stop && opt.va_path.empty()) { 162 | cout << "To use auto-stop, you need to assign a validation set" << endl; 163 | return 1; 164 | } 165 | 166 | #if defined USEOMP 167 | omp_set_num_threads(opt.nr_threads); 168 | #endif 169 | 170 | train_on_disk(opt); 171 | 172 | return 0; 173 | } 174 | -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/ffm.h: -------------------------------------------------------------------------------- 1 | #ifndef _LIBFFM_H 2 | #define _LIBFFM_H 3 | 4 | #include 5 | 6 | namespace ffm { 7 | 8 | using namespace std; 9 | 10 | typedef float ffm_float; 11 | typedef double ffm_double; 12 | typedef int ffm_int; 13 | typedef long long ffm_long; 14 | 15 | struct ffm_node { 16 | ffm_int f; // field index 17 | ffm_int j; // feature index 18 | ffm_float v; // value 19 | }; 20 | 21 | struct ffm_model { 22 | ffm_int n; // number of features 23 | ffm_int m; // number of fields 24 | ffm_int k; // number of latent factors 25 | ffm_float *W = nullptr; 26 | bool normalization; 27 | ~ffm_model(); 28 | }; 29 | 30 | struct ffm_parameter { 31 | ffm_float eta = 0.2; // learning rate 32 | ffm_float lambda = 0.00002; // regularization parameter 33 | ffm_int nr_iters = 15; 34 | ffm_int k = 4; // number of latent factors 35 | bool normalization = true; 36 | bool auto_stop = false; 37 | }; 38 | 39 | void ffm_read_problem_to_disk(string txt_path, string bin_path); 40 | 41 | void ffm_save_model(ffm_model &model, string path); 42 | 43 | ffm_model ffm_load_model(string path); 44 | 45 | ffm_model ffm_train_on_disk(string Tr_path, string Va_path, ffm_parameter param); 46 | 47 | ffm_float ffm_predict(ffm_node *begin, ffm_node *end, ffm_model &model); 48 | 49 | ffm_float ffm_get_wTx(ffm_node *begin, ffm_node *end, ffm_model &model); 50 | 51 | } // namespace ffm 52 | 53 | #endif // _LIBFFM_H 54 | -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/ffm.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libffm/libffm/ffm.o -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/timer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "timer.h" 3 | 4 | Timer::Timer() 5 | { 6 | reset(); 7 | } 8 | 9 | void Timer::reset() 10 | { 11 | begin = std::chrono::high_resolution_clock::now(); 12 | duration = 13 | std::chrono::duration_cast(begin-begin); 14 | } 15 | 16 | void Timer::tic() 17 | { 18 | begin = std::chrono::high_resolution_clock::now(); 19 | } 20 | 21 | float Timer::toc() 22 | { 23 | duration += std::chrono::duration_cast 24 | (std::chrono::high_resolution_clock::now()-begin); 25 | return get(); 26 | } 27 | 28 | float Timer::get() 29 | { 30 | return (float)duration.count() / 1000; 31 | } 32 | -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/timer.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | class Timer 4 | { 5 | public: 6 | Timer(); 7 | void reset(); 8 | void tic(); 9 | float toc(); 10 | float get(); 11 | private: 12 | std::chrono::high_resolution_clock::time_point begin; 13 | std::chrono::milliseconds duration; 14 | }; 15 | -------------------------------------------------------------------------------- /Click_prediction/libffm/libffm/timer.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libffm/libffm/timer.o -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | cd src/libfm; make all 3 | 4 | libFM: 5 | cd src/libfm; make libFM 6 | 7 | clean: 8 | cd src/libfm; make clean 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/README.md: -------------------------------------------------------------------------------- 1 | libFM 2 | ===== 3 | 4 | Library for factorization machines 5 | 6 | web: http://www.libfm.org/ 7 | 8 | forum: https://groups.google.com/forum/#!forum/libfm 9 | 10 | Factorization machines (FM) are a generic approach that allows to mimic most factorization models by feature engineering. This way, factorization machines combine the generality of feature engineering with the superiority of factorization models in estimating interactions between categorical variables of large domain. libFM is a software implementation for factorization machines that features stochastic gradient descent (SGD) and alternating least squares (ALS) optimization as well as Bayesian inference using Markov Chain Monte Carlo (MCMC). 11 | 12 | Compile 13 | ======= 14 | libFM has been tested with the GNU compiler collection and GNU make. libFM and the tools can be compiled with 15 | > make all 16 | 17 | Usage 18 | ===== 19 | Please see the [libFM 1.4.2 manual](http://www.libfm.org/libfm-1.42.manual.pdf) for details about how to use libFM. If you have questions, please visit the [forum](https://groups.google.com/forum/#!forum/libfm). 20 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/bin/convert: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libfm/libfm/bin/convert -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/bin/fm_model: -------------------------------------------------------------------------------- 1 | #global bias W0 2 | -1.51913e-05 3 | #unary interactions Wj 4 | 0.0543242 5 | 0.0284947 6 | 0.0266538 7 | 0.016659 8 | 0.0125872 9 | 0.0107259 10 | 0.0118907 11 | 0.0157203 12 | 0.00839382 13 | 0.00971495 14 | 0.00612373 15 | 0.00524808 16 | 0.00285415 17 | 0.0008417 18 | -0.000239964 19 | -0.0086962 20 | 0.00883555 21 | -0.0162203 22 | -0.00296712 23 | 0.0212882 24 | 0.0188632 25 | -0.00632028 26 | -0.0134724 27 | 0.00510968 28 | -0.0100098 29 | 0.010746 30 | -0.0212505 31 | 0.0112133 32 | -0.00330014 33 | 0.0205507 34 | -0.0058263 35 | -0.00871744 36 | #pairwise interactions Vj,f 37 | -0.00700443 0.0115064 0.000250908 -0.0337887 0.00279776 0.0310548 -0.0307156 0.0202006 38 | -0.0181796 0.00783596 -0.00684362 -0.00773165 0.00437475 -0.00467768 0.00468273 -0.0114019 39 | -0.00487966 -0.000374121 0.000759323 -0.0102248 0.00195136 0.00497072 -0.0068161 0.000974602 40 | -0.00833354 0.00259372 -0.00322643 -0.00897765 0.0036128 0.00131875 -0.000500686 0.000626169 41 | -0.00389153 -0.00202808 0.00232841 -0.0078374 0.00150366 0.00242359 -0.00374564 -0.00292291 42 | -0.00459782 -0.0023391 -0.000624162 0.000624403 0.00092783 -0.00574112 0.00559153 -0.00750119 43 | -0.000536693 -0.00366557 0.0022891 -0.00226212 0.000486126 -9.3187e-05 -0.00130247 -0.0022421 44 | 8.09081e-05 4.88191e-05 0.000511957 -0.00204266 9.63924e-05 0.00186076 -0.00199404 0.00120642 45 | 0.000429027 -0.00296412 -0.000256632 0.00518759 0.000159051 -0.00622174 0.00615728 -0.00362862 46 | -0.002437 0.000719526 -0.00051522 -0.00188664 0.00066375 -5.7006e-05 -8.73099e-05 -0.00137168 47 | -0.00307155 5.34744e-05 -0.000672551 -0.00166223 0.00100586 -0.00121773 0.000851445 -0.00220555 48 | 0.000915018 -0.000786345 3.11917e-05 0.00232754 -0.000285893 -0.00181935 0.00185057 -0.000782855 49 | 0.000590278 -0.00137097 0.00103078 3.35035e-05 -0.000245336 -8.46218e-05 -0.0005121 -0.000769284 50 | 0.00137207 0.00063334 -0.000470167 0.00216574 -0.00048717 -0.000448922 0.000950496 0.000930627 51 | 0.000298715 0.000457383 6.04733e-07 -0.000447437 -9.5566e-05 0.000937129 -0.000852014 0.000909023 52 | 0.00154043 0.000518873 -0.000140893 0.00188433 -0.000605516 -2.53168e-05 0.000338521 0.000910561 53 | 0.000421026 0.000134062 2.61146e-05 0.0002638 -0.000144135 0.00021623 -0.000152026 0.000388725 54 | 0.000450511 0.000203354 -2.3082e-05 0.000364577 -0.000164302 0.000185774 -8.86271e-05 0.000415691 55 | 0.000313412 0.000218558 1.86745e-05 -4.68575e-05 -0.00010331 0.000448149 -0.000387851 0.000521073 56 | 0.000357683 0.000128925 2.93406e-05 0.000138543 -0.000118317 0.000268847 -0.000212042 0.000399502 57 | 0.000336198 0.000135479 1.68862e-06 0.000218085 -0.000116749 0.00017916 -0.000117798 0.000333787 58 | 0.000307244 0.000122211 -3.93336e-06 0.000211933 -0.000106381 0.000151173 -9.33714e-05 0.000302845 59 | 0.000300511 9.93715e-05 4.86275e-06 0.000230786 -0.000104756 0.000116281 -6.59211e-05 0.000259613 60 | 0.000300723 0.000100172 1.29128e-05 0.000185668 -0.000101935 0.000157492 -0.000108992 0.000289874 61 | 0.000278255 0.00010348 -4.64355e-07 0.000201026 -9.63521e-05 0.000124331 -7.38646e-05 0.000260312 62 | 0.000258378 0.000100057 7.075e-06 0.000145423 -8.80599e-05 0.000155841 -0.000111522 0.00026875 63 | 0.000275873 8.85816e-05 9.6919e-08 0.000221333 -9.54046e-05 9.49409e-05 -4.61547e-05 0.000236249 64 | 0.000248546 8.46825e-05 8.2937e-06 0.000155056 -8.3886e-05 0.000129072 -8.80767e-05 0.000242182 65 | 0.000247644 7.74817e-05 5.36621e-06 0.000181076 -8.41681e-05 0.000100233 -5.98669e-05 0.000221104 66 | 0.000245071 8.46181e-05 2.88367e-06 0.000178966 -8.52511e-05 0.000104409 -6.22723e-05 0.000221694 67 | 0.000252015 7.16076e-05 7.94436e-06 0.000184286 -8.51625e-05 9.77973e-05 -5.71402e-05 0.000220597 68 | 0.000222416 8.4576e-05 -6.5907e-06 0.000178845 -7.773e-05 8.3189e-05 -4.16687e-05 0.000201234 69 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/bin/libFM: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libfm/libfm/bin/libFM -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/bin/transpose: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libfm/libfm/bin/transpose -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/scripts/triple_format_to_libfm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 3 | # Contact: srendle@libfm.org, http://www.libfm.org/ 4 | # 5 | # This file is part of libFM. 6 | # 7 | # libFM is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # libFM is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with libFM. If not, see . 19 | # 20 | # 21 | # triple_format_to_libfm.pl: Converts data in a triple format 22 | # "id1 id2 id3 target" (like often used in recommender systems for rating 23 | # prediction) into the libfm format. 24 | # 25 | # Version history 26 | # - 2013-07-12: write groups 27 | # - 2012-12-27: header is not printed 28 | 29 | use Getopt::Long; 30 | use strict; 31 | 32 | srand(); 33 | 34 | 35 | my $file_in; 36 | my $file_out_meta; 37 | my $has_header = 0; 38 | my $target_column = undef; 39 | my $_delete_column = ""; 40 | my $offset = 0; # where to start counting for indices. For libsvm one should start with 1; libfm can deal with 0. 41 | my $separator = " "; 42 | 43 | # example 44 | # ./triple_format_to_libfm.pl --in train.txt,test.txt --header 0 --target_column 2 --delete_column 3,4,5,6,7 --offset 0 45 | 46 | 47 | GetOptions( 48 | 'in=s' => \$file_in, 49 | 'header=i' => \$has_header, 50 | 'target_column=i' => \$target_column, 51 | 'delete_column=s' => \$_delete_column, 52 | 'offset=i' => \$offset, 53 | 'separator=s' => \$separator, 54 | 'outmeta=s' => \$file_out_meta, 55 | ); 56 | 57 | (defined $target_column) || die "no target column specified"; 58 | 59 | my @files = split(/[,;]/, $file_in); 60 | my %delete_column; 61 | foreach my $c (split(/[,;]/, $_delete_column)) { 62 | $delete_column{int($c)} = 1; 63 | } 64 | 65 | my %id; 66 | my $id_cntr = $offset; 67 | 68 | my $OUT_GROUPS; 69 | if (defined $file_out_meta) { 70 | open $OUT_GROUPS, '>' , $file_out_meta; 71 | } 72 | 73 | foreach my $file_name (@files) { 74 | my $file_out = $file_name . ".libfm"; 75 | print "transforming file $file_name to $file_out..."; 76 | my $num_triples = 0; 77 | 78 | open my $IN, '<' , $file_name; 79 | open my $OUT, '>' , $file_out; 80 | if ($has_header) { 81 | $_ = <$IN>; 82 | # print {$OUT} $_; 83 | } 84 | while (<$IN>) { 85 | chomp; 86 | if ($_ ne "") { 87 | my @data = split /$separator/; 88 | ($#data >= $target_column) || die "not enough values in line $num_triples, expected at least $target_column values\nfound $_\n"; 89 | my $out_str = $data[$target_column]; 90 | my $out_col_id = 0; ## says which column in the input a field corresponds to after "deleting" the "delete_column", i.e. it is a counter over the #$data-field in @data assuming that some of the columns have been deleted; one can see this as the "group" id 91 | for (my $i = 0; $i <= $#data; $i++) { 92 | if (($i != $target_column) && (! exists $delete_column{$i})) { 93 | my $col_id = $out_col_id . " " . $data[$i]; ## this id holds the unique id of $data[$i] (also w.r.t. its group) 94 | if (! exists $id{$col_id}) { 95 | $id{$col_id} = $id_cntr; 96 | if (defined $file_out_meta) { 97 | print {$OUT_GROUPS} $out_col_id, "\n"; 98 | } 99 | $id_cntr++; 100 | } 101 | my $libfm_id = $id{$col_id}; 102 | $out_str .= " " . $libfm_id . ":1"; 103 | $out_col_id++; 104 | } 105 | } 106 | print {$OUT} $out_str, "\n"; 107 | } 108 | } 109 | close $OUT; 110 | close $IN; 111 | print "\n"; 112 | } 113 | 114 | if (defined $file_out_meta) { 115 | close $OUT_GROUPS; 116 | } 117 | 118 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/fm_core/fm_data.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // fm_data.h: Base data type of libFM 21 | 22 | #ifndef FM_DATA_H_ 23 | #define FM_DATA_H_ 24 | 25 | typedef float FM_FLOAT; 26 | 27 | #endif /*FM_DATA_H_*/ 28 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/fm_core/fm_model.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // fm_model.h: Model for Factorization Machines 21 | // 22 | // Based on the publication(s): 23 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th 24 | // IEEE International Conference on Data Mining (ICDM 2010), Sydney, 25 | // Australia. 26 | 27 | #ifndef FM_MODEL_H_ 28 | #define FM_MODEL_H_ 29 | 30 | #include "../util/matrix.h" 31 | #include "../util/fmatrix.h" 32 | 33 | #include "fm_data.h" 34 | 35 | 36 | class fm_model { 37 | private: 38 | DVector m_sum, m_sum_sqr; 39 | public: 40 | double w0; 41 | DVectorDouble w; 42 | DMatrixDouble v; 43 | 44 | public: 45 | // the following values should be set: 46 | uint num_attribute; 47 | 48 | bool k0, k1; 49 | int num_factor; 50 | 51 | double reg0; 52 | double regw, regv; 53 | 54 | double init_stdev; 55 | double init_mean; 56 | 57 | fm_model(); 58 | void debug(); 59 | void init(); 60 | double predict(sparse_row& x); 61 | double predict(sparse_row& x, DVector &sum, DVector &sum_sqr); 62 | void saveModel(std::string model_file_path); 63 | int loadModel(std::string model_file_path); 64 | private: 65 | void splitString(const std::string& s, char c, std::vector& v); 66 | 67 | }; 68 | 69 | 70 | 71 | fm_model::fm_model() { 72 | num_factor = 0; 73 | init_mean = 0; 74 | init_stdev = 0.01; 75 | reg0 = 0.0; 76 | regw = 0.0; 77 | regv = 0.0; 78 | k0 = true; 79 | k1 = true; 80 | } 81 | 82 | void fm_model::debug() { 83 | std::cout << "num_attributes=" << num_attribute << std::endl; 84 | std::cout << "use w0=" << k0 << std::endl; 85 | std::cout << "use w1=" << k1 << std::endl; 86 | std::cout << "dim v =" << num_factor << std::endl; 87 | std::cout << "reg_w0=" << reg0 << std::endl; 88 | std::cout << "reg_w=" << regw << std::endl; 89 | std::cout << "reg_v=" << regv << std::endl; 90 | std::cout << "init ~ N(" << init_mean << "," << init_stdev << ")" << std::endl; 91 | } 92 | 93 | void fm_model::init() { 94 | w0 = 0; 95 | w.setSize(num_attribute); 96 | v.setSize(num_factor, num_attribute); 97 | w.init(0); 98 | v.init(init_mean, init_stdev); 99 | m_sum.setSize(num_factor); 100 | m_sum_sqr.setSize(num_factor); 101 | } 102 | 103 | double fm_model::predict(sparse_row& x) { 104 | return predict(x, m_sum, m_sum_sqr); 105 | } 106 | 107 | double fm_model::predict(sparse_row& x, DVector &sum, DVector &sum_sqr) { 108 | double result = 0; 109 | if (k0) { 110 | result += w0; 111 | } 112 | if (k1) { 113 | for (uint i = 0; i < x.size; i++) { 114 | assert(x.data[i].id < num_attribute); 115 | result += w(x.data[i].id) * x.data[i].value; 116 | } 117 | } 118 | for (int f = 0; f < num_factor; f++) { 119 | sum(f) = 0; 120 | sum_sqr(f) = 0; 121 | for (uint i = 0; i < x.size; i++) { 122 | double d = v(f,x.data[i].id) * x.data[i].value; 123 | sum(f) += d; 124 | sum_sqr(f) += d*d; 125 | } 126 | result += 0.5 * (sum(f)*sum(f) - sum_sqr(f)); 127 | } 128 | return result; 129 | } 130 | 131 | /* 132 | * Write the FM model (all the parameters) in a file. 133 | */ 134 | void fm_model::saveModel(std::string model_file_path){ 135 | std::ofstream out_model; 136 | out_model.open(model_file_path.c_str()); 137 | if (k0) { 138 | out_model << "#global bias W0" << std::endl; 139 | out_model << w0 << std::endl; 140 | } 141 | if (k1) { 142 | out_model << "#unary interactions Wj" << std::endl; 143 | for (uint i = 0; i v_str; 182 | splitString(line, ' ', v_str); 183 | if ((int)v_str.size() != num_factor){return 0;} 184 | for (int f = 0; f < num_factor; f++) { 185 | v(f,i) = std::atof(v_str[f].c_str()); 186 | } 187 | } 188 | model_file.close(); 189 | } 190 | else{ return 0;} 191 | return 1; 192 | } 193 | 194 | /* 195 | * Splits the string s around matches of the given character c, and stores the substrings in the vector v 196 | */ 197 | void fm_model::splitString(const std::string& s, char c, std::vector& v) { 198 | std::string::size_type i = 0; 199 | std::string::size_type j = s.find(c); 200 | while (j != std::string::npos) { 201 | v.push_back(s.substr(i, j-i)); 202 | i = ++j; 203 | j = s.find(c, j); 204 | if (j == std::string::npos) 205 | v.push_back(s.substr(i, s.length())); 206 | } 207 | } 208 | 209 | #endif /*FM_MODEL_H_*/ 210 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/fm_core/fm_sgd.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // fm_sgd.h: Generic SGD for elementwise and pairwise losses for Factorization 21 | // Machines 22 | // 23 | // Based on the publication(s): 24 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th 25 | // IEEE International Conference on Data Mining (ICDM 2010), Sydney, 26 | // Australia. 27 | 28 | #ifndef FM_SGD_H_ 29 | #define FM_SGD_H_ 30 | 31 | #include "fm_model.h" 32 | 33 | void fm_SGD(fm_model* fm, const double& learn_rate, sparse_row &x, const double multiplier, DVector &sum) { 34 | if (fm->k0) { 35 | double& w0 = fm->w0; 36 | w0 -= learn_rate * (multiplier + fm->reg0 * w0); 37 | } 38 | if (fm->k1) { 39 | for (uint i = 0; i < x.size; i++) { 40 | double& w = fm->w(x.data[i].id); 41 | w -= learn_rate * (multiplier * x.data[i].value + fm->regw * w); 42 | } 43 | } 44 | for (int f = 0; f < fm->num_factor; f++) { 45 | for (uint i = 0; i < x.size; i++) { 46 | double& v = fm->v(f,x.data[i].id); 47 | double grad = sum(f) * x.data[i].value - v * x.data[i].value * x.data[i].value; 48 | v -= learn_rate * (multiplier * grad + fm->regv * v); 49 | } 50 | } 51 | } 52 | 53 | void fm_pairSGD(fm_model* fm, const double& learn_rate, sparse_row &x_pos, sparse_row &x_neg, const double multiplier, DVector &sum_pos, DVector &sum_neg, DVector &grad_visited, DVector &grad) { 54 | if (fm->k0) { 55 | double& w0 = fm->w0; 56 | w0 -= fm->reg0 * w0; // w0 should always be 0 57 | } 58 | if (fm->k1) { 59 | for (uint i = 0; i < x_pos.size; i++) { 60 | grad(x_pos.data[i].id) = 0; 61 | grad_visited(x_pos.data[i].id) = false; 62 | } 63 | for (uint i = 0; i < x_neg.size; i++) { 64 | grad(x_neg.data[i].id) = 0; 65 | grad_visited(x_neg.data[i].id) = false; 66 | } 67 | for (uint i = 0; i < x_pos.size; i++) { 68 | grad(x_pos.data[i].id) += x_pos.data[i].value; 69 | } 70 | for (uint i = 0; i < x_neg.size; i++) { 71 | grad(x_neg.data[i].id) -= x_neg.data[i].value; 72 | } 73 | for (uint i = 0; i < x_pos.size; i++) { 74 | uint& attr_id = x_pos.data[i].id; 75 | if (! grad_visited(attr_id)) { 76 | double& w = fm->w(attr_id); 77 | w -= learn_rate * (multiplier * grad(attr_id) + fm->regw * w); 78 | grad_visited(attr_id) = true; 79 | } 80 | } 81 | for (uint i = 0; i < x_neg.size; i++) { 82 | uint& attr_id = x_neg.data[i].id; 83 | if (! grad_visited(attr_id)) { 84 | double& w = fm->w(attr_id); 85 | w -= learn_rate * (multiplier * grad(attr_id) + fm->regw * w); 86 | grad_visited(attr_id) = true; 87 | } 88 | } 89 | } 90 | 91 | for (int f = 0; f < fm->num_factor; f++) { 92 | for (uint i = 0; i < x_pos.size; i++) { 93 | grad(x_pos.data[i].id) = 0; 94 | grad_visited(x_pos.data[i].id) = false; 95 | } 96 | for (uint i = 0; i < x_neg.size; i++) { 97 | grad(x_neg.data[i].id) = 0; 98 | grad_visited(x_neg.data[i].id) = false; 99 | } 100 | for (uint i = 0; i < x_pos.size; i++) { 101 | grad(x_pos.data[i].id) += sum_pos(f) * x_pos.data[i].value - fm->v(f, x_pos.data[i].id) * x_pos.data[i].value * x_pos.data[i].value; 102 | } 103 | for (uint i = 0; i < x_neg.size; i++) { 104 | grad(x_neg.data[i].id) -= sum_neg(f) * x_neg.data[i].value - fm->v(f, x_neg.data[i].id) * x_neg.data[i].value * x_neg.data[i].value; 105 | } 106 | for (uint i = 0; i < x_pos.size; i++) { 107 | uint& attr_id = x_pos.data[i].id; 108 | if (! grad_visited(attr_id)) { 109 | double& v = fm->v(f,attr_id); 110 | v -= learn_rate * (multiplier * grad(attr_id) + fm->regv * v); 111 | grad_visited(attr_id) = true; 112 | } 113 | } 114 | for (uint i = 0; i < x_neg.size; i++) { 115 | uint& attr_id = x_neg.data[i].id; 116 | if (! grad_visited(attr_id)) { 117 | double& v = fm->v(f,attr_id); 118 | v -= learn_rate * (multiplier * grad(attr_id) + fm->regv * v); 119 | grad_visited(attr_id) = true; 120 | } 121 | } 122 | 123 | 124 | } 125 | 126 | } 127 | #endif /*FM_SGD_H_*/ 128 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/libfm/Makefile: -------------------------------------------------------------------------------- 1 | BIN_DIR := ../../bin/ 2 | 3 | OBJECTS := \ 4 | libfm.o \ 5 | tools/transpose.o \ 6 | tools/convert.o \ 7 | 8 | all: libFM transpose convert 9 | 10 | libFM: libfm.o 11 | mkdir -p $(BIN_DIR) 12 | g++ -O3 -Wall libfm.o -o $(BIN_DIR)libFM 13 | 14 | %.o: %.cpp 15 | g++ -O3 -Wall -c $< -o $@ 16 | 17 | clean: clean_lib 18 | mkdir -p $(BIN_DIR) 19 | rm -f $(BIN_DIR)libFM $(BIN_DIR)convert $(BIN_DIR)transpose 20 | 21 | clean_lib: 22 | rm -f $(OBJECTS) 23 | 24 | 25 | transpose: tools/transpose.o 26 | mkdir -p $(BIN_DIR) 27 | g++ -O3 tools/transpose.o -o $(BIN_DIR)transpose 28 | 29 | convert: tools/convert.o 30 | mkdir -p $(BIN_DIR) 31 | g++ -O3 tools/convert.o -o $(BIN_DIR)convert 32 | 33 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/libfm/libfm.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libfm/libfm/src/libfm/libfm.o -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/libfm/src/fm_learn.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // fm_learn.h: Generic learning method for factorization machines 21 | 22 | #ifndef FM_LEARN_H_ 23 | #define FM_LEARN_H_ 24 | 25 | #include 26 | #include "Data.h" 27 | #include "../../fm_core/fm_model.h" 28 | #include "../../util/rlog.h" 29 | #include "../../util/util.h" 30 | 31 | 32 | class fm_learn { 33 | protected: 34 | DVector sum, sum_sqr; 35 | DMatrix pred_q_term; 36 | 37 | // this function can be overwritten (e.g. for MCMC) 38 | virtual double predict_case(Data& data) { 39 | return fm->predict(data.data->getRow()); 40 | } 41 | 42 | public: 43 | DataMetaInfo* meta; 44 | fm_model* fm; 45 | double min_target; 46 | double max_target; 47 | 48 | int task; // 0=regression, 1=classification 49 | 50 | const static int TASK_REGRESSION = 0; 51 | const static int TASK_CLASSIFICATION = 1; 52 | 53 | Data* validation; 54 | 55 | 56 | RLog* log; 57 | 58 | fm_learn() { log = NULL; task = 0; meta = NULL;} 59 | 60 | 61 | virtual void init() { 62 | if (log != NULL) { 63 | if (task == TASK_REGRESSION) { 64 | log->addField("rmse", std::numeric_limits::quiet_NaN()); 65 | log->addField("mae", std::numeric_limits::quiet_NaN()); 66 | } else if (task == TASK_CLASSIFICATION) { 67 | log->addField("accuracy", std::numeric_limits::quiet_NaN()); 68 | } else { 69 | throw "unknown task"; 70 | } 71 | log->addField("time_pred", std::numeric_limits::quiet_NaN()); 72 | log->addField("time_learn", std::numeric_limits::quiet_NaN()); 73 | log->addField("time_learn2", std::numeric_limits::quiet_NaN()); 74 | log->addField("time_learn4", std::numeric_limits::quiet_NaN()); 75 | } 76 | sum.setSize(fm->num_factor); 77 | sum_sqr.setSize(fm->num_factor); 78 | pred_q_term.setSize(fm->num_factor, meta->num_relations + 1); 79 | } 80 | 81 | virtual double evaluate(Data& data) { 82 | assert(data.data != NULL); 83 | if (task == TASK_REGRESSION) { 84 | return evaluate_regression(data); 85 | } else if (task == TASK_CLASSIFICATION) { 86 | return evaluate_classification(data); 87 | } else { 88 | throw "unknown task"; 89 | } 90 | } 91 | 92 | public: 93 | virtual void learn(Data& train, Data& test) { } 94 | 95 | virtual void predict(Data& data, DVector& out) = 0; 96 | // virtual void sgd_logits(Data& data, DVector& out) = 0; 97 | 98 | virtual void debug() { 99 | std::cout << "task=" << task << std::endl; 100 | std::cout << "min_target=" << min_target << std::endl; 101 | std::cout << "max_target=" << max_target << std::endl; 102 | } 103 | 104 | protected: 105 | virtual double evaluate_classification(Data& data) { 106 | int num_correct = 0; 107 | double eval_time = getusertime(); 108 | for (data.data->begin(); !data.data->end(); data.data->next()) { 109 | double p = predict_case(data); 110 | if (((p >= 0) && (data.target(data.data->getRowIndex()) >= 0)) || ((p < 0) && (data.target(data.data->getRowIndex()) < 0))) { 111 | num_correct++; 112 | } 113 | } 114 | eval_time = (getusertime() - eval_time); 115 | // log the values 116 | if (log != NULL) { 117 | log->log("accuracy", (double) num_correct / (double) data.data->getNumRows()); 118 | log->log("time_pred", eval_time); 119 | } 120 | //printf("%lf / %lf = %lf\n", (double) num_correct, (double) data.data->getNumRows(), (double) num_correct / (double) data.data->getNumRows()); 121 | return (double) num_correct / (double) data.data->getNumRows(); 122 | } 123 | virtual double evaluate_regression(Data& data) { 124 | double rmse_sum_sqr = 0; 125 | double mae_sum_abs = 0; 126 | double eval_time = getusertime(); 127 | for (data.data->begin(); !data.data->end(); data.data->next()) { 128 | double p = predict_case(data); 129 | p = std::min(max_target, p); 130 | p = std::max(min_target, p); 131 | double err = p - data.target(data.data->getRowIndex()); 132 | rmse_sum_sqr += err*err; 133 | mae_sum_abs += std::abs((double)err); 134 | } 135 | eval_time = (getusertime() - eval_time); 136 | // log the values 137 | if (log != NULL) { 138 | log->log("rmse", std::sqrt(rmse_sum_sqr/data.data->getNumRows())); 139 | log->log("mae", mae_sum_abs/data.data->getNumRows()); 140 | log->log("time_pred", eval_time); 141 | } 142 | 143 | return std::sqrt(rmse_sum_sqr/data.data->getNumRows()); 144 | } 145 | 146 | }; 147 | 148 | #endif /*FM_LEARN_H_*/ 149 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/libfm/src/fm_learn_sgd.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // fm_learn_sgd.h: Stochastic Gradient Descent based learning 21 | // 22 | // Based on the publication(s): 23 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th 24 | // IEEE International Conference on Data Mining (ICDM 2010), Sydney, 25 | // Australia. 26 | 27 | #ifndef FM_LEARN_SGD_H_ 28 | #define FM_LEARN_SGD_H_ 29 | 30 | #include "fm_learn.h" 31 | #include "../../fm_core/fm_sgd.h" 32 | 33 | class fm_learn_sgd: public fm_learn { 34 | protected: 35 | //DVector sum, sum_sqr; 36 | public: 37 | int num_iter; 38 | double learn_rate; 39 | DVector learn_rates; 40 | 41 | virtual void init() { 42 | fm_learn::init(); 43 | learn_rates.setSize(3); 44 | // sum.setSize(fm->num_factor); 45 | // sum_sqr.setSize(fm->num_factor); 46 | } 47 | 48 | virtual void learn(Data& train, Data& test) { 49 | fm_learn::learn(train, test); 50 | std::cout << "learnrate=" << learn_rate << std::endl; 51 | std::cout << "learnrates=" << learn_rates(0) << "," << learn_rates(1) << "," << learn_rates(2) << std::endl; 52 | std::cout << "#iterations=" << num_iter << std::endl; 53 | 54 | if (train.relation.dim > 0) { 55 | throw "relations are not supported with SGD"; 56 | } 57 | std::cout.flush(); 58 | } 59 | 60 | void SGD(sparse_row &x, const double multiplier, DVector &sum) { 61 | fm_SGD(fm, learn_rate, x, multiplier, sum); 62 | } 63 | 64 | void debug() { 65 | std::cout << "num_iter=" << num_iter << std::endl; 66 | fm_learn::debug(); 67 | } 68 | 69 | virtual void predict(Data& data, DVector& out) { 70 | assert(data.data->getNumRows() == out.dim); 71 | for (data.data->begin(); !data.data->end(); data.data->next()) { 72 | double p = predict_case(data); 73 | if (task == TASK_REGRESSION ) { 74 | p = std::min(max_target, p); 75 | p = std::max(min_target, p); 76 | } else if (task == TASK_CLASSIFICATION) { 77 | p = 1.0/(1.0 + exp(-p)); 78 | } else { 79 | throw "task not supported"; 80 | } 81 | out(data.data->getRowIndex()) = p; 82 | } 83 | } 84 | 85 | }; 86 | 87 | #endif /*FM_LEARN_SGD_H_*/ 88 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/libfm/src/fm_learn_sgd_element.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // fm_learn_sgd.h: Stochastic Gradient Descent based learning for 21 | // classification and regression 22 | // 23 | // Based on the publication(s): 24 | // - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th 25 | // IEEE International Conference on Data Mining (ICDM 2010), Sydney, 26 | // Australia. 27 | 28 | #ifndef FM_LEARN_SGD_ELEMENT_H_ 29 | #define FM_LEARN_SGD_ELEMENT_H_ 30 | 31 | #include "fm_learn_sgd.h" 32 | 33 | class fm_learn_sgd_element: public fm_learn_sgd { 34 | public: 35 | virtual void init() { 36 | fm_learn_sgd::init(); 37 | 38 | if (log != NULL) { 39 | log->addField("rmse_train", std::numeric_limits::quiet_NaN()); 40 | } 41 | } 42 | virtual void learn(Data& train, Data& test) { 43 | fm_learn_sgd::learn(train, test); 44 | 45 | std::cout << "SGD: DON'T FORGET TO SHUFFLE THE ROWS IN TRAINING DATA TO GET THE BEST RESULTS." << std::endl; 46 | // SGD 47 | for (int i = 0; i < num_iter; i++) { 48 | 49 | double iteration_time = getusertime(); 50 | for (train.data->begin(); !train.data->end(); train.data->next()) { 51 | 52 | double p = fm->predict(train.data->getRow(), sum, sum_sqr); 53 | double mult = 0; 54 | if (task == 0) { 55 | p = std::min(max_target, p); 56 | p = std::max(min_target, p); 57 | mult = -(train.target(train.data->getRowIndex())-p); 58 | } else if (task == 1) { 59 | mult = -train.target(train.data->getRowIndex())*(1.0-1.0/(1.0+exp(-train.target(train.data->getRowIndex())*p))); 60 | } 61 | SGD(train.data->getRow(), mult, sum); 62 | } 63 | iteration_time = (getusertime() - iteration_time); 64 | double rmse_train = evaluate(train); 65 | double rmse_test = evaluate(test); 66 | std::cout << "#Iter=" << std::setw(3) << i << "\tTrain=" << rmse_train << "\tTest=" << rmse_test << std::endl; 67 | if (log != NULL) { 68 | log->log("rmse_train", rmse_train); 69 | log->log("time_learn", iteration_time); 70 | log->newLine(); 71 | } 72 | } 73 | } 74 | 75 | void sgd_logits(Data& data, DVector& out) { 76 | assert(data.data->getNumRows() == out.dim); 77 | for (data.data->begin(); !data.data->end(); data.data->next()) { 78 | double p = predict_case(data); 79 | // std::cout << p << std::endl; 80 | // if (task == TASK_REGRESSION ) { 81 | // p = std::min(max_target, p); 82 | // p = std::max(min_target, p); 83 | // } else if (task == TASK_CLASSIFICATION) { 84 | // p = 1.0/(1.0 + exp(-p)); 85 | // } else { 86 | // throw "task not supported"; 87 | // } 88 | out(data.data->getRowIndex()) = p; 89 | } 90 | } 91 | 92 | }; 93 | 94 | #endif /*FM_LEARN_SGD_ELEMENT_H_*/ 95 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/libfm/src/relation.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // relation.h: Data and Links for Relations 21 | 22 | #ifndef RELATION_DATA_H_ 23 | #define RELATION_DATA_H_ 24 | 25 | #include 26 | #include "../../util/matrix.h" 27 | #include "../../util/fmatrix.h" 28 | #include "../../fm_core/fm_data.h" 29 | #include "../../fm_core/fm_model.h" 30 | #include "Data.h" 31 | 32 | class RelationData { 33 | protected: 34 | uint cache_size; 35 | bool has_xt; 36 | bool has_x; 37 | public: 38 | RelationData(uint cache_size, bool has_x, bool has_xt) { 39 | this->data_t = NULL; 40 | this->data = NULL; 41 | this->cache_size = cache_size; 42 | this->has_x = has_x; 43 | this->has_xt = has_xt; 44 | this->meta = NULL; 45 | } 46 | DataMetaInfo* meta; 47 | 48 | LargeSparseMatrix* data_t; 49 | LargeSparseMatrix* data; 50 | 51 | int num_feature; 52 | uint num_cases; 53 | uint attr_offset; 54 | 55 | void load(std::string filename); 56 | void debug(); 57 | }; 58 | 59 | 60 | class RelationJoin { 61 | public: 62 | DVector data_row_to_relation_row; 63 | RelationData* data; 64 | 65 | void load(std::string filename, uint expected_row_count) { 66 | bool do_binary = false; 67 | // check if binary or text format should be read 68 | { 69 | std::ifstream in (filename.c_str(), std::ios_base::in | std::ios_base::binary); 70 | if (in.is_open()) { 71 | uint file_version; 72 | uint data_size; 73 | in.read(reinterpret_cast(&file_version), sizeof(file_version)); 74 | in.read(reinterpret_cast(&data_size), sizeof(data_size)); 75 | do_binary = ((file_version == DVECTOR_EXPECTED_FILE_ID) && (data_size == sizeof(uint))); 76 | in.close(); 77 | } 78 | } 79 | if (do_binary) { 80 | //std::cout << "(binary mode) " << std::endl; 81 | data_row_to_relation_row.loadFromBinaryFile(filename); 82 | } else { 83 | //std::cout << "(text mode) " << std::endl; 84 | data_row_to_relation_row.setSize(expected_row_count); 85 | data_row_to_relation_row.load(filename); 86 | } 87 | assert(data_row_to_relation_row.dim == expected_row_count); 88 | } 89 | }; 90 | 91 | void RelationData::load(std::string filename) { 92 | 93 | std::cout << "has x = " << has_x << std::endl; 94 | std::cout << "has xt = " << has_xt << std::endl; 95 | assert(has_x || has_xt); 96 | 97 | //uint num_cases = 0; 98 | uint num_values = 0; 99 | uint this_cs = cache_size; 100 | if (has_xt && has_x) { this_cs /= 2; } 101 | 102 | if (has_x) { 103 | std::cout << "data... "; 104 | this->data = new LargeSparseMatrixHD(filename + ".x", this_cs); 105 | this->num_feature = this->data->getNumCols(); 106 | num_values = this->data->getNumValues(); 107 | num_cases = this->data->getNumRows(); 108 | } else { 109 | data = NULL; 110 | } 111 | if (has_xt) { 112 | std::cout << "data transpose... "; 113 | this->data_t = new LargeSparseMatrixHD(filename + ".xt", this_cs); 114 | this->num_feature = this->data_t->getNumRows(); 115 | num_values = this->data_t->getNumValues(); 116 | num_cases = this->data_t->getNumCols(); 117 | } else { 118 | data_t = NULL; 119 | } 120 | 121 | if (has_xt && has_x) { 122 | assert(this->data->getNumCols() == this->data_t->getNumRows()); 123 | assert(this->data->getNumRows() == this->data_t->getNumCols()); 124 | assert(this->data->getNumValues() == this->data_t->getNumValues()); 125 | } 126 | 127 | std::cout << "num_cases=" << this->num_cases << "\tnum_values=" << num_values << "\tnum_features=" << this->num_feature << std::endl; 128 | 129 | meta = new DataMetaInfo(this->num_feature); 130 | 131 | if (fileexists(filename + ".groups")) { 132 | meta->loadGroupsFromFile(filename + ".groups"); 133 | } 134 | } 135 | 136 | 137 | void RelationData::debug() { 138 | if (has_x) { 139 | for (data->begin(); (!data->end()) && (data->getRowIndex() < 4); data->next() ) { 140 | for (uint j = 0; j < data->getRow().size; j++) { 141 | std::cout << " " << data->getRow().data[j].id << ":" << data->getRow().data[j].value; 142 | } 143 | std::cout << std::endl; 144 | } 145 | } 146 | } 147 | 148 | #endif /*RELATION_DATA_H_*/ 149 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/libfm/tools/convert.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libfm/libfm/src/libfm/tools/convert.o -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/libfm/tools/transpose.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // transpose: Transposes a matrix in binary sparse format. 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include "../../util/util.h" 30 | #include "../../util/cmdline.h" 31 | #include "../src/Data.h" 32 | 33 | /** 34 | * 35 | * Version history: 36 | * 1.4.2: 37 | * changed license to GPLv3 38 | * 1.4.0: 39 | * default cache size is 200 MB 40 | * 1.3.6: 41 | * binary mode for file access 42 | * 1.3.4: 43 | * no differences, version numbers are kept in sync over all libfm tools 44 | * 1.3.2: 45 | * no differences, version numbers are kept in sync over all libfm tools 46 | * 1.0: 47 | * first version 48 | */ 49 | 50 | 51 | 52 | using namespace std; 53 | 54 | int main(int argc, char **argv) { 55 | 56 | srand ( time(NULL) ); 57 | try { 58 | CMDLine cmdline(argc, argv); 59 | std::cout << "----------------------------------------------------------------------------" << std::endl; 60 | std::cout << "Transpose" << std::endl; 61 | std::cout << " Version: 1.4.2" << std::endl; 62 | std::cout << " Author: Steffen Rendle, srendle@libfm.org" << std::endl; 63 | std::cout << " WWW: http://www.libfm.org/" << std::endl; 64 | std::cout << "This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt." << std::endl; 65 | std::cout << "This is free software, and you are welcome to redistribute it under certain" << std::endl; 66 | std::cout << "conditions; for details see license.txt." << std::endl; 67 | std::cout << "----------------------------------------------------------------------------" << std::endl; 68 | 69 | const std::string param_ifile = cmdline.registerParameter("ifile", "input file name, file has to be in binary sparse format [MANDATORY]"); 70 | const std::string param_ofile = cmdline.registerParameter("ofile", "output file name [MANDATORY]"); 71 | 72 | const std::string param_cache_size = cmdline.registerParameter("cache_size", "cache size for data storage, default=200000000"); 73 | const std::string param_help = cmdline.registerParameter("help", "this screen"); 74 | 75 | 76 | if (cmdline.hasParameter(param_help) || (argc == 1)) { 77 | cmdline.print_help(); 78 | return 0; 79 | } 80 | cmdline.checkParameters(); 81 | 82 | 83 | // (1) Load the data 84 | long long cache_size = cmdline.getValue(param_cache_size, 200000000); 85 | cache_size /= 2; 86 | LargeSparseMatrixHD d_in(cmdline.getValue(param_ifile), cache_size); 87 | std::cout << "num_rows=" << d_in.getNumRows() << "\tnum_values=" << d_in.getNumValues() << "\tnum_features=" << d_in.getNumCols() << std::endl; 88 | 89 | // (2) transpose the data 90 | // (2.1) count how many entries per col (=transpose-row) there are: 91 | DVector entries_per_col(d_in.getNumCols()); 92 | entries_per_col.init(0); 93 | for (d_in.begin(); !d_in.end(); d_in.next() ) { 94 | sparse_row& row = d_in.getRow(); 95 | for (uint j = 0; j < row.size; j++) { 96 | entries_per_col(row.data[j].id)++; 97 | } 98 | } 99 | // (2.2) build a 100 | std::string ofile = cmdline.getValue(param_ofile); 101 | std::cout << "output to " << ofile << std::endl; std::cout.flush(); 102 | std::ofstream out(ofile.c_str(), ios_base::out | ios_base::binary); 103 | if (out.is_open()) { 104 | file_header fh; 105 | fh.id = FMATRIX_EXPECTED_FILE_ID; 106 | fh.num_values = d_in.getNumValues(); 107 | fh.num_rows = d_in.getNumCols(); 108 | fh.num_cols = d_in.getNumRows(); 109 | fh.float_size = sizeof(DATA_FLOAT); 110 | out.write(reinterpret_cast(&fh), sizeof(fh)); 111 | 112 | DVector< sparse_row > out_row_cache; 113 | DVector< sparse_entry > out_entry_cache; 114 | { 115 | // determine cache sizes automatically: 116 | double avg_entries_per_line = (double) d_in.getNumValues() / d_in.getNumCols(); 117 | uint num_rows_in_cache = cache_size / (sizeof(sparse_entry) * avg_entries_per_line + sizeof(uint)); 118 | num_rows_in_cache = std::min(num_rows_in_cache, d_in.getNumCols()); 119 | uint64 num_entries_in_cache = (cache_size - sizeof(uint)*num_rows_in_cache) / sizeof(sparse_entry); 120 | num_entries_in_cache = std::min(num_entries_in_cache, d_in.getNumValues()); 121 | std::cout << "num entries in cache=" << num_entries_in_cache << "\tnum rows in cache=" << num_rows_in_cache << std::endl; 122 | out_entry_cache.setSize(num_entries_in_cache); 123 | out_row_cache.setSize(num_rows_in_cache); 124 | } 125 | 126 | uint out_cache_col_position = 0; // the first column id that is in cache 127 | uint out_cache_col_num = 0; // how many columns are in the cache 128 | 129 | while (out_cache_col_position < d_in.getNumCols()) { 130 | // assign cache sizes 131 | { 132 | uint entry_cache_pos = 0; 133 | // while (there is enough space in the entry cache for the next row) and (there is space for another row) and (there is another row in the data) do 134 | while (((entry_cache_pos + entries_per_col(out_cache_col_position + out_cache_col_num)) < out_entry_cache.dim) && ((out_cache_col_num+1) < out_row_cache.dim) && ((out_cache_col_position+out_cache_col_num) < d_in.getNumCols())) { 135 | out_row_cache(out_cache_col_num).size = 0; 136 | out_row_cache(out_cache_col_num).data = &(out_entry_cache.value[entry_cache_pos]); 137 | entry_cache_pos += entries_per_col(out_cache_col_position + out_cache_col_num); 138 | out_cache_col_num++; 139 | } 140 | } 141 | assert(out_cache_col_num > 0); 142 | // fill the cache 143 | for (d_in.begin(); !d_in.end(); d_in.next() ) { 144 | sparse_row& row = d_in.getRow(); 145 | for (uint j = 0; j < row.size; j++) { 146 | if ((row.data[j].id >= out_cache_col_position) && (row.data[j].id < (out_cache_col_position+out_cache_col_num))) { 147 | uint cache_row_index = row.data[j].id-out_cache_col_position; 148 | out_row_cache(cache_row_index).data[out_row_cache(cache_row_index).size].id = d_in.getRowIndex(); 149 | out_row_cache(cache_row_index).data[out_row_cache(cache_row_index).size].value = row.data[j].value; 150 | out_row_cache(cache_row_index).size++; 151 | } 152 | } 153 | } 154 | 155 | for (uint i = 0; i < out_cache_col_num; i++) { 156 | assert(out_row_cache(i).size == entries_per_col(i + out_cache_col_position)); 157 | out.write(reinterpret_cast(&(out_row_cache(i).size)), sizeof(uint)); 158 | out.write(reinterpret_cast(out_row_cache(i).data), sizeof(sparse_entry)*out_row_cache(i).size); 159 | } 160 | out_cache_col_position += out_cache_col_num; 161 | out_cache_col_num = 0; 162 | } 163 | out.close(); 164 | } else { 165 | throw "could not open " + ofile; 166 | } 167 | 168 | } catch (std::string &e) { 169 | std::cerr << e << std::endl; 170 | } 171 | 172 | } 173 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/libfm/tools/transpose.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/libfm/libfm/src/libfm/tools/transpose.o -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/util/cmdline.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // cmdline.h: Command line parser 21 | 22 | #ifndef CMDLINE_H_ 23 | #define CMDLINE_H_ 24 | 25 | #include 26 | #include 27 | #include "util.h" 28 | 29 | class CMDLine { 30 | protected: 31 | std::map< std::string, std::string > help; 32 | std::map< std::string, std::string > value; 33 | bool parse_name(std::string& s) { 34 | if ((s.length() > 0) && (s[0] == '-')) { 35 | if ((s.length() > 1) && (s[1] == '-')) { 36 | s = s.substr(2); 37 | } else { 38 | s = s.substr(1); 39 | } 40 | return true; 41 | } else { 42 | return false; 43 | } 44 | } 45 | 46 | public: 47 | std::string delimiter; 48 | 49 | CMDLine(int argc, char **argv) { 50 | delimiter = ";,"; 51 | int i = 1; 52 | while (i < argc) { 53 | std::string s(argv[i]); 54 | if (parse_name(s)) { 55 | if (value.find(s) != value.end()) { 56 | throw "the parameter " + s + " is already specified"; 57 | } 58 | if ((i+1) < argc) { 59 | std::string s_next(argv[i+1]); 60 | if (! parse_name(s_next)) { 61 | value[s] = s_next; 62 | i++; 63 | } else { 64 | value[s] = ""; 65 | } 66 | } else { 67 | value[s] = ""; 68 | } 69 | } else { 70 | throw "cannot parse " + s; 71 | } 72 | i++; 73 | } 74 | } 75 | 76 | void setValue(std::string parameter, std::string value) { 77 | this->value[parameter] = value; 78 | } 79 | 80 | bool hasParameter(std::string parameter) { 81 | return (value.find(parameter) != value.end()); 82 | } 83 | 84 | void removeParameter(const std::string& parameter) { 85 | if (hasParameter(parameter)) { 86 | value.erase(parameter); 87 | } 88 | } 89 | 90 | void print_help() { 91 | for (std::map< std::string, std::string >::const_iterator pv = help.begin(); pv != help.end(); ++pv) { 92 | std::cout << "-" << pv->first; 93 | for (int i=pv->first.size()+1; i < 16; i++) { std::cout << " "; } 94 | std::string s_out = pv->second; 95 | while (s_out.size() > 0) { 96 | if (s_out.size() > (72-16)) { 97 | size_t p = s_out.substr(0, 72-16).find_last_of(" \t"); 98 | if (p == 0) { 99 | p = 72-16; 100 | } 101 | std::cout << s_out.substr(0, p) << std::endl; 102 | s_out = s_out.substr(p+1, s_out.length()-p); 103 | } else { 104 | std::cout << s_out << std::endl; 105 | s_out = ""; 106 | } 107 | if (s_out.size() > 0) { 108 | for (int i=0; i < 16; i++) { std::cout << " "; } 109 | } 110 | } 111 | } 112 | } 113 | const std::string& registerParameter(const std::string& parameter, const std::string& help) { 114 | this->help[parameter] = help; 115 | return parameter; 116 | } 117 | 118 | void checkParameters() { 119 | // make sure there is no parameter specified on the cmdline that is not registered: 120 | for (std::map< std::string, std::string >::const_iterator pv = value.begin(); pv != value.end(); ++pv) { 121 | if (help.find(pv->first) == help.end()) { 122 | throw "the parameter " + pv->first + " does not exist"; 123 | } 124 | } 125 | } 126 | 127 | const std::string& getValue(const std::string& parameter) { 128 | return value[parameter]; 129 | } 130 | 131 | const std::string& getValue(const std::string& parameter, const std::string& default_value) { 132 | if (hasParameter(parameter)) { 133 | return value[parameter]; 134 | } else { 135 | return default_value; 136 | } 137 | } 138 | 139 | const double getValue(const std::string& parameter, const double& default_value) { 140 | if (hasParameter(parameter)) { 141 | return atof(value[parameter].c_str()); 142 | } else { 143 | return default_value; 144 | } 145 | } 146 | 147 | const long int getValue(const std::string& parameter, const long int& default_value) { 148 | if (hasParameter(parameter)) { 149 | return atoi(value[parameter].c_str()); 150 | } else { 151 | return default_value; 152 | } 153 | } 154 | 155 | const int getValue(const std::string& parameter, const int& default_value) { 156 | if (hasParameter(parameter)) { 157 | return atoi(value[parameter].c_str()); 158 | } else { 159 | return default_value; 160 | } 161 | } 162 | 163 | const uint getValue(const std::string& parameter, const uint& default_value) { 164 | if (hasParameter(parameter)) { 165 | return atoi(value[parameter].c_str()); 166 | } else { 167 | return default_value; 168 | } 169 | } 170 | 171 | std::vector getStrValues(const std::string& parameter) { 172 | std::vector result = tokenize(value[parameter], delimiter); 173 | return result; 174 | } 175 | std::vector getIntValues(const std::string& parameter) { 176 | std::vector result; 177 | std::vector result_str = getStrValues(parameter); 178 | result.resize(result_str.size()); 179 | for (uint i = 0; i < result.size(); i++) { 180 | result[i] = atoi(result_str[i].c_str()); 181 | } 182 | return result; 183 | } 184 | std::vector getDblValues(const std::string& parameter) { 185 | std::vector result; 186 | std::vector result_str = getStrValues(parameter); 187 | result.resize(result_str.size()); 188 | for (uint i = 0; i < result.size(); i++) { 189 | result[i] = atof(result_str[i].c_str()); 190 | } 191 | return result; 192 | } 193 | std::vector getUIntValues(const std::string& parameter) { 194 | std::vector result; 195 | std::vector result_str = getStrValues(parameter); 196 | result.resize(result_str.size()); 197 | for (uint i = 0; i < result.size(); i++) { 198 | result[i] = atoi(result_str[i].c_str()); 199 | } 200 | return result; 201 | } 202 | }; 203 | 204 | 205 | #endif /*CMDLINE_H_*/ 206 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/util/memory.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // memory.h: Logging memory consumption of large data structures 21 | 22 | #ifndef MEMORY_H_ 23 | #define MEMORY_H_ 24 | 25 | #include 26 | #include 27 | 28 | typedef unsigned long long int uint64; 29 | typedef signed long long int int64; 30 | 31 | class MemoryLog { 32 | private: 33 | uint64 mem_size; 34 | 35 | public: 36 | static MemoryLog& getInstance() { 37 | static MemoryLog instance; 38 | return instance; 39 | } 40 | 41 | MemoryLog() { 42 | mem_size = 0; 43 | } 44 | 45 | void logNew(std::string message, uint64 size, uint64 count = 1) { 46 | mem_size += size*count; 47 | // std::cout << "total memory consumption=" << mem_size << " bytes" << "\t" << "reserving " << count << "*" << size << " for " << message << std::endl; 48 | } 49 | void logFree(std::string message, uint64 size, uint64 count = 1) { 50 | mem_size -= size*count; 51 | // std::cout << "total memory consumption=" << mem_size << " bytes" << std::endl; 52 | } 53 | 54 | }; 55 | 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/util/random.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // random.h: Sampling methods 21 | 22 | #ifndef RANDOM_H_ 23 | #define RANDOM_H_ 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | 30 | double ran_gaussian(); 31 | double ran_gaussian(double mean, double stdev); 32 | double ran_left_tgaussian(double left); 33 | double ran_left_tgaussian(double left, double mean, double stdev); 34 | double ran_left_tgaussian_naive(double left); 35 | double ran_uniform(); 36 | double ran_exp(); 37 | double ran_gamma(double alpha, double beta); 38 | double ran_gamma(double alpha); 39 | bool ran_bernoulli(double p); 40 | 41 | double erf(double x); 42 | double cdf_gaussian(double x, double mean, double stdev); 43 | double cdf_gaussian(double x); 44 | 45 | 46 | 47 | double erf(double x) { 48 | double t; 49 | if (x >= 0) { 50 | t = 1.0 / (1.0 + 0.3275911 * x); 51 | } else { 52 | t = 1.0 / (1.0 - 0.3275911 * x); 53 | } 54 | 55 | double result = 1.0 - (t * (0.254829592 + t * (-0.284496736 + t * (1.421413741 + t * (-1.453152027 + t * 1.061405429)))))*exp(-x*x); 56 | if (x >= 0) { 57 | return result; 58 | } else { 59 | return -result; 60 | } 61 | } 62 | 63 | double cdf_gaussian(double x, double mean, double stdev) { 64 | return 0.5 + 0.5 * erf(0.707106781 * (x-mean) / stdev); 65 | } 66 | 67 | double cdf_gaussian(double x) { 68 | return 0.5 + 0.5 * erf(0.707106781 * x ); 69 | } 70 | 71 | 72 | double ran_left_tgaussian(double left) { 73 | // draw a trunctated normal: acceptance region are values larger than 74 | if (left <= 0.0) { // acceptance probability > 0.5 75 | return ran_left_tgaussian_naive(left); 76 | } else { 77 | // Robert: Simulation of truncated normal variables 78 | double alpha_star = 0.5*(left + sqrt(left*left + 4.0)); 79 | 80 | // draw from translated exponential distr: 81 | // f(alpha,left) = alpha * exp(-alpha*(z-left)) * I(z>=left) 82 | double z,d,u; 83 | do { 84 | z = ran_exp() / alpha_star + left; 85 | d = z-alpha_star; 86 | d = exp(-(d*d)/2); 87 | u = ran_uniform(); 88 | if (u < d) { 89 | return z; 90 | } 91 | } while (true); 92 | } 93 | } 94 | 95 | double ran_left_tgaussian_naive(double left) { 96 | // draw a trunctated normal: acceptance region are values larger than 97 | double result; 98 | do { 99 | result = ran_gaussian(); 100 | } while (result < left); 101 | return result; 102 | } 103 | 104 | double ran_left_tgaussian(double left, double mean, double stdev) { 105 | return mean + stdev * ran_left_tgaussian((left-mean)/stdev); 106 | } 107 | 108 | double ran_right_tgaussian(double right) { 109 | return -ran_left_tgaussian(-right); 110 | } 111 | 112 | double ran_right_tgaussian(double right, double mean, double stdev) { 113 | return mean + stdev * ran_right_tgaussian((right-mean)/stdev); 114 | } 115 | 116 | 117 | 118 | double ran_gamma(double alpha) { 119 | assert(alpha > 0); 120 | if (alpha < 1.0) { 121 | double u; 122 | do { 123 | u = ran_uniform(); 124 | } while (u == 0.0); 125 | return ran_gamma(alpha + 1.0) * pow(u, 1.0 / alpha); 126 | } else { 127 | // Marsaglia and Tsang: A Simple Method for Generating Gamma Variables 128 | double d,c,x,v,u; 129 | d = alpha - 1.0/3.0; 130 | c = 1.0 / std::sqrt(9.0 * d); 131 | do { 132 | do { 133 | x = ran_gaussian(); 134 | v = 1.0 + c*x; 135 | } while (v <= 0.0); 136 | v = v * v * v; 137 | u = ran_uniform(); 138 | } while ( 139 | (u >= (1.0 - 0.0331 * (x*x) * (x*x))) 140 | && (log(u) >= (0.5 * x * x + d * (1.0 - v + std::log(v)))) 141 | ); 142 | return d*v; 143 | } 144 | } 145 | 146 | double ran_gamma(double alpha, double beta) { 147 | return ran_gamma(alpha) / beta; 148 | } 149 | 150 | double ran_gaussian() { 151 | // Joseph L. Leva: A fast normal Random number generator 152 | double u,v, x, y, Q; 153 | do { 154 | do { 155 | u = ran_uniform(); 156 | } while (u == 0.0); 157 | v = 1.7156 * (ran_uniform() - 0.5); 158 | x = u - 0.449871; 159 | y = std::abs(v) + 0.386595; 160 | Q = x*x + y*(0.19600*y-0.25472*x); 161 | if (Q < 0.27597) { break; } 162 | } while ((Q > 0.27846) || ((v*v) > (-4.0*u*u*std::log(u)))); 163 | return v / u; 164 | } 165 | 166 | double ran_gaussian(double mean, double stdev) { 167 | if ((stdev == 0.0) || (std::isnan(stdev))) { 168 | return mean; 169 | } else { 170 | return mean + stdev*ran_gaussian(); 171 | } 172 | } 173 | 174 | double ran_uniform() { 175 | return rand()/((double)RAND_MAX + 1); 176 | } 177 | 178 | double ran_exp() { 179 | return -std::log(1-ran_uniform()); 180 | } 181 | 182 | bool ran_bernoulli(double p) { 183 | return (ran_uniform() < p); 184 | } 185 | 186 | #endif /*RANDOM_H_*/ 187 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/util/rlog.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // rlog.h: Logging into R compatible files 21 | 22 | #ifndef RLOG_H_ 23 | #define RLOG_H_ 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | class RLog { 30 | private: 31 | std::ostream* out; 32 | std::vector header; 33 | std::map default_value; 34 | std::map value; 35 | public: 36 | RLog(std::ostream* stream) { 37 | out = stream; 38 | header.clear(); 39 | default_value.clear(); 40 | value.clear(); 41 | }; 42 | 43 | void log(const std::string& field, double d) { 44 | value[field] = d; 45 | } 46 | 47 | void init() { 48 | if (out != NULL) { 49 | for (uint i = 0; i < header.size(); i++) { 50 | *out << header[i]; 51 | if (i < (header.size()-1)) { 52 | *out << "\t"; 53 | } else { 54 | *out << "\n"; 55 | } 56 | } 57 | out->flush(); 58 | } 59 | for (uint i = 0; i < header.size(); i++) { 60 | value[header[i]] = default_value[header[i]]; 61 | } 62 | } 63 | 64 | void addField(const std::string& field_name, double def) { 65 | //std::cout << field_name << std::endl; std::cout.flush(); 66 | std::vector::iterator i = std::find(header.begin(), header.end(), field_name); 67 | if (i != header.end()) { 68 | throw "the field " + field_name + " already exists"; 69 | } 70 | header.push_back(field_name); 71 | default_value[field_name] = def; 72 | } 73 | 74 | void newLine() { 75 | if (out != NULL) { 76 | for (uint i = 0; i < header.size(); i++) { 77 | *out << value[header[i]]; 78 | if (i < (header.size()-1)) { 79 | *out << "\t"; 80 | } else { 81 | *out << "\n"; 82 | } 83 | } 84 | out->flush(); 85 | value.clear(); 86 | for (uint i = 0; i < header.size(); i++) { 87 | value[header[i]] = default_value[header[i]]; 88 | } 89 | } 90 | } 91 | }; 92 | 93 | 94 | #endif /*RLOG_H_*/ 95 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/util/smatrix.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // smatrix.h: Sparse Matrices and Tensors 21 | 22 | #ifndef SMATRIX_H_ 23 | #define SMATRIX_H_ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | 33 | template class SparseVector : public std::map { 34 | public: 35 | T get(int x) { 36 | typename SparseVector::iterator iter = this->find(x); 37 | if (iter != this->end()) { 38 | return iter->second; 39 | } else { 40 | return 0; 41 | } 42 | } 43 | void toStream(std::ostream &stream); 44 | }; 45 | 46 | template class SparseMatrix : public std::map > { 47 | public: 48 | T get(int x, int y) { 49 | typename SparseMatrix::iterator iter = this->find(x); 50 | if (iter != this->end()) { 51 | return iter->second.get(y); 52 | } else { 53 | return 0; 54 | } 55 | } 56 | void toStream(std::ostream &stream); 57 | void fromFile(const std::string &filename); 58 | }; 59 | template class SparseTensor : public std::map > { 60 | public: 61 | T get(int x, int y, int z) { 62 | typename SparseTensor::iterator iter = this->find(x); 63 | if (iter != this->end()) { 64 | return iter->second.get(y, z); 65 | } else { 66 | return 0; 67 | } 68 | } 69 | void toStream(std::ostream &stream); 70 | void toFile(const std::string &filename); 71 | void fromFile(const std::string &filename); 72 | }; 73 | 74 | class SparseVectorInt : public SparseVector {}; 75 | class SparseMatrixInt : public SparseMatrix {}; 76 | class SparseTensorInt : public SparseTensor {}; 77 | class SparseVectorDouble : public SparseVector {}; 78 | class SparseMatrixDouble : public SparseMatrix {}; 79 | class SparseTensorDouble : public SparseTensor {}; 80 | 81 | class SparseVectorBoolean : public std::set { 82 | public: 83 | bool get(int x) { 84 | SparseVectorBoolean::iterator iter = this->find(x); 85 | if (iter != this->end()) { 86 | return true; 87 | } else { 88 | return false; 89 | } 90 | } 91 | }; 92 | 93 | class SparseMatrixBoolean : public std::map { 94 | public: 95 | bool get(int x, int y) { 96 | SparseMatrixBoolean::iterator iter = this->find(x); 97 | if (iter != this->end()) { 98 | return iter->second.get(y); 99 | } else { 100 | return 0; 101 | } 102 | } 103 | void fromFile(const std::string &filename); 104 | }; 105 | 106 | class SparseTensorBoolean : public std::map { 107 | public: 108 | bool get(int x, int y, int z) { 109 | SparseTensorBoolean::iterator iter = this->find(x); 110 | if (iter != this->end()) { 111 | return iter->second.get(y, z); 112 | } else { 113 | return 0; 114 | } 115 | } 116 | void toStream(std::ostream &stream); 117 | void toFile(const std::string &filename); 118 | void fromFile(const std::string &filename); 119 | }; 120 | 121 | 122 | template void SparseVector::toStream(std::ostream &stream) { 123 | for(typename SparseVector::const_iter it_cell = this->begin(); it_cell != this->end(); ++it_cell) { 124 | stream << it_cell->first << " " << it_cell->second << std::endl; 125 | } 126 | } 127 | 128 | template void SparseMatrix::toStream(std::ostream &stream) { 129 | for(typename SparseMatrix::const_iter i = this->begin(); i != this->end(); ++i) { 130 | for(typename SparseVector::const_iter j = i->second->begin(); j != i->second->end(); ++j) { 131 | stream << i->first << " " << j->first << " " << j->second << std::endl; 132 | } 133 | } 134 | } 135 | 136 | template void SparseTensor::toStream(std::ostream &stream) { 137 | for(typename SparseTensor::const_iterator t = this->begin(); t != this->end(); ++t) { 138 | for(typename SparseMatrix::const_iterator i = t->second.begin(); i != t->second.end(); ++i) { 139 | for(typename SparseVector::const_iterator j = i->second.begin(); j != i->second.end(); ++j) { 140 | stream << t->first << " " << i->first << " " << j->first << " " << j->second << std::endl; 141 | } 142 | } 143 | } 144 | } 145 | 146 | template void SparseTensor::toFile(const std::string &filename) { 147 | std::ofstream out_file (filename.c_str()); 148 | if (out_file.is_open()) { 149 | toStream(out_file); 150 | out_file.close(); 151 | } else { 152 | throw "Unable to open file " + filename; 153 | } 154 | 155 | } 156 | 157 | template void SparseTensor::fromFile(const std::string &filename) { 158 | std::ifstream fData (filename.c_str()); 159 | if (! fData.is_open()) { 160 | throw "Unable to open file " + filename; 161 | } 162 | while (! fData.eof()) { 163 | int t, m, v; 164 | fData >> t; 165 | fData >> m; 166 | fData >> v; 167 | if (! fData.eof()) { 168 | T value; 169 | fData >> value; 170 | (*this)[t][m][v] = value; 171 | } 172 | } 173 | fData.close(); 174 | } 175 | 176 | template void SparseMatrix::fromFile(const std::string &filename) { 177 | std::ifstream fData (filename.c_str()); 178 | if (! fData.is_open()) { 179 | throw "Unable to open file " + filename; 180 | } 181 | while (! fData.eof()) { 182 | int t, m; 183 | fData >> t; 184 | fData >> m; 185 | if (! fData.eof()) { 186 | T value; 187 | fData >> value; 188 | (*this)[t][m] = value; 189 | } 190 | } 191 | fData.close(); 192 | } 193 | 194 | void SparseTensorBoolean::toStream(std::ostream &stream) { 195 | for(SparseTensorBoolean::const_iterator t = this->begin(); t != this->end(); ++t) { 196 | for(SparseMatrixBoolean::const_iterator i = t->second.begin(); i != t->second.end(); ++i) { 197 | for(SparseVectorBoolean::const_iterator j = i->second.begin(); j != i->second.end(); ++j) { 198 | stream << t->first << " " << i->first << " " << (*j) << std::endl; 199 | } 200 | } 201 | } 202 | } 203 | 204 | void SparseTensorBoolean::toFile(const std::string &filename) { 205 | std::ofstream out_file (filename.c_str()); 206 | if (out_file.is_open()) { 207 | toStream(out_file); 208 | out_file.close(); 209 | } else { 210 | throw "Unable to open file " + filename; 211 | } 212 | 213 | } 214 | 215 | void SparseTensorBoolean::fromFile(const std::string &filename) { 216 | std::ifstream fData (filename.c_str()); 217 | if (! fData.is_open()) { 218 | throw "Unable to open file " + filename; 219 | } 220 | while (! fData.eof()) { 221 | int t, m, v; 222 | fData >> t; 223 | fData >> m; 224 | if (! fData.eof()) { 225 | fData >> v; 226 | (*this)[t][m].insert(v); 227 | } 228 | } 229 | fData.close(); 230 | } 231 | 232 | 233 | void SparseMatrixBoolean::fromFile(const std::string &filename) { 234 | std::ifstream fData (filename.c_str()); 235 | if (! fData.is_open()) { 236 | throw "Unable to open file " + filename; 237 | } 238 | while (! fData.eof()) { 239 | int m, v; 240 | fData >> m; 241 | if (! fData.eof()) { 242 | fData >> v; 243 | (*this)[m].insert(v); 244 | } 245 | } 246 | fData.close(); 247 | } 248 | 249 | #endif /*SMATRIX_H_*/ 250 | -------------------------------------------------------------------------------- /Click_prediction/libfm/libfm/src/util/util.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle 2 | // Contact: srendle@libfm.org, http://www.libfm.org/ 3 | // 4 | // This file is part of libFM. 5 | // 6 | // libFM is free software: you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation, either version 3 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // libFM is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License 17 | // along with libFM. If not, see . 18 | // 19 | // 20 | // util.h: Utility functions 21 | 22 | #ifndef UTIL_H_ 23 | #define UTIL_H_ 24 | 25 | #include 26 | #include 27 | 28 | #ifdef _WIN32 29 | #include 30 | #else 31 | #include 32 | #endif 33 | 34 | #include 35 | #include 36 | 37 | typedef unsigned int uint; 38 | 39 | #ifdef _WIN32 40 | namespace std { 41 | bool isnan(double d) { return _isnan(d); } 42 | bool isnan(float f) { return _isnan(f); } 43 | bool isinf(double d) { return (! _finite(d)) && (! isnan(d)); } 44 | bool isinf(float f) { return (! _finite(f)) && (! isnan(f)); } 45 | } 46 | #endif 47 | 48 | #include 49 | 50 | double sqr(double d) { return d*d; } 51 | 52 | double sigmoid(double d) { return (double)1.0/(1.0+exp(-d)); } 53 | 54 | std::vector tokenize(const std::string& str, const std::string& delimiter) { 55 | std::vector result; 56 | std::string::size_type lastPos = str.find_first_not_of(delimiter, 0); 57 | 58 | std::string::size_type pos = str.find_first_of(delimiter, lastPos); 59 | while (std::string::npos != pos || std::string::npos != lastPos) { 60 | result.push_back(str.substr(lastPos, pos - lastPos)); 61 | lastPos = str.find_first_not_of(delimiter, pos); 62 | pos = str.find_first_of(delimiter, lastPos); 63 | } 64 | return result; 65 | } 66 | 67 | double getusertime2() { 68 | return (double) clock_t() / CLOCKS_PER_SEC; 69 | } 70 | 71 | double getusertime() { 72 | #ifdef _WIN32 73 | return getusertime2(); 74 | #else 75 | struct rusage ru; 76 | getrusage(RUSAGE_SELF, &ru); 77 | 78 | struct timeval tim = ru.ru_utime; 79 | return (double)tim.tv_sec + (double)tim.tv_usec / 1000000.0; 80 | #endif 81 | } 82 | 83 | 84 | double getusertime3() { 85 | return (double) clock() / CLOCKS_PER_SEC; 86 | } 87 | 88 | double getusertime4() { 89 | return (double) time(NULL); 90 | } 91 | 92 | bool fileexists(std::string filename) { 93 | std::ifstream in_file (filename.c_str()); 94 | return in_file.is_open(); 95 | } 96 | 97 | 98 | #endif /*UTIL_H_*/ 99 | -------------------------------------------------------------------------------- /Click_prediction/output/criteo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/criteo.jpg -------------------------------------------------------------------------------- /Click_prediction/output/facebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/facebook.png -------------------------------------------------------------------------------- /Click_prediction/output/ffm_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/ffm_formula.png -------------------------------------------------------------------------------- /Click_prediction/output/fm_format.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/fm_format.png -------------------------------------------------------------------------------- /Click_prediction/output/fm_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/fm_formula.png -------------------------------------------------------------------------------- /Click_prediction/output/fm_formula2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/fm_formula2.png -------------------------------------------------------------------------------- /Click_prediction/output/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/loss.png -------------------------------------------------------------------------------- /Click_prediction/output/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/model.png -------------------------------------------------------------------------------- /Click_prediction/output/tensorboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/tensorboard.png -------------------------------------------------------------------------------- /Click_prediction/output/train_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Click_prediction/output/train_info.png -------------------------------------------------------------------------------- /Coupon_Usage_Predict/readme.md: -------------------------------------------------------------------------------- 1 | ## 赛题回顾 2 | [本赛题](https://tianchi.shuju.aliyun.com/competition/introduction.htm?spm=5176.100068.5678.1.9Igo9O&raceId=231587)提供用户在2016年1月1日至2016年6月30日之间真实线上线下消费行为,预测用户在2016年7月领取优惠券后15天以内是否核销。评测指标采用AUC,先对每个优惠券单独计算核销预测的AUC值,再对所有优惠券的AUC值求平均作为最终的评价标准。 3 | 4 | 第一赛季数据:[下载链接](http://pan.baidu.com/s/1nvFG2ff) 5 | 6 | ## 解决方案概述 7 | 本赛题提供了用户线下消费和优惠券领取核销行为的纪录表,用户线上点击/消费和优惠券领取核销行为的纪录表,记录的时间区间是2016.01.01至2016.06.30,需要预测的是2016年7月份用户领取优惠劵后是否核销。根据这两份数据表,我们首先对数据集进行划分,然后提取了用户相关的特征、商家相关的特征,优惠劵相关的特征,用户与商家之间的交互特征,以及利用本赛题的leakage得到的其它特征(这部分特征在实际业务中是不可能获取到的)。最后训练了XGBoost,GBDT,RandomForest进行模型融合。 8 | 9 | ## 数据集划分 10 | 可以采用滑窗的方法得到多份训练数据集,特征区间越小,得到的训练数据集越多。以下是一种划分方式: 11 | ![split_dataset.png](http://upload-images.jianshu.io/upload_images/99097-d6e3f38a9b9ec379.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 12 | 13 | 划取多份训练集,一方面可以增加训练样本,另一方面可以做交叉验证实验,方便调参。 14 | 15 | 16 | ## 特征工程 17 | 赛题提供了online和offline两份数据集,online数据集可以提取到与用户相关的特征,offline数据集可以提取到更加丰富的特征:用户相关的特征,商家相关的特征,优惠劵相关的特征,用户-商家交互特征。 18 | 19 | 另外需要指出的是,赛题提供的预测集中,包含了同一个用户在整个7月份里的优惠券领取情况,这实际上是一种leakage,比如存在这种情况:某一个用户在7月10日领取了某优惠券,然后在7月12日和7月15日又领取了相同的优惠券,那么7月10日领取的优惠券被核销的可能性就很大了。我们在做特征工程时也注意到了这一点,提取了一些相关的特征。加入这部分特征后,AUC提升了10个百分点,相信大多数队伍都利用了这一leakage,但这些特征在实际业务中是无法获取到的。 20 | 21 | 以下简要地说明各部分特征: 22 | 23 | - **用户线下相关的特征** 24 | - 用户领取优惠券次数 25 | - 用户获得优惠券但没有消费的次数 26 | - 用户获得优惠券并核销次数 27 | - 用户领取优惠券后进行核销率 28 | - 用户满0~50/50~200/200~500 减的优惠券核销率 29 | - 用户核销满0~50/50~200/200~500减的优惠券占所有核销优惠券的比重 30 | - 用户核销优惠券的平均/最低/最高消费折率 31 | - 用户核销过优惠券的不同商家数量,及其占所有不同商家的比重 32 | - 用户核销过的不同优惠券数量,及其占所有不同优惠券的比重 33 | - 用户平均核销每个商家多少张优惠券 34 | - 用户核销优惠券中的平均/最大/最小用户-商家距离 35 | 36 | - **用户线上相关的特征** 37 | - 用户线上操作次数 38 | - 用户线上点击率 39 | - 用户线上购买率 40 | - 用户线上领取率 41 | - 用户线上不消费次数 42 | - 用户线上优惠券核销次数 43 | - 用户线上优惠券核销率 44 | - 用户线下不消费次数占线上线下总的不消费次数的比重 45 | - 用户线下的优惠券核销次数占线上线下总的优惠券核销次数的比重 46 | - 用户线下领取的记录数量占总的记录数量的比重 47 | 48 | - **商家相关的特征** 49 | - 商家优惠券被领取次数 50 | - 商家优惠券被领取后不核销次数 51 | - 商家优惠券被领取后核销次数 52 | - 商家优惠券被领取后核销率 53 | - 商家优惠券核销的平均/最小/最大消费折率 54 | - 核销商家优惠券的不同用户数量,及其占领取不同的用户比重 55 | - 商家优惠券平均每个用户核销多少张 56 | - 商家被核销过的不同优惠券数量 57 | - 商家被核销过的不同优惠券数量占所有领取过的不同优惠券数量的比重 58 | - 商家平均每种优惠券核销多少张 59 | - 商家被核销优惠券的平均时间率 60 | - 商家被核销优惠券中的平均/最小/最大用户-商家距离 61 | 62 | - **用户-商家交互特征** 63 | - 用户领取商家的优惠券次数 64 | - 用户领取商家的优惠券后不核销次数 65 | - 用户领取商家的优惠券后核销次数 66 | - 用户领取商家的优惠券后核销率 67 | - 用户对每个商家的不核销次数占用户总的不核销次数的比重 68 | - 用户对每个商家的优惠券核销次数占用户总的核销次数的比重 69 | - 用户对每个商家的不核销次数占商家总的不核销次数的比重 70 | - 用户对每个商家的优惠券核销次数占商家总的核销次数的比重 71 | 72 | - **优惠券相关的特征** 73 | - 优惠券类型(直接优惠为0, 满减为1) 74 | - 优惠券折率 75 | - 满减优惠券的最低消费 76 | - 历史出现次数 77 | - 历史核销次数 78 | - 历史核销率 79 | - 历史核销时间率 80 | - 领取优惠券是一周的第几天 81 | - 领取优惠券是一月的第几天 82 | - 历史上用户领取该优惠券次数 83 | - 历史上用户消费该优惠券次数 84 | - 历史上用户对该优惠券的核销率 85 | 86 | - **其它特征** 87 | 88 | 这部分特征利用了赛题leakage,都是在预测区间提取的。 89 | - 用户领取的所有优惠券数目 90 | - 用户领取的特定优惠券数目 91 | - 用户此次之后/前领取的所有优惠券数目 92 | - 用户此次之后/前领取的特定优惠券数目 93 | - 用户上/下一次领取的时间间隔 94 | - 用户领取特定商家的优惠券数目 95 | - 用户领取的不同商家数目 96 | - 用户当天领取的优惠券数目 97 | - 用户当天领取的特定优惠券数目 98 | - 用户领取的所有优惠券种类数目 99 | - 商家被领取的优惠券数目 100 | - 商家被领取的特定优惠券数目 101 | - 商家被多少不同用户领取的数目 102 | - 商家发行的所有优惠券种类数目 103 | 104 | 105 | 106 | ## 模型设计与模型融合 107 | 108 | 基于以上提取到的特征,进行模型设计与融合。 109 | 110 | - 单模型 111 | 112 | 第一赛季只训练了XGBoost单模型提交,连续几周位居排行榜第一位。 113 | 114 | 第二赛季训练了XGBoost,GBDT,RandomForest三种单模型,其中GBDT表现最好,XGBoost次之,RandomForest相比之下最差。GBDT和XGBoost单模型在第二赛季仍然名列Top3,融合后效果更佳,尝试了以下两种方法: 115 | 116 | - 加权融合 117 | 118 | 得到了单模型的预测结果后,直接将概率预测值进行加权融合,我们简单地用`0.65 * GBDT + 0.35 * XGBoost`就得到了第一的成绩。 119 | 120 | - Blending模型 121 | 122 | 我们尝试了两层的blending模型,首先将训练集分为两部分(D1和D2),一部分用于第一层(level 1)的训练,另一部分用于第二层(level 2)的训练。level1 在D1上训练了4个XGBoost,4个GBDT,4个RandomForest,将这些模型的预测结果作为level2的feature,在D2上训练第二层模型。Blending模型的结果相比单模型有细微的提升,但这点提升相对于模型复杂度带来的计算代价显得微不足道。 123 | 124 | 125 | ![flag.png](https://github.com/wepe/O2O-Coupon-Usage-Forecast/blob/master/flag.png) 126 | -------------------------------------------------------------------------------- /Loan_risk_prediction/README.md: -------------------------------------------------------------------------------- 1 | # TIANCHI_Project 2 | 天池大数据比赛总结 3 | -------------------------------------------------------------------------------- /Loan_risk_prediction/code/Xgboost调优示例.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Xgboost调优示例.py 8 | @time: 2018/2/24 17:09 9 | @desc: 10 | """ 11 | ''' 12 | Xgboost 调参详解 13 | 参考博客: http://blog.csdn.net/han_xiaoyang/article/details/52665396 14 | 15 | GridSearchCV 博客: 16 | http://blog.csdn.net/cherdw/article/details/54970366 17 | ''' 18 | import pandas as pd 19 | import numpy as np 20 | import xgboost as xgb 21 | import matplotlib.pylab as plt 22 | 23 | from matplotlib.pylab import rcParams 24 | from xgboost import XGBClassifier 25 | from sklearn import svm, grid_search, datasets 26 | from sklearn import cross_validation, metrics 27 | from sklearn.model_selection import GridSearchCV # 网格搜索 28 | 29 | # %matplotlib inline 30 | rcParams['figure.figsize'] = 12, 4 31 | 32 | train = pd.read_csv('E:\py_workspace\TIANCHI_Project\Loan_risk_prediction\data\\train_modified.csv') 33 | target = 'Disbursed' 34 | IDcol = 'ID' 35 | 36 | # 先定义一个函数,帮助我们建立XGBoost models 并进行交叉验证 37 | def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=0.5, early_stopping_rounds=50): 38 | if useTrainCV: 39 | xgb_param = alg.get_xgb_params() 40 | xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values) 41 | cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], 42 | nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, 43 | show_progress=False) 44 | alg.set_params(n_estimators=cvresult.shape[0]) 45 | # Fit the algorithm on the data 46 | alg.fit(dtrain[predictors], dtrain['Disbursed'], eval_metric='auc') 47 | 48 | # Predict training set 49 | dtrain_predictions = alg.predict(dtrain[predictors]) 50 | dtrain_predprob = alg.predict_prob(dtrain[predictors])[:, 1] 51 | 52 | print('\n Model Report') 53 | print('Accuracy: %.4g' % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)) 54 | print('AUC Score (Train): %f' % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)) 55 | 56 | feat_imp = pd.Series(alg.booster().get_fcore()).sort_values(ascending=False) 57 | feat_imp.plot(kind='bar', title='Feature Importance') 58 | plt.ylabel('Feature Importance') 59 | 60 | 61 | # 62 | # 第一步: 确定学习速率和tree_based参数 调优的估计器的数目 63 | # 64 | predictors = [x for x in train.columns if x not in [target, IDcol]] 65 | xgb1 = XGBClassifier( 66 | learning_rate= 0.1, 67 | n_estimators= 1000, 68 | max_depth= 5, # 树的最大深度。这个值也是用来避免过拟合的。max_depth越大。模型会学到更具体更局部的样本。 69 | # 需要使用cv函数来进行调优。 70 | # 典型值 3-10 71 | min_child_weight= 1, # 决定最小叶子节点样本权重和。这个参数用于避免过拟合,当他的值较大时,可以避免 72 | # 模型学习到局部的特殊样本 73 | # 但是如果这个值过高,会导致欠拟合。这个参数需要使用cv来调整 74 | # 默认是 1 75 | gamma= 0, # 在节点分裂时,只有分裂后损失函数的值下降,才会分裂这个节点。 76 | # gamma指定了节点分裂所需的最小损失函数下降值 77 | # 这个参数的值越大,算法越保守。这个参数的值和损失函数息息相关。 78 | subsample= 0.8, # 这个参数控制对于每棵树随机采样的比例 79 | # 减少这个参数的值,算法会更加保守,避免过拟合。 80 | # 但是,如果这个值设置的越小,他可能会导致欠拟合。 81 | # 典型值 0.5-1 82 | colsample_bytree= 0.8, # 用来控制树的每一级的每一次分裂,对列数的采样的占比 83 | # subsample和colsample_bytree可以起到相同的作用。 84 | objective= 'binary:logistic', # 学习目标参数 二分类的逻辑回归,返回预测的概率 (不是类别) 85 | # ‘multi:softmax’ 使用softmax的多分类器,返回预测的类别 (不是概率) 86 | # 这种情况下需要多加一个参数 : num_class (类别数目) 87 | nthread= 4, 88 | scale_pos_weight= 1, # 默认为1 89 | # 在各类样本十分不平衡时,把这个参数设置为一个正值,可以使算法更快收敛 90 | seed= 27 # 随机数种子 91 | # 设置他可以复现随机数据的结果,也可以用于调整参数 92 | ) 93 | 94 | 95 | # 第二部:max_depth 和 min_weight 参数调优 96 | # grid_search 参考: 97 | # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html 98 | # http://blog.csdn.net/abcjennifer/article/details/23884761 99 | 100 | # 网格搜索scoring=’roc_auc’只支持二分类,多分类需要修改scoring(默认支持多分类) 101 | param_test1 = { 102 | 'max_depth':range(3, 10, 2), 103 | 'min_child_weight': range(1, 6, 2) 104 | } 105 | 106 | param_test2 = { 107 | 'max_depth':[4,5,6], 108 | 'min_child_weoght':[4,5,6] 109 | } 110 | # Deprecated since version 0.18: This module will be removed in 0.20. 111 | # Use sklearn.model_selection.GridSearchCV instead. 112 | # GridSearchCV 他存在的意义就是自动调参,只要把参数传进去,就能给出最优化的结果和参数,但是这个方法只适合小数据集。 113 | # 一旦数据量上去了,很难得出结果。 114 | # 数据量大的时候可以使用一个快速调优的方法-坐标下降,它其实是一种贪心算法,拿当前对模型影响最大的参数调优,直到最优化 115 | # 再拿下一个影响最大的参数调优,如此下去,直到所有的参数调整完毕。 116 | # 这个方法的缺点就是可能会调到局部最优而不是全局最优,但是省时省力 117 | # 后续可用bagging优化 118 | gsearch1 = GridSearchCV( 119 | estimator= XGBClassifier( # 确定所使用的分类器,每一个分类器都需要一个score参数,或者score方法 120 | learning_rate=0.1, 121 | n_estimators=140, 122 | max_depth=5, 123 | min_child_weight=1, 124 | gamma=0, 125 | subsample=0.8, 126 | colsample_bytree=0.8, 127 | objective= 'binary:logistic', 128 | nthread=4, 129 | scale_pos_weight=1, 130 | seed=27 131 | ), 132 | param_grid=param_test1, # 值为字典或者列表,即需要优化的参数的值 133 | scoring='roc_auc', # 准确度评价标准,默认为None,这时需要使用score函数 134 | n_jobs=4, # 并行数, int: 个数 -1 跟cpu核数一致, 1 默认值 135 | iid=False, # 默认为True,为True时,默认为各个样本fold概率分布一致,误差估计为所有样本之和,而非各个fold的平均 136 | cv=5, # 交叉验证参数,默认为None 137 | verbose= 2, # 日志冗长度,int, 0:不输出训练过程, 1: 偶尔输出, >1:对每个子模型都输出 138 | refit=True # 默认为True,程序将会以交叉验证训练集得到的最佳参数,重新对所有可用的训练集与开发集进行, 139 | # 作为最终用于性能评估的最佳模型参数。即在搜索参数结束后,用最佳参数结果再次fit一遍全部数据集。 140 | ) 141 | gsearch1.fit(train[predictors],train[target]) # 运行网格搜索 142 | #gsearch1.grid_scores_, # 给出不同参数情况下的评价结果 143 | gsearch1.best_params_, # 描述了已取得最佳结果的参数的组合 144 | gsearch1.best_score_ # 成员提供优化过程期间观察到的最好的结果的评分 145 | 146 | # 第三步: gamma参数调优 147 | param_test3 = { 148 | 'gamma':[i/10.0 for i in range(0, 5)] 149 | } 150 | gsearch3 = GridSearchCV(estimator=XGBClassifier( 151 | learning_rate=0.1, 152 | n_estimators=140, 153 | max_depth=4, 154 | min_child_weight=6, 155 | gamma=0, 156 | subsample=0.8, 157 | colsample_bytree=0.8, 158 | objective='binary:logistic', 159 | nthread=4, 160 | scale_pos_weight=1, 161 | seed=27), 162 | param_grid=param_test3, 163 | scoring='roc_auc', 164 | n_jobs=4, 165 | iid=False, 166 | cv=5 167 | ) 168 | 169 | 170 | 171 | 172 | 173 | if __name__ == '__main__': 174 | pass -------------------------------------------------------------------------------- /Loan_risk_prediction/data/Train_nyOWmfK.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Loan_risk_prediction/data/Train_nyOWmfK.csv -------------------------------------------------------------------------------- /Loan_risk_prediction/doc/README.md: -------------------------------------------------------------------------------- 1 | ##推荐资料: 2 | 3 | 4 | 5 | - wepon大神的github吐血推荐!!!https://github.com/wepe ,本次比赛主要参考了他们去年微额借款用户人品预测大赛冠军解决方案以及拍拍贷风险控制大赛铜奖解决方案,干货多多! 6 | - 此外还有:金老师的知乎专栏:https://zhuanlan.zhihu.com/jlbookworm ,收录了各种大神解决方案和开源代码。 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /Loan_risk_prediction/doc/不得直视本王-解决方案.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Loan_risk_prediction/doc/不得直视本王-解决方案.pdf -------------------------------------------------------------------------------- /Loan_risk_prediction/doc/创新应用.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Loan_risk_prediction/doc/创新应用.docx -------------------------------------------------------------------------------- /Loan_risk_prediction/doc/最优分箱.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Loan_risk_prediction/doc/最优分箱.docx -------------------------------------------------------------------------------- /Loan_risk_prediction/doc/风控算法大赛解决方案.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Loan_risk_prediction/doc/风控算法大赛解决方案.pdf -------------------------------------------------------------------------------- /PPD_RiskControl/README.md: -------------------------------------------------------------------------------- 1 | ### [拍拍贷风险控制大赛铜奖解决方案](https://github.com/wepe/PPD_RiskControlCompetition) 2 | 3 | - 赛题介绍 4 | 5 | 请参见 [Kesci官网介绍](http://www.kesci.com/apps/home_log/index.html#!/competition/56cd5f02b89b5bd026cb39c9) 6 | 7 | - 解决方案 8 | 9 | 详细解决方案请看 [PDF文件](https://github.com/wepe/PPD_RiskControlCompetition/blob/master/%E9%A3%8E%E6%8E%A7%E7%AE%97%E6%B3%95%E5%A4%A7%E8%B5%9B%E8%A7%A3%E5%86%B3%E6%96%B9%E6%A1%88.pdf) 10 | 11 | - 代码目录说明 12 | 13 | - `proccess 文件夹` 14 | 15 | - Split.java将采样数据分块 16 | - CombineSample.java将采样数据合并 17 | - NThreadRNB.java和NThreadMatrix.java是并行采样代码工具类 18 | 19 | 20 | - `feature engineering 文件夾` 21 | 22 | - NullDiscrete.java和CityFeature.java为计算城市向量特征 23 | - city_feature.sql 为数据库处理代码 24 | - rank.py 对原始数值特征进行排序,得到排序特征 25 | - null.py 分析和处理缺失值 26 | - jw.py 生成经纬度特征 27 | - CategoryFeatureProcess.py 特征处理 28 | - MergeFeature.py 特征合并 29 | - SelectFeature.py 特征筛选 30 | - jingch文件夹 31 | - UserLogInfoFeature 登录信息特征提取类 32 | - UserUpdateInfoFeature 修改信息特征提取类 33 | - MergeTool 模型融合工具类 34 | - FeatureProcess 特征处理类 35 | 36 | 37 | 38 | - `feature_select 文件夹` 39 | 40 | - sort_feature_using_xgb.py 训练xgb模型对特征进行重要性排序,特征选择 41 | - avg_featurescore.py 将多份featurescore文件全加,得到特征重要性排序文件 42 | 43 | 44 | 45 | 46 | - `sample 文件夹` 47 | 48 | 包含了两种采样代码,其中MY0为本次比赛使用的基于粗糙集的并行过采样算法 49 | 50 | 51 | - `util 文件夹` 52 | 53 | 这个文件夹包含一些常用的工具类和代码 54 | - visualize_null.py 缺失值的可视化分析 55 | - visualize_dataset.py 对每日成交量和违约量进行可视化 56 | - plot_feature_importance.py 画特征重要性图 57 | - cal_auc.py 计算auc,线下验证 58 | 59 | 60 | - `lr文件夹` 61 | 62 | - lr.py 逻辑回归模型文件 63 | 64 | 65 | 66 | - `svm 文件夹` 67 | 68 | - svm.py 运用数据集分解的方法训练多个svm进行averaging 69 | - avg_val.py 查看在验证集上的效果 70 | - avg_test.py 查看在测试集上的效果 71 | 72 | 73 | - `xgb文件夹` 74 | 75 | - single_xgb.py 单模型xgboost,线下cv的auc值是0.782左右 76 | - bagging_xgb.py 单模型xgboost的改进版,对参数和特征加入随机扰动,训练多个xgboost子模型 77 | - avg_test.py 对子模型预测结果进行averaging 78 | - graphlab_xgboost.py graphlab版本的xgboost 79 | 80 | - `ensemble文件夹` 81 | 82 | - cal_mic.py 计算单模型结果文件之间的相关性 83 | - ensemble.py 多个单模型的加权融合,通过验证集选取最优参数 84 | - blend_ensemble.py 训练多个单模型,并进行blending融合 85 | 86 | 87 | - 使用须知 88 | 89 | - 代码可以自由使用,但请保留出处。 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /PPD_RiskControl/doc/风控算法大赛解决方案.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/PPD_RiskControl/doc/风控算法大赛解决方案.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TIANCHI_Project 2 | 天池大数据比赛总结 3 | 4 | ### Ad_Convert_prediction : 阿里妈妈广告转化率预测 5 | 6 | ### Click_prediction : 点击率预估 7 | 8 | ### Coupon_Usage_Predict : 优惠券使用预测 9 | 10 | ### Loan_risk_prediction : 贷款风险预测 11 | 12 | ### PPD_RiskControll : 拍拍贷贷款风险预测 13 | 14 | ### Zhihuijiaotong :智慧交通预测 15 | -------------------------------------------------------------------------------- /Shangjialiuliang_predict/README.md: -------------------------------------------------------------------------------- 1 | # 代码说明文档 2 | ## 环境需求 3 | ### 运行环境 4 | - python2.7 Anaconda 4.0 5 | - Jupyter notebook 6 | ### 外部依赖库 7 | - numpy 8 | - pandas 9 | - sklearn 10 | - statsmodels 11 | ## 文件 12 | ### data 13 | 14 | 用于存储所有的数据,包括原始数据,额外数据,处理后的数据,模型中间数据以及最后提交的结果。 15 | 16 | #### results 17 | 存储模型和规则预测出的最终结果。 18 | #### shop\_info\_name2Id 19 | 将商店中的地址、三级分类等名词映射成Id保存在该文件夹下。 20 | #### statistics 21 | 原始数据处理后的数据,包括平滑后的数据,天气数据和天气统计。 22 | #### test_train 23 | 存储线下线上train和test的特征以及标签文件。 24 | #### weekABCD 25 | 线下线上训练集和测试集的划分,按日分。 26 | #### weekABCD_0123 27 | 线下线上训练集和测试集的划分(将一天分为四个时间段,没六小时一个时间段)。 28 | 29 | ### main 30 | 主要的数据预处理代码和模型,以及数据分析代码。 31 | #### analysis 32 | 数据分析的代码和统计结果。 33 | 34 | #### data_processing 35 | 数据预处理,包括数据统计,数据预处理,数据平滑,训练集和测试集划分。 36 | - `avg_smoothing.py` 对数据中的0进行处理,遇到0,用前三星期对应值的平均值替换 37 | - `smoothing.py` 处理数据中的异常值 38 | - `split_test_train.py` 数据集训练集和测试集划分 39 | 40 | #### draw_picture 41 | 用于画图的一些基本的函数,方便数据的显示和分析 42 | - `draw.py` 画图 43 | 44 | #### fuse 45 | 模型融合相关文件。 46 | - `fuse.py` 两个模型的结果进行融合,需要运行`run.py`文件来调用。 47 | 48 | #### model 49 | 我们在比赛中所使用过的模型,包括 ARIMA,GBDT,LR,RF,Extremely Randomized Trees等。 50 | - `base_model.py` ExtraTreeRegreessor模型,是我们在比赛线上线下预测中最主要使用的模型。 51 | - `gbdt.py` 基于GBDT模型的简单预测。 52 | - `RandomForestRegreessor.py` 基于随机森林模型的预测。 53 | - `predict_two_week.py` 与之前复制单周预测不同,直接预测两周结果。 54 | - `multi_mode.py` 不同模型,不同特征,不同参数的结合。 55 | - `use_first_week_predict_second_week.py` 在预测第一周的结果后,将第一周的结果用于第二周的预测。 56 | 57 | ##### old model 58 | 该文件夹下主要是比赛初期使用的一些预测模型,在后来被我们放弃使用。 59 | - `mean_test.py` 60 | 均值预测,取最后三周平滑数据对应周期(7)的均值,预测一周销售量,最后输出结果到文件。 61 | - `lr_test.py` LR预测,使用Ridge回归拟合最后三周的每个商铺的总销售量,预测一周的总销售量,最后输出差值到文件。 62 | - `arima_pred.py` 使用了ARIMA模型进行预测,对数据进行对数惩罚,一阶差分后计算ADF值,找到最稳定的差分项。之后用grid search找出拟合效果最好的模型进行预测。我们采用了bic和smape对模型效果进行评估。 63 | 64 | #### rule 65 | 主要使用的规则代码,包括节假日(双十一等),天气处理,火锅店单独处理等,执行`run.py`在指定结果上添加规则影响。 66 | - `special_day.py` 对特定的节假日如11月11日进行处理。 67 | - `weather.py` 根据天气值对预测值进行处理 68 | - `hot_pot.py` 对火锅店进行特殊处理 69 | ### notebook 70 | 为了方便统计数据特征,ARIMA模型结果,可视化结果和特征,我们使用了jupyter notebook来处理和保存含有图片的代码。详见文件夹中具体代码。 71 | ### pictures 72 | 一些统计信息和图片。 -------------------------------------------------------------------------------- /Shangjialiuliang_predict/data/shop_info_name2Id/cate_1_name.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/data/shop_info_name2Id/cate_1_name.csv -------------------------------------------------------------------------------- /Shangjialiuliang_predict/data/shop_info_name2Id/cate_2_name.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/data/shop_info_name2Id/cate_2_name.csv -------------------------------------------------------------------------------- /Shangjialiuliang_predict/data/shop_info_name2Id/cate_3_name.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/data/shop_info_name2Id/cate_3_name.csv -------------------------------------------------------------------------------- /Shangjialiuliang_predict/data/shop_info_name2Id/city_name.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/data/shop_info_name2Id/city_name.csv -------------------------------------------------------------------------------- /Shangjialiuliang_predict/data/shop_info_name2Id/shop_info.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/data/shop_info_name2Id/shop_info.csv -------------------------------------------------------------------------------- /Shangjialiuliang_predict/doc/资料.txt: -------------------------------------------------------------------------------- 1 | 香瓜资料: 2 | 3 | 参考github链接: https://github.com/RogerMonkey/IJCAI_CUP_2017 -------------------------------------------------------------------------------- /Shangjialiuliang_predict/main/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: __init__.py.py 8 | @time: 2018/3/5 14:48 9 | @desc: 10 | """ 11 | 12 | if __name__ == '__main__': 13 | pass -------------------------------------------------------------------------------- /Shangjialiuliang_predict/notebook/a.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/notebook/a.txt -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/cate_shop_number/cate_1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/cate_shop_number/cate_1.csv -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/cate_shop_number/cate_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/cate_shop_number/cate_1.png -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/cate_shop_number/cate_2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/cate_shop_number/cate_2.csv -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/cate_shop_number/cate_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/cate_shop_number/cate_2.png -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/cate_shop_number/cate_3.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/cate_shop_number/cate_3.csv -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/cate_shop_number/cate_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/cate_shop_number/cate_3.png -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/city_shop_number/0-50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/city_shop_number/0-50.png -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/city_shop_number/101-121.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/city_shop_number/101-121.png -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/city_shop_number/51-100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/city_shop_number/51-100.png -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/city_shop_number/all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/city_shop_number/all.png -------------------------------------------------------------------------------- /Shangjialiuliang_predict/pictures/city_shop_number/city_shop_number.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Shangjialiuliang_predict/pictures/city_shop_number/city_shop_number.csv -------------------------------------------------------------------------------- /Shangjialiuliang_predict/run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from main.data_processing import run as dp_run 4 | from main.model import run as model_run 5 | from main.rule import run as rule_run 6 | from main.fuse import run as fuse_run 7 | 8 | def run(): 9 | ''' 10 | 数据预处理 11 | :return: 12 | ''' 13 | dp_run() 14 | 15 | ''' 16 | 跑模型 17 | ''' 18 | model_run() 19 | 20 | ''' 21 | 模型融合 22 | ''' 23 | fuse_run() 24 | 25 | ''' 26 | 跑规则 27 | ''' 28 | rule_run() 29 | -------------------------------------------------------------------------------- /Tencent_Social_Ads/README.md: -------------------------------------------------------------------------------- 1 | # 第一届腾讯社交广告高校算法大赛-移动App广告转化率预估 2 | 赛题详情http://algo.tpai.qq.com/home/information/index.html 3 | 题目描述 4 | 根据从某社交广告系统连续两周的日志记录按照推广中的App和用户维度随机采样构造的数据, 5 | 预测App广告点击后被激活的概率:pCVR=P(conversion=1 | Ad,User,Context), 6 | 即给定广告、用户和上下文情况下广告被点击后发生激活的概率。 7 | # 运行环境 8 | - 操作系统 Ubuntu 14.04.4 LTS (GNU/Linux 4.2.0-27-generic x86_64) 9 | - 内存 128GB 10 | - CPU 32 Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz 11 | - 显卡 TITAN X (Pascal) 12GB 12 | - 语言 Python3.6 13 | - Python依赖包 14 | 1. Keras==2.0.6 15 | 2. lightgbm==0.1 16 | 3. matplotlib==2.0.0 17 | 4. numpy==1.11.3 18 | 5. pandas==0.19.2 19 | 6. scikit-learn==0.18.1 20 | 7. scipy==0.18.1 21 | 8. tensorflow-gpu==1.2.1 22 | 9. tqdm==4.11.2 23 | 10. xgboost==0.6a2 24 | - 其他库 25 | LIBFFM v121 26 | # 运行说明 27 | 1. 将复赛数据文件`final.zip`放在根目录下 28 | 2. 在根目录下运行`sh run.sh`命令生成特征文件 29 | 3. 打开`./code/_4_*_model_*.ipynb`分别进行模型训练和预测,生成单模型提交结果,包括`lgb,xgb,ffm,mlp` 30 | 4. 打开`./code/_4_5_model_avg.ipynb`进行最终的加权平均并生成最终提交结果 31 | # 方案说明 32 | 33 | 1. 用户点击日志挖掘`_2_1_gen_user_click_features.py` 34 | 挖掘广告点击日志,从不同时间粒度(天,小时)和不同属性维度(点击的素材,广告,推广计划,广告主类型,广告位等)提取用户点击行为的统计特征。 35 | 2. 用户安装日志挖掘 `_2_2_gen_app_install_features.py` 36 | 根据用户历史APP安装记录日志,分析用户的安装偏好和APP的流行趋势,结合APP安装时间的信息提取APP的时间维度的描述向量。这里最后只用了一种特征。 37 | 3. 广告主转化回流上报机制分析`_2_4_gen_tricks.py` 38 | 不同的广告主具有不同的转化计算方式,如第一次点击算转化,最后一次点击算转化,安装时点击算转化,分析并构造相应描述特征,提升模型预测精度。 39 | 4. 广告转化率特征提取`_2_5_gen_smooth_cvr.py` 40 | 构造转化率特征,使用全局和滑动窗口等方式计算单特征转化率,组合特征转化率,使用均值填充,层级填充,贝叶斯平滑,拉普拉斯平滑等方式对转化率进行修正。 41 | 5. 广告描述向量特征提取`_2_6_gen_ID_click_vectors.py` 42 | 广告投放是有特定受众对象的,而特定的受众对象也可以描述广告的相关特性,使用不同的人口属性对广告ID和APPID进行向量表示,学习隐含的语义特征。 43 | 6. 建模预测 44 | 使用多种模型进行训练,包括LightGBM,XGBoost,FFM和神经网络,最后进行多模型加权融合提高最终模型性能。 45 | 46 | # 其他 47 | - 最终线上排名20,logloss 0.101763 48 | - 最终特征维度在110左右 49 | - 部分最终没有采用的特征代码依然保留 50 | - 由于我们团队的代码是3个人共同完成的,我这里整理的模型训练的部分可能和当时略有差异,但特征部分基本一致。 51 | - `deprecated`目录下为弃用的代码,包括一些原始代码和打算尝试的方法 -------------------------------------------------------------------------------- /Tencent_Social_Ads/data/数据说明.txt: -------------------------------------------------------------------------------- 1 | 复赛: http://algo.tpai.qq.com/home/information/index.html 2 | 3 | 描述 4 | 计算广告是互联网最重要的商业模式之一,广告投放效果通常通过曝光、点击和转化各环节来衡量,大多数广告系统受广告效果数据回流的限制 5 | 只能通过曝光或点击作为投放效果的衡量标准开展优化。 6 | 腾讯社交广告(http://ads.tencent.com)发挥特有的用户识别和转化跟踪数据能力,帮助广告主跟踪广告投放后的转化效果, 7 | 基于广告转化数据训练转化率预估模型(pCVR,Predicted Conversion Rate), 8 | 在广告排序中引入pCVR因子优化广告投放效果,提升ROI。 9 | 10 | 本题目以移动App广告为研究对象,预测App广告点击后被激活的概率:pCVR=P(conversion=1 | Ad,User,Context), 11 | 即给定广告、用户和上下文情况下广告被点击后发生激活的概率。 12 | 13 | 测试数据 14 | 从训练数据时段随后1天(即第31天)的广告日志中按照与训练数据同样的采样方式抽取得到, 15 | 测试数据文件(test.csv)每行代表一个测试样本,各字段之间由逗号分隔, 16 | 顺序依次为:“instanceID,-1,clickTime,creativeID,userID,positionID,connectionType,telecomsOperator”。 17 | 其中,instanceID唯一标识一个样本,-1代表label占位使用,表示待预测。 18 | 19 | 训练数据 20 | 从腾讯社交广告系统中某一连续两周的日志中按照推广中的App和用户维度随机采样。 21 | 22 | 每一条训练样本即为一条广告点击日志(点击时间用clickTime表示),样本label取值0或1,其中0表示点击后没有发生转化,1表示点击后有发生转化, 23 | 如果label为1,还会提供转化回流时间(conversionTime,定义详见“FAQ”)。给定特征集如下: 24 | 25 | 特征 分类 描述 26 | 广告特征 账户ID(advertiserID) 腾讯社交广告的账户结构分为四级:账户——推广计划——广告——素材,账户对应一家特定的广告主。 27 | 推广计划ID(campaignID) 推广计划是广告的集合,类似电脑文件夹功能。广告主可以将推广平台、预算限额、是否匀速投放等条件相同的广告放在同一个推广计划中,方便管理。 28 | 广告ID(adID) 腾讯社交广告管理平台中的广告是指广告主创建的广告创意(或称广告素材)及广告展示相关设置,包含广告的基本信息(广告名称,投放时间等),广告的推广目标,投放平台,投放的广告规格,所投放的广告创意,广告的受众(即广告的定向设置),广告出价等信息。单个推广计划下的广告数不设上限。 29 | 素材ID(creativeID) 展示给用户直接看到的广告内容,一条广告下可以有多组素材。 30 | AppID(appID) 广告推广的目标页面链接地址,即点击后想要展示给用户的页面,此处页面特指具体的App。多个推广计划或广告可以同时推广同一个App。 31 | App分类(appCategory) App开发者设定的App类目标签,类目标签有两层,使用3位数字编码,百位数表示一级类目,十位个位数表示二级类目,如“210”表示一级类目编号为2,二级类目编号为10,类目未知或者无法获取时,标记为0。 32 | App平台(appPlatform) App所属操作系统平台,取值为Android,iOS,未知。同一个appID只会属于一个平台。 33 | 用户特征 用户ID(userID) 唯一标识一个用户 34 | 年龄(age) 取值范围[0, 80],其中0表示未知。 35 | 性别(gender) 取值包括男,女,未知。 36 | 学历(education) 用户当前最高学历,不区分在读生和毕业生,取值包括小学,初中,高中,专科,本科,硕士,博士,未知 37 | 婚恋状态(marriageStatus) 用户当前感情状况,取值包括单身,新婚,已婚,未知。 38 | 育儿状态(haveBaby) 用户当前孕育宝宝状态,取值包括孕育中,宝宝0~6个月,宝宝6~12个月,宝宝1~2岁,宝宝2~3岁,育儿但宝宝年龄未知,未知。 39 | 家乡/籍贯(hometown) 用户出生地,取值具体到市级城市,使用二级编码,千位百位数表示省份,十位个位数表示省内城市,如1806表示省份编号为18,城市编号是省内的6号,编号0表示未知。 40 | 常住地(residence) 最近一段时间用户长期居住的地方,取值具体到市级城市,编码方式与家乡相同。 41 | App安装列表(appInstallList) 截止到某一时间点用户全部的App安装列表(appID),已过滤高频和低频App。 42 | App安装流水 最近一段时间内用户安装App行为流水,包括appID,行为发生时间(installTime)和app类别(appCategory),已过滤高频和低频App。 43 | 注:2~8基于用户个人注册资料和算法自动修正得到,9~10基于用户行为日志统计得到。 44 | 上下文特征 广告位ID(positionID) 广告曝光的具体位置,如QQ空间Feeds广告位。 45 | 站点集合ID(sitesetID) 多个广告位的聚合,如QQ空间 46 | 广告位类型(positionType) 对于某些站点,人工定义的一套广告位规格分类,如Banner广告位。 47 | 联网方式(connectionType) 移动设备当前使用的联网方式,取值包括2G,3G,4G,WIFI,未知 48 | 运营商(telecomsOperator) 移动设备当前使用的运营商,取值包括中国移动,中国联通,中国电信,未知 49 | 50 | 特别的,出于数据安全的考虑,对于userID,appID,特征,以及时间字段,我们不提供原始数据,按照如下方式加密处理: 51 | 52 | 字段 解释 53 | userID 对每个用户ID,随机生成一个不重复的userID(假设用户数为100w,将所有用户随机打散排列,将其序号作为userID,取值范围是[1, 100w]),使用加密后的userID作为用户标识。 54 | appID 参考用户ID的加密方式,生成加密后的appID。 55 | 特征 参考用户ID的加密方式,生成加密后的特征ID。特别的,我们使用“0”表示特征值未知。 56 | 时间 包括 clickTime,conversionTime,installTime,格式均为DDHHMM,其中DD代表第几天,HH代表小时,MM代表分钟。 57 | 58 | 59 | 训练数据文件(train.csv) 60 | 每行代表一个训练样本,各字段之间由逗号分隔, 61 | 顺序依次为:“label,clickTime,conversionTime,creativeID,userID,positionID,connectionType,telecomsOperator”。 62 | 当label=0时,conversionTime字段为空字符串。特别的,训练数据时间范围为第17天0点到第31天0点(定义详见下面的“补充说明”)。 63 | 为了节省存储空间,用户、App、广告和广告位相关信息以独立文件提供(训练数据和测试数据共用),具体如下: 64 | 65 | 文件类型 描述 66 | 用户基础特征文件(user.csv) 每行代表一个用户,各字段之间由逗号分隔,顺序依次为:“userID,age,gender,education,marriageStatus,haveBaby,hometown,residence”。 67 | 用户App安装列表文件(user_installedapps.csv) 每行代表一个用户安装的单个App,各字段之间由逗号分隔,顺序依次为:“userID,appID”。特别的,我们提供了截止到第1天0点用户全部的App安装列表。 68 | 用户App安装流水文件(user_app_actions.csv) 每行代表一个用户的单个App操作流水,各字段之间由逗号分隔,顺序依次为:“userID,installTime,appID”。特别的,我们提供了训练数据开始时间之前16天开始连续30天的操作流水,即第1天0点到第31天0点。 69 | App特征文件(app_categories.csv) 每行代表一个App,各字段之间由逗号分隔,顺序依次为:“appID,appCategory”。 70 | 广告特征文件(ad.csv) 每行描述一条广告素材,各字段之间由逗号分隔,顺序依次为“creativeID,adID,camgaignID,advertiserID,appID,appPlatform”。 71 | 广告位特征文件(position.csv) 每行描述一个广告位,各字段之间由逗号分隔,顺序依次为:“positionID,sitesetID,positionType”。 72 | 注:若字段取值为0或空字符串均代表未知。(站点集合ID(sitesetID)为0并不表示未知,而是一个特定的站点集合。) -------------------------------------------------------------------------------- /Tencent_Social_Ads/doc/各代码功能说明.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/charlesXu86/TIANCHI_Project/ad4598ebc8e5bf63b07185ddbc9418c9504e05ce/Tencent_Social_Ads/doc/各代码功能说明.txt -------------------------------------------------------------------------------- /Tencent_Social_Ads/doc/模型介绍.txt: -------------------------------------------------------------------------------- 1 | 参考github链接:https://github.com/shenweichen/Tencent_Social_Ads2017_Mobile_App_pCVR 2 | https://github.com/BladeCoda/Tencent2017_Final_Coda_Allegro 3 | 博客: http://blog.csdn.net/haphapyear/article/details/75057407/ -------------------------------------------------------------------------------- /Tencent_Social_Ads/notebook/_1_preprocess_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "import pickle\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "from Ad_Utils import raw_data_path,feature_data_path,result_path,cache_pkl_path,dump_pickle,load_pickle" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "def gen_global_index():\n", 27 | " train = pd.read_csv(raw_data_path+'train.csv')\n", 28 | " test = pd.read_csv(raw_data_path+'test.csv')\n", 29 | " all_data = train.append(test)\n", 30 | " all_data['global_index'] = np.arange(0,all_data.shape[0])\n", 31 | " train = all_data.iloc[0:train.shape[0],:]\n", 32 | " test = all_data.iloc[train.shape[0]:,:]\n", 33 | " dump_pickle(train,raw_data_path+'train.pkl')\n", 34 | " dump_pickle(test,raw_data_path+'test.pkl')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "def csv_pkl(csv_name_without_suffix,protocol=None):\n", 46 | " pkl_path = raw_data_path+csv_name_without_suffix +'.pkl'\n", 47 | " if not os.path.exists(pkl_path):\n", 48 | " print('generating '+pkl_path)\n", 49 | " data = pd.read_csv(raw_data_path+csv_name_without_suffix+'.csv')\n", 50 | " dump_pickle(data,pkl_path,protocol=protocol)\n", 51 | " else:\n", 52 | " print('found '+pkl_path)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 4, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "def gen_demo_result():\n", 64 | " test = pd.read_csv(raw_data_path+'test.csv')\n", 65 | " test = test[['instanceID','label']]\n", 66 | " test.rename(columns={'label':'prob'},inplace=True)\n", 67 | " if not os.path.exists(result_path):\n", 68 | " os.mkdir(result_path)\n", 69 | " test.to_csv(result_path+'demo_result.csv',index=False)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 7, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "found ../raw_data/ad.pkl\n", 84 | "found ../raw_data/position.pkl\n", 85 | "found ../raw_data/app_categories.pkl\n", 86 | "found ../raw_data/test.pkl\n", 87 | "found ../raw_data/user_app_actions.pkl\n", 88 | "found ../raw_data/user.pkl\n", 89 | "generating ../raw_data/user_installedapps.pkl\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "if __name__ == '__main__':\n", 95 | " gen_global_index()\n", 96 | " train = load_pickle(raw_data_path+'train.pkl')\n", 97 | " train = train[train.clickTime>=17000000]#丢弃16号的数据\n", 98 | " dump_pickle(train,raw_data_path+'train.pkl')\n", 99 | " \n", 100 | " csv_pkl('ad')\n", 101 | " csv_pkl('position')\n", 102 | " csv_pkl('app_categories')\n", 103 | " csv_pkl('test')\n", 104 | " csv_pkl('user_app_actions')\n", 105 | " csv_pkl('user')\n", 106 | " csv_pkl('user_installedapps',protocol=4)\n", 107 | " \n", 108 | " gen_demo_result()\n", 109 | " \n", 110 | " if not os.path.exists(feature_data_path):\n", 111 | " os.mkdir(feature_data_path)\n", 112 | " if not os.path.exists(cache_pkl_path):\n", 113 | " os.mkdir(cache_pkl_path)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 3", 129 | "language": "python", 130 | "name": "python3" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 3 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython3", 142 | "version": "3.6.0" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 2 147 | } 148 | -------------------------------------------------------------------------------- /Tencent_Social_Ads/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | unzip final.zip 3 | mv final data 4 | code_path=./ 5 | cd code 6 | python $code_path"_1_preprocess_data.py" 7 | python $code_path"_2_1_gen_user_click_features.py" 8 | python $code_path"_2_2_gen_app_install_features.py" 9 | python $code_path"_2_3_gen_global_sum_counts.py" 10 | python $code_path"_2_4_gen_tricks.py" 11 | python $code_path"_2_5_gen_smooth_cvr.py" 12 | python $code_path"_2_6_gen_ID_click_vectors.py" 13 | python $code_path"_2_7_gen_trick_final.py" 14 | python $code_path"_3_0_gen_final_data.py" 15 | echo "finished gen data!!!" -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/Ad_Utils.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Ad_Utils.py 8 | @time: 2018/3/7 16:27 9 | @desc: 数据预处理工具类 10 | """ 11 | 12 | import pickle 13 | import pandas as pd 14 | import numpy as np 15 | import scipy.stats as sps # 统计推断包 16 | 17 | from tqdm import tqdm 18 | 19 | # file_path 20 | raw_data_path = 'E:\dataset\\final\\' 21 | feature_data_path ='E:\dataset\\final\\features' 22 | cache_pkl_path = '../cache_pkl/' 23 | result_path = '../result/' 24 | 25 | def load_pickle(path): 26 | return pickle.load(open(path, 'rb')) 27 | 28 | def dump_pickle(obj, path, protocol=None): 29 | pickle.dump(obj, open(path, 'wb'), protocol=protocol) 30 | 31 | def analyse(data, field): 32 | a = data.groupby(field).size() 33 | b = data.groupby(field)['label'].sum() 34 | c = pd.DataFrame({'conversion':b, 'click':a}) 35 | c.reset_index(inplace=True) 36 | c['prob'] = c['conversion'] / c['click'] 37 | return c.sort_values('prob', ascending=False) 38 | 39 | def generate_file(valid_y, pred_prob): 40 | ans = valid_y.copy() 41 | ans['prob'] = pred_prob 42 | return ans 43 | 44 | def addCrossFeature(split_train, split_test, feature_1, feature_2): 45 | ''' 46 | 根据训练集构造新的特征组合,对于测试集出现的新类别:取值NA 47 | :param split_train: 48 | :param split_test: 49 | :param feature_1: 50 | :param feature_2: 51 | :return: 52 | ''' 53 | comb_index = split_train[[feature_1, feature_2]].drop_duplicates() 54 | comb_index[feature_1 + '_' + feature_2] = np.arange(1, comb_index.shape[0]+1) #在给定范围内给出给定范围内给定间隔的值 55 | split_train = pd.merge(split_train, comb_index, 'left', on=[feature_1, feature_2]) 56 | split_test = pd.merge(split_test, comb_index, 'left', on=[feature_1, feature_2]) 57 | return split_train, split_test 58 | 59 | def get_feature_value(features, values, sort=True): 60 | ''' 61 | 获取特征和值 62 | :param features: 63 | :param values: 64 | :param sort: 65 | :return: 66 | ''' 67 | df = pd.DataFrame({'name':features, 'value':values, 'abs_':np.abs(values)}) 68 | if sort: 69 | return df.sort_values('abs_', ascending=False) 70 | else: 71 | return df 72 | 73 | def feature_spearmanr(data, feature_list): 74 | ''' 75 | 76 | :param data: 77 | :param feature_list: 78 | :return: 79 | ''' 80 | cor_feature = [] 81 | spearmanr = [] 82 | for i in range(0, len(feature_list)): 83 | for j in range(i+1, len(feature_list)): 84 | cor_feature.append('_'.join([feature_list[i], feature_list[j]])) 85 | spearmanr.append(sps.spearmanr(data[feature_list[i]], data[feature_list[i]])[0]) 86 | sp_df = pd.DataFrame({'feature':cor_feature, 'spearmanr':spearmanr}) 87 | sp_df['abs_spearmanr'] = np.abs(sp_df['spearmanr']) 88 | sp_df.sort_values('abs_spearmanr', ascending=False, inplace=True) 89 | return sp_df 90 | 91 | def feature_target_spearmanr(data, feature_list, target): 92 | cor_feature = [] 93 | spearmanr = [] 94 | for i in range(0, len(feature_list)): 95 | cor_feature.append('_'.join([feature_list, target])) 96 | spearmanr.append(sps.spearmanr(data[feature_list[i]], data[target])[0]) 97 | sp_df = pd.DataFrame({'feature':cor_feature, 'spearmanr':spearmanr}) 98 | sp_df['abs_spearmanr'] = np.abs(sp_df['spearmanr']) 99 | sp_df.sort_values('abs_spearmanr', ascending=False, inplace=True) 100 | return sp_df 101 | 102 | def stratified_sampling(train, frac=0.33, seed=0): 103 | np.random.seed(seed) 104 | label_mean = train['label'].mean() 105 | pos_size = int(train.shape[0] * frac * label_mean) 106 | neg_size = int(train.shape[0] * frac * (1-label_mean)) 107 | pos_index = train[train.label == 1].index 108 | neg_index = train[train.label == 0].index 109 | sample_pos_idx = np.random.choice(pos_index, pos_size, replace=False) # 从给定的1-D数组生成随机样本 110 | sample_neg_idx = np.random.choice(neg_index, neg_size, replace=False) 111 | sample_index = np.hstack([sample_pos_idx, sample_neg_idx]) 112 | np.random.shuffle(sample_index) 113 | return train.loc[sample_index] 114 | 115 | def inverse_logit(x): 116 | return np.log(x/(1-x)) 117 | 118 | def logit(x): 119 | return 1/(1+np.exp(-x)) 120 | 121 | def calibration(pred, avg): 122 | intercept = inverse_logit(np.mean(pred)) - inverse_logit(avg) 123 | return logit(inverse_logit(pred) - intercept) 124 | 125 | def simple_avg(pred_list): 126 | ans = np.ones_like(pred_list[0]) 127 | for p in pred_list: 128 | ans += inverse_logit(p) 129 | return logit(ans/len(pred_list)) 130 | -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/Feature_joint.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Feature_joint.py 8 | @time: 2018/3/8 15:41 9 | @desc: 特征拼接 10 | """ 11 | 12 | import os 13 | import pickle 14 | import gc 15 | import pandas as pd 16 | import numpy as np 17 | 18 | from tqdm import tqdm 19 | from Ad_Utils import load_pickle,dump_pickle,raw_data_path 20 | 21 | def addAd(data): 22 | ''' 23 | 拼接原始ad特征 24 | :param data: 25 | :return: 26 | ''' 27 | feature_path = raw_data_path + 'ad.pkl' 28 | ad_feature = ['adID', 'camgaignID', 'creativeID', 'advertiserID', 'appID', 'appPlatform'] #ad.csv的所有字段 29 | if os.path.exists(feature_path): 30 | ad = load_pickle(feature_path) 31 | else: 32 | ad = pd.read_csv(raw_data_path + 'ad.csv') 33 | dump_pickle(ad, feature_path) 34 | return pd.merge(data, ad[ad_feature], on='creativeID', how='left') 35 | 36 | def addPosition(data): 37 | ''' 38 | 拼接原始position特征 39 | :param data: 40 | :return: 41 | ''' 42 | feature_path = raw_data_path + 'position.pkl' 43 | position_feature = ['positionID', 'sitesetID', 'positionType'] 44 | if os.path.exists(feature_path): 45 | position = load_pickle(feature_path) 46 | else: 47 | position = pd.read_csv(raw_data_path + 'position.csv') 48 | dump_pickle(position, feature_path) 49 | 50 | return pd.merge(data, position[position_feature], on='positionID', how='left') 51 | 52 | def addAppCategories(data): 53 | ''' 54 | 拼接原始app_categories特征 55 | :param data: 56 | :return: 57 | ''' 58 | app = pd.read_csv(raw_data_path + 'app_categories.csv') 59 | app['cate_a'] = app['appCategory'].apply(lambda x: x//100 if x > 100 else x) 60 | return pd.merge(data, app, on='appID', how='left') 61 | 62 | def addUserInfo(data): 63 | ''' 64 | 添加用户信息,以及将居住地按省份和城市提取 65 | :param data: 66 | :return: 67 | ''' 68 | user_info = pd.read_csv(raw_data_path + 'user.csv') 69 | data = pd.merge(data, user_info, on='userID', how='left') 70 | data['ht_province'] = data['hometown']//100 71 | data['ht_province'] = data['residence']//100 72 | return data 73 | 74 | def addTime(data): 75 | ''' 76 | 添加一些时间转换的信息 77 | :param data: 78 | :return: 79 | ''' 80 | data['clickDay'] = data['clickTime']//1000000 81 | data['clickDay'] = data['clickDay'].astype(int) 82 | data['clickHour'] = (data['clickTime']//10000%100).astype(int) 83 | # data['clickMin'] = (data['clickTime']%10000//100).astype(int) 84 | # data['clickSec'] = (data['clickTime']%100).astype(int) 85 | return data 86 | -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/Gen_ID_click_vectors.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Gen_ID_click_vectors.py 8 | @time: 2018/3/19 17:16 9 | @desc: 广告描述向量特征提取 10 | 广告投放是有特定受众对象的,而特定的受众对象也可以描述广告的相关特性, 11 | 使用不同的人口属性对广告ID和APPID进行向量表示,学习隐含的语义特征。 12 | """ 13 | 14 | import os 15 | import pickle 16 | import gc 17 | import pandas as pd 18 | import numpy as np 19 | 20 | from tqdm import tqdm 21 | from Ad_Utils import load_pickle, dump_pickle, raw_data_path, feature_data_path 22 | from Feature_joint import addTime, addPosition, addAd, addAppCategories, addUserInfo 23 | 24 | 25 | def gen_CountVector_ID_user_clicks(ID_name, last_day=27, ID_describe_feature_names=['age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby'], drop_na=False): 26 | ''' 27 | 生成根据train和test表计算的ID_name计数描述向量 28 | 拼接键 [ID_name] 29 | :param ID_name: 30 | :param last_day: 31 | :param gen_CountVector_ID_user_clicks: 32 | :param drop_na: 33 | :return: 34 | ''' 35 | train = load_pickle(raw_data_path + 'train.pkl') 36 | test = load_pickle(raw_data_path + 'test.pkl') 37 | data = train.append(test) 38 | data = addTime(data) 39 | data = data[data.clickDay <= last_day] 40 | data = addAd(data) 41 | data = addPosition(data) 42 | data = addAppCategories(data) 43 | data = data[['userID', ID_name]] 44 | user_info = pd.read_csv(raw_data_path + 'user.csv') 45 | 46 | user_info['age_cut'] = pd.cut(user_info['age'], bins=[-1, 0, 18, 25, 35, 45, 55, np.inf], labels=False) 47 | user_info.loc[user_info.education == 7, 'education'] = 6 48 | 49 | user_info['hometown_province'] = user_info['hometown'].apply(lambda x: x // 100) 50 | user_info['residence_province'] = user_info['residence'].apply(lambda x: x // 100) 51 | 52 | for feature in tqdm(ID_describe_feature_names): 53 | feature_path = feature_data_path + 'CountVector_' + ID_name + '_user_clicks_' + feature + '_lastday' + str( 54 | last_day) + '.pkl' 55 | if drop_na: 56 | feature_path += '.no_na' 57 | if os.path.exists(feature_path): 58 | print('found ' + feature_path) 59 | continue 60 | print('generating ' + feature_path) 61 | prefix_name = ID_name + '_user_clicks_' + feature 62 | sub_user_info = pd.get_dummies(user_info[['userID', feature]], columns=[feature], prefix=prefix_name) 63 | if drop_na: 64 | sub_user_info.drop([prefix_name + '_0'], axis=1, inplace=True) 65 | data = pd.merge(data, sub_user_info, 'left', 'userID') 66 | dummy_features = sub_user_info.columns.tolist() 67 | dummy_features.remove('userID') 68 | ID_describe_feature = data[[ID_name] + dummy_features].groupby([ID_name], as_index=False).sum() 69 | data.drop(dummy_features, axis=1, inplace=True) 70 | dump_pickle(ID_describe_feature, feature_path) 71 | 72 | 73 | def get_ConcatedTfidfVector_ID_user_clicks(ID_name, last_day, mode='local', concated_list=['age_cut', 'gender', 'education', 'marriageStatus','haveBaby', ], drop_na=False, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): 74 | """ 75 | 使用默认的local模式效果稍微好一些 76 | 测试过advertiserID,camgaignID,adID,creativeID,appID,appCategory,cate_A,appPlatform,positionType 77 | adver效果较好,appID效果其次,然后是appCategory,其他都不好 78 | """ 79 | from sklearn.feature_extraction.text import TfidfTransformer 80 | tfidf_vec = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) 81 | concated_tfidf_vec = None 82 | 83 | for feature in tqdm(concated_list): 84 | feature_path = feature_data_path + 'CountVector_' + ID_name + '_user_clicks_' + feature + '_lastday' + str( 85 | last_day) + '.pkl' 86 | if drop_na: 87 | feature_path += '.no_na' 88 | if not os.path.exists(feature_path): 89 | gen_CountVector_ID_user_clicks(ID_name) 90 | count_vec = load_pickle(feature_path) 91 | if mode == 'local': 92 | count_vec.set_index(ID_name, inplace=True) 93 | vec_columns = count_vec.columns 94 | local_tfidf_vec = tfidf_vec.fit_transform(count_vec).todense() 95 | local_tfidf_vec = pd.DataFrame(local_tfidf_vec, columns=vec_columns, index=count_vec.index).reset_index() 96 | elif mode == 'global': 97 | local_tfidf_vec = count_vec 98 | 99 | if concated_tfidf_vec is None: 100 | concated_tfidf_vec = local_tfidf_vec 101 | else: 102 | concated_tfidf_vec = pd.merge(concated_tfidf_vec, local_tfidf_vec, 'left', ID_name) 103 | if mode == 'global': 104 | concated_tfidf_vec.set_index(ID_name, inplace=True) 105 | vec_columns = concated_tfidf_vec.columns 106 | global_concated_tfidf_vec = tfidf_vec.fit_transform(concated_tfidf_vec).todense() 107 | global_concated_tfidf_vec = pd.DataFrame(global_concated_tfidf_vec, columns=vec_columns, 108 | index=concated_tfidf_vec.index) 109 | concated_tfidf_vec = global_concated_tfidf_vec.reset_index() 110 | return concated_tfidf_vec 111 | 112 | if __name__ == '__main__': 113 | gen_CountVector_ID_user_clicks('advertiserID', 31) 114 | gen_CountVector_ID_user_clicks('appID', 31) 115 | gen_CountVector_ID_user_clicks('advertiserID', 27) 116 | gen_CountVector_ID_user_clicks('appID', 27) 117 | print('All done') -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/Gen_global_sum_counts.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Gen_global_sum_counts.py 8 | @time: 2018/3/13 17:01 9 | @desc: 10 | """ 11 | import os 12 | import pickle 13 | import gc 14 | import pandas as pd 15 | import numpy as np 16 | 17 | from tqdm import tqdm 18 | from Ad_Utils import load_pickle, dump_pickle, raw_data_path, feature_data_path 19 | from Feature_joint import addUserInfo, addTime, addAd, addPosition, addAppCategories 20 | 21 | 22 | def gen_ID_global_sum_count(last_day = 27,stats_features = ['positionID','creativeID','appID','adID','userID']): 23 | train = load_pickle(raw_data_path + 'train.pkl') 24 | test = load_pickle(raw_data_path + 'test.pkl') 25 | data = train.append(test) 26 | data = addTime(data) 27 | data = data[data.clickDay<=last_day] 28 | del train, test 29 | gc.collect() 30 | data = addAd(data) 31 | data = addPosition(data) 32 | data = addAppCategories(data) 33 | 34 | for feature in tqdm(stats_features): 35 | feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str(last_day) + '.pkl' 36 | if os.path.exists(feature_path): 37 | print('found ' + feature_path) 38 | # continue 39 | print('generating ' + feature_path) 40 | feature_count_sum = pd.DataFrame(data.groupby(feature).size()).reset_index().rename( 41 | columns={0: feature + '_sum_count'}) 42 | dump_pickle(feature_count_sum, feature_path) 43 | 44 | def add_global_count_sum(data,last_day =27 ,stats_features=['positionID','creativeID','appID','adID','userID']): 45 | """ 46 | 添加ID出现次数,根据ID_name拼接 47 | """ 48 | for feature in tqdm(stats_features): 49 | feature_path = feature_data_path+'global_count_'+feature+'_lastday'+str(last_day)+'.pkl' 50 | if not os.path.exists(feature_path): 51 | gen_ID_global_sum_count([feature]) 52 | feature_count_sum = load_pickle(feature_path) 53 | data = data.merge(feature_count_sum,'left',[feature]) 54 | return data 55 | 56 | if __name__ == '__main__': 57 | gen_ID_global_sum_count(27) 58 | gen_ID_global_sum_count(31) 59 | # add_global_count_sum(27) 60 | print('all done') 61 | -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/Gen_smooth_cvr.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Gen_smooth_cvr.py 8 | @time: 2018/3/15 11:22 9 | @desc: 广告转化率特征提取 10 | 构造转化率特征,使用全局和滑动窗口等方式计算单特征转化率,组合特征转化率,使用均值填充,层级填充,贝叶斯平滑,拉普拉斯平滑等方式对转化率进行修正。 11 | """ 12 | 13 | import numpy as np 14 | import pandas as pd 15 | import gc 16 | import os 17 | 18 | from Smooth import BayesianSmoothing 19 | from tqdm import tqdm 20 | from Ad_Utils import raw_data_path, feature_data_path,load_pickle, dump_pickle 21 | from Feature_joint import addAd, addTime, addPosition 22 | 23 | 24 | def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25): 25 | # train_data = pd.read_csv(raw_data_path, 'train.csv') 26 | train_data = load_pickle(raw_data_path + 'train.pkl') 27 | test_data = load_pickle(raw_data_path + 'test.pkl') 28 | data = train_data.append(test_data) 29 | del train_data, test_data 30 | gc.collect() 31 | data = addTime(data) 32 | data = addAd(data) 33 | data = addPosition(data) 34 | ID_hist_cvr = None 35 | for day in tqdm(np.arange(start_day, end_day+1)): 36 | feature_path = feature_data_path + key + '_histcvr_smooth_day_'+str(day)+'.pkl' 37 | if os.path.exists(feature_path): 38 | print('found ' + feature_path) 39 | else: 40 | print('generating ' + feature_path) 41 | dfCvr = data[data.clickDay < day] 42 | dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label') 43 | dfCvr = dfCvr.groupby([key], as_index=False).sum() 44 | dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / (dfCvr['label_0'] + dfCvr['label_0'] + alpha * 2) 45 | # dfCvr['clickDay'] = day 46 | sub_data = pd.merge(data.loc[data.clickDay==day,['clickDay',key]],dfCvr[[key,key+'_cvr']],'left',on=[key,]) 47 | sub_data.drop_duplicates(['clickDay', key], inplace=True) 48 | sub_data.sort_values(['clickDay', key], inplace=True) 49 | dump_pickle(sub_data[['clickDay', key, key + '_cvr']], feature_path) 50 | 51 | def add_hist_cvr_smooth(data, key): 52 | hist_cvr_smooth = None 53 | for day in tqdm((data.clickTime // 1000000).unique()): 54 | feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str(day) + '.pkl' 55 | day_cvr_smooth = load_pickle(feature_path) 56 | if hist_cvr_smooth is None: 57 | hist_cvr_smooth = day_cvr_smooth 58 | else: 59 | hist_cvr_smooth = pd.concat([hist_cvr_smooth, day_cvr_smooth], axis=0) 60 | data = pd.merge(data, hist_cvr_smooth, 'left', ['clickDay', key]) 61 | return data 62 | 63 | 64 | def gen_positionID_cvr_smooth(test_day): 65 | feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str(test_day) + '.pkl' 66 | if os.path.exists(feature_path): 67 | print('found ' + feature_path) 68 | else: 69 | print('generating ' + feature_path) 70 | data = load_pickle(raw_data_path + 'train.pkl') 71 | data = addTime(data) 72 | positionID_cvr = data[data.clickDay < test_day] 73 | I = positionID_cvr.groupby('positionID')['label'].size().reset_index() 74 | I.columns = ['positionID', 'I'] 75 | C = positionID_cvr.groupby('positionID')['label'].sum().reset_index() 76 | C.columns = ['positionID', 'C'] 77 | positionID_cvr = pd.concat([I, C['C']], axis=1) 78 | hyper = BayesianSmoothing(1, 1) 79 | hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values, 10000, 0.00000001) 80 | alpha = hyper.alpha 81 | beta = hyper.beta 82 | positionID_cvr['positionID_cvr_smooth'] = (positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta) 83 | dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']], feature_path) 84 | 85 | def add_smooth_pos_cvr(data, test_day): 86 | feature_path = feature_data_path + 'positionID_cvr_smooth_day_'+str(test_day)+'.pkl' 87 | smooth_pos_cvr = load_pickle(feature_path) 88 | data = pd.merge(data,smooth_pos_cvr,'left','positionID') 89 | return data 90 | 91 | if __name__ == '__main__': 92 | gen_hist_cvr_smooth(23, 31, 'userID', ) 93 | gen_hist_cvr_smooth(23, 31, 'creativeID', ) 94 | gen_hist_cvr_smooth(23, 31, 'adID', ) 95 | gen_hist_cvr_smooth(23, 31, 'appID', ) 96 | 97 | gen_positionID_cvr_smooth(27) 98 | gen_positionID_cvr_smooth(31) -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/Gen_tricks.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Gen_tricks.py 8 | @time: 2018/3/14 9:03 9 | @desc: 广告主转化回流上报机制分析 10 | 不同的广告主具有不同的转化计算方式,如第一次点击算转化,最后一次点击算转化,安装时点击算转化,分析并构造相应描述特征,提升模型预测精度。 11 | """ 12 | import pandas as pd 13 | import numpy as np 14 | import gc 15 | import os 16 | 17 | from Smooth import BayesianSmoothing 18 | from tqdm import tqdm 19 | from Ad_Utils import raw_data_path, feature_data_path, load_pickle, dump_pickle 20 | from Feature_joint import addAd, addPosition, addTime 21 | 22 | 23 | def trick(row): 24 | if row['ua_cnt'] <= 1: 25 | return 0 26 | elif row['ua_first'] > 0: 27 | return 1 28 | elif row['ua_last'] > 0: 29 | return 2 30 | else: 31 | return 3 32 | 33 | def add_trick(df): 34 | ua_cnt = df.groupby(['userID', 'advertiserID']).size().reset_index() 35 | ua_cnt.rename(columns={0: 'ua_cnt'}, inplace=True) 36 | ua_cnt = ua_cnt[['userID', 'advertiserID', 'ua_cnt']] 37 | df = pd.merge(df, ua_cnt, how='left', on=['userID', 'advertiserID']) 38 | 39 | sorted = df.sort_values(by=['userID', 'advertiserID', 'clickTime'], ascending=True) 40 | first = sorted.drop_duplicates(['userID', 'advertiserID']) 41 | last = sorted.drop_duplicates(['userID', 'advertiserID'], keep='last') 42 | 43 | first['ua_first'] = 1 44 | first = first[['ua_first']] 45 | df = df.join(first) 46 | 47 | last['ua_last'] = 1 48 | last = last[['ua_last']] 49 | df = df.join(last) 50 | 51 | df['trick'] = df.apply(trick, axis=1) 52 | return df 53 | 54 | def add_diff(df): 55 | sorted = df.sort_values(by=['userID', 'advertiserID', 'clickTime'], ascending=True) 56 | first = sorted.groupby(['userID', 'advertiserID'])['clickTime'].first().reset_index() 57 | first.rename(columns={'clickTime': 'first_diff'}, inplace=True) 58 | last = sorted.groupby(['userID', 'advertiserID'])['clickTime'].last().reset_index() 59 | last.rename(columns={'clickTime': 'last_diff'}, inplace=True) 60 | df = pd.merge(df, first, 'left', on=['userID', 'advertiserID']) 61 | df = pd.merge(df, last, 'left', on=['userID', 'advertiserID']) 62 | df['first_diff'] = df['clickTime'] - df['first_diff'] 63 | df['last_diff'] = df['last_diff'] - df['clickTime'] 64 | return df 65 | 66 | def add_install2click(df, i, actions): 67 | install2click = actions[actions.installTime < i * 1000000] 68 | df = pd.merge(df, install2click, 'left', ['userID', 'appID']) 69 | df['install2click'] = df['clickTime'] - df['installTime'] 70 | return df 71 | 72 | def gen_tricks(start_day, end_day): 73 | ''' 74 | 生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接 75 | :param start_day: 76 | :param end_day: 77 | :return: 78 | ''' 79 | train_data = load_pickle(raw_data_path + 'train.pkl') 80 | test_data = load_pickle(raw_data_path + 'test.pkl') 81 | actions = load_pickle(raw_data_path + 'user_app_actions.pkl') 82 | data = train_data.append(test_data) 83 | del train_data, test_data 84 | data = addTime(data) 85 | data = addAd(data) 86 | 87 | for day in tqdm(np.arange(start_day, end_day+1)): 88 | feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl' 89 | if os.path.exists(feature_path): 90 | print('found ' + feature_path) 91 | else: 92 | print('generating ' + feature_path) 93 | df = data.loc[data.clickDay == day] 94 | df = add_trick(df) 95 | df = add_diff(df) 96 | df = add_install2click(df, day, actions) 97 | dump_pickle(df[['global_index', 'trick', 'first_diff', 'last_diff', 'install2click']], feature_path) 98 | 99 | def add_tricks(data): 100 | ''' 101 | 102 | :param data: 103 | :return: 104 | ''' 105 | tricks = None 106 | for day in tqdm((data.clickTime // 1000000).unique()): 107 | feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl' 108 | if not os.path.exists(feature_path): 109 | gen_tricks(day, day) 110 | day_tricks = load_pickle(feature_path) 111 | if tricks is None: 112 | tricks = day_tricks 113 | else: 114 | tricks = pd.concat([tricks, day_tricks], axis=0) 115 | data = pd.merge(data, tricks, 'left', 'global_index') 116 | return data 117 | 118 | if __name__ == '__main__': 119 | gen_tricks(23, 31) 120 | print('All done') -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/Gen_tricks_final.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Gen_tricks_final.py 8 | @time: 2018/3/23 9:10 9 | @desc: 10 | """ 11 | 12 | import os 13 | import pandas as pd 14 | import numpy as np 15 | import scipy as sps 16 | 17 | from tqdm import tqdm 18 | from Feature_joint import addTime 19 | from Ad_Utils import raw_data_path, feature_data_path 20 | 21 | 22 | def generate_click_trick(): 23 | # df['origin_index'] = df.index 24 | feature_path = feature_data_path + 'global_tricks.pkl' 25 | if os.path.exists(feature_path): 26 | print('found ' + feature_path) 27 | else: 28 | train = pd.read_pickle(raw_data_path + 'train.pkl') 29 | test = pd.read_pickle(raw_data_path + 'test.pkl') 30 | df = train.append(test) 31 | df = df[['global_index', 'creativeID', 'userID', 'label', 'clickTime', ]] 32 | del train, test 33 | df = addTime(df) 34 | gc.collect() 35 | uct_cnt = df.groupby(['userID', 'creativeID']).size().reset_index() 36 | uct_cnt.rename(columns={0: 'global_uct_cnt'}, inplace=True) 37 | df = pd.merge(df, uct_cnt, how='left', on=['userID', 'creativeID']) 38 | 39 | df_1 = df.sort_values(by=['userID', 'clickTime'], ascending=True) 40 | first = df_1.drop_duplicates('userID') 41 | first['global_first'] = 1 42 | first = first[['userID', 'clickTime', 'global_first']] 43 | df = pd.merge(df, first, how='left', on=['userID', 'clickTime']) 44 | 45 | df_2 = df.sort_values(by=['userID', 'clickTime'], ascending=False) 46 | last = df_2.drop_duplicates('userID') 47 | last['global_last'] = 1 48 | last = last[['userID', 'clickTime', 'global_last']] 49 | df = pd.merge(df, last, how='left', on=['userID', 'clickTime']) 50 | pd.to_pickle(df[['clickDay', 'global_uct_cnt', 'global_first', 'global_last', ]], feature_path) 51 | 52 | def add_click_trick(data,start_day,end_day): 53 | feature_path = feature_data_path + 'global_tricks.pkl' 54 | feature_names = ['global_uct_cnt','global_first','global_last'] 55 | trick_final = pd.read_pickle(feature_path) 56 | trick_final = trick_final.loc[(trick_final.clickDay>=start_day)&(trick_final.clickDay<=end_day),feature_names] 57 | trick_final.index = data.index 58 | data = pd.concat([data,trick_final[feature_names]],axis=1) 59 | #data = pd.merge(data,trick_final,'left','global_index') 60 | return data 61 | 62 | if __name__ == '__main__': 63 | generate_click_trick() -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/Preprocess_Data.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Preprocess_Data.py 8 | @time: 2018/3/7 16:10 9 | @desc: 数据预处理 10 | """ 11 | 12 | import os 13 | import pickle 14 | import pandas as pd 15 | import numpy as np 16 | from Ad_Utils import raw_data_path,feature_data_path,result_path,cache_pkl_path,dump_pickle,load_pickle 17 | 18 | def gen_global_index(): 19 | train = pd.read_csv(raw_data_path + 'train.csv') 20 | test = pd.read_csv(raw_data_path + 'test.csv') 21 | all_data = train.append(test) 22 | all_data['global_index'] = np.arange(0, all_data.shape[0]) 23 | train = all_data.iloc[0:train.shape[0], :] 24 | test = all_data.iloc[train.shape[0]:, :] 25 | dump_pickle(train, raw_data_path + 'train.pkl') 26 | dump_pickle(test, raw_data_path + 'test.pkl') 27 | 28 | def csv_pkl(csv_name_without_suffix, protocol=None): 29 | pkl_path = raw_data_path + csv_name_without_suffix + '.pkl' 30 | if not os.path.exists(pkl_path): 31 | print('generating' + pkl_path) 32 | data = pd.read_csv(raw_data_path + csv_name_without_suffix + '.csv') 33 | dump_pickle(data, pkl_path, protocol=protocol) 34 | else: 35 | print('found' + pkl_path) 36 | 37 | def gen_demo_result(): 38 | test = pd.read_csv(raw_data_path + 'test.csv') 39 | test = test[['instanceID', 'label']] 40 | test.rename(columns={'label':'prob'}, inplace=True) 41 | if not os.path.exists(result_path): 42 | os.mkdir(result_path) 43 | test.to_csv(result_path + 'demo_result.csv', index=False) 44 | 45 | if __name__ == '__main__': 46 | gen_global_index() 47 | train = load_pickle(raw_data_path + 'train.pkl') 48 | train = train[train.clickTime >=17000000] #丢弃16号的数据 49 | dump_pickle(train, raw_data_path + 'train.pkl') 50 | 51 | csv_pkl('ad') 52 | csv_pkl('position') 53 | csv_pkl('app_categories') 54 | csv_pkl('test') 55 | csv_pkl('user_app_actions') 56 | csv_pkl('user') 57 | csv_pkl('user_installedapps', protocol=4) 58 | 59 | gen_demo_result() 60 | 61 | if not os.path.exists(feature_data_path): 62 | os.mkdir(feature_data_path) 63 | if not os.path.exists(cache_pkl_path): 64 | os.mkdir(cache_pkl_path) -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/Smooth.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Smooth.py 8 | @time: 2018/3/14 9:42 9 | @desc: 贝叶斯平滑 参考博客: 10 | http://blog.csdn.net/mytestmy/article/details/19088519 11 | """ 12 | 13 | ''' 14 | 主要思想是在分子分母各加一个比较大的数 15 | ''' 16 | 17 | import numpy as np 18 | import random 19 | import scipy.special as special # 排列组合与阶乘 20 | 21 | from tqdm import tqdm 22 | 23 | np.random.seed(0) 24 | 25 | class BayesianSmoothing(object): 26 | def __init__(self, alpha, beta): 27 | self.alpha = alpha 28 | self.beta = beta 29 | 30 | def sample(self, alpha, beta, num, imp_upperbound): 31 | sample = np.random.beta(alpha, beta, num) # Draw samples from a Beta distribution. 32 | I = [] 33 | C = [] 34 | for clk_rt in sample: 35 | # imp = random.random() * imp_upperbound 36 | imp = imp_upperbound 37 | clk = imp * clk_rt 38 | I.append(imp) 39 | C.append(clk) 40 | return I, C 41 | 42 | def update(self, imps, clks, iter_num, epsilon): 43 | for i in tqdm(range(iter_num)): 44 | new_alpha, new_beta = self.__fixed_point_iteration(imps, clks, self.alpha, self.beta) 45 | if abs(new_alpha - self.beta) < epsilon and abs(new_beta - self.alpha) < epsilon: 46 | break 47 | print(new_alpha, new_beta, i) 48 | self.alpha = new_alpha 49 | self.beta = new_beta 50 | 51 | def __fixed_point_iteration(self, imps, clks, alpha, beta): 52 | ''' 53 | 参数估计的几种方法之一: fixed-point iteration 54 | 首先构造出似然函数,然后利用fixed-point iteration来求得似然函数的最大值 55 | 1)首先给出参数的一个初始值。 56 | 2)在初始值处,构造似然函数的一个紧的下界函数。这个下界函数可以用closed-form的方式计算其最大值处的参数值,将此参数值作为新的参数估计。 57 | 3)不断重复上述(2)的步骤,直至收敛。此时便可到达似然函数的stationary point。 58 | 其实fixed-point iteration(不动点迭代)的思想与EM类似。 59 | :param imps: 60 | :param clks: 61 | :param alpha: 62 | :param beta: 63 | :return: 64 | ''' 65 | numerater_alpha = 0.0 66 | numerater_beta = 0.0 67 | denominator = 0.0 68 | 69 | for i in range(len(imps)): 70 | numerater_alpha += (special.digamma(clks[i] + alpha) - special.digamma(alpha)) 71 | numerater_beta += (special.digamma(imps[i] + beta) - special.digamma(beta)) # 计算psi值 72 | denominator += (special.digamma(imps[i] + alpha + beta) - special.digamma(alpha + beta)) 73 | return alpha * (numerater_alpha / denominator), beta * (numerater_beta / denominator) 74 | 75 | def test(): 76 | bs = BayesianSmoothing(1, 1) 77 | I, C = bs.sample(500, 500, 1000, 1000) 78 | print(I, C) 79 | bs.update(I, C, 1000, 0.0000000001) 80 | print(bs.alpha, bs.beta) 81 | 82 | 83 | if __name__ == '__main__': 84 | pass -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: __init__.py.py 8 | @time: 2018/3/7 16:02 9 | @desc: 10 | """ 11 | 12 | if __name__ == '__main__': 13 | pass -------------------------------------------------------------------------------- /Tencent_Social_Ads/src/ffm.py: -------------------------------------------------------------------------------- 1 | import subprocess,multiprocessing 2 | import os,time 3 | import pandas as pd 4 | import numpy as np 5 | 6 | class FFM: 7 | """libffm-1.21 Python Wrapper with libffm format data 8 | 9 | :Args: 10 | - reg_lambda: float, default: 2e-5 11 | regularization parameter 12 | - factor: int,default: 4 13 | number of latent factors 14 | - iteration: int, default: 15 15 | - learning_rate: float, default: 0.2 16 | - n_jobs: int, default: 1 17 | Number of parallel threads 18 | - verbose: int, default: 1 19 | - norm: bool, default: True 20 | instance-wise normalization 21 | """ 22 | def __init__(self,reg_lambda=0.00002,factor=4,iteration=15,learning_rate=0.2,n_jobs=1, 23 | verbose=1,norm=True,): 24 | if n_jobs <=0 or n_jobs > multiprocessing.cpu_count(): 25 | raise ValueError('n_jobs must be 1~{0}'.format(multiprocessing.cpu_count())) 26 | 27 | self.reg_lambda = reg_lambda 28 | self.factor = factor 29 | self.iteration = iteration 30 | self.learning_rate = learning_rate 31 | self.n_jobs = n_jobs 32 | self.verbose = verbose 33 | self.norm = norm 34 | 35 | 36 | self.cmd = '' 37 | 38 | self.output_name = 'ffm_result'+str(int(time.time()))# temp predict result file 39 | 40 | 41 | 42 | 43 | def fit(self,train_ffm_path,valid_ffm_path=None,model_path=None,auto_stop=False,): 44 | """ Train the FFM model with ffm-format data, 45 | 46 | :Args: 47 | - train_ffm_path: str 48 | - valid_ffm_path: str, default: None 49 | - model_path: str, default: None 50 | - auto_stop: bool, default: False 51 | stop at the iteration that achieves the best validation loss 52 | """ 53 | 54 | if not os.path.exists(train_ffm_path): 55 | raise FileNotFoundError("file '{0}' not exists".format(train_ffm_path)) 56 | self.train_ffm_path = train_ffm_path 57 | self.valid_ffm_path = valid_ffm_path 58 | self.model_path = None 59 | self.auto_stop = auto_stop 60 | 61 | 62 | cmd = 'ffm-train -l {l} -k {k} -t {t} -r {r} -s {s}'\ 63 | .format(l=self.reg_lambda,k=self.factor,t=self.iteration,r=self.learning_rate,s=self.n_jobs) 64 | if self.valid_ffm_path is not None: 65 | cmd +=' -p {p}'.format(p=self.valid_ffm_path) 66 | 67 | if self.verbose == 0: 68 | cmd += ' --quiet' 69 | if not self.norm: 70 | cmd += ' --no-norm' 71 | 72 | if self.auto_stop: 73 | if self.valid_ffm_path is None: 74 | raise ValueError('Must specify valid_ffm_path when auto_stop = True') 75 | cmd += ' --auto-stop' 76 | cmd += ' {p}'.format(p=self.train_ffm_path) 77 | if not model_path is None: 78 | cmd +=' {p}'.format(p=model_path) 79 | self.model_path = model_path 80 | self.cmd = cmd 81 | print('Sending command...') 82 | popen = subprocess.Popen(cmd, stdout = subprocess.PIPE,shell=True) 83 | while True: 84 | output = str(popen.stdout.readline(),encoding='utf-8').strip('\n') 85 | if output.strip()=='': 86 | print('FFM training done') 87 | break 88 | print(output) 89 | 90 | def predict(self,test_ffm_path,model_path=None): 91 | """ Predict and return the probability of positive class. 92 | 93 | :Args: 94 | - test_ffm_path: str 95 | - model_path: str, default: None 96 | :returns: 97 | - pred_prob: np.array 98 | """ 99 | 100 | cmd = "ffm-predict {t}".format(t=test_ffm_path) 101 | if model_path is None and self.model_path is None: 102 | raise ValueError('Must specify model_path') 103 | elif model_path is not None: 104 | self.model_path = model_path 105 | cmd +=" {0} {1}".format(self.model_path,self.output_name) 106 | self.cmd = cmd 107 | print('Sending command...') 108 | popen = subprocess.Popen(cmd, stdout = subprocess.PIPE,shell=True) 109 | while True: 110 | output = str(popen.stdout.readline(),encoding='utf-8').strip('\n') 111 | if output.strip()=='': 112 | print('FFM predicting done') 113 | break 114 | print(output) 115 | 116 | ans = pd.read_csv(self.output_name,names=['prob']) 117 | os.remove(self.output_name) 118 | return ans.prob.values -------------------------------------------------------------------------------- /Zhihuijiaotong/code/Related_lagging.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Related_lagging.py 8 | @time: 2018/3/1 9:43 9 | @desc: lagging 时间特征 调参手段 10 | """ 11 | 12 | import pandas as pd 13 | import numpy as np 14 | import multiprocessing 15 | from joblib import Parallel, delayed 16 | 17 | pd.options.display.float_format = '{:,.2f}'.format 18 | pd.set_option('display.max_rows', 20) 19 | pd.set_option('display.max_columns', 500) 20 | pd.set_option('display.width', 1000) 21 | 22 | if __name__ == '__main__': 23 | df = pd.read_csv('data\\training.txt', delimiter=';', parse_dates=['time_interval_begin'], dtype={'link_ID':object}) 24 | link_tops = pd.read_csv('E:\dataset\data\gy_contest_link_top.txt', delimiter=';', dtype={'link_ID':object}) 25 | link_tops.fillna(' ', inplace=True) 26 | in_links_df = pd.DataFrame(link_tops['in_links'].str.split('#').tolist(), columns=['in_link1', 'in_link2', 'in_link3', 'in_link4']) 27 | out_links_df = pd.DataFrame(link_tops['out_links'].str.split('#').tolist(), columns=['out_link1', 'out_link2', 'out_link3', 'out_link4']) 28 | 29 | link_tops = pd.merge(link_tops, in_links_df, left_index=True, right_index=True) 30 | link_tops = pd.merge(link_tops, out_links_df, left_index=True, right_index=True) 31 | link_tops = link_tops.drop(['in_links', 'out_links'], axis=1) 32 | link_tops = link_tops.fillna(np.nan) 33 | link_tops = link_tops.replace(r'\s+', np.nan, regex=True) 34 | 35 | df = pd.merge(df, link_tops, on='link_ID', how='left') 36 | 37 | def applyParallel(dfGrouped, func): 38 | retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func, check_pickle=False)(group) for name, group in dfGrouped) 39 | return pd.concat(retLst) 40 | 41 | def related_lagging(group): 42 | tmp = group[['link_ID', 'lagging']].copy().set_index('link_ID') 43 | for index, row in group.iterrows(): 44 | if str(row['in_link1']) != 'nan': 45 | group.loc[index, 'in_link1_lagging'] = tmp.loc[row['in_link1']]['lagging1'] 46 | if str(row['in_link2']) != 'nan': 47 | group.loc[index, 'in_link2_lagging'] = tmp.loc[row['in_link2']]['lagging1'] 48 | if str(row['in_link3']) != 'nan': 49 | group.loc[index, 'in_link3_lagging'] = tmp.loc[row['in_link3']]['lagging1'] 50 | if str(row['in_link4']) != 'nan': 51 | group.loc[index, 'in_link4_lagging'] = tmp.loc[row['in_link4']]['lagging1'] 52 | 53 | if str(row['out_link1']) != 'nan': 54 | group.loc[index, 'out_link1_lagging'] = tmp.loc[row['out_link1']]['lagging1'] 55 | if str(row['out_link2']) != 'nan': 56 | group.loc[index, 'out_link2_lagging'] = tmp.loc[row['out_link2']]['lagging1'] 57 | if str(row['out_link3']) != 'nan': 58 | group.loc[index, 'out_link3_lagging'] = tmp.loc[row['out_link3']]['lagging1'] 59 | if str(row['out_link4']) != 'nan': 60 | group.loc[index, 'out_link4_lagging'] = tmp.loc[row['out_link4']]['lagging1'] 61 | # print group.index.values[0] 62 | return group 63 | 64 | df = applyParallel(df.groupby(df['time_interval_begin']), related_lagging) 65 | 66 | df['in_link_mean'] = df[['in_link1_lagging', 'in_link2_lagging', 'in_link3_lagging', 'in_link4_lagging']].mean( 67 | axis=1) 68 | df['out_link_mean'] = df[['out_link1_lagging', 'out_link2_lagging', 'out_link3_lagging', 'out_link4_lagging']].mean( 69 | axis=1) 70 | df['in_link_mean'].fillna(3, inplace=True) 71 | df['out_link_mean'].fillna(3, inplace=True) 72 | df = df.drop(['in_link1', 'in_link2', 'in_link3', 'in_link4', 'out_link1', 'out_link2', 'out_link3', 73 | 'out_link4'], axis=1) 74 | df = df.drop(['in_link1_lagging', 'in_link2_lagging', 'in_link3_lagging', 'in_link4_lagging', 'out_link1_lagging', 75 | 'out_link2_lagging', 'out_link3_lagging', 'out_link4_lagging'], axis=1) 76 | df.to_csv('data/training1.txt', header=True, index=None, sep=';', mode='w') 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /Zhihuijiaotong/code/Utils.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: Ad_Utils.py 8 | @time: 2018/2/27 14:19 9 | @desc: 智慧交通数据处理工具类 10 | """ 11 | import numpy as np 12 | import pandas as pd 13 | import matplotlib.pyplot as plt 14 | 15 | def bucket_data(lines): 16 | bucket = {} 17 | for line in lines: 18 | time_series = line[-2] 19 | bucket[time_series] = [] 20 | for line in lines: 21 | time_series, y1 = line[-2:] 22 | line = np.delete(line, -2, axis=0) 23 | bucket[time_series].append(line) 24 | return bucket 25 | 26 | def cross_valid(regressor, bucket, lagging): 27 | ''' 28 | 交叉验证 29 | :param regressor: 30 | :param bucket: 31 | :param lagging: 32 | :return: 33 | ''' 34 | valid_loss = [] 35 | last = [[] for i in range(len(bucket[bucket.keys()[0]]))] 36 | for time_series in sorted(bucket.keys(), key=float): 37 | if time_series >= 120: 38 | if int(time_series) in range(120, 120 + lagging, 2): 39 | last = np.concatenate((last, np.array(bucket[time_series], dtype=float)[:, -1].reshape(-1, 1)), axis=1) 40 | else: 41 | batch = np.array(bucket[time_series], dtype=float) 42 | y = batch[:, -1] 43 | batch = np.delete(batch, -1, axis=1) 44 | batch = np.concatenate((batch, last), axis=1) 45 | y_pre = regressor.predict(batch) 46 | last = np.delete(last, 0, axis=1) 47 | last = np.concatenate((last, y_pre.reshape(-1, 1)), axis=1) 48 | loss = np.mean(abs(np.expm1(y) - np.expm1(y_pre)) / np.expm1(y)) 49 | valid_loss.append(loss) 50 | return np.mean(valid_loss) 51 | 52 | def mape_in(y, d): 53 | ''' 54 | 55 | :param y: 56 | :param d: 57 | :return: 58 | ''' 59 | c = d.get_label() 60 | result = np.sum(np.abs(np.expm1(y) - np.expm1(c)) / np.expm1(c)) / len(c) 61 | return 'mape', result 62 | 63 | def feature_vis(regressor, train_feature): 64 | importances = regressor.feature_importances_ # 65 | indices = np.argsort(importances)[::-1] # 将元祖或列表的内容反转 66 | selected_features = [train_feature[e] for e in indices] 67 | plt.figure(figsize=(20, 10)) 68 | plt.title("train_feature importances") 69 | plt.bar(range(len(train_feature)), importances[indices], 70 | color="r", align="center") 71 | plt.xticks(range(len(selected_features)), selected_features, rotation=70) 72 | plt.show() 73 | 74 | 75 | # --- # 76 | def submission(train_feature, regressor, df, file1, file2, file3, file4): 77 | test_df = df.loc[((df['time_interval_begin'].dt.year == 2017) & (df['time_interval_begin'].dt.month == 7) 78 | & (df['time_interval_begin'].dt.hour.isin([7, 14, 17])) & (df['time_interval_begin'].dt.minite == 58))].copy() 79 | test_df['lagging5'] = test_df['lagging4'] 80 | test_df['lagging4'] = test_df['lagging3'] 81 | test_df['lagging3'] = test_df['lagging2'] 82 | test_df['lagging2'] = test_df['lagging1'] 83 | test_df['lagging1'] = test_df['travel_time'] 84 | 85 | with open(file1, 'w'): 86 | pass 87 | with open(file2, 'w'): 88 | pass 89 | with open(file3, 'w'): 90 | pass 91 | with open(file4, 'w'): 92 | pass 93 | 94 | for i in range(30): 95 | test_X = test_df[train_feature] 96 | y_prediction = regressor.predict(test_X.values) 97 | 98 | test_df['lagging5'] = test_df['lagging4'] 99 | test_df['lagging4'] = test_df['lagging3'] 100 | test_df['lagging3'] = test_df['lagging2'] 101 | test_df['lagging2'] = test_df['lagging1'] 102 | test_df['lagging1'] = y_prediction 103 | 104 | test_df['predicted'] = np.expm1(y_prediction) 105 | test_df['time_interval_begin'] = test_df['time_interval_begin'] + pd.DateOffset(minutes=2) 106 | test_df['time_interval'] = test_df['time_interval_begin'].map( 107 | lambda x: '[' + str(x) + ',' + str(x + pd.DateOffset(minutes=2)) + ')') 108 | test_df.time_interval = test_df.time_interval.astype(object) 109 | if i < 7: 110 | test_df[['link_ID', 'date', 'time_interval', 'predicted']].to_csv(file1, mode='a', header=False, 111 | index=False, 112 | sep=';') 113 | elif (7 <= i) and (i < 14): 114 | test_df[['link_ID', 'date', 'time_interval', 'predicted']].to_csv(file2, mode='a', header=False, 115 | index=False, 116 | sep=';') 117 | elif (14 <= i) and (i < 22): 118 | test_df[['link_ID', 'date', 'time_interval', 'predicted']].to_csv(file3, mode='a', header=False, 119 | index=False, 120 | sep=';') 121 | else: 122 | test_df[['link_ID', 'date', 'time_interval', 'predicted']].to_csv(file4, mode='a', header=False, 123 | index=False, 124 | sep=';') 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /Zhihuijiaotong/code/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 _*- 2 | 3 | """ 4 | @version: 5 | @author: CharlesXu 6 | @license: Q_S_Y_Q 7 | @file: __init__.py.py 8 | @time: 2018/2/27 13:14 9 | @desc: 10 | """ 11 | 12 | if __name__ == '__main__': 13 | pass --------------------------------------------------------------------------------