├── .idea ├── ActiveUserPrediction.iml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── README.md ├── blending └── blending_v1.py ├── catboostpy ├── catboost_test.py ├── catboost_v7.py ├── cb_v8.py └── cb_v9.py ├── dataanalysispy ├── data_analysis.py ├── eda_v1.ipynb └── get_global_file.py ├── dataprocesspy ├── __pycache__ │ └── data_process_v7.cpython-36.pyc ├── create_data.py ├── create_feature_v3_nonp.py ├── create_feature_v3_parallel.py ├── data_process.py ├── data_process_v9.py ├── kuaishou_stats2.csv └── ts_feature_calculators.py ├── featureselection ├── feature_importance_mi.csv ├── feature_importance_shap.csv ├── feature_selection.py └── keep_features.py ├── hardcodedpy ├── hard_approach.py ├── hardcode_approach.py ├── hardcode_approach_v2.py ├── hardcode_approach_v3.py ├── merge_approach.py └── new_merge.py ├── lgbpy ├── kuaishou_stats.csv ├── lgb_model.py └── lgb_v16.py ├── lrpy ├── lr_v1.py └── lr_v2.py ├── model └── engines.py ├── nnpy ├── dnn.py ├── f1_keras.py ├── nn_model.py ├── nn_v1.py └── nn_v2.py ├── paper ├── Modeling and Predicting the Active video-viewing time in a large-scale e-learning system.pdf ├── The Prediction of Booking Destination on airbnb dataset.pdf ├── Using Deep Learning to Predict Customer Churn in a mobile telecommunication newwork.pdf ├── field-aware fatorization machine for CTR prediction.pdf └── predicting airbnb user's desired travel destination.pdf ├── photos ├── 16count.JPG ├── 23count.JPG ├── 23count3.JPG ├── 24count.JPG ├── 24count3.JPG ├── count2.JPG ├── describe.JPG ├── outlier1.JPG ├── registerday_count.JPG ├── sample.JPG └── value_count.JPG ├── quick_test.py ├── rfpy └── rf_v1.py ├── rulepy └── hardcode_approach.py ├── statsfile ├── kuaishou_stats.csv └── kuaishou_stats2.csv ├── svmpy └── svm_v1.py ├── test.py └── utilspy ├── calculate.py ├── create_data.py ├── kpca.py ├── kuaishou_stats2.csv ├── ts_feature_calculators.py ├── util_analysis.py ├── utils_feature_engineering.py ├── utils_misc.py ├── utils_models.py └── utils_plot.py /.idea/ActiveUserPrediction.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ApexVCS 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /catboostpy/cb_v8.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import pandas as pd 4 | from catboost import CatBoostClassifier 5 | from sklearn.decomposition import PCA 6 | from sklearn.metrics import classification_report 7 | 8 | def predict(clf2, test_set, param, kpca): 9 | uid = pd.DataFrame() 10 | # test_set = processing(trainSpan=(1, 30), label=False) 11 | uid["user_id"] = test_set["user_id"] 12 | test_set = test_set.drop(labels=["user_id"], axis=1) 13 | test_set = kpca.transform(test_set.values) 14 | print("begin to make predictions") 15 | # res = clf2.predict_proba(test_set.values) 16 | res = clf2.predict_proba(test_set) 17 | uid["proba1"] = pd.Series(res[:, 1]) 18 | uid["score"] = uid.groupby(by=["user_id"])["proba1"].transform(lambda x: sum(x) / float(len(x))) 19 | uid.drop_duplicates(subset=["user_id"], inplace=True) 20 | uid.sort_values(by=["score"], axis=0, ascending=False, inplace=True) 21 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 22 | uid_file = "../result/uid/uid_cb_" + param + "_" + str_time + ".csv" 23 | uid.to_csv(uid_file, header=True, index=False) 24 | # active_users = uid.loc[uid["score"]>0.5]["user_id"].unique().tolist() 25 | active_users = uid["user_id"][:24500].unique().tolist() 26 | # print(len(active_users)) 27 | print(uid["score"].tolist()[24500]) 28 | # print(active_users) 29 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 30 | submission_file = "../result/622/submission_cb_" + param + "_" + str_time + ".csv" 31 | with open(submission_file, "a", newline="") as f: 32 | writer = csv.writer(f) 33 | for i in active_users: 34 | writer.writerow([i]) 35 | 36 | 37 | # using this module ,one needs to deconstruct some of the features in data_process 38 | def run(scheme_num=1, file_name="../data/data_v3/training_e"): 39 | train_set_ls = [] 40 | if scheme_num == 1: 41 | for i in [16, 17, 22, 23]: 42 | print("begin to load the dataset") 43 | file_name1 = file_name + "ld1-" + str(i) + ".csv" 44 | train_set_temp = pd.read_csv(file_name1, header=0, index_col=None) 45 | print(train_set_temp.describe()) 46 | train_set_ls.append(train_set_temp) 47 | elif scheme_num == 2: 48 | for i in [16, 23]: 49 | print("begin to load the dataset") 50 | file_name2 = file_name + "ld1-" + str(i) + ".csv" 51 | train_set_temp = pd.read_csv(file_name2, header=0, index_col=None) 52 | print(train_set_temp.describe()) 53 | train_set_ls.append(train_set_temp) 54 | elif scheme_num == 3: 55 | for i in [17,18, 19, 20, 21, 22, 23]: 56 | print("begin to load the dataset") 57 | file_name3 = file_name + "ld1-" + str(i) + ".csv" 58 | train_set_temp = pd.read_csv(file_name3, header=0, index_col=None) 59 | print(train_set_temp.describe()) 60 | train_set_ls.append(train_set_temp) 61 | val_file_name = file_name + "ld1-23.csv" 62 | val_set = pd.read_csv(val_file_name, header=0, index_col=None) 63 | print(val_set.describe()) 64 | train_set = pd.concat(train_set_ls, axis=0) 65 | ds = train_set.describe() 66 | print(ds) 67 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) 68 | 69 | print("begin to drop the duplicates") 70 | train_set.drop_duplicates(subset=keep_feature, inplace=True) 71 | val_set.drop_duplicates(subset=keep_feature, inplace=True) 72 | print(train_set.describe()) 73 | print(val_set.describe()) 74 | train_label = train_set["label"] 75 | val_label = val_set["label"] 76 | train_set = train_set.drop(labels=["label", "user_id"], axis=1) 77 | val_set = val_set.drop(labels=["label", "user_id"], axis=1) 78 | 79 | print("begin to standardization the data") 80 | for fea in keep_feature: 81 | if train_set[fea].var() < 0.000001 or val_set[fea].var() < 0.000001: 82 | train_set.drop(labels=[fea], axis=1, inplace=True) 83 | val_set.drop(labels=[fea], axis=1, inplace=True) 84 | else: 85 | train_set[fea] = (train_set[fea] - train_set[fea].min()) / (train_set[fea].max() - train_set[fea].min()) 86 | # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std()) 87 | val_set[fea] = (val_set[fea] - val_set[fea].min()) / (val_set[fea].max() - val_set[fea].min()) 88 | # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std()) 89 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) 90 | kpca = PCA(n_components=0.99, whiten=True) 91 | # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1) 92 | kpca.fit(train_set.values) 93 | train_set = kpca.transform(train_set.values) 94 | val_set = kpca.transform(val_set.values) 95 | # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_)) 96 | print("number of components {}".format(kpca.n_components_)) 97 | print("noise variance {}".format(kpca.noise_variance_)) 98 | print("the explained variance {}".format(kpca.explained_variance_)) 99 | print("the explained variance ratio {}".format(kpca.explained_variance_ratio_)) 100 | 101 | print("begin to make prediction with plain features and without tuning parameters") 102 | 103 | initial_params = { 104 | "colsample_bytree": 0.9956575704604527, 105 | "learning_rate": 0.03640520807213964, 106 | "max_bin": 210, 107 | # "max_depth":7, 108 | "min_child_samples": 80, 109 | "min_child_weight": 0.23740522733908753, 110 | # "min_split_gain": 0.0004147079426427973, 111 | "n_estimators": 266, 112 | "num_leaves": 12, 113 | "reg_alpha": 271.01549892268713, 114 | "reg_lambda": 0.0001118074055642654, 115 | # "scale_pos_weight": 0.9914246775102074, 116 | "subsample": 0.9090257022233618, 117 | "boosting_type": "dart", 118 | } 119 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns)) 120 | 121 | # best_f1 =0.0 122 | # best_params = {"n_estimators":800,"num_leaves":6} 123 | # for n_estimator in [400,600,800]: 124 | # for num_leave in [4,6,8]: 125 | # print({"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"}) 126 | # clf1 = LGBMClassifier(n_estimators=n_estimator, num_leaves=num_leave, boosting_type="dart") 127 | # clf1.fit(train_set.values, train_label.values) 128 | # print("load the test dataset") 129 | # yhat = clf1.predict(val_set.values) 130 | # print(classification_report(y_pred=yhat, y_true=val_label.values,digits=4)) 131 | # f1 = f1_score(y_pred=yhat, y_true=val_label.values) 132 | # if best_f10.5]["user_id"].unique().tolist() 27 | active_users = uid["user_id"][:24500].unique().tolist() 28 | # print(len(active_users)) 29 | print(uid["score"].tolist()[24500]) 30 | # print(active_users) 31 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 32 | submission_file = "../result/622/submission_cb_" + param + "_" + str_time + ".csv" 33 | with open(submission_file, "a", newline="") as f: 34 | writer = csv.writer(f) 35 | for i in active_users: 36 | writer.writerow([i]) 37 | 38 | 39 | # using this module ,one needs to deconstruct some of the features in data_process 40 | def run(scheme_num=1, file_name="../data/data_v3/training_e"): 41 | train_set_ls = [] 42 | if scheme_num == 1: 43 | for i in [16, 17, 22, 23]: 44 | print("begin to load the dataset") 45 | file_name1 = file_name + "ld1-" + str(i) + ".csv" 46 | train_set_temp = pd.read_csv(file_name1, header=0, index_col=None) 47 | print(train_set_temp.describe()) 48 | train_set_ls.append(train_set_temp) 49 | elif scheme_num == 2: 50 | for i in [16, 23]: 51 | print("begin to load the dataset") 52 | file_name2 = file_name + "ld1-" + str(i) + ".csv" 53 | train_set_temp = pd.read_csv(file_name2, header=0, index_col=None) 54 | print(train_set_temp.describe()) 55 | train_set_ls.append(train_set_temp) 56 | elif scheme_num == 3: 57 | for i in [18, 19, 20, 21, 22, 23]: 58 | print("begin to load the dataset") 59 | file_name3 = file_name + "ld1-" + str(i) + ".csv" 60 | train_set_temp = pd.read_csv(file_name3, header=0, index_col=None) 61 | print(train_set_temp.describe()) 62 | train_set_ls.append(train_set_temp) 63 | val_file_name = file_name + "ld1-23.csv" 64 | val_set = pd.read_csv(val_file_name, header=0, index_col=None) 65 | print(val_set.describe()) 66 | train_set = pd.concat(train_set_ls, axis=0) 67 | ds = train_set.describe() 68 | print(ds) 69 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) 70 | 71 | print("begin to drop the duplicates") 72 | train_set.drop_duplicates(subset=keep_feature, inplace=True) 73 | val_set.drop_duplicates(subset=keep_feature, inplace=True) 74 | print(train_set.describe()) 75 | print(val_set.describe()) 76 | train_label = train_set["label"] 77 | val_label = val_set["label"] 78 | train_set = train_set.drop(labels=["label", "user_id"], axis=1) 79 | val_set = val_set.drop(labels=["label", "user_id"], axis=1) 80 | 81 | print("begin to standardization the data") 82 | for fea in keep_feature: 83 | if train_set[fea].var() < 0.000001 or val_set[fea].var() < 0.000001: 84 | train_set.drop(labels=[fea], axis=1, inplace=True) 85 | val_set.drop(labels=[fea], axis=1, inplace=True) 86 | else: 87 | train_set[fea] = (train_set[fea] - train_set[fea].min()) / (train_set[fea].max() - train_set[fea].min()) 88 | # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std()) 89 | val_set[fea] = (val_set[fea] - val_set[fea].min()) / (val_set[fea].max() - val_set[fea].min()) 90 | # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std()) 91 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) 92 | kpca = PCA(n_components=0.99, whiten=True) 93 | # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1) 94 | kpca.fit(train_set.values) 95 | train_set = kpca.transform(train_set.values) 96 | val_set = kpca.transform(val_set.values) 97 | # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_)) 98 | print("number of components {}".format(kpca.n_components_)) 99 | print("noise variance {}".format(kpca.noise_variance_)) 100 | print("the explained variance {}".format(kpca.explained_variance_)) 101 | print("the explained variance ratio {}".format(kpca.explained_variance_ratio_)) 102 | 103 | print("begin to make prediction with plain features and without tuning parameters") 104 | 105 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns)) 106 | 107 | # best_f1 =0.0 108 | # best_params = {"n_estimators":800,"num_leaves":6} 109 | # for n_estimator in [400,600,800]: 110 | # for num_leave in [4,6,8]: 111 | # print({"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"}) 112 | # clf1 = LGBMClassifier(n_estimators=n_estimator, num_leaves=num_leave, boosting_type="dart") 113 | # clf1.fit(train_set.values, train_label.values) 114 | # print("load the test dataset") 115 | # yhat = clf1.predict(val_set.values) 116 | # print(classification_report(y_pred=yhat, y_true=val_label.values,digits=4)) 117 | # f1 = f1_score(y_pred=yhat, y_true=val_label.values) 118 | # if best_f1=trainSpan[0])&(df_user_register["register_day"]<=trainSpan[1])] 56 | 57 | df_user_register_train["register_day_rate"] = df_user_register_train.groupby(by=["register_day"])["register_day"].transform("count") 58 | df_user_register_train["register_type_rate"] = df_user_register_train.groupby(by=["register_type"])["register_type"].transform("count") 59 | df_user_register_train["register_type_device"] = df_user_register_train.groupby(by=["register_type"])["device_type"].transform(lambda x: x.nunique()) 60 | df_user_register_train["device_type_rate"] = df_user_register_train.groupby(by=["device_type"])["device_type"].transform("count") 61 | df_user_register_train["device_type_register"] = df_user_register_train.groupby(by=["device_type"])["register_type"].transform(lambda x: x.nunique()) 62 | 63 | df_user_register = df_user_register_train.drop(labels=["register_type","device_type"],axis=1) 64 | 65 | print("get users from app launch log") 66 | # app_launch_log = ["user_id","app_launch_day"] 67 | dtype_app_launch = {"user_id": np.uint32, "app_launch_day": np.uint8} 68 | df_app_launch = pd.read_csv("data/app_launch_log.csv", header=0, index_col=None, dtype=dtype_app_launch) 69 | def analysisTrans(): 70 | print("begin to load the trainset1") 71 | # train_set1 = processing(trainSpan=(1,10),label=True) 72 | # train_set1.to_csv("data/training_ld1-10.csv", header=True, index=False) 73 | train_set1 = pd.read_csv("data/training_ld1-10.csv", header=0, index_col=None) 74 | print(train_set1.describe()) 75 | print("begin to load the trainset2") 76 | # train_set2 = processing(trainSpan=(11,20),label=True) 77 | # train_set2.to_csv("data/training_ld11-20.csv", header=True, index=False) 78 | train_set2 = pd.read_csv("data/training_ld11-20.csv", header=0, index_col=None) 79 | print(train_set2.describe()) 80 | print("begin to merge the trainsets") 81 | train_set = pd.concat([train_set1,train_set2],axis=0) 82 | print(train_set.describe()) 83 | analysisTrans() 84 | # user_activity() 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /dataanalysispy/get_global_file.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | user_register_log = ["user_id", "register_day", "register_type", "device_type"] 5 | app_launch_log = ["user_id", "app_launch_day"] 6 | video_create_log = ["user_id", "video_create_day"] 7 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"] 8 | 9 | 10 | def get_global_file(): 11 | print("get users from user register log") 12 | # user_register_log = ["user_id", "register_day", "register_type", "device_type"] 13 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, 14 | "device_type": np.uint16} 15 | df_user_register = pd.read_csv("data/user_register_log.csv", header=0, index_col=None, dtype=dtype_user_register) 16 | # df_user_register.drop_duplicates(inplace=True) 17 | # df_user_register_train = df_user_register.loc[(df_user_register["register_day"]>=trainSpan[0])&(df_user_register["register_day"]<=trainSpan[1])] 18 | # these are global features 19 | df_user_register["register_day_rate"] = df_user_register.groupby(by=["register_day"])["register_day"].transform( 20 | "count") 21 | df_user_register["register_type_rate"] = df_user_register.groupby(by=["register_type"])["register_type"].transform( 22 | "count") 23 | df_user_register["register_type_device"] = df_user_register.groupby(by=["register_type"])["device_type"].transform( 24 | lambda x: x.nunique()) 25 | df_user_register["device_type_rate"] = df_user_register.groupby(by=["device_type"])["device_type"].transform( 26 | "count") 27 | df_user_register["device_type_register"] = df_user_register.groupby(by=["device_type"])["register_type"].transform( 28 | lambda x: x.nunique()) 29 | df_user_register.to_csv("data/user_register_log_global.csv",header=True,index=False) 30 | 31 | user_register_feature = ["user_id", 32 | "register_day_rate", "register_type_rate", 33 | "register_type_device", "device_type_rate", "device_type_register" 34 | ] 35 | df_user_register_base = df_user_register[["user_id", "register_day"]].drop_duplicates() 36 | 37 | print("get users from app launch log") 38 | # app_launch_log = ["user_id","app_launch_day"] 39 | dtype_app_launch = {"user_id": np.uint32, "app_launch_day": np.uint8} 40 | df_app_launch = pd.read_csv("data/app_launch_log.csv", header=0, index_col=None, dtype=dtype_app_launch) 41 | df_app_launch = df_app_launch.merge(df_user_register_base, on=["user_id"], how="left").fillna(-1) 42 | 43 | df_app_launch["user_app_launch_rate_global"] = df_app_launch.groupby(by=["user_id"])[ 44 | "app_launch_day"].transform("count") 45 | # df_app_launch["user_app_launch_register_min_time_global"] = df_app_launch.groupby(by=["user_id"])[ 46 | # "app_launch_day"].transform(lambda x: min(x)) - \ 47 | # df_app_launch["register_day"] 48 | df_app_launch["user_app_launch_register_max_time_global"] = df_app_launch.groupby(by=["user_id"])[ 49 | "app_launch_day"].transform(lambda x: max(x)) - \ 50 | df_app_launch["register_day"] 51 | df_app_launch["user_app_launch_register_mean_time_global"] = df_app_launch.groupby(by=["user_id"])[ 52 | "app_launch_day"].transform( 53 | lambda x: (max(x) + min(x)) / 2) - df_app_launch["register_day"] 54 | df_app_launch["user_app_launch_gap_global"] = df_app_launch.groupby(by=["user_id"])[ 55 | "app_launch_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x)) > 1 else 0) 56 | df_app_launch["user_app_launch_var_global"] = df_app_launch.groupby(by=["user_id"])[ 57 | "app_launch_day"].transform(lambda x: np.var(x)) 58 | df_app_launch.to_csv("data/app_launch_log_global.csv", header=True, index=False) 59 | 60 | print("get users from video create") 61 | # video_create_log = ["user_id", "video_create_day"] 62 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8} 63 | df_video_create = pd.read_csv("data/video_create_log.csv", header=0, index_col=None, dtype=dtype_video_create) 64 | df_video_create = df_video_create.merge(df_user_register_base, on=["user_id"], how="left").fillna(-1) 65 | 66 | df_video_create["user_video_create_rate_global"] = df_video_create.groupby(by=["user_id"])[ 67 | "video_create_day"].transform("count") 68 | df_video_create["user_video_create_day_global"] = df_video_create.groupby(by=["user_id"])[ 69 | "video_create_day"].transform(lambda x: x.nunique()) 70 | df_video_create["user_video_create_frequency_global"] = df_video_create["user_video_create_rate_global"] / \ 71 | df_video_create["user_video_create_day_global"] 72 | 73 | df_video_create["user_video_create_register_min_time_global"] = df_video_create.groupby(by=["user_id"])[ 74 | "video_create_day"].transform( 75 | lambda x: min(x)) - \ 76 | df_video_create["register_day"] 77 | df_video_create["user_video_create_register_max_time_global"] = df_video_create.groupby(by=["user_id"])[ 78 | "video_create_day"].transform( 79 | lambda x: max(x)) - \ 80 | df_video_create["register_day"] 81 | df_video_create["user_video_create_register_mean_time_global"] = df_video_create.groupby(by=["user_id"])[ 82 | "video_create_day"].transform( 83 | lambda x: (max(x) + min(x)) / 2) - df_video_create["register_day"] 84 | # df_video_create["user_video_create_register_mean_time"] = df_video_create["video_create_day"]-df_video_create["register_day"] 85 | df_video_create["user_video_create_gap_global"] = df_video_create.groupby(by=["user_id"])[ 86 | "video_create_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x)) > 1 else 0) 87 | df_video_create["user_video_create_var_global"] = df_video_create.groupby(by=["user_id"])[ 88 | "video_create_day"].transform(lambda x: np.var(x)) 89 | df_video_create.to_csv("data/video_create_log_global.csv", header=True, index=False) 90 | 91 | print("get users from user activity log") 92 | # user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"] 93 | # usecols = ["user_id", "user_activity_day", "page","action_type"] 94 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32, 95 | "author_id": np.uint32, "action_type": np.uint8} 96 | df_user_activity = pd.read_csv("data/user_activity_log.csv", header=0, index_col=None, dtype=dtype_user_activity) 97 | df_user_activity = df_user_activity.merge(df_user_register_base, on=["user_id"], how="left").fillna(-1) 98 | # df_user_activity = df_user_activity.sample(n=50000) 99 | print("read , merge and sample over") 100 | # print(df_user_activity.describe()) 101 | # df_user_activity.drop_duplicates(inplace=True) 102 | # print(df_user_activity.describe()) 103 | df_user_activity["user_activity_rate_global"] = (df_user_activity.groupby(by=["user_id"])["user_id"].transform( 104 | "count")).astype(np.uint16) 105 | df_user_activity["user_activity_day_rate_global"] = (df_user_activity.groupby(by=["user_id"])[ 106 | "user_activity_day"].transform(lambda x: x.nunique())).astype(np.uint8) 107 | df_user_activity["user_activity_frequency_global"] = df_user_activity["user_activity_rate_global"]/df_user_activity["user_activity_day_rate_global"] 108 | df_user_activity["user_activity_gap_global"] = df_user_activity.groupby(by=["user_id"])[ 109 | "user_activity_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x)) > 1 else 0) 110 | df_user_activity["user_activity_var_global"] = df_user_activity.groupby(by=["user_id"])[ 111 | "user_activity_day"].transform(lambda x: np.var(x)) 112 | df_user_activity["user_activity_register_min_time_global"] = (df_user_activity.groupby(by=["user_id"])[ 113 | "user_activity_day"].transform(lambda x: min(x)) - \ 114 | df_user_activity["register_day"]).astype(np.uint8) 115 | df_user_activity["user_activity_register_max_time_global"] = (df_user_activity.groupby(by=["user_id"])[ 116 | "user_activity_day"].transform(lambda x: max(x)) - \ 117 | df_user_activity["register_day"]).astype(np.uint8) 118 | df_user_activity["user_activity_register_mean_time_global"] = df_user_activity.groupby(by=["user_id"])[ 119 | "user_activity_day"].transform( 120 | lambda x: (max(x) + min(x)) / 2) - df_user_activity["register_day"] 121 | print("groupby one columns ") 122 | df_user_activity["user_page_num_global"] = (df_user_activity.groupby(by=["user_id"])["page"].transform( 123 | lambda x: x.nunique())).astype(np.uint8) 124 | df_user_activity["user_video_num_global"] = (df_user_activity.groupby(by=["user_id"])["video_id"].transform( 125 | lambda x: x.nunique())).astype(np.uint16) 126 | df_user_activity["user_author_num_global"] = (df_user_activity.groupby(by=["user_id"])["author_id"].transform( 127 | lambda x: x.nunique())).astype(np.uint16) 128 | df_user_activity["user_action_type_num_global"] = (df_user_activity.groupby(by=["user_id"])[ 129 | "action_type"].transform(lambda x: x.nunique())).astype(np.uint8) 130 | print("groupby two columns ") 131 | # df_user_activity["user_author_video_num_global"] = (df_user_activity.groupby(by=["user_id", "author_id"])[ 132 | # "video_id"].transform( 133 | # lambda x: x.nunique())).astype(np.uint16) 134 | # print("1") 135 | # df_user_activity["user_video_action_type_num_global"] = (df_user_activity.groupby(by=["user_id", "video_id"])[ 136 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8) 137 | # print("2") 138 | # df_user_activity["user_author_action_type_num_global"] = (df_user_activity.groupby(by=["user_id", "author_id"])[ 139 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8) 140 | # print("3") 141 | # df_user_activity["user_page_action_type_num_global"] = (df_user_activity.groupby(by=["user_id", "page"])[ 142 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8) 143 | print("data process over") 144 | # df_user_activity["page_rate_global"] = (df_user_activity.groupby(by=["page"])["page"].transform("count")).astype(np.uint32) 145 | # df_user_activity["page_video_global"] = (df_user_activity.groupby(by=["page"])["video_id"].transform( 146 | # lambda x: x.nunique())).astype(np.uint32) 147 | # df_user_activity["page_author_global"] = (df_user_activity.groupby(by=["page"])["author_id"].transform( 148 | # lambda x: x.nunique())).astype(np.uint32) 149 | # df_user_activity["video_rate_global"] = (df_user_activity.groupby(by=["video_id"])["video_id"].transform( 150 | # "count")).astype(np.uint32) 151 | # df_user_activity["video_user_global"] = (df_user_activity.groupby(by=["video_id"])["user_id"].transform( 152 | # lambda x: x.nunique())).astype(np.uint16) 153 | # df_user_activity["video_action_type_global"] = (df_user_activity.groupby(by=["video_id"])[ 154 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8) 155 | # df_user_activity["author_rate_global"] = (df_user_activity.groupby(by=["video_id"])["author_id"].transform( 156 | # "count")).astype(np.uint32) 157 | # df_user_activity["author_user_global"] = (df_user_activity.groupby(by=["author_id"])["user_id"].transform( 158 | # lambda x: x.nunique())).astype(np.uint16) 159 | # df_user_activity["author_video_global"] = (df_user_activity.groupby(by=["author_id"])["video_id"].transform( 160 | # lambda x: x.nunique())).astype(np.uint16) 161 | # df_user_activity["author_action_type_global"] = (df_user_activity.groupby(by=["author_id"])[ 162 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8) 163 | # df_user_activity["action_type_rate_global"] = (df_user_activity.groupby(by=["action_type"])[ 164 | # "action_type"].transform("count")).astype(np.uint32) 165 | df_user_activity.to_csv("data/user_activity_log_global.csv", header=True, index=False) 166 | 167 | if __name__ == "__main__": 168 | get_global_file() 169 | -------------------------------------------------------------------------------- /dataprocesspy/__pycache__/data_process_v7.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/dataprocesspy/__pycache__/data_process_v7.cpython-36.pyc -------------------------------------------------------------------------------- /dataprocesspy/create_data.py: -------------------------------------------------------------------------------- 1 | from data_process_v7 import processing 2 | 3 | if __name__=="__main__": 4 | # print("begin to load the trainset2") 5 | # train_set2 = processing(trainSpan=(1, 15), label=True) 6 | # train_set2.to_csv("../data/data_v4/training_rld1-15.csv", header=True, index=False) 7 | # train_set2 = pd.read_csv("data/training_eld1-15.csv", header=0, index_col=None, usecols=use_feature) 8 | # print(train_set2.describe()) 9 | # print("begin to load the testset") 10 | # train_set52 = processing(trainSpan=(1, 30), label=False) 11 | # train_set52.to_csv("data/testing_eld1-30_r.csv", header=True, index=False) 12 | # train_set52 = pd.read_csv("data/training_eld1-23.csv", header=0, index_col=None, usecols=use_feature) 13 | # print(train_set52.describe()) 14 | # print("begin to load the trainset52") 15 | # train_set52 = processing(trainSpan=(1, 23), label=True) 16 | # train_set52.to_csv("data/training_rld1-23_r.csv", header=True, index=False) 17 | # # train_set52 = pd.read_csv("data/training_eld1-23.csv", header=0, index_col=None, usecols=use_feature) 18 | # print(train_set52.describe()) 19 | # print("begin to load the trainset51") 20 | # train_set51 = processing(trainSpan=(1, 22), label=True) 21 | # train_set51.to_csv("data/training_rld1-22.csv", header=True, index=False) 22 | # # train_set5 = pd.read_csv("data/training_eld1-22.csv", header=0, index_col=None, usecols=use_feature) 23 | # print(train_set51.describe()) 24 | print("begin to load the trainset5") 25 | train_set5 = processing(trainSpan=(1, 21), label=True) 26 | train_set5.to_csv("../data/data_v4/training_rld1-21.csv", header=True, index=False) 27 | # train_set5 = pd.read_csv("data/training_eld1-21.csv", header=0, index_col=None, usecols=use_feature) 28 | print(train_set5.describe()) 29 | print("begin to load the trainset41") 30 | train_set41 = processing(trainSpan=(1, 20), label=True) 31 | train_set41.to_csv("../data/data_v4/training_rld1-20.csv", header=True, index=False) 32 | # train_set41 = pd.read_csv("data/training_eld1-20.csv", header=0, index_col=None, usecols=use_feature) 33 | print(train_set41.describe()) 34 | print("begin to load the trainset4") 35 | train_set4 = processing(trainSpan=(1, 19), label=True) 36 | train_set4.to_csv("../data/data_v4/training_rld1-19.csv", header=True, index=False) 37 | # train_set4 = pd.read_csv("data/training_eld1-19.csv", header=0, index_col=None, usecols=use_feature) 38 | print(train_set4.describe()) 39 | 40 | print("begin to load the trainset21") 41 | train_set21 = processing(trainSpan=(1, 16), label=True) 42 | train_set21.to_csv("../data/data_v4/training_rld1-16.csv", header=True, index=False) 43 | # train_set21 = pd.read_csv("data/training_eld1-16.csv", header=0, index_col=None, usecols=use_feature) 44 | print(train_set21.describe()) 45 | print("begin to load the trainset3") 46 | train_set3 = processing(trainSpan=(1, 17), label=True) 47 | train_set3.to_csv("../data/data_v4/training_rld1-17.csv", header=True, index=False) 48 | # train_set3 = pd.read_csv("data/training_eld1-17.csv", header=0, index_col=None, usecols=use_feature) 49 | print(train_set3.describe()) 50 | print("begin to load the trainset31") 51 | train_set31 = processing(trainSpan=(1, 18), label=True) 52 | train_set31.to_csv("../data/data_v4/training_rld1-18.csv", header=True, index=False) 53 | # train_set3 = pd.read_csv("data/training_eld1-18.csv", header=0, index_col=None, usecols=use_feature) 54 | print(train_set31.describe()) 55 | # 56 | 57 | -------------------------------------------------------------------------------- /dataprocesspy/data_process.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | user_register_log = ["user_id","register_day","register_type","device_type"] 4 | app_launch_log = ["user_id","app_launch_day"] 5 | video_create_log = ["user_id","video_create_day"] 6 | user_activity_log = ["user_id","user_activity_day","page","video_id","author_id","action_type"] 7 | 8 | user_register_feature = ["user_id", 9 | "register_day_rate","register_type_rate", 10 | "register_type_device","device_type_rate","device_type_register"] 11 | app_launch_feature = ["user_id", 12 | "user_app_launch_rate","user_app_launch_gap"] 13 | video_create_feature = ["user_id", 14 | "user_video_create_rate","user_video_create_day","user_video_create_gap"] 15 | user_activity_feature = ["user_id", 16 | "user_activity_day","user_activity_day_rate","user_activity_gap", 17 | # "page_rate","page_action_type", 18 | # "video_id_rate","video_id_user","video_id_action_type", 19 | # "author_id_rate","author_id_user","author_id_video", 20 | # "action_type_rate","action_type_page" 21 | ] 22 | 23 | def processing(trainSpan=(1,23),label=True): 24 | if label: 25 | assert isinstance(trainSpan,tuple),"input parameter should be a tuple with two items (min,max)" 26 | assert trainSpan[0]>0 and trainSpan[0]<23 and trainSpan[1]>trainSpan[0] and trainSpan[1]<=23 27 | else: 28 | assert isinstance(trainSpan,tuple),"input parameter should be a tuple with two items (min,max)" 29 | assert trainSpan[0]>0 and trainSpan[0]<30 and trainSpan[1]>trainSpan[0] and trainSpan[1]<=30 30 | print("get users from user register log") 31 | # user_register_log = ["user_id", "register_day", "register_type", "device_type"] 32 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type":np.uint16} 33 | df_user_register = pd.read_csv("data/user_register_log.csv",header=0,index_col=None,dtype=dtype_user_register) 34 | # df_user_register.drop_duplicates(inplace=True) 35 | df_user_register_train = df_user_register.loc[(df_user_register["register_day"]>=trainSpan[0])&(df_user_register["register_day"]<=trainSpan[1])] 36 | 37 | df_user_register_train["register_day_rate"] = df_user_register_train.groupby(by=["register_day"])["register_day"].transform("count") 38 | df_user_register_train["register_type_rate"] = df_user_register_train.groupby(by=["register_type"])["register_type"].transform("count") 39 | df_user_register_train["register_type_device"] = df_user_register_train.groupby(by=["register_type"])["device_type"].transform(lambda x: x.nunique()) 40 | df_user_register_train["device_type_rate"] = df_user_register_train.groupby(by=["device_type"])["device_type"].transform("count") 41 | df_user_register_train["device_type_register"] = df_user_register_train.groupby(by=["device_type"])["register_type"].transform(lambda x: x.nunique()) 42 | 43 | df_user_register_train = df_user_register_train[user_register_feature].drop_duplicates() 44 | print(df_user_register_train.describe()) 45 | 46 | print("get users from app launch log") 47 | # app_launch_log = ["user_id","app_launch_day"] 48 | dtype_app_launch = {"user_id": np.uint32, "app_launch_day": np.uint8} 49 | df_app_launch = pd.read_csv("data/app_launch_log.csv", header=0, index_col=None, dtype=dtype_app_launch) 50 | # df_app_launch.drop_duplicates(inplace=True) 51 | df_app_launch_train = df_app_launch.loc[ 52 | (df_app_launch["app_launch_day"] >= trainSpan[0]) & (df_app_launch["app_launch_day"] <= trainSpan[1])] 53 | 54 | # print(df_app_launch_train.describe()) 55 | df_app_launch_train["user_app_launch_rate"] = df_app_launch_train.groupby(by=["user_id"])[ 56 | "app_launch_day"].transform("count") 57 | df_app_launch_train["user_app_launch_gap"] = df_app_launch_train.groupby(by=["user_id"])[ 58 | "app_launch_day"].transform(lambda x: (max(x) - min(x)) / (len(x) - 1) if len(set(x)) > 1 else 0) 59 | 60 | df_app_launch_train = df_app_launch_train[app_launch_feature].drop_duplicates() 61 | print(df_app_launch_train.describe()) 62 | 63 | print("get users from video create") 64 | # video_create_log = ["user_id", "video_create_day"] 65 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8} 66 | df_video_create = pd.read_csv("data/video_create_log.csv",header=0,index_col=None,dtype=dtype_video_create) 67 | # df_video_create.drop_duplicates(inplace=True) 68 | df_video_create_train = df_video_create.loc[ 69 | (df_video_create["video_create_day"] >= trainSpan[0]) & (df_video_create["video_create_day"] <= trainSpan[1])] 70 | 71 | df_video_create_train["user_video_create_rate"] = df_video_create_train.groupby(by=["user_id"])[ 72 | "video_create_day"].transform("count") 73 | df_video_create_train["user_video_create_day"] = df_video_create_train.groupby(by=["user_id"])[ 74 | "video_create_day"].transform(lambda x: x.nunique()) 75 | df_video_create_train["user_video_create_gap"] = df_video_create_train.groupby(by=["user_id"])[ 76 | "video_create_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x)) > 1 else 0) 77 | # print(df_video_create_train.describe()) 78 | df_video_create_train = df_video_create_train[video_create_feature].drop_duplicates() 79 | print(df_video_create_train.describe()) 80 | 81 | print("get users from user activity log") 82 | # user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"] 83 | # usecols = ["user_id", "user_activity_day", "page","action_type"] 84 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32, 85 | "author_id": np.uint32, "action_type": np.uint8} 86 | df_user_activity = pd.read_csv("data/user_activity_log.csv", header=0, index_col=None, dtype=dtype_user_activity) 87 | # print(df_user_activity.describe()) 88 | # df_user_activity.drop_duplicates(inplace=True) 89 | # print(df_user_activity.describe()) 90 | df_user_activity_train = df_user_activity.loc[ 91 | (df_user_activity["user_activity_day"] >= trainSpan[0]) & ( 92 | df_user_activity["user_activity_day"] <= trainSpan[1])] 93 | 94 | df_user_activity_train["user_activity_rate"] = df_user_activity_train.groupby(by=["user_id"])["user_id"].transform( 95 | "count") 96 | df_user_activity_train["user_activity_day_rate"] = df_user_activity_train.groupby(by=["user_id"])[ 97 | "user_activity_day"].transform(lambda x: x.nunique()) 98 | df_user_activity_train["user_activity_gap"] = df_user_activity_train.groupby(by=["user_id"])[ 99 | "user_activity_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x))>1 else 0) 100 | # df_user_activity_train["page_rate"] = df_user_activity_train.groupby(by=["page"])["page"].transform("count") 101 | # df_user_activity_train["page_action_type"] = df_user_activity_train.groupby(by=["page"])["action_type"].transform( 102 | # lambda x: x.nunique()) 103 | # df_user_activity_train["video_id_rate"] = df_user_activity_train.groupby(by=["video_id"])["video_id"].transform( 104 | # "count") 105 | # df_user_activity_train["video_id_user"] = df_user_activity_train.groupby(by=["video_id"])["user_id"].transform( 106 | # lambda x: x.nunique()) 107 | # df_user_activity_train["video_id_action_type"] = df_user_activity_train.groupby(by=["video_id"])[ 108 | # "action_type"].transform(lambda x: x.nunique()) 109 | # df_user_activity_train["author_id_rate"] = df_user_activity_train.groupby(by=["author_id"])["author_id"].transform( 110 | # "count") 111 | # df_user_activity_train["author_id_user"] = df_user_activity_train.groupby(by=["author_id"])["user_id"].transform( 112 | # lambda x: x.nunique()) 113 | # df_user_activity_train["author_id_video"] = df_user_activity_train.groupby(by=["author_id"])["video_id"].transform( 114 | # lambda x: x.nunique()) 115 | # df_user_activity_train["action_type_rate"] = df_user_activity_train.groupby(by=["action_type"])[ 116 | # "action_type"].transform("count") 117 | # df_user_activity_train["action_type_page"] = df_user_activity_train.groupby(by=["action_type"])["page"].transform( 118 | # lambda x: x.nunique()) 119 | df_user_activity_train = df_user_activity_train[user_activity_feature].drop_duplicates() 120 | print(df_user_activity_train.describe()) 121 | 122 | if label: 123 | active_user_register = (df_user_register.loc[(df_user_register["register_day"]>trainSpan[1])&(df_user_register["register_day"]<=(trainSpan[1]+7))]).user_id.unique().tolist() 124 | active_app_launch = (df_app_launch.loc[(df_app_launch["app_launch_day"] > trainSpan[1]) & (df_app_launch["app_launch_day"] <= (trainSpan[1] + 7))]).user_id.unique().tolist() 125 | active_video_create = (df_video_create.loc[(df_video_create["video_create_day"]>trainSpan[1])&(df_video_create["video_create_day"]<=(trainSpan[1]+7))]).user_id.unique().tolist() 126 | active_user_activity = (df_user_activity.loc[(df_user_activity["user_activity_day"] > trainSpan[1]) & (df_user_activity["user_activity_day"] <= (trainSpan[1] + 7))]).user_id.unique().tolist() 127 | active_user = list(set(active_user_register+active_app_launch+active_video_create+active_user_activity)) 128 | 129 | df_user_register_train["label"] = 0 130 | df_user_register_train.loc[df_user_register_train["user_id"].isin(active_user),"label"] = 1 131 | 132 | df_app_launch_train["label"] = 0 133 | df_app_launch_train.loc[df_app_launch_train["user_id"].isin(active_user),"label"] = 1 134 | 135 | df_video_create_train["label"] = 0 136 | df_video_create_train.loc[df_video_create_train["user_id"].isin(active_user),"label"] = 1 137 | 138 | df_user_activity_train["label"] = 0 139 | df_user_activity_train.loc[df_user_activity_train["user_id"].isin(active_user),"label"] = 1 140 | 141 | df_register_launch = df_user_register_train.merge(df_app_launch_train,how="left") 142 | # print(df_register_launch.describe()) 143 | df_register_launch_create = df_register_launch.merge(df_video_create_train,how="left") 144 | # print(df_register_launch_create.describe()) 145 | df_register_launch_create = df_register_launch_create.fillna(0) 146 | df_activity_register_launch_create = df_user_activity_train.merge(df_register_launch_create,how="left") 147 | df_activity_register_launch_create = df_activity_register_launch_create.fillna(0) 148 | print(df_activity_register_launch_create.describe()) 149 | return df_activity_register_launch_create 150 | -------------------------------------------------------------------------------- /hardcodedpy/hard_approach.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import pandas as pd 4 | import numpy as np 5 | user_register_log = ["user_id","register_day","register_type","device_type"] 6 | app_launch_log = ["user_id","app_launch_day"] 7 | video_create_log = ["user_id","video_create_day"] 8 | user_activity_log = ["user_id","user_activity_day","page","video_id","author_id","action_type"] 9 | 10 | 11 | def get_user_from_videoCreate(laterThanDay,videoCount): 12 | print("get users from video create") 13 | video_create_log = ["user_id", "video_create_day"] 14 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8} 15 | df_video_create = pd.read_table("data/A2/video_create_log.txt",header=None,names=video_create_log,index_col=None,dtype=dtype_video_create) 16 | latest_user = (df_video_create.loc[df_video_create["video_create_day"]>laterThanDay]).user_id.unique().tolist() 17 | print("get latest users") 18 | print(latest_user) 19 | print(len(latest_user)) 20 | df_video_create["videoCount"] = df_video_create.groupby(by=["user_id"])["video_create_day"].transform(lambda x: x.nunique()) 21 | frequent_user = (df_video_create.loc[df_video_create["videoCount"]>videoCount]).user_id.unique().tolist() 22 | print("get frequent users") 23 | print(frequent_user) 24 | print(len(frequent_user)) 25 | user_videoCreate = list(set(latest_user+frequent_user)) 26 | print(user_videoCreate) 27 | print(len(user_videoCreate)) 28 | return user_videoCreate 29 | # with open("result/submission.csv","a",newline="") as f: 30 | # writer = csv.writer(f) 31 | # for i in user_videoCreate: 32 | # writer.writerow([i]) 33 | # get_user_from_videoCreate(23,2) 34 | def get_user_from_appLaunch(laterThanDay,launchCount): 35 | print("get users from app launch log") 36 | app_launch_log = ["user_id","app_launch_day"] 37 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8} 38 | df_app_launch = pd.read_table("data/A2/app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch) 39 | latest_user = (df_app_launch.loc[df_app_launch["app_launch_day"]>laterThanDay]).user_id.unique().tolist() 40 | print("get latest users") 41 | print(latest_user) 42 | print(len(latest_user)) 43 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique()) 44 | frequent_user = (df_app_launch.loc[df_app_launch["launchCount"]>launchCount]).user_id.unique().tolist() 45 | print("get frequent users") 46 | print(frequent_user) 47 | print(len(frequent_user)) 48 | user_appLaunch = list(set(latest_user+frequent_user)) 49 | print("get merged users") 50 | print(user_appLaunch) 51 | print(len(user_appLaunch)) 52 | return user_appLaunch 53 | # with open("result/submission.csv","a",newline="") as f: 54 | # writer = csv.writer(f) 55 | # for i in user_appLaunch: 56 | # writer.writerow([i]) 57 | # get_user_from_appLaunch(27,4) 58 | def get_user_from_userRegister(laterThanDay): 59 | print("get users from user register log") 60 | user_register_log = ["user_id", "register_day", "register_type", "device_type"] 61 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": str} 62 | df_user_register = pd.read_table("data/A2/user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register) 63 | latest_user = (df_user_register.loc[df_user_register["register_day"]>laterThanDay]).user_id.unique().tolist() 64 | print("get latest users") 65 | print(latest_user) 66 | print(len(latest_user)) 67 | return latest_user 68 | # get_user_from_userRegister(25) 69 | def get_user_from_userActivity(laterThanDay,dayCount,pageList,typeList): 70 | print("get users from user activity log") 71 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"] 72 | usecols = ["user_id", "user_activity_day", "page","action_type"] 73 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "action_type": np.uint8} 74 | df_user_activity = pd.read_table("data/A2/user_activity_log.txt",header=None,names=user_activity_log,usecols=usecols,index_col=None,dtype=dtype_user_activity) 75 | latest_user = (df_user_activity.loc[df_user_activity["user_activity_day"]>laterThanDay]).user_id.unique().tolist() 76 | print("get latest users") 77 | print(latest_user) 78 | print(len(latest_user)) 79 | 80 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique()) 81 | frequent_user = (df_user_activity.loc[df_user_activity["dayCount"]>dayCount]).user_id.unique().tolist() 82 | print("get frequent users") 83 | print(frequent_user) 84 | print(len(frequent_user)) 85 | 86 | print("get users in certain pages and certain action type") 87 | user_inList = (df_user_activity.loc[((df_user_activity["page"].isin(pageList))|(df_user_activity["action_type"].isin(typeList)))&(df_user_activity["user_activity_day"]>laterThanDay-3)]).user_id.unique().tolist() 88 | 89 | print(user_inList) 90 | print(len(user_inList)) 91 | user_userActivity = list(set(latest_user+frequent_user+user_inList)) 92 | 93 | print("get merged users") 94 | print(user_userActivity) 95 | print(len(user_userActivity)) 96 | return user_userActivity 97 | # get_user_from_userActivity(18, 3, [1,2,3], [1,3,4,5]) 98 | 99 | def get_user(): 100 | 101 | user_videoCreate = get_user_from_videoCreate(24, 1) 102 | user_appLaunch = get_user_from_appLaunch(24, 3) 103 | user_userRegister = get_user_from_userRegister(27) 104 | user_userActivity = get_user_from_userActivity(27, 3, [1], [3,4,5]) 105 | 106 | users = list(set(user_videoCreate+user_appLaunch+user_userRegister+user_userActivity)) 107 | print("get the final merged users") 108 | print(users) 109 | print(len(users)) 110 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 111 | submission_file = "result/submission_h_" + str_time + ".csv" 112 | with open(submission_file,"a",newline="") as f: 113 | writer = csv.writer(f) 114 | for i in users: 115 | writer.writerow([i]) 116 | # get_user() -------------------------------------------------------------------------------- /hardcodedpy/hardcode_approach_v2.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import pandas as pd 4 | import numpy as np 5 | user_register_log = ["user_id","register_day","register_type","device_type"] 6 | app_launch_log = ["user_id","app_launch_day"] 7 | video_create_log = ["user_id","video_create_day"] 8 | user_activity_log = ["user_id","user_activity_day","page","video_id","author_id","action_type"] 9 | 10 | 11 | def get_frequser_from_videoCreate(videoCount): 12 | print("get users from video create") 13 | video_create_log = ["user_id", "video_create_day"] 14 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8} 15 | df_video_create = pd.read_table("data/video_create_log.txt",header=None,names=video_create_log,index_col=None,dtype=dtype_video_create) 16 | # latest_user = (df_video_create.loc[df_video_create["video_create_day"]>laterThanDay]).user_id.unique().tolist() 17 | # print("get latest users") 18 | # print(latest_user) 19 | # print(len(latest_user)) 20 | df_video_create["videoCount"] = df_video_create.groupby(by=["user_id"])["video_create_day"].transform(lambda x: x.nunique()) 21 | frequent_user = (df_video_create.loc[df_video_create["videoCount"]>videoCount]).user_id.unique().tolist() 22 | print(df_video_create.describe()) 23 | print("get frequent users") 24 | print(frequent_user) 25 | print(len(frequent_user)) 26 | return frequent_user 27 | # with open("result/submission.csv","a",newline="") as f: 28 | # writer = csv.writer(f) 29 | # for i in user_videoCreate: 30 | # writer.writerow([i]) 31 | # get_frequser_from_videoCreate(3) 32 | def get_frequser_from_appLaunch(launchCount): 33 | print("get users from app launch log") 34 | app_launch_log = ["user_id","app_launch_day"] 35 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8} 36 | df_app_launch = pd.read_table("data/app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch) 37 | # latest_user = (df_app_launch.loc[df_app_launch["app_launch_day"]>laterThanDay]).user_id.unique().tolist() 38 | # print("get latest users") 39 | # print(latest_user) 40 | # print(len(latest_user)) 41 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique()) 42 | frequent_user = (df_app_launch.loc[df_app_launch["launchCount"]>launchCount]).user_id.unique().tolist() 43 | print(df_app_launch.describe()) 44 | print("get frequent users") 45 | print(frequent_user) 46 | print(len(frequent_user)) 47 | return frequent_user 48 | # with open("result/submission.csv","a",newline="") as f: 49 | # writer = csv.writer(f) 50 | # for i in user_appLaunch: 51 | # writer.writerow([i]) 52 | # get_frequser_from_appLaunch(10) 53 | def get_user_from_userRegister(laterThanDay): 54 | print("get users from user regiser log") 55 | user_register_log = ["user_id", "register_day", "register_type", "device_type"] 56 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": str} 57 | df_user_register = pd.read_table("data/user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register) 58 | latest_user = (df_user_register.loc[df_user_register["register_day"]>laterThanDay]).user_id.unique().tolist() 59 | print("get latest users") 60 | print(latest_user) 61 | print(len(latest_user)) 62 | return latest_user 63 | # get_user_from_userRegister(25) 64 | def get_frequser_from_userActivity(dayCount): 65 | print("get users from user activity log") 66 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"] 67 | usecols = ["user_id", "user_activity_day"] 68 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "action_type": np.uint8} 69 | df_user_activity = pd.read_table("data/user_activity_log.txt",header=None,names=user_activity_log,usecols=usecols,index_col=None,dtype=dtype_user_activity).drop_duplicates() 70 | # latest_user = (df_user_activity.loc[df_user_activity["user_activity_day"]>laterThanDay]).user_id.unique().tolist() 71 | # print("get latest users") 72 | # print(latest_user) 73 | # print(len(latest_user)) 74 | 75 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique()) 76 | frequent_user = (df_user_activity.loc[df_user_activity["dayCount"]>dayCount]).user_id.unique().tolist() 77 | print(df_user_activity.describe()) 78 | print("get frequent users") 79 | print(frequent_user) 80 | print(len(frequent_user)) 81 | 82 | # print("get users in certain pages and certain action type") 83 | # user_inList = (df_user_activity.loc[((df_user_activity["page"].isin(pageList))|(df_user_activity["action_type"].isin(typeList)))&(df_user_activity["user_activity_day"]>laterThanDay-3)]).user_id.unique().tolist() 84 | # 85 | # print(user_inList) 86 | # print(len(user_inList)) 87 | # user_userActivity = list(set(latest_user+frequent_user+user_inList)) 88 | # 89 | # print("get merged users") 90 | # print(user_userActivity) 91 | # print(len(user_userActivity)) 92 | return frequent_user 93 | # get_frequser_from_userActivity(10) 94 | def get_activeUsers_from_register(): 95 | print("get users from user regiser log") 96 | # user_register_log = ["user_id", "register_day", "register_type", "device_type"] 97 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": np.uint16} 98 | # df_user_register = pd.read_table("data/user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register) 99 | df_user_register = pd.read_csv("data/user_register_log.csv", header=0, index_col=None, dtype=dtype_user_register) 100 | 101 | df_user_register["register_type_rate"] = df_user_register.groupby(by=["register_type"])["register_type"].transform( 102 | "count") 103 | df_user_register["register_type_device"] = df_user_register.groupby(by=["register_type"])["device_type"].transform( 104 | lambda x: x.nunique()) 105 | df_user_register["device_type_rate"] = df_user_register.groupby(by=["device_type"])["device_type"].transform( 106 | "count") 107 | df_user_register["device_type_register"] = df_user_register.groupby(by=["device_type"])["register_type"].transform( 108 | lambda x: x.nunique()) 109 | active_users = pd.read_csv("hCoded/submission_freqUsers1_2018-06-08_11-16.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist() 110 | 111 | df_acuser_info = df_user_register.loc[df_user_register["user_id"].isin(active_users)] 112 | # df_acuser_info.to_csv("data/active_user_info.csv",header=True,index=False) 113 | print(df_acuser_info.describe()) 114 | # get_activeUsers_from_register() 115 | def get_user(): 116 | 117 | user_videoCreate = get_frequser_from_videoCreate(3) 118 | user_appLaunch = get_frequser_from_appLaunch(8) 119 | # user_userRegister = get_user_from_userRegister(27) 120 | user_userActivity = get_frequser_from_userActivity(7) 121 | 122 | users = list(set(user_videoCreate+user_appLaunch+user_userActivity)) 123 | print("get the final merged users") 124 | print(users) 125 | print(len(users)) 126 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 127 | submission_file = "hCoded/submission_freqUsers_v2_" + str_time + ".csv" 128 | with open(submission_file,"a",newline="") as f: 129 | writer = csv.writer(f) 130 | for i in users: 131 | writer.writerow([i]) 132 | get_user() -------------------------------------------------------------------------------- /hardcodedpy/hardcode_approach_v3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def processing(laterThanDay,launchCount,videoCount,activityCount): 5 | print("get users from user register log") 6 | user_register_log = ["user_id", "register_day", "register_type", "device_type"] 7 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": np.uint32} 8 | df_user_register = pd.read_table("user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register) 9 | user_outliers = df_user_register[(df_user_register["register_type"] == 3) & ( 10 | (df_user_register["device_type"] == 1) | (df_user_register["device_type"] == 223) | ( 11 | df_user_register["device_type"] == 83))]["user_id"].unique().tolist() 12 | df_user_register = df_user_register[~df_user_register["user_id"].isin(user_outliers)] 13 | df_user_register = df_user_register.loc[df_user_register["register_day"]>laterThanDay] 14 | 15 | 16 | print("get users from app launch log") 17 | app_launch_log = ["user_id","app_launch_day"] 18 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8} 19 | df_app_launch = pd.read_table("app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch) 20 | df_app_launch = df_app_launch[~df_app_launch["user_id"].isin(user_outliers)] 21 | df_app_launch = df_app_launch.loc[df_app_launch["app_launch_day"] >laterThanDay] 22 | 23 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique()) 24 | frequent_user1 = (df_app_launch.loc[df_app_launch["launchCount"]>launchCount]).user_id.unique().tolist() 25 | print("number of frequent launch users after {} is {} ".format(laterThanDay,len(frequent_user1))) 26 | video_create_log = ["user_id", "video_create_day"] 27 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8} 28 | df_video_create = pd.read_table("video_create_log.txt",header=None,names=video_create_log,index_col=None,dtype=dtype_video_create) 29 | 30 | df_video_create = df_video_create[~df_video_create["user_id"].isin(user_outliers)] 31 | 32 | df_video_create = df_video_create.loc[df_video_create["video_create_day"] >laterThanDay] 33 | 34 | df_video_create["videoCount"] = df_video_create.groupby(by=["user_id"])["video_create_day"].transform(lambda x: x.nunique()) 35 | frequent_user2 = (df_video_create.loc[df_video_create["videoCount"]>videoCount]).user_id.unique().tolist() 36 | print("number of frequent video create users after {} is {} ".format(laterThanDay,len(frequent_user2))) 37 | 38 | print("get users from user activity log") 39 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"] 40 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32, 41 | "author_id":np.uint32, "action_type": np.uint8} 42 | df_user_activity = pd.read_table("user_activity_log.txt",header=None,names=user_activity_log,index_col=None,dtype=dtype_user_activity,usecols=["user_id", "user_activity_day"]) 43 | df_user_activity = df_user_activity[~df_user_activity["user_id"].isin(user_outliers)] 44 | df_user_activity= df_user_activity.loc[df_user_activity["user_activity_day"] >laterThanDay] 45 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique()) 46 | frequent_user3 = (df_user_activity.loc[df_user_activity["dayCount"]>activityCount]).user_id.unique().tolist() 47 | print("number of frequent activity users after {} is {} ".format(laterThanDay,len(frequent_user3))) 48 | 49 | 50 | processing(24,4,4,4) 51 | -------------------------------------------------------------------------------- /hardcodedpy/merge_approach.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | from hardcode_approach import get_user 4 | import pandas as pd 5 | import numpy as np 6 | def merge1(): 7 | hardcode_user = get_user() 8 | print(len(hardcode_user)) 9 | merged_csv1 = pd.read_csv("result/submission_2018-06-01_17-07.csv",header=None,index_col=None,names=["user_id"]) 10 | mc1 = merged_csv1["user_id"].tolist() 11 | print(len(mc1)) 12 | merged_csv2 = pd.read_csv("result/submission_2018-06-01_17-47.csv",header=None,index_col=None,names=["user_id"]) 13 | mc2 = merged_csv2["user_id"].tolist() 14 | print(len(mc2)) 15 | mc2 = [e for e in mc2 if e in mc1] 16 | print(len(mc2)) 17 | merged_csv3 = pd.read_csv("result/submission_2018-06-01_18-05catboost.csv",header=None,index_col=None,names=["user_id"]) 18 | mc3 = merged_csv3["user_id"].tolist() 19 | print(len(mc3)) 20 | mc3 = [e for e in mc3 if e in mc2] 21 | print(len(mc3)) 22 | users = list(set(hardcode_user+mc3)) 23 | print(len(users)) 24 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 25 | submission_file = "result/submission_" + str_time + ".csv" 26 | with open(submission_file,"a",newline="") as f: 27 | writer = csv.writer(f) 28 | for i in users: 29 | writer.writerow([i]) 30 | def merge2(): 31 | hardcode_user = get_user() 32 | print(len(hardcode_user)) 33 | merged_csv1 = pd.read_csv("result/submission_2018-05-30_23-20.csv",header=None,index_col=None,names=["user_id"]) 34 | mc1 = merged_csv1["user_id"].tolist() 35 | print(len(mc1)) 36 | merged_csv2 = pd.read_csv("merge/submission_2018-06-01_11-57.csv",header=None,index_col=None,names=["user_id"]) 37 | mc2 = merged_csv2["user_id"].tolist() 38 | print(len(mc2)) 39 | mc2 = [e for e in mc2 if e in mc1] 40 | print(len(mc2)) 41 | users = list(set(hardcode_user+mc2)) 42 | print(len(users)) 43 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 44 | submission_file = "result/submission_" + str_time + ".csv" 45 | with open(submission_file,"a",newline="") as f: 46 | writer = csv.writer(f) 47 | for i in users: 48 | writer.writerow([i]) 49 | def merge3(): 50 | hardcode_user = get_user() 51 | print(len(hardcode_user)) 52 | merged_csv1 = pd.read_csv("result/submission_lgb_2018-06-03_00-34.csv",header=None,index_col=None,names=["user_id"]) 53 | mc1 = merged_csv1["user_id"][:23500].tolist() 54 | print(len(mc1)) 55 | users = list(set(hardcode_user+mc1)) 56 | print(len(users)) 57 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 58 | submission_file = "result/submission_" + str_time + ".csv" 59 | # with open(submission_file,"a",newline="") as f: 60 | # writer = csv.writer(f) 61 | # for i in users: 62 | # writer.writerow([i]) 63 | def merge4(): 64 | hardcode_user = get_user() 65 | print(len(hardcode_user)) 66 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 67 | hdf = pd.Series(hardcode_user,name="user_id") 68 | hfile = "hCoded/hcode_"+str_time + ".csv" 69 | hdf.to_csv(hfile,header=True,index=False) 70 | merged_csv1 = pd.read_csv("lgb/uid_2018-06-11_22-04-13.csv",header=0,index_col=None) 71 | mc1 = merged_csv1["user_id"][:23800].tolist() 72 | print(len(mc1)) 73 | users = list(set(hardcode_user+mc1)) 74 | print(len(users)) 75 | 76 | submission_file = "merge/submission_" + str_time + ".csv" 77 | # with open(submission_file,"a",newline="") as f: 78 | # writer = csv.writer(f) 79 | # for i in users: 80 | # writer.writerow([i]) 81 | # merge4() 82 | def merge5(): 83 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 84 | # hardcode_user = get_user() 85 | # hardcode_user = pd.read_csv("merge/submission_v5_fre2_2018-06-08_11-48.csv",header=0,index_col=None)["user_id"].tolist() 86 | # print(len(hardcode_user)) 87 | mc2 = pd.read_csv("hCoded/hcode_v12_lastdayofactivityandlaunchcount1_withauthor_2018-06-16_08-54.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist() 88 | print(len(mc2)) 89 | # hdf = pd.Series(hardcode_user,name="user_id") 90 | # hfile = "hCoded/hcode_28ac_"+str_time + ".csv" 91 | # hdf.to_csv(hfile,header=True,index=False) 92 | mc = pd.read_csv("single/submission_18-23slgb_0.81-2018-06-16_08-16.csv",header=None,index_col=None,names=["user_id"])["user_id"].tolist()[:20000] 93 | # mc1 = pd.read_csv("lgb/uid_2018-06-04_16-55-34.csv",header=0,index_col=None) 94 | # mc = mc1.loc[mc1["score"]>0.40]["user_id"].tolist() 95 | print(len(mc)) 96 | users = list(set(mc2+mc)) 97 | print(len(users)) 98 | # 99 | submission_file = "merge/submission_0.81lgb20000_v12_" + str_time + ".csv" 100 | 101 | with open(submission_file,"a",newline="") as f: 102 | writer = csv.writer(f) 103 | for i in users: 104 | writer.writerow([i]) 105 | # merge5() 106 | def register_in_activity_author(laterThanDay,dayCount): 107 | print("get users from user activity log") 108 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"] 109 | # usecols = ["user_id", "user_activity_day", "page","action_type"] 110 | usecols = ["user_id", "user_activity_day","author_id"] 111 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "action_type": np.uint8} 112 | df_user_activity = pd.read_table("data/user_activity_log.txt",header=None,names=user_activity_log,usecols=usecols,index_col=None,dtype=dtype_user_activity).drop_duplicates() 113 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique()) 114 | author_id = df_user_activity["author_id"].unique().tolist() 115 | user_id = df_user_activity["user_id"].unique().tolist() 116 | 117 | def intersection(lst1, lst2): 118 | return list(set(lst1) & set(lst2)) 119 | 120 | intersect_id = intersection(user_id, author_id) 121 | print("number of user is author {}".format(len(intersect_id))) 122 | user_userActivity = (df_user_activity.loc[(df_user_activity["user_activity_day"]>laterThanDay)&(df_user_activity["user_id"].isin(intersect_id))&(df_user_activity["dayCount"]>dayCount)]).user_id.unique().tolist() 123 | print("number of user is author activates more than {} days no later than {} : {}".format(dayCount,laterThanDay,len(user_userActivity))) 124 | return user_userActivity 125 | def single(): 126 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 127 | single_csv1 = pd.read_csv("06-17/uid_2018-06-17_01-01-33.csv",header=0,index_col=None)["user_id"].unique().tolist() 128 | # mc1 = single_csv1.loc[single_csv1["score"]>0.48]["user_id"].tolist() 129 | mc1 = single_csv1[:23727] 130 | print(len(mc1)) 131 | # user_userActivity = register_in_activity_author(23,2) 132 | 133 | users = list(set(mc1)) 134 | print(len(users)) 135 | submission_file = "single/submission_slgb_all0.8-" + str_time + ".csv" 136 | with open(submission_file,"a",newline="") as f: 137 | writer = csv.writer(f) 138 | for i in users: 139 | writer.writerow([i]) 140 | single() 141 | -------------------------------------------------------------------------------- /hardcodedpy/new_merge.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import pandas as pd 4 | 5 | def merge5(): 6 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 7 | # hardcode_user = get_user() 8 | hardcode_user = pd.read_csv("hCoded/hcode_20-29_v5_2018-06-06_20-12_nolastdayoflaunch_22-30.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist() 9 | print(len(hardcode_user)) 10 | # hdf = pd.Series(hardcode_user,name="user_id") 11 | # hfile = "hCoded/hcode_28ac_"+str_time + ".csv" 12 | # hdf.to_csv(hfile,header=True,index=False) 13 | # mc1 = pd.read_csv("lr/uid_2018-06-07_22-55-45.csv",header=0,index_col=None) 14 | mc = pd.read_csv("hCoded/submission_freqUsers_v2_2018-06-08_11-38.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist() 15 | # mc = mc1.loc[mc1["score"]>0.20]["user_id"].tolist() 16 | print(len(mc)) 17 | ac_users = list(set(hardcode_user)-set(mc)) 18 | print(len(ac_users)) 19 | 20 | v5_user = pd.read_csv("hCoded/hcode_20-29_v5_2018-06-06_20-12_nolastdayoflaunch_22-30.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist() 21 | print(len(v5_user)) 22 | 23 | users = list(set(v5_user+ac_users)) 24 | print(len(users)) 25 | # # 26 | # submission_file = "merge/submission_0.815-baseline+v5_" + str_time + ".csv" 27 | # with open(submission_file,"a",newline="") as f: 28 | # writer = csv.writer(f) 29 | # for i in users: 30 | # writer.writerow([i]) 31 | def merge6(): 32 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 33 | # hardcode_user = get_user() 34 | hardcode_user = pd.read_csv("merge/submission_rule_consec_2018-06-25_20-23.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist() 35 | mc = pd.read_csv("lgb/submission_lgb_r3_1600_4_2018-06-24_23-42-42.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist()[:24000] 36 | print(len(hardcode_user)) 37 | # hdf = pd.Series(hardcode_user,name="user_id") 38 | # hfile = "hCoded/hcode_28ac_"+str_time + ".csv" 39 | # hdf.to_csv(hfile,header=True,index=False) 40 | # mc1 = pd.read_csv("lr/uid_2018-06-07_22-55-45.csv",header=0,index_col=None) 41 | # mc = pd.read_csv("hCoded/submission_freqUsers_v3_2018-06-08_11-41.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist() 42 | # mc = mc1.loc[mc1["score"]>0.20]["user_id"].tolist() 43 | 44 | # mc1 = pd.read_csv("lgb/uid_2018-06-04_16-55-34.csv",header=0,index_col=None) 45 | # mc = mc1.loc[mc1["score"]>0.7]["user_id"].tolist() 46 | # merged_csv1 = pd.read_csv("lgb/uid_2018-06-11_22-04-13.csv",header=0,index_col=None) 47 | # mc = merged_csv1["user_id"][:23800].tolist() 48 | print(len(mc)) 49 | 50 | users = list(set(hardcode_user+mc)) 51 | # users = list(set(mc)) 52 | print(len(users)) 53 | # # 54 | submission_file = "merge/submission_lgbhest_ru_" + str_time + ".csv" 55 | with open(submission_file,"a",newline="") as f: 56 | writer = csv.writer(f) 57 | for i in users: 58 | writer.writerow([i]) 59 | # merge6() 60 | import numpy as np 61 | def get_user_from_activity_new(trainSpan,laterThanDay,activityCount): 62 | print("get users from user activity log") 63 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32, 64 | "author_id": np.uint32, "action_type": np.uint8} 65 | use_feature = ["user_id","user_activity_day"] 66 | df_user_activity = pd.read_csv("data/user_activity_log.csv", header=0, index_col=None, dtype=dtype_user_activity,usecols=use_feature) 67 | 68 | df_user_activity = df_user_activity.loc[ 69 | (df_user_activity["user_activity_day"] >= trainSpan[0]) & ( 70 | df_user_activity["user_activity_day"] <= trainSpan[1])] 71 | # print(df_app_launch.groupby(by=["user_id"]).size()) 72 | # print(df_app_launch.groupby(by=["app_launch_day"]).size()) 73 | df_user_activity["activityCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform("count") 74 | # print(df_user_activity.describe()) 75 | user_activity = (df_user_activity.loc[(df_user_activity["user_activity_day"]>laterThanDay)&(df_user_activity["activityCount"]>activityCount)]).user_id.unique().tolist() 76 | print("users active no later than {} and active for more than {} times: {} ".format(laterThanDay,activityCount,len(user_activity))) 77 | return user_activity 78 | def get_user_from_appLaunch_new(trainSpan,laterThanDay,launchCount): 79 | print("get users from app launch log") 80 | app_launch_log = ["user_id","app_launch_day"] 81 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8} 82 | df_app_launch = pd.read_table("data/app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch).drop_duplicates() 83 | df_app_launch = df_app_launch.loc[ 84 | (df_app_launch["app_launch_day"] >= trainSpan[0]) & (df_app_launch["app_launch_day"] <= trainSpan[1])] 85 | # print(df_app_launch.groupby(by=["user_id"]).size()) 86 | # print(df_app_launch.groupby(by=["app_launch_day"]).size()) 87 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique()) 88 | user_appLaunch = (df_app_launch.loc[(df_app_launch["app_launch_day"]>laterThanDay)&(df_app_launch["launchCount"]>launchCount)]).user_id.unique().tolist() 89 | print("users launched no later than {} and launched for more than {} days: {} ".format(laterThanDay,launchCount,len(user_appLaunch))) 90 | return user_appLaunch 91 | if __name__=="__main__": 92 | # av1 = get_user_from_activity_new((30,30),29,216) 93 | # av2 = get_user_from_activity_new((29,30),29,342) 94 | # av3 = get_user_from_activity_new((28,30),28,452) 95 | # av4 = get_user_from_activity_new((27,30),27,569) 96 | # av = list(set(av1+av2+av3+av4)) 97 | # print(len(av)) 98 | # la1 = get_user_from_appLaunch_new((29,30), 29, 1) 99 | # print("number of users between {} and {} is {}".format(29,30,len(la1))) 100 | # la2 = get_user_from_appLaunch_new((28,30), 28, 1) 101 | # print("number of users between {} and {} is {}".format(28,30,len(la2))) 102 | # la3 = get_user_from_appLaunch_new((27,30), 27, 2) 103 | # print("number of users between {} and {} is {}".format(27,30,len(la3))) 104 | # la4 = get_user_from_appLaunch_new((26,30), 26, 3) 105 | # print("number of users between {} and {} is {}".format(26,30,len(la4))) 106 | # la5 = get_user_from_appLaunch_new((25,30), 25, 4) 107 | # print("number of users between {} and {} is {}".format(25,30,len(la5))) 108 | # # # la6 = get_user_from_appLaunch_new((24,30), 24, 5) 109 | # # # print("number of users between {} and {} is {}".format(24,30,len(la6))) 110 | # # # la7 = get_user_from_appLaunch_new((23,30), 23, 6) 111 | # # # print("number of users between {} and {} is {}".format(23,30,len(la7))) 112 | # # # la8 = get_user_from_appLaunch_new((22,30), 22, 7) 113 | # # # print("number of users between {} and {} is {}".format(22,30,len(la8))) 114 | # # # la9 = get_user_from_appLaunch_new((21,30), 21, 8) 115 | # # # print("number of users between {} and {} is {}".format(21,30,len(la9))) 116 | # # # la10 = get_user_from_appLaunch_new((20,30), 20, 9) 117 | # # # print("number of users between {} and {} i0s {}".format(20,30,len(la10))) 118 | # # # la11 = get_user_from_appLaunch_new((19,30), 19, 10) 119 | # # # print("number of users between {} and {} is {}".format(19,30,len(la11))) 120 | # # # la12 = get_user_from_appLaunch_new((18,30), 18, 11) 121 | # # # print("number of users between {} and {} is {}".format(18,30,len(la12))) 122 | # # # la13 = get_user_from_appLaunch_new((17,30), 17, 12) 123 | # # # print("number of users between {} and {} is {}".format(17,30,len(la13))) 124 | # # # la = list(set(av1+av2+la1+la2+la3+la4+la5+la6+la7+la8+la9+la10+la11+la12+la13)) 125 | # la = list(set(av+la1+la2+la3+la4+la5)) 126 | # print("number of consecutive users {}".format(len(la))) 127 | # str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 128 | # submission_file = "merge/submission_rule_consec_" + str_time + ".csv" 129 | # with open(submission_file,"a",newline="") as f: 130 | # writer = csv.writer(f) 131 | # for i in la: 132 | # writer.writerow([i]) 133 | merge6() -------------------------------------------------------------------------------- /lrpy/lr_v2.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import pandas as pd 4 | # import joblib 5 | from lightgbm import LGBMClassifier 6 | from sklearn.decomposition import PCA, FactorAnalysis 7 | from sklearn.feature_selection import SelectKBest, mutual_info_classif, SelectFromModel 8 | from sklearn.linear_model import LogisticRegressionCV, LogisticRegression 9 | from sklearn.metrics import classification_report, f1_score 10 | # from sklearn.model_selection import GridSearchCV, train_test_split 11 | # from skopt import BayesSearchCV 12 | # from skopt.callbacks import DeltaXStopper 13 | # from data_process_v7 import processing 14 | # from sklearn.feature_selection import VarianceThreshold 15 | import numpy as np 16 | # def predict(clf2, test_set,param): 17 | from sklearn.pipeline import Pipeline 18 | 19 | 20 | def predict(clf2, test_set,param,sel): 21 | uid = pd.DataFrame() 22 | # test_set = processing(trainSpan=(1, 30), label=False) 23 | uid["user_id"] = test_set["user_id"] 24 | test_set = test_set.drop(labels=["user_id"], axis=1) 25 | test_set = sel.transform(test_set.values) 26 | print("begin to make predictions") 27 | # res = clf2.predict_proba(test_set.values) 28 | res = clf2.predict_proba(test_set) 29 | uid["proba1"] = pd.Series(res[:, 1]) 30 | uid["score"] = uid.groupby(by=["user_id"])["proba1"].transform(lambda x: sum(x) / float(len(x))) 31 | uid.drop_duplicates(subset=["user_id"],inplace=True) 32 | uid.sort_values(by=["score"],axis=0,ascending=False,inplace=True) 33 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 34 | uid_file = "../result/uid/B/uid_lr_" +param+"_"+ str_time + ".csv" 35 | uid.to_csv(uid_file,header=True,index=False) 36 | # active_users = uid.loc[uid["score"]>0.5]["user_id"].unique().tolist() 37 | active_users = uid["user_id"][:24500].unique().tolist() 38 | # print(len(active_users)) 39 | print(uid["score"].tolist()[24500]) 40 | # print(active_users) 41 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 42 | submission_file = "../result/628/pm/submission_lr_" +param+"_"+ str_time + ".csv" 43 | with open(submission_file, "a", newline="") as f: 44 | writer = csv.writer(f) 45 | for i in active_users: 46 | writer.writerow([i]) 47 | # using this module ,one needs to deconstruct some of the features in data_process 48 | def run(scheme_num=3,file_name="../data/data_v8/training_r"): 49 | train_set_ls = [] 50 | if scheme_num ==1: 51 | for i in [16,17,22,23]: 52 | print("begin to load the dataset") 53 | file_name1 = file_name+"ld1-"+str(i)+".csv" 54 | train_set_temp = pd.read_csv(file_name1, header=0, index_col=None) 55 | print(train_set_temp.describe()) 56 | train_set_ls.append(train_set_temp) 57 | elif scheme_num ==2: 58 | for i in [16,23]: 59 | print("begin to load the dataset") 60 | file_name2 = file_name+"ld1-" + str(i) + ".csv" 61 | train_set_temp = pd.read_csv(file_name2, header=0, index_col=None) 62 | print(train_set_temp.describe()) 63 | train_set_ls.append(train_set_temp) 64 | elif scheme_num ==3: 65 | for i in [17,18,19,20,21,22,23]: 66 | print("begin to load the dataset"+str(i)) 67 | file_name3 = file_name+ "ld1-" + str(i) + ".csv" 68 | train_set_temp = pd.read_csv(file_name3, header=0, index_col=None) 69 | print(train_set_temp.describe()) 70 | train_set_ls.append(train_set_temp) 71 | val_file_name = file_name+ "ld1-23.csv" 72 | val_set = pd.read_csv(val_file_name, header=0, index_col=None) 73 | val_set2 = pd.read_csv("../data/data_v5/training_eld1-23.csv", header=0, index_col=None) 74 | print(val_set.describe()) 75 | print(val_set2.describe()) 76 | train_set = pd.concat(train_set_ls, axis=0) 77 | ds = train_set.describe() 78 | print(ds) 79 | 80 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) 81 | 82 | print("begin to drop the duplicates") 83 | train_set.drop_duplicates(subset=keep_feature, inplace=True) 84 | val_set.drop_duplicates(subset=keep_feature,inplace=True) 85 | val_set2.drop_duplicates(subset=keep_feature,inplace=True) 86 | print(train_set.describe()) 87 | print(val_set.describe()) 88 | print(val_set2.describe()) 89 | train_label = train_set["label"] 90 | val_label = val_set["label"] 91 | val_label2 = val_set2["label"] 92 | train_set = train_set.drop(labels=["label", "user_id"], axis=1) 93 | val_set = val_set.drop(labels=["label","user_id"], axis=1) 94 | val_set2 = val_set2.drop(labels=["label","user_id"], axis=1) 95 | 96 | drop_features = [""] 97 | 98 | 99 | print("begin to standardization the data") 100 | for fea in keep_feature: 101 | train_set[fea] = (train_set[fea]-train_set[fea].min())/(train_set[fea].max()-train_set[fea].min()) 102 | # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std()) 103 | val_set[fea] = (val_set[fea]-val_set[fea].min())/(val_set[fea].max()-val_set[fea].min()) 104 | val_set2[fea] = (val_set2[fea]-val_set2[fea].min())/(val_set2[fea].max()-val_set2[fea].min()) 105 | # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std()) 106 | # print(train_set.describe()) 107 | # keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) 108 | # sel = SelectKBest(mutual_info_classif, k=300).fit(train_set.values, train_label.values) 109 | # train_set = sel.transform(train_set.values) 110 | # val_set = sel.transform(val_set.values) 111 | # val_set2 = sel.transform(val_set2.values) 112 | # feature_importances = sel.scores_ 113 | # print(feature_importances) 114 | # print(keep_feature) 115 | # feature_score_name = sorted(zip(feature_importances, keep_feature), reverse=True) 116 | # for score, name in feature_score_name: 117 | # print('{}: {}'.format(name, score)) 118 | 119 | # kpca = PCA(n_components=0.98) 120 | # # kpca = FactorAnalysis(n_components=100) 121 | # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1) 122 | # kpca.fit(train_set.values) 123 | # train_set = kpca.transform(train_set.values) 124 | # val_set = kpca.transform(val_set.values) 125 | # print(kpca.components_) 126 | # # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_)) 127 | # print("number of components {}".format(kpca.n_components_)) 128 | # print("noise variance {}".format(kpca.noise_variance_)) 129 | # print("the explained variance {}".format(kpca.explained_variance_)) 130 | # print("the explained variance ratio {}".format(kpca.explained_variance_ratio_)) 131 | 132 | print("begin to make prediction with plain features and without tuning parameters") 133 | 134 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns)) 135 | 136 | # best_f1 =0.0 137 | # best_params = {"n_estimators":800,"num_leaves":6} 138 | # for n_estimator in [400,600,800]: 139 | # for num_leave in [4,6,8]: 140 | # print({"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"}) 141 | # clf1 = LGBMClassifier(n_estimators=n_estimator, num_leaves=num_leave, boosting_type="dart") 142 | # clf1.fit(train_set.values, train_label.values) 143 | # print("load the test dataset") 144 | # yhat = clf1.predict(val_set.values) 145 | # print(classification_report(y_pred=yhat, y_true=val_label.values,digits=4)) 146 | # f1 = f1_score(y_pred=yhat, y_true=val_label.values) 147 | # if best_f10.5]["user_id"].unique().tolist() 66 | active_users = uid["user_id"][:24500].unique().tolist() 67 | # print(len(active_users)) 68 | print(uid["score"].tolist()[24500]) 69 | # print(active_users) 70 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 71 | submission_file = "../result/622/submission_dnn_" + param + "_" + str_time + ".csv" 72 | with open(submission_file, "a", newline="") as f: 73 | writer = csv.writer(f) 74 | for i in active_users: 75 | writer.writerow([i]) 76 | 77 | # using this module ,one needs to deconstruct some of the features in data_process 78 | def run(scheme_num=1, file_name="../data/data_v3/training_e"): 79 | train_set_ls = [] 80 | if scheme_num == 1: 81 | for i in [16, 17, 22, 23]: 82 | print("begin to load the dataset") 83 | file_name1 = file_name + "ld1-" + str(i) + ".csv" 84 | train_set_temp = pd.read_csv(file_name1, header=0, index_col=None) 85 | print(train_set_temp.describe()) 86 | train_set_ls.append(train_set_temp) 87 | elif scheme_num == 2: 88 | for i in [16, 23]: 89 | print("begin to load the dataset") 90 | file_name2 = file_name + "ld1-" + str(i) + ".csv" 91 | train_set_temp = pd.read_csv(file_name2, header=0, index_col=None) 92 | print(train_set_temp.describe()) 93 | train_set_ls.append(train_set_temp) 94 | elif scheme_num == 3: 95 | for i in [18, 19, 20, 21, 22, 23]: 96 | print("begin to load the dataset") 97 | file_name3 = file_name + "ld1-" + str(i) + ".csv" 98 | train_set_temp = pd.read_csv(file_name3, header=0, index_col=None) 99 | print(train_set_temp.describe()) 100 | train_set_ls.append(train_set_temp) 101 | val_file_name = file_name + "ld1-22.csv" 102 | val_set = pd.read_csv(val_file_name, header=0, index_col=None) 103 | print(val_set.describe()) 104 | train_set = pd.concat(train_set_ls, axis=0) 105 | ds = train_set.describe() 106 | print(ds) 107 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) 108 | 109 | print("begin to drop the duplicates") 110 | train_set.drop_duplicates(subset=keep_feature, inplace=True) 111 | val_set.drop_duplicates(subset=keep_feature, inplace=True) 112 | print(train_set.describe()) 113 | print(val_set.describe()) 114 | train_label = train_set["label"] 115 | val_label = val_set["label"] 116 | train_set = train_set.drop(labels=["label", "user_id"], axis=1) 117 | val_set = val_set.drop(labels=["label", "user_id"], axis=1) 118 | 119 | print("begin to standardization the data") 120 | for fea in keep_feature: 121 | if train_set[fea].var() < 0.000001 or val_set[fea].var() < 0.000001: 122 | train_set.drop(labels=[fea], axis=1, inplace=True) 123 | val_set.drop(labels=[fea], axis=1, inplace=True) 124 | else: 125 | train_set[fea] = (train_set[fea] - train_set[fea].min()) / (train_set[fea].max() - train_set[fea].min()) 126 | # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std()) 127 | val_set[fea] = (val_set[fea] - val_set[fea].min()) / (val_set[fea].max() - val_set[fea].min()) 128 | # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std()) 129 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"])) 130 | kpca = PCA(n_components=0.99, whiten=True) 131 | # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1) 132 | kpca.fit(train_set.values) 133 | train_set = kpca.transform(train_set.values) 134 | val_set = kpca.transform(val_set.values) 135 | pca_std = np.std(train_set) 136 | # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_)) 137 | NCOMPONENTS = kpca.n_components_ 138 | print("number of components {}".format(kpca.n_components_)) 139 | print("noise variance {}".format(kpca.noise_variance_)) 140 | print("the explained variance {}".format(kpca.explained_variance_)) 141 | print("the explained variance ratio {}".format(kpca.explained_variance_ratio_)) 142 | 143 | print("begin to make prediction with plain features and without tuning parameters") 144 | 145 | # scoring = {'f1': "f1"} 146 | # clf1 = GridSearchCV(LGBMClassifier(), 147 | # param_grid={"n_estimators":[200,400,600],"num_leaves": [4,5,6,8],"boosting_type":["dart"]}, 148 | # scoring=scoring, cv=4, refit='f1',n_jobs=-1,verbose=1) 149 | 150 | for layers in [3]: 151 | for units in [128]: 152 | print({"layers": layers, "neurals": units}) 153 | model = Sequential() 154 | # model.add(Dense(units, input_dim=NCOMPONENTS, activation='relu')) 155 | # model.add(Embedding(units,32, input_lenth=NCOMPONENTS)) 156 | # model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')) 157 | # model.add(MaxPooling1D(pool_size=2)) 158 | # model.add(Flatten()) 159 | # model.add(Dense(250, activation='relu')) 160 | # model.add(Dense(1, activation='sigmoid')) 161 | model.add(Dense(units, input_dim=NCOMPONENTS, activation='relu')) 162 | model.add(GaussianNoise(pca_std)) 163 | for i in range(layers): 164 | model.add(Dense(units, activation='relu')) 165 | model.add(GaussianNoise(pca_std)) 166 | model.add(Dropout(0.1)) 167 | model.add(Dense(1, activation='sigmoid')) 168 | print(model.summary()) 169 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1]) 170 | early_stopping = EarlyStopping(monitor="val_loss",patience=16) 171 | model.fit(train_set, train_label, epochs=300, batch_size=256, validation_split=0.15, verbose=2,callbacks=[early_stopping]) 172 | 173 | print("begin to make classification report for the validation dataset") 174 | # yhat = clf1.predict(val_set.values) 175 | # yhat = clf1.predict(val_set.values) 176 | yhat = np.reshape(model.predict_classes(val_set),-1) 177 | 178 | print(classification_report(y_pred=yhat, y_true=val_label.values, digits=4)) 179 | 180 | print("begin to make classification report for the training dataset") 181 | # yhat = clf1.predict(train_set.values) 182 | yhat = np.reshape(model.predict_classes(train_set),-1) 183 | print(classification_report(y_pred=yhat, y_true=train_label.values, digits=4)) 184 | 185 | print("load the test dataset") 186 | test_file_name = file_name.replace("training", "testing") + "ld1-30.csv" 187 | test_set = pd.read_csv(test_file_name, header=0, index_col=None, usecols=keep_feature + ["user_id"]) 188 | # test_set = pd.read_csv("data/testing_rld1-30.csv",header=0,index_col=None) 189 | for fea in keep_feature: 190 | test_set[fea] = (test_set[fea] - test_set[fea].min()) / (test_set[fea].max() - test_set[fea].min()) 191 | # test_set[fea] = (test_set[fea]-test_set[fea].mean())/(test_set[fea].std()) 192 | 193 | print("begin to make prediction") 194 | param = list(file_name)[-1] + str(scheme_num) + "_" + str(layers) + "_" + str(units) 195 | print(param) 196 | # predict(clf1,test_set,param) 197 | predict(model, test_set, param, kpca) 198 | 199 | if __name__ == "__main__": 200 | file_name1 = "../data/data_v3/training_e" 201 | file_name2 = "../data/data_v4/training_r" 202 | for scheme in [3]: 203 | for file in ["../data/data_v4/training_r"]: 204 | run(scheme_num=scheme,file_name=file) -------------------------------------------------------------------------------- /nnpy/f1_keras.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | 3 | def f1(y_true, y_pred): 4 | def recall(y_true, y_pred): 5 | """Recall metric. 6 | 7 | Only computes a batch-wise average of recall. 8 | 9 | Computes the recall, a metric for multi-label classification of 10 | how many relevant items are selected. 11 | """ 12 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 13 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 14 | recall = true_positives / (possible_positives + K.epsilon()) 15 | return recall 16 | 17 | def precision(y_true, y_pred): 18 | """Precision metric. 19 | 20 | Only computes a batch-wise average of precision. 21 | 22 | Computes the precision, a metric for multi-label classification of 23 | how many selected items are relevant. 24 | """ 25 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 26 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 27 | precision = true_positives / (predicted_positives + K.epsilon()) 28 | return precision 29 | precision = precision(y_true, y_pred) 30 | recall = recall(y_true, y_pred) 31 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 32 | -------------------------------------------------------------------------------- /nnpy/nn_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.preprocessing import LabelEncoder 4 | import tensorflow as tf 5 | from sklearn.metrics import r2_score, log_loss, roc_auc_score 6 | from sklearn.model_selection import KFold 7 | 8 | # Training steps 9 | STEPS = 500 10 | LEARNING_RATE = 0.0001 11 | BETA = 0.01 12 | DROPOUT = 0.5 13 | RANDOM_SEED = 12345 14 | MAX_Y = 250 15 | RESTORE = True 16 | START = 0 17 | 18 | # Training variables 19 | IN_DIM = 13 20 | 21 | # Network Parameters - Hidden layers 22 | n_hidden_1 = 100 23 | n_hidden_2 = 50 24 | 25 | def weight_variable(shape): 26 | initial = tf.truncated_normal(shape, stddev=0.01) 27 | return tf.Variable(initial) 28 | 29 | def bias_variable(shape): 30 | initial = tf.constant(0.03, shape=shape) 31 | return tf.Variable(initial) 32 | 33 | def deep_network(inputs, keep_prob): 34 | # Input -> Hidden Layer 35 | w1 = weight_variable([IN_DIM, n_hidden_1]) 36 | b1 = bias_variable([n_hidden_1]) 37 | # Hidden Layer -> Hidden Layer 38 | w2 = weight_variable([n_hidden_1, n_hidden_2]) 39 | b2 = bias_variable([n_hidden_2]) 40 | # Hidden Layer -> Output 41 | w3 = weight_variable([n_hidden_2, 1]) 42 | b3 = bias_variable([1]) 43 | 44 | # 1st Hidden layer with dropout 45 | h1 = tf.nn.relu(tf.matmul(inputs, w1) + b1) 46 | h1_dropout = tf.nn.dropout(h1, keep_prob) 47 | # 2nd Hidden layer with dropout 48 | h2 = tf.nn.relu(tf.matmul(h1_dropout, w2) + b2) 49 | h2_dropout = tf.nn.dropout(h2, keep_prob) 50 | 51 | # Run sigmoid on output to get 0 to 1 52 | out = tf.nn.sigmoid(tf.matmul(h2_dropout, w3) + b3) 53 | 54 | # Loss function with L2 Regularization 55 | regularizers = tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w3) 56 | 57 | scaled_out = tf.multiply(out, MAX_Y) # Scale output 58 | return inputs, out, scaled_out, regularizers 59 | def get_stratified_sample(df, sample_target="user_id", reference_target="app_launch_day", 60 | sample_ratio=0.2): 61 | df = df.astype(np.uint32) 62 | reference_target_ls = df[reference_target].unique().tolist() 63 | target_sample = [] 64 | for i in reference_target_ls: 65 | # print("get users in day {}".format(i)) 66 | target_sample.extend(df.loc[df[reference_target] == int(i)][sample_target].drop_duplicates().sample(frac=sample_ratio).tolist()) 67 | del df 68 | return list(set(target_sample)) 69 | def nn_model(train_set,val_set,file,best_params=None,val_ratio=0.4, n_round = 3): 70 | tf.set_random_seed(RANDOM_SEED) 71 | 72 | # Create the model 73 | x = tf.placeholder(tf.float32, [None, IN_DIM]) 74 | 75 | # Define loss and optimizer 76 | y_ = tf.placeholder(tf.float32, [None, 1]) 77 | 78 | # Dropout on hidden layers 79 | keep_prob = tf.placeholder("float") 80 | 81 | # Build the graph for the deep net 82 | inputs, out, scaled_out, regularizers = deep_network(x, keep_prob) 83 | 84 | # Normal loss function (RMSE) 85 | loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y_, scaled_out)))) 86 | 87 | # Loss function with L2 Regularization 88 | loss = tf.reduce_mean(loss + BETA * regularizers) 89 | 90 | # Optimizer 91 | train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss) 92 | 93 | 94 | logloss = log_loss(y_, scaled_out) 95 | auc = roc_auc_score(y_, scaled_out) 96 | 97 | # Save model 98 | use_cols = list(set(train_set.columns)-set(["user_id","label"])) 99 | df_val_user = pd.read_pickle("../work/val_user_17_23.pkl") 100 | val_user_all = df_val_user["user_id"].unique().tolist() 101 | final_rank = 0 102 | train_set.drop_duplicates(inplace=True, subset=use_cols, keep="last") 103 | val_set.drop_duplicates(inplace=True, subset=use_cols, keep="last") 104 | saver = tf.train.Saver(max_to_keep=5) 105 | 106 | with tf.Session() as sess: 107 | #if RESTORE: 108 | # print('Loading Model...') 109 | # ckpt = tf.train.get_checkpoint_state('./models/neural/') 110 | # saver.restore(sess, ckpt.model_checkpoint_path) 111 | #else: 112 | sess.run(tf.global_variables_initializer()) 113 | # val = val_set.iloc[-val_len:, :].sample(frac=val_ratio) 114 | # val = val_set.sample(frac=val_ratio) 115 | val_user = get_stratified_sample(df_val_user, sample_ratio=val_ratio) 116 | val_user_add = list(set(val_user_all) - set(val_user)) 117 | val = val_set[val_set["user_id"].isin(val_user)] 118 | # val_add = val_set[val_set["user_id"].isin(val_user_add)] 119 | val_train = val_set[~val_set["user_id"].isin(val["user_id"])] 120 | train = pd.concat([train_set, val_train], axis=0) 121 | # train = pd.concat([train_set, val_train,val_add], axis=0) 122 | print("shape of val:", val.shape) 123 | print("shape of train:", train.shape) 124 | y_train = train['label'] 125 | train = train.drop(['user_id', "label"], axis=1) 126 | val_y = val['label'] 127 | val_x = val.drop(['user_id', "label"], axis=1) 128 | 129 | # Train until maximum steps reached or interrupted 130 | for i in range(START, STEPS): 131 | k_fold = KFold(n_splits=10, shuffle=True) 132 | #if i % 100 == 0: 133 | # saver.save(sess, './models/neural/step_' + str(i) + '.cptk') 134 | 135 | for k, (ktrain, ktest) in enumerate(k_fold.split(train, y_train)): 136 | train_step.run(feed_dict={x: train[ktrain], y_: y_train[ktrain], keep_prob: DROPOUT}) 137 | # Show test score every 10 iterations 138 | if i % 10 == 0: 139 | # Tensorflow R2 140 | #train_accuracy = accuracy.eval(feed_dict={ 141 | # x: train[ktest], y_: y_train[ktest]}) 142 | # SkLearn metrics R2 143 | train_accuracy = log_loss(y_train[ktest], 144 | sess.run(scaled_out, feed_dict={x: train[ktest], keep_prob: 1.0})) 145 | print('Step: %d, Fold: %d, R2 Score: %g' % (i, k, train_accuracy)) 146 | 147 | CV = [] 148 | for i in range(n_round): 149 | k_fold = KFold(n_splits=10, shuffle=True) 150 | for k, (ktrain, ktest) in enumerate(k_fold.split(train, y_train)): 151 | # Tensorflow R2 152 | #accuracy = accuracy.eval(feed_dict={ 153 | # x: train[ktest], y_: y_train[ktest]}) 154 | # SkLearn metrics R2 155 | auc = roc_auc_score(y_train[ktest], 156 | sess.run(scaled_out, feed_dict={x: train[ktest], keep_prob: 1.0})) 157 | print('Step: %d, Fold: %d, R2 Score: %g' % (i, k, auc)) 158 | CV.append(auc) 159 | print('Mean R2: %g' % (np.mean(CV))) 160 | 161 | if __name__ == '__main__': 162 | tf.app.run() -------------------------------------------------------------------------------- /nnpy/nn_v1.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import pandas as pd 4 | import joblib 5 | from sklearn.model_selection import GridSearchCV 6 | from sklearn.neural_network import MLPClassifier 7 | from skopt import BayesSearchCV 8 | from skopt.callbacks import DeltaXStopper 9 | from data_process_v4 import processing 10 | from skopt.space import Categorical 11 | 12 | def predict(clf2, test_set): 13 | uid = pd.DataFrame() 14 | # test_set = processing(trainSpan=(1, 30), label=False) 15 | uid["user_id"] = test_set["user_id"] 16 | test_set = test_set.drop(labels=["user_id"], axis=1) 17 | print("begin to make predictions") 18 | res = clf2.predict_proba(test_set.values) 19 | uid["proba1"] = pd.Series(res[:, 1]) 20 | uid["score"] = uid.groupby(by=["user_id"])["proba1"].transform(lambda x: sum(x) / float(len(x))) 21 | uid.drop_duplicates(subset=["user_id"],inplace=True) 22 | uid.sort_values(by=["score"],axis=0,ascending=False,inplace=True) 23 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 24 | uid_file = "result/uid_" + str_time + ".csv" 25 | uid.to_csv(uid_file,header=True,index=False) 26 | active_users = uid["user_id"][:24000].unique().tolist() 27 | print(len(active_users)) 28 | print(active_users) 29 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 30 | submission_file = "result/submission_nn_" + str_time + ".csv" 31 | with open(submission_file, "a", newline="") as f: 32 | writer = csv.writer(f) 33 | for i in active_users: 34 | writer.writerow([i]) 35 | # using this module ,one needs to deconstruct some of the features in data_process 36 | def run(): 37 | # print("begin to load the trainset1") 38 | # train_set1 = processing(trainSpan=(1,19),label=True) 39 | # train_set1.to_csv("data/training_ld1-19.csv", header=True, index=False) 40 | # train_set1 = pd.read_csv("data/training_ld1-16.csv", header=0, index_col=None) 41 | # print(train_set1.describe()) 42 | # print("begin to load the trainset2") 43 | # train_set2 = processing(trainSpan=(5,23),label=True) 44 | # train_set2.to_csv("data/training_ld5-23.csv", header=True, index=False) 45 | # train_set2 = pd.read_csv("data/training_ld8-23.csv", header=0, index_col=None) 46 | # print(train_set1.describe()) 47 | # print("begin to load the trainset3") 48 | # train_set3 = processing(trainSpan=(1,23),label=True) 49 | # train_set3.to_csv("data/training_ld1-23.csv", header=True, index=False) 50 | # train_set3 = pd.read_csv("data/training_ld1-23.csv", header=0, index_col=None) 51 | # print(train_set1.describe()) 52 | print("begin to merge the trainsets") 53 | # train_set = pd.concat([train_set1,train_set2,train_set3],axis=0) 54 | # train_set = pd.concat([train_set1,train_set2],axis=0) 55 | # train_set.to_csv("data/training_lm5-23.csv", header=True, index=False) 56 | train_set = pd.read_csv("data/training_lm15-23.csv", header=0, index_col=None) 57 | # del train_set1,train_set2 58 | # gc.collect() 59 | print(train_set.describe()) 60 | keep_feature = list(set(train_set.columns.values.tolist())-set(["user_id","label"])) 61 | print("begin to drop the duplicates") 62 | train_set.drop_duplicates(subset=keep_feature,inplace=True) 63 | print(train_set.describe()) 64 | train_label =train_set["label"] 65 | train_set = train_set.drop(labels=["label","user_id"], axis=1) 66 | 67 | # train_x, val_x,train_y,val_y = train_test_split(train_set.values,train_label.values,test_size=0.33,random_state=42,shuffle=True) 68 | print("begin to make prediction with plain features and without tuning parameters") 69 | initial_params = { 70 | "hidden_layer_sizes": (128,128), 71 | "activation": "relu", 72 | "solver": "adam", 73 | "batch_size":"auto", 74 | "learning_rate": "adaptive", 75 | "alpha": 0.0001, 76 | "max_iter": 400, 77 | "verbose": True, 78 | "warm_start": True, 79 | "early_stopping": True, 80 | "validation_fraction": 0.1, 81 | } 82 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns)) 83 | 84 | scoring = {'AUC': 'roc_auc', 'f1': "f1"} 85 | clf1 = GridSearchCV(MLPClassifier(**initial_params), 86 | param_grid={ 87 | "max_iter":[400,800,1200], 88 | "solver": ["lbfgs","adam"], 89 | "batch_size":[128,200,156]}, 90 | scoring=scoring, cv=4, refit='f1',n_jobs=-1,verbose=2) 91 | clf1.fit(train_set.values, train_label.values) 92 | # cv_results = cv(initial_params,train_data,num_boost_round=800,nfold=4,early_stopping_rounds=30,verbose_eval=True) 93 | # bst = lgb.cv(initial_params, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30) 94 | bs = clf1.best_score_ 95 | print(bs) 96 | bp = clf1.best_params_ 97 | print(bp) 98 | # clf1 = LGBMClassifier(**initial_params) 99 | # clf1.fit(X=train_x,y=train_y,eval_set=(val_x,val_y),early_stopping_rounds=20,eval_metric="auc") 100 | print("load the test dataset") 101 | # test_set = processing(trainSpan=(15, 30), label=False) 102 | # test_set.to_csv("data/testing_ld15-30.csv",header=True,index=False) 103 | test_set = pd.read_csv("data/testing_ld15-30.csv",header=0,index_col=None) 104 | print("begin to make prediction") 105 | predict(clf1,test_set) 106 | 107 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 108 | with open("kuaishou_stats.csv", 'a', newline='') as f: 109 | writer = csv.writer(f) 110 | writer.writerow(["feature importance of nn for kuaishou-crt ", str_time]) 111 | writer.writerow(["best score",bs,"best params"]) 112 | for key, value in bp.items(): 113 | writer.writerow([key, value]) 114 | model_name = "nn_" + str_time + ".pkl" 115 | joblib.dump(clf1, model_name) 116 | print("begin to tune the parameters with the selected feature") 117 | hls = [] 118 | for i in [32, 64]: 119 | hls.append((i * 3,i * 3)) 120 | hls.append((i * 4,i * 4)) 121 | hls.append((i*2, i * 3, i*2)) 122 | hls.append((i*3, i * 4, i*3)) 123 | # hls.append((i,i * 2, i * 4, i * 3)) 124 | paramsSpace = { 125 | "hidden_layer_sizes": Categorical(hls), 126 | "activation": Categorical(["logistic", "tanh", "relu"]), 127 | "solver": Categorical(["lbfgs", "sgd", "adam"]), 128 | "learning_rate": Categorical(["invscaling", "adaptive"]), 129 | "alpha": Categorical([0.0001, 0.001, 0.01,0.1,1.0]), 130 | "batch_size":(128, 256), 131 | "max_iter":(400, 1200), 132 | "momentum":(0.6, 1.0, 'uniform'), 133 | "beta_1":(0.6, 1.0, 'uniform'), 134 | "beta_2":(0.98, 0.99990, 'uniform'), 135 | } 136 | def tune_parameter(X, y, clf, params): 137 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 138 | gs = BayesSearchCV( 139 | estimator=clf, search_spaces=params, 140 | scoring="f1", n_iter=60,optimizer_kwargs={"base_estimator":"GP"}, 141 | verbose=0, n_jobs=-1, cv=4, refit=True, random_state=1234 142 | ) 143 | gs.fit(X, y,callback=DeltaXStopper(0.000001)) 144 | best_params = gs.best_params_ 145 | best_score = gs.best_score_ 146 | print(best_params) 147 | print(best_score) 148 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 149 | with open("kuaishou_stats.csv", 'a', newline='') as f: 150 | writer = csv.writer(f) 151 | writer.writerow(["the best params for nn: "]) 152 | for key, value in best_params.items(): 153 | writer.writerow([key, value]) 154 | writer.writerow(["the best score for nn: ", best_score,str_time]) 155 | return gs 156 | 157 | model = MLPClassifier(**bp) 158 | clf2 = tune_parameter(train_set.values,train_label.values,model,paramsSpace) 159 | print("parameter tuning over, begin to save the model!") 160 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 161 | 162 | model_name = "nn_" + str_time + ".pkl" 163 | joblib.dump(clf2, model_name) 164 | 165 | print("begin to process the whole dataset and ready to feed into the fitted model") 166 | predict(clf2,test_set) 167 | 168 | if __name__=="__main__": 169 | run() -------------------------------------------------------------------------------- /nnpy/nn_v2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | def KerasClassifier_wrapper(input_dims): 4 | import tensorflow as tf 5 | from keras import backend as K 6 | # AUC for a binary classifier 7 | def auc(y_true, y_pred): 8 | ptas = tf.stack([binary_PTA(y_true, y_pred, k) for k in np.linspace(0, 1, 1000)], axis=0) 9 | pfas = tf.stack([binary_PFA(y_true, y_pred, k) for k in np.linspace(0, 1, 1000)], axis=0) 10 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0) 11 | binSizes = -(pfas[1:] - pfas[:-1]) 12 | s = ptas * binSizes 13 | return K.sum(s, axis=0) 14 | 15 | # ----------------------------------------------------------------------------------------------------------------------------------------------------- 16 | # PFA, prob false alert for binary classifier 17 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)): 18 | y_pred = K.cast(y_pred >= threshold, 'float32') 19 | # N = total number of negative labels 20 | N = K.sum(1 - y_true) 21 | # FP = total number of false alerts, alerts from the negative class labels 22 | FP = K.sum(y_pred - y_pred * y_true) 23 | return FP / (N+0.00000001) 24 | 25 | # ----------------------------------------------------------------------------------------------------------------------------------------------------- 26 | # P_TA prob true alerts for binary classifier 27 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)): 28 | y_pred = K.cast(y_pred >= threshold, 'float32') 29 | # P = total number of positive labels 30 | P = K.sum(y_true) 31 | # TP = total number of correct alerts, alerts from the positive class labels 32 | TP = K.sum(y_pred * y_true) 33 | return TP / (P+0.00000001) 34 | # prepare callbacks 35 | def model(): 36 | from keras.models import Sequential 37 | model = Sequential() 38 | # input layer 39 | from keras.layers import Dense 40 | model.add(Dense(input_dims, input_dim=input_dims)) 41 | from keras.layers import BatchNormalization 42 | model.add(BatchNormalization()) 43 | from keras.layers import Activation 44 | model.add(Activation('relu')) 45 | from keras.layers import Dropout 46 | model.add(Dropout(0.4)) 47 | # hidden layers 48 | model.add(Dense(input_dims)) 49 | model.add(BatchNormalization()) 50 | model.add(Activation('relu')) 51 | model.add(Dropout(0.4)) 52 | 53 | model.add(Dense(input_dims // 2)) 54 | model.add(BatchNormalization()) 55 | model.add(Activation('relu')) 56 | model.add(Dropout(0.4)) 57 | 58 | model.add(Dense(input_dims // 4, activation='relu')) 59 | 60 | # output layer (y_pred) 61 | model.add(Dense(1, activation='sigmoid')) 62 | 63 | # compile this model 64 | model.compile(loss='binary_crossentropy', # one may use 'mean_absolute_error' as alternative 65 | optimizer='adam', 66 | metrics=[auc] # you can add several if needed 67 | ) 68 | # Visualize NN architecture 69 | print(model.summary()) 70 | return model 71 | from keras.wrappers.scikit_learn import KerasClassifier 72 | return KerasClassifier(build_fn=model) 73 | def get_stratified_sample(df, sample_target="user_id", reference_target="app_launch_day", 74 | sample_ratio=0.2): 75 | df = df.astype(np.uint32) 76 | reference_target_ls = df[reference_target].unique().tolist() 77 | target_sample = [] 78 | for i in reference_target_ls: 79 | # print("get users in day {}".format(i)) 80 | target_sample.extend(df.loc[df[reference_target] == int(i)][sample_target].drop_duplicates().sample(frac=sample_ratio).tolist()) 81 | del df 82 | return list(set(target_sample)) 83 | def nn_predict(train_set,val_set,test_set,file,minmax_scale=True,val_ratio=0.4, n_round = 3): 84 | import numpy as np 85 | from scipy.stats import rankdata 86 | import random 87 | import gc 88 | res=test_set[['user_id']] 89 | test_x = test_set.drop(['user_id',"label"], axis=1) 90 | res['prob'] = 0 91 | user_register_log = ["user_id", "register_day", "register_type", "device_type"] 92 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": np.uint8} 93 | testb = \ 94 | pd.read_table('/mnt/datasets/fusai/user_register_log.txt', header=None, names=user_register_log, index_col=None, 95 | dtype=dtype_user_register)[['user_id']] 96 | print("begin to train ") 97 | 98 | # val_len = int(len(val_set)*0.4) 99 | # use_cols = list(set(train_set.columns)-set(["user_id","label"])) 100 | df_val_user = pd.read_pickle("../work/val_user_8_23.pkl") 101 | # val_user_all = df_val_user["user_id"].unique().tolist() 102 | # final_rank = 0 103 | # train_set.drop_duplicates(inplace=True, subset=use_cols, keep="last") 104 | # val_set.drop_duplicates(inplace=True, subset=use_cols, keep="last") 105 | # train_set.reset_index(drop=True,inplace=True) 106 | # val_set.reset_index(drop=True,inplace=True) 107 | if minmax_scale: 108 | for f in test_x.columns: 109 | train_set[f] = (train_set[f]-train_set[f].min())/(train_set[f].max()-train_set[f].min()) 110 | val_set[f] = (val_set[f]-val_set[f].min())/(val_set[f].max()-val_set[f].min()) 111 | test_x[f] = (test_x[f]-test_x[f].min())/(test_x[f].max()-test_x[f].min()) 112 | for i in range(n_round): 113 | random.seed(np.random.randint(1, 1000)) 114 | # val = val_set.iloc[-val_len:, :].sample(frac=val_ratio) 115 | # val = val_set.sample(frac=val_ratio) 116 | print("get stratified sample validation user") 117 | val_user = get_stratified_sample(df_val_user,sample_ratio=val_ratio) 118 | # val_user_add = list(set(val_user_all)-set(val_user)) 119 | val = val_set.loc[val_set["user_id"].isin(val_user)] 120 | # val_add = val_set.loc[val_set["user_id"].isin(val_user_add)] 121 | val_train = val_set.loc[~val_set["user_id"].isin(val["user_id"])] 122 | train = pd.concat([train_set, val_train], axis=0) 123 | # train = pd.concat([train_set, val_train,val_add], axis=0) 124 | print("the {}th round".format(i)) 125 | print("shape of val:", val.shape) 126 | print("shape of train:", train.shape) 127 | train_y = train['label'] 128 | train_x = train.drop(['user_id', "label"], axis=1) 129 | val_y = val['label'] 130 | val_x = val.drop(['user_id', "label"], axis=1) 131 | from keras.callbacks import ModelCheckpoint 132 | from keras.callbacks import EarlyStopping 133 | clf_nn = KerasClassifier_wrapper(train_x.shape[1]) 134 | model_path = "../input/keras_model.h5" 135 | callbacks = [ 136 | EarlyStopping( 137 | monitor='val_auc', 138 | patience=20, 139 | mode='max', 140 | verbose=100), 141 | ModelCheckpoint( 142 | model_path, 143 | monitor='val_auc', 144 | save_best_only=True, 145 | mode='max', 146 | verbose=100) 147 | ] 148 | # fit estimator 149 | history = clf_nn.fit( 150 | train_x, 151 | train_y, 152 | epochs=500, 153 | batch_size=1024, 154 | validation_data=(val_x, val_y), 155 | verbose=1, 156 | callbacks=callbacks, 157 | shuffle=True, 158 | n_jobs=-1, 159 | ) 160 | print(history.history.keys()) 161 | import matplotlib.pyplot as plt 162 | # summarize history for R^2 163 | fig_acc = plt.figure(figsize=(10, 10)) 164 | plt.plot(history.history['auc']) 165 | plt.plot(history.history['val_auc']) 166 | plt.title('model auc') 167 | plt.ylabel('auc') 168 | plt.xlabel('epoch') 169 | plt.legend(['train', 'test'], loc='upper left') 170 | plt.show() 171 | fig_acc.savefig("model_auc.png") 172 | 173 | # summarize history for loss 174 | fig_loss = plt.figure(figsize=(10, 10)) 175 | plt.plot(history.history['loss']) 176 | plt.plot(history.history['val_loss']) 177 | plt.title('model loss') 178 | plt.ylabel('loss') 179 | plt.xlabel('epoch') 180 | plt.legend(['train', 'test'], loc='upper left') 181 | plt.show() 182 | fig_loss.savefig("model_loss.png") 183 | # weight = 2*(temp_score_train*temp_score_val)/(temp_score_train+temp_score_val) 184 | res_temp = test_set[['user_id']] 185 | res_temp['prob'] = 0 186 | temp_predict = clf_nn.predict_proba(test_x)[:, 1] 187 | res_temp['prob'] = temp_predict 188 | res_temp = pd.merge(testb, res_temp, on='user_id', how='left').fillna(0) 189 | res_temp.to_csv('../input/' + file +str(i)+ '.txt', sep=',', index=False, header=False) 190 | # res_temp = get_normalized_rank(res_temp) 191 | # res['prob']+= res_temp['rank'] 192 | # res['prob']+= res_temp['rank']/n_round 193 | # res['prob']+= res_temp['prob']/n_round 194 | res['prob']+= temp_predict/n_round 195 | # res_temp = res_temp[["user_id","rank"]] 196 | # final_rank = final_rank+rankdata(temp_predict, method='ordinal') 197 | del val, val_train,train,train_y,train_x,val_y,val_x,res_temp,temp_predict,clf_nn,history 198 | gc.collect() 199 | # res["prob"] = (final_rank -min(final_rank))/(max(final_rank)-min(final_rank)) 200 | # res["prob"] = (res["prob"] -min(res["prob"]))/(max(res["prob"])-min(res["prob"])) 201 | res=pd.merge(testb,res,on='user_id',how='left').fillna(0) 202 | res.to_csv('../work/' + file + '.txt', sep=',', index=False,header=False) 203 | del testb,train_set, val_set,test_set 204 | gc.collect() 205 | return res -------------------------------------------------------------------------------- /paper/Modeling and Predicting the Active video-viewing time in a large-scale e-learning system.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/Modeling and Predicting the Active video-viewing time in a large-scale e-learning system.pdf -------------------------------------------------------------------------------- /paper/The Prediction of Booking Destination on airbnb dataset.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/The Prediction of Booking Destination on airbnb dataset.pdf -------------------------------------------------------------------------------- /paper/Using Deep Learning to Predict Customer Churn in a mobile telecommunication newwork.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/Using Deep Learning to Predict Customer Churn in a mobile telecommunication newwork.pdf -------------------------------------------------------------------------------- /paper/field-aware fatorization machine for CTR prediction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/field-aware fatorization machine for CTR prediction.pdf -------------------------------------------------------------------------------- /paper/predicting airbnb user's desired travel destination.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/predicting airbnb user's desired travel destination.pdf -------------------------------------------------------------------------------- /photos/16count.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/16count.JPG -------------------------------------------------------------------------------- /photos/23count.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/23count.JPG -------------------------------------------------------------------------------- /photos/23count3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/23count3.JPG -------------------------------------------------------------------------------- /photos/24count.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/24count.JPG -------------------------------------------------------------------------------- /photos/24count3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/24count3.JPG -------------------------------------------------------------------------------- /photos/count2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/count2.JPG -------------------------------------------------------------------------------- /photos/describe.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/describe.JPG -------------------------------------------------------------------------------- /photos/outlier1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/outlier1.JPG -------------------------------------------------------------------------------- /photos/registerday_count.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/registerday_count.JPG -------------------------------------------------------------------------------- /photos/sample.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/sample.JPG -------------------------------------------------------------------------------- /photos/value_count.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/value_count.JPG -------------------------------------------------------------------------------- /quick_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # train_set = pd.read_csv("data/training_m1-23.csv", header=0, index_col=None) 4 | # # del train_set1,train_set2 5 | # # gc.collect() 6 | # print(train_set.describe()) 7 | # keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id"])) 8 | # print("begin to drop the duplicates") 9 | # train_set.drop_duplicates(subset=keep_feature, inplace=True) 10 | # print(train_set.describe()) 11 | # train_label = train_set["label"] 12 | # train_set = train_set.drop(labels=["label", "user_id"], axis=1) 13 | # 14 | # ls = [0,1,2,3,4,5,1,2,5,1,2,4,9] 15 | # print(ls.count(10)/len(ls)) 16 | import numpy as np 17 | print("get users from user activity log") 18 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32, 19 | "author_id": np.uint32, "action_type": np.uint8} 20 | df_user_activity = pd.read_csv("data/user_activity_log.csv", header=0, index_col=None, dtype=dtype_user_activity) 21 | # df_user_activity = df_user_activity.merge(df_user_register_base, on=["user_id"], how="left").fillna(-1) 22 | df_user_activity_train = df_user_activity.loc[ 23 | (df_user_activity["user_activity_day"] >= 1) & ( 24 | df_user_activity["user_activity_day"] <= 9)] 25 | print(df_user_activity_train.describe()) 26 | user_activity_author = df_user_activity_train["author_id"].unique().tolist() 27 | print(user_activity_author) 28 | df_user_activity_train["user_in_author"] = 0 29 | # df_user_activity_train["user_in_author"] = df_user_activity_train["user_id"].apply(lambda x: 1 if x in user_activity_author else 0) 30 | print("begin to get user in author or not mark") 31 | df_user_activity_train.loc[df_user_activity_train["user_id"].isin(user_activity_author),"user_in_author"]=1 32 | print(df_user_activity_train.describe()) -------------------------------------------------------------------------------- /rfpy/rf_v1.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | # import gc 4 | import pandas as pd 5 | import joblib 6 | import lightgbm 7 | from lightgbm import LGBMClassifier,cv 8 | from scipy.stats import stats 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.model_selection import GridSearchCV 11 | from skopt import BayesSearchCV 12 | from skopt.callbacks import DeltaXStopper 13 | from data_process_v2 import processing 14 | from skopt.space import Categorical 15 | 16 | def predict(clf2, test_set): 17 | uid = pd.DataFrame() 18 | # test_set = processing(trainSpan=(1, 30), label=False) 19 | uid["user_id"] = test_set["user_id"] 20 | test_set = test_set.drop(labels=["user_id"], axis=1) 21 | print("begin to make predictions") 22 | res = clf2.predict(test_set.values) 23 | uid["y_hat"] = pd.Series(res) 24 | uid["label"] = uid.groupby(by=["user_id"])["y_hat"].transform(lambda x: stats.mode(x)[0][0]) 25 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 26 | uid_file = "result/uid_" + str_time + ".csv" 27 | uid.to_csv(uid_file,header=True,index=False) 28 | active_users = (uid.loc[uid["label"] == 1]).user_id.unique().tolist() 29 | print(len(active_users)) 30 | print(active_users) 31 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 32 | submission_file = "result/submission_" + str_time + ".csv" 33 | with open(submission_file, "a", newline="") as f: 34 | writer = csv.writer(f) 35 | for i in active_users: 36 | writer.writerow([i]) 37 | # using this module ,one needs to deconstruct some of the features in data_process 38 | keep_feature = ["user_id", 39 | "register_day_rate", "register_type_rate", 40 | "register_type_device", "device_type_rate", "device_type_register", 41 | "user_app_launch_register_mean_time", 42 | "user_app_launch_rate", "user_app_launch_gap", 43 | "user_video_create_register_mean_time", 44 | "user_video_create_rate", "user_video_create_day", "user_video_create_gap", 45 | "user_activity_register_mean_time", "user_activity_rate", 46 | "user_activity_frequency", 47 | "user_activity_day_rate", "user_activity_gap", 48 | "user_page_num", "user_video_id_num", 49 | "user_author_id_num", "user_author_id_video_num", 50 | "user_action_type_num" 51 | ] 52 | def run(): 53 | print("begin to load the trainset1") 54 | train_set1 = processing(trainSpan=(1,12),label=True) 55 | # print(train_set1.describe()) 56 | print("begin to load the trainset2") 57 | train_set2 = processing(trainSpan=(13,23),label=True) 58 | # print(train_set2.describe()) 59 | print("begin to merge the trainsets") 60 | train_set = pd.concat([train_set1,train_set2],axis=0) 61 | print(train_set.describe()) 62 | # del train_set1,train_set2 63 | # gc.collect() 64 | print("begin to drop the duplicates") 65 | train_set.drop_duplicates(subset=keep_feature,inplace=True) 66 | print(train_set.describe()) 67 | train_label =train_set["label"] 68 | train_set = train_set.drop(labels=["label","user_id"], axis=1) 69 | 70 | # train_x, val_x,train_y,val_y = train_test_split(train_set.values,train_label.values,test_size=0.33,random_state=42,shuffle=True) 71 | print("begin to make prediction with plain features and without tuning parameters") 72 | initial_params = { 73 | "n_jobs": -1, 74 | "n_estimators": 400, 75 | "criterion": "gini", 76 | "max_features": 'auto', 77 | "max_depth": 6, 78 | "min_samples_split": 2, 79 | "min_samples_leaf": 1, 80 | "min_weight_fraction_leaf": 0.0, 81 | "max_leaf_nodes": 64, 82 | "min_impurity_decrease": 0.0, 83 | } 84 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns)) 85 | 86 | scoring = {'AUC': 'roc_auc', 'f1': "f1"} 87 | clf1 = GridSearchCV(RandomForestClassifier(**initial_params), 88 | param_grid={"n_estimators":[400,600],"max_leaf_nodes": [16,24,32,64]}, 89 | scoring=scoring, cv=3, refit='f1',n_jobs=-1,verbose=0) 90 | clf1.fit(train_set.values, train_label.values) 91 | # cv_results = cv(initial_params,train_data,num_boost_round=800,nfold=4,early_stopping_rounds=30,verbose_eval=True) 92 | # bst = lgb.cv(initial_params, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30) 93 | print(clf1.best_score_) 94 | print(clf1.best_params_) 95 | # clf1 = LGBMClassifier(**initial_params) 96 | # clf1.fit(X=train_x,y=train_y,eval_set=(val_x,val_y),early_stopping_rounds=20,eval_metric="auc") 97 | print("load the test dataset") 98 | test_set = processing(trainSpan=(20, 30), label=False) 99 | print("begin to make prediction") 100 | predict(clf1,test_set) 101 | 102 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 103 | print("begin to get important features") 104 | feature_names = train_set.columns 105 | feature_importances = clf1.best_estimator_.feature_importances_ 106 | print(feature_importances) 107 | print(feature_names) 108 | 109 | with open("kuaishou_stats.csv", 'a', newline='') as f: 110 | writer = csv.writer(f) 111 | writer.writerow(["feature importance of catboost for tencent-crt", str_time]) 112 | # writer.writerow(eval_metrics) 113 | feature_score_name = sorted(zip(feature_importances, feature_names), reverse=True) 114 | for score, name in feature_score_name: 115 | print('{}: {}'.format(name, score)) 116 | writer.writerow([name, score]) 117 | sorted_feature_name = [name for score, name in feature_score_name] 118 | print(sorted_feature_name) 119 | 120 | print("begin to tune the parameters with the selected feature") 121 | paramsSpace = { 122 | "n_estimators": (200, 800), 123 | "criterion": Categorical(["gini", "entropy"]), 124 | "max_features": (0.6, 1.0, 'uniform'), 125 | "max_depth": (3, 8), 126 | "min_samples_split": (2, 128), 127 | "min_samples_leaf": (1, 128), 128 | "min_weight_fraction_leaf": (0.0, 0.5, 'uniform'), 129 | "max_leaf_nodes": (16, 128), 130 | "min_impurity_decrease": (1e-6, 1e-1, 'log-uniform'), 131 | } 132 | 133 | def tune_parameter(X, y, clf, params): 134 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 135 | gs = BayesSearchCV( 136 | estimator=clf, search_spaces=params, 137 | scoring="f1", n_iter=60,optimizer_kwargs={"base_estimator":"RF"}, 138 | verbose=0, n_jobs=-1, cv=3, refit=True, random_state=1234 139 | ) 140 | gs.fit(X, y,callback=DeltaXStopper(0.000001)) 141 | best_params = gs.best_params_ 142 | best_score = gs.best_score_ 143 | print(best_params) 144 | print(best_score) 145 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 146 | with open("kuaishou_stats.csv", 'a', newline='') as f: 147 | writer = csv.writer(f) 148 | writer.writerow(["the best params for lightgbm: "]) 149 | for key, value in best_params.items(): 150 | writer.writerow([key, value]) 151 | writer.writerow(["the best score for lightgbm: ", best_score,str_time]) 152 | return gs 153 | 154 | model = RandomForestClassifier(**initial_params) 155 | clf2 = tune_parameter(train_set.values,train_label.values,model,paramsSpace) 156 | print("parameter tuning over, begin to save the model!") 157 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 158 | 159 | model_name = "lightgbm_" + str_time + ".pkl" 160 | joblib.dump(clf2, model_name) 161 | 162 | print("begin to process the whole dataset and ready to feed into the fitted model") 163 | predict(clf2,test_set) 164 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 165 | print("begin to get important features") 166 | feature_names = train_set.columns 167 | feature_importances = clf2.best_estimator_.feature_importances_ 168 | print(feature_importances) 169 | print(feature_names) 170 | 171 | with open("kuaishou_stats.csv", 'a', newline='') as f: 172 | writer = csv.writer(f) 173 | writer.writerow(["feature importance of catboost for tencent-crt", str_time]) 174 | # writer.writerow(eval_metrics) 175 | feature_score_name = sorted(zip(feature_importances, feature_names), reverse=True) 176 | for score, name in feature_score_name: 177 | print('{}: {}'.format(name, score)) 178 | writer.writerow([name, score]) 179 | sorted_feature_name = [name for score, name in feature_score_name] 180 | print(sorted_feature_name) 181 | if __name__=="__main__": 182 | run() -------------------------------------------------------------------------------- /rulepy/hardcode_approach.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import pandas as pd 4 | import numpy as np 5 | user_register_log = ["user_id","register_day","register_type","device_type"] 6 | app_launch_log = ["user_id","app_launch_day"] 7 | video_create_log = ["user_id","video_create_day"] 8 | user_activity_log = ["user_id","user_activity_day","page","video_id","author_id","action_type"] 9 | 10 | 11 | def get_user_from_videoCreate(laterThanDay,videoCount): 12 | print("get users from video create") 13 | video_create_log = ["user_id", "video_create_day"] 14 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8} 15 | df_video_create = pd.read_table("data/video_create_log.txt",header=None,names=video_create_log,index_col=None,dtype=dtype_video_create) 16 | latest_user = (df_video_create.loc[df_video_create["video_create_day"]>laterThanDay]).user_id.unique().tolist() 17 | print("get latest users") 18 | print(latest_user) 19 | print(len(latest_user)) 20 | df_video_create["videoCount"] = df_video_create.groupby(by=["user_id"])["video_create_day"].transform(lambda x: x.nunique()) 21 | frequent_user = (df_video_create.loc[df_video_create["videoCount"]>videoCount]).user_id.unique().tolist() 22 | print("get frequent users") 23 | print(frequent_user) 24 | print(len(frequent_user)) 25 | user_videoCreate = list(set(latest_user+frequent_user)) 26 | print(user_videoCreate) 27 | print(len(user_videoCreate)) 28 | return user_videoCreate 29 | # with open("result/submission.csv","a",newline="") as f: 30 | # writer = csv.writer(f) 31 | # for i in user_videoCreate: 32 | # writer.writerow([i]) 33 | # get_user_from_videoCreate(23,2) 34 | def get_user_from_appLaunch(laterThanDay,launchCount): 35 | print("get users from app launch log") 36 | app_launch_log = ["user_id","app_launch_day"] 37 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8} 38 | df_app_launch = pd.read_table("data/app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch) 39 | latest_user = (df_app_launch.loc[df_app_launch["app_launch_day"]>laterThanDay]).user_id.unique().tolist() 40 | print("get latest users") 41 | print(latest_user) 42 | print(len(latest_user)) 43 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique()) 44 | frequent_user = (df_app_launch.loc[df_app_launch["launchCount"]>launchCount]).user_id.unique().tolist() 45 | print("get frequent users") 46 | print(frequent_user) 47 | print(len(frequent_user)) 48 | user_appLaunch = list(set(latest_user+frequent_user)) 49 | print("get merged users") 50 | print(user_appLaunch) 51 | print(len(user_appLaunch)) 52 | return user_appLaunch 53 | # with open("result/submission.csv","a",newline="") as f: 54 | # writer = csv.writer(f) 55 | # for i in user_appLaunch: 56 | # writer.writerow([i]) 57 | # get_user_from_appLaunch(27,4) 58 | def get_user_from_userRegister(laterThanDay): 59 | print("get users from user register log") 60 | user_register_log = ["user_id", "register_day", "register_type", "device_type"] 61 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": str} 62 | df_user_register = pd.read_table("data/user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register) 63 | latest_user = (df_user_register.loc[df_user_register["register_day"]>laterThanDay]).user_id.unique().tolist() 64 | print("get latest users") 65 | print(latest_user) 66 | print(len(latest_user)) 67 | return latest_user 68 | # get_user_from_userRegister(25) 69 | def get_user_from_userActivity(laterThanDay,dayCount,pageList,typeList): 70 | print("get users from user activity log") 71 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"] 72 | usecols = ["user_id", "user_activity_day", "page","action_type"] 73 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "action_type": np.uint8} 74 | df_user_activity = pd.read_table("data/user_activity_log.txt",header=None,names=user_activity_log,usecols=usecols,index_col=None,dtype=dtype_user_activity) 75 | latest_user = (df_user_activity.loc[df_user_activity["user_activity_day"]>laterThanDay]).user_id.unique().tolist() 76 | print("get latest users") 77 | print(latest_user) 78 | print(len(latest_user)) 79 | 80 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique()) 81 | frequent_user = (df_user_activity.loc[df_user_activity["dayCount"]>dayCount]).user_id.unique().tolist() 82 | print("get frequent users") 83 | print(frequent_user) 84 | print(len(frequent_user)) 85 | 86 | print("get users in certain pages and certain action type") 87 | user_inList = (df_user_activity.loc[((df_user_activity["page"].isin(pageList))|(df_user_activity["action_type"].isin(typeList)))&(df_user_activity["user_activity_day"]>laterThanDay-3)]).user_id.unique().tolist() 88 | 89 | print(user_inList) 90 | print(len(user_inList)) 91 | user_userActivity = list(set(latest_user+frequent_user+user_inList)) 92 | 93 | print("get merged users") 94 | print(user_userActivity) 95 | print(len(user_userActivity)) 96 | return user_userActivity 97 | # get_user_from_userActivity(27, 3, [1,2,3], [1,3,4,5]) 98 | 99 | def get_user(): 100 | 101 | user_videoCreate = get_user_from_videoCreate(23, 31) 102 | user_appLaunch = get_user_from_appLaunch(23,31) 103 | user_userRegister = get_user_from_userRegister(23) 104 | user_userActivity = get_user_from_userActivity(23, 31, [], []) 105 | 106 | users = list(set(user_videoCreate+user_appLaunch+user_userRegister+user_userActivity)) 107 | print("get the final merged users") 108 | print(users) 109 | print(len(users)) 110 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 111 | submission_file = "result/submission_" + str_time + ".csv" 112 | # with open(submission_file,"a",newline="") as f: 113 | # writer = csv.writer(f) 114 | # for i in users: 115 | # writer.writerow([i]) 116 | get_user() -------------------------------------------------------------------------------- /svmpy/svm_v1.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import pandas as pd 4 | import joblib 5 | from sklearn.model_selection import GridSearchCV 6 | from sklearn.svm import SVC 7 | from skopt import BayesSearchCV 8 | from skopt.callbacks import DeltaXStopper 9 | from data_process_v4 import processing 10 | from skopt.space import Categorical, Real, Integer 11 | 12 | 13 | def predict(clf2, test_set): 14 | uid = pd.DataFrame() 15 | # test_set = processing(trainSpan=(1, 30), label=False) 16 | uid["user_id"] = test_set["user_id"] 17 | test_set = test_set.drop(labels=["user_id"], axis=1) 18 | print("begin to make predictions") 19 | res = clf2.predict_proba(test_set.values) 20 | uid["proba1"] = pd.Series(res[:, 1]) 21 | uid["score"] = uid.groupby(by=["user_id"])["proba1"].transform(lambda x: sum(x) / float(len(x))) 22 | uid.drop_duplicates(subset=["user_id"],inplace=True) 23 | uid.sort_values(by=["score"],axis=0,ascending=False,inplace=True) 24 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 25 | uid_file = "result/uid_" + str_time + ".csv" 26 | uid.to_csv(uid_file,header=True,index=False) 27 | active_users = uid["user_id"][:24000].unique().tolist() 28 | print(len(active_users)) 29 | print(active_users) 30 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 31 | submission_file = "result/submission_svc_" + str_time + ".csv" 32 | with open(submission_file, "a", newline="") as f: 33 | writer = csv.writer(f) 34 | for i in active_users: 35 | writer.writerow([i]) 36 | # using this module ,one needs to deconstruct some of the features in data_process 37 | def run(): 38 | # print("begin to load the trainset1") 39 | # train_set1 = processing(trainSpan=(1,19),label=True) 40 | # train_set1.to_csv("data/training_ld1-19.csv", header=True, index=False) 41 | # train_set1 = pd.read_csv("data/training_ld1-16.csv", header=0, index_col=None) 42 | # print(train_set1.describe()) 43 | # print("begin to load the trainset2") 44 | # train_set2 = processing(trainSpan=(5,23),label=True) 45 | # train_set2.to_csv("data/training_ld5-23.csv", header=True, index=False) 46 | # train_set2 = pd.read_csv("data/training_ld8-23.csv", header=0, index_col=None) 47 | # print(train_set1.describe()) 48 | # print("begin to load the trainset3") 49 | # train_set3 = processing(trainSpan=(1,23),label=True) 50 | # train_set3.to_csv("data/training_ld1-23.csv", header=True, index=False) 51 | # train_set3 = pd.read_csv("data/training_ld1-23.csv", header=0, index_col=None) 52 | # print(train_set1.describe()) 53 | print("begin to merge the trainsets") 54 | # train_set = pd.concat([train_set1,train_set2,train_set3],axis=0) 55 | # train_set = pd.concat([train_set1,train_set2],axis=0) 56 | # train_set.to_csv("data/training_lm5-23.csv", header=True, index=False) 57 | train_set = pd.read_csv("data/training_lm15-23.csv", header=0, index_col=None) 58 | # del train_set1,train_set2 59 | # gc.collect() 60 | print(train_set.describe()) 61 | keep_feature = list(set(train_set.columns.values.tolist())-set(["user_id","label"])) 62 | print("begin to drop the duplicates") 63 | train_set.drop_duplicates(subset=keep_feature,inplace=True) 64 | print(train_set.describe()) 65 | train_label =train_set["label"] 66 | train_set = train_set.drop(labels=["label","user_id"], axis=1) 67 | 68 | # train_x, val_x,train_y,val_y = train_test_split(train_set.values,train_label.values,test_size=0.33,random_state=42,shuffle=True) 69 | print("begin to make prediction with plain features and without tuning parameters") 70 | initial_params = { 71 | "C": 1.0, 72 | "kernel": "rbf", 73 | "degree": 3, 74 | "gamma":"auto", 75 | "coef0": 0.0, 76 | "tol": 0.0001, 77 | "cache_size": 4000, 78 | "verbose": True, 79 | "max_iter": -1, 80 | "probability": True, 81 | } 82 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns)) 83 | 84 | scoring = {'AUC': 'roc_auc', 'f1': "f1"} 85 | clf1 = GridSearchCV(SVC(**initial_params), 86 | param_grid={ 87 | "C":[0.01,0.1,1.0,10,100], 88 | "kernel": ["rbf"], 89 | "gamma":[0.0001,0.001,0.01,0.1]}, 90 | scoring=scoring, cv=4, refit='f1',n_jobs=-1,verbose=2) 91 | clf1.fit(train_set.values, train_label.values) 92 | # cv_results = cv(initial_params,train_data,num_boost_round=800,nfold=4,early_stopping_rounds=30,verbose_eval=True) 93 | # bst = lgb.cv(initial_params, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30) 94 | bs = clf1.best_score_ 95 | print(bs) 96 | bp = clf1.best_params_ 97 | print(bp) 98 | # clf1 = LGBMClassifier(**initial_params) 99 | # clf1.fit(X=train_x,y=train_y,eval_set=(val_x,val_y),early_stopping_rounds=20,eval_metric="auc") 100 | print("load the test dataset") 101 | # test_set = processing(trainSpan=(15, 30), label=False) 102 | # test_set.to_csv("data/testing_ld15-30.csv",header=True,index=False) 103 | test_set = pd.read_csv("data/testing_ld15-30.csv",header=0,index_col=None) 104 | print("begin to make prediction") 105 | predict(clf1,test_set) 106 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 107 | with open("kuaishou_stats.csv", 'a', newline='') as f: 108 | writer = csv.writer(f) 109 | writer.writerow(["feature importance of svm for kuaishou-crt ", str_time]) 110 | writer.writerow(["best score",bs,"best params"]) 111 | for key, value in bp.items(): 112 | writer.writerow([key, value]) 113 | 114 | model_name = "svm_" + str_time + ".pkl" 115 | joblib.dump(clf1, model_name) 116 | print("begin to tune the parameters with the selected feature") 117 | paramsSpace = { 118 | "C": Real(1e-6, 1e+6, prior='log-uniform'), 119 | "gamma": Real(1e-6, 1e+1, prior='log-uniform'), 120 | "degree": Integer(1,3), 121 | "kernel": Categorical(['poly', 'rbf']), 122 | } 123 | def tune_parameter(X, y, clf, params): 124 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 125 | gs = BayesSearchCV( 126 | estimator=clf, search_spaces=params, 127 | scoring="f1", n_iter=100,optimizer_kwargs={"base_estimator":"GP"}, 128 | verbose=2, n_jobs=-1, cv=4, refit=True, random_state=1234 129 | ) 130 | gs.fit(X, y,callback=DeltaXStopper(0.000001)) 131 | best_params = gs.best_params_ 132 | best_score = gs.best_score_ 133 | print(best_params) 134 | print(best_score) 135 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 136 | with open("kuaishou_stats.csv", 'a', newline='') as f: 137 | writer = csv.writer(f) 138 | writer.writerow(["the best params for svm: "]) 139 | for key, value in best_params.items(): 140 | writer.writerow([key, value]) 141 | writer.writerow(["the best score for svm: ", best_score,str_time]) 142 | return gs 143 | 144 | model = SVC(**bp) 145 | clf2 = tune_parameter(train_set.values,train_label.values,model,paramsSpace) 146 | print("parameter tuning over, begin to save the model!") 147 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) 148 | 149 | model_name = "svm_" + str_time + ".pkl" 150 | joblib.dump(clf2, model_name) 151 | 152 | print("begin to process the whole dataset and ready to feed into the fitted model") 153 | predict(clf2,test_set) 154 | 155 | if __name__=="__main__": 156 | run() -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | data = ['First Item', 'Second Item', 'Third Item'] 4 | with open('output.csv', 'w', newline='') as csvfile: 5 | writer = csv.writer(csvfile) 6 | for i in data: 7 | writer.writerow([i]) -------------------------------------------------------------------------------- /utilspy/calculate.py: -------------------------------------------------------------------------------- 1 | def calculate(): 2 | M_hat = 51480 3 | F1_hat = 0.63088748 4 | precision_hat = F1_hat/(2-F1_hat) 5 | N_hat = M_hat*precision_hat 6 | print(N_hat) 7 | 8 | f1 = 0.8014 9 | M = 30000 10 | TP = (M+N_hat)/2*f1 11 | 12 | precision = TP/M 13 | recall = TP/N_hat 14 | print("True positive number {} ".format(TP)) 15 | print("precision {}".format(precision)) 16 | print("recall {}".format(recall)) 17 | 18 | 19 | 20 | p = 20200/25600 21 | r = 20200/23722 22 | print("pre {}".format(p)) 23 | print("rec {}".format(r)) 24 | 25 | 26 | 27 | # p = 0.795 28 | # r = 0.845 29 | # print("possible submit number {}".format(N_hat*r/p)) 30 | f1 = 2*p*r/(p+r) 31 | print("f1 score {}".format(f1)) 32 | 33 | print(336/800) 34 | 35 | if __name__ == "__main__": 36 | calculate() 37 | -------------------------------------------------------------------------------- /utilspy/create_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from dataprocesspy.create_feature_v3_nonp import processing 3 | 4 | if __name__=="__main__": 5 | # print("begin to load the testset") 6 | # train_set52 = processing(trainSpan=(1, 30), label=False) 7 | # train_set52.to_csv("data/testing_eld1-30_r.csv", header=True, index=False) 8 | # train_set52 = pd.read_csv("data/training_eld1-23.csv", header=0, index_col=None, usecols=use_feature) 9 | # print(train_set52.describe()) 10 | # print("begin to load the trainset52") 11 | # train_set52 = processing(trainSpan=(1, 23), label=True) 12 | # train_set52.to_csv("data/training_rld1-23_r.csv", header=True, index=False) 13 | # # train_set52 = pd.read_csv("data/training_eld1-23.csv", header=0, index_col=None, usecols=use_feature) 14 | # print(train_set52.describe()) 15 | # print("begin to load the trainset51") 16 | # train_set51 = processing(trainSpan=(1, 22), label=True) 17 | # train_set51.to_csv("data/training_rld1-22.csv", header=True, index=False) 18 | # # train_set5 = pd.read_csv("data/training_eld1-22.csv", header=0, index_col=None, usecols=use_feature) 19 | # print(train_set51.describe()) 20 | # print("begin to load the trainset5") 21 | # train_set5 = processing(trainSpan=(1, 21), label=True) 22 | # train_set5.to_csv("data/training_rld1-21.csv", header=True, index=False) 23 | # # train_set5 = pd.read_csv("data/training_eld1-21.csv", header=0, index_col=None, usecols=use_feature) 24 | # print(train_set5.describe()) 25 | print("begin to load the trainset41") 26 | train_set41 = processing(trainSpan=(1, 20), label=True) 27 | train_set41.to_csv("../data/data_v4/training_rld1-20.csv", header=True, index=False) 28 | # train_set41 = pd.read_csv("data/training_eld1-20.csv", header=0, index_col=None, usecols=use_feature) 29 | print(train_set41.describe()) 30 | print("begin to load the trainset4") 31 | train_set4 = processing(trainSpan=(1, 19), label=True) 32 | train_set4.to_csv("../data/data_v4/training_rld1-19.csv", header=True, index=False) 33 | # train_set4 = pd.read_csv("data/training_eld1-19.csv", header=0, index_col=None, usecols=use_feature) 34 | print(train_set4.describe()) 35 | print("begin to load the trainset2") 36 | train_set2 = processing(trainSpan=(1, 15), label=True) 37 | train_set2.to_csv("../data/data_v4/training_rld1-15.csv", header=True, index=False) 38 | # train_set2 = pd.read_csv("data/training_eld1-15.csv", header=0, index_col=None, usecols=use_feature) 39 | print(train_set2.describe()) 40 | print("begin to load the trainset21") 41 | train_set21 = processing(trainSpan=(1, 16), label=True) 42 | train_set21.to_csv("../data/data_v4/training_rld1-16.csv", header=True, index=False) 43 | # train_set21 = pd.read_csv("data/training_eld1-16.csv", header=0, index_col=None, usecols=use_feature) 44 | print(train_set21.describe()) 45 | print("begin to load the trainset3") 46 | train_set3 = processing(trainSpan=(1, 17), label=True) 47 | train_set3.to_csv("../data/data_v4/training_rld1-17.csv", header=True, index=False) 48 | # train_set3 = pd.read_csv("data/training_eld1-17.csv", header=0, index_col=None, usecols=use_feature) 49 | print(train_set3.describe()) 50 | print("begin to load the trainset31") 51 | train_set31 = processing(trainSpan=(1, 18), label=True) 52 | train_set31.to_csv("../data/data_v4/training_rld1-18.csv", header=True, index=False) 53 | # train_set3 = pd.read_csv("data/training_eld1-18.csv", header=0, index_col=None, usecols=use_feature) 54 | print(train_set31.describe()) 55 | 56 | 57 | -------------------------------------------------------------------------------- /utilspy/kpca.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | from sklearn.datasets import make_circles 4 | from sklearn.model_selection import train_test_split 5 | import matplotlib.pyplot as plt 6 | from multiprocessing import Pool 7 | from sklearn.metrics.pairwise import rbf_kernel 8 | 9 | """ 10 | Implementation of methods from the paper 11 | Kernel PCA and De-noising in feature spaces. 12 | Each function has a comment above it which contains 13 | "(e)" where e denotes the corresponding equation from 14 | the paper. 15 | """ 16 | 17 | 18 | def gaussianKernel(x, y, c): 19 | ''' Returns K(x,y) where K denotes gaussian kernel ''' 20 | return math.exp(-(np.sqrt(np.dot(x - y, (x - y).conj())) ** 2) / c) 21 | 22 | 23 | # return math.exp(-(np.linalg.norm(x-y)**2) / c) 24 | 25 | def createK(data, c): 26 | ''' Returns K matrix containing inner products of the data using the kernel function 27 | so that K_ij := (phi(x_i)*phi(x_j)) ''' 28 | return rbf_kernel(data, gamma=1 / c) 29 | 30 | 31 | def createKOld(data, kernelFunction, c): 32 | ''' Returns K matrix containing inner products of the data using the kernel function 33 | so that K_ij := (phi(x_i)*phi(x_j)) ''' 34 | return rbf_kernel(data, gamma=1 / c) 35 | 36 | 37 | # l = len(data) 38 | # K = np.zeros((l,l)) 39 | # for col in range(l): 40 | # for row in range(l): 41 | # K[row][col] = kernelFunction(data[row],data[col], c) 42 | # return K 43 | 44 | def calcBetaKOld(alphaK, data, x, c): 45 | ''' Returns the projection of x onto the eigenvector V_k ''' 46 | BetaK = 0 47 | # print 'data.shape',data.shape 48 | # print 'x.shape', x.shape 49 | kernelVals = rbf_kernel(data, x.reshape(1, -1), 1 / c) 50 | for i, xi in enumerate(data): 51 | # BetaK += alphaK[i]*kernelFunction(xi,x,c) 52 | BetaK += alphaK[i] * kernelVals[i][0] 53 | return BetaK 54 | 55 | 56 | def calcBetaK(alphaK, kernelVals): 57 | ''' Returns the projection of x onto the eigenvector V_k ''' 58 | BetaK = 0 59 | BetaK = np.sum(alphaK * kernelVals) 60 | return BetaK 61 | 62 | 63 | def centerK(K): 64 | ''' Returns centered K matrix, see K. Murphy 14.43 ''' 65 | l = len(K) 66 | l_ones = np.ones((l, l), dtype=int) / l 67 | Kcentered = K - np.dot(l_ones, K) - np.dot(K, l_ones) + np.dot(l_ones, np.dot(K, l_ones)) 68 | return Kcentered 69 | 70 | 71 | def normAlpha(alpha, lambdas): 72 | ''' Returns new alpha corresponding to normalized eigen vectors, 73 | so that lambda_k(a^k * a^k) = 1 ''' 74 | for i, a in enumerate(alpha): 75 | a /= np.sqrt(lambdas[i]) 76 | return alpha 77 | 78 | 79 | # def calcZold(alpha, data, x, kernelFunction, c,z0): 80 | # ''' Equation (10), returns pre-image z for single input datapoint x ''' 81 | # z = z0 82 | # iters=0 83 | # while iters <5: 84 | # numerator = 0 85 | # denom = 0 86 | # for i, xi in enumerate(data): 87 | # gammaI = calcGammaI(alpha, i, data, x, kernelFunction, c) * kernelFunction(z,xi,c) 88 | # numerator += gammaI * xi 89 | # denom += gammaI 90 | # z = numerator/denom 91 | # iters +=1 92 | # return z 93 | 94 | def calcZWrapper(args): 95 | return calcZ(*args) 96 | 97 | 98 | def calcZ(alpha, data, x, K, c, z0, idx): 99 | ''' Equation (10), returns pre-image z for single input datapoint x ''' 100 | z = z0 101 | iters = 0 102 | maxIters = 10 103 | # calculate beta, gamma (do not change with each iteration) 104 | beta = [calcBetaKOld(aK, data, x, c) for aK in alpha] 105 | gamma = [calcGammaIOpt(alpha, i, beta) for i in range(len(data))] 106 | 107 | while iters < maxIters: # iterate until convergence 108 | numerator = 0 109 | denom = 0 110 | k = rbf_kernel(data, z.reshape(1, -1), 1 / c) 111 | for i, xi in enumerate(data): 112 | gammaI = gamma[i] * k[i][0] 113 | numerator += gammaI * xi 114 | denom += gammaI 115 | if denom > 10 ** -12: # handling numerical instability 116 | newZ = numerator / denom 117 | """ 118 | if np.linalg.norm(z - newZ) < 10**-8: # convergence definition 119 | z = newZ 120 | break 121 | """ 122 | z = newZ 123 | iters += 1 124 | else: 125 | # print "restarted point" 126 | iters = 0 127 | z = z0 + np.random.multivariate_normal(np.zeros(z0.size), np.identity(z0.size)) 128 | numerator = 0 129 | denom = 0 130 | 131 | # print "iters:", iters 132 | return z 133 | 134 | 135 | # def calcGammaI(alpha, i, data, x, kernelFunction, c): 136 | # ''' returns gamma_i = sum_{k=1}^n Beta_k * alpha_i^k ''' 137 | # gammaI = 0 138 | # alphaI = alpha.T[i] 139 | # for k, alphaKI in enumerate(alphaI): 140 | # gammaI += calcBetaK(alpha[k], kernelFunction, data, x, c) * alphaKI 141 | # return gammaI 142 | 143 | def calcGammaIOpt(alpha, i, beta): 144 | ''' returns gamma_i = sum_{k=1}^n beta_k * alpha_i^k ''' 145 | gammaI = 0 146 | alphaI = alpha.T[i] 147 | for k, alphaKI in enumerate(alphaI): 148 | gammaI += beta[k] * alphaKI 149 | return gammaI 150 | 151 | 152 | def kernelPCADeNoise(kernelFunction, c, components, dataTrain, dataTest): 153 | Data = dataTrain 154 | 155 | l = len(Data) 156 | 157 | # build K 158 | # K = createK(Data, kernelFunction, c) 159 | K = createK(Data, c) 160 | 161 | # center K 162 | K = centerK(K) 163 | 164 | # find eigen vectors 165 | lLambda, alpha = np.linalg.eigh(K) # (3) 166 | lambdas = lLambda / l # /l with the notation from the paper (but not murphys) 167 | # drop negative and 0 eigenvalues and their vectors 168 | for i, l in enumerate(lambdas): 169 | if l > 10 ** (-8): 170 | lambdas = lambdas[i:] 171 | alpha = alpha[i:] 172 | break 173 | 174 | # use only the components largest eigenvalues with corresponding vectors 175 | lambdas = lambdas[-components:] 176 | alpha = alpha[-components:] 177 | 178 | # normalize alpha 179 | alpha = normAlpha(alpha, lambdas) 180 | 181 | # p=Pool() 182 | # Z = p.map(calcZWrapper, [(alpha, Data, x, K, c, x, i) for i, x in enumerate(dataTest)]) 183 | 184 | Z = [] 185 | for i in range(len(dataTest)): 186 | # print i 187 | Z.append(calcZ(alpha, Data, dataTest[i], K, c, dataTest[i], i)) 188 | 189 | Z = np.array(Z) 190 | return Z 191 | 192 | 193 | # if __name__ == '__main__': 194 | # # hyperparameters 195 | # c = 0.5 196 | # 197 | # # For half-circle toy example 198 | # X, y = make_circles(n_samples=600, factor=.3, noise=.05) 199 | # X = np.array([x for i, x in enumerate(X) if x[1] > 0 and not y[i]]) 200 | # Xtrain, Xtest = train_test_split(X, test_size=0.9) 201 | # 202 | # Z = kernelPCADeNoise(gaussianKernel, c, 1, Xtrain, Xtest) 203 | # 204 | # plt.plot(Xtrain.T[0], Xtrain.T[1], 'ro') 205 | # plt.plot(Z.T[0], Z.T[1], 'go') 206 | # plt.show() -------------------------------------------------------------------------------- /utilspy/kuaishou_stats2.csv: -------------------------------------------------------------------------------- 1 | ,user_id,register_day_type_rate,register_day_type_ratio,register_day_device_ratio,register_type_ratio,register_type_device,register_type_device_ratio,register_day_device_rate,device_type_ratio,device_type_register_ratio,register_day_register_type_device_ratio,register_day_device_type_register_ratio 2 | count,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0 3 | mean,686252.6029829077,547.1040718009635,0.3495839834213257,0.010990312322974205,0.344575434923172,1255.2387316043028,0.012078322470188141,16.82571108031413,0.010135618969798088,0.4458794593811035,0.018300948664546013,0.5927771329879761 4 | std,396370.4801494953,239.88014453171334,0.14129890501499176,0.013702892698347569,0.13428840041160583,321.2693523491565,0.017135903239250183,20.896847090204183,0.012916702777147293,0.21795567870140076,0.03660264611244202,0.290801078081131 5 | min,16.0,1.0,0.0004636068479157984,0.0004636068479157984,0.00013198706437833607,3.0,7.117944187484682e-05,1.0,3.299676609458402e-05,0.000681198900565505,0.0010857763700187206,0.00917431153357029 6 | 25%,343746.5,355.0,0.26530611515045166,0.001303780940361321,0.32013463973999023,1315.0,0.0011337868636474013,2.0,0.0008249191450886428,0.29323309659957886,0.002816901309415698,0.3636363744735718 7 | 50%,685296.0,598.0,0.3815484344959259,0.004965859930962324,0.32013463973999023,1315.0,0.004911381751298904,7.0,0.0038936184719204903,0.46341463923454285,0.007547169923782349,0.5384615659713745 8 | 75%,1031319.75,754.0,0.4686369001865387,0.01543460600078106,0.46357157826423645,1516.0,0.015517118386924267,25.0,0.01484854519367218,0.5435967445373535,0.021314388141036034,1.0 9 | max,1367532.0,921.0,0.5199321508407593,0.05910735949873924,0.46357157826423645,1516.0,0.5,109.0,0.048439253121614456,1.0,1.0,1.0 10 | ,user_id,register_day_type_rate,register_day_type_ratio,register_day_device_ratio,register_type_ratio,register_type_device,register_type_device_ratio,register_day_device_rate,device_type_ratio,device_type_register_ratio,register_day_register_type_device_ratio,register_day_device_type_register_ratio 11 | count,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0 12 | mean,686252.6029829077,547.1040718009635,0.3495839834213257,0.010990312322974205,0.344575434923172,1255.2387316043028,0.012078322470188141,16.82571108031413,0.010135618969798088,0.4458794593811035,0.018300948664546013,0.5927771329879761 13 | std,396370.4801494953,239.88014453171334,0.14129890501499176,0.013702892698347569,0.13428840041160583,321.2693523491565,0.017135903239250183,20.896847090204183,0.012916702777147293,0.21795567870140076,0.03660264611244202,0.290801078081131 14 | min,16.0,1.0,0.0004636068479157984,0.0004636068479157984,0.00013198706437833607,3.0,7.117944187484682e-05,1.0,3.299676609458402e-05,0.000681198900565505,0.0010857763700187206,0.00917431153357029 15 | 25%,343746.5,355.0,0.26530611515045166,0.001303780940361321,0.32013463973999023,1315.0,0.0011337868636474013,2.0,0.0008249191450886428,0.29323309659957886,0.002816901309415698,0.3636363744735718 16 | 50%,685296.0,598.0,0.3815484344959259,0.004965859930962324,0.32013463973999023,1315.0,0.004911381751298904,7.0,0.0038936184719204903,0.46341463923454285,0.007547169923782349,0.5384615659713745 17 | 75%,1031319.75,754.0,0.4686369001865387,0.01543460600078106,0.46357157826423645,1516.0,0.015517118386924267,25.0,0.01484854519367218,0.5435967445373535,0.021314388141036034,1.0 18 | max,1367532.0,921.0,0.5199321508407593,0.05910735949873924,0.46357157826423645,1516.0,0.5,109.0,0.048439253121614456,1.0,1.0,1.0 19 | -------------------------------------------------------------------------------- /utilspy/util_analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | def missing_values_table(df): 4 | """Function to calculate missing values by column""" 5 | # Total missing values 6 | mis_val = df.isnull().sum() 7 | 8 | # Percentage of missing values 9 | mis_val_percent = 100 * df.isnull().sum() / len(df) 10 | 11 | # Make a table with the results 12 | mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) 13 | 14 | # Rename the columns 15 | mis_val_table_ren_columns = mis_val_table.rename( 16 | columns={0: 'Missing Values', 1: '% of Total Values'}) 17 | 18 | # Sort the table by percentage of missing descending 19 | mis_val_table_ren_columns = mis_val_table_ren_columns[ 20 | mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values( 21 | '% of Total Values', ascending=False).round(3) 22 | 23 | # Print some summary information 24 | print("Your selected dataframe has " + str(df.shape[1]) + " columns.\n" 25 | "There are " + str(mis_val_table_ren_columns.shape[0]) + 26 | " columns that have missing values.") 27 | # Return the dataframe with missing information 28 | return mis_val_table_ren_columns 29 | # Function to calculate correlations with the target for a dataframe 30 | def target_corrs(df,target="label",method="pearson"): 31 | # List of correlations 32 | corrs = [] 33 | 34 | # Iterate through the columns 35 | for col in df.columns: 36 | # print(col) 37 | # Skip the target column 38 | if col != target: 39 | # Calculate correlation with the target 40 | corr = df[target].corr(df[col],method=method) 41 | print('The correlation between %s and the TARGET is %0.4f' % (col, corr)) 42 | # Append the list as a tuple 43 | corrs.append((col, corr)) 44 | 45 | # Sort by absolute magnitude of correlations 46 | corrs = sorted(corrs, key=lambda x: abs(x[1]), reverse=True) 47 | 48 | return corrs 49 | # Function to calculate mutual information with the target for a dataframe 50 | def target_mi(df,target="label"): 51 | from sklearn.feature_selection import mutual_info_classif 52 | # List of correlations 53 | mis = [] 54 | 55 | # Iterate through the columns 56 | for col in df.columns: 57 | # print(col) 58 | # Skip the target column 59 | if col != target: 60 | # Calculate correlation with the target 61 | mi = mutual_info_classif(df[col].values,df[target].values)[0] 62 | print('The mutual information between %s and the TARGET is %0.4f' % (col, mi)) 63 | # Append the list as a tuple 64 | mis.append((col, mi)) 65 | 66 | # Sort by absolute magnitude of correlations 67 | corrs = sorted(mi, key=lambda x: abs(x[1]), reverse=True) 68 | 69 | return corrs 70 | 71 | 72 | -------------------------------------------------------------------------------- /utilspy/utils_feature_engineering.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import shap 4 | 5 | def collinear_columns_to_remove(df,threshold=0.8,method="pearson"): 6 | from sklearn.feature_selection import mutual_info_classif,mutual_info_regression 7 | if method in ["pearson","kendall","spearman"]: 8 | corrs = df.corr(method=method) 9 | elif method == "mi_classif": 10 | # List of mutual informations 11 | mis = [] 12 | 13 | # Iterate through the columns 14 | for col in df.columns: 15 | # print(col) 16 | # Calculate correlation with the target 17 | mi = np.reshape(mutual_info_classif(df.values, df[col].values), -1).tolist() 18 | # Append the list as a tuple 19 | mis.append(mi) 20 | corrs = pd.DataFrame(np.array(mis), columns=df.columns, index=df.columns) 21 | elif method == "mi_regression": 22 | # List of mutual informations 23 | mis = [] 24 | 25 | # Iterate through the columns 26 | for col in df.columns: 27 | # print(col) 28 | # Calculate correlation with the target 29 | mi = np.reshape(mutual_info_regression(df.values, df[col].values), -1).tolist() 30 | # Append the list as a tuple 31 | mis.append(mi) 32 | corrs = pd.DataFrame(np.array(mis), columns=df.columns, index=df.columns) 33 | # Set the threshold 34 | threshold = threshold 35 | 36 | # Empty dictionary to hold correlated variables 37 | above_threshold_vars = {} 38 | 39 | # For each column, record the variables that are above the threshold 40 | for col in corrs: 41 | above_threshold_vars[col] = list(corrs.index[corrs[col] > threshold]) 42 | # Track columns to remove and columns already examined 43 | cols_to_remove = [] 44 | cols_seen = [] 45 | cols_to_remove_pair = [] 46 | 47 | # Iterate through columns and correlated columns 48 | for key, value in above_threshold_vars.items(): 49 | # Keep track of columns already examined 50 | cols_seen.append(key) 51 | for x in value: 52 | if x == key: 53 | next 54 | else: 55 | # Only want to remove one in a pair 56 | if x not in cols_seen: 57 | cols_to_remove.append(x) 58 | cols_to_remove_pair.append(key) 59 | 60 | cols_to_remove = list(set(cols_to_remove)) 61 | print('Number of columns to remove: ', len(cols_to_remove)) 62 | return cols_to_remove 63 | 64 | 65 | def remove_missing_columns(train, test, threshold=90): 66 | # Calculate missing stats for train and test (remember to calculate a percent!) 67 | train_miss = pd.DataFrame(train.isnull().sum()) 68 | train_miss['percent'] = 100 * train_miss[0] / len(train) 69 | 70 | test_miss = pd.DataFrame(test.isnull().sum()) 71 | test_miss['percent'] = 100 * test_miss[0] / len(test) 72 | 73 | # list of missing columns for train and test 74 | missing_train_columns = list(train_miss.index[train_miss['percent'] > threshold]) 75 | missing_test_columns = list(test_miss.index[test_miss['percent'] > threshold]) 76 | 77 | # Combine the two lists together 78 | missing_columns = list(set(missing_train_columns + missing_test_columns)) 79 | 80 | # Print information 81 | print('There are %d columns with greater than %d%% missing values.' % (len(missing_columns), threshold)) 82 | 83 | # Drop the missing columns and return 84 | train = train.drop(columns=missing_columns) 85 | test = test.drop(columns=missing_columns) 86 | 87 | return train, test -------------------------------------------------------------------------------- /utilspy/utils_misc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from joblib import Parallel, delayed 4 | def reduce_mem_usage(df): 5 | """ iterate through all the columns of a dataframe and modify the input type 6 | to reduce memory usage. 7 | """ 8 | start_mem = df.memory_usage().sum() / 1024 ** 2 9 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) 10 | 11 | for col in df.columns: 12 | col_type = df[col].dtype 13 | 14 | if col_type != object: 15 | c_min = df[col].min() 16 | c_max = df[col].max() 17 | if str(col_type)[:3] == 'int': 18 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 19 | df[col] = df[col].astype(np.int8) 20 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 21 | df[col] = df[col].astype(np.int16) 22 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 23 | df[col] = df[col].astype(np.int32) 24 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 25 | df[col] = df[col].astype(np.int64) 26 | else: 27 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 28 | df[col] = df[col].astype(np.float16) 29 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 30 | df[col] = df[col].astype(np.float32) 31 | else: 32 | df[col] = df[col].astype(np.float64) 33 | else: 34 | if df[col].nunique() < len(df[col]) >> 1: 35 | df[col] = df[col].astype('category') 36 | 37 | end_mem = df.memory_usage().sum() / 1024 ** 2 38 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) 39 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) 40 | 41 | return df 42 | 43 | class Reducer: 44 | """ 45 | Class that takes a dict of increasingly bigger numpy datatypes to transform 46 | the input of a pandas dataframe in order to save memory usage. 47 | """ 48 | memory_scale_factor = 1024**2 # memory in MB 49 | 50 | def __init__(self, conv_table=None): 51 | """ 52 | :param conv_table: dict with np.dtypes-strings as keys 53 | """ 54 | if conv_table is None: 55 | self.conversion_table = \ 56 | {'int': [np.int8, np.int16, np.int32, np.int64], 57 | 'uint': [np.uint8, np.uint16, np.uint32, np.uint64], 58 | 'float': [np.float16, np.float32, ]} 59 | else: 60 | self.conversion_table = conv_table 61 | 62 | def _type_candidates(self, k): 63 | for c in self.conversion_table[k]: 64 | i = np.iinfo(c) if 'int' in k else np.finfo(c) 65 | yield c, i 66 | 67 | def reduce(self, df, verbose=False): 68 | """Takes a dataframe and returns it with all input transformed to the 69 | smallest necessary types. 70 | 71 | :param df: pandas dataframe 72 | :param verbose: If True, outputs more information 73 | :return: pandas dataframe with reduced input types 74 | """ 75 | if verbose: 76 | start_mem = df.memory_usage().sum() / 1024**2 77 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) 78 | ret_list = Parallel(n_jobs=-1)(delayed(self._reduce) 79 | (df[c], c, verbose) for c in 80 | df.columns) 81 | df = pd.concat(ret_list, axis=1) 82 | if verbose: 83 | end_mem = df.memory_usage().sum() / 1024**2 84 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) 85 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) 86 | return df 87 | 88 | def _reduce(self, s, colname, verbose): 89 | 90 | # skip NaNs 91 | if s.isnull().any(): 92 | if verbose: 93 | print(colname, 'has NaNs - Skip..') 94 | return s 95 | 96 | # detect kind of type 97 | coltype = s.dtype 98 | if np.issubdtype(coltype, np.integer): 99 | conv_key = 'int' if s.min() < 0 else 'uint' 100 | elif np.issubdtype(coltype, np.floating): 101 | conv_key = 'float' 102 | else: 103 | if s.nunique()<(len(s)>>1): 104 | return s.astype('category') 105 | if verbose: 106 | print(colname, 'is', coltype, '- Skip..') 107 | print(colname, 'is', coltype, '- Skip..') 108 | return s 109 | 110 | # find right candidate 111 | for cand, cand_info in self._type_candidates(conv_key): 112 | if s.max() <= cand_info.max and s.min() >= cand_info.min: 113 | if verbose: 114 | print('convert', colname, 'to', str(cand)) 115 | return s.astype(cand) 116 | 117 | # reaching this code is bad. Probably there are inf, or other high numbs 118 | print(("WARNING: {} " 119 | "doesn't fit the grid with \nmax: {} " 120 | "and \nmin: {}").format(colname, s.max(), s.min())) 121 | print('Dropping it..') 122 | def import_data(file,header=0,index_col=None): 123 | """readin a file as dataframe and optimize its memory usage""" 124 | df = pd.read_csv(file, header=header,index_col=index_col,parse_dates=True, keep_date_col=True) 125 | reducer = Reducer() 126 | df = reducer.reduce(df,verbose=True) 127 | # df = reduce_mem_usage(df) 128 | return df 129 | def convert_data(df): 130 | """create a dataframe and optimize its memory usage""" 131 | reducer = Reducer() 132 | df = reducer.reduce(df,verbose=True) 133 | # df = reduce_mem_usage(df) 134 | return df -------------------------------------------------------------------------------- /utilspy/utils_models.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.metrics import roc_auc_score 6 | from sklearn.model_selection import KFold 7 | from sklearn.preprocessing import LabelEncoder 8 | import lightgbm as lgb 9 | 10 | 11 | def model(features, test_features,target="label", encoding='ohe', n_folds=5): 12 | """Train and test a light gradient boosting model using 13 | cross validation. 14 | 15 | Parameters 16 | -------- 17 | features (pd.DataFrame): 18 | dataframe of training features to use 19 | for training a model. Must include the TARGET column. 20 | test_features (pd.DataFrame): 21 | dataframe of testing features to use 22 | for making predictions with the model. 23 | encoding (str, default = 'ohe'): 24 | method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding 25 | n_folds (int, default = 5): number of folds to use for cross validation 26 | 27 | Return 28 | -------- 29 | submission (pd.DataFrame): 30 | dataframe with `SK_ID_CURR` and `TARGET` probabilities 31 | predicted by the model. 32 | feature_importances (pd.DataFrame): 33 | dataframe with the feature importances from the model. 34 | valid_metrics (pd.DataFrame): 35 | dataframe with training and validation metrics (ROC AUC) for each fold and overall. 36 | 37 | """ 38 | # Extract the ids 39 | train_ids = features['id'] 40 | test_ids = test_features['id'] 41 | 42 | # Extract the labels for training 43 | labels = features[target] 44 | 45 | # Remove the ids and target 46 | features = features.drop(columns=['id', target]) 47 | test_features = test_features.drop(columns=['id']) 48 | 49 | # One Hot Encoding 50 | if encoding == 'ohe': 51 | features = pd.get_dummies(features) 52 | test_features = pd.get_dummies(test_features) 53 | 54 | # Align the dataframes by the columns 55 | features, test_features = features.align(test_features, join='inner', axis=1) 56 | 57 | # No categorical indices to record 58 | cat_indices = 'auto' 59 | 60 | # Integer label encoding 61 | elif encoding == 'le': 62 | # Create a label encoder 63 | label_encoder = LabelEncoder() 64 | 65 | # List for storing categorical indices 66 | cat_indices = [] 67 | 68 | # Iterate through each column 69 | for i, col in enumerate(features): 70 | if features[col].dtype == 'object': 71 | # Map the categorical features to integers 72 | features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,))) 73 | test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,))) 74 | 75 | # Record the categorical indices 76 | cat_indices.append(i) 77 | 78 | # Catch error if label encoding scheme is not valid 79 | else: 80 | raise ValueError("Encoding must be either 'ohe' or 'le'") 81 | 82 | print('Training Data Shape: ', features.shape) 83 | print('Testing Data Shape: ', test_features.shape) 84 | 85 | # Extract feature names 86 | feature_names = list(features.columns) 87 | 88 | # Convert to np arrays 89 | features = np.array(features) 90 | test_features = np.array(test_features) 91 | 92 | # Create the kfold object 93 | k_fold = KFold(n_splits=n_folds, shuffle=False, random_state=50) 94 | 95 | # Empty array for feature importances 96 | feature_importance_values = np.zeros(len(feature_names)) 97 | # Empty array for test predictions 98 | test_predictions = np.zeros(test_features.shape[0]) 99 | 100 | # Empty array for out of fold validation predictions 101 | out_of_fold = np.zeros(features.shape[0]) 102 | 103 | # Lists for recording validation and training scores 104 | valid_scores = [] 105 | train_scores = [] 106 | 107 | # Iterate through each fold 108 | for train_indices, valid_indices in k_fold.split(features): 109 | # Training input for the fold 110 | train_features, train_labels = features[train_indices], labels[train_indices] 111 | # Validation input for the fold 112 | valid_features, valid_labels = features[valid_indices], labels[valid_indices] 113 | 114 | # Create the model 115 | model = lgb.LGBMClassifier(n_estimators=10000, objective='binary', 116 | class_weight='balanced', learning_rate=0.05, 117 | reg_alpha=0.1, reg_lambda=0.1, 118 | subsample=0.8, n_jobs=-1, random_state=50) 119 | 120 | # Train the model 121 | model.fit(train_features, train_labels, eval_metric='auc', 122 | eval_set=[(valid_features, valid_labels), (train_features, train_labels)], 123 | eval_names=['valid', 'train'], categorical_feature=cat_indices, 124 | early_stopping_rounds=100, verbose=200) 125 | 126 | # Record the best iteration 127 | best_iteration = model.best_iteration_ 128 | 129 | # Record the feature importances 130 | feature_importance_values += model.feature_importances_ / k_fold.n_splits 131 | # Make predictions 132 | test_predictions += model.predict_proba(test_features, num_iteration=best_iteration)[:, 1] / k_fold.n_splits 133 | 134 | # Record the out of fold predictions 135 | out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration=best_iteration)[:, 1] 136 | 137 | # Record the best score 138 | valid_score = model.best_score_['valid']['auc'] 139 | train_score = model.best_score_['train']['auc'] 140 | 141 | valid_scores.append(valid_score) 142 | train_scores.append(train_score) 143 | 144 | # Clean up memory 145 | gc.enable() 146 | del model, train_features, valid_features 147 | gc.collect() 148 | 149 | # Make the submission dataframe 150 | submission = pd.DataFrame({'id': test_ids, target: test_predictions}) 151 | 152 | # Make the feature importance dataframe 153 | feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values}) 154 | 155 | # Overall validation score 156 | valid_auc = roc_auc_score(labels, out_of_fold) 157 | 158 | # Add the overall scores to the metrics 159 | valid_scores.append(valid_auc) 160 | train_scores.append(np.mean(train_scores)) 161 | 162 | # Needed for creating dataframe of validation scores 163 | fold_names = list(range(n_folds)) 164 | fold_names.append('overall') 165 | # Dataframe of validation scores 166 | metrics = pd.DataFrame({'fold': fold_names, 167 | 'train': train_scores, 168 | 'valid': valid_scores}) 169 | 170 | return submission, feature_importances, metrics -------------------------------------------------------------------------------- /utilspy/utils_plot.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | # Plots the disribution of a variable colored by value of the target 5 | def corr_coefficient(df,method="pearson"): 6 | import plotly.graph_objs as pgo 7 | import plotly.offline as po 8 | data = [ 9 | pgo.Heatmap( 10 | z=df.corr(method=method).values, 11 | x=df.columns.values, 12 | y=df.columns.values, 13 | colorscale='Viridis', 14 | reversescale=False, 15 | text=True, 16 | opacity=1.0) 17 | ] 18 | 19 | layout = pgo.Layout( 20 | title=method+' Correlation of features', 21 | xaxis=dict(ticks='', nticks=36), 22 | yaxis=dict(ticks=''), 23 | width=900, height=700, 24 | margin=dict( 25 | l=240, 26 | ), ) 27 | 28 | fig = pgo.Figure(data=data, layout=layout) 29 | po.iplot(fig, filename='labelled-heatmap') 30 | 31 | 32 | def kde_target(var_name, df,target="label"): 33 | import matplotlib.pyplot as plt # for plotting 34 | import seaborn as sns # for making plots with seaborn 35 | # Calculate the correlation coefficient between the new variable and the target 36 | corr = df[target].corr(df[var_name]) 37 | 38 | # Calculate medians for repaid vs not repaid 39 | avg_repaid = df.ix[df[target] == 0, var_name].median() 40 | avg_not_repaid = df.ix[df[target] == 1, var_name].median() 41 | 42 | plt.figure(figsize=(12, 6)) 43 | 44 | # Plot the distribution for target == 0 and target == 1 45 | sns.kdeplot(df.ix[df[target] == 0, var_name], label='label == 0') 46 | sns.kdeplot(df.ix[df[target] == 1, var_name], label='label == 1') 47 | 48 | # label the plot 49 | plt.xlabel(var_name); 50 | plt.ylabel('Density'); 51 | plt.title('%s Distribution' % var_name) 52 | plt.legend(); 53 | 54 | # print out the correlation 55 | print('The correlation between %s and the TARGET is %0.4f' % (var_name, corr)) 56 | # Print out average values 57 | print('Median value for loan that was not repaid = %0.4f' % avg_not_repaid) 58 | print('Median value for loan that was repaid = %0.4f' % avg_repaid) 59 | def value_count_bar_default(df,series_name,title="value count bar"): 60 | temp = df[series_name].value_counts() 61 | temp = pd.DataFrame({'labels': temp.index, 62 | 'values': temp.values 63 | }) 64 | temp.iplot(kind='bar', xTitle=series_name, yTitle="Count", 65 | title=title, colors=['#75e575']) 66 | import seaborn as sns 67 | plt.figure(figsize=(12, 5)) 68 | plt.title("Distribution of register day") 69 | ax = sns.distplot(df_user_register["register_day"]) 70 | 71 | def value_count_bar(df,series_name,title="value count plot"): 72 | import plotly.graph_objs as pgo 73 | import plotly.offline as po 74 | temp = df[series_name].value_counts() 75 | # print("Total number of states : ",len(temp)) 76 | trace = pgo.Bar( 77 | x=temp.index, 78 | y=(temp / temp.sum()) * 100, 79 | ) 80 | data = [trace] 81 | layout = pgo.Layout( 82 | title=title, 83 | xaxis=dict( 84 | title='Name of type of the Suite', 85 | tickfont=dict( 86 | size=14, 87 | color='rgb(107, 107, 107)' 88 | ) 89 | ), 90 | yaxis=dict( 91 | title='Count of Name of type of the Suite in %', 92 | titlefont=dict( 93 | size=16, 94 | color='rgb(107, 107, 107)' 95 | ), 96 | tickfont=dict( 97 | size=14, 98 | color='rgb(107, 107, 107)' 99 | ) 100 | ) 101 | ) 102 | fig = pgo.Figure(data=data, layout=layout) 103 | po.iplot(fig, filename=series_name) 104 | 105 | def value_count_pie(df,series_name,title="value count pie",hole=0.0): 106 | temp = df[series_name].value_counts() 107 | df = pd.DataFrame({'labels': temp.index, 108 | 'values': temp.values 109 | }) 110 | df.iplot(kind='pie', labels='labels', values='values', title=title,hole=hole) 111 | def value_count_hole_pie(df, series_name,title="value count hole pie"): 112 | from plotly.offline import iplot 113 | temp = df[series_name].value_counts() 114 | fig = { 115 | "input": [ 116 | { 117 | "values": temp.values, 118 | "labels": temp.index, 119 | "domain": {"x": [0, .48]}, 120 | # "name": "Types of Loans", 121 | # "hoverinfo":"label+percent+name", 122 | "hole": .7, 123 | "type": "pie" 124 | }, 125 | 126 | ], 127 | "layout": { 128 | "title": title, 129 | "annotations": [ 130 | { 131 | "font": { 132 | "size": 20 133 | }, 134 | "showarrow": False, 135 | "text": series_name, 136 | "x": 0.17, 137 | "y": 0.5 138 | } 139 | 140 | ] 141 | } 142 | } 143 | iplot(fig, filename='donut') 144 | def value_count_bar_with_target(df,series_name,target,title="value count with regard to target"): 145 | import plotly.graph_objs as pgo 146 | from plotly.offline import iplot 147 | temp = df["NAME_FAMILY_STATUS"].value_counts() 148 | # print(temp.values) 149 | temp_y0 = [] 150 | temp_y1 = [] 151 | for val in temp.index: 152 | temp_y1.append(np.sum(df[target][df[series_name] == val] == 1)) 153 | temp_y0.append(np.sum(df[target][df[series_name] == val] == 0)) 154 | trace1 = pgo.Bar( 155 | x=temp.index, 156 | y=(temp_y1 / temp.sum()) * 100, 157 | name='YES' 158 | ) 159 | trace2 = pgo.Bar( 160 | x=temp.index, 161 | y=(temp_y0 / temp.sum()) * 100, 162 | name='NO' 163 | ) 164 | data = [trace1, trace2] 165 | layout = pgo.Layout( 166 | title=title, 167 | # barmode='stack', 168 | width=1000, 169 | xaxis=dict( 170 | title=series_name, 171 | tickfont=dict( 172 | size=14, 173 | color='rgb(107, 107, 107)' 174 | ) 175 | ), 176 | yaxis=dict( 177 | title='Count in %', 178 | titlefont=dict( 179 | size=16, 180 | color='rgb(107, 107, 107)' 181 | ), 182 | tickfont=dict( 183 | size=14, 184 | color='rgb(107, 107, 107)' 185 | ) 186 | ) 187 | ) 188 | 189 | fig = pgo.Figure(data=data, layout=layout) 190 | iplot(fig) 191 | def plot_feature_importances(df): 192 | """ 193 | Plot importances returned by a model. This can work with any measure of 194 | feature importance provided that higher importance is better. 195 | 196 | Args: 197 | df (dataframe): feature importances. Must have the features in a column 198 | called `features` and the importances in a column called `importance 199 | 200 | Returns: 201 | shows a plot of the 15 most importance features 202 | 203 | df (dataframe): feature importances sorted by importance (highest to lowest) 204 | with a column for normalized importance 205 | """ 206 | 207 | # Sort features according to importance 208 | df = df.sort_values('importance', ascending=False).reset_index() 209 | 210 | # Normalize the feature importances to add up to one 211 | df['importance_normalized'] = df['importance'] / df['importance'].sum() 212 | 213 | # Make a horizontal bar chart of feature importances 214 | plt.figure(figsize=(10, 6)) 215 | ax = plt.subplot() 216 | 217 | # Need to reverse the index to plot most important on top 218 | ax.barh(list(reversed(list(df.index[:15]))), 219 | df['importance_normalized'].head(15), 220 | align='center', edgecolor='k') 221 | 222 | # Set the yticks and labels 223 | ax.set_yticks(list(reversed(list(df.index[:15])))) 224 | ax.set_yticklabels(df['feature'].head(15)) 225 | 226 | # Plot labeling 227 | plt.xlabel('Normalized Importance'); 228 | plt.title('Feature Importances') 229 | plt.show() 230 | 231 | return df --------------------------------------------------------------------------------