├── .idea
├── ActiveUserPrediction.iml
├── inspectionProfiles
│ └── Project_Default.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── README.md
├── blending
└── blending_v1.py
├── catboostpy
├── catboost_test.py
├── catboost_v7.py
├── cb_v8.py
└── cb_v9.py
├── dataanalysispy
├── data_analysis.py
├── eda_v1.ipynb
└── get_global_file.py
├── dataprocesspy
├── __pycache__
│ └── data_process_v7.cpython-36.pyc
├── create_data.py
├── create_feature_v3_nonp.py
├── create_feature_v3_parallel.py
├── data_process.py
├── data_process_v9.py
├── kuaishou_stats2.csv
└── ts_feature_calculators.py
├── featureselection
├── feature_importance_mi.csv
├── feature_importance_shap.csv
├── feature_selection.py
└── keep_features.py
├── hardcodedpy
├── hard_approach.py
├── hardcode_approach.py
├── hardcode_approach_v2.py
├── hardcode_approach_v3.py
├── merge_approach.py
└── new_merge.py
├── lgbpy
├── kuaishou_stats.csv
├── lgb_model.py
└── lgb_v16.py
├── lrpy
├── lr_v1.py
└── lr_v2.py
├── model
└── engines.py
├── nnpy
├── dnn.py
├── f1_keras.py
├── nn_model.py
├── nn_v1.py
└── nn_v2.py
├── paper
├── Modeling and Predicting the Active video-viewing time in a large-scale e-learning system.pdf
├── The Prediction of Booking Destination on airbnb dataset.pdf
├── Using Deep Learning to Predict Customer Churn in a mobile telecommunication newwork.pdf
├── field-aware fatorization machine for CTR prediction.pdf
└── predicting airbnb user's desired travel destination.pdf
├── photos
├── 16count.JPG
├── 23count.JPG
├── 23count3.JPG
├── 24count.JPG
├── 24count3.JPG
├── count2.JPG
├── describe.JPG
├── outlier1.JPG
├── registerday_count.JPG
├── sample.JPG
└── value_count.JPG
├── quick_test.py
├── rfpy
└── rf_v1.py
├── rulepy
└── hardcode_approach.py
├── statsfile
├── kuaishou_stats.csv
└── kuaishou_stats2.csv
├── svmpy
└── svm_v1.py
├── test.py
└── utilspy
├── calculate.py
├── create_data.py
├── kpca.py
├── kuaishou_stats2.csv
├── ts_feature_calculators.py
├── util_analysis.py
├── utils_feature_engineering.py
├── utils_misc.py
├── utils_models.py
└── utils_plot.py
/.idea/ActiveUserPrediction.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ApexVCS
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/catboostpy/cb_v8.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | import pandas as pd
4 | from catboost import CatBoostClassifier
5 | from sklearn.decomposition import PCA
6 | from sklearn.metrics import classification_report
7 |
8 | def predict(clf2, test_set, param, kpca):
9 | uid = pd.DataFrame()
10 | # test_set = processing(trainSpan=(1, 30), label=False)
11 | uid["user_id"] = test_set["user_id"]
12 | test_set = test_set.drop(labels=["user_id"], axis=1)
13 | test_set = kpca.transform(test_set.values)
14 | print("begin to make predictions")
15 | # res = clf2.predict_proba(test_set.values)
16 | res = clf2.predict_proba(test_set)
17 | uid["proba1"] = pd.Series(res[:, 1])
18 | uid["score"] = uid.groupby(by=["user_id"])["proba1"].transform(lambda x: sum(x) / float(len(x)))
19 | uid.drop_duplicates(subset=["user_id"], inplace=True)
20 | uid.sort_values(by=["score"], axis=0, ascending=False, inplace=True)
21 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
22 | uid_file = "../result/uid/uid_cb_" + param + "_" + str_time + ".csv"
23 | uid.to_csv(uid_file, header=True, index=False)
24 | # active_users = uid.loc[uid["score"]>0.5]["user_id"].unique().tolist()
25 | active_users = uid["user_id"][:24500].unique().tolist()
26 | # print(len(active_users))
27 | print(uid["score"].tolist()[24500])
28 | # print(active_users)
29 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
30 | submission_file = "../result/622/submission_cb_" + param + "_" + str_time + ".csv"
31 | with open(submission_file, "a", newline="") as f:
32 | writer = csv.writer(f)
33 | for i in active_users:
34 | writer.writerow([i])
35 |
36 |
37 | # using this module ,one needs to deconstruct some of the features in data_process
38 | def run(scheme_num=1, file_name="../data/data_v3/training_e"):
39 | train_set_ls = []
40 | if scheme_num == 1:
41 | for i in [16, 17, 22, 23]:
42 | print("begin to load the dataset")
43 | file_name1 = file_name + "ld1-" + str(i) + ".csv"
44 | train_set_temp = pd.read_csv(file_name1, header=0, index_col=None)
45 | print(train_set_temp.describe())
46 | train_set_ls.append(train_set_temp)
47 | elif scheme_num == 2:
48 | for i in [16, 23]:
49 | print("begin to load the dataset")
50 | file_name2 = file_name + "ld1-" + str(i) + ".csv"
51 | train_set_temp = pd.read_csv(file_name2, header=0, index_col=None)
52 | print(train_set_temp.describe())
53 | train_set_ls.append(train_set_temp)
54 | elif scheme_num == 3:
55 | for i in [17,18, 19, 20, 21, 22, 23]:
56 | print("begin to load the dataset")
57 | file_name3 = file_name + "ld1-" + str(i) + ".csv"
58 | train_set_temp = pd.read_csv(file_name3, header=0, index_col=None)
59 | print(train_set_temp.describe())
60 | train_set_ls.append(train_set_temp)
61 | val_file_name = file_name + "ld1-23.csv"
62 | val_set = pd.read_csv(val_file_name, header=0, index_col=None)
63 | print(val_set.describe())
64 | train_set = pd.concat(train_set_ls, axis=0)
65 | ds = train_set.describe()
66 | print(ds)
67 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))
68 |
69 | print("begin to drop the duplicates")
70 | train_set.drop_duplicates(subset=keep_feature, inplace=True)
71 | val_set.drop_duplicates(subset=keep_feature, inplace=True)
72 | print(train_set.describe())
73 | print(val_set.describe())
74 | train_label = train_set["label"]
75 | val_label = val_set["label"]
76 | train_set = train_set.drop(labels=["label", "user_id"], axis=1)
77 | val_set = val_set.drop(labels=["label", "user_id"], axis=1)
78 |
79 | print("begin to standardization the data")
80 | for fea in keep_feature:
81 | if train_set[fea].var() < 0.000001 or val_set[fea].var() < 0.000001:
82 | train_set.drop(labels=[fea], axis=1, inplace=True)
83 | val_set.drop(labels=[fea], axis=1, inplace=True)
84 | else:
85 | train_set[fea] = (train_set[fea] - train_set[fea].min()) / (train_set[fea].max() - train_set[fea].min())
86 | # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std())
87 | val_set[fea] = (val_set[fea] - val_set[fea].min()) / (val_set[fea].max() - val_set[fea].min())
88 | # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std())
89 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))
90 | kpca = PCA(n_components=0.99, whiten=True)
91 | # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1)
92 | kpca.fit(train_set.values)
93 | train_set = kpca.transform(train_set.values)
94 | val_set = kpca.transform(val_set.values)
95 | # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_))
96 | print("number of components {}".format(kpca.n_components_))
97 | print("noise variance {}".format(kpca.noise_variance_))
98 | print("the explained variance {}".format(kpca.explained_variance_))
99 | print("the explained variance ratio {}".format(kpca.explained_variance_ratio_))
100 |
101 | print("begin to make prediction with plain features and without tuning parameters")
102 |
103 | initial_params = {
104 | "colsample_bytree": 0.9956575704604527,
105 | "learning_rate": 0.03640520807213964,
106 | "max_bin": 210,
107 | # "max_depth":7,
108 | "min_child_samples": 80,
109 | "min_child_weight": 0.23740522733908753,
110 | # "min_split_gain": 0.0004147079426427973,
111 | "n_estimators": 266,
112 | "num_leaves": 12,
113 | "reg_alpha": 271.01549892268713,
114 | "reg_lambda": 0.0001118074055642654,
115 | # "scale_pos_weight": 0.9914246775102074,
116 | "subsample": 0.9090257022233618,
117 | "boosting_type": "dart",
118 | }
119 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns))
120 |
121 | # best_f1 =0.0
122 | # best_params = {"n_estimators":800,"num_leaves":6}
123 | # for n_estimator in [400,600,800]:
124 | # for num_leave in [4,6,8]:
125 | # print({"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"})
126 | # clf1 = LGBMClassifier(n_estimators=n_estimator, num_leaves=num_leave, boosting_type="dart")
127 | # clf1.fit(train_set.values, train_label.values)
128 | # print("load the test dataset")
129 | # yhat = clf1.predict(val_set.values)
130 | # print(classification_report(y_pred=yhat, y_true=val_label.values,digits=4))
131 | # f1 = f1_score(y_pred=yhat, y_true=val_label.values)
132 | # if best_f10.5]["user_id"].unique().tolist()
27 | active_users = uid["user_id"][:24500].unique().tolist()
28 | # print(len(active_users))
29 | print(uid["score"].tolist()[24500])
30 | # print(active_users)
31 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
32 | submission_file = "../result/622/submission_cb_" + param + "_" + str_time + ".csv"
33 | with open(submission_file, "a", newline="") as f:
34 | writer = csv.writer(f)
35 | for i in active_users:
36 | writer.writerow([i])
37 |
38 |
39 | # using this module ,one needs to deconstruct some of the features in data_process
40 | def run(scheme_num=1, file_name="../data/data_v3/training_e"):
41 | train_set_ls = []
42 | if scheme_num == 1:
43 | for i in [16, 17, 22, 23]:
44 | print("begin to load the dataset")
45 | file_name1 = file_name + "ld1-" + str(i) + ".csv"
46 | train_set_temp = pd.read_csv(file_name1, header=0, index_col=None)
47 | print(train_set_temp.describe())
48 | train_set_ls.append(train_set_temp)
49 | elif scheme_num == 2:
50 | for i in [16, 23]:
51 | print("begin to load the dataset")
52 | file_name2 = file_name + "ld1-" + str(i) + ".csv"
53 | train_set_temp = pd.read_csv(file_name2, header=0, index_col=None)
54 | print(train_set_temp.describe())
55 | train_set_ls.append(train_set_temp)
56 | elif scheme_num == 3:
57 | for i in [18, 19, 20, 21, 22, 23]:
58 | print("begin to load the dataset")
59 | file_name3 = file_name + "ld1-" + str(i) + ".csv"
60 | train_set_temp = pd.read_csv(file_name3, header=0, index_col=None)
61 | print(train_set_temp.describe())
62 | train_set_ls.append(train_set_temp)
63 | val_file_name = file_name + "ld1-23.csv"
64 | val_set = pd.read_csv(val_file_name, header=0, index_col=None)
65 | print(val_set.describe())
66 | train_set = pd.concat(train_set_ls, axis=0)
67 | ds = train_set.describe()
68 | print(ds)
69 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))
70 |
71 | print("begin to drop the duplicates")
72 | train_set.drop_duplicates(subset=keep_feature, inplace=True)
73 | val_set.drop_duplicates(subset=keep_feature, inplace=True)
74 | print(train_set.describe())
75 | print(val_set.describe())
76 | train_label = train_set["label"]
77 | val_label = val_set["label"]
78 | train_set = train_set.drop(labels=["label", "user_id"], axis=1)
79 | val_set = val_set.drop(labels=["label", "user_id"], axis=1)
80 |
81 | print("begin to standardization the data")
82 | for fea in keep_feature:
83 | if train_set[fea].var() < 0.000001 or val_set[fea].var() < 0.000001:
84 | train_set.drop(labels=[fea], axis=1, inplace=True)
85 | val_set.drop(labels=[fea], axis=1, inplace=True)
86 | else:
87 | train_set[fea] = (train_set[fea] - train_set[fea].min()) / (train_set[fea].max() - train_set[fea].min())
88 | # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std())
89 | val_set[fea] = (val_set[fea] - val_set[fea].min()) / (val_set[fea].max() - val_set[fea].min())
90 | # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std())
91 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))
92 | kpca = PCA(n_components=0.99, whiten=True)
93 | # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1)
94 | kpca.fit(train_set.values)
95 | train_set = kpca.transform(train_set.values)
96 | val_set = kpca.transform(val_set.values)
97 | # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_))
98 | print("number of components {}".format(kpca.n_components_))
99 | print("noise variance {}".format(kpca.noise_variance_))
100 | print("the explained variance {}".format(kpca.explained_variance_))
101 | print("the explained variance ratio {}".format(kpca.explained_variance_ratio_))
102 |
103 | print("begin to make prediction with plain features and without tuning parameters")
104 |
105 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns))
106 |
107 | # best_f1 =0.0
108 | # best_params = {"n_estimators":800,"num_leaves":6}
109 | # for n_estimator in [400,600,800]:
110 | # for num_leave in [4,6,8]:
111 | # print({"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"})
112 | # clf1 = LGBMClassifier(n_estimators=n_estimator, num_leaves=num_leave, boosting_type="dart")
113 | # clf1.fit(train_set.values, train_label.values)
114 | # print("load the test dataset")
115 | # yhat = clf1.predict(val_set.values)
116 | # print(classification_report(y_pred=yhat, y_true=val_label.values,digits=4))
117 | # f1 = f1_score(y_pred=yhat, y_true=val_label.values)
118 | # if best_f1=trainSpan[0])&(df_user_register["register_day"]<=trainSpan[1])]
56 |
57 | df_user_register_train["register_day_rate"] = df_user_register_train.groupby(by=["register_day"])["register_day"].transform("count")
58 | df_user_register_train["register_type_rate"] = df_user_register_train.groupby(by=["register_type"])["register_type"].transform("count")
59 | df_user_register_train["register_type_device"] = df_user_register_train.groupby(by=["register_type"])["device_type"].transform(lambda x: x.nunique())
60 | df_user_register_train["device_type_rate"] = df_user_register_train.groupby(by=["device_type"])["device_type"].transform("count")
61 | df_user_register_train["device_type_register"] = df_user_register_train.groupby(by=["device_type"])["register_type"].transform(lambda x: x.nunique())
62 |
63 | df_user_register = df_user_register_train.drop(labels=["register_type","device_type"],axis=1)
64 |
65 | print("get users from app launch log")
66 | # app_launch_log = ["user_id","app_launch_day"]
67 | dtype_app_launch = {"user_id": np.uint32, "app_launch_day": np.uint8}
68 | df_app_launch = pd.read_csv("data/app_launch_log.csv", header=0, index_col=None, dtype=dtype_app_launch)
69 | def analysisTrans():
70 | print("begin to load the trainset1")
71 | # train_set1 = processing(trainSpan=(1,10),label=True)
72 | # train_set1.to_csv("data/training_ld1-10.csv", header=True, index=False)
73 | train_set1 = pd.read_csv("data/training_ld1-10.csv", header=0, index_col=None)
74 | print(train_set1.describe())
75 | print("begin to load the trainset2")
76 | # train_set2 = processing(trainSpan=(11,20),label=True)
77 | # train_set2.to_csv("data/training_ld11-20.csv", header=True, index=False)
78 | train_set2 = pd.read_csv("data/training_ld11-20.csv", header=0, index_col=None)
79 | print(train_set2.describe())
80 | print("begin to merge the trainsets")
81 | train_set = pd.concat([train_set1,train_set2],axis=0)
82 | print(train_set.describe())
83 | analysisTrans()
84 | # user_activity()
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/dataanalysispy/get_global_file.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | user_register_log = ["user_id", "register_day", "register_type", "device_type"]
5 | app_launch_log = ["user_id", "app_launch_day"]
6 | video_create_log = ["user_id", "video_create_day"]
7 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"]
8 |
9 |
10 | def get_global_file():
11 | print("get users from user register log")
12 | # user_register_log = ["user_id", "register_day", "register_type", "device_type"]
13 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8,
14 | "device_type": np.uint16}
15 | df_user_register = pd.read_csv("data/user_register_log.csv", header=0, index_col=None, dtype=dtype_user_register)
16 | # df_user_register.drop_duplicates(inplace=True)
17 | # df_user_register_train = df_user_register.loc[(df_user_register["register_day"]>=trainSpan[0])&(df_user_register["register_day"]<=trainSpan[1])]
18 | # these are global features
19 | df_user_register["register_day_rate"] = df_user_register.groupby(by=["register_day"])["register_day"].transform(
20 | "count")
21 | df_user_register["register_type_rate"] = df_user_register.groupby(by=["register_type"])["register_type"].transform(
22 | "count")
23 | df_user_register["register_type_device"] = df_user_register.groupby(by=["register_type"])["device_type"].transform(
24 | lambda x: x.nunique())
25 | df_user_register["device_type_rate"] = df_user_register.groupby(by=["device_type"])["device_type"].transform(
26 | "count")
27 | df_user_register["device_type_register"] = df_user_register.groupby(by=["device_type"])["register_type"].transform(
28 | lambda x: x.nunique())
29 | df_user_register.to_csv("data/user_register_log_global.csv",header=True,index=False)
30 |
31 | user_register_feature = ["user_id",
32 | "register_day_rate", "register_type_rate",
33 | "register_type_device", "device_type_rate", "device_type_register"
34 | ]
35 | df_user_register_base = df_user_register[["user_id", "register_day"]].drop_duplicates()
36 |
37 | print("get users from app launch log")
38 | # app_launch_log = ["user_id","app_launch_day"]
39 | dtype_app_launch = {"user_id": np.uint32, "app_launch_day": np.uint8}
40 | df_app_launch = pd.read_csv("data/app_launch_log.csv", header=0, index_col=None, dtype=dtype_app_launch)
41 | df_app_launch = df_app_launch.merge(df_user_register_base, on=["user_id"], how="left").fillna(-1)
42 |
43 | df_app_launch["user_app_launch_rate_global"] = df_app_launch.groupby(by=["user_id"])[
44 | "app_launch_day"].transform("count")
45 | # df_app_launch["user_app_launch_register_min_time_global"] = df_app_launch.groupby(by=["user_id"])[
46 | # "app_launch_day"].transform(lambda x: min(x)) - \
47 | # df_app_launch["register_day"]
48 | df_app_launch["user_app_launch_register_max_time_global"] = df_app_launch.groupby(by=["user_id"])[
49 | "app_launch_day"].transform(lambda x: max(x)) - \
50 | df_app_launch["register_day"]
51 | df_app_launch["user_app_launch_register_mean_time_global"] = df_app_launch.groupby(by=["user_id"])[
52 | "app_launch_day"].transform(
53 | lambda x: (max(x) + min(x)) / 2) - df_app_launch["register_day"]
54 | df_app_launch["user_app_launch_gap_global"] = df_app_launch.groupby(by=["user_id"])[
55 | "app_launch_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x)) > 1 else 0)
56 | df_app_launch["user_app_launch_var_global"] = df_app_launch.groupby(by=["user_id"])[
57 | "app_launch_day"].transform(lambda x: np.var(x))
58 | df_app_launch.to_csv("data/app_launch_log_global.csv", header=True, index=False)
59 |
60 | print("get users from video create")
61 | # video_create_log = ["user_id", "video_create_day"]
62 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8}
63 | df_video_create = pd.read_csv("data/video_create_log.csv", header=0, index_col=None, dtype=dtype_video_create)
64 | df_video_create = df_video_create.merge(df_user_register_base, on=["user_id"], how="left").fillna(-1)
65 |
66 | df_video_create["user_video_create_rate_global"] = df_video_create.groupby(by=["user_id"])[
67 | "video_create_day"].transform("count")
68 | df_video_create["user_video_create_day_global"] = df_video_create.groupby(by=["user_id"])[
69 | "video_create_day"].transform(lambda x: x.nunique())
70 | df_video_create["user_video_create_frequency_global"] = df_video_create["user_video_create_rate_global"] / \
71 | df_video_create["user_video_create_day_global"]
72 |
73 | df_video_create["user_video_create_register_min_time_global"] = df_video_create.groupby(by=["user_id"])[
74 | "video_create_day"].transform(
75 | lambda x: min(x)) - \
76 | df_video_create["register_day"]
77 | df_video_create["user_video_create_register_max_time_global"] = df_video_create.groupby(by=["user_id"])[
78 | "video_create_day"].transform(
79 | lambda x: max(x)) - \
80 | df_video_create["register_day"]
81 | df_video_create["user_video_create_register_mean_time_global"] = df_video_create.groupby(by=["user_id"])[
82 | "video_create_day"].transform(
83 | lambda x: (max(x) + min(x)) / 2) - df_video_create["register_day"]
84 | # df_video_create["user_video_create_register_mean_time"] = df_video_create["video_create_day"]-df_video_create["register_day"]
85 | df_video_create["user_video_create_gap_global"] = df_video_create.groupby(by=["user_id"])[
86 | "video_create_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x)) > 1 else 0)
87 | df_video_create["user_video_create_var_global"] = df_video_create.groupby(by=["user_id"])[
88 | "video_create_day"].transform(lambda x: np.var(x))
89 | df_video_create.to_csv("data/video_create_log_global.csv", header=True, index=False)
90 |
91 | print("get users from user activity log")
92 | # user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"]
93 | # usecols = ["user_id", "user_activity_day", "page","action_type"]
94 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32,
95 | "author_id": np.uint32, "action_type": np.uint8}
96 | df_user_activity = pd.read_csv("data/user_activity_log.csv", header=0, index_col=None, dtype=dtype_user_activity)
97 | df_user_activity = df_user_activity.merge(df_user_register_base, on=["user_id"], how="left").fillna(-1)
98 | # df_user_activity = df_user_activity.sample(n=50000)
99 | print("read , merge and sample over")
100 | # print(df_user_activity.describe())
101 | # df_user_activity.drop_duplicates(inplace=True)
102 | # print(df_user_activity.describe())
103 | df_user_activity["user_activity_rate_global"] = (df_user_activity.groupby(by=["user_id"])["user_id"].transform(
104 | "count")).astype(np.uint16)
105 | df_user_activity["user_activity_day_rate_global"] = (df_user_activity.groupby(by=["user_id"])[
106 | "user_activity_day"].transform(lambda x: x.nunique())).astype(np.uint8)
107 | df_user_activity["user_activity_frequency_global"] = df_user_activity["user_activity_rate_global"]/df_user_activity["user_activity_day_rate_global"]
108 | df_user_activity["user_activity_gap_global"] = df_user_activity.groupby(by=["user_id"])[
109 | "user_activity_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x)) > 1 else 0)
110 | df_user_activity["user_activity_var_global"] = df_user_activity.groupby(by=["user_id"])[
111 | "user_activity_day"].transform(lambda x: np.var(x))
112 | df_user_activity["user_activity_register_min_time_global"] = (df_user_activity.groupby(by=["user_id"])[
113 | "user_activity_day"].transform(lambda x: min(x)) - \
114 | df_user_activity["register_day"]).astype(np.uint8)
115 | df_user_activity["user_activity_register_max_time_global"] = (df_user_activity.groupby(by=["user_id"])[
116 | "user_activity_day"].transform(lambda x: max(x)) - \
117 | df_user_activity["register_day"]).astype(np.uint8)
118 | df_user_activity["user_activity_register_mean_time_global"] = df_user_activity.groupby(by=["user_id"])[
119 | "user_activity_day"].transform(
120 | lambda x: (max(x) + min(x)) / 2) - df_user_activity["register_day"]
121 | print("groupby one columns ")
122 | df_user_activity["user_page_num_global"] = (df_user_activity.groupby(by=["user_id"])["page"].transform(
123 | lambda x: x.nunique())).astype(np.uint8)
124 | df_user_activity["user_video_num_global"] = (df_user_activity.groupby(by=["user_id"])["video_id"].transform(
125 | lambda x: x.nunique())).astype(np.uint16)
126 | df_user_activity["user_author_num_global"] = (df_user_activity.groupby(by=["user_id"])["author_id"].transform(
127 | lambda x: x.nunique())).astype(np.uint16)
128 | df_user_activity["user_action_type_num_global"] = (df_user_activity.groupby(by=["user_id"])[
129 | "action_type"].transform(lambda x: x.nunique())).astype(np.uint8)
130 | print("groupby two columns ")
131 | # df_user_activity["user_author_video_num_global"] = (df_user_activity.groupby(by=["user_id", "author_id"])[
132 | # "video_id"].transform(
133 | # lambda x: x.nunique())).astype(np.uint16)
134 | # print("1")
135 | # df_user_activity["user_video_action_type_num_global"] = (df_user_activity.groupby(by=["user_id", "video_id"])[
136 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8)
137 | # print("2")
138 | # df_user_activity["user_author_action_type_num_global"] = (df_user_activity.groupby(by=["user_id", "author_id"])[
139 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8)
140 | # print("3")
141 | # df_user_activity["user_page_action_type_num_global"] = (df_user_activity.groupby(by=["user_id", "page"])[
142 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8)
143 | print("data process over")
144 | # df_user_activity["page_rate_global"] = (df_user_activity.groupby(by=["page"])["page"].transform("count")).astype(np.uint32)
145 | # df_user_activity["page_video_global"] = (df_user_activity.groupby(by=["page"])["video_id"].transform(
146 | # lambda x: x.nunique())).astype(np.uint32)
147 | # df_user_activity["page_author_global"] = (df_user_activity.groupby(by=["page"])["author_id"].transform(
148 | # lambda x: x.nunique())).astype(np.uint32)
149 | # df_user_activity["video_rate_global"] = (df_user_activity.groupby(by=["video_id"])["video_id"].transform(
150 | # "count")).astype(np.uint32)
151 | # df_user_activity["video_user_global"] = (df_user_activity.groupby(by=["video_id"])["user_id"].transform(
152 | # lambda x: x.nunique())).astype(np.uint16)
153 | # df_user_activity["video_action_type_global"] = (df_user_activity.groupby(by=["video_id"])[
154 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8)
155 | # df_user_activity["author_rate_global"] = (df_user_activity.groupby(by=["video_id"])["author_id"].transform(
156 | # "count")).astype(np.uint32)
157 | # df_user_activity["author_user_global"] = (df_user_activity.groupby(by=["author_id"])["user_id"].transform(
158 | # lambda x: x.nunique())).astype(np.uint16)
159 | # df_user_activity["author_video_global"] = (df_user_activity.groupby(by=["author_id"])["video_id"].transform(
160 | # lambda x: x.nunique())).astype(np.uint16)
161 | # df_user_activity["author_action_type_global"] = (df_user_activity.groupby(by=["author_id"])[
162 | # "action_type"].transform(lambda x: x.nunique())).astype(np.uint8)
163 | # df_user_activity["action_type_rate_global"] = (df_user_activity.groupby(by=["action_type"])[
164 | # "action_type"].transform("count")).astype(np.uint32)
165 | df_user_activity.to_csv("data/user_activity_log_global.csv", header=True, index=False)
166 |
167 | if __name__ == "__main__":
168 | get_global_file()
169 |
--------------------------------------------------------------------------------
/dataprocesspy/__pycache__/data_process_v7.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/dataprocesspy/__pycache__/data_process_v7.cpython-36.pyc
--------------------------------------------------------------------------------
/dataprocesspy/create_data.py:
--------------------------------------------------------------------------------
1 | from data_process_v7 import processing
2 |
3 | if __name__=="__main__":
4 | # print("begin to load the trainset2")
5 | # train_set2 = processing(trainSpan=(1, 15), label=True)
6 | # train_set2.to_csv("../data/data_v4/training_rld1-15.csv", header=True, index=False)
7 | # train_set2 = pd.read_csv("data/training_eld1-15.csv", header=0, index_col=None, usecols=use_feature)
8 | # print(train_set2.describe())
9 | # print("begin to load the testset")
10 | # train_set52 = processing(trainSpan=(1, 30), label=False)
11 | # train_set52.to_csv("data/testing_eld1-30_r.csv", header=True, index=False)
12 | # train_set52 = pd.read_csv("data/training_eld1-23.csv", header=0, index_col=None, usecols=use_feature)
13 | # print(train_set52.describe())
14 | # print("begin to load the trainset52")
15 | # train_set52 = processing(trainSpan=(1, 23), label=True)
16 | # train_set52.to_csv("data/training_rld1-23_r.csv", header=True, index=False)
17 | # # train_set52 = pd.read_csv("data/training_eld1-23.csv", header=0, index_col=None, usecols=use_feature)
18 | # print(train_set52.describe())
19 | # print("begin to load the trainset51")
20 | # train_set51 = processing(trainSpan=(1, 22), label=True)
21 | # train_set51.to_csv("data/training_rld1-22.csv", header=True, index=False)
22 | # # train_set5 = pd.read_csv("data/training_eld1-22.csv", header=0, index_col=None, usecols=use_feature)
23 | # print(train_set51.describe())
24 | print("begin to load the trainset5")
25 | train_set5 = processing(trainSpan=(1, 21), label=True)
26 | train_set5.to_csv("../data/data_v4/training_rld1-21.csv", header=True, index=False)
27 | # train_set5 = pd.read_csv("data/training_eld1-21.csv", header=0, index_col=None, usecols=use_feature)
28 | print(train_set5.describe())
29 | print("begin to load the trainset41")
30 | train_set41 = processing(trainSpan=(1, 20), label=True)
31 | train_set41.to_csv("../data/data_v4/training_rld1-20.csv", header=True, index=False)
32 | # train_set41 = pd.read_csv("data/training_eld1-20.csv", header=0, index_col=None, usecols=use_feature)
33 | print(train_set41.describe())
34 | print("begin to load the trainset4")
35 | train_set4 = processing(trainSpan=(1, 19), label=True)
36 | train_set4.to_csv("../data/data_v4/training_rld1-19.csv", header=True, index=False)
37 | # train_set4 = pd.read_csv("data/training_eld1-19.csv", header=0, index_col=None, usecols=use_feature)
38 | print(train_set4.describe())
39 |
40 | print("begin to load the trainset21")
41 | train_set21 = processing(trainSpan=(1, 16), label=True)
42 | train_set21.to_csv("../data/data_v4/training_rld1-16.csv", header=True, index=False)
43 | # train_set21 = pd.read_csv("data/training_eld1-16.csv", header=0, index_col=None, usecols=use_feature)
44 | print(train_set21.describe())
45 | print("begin to load the trainset3")
46 | train_set3 = processing(trainSpan=(1, 17), label=True)
47 | train_set3.to_csv("../data/data_v4/training_rld1-17.csv", header=True, index=False)
48 | # train_set3 = pd.read_csv("data/training_eld1-17.csv", header=0, index_col=None, usecols=use_feature)
49 | print(train_set3.describe())
50 | print("begin to load the trainset31")
51 | train_set31 = processing(trainSpan=(1, 18), label=True)
52 | train_set31.to_csv("../data/data_v4/training_rld1-18.csv", header=True, index=False)
53 | # train_set3 = pd.read_csv("data/training_eld1-18.csv", header=0, index_col=None, usecols=use_feature)
54 | print(train_set31.describe())
55 | #
56 |
57 |
--------------------------------------------------------------------------------
/dataprocesspy/data_process.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | user_register_log = ["user_id","register_day","register_type","device_type"]
4 | app_launch_log = ["user_id","app_launch_day"]
5 | video_create_log = ["user_id","video_create_day"]
6 | user_activity_log = ["user_id","user_activity_day","page","video_id","author_id","action_type"]
7 |
8 | user_register_feature = ["user_id",
9 | "register_day_rate","register_type_rate",
10 | "register_type_device","device_type_rate","device_type_register"]
11 | app_launch_feature = ["user_id",
12 | "user_app_launch_rate","user_app_launch_gap"]
13 | video_create_feature = ["user_id",
14 | "user_video_create_rate","user_video_create_day","user_video_create_gap"]
15 | user_activity_feature = ["user_id",
16 | "user_activity_day","user_activity_day_rate","user_activity_gap",
17 | # "page_rate","page_action_type",
18 | # "video_id_rate","video_id_user","video_id_action_type",
19 | # "author_id_rate","author_id_user","author_id_video",
20 | # "action_type_rate","action_type_page"
21 | ]
22 |
23 | def processing(trainSpan=(1,23),label=True):
24 | if label:
25 | assert isinstance(trainSpan,tuple),"input parameter should be a tuple with two items (min,max)"
26 | assert trainSpan[0]>0 and trainSpan[0]<23 and trainSpan[1]>trainSpan[0] and trainSpan[1]<=23
27 | else:
28 | assert isinstance(trainSpan,tuple),"input parameter should be a tuple with two items (min,max)"
29 | assert trainSpan[0]>0 and trainSpan[0]<30 and trainSpan[1]>trainSpan[0] and trainSpan[1]<=30
30 | print("get users from user register log")
31 | # user_register_log = ["user_id", "register_day", "register_type", "device_type"]
32 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type":np.uint16}
33 | df_user_register = pd.read_csv("data/user_register_log.csv",header=0,index_col=None,dtype=dtype_user_register)
34 | # df_user_register.drop_duplicates(inplace=True)
35 | df_user_register_train = df_user_register.loc[(df_user_register["register_day"]>=trainSpan[0])&(df_user_register["register_day"]<=trainSpan[1])]
36 |
37 | df_user_register_train["register_day_rate"] = df_user_register_train.groupby(by=["register_day"])["register_day"].transform("count")
38 | df_user_register_train["register_type_rate"] = df_user_register_train.groupby(by=["register_type"])["register_type"].transform("count")
39 | df_user_register_train["register_type_device"] = df_user_register_train.groupby(by=["register_type"])["device_type"].transform(lambda x: x.nunique())
40 | df_user_register_train["device_type_rate"] = df_user_register_train.groupby(by=["device_type"])["device_type"].transform("count")
41 | df_user_register_train["device_type_register"] = df_user_register_train.groupby(by=["device_type"])["register_type"].transform(lambda x: x.nunique())
42 |
43 | df_user_register_train = df_user_register_train[user_register_feature].drop_duplicates()
44 | print(df_user_register_train.describe())
45 |
46 | print("get users from app launch log")
47 | # app_launch_log = ["user_id","app_launch_day"]
48 | dtype_app_launch = {"user_id": np.uint32, "app_launch_day": np.uint8}
49 | df_app_launch = pd.read_csv("data/app_launch_log.csv", header=0, index_col=None, dtype=dtype_app_launch)
50 | # df_app_launch.drop_duplicates(inplace=True)
51 | df_app_launch_train = df_app_launch.loc[
52 | (df_app_launch["app_launch_day"] >= trainSpan[0]) & (df_app_launch["app_launch_day"] <= trainSpan[1])]
53 |
54 | # print(df_app_launch_train.describe())
55 | df_app_launch_train["user_app_launch_rate"] = df_app_launch_train.groupby(by=["user_id"])[
56 | "app_launch_day"].transform("count")
57 | df_app_launch_train["user_app_launch_gap"] = df_app_launch_train.groupby(by=["user_id"])[
58 | "app_launch_day"].transform(lambda x: (max(x) - min(x)) / (len(x) - 1) if len(set(x)) > 1 else 0)
59 |
60 | df_app_launch_train = df_app_launch_train[app_launch_feature].drop_duplicates()
61 | print(df_app_launch_train.describe())
62 |
63 | print("get users from video create")
64 | # video_create_log = ["user_id", "video_create_day"]
65 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8}
66 | df_video_create = pd.read_csv("data/video_create_log.csv",header=0,index_col=None,dtype=dtype_video_create)
67 | # df_video_create.drop_duplicates(inplace=True)
68 | df_video_create_train = df_video_create.loc[
69 | (df_video_create["video_create_day"] >= trainSpan[0]) & (df_video_create["video_create_day"] <= trainSpan[1])]
70 |
71 | df_video_create_train["user_video_create_rate"] = df_video_create_train.groupby(by=["user_id"])[
72 | "video_create_day"].transform("count")
73 | df_video_create_train["user_video_create_day"] = df_video_create_train.groupby(by=["user_id"])[
74 | "video_create_day"].transform(lambda x: x.nunique())
75 | df_video_create_train["user_video_create_gap"] = df_video_create_train.groupby(by=["user_id"])[
76 | "video_create_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x)) > 1 else 0)
77 | # print(df_video_create_train.describe())
78 | df_video_create_train = df_video_create_train[video_create_feature].drop_duplicates()
79 | print(df_video_create_train.describe())
80 |
81 | print("get users from user activity log")
82 | # user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"]
83 | # usecols = ["user_id", "user_activity_day", "page","action_type"]
84 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32,
85 | "author_id": np.uint32, "action_type": np.uint8}
86 | df_user_activity = pd.read_csv("data/user_activity_log.csv", header=0, index_col=None, dtype=dtype_user_activity)
87 | # print(df_user_activity.describe())
88 | # df_user_activity.drop_duplicates(inplace=True)
89 | # print(df_user_activity.describe())
90 | df_user_activity_train = df_user_activity.loc[
91 | (df_user_activity["user_activity_day"] >= trainSpan[0]) & (
92 | df_user_activity["user_activity_day"] <= trainSpan[1])]
93 |
94 | df_user_activity_train["user_activity_rate"] = df_user_activity_train.groupby(by=["user_id"])["user_id"].transform(
95 | "count")
96 | df_user_activity_train["user_activity_day_rate"] = df_user_activity_train.groupby(by=["user_id"])[
97 | "user_activity_day"].transform(lambda x: x.nunique())
98 | df_user_activity_train["user_activity_gap"] = df_user_activity_train.groupby(by=["user_id"])[
99 | "user_activity_day"].transform(lambda x: (max(x) - min(x)) / (len(set(x)) - 1) if len(set(x))>1 else 0)
100 | # df_user_activity_train["page_rate"] = df_user_activity_train.groupby(by=["page"])["page"].transform("count")
101 | # df_user_activity_train["page_action_type"] = df_user_activity_train.groupby(by=["page"])["action_type"].transform(
102 | # lambda x: x.nunique())
103 | # df_user_activity_train["video_id_rate"] = df_user_activity_train.groupby(by=["video_id"])["video_id"].transform(
104 | # "count")
105 | # df_user_activity_train["video_id_user"] = df_user_activity_train.groupby(by=["video_id"])["user_id"].transform(
106 | # lambda x: x.nunique())
107 | # df_user_activity_train["video_id_action_type"] = df_user_activity_train.groupby(by=["video_id"])[
108 | # "action_type"].transform(lambda x: x.nunique())
109 | # df_user_activity_train["author_id_rate"] = df_user_activity_train.groupby(by=["author_id"])["author_id"].transform(
110 | # "count")
111 | # df_user_activity_train["author_id_user"] = df_user_activity_train.groupby(by=["author_id"])["user_id"].transform(
112 | # lambda x: x.nunique())
113 | # df_user_activity_train["author_id_video"] = df_user_activity_train.groupby(by=["author_id"])["video_id"].transform(
114 | # lambda x: x.nunique())
115 | # df_user_activity_train["action_type_rate"] = df_user_activity_train.groupby(by=["action_type"])[
116 | # "action_type"].transform("count")
117 | # df_user_activity_train["action_type_page"] = df_user_activity_train.groupby(by=["action_type"])["page"].transform(
118 | # lambda x: x.nunique())
119 | df_user_activity_train = df_user_activity_train[user_activity_feature].drop_duplicates()
120 | print(df_user_activity_train.describe())
121 |
122 | if label:
123 | active_user_register = (df_user_register.loc[(df_user_register["register_day"]>trainSpan[1])&(df_user_register["register_day"]<=(trainSpan[1]+7))]).user_id.unique().tolist()
124 | active_app_launch = (df_app_launch.loc[(df_app_launch["app_launch_day"] > trainSpan[1]) & (df_app_launch["app_launch_day"] <= (trainSpan[1] + 7))]).user_id.unique().tolist()
125 | active_video_create = (df_video_create.loc[(df_video_create["video_create_day"]>trainSpan[1])&(df_video_create["video_create_day"]<=(trainSpan[1]+7))]).user_id.unique().tolist()
126 | active_user_activity = (df_user_activity.loc[(df_user_activity["user_activity_day"] > trainSpan[1]) & (df_user_activity["user_activity_day"] <= (trainSpan[1] + 7))]).user_id.unique().tolist()
127 | active_user = list(set(active_user_register+active_app_launch+active_video_create+active_user_activity))
128 |
129 | df_user_register_train["label"] = 0
130 | df_user_register_train.loc[df_user_register_train["user_id"].isin(active_user),"label"] = 1
131 |
132 | df_app_launch_train["label"] = 0
133 | df_app_launch_train.loc[df_app_launch_train["user_id"].isin(active_user),"label"] = 1
134 |
135 | df_video_create_train["label"] = 0
136 | df_video_create_train.loc[df_video_create_train["user_id"].isin(active_user),"label"] = 1
137 |
138 | df_user_activity_train["label"] = 0
139 | df_user_activity_train.loc[df_user_activity_train["user_id"].isin(active_user),"label"] = 1
140 |
141 | df_register_launch = df_user_register_train.merge(df_app_launch_train,how="left")
142 | # print(df_register_launch.describe())
143 | df_register_launch_create = df_register_launch.merge(df_video_create_train,how="left")
144 | # print(df_register_launch_create.describe())
145 | df_register_launch_create = df_register_launch_create.fillna(0)
146 | df_activity_register_launch_create = df_user_activity_train.merge(df_register_launch_create,how="left")
147 | df_activity_register_launch_create = df_activity_register_launch_create.fillna(0)
148 | print(df_activity_register_launch_create.describe())
149 | return df_activity_register_launch_create
150 |
--------------------------------------------------------------------------------
/hardcodedpy/hard_approach.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | import pandas as pd
4 | import numpy as np
5 | user_register_log = ["user_id","register_day","register_type","device_type"]
6 | app_launch_log = ["user_id","app_launch_day"]
7 | video_create_log = ["user_id","video_create_day"]
8 | user_activity_log = ["user_id","user_activity_day","page","video_id","author_id","action_type"]
9 |
10 |
11 | def get_user_from_videoCreate(laterThanDay,videoCount):
12 | print("get users from video create")
13 | video_create_log = ["user_id", "video_create_day"]
14 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8}
15 | df_video_create = pd.read_table("data/A2/video_create_log.txt",header=None,names=video_create_log,index_col=None,dtype=dtype_video_create)
16 | latest_user = (df_video_create.loc[df_video_create["video_create_day"]>laterThanDay]).user_id.unique().tolist()
17 | print("get latest users")
18 | print(latest_user)
19 | print(len(latest_user))
20 | df_video_create["videoCount"] = df_video_create.groupby(by=["user_id"])["video_create_day"].transform(lambda x: x.nunique())
21 | frequent_user = (df_video_create.loc[df_video_create["videoCount"]>videoCount]).user_id.unique().tolist()
22 | print("get frequent users")
23 | print(frequent_user)
24 | print(len(frequent_user))
25 | user_videoCreate = list(set(latest_user+frequent_user))
26 | print(user_videoCreate)
27 | print(len(user_videoCreate))
28 | return user_videoCreate
29 | # with open("result/submission.csv","a",newline="") as f:
30 | # writer = csv.writer(f)
31 | # for i in user_videoCreate:
32 | # writer.writerow([i])
33 | # get_user_from_videoCreate(23,2)
34 | def get_user_from_appLaunch(laterThanDay,launchCount):
35 | print("get users from app launch log")
36 | app_launch_log = ["user_id","app_launch_day"]
37 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8}
38 | df_app_launch = pd.read_table("data/A2/app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch)
39 | latest_user = (df_app_launch.loc[df_app_launch["app_launch_day"]>laterThanDay]).user_id.unique().tolist()
40 | print("get latest users")
41 | print(latest_user)
42 | print(len(latest_user))
43 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique())
44 | frequent_user = (df_app_launch.loc[df_app_launch["launchCount"]>launchCount]).user_id.unique().tolist()
45 | print("get frequent users")
46 | print(frequent_user)
47 | print(len(frequent_user))
48 | user_appLaunch = list(set(latest_user+frequent_user))
49 | print("get merged users")
50 | print(user_appLaunch)
51 | print(len(user_appLaunch))
52 | return user_appLaunch
53 | # with open("result/submission.csv","a",newline="") as f:
54 | # writer = csv.writer(f)
55 | # for i in user_appLaunch:
56 | # writer.writerow([i])
57 | # get_user_from_appLaunch(27,4)
58 | def get_user_from_userRegister(laterThanDay):
59 | print("get users from user register log")
60 | user_register_log = ["user_id", "register_day", "register_type", "device_type"]
61 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": str}
62 | df_user_register = pd.read_table("data/A2/user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register)
63 | latest_user = (df_user_register.loc[df_user_register["register_day"]>laterThanDay]).user_id.unique().tolist()
64 | print("get latest users")
65 | print(latest_user)
66 | print(len(latest_user))
67 | return latest_user
68 | # get_user_from_userRegister(25)
69 | def get_user_from_userActivity(laterThanDay,dayCount,pageList,typeList):
70 | print("get users from user activity log")
71 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"]
72 | usecols = ["user_id", "user_activity_day", "page","action_type"]
73 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "action_type": np.uint8}
74 | df_user_activity = pd.read_table("data/A2/user_activity_log.txt",header=None,names=user_activity_log,usecols=usecols,index_col=None,dtype=dtype_user_activity)
75 | latest_user = (df_user_activity.loc[df_user_activity["user_activity_day"]>laterThanDay]).user_id.unique().tolist()
76 | print("get latest users")
77 | print(latest_user)
78 | print(len(latest_user))
79 |
80 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique())
81 | frequent_user = (df_user_activity.loc[df_user_activity["dayCount"]>dayCount]).user_id.unique().tolist()
82 | print("get frequent users")
83 | print(frequent_user)
84 | print(len(frequent_user))
85 |
86 | print("get users in certain pages and certain action type")
87 | user_inList = (df_user_activity.loc[((df_user_activity["page"].isin(pageList))|(df_user_activity["action_type"].isin(typeList)))&(df_user_activity["user_activity_day"]>laterThanDay-3)]).user_id.unique().tolist()
88 |
89 | print(user_inList)
90 | print(len(user_inList))
91 | user_userActivity = list(set(latest_user+frequent_user+user_inList))
92 |
93 | print("get merged users")
94 | print(user_userActivity)
95 | print(len(user_userActivity))
96 | return user_userActivity
97 | # get_user_from_userActivity(18, 3, [1,2,3], [1,3,4,5])
98 |
99 | def get_user():
100 |
101 | user_videoCreate = get_user_from_videoCreate(24, 1)
102 | user_appLaunch = get_user_from_appLaunch(24, 3)
103 | user_userRegister = get_user_from_userRegister(27)
104 | user_userActivity = get_user_from_userActivity(27, 3, [1], [3,4,5])
105 |
106 | users = list(set(user_videoCreate+user_appLaunch+user_userRegister+user_userActivity))
107 | print("get the final merged users")
108 | print(users)
109 | print(len(users))
110 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
111 | submission_file = "result/submission_h_" + str_time + ".csv"
112 | with open(submission_file,"a",newline="") as f:
113 | writer = csv.writer(f)
114 | for i in users:
115 | writer.writerow([i])
116 | # get_user()
--------------------------------------------------------------------------------
/hardcodedpy/hardcode_approach_v2.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | import pandas as pd
4 | import numpy as np
5 | user_register_log = ["user_id","register_day","register_type","device_type"]
6 | app_launch_log = ["user_id","app_launch_day"]
7 | video_create_log = ["user_id","video_create_day"]
8 | user_activity_log = ["user_id","user_activity_day","page","video_id","author_id","action_type"]
9 |
10 |
11 | def get_frequser_from_videoCreate(videoCount):
12 | print("get users from video create")
13 | video_create_log = ["user_id", "video_create_day"]
14 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8}
15 | df_video_create = pd.read_table("data/video_create_log.txt",header=None,names=video_create_log,index_col=None,dtype=dtype_video_create)
16 | # latest_user = (df_video_create.loc[df_video_create["video_create_day"]>laterThanDay]).user_id.unique().tolist()
17 | # print("get latest users")
18 | # print(latest_user)
19 | # print(len(latest_user))
20 | df_video_create["videoCount"] = df_video_create.groupby(by=["user_id"])["video_create_day"].transform(lambda x: x.nunique())
21 | frequent_user = (df_video_create.loc[df_video_create["videoCount"]>videoCount]).user_id.unique().tolist()
22 | print(df_video_create.describe())
23 | print("get frequent users")
24 | print(frequent_user)
25 | print(len(frequent_user))
26 | return frequent_user
27 | # with open("result/submission.csv","a",newline="") as f:
28 | # writer = csv.writer(f)
29 | # for i in user_videoCreate:
30 | # writer.writerow([i])
31 | # get_frequser_from_videoCreate(3)
32 | def get_frequser_from_appLaunch(launchCount):
33 | print("get users from app launch log")
34 | app_launch_log = ["user_id","app_launch_day"]
35 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8}
36 | df_app_launch = pd.read_table("data/app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch)
37 | # latest_user = (df_app_launch.loc[df_app_launch["app_launch_day"]>laterThanDay]).user_id.unique().tolist()
38 | # print("get latest users")
39 | # print(latest_user)
40 | # print(len(latest_user))
41 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique())
42 | frequent_user = (df_app_launch.loc[df_app_launch["launchCount"]>launchCount]).user_id.unique().tolist()
43 | print(df_app_launch.describe())
44 | print("get frequent users")
45 | print(frequent_user)
46 | print(len(frequent_user))
47 | return frequent_user
48 | # with open("result/submission.csv","a",newline="") as f:
49 | # writer = csv.writer(f)
50 | # for i in user_appLaunch:
51 | # writer.writerow([i])
52 | # get_frequser_from_appLaunch(10)
53 | def get_user_from_userRegister(laterThanDay):
54 | print("get users from user regiser log")
55 | user_register_log = ["user_id", "register_day", "register_type", "device_type"]
56 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": str}
57 | df_user_register = pd.read_table("data/user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register)
58 | latest_user = (df_user_register.loc[df_user_register["register_day"]>laterThanDay]).user_id.unique().tolist()
59 | print("get latest users")
60 | print(latest_user)
61 | print(len(latest_user))
62 | return latest_user
63 | # get_user_from_userRegister(25)
64 | def get_frequser_from_userActivity(dayCount):
65 | print("get users from user activity log")
66 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"]
67 | usecols = ["user_id", "user_activity_day"]
68 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "action_type": np.uint8}
69 | df_user_activity = pd.read_table("data/user_activity_log.txt",header=None,names=user_activity_log,usecols=usecols,index_col=None,dtype=dtype_user_activity).drop_duplicates()
70 | # latest_user = (df_user_activity.loc[df_user_activity["user_activity_day"]>laterThanDay]).user_id.unique().tolist()
71 | # print("get latest users")
72 | # print(latest_user)
73 | # print(len(latest_user))
74 |
75 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique())
76 | frequent_user = (df_user_activity.loc[df_user_activity["dayCount"]>dayCount]).user_id.unique().tolist()
77 | print(df_user_activity.describe())
78 | print("get frequent users")
79 | print(frequent_user)
80 | print(len(frequent_user))
81 |
82 | # print("get users in certain pages and certain action type")
83 | # user_inList = (df_user_activity.loc[((df_user_activity["page"].isin(pageList))|(df_user_activity["action_type"].isin(typeList)))&(df_user_activity["user_activity_day"]>laterThanDay-3)]).user_id.unique().tolist()
84 | #
85 | # print(user_inList)
86 | # print(len(user_inList))
87 | # user_userActivity = list(set(latest_user+frequent_user+user_inList))
88 | #
89 | # print("get merged users")
90 | # print(user_userActivity)
91 | # print(len(user_userActivity))
92 | return frequent_user
93 | # get_frequser_from_userActivity(10)
94 | def get_activeUsers_from_register():
95 | print("get users from user regiser log")
96 | # user_register_log = ["user_id", "register_day", "register_type", "device_type"]
97 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": np.uint16}
98 | # df_user_register = pd.read_table("data/user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register)
99 | df_user_register = pd.read_csv("data/user_register_log.csv", header=0, index_col=None, dtype=dtype_user_register)
100 |
101 | df_user_register["register_type_rate"] = df_user_register.groupby(by=["register_type"])["register_type"].transform(
102 | "count")
103 | df_user_register["register_type_device"] = df_user_register.groupby(by=["register_type"])["device_type"].transform(
104 | lambda x: x.nunique())
105 | df_user_register["device_type_rate"] = df_user_register.groupby(by=["device_type"])["device_type"].transform(
106 | "count")
107 | df_user_register["device_type_register"] = df_user_register.groupby(by=["device_type"])["register_type"].transform(
108 | lambda x: x.nunique())
109 | active_users = pd.read_csv("hCoded/submission_freqUsers1_2018-06-08_11-16.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist()
110 |
111 | df_acuser_info = df_user_register.loc[df_user_register["user_id"].isin(active_users)]
112 | # df_acuser_info.to_csv("data/active_user_info.csv",header=True,index=False)
113 | print(df_acuser_info.describe())
114 | # get_activeUsers_from_register()
115 | def get_user():
116 |
117 | user_videoCreate = get_frequser_from_videoCreate(3)
118 | user_appLaunch = get_frequser_from_appLaunch(8)
119 | # user_userRegister = get_user_from_userRegister(27)
120 | user_userActivity = get_frequser_from_userActivity(7)
121 |
122 | users = list(set(user_videoCreate+user_appLaunch+user_userActivity))
123 | print("get the final merged users")
124 | print(users)
125 | print(len(users))
126 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
127 | submission_file = "hCoded/submission_freqUsers_v2_" + str_time + ".csv"
128 | with open(submission_file,"a",newline="") as f:
129 | writer = csv.writer(f)
130 | for i in users:
131 | writer.writerow([i])
132 | get_user()
--------------------------------------------------------------------------------
/hardcodedpy/hardcode_approach_v3.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | def processing(laterThanDay,launchCount,videoCount,activityCount):
5 | print("get users from user register log")
6 | user_register_log = ["user_id", "register_day", "register_type", "device_type"]
7 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": np.uint32}
8 | df_user_register = pd.read_table("user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register)
9 | user_outliers = df_user_register[(df_user_register["register_type"] == 3) & (
10 | (df_user_register["device_type"] == 1) | (df_user_register["device_type"] == 223) | (
11 | df_user_register["device_type"] == 83))]["user_id"].unique().tolist()
12 | df_user_register = df_user_register[~df_user_register["user_id"].isin(user_outliers)]
13 | df_user_register = df_user_register.loc[df_user_register["register_day"]>laterThanDay]
14 |
15 |
16 | print("get users from app launch log")
17 | app_launch_log = ["user_id","app_launch_day"]
18 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8}
19 | df_app_launch = pd.read_table("app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch)
20 | df_app_launch = df_app_launch[~df_app_launch["user_id"].isin(user_outliers)]
21 | df_app_launch = df_app_launch.loc[df_app_launch["app_launch_day"] >laterThanDay]
22 |
23 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique())
24 | frequent_user1 = (df_app_launch.loc[df_app_launch["launchCount"]>launchCount]).user_id.unique().tolist()
25 | print("number of frequent launch users after {} is {} ".format(laterThanDay,len(frequent_user1)))
26 | video_create_log = ["user_id", "video_create_day"]
27 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8}
28 | df_video_create = pd.read_table("video_create_log.txt",header=None,names=video_create_log,index_col=None,dtype=dtype_video_create)
29 |
30 | df_video_create = df_video_create[~df_video_create["user_id"].isin(user_outliers)]
31 |
32 | df_video_create = df_video_create.loc[df_video_create["video_create_day"] >laterThanDay]
33 |
34 | df_video_create["videoCount"] = df_video_create.groupby(by=["user_id"])["video_create_day"].transform(lambda x: x.nunique())
35 | frequent_user2 = (df_video_create.loc[df_video_create["videoCount"]>videoCount]).user_id.unique().tolist()
36 | print("number of frequent video create users after {} is {} ".format(laterThanDay,len(frequent_user2)))
37 |
38 | print("get users from user activity log")
39 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"]
40 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32,
41 | "author_id":np.uint32, "action_type": np.uint8}
42 | df_user_activity = pd.read_table("user_activity_log.txt",header=None,names=user_activity_log,index_col=None,dtype=dtype_user_activity,usecols=["user_id", "user_activity_day"])
43 | df_user_activity = df_user_activity[~df_user_activity["user_id"].isin(user_outliers)]
44 | df_user_activity= df_user_activity.loc[df_user_activity["user_activity_day"] >laterThanDay]
45 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique())
46 | frequent_user3 = (df_user_activity.loc[df_user_activity["dayCount"]>activityCount]).user_id.unique().tolist()
47 | print("number of frequent activity users after {} is {} ".format(laterThanDay,len(frequent_user3)))
48 |
49 |
50 | processing(24,4,4,4)
51 |
--------------------------------------------------------------------------------
/hardcodedpy/merge_approach.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | from hardcode_approach import get_user
4 | import pandas as pd
5 | import numpy as np
6 | def merge1():
7 | hardcode_user = get_user()
8 | print(len(hardcode_user))
9 | merged_csv1 = pd.read_csv("result/submission_2018-06-01_17-07.csv",header=None,index_col=None,names=["user_id"])
10 | mc1 = merged_csv1["user_id"].tolist()
11 | print(len(mc1))
12 | merged_csv2 = pd.read_csv("result/submission_2018-06-01_17-47.csv",header=None,index_col=None,names=["user_id"])
13 | mc2 = merged_csv2["user_id"].tolist()
14 | print(len(mc2))
15 | mc2 = [e for e in mc2 if e in mc1]
16 | print(len(mc2))
17 | merged_csv3 = pd.read_csv("result/submission_2018-06-01_18-05catboost.csv",header=None,index_col=None,names=["user_id"])
18 | mc3 = merged_csv3["user_id"].tolist()
19 | print(len(mc3))
20 | mc3 = [e for e in mc3 if e in mc2]
21 | print(len(mc3))
22 | users = list(set(hardcode_user+mc3))
23 | print(len(users))
24 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
25 | submission_file = "result/submission_" + str_time + ".csv"
26 | with open(submission_file,"a",newline="") as f:
27 | writer = csv.writer(f)
28 | for i in users:
29 | writer.writerow([i])
30 | def merge2():
31 | hardcode_user = get_user()
32 | print(len(hardcode_user))
33 | merged_csv1 = pd.read_csv("result/submission_2018-05-30_23-20.csv",header=None,index_col=None,names=["user_id"])
34 | mc1 = merged_csv1["user_id"].tolist()
35 | print(len(mc1))
36 | merged_csv2 = pd.read_csv("merge/submission_2018-06-01_11-57.csv",header=None,index_col=None,names=["user_id"])
37 | mc2 = merged_csv2["user_id"].tolist()
38 | print(len(mc2))
39 | mc2 = [e for e in mc2 if e in mc1]
40 | print(len(mc2))
41 | users = list(set(hardcode_user+mc2))
42 | print(len(users))
43 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
44 | submission_file = "result/submission_" + str_time + ".csv"
45 | with open(submission_file,"a",newline="") as f:
46 | writer = csv.writer(f)
47 | for i in users:
48 | writer.writerow([i])
49 | def merge3():
50 | hardcode_user = get_user()
51 | print(len(hardcode_user))
52 | merged_csv1 = pd.read_csv("result/submission_lgb_2018-06-03_00-34.csv",header=None,index_col=None,names=["user_id"])
53 | mc1 = merged_csv1["user_id"][:23500].tolist()
54 | print(len(mc1))
55 | users = list(set(hardcode_user+mc1))
56 | print(len(users))
57 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
58 | submission_file = "result/submission_" + str_time + ".csv"
59 | # with open(submission_file,"a",newline="") as f:
60 | # writer = csv.writer(f)
61 | # for i in users:
62 | # writer.writerow([i])
63 | def merge4():
64 | hardcode_user = get_user()
65 | print(len(hardcode_user))
66 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
67 | hdf = pd.Series(hardcode_user,name="user_id")
68 | hfile = "hCoded/hcode_"+str_time + ".csv"
69 | hdf.to_csv(hfile,header=True,index=False)
70 | merged_csv1 = pd.read_csv("lgb/uid_2018-06-11_22-04-13.csv",header=0,index_col=None)
71 | mc1 = merged_csv1["user_id"][:23800].tolist()
72 | print(len(mc1))
73 | users = list(set(hardcode_user+mc1))
74 | print(len(users))
75 |
76 | submission_file = "merge/submission_" + str_time + ".csv"
77 | # with open(submission_file,"a",newline="") as f:
78 | # writer = csv.writer(f)
79 | # for i in users:
80 | # writer.writerow([i])
81 | # merge4()
82 | def merge5():
83 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
84 | # hardcode_user = get_user()
85 | # hardcode_user = pd.read_csv("merge/submission_v5_fre2_2018-06-08_11-48.csv",header=0,index_col=None)["user_id"].tolist()
86 | # print(len(hardcode_user))
87 | mc2 = pd.read_csv("hCoded/hcode_v12_lastdayofactivityandlaunchcount1_withauthor_2018-06-16_08-54.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist()
88 | print(len(mc2))
89 | # hdf = pd.Series(hardcode_user,name="user_id")
90 | # hfile = "hCoded/hcode_28ac_"+str_time + ".csv"
91 | # hdf.to_csv(hfile,header=True,index=False)
92 | mc = pd.read_csv("single/submission_18-23slgb_0.81-2018-06-16_08-16.csv",header=None,index_col=None,names=["user_id"])["user_id"].tolist()[:20000]
93 | # mc1 = pd.read_csv("lgb/uid_2018-06-04_16-55-34.csv",header=0,index_col=None)
94 | # mc = mc1.loc[mc1["score"]>0.40]["user_id"].tolist()
95 | print(len(mc))
96 | users = list(set(mc2+mc))
97 | print(len(users))
98 | #
99 | submission_file = "merge/submission_0.81lgb20000_v12_" + str_time + ".csv"
100 |
101 | with open(submission_file,"a",newline="") as f:
102 | writer = csv.writer(f)
103 | for i in users:
104 | writer.writerow([i])
105 | # merge5()
106 | def register_in_activity_author(laterThanDay,dayCount):
107 | print("get users from user activity log")
108 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"]
109 | # usecols = ["user_id", "user_activity_day", "page","action_type"]
110 | usecols = ["user_id", "user_activity_day","author_id"]
111 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "action_type": np.uint8}
112 | df_user_activity = pd.read_table("data/user_activity_log.txt",header=None,names=user_activity_log,usecols=usecols,index_col=None,dtype=dtype_user_activity).drop_duplicates()
113 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique())
114 | author_id = df_user_activity["author_id"].unique().tolist()
115 | user_id = df_user_activity["user_id"].unique().tolist()
116 |
117 | def intersection(lst1, lst2):
118 | return list(set(lst1) & set(lst2))
119 |
120 | intersect_id = intersection(user_id, author_id)
121 | print("number of user is author {}".format(len(intersect_id)))
122 | user_userActivity = (df_user_activity.loc[(df_user_activity["user_activity_day"]>laterThanDay)&(df_user_activity["user_id"].isin(intersect_id))&(df_user_activity["dayCount"]>dayCount)]).user_id.unique().tolist()
123 | print("number of user is author activates more than {} days no later than {} : {}".format(dayCount,laterThanDay,len(user_userActivity)))
124 | return user_userActivity
125 | def single():
126 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
127 | single_csv1 = pd.read_csv("06-17/uid_2018-06-17_01-01-33.csv",header=0,index_col=None)["user_id"].unique().tolist()
128 | # mc1 = single_csv1.loc[single_csv1["score"]>0.48]["user_id"].tolist()
129 | mc1 = single_csv1[:23727]
130 | print(len(mc1))
131 | # user_userActivity = register_in_activity_author(23,2)
132 |
133 | users = list(set(mc1))
134 | print(len(users))
135 | submission_file = "single/submission_slgb_all0.8-" + str_time + ".csv"
136 | with open(submission_file,"a",newline="") as f:
137 | writer = csv.writer(f)
138 | for i in users:
139 | writer.writerow([i])
140 | single()
141 |
--------------------------------------------------------------------------------
/hardcodedpy/new_merge.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | import pandas as pd
4 |
5 | def merge5():
6 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
7 | # hardcode_user = get_user()
8 | hardcode_user = pd.read_csv("hCoded/hcode_20-29_v5_2018-06-06_20-12_nolastdayoflaunch_22-30.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist()
9 | print(len(hardcode_user))
10 | # hdf = pd.Series(hardcode_user,name="user_id")
11 | # hfile = "hCoded/hcode_28ac_"+str_time + ".csv"
12 | # hdf.to_csv(hfile,header=True,index=False)
13 | # mc1 = pd.read_csv("lr/uid_2018-06-07_22-55-45.csv",header=0,index_col=None)
14 | mc = pd.read_csv("hCoded/submission_freqUsers_v2_2018-06-08_11-38.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist()
15 | # mc = mc1.loc[mc1["score"]>0.20]["user_id"].tolist()
16 | print(len(mc))
17 | ac_users = list(set(hardcode_user)-set(mc))
18 | print(len(ac_users))
19 |
20 | v5_user = pd.read_csv("hCoded/hcode_20-29_v5_2018-06-06_20-12_nolastdayoflaunch_22-30.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist()
21 | print(len(v5_user))
22 |
23 | users = list(set(v5_user+ac_users))
24 | print(len(users))
25 | # #
26 | # submission_file = "merge/submission_0.815-baseline+v5_" + str_time + ".csv"
27 | # with open(submission_file,"a",newline="") as f:
28 | # writer = csv.writer(f)
29 | # for i in users:
30 | # writer.writerow([i])
31 | def merge6():
32 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
33 | # hardcode_user = get_user()
34 | hardcode_user = pd.read_csv("merge/submission_rule_consec_2018-06-25_20-23.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist()
35 | mc = pd.read_csv("lgb/submission_lgb_r3_1600_4_2018-06-24_23-42-42.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist()[:24000]
36 | print(len(hardcode_user))
37 | # hdf = pd.Series(hardcode_user,name="user_id")
38 | # hfile = "hCoded/hcode_28ac_"+str_time + ".csv"
39 | # hdf.to_csv(hfile,header=True,index=False)
40 | # mc1 = pd.read_csv("lr/uid_2018-06-07_22-55-45.csv",header=0,index_col=None)
41 | # mc = pd.read_csv("hCoded/submission_freqUsers_v3_2018-06-08_11-41.csv",header=None,index_col=None,names=["user_id"])["user_id"].unique().tolist()
42 | # mc = mc1.loc[mc1["score"]>0.20]["user_id"].tolist()
43 |
44 | # mc1 = pd.read_csv("lgb/uid_2018-06-04_16-55-34.csv",header=0,index_col=None)
45 | # mc = mc1.loc[mc1["score"]>0.7]["user_id"].tolist()
46 | # merged_csv1 = pd.read_csv("lgb/uid_2018-06-11_22-04-13.csv",header=0,index_col=None)
47 | # mc = merged_csv1["user_id"][:23800].tolist()
48 | print(len(mc))
49 |
50 | users = list(set(hardcode_user+mc))
51 | # users = list(set(mc))
52 | print(len(users))
53 | # #
54 | submission_file = "merge/submission_lgbhest_ru_" + str_time + ".csv"
55 | with open(submission_file,"a",newline="") as f:
56 | writer = csv.writer(f)
57 | for i in users:
58 | writer.writerow([i])
59 | # merge6()
60 | import numpy as np
61 | def get_user_from_activity_new(trainSpan,laterThanDay,activityCount):
62 | print("get users from user activity log")
63 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32,
64 | "author_id": np.uint32, "action_type": np.uint8}
65 | use_feature = ["user_id","user_activity_day"]
66 | df_user_activity = pd.read_csv("data/user_activity_log.csv", header=0, index_col=None, dtype=dtype_user_activity,usecols=use_feature)
67 |
68 | df_user_activity = df_user_activity.loc[
69 | (df_user_activity["user_activity_day"] >= trainSpan[0]) & (
70 | df_user_activity["user_activity_day"] <= trainSpan[1])]
71 | # print(df_app_launch.groupby(by=["user_id"]).size())
72 | # print(df_app_launch.groupby(by=["app_launch_day"]).size())
73 | df_user_activity["activityCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform("count")
74 | # print(df_user_activity.describe())
75 | user_activity = (df_user_activity.loc[(df_user_activity["user_activity_day"]>laterThanDay)&(df_user_activity["activityCount"]>activityCount)]).user_id.unique().tolist()
76 | print("users active no later than {} and active for more than {} times: {} ".format(laterThanDay,activityCount,len(user_activity)))
77 | return user_activity
78 | def get_user_from_appLaunch_new(trainSpan,laterThanDay,launchCount):
79 | print("get users from app launch log")
80 | app_launch_log = ["user_id","app_launch_day"]
81 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8}
82 | df_app_launch = pd.read_table("data/app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch).drop_duplicates()
83 | df_app_launch = df_app_launch.loc[
84 | (df_app_launch["app_launch_day"] >= trainSpan[0]) & (df_app_launch["app_launch_day"] <= trainSpan[1])]
85 | # print(df_app_launch.groupby(by=["user_id"]).size())
86 | # print(df_app_launch.groupby(by=["app_launch_day"]).size())
87 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique())
88 | user_appLaunch = (df_app_launch.loc[(df_app_launch["app_launch_day"]>laterThanDay)&(df_app_launch["launchCount"]>launchCount)]).user_id.unique().tolist()
89 | print("users launched no later than {} and launched for more than {} days: {} ".format(laterThanDay,launchCount,len(user_appLaunch)))
90 | return user_appLaunch
91 | if __name__=="__main__":
92 | # av1 = get_user_from_activity_new((30,30),29,216)
93 | # av2 = get_user_from_activity_new((29,30),29,342)
94 | # av3 = get_user_from_activity_new((28,30),28,452)
95 | # av4 = get_user_from_activity_new((27,30),27,569)
96 | # av = list(set(av1+av2+av3+av4))
97 | # print(len(av))
98 | # la1 = get_user_from_appLaunch_new((29,30), 29, 1)
99 | # print("number of users between {} and {} is {}".format(29,30,len(la1)))
100 | # la2 = get_user_from_appLaunch_new((28,30), 28, 1)
101 | # print("number of users between {} and {} is {}".format(28,30,len(la2)))
102 | # la3 = get_user_from_appLaunch_new((27,30), 27, 2)
103 | # print("number of users between {} and {} is {}".format(27,30,len(la3)))
104 | # la4 = get_user_from_appLaunch_new((26,30), 26, 3)
105 | # print("number of users between {} and {} is {}".format(26,30,len(la4)))
106 | # la5 = get_user_from_appLaunch_new((25,30), 25, 4)
107 | # print("number of users between {} and {} is {}".format(25,30,len(la5)))
108 | # # # la6 = get_user_from_appLaunch_new((24,30), 24, 5)
109 | # # # print("number of users between {} and {} is {}".format(24,30,len(la6)))
110 | # # # la7 = get_user_from_appLaunch_new((23,30), 23, 6)
111 | # # # print("number of users between {} and {} is {}".format(23,30,len(la7)))
112 | # # # la8 = get_user_from_appLaunch_new((22,30), 22, 7)
113 | # # # print("number of users between {} and {} is {}".format(22,30,len(la8)))
114 | # # # la9 = get_user_from_appLaunch_new((21,30), 21, 8)
115 | # # # print("number of users between {} and {} is {}".format(21,30,len(la9)))
116 | # # # la10 = get_user_from_appLaunch_new((20,30), 20, 9)
117 | # # # print("number of users between {} and {} i0s {}".format(20,30,len(la10)))
118 | # # # la11 = get_user_from_appLaunch_new((19,30), 19, 10)
119 | # # # print("number of users between {} and {} is {}".format(19,30,len(la11)))
120 | # # # la12 = get_user_from_appLaunch_new((18,30), 18, 11)
121 | # # # print("number of users between {} and {} is {}".format(18,30,len(la12)))
122 | # # # la13 = get_user_from_appLaunch_new((17,30), 17, 12)
123 | # # # print("number of users between {} and {} is {}".format(17,30,len(la13)))
124 | # # # la = list(set(av1+av2+la1+la2+la3+la4+la5+la6+la7+la8+la9+la10+la11+la12+la13))
125 | # la = list(set(av+la1+la2+la3+la4+la5))
126 | # print("number of consecutive users {}".format(len(la)))
127 | # str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
128 | # submission_file = "merge/submission_rule_consec_" + str_time + ".csv"
129 | # with open(submission_file,"a",newline="") as f:
130 | # writer = csv.writer(f)
131 | # for i in la:
132 | # writer.writerow([i])
133 | merge6()
--------------------------------------------------------------------------------
/lrpy/lr_v2.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | import pandas as pd
4 | # import joblib
5 | from lightgbm import LGBMClassifier
6 | from sklearn.decomposition import PCA, FactorAnalysis
7 | from sklearn.feature_selection import SelectKBest, mutual_info_classif, SelectFromModel
8 | from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
9 | from sklearn.metrics import classification_report, f1_score
10 | # from sklearn.model_selection import GridSearchCV, train_test_split
11 | # from skopt import BayesSearchCV
12 | # from skopt.callbacks import DeltaXStopper
13 | # from data_process_v7 import processing
14 | # from sklearn.feature_selection import VarianceThreshold
15 | import numpy as np
16 | # def predict(clf2, test_set,param):
17 | from sklearn.pipeline import Pipeline
18 |
19 |
20 | def predict(clf2, test_set,param,sel):
21 | uid = pd.DataFrame()
22 | # test_set = processing(trainSpan=(1, 30), label=False)
23 | uid["user_id"] = test_set["user_id"]
24 | test_set = test_set.drop(labels=["user_id"], axis=1)
25 | test_set = sel.transform(test_set.values)
26 | print("begin to make predictions")
27 | # res = clf2.predict_proba(test_set.values)
28 | res = clf2.predict_proba(test_set)
29 | uid["proba1"] = pd.Series(res[:, 1])
30 | uid["score"] = uid.groupby(by=["user_id"])["proba1"].transform(lambda x: sum(x) / float(len(x)))
31 | uid.drop_duplicates(subset=["user_id"],inplace=True)
32 | uid.sort_values(by=["score"],axis=0,ascending=False,inplace=True)
33 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
34 | uid_file = "../result/uid/B/uid_lr_" +param+"_"+ str_time + ".csv"
35 | uid.to_csv(uid_file,header=True,index=False)
36 | # active_users = uid.loc[uid["score"]>0.5]["user_id"].unique().tolist()
37 | active_users = uid["user_id"][:24500].unique().tolist()
38 | # print(len(active_users))
39 | print(uid["score"].tolist()[24500])
40 | # print(active_users)
41 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
42 | submission_file = "../result/628/pm/submission_lr_" +param+"_"+ str_time + ".csv"
43 | with open(submission_file, "a", newline="") as f:
44 | writer = csv.writer(f)
45 | for i in active_users:
46 | writer.writerow([i])
47 | # using this module ,one needs to deconstruct some of the features in data_process
48 | def run(scheme_num=3,file_name="../data/data_v8/training_r"):
49 | train_set_ls = []
50 | if scheme_num ==1:
51 | for i in [16,17,22,23]:
52 | print("begin to load the dataset")
53 | file_name1 = file_name+"ld1-"+str(i)+".csv"
54 | train_set_temp = pd.read_csv(file_name1, header=0, index_col=None)
55 | print(train_set_temp.describe())
56 | train_set_ls.append(train_set_temp)
57 | elif scheme_num ==2:
58 | for i in [16,23]:
59 | print("begin to load the dataset")
60 | file_name2 = file_name+"ld1-" + str(i) + ".csv"
61 | train_set_temp = pd.read_csv(file_name2, header=0, index_col=None)
62 | print(train_set_temp.describe())
63 | train_set_ls.append(train_set_temp)
64 | elif scheme_num ==3:
65 | for i in [17,18,19,20,21,22,23]:
66 | print("begin to load the dataset"+str(i))
67 | file_name3 = file_name+ "ld1-" + str(i) + ".csv"
68 | train_set_temp = pd.read_csv(file_name3, header=0, index_col=None)
69 | print(train_set_temp.describe())
70 | train_set_ls.append(train_set_temp)
71 | val_file_name = file_name+ "ld1-23.csv"
72 | val_set = pd.read_csv(val_file_name, header=0, index_col=None)
73 | val_set2 = pd.read_csv("../data/data_v5/training_eld1-23.csv", header=0, index_col=None)
74 | print(val_set.describe())
75 | print(val_set2.describe())
76 | train_set = pd.concat(train_set_ls, axis=0)
77 | ds = train_set.describe()
78 | print(ds)
79 |
80 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))
81 |
82 | print("begin to drop the duplicates")
83 | train_set.drop_duplicates(subset=keep_feature, inplace=True)
84 | val_set.drop_duplicates(subset=keep_feature,inplace=True)
85 | val_set2.drop_duplicates(subset=keep_feature,inplace=True)
86 | print(train_set.describe())
87 | print(val_set.describe())
88 | print(val_set2.describe())
89 | train_label = train_set["label"]
90 | val_label = val_set["label"]
91 | val_label2 = val_set2["label"]
92 | train_set = train_set.drop(labels=["label", "user_id"], axis=1)
93 | val_set = val_set.drop(labels=["label","user_id"], axis=1)
94 | val_set2 = val_set2.drop(labels=["label","user_id"], axis=1)
95 |
96 | drop_features = [""]
97 |
98 |
99 | print("begin to standardization the data")
100 | for fea in keep_feature:
101 | train_set[fea] = (train_set[fea]-train_set[fea].min())/(train_set[fea].max()-train_set[fea].min())
102 | # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std())
103 | val_set[fea] = (val_set[fea]-val_set[fea].min())/(val_set[fea].max()-val_set[fea].min())
104 | val_set2[fea] = (val_set2[fea]-val_set2[fea].min())/(val_set2[fea].max()-val_set2[fea].min())
105 | # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std())
106 | # print(train_set.describe())
107 | # keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))
108 | # sel = SelectKBest(mutual_info_classif, k=300).fit(train_set.values, train_label.values)
109 | # train_set = sel.transform(train_set.values)
110 | # val_set = sel.transform(val_set.values)
111 | # val_set2 = sel.transform(val_set2.values)
112 | # feature_importances = sel.scores_
113 | # print(feature_importances)
114 | # print(keep_feature)
115 | # feature_score_name = sorted(zip(feature_importances, keep_feature), reverse=True)
116 | # for score, name in feature_score_name:
117 | # print('{}: {}'.format(name, score))
118 |
119 | # kpca = PCA(n_components=0.98)
120 | # # kpca = FactorAnalysis(n_components=100)
121 | # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1)
122 | # kpca.fit(train_set.values)
123 | # train_set = kpca.transform(train_set.values)
124 | # val_set = kpca.transform(val_set.values)
125 | # print(kpca.components_)
126 | # # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_))
127 | # print("number of components {}".format(kpca.n_components_))
128 | # print("noise variance {}".format(kpca.noise_variance_))
129 | # print("the explained variance {}".format(kpca.explained_variance_))
130 | # print("the explained variance ratio {}".format(kpca.explained_variance_ratio_))
131 |
132 | print("begin to make prediction with plain features and without tuning parameters")
133 |
134 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns))
135 |
136 | # best_f1 =0.0
137 | # best_params = {"n_estimators":800,"num_leaves":6}
138 | # for n_estimator in [400,600,800]:
139 | # for num_leave in [4,6,8]:
140 | # print({"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"})
141 | # clf1 = LGBMClassifier(n_estimators=n_estimator, num_leaves=num_leave, boosting_type="dart")
142 | # clf1.fit(train_set.values, train_label.values)
143 | # print("load the test dataset")
144 | # yhat = clf1.predict(val_set.values)
145 | # print(classification_report(y_pred=yhat, y_true=val_label.values,digits=4))
146 | # f1 = f1_score(y_pred=yhat, y_true=val_label.values)
147 | # if best_f10.5]["user_id"].unique().tolist()
66 | active_users = uid["user_id"][:24500].unique().tolist()
67 | # print(len(active_users))
68 | print(uid["score"].tolist()[24500])
69 | # print(active_users)
70 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
71 | submission_file = "../result/622/submission_dnn_" + param + "_" + str_time + ".csv"
72 | with open(submission_file, "a", newline="") as f:
73 | writer = csv.writer(f)
74 | for i in active_users:
75 | writer.writerow([i])
76 |
77 | # using this module ,one needs to deconstruct some of the features in data_process
78 | def run(scheme_num=1, file_name="../data/data_v3/training_e"):
79 | train_set_ls = []
80 | if scheme_num == 1:
81 | for i in [16, 17, 22, 23]:
82 | print("begin to load the dataset")
83 | file_name1 = file_name + "ld1-" + str(i) + ".csv"
84 | train_set_temp = pd.read_csv(file_name1, header=0, index_col=None)
85 | print(train_set_temp.describe())
86 | train_set_ls.append(train_set_temp)
87 | elif scheme_num == 2:
88 | for i in [16, 23]:
89 | print("begin to load the dataset")
90 | file_name2 = file_name + "ld1-" + str(i) + ".csv"
91 | train_set_temp = pd.read_csv(file_name2, header=0, index_col=None)
92 | print(train_set_temp.describe())
93 | train_set_ls.append(train_set_temp)
94 | elif scheme_num == 3:
95 | for i in [18, 19, 20, 21, 22, 23]:
96 | print("begin to load the dataset")
97 | file_name3 = file_name + "ld1-" + str(i) + ".csv"
98 | train_set_temp = pd.read_csv(file_name3, header=0, index_col=None)
99 | print(train_set_temp.describe())
100 | train_set_ls.append(train_set_temp)
101 | val_file_name = file_name + "ld1-22.csv"
102 | val_set = pd.read_csv(val_file_name, header=0, index_col=None)
103 | print(val_set.describe())
104 | train_set = pd.concat(train_set_ls, axis=0)
105 | ds = train_set.describe()
106 | print(ds)
107 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))
108 |
109 | print("begin to drop the duplicates")
110 | train_set.drop_duplicates(subset=keep_feature, inplace=True)
111 | val_set.drop_duplicates(subset=keep_feature, inplace=True)
112 | print(train_set.describe())
113 | print(val_set.describe())
114 | train_label = train_set["label"]
115 | val_label = val_set["label"]
116 | train_set = train_set.drop(labels=["label", "user_id"], axis=1)
117 | val_set = val_set.drop(labels=["label", "user_id"], axis=1)
118 |
119 | print("begin to standardization the data")
120 | for fea in keep_feature:
121 | if train_set[fea].var() < 0.000001 or val_set[fea].var() < 0.000001:
122 | train_set.drop(labels=[fea], axis=1, inplace=True)
123 | val_set.drop(labels=[fea], axis=1, inplace=True)
124 | else:
125 | train_set[fea] = (train_set[fea] - train_set[fea].min()) / (train_set[fea].max() - train_set[fea].min())
126 | # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std())
127 | val_set[fea] = (val_set[fea] - val_set[fea].min()) / (val_set[fea].max() - val_set[fea].min())
128 | # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std())
129 | keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))
130 | kpca = PCA(n_components=0.99, whiten=True)
131 | # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1)
132 | kpca.fit(train_set.values)
133 | train_set = kpca.transform(train_set.values)
134 | val_set = kpca.transform(val_set.values)
135 | pca_std = np.std(train_set)
136 | # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_))
137 | NCOMPONENTS = kpca.n_components_
138 | print("number of components {}".format(kpca.n_components_))
139 | print("noise variance {}".format(kpca.noise_variance_))
140 | print("the explained variance {}".format(kpca.explained_variance_))
141 | print("the explained variance ratio {}".format(kpca.explained_variance_ratio_))
142 |
143 | print("begin to make prediction with plain features and without tuning parameters")
144 |
145 | # scoring = {'f1': "f1"}
146 | # clf1 = GridSearchCV(LGBMClassifier(),
147 | # param_grid={"n_estimators":[200,400,600],"num_leaves": [4,5,6,8],"boosting_type":["dart"]},
148 | # scoring=scoring, cv=4, refit='f1',n_jobs=-1,verbose=1)
149 |
150 | for layers in [3]:
151 | for units in [128]:
152 | print({"layers": layers, "neurals": units})
153 | model = Sequential()
154 | # model.add(Dense(units, input_dim=NCOMPONENTS, activation='relu'))
155 | # model.add(Embedding(units,32, input_lenth=NCOMPONENTS))
156 | # model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
157 | # model.add(MaxPooling1D(pool_size=2))
158 | # model.add(Flatten())
159 | # model.add(Dense(250, activation='relu'))
160 | # model.add(Dense(1, activation='sigmoid'))
161 | model.add(Dense(units, input_dim=NCOMPONENTS, activation='relu'))
162 | model.add(GaussianNoise(pca_std))
163 | for i in range(layers):
164 | model.add(Dense(units, activation='relu'))
165 | model.add(GaussianNoise(pca_std))
166 | model.add(Dropout(0.1))
167 | model.add(Dense(1, activation='sigmoid'))
168 | print(model.summary())
169 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
170 | early_stopping = EarlyStopping(monitor="val_loss",patience=16)
171 | model.fit(train_set, train_label, epochs=300, batch_size=256, validation_split=0.15, verbose=2,callbacks=[early_stopping])
172 |
173 | print("begin to make classification report for the validation dataset")
174 | # yhat = clf1.predict(val_set.values)
175 | # yhat = clf1.predict(val_set.values)
176 | yhat = np.reshape(model.predict_classes(val_set),-1)
177 |
178 | print(classification_report(y_pred=yhat, y_true=val_label.values, digits=4))
179 |
180 | print("begin to make classification report for the training dataset")
181 | # yhat = clf1.predict(train_set.values)
182 | yhat = np.reshape(model.predict_classes(train_set),-1)
183 | print(classification_report(y_pred=yhat, y_true=train_label.values, digits=4))
184 |
185 | print("load the test dataset")
186 | test_file_name = file_name.replace("training", "testing") + "ld1-30.csv"
187 | test_set = pd.read_csv(test_file_name, header=0, index_col=None, usecols=keep_feature + ["user_id"])
188 | # test_set = pd.read_csv("data/testing_rld1-30.csv",header=0,index_col=None)
189 | for fea in keep_feature:
190 | test_set[fea] = (test_set[fea] - test_set[fea].min()) / (test_set[fea].max() - test_set[fea].min())
191 | # test_set[fea] = (test_set[fea]-test_set[fea].mean())/(test_set[fea].std())
192 |
193 | print("begin to make prediction")
194 | param = list(file_name)[-1] + str(scheme_num) + "_" + str(layers) + "_" + str(units)
195 | print(param)
196 | # predict(clf1,test_set,param)
197 | predict(model, test_set, param, kpca)
198 |
199 | if __name__ == "__main__":
200 | file_name1 = "../data/data_v3/training_e"
201 | file_name2 = "../data/data_v4/training_r"
202 | for scheme in [3]:
203 | for file in ["../data/data_v4/training_r"]:
204 | run(scheme_num=scheme,file_name=file)
--------------------------------------------------------------------------------
/nnpy/f1_keras.py:
--------------------------------------------------------------------------------
1 | from keras import backend as K
2 |
3 | def f1(y_true, y_pred):
4 | def recall(y_true, y_pred):
5 | """Recall metric.
6 |
7 | Only computes a batch-wise average of recall.
8 |
9 | Computes the recall, a metric for multi-label classification of
10 | how many relevant items are selected.
11 | """
12 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
13 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
14 | recall = true_positives / (possible_positives + K.epsilon())
15 | return recall
16 |
17 | def precision(y_true, y_pred):
18 | """Precision metric.
19 |
20 | Only computes a batch-wise average of precision.
21 |
22 | Computes the precision, a metric for multi-label classification of
23 | how many selected items are relevant.
24 | """
25 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
26 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
27 | precision = true_positives / (predicted_positives + K.epsilon())
28 | return precision
29 | precision = precision(y_true, y_pred)
30 | recall = recall(y_true, y_pred)
31 | return 2*((precision*recall)/(precision+recall+K.epsilon()))
32 |
--------------------------------------------------------------------------------
/nnpy/nn_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.preprocessing import LabelEncoder
4 | import tensorflow as tf
5 | from sklearn.metrics import r2_score, log_loss, roc_auc_score
6 | from sklearn.model_selection import KFold
7 |
8 | # Training steps
9 | STEPS = 500
10 | LEARNING_RATE = 0.0001
11 | BETA = 0.01
12 | DROPOUT = 0.5
13 | RANDOM_SEED = 12345
14 | MAX_Y = 250
15 | RESTORE = True
16 | START = 0
17 |
18 | # Training variables
19 | IN_DIM = 13
20 |
21 | # Network Parameters - Hidden layers
22 | n_hidden_1 = 100
23 | n_hidden_2 = 50
24 |
25 | def weight_variable(shape):
26 | initial = tf.truncated_normal(shape, stddev=0.01)
27 | return tf.Variable(initial)
28 |
29 | def bias_variable(shape):
30 | initial = tf.constant(0.03, shape=shape)
31 | return tf.Variable(initial)
32 |
33 | def deep_network(inputs, keep_prob):
34 | # Input -> Hidden Layer
35 | w1 = weight_variable([IN_DIM, n_hidden_1])
36 | b1 = bias_variable([n_hidden_1])
37 | # Hidden Layer -> Hidden Layer
38 | w2 = weight_variable([n_hidden_1, n_hidden_2])
39 | b2 = bias_variable([n_hidden_2])
40 | # Hidden Layer -> Output
41 | w3 = weight_variable([n_hidden_2, 1])
42 | b3 = bias_variable([1])
43 |
44 | # 1st Hidden layer with dropout
45 | h1 = tf.nn.relu(tf.matmul(inputs, w1) + b1)
46 | h1_dropout = tf.nn.dropout(h1, keep_prob)
47 | # 2nd Hidden layer with dropout
48 | h2 = tf.nn.relu(tf.matmul(h1_dropout, w2) + b2)
49 | h2_dropout = tf.nn.dropout(h2, keep_prob)
50 |
51 | # Run sigmoid on output to get 0 to 1
52 | out = tf.nn.sigmoid(tf.matmul(h2_dropout, w3) + b3)
53 |
54 | # Loss function with L2 Regularization
55 | regularizers = tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w3)
56 |
57 | scaled_out = tf.multiply(out, MAX_Y) # Scale output
58 | return inputs, out, scaled_out, regularizers
59 | def get_stratified_sample(df, sample_target="user_id", reference_target="app_launch_day",
60 | sample_ratio=0.2):
61 | df = df.astype(np.uint32)
62 | reference_target_ls = df[reference_target].unique().tolist()
63 | target_sample = []
64 | for i in reference_target_ls:
65 | # print("get users in day {}".format(i))
66 | target_sample.extend(df.loc[df[reference_target] == int(i)][sample_target].drop_duplicates().sample(frac=sample_ratio).tolist())
67 | del df
68 | return list(set(target_sample))
69 | def nn_model(train_set,val_set,file,best_params=None,val_ratio=0.4, n_round = 3):
70 | tf.set_random_seed(RANDOM_SEED)
71 |
72 | # Create the model
73 | x = tf.placeholder(tf.float32, [None, IN_DIM])
74 |
75 | # Define loss and optimizer
76 | y_ = tf.placeholder(tf.float32, [None, 1])
77 |
78 | # Dropout on hidden layers
79 | keep_prob = tf.placeholder("float")
80 |
81 | # Build the graph for the deep net
82 | inputs, out, scaled_out, regularizers = deep_network(x, keep_prob)
83 |
84 | # Normal loss function (RMSE)
85 | loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y_, scaled_out))))
86 |
87 | # Loss function with L2 Regularization
88 | loss = tf.reduce_mean(loss + BETA * regularizers)
89 |
90 | # Optimizer
91 | train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)
92 |
93 |
94 | logloss = log_loss(y_, scaled_out)
95 | auc = roc_auc_score(y_, scaled_out)
96 |
97 | # Save model
98 | use_cols = list(set(train_set.columns)-set(["user_id","label"]))
99 | df_val_user = pd.read_pickle("../work/val_user_17_23.pkl")
100 | val_user_all = df_val_user["user_id"].unique().tolist()
101 | final_rank = 0
102 | train_set.drop_duplicates(inplace=True, subset=use_cols, keep="last")
103 | val_set.drop_duplicates(inplace=True, subset=use_cols, keep="last")
104 | saver = tf.train.Saver(max_to_keep=5)
105 |
106 | with tf.Session() as sess:
107 | #if RESTORE:
108 | # print('Loading Model...')
109 | # ckpt = tf.train.get_checkpoint_state('./models/neural/')
110 | # saver.restore(sess, ckpt.model_checkpoint_path)
111 | #else:
112 | sess.run(tf.global_variables_initializer())
113 | # val = val_set.iloc[-val_len:, :].sample(frac=val_ratio)
114 | # val = val_set.sample(frac=val_ratio)
115 | val_user = get_stratified_sample(df_val_user, sample_ratio=val_ratio)
116 | val_user_add = list(set(val_user_all) - set(val_user))
117 | val = val_set[val_set["user_id"].isin(val_user)]
118 | # val_add = val_set[val_set["user_id"].isin(val_user_add)]
119 | val_train = val_set[~val_set["user_id"].isin(val["user_id"])]
120 | train = pd.concat([train_set, val_train], axis=0)
121 | # train = pd.concat([train_set, val_train,val_add], axis=0)
122 | print("shape of val:", val.shape)
123 | print("shape of train:", train.shape)
124 | y_train = train['label']
125 | train = train.drop(['user_id', "label"], axis=1)
126 | val_y = val['label']
127 | val_x = val.drop(['user_id', "label"], axis=1)
128 |
129 | # Train until maximum steps reached or interrupted
130 | for i in range(START, STEPS):
131 | k_fold = KFold(n_splits=10, shuffle=True)
132 | #if i % 100 == 0:
133 | # saver.save(sess, './models/neural/step_' + str(i) + '.cptk')
134 |
135 | for k, (ktrain, ktest) in enumerate(k_fold.split(train, y_train)):
136 | train_step.run(feed_dict={x: train[ktrain], y_: y_train[ktrain], keep_prob: DROPOUT})
137 | # Show test score every 10 iterations
138 | if i % 10 == 0:
139 | # Tensorflow R2
140 | #train_accuracy = accuracy.eval(feed_dict={
141 | # x: train[ktest], y_: y_train[ktest]})
142 | # SkLearn metrics R2
143 | train_accuracy = log_loss(y_train[ktest],
144 | sess.run(scaled_out, feed_dict={x: train[ktest], keep_prob: 1.0}))
145 | print('Step: %d, Fold: %d, R2 Score: %g' % (i, k, train_accuracy))
146 |
147 | CV = []
148 | for i in range(n_round):
149 | k_fold = KFold(n_splits=10, shuffle=True)
150 | for k, (ktrain, ktest) in enumerate(k_fold.split(train, y_train)):
151 | # Tensorflow R2
152 | #accuracy = accuracy.eval(feed_dict={
153 | # x: train[ktest], y_: y_train[ktest]})
154 | # SkLearn metrics R2
155 | auc = roc_auc_score(y_train[ktest],
156 | sess.run(scaled_out, feed_dict={x: train[ktest], keep_prob: 1.0}))
157 | print('Step: %d, Fold: %d, R2 Score: %g' % (i, k, auc))
158 | CV.append(auc)
159 | print('Mean R2: %g' % (np.mean(CV)))
160 |
161 | if __name__ == '__main__':
162 | tf.app.run()
--------------------------------------------------------------------------------
/nnpy/nn_v1.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | import pandas as pd
4 | import joblib
5 | from sklearn.model_selection import GridSearchCV
6 | from sklearn.neural_network import MLPClassifier
7 | from skopt import BayesSearchCV
8 | from skopt.callbacks import DeltaXStopper
9 | from data_process_v4 import processing
10 | from skopt.space import Categorical
11 |
12 | def predict(clf2, test_set):
13 | uid = pd.DataFrame()
14 | # test_set = processing(trainSpan=(1, 30), label=False)
15 | uid["user_id"] = test_set["user_id"]
16 | test_set = test_set.drop(labels=["user_id"], axis=1)
17 | print("begin to make predictions")
18 | res = clf2.predict_proba(test_set.values)
19 | uid["proba1"] = pd.Series(res[:, 1])
20 | uid["score"] = uid.groupby(by=["user_id"])["proba1"].transform(lambda x: sum(x) / float(len(x)))
21 | uid.drop_duplicates(subset=["user_id"],inplace=True)
22 | uid.sort_values(by=["score"],axis=0,ascending=False,inplace=True)
23 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
24 | uid_file = "result/uid_" + str_time + ".csv"
25 | uid.to_csv(uid_file,header=True,index=False)
26 | active_users = uid["user_id"][:24000].unique().tolist()
27 | print(len(active_users))
28 | print(active_users)
29 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
30 | submission_file = "result/submission_nn_" + str_time + ".csv"
31 | with open(submission_file, "a", newline="") as f:
32 | writer = csv.writer(f)
33 | for i in active_users:
34 | writer.writerow([i])
35 | # using this module ,one needs to deconstruct some of the features in data_process
36 | def run():
37 | # print("begin to load the trainset1")
38 | # train_set1 = processing(trainSpan=(1,19),label=True)
39 | # train_set1.to_csv("data/training_ld1-19.csv", header=True, index=False)
40 | # train_set1 = pd.read_csv("data/training_ld1-16.csv", header=0, index_col=None)
41 | # print(train_set1.describe())
42 | # print("begin to load the trainset2")
43 | # train_set2 = processing(trainSpan=(5,23),label=True)
44 | # train_set2.to_csv("data/training_ld5-23.csv", header=True, index=False)
45 | # train_set2 = pd.read_csv("data/training_ld8-23.csv", header=0, index_col=None)
46 | # print(train_set1.describe())
47 | # print("begin to load the trainset3")
48 | # train_set3 = processing(trainSpan=(1,23),label=True)
49 | # train_set3.to_csv("data/training_ld1-23.csv", header=True, index=False)
50 | # train_set3 = pd.read_csv("data/training_ld1-23.csv", header=0, index_col=None)
51 | # print(train_set1.describe())
52 | print("begin to merge the trainsets")
53 | # train_set = pd.concat([train_set1,train_set2,train_set3],axis=0)
54 | # train_set = pd.concat([train_set1,train_set2],axis=0)
55 | # train_set.to_csv("data/training_lm5-23.csv", header=True, index=False)
56 | train_set = pd.read_csv("data/training_lm15-23.csv", header=0, index_col=None)
57 | # del train_set1,train_set2
58 | # gc.collect()
59 | print(train_set.describe())
60 | keep_feature = list(set(train_set.columns.values.tolist())-set(["user_id","label"]))
61 | print("begin to drop the duplicates")
62 | train_set.drop_duplicates(subset=keep_feature,inplace=True)
63 | print(train_set.describe())
64 | train_label =train_set["label"]
65 | train_set = train_set.drop(labels=["label","user_id"], axis=1)
66 |
67 | # train_x, val_x,train_y,val_y = train_test_split(train_set.values,train_label.values,test_size=0.33,random_state=42,shuffle=True)
68 | print("begin to make prediction with plain features and without tuning parameters")
69 | initial_params = {
70 | "hidden_layer_sizes": (128,128),
71 | "activation": "relu",
72 | "solver": "adam",
73 | "batch_size":"auto",
74 | "learning_rate": "adaptive",
75 | "alpha": 0.0001,
76 | "max_iter": 400,
77 | "verbose": True,
78 | "warm_start": True,
79 | "early_stopping": True,
80 | "validation_fraction": 0.1,
81 | }
82 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns))
83 |
84 | scoring = {'AUC': 'roc_auc', 'f1': "f1"}
85 | clf1 = GridSearchCV(MLPClassifier(**initial_params),
86 | param_grid={
87 | "max_iter":[400,800,1200],
88 | "solver": ["lbfgs","adam"],
89 | "batch_size":[128,200,156]},
90 | scoring=scoring, cv=4, refit='f1',n_jobs=-1,verbose=2)
91 | clf1.fit(train_set.values, train_label.values)
92 | # cv_results = cv(initial_params,train_data,num_boost_round=800,nfold=4,early_stopping_rounds=30,verbose_eval=True)
93 | # bst = lgb.cv(initial_params, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30)
94 | bs = clf1.best_score_
95 | print(bs)
96 | bp = clf1.best_params_
97 | print(bp)
98 | # clf1 = LGBMClassifier(**initial_params)
99 | # clf1.fit(X=train_x,y=train_y,eval_set=(val_x,val_y),early_stopping_rounds=20,eval_metric="auc")
100 | print("load the test dataset")
101 | # test_set = processing(trainSpan=(15, 30), label=False)
102 | # test_set.to_csv("data/testing_ld15-30.csv",header=True,index=False)
103 | test_set = pd.read_csv("data/testing_ld15-30.csv",header=0,index_col=None)
104 | print("begin to make prediction")
105 | predict(clf1,test_set)
106 |
107 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
108 | with open("kuaishou_stats.csv", 'a', newline='') as f:
109 | writer = csv.writer(f)
110 | writer.writerow(["feature importance of nn for kuaishou-crt ", str_time])
111 | writer.writerow(["best score",bs,"best params"])
112 | for key, value in bp.items():
113 | writer.writerow([key, value])
114 | model_name = "nn_" + str_time + ".pkl"
115 | joblib.dump(clf1, model_name)
116 | print("begin to tune the parameters with the selected feature")
117 | hls = []
118 | for i in [32, 64]:
119 | hls.append((i * 3,i * 3))
120 | hls.append((i * 4,i * 4))
121 | hls.append((i*2, i * 3, i*2))
122 | hls.append((i*3, i * 4, i*3))
123 | # hls.append((i,i * 2, i * 4, i * 3))
124 | paramsSpace = {
125 | "hidden_layer_sizes": Categorical(hls),
126 | "activation": Categorical(["logistic", "tanh", "relu"]),
127 | "solver": Categorical(["lbfgs", "sgd", "adam"]),
128 | "learning_rate": Categorical(["invscaling", "adaptive"]),
129 | "alpha": Categorical([0.0001, 0.001, 0.01,0.1,1.0]),
130 | "batch_size":(128, 256),
131 | "max_iter":(400, 1200),
132 | "momentum":(0.6, 1.0, 'uniform'),
133 | "beta_1":(0.6, 1.0, 'uniform'),
134 | "beta_2":(0.98, 0.99990, 'uniform'),
135 | }
136 | def tune_parameter(X, y, clf, params):
137 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
138 | gs = BayesSearchCV(
139 | estimator=clf, search_spaces=params,
140 | scoring="f1", n_iter=60,optimizer_kwargs={"base_estimator":"GP"},
141 | verbose=0, n_jobs=-1, cv=4, refit=True, random_state=1234
142 | )
143 | gs.fit(X, y,callback=DeltaXStopper(0.000001))
144 | best_params = gs.best_params_
145 | best_score = gs.best_score_
146 | print(best_params)
147 | print(best_score)
148 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
149 | with open("kuaishou_stats.csv", 'a', newline='') as f:
150 | writer = csv.writer(f)
151 | writer.writerow(["the best params for nn: "])
152 | for key, value in best_params.items():
153 | writer.writerow([key, value])
154 | writer.writerow(["the best score for nn: ", best_score,str_time])
155 | return gs
156 |
157 | model = MLPClassifier(**bp)
158 | clf2 = tune_parameter(train_set.values,train_label.values,model,paramsSpace)
159 | print("parameter tuning over, begin to save the model!")
160 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
161 |
162 | model_name = "nn_" + str_time + ".pkl"
163 | joblib.dump(clf2, model_name)
164 |
165 | print("begin to process the whole dataset and ready to feed into the fitted model")
166 | predict(clf2,test_set)
167 |
168 | if __name__=="__main__":
169 | run()
--------------------------------------------------------------------------------
/nnpy/nn_v2.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | def KerasClassifier_wrapper(input_dims):
4 | import tensorflow as tf
5 | from keras import backend as K
6 | # AUC for a binary classifier
7 | def auc(y_true, y_pred):
8 | ptas = tf.stack([binary_PTA(y_true, y_pred, k) for k in np.linspace(0, 1, 1000)], axis=0)
9 | pfas = tf.stack([binary_PFA(y_true, y_pred, k) for k in np.linspace(0, 1, 1000)], axis=0)
10 | pfas = tf.concat([tf.ones((1,)), pfas], axis=0)
11 | binSizes = -(pfas[1:] - pfas[:-1])
12 | s = ptas * binSizes
13 | return K.sum(s, axis=0)
14 |
15 | # -----------------------------------------------------------------------------------------------------------------------------------------------------
16 | # PFA, prob false alert for binary classifier
17 | def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
18 | y_pred = K.cast(y_pred >= threshold, 'float32')
19 | # N = total number of negative labels
20 | N = K.sum(1 - y_true)
21 | # FP = total number of false alerts, alerts from the negative class labels
22 | FP = K.sum(y_pred - y_pred * y_true)
23 | return FP / (N+0.00000001)
24 |
25 | # -----------------------------------------------------------------------------------------------------------------------------------------------------
26 | # P_TA prob true alerts for binary classifier
27 | def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
28 | y_pred = K.cast(y_pred >= threshold, 'float32')
29 | # P = total number of positive labels
30 | P = K.sum(y_true)
31 | # TP = total number of correct alerts, alerts from the positive class labels
32 | TP = K.sum(y_pred * y_true)
33 | return TP / (P+0.00000001)
34 | # prepare callbacks
35 | def model():
36 | from keras.models import Sequential
37 | model = Sequential()
38 | # input layer
39 | from keras.layers import Dense
40 | model.add(Dense(input_dims, input_dim=input_dims))
41 | from keras.layers import BatchNormalization
42 | model.add(BatchNormalization())
43 | from keras.layers import Activation
44 | model.add(Activation('relu'))
45 | from keras.layers import Dropout
46 | model.add(Dropout(0.4))
47 | # hidden layers
48 | model.add(Dense(input_dims))
49 | model.add(BatchNormalization())
50 | model.add(Activation('relu'))
51 | model.add(Dropout(0.4))
52 |
53 | model.add(Dense(input_dims // 2))
54 | model.add(BatchNormalization())
55 | model.add(Activation('relu'))
56 | model.add(Dropout(0.4))
57 |
58 | model.add(Dense(input_dims // 4, activation='relu'))
59 |
60 | # output layer (y_pred)
61 | model.add(Dense(1, activation='sigmoid'))
62 |
63 | # compile this model
64 | model.compile(loss='binary_crossentropy', # one may use 'mean_absolute_error' as alternative
65 | optimizer='adam',
66 | metrics=[auc] # you can add several if needed
67 | )
68 | # Visualize NN architecture
69 | print(model.summary())
70 | return model
71 | from keras.wrappers.scikit_learn import KerasClassifier
72 | return KerasClassifier(build_fn=model)
73 | def get_stratified_sample(df, sample_target="user_id", reference_target="app_launch_day",
74 | sample_ratio=0.2):
75 | df = df.astype(np.uint32)
76 | reference_target_ls = df[reference_target].unique().tolist()
77 | target_sample = []
78 | for i in reference_target_ls:
79 | # print("get users in day {}".format(i))
80 | target_sample.extend(df.loc[df[reference_target] == int(i)][sample_target].drop_duplicates().sample(frac=sample_ratio).tolist())
81 | del df
82 | return list(set(target_sample))
83 | def nn_predict(train_set,val_set,test_set,file,minmax_scale=True,val_ratio=0.4, n_round = 3):
84 | import numpy as np
85 | from scipy.stats import rankdata
86 | import random
87 | import gc
88 | res=test_set[['user_id']]
89 | test_x = test_set.drop(['user_id',"label"], axis=1)
90 | res['prob'] = 0
91 | user_register_log = ["user_id", "register_day", "register_type", "device_type"]
92 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": np.uint8}
93 | testb = \
94 | pd.read_table('/mnt/datasets/fusai/user_register_log.txt', header=None, names=user_register_log, index_col=None,
95 | dtype=dtype_user_register)[['user_id']]
96 | print("begin to train ")
97 |
98 | # val_len = int(len(val_set)*0.4)
99 | # use_cols = list(set(train_set.columns)-set(["user_id","label"]))
100 | df_val_user = pd.read_pickle("../work/val_user_8_23.pkl")
101 | # val_user_all = df_val_user["user_id"].unique().tolist()
102 | # final_rank = 0
103 | # train_set.drop_duplicates(inplace=True, subset=use_cols, keep="last")
104 | # val_set.drop_duplicates(inplace=True, subset=use_cols, keep="last")
105 | # train_set.reset_index(drop=True,inplace=True)
106 | # val_set.reset_index(drop=True,inplace=True)
107 | if minmax_scale:
108 | for f in test_x.columns:
109 | train_set[f] = (train_set[f]-train_set[f].min())/(train_set[f].max()-train_set[f].min())
110 | val_set[f] = (val_set[f]-val_set[f].min())/(val_set[f].max()-val_set[f].min())
111 | test_x[f] = (test_x[f]-test_x[f].min())/(test_x[f].max()-test_x[f].min())
112 | for i in range(n_round):
113 | random.seed(np.random.randint(1, 1000))
114 | # val = val_set.iloc[-val_len:, :].sample(frac=val_ratio)
115 | # val = val_set.sample(frac=val_ratio)
116 | print("get stratified sample validation user")
117 | val_user = get_stratified_sample(df_val_user,sample_ratio=val_ratio)
118 | # val_user_add = list(set(val_user_all)-set(val_user))
119 | val = val_set.loc[val_set["user_id"].isin(val_user)]
120 | # val_add = val_set.loc[val_set["user_id"].isin(val_user_add)]
121 | val_train = val_set.loc[~val_set["user_id"].isin(val["user_id"])]
122 | train = pd.concat([train_set, val_train], axis=0)
123 | # train = pd.concat([train_set, val_train,val_add], axis=0)
124 | print("the {}th round".format(i))
125 | print("shape of val:", val.shape)
126 | print("shape of train:", train.shape)
127 | train_y = train['label']
128 | train_x = train.drop(['user_id', "label"], axis=1)
129 | val_y = val['label']
130 | val_x = val.drop(['user_id', "label"], axis=1)
131 | from keras.callbacks import ModelCheckpoint
132 | from keras.callbacks import EarlyStopping
133 | clf_nn = KerasClassifier_wrapper(train_x.shape[1])
134 | model_path = "../input/keras_model.h5"
135 | callbacks = [
136 | EarlyStopping(
137 | monitor='val_auc',
138 | patience=20,
139 | mode='max',
140 | verbose=100),
141 | ModelCheckpoint(
142 | model_path,
143 | monitor='val_auc',
144 | save_best_only=True,
145 | mode='max',
146 | verbose=100)
147 | ]
148 | # fit estimator
149 | history = clf_nn.fit(
150 | train_x,
151 | train_y,
152 | epochs=500,
153 | batch_size=1024,
154 | validation_data=(val_x, val_y),
155 | verbose=1,
156 | callbacks=callbacks,
157 | shuffle=True,
158 | n_jobs=-1,
159 | )
160 | print(history.history.keys())
161 | import matplotlib.pyplot as plt
162 | # summarize history for R^2
163 | fig_acc = plt.figure(figsize=(10, 10))
164 | plt.plot(history.history['auc'])
165 | plt.plot(history.history['val_auc'])
166 | plt.title('model auc')
167 | plt.ylabel('auc')
168 | plt.xlabel('epoch')
169 | plt.legend(['train', 'test'], loc='upper left')
170 | plt.show()
171 | fig_acc.savefig("model_auc.png")
172 |
173 | # summarize history for loss
174 | fig_loss = plt.figure(figsize=(10, 10))
175 | plt.plot(history.history['loss'])
176 | plt.plot(history.history['val_loss'])
177 | plt.title('model loss')
178 | plt.ylabel('loss')
179 | plt.xlabel('epoch')
180 | plt.legend(['train', 'test'], loc='upper left')
181 | plt.show()
182 | fig_loss.savefig("model_loss.png")
183 | # weight = 2*(temp_score_train*temp_score_val)/(temp_score_train+temp_score_val)
184 | res_temp = test_set[['user_id']]
185 | res_temp['prob'] = 0
186 | temp_predict = clf_nn.predict_proba(test_x)[:, 1]
187 | res_temp['prob'] = temp_predict
188 | res_temp = pd.merge(testb, res_temp, on='user_id', how='left').fillna(0)
189 | res_temp.to_csv('../input/' + file +str(i)+ '.txt', sep=',', index=False, header=False)
190 | # res_temp = get_normalized_rank(res_temp)
191 | # res['prob']+= res_temp['rank']
192 | # res['prob']+= res_temp['rank']/n_round
193 | # res['prob']+= res_temp['prob']/n_round
194 | res['prob']+= temp_predict/n_round
195 | # res_temp = res_temp[["user_id","rank"]]
196 | # final_rank = final_rank+rankdata(temp_predict, method='ordinal')
197 | del val, val_train,train,train_y,train_x,val_y,val_x,res_temp,temp_predict,clf_nn,history
198 | gc.collect()
199 | # res["prob"] = (final_rank -min(final_rank))/(max(final_rank)-min(final_rank))
200 | # res["prob"] = (res["prob"] -min(res["prob"]))/(max(res["prob"])-min(res["prob"]))
201 | res=pd.merge(testb,res,on='user_id',how='left').fillna(0)
202 | res.to_csv('../work/' + file + '.txt', sep=',', index=False,header=False)
203 | del testb,train_set, val_set,test_set
204 | gc.collect()
205 | return res
--------------------------------------------------------------------------------
/paper/Modeling and Predicting the Active video-viewing time in a large-scale e-learning system.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/Modeling and Predicting the Active video-viewing time in a large-scale e-learning system.pdf
--------------------------------------------------------------------------------
/paper/The Prediction of Booking Destination on airbnb dataset.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/The Prediction of Booking Destination on airbnb dataset.pdf
--------------------------------------------------------------------------------
/paper/Using Deep Learning to Predict Customer Churn in a mobile telecommunication newwork.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/Using Deep Learning to Predict Customer Churn in a mobile telecommunication newwork.pdf
--------------------------------------------------------------------------------
/paper/field-aware fatorization machine for CTR prediction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/field-aware fatorization machine for CTR prediction.pdf
--------------------------------------------------------------------------------
/paper/predicting airbnb user's desired travel destination.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/paper/predicting airbnb user's desired travel destination.pdf
--------------------------------------------------------------------------------
/photos/16count.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/16count.JPG
--------------------------------------------------------------------------------
/photos/23count.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/23count.JPG
--------------------------------------------------------------------------------
/photos/23count3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/23count3.JPG
--------------------------------------------------------------------------------
/photos/24count.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/24count.JPG
--------------------------------------------------------------------------------
/photos/24count3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/24count3.JPG
--------------------------------------------------------------------------------
/photos/count2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/count2.JPG
--------------------------------------------------------------------------------
/photos/describe.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/describe.JPG
--------------------------------------------------------------------------------
/photos/outlier1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/outlier1.JPG
--------------------------------------------------------------------------------
/photos/registerday_count.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/registerday_count.JPG
--------------------------------------------------------------------------------
/photos/sample.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/sample.JPG
--------------------------------------------------------------------------------
/photos/value_count.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hellobilllee/ActiveUserPrediction/9f4e5f7b55b61b9cb1b1b063e01e9fa399225036/photos/value_count.JPG
--------------------------------------------------------------------------------
/quick_test.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | # train_set = pd.read_csv("data/training_m1-23.csv", header=0, index_col=None)
4 | # # del train_set1,train_set2
5 | # # gc.collect()
6 | # print(train_set.describe())
7 | # keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id"]))
8 | # print("begin to drop the duplicates")
9 | # train_set.drop_duplicates(subset=keep_feature, inplace=True)
10 | # print(train_set.describe())
11 | # train_label = train_set["label"]
12 | # train_set = train_set.drop(labels=["label", "user_id"], axis=1)
13 | #
14 | # ls = [0,1,2,3,4,5,1,2,5,1,2,4,9]
15 | # print(ls.count(10)/len(ls))
16 | import numpy as np
17 | print("get users from user activity log")
18 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "video_id": np.uint32,
19 | "author_id": np.uint32, "action_type": np.uint8}
20 | df_user_activity = pd.read_csv("data/user_activity_log.csv", header=0, index_col=None, dtype=dtype_user_activity)
21 | # df_user_activity = df_user_activity.merge(df_user_register_base, on=["user_id"], how="left").fillna(-1)
22 | df_user_activity_train = df_user_activity.loc[
23 | (df_user_activity["user_activity_day"] >= 1) & (
24 | df_user_activity["user_activity_day"] <= 9)]
25 | print(df_user_activity_train.describe())
26 | user_activity_author = df_user_activity_train["author_id"].unique().tolist()
27 | print(user_activity_author)
28 | df_user_activity_train["user_in_author"] = 0
29 | # df_user_activity_train["user_in_author"] = df_user_activity_train["user_id"].apply(lambda x: 1 if x in user_activity_author else 0)
30 | print("begin to get user in author or not mark")
31 | df_user_activity_train.loc[df_user_activity_train["user_id"].isin(user_activity_author),"user_in_author"]=1
32 | print(df_user_activity_train.describe())
--------------------------------------------------------------------------------
/rfpy/rf_v1.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | # import gc
4 | import pandas as pd
5 | import joblib
6 | import lightgbm
7 | from lightgbm import LGBMClassifier,cv
8 | from scipy.stats import stats
9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.model_selection import GridSearchCV
11 | from skopt import BayesSearchCV
12 | from skopt.callbacks import DeltaXStopper
13 | from data_process_v2 import processing
14 | from skopt.space import Categorical
15 |
16 | def predict(clf2, test_set):
17 | uid = pd.DataFrame()
18 | # test_set = processing(trainSpan=(1, 30), label=False)
19 | uid["user_id"] = test_set["user_id"]
20 | test_set = test_set.drop(labels=["user_id"], axis=1)
21 | print("begin to make predictions")
22 | res = clf2.predict(test_set.values)
23 | uid["y_hat"] = pd.Series(res)
24 | uid["label"] = uid.groupby(by=["user_id"])["y_hat"].transform(lambda x: stats.mode(x)[0][0])
25 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
26 | uid_file = "result/uid_" + str_time + ".csv"
27 | uid.to_csv(uid_file,header=True,index=False)
28 | active_users = (uid.loc[uid["label"] == 1]).user_id.unique().tolist()
29 | print(len(active_users))
30 | print(active_users)
31 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
32 | submission_file = "result/submission_" + str_time + ".csv"
33 | with open(submission_file, "a", newline="") as f:
34 | writer = csv.writer(f)
35 | for i in active_users:
36 | writer.writerow([i])
37 | # using this module ,one needs to deconstruct some of the features in data_process
38 | keep_feature = ["user_id",
39 | "register_day_rate", "register_type_rate",
40 | "register_type_device", "device_type_rate", "device_type_register",
41 | "user_app_launch_register_mean_time",
42 | "user_app_launch_rate", "user_app_launch_gap",
43 | "user_video_create_register_mean_time",
44 | "user_video_create_rate", "user_video_create_day", "user_video_create_gap",
45 | "user_activity_register_mean_time", "user_activity_rate",
46 | "user_activity_frequency",
47 | "user_activity_day_rate", "user_activity_gap",
48 | "user_page_num", "user_video_id_num",
49 | "user_author_id_num", "user_author_id_video_num",
50 | "user_action_type_num"
51 | ]
52 | def run():
53 | print("begin to load the trainset1")
54 | train_set1 = processing(trainSpan=(1,12),label=True)
55 | # print(train_set1.describe())
56 | print("begin to load the trainset2")
57 | train_set2 = processing(trainSpan=(13,23),label=True)
58 | # print(train_set2.describe())
59 | print("begin to merge the trainsets")
60 | train_set = pd.concat([train_set1,train_set2],axis=0)
61 | print(train_set.describe())
62 | # del train_set1,train_set2
63 | # gc.collect()
64 | print("begin to drop the duplicates")
65 | train_set.drop_duplicates(subset=keep_feature,inplace=True)
66 | print(train_set.describe())
67 | train_label =train_set["label"]
68 | train_set = train_set.drop(labels=["label","user_id"], axis=1)
69 |
70 | # train_x, val_x,train_y,val_y = train_test_split(train_set.values,train_label.values,test_size=0.33,random_state=42,shuffle=True)
71 | print("begin to make prediction with plain features and without tuning parameters")
72 | initial_params = {
73 | "n_jobs": -1,
74 | "n_estimators": 400,
75 | "criterion": "gini",
76 | "max_features": 'auto',
77 | "max_depth": 6,
78 | "min_samples_split": 2,
79 | "min_samples_leaf": 1,
80 | "min_weight_fraction_leaf": 0.0,
81 | "max_leaf_nodes": 64,
82 | "min_impurity_decrease": 0.0,
83 | }
84 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns))
85 |
86 | scoring = {'AUC': 'roc_auc', 'f1': "f1"}
87 | clf1 = GridSearchCV(RandomForestClassifier(**initial_params),
88 | param_grid={"n_estimators":[400,600],"max_leaf_nodes": [16,24,32,64]},
89 | scoring=scoring, cv=3, refit='f1',n_jobs=-1,verbose=0)
90 | clf1.fit(train_set.values, train_label.values)
91 | # cv_results = cv(initial_params,train_data,num_boost_round=800,nfold=4,early_stopping_rounds=30,verbose_eval=True)
92 | # bst = lgb.cv(initial_params, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30)
93 | print(clf1.best_score_)
94 | print(clf1.best_params_)
95 | # clf1 = LGBMClassifier(**initial_params)
96 | # clf1.fit(X=train_x,y=train_y,eval_set=(val_x,val_y),early_stopping_rounds=20,eval_metric="auc")
97 | print("load the test dataset")
98 | test_set = processing(trainSpan=(20, 30), label=False)
99 | print("begin to make prediction")
100 | predict(clf1,test_set)
101 |
102 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
103 | print("begin to get important features")
104 | feature_names = train_set.columns
105 | feature_importances = clf1.best_estimator_.feature_importances_
106 | print(feature_importances)
107 | print(feature_names)
108 |
109 | with open("kuaishou_stats.csv", 'a', newline='') as f:
110 | writer = csv.writer(f)
111 | writer.writerow(["feature importance of catboost for tencent-crt", str_time])
112 | # writer.writerow(eval_metrics)
113 | feature_score_name = sorted(zip(feature_importances, feature_names), reverse=True)
114 | for score, name in feature_score_name:
115 | print('{}: {}'.format(name, score))
116 | writer.writerow([name, score])
117 | sorted_feature_name = [name for score, name in feature_score_name]
118 | print(sorted_feature_name)
119 |
120 | print("begin to tune the parameters with the selected feature")
121 | paramsSpace = {
122 | "n_estimators": (200, 800),
123 | "criterion": Categorical(["gini", "entropy"]),
124 | "max_features": (0.6, 1.0, 'uniform'),
125 | "max_depth": (3, 8),
126 | "min_samples_split": (2, 128),
127 | "min_samples_leaf": (1, 128),
128 | "min_weight_fraction_leaf": (0.0, 0.5, 'uniform'),
129 | "max_leaf_nodes": (16, 128),
130 | "min_impurity_decrease": (1e-6, 1e-1, 'log-uniform'),
131 | }
132 |
133 | def tune_parameter(X, y, clf, params):
134 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
135 | gs = BayesSearchCV(
136 | estimator=clf, search_spaces=params,
137 | scoring="f1", n_iter=60,optimizer_kwargs={"base_estimator":"RF"},
138 | verbose=0, n_jobs=-1, cv=3, refit=True, random_state=1234
139 | )
140 | gs.fit(X, y,callback=DeltaXStopper(0.000001))
141 | best_params = gs.best_params_
142 | best_score = gs.best_score_
143 | print(best_params)
144 | print(best_score)
145 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
146 | with open("kuaishou_stats.csv", 'a', newline='') as f:
147 | writer = csv.writer(f)
148 | writer.writerow(["the best params for lightgbm: "])
149 | for key, value in best_params.items():
150 | writer.writerow([key, value])
151 | writer.writerow(["the best score for lightgbm: ", best_score,str_time])
152 | return gs
153 |
154 | model = RandomForestClassifier(**initial_params)
155 | clf2 = tune_parameter(train_set.values,train_label.values,model,paramsSpace)
156 | print("parameter tuning over, begin to save the model!")
157 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
158 |
159 | model_name = "lightgbm_" + str_time + ".pkl"
160 | joblib.dump(clf2, model_name)
161 |
162 | print("begin to process the whole dataset and ready to feed into the fitted model")
163 | predict(clf2,test_set)
164 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
165 | print("begin to get important features")
166 | feature_names = train_set.columns
167 | feature_importances = clf2.best_estimator_.feature_importances_
168 | print(feature_importances)
169 | print(feature_names)
170 |
171 | with open("kuaishou_stats.csv", 'a', newline='') as f:
172 | writer = csv.writer(f)
173 | writer.writerow(["feature importance of catboost for tencent-crt", str_time])
174 | # writer.writerow(eval_metrics)
175 | feature_score_name = sorted(zip(feature_importances, feature_names), reverse=True)
176 | for score, name in feature_score_name:
177 | print('{}: {}'.format(name, score))
178 | writer.writerow([name, score])
179 | sorted_feature_name = [name for score, name in feature_score_name]
180 | print(sorted_feature_name)
181 | if __name__=="__main__":
182 | run()
--------------------------------------------------------------------------------
/rulepy/hardcode_approach.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | import pandas as pd
4 | import numpy as np
5 | user_register_log = ["user_id","register_day","register_type","device_type"]
6 | app_launch_log = ["user_id","app_launch_day"]
7 | video_create_log = ["user_id","video_create_day"]
8 | user_activity_log = ["user_id","user_activity_day","page","video_id","author_id","action_type"]
9 |
10 |
11 | def get_user_from_videoCreate(laterThanDay,videoCount):
12 | print("get users from video create")
13 | video_create_log = ["user_id", "video_create_day"]
14 | dtype_video_create = {"user_id": np.uint32, "video_create_day": np.uint8}
15 | df_video_create = pd.read_table("data/video_create_log.txt",header=None,names=video_create_log,index_col=None,dtype=dtype_video_create)
16 | latest_user = (df_video_create.loc[df_video_create["video_create_day"]>laterThanDay]).user_id.unique().tolist()
17 | print("get latest users")
18 | print(latest_user)
19 | print(len(latest_user))
20 | df_video_create["videoCount"] = df_video_create.groupby(by=["user_id"])["video_create_day"].transform(lambda x: x.nunique())
21 | frequent_user = (df_video_create.loc[df_video_create["videoCount"]>videoCount]).user_id.unique().tolist()
22 | print("get frequent users")
23 | print(frequent_user)
24 | print(len(frequent_user))
25 | user_videoCreate = list(set(latest_user+frequent_user))
26 | print(user_videoCreate)
27 | print(len(user_videoCreate))
28 | return user_videoCreate
29 | # with open("result/submission.csv","a",newline="") as f:
30 | # writer = csv.writer(f)
31 | # for i in user_videoCreate:
32 | # writer.writerow([i])
33 | # get_user_from_videoCreate(23,2)
34 | def get_user_from_appLaunch(laterThanDay,launchCount):
35 | print("get users from app launch log")
36 | app_launch_log = ["user_id","app_launch_day"]
37 | dtype_app_launch = {"user_id":np.uint32,"app_launch_day":np.uint8}
38 | df_app_launch = pd.read_table("data/app_launch_log.txt",header=None,names=app_launch_log,index_col=None,dtype=dtype_app_launch)
39 | latest_user = (df_app_launch.loc[df_app_launch["app_launch_day"]>laterThanDay]).user_id.unique().tolist()
40 | print("get latest users")
41 | print(latest_user)
42 | print(len(latest_user))
43 | df_app_launch["launchCount"] = df_app_launch.groupby(by=["user_id"])["app_launch_day"].transform(lambda x: x.nunique())
44 | frequent_user = (df_app_launch.loc[df_app_launch["launchCount"]>launchCount]).user_id.unique().tolist()
45 | print("get frequent users")
46 | print(frequent_user)
47 | print(len(frequent_user))
48 | user_appLaunch = list(set(latest_user+frequent_user))
49 | print("get merged users")
50 | print(user_appLaunch)
51 | print(len(user_appLaunch))
52 | return user_appLaunch
53 | # with open("result/submission.csv","a",newline="") as f:
54 | # writer = csv.writer(f)
55 | # for i in user_appLaunch:
56 | # writer.writerow([i])
57 | # get_user_from_appLaunch(27,4)
58 | def get_user_from_userRegister(laterThanDay):
59 | print("get users from user register log")
60 | user_register_log = ["user_id", "register_day", "register_type", "device_type"]
61 | dtype_user_register = {"user_id": np.uint32, "register_day": np.uint8, "register_type": np.uint8, "device_type": str}
62 | df_user_register = pd.read_table("data/user_register_log.txt",header=None,names=user_register_log,index_col=None,dtype=dtype_user_register)
63 | latest_user = (df_user_register.loc[df_user_register["register_day"]>laterThanDay]).user_id.unique().tolist()
64 | print("get latest users")
65 | print(latest_user)
66 | print(len(latest_user))
67 | return latest_user
68 | # get_user_from_userRegister(25)
69 | def get_user_from_userActivity(laterThanDay,dayCount,pageList,typeList):
70 | print("get users from user activity log")
71 | user_activity_log = ["user_id", "user_activity_day", "page", "video_id", "author_id", "action_type"]
72 | usecols = ["user_id", "user_activity_day", "page","action_type"]
73 | dtype_user_activity = {"user_id": np.uint32, "user_activity_day": np.uint8, "page": np.uint8, "action_type": np.uint8}
74 | df_user_activity = pd.read_table("data/user_activity_log.txt",header=None,names=user_activity_log,usecols=usecols,index_col=None,dtype=dtype_user_activity)
75 | latest_user = (df_user_activity.loc[df_user_activity["user_activity_day"]>laterThanDay]).user_id.unique().tolist()
76 | print("get latest users")
77 | print(latest_user)
78 | print(len(latest_user))
79 |
80 | df_user_activity["dayCount"] = df_user_activity.groupby(by=["user_id"])["user_activity_day"].transform(lambda x: x.nunique())
81 | frequent_user = (df_user_activity.loc[df_user_activity["dayCount"]>dayCount]).user_id.unique().tolist()
82 | print("get frequent users")
83 | print(frequent_user)
84 | print(len(frequent_user))
85 |
86 | print("get users in certain pages and certain action type")
87 | user_inList = (df_user_activity.loc[((df_user_activity["page"].isin(pageList))|(df_user_activity["action_type"].isin(typeList)))&(df_user_activity["user_activity_day"]>laterThanDay-3)]).user_id.unique().tolist()
88 |
89 | print(user_inList)
90 | print(len(user_inList))
91 | user_userActivity = list(set(latest_user+frequent_user+user_inList))
92 |
93 | print("get merged users")
94 | print(user_userActivity)
95 | print(len(user_userActivity))
96 | return user_userActivity
97 | # get_user_from_userActivity(27, 3, [1,2,3], [1,3,4,5])
98 |
99 | def get_user():
100 |
101 | user_videoCreate = get_user_from_videoCreate(23, 31)
102 | user_appLaunch = get_user_from_appLaunch(23,31)
103 | user_userRegister = get_user_from_userRegister(23)
104 | user_userActivity = get_user_from_userActivity(23, 31, [], [])
105 |
106 | users = list(set(user_videoCreate+user_appLaunch+user_userRegister+user_userActivity))
107 | print("get the final merged users")
108 | print(users)
109 | print(len(users))
110 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
111 | submission_file = "result/submission_" + str_time + ".csv"
112 | # with open(submission_file,"a",newline="") as f:
113 | # writer = csv.writer(f)
114 | # for i in users:
115 | # writer.writerow([i])
116 | get_user()
--------------------------------------------------------------------------------
/svmpy/svm_v1.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import datetime
3 | import pandas as pd
4 | import joblib
5 | from sklearn.model_selection import GridSearchCV
6 | from sklearn.svm import SVC
7 | from skopt import BayesSearchCV
8 | from skopt.callbacks import DeltaXStopper
9 | from data_process_v4 import processing
10 | from skopt.space import Categorical, Real, Integer
11 |
12 |
13 | def predict(clf2, test_set):
14 | uid = pd.DataFrame()
15 | # test_set = processing(trainSpan=(1, 30), label=False)
16 | uid["user_id"] = test_set["user_id"]
17 | test_set = test_set.drop(labels=["user_id"], axis=1)
18 | print("begin to make predictions")
19 | res = clf2.predict_proba(test_set.values)
20 | uid["proba1"] = pd.Series(res[:, 1])
21 | uid["score"] = uid.groupby(by=["user_id"])["proba1"].transform(lambda x: sum(x) / float(len(x)))
22 | uid.drop_duplicates(subset=["user_id"],inplace=True)
23 | uid.sort_values(by=["score"],axis=0,ascending=False,inplace=True)
24 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
25 | uid_file = "result/uid_" + str_time + ".csv"
26 | uid.to_csv(uid_file,header=True,index=False)
27 | active_users = uid["user_id"][:24000].unique().tolist()
28 | print(len(active_users))
29 | print(active_users)
30 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
31 | submission_file = "result/submission_svc_" + str_time + ".csv"
32 | with open(submission_file, "a", newline="") as f:
33 | writer = csv.writer(f)
34 | for i in active_users:
35 | writer.writerow([i])
36 | # using this module ,one needs to deconstruct some of the features in data_process
37 | def run():
38 | # print("begin to load the trainset1")
39 | # train_set1 = processing(trainSpan=(1,19),label=True)
40 | # train_set1.to_csv("data/training_ld1-19.csv", header=True, index=False)
41 | # train_set1 = pd.read_csv("data/training_ld1-16.csv", header=0, index_col=None)
42 | # print(train_set1.describe())
43 | # print("begin to load the trainset2")
44 | # train_set2 = processing(trainSpan=(5,23),label=True)
45 | # train_set2.to_csv("data/training_ld5-23.csv", header=True, index=False)
46 | # train_set2 = pd.read_csv("data/training_ld8-23.csv", header=0, index_col=None)
47 | # print(train_set1.describe())
48 | # print("begin to load the trainset3")
49 | # train_set3 = processing(trainSpan=(1,23),label=True)
50 | # train_set3.to_csv("data/training_ld1-23.csv", header=True, index=False)
51 | # train_set3 = pd.read_csv("data/training_ld1-23.csv", header=0, index_col=None)
52 | # print(train_set1.describe())
53 | print("begin to merge the trainsets")
54 | # train_set = pd.concat([train_set1,train_set2,train_set3],axis=0)
55 | # train_set = pd.concat([train_set1,train_set2],axis=0)
56 | # train_set.to_csv("data/training_lm5-23.csv", header=True, index=False)
57 | train_set = pd.read_csv("data/training_lm15-23.csv", header=0, index_col=None)
58 | # del train_set1,train_set2
59 | # gc.collect()
60 | print(train_set.describe())
61 | keep_feature = list(set(train_set.columns.values.tolist())-set(["user_id","label"]))
62 | print("begin to drop the duplicates")
63 | train_set.drop_duplicates(subset=keep_feature,inplace=True)
64 | print(train_set.describe())
65 | train_label =train_set["label"]
66 | train_set = train_set.drop(labels=["label","user_id"], axis=1)
67 |
68 | # train_x, val_x,train_y,val_y = train_test_split(train_set.values,train_label.values,test_size=0.33,random_state=42,shuffle=True)
69 | print("begin to make prediction with plain features and without tuning parameters")
70 | initial_params = {
71 | "C": 1.0,
72 | "kernel": "rbf",
73 | "degree": 3,
74 | "gamma":"auto",
75 | "coef0": 0.0,
76 | "tol": 0.0001,
77 | "cache_size": 4000,
78 | "verbose": True,
79 | "max_iter": -1,
80 | "probability": True,
81 | }
82 | # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns))
83 |
84 | scoring = {'AUC': 'roc_auc', 'f1': "f1"}
85 | clf1 = GridSearchCV(SVC(**initial_params),
86 | param_grid={
87 | "C":[0.01,0.1,1.0,10,100],
88 | "kernel": ["rbf"],
89 | "gamma":[0.0001,0.001,0.01,0.1]},
90 | scoring=scoring, cv=4, refit='f1',n_jobs=-1,verbose=2)
91 | clf1.fit(train_set.values, train_label.values)
92 | # cv_results = cv(initial_params,train_data,num_boost_round=800,nfold=4,early_stopping_rounds=30,verbose_eval=True)
93 | # bst = lgb.cv(initial_params, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30)
94 | bs = clf1.best_score_
95 | print(bs)
96 | bp = clf1.best_params_
97 | print(bp)
98 | # clf1 = LGBMClassifier(**initial_params)
99 | # clf1.fit(X=train_x,y=train_y,eval_set=(val_x,val_y),early_stopping_rounds=20,eval_metric="auc")
100 | print("load the test dataset")
101 | # test_set = processing(trainSpan=(15, 30), label=False)
102 | # test_set.to_csv("data/testing_ld15-30.csv",header=True,index=False)
103 | test_set = pd.read_csv("data/testing_ld15-30.csv",header=0,index_col=None)
104 | print("begin to make prediction")
105 | predict(clf1,test_set)
106 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
107 | with open("kuaishou_stats.csv", 'a', newline='') as f:
108 | writer = csv.writer(f)
109 | writer.writerow(["feature importance of svm for kuaishou-crt ", str_time])
110 | writer.writerow(["best score",bs,"best params"])
111 | for key, value in bp.items():
112 | writer.writerow([key, value])
113 |
114 | model_name = "svm_" + str_time + ".pkl"
115 | joblib.dump(clf1, model_name)
116 | print("begin to tune the parameters with the selected feature")
117 | paramsSpace = {
118 | "C": Real(1e-6, 1e+6, prior='log-uniform'),
119 | "gamma": Real(1e-6, 1e+1, prior='log-uniform'),
120 | "degree": Integer(1,3),
121 | "kernel": Categorical(['poly', 'rbf']),
122 | }
123 | def tune_parameter(X, y, clf, params):
124 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
125 | gs = BayesSearchCV(
126 | estimator=clf, search_spaces=params,
127 | scoring="f1", n_iter=100,optimizer_kwargs={"base_estimator":"GP"},
128 | verbose=2, n_jobs=-1, cv=4, refit=True, random_state=1234
129 | )
130 | gs.fit(X, y,callback=DeltaXStopper(0.000001))
131 | best_params = gs.best_params_
132 | best_score = gs.best_score_
133 | print(best_params)
134 | print(best_score)
135 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
136 | with open("kuaishou_stats.csv", 'a', newline='') as f:
137 | writer = csv.writer(f)
138 | writer.writerow(["the best params for svm: "])
139 | for key, value in best_params.items():
140 | writer.writerow([key, value])
141 | writer.writerow(["the best score for svm: ", best_score,str_time])
142 | return gs
143 |
144 | model = SVC(**bp)
145 | clf2 = tune_parameter(train_set.values,train_label.values,model,paramsSpace)
146 | print("parameter tuning over, begin to save the model!")
147 | str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
148 |
149 | model_name = "svm_" + str_time + ".pkl"
150 | joblib.dump(clf2, model_name)
151 |
152 | print("begin to process the whole dataset and ready to feed into the fitted model")
153 | predict(clf2,test_set)
154 |
155 | if __name__=="__main__":
156 | run()
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | data = ['First Item', 'Second Item', 'Third Item']
4 | with open('output.csv', 'w', newline='') as csvfile:
5 | writer = csv.writer(csvfile)
6 | for i in data:
7 | writer.writerow([i])
--------------------------------------------------------------------------------
/utilspy/calculate.py:
--------------------------------------------------------------------------------
1 | def calculate():
2 | M_hat = 51480
3 | F1_hat = 0.63088748
4 | precision_hat = F1_hat/(2-F1_hat)
5 | N_hat = M_hat*precision_hat
6 | print(N_hat)
7 |
8 | f1 = 0.8014
9 | M = 30000
10 | TP = (M+N_hat)/2*f1
11 |
12 | precision = TP/M
13 | recall = TP/N_hat
14 | print("True positive number {} ".format(TP))
15 | print("precision {}".format(precision))
16 | print("recall {}".format(recall))
17 |
18 |
19 |
20 | p = 20200/25600
21 | r = 20200/23722
22 | print("pre {}".format(p))
23 | print("rec {}".format(r))
24 |
25 |
26 |
27 | # p = 0.795
28 | # r = 0.845
29 | # print("possible submit number {}".format(N_hat*r/p))
30 | f1 = 2*p*r/(p+r)
31 | print("f1 score {}".format(f1))
32 |
33 | print(336/800)
34 |
35 | if __name__ == "__main__":
36 | calculate()
37 |
--------------------------------------------------------------------------------
/utilspy/create_data.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from dataprocesspy.create_feature_v3_nonp import processing
3 |
4 | if __name__=="__main__":
5 | # print("begin to load the testset")
6 | # train_set52 = processing(trainSpan=(1, 30), label=False)
7 | # train_set52.to_csv("data/testing_eld1-30_r.csv", header=True, index=False)
8 | # train_set52 = pd.read_csv("data/training_eld1-23.csv", header=0, index_col=None, usecols=use_feature)
9 | # print(train_set52.describe())
10 | # print("begin to load the trainset52")
11 | # train_set52 = processing(trainSpan=(1, 23), label=True)
12 | # train_set52.to_csv("data/training_rld1-23_r.csv", header=True, index=False)
13 | # # train_set52 = pd.read_csv("data/training_eld1-23.csv", header=0, index_col=None, usecols=use_feature)
14 | # print(train_set52.describe())
15 | # print("begin to load the trainset51")
16 | # train_set51 = processing(trainSpan=(1, 22), label=True)
17 | # train_set51.to_csv("data/training_rld1-22.csv", header=True, index=False)
18 | # # train_set5 = pd.read_csv("data/training_eld1-22.csv", header=0, index_col=None, usecols=use_feature)
19 | # print(train_set51.describe())
20 | # print("begin to load the trainset5")
21 | # train_set5 = processing(trainSpan=(1, 21), label=True)
22 | # train_set5.to_csv("data/training_rld1-21.csv", header=True, index=False)
23 | # # train_set5 = pd.read_csv("data/training_eld1-21.csv", header=0, index_col=None, usecols=use_feature)
24 | # print(train_set5.describe())
25 | print("begin to load the trainset41")
26 | train_set41 = processing(trainSpan=(1, 20), label=True)
27 | train_set41.to_csv("../data/data_v4/training_rld1-20.csv", header=True, index=False)
28 | # train_set41 = pd.read_csv("data/training_eld1-20.csv", header=0, index_col=None, usecols=use_feature)
29 | print(train_set41.describe())
30 | print("begin to load the trainset4")
31 | train_set4 = processing(trainSpan=(1, 19), label=True)
32 | train_set4.to_csv("../data/data_v4/training_rld1-19.csv", header=True, index=False)
33 | # train_set4 = pd.read_csv("data/training_eld1-19.csv", header=0, index_col=None, usecols=use_feature)
34 | print(train_set4.describe())
35 | print("begin to load the trainset2")
36 | train_set2 = processing(trainSpan=(1, 15), label=True)
37 | train_set2.to_csv("../data/data_v4/training_rld1-15.csv", header=True, index=False)
38 | # train_set2 = pd.read_csv("data/training_eld1-15.csv", header=0, index_col=None, usecols=use_feature)
39 | print(train_set2.describe())
40 | print("begin to load the trainset21")
41 | train_set21 = processing(trainSpan=(1, 16), label=True)
42 | train_set21.to_csv("../data/data_v4/training_rld1-16.csv", header=True, index=False)
43 | # train_set21 = pd.read_csv("data/training_eld1-16.csv", header=0, index_col=None, usecols=use_feature)
44 | print(train_set21.describe())
45 | print("begin to load the trainset3")
46 | train_set3 = processing(trainSpan=(1, 17), label=True)
47 | train_set3.to_csv("../data/data_v4/training_rld1-17.csv", header=True, index=False)
48 | # train_set3 = pd.read_csv("data/training_eld1-17.csv", header=0, index_col=None, usecols=use_feature)
49 | print(train_set3.describe())
50 | print("begin to load the trainset31")
51 | train_set31 = processing(trainSpan=(1, 18), label=True)
52 | train_set31.to_csv("../data/data_v4/training_rld1-18.csv", header=True, index=False)
53 | # train_set3 = pd.read_csv("data/training_eld1-18.csv", header=0, index_col=None, usecols=use_feature)
54 | print(train_set31.describe())
55 |
56 |
57 |
--------------------------------------------------------------------------------
/utilspy/kpca.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import math
3 | from sklearn.datasets import make_circles
4 | from sklearn.model_selection import train_test_split
5 | import matplotlib.pyplot as plt
6 | from multiprocessing import Pool
7 | from sklearn.metrics.pairwise import rbf_kernel
8 |
9 | """
10 | Implementation of methods from the paper
11 | Kernel PCA and De-noising in feature spaces.
12 | Each function has a comment above it which contains
13 | "(e)" where e denotes the corresponding equation from
14 | the paper.
15 | """
16 |
17 |
18 | def gaussianKernel(x, y, c):
19 | ''' Returns K(x,y) where K denotes gaussian kernel '''
20 | return math.exp(-(np.sqrt(np.dot(x - y, (x - y).conj())) ** 2) / c)
21 |
22 |
23 | # return math.exp(-(np.linalg.norm(x-y)**2) / c)
24 |
25 | def createK(data, c):
26 | ''' Returns K matrix containing inner products of the data using the kernel function
27 | so that K_ij := (phi(x_i)*phi(x_j)) '''
28 | return rbf_kernel(data, gamma=1 / c)
29 |
30 |
31 | def createKOld(data, kernelFunction, c):
32 | ''' Returns K matrix containing inner products of the data using the kernel function
33 | so that K_ij := (phi(x_i)*phi(x_j)) '''
34 | return rbf_kernel(data, gamma=1 / c)
35 |
36 |
37 | # l = len(data)
38 | # K = np.zeros((l,l))
39 | # for col in range(l):
40 | # for row in range(l):
41 | # K[row][col] = kernelFunction(data[row],data[col], c)
42 | # return K
43 |
44 | def calcBetaKOld(alphaK, data, x, c):
45 | ''' Returns the projection of x onto the eigenvector V_k '''
46 | BetaK = 0
47 | # print 'data.shape',data.shape
48 | # print 'x.shape', x.shape
49 | kernelVals = rbf_kernel(data, x.reshape(1, -1), 1 / c)
50 | for i, xi in enumerate(data):
51 | # BetaK += alphaK[i]*kernelFunction(xi,x,c)
52 | BetaK += alphaK[i] * kernelVals[i][0]
53 | return BetaK
54 |
55 |
56 | def calcBetaK(alphaK, kernelVals):
57 | ''' Returns the projection of x onto the eigenvector V_k '''
58 | BetaK = 0
59 | BetaK = np.sum(alphaK * kernelVals)
60 | return BetaK
61 |
62 |
63 | def centerK(K):
64 | ''' Returns centered K matrix, see K. Murphy 14.43 '''
65 | l = len(K)
66 | l_ones = np.ones((l, l), dtype=int) / l
67 | Kcentered = K - np.dot(l_ones, K) - np.dot(K, l_ones) + np.dot(l_ones, np.dot(K, l_ones))
68 | return Kcentered
69 |
70 |
71 | def normAlpha(alpha, lambdas):
72 | ''' Returns new alpha corresponding to normalized eigen vectors,
73 | so that lambda_k(a^k * a^k) = 1 '''
74 | for i, a in enumerate(alpha):
75 | a /= np.sqrt(lambdas[i])
76 | return alpha
77 |
78 |
79 | # def calcZold(alpha, data, x, kernelFunction, c,z0):
80 | # ''' Equation (10), returns pre-image z for single input datapoint x '''
81 | # z = z0
82 | # iters=0
83 | # while iters <5:
84 | # numerator = 0
85 | # denom = 0
86 | # for i, xi in enumerate(data):
87 | # gammaI = calcGammaI(alpha, i, data, x, kernelFunction, c) * kernelFunction(z,xi,c)
88 | # numerator += gammaI * xi
89 | # denom += gammaI
90 | # z = numerator/denom
91 | # iters +=1
92 | # return z
93 |
94 | def calcZWrapper(args):
95 | return calcZ(*args)
96 |
97 |
98 | def calcZ(alpha, data, x, K, c, z0, idx):
99 | ''' Equation (10), returns pre-image z for single input datapoint x '''
100 | z = z0
101 | iters = 0
102 | maxIters = 10
103 | # calculate beta, gamma (do not change with each iteration)
104 | beta = [calcBetaKOld(aK, data, x, c) for aK in alpha]
105 | gamma = [calcGammaIOpt(alpha, i, beta) for i in range(len(data))]
106 |
107 | while iters < maxIters: # iterate until convergence
108 | numerator = 0
109 | denom = 0
110 | k = rbf_kernel(data, z.reshape(1, -1), 1 / c)
111 | for i, xi in enumerate(data):
112 | gammaI = gamma[i] * k[i][0]
113 | numerator += gammaI * xi
114 | denom += gammaI
115 | if denom > 10 ** -12: # handling numerical instability
116 | newZ = numerator / denom
117 | """
118 | if np.linalg.norm(z - newZ) < 10**-8: # convergence definition
119 | z = newZ
120 | break
121 | """
122 | z = newZ
123 | iters += 1
124 | else:
125 | # print "restarted point"
126 | iters = 0
127 | z = z0 + np.random.multivariate_normal(np.zeros(z0.size), np.identity(z0.size))
128 | numerator = 0
129 | denom = 0
130 |
131 | # print "iters:", iters
132 | return z
133 |
134 |
135 | # def calcGammaI(alpha, i, data, x, kernelFunction, c):
136 | # ''' returns gamma_i = sum_{k=1}^n Beta_k * alpha_i^k '''
137 | # gammaI = 0
138 | # alphaI = alpha.T[i]
139 | # for k, alphaKI in enumerate(alphaI):
140 | # gammaI += calcBetaK(alpha[k], kernelFunction, data, x, c) * alphaKI
141 | # return gammaI
142 |
143 | def calcGammaIOpt(alpha, i, beta):
144 | ''' returns gamma_i = sum_{k=1}^n beta_k * alpha_i^k '''
145 | gammaI = 0
146 | alphaI = alpha.T[i]
147 | for k, alphaKI in enumerate(alphaI):
148 | gammaI += beta[k] * alphaKI
149 | return gammaI
150 |
151 |
152 | def kernelPCADeNoise(kernelFunction, c, components, dataTrain, dataTest):
153 | Data = dataTrain
154 |
155 | l = len(Data)
156 |
157 | # build K
158 | # K = createK(Data, kernelFunction, c)
159 | K = createK(Data, c)
160 |
161 | # center K
162 | K = centerK(K)
163 |
164 | # find eigen vectors
165 | lLambda, alpha = np.linalg.eigh(K) # (3)
166 | lambdas = lLambda / l # /l with the notation from the paper (but not murphys)
167 | # drop negative and 0 eigenvalues and their vectors
168 | for i, l in enumerate(lambdas):
169 | if l > 10 ** (-8):
170 | lambdas = lambdas[i:]
171 | alpha = alpha[i:]
172 | break
173 |
174 | # use only the components largest eigenvalues with corresponding vectors
175 | lambdas = lambdas[-components:]
176 | alpha = alpha[-components:]
177 |
178 | # normalize alpha
179 | alpha = normAlpha(alpha, lambdas)
180 |
181 | # p=Pool()
182 | # Z = p.map(calcZWrapper, [(alpha, Data, x, K, c, x, i) for i, x in enumerate(dataTest)])
183 |
184 | Z = []
185 | for i in range(len(dataTest)):
186 | # print i
187 | Z.append(calcZ(alpha, Data, dataTest[i], K, c, dataTest[i], i))
188 |
189 | Z = np.array(Z)
190 | return Z
191 |
192 |
193 | # if __name__ == '__main__':
194 | # # hyperparameters
195 | # c = 0.5
196 | #
197 | # # For half-circle toy example
198 | # X, y = make_circles(n_samples=600, factor=.3, noise=.05)
199 | # X = np.array([x for i, x in enumerate(X) if x[1] > 0 and not y[i]])
200 | # Xtrain, Xtest = train_test_split(X, test_size=0.9)
201 | #
202 | # Z = kernelPCADeNoise(gaussianKernel, c, 1, Xtrain, Xtest)
203 | #
204 | # plt.plot(Xtrain.T[0], Xtrain.T[1], 'ro')
205 | # plt.plot(Z.T[0], Z.T[1], 'go')
206 | # plt.show()
--------------------------------------------------------------------------------
/utilspy/kuaishou_stats2.csv:
--------------------------------------------------------------------------------
1 | ,user_id,register_day_type_rate,register_day_type_ratio,register_day_device_ratio,register_type_ratio,register_type_device,register_type_device_ratio,register_day_device_rate,device_type_ratio,device_type_register_ratio,register_day_register_type_device_ratio,register_day_device_type_register_ratio
2 | count,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0
3 | mean,686252.6029829077,547.1040718009635,0.3495839834213257,0.010990312322974205,0.344575434923172,1255.2387316043028,0.012078322470188141,16.82571108031413,0.010135618969798088,0.4458794593811035,0.018300948664546013,0.5927771329879761
4 | std,396370.4801494953,239.88014453171334,0.14129890501499176,0.013702892698347569,0.13428840041160583,321.2693523491565,0.017135903239250183,20.896847090204183,0.012916702777147293,0.21795567870140076,0.03660264611244202,0.290801078081131
5 | min,16.0,1.0,0.0004636068479157984,0.0004636068479157984,0.00013198706437833607,3.0,7.117944187484682e-05,1.0,3.299676609458402e-05,0.000681198900565505,0.0010857763700187206,0.00917431153357029
6 | 25%,343746.5,355.0,0.26530611515045166,0.001303780940361321,0.32013463973999023,1315.0,0.0011337868636474013,2.0,0.0008249191450886428,0.29323309659957886,0.002816901309415698,0.3636363744735718
7 | 50%,685296.0,598.0,0.3815484344959259,0.004965859930962324,0.32013463973999023,1315.0,0.004911381751298904,7.0,0.0038936184719204903,0.46341463923454285,0.007547169923782349,0.5384615659713745
8 | 75%,1031319.75,754.0,0.4686369001865387,0.01543460600078106,0.46357157826423645,1516.0,0.015517118386924267,25.0,0.01484854519367218,0.5435967445373535,0.021314388141036034,1.0
9 | max,1367532.0,921.0,0.5199321508407593,0.05910735949873924,0.46357157826423645,1516.0,0.5,109.0,0.048439253121614456,1.0,1.0,1.0
10 | ,user_id,register_day_type_rate,register_day_type_ratio,register_day_device_ratio,register_type_ratio,register_type_device,register_type_device_ratio,register_day_device_rate,device_type_ratio,device_type_register_ratio,register_day_register_type_device_ratio,register_day_device_type_register_ratio
11 | count,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0,30306.0
12 | mean,686252.6029829077,547.1040718009635,0.3495839834213257,0.010990312322974205,0.344575434923172,1255.2387316043028,0.012078322470188141,16.82571108031413,0.010135618969798088,0.4458794593811035,0.018300948664546013,0.5927771329879761
13 | std,396370.4801494953,239.88014453171334,0.14129890501499176,0.013702892698347569,0.13428840041160583,321.2693523491565,0.017135903239250183,20.896847090204183,0.012916702777147293,0.21795567870140076,0.03660264611244202,0.290801078081131
14 | min,16.0,1.0,0.0004636068479157984,0.0004636068479157984,0.00013198706437833607,3.0,7.117944187484682e-05,1.0,3.299676609458402e-05,0.000681198900565505,0.0010857763700187206,0.00917431153357029
15 | 25%,343746.5,355.0,0.26530611515045166,0.001303780940361321,0.32013463973999023,1315.0,0.0011337868636474013,2.0,0.0008249191450886428,0.29323309659957886,0.002816901309415698,0.3636363744735718
16 | 50%,685296.0,598.0,0.3815484344959259,0.004965859930962324,0.32013463973999023,1315.0,0.004911381751298904,7.0,0.0038936184719204903,0.46341463923454285,0.007547169923782349,0.5384615659713745
17 | 75%,1031319.75,754.0,0.4686369001865387,0.01543460600078106,0.46357157826423645,1516.0,0.015517118386924267,25.0,0.01484854519367218,0.5435967445373535,0.021314388141036034,1.0
18 | max,1367532.0,921.0,0.5199321508407593,0.05910735949873924,0.46357157826423645,1516.0,0.5,109.0,0.048439253121614456,1.0,1.0,1.0
19 |
--------------------------------------------------------------------------------
/utilspy/util_analysis.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | def missing_values_table(df):
4 | """Function to calculate missing values by column"""
5 | # Total missing values
6 | mis_val = df.isnull().sum()
7 |
8 | # Percentage of missing values
9 | mis_val_percent = 100 * df.isnull().sum() / len(df)
10 |
11 | # Make a table with the results
12 | mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
13 |
14 | # Rename the columns
15 | mis_val_table_ren_columns = mis_val_table.rename(
16 | columns={0: 'Missing Values', 1: '% of Total Values'})
17 |
18 | # Sort the table by percentage of missing descending
19 | mis_val_table_ren_columns = mis_val_table_ren_columns[
20 | mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values(
21 | '% of Total Values', ascending=False).round(3)
22 |
23 | # Print some summary information
24 | print("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
25 | "There are " + str(mis_val_table_ren_columns.shape[0]) +
26 | " columns that have missing values.")
27 | # Return the dataframe with missing information
28 | return mis_val_table_ren_columns
29 | # Function to calculate correlations with the target for a dataframe
30 | def target_corrs(df,target="label",method="pearson"):
31 | # List of correlations
32 | corrs = []
33 |
34 | # Iterate through the columns
35 | for col in df.columns:
36 | # print(col)
37 | # Skip the target column
38 | if col != target:
39 | # Calculate correlation with the target
40 | corr = df[target].corr(df[col],method=method)
41 | print('The correlation between %s and the TARGET is %0.4f' % (col, corr))
42 | # Append the list as a tuple
43 | corrs.append((col, corr))
44 |
45 | # Sort by absolute magnitude of correlations
46 | corrs = sorted(corrs, key=lambda x: abs(x[1]), reverse=True)
47 |
48 | return corrs
49 | # Function to calculate mutual information with the target for a dataframe
50 | def target_mi(df,target="label"):
51 | from sklearn.feature_selection import mutual_info_classif
52 | # List of correlations
53 | mis = []
54 |
55 | # Iterate through the columns
56 | for col in df.columns:
57 | # print(col)
58 | # Skip the target column
59 | if col != target:
60 | # Calculate correlation with the target
61 | mi = mutual_info_classif(df[col].values,df[target].values)[0]
62 | print('The mutual information between %s and the TARGET is %0.4f' % (col, mi))
63 | # Append the list as a tuple
64 | mis.append((col, mi))
65 |
66 | # Sort by absolute magnitude of correlations
67 | corrs = sorted(mi, key=lambda x: abs(x[1]), reverse=True)
68 |
69 | return corrs
70 |
71 |
72 |
--------------------------------------------------------------------------------
/utilspy/utils_feature_engineering.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import shap
4 |
5 | def collinear_columns_to_remove(df,threshold=0.8,method="pearson"):
6 | from sklearn.feature_selection import mutual_info_classif,mutual_info_regression
7 | if method in ["pearson","kendall","spearman"]:
8 | corrs = df.corr(method=method)
9 | elif method == "mi_classif":
10 | # List of mutual informations
11 | mis = []
12 |
13 | # Iterate through the columns
14 | for col in df.columns:
15 | # print(col)
16 | # Calculate correlation with the target
17 | mi = np.reshape(mutual_info_classif(df.values, df[col].values), -1).tolist()
18 | # Append the list as a tuple
19 | mis.append(mi)
20 | corrs = pd.DataFrame(np.array(mis), columns=df.columns, index=df.columns)
21 | elif method == "mi_regression":
22 | # List of mutual informations
23 | mis = []
24 |
25 | # Iterate through the columns
26 | for col in df.columns:
27 | # print(col)
28 | # Calculate correlation with the target
29 | mi = np.reshape(mutual_info_regression(df.values, df[col].values), -1).tolist()
30 | # Append the list as a tuple
31 | mis.append(mi)
32 | corrs = pd.DataFrame(np.array(mis), columns=df.columns, index=df.columns)
33 | # Set the threshold
34 | threshold = threshold
35 |
36 | # Empty dictionary to hold correlated variables
37 | above_threshold_vars = {}
38 |
39 | # For each column, record the variables that are above the threshold
40 | for col in corrs:
41 | above_threshold_vars[col] = list(corrs.index[corrs[col] > threshold])
42 | # Track columns to remove and columns already examined
43 | cols_to_remove = []
44 | cols_seen = []
45 | cols_to_remove_pair = []
46 |
47 | # Iterate through columns and correlated columns
48 | for key, value in above_threshold_vars.items():
49 | # Keep track of columns already examined
50 | cols_seen.append(key)
51 | for x in value:
52 | if x == key:
53 | next
54 | else:
55 | # Only want to remove one in a pair
56 | if x not in cols_seen:
57 | cols_to_remove.append(x)
58 | cols_to_remove_pair.append(key)
59 |
60 | cols_to_remove = list(set(cols_to_remove))
61 | print('Number of columns to remove: ', len(cols_to_remove))
62 | return cols_to_remove
63 |
64 |
65 | def remove_missing_columns(train, test, threshold=90):
66 | # Calculate missing stats for train and test (remember to calculate a percent!)
67 | train_miss = pd.DataFrame(train.isnull().sum())
68 | train_miss['percent'] = 100 * train_miss[0] / len(train)
69 |
70 | test_miss = pd.DataFrame(test.isnull().sum())
71 | test_miss['percent'] = 100 * test_miss[0] / len(test)
72 |
73 | # list of missing columns for train and test
74 | missing_train_columns = list(train_miss.index[train_miss['percent'] > threshold])
75 | missing_test_columns = list(test_miss.index[test_miss['percent'] > threshold])
76 |
77 | # Combine the two lists together
78 | missing_columns = list(set(missing_train_columns + missing_test_columns))
79 |
80 | # Print information
81 | print('There are %d columns with greater than %d%% missing values.' % (len(missing_columns), threshold))
82 |
83 | # Drop the missing columns and return
84 | train = train.drop(columns=missing_columns)
85 | test = test.drop(columns=missing_columns)
86 |
87 | return train, test
--------------------------------------------------------------------------------
/utilspy/utils_misc.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from joblib import Parallel, delayed
4 | def reduce_mem_usage(df):
5 | """ iterate through all the columns of a dataframe and modify the input type
6 | to reduce memory usage.
7 | """
8 | start_mem = df.memory_usage().sum() / 1024 ** 2
9 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
10 |
11 | for col in df.columns:
12 | col_type = df[col].dtype
13 |
14 | if col_type != object:
15 | c_min = df[col].min()
16 | c_max = df[col].max()
17 | if str(col_type)[:3] == 'int':
18 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
19 | df[col] = df[col].astype(np.int8)
20 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
21 | df[col] = df[col].astype(np.int16)
22 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
23 | df[col] = df[col].astype(np.int32)
24 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
25 | df[col] = df[col].astype(np.int64)
26 | else:
27 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
28 | df[col] = df[col].astype(np.float16)
29 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
30 | df[col] = df[col].astype(np.float32)
31 | else:
32 | df[col] = df[col].astype(np.float64)
33 | else:
34 | if df[col].nunique() < len(df[col]) >> 1:
35 | df[col] = df[col].astype('category')
36 |
37 | end_mem = df.memory_usage().sum() / 1024 ** 2
38 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
39 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
40 |
41 | return df
42 |
43 | class Reducer:
44 | """
45 | Class that takes a dict of increasingly bigger numpy datatypes to transform
46 | the input of a pandas dataframe in order to save memory usage.
47 | """
48 | memory_scale_factor = 1024**2 # memory in MB
49 |
50 | def __init__(self, conv_table=None):
51 | """
52 | :param conv_table: dict with np.dtypes-strings as keys
53 | """
54 | if conv_table is None:
55 | self.conversion_table = \
56 | {'int': [np.int8, np.int16, np.int32, np.int64],
57 | 'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
58 | 'float': [np.float16, np.float32, ]}
59 | else:
60 | self.conversion_table = conv_table
61 |
62 | def _type_candidates(self, k):
63 | for c in self.conversion_table[k]:
64 | i = np.iinfo(c) if 'int' in k else np.finfo(c)
65 | yield c, i
66 |
67 | def reduce(self, df, verbose=False):
68 | """Takes a dataframe and returns it with all input transformed to the
69 | smallest necessary types.
70 |
71 | :param df: pandas dataframe
72 | :param verbose: If True, outputs more information
73 | :return: pandas dataframe with reduced input types
74 | """
75 | if verbose:
76 | start_mem = df.memory_usage().sum() / 1024**2
77 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
78 | ret_list = Parallel(n_jobs=-1)(delayed(self._reduce)
79 | (df[c], c, verbose) for c in
80 | df.columns)
81 | df = pd.concat(ret_list, axis=1)
82 | if verbose:
83 | end_mem = df.memory_usage().sum() / 1024**2
84 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
85 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
86 | return df
87 |
88 | def _reduce(self, s, colname, verbose):
89 |
90 | # skip NaNs
91 | if s.isnull().any():
92 | if verbose:
93 | print(colname, 'has NaNs - Skip..')
94 | return s
95 |
96 | # detect kind of type
97 | coltype = s.dtype
98 | if np.issubdtype(coltype, np.integer):
99 | conv_key = 'int' if s.min() < 0 else 'uint'
100 | elif np.issubdtype(coltype, np.floating):
101 | conv_key = 'float'
102 | else:
103 | if s.nunique()<(len(s)>>1):
104 | return s.astype('category')
105 | if verbose:
106 | print(colname, 'is', coltype, '- Skip..')
107 | print(colname, 'is', coltype, '- Skip..')
108 | return s
109 |
110 | # find right candidate
111 | for cand, cand_info in self._type_candidates(conv_key):
112 | if s.max() <= cand_info.max and s.min() >= cand_info.min:
113 | if verbose:
114 | print('convert', colname, 'to', str(cand))
115 | return s.astype(cand)
116 |
117 | # reaching this code is bad. Probably there are inf, or other high numbs
118 | print(("WARNING: {} "
119 | "doesn't fit the grid with \nmax: {} "
120 | "and \nmin: {}").format(colname, s.max(), s.min()))
121 | print('Dropping it..')
122 | def import_data(file,header=0,index_col=None):
123 | """readin a file as dataframe and optimize its memory usage"""
124 | df = pd.read_csv(file, header=header,index_col=index_col,parse_dates=True, keep_date_col=True)
125 | reducer = Reducer()
126 | df = reducer.reduce(df,verbose=True)
127 | # df = reduce_mem_usage(df)
128 | return df
129 | def convert_data(df):
130 | """create a dataframe and optimize its memory usage"""
131 | reducer = Reducer()
132 | df = reducer.reduce(df,verbose=True)
133 | # df = reduce_mem_usage(df)
134 | return df
--------------------------------------------------------------------------------
/utilspy/utils_models.py:
--------------------------------------------------------------------------------
1 | import gc
2 |
3 | import pandas as pd
4 | import numpy as np
5 | from sklearn.metrics import roc_auc_score
6 | from sklearn.model_selection import KFold
7 | from sklearn.preprocessing import LabelEncoder
8 | import lightgbm as lgb
9 |
10 |
11 | def model(features, test_features,target="label", encoding='ohe', n_folds=5):
12 | """Train and test a light gradient boosting model using
13 | cross validation.
14 |
15 | Parameters
16 | --------
17 | features (pd.DataFrame):
18 | dataframe of training features to use
19 | for training a model. Must include the TARGET column.
20 | test_features (pd.DataFrame):
21 | dataframe of testing features to use
22 | for making predictions with the model.
23 | encoding (str, default = 'ohe'):
24 | method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
25 | n_folds (int, default = 5): number of folds to use for cross validation
26 |
27 | Return
28 | --------
29 | submission (pd.DataFrame):
30 | dataframe with `SK_ID_CURR` and `TARGET` probabilities
31 | predicted by the model.
32 | feature_importances (pd.DataFrame):
33 | dataframe with the feature importances from the model.
34 | valid_metrics (pd.DataFrame):
35 | dataframe with training and validation metrics (ROC AUC) for each fold and overall.
36 |
37 | """
38 | # Extract the ids
39 | train_ids = features['id']
40 | test_ids = test_features['id']
41 |
42 | # Extract the labels for training
43 | labels = features[target]
44 |
45 | # Remove the ids and target
46 | features = features.drop(columns=['id', target])
47 | test_features = test_features.drop(columns=['id'])
48 |
49 | # One Hot Encoding
50 | if encoding == 'ohe':
51 | features = pd.get_dummies(features)
52 | test_features = pd.get_dummies(test_features)
53 |
54 | # Align the dataframes by the columns
55 | features, test_features = features.align(test_features, join='inner', axis=1)
56 |
57 | # No categorical indices to record
58 | cat_indices = 'auto'
59 |
60 | # Integer label encoding
61 | elif encoding == 'le':
62 | # Create a label encoder
63 | label_encoder = LabelEncoder()
64 |
65 | # List for storing categorical indices
66 | cat_indices = []
67 |
68 | # Iterate through each column
69 | for i, col in enumerate(features):
70 | if features[col].dtype == 'object':
71 | # Map the categorical features to integers
72 | features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
73 | test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))
74 |
75 | # Record the categorical indices
76 | cat_indices.append(i)
77 |
78 | # Catch error if label encoding scheme is not valid
79 | else:
80 | raise ValueError("Encoding must be either 'ohe' or 'le'")
81 |
82 | print('Training Data Shape: ', features.shape)
83 | print('Testing Data Shape: ', test_features.shape)
84 |
85 | # Extract feature names
86 | feature_names = list(features.columns)
87 |
88 | # Convert to np arrays
89 | features = np.array(features)
90 | test_features = np.array(test_features)
91 |
92 | # Create the kfold object
93 | k_fold = KFold(n_splits=n_folds, shuffle=False, random_state=50)
94 |
95 | # Empty array for feature importances
96 | feature_importance_values = np.zeros(len(feature_names))
97 | # Empty array for test predictions
98 | test_predictions = np.zeros(test_features.shape[0])
99 |
100 | # Empty array for out of fold validation predictions
101 | out_of_fold = np.zeros(features.shape[0])
102 |
103 | # Lists for recording validation and training scores
104 | valid_scores = []
105 | train_scores = []
106 |
107 | # Iterate through each fold
108 | for train_indices, valid_indices in k_fold.split(features):
109 | # Training input for the fold
110 | train_features, train_labels = features[train_indices], labels[train_indices]
111 | # Validation input for the fold
112 | valid_features, valid_labels = features[valid_indices], labels[valid_indices]
113 |
114 | # Create the model
115 | model = lgb.LGBMClassifier(n_estimators=10000, objective='binary',
116 | class_weight='balanced', learning_rate=0.05,
117 | reg_alpha=0.1, reg_lambda=0.1,
118 | subsample=0.8, n_jobs=-1, random_state=50)
119 |
120 | # Train the model
121 | model.fit(train_features, train_labels, eval_metric='auc',
122 | eval_set=[(valid_features, valid_labels), (train_features, train_labels)],
123 | eval_names=['valid', 'train'], categorical_feature=cat_indices,
124 | early_stopping_rounds=100, verbose=200)
125 |
126 | # Record the best iteration
127 | best_iteration = model.best_iteration_
128 |
129 | # Record the feature importances
130 | feature_importance_values += model.feature_importances_ / k_fold.n_splits
131 | # Make predictions
132 | test_predictions += model.predict_proba(test_features, num_iteration=best_iteration)[:, 1] / k_fold.n_splits
133 |
134 | # Record the out of fold predictions
135 | out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration=best_iteration)[:, 1]
136 |
137 | # Record the best score
138 | valid_score = model.best_score_['valid']['auc']
139 | train_score = model.best_score_['train']['auc']
140 |
141 | valid_scores.append(valid_score)
142 | train_scores.append(train_score)
143 |
144 | # Clean up memory
145 | gc.enable()
146 | del model, train_features, valid_features
147 | gc.collect()
148 |
149 | # Make the submission dataframe
150 | submission = pd.DataFrame({'id': test_ids, target: test_predictions})
151 |
152 | # Make the feature importance dataframe
153 | feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
154 |
155 | # Overall validation score
156 | valid_auc = roc_auc_score(labels, out_of_fold)
157 |
158 | # Add the overall scores to the metrics
159 | valid_scores.append(valid_auc)
160 | train_scores.append(np.mean(train_scores))
161 |
162 | # Needed for creating dataframe of validation scores
163 | fold_names = list(range(n_folds))
164 | fold_names.append('overall')
165 | # Dataframe of validation scores
166 | metrics = pd.DataFrame({'fold': fold_names,
167 | 'train': train_scores,
168 | 'valid': valid_scores})
169 |
170 | return submission, feature_importances, metrics
--------------------------------------------------------------------------------
/utilspy/utils_plot.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | # Plots the disribution of a variable colored by value of the target
5 | def corr_coefficient(df,method="pearson"):
6 | import plotly.graph_objs as pgo
7 | import plotly.offline as po
8 | data = [
9 | pgo.Heatmap(
10 | z=df.corr(method=method).values,
11 | x=df.columns.values,
12 | y=df.columns.values,
13 | colorscale='Viridis',
14 | reversescale=False,
15 | text=True,
16 | opacity=1.0)
17 | ]
18 |
19 | layout = pgo.Layout(
20 | title=method+' Correlation of features',
21 | xaxis=dict(ticks='', nticks=36),
22 | yaxis=dict(ticks=''),
23 | width=900, height=700,
24 | margin=dict(
25 | l=240,
26 | ), )
27 |
28 | fig = pgo.Figure(data=data, layout=layout)
29 | po.iplot(fig, filename='labelled-heatmap')
30 |
31 |
32 | def kde_target(var_name, df,target="label"):
33 | import matplotlib.pyplot as plt # for plotting
34 | import seaborn as sns # for making plots with seaborn
35 | # Calculate the correlation coefficient between the new variable and the target
36 | corr = df[target].corr(df[var_name])
37 |
38 | # Calculate medians for repaid vs not repaid
39 | avg_repaid = df.ix[df[target] == 0, var_name].median()
40 | avg_not_repaid = df.ix[df[target] == 1, var_name].median()
41 |
42 | plt.figure(figsize=(12, 6))
43 |
44 | # Plot the distribution for target == 0 and target == 1
45 | sns.kdeplot(df.ix[df[target] == 0, var_name], label='label == 0')
46 | sns.kdeplot(df.ix[df[target] == 1, var_name], label='label == 1')
47 |
48 | # label the plot
49 | plt.xlabel(var_name);
50 | plt.ylabel('Density');
51 | plt.title('%s Distribution' % var_name)
52 | plt.legend();
53 |
54 | # print out the correlation
55 | print('The correlation between %s and the TARGET is %0.4f' % (var_name, corr))
56 | # Print out average values
57 | print('Median value for loan that was not repaid = %0.4f' % avg_not_repaid)
58 | print('Median value for loan that was repaid = %0.4f' % avg_repaid)
59 | def value_count_bar_default(df,series_name,title="value count bar"):
60 | temp = df[series_name].value_counts()
61 | temp = pd.DataFrame({'labels': temp.index,
62 | 'values': temp.values
63 | })
64 | temp.iplot(kind='bar', xTitle=series_name, yTitle="Count",
65 | title=title, colors=['#75e575'])
66 | import seaborn as sns
67 | plt.figure(figsize=(12, 5))
68 | plt.title("Distribution of register day")
69 | ax = sns.distplot(df_user_register["register_day"])
70 |
71 | def value_count_bar(df,series_name,title="value count plot"):
72 | import plotly.graph_objs as pgo
73 | import plotly.offline as po
74 | temp = df[series_name].value_counts()
75 | # print("Total number of states : ",len(temp))
76 | trace = pgo.Bar(
77 | x=temp.index,
78 | y=(temp / temp.sum()) * 100,
79 | )
80 | data = [trace]
81 | layout = pgo.Layout(
82 | title=title,
83 | xaxis=dict(
84 | title='Name of type of the Suite',
85 | tickfont=dict(
86 | size=14,
87 | color='rgb(107, 107, 107)'
88 | )
89 | ),
90 | yaxis=dict(
91 | title='Count of Name of type of the Suite in %',
92 | titlefont=dict(
93 | size=16,
94 | color='rgb(107, 107, 107)'
95 | ),
96 | tickfont=dict(
97 | size=14,
98 | color='rgb(107, 107, 107)'
99 | )
100 | )
101 | )
102 | fig = pgo.Figure(data=data, layout=layout)
103 | po.iplot(fig, filename=series_name)
104 |
105 | def value_count_pie(df,series_name,title="value count pie",hole=0.0):
106 | temp = df[series_name].value_counts()
107 | df = pd.DataFrame({'labels': temp.index,
108 | 'values': temp.values
109 | })
110 | df.iplot(kind='pie', labels='labels', values='values', title=title,hole=hole)
111 | def value_count_hole_pie(df, series_name,title="value count hole pie"):
112 | from plotly.offline import iplot
113 | temp = df[series_name].value_counts()
114 | fig = {
115 | "input": [
116 | {
117 | "values": temp.values,
118 | "labels": temp.index,
119 | "domain": {"x": [0, .48]},
120 | # "name": "Types of Loans",
121 | # "hoverinfo":"label+percent+name",
122 | "hole": .7,
123 | "type": "pie"
124 | },
125 |
126 | ],
127 | "layout": {
128 | "title": title,
129 | "annotations": [
130 | {
131 | "font": {
132 | "size": 20
133 | },
134 | "showarrow": False,
135 | "text": series_name,
136 | "x": 0.17,
137 | "y": 0.5
138 | }
139 |
140 | ]
141 | }
142 | }
143 | iplot(fig, filename='donut')
144 | def value_count_bar_with_target(df,series_name,target,title="value count with regard to target"):
145 | import plotly.graph_objs as pgo
146 | from plotly.offline import iplot
147 | temp = df["NAME_FAMILY_STATUS"].value_counts()
148 | # print(temp.values)
149 | temp_y0 = []
150 | temp_y1 = []
151 | for val in temp.index:
152 | temp_y1.append(np.sum(df[target][df[series_name] == val] == 1))
153 | temp_y0.append(np.sum(df[target][df[series_name] == val] == 0))
154 | trace1 = pgo.Bar(
155 | x=temp.index,
156 | y=(temp_y1 / temp.sum()) * 100,
157 | name='YES'
158 | )
159 | trace2 = pgo.Bar(
160 | x=temp.index,
161 | y=(temp_y0 / temp.sum()) * 100,
162 | name='NO'
163 | )
164 | data = [trace1, trace2]
165 | layout = pgo.Layout(
166 | title=title,
167 | # barmode='stack',
168 | width=1000,
169 | xaxis=dict(
170 | title=series_name,
171 | tickfont=dict(
172 | size=14,
173 | color='rgb(107, 107, 107)'
174 | )
175 | ),
176 | yaxis=dict(
177 | title='Count in %',
178 | titlefont=dict(
179 | size=16,
180 | color='rgb(107, 107, 107)'
181 | ),
182 | tickfont=dict(
183 | size=14,
184 | color='rgb(107, 107, 107)'
185 | )
186 | )
187 | )
188 |
189 | fig = pgo.Figure(data=data, layout=layout)
190 | iplot(fig)
191 | def plot_feature_importances(df):
192 | """
193 | Plot importances returned by a model. This can work with any measure of
194 | feature importance provided that higher importance is better.
195 |
196 | Args:
197 | df (dataframe): feature importances. Must have the features in a column
198 | called `features` and the importances in a column called `importance
199 |
200 | Returns:
201 | shows a plot of the 15 most importance features
202 |
203 | df (dataframe): feature importances sorted by importance (highest to lowest)
204 | with a column for normalized importance
205 | """
206 |
207 | # Sort features according to importance
208 | df = df.sort_values('importance', ascending=False).reset_index()
209 |
210 | # Normalize the feature importances to add up to one
211 | df['importance_normalized'] = df['importance'] / df['importance'].sum()
212 |
213 | # Make a horizontal bar chart of feature importances
214 | plt.figure(figsize=(10, 6))
215 | ax = plt.subplot()
216 |
217 | # Need to reverse the index to plot most important on top
218 | ax.barh(list(reversed(list(df.index[:15]))),
219 | df['importance_normalized'].head(15),
220 | align='center', edgecolor='k')
221 |
222 | # Set the yticks and labels
223 | ax.set_yticks(list(reversed(list(df.index[:15]))))
224 | ax.set_yticklabels(df['feature'].head(15))
225 |
226 | # Plot labeling
227 | plt.xlabel('Normalized Importance');
228 | plt.title('Feature Importances')
229 | plt.show()
230 |
231 | return df
--------------------------------------------------------------------------------