├── DataCon2019 ├── code │ ├── stage1 │ │ ├── call_pid_tfidf_stacking.ipynb │ │ ├── deep_learning_model.ipynb │ │ ├── exinfos.ipynb │ │ ├── explore.ipynb │ │ ├── feature_engineering.ipynb │ │ ├── new_feature_engineering.ipynb │ │ ├── out_of_fold.ipynb │ │ ├── ret_value_stacking.ipynb │ │ ├── stacking.ipynb │ │ └── test.ipynb │ └── stage2 │ │ ├── DBSCAN.py │ │ ├── feature_engineering.ipynb │ │ ├── for_cluster_kmeans.py │ │ ├── get_call_name_tfidf_features.py │ │ ├── plot_comparison.py │ │ └── yield_call_name_api_name_exinfos_tsne.py ├── loom_大数据安全分析比赛决赛.pdf └── useful │ ├── K-means_and_DBSCAN_cluster_comparison.jpg │ ├── K-means_and_DBSCAN_cluster_comparison.pdf │ ├── K-means_cluster_comparison.jpg │ ├── api_name_barh.pdf │ ├── call_pid_barh.pdf │ ├── draw_origin_data.jpg │ ├── exinfos_barh.pdf │ ├── rank.png │ ├── ret_value_barh.pdf │ └── table.md ├── DataCon2020 ├── PPT │ ├── loom_2020DataCon大数据安全分析比赛分享.pptx │ └── picture │ │ ├── 2020rank.png │ │ ├── ROC_curve.png │ │ ├── black.png │ │ ├── black_white_pdf.png │ │ ├── decode.png │ │ ├── features_tsne.png │ │ ├── result1.png │ │ ├── result2.png │ │ ├── tfidf.png │ │ ├── time.png │ │ ├── train_flow.png │ │ ├── vb.png │ │ ├── white.png │ │ ├── xgb1.png │ │ ├── xgb2.png │ │ ├── xgb3.png │ │ └── 方差偏差均衡.png ├── codes │ ├── bagging.py │ ├── get_id.py │ ├── get_raw_test_data.py │ ├── get_raw_train_data.py │ ├── lgb_cv.py │ ├── plot.py │ ├── t_sne.py │ ├── test_train_model.py │ ├── xgb_bagging.py │ ├── yield_end_result.py │ ├── yield_features.py │ └── yield_train_model.py ├── readme.md ├── run.sh ├── scripts │ └── yield_raw_data.sh └── setup_run.sh └── README.md /DataCon2019/code/stage1/call_pid_tfidf_stacking.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import time\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import pickle\n", 13 | "import dask.array as da\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.model_selection import StratifiedKFold\n", 18 | "from sklearn.model_selection import cross_validate\n", 19 | "from sklearn.model_selection import GridSearchCV\n", 20 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 21 | "\n", 22 | "from sklearn import svm\n", 23 | "from sklearn import neighbors\n", 24 | "from sklearn import naive_bayes\n", 25 | "from sklearn.svm import LinearSVC\n", 26 | "from xgboost import XGBClassifier\n", 27 | "from sklearn.tree import DecisionTreeClassifier\n", 28 | "from sklearn.linear_model import LogisticRegression\n", 29 | "from sklearn.linear_model import LogisticRegressionCV\n", 30 | "from sklearn.tree import DecisionTreeClassifier\n", 31 | "from sklearn.gaussian_process import GaussianProcessClassifier\n", 32 | "\n", 33 | "from sklearn.ensemble import RandomForestClassifier\n", 34 | "from sklearn.ensemble import AdaBoostClassifier\n", 35 | "from sklearn.ensemble import BaggingClassifier\n", 36 | "from sklearn.ensemble import ExtraTreesClassifier\n", 37 | "from sklearn.ensemble import GradientBoostingClassifier\n", 38 | "from sklearn.ensemble import VotingClassifier\n", 39 | "\n", 40 | "from sklearn import metrics\n", 41 | "from sklearn.metrics import accuracy_score\n", 42 | "from sklearn.metrics import classification_report\n", 43 | "\n", 44 | "from sklearn.externals import joblib\n", 45 | "\n", 46 | "%config InlineBackend.figure_format = 'svg'\n", 47 | "%matplotlib inline\n", 48 | "\n", 49 | "import warnings\n", 50 | "warnings.filterwarnings(\"ignore\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "safe_type = pd.read_csv(\"origin_data.csv\")[\"safe_type\"]\n", 60 | "train_call_pid = pd.read_csv(\"origin_data.csv\")[\"call_pid\"]\n", 61 | "test_call_pid = pd.read_csv(\"origin_test.csv\")[\"call_pid\"]" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 8, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "vectorizes = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)\n", 71 | "train_call_pid_tfidf = vectorizes.fit_transform(train_call_pid.tolist())\n", 72 | "test_call_pid_tfidf = vectorizes.transform(test_call_pid.tolist())" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 12, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "with open(\"train_call_pid_tfidf.pkl\", \"wb\") as fp:\n", 82 | " pickle.dump(train_call_pid_tfidf, fp)\n", 83 | "with open(\"test_call_pid_tfidf.pkl\", \"wb\") as fp:\n", 84 | " pickle.dump(test_call_pid_tfidf, fp)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 3, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "train_call_pid_tfidf = pd.read_pickle(\"train_call_pid_tfidf.pkl\")\n", 94 | "test_call_pid_tfidf = pd.read_pickle(\"test_call_pid_tfidf.pkl\")" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 4, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "bc_model = BaggingClassifier()\n", 104 | "gbc_model = GradientBoostingClassifier()\n", 105 | "lr_model = LogisticRegression()\n", 106 | "svm_model = svm.LinearSVC()\n", 107 | "dt_model = DecisionTreeClassifier()\n", 108 | "xgb_model = XGBClassifier(max_depth=7,\n", 109 | " learning_rate=0.05,\n", 110 | " n_estimators=1000)\n", 111 | "\n", 112 | "rfc_model = RandomForestClassifier(200)\n", 113 | "etc_model = ExtraTreesClassifier()\n", 114 | "mnb_model = naive_bayes.MultinomialNB(alpha=0.01)\n", 115 | "ada_model = AdaBoostClassifier()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 5, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "def get_oof(model, x_train, y_train, x_test, n_splits):\n", 125 | " \"\"\"\n", 126 | " :@param x_train: feature matrix.\n", 127 | " :type x: np.array(M X N) or list(M X N).\n", 128 | " :@param y_train: class label.\n", 129 | " :type y: int.\n", 130 | " :@param x_test: test set feature matrix.\n", 131 | " :type x_test: np.array(M X N) or list(M X N).\n", 132 | " :@param n_splits: K-fold parameter.\n", 133 | " :type n_splits: int.\n", 134 | " \"\"\"\n", 135 | " n_train, n_test = x_train.shape[0], x_test.shape[0]\n", 136 | " kf = StratifiedKFold(n_splits=n_splits, random_state=0)\n", 137 | " oof_train = np.empty((n_train, ))\n", 138 | " oof_test = np.empty((n_test, ))\n", 139 | " oof_test_skf = np.empty((n_splits, n_test))\n", 140 | " for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):\n", 141 | " kf_x_train = x_train[train_index]\n", 142 | " kf_y_train = y_train[train_index]\n", 143 | " kf_x_test = x_train[test_index]\n", 144 | " model.fit(kf_x_train, kf_y_train)\n", 145 | " oof_train[test_index] = model.predict(kf_x_test)\n", 146 | " oof_test_skf[i, :] = model.predict(x_test)\n", 147 | " oof_test[:] = oof_test_skf.mean(axis=0)\n", 148 | " return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "train_tfidf_features = train_call_pid_tfidf\n", 158 | "test_tfidf_features = test_call_pid_tfidf\n", 159 | "try:\n", 160 | " lr_model_oof_train, lr_model_oof_test = get_oof(lr_model, \n", 161 | " train_tfidf_features.tolil(), \n", 162 | " safe_type.values,\n", 163 | " test_tfidf_features.tolil(),\n", 164 | " 10)\n", 165 | " with open(\"call_pid_lr_model_oof_train.csv\", \"wb\") as fp:\n", 166 | " pickle.dump(lr_model_oof_train, fp)\n", 167 | " with open(\"call_pid_lr_model_oof_test.csv\", \"wb\") as fp:\n", 168 | " pickle.dump(lr_model_oof_test, fp)\n", 169 | " print(\"lr success!\")\n", 170 | "except:\n", 171 | " print(\"lr error!\")\n", 172 | "try:\n", 173 | " gbc_model_oof_train, gbc_model_oof_test = get_oof(gbc_model, \n", 174 | " train_tfidf_features.tolil(), \n", 175 | " safe_type.values,\n", 176 | " test_tfidf_features.tolil(),\n", 177 | " 10)\n", 178 | " with open(\"call_pid_gbc_model_oof_train.csv\", \"wb\") as fp:\n", 179 | " pickle.dump(gbc_model_oof_train, fp)\n", 180 | " with open(\"call_pid_gbc_model_oof_test.csv\", \"wb\") as fp:\n", 181 | " pickle.dump(gbc_model_oof_test, fp)\n", 182 | " print(\"gbc success!\")\n", 183 | "except:\n", 184 | " print(\"gbc error!\")\n", 185 | "try:\n", 186 | " bc_model_oof_train, bc_model_oof_test = get_oof(bc_model, \n", 187 | " train_tfidf_features.tolil(), \n", 188 | " safe_type.values,\n", 189 | " test_tfidf_features.tolil(),\n", 190 | " 10)\n", 191 | " with open(\"call_pid_bc_model_oof_train.csv\", \"wb\") as fp:\n", 192 | " pickle.dump(bc_model_oof_train, fp)\n", 193 | " with open(\"call_pid_bc_model_oof_test.csv\", \"wb\") as fp:\n", 194 | " pickle.dump(bc_model_oof_test, fp)\n", 195 | " print(\"bc success!\")\n", 196 | "except:\n", 197 | " print(\"bc error!\")\n", 198 | "try:\n", 199 | " svm_model_oof_train, svm_model_oof_test = get_oof(svm_model, \n", 200 | " train_tfidf_features.tolil(), \n", 201 | " safe_type.values,\n", 202 | " test_tfidf_features.tolil(),\n", 203 | " 10)\n", 204 | " with open(\"call_pid_svm_model_oof_train.csv\", \"wb\") as fp:\n", 205 | " pickle.dump(svm_model_oof_train, fp)\n", 206 | " with open(\"call_pid_svm_model_oof_test.csv\", \"wb\") as fp:\n", 207 | " pickle.dump(svm_model_oof_test, fp)\n", 208 | " print(\"svm success!\")\n", 209 | "except:\n", 210 | " print(\"svm error!\")\n", 211 | "try:\n", 212 | " dt_model_oof_train, dt_model_oof_test = get_oof(dt_model, \n", 213 | " train_tfidf_features.tolil(), \n", 214 | " safe_type.values,\n", 215 | " test_tfidf_features.tolil(),\n", 216 | " 10)\n", 217 | " with open(\"call_pid_dt_model_oof_train.csv\", \"wb\") as fp:\n", 218 | " pickle.dump(dt_model_oof_train, fp)\n", 219 | " with open(\"call_pid_dt_model_oof_test.csv\", \"wb\") as fp:\n", 220 | " pickle.dump(dt_model_oof_test, fp)\n", 221 | " print(\"dt success!\")\n", 222 | "except:\n", 223 | " print(\"dt error!\")\n", 224 | "\n", 225 | " \n", 226 | "try:\n", 227 | " rfc_model_oof_train, rfc_model_oof_test = get_oof(rfc_model, \n", 228 | " train_tfidf_features.tolil(), \n", 229 | " safe_type.values,\n", 230 | " test_tfidf_features.tolil(),\n", 231 | " 10)\n", 232 | " with open(\"call_pid_rfc_model_oof_train.csv\", \"wb\") as fp:\n", 233 | " pickle.dump(rfc_model_oof_train, fp)\n", 234 | " with open(\"call_pid_rfc_model_oof_test.csv\", \"wb\") as fp:\n", 235 | " pickle.dump(rfc_model_oof_test, fp)\n", 236 | " print(\"rfc success!\")\n", 237 | "except:\n", 238 | " print(\"rfc error!\")\n", 239 | " \n", 240 | "try:\n", 241 | " etc_model_oof_train, etc_model_oof_test = get_oof(etc_model, \n", 242 | " train_tfidf_features.tolil(), \n", 243 | " safe_type.values,\n", 244 | " test_tfidf_features.tolil(),\n", 245 | " 10)\n", 246 | " with open(\"call_pid_etc_model_oof_train.csv\", \"wb\") as fp:\n", 247 | " pickle.dump(etc_model_oof_train, fp)\n", 248 | " with open(\"call_pid_etc_model_oof_test.csv\", \"wb\") as fp:\n", 249 | " pickle.dump(etc_model_oof_test, fp)\n", 250 | " print(\"etc success!\")\n", 251 | "except:\n", 252 | " print(\"etc error!\")\n", 253 | "try:\n", 254 | " mnb_model_oof_train, mnb_model_oof_test = get_oof(mnb_model, \n", 255 | " train_tfidf_features.tolil(), \n", 256 | " safe_type.values,\n", 257 | " test_tfidf_features.tolil(),\n", 258 | " 10)\n", 259 | " with open(\"call_pid_mnb_model_oof_train.csv\", \"wb\") as fp:\n", 260 | " pickle.dump(mnb_model_oof_train, fp)\n", 261 | " with open(\"call_pid_mnb_model_oof_test.csv\", \"wb\") as fp:\n", 262 | " pickle.dump(mnb_model_oof_test, fp)\n", 263 | " print(\"mnb success!\")\n", 264 | "except:\n", 265 | " print(\"mnb error!\")\n", 266 | " \n", 267 | "try:\n", 268 | " ada_model_oof_train, ada_model_oof_test = get_oof(ada_model, \n", 269 | " train_tfidf_features.tolil(), \n", 270 | " safe_type.values,\n", 271 | " test_tfidf_features.tolil(),\n", 272 | " 10)\n", 273 | " with open(\"call_pid_ada_model_oof_train.csv\", \"wb\") as fp:\n", 274 | " pickle.dump(ada_model_oof_train, fp)\n", 275 | " with open(\"call_pid_ada_model_oof_test.csv\", \"wb\") as fp:\n", 276 | " pickle.dump(ada_model_oof_test, fp)\n", 277 | " print(\"ada success!\")\n", 278 | "except:\n", 279 | " print(\"ada error!\")\n", 280 | "\n", 281 | "try:\n", 282 | " xgb_model_oof_train, xgb_model_oof_test = get_oof(xgb_model, \n", 283 | " train_tfidf_features.tolil(), \n", 284 | " safe_type.values,\n", 285 | " test_tfidf_features.tolil(),\n", 286 | " 10)\n", 287 | " with open(\"call_pid_xgb_model_oof_train.csv\", \"wb\") as fp:\n", 288 | " pickle.dump(xgb_model_oof_train, fp)\n", 289 | " with open(\"call_pid_xgb_model_oof_test.csv\", \"wb\") as fp:\n", 290 | " pickle.dump(xgb_model_oof_test, fp)\n", 291 | " print(\"xgb success!\")\n", 292 | "except:\n", 293 | " print(\"xgb error!\")\n", 294 | "\n", 295 | "\n", 296 | "call_pid_stacking_train_10 = np.hstack([lr_model_oof_train, gbc_model_oof_train, bc_model_oof_train,\n", 297 | " svm_model_oof_train, xgb_model_oof_train, dt_model_oof_train,\n", 298 | " rfc_model_oof_train, etc_model_oof_train, mnb_model_oof_train,\n", 299 | " ada_model_oof_train])\n", 300 | "call_pid_stacking_test_10 = np.hstack([lr_model_oof_test, gbc_model_oof_test, bc_model_oof_test,\n", 301 | " svm_model_oof_test, xgb_model_oof_test, dt_model_oof_test,\n", 302 | " rfc_model_oof_test, etc_model_oof_test, mnb_model_oof_test,\n", 303 | " ada_model_oof_test])\n", 304 | "with open(\"call_pid_stacking_train_10.pkl\", \"wb\") as fp:\n", 305 | " pickle.dump(call_pid_stacking_train_10, fp)\n", 306 | " \n", 307 | "with open(\"call_pid_stacking_test_10.pkl\", \"wb\") as fp:\n", 308 | " pickle.dump(call_pid_stacking_test_10, fp)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 3", 322 | "language": "python", 323 | "name": "python3" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 3 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython3", 335 | "version": "3.6.7" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 2 340 | } 341 | -------------------------------------------------------------------------------- /DataCon2019/code/stage1/deep_learning_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 27, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from sklearn.preprocessing import MinMaxScaler\n", 10 | "from keras import models \n", 11 | "from keras import layers \n", 12 | "from keras.layers import Dropout\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "\n", 17 | "from sklearn import metrics\n", 18 | "from sklearn.metrics import accuracy_score\n", 19 | "from sklearn.metrics import classification_report\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "\n", 22 | "%config InlineBackend.figure_format = 'svg'\n", 23 | "%matplotlib inline\n", 24 | "\n", 25 | "import warnings\n", 26 | "warnings.filterwarnings(\"ignore\")" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 11, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "data = pd.read_csv(\"fliter_train_data_2.csv\")\n", 36 | "safe_type = data[\"safe_type\"]\n", 37 | "features = data.iloc[:, 2:]\n", 38 | "\n", 39 | "test = pd.read_csv(\"fliter_test_data_2.csv\")\n", 40 | "id_ = test[\"id\"]\n", 41 | "test_features = test.iloc[:, 1:]" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 32, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "min_max_scaler = MinMaxScaler()\n", 51 | "train_data = min_max_scaler.fit_transform(features)\n", 52 | "test_data = min_max_scaler.fit_transform(test_features)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 40, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "train_data, test_data, train_label, test_label = train_test_split(train_data, \n", 62 | " safe_type, \n", 63 | " test_size=0.2, \n", 64 | " random_state=0)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 30, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "def build_model(dimension): \n", 74 | " model = models.Sequential() \n", 75 | " model.add(layers.Dense(64, activation='relu', input_shape=(dimension,))) \n", 76 | " model.add(Dropout(0.2))\n", 77 | " model.add(layers.Dense(128, activation='relu')) \n", 78 | " model.add(Dropout(0.2))\n", 79 | " model.add(layers.Dense(32, activation='relu'))\n", 80 | " model.add(Dropout(0.2))\n", 81 | " model.add(layers.Dense(16, activation='relu'))\n", 82 | " model.add(Dropout(0.2))\n", 83 | " model.add(layers.Dense(8, activation='relu'))\n", 84 | " model.add(Dropout(0.2))\n", 85 | " model.add(layers.Dense(1, activation='sigmoid')) \n", 86 | " model.compile(optimizer='rmsprop',\n", 87 | " loss='binary_crossentropy',\n", 88 | " metrics=['accuracy']) \n", 89 | " return model" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 41, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "Epoch 1/100\n", 102 | "23983/23983 [==============================] - 1s 35us/step - loss: 0.4526 - acc: 0.8262\n", 103 | "Epoch 2/100\n", 104 | "23983/23983 [==============================] - 0s 12us/step - loss: 0.3507 - acc: 0.8834\n", 105 | "Epoch 3/100\n", 106 | "23983/23983 [==============================] - 0s 12us/step - loss: 0.3146 - acc: 0.8981\n", 107 | "Epoch 4/100\n", 108 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.2854 - acc: 0.9068\n", 109 | "Epoch 5/100\n", 110 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.2603 - acc: 0.9161\n", 111 | "Epoch 6/100\n", 112 | "23983/23983 [==============================] - 0s 17us/step - loss: 0.2448 - acc: 0.9219\n", 113 | "Epoch 7/100\n", 114 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.2351 - acc: 0.9266\n", 115 | "Epoch 8/100\n", 116 | "23983/23983 [==============================] - 0s 12us/step - loss: 0.2227 - acc: 0.9324\n", 117 | "Epoch 9/100\n", 118 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.2119 - acc: 0.9345\n", 119 | "Epoch 10/100\n", 120 | "23983/23983 [==============================] - 0s 12us/step - loss: 0.2051 - acc: 0.9364\n", 121 | "Epoch 11/100\n", 122 | "23983/23983 [==============================] - 0s 12us/step - loss: 0.2036 - acc: 0.9383\n", 123 | "Epoch 12/100\n", 124 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1993 - acc: 0.9409\n", 125 | "Epoch 13/100\n", 126 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1933 - acc: 0.9416\n", 127 | "Epoch 14/100\n", 128 | "23983/23983 [==============================] - 0s 16us/step - loss: 0.1849 - acc: 0.9438\n", 129 | "Epoch 15/100\n", 130 | "23983/23983 [==============================] - 0s 12us/step - loss: 0.1840 - acc: 0.9458\n", 131 | "Epoch 16/100\n", 132 | "23983/23983 [==============================] - 0s 12us/step - loss: 0.1816 - acc: 0.9455\n", 133 | "Epoch 17/100\n", 134 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1812 - acc: 0.9457\n", 135 | "Epoch 18/100\n", 136 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1746 - acc: 0.9489\n", 137 | "Epoch 19/100\n", 138 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1757 - acc: 0.9477\n", 139 | "Epoch 20/100\n", 140 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1749 - acc: 0.9481\n", 141 | "Epoch 21/100\n", 142 | "23983/23983 [==============================] - 0s 15us/step - loss: 0.1709 - acc: 0.9487\n", 143 | "Epoch 22/100\n", 144 | "23983/23983 [==============================] - 0s 16us/step - loss: 0.1680 - acc: 0.9513\n", 145 | "Epoch 23/100\n", 146 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1630 - acc: 0.9506\n", 147 | "Epoch 24/100\n", 148 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1665 - acc: 0.9511\n", 149 | "Epoch 25/100\n", 150 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1627 - acc: 0.9523\n", 151 | "Epoch 26/100\n", 152 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1633 - acc: 0.9508\n", 153 | "Epoch 27/100\n", 154 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1581 - acc: 0.9535\n", 155 | "Epoch 28/100\n", 156 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1586 - acc: 0.9513\n", 157 | "Epoch 29/100\n", 158 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1553 - acc: 0.9529\n", 159 | "Epoch 30/100\n", 160 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1602 - acc: 0.9519\n", 161 | "Epoch 31/100\n", 162 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1549 - acc: 0.9533\n", 163 | "Epoch 32/100\n", 164 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1552 - acc: 0.9517\n", 165 | "Epoch 33/100\n", 166 | "23983/23983 [==============================] - 0s 16us/step - loss: 0.1520 - acc: 0.9531\n", 167 | "Epoch 34/100\n", 168 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1494 - acc: 0.9551\n", 169 | "Epoch 35/100\n", 170 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1523 - acc: 0.9541\n", 171 | "Epoch 36/100\n", 172 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1495 - acc: 0.9556\n", 173 | "Epoch 37/100\n", 174 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1477 - acc: 0.9548\n", 175 | "Epoch 38/100\n", 176 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1454 - acc: 0.9563\n", 177 | "Epoch 39/100\n", 178 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1481 - acc: 0.9557\n", 179 | "Epoch 40/100\n", 180 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1437 - acc: 0.9563\n", 181 | "Epoch 41/100\n", 182 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1451 - acc: 0.9553\n", 183 | "Epoch 42/100\n", 184 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1430 - acc: 0.9556\n", 185 | "Epoch 43/100\n", 186 | "23983/23983 [==============================] - 0s 15us/step - loss: 0.1384 - acc: 0.9569\n", 187 | "Epoch 44/100\n", 188 | "23983/23983 [==============================] - 0s 16us/step - loss: 0.1419 - acc: 0.9560\n", 189 | "Epoch 45/100\n", 190 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1380 - acc: 0.9577\n", 191 | "Epoch 46/100\n", 192 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1371 - acc: 0.9568\n", 193 | "Epoch 47/100\n", 194 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1370 - acc: 0.9575\n", 195 | "Epoch 48/100\n", 196 | "23983/23983 [==============================] - 0s 12us/step - loss: 0.1349 - acc: 0.9580\n", 197 | "Epoch 49/100\n", 198 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1379 - acc: 0.9572\n", 199 | "Epoch 50/100\n", 200 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1343 - acc: 0.9579\n", 201 | "Epoch 51/100\n", 202 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1329 - acc: 0.9581\n", 203 | "Epoch 52/100\n", 204 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1361 - acc: 0.9584\n", 205 | "Epoch 53/100\n", 206 | "23983/23983 [==============================] - 0s 15us/step - loss: 0.1342 - acc: 0.9573\n", 207 | "Epoch 54/100\n", 208 | "23983/23983 [==============================] - 0s 15us/step - loss: 0.1298 - acc: 0.9597\n", 209 | "Epoch 55/100\n", 210 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1330 - acc: 0.9589\n", 211 | "Epoch 56/100\n", 212 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1281 - acc: 0.9593\n", 213 | "Epoch 57/100\n", 214 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1311 - acc: 0.9575\n", 215 | "Epoch 58/100\n", 216 | "23983/23983 [==============================] - 0s 12us/step - loss: 0.1286 - acc: 0.9605\n", 217 | "Epoch 59/100\n", 218 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1305 - acc: 0.9581\n", 219 | "Epoch 60/100\n", 220 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1300 - acc: 0.9589\n", 221 | "Epoch 61/100\n", 222 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1272 - acc: 0.9579\n", 223 | "Epoch 62/100\n", 224 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1252 - acc: 0.9591\n", 225 | "Epoch 63/100\n", 226 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1258 - acc: 0.9605\n", 227 | "Epoch 64/100\n", 228 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1293 - acc: 0.9589\n", 229 | "Epoch 65/100\n", 230 | "23983/23983 [==============================] - 0s 17us/step - loss: 0.1209 - acc: 0.9611\n", 231 | "Epoch 66/100\n", 232 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1239 - acc: 0.9606\n", 233 | "Epoch 67/100\n", 234 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1256 - acc: 0.9605\n", 235 | "Epoch 68/100\n", 236 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1236 - acc: 0.9613\n", 237 | "Epoch 69/100\n", 238 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1216 - acc: 0.9613\n", 239 | "Epoch 70/100\n", 240 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1224 - acc: 0.9600\n", 241 | "Epoch 71/100\n", 242 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1239 - acc: 0.9607\n", 243 | "Epoch 72/100\n", 244 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1183 - acc: 0.9626\n", 245 | "Epoch 73/100\n", 246 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1230 - acc: 0.9617\n", 247 | "Epoch 74/100\n", 248 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1211 - acc: 0.9606\n", 249 | "Epoch 75/100\n", 250 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1213 - acc: 0.9621\n", 251 | "Epoch 76/100\n", 252 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1190 - acc: 0.9613\n", 253 | "Epoch 77/100\n", 254 | "23983/23983 [==============================] - 0s 16us/step - loss: 0.1160 - acc: 0.9635\n", 255 | "Epoch 78/100\n", 256 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1168 - acc: 0.9631\n", 257 | "Epoch 79/100\n", 258 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1169 - acc: 0.9621\n", 259 | "Epoch 80/100\n", 260 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1180 - acc: 0.9626\n", 261 | "Epoch 81/100\n", 262 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1174 - acc: 0.9631\n", 263 | "Epoch 82/100\n", 264 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1157 - acc: 0.9636\n", 265 | "Epoch 83/100\n", 266 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1170 - acc: 0.9631\n", 267 | "Epoch 84/100\n", 268 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1151 - acc: 0.9657\n", 269 | "Epoch 85/100\n", 270 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1143 - acc: 0.9632\n", 271 | "Epoch 86/100\n", 272 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1124 - acc: 0.9647\n", 273 | "Epoch 87/100\n", 274 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1115 - acc: 0.9649\n", 275 | "Epoch 88/100\n", 276 | "23983/23983 [==============================] - 0s 17us/step - loss: 0.1179 - acc: 0.9628\n", 277 | "Epoch 89/100\n", 278 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1121 - acc: 0.9641\n", 279 | "Epoch 90/100\n", 280 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1119 - acc: 0.9650\n", 281 | "Epoch 91/100\n", 282 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1131 - acc: 0.9633\n", 283 | "Epoch 92/100\n", 284 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1113 - acc: 0.9647\n", 285 | "Epoch 93/100\n", 286 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1119 - acc: 0.9649\n", 287 | "Epoch 94/100\n", 288 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1150 - acc: 0.9630\n", 289 | "Epoch 95/100\n", 290 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1098 - acc: 0.9662\n", 291 | "Epoch 96/100\n", 292 | "23983/23983 [==============================] - 0s 13us/step - loss: 0.1105 - acc: 0.9642\n", 293 | "Epoch 97/100\n", 294 | "23983/23983 [==============================] - 0s 15us/step - loss: 0.1090 - acc: 0.9656\n", 295 | "Epoch 98/100\n", 296 | "23983/23983 [==============================] - 0s 15us/step - loss: 0.1091 - acc: 0.9654\n", 297 | "Epoch 99/100\n", 298 | "23983/23983 [==============================] - 0s 14us/step - loss: 0.1062 - acc: 0.9649\n", 299 | "Epoch 100/100\n", 300 | "23983/23983 [==============================] - 0s 16us/step - loss: 0.1105 - acc: 0.9648\n" 301 | ] 302 | }, 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "" 307 | ] 308 | }, 309 | "execution_count": 41, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "model = build_model(train_data.shape[1])\n", 316 | "model.fit(train_data, \n", 317 | " train_label, \n", 318 | " epochs=100, \n", 319 | " batch_size=524)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 43, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "predict = model.predict_classes(test_data)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 42, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "def plot(test_label, y_pred, model):\n", 338 | " font = {\"color\": \"darkred\",\n", 339 | " \"size\": 13, \n", 340 | " \"family\" : \"serif\"}\n", 341 | "\n", 342 | " accs = accuracy_score(test_label, y_pred)\n", 343 | " fpr, tpr, _ = metrics.roc_curve(test_label, y_pred)\n", 344 | " auc = metrics.roc_auc_score(test_label, y_pred)\n", 345 | " plt.style.use(\"fivethirtyeight\")\n", 346 | " fig, ax = plt.subplots()\n", 347 | " ax.plot(fpr, tpr, label=\"{}, auc=\".format(model)+str(auc), color='green', linewidth=2)\n", 348 | " ax.set_title(\"ROC curve\", fontdict=font)\n", 349 | " leg = ax.legend(loc=\"best\")\n", 350 | " text = leg.get_texts()\n", 351 | " _ = plt.setp(text, color=\"blue\") " 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 44, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "data": { 361 | "image/svg+xml": [ 362 | "\r\n", 363 | "\r\n", 365 | "\r\n", 366 | "\r\n", 367 | " \r\n", 368 | " \r\n", 371 | " \r\n", 372 | " \r\n", 373 | " \r\n", 374 | " \r\n", 380 | " \r\n", 381 | " \r\n", 382 | " \r\n", 383 | " \r\n", 389 | " \r\n", 390 | " \r\n", 391 | " \r\n", 392 | " \r\n", 393 | " \r\n", 396 | " \r\n", 397 | " \r\n", 398 | " \r\n", 399 | " \r\n", 400 | " \r\n", 401 | " \r\n", 422 | " \r\n", 428 | " \r\n", 429 | " \r\n", 430 | " \r\n", 431 | " \r\n", 432 | " \r\n", 433 | " \r\n", 434 | " \r\n", 435 | " \r\n", 436 | " \r\n", 437 | " \r\n", 438 | " \r\n", 441 | " \r\n", 442 | " \r\n", 443 | " \r\n", 444 | " \r\n", 445 | " \r\n", 446 | " \r\n", 470 | " \r\n", 471 | " \r\n", 472 | " \r\n", 473 | " \r\n", 474 | " \r\n", 475 | " \r\n", 476 | " \r\n", 477 | " \r\n", 478 | " \r\n", 479 | " \r\n", 480 | " \r\n", 483 | " \r\n", 484 | " \r\n", 485 | " \r\n", 486 | " \r\n", 487 | " \r\n", 488 | " \r\n", 505 | " \r\n", 506 | " \r\n", 507 | " \r\n", 508 | " \r\n", 509 | " \r\n", 510 | " \r\n", 511 | " \r\n", 512 | " \r\n", 513 | " \r\n", 514 | " \r\n", 515 | " \r\n", 518 | " \r\n", 519 | " \r\n", 520 | " \r\n", 521 | " \r\n", 522 | " \r\n", 523 | " \r\n", 553 | " \r\n", 554 | " \r\n", 555 | " \r\n", 556 | " \r\n", 557 | " \r\n", 558 | " \r\n", 559 | " \r\n", 560 | " \r\n", 561 | " \r\n", 562 | " \r\n", 563 | " \r\n", 566 | " \r\n", 567 | " \r\n", 568 | " \r\n", 569 | " \r\n", 570 | " \r\n", 571 | " \r\n", 610 | " \r\n", 611 | " \r\n", 612 | " \r\n", 613 | " \r\n", 614 | " \r\n", 615 | " \r\n", 616 | " \r\n", 617 | " \r\n", 618 | " \r\n", 619 | " \r\n", 620 | " \r\n", 623 | " \r\n", 624 | " \r\n", 625 | " \r\n", 626 | " \r\n", 627 | " \r\n", 628 | " \r\n", 641 | " \r\n", 642 | " \r\n", 643 | " \r\n", 644 | " \r\n", 645 | " \r\n", 646 | " \r\n", 647 | " \r\n", 648 | " \r\n", 649 | " \r\n", 650 | " \r\n", 651 | " \r\n", 652 | " \r\n", 653 | " \r\n", 656 | " \r\n", 657 | " \r\n", 658 | " \r\n", 659 | " \r\n", 660 | " \r\n", 661 | " \r\n", 662 | " \r\n", 663 | " \r\n", 664 | " \r\n", 665 | " \r\n", 666 | " \r\n", 667 | " \r\n", 668 | " \r\n", 669 | " \r\n", 672 | " \r\n", 673 | " \r\n", 674 | " \r\n", 675 | " \r\n", 676 | " \r\n", 677 | " \r\n", 678 | " \r\n", 679 | " \r\n", 680 | " \r\n", 681 | " \r\n", 682 | " \r\n", 683 | " \r\n", 684 | " \r\n", 685 | " \r\n", 688 | " \r\n", 689 | " \r\n", 690 | " \r\n", 691 | " \r\n", 692 | " \r\n", 693 | " \r\n", 694 | " \r\n", 695 | " \r\n", 696 | " \r\n", 697 | " \r\n", 698 | " \r\n", 699 | " \r\n", 700 | " \r\n", 701 | " \r\n", 704 | " \r\n", 705 | " \r\n", 706 | " \r\n", 707 | " \r\n", 708 | " \r\n", 709 | " \r\n", 710 | " \r\n", 711 | " \r\n", 712 | " \r\n", 713 | " \r\n", 714 | " \r\n", 715 | " \r\n", 716 | " \r\n", 717 | " \r\n", 720 | " \r\n", 721 | " \r\n", 722 | " \r\n", 723 | " \r\n", 724 | " \r\n", 725 | " \r\n", 726 | " \r\n", 727 | " \r\n", 728 | " \r\n", 729 | " \r\n", 730 | " \r\n", 731 | " \r\n", 732 | " \r\n", 733 | " \r\n", 736 | " \r\n", 737 | " \r\n", 738 | " \r\n", 739 | " \r\n", 740 | " \r\n", 741 | " \r\n", 742 | " \r\n", 743 | " \r\n", 744 | " \r\n", 745 | " \r\n", 746 | " \r\n", 747 | " \r\n", 748 | " \r\n", 749 | " \r\n", 753 | " \r\n", 754 | " \r\n", 755 | " \r\n", 758 | " \r\n", 759 | " \r\n", 760 | " \r\n", 763 | " \r\n", 764 | " \r\n", 765 | " \r\n", 768 | " \r\n", 769 | " \r\n", 770 | " \r\n", 773 | " \r\n", 774 | " \r\n", 775 | " \r\n", 776 | " \r\n", 777 | " \r\n", 812 | " \r\n", 839 | " \r\n", 862 | " \r\n", 863 | " \r\n", 884 | " \r\n", 908 | " \r\n", 932 | " \r\n", 949 | " \r\n", 973 | " \r\n", 974 | " \r\n", 975 | " \r\n", 976 | " \r\n", 977 | " \r\n", 978 | " \r\n", 979 | " \r\n", 980 | " \r\n", 981 | " \r\n", 982 | " \r\n", 983 | " \r\n", 984 | " \r\n", 985 | " \r\n", 986 | " \r\n", 987 | " \r\n", 988 | " \r\n", 999 | " \r\n", 1000 | " \r\n", 1001 | " \r\n", 1004 | " \r\n", 1005 | " \r\n", 1006 | " \r\n", 1007 | " \r\n", 1008 | " \r\n", 1009 | " \r\n", 1028 | " \r\n", 1036 | " \r\n", 1037 | " \r\n", 1069 | " \r\n", 1090 | " \r\n", 1111 | " \r\n", 1122 | " \r\n", 1152 | " \r\n", 1176 | " \r\n", 1208 | " \r\n", 1209 | " \r\n", 1210 | " \r\n", 1211 | " \r\n", 1212 | " \r\n", 1213 | " \r\n", 1214 | " \r\n", 1215 | " \r\n", 1216 | " \r\n", 1217 | " \r\n", 1218 | " \r\n", 1219 | " \r\n", 1220 | " \r\n", 1221 | " \r\n", 1222 | " \r\n", 1223 | " \r\n", 1224 | " \r\n", 1225 | " \r\n", 1226 | " \r\n", 1227 | " \r\n", 1228 | " \r\n", 1229 | " \r\n", 1230 | " \r\n", 1231 | " \r\n", 1232 | " \r\n", 1233 | " \r\n", 1234 | " \r\n", 1235 | " \r\n", 1236 | " \r\n", 1237 | " \r\n", 1238 | " \r\n", 1239 | " \r\n", 1240 | " \r\n", 1241 | " \r\n", 1242 | " \r\n", 1243 | " \r\n", 1244 | " \r\n", 1245 | "\r\n" 1246 | ], 1247 | "text/plain": [ 1248 | "
" 1249 | ] 1250 | }, 1251 | "metadata": {}, 1252 | "output_type": "display_data" 1253 | } 1254 | ], 1255 | "source": [ 1256 | "plot(test_label, predict, \"nn\")" 1257 | ] 1258 | }, 1259 | { 1260 | "cell_type": "code", 1261 | "execution_count": 38, 1262 | "metadata": {}, 1263 | "outputs": [], 1264 | "source": [ 1265 | "result = pd.DataFrame()\n", 1266 | "result[\"id\"] = id_\n", 1267 | "result[\"safe_type\"] = predict\n", 1268 | "result.to_csv(\"result.csv\", encoding=\"utf-8\", index=False)" 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "code", 1273 | "execution_count": null, 1274 | "metadata": {}, 1275 | "outputs": [], 1276 | "source": [] 1277 | } 1278 | ], 1279 | "metadata": { 1280 | "kernelspec": { 1281 | "display_name": "Python 3", 1282 | "language": "python", 1283 | "name": "python3" 1284 | }, 1285 | "language_info": { 1286 | "codemirror_mode": { 1287 | "name": "ipython", 1288 | "version": 3 1289 | }, 1290 | "file_extension": ".py", 1291 | "mimetype": "text/x-python", 1292 | "name": "python", 1293 | "nbconvert_exporter": "python", 1294 | "pygments_lexer": "ipython3", 1295 | "version": "3.6.7" 1296 | } 1297 | }, 1298 | "nbformat": 4, 1299 | "nbformat_minor": 2 1300 | } 1301 | -------------------------------------------------------------------------------- /DataCon2019/code/stage1/feature_engineering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "import glob\n", 11 | "import pandas as pd\n", 12 | "import pickle\n", 13 | "from collections import Counter\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "from xgboost import XGBClassifier\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "\n", 18 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 19 | "from sklearn import metrics\n", 20 | "from sklearn.metrics import accuracy_score\n", 21 | "from sklearn.metrics import classification_report\n", 22 | "\n", 23 | "from sklearn.preprocessing import MinMaxScaler\n", 24 | "from sklearn.externals import joblib\n", 25 | "\n", 26 | "%config InlineBackend.figure_format = 'svg'\n", 27 | "%matplotlib inline\n", 28 | "\n", 29 | "import warnings\n", 30 | "warnings.filterwarnings(\"ignore\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 16, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "def yield_origin_csv(file_type):\n", 40 | " flag = 1\n", 41 | " id_, api_name_list, call_pid_list, ret_value_list = [], [], [], []\n", 42 | " api_name_regex = re.compile('Get n-gram features" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 3, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "origin_train_data = pd.read_csv(\"origin_data.csv\")\n", 193 | "origin_test_data = pd.read_csv(\"origin_test.csv\")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 45, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "train_data_api_name = origin_train_data[\"api_name\"]\n", 203 | "test_data_api_name = origin_test_data[\"api_name\"]" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 51, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)\n", 213 | "train_tfidf_features = vectorizer.fit_transform(train_data_api_name.tolist())\n", 214 | "test_tfidf_features = vectorizer.transform(test_data_api_name.tolist())" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 82, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "with open(\"train_tfidf_features.pkl\", \"wb\") as fp:\n", 224 | " pickle.dump(train_tfidf_features, fp)\n", 225 | "with open(\"test_tfidf_features.pkl\", \"wb\") as fp:\n", 226 | " pickle.dump(test_tfidf_features, fp)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 4, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "train_data_ret_value = origin_train_data[\"ret_value\"]\n", 236 | "test_data_ret_value = origin_test_data[\"ret_value\"]" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 9, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)\n", 246 | "train_tfidf_features = vectorizer.fit_transform(train_data_ret_value.tolist())\n", 247 | "test_tfidf_features = vectorizer.transform(test_data_ret_value.tolist())" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 15, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "with open(\"train_ret_value_tfidf_features.pkl\", \"wb\") as fp:\n", 257 | " pickle.dump(train_tfidf_features, fp)\n", 258 | " \n", 259 | "with open(\"test_ret_value_tfidf_features.pkl\", \"wb\") as fp:\n", 260 | " pickle.dump(test_tfidf_features, fp)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [] 269 | } 270 | ], 271 | "metadata": { 272 | "kernelspec": { 273 | "display_name": "Python 3", 274 | "language": "python", 275 | "name": "python3" 276 | }, 277 | "language_info": { 278 | "codemirror_mode": { 279 | "name": "ipython", 280 | "version": 3 281 | }, 282 | "file_extension": ".py", 283 | "mimetype": "text/x-python", 284 | "name": "python", 285 | "nbconvert_exporter": "python", 286 | "pygments_lexer": "ipython3", 287 | "version": "3.6.7" 288 | } 289 | }, 290 | "nbformat": 4, 291 | "nbformat_minor": 2 292 | } 293 | -------------------------------------------------------------------------------- /DataCon2019/code/stage1/out_of_fold.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import time\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "\n", 14 | "from sklearn.model_selection import train_test_split\n", 15 | "from sklearn.model_selection import StratifiedKFold\n", 16 | "from sklearn.model_selection import cross_validate\n", 17 | "from sklearn.model_selection import GridSearchCV\n", 18 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 19 | "\n", 20 | "from sklearn import svm\n", 21 | "from sklearn import neighbors\n", 22 | "from sklearn import naive_bayes\n", 23 | "from sklearn.svm import LinearSVC\n", 24 | "from xgboost import XGBClassifier\n", 25 | "from sklearn.tree import DecisionTreeClassifier\n", 26 | "from sklearn.linear_model import LogisticRegression\n", 27 | "from sklearn.linear_model import LogisticRegressionCV\n", 28 | "from sklearn.gaussian_process import GaussianProcessClassifier\n", 29 | "\n", 30 | "from sklearn.ensemble import RandomForestClassifier\n", 31 | "from sklearn.ensemble import AdaBoostClassifier\n", 32 | "from sklearn.ensemble import BaggingClassifier\n", 33 | "from sklearn.ensemble import ExtraTreesClassifier\n", 34 | "from sklearn.ensemble import GradientBoostingClassifier\n", 35 | "from sklearn.ensemble import VotingClassifier\n", 36 | "\n", 37 | "from sklearn import metrics\n", 38 | "from sklearn.metrics import accuracy_score\n", 39 | "from sklearn.metrics import classification_report\n", 40 | "\n", 41 | "from sklearn.externals import joblib\n", 42 | "\n", 43 | "%config InlineBackend.figure_format = 'svg'\n", 44 | "%matplotlib inline\n", 45 | "\n", 46 | "import warnings\n", 47 | "warnings.filterwarnings(\"ignore\")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "data = pd.read_csv(\"fliter_train_data.csv\")\n", 57 | "safe_type = data[\"safe_type\"]\n", 58 | "features = data.iloc[:, 2:]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 7, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "train_data, test_data, train_label, test_label = train_test_split(features, \n", 68 | " safe_type, \n", 69 | " test_size=0.2, \n", 70 | " random_state=0)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 9, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "def plot(test_label, y_pred, model):\n", 80 | " font = {\"color\": \"darkred\",\n", 81 | " \"size\": 13, \n", 82 | " \"family\" : \"serif\"}\n", 83 | "\n", 84 | " accs = accuracy_score(test_label, y_pred)\n", 85 | " fpr, tpr, _ = metrics.roc_curve(test_label, y_pred)\n", 86 | " auc = metrics.roc_auc_score(test_label, y_pred)\n", 87 | " plt.style.use(\"fivethirtyeight\")\n", 88 | " fig, ax = plt.subplots()\n", 89 | " ax.plot(fpr, tpr, label=\"{}, auc=\".format(model)+str(auc), color='green', linewidth=2)\n", 90 | " ax.set_title(\"ROC curve\", fontdict=font)\n", 91 | " leg = ax.legend(loc=\"best\")\n", 92 | " text = leg.get_texts()\n", 93 | " _ = plt.setp(text, color=\"blue\") " 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 8, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "model = XGBClassifier() \n", 103 | "model.fit(train_data, train_label) \n", 104 | "y_pred = model.predict(test_data)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 3, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "The best parameter for BaggingClassifier is {'max_samples': 0.5, 'n_estimators': 300, 'random_state': 0} with a runtime of 1259.84 seconds.\n", 117 | "The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 300, 'random_state': 0} with a runtime of 2480.75 seconds.\n", 118 | "The best parameter for LogisticRegression is {'fit_intercept': False, 'random_state': 0, 'solver': 'newton-cg'} with a runtime of 300.54 seconds.\n", 119 | "The best parameter for BernoulliNB is {'alpha': 0.1} with a runtime of 1.86 seconds.\n", 120 | "The best parameter for KNeighborsClassifier is {} with a runtime of 67.43 seconds.\n", 121 | "The best parameter for XGBClassifier is {'algorithm': 'auto', 'n_neighbors': 1, 'weights': 'uniform'} with a runtime of 771.57 seconds.\n", 122 | "Total optimization time was 81.37 minutes.\n", 123 | "----------\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "grid_n_estimator = [10, 50, 100, 300]\n", 129 | "grid_ratio = [0.1, 0.25, 0.5, 0.75, 1.0]\n", 130 | "grid_learn = [0.01, 0.03, 0.05, 0.1, 0.25]\n", 131 | "grid_max_depth = [2, 4, 6, 8, 10, None]\n", 132 | "grid_min_samples = [5, 10, 0.03, 0.05, 0.10]\n", 133 | "grid_criterion = ['gini', 'entropy']\n", 134 | "grid_bool = [True, False]\n", 135 | "grid_seed = [0]\n", 136 | "\n", 137 | "layer_1 = [\n", 138 | " #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html\n", 139 | "# ('ada', AdaBoostClassifier()),\n", 140 | " ('bc', BaggingClassifier()),\n", 141 | "# ('etc', ExtraTreesClassifier()),\n", 142 | " ('gbc', GradientBoostingClassifier()),\n", 143 | "# ('rfc', RandomForestClassifier()),\n", 144 | "\n", 145 | " #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc\n", 146 | "# ('gpc', GaussianProcessClassifier()),\n", 147 | "\n", 148 | " #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 149 | " ('lr', LogisticRegression()),\n", 150 | "\n", 151 | " #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html\n", 152 | " ('bnb', naive_bayes.BernoulliNB()),\n", 153 | "# ('gnb', naive_bayes.GaussianNB()),\n", 154 | "\n", 155 | " #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html\n", 156 | " ('knn', neighbors.KNeighborsClassifier()),\n", 157 | "\n", 158 | " #SVM: http://scikit-learn.org/stable/modules/svm.html\n", 159 | "# ('svc', svm.SVC(probability=True)),\n", 160 | "\n", 161 | " #xgboost: http://xgboost.readthedocs.io/en/latest/model.html\n", 162 | " ('xgb', XGBClassifier())\n", 163 | "\n", 164 | " ]\n", 165 | "\n", 166 | "grid_param = [\n", 167 | "# [{\n", 168 | "# #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html\n", 169 | "# 'n_estimators': grid_n_estimator, #default=50\n", 170 | "# 'learning_rate': grid_learn, #default=1\n", 171 | "# #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R\n", 172 | "# 'random_state': grid_seed\n", 173 | "# }],\n", 174 | "\n", 175 | "\n", 176 | " [{\n", 177 | " #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier\n", 178 | " 'n_estimators': grid_n_estimator, #default=10\n", 179 | " 'max_samples': grid_ratio, #default=1.0\n", 180 | " 'random_state': grid_seed\n", 181 | " }],\n", 182 | "\n", 183 | "\n", 184 | "# [{\n", 185 | "# #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier\n", 186 | "# 'n_estimators': grid_n_estimator, #default=10\n", 187 | "# 'criterion': grid_criterion, #default=”gini”\n", 188 | "# 'max_depth': grid_max_depth, #default=None\n", 189 | "# 'random_state': grid_seed\n", 190 | "# }],\n", 191 | "\n", 192 | "\n", 193 | " [{\n", 194 | " #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier\n", 195 | " #'loss': ['deviance', 'exponential'], #default=’deviance’\n", 196 | " 'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.\n", 197 | " 'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.\n", 198 | " #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”\n", 199 | " 'max_depth': grid_max_depth, #default=3 \n", 200 | " 'random_state': grid_seed\n", 201 | " }],\n", 202 | "\n", 203 | "\n", 204 | "# [{\n", 205 | "# #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier\n", 206 | "# 'n_estimators': grid_n_estimator, #default=10\n", 207 | "# 'criterion': grid_criterion, #default=”gini”\n", 208 | "# 'max_depth': grid_max_depth, #default=None\n", 209 | "# 'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.\n", 210 | "# 'random_state': grid_seed\n", 211 | "# }],\n", 212 | "\n", 213 | "# [{ \n", 214 | "# #GaussianProcessClassifier\n", 215 | "# 'max_iter_predict': grid_n_estimator, #default: 100\n", 216 | "# 'random_state': grid_seed\n", 217 | "# }],\n", 218 | "\n", 219 | "\n", 220 | " [{\n", 221 | " #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV\n", 222 | " 'fit_intercept': grid_bool, #default: True\n", 223 | " #'penalty': ['l1','l2'],\n", 224 | " 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs\n", 225 | " 'random_state': grid_seed\n", 226 | " }],\n", 227 | "\n", 228 | "\n", 229 | " [{\n", 230 | " #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB\n", 231 | " 'alpha': grid_ratio, #default: 1.0\n", 232 | " }],\n", 233 | "\n", 234 | "\n", 235 | " #GaussianNB - \n", 236 | " [{}],\n", 237 | "\n", 238 | " [{\n", 239 | " #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier\n", 240 | " 'n_neighbors': [1,2,3,4,5,6,7], #default: 5\n", 241 | " 'weights': ['uniform', 'distance'], #default = ‘uniform’\n", 242 | " 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']\n", 243 | " }],\n", 244 | "\n", 245 | "\n", 246 | "# [{\n", 247 | "# #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC\n", 248 | "# #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r\n", 249 | "# #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],\n", 250 | "# 'C': [1,2,3,4,5], #default=1.0\n", 251 | "# 'gamma': grid_ratio, #edfault: auto\n", 252 | "# 'decision_function_shape': ['ovo', 'ovr'], #default:ovr\n", 253 | "# 'probability': [True],\n", 254 | "# 'random_state': grid_seed\n", 255 | "# }],\n", 256 | "\n", 257 | "\n", 258 | " [{\n", 259 | " #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html\n", 260 | " 'learning_rate': grid_learn, #default: .3\n", 261 | " 'max_depth': [1,2,4,6,8,10], #default 2\n", 262 | " 'n_estimators': grid_n_estimator, \n", 263 | " 'seed': grid_seed \n", 264 | " }] \n", 265 | " ]\n", 266 | "\n", 267 | "\n", 268 | "\n", 269 | "start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter\n", 270 | "for clf, param in zip (layer_1, grid_param): #https://docs.python.org/3/library/functions.html#zip\n", 271 | "\n", 272 | " #print(clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm\n", 273 | " #print(param)\n", 274 | " \n", 275 | " \n", 276 | " start = time.perf_counter() \n", 277 | " best_search = GridSearchCV(estimator = clf[1], param_grid = param, cv = 5, scoring = 'roc_auc')\n", 278 | " best_search.fit(features, safe_type)\n", 279 | " run = time.perf_counter() - start\n", 280 | "\n", 281 | " best_param = best_search.best_params_\n", 282 | " print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf[1].__class__.__name__, best_param, run))\n", 283 | " clf[1].set_params(**best_param) \n", 284 | "\n", 285 | "\n", 286 | "run_total = time.perf_counter() - start_total\n", 287 | "print('Total optimization time was {:.2f} minutes.'.format(run_total/60))\n", 288 | "print('-'*10)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 4, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "layer_1 = [\n", 298 | " #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html\n", 299 | "# ('ada', AdaBoostClassifier()),\n", 300 | " ('bc', BaggingClassifier()),\n", 301 | "# ('etc', ExtraTreesClassifier()),\n", 302 | " ('gbc', GradientBoostingClassifier()),\n", 303 | "# ('rfc', RandomForestClassifier()),\n", 304 | "\n", 305 | " #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc\n", 306 | "# ('gpc', GaussianProcessClassifier()),\n", 307 | "\n", 308 | " #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 309 | " ('lr', LogisticRegression()),\n", 310 | "\n", 311 | " #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html\n", 312 | " ('bnb', naive_bayes.BernoulliNB()),\n", 313 | "# ('gnb', naive_bayes.GaussianNB()),\n", 314 | "\n", 315 | " #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html\n", 316 | " ('knn', neighbors.KNeighborsClassifier()),\n", 317 | "\n", 318 | " #SVM: http://scikit-learn.org/stable/modules/svm.html\n", 319 | " ('svc', svm.SVC(probability=True)),\n", 320 | "\n", 321 | " #xgboost: http://xgboost.readthedocs.io/en/latest/model.html\n", 322 | " ('xgb', XGBClassifier())\n", 323 | "\n", 324 | " ]" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 5, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "Soft Voting Training w/bin score mean: 98.43\n", 337 | "Soft Voting Test w/bin score mean: 97.14\n", 338 | "Soft Voting Test w/bin score 3*std: +/- 0.68\n", 339 | "----------\n" 340 | ] 341 | } 342 | ], 343 | "source": [ 344 | "vote_soft = VotingClassifier(estimators=layer_1 , voting = 'soft')\n", 345 | "vote_soft_cv = cross_validate(vote_soft, features, safe_type, cv=5)\n", 346 | "vote_soft.fit(features, safe_type)\n", 347 | "\n", 348 | "print(\"Soft Voting Training w/bin score mean: {:.2f}\".format(vote_soft_cv['train_score'].mean()*100)) \n", 349 | "print(\"Soft Voting Test w/bin score mean: {:.2f}\".format(vote_soft_cv['test_score'].mean()*100))\n", 350 | "print(\"Soft Voting Test w/bin score 3*std: +/- {:.2f}\".format(vote_soft_cv['test_score'].std()*100*3))\n", 351 | "print('-'*10)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 9, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "Soft Voting Training w/bin score mean: 98.79\n", 364 | "Soft Voting Test w/bin score mean: 97.85\n", 365 | "Soft Voting Test w/bin score 3*std: +/- 0.46\n", 366 | "----------\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "gv_vote_soft = VotingClassifier(estimators=layer_1 , voting = 'soft')\n", 372 | "gv_vote_soft_cv = cross_validate(gv_vote_soft, features, safe_type, cv=5)\n", 373 | "gv_vote_soft.fit(features, safe_type)\n", 374 | "\n", 375 | "print(\"Soft Voting Training w/bin score mean: {:.2f}\".format(gv_vote_soft_cv['train_score'].mean()*100)) \n", 376 | "print(\"Soft Voting Test w/bin score mean: {:.2f}\".format(gv_vote_soft_cv['test_score'].mean()*100))\n", 377 | "print(\"Soft Voting Test w/bin score 3*std: +/- {:.2f}\".format(gv_vote_soft_cv['test_score'].std()*100*3))\n", 378 | "print('-'*10)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 10, 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "data": { 388 | "text/plain": [ 389 | "['gv_vote_soft.m']" 390 | ] 391 | }, 392 | "execution_count": 10, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "joblib.dump(gv_vote_soft, \"gv_vote_soft.m\")" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 11, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "test = pd.read_csv(\"fliter_test_data.csv\")\n", 408 | "id_ = test[\"id\"]\n", 409 | "test_features = test.iloc[:, 1:]" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 12, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "predict = gv_vote_soft.predict(test_features)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 13, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "result = pd.DataFrame()\n", 428 | "result[\"id\"] = id_\n", 429 | "result[\"safe_type\"] = predict\n", 430 | "result.to_csv(\"result.csv\", encoding=\"utf-8\", index=False)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 21, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "bc = joblib.load(\"./models/bc_gr_model.m\")" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 449 | "import glob\n", 450 | "def read_data(file_type):\n", 451 | " data = []\n", 452 | " for path in glob.glob(\"./stage1_dataset/train/{}/*\".format(file_type)):\n", 453 | " with open(path, \"r\") as fp:\n", 454 | " data.append(fp.read())\n", 455 | " return data\n", 456 | "\n", 457 | "vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=3000)\n", 458 | "white_data = read_data(\"white\")\n", 459 | "black_data = read_data(\"black\")\n", 460 | "data = white_data + black_data\n", 461 | "white = [0 for _ in range(len(white_data))]\n", 462 | "black = [1 for _ in range(len(black_data))]\n", 463 | "safe_type = white + black\n", 464 | "features = vectorizer.fit_transform(data)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 45, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "def get_oof(model, x_train, y_train, x_test, n_splits):\n", 474 | " \"\"\"\n", 475 | " :@param x_train: feature matrix.\n", 476 | " :type x: np.array(M X N) or list(M X N).\n", 477 | " :@param y_train: class label.\n", 478 | " :type y: int.\n", 479 | " :@param x_test: test set feature matrix.\n", 480 | " :type x_test: np.array(M X N) or list(M X N).\n", 481 | " :@param n_splits: K-fold parameter.\n", 482 | " :type n_splits: int.\n", 483 | " \"\"\"\n", 484 | " n_train, n_test = x_train.shape[0], x_test.shape[0]\n", 485 | " kf = StratifiedKFold(n_splits=n_splits, random_state=0)\n", 486 | " oof_train = np.empty((n_train, ))\n", 487 | " oof_test = np.empty((n_test, ))\n", 488 | " oof_test_skf = np.empty((n_splits, n_test))\n", 489 | " for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):\n", 490 | " kf_x_train = x_train[train_index]\n", 491 | " kf_y_train = y_train[train_index]\n", 492 | " kf_x_test = x_train[test_index]\n", 493 | " model.fit(kf_x_train, kf_y_train)\n", 494 | " oof_train[test_index] = model.predict(kf_x_test)\n", 495 | " oof_test_skf[i, :] = model.predict(x_test)\n", 496 | " oof_test[:] = oof_test_skf.mean(axis=0)\n", 497 | " return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 67, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "import numpy as np\n", 514 | "from PIL import Image\n", 515 | "import binascii\n", 516 | "\n", 517 | "def getMatrixfrom_bin(filename, width):\n", 518 | " with open(filename, 'rb') as f:\n", 519 | " content = f.read()\n", 520 | " hexst = binascii.hexlify(content) #将二进制文件转换为十六进制字符串\n", 521 | " fh = np.array([int(hexst[i: i+2], 16) for i in range(0, len(hexst), 2)]) #按字节分割\n", 522 | " rn = len(fh) // width\n", 523 | " fh = np.reshape(fh[:rn * width], (-1, width)) #根据设定的宽度生成矩阵\n", 524 | " fh = np.uint8(fh)\n", 525 | " return fh\n" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 68, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [ 534 | "filename = \"./pandalearning.exe\"\n", 535 | "im = Image.fromarray(getMatrixfrom_bin(filename, 512)) #转换为图像\n", 536 | "# im.save(\"your_img_filename.png\")" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 69, 542 | "metadata": {}, 543 | "outputs": [], 544 | "source": [ 545 | "im.show()" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 70, 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "import pefile\n", 555 | "PEfile_Path = \"pandalearning.exe\"\n", 556 | "pe = pefile.PE(PEfile_Path)" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 79, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "with open(\"test.txt\", \"w\") as fp:\n", 566 | " fp.write(str(pe))" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 80, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "import re\n", 576 | "from collections import *\n", 577 | "# 从.asm文件获取Opcode序列\n", 578 | "def getOpcodeSequence(filename):\n", 579 | " opcode_seq = []\n", 580 | " p = re.compile(r'\\s([a-fA-F0-9]{2}\\s)+\\s*([a-z]+)')\n", 581 | " with open(filename) as f:\n", 582 | " for line in f:\n", 583 | " if line.startswith(\".text\"):\n", 584 | " m = re.findall(p, line)\n", 585 | " if m:\n", 586 | " opc = m[0][10]\n", 587 | " if opc != \"align\":\n", 588 | " opcode_seq.append(opc)\n", 589 | " return opcode_seq\n", 590 | "# 根据Opcode序列,统计对应的n-gram\n", 591 | "def getOpcodeNgram(ops ,n = 3):\n", 592 | " opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)]\n", 593 | " opngram = Counter(opngramlist)\n", 594 | " return opngram\n", 595 | "file = \"test.txt\"\n", 596 | "ops = getOpcodeSequence(file)\n", 597 | "opngram = getOpcodeNgram(ops)" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 82, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "data = str(pe)" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [] 615 | } 616 | ], 617 | "metadata": { 618 | "kernelspec": { 619 | "display_name": "Python 3", 620 | "language": "python", 621 | "name": "python3" 622 | }, 623 | "language_info": { 624 | "codemirror_mode": { 625 | "name": "ipython", 626 | "version": 3 627 | }, 628 | "file_extension": ".py", 629 | "mimetype": "text/x-python", 630 | "name": "python", 631 | "nbconvert_exporter": "python", 632 | "pygments_lexer": "ipython3", 633 | "version": "3.6.7" 634 | } 635 | }, 636 | "nbformat": 4, 637 | "nbformat_minor": 2 638 | } 639 | -------------------------------------------------------------------------------- /DataCon2019/code/stage1/ret_value_stacking.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import time\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import pickle\n", 13 | "import dask.array as da\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.model_selection import StratifiedKFold\n", 18 | "from sklearn.model_selection import cross_validate\n", 19 | "from sklearn.model_selection import GridSearchCV\n", 20 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 21 | "\n", 22 | "from sklearn import svm\n", 23 | "from sklearn import neighbors\n", 24 | "from sklearn import naive_bayes\n", 25 | "from sklearn.svm import LinearSVC\n", 26 | "from xgboost import XGBClassifier\n", 27 | "from sklearn.tree import DecisionTreeClassifier\n", 28 | "from sklearn.linear_model import LogisticRegression\n", 29 | "from sklearn.linear_model import LogisticRegressionCV\n", 30 | "from sklearn.tree import DecisionTreeClassifier\n", 31 | "from sklearn.gaussian_process import GaussianProcessClassifier\n", 32 | "\n", 33 | "from sklearn.ensemble import RandomForestClassifier\n", 34 | "from sklearn.ensemble import AdaBoostClassifier\n", 35 | "from sklearn.ensemble import BaggingClassifier\n", 36 | "from sklearn.ensemble import ExtraTreesClassifier\n", 37 | "from sklearn.ensemble import GradientBoostingClassifier\n", 38 | "from sklearn.ensemble import VotingClassifier\n", 39 | "\n", 40 | "from sklearn import metrics\n", 41 | "from sklearn.metrics import accuracy_score\n", 42 | "from sklearn.metrics import classification_report\n", 43 | "\n", 44 | "from sklearn.externals import joblib\n", 45 | "\n", 46 | "%config InlineBackend.figure_format = 'svg'\n", 47 | "%matplotlib inline\n", 48 | "\n", 49 | "import warnings\n", 50 | "warnings.filterwarnings(\"ignore\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "with open(\"train_ret_value_tfidf_features.pkl\", \"rb\") as fp:\n", 60 | " train_tfidf_features = pickle.load(fp)\n", 61 | "with open(\"test_ret_value_tfidf_features.pkl\", \"rb\") as fp:\n", 62 | " test_tfidf_features = pickle.load(fp)\n", 63 | "safe_type = pd.read_csv(\"safe_type.csv\", header=None)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "bc_model = BaggingClassifier()\n", 73 | "gbc_model = GradientBoostingClassifier()\n", 74 | "lr_model = LogisticRegression()\n", 75 | "svm_model = svm.LinearSVC()\n", 76 | "dt_model = DecisionTreeClassifier()\n", 77 | "xgb_model = XGBClassifier(max_depth=7,\n", 78 | " learning_rate=0.05,\n", 79 | " n_estimators=1000)\n", 80 | "\n", 81 | "rfc_model = RandomForestClassifier(200)\n", 82 | "etc_model = ExtraTreesClassifier()\n", 83 | "mnb_model = naive_bayes.MultinomialNB(alpha=0.01)\n", 84 | "ada_model = AdaBoostClassifier()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "def get_oof(model, x_train, y_train, x_test, n_splits):\n", 94 | " \"\"\"\n", 95 | " :@param x_train: feature matrix.\n", 96 | " :type x: np.array(M X N) or list(M X N).\n", 97 | " :@param y_train: class label.\n", 98 | " :type y: int.\n", 99 | " :@param x_test: test set feature matrix.\n", 100 | " :type x_test: np.array(M X N) or list(M X N).\n", 101 | " :@param n_splits: K-fold parameter.\n", 102 | " :type n_splits: int.\n", 103 | " \"\"\"\n", 104 | " n_train, n_test = x_train.shape[0], x_test.shape[0]\n", 105 | " kf = StratifiedKFold(n_splits=n_splits, random_state=0)\n", 106 | " oof_train = np.empty((n_train, ))\n", 107 | " oof_test = np.empty((n_test, ))\n", 108 | " oof_test_skf = np.empty((n_splits, n_test))\n", 109 | " for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):\n", 110 | " kf_x_train = x_train[train_index]\n", 111 | " kf_y_train = y_train[train_index]\n", 112 | " kf_x_test = x_train[test_index]\n", 113 | " model.fit(kf_x_train, kf_y_train)\n", 114 | " oof_train[test_index] = model.predict(kf_x_test)\n", 115 | " oof_test_skf[i, :] = model.predict(x_test)\n", 116 | " oof_test[:] = oof_test_skf.mean(axis=0)\n", 117 | " return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "rfc success!\n", 130 | "etc success!\n", 131 | "mnb success!\n", 132 | "ada success!\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "try:\n", 138 | " lr_model_oof_train, lr_model_oof_test = get_oof(lr_model, \n", 139 | " train_tfidf_features.tolil(), \n", 140 | " safe_type.values,\n", 141 | " test_tfidf_features.tolil(),\n", 142 | " 10)\n", 143 | " with open(\"ret_value_lr_model_oof_train.csv\", \"wb\") as fp:\n", 144 | " pickle.dump(lr_model_oof_train, fp)\n", 145 | " with open(\"ret_value_lr_model_oof_test.csv\", \"wb\") as fp:\n", 146 | " pickle.dump(lr_model_oof_test, fp)\n", 147 | " print(\"lr success!\")\n", 148 | "except:\n", 149 | " print(\"lr error!\")\n", 150 | "try:\n", 151 | " gbc_model_oof_train, gbc_model_oof_test = get_oof(gbc_model, \n", 152 | " train_tfidf_features.tolil(), \n", 153 | " safe_type.values,\n", 154 | " test_tfidf_features.tolil(),\n", 155 | " 10)\n", 156 | " with open(\"ret_value_gbc_model_oof_train.csv\", \"wb\") as fp:\n", 157 | " pickle.dump(gbc_model_oof_train, fp)\n", 158 | " with open(\"ret_value_gbc_model_oof_test.csv\", \"wb\") as fp:\n", 159 | " pickle.dump(gbc_model_oof_test, fp)\n", 160 | " print(\"gbc success!\")\n", 161 | "except:\n", 162 | " print(\"gbc error!\")\n", 163 | "try:\n", 164 | " bc_model_oof_train, bc_model_oof_test = get_oof(bc_model, \n", 165 | " train_tfidf_features.tolil(), \n", 166 | " safe_type.values,\n", 167 | " test_tfidf_features.tolil(),\n", 168 | " 10)\n", 169 | " with open(\"ret_value_bc_model_oof_train.csv\", \"wb\") as fp:\n", 170 | " pickle.dump(bc_model_oof_train, fp)\n", 171 | " with open(\"ret_value_bc_model_oof_test.csv\", \"wb\") as fp:\n", 172 | " pickle.dump(bc_model_oof_test, fp)\n", 173 | " print(\"bc success!\")\n", 174 | "except:\n", 175 | " print(\"bc error!\")\n", 176 | "try:\n", 177 | " svm_model_oof_train, svm_model_oof_test = get_oof(svm_model, \n", 178 | " train_tfidf_features.tolil(), \n", 179 | " safe_type.values,\n", 180 | " test_tfidf_features.tolil(),\n", 181 | " 10)\n", 182 | " with open(\"ret_value_svm_model_oof_train.csv\", \"wb\") as fp:\n", 183 | " pickle.dump(svm_model_oof_train, fp)\n", 184 | " with open(\"ret_value_svm_model_oof_test.csv\", \"wb\") as fp:\n", 185 | " pickle.dump(svm_model_oof_test, fp)\n", 186 | " print(\"svm success!\")\n", 187 | "except:\n", 188 | " print(\"svm error!\")\n", 189 | "try:\n", 190 | " dt_model_oof_train, dt_model_oof_test = get_oof(dt_model, \n", 191 | " train_tfidf_features.tolil(), \n", 192 | " safe_type.values,\n", 193 | " test_tfidf_features.tolil(),\n", 194 | " 10)\n", 195 | " with open(\"ret_value_dt_model_oof_train.csv\", \"wb\") as fp:\n", 196 | " pickle.dump(dt_model_oof_train, fp)\n", 197 | " with open(\"ret_value_dt_model_oof_test.csv\", \"wb\") as fp:\n", 198 | " pickle.dump(dt_model_oof_test, fp)\n", 199 | " print(\"dt success!\")\n", 200 | "except:\n", 201 | " print(\"dt error!\")\n", 202 | "try:\n", 203 | " xgb_model_oof_train, xgb_model_oof_test = get_oof(xgb_model, \n", 204 | " train_tfidf_features.tolil(), \n", 205 | " safe_type.values,\n", 206 | " test_tfidf_features.tolil(),\n", 207 | " 10)\n", 208 | " with open(\"ret_value_xgb_model_oof_train.csv\", \"wb\") as fp:\n", 209 | " pickle.dump(xgb_model_oof_train, fp)\n", 210 | " with open(\"ret_value_xgb_model_oof_test.csv\", \"wb\") as fp:\n", 211 | " pickle.dump(xgb_model_oof_test, fp)\n", 212 | " print(\"xgb success!\")\n", 213 | "except:\n", 214 | " print(\"xgb error!\")\n", 215 | "try:\n", 216 | " rfc_model_oof_train, rfc_model_oof_test = get_oof(rfc_model, \n", 217 | " train_tfidf_features.tolil(), \n", 218 | " safe_type.values,\n", 219 | " test_tfidf_features.tolil(),\n", 220 | " 10)\n", 221 | " with open(\"ret_value_rfc_model_oof_train.csv\", \"wb\") as fp:\n", 222 | " pickle.dump(rfc_model_oof_train, fp)\n", 223 | " with open(\"ret_value_rfc_model_oof_test.csv\", \"wb\") as fp:\n", 224 | " pickle.dump(rfc_model_oof_test, fp)\n", 225 | " print(\"rfc success!\")\n", 226 | "except:\n", 227 | " print(\"rfc error!\")\n", 228 | " \n", 229 | "try:\n", 230 | " etc_model_oof_train, etc_model_oof_test = get_oof(etc_model, \n", 231 | " train_tfidf_features.tolil(), \n", 232 | " safe_type.values,\n", 233 | " test_tfidf_features.tolil(),\n", 234 | " 10)\n", 235 | " with open(\"ret_value_etc_model_oof_train.csv\", \"wb\") as fp:\n", 236 | " pickle.dump(etc_model_oof_train, fp)\n", 237 | " with open(\"ret_value_etc_model_oof_test.csv\", \"wb\") as fp:\n", 238 | " pickle.dump(etc_model_oof_test, fp)\n", 239 | " print(\"etc success!\")\n", 240 | "except:\n", 241 | " print(\"etc error!\")\n", 242 | "try:\n", 243 | " mnb_model_oof_train, mnb_model_oof_test = get_oof(mnb_model, \n", 244 | " train_tfidf_features.tolil(), \n", 245 | " safe_type.values,\n", 246 | " test_tfidf_features.tolil(),\n", 247 | " 10)\n", 248 | " with open(\"ret_value_mnb_model_oof_train.csv\", \"wb\") as fp:\n", 249 | " pickle.dump(mnb_model_oof_train, fp)\n", 250 | " with open(\"ret_value_mnb_model_oof_test.csv\", \"wb\") as fp:\n", 251 | " pickle.dump(mnb_model_oof_test, fp)\n", 252 | " print(\"mnb success!\")\n", 253 | "except:\n", 254 | " print(\"mnb error!\")\n", 255 | " \n", 256 | "try:\n", 257 | " ada_model_oof_train, ada_model_oof_test = get_oof(ada_model, \n", 258 | " train_tfidf_features.tolil(), \n", 259 | " safe_type.values,\n", 260 | " test_tfidf_features.tolil(),\n", 261 | " 10)\n", 262 | " with open(\"ret_value_ada_model_oof_train.csv\", \"wb\") as fp:\n", 263 | " pickle.dump(ada_model_oof_train, fp)\n", 264 | " with open(\"ret_value_ada_model_oof_test.csv\", \"wb\") as fp:\n", 265 | " pickle.dump(ada_model_oof_test, fp)\n", 266 | " print(\"ada success!\")\n", 267 | "except:\n", 268 | " print(\"ada error!\")\n", 269 | "\n", 270 | "\n", 271 | "ret_value_stacking_train_10 = np.hstack([lr_model_oof_train, gbc_model_oof_train, bc_model_oof_train,\n", 272 | " svm_model_oof_train, xgb_model_oof_train, dt_model_oof_train,\n", 273 | " rfc_model_oof_train, etc_model_oof_train, mnb_model_oof_train,\n", 274 | " ada_model_oof_train])\n", 275 | "ret_value_stacking_test_10 = np.hstack([lr_model_oof_test, gbc_model_oof_test, bc_model_oof_test,\n", 276 | " svm_model_oof_test, xgb_model_oof_test, dt_model_oof_test,\n", 277 | " rfc_model_oof_test, etc_model_oof_test, mnb_model_oof_test,\n", 278 | " ada_model_oof_test])\n", 279 | "with open(\"ret_value_stacking_train_10.pkl\", \"wb\") as fp:\n", 280 | " pickle.dump(ret_value_stacking_train_10, fp)\n", 281 | " \n", 282 | "with open(\"ret_value_stacking_test_10.pkl\", \"wb\") as fp:\n", 283 | " pickle.dump(ret_value_stacking_test_10, fp)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 2, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "import pandas as pd" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 11, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "lr = pd.read_pickle(\"exinfos_lr_model_oof_train.csv\")" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 10, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "9251.0" 313 | ] 314 | }, 315 | "execution_count": 10, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "lr.sum()" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 12, 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "data": { 331 | "text/plain": [ 332 | "9464.0" 333 | ] 334 | }, 335 | "execution_count": 12, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "lr.sum()" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 6, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "t = pd.read_pickle(\"api_name_and_ret_value_stacked_mix_train.pkl\")" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 8, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "array([9669., 9726., 9800., 9817., 9992., 9650., 9872., 9582., 9836.,\n", 362 | " 9844., 9128., 9116., 9575., 9559., 9885., 9487., 9613., 9308.,\n", 363 | " 9653., 9425.])" 364 | ] 365 | }, 366 | "execution_count": 8, 367 | "metadata": {}, 368 | "output_type": "execute_result" 369 | } 370 | ], 371 | "source": [ 372 | "t.sum(axis=0)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [] 381 | } 382 | ], 383 | "metadata": { 384 | "kernelspec": { 385 | "display_name": "Python 3", 386 | "language": "python", 387 | "name": "python3" 388 | }, 389 | "language_info": { 390 | "codemirror_mode": { 391 | "name": "ipython", 392 | "version": 3 393 | }, 394 | "file_extension": ".py", 395 | "mimetype": "text/x-python", 396 | "name": "python", 397 | "nbconvert_exporter": "python", 398 | "pygments_lexer": "ipython3", 399 | "version": "3.6.7" 400 | } 401 | }, 402 | "nbformat": 4, 403 | "nbformat_minor": 2 404 | } 405 | -------------------------------------------------------------------------------- /DataCon2019/code/stage2/DBSCAN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import sklearn.cluster as skc 4 | 5 | api_name_svded_train = pd.read_pickle("api_name_svded_features.pkl") 6 | exinfos_svded_train = pd.read_pickle("exinfos_svded_features.pkl") 7 | call_name_svded_train = pd.read_pickle("call_name_svded_features.pkl") 8 | 9 | merge_data = np.hstack([api_name_svded_train, exinfos_svded_train, call_name_svded_train]) 10 | dbscan = skc.DBSCAN() 11 | y_pred = dbscan.fit_predict(merge_data) 12 | 13 | result = pd.DataFrame() 14 | result["id"] = pd.read_csv("id.csv", names=["id"])["id"] 15 | result["family_id"] = y_pred 16 | 17 | result.to_csv("result.csv", encoding="utf-8", index=False) 18 | -------------------------------------------------------------------------------- /DataCon2019/code/stage2/feature_engineering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "import glob\n", 11 | "import time\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "import pickle\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "\n", 17 | "from sklearn.model_selection import train_test_split\n", 18 | "from sklearn.model_selection import StratifiedKFold\n", 19 | "from sklearn.model_selection import cross_validate\n", 20 | "from sklearn.model_selection import GridSearchCV\n", 21 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 22 | "\n", 23 | "from sklearn import svm\n", 24 | "from sklearn import neighbors\n", 25 | "from sklearn import naive_bayes\n", 26 | "from sklearn.svm import LinearSVC\n", 27 | "# from xgboost import XGBClassifier\n", 28 | "from sklearn.cluster import KMeans\n", 29 | "from sklearn.decomposition import TruncatedSVD \n", 30 | "from sklearn.tree import DecisionTreeClassifier\n", 31 | "from sklearn.linear_model import LogisticRegression\n", 32 | "from sklearn.linear_model import LogisticRegressionCV\n", 33 | "from sklearn.tree import DecisionTreeClassifier\n", 34 | "from sklearn.gaussian_process import GaussianProcessClassifier\n", 35 | "\n", 36 | "from sklearn.ensemble import RandomForestClassifier\n", 37 | "from sklearn.ensemble import AdaBoostClassifier\n", 38 | "from sklearn.ensemble import BaggingClassifier\n", 39 | "from sklearn.ensemble import ExtraTreesClassifier\n", 40 | "from sklearn.ensemble import GradientBoostingClassifier\n", 41 | "from sklearn.ensemble import VotingClassifier\n", 42 | "\n", 43 | "from sklearn import metrics\n", 44 | "from sklearn.metrics import accuracy_score\n", 45 | "from sklearn.metrics import classification_report\n", 46 | "\n", 47 | "from sklearn.externals import joblib\n", 48 | "\n", 49 | "%config InlineBackend.figure_format = 'svg'\n", 50 | "%matplotlib inline\n", 51 | "\n", 52 | "import warnings\n", 53 | "warnings.filterwarnings(\"ignore\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "def yield_origin_csv():\n", 63 | " flag = 1\n", 64 | " id_, api_name_list, exinfos_list = [], [], []\n", 65 | " api_name_regex = re.compile('\n", 108 | "\n", 121 | "\n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | "
idapi_nameexinfos
03ec88410420dd913bf5676b2ba0ae4baa41dad0d55df9b...AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...user32 gdi32 mfc42 msvcrt imm32 advapi32 rpcrt...
12dfd653c6b862500ff7c47615ad0725a8ce88ddb8ee083...AnalyzeStart Fake_BeCreatedEx TryToAnalyze Fak...mpr advapi32 rpcrt4 secur32 user32 gdi32 imm32...
2fb7ae8ad837ee4c2afc58bc321e6bfddb6564a6bce3743...AnalyzeStart Fake_BeCreatedEx TryToAnalyze Unp...user32 gdi32 advapi32 rpcrt4 secur32 oleaut32 ...
3c97a29518ee63fecae29dd973941b8395bd3aaceb11c52...AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...user32 gdi32 advapi32 rpcrt4 secur32 iphlpapi ...
4fb146a3d534cfc36b325bc1c4d7995122b722eb5ae04d5...AnalyzeStart Fake_BeCreatedEx TryToAnalyze NtQ...mfc42 msvcrt gdi32 user32 imm32 advapi32 rpcrt...
\n", 163 | "" 164 | ], 165 | "text/plain": [ 166 | " id \\\n", 167 | "0 3ec88410420dd913bf5676b2ba0ae4baa41dad0d55df9b... \n", 168 | "1 2dfd653c6b862500ff7c47615ad0725a8ce88ddb8ee083... \n", 169 | "2 fb7ae8ad837ee4c2afc58bc321e6bfddb6564a6bce3743... \n", 170 | "3 c97a29518ee63fecae29dd973941b8395bd3aaceb11c52... \n", 171 | "4 fb146a3d534cfc36b325bc1c4d7995122b722eb5ae04d5... \n", 172 | "\n", 173 | " api_name \\\n", 174 | "0 AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa... \n", 175 | "1 AnalyzeStart Fake_BeCreatedEx TryToAnalyze Fak... \n", 176 | "2 AnalyzeStart Fake_BeCreatedEx TryToAnalyze Unp... \n", 177 | "3 AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa... \n", 178 | "4 AnalyzeStart Fake_BeCreatedEx TryToAnalyze NtQ... \n", 179 | "\n", 180 | " exinfos \n", 181 | "0 user32 gdi32 mfc42 msvcrt imm32 advapi32 rpcrt... \n", 182 | "1 mpr advapi32 rpcrt4 secur32 user32 gdi32 imm32... \n", 183 | "2 user32 gdi32 advapi32 rpcrt4 secur32 oleaut32 ... \n", 184 | "3 user32 gdi32 advapi32 rpcrt4 secur32 iphlpapi ... \n", 185 | "4 mfc42 msvcrt gdi32 user32 imm32 advapi32 rpcrt... " 186 | ] 187 | }, 188 | "execution_count": 3, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "data.head()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 4, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "data.fillna(method=\"ffill\", inplace=True)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 5, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "\n", 216 | "RangeIndex: 60000 entries, 0 to 59999\n", 217 | "Data columns (total 3 columns):\n", 218 | "id 60000 non-null object\n", 219 | "api_name 60000 non-null object\n", 220 | "exinfos 60000 non-null object\n", 221 | "dtypes: object(3)\n", 222 | "memory usage: 1.4+ MB\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "data.info()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 13, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "api_name_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=100000)\n", 237 | "api_name_train_tfidf_features = api_name_vectorizer.fit_transform(data[\"api_name\"].tolist())\n", 238 | "\n", 239 | "exinfos_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=100000)\n", 240 | "exinfos_train_tfidf_features = exinfos_vectorizer.fit_transform(data[\"exinfos\"].tolist())" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 10, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "with open(\"api_name_train_tfidf_features.pkl\", \"wb\") as fp:\n", 250 | " pickle.dump(api_name_train_tfidf_features, fp)\n", 251 | "with open(\"exinfos_train_tfidf_features.pkl\", \"wb\") as fp:\n", 252 | " pickle.dump(exinfos_train_tfidf_features, fp)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 2, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "api_name_train_tfidf_features = pd.read_pickle(\"api_name_train_tfidf_features.pkl\")\n", 262 | "exinfos_train_tfidf_features = pd.read_pickle(\"exinfos_svded_features.pkl\")" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "svd = TruncatedSVD(n_components=1000, algorithm=\"arpack\", random_state=0)\n", 272 | "svded_train = svd.fit_transform(api_name_train_tfidf_features.tolil())\n", 273 | "svd = TruncatedSVD(n_components=10000, algorithm=\"arpack\", random_state=0)\n", 274 | "exinfos_svded_train = svd.fit_transform(exinfos_train_tfidf_features.tolil())\n", 275 | "with open(\"api_name_svded_10000_features.pkl\", \"wb\") as fp:\n", 276 | " pickle.dump(svded_train, fp)\n", 277 | "with open(\"exinfos_svded_10000_features.pkl\", \"wb\") as fp:\n", 278 | " pickle.dump(exinfos_svded_train, fp)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 3, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "api_name_svded_train = pd.read_pickle(\"api_name_svded_features.pkl\")\n", 288 | "exinfos_svded_train = pd.read_pickle(\"exinfos_svded_features.pkl\")" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 4, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "merge_data = np.hstack([api_name_svded_train, exinfos_svded_train])" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 5, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "data": { 307 | "text/plain": [ 308 | "(60000, 2000)" 309 | ] 310 | }, 311 | "execution_count": 5, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "merge_data.shape" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 7, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "kmeans = KMeans(n_clusters=50, random_state=0)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 8, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "y_pred = kmeans.fit_predict(merge_data)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 11, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "result = pd.DataFrame()\n", 345 | "result[\"id\"] = data[\"id\"]\n", 346 | "result[\"family_id\"] = y_pred" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 14, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "result.to_csv(\"result.csv\", encoding=\"utf-8\", index=False)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 25, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "y_pred = pd.read_csv(\"result.csv\")[\"family_id\"]" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 44, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "exinfos = pd.read_pickle(\"exinfos_svded_features.pkl\")" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 6, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "from sklearn.manifold import TSNE\n", 383 | "\n", 384 | "X_tsne = TSNE(n_components=2, random_state=33).fit_transform(merge_data)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "with open(\"api_name_exinfos_stne_data.pkl\", \"wb\") as fp:\n", 394 | " pickle.dump(X_tsne, fp)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 2, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "X_tsne = pd.read_pickle(\"call_name_tsne_data.pkl\")" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 9, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "font = {\"color\": \"darkred\",\n", 413 | " \"size\": 13, \n", 414 | " \"family\" : \"serif\"}\n", 415 | "\n", 416 | "plt.style.use(\"dark_background\")\n", 417 | "plt.figure()\n", 418 | "plt.scatter(X_tsne[:, 0], X_tsne[:, 1])\n", 419 | "plt.title(\"origin_data_t-SNE\", fontdict=font)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 10, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "y_pred = pd.read_csv(\"34.78_k=100.csv\")[\"family_id\"]" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 19, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "font = {\"color\": \"darkred\",\n", 438 | " \"size\": 13, \n", 439 | " \"family\" : \"serif\"}\n", 440 | "\n", 441 | "plt.style.use(\"dark_background\")\n", 442 | "plt.figure()\n", 443 | "plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_pred.values, alpha=0.6, \n", 444 | " cmap=plt.cm.get_cmap('rainbow', 100))\n", 445 | "plt.title(\"api_name_and_exinfos_t-SNE\", fontdict=font)\n", 446 | "cbar = plt.colorbar() \n", 447 | "cbar.set_label(label='family id', fontdict=font)\n", 448 | "plt.clim(-5, 100)\n", 449 | "plt.tight_layout()\n", 450 | "plt.savefig(\"api_name_and_exinfos_TSNE.pdf\")" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 21, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "call_name_svded_features = pd.read_pickle(\"call_name_svded_features.pkl\")\n", 460 | "api_name_svded_features = pd.read_pickle(\"api_name_svded_features.pkl\")\n", 461 | "exinfos_svded_features = pd.read_pickle(\"exinfos_svded_features.pkl\")\n", 462 | "merge_data = np.hstack([api_name_svded_features, exinfos_svded_features, call_name_svded_features])" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 26, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "kmeans = KMeans(n_clusters=100, random_state=0)\n", 472 | "y_pred = kmeans.fit_predict(merge_data)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 27, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "cluster = 100\n", 482 | "result = pd.DataFrame()\n", 483 | "result[\"id\"] = pd.read_csv(\"id.csv\", names=[\"id\"])[\"id\"]\n", 484 | "result[\"family_id\"] = y_pred\n", 485 | "\n", 486 | "result.to_csv(f\"k-means_cluster={cluster}_result.csv\", encoding=\"utf-8\", index=False)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 48, 492 | "metadata": {}, 493 | "outputs": [ 494 | { 495 | "data": { 496 | "text/plain": [ 497 | "2.6399999999999997" 498 | ] 499 | }, 500 | "execution_count": 48, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "10.53 - 7.89" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 49, 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "data": { 516 | "text/plain": [ 517 | "7.89" 518 | ] 519 | }, 520 | "execution_count": 49, 521 | "metadata": {}, 522 | "output_type": "execute_result" 523 | } 524 | ], 525 | "source": [ 526 | "2.63 * 3" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [] 535 | } 536 | ], 537 | "metadata": { 538 | "kernelspec": { 539 | "display_name": "Python 3", 540 | "language": "python", 541 | "name": "python3" 542 | }, 543 | "language_info": { 544 | "codemirror_mode": { 545 | "name": "ipython", 546 | "version": 3 547 | }, 548 | "file_extension": ".py", 549 | "mimetype": "text/x-python", 550 | "name": "python", 551 | "nbconvert_exporter": "python", 552 | "pygments_lexer": "ipython3", 553 | "version": "3.6.7" 554 | } 555 | }, 556 | "nbformat": 4, 557 | "nbformat_minor": 2 558 | } 559 | -------------------------------------------------------------------------------- /DataCon2019/code/stage2/for_cluster_kmeans.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.cluster import KMeans 5 | 6 | call_name_svded_features = pd.read_pickle("call_name_svded_features.pkl") 7 | api_name_svded_features = pd.read_pickle("api_name_svded_features.pkl") 8 | exinfos_svded_features = pd.read_pickle("exinfos_svded_features.pkl") 9 | merge_data = np.hstack([api_name_svded_features, exinfos_svded_features, call_name_svded_features]) 10 | 11 | for cluster in [50, 250, 300, 400, 500]: 12 | kmeans = KMeans(n_clusters=cluster, random_state=0) 13 | y_pred = kmeans.fit_predict(merge_data) 14 | result = pd.DataFrame() 15 | result["id"] = pd.read_csv("id.csv", names=["id"])["id"] 16 | result["family_id"] = y_pred 17 | 18 | result.to_csv(f"k-means_cluster={cluster}_result.csv", encoding="utf-8", index=False) -------------------------------------------------------------------------------- /DataCon2019/code/stage2/get_call_name_tfidf_features.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.manifold import TSNE 5 | from sklearn.decomposition import TruncatedSVD 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | 8 | 9 | data = pd.read_csv("call_name.csv") 10 | call_name_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9) 11 | call_name_train_tfidf_features = call_name_vectorizer.fit_transform(data["call_name"].tolist()) 12 | with open("call_name_tfidf_features.pkl", "wb") as fp: 13 | pickle.dump(call_name_train_tfidf_features, fp) 14 | 15 | svd = TruncatedSVD(n_components=1000, algorithm="arpack", random_state=0) 16 | call_name_svded_train = svd.fit_transform(call_name_train_tfidf_features.tolil()) 17 | 18 | with open("call_name_svded_features.pkl", "wb") as fp: 19 | pickle.dump(call_name_svded_train, fp) 20 | 21 | X_tsne = TSNE(n_components=2, random_state=33).fit_transform(call_name_svded_train) 22 | 23 | with open("call_name_tsne_data.pkl", "wb") as fp: 24 | pickle.dump(X_tsne, fp) -------------------------------------------------------------------------------- /DataCon2019/code/stage2/plot_comparison.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | X_tsne = pd.read_pickle("api_name_exinfos_call_name_tsne_data.pkl") 7 | dbscan_y_pred = pd.read_csv("result.csv")["family_id"] 8 | kmeans_50 = pd.read_csv("k-means_cluster=50_result.csv")["family_id"] 9 | kmeans_100 = pd.read_csv("k-means_cluster=100_result.csv")["family_id"] 10 | kmeans_200 = pd.read_csv("k-means_cluster=200_result.csv")["family_id"] 11 | kmeans_250 = pd.read_csv("k-means_cluster=250_result.csv")["family_id"] 12 | kmeans_300 = pd.read_csv("k-means_cluster=300_result.csv")["family_id"] 13 | kmeans_400 = pd.read_csv("k-means_cluster=400_result.csv")["family_id"] 14 | kmeans_500 = pd.read_csv("k-means_cluster=500_result.csv")["family_id"] 15 | 16 | font = {"color": "darkred", 17 | "size": 25, 18 | "family" : "serif"} 19 | 20 | plt.style.use("dark_background") 21 | plt.figure(figsize=(30, 25)) 22 | 23 | plt.subplot(3, 3, 1) 24 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_50.values, alpha=0.6, 25 | cmap=plt.cm.get_cmap('rainbow', 50)) 26 | plt.title("K-means_cluster=50_t-SNE", fontdict=font) 27 | cbar = plt.colorbar() 28 | cbar.set_label(label='family id', fontdict=font) 29 | plt.clim(0, 50) 30 | 31 | plt.subplot(3, 3, 2) 32 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=dbscan_y_pred.values, alpha=0.6, 33 | cmap=plt.cm.get_cmap('rainbow', dbscan_y_pred.max()-dbscan_y_pred.min())) 34 | plt.title("DBSCAN_t-SNE", fontdict=font) 35 | cbar = plt.colorbar() 36 | cbar.set_label(label='family id', fontdict=font) 37 | plt.clim(dbscan_y_pred.min(), 1000) 38 | 39 | plt.subplot(3, 3, 3) 40 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_100.values, alpha=0.6, 41 | cmap=plt.cm.get_cmap('rainbow', 100)) 42 | plt.title("K-means_cluster=100_t-SNE", fontdict=font) 43 | cbar = plt.colorbar() 44 | cbar.set_label(label='family id', fontdict=font) 45 | plt.clim(0, 100) 46 | 47 | plt.subplot(3, 3, 4) 48 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_200.values, alpha=0.6, 49 | cmap=plt.cm.get_cmap('rainbow', 200)) 50 | plt.title("K-means_cluster=200_t-SNE", fontdict=font) 51 | cbar = plt.colorbar() 52 | cbar.set_label(label='family id', fontdict=font) 53 | plt.clim(0, 200) 54 | 55 | plt.subplot(3, 3, 5) 56 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.6, c=np.ones(60000), cmap=plt.cm.get_cmap('rainbow', 1)) 57 | plt.title("origin_data_t-SNE", fontdict=font) 58 | cbar = plt.colorbar(ticks=[0]) 59 | cbar.set_label(label='color bar', fontdict=font) 60 | 61 | plt.subplot(3, 3, 6) 62 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_250.values, alpha=0.6, 63 | cmap=plt.cm.get_cmap('rainbow', 250)) 64 | plt.title("K-means_cluster=250_t-SNE", fontdict=font) 65 | cbar = plt.colorbar() 66 | cbar.set_label(label='family id', fontdict=font) 67 | plt.clim(0, 250) 68 | 69 | plt.subplot(3, 3, 7) 70 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_300.values, alpha=0.6, 71 | cmap=plt.cm.get_cmap('rainbow', 300)) 72 | plt.title("K-means_cluster=300_t-SNE", fontdict=font) 73 | cbar = plt.colorbar() 74 | cbar.set_label(label='family id', fontdict=font) 75 | plt.clim(0, 300) 76 | 77 | plt.subplot(3, 3, 8) 78 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_400.values, alpha=0.6, 79 | cmap=plt.cm.get_cmap('rainbow', 400)) 80 | plt.title("K-means_cluster=400_t-SNE", fontdict=font) 81 | cbar = plt.colorbar() 82 | cbar.set_label(label='family id', fontdict=font) 83 | plt.clim(0, 400) 84 | 85 | plt.subplot(3, 3, 9) 86 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_500.values, alpha=0.6, 87 | cmap=plt.cm.get_cmap('rainbow', 500)) 88 | plt.title("K-means_cluster=500_t-SNE", fontdict=font) 89 | cbar = plt.colorbar() 90 | cbar.set_label(label='family id', fontdict=font) 91 | plt.clim(0, 500) 92 | 93 | plt.tight_layout() 94 | plt.savefig("K-means_and_DBSCAN_cluster_comparison.jpg") -------------------------------------------------------------------------------- /DataCon2019/code/stage2/yield_call_name_api_name_exinfos_tsne.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.manifold import TSNE 5 | from sklearn.decomposition import TruncatedSVD 6 | 7 | call_name_svded_features = pd.read_pickle("call_name_svded_features.pkl") 8 | api_name_svded_features = pd.read_pickle("api_name_svded_features.pkl") 9 | exinfos_svded_features = pd.read_pickle("exinfos_svded_features.pkl") 10 | merge_data = np.hstack([api_name_svded_features, exinfos_svded_features, call_name_svded_features]) 11 | X_tsne = TSNE(n_components=2, random_state=33).fit_transform(merge_data) 12 | 13 | with open("api_name_exinfos_tsne_call_name_data.pkl", "wb") as fp: 14 | pickle.dump(X_tsne, fp) -------------------------------------------------------------------------------- /DataCon2019/loom_大数据安全分析比赛决赛.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/loom_大数据安全分析比赛决赛.pdf -------------------------------------------------------------------------------- /DataCon2019/useful/K-means_and_DBSCAN_cluster_comparison.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/K-means_and_DBSCAN_cluster_comparison.jpg -------------------------------------------------------------------------------- /DataCon2019/useful/K-means_and_DBSCAN_cluster_comparison.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/K-means_and_DBSCAN_cluster_comparison.pdf -------------------------------------------------------------------------------- /DataCon2019/useful/K-means_cluster_comparison.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/K-means_cluster_comparison.jpg -------------------------------------------------------------------------------- /DataCon2019/useful/api_name_barh.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/api_name_barh.pdf -------------------------------------------------------------------------------- /DataCon2019/useful/call_pid_barh.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/call_pid_barh.pdf -------------------------------------------------------------------------------- /DataCon2019/useful/draw_origin_data.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/draw_origin_data.jpg -------------------------------------------------------------------------------- /DataCon2019/useful/exinfos_barh.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/exinfos_barh.pdf -------------------------------------------------------------------------------- /DataCon2019/useful/rank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/rank.png -------------------------------------------------------------------------------- /DataCon2019/useful/ret_value_barh.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/ret_value_barh.pdf -------------------------------------------------------------------------------- /DataCon2019/useful/table.md: -------------------------------------------------------------------------------- 1 | | 算法名称 | 准确率 | 召回率 | $\boldsymbol{F1\ Score}$ | 2 | | :----------------------------------: | :------------------------------: | :-------------------------------: | :-------------------------------: | 3 | | $\boldsymbol{Random\ Forest}$ | $0.9800 $ | $0.98 00 $ | $ 0.98 00 $ | 4 | | $\boldsymbol{XGBoost}$ | $0.9800$ | $0.9700$ | $0.9 800$ | 5 | | $\boldsymbol{BPNN}$ | $0.9635$ | $0.9635$ | $0.9635$ | 6 | | $\boldsymbol{Decission\ Tree}$ | $0.95 00 $ | $0.96 00 $ | $0.950 0$ | 7 | | $\boldsymbol{Logistic\ Regression}$ | $0.9600$ | $0.9700$ | $0.9400$ | 8 | | $\boldsymbol{Naive\ Byes^{[1]}}$ | $0.940 0$ | $ 0.9500$ | $0.9400 $ | 9 | | $\boldsymbol{Naive\ Byes^{[2]}}$ | $0.9484$ | $0.9484$ | $0.9484$ | 10 | | $\boldsymbol{GBDT}$ | $0.97 00$ | $0.95 00$ | $0.9 600$ | 11 | | $\boldsymbol{Bagging}$ | $0.9700$ | $0.9600$ | $0.9700$ | 12 | | $\boldsymbol{AdaBoost}$ | $0.9521$ | $0.9521$ | $0.9521$ | 13 | | $\boldsymbol{SVM}$ | $0.9 700$ | $0.9500 $ | $0.9600 $ | 14 | | $\boldsymbol{Ensemble\ model^{[1]}}$ | $\boldsymbol{\color{red}0.9839}$ | $\boldsymbol{\color{red}0.9839}$ | $\boldsymbol{\color{red}0.9839}$ | 15 | | $\boldsymbol{Ensemble\ model^{[2]}}$ | $\boldsymbol{\color{red}0.9967}$ | $\boldsymbol{\color{red}0.9 967}$ | $\boldsymbol{\color{red}0.99 67}$ | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /DataCon2020/PPT/loom_2020DataCon大数据安全分析比赛分享.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/loom_2020DataCon大数据安全分析比赛分享.pptx -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/2020rank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/2020rank.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/ROC_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/ROC_curve.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/black.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/black_white_pdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/black_white_pdf.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/decode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/decode.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/features_tsne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/features_tsne.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/result1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/result1.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/result2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/result2.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/tfidf.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/time.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/train_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/train_flow.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/vb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/vb.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/white.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/xgb1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/xgb1.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/xgb2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/xgb2.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/xgb3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/xgb3.png -------------------------------------------------------------------------------- /DataCon2020/PPT/picture/方差偏差均衡.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/方差偏差均衡.png -------------------------------------------------------------------------------- /DataCon2020/codes/bagging.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import StratifiedKFold 3 | 4 | def bagging(model, x_train, y_train, x_test, n_splits): 5 | """ 6 | :@param x_train: feature matrix. 7 | :type x_train: np.array(M X N) or list(M X N). 8 | :@param y_train: class label. 9 | :type y_train: np.array(M X 1). 10 | :@param x_test: test set feature matrix. 11 | :type x_test: np.array(M X N) or list(M X N). 12 | :@param n_splits: K-fold parameter. 13 | :type n_splits: int. 14 | """ 15 | n_train, n_test = x_train.shape[0], x_test.shape[0] 16 | # 随机划分数据 17 | kf = StratifiedKFold(n_splits=n_splits, random_state=0) 18 | oof_train = np.empty((n_train, )) 19 | oof_test = np.empty((n_test, )) 20 | oof_test_skf = np.empty((n_splits, n_test)) 21 | 22 | # 训练第i个模型 23 | for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)): 24 | kf_x_train = x_train[train_index] 25 | kf_y_train = y_train[train_index] 26 | model.fit(kf_x_train, kf_y_train) 27 | oof_test_skf[i, :] = model.predict(x_test) 28 | # 对所有的模型结果进行集成 29 | oof_test[:] = oof_test_skf.mean(axis=0) 30 | return oof_test.reshape(-1, 1) 31 | -------------------------------------------------------------------------------- /DataCon2020/codes/get_id.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pandas as pd 3 | 4 | names = [] 5 | df = pd.DataFrame() 6 | for path in glob.glob("/home/datacon/malware/YYY_step1/*"): 7 | names.append(path.split("/")[-1]) 8 | 9 | df["id"] = names 10 | df.to_csv("/home/jovyan/media_directory/test_id.csv", index=False, header=None, encoding="utf-8") -------------------------------------------------------------------------------- /DataCon2020/codes/get_raw_test_data.py: -------------------------------------------------------------------------------- 1 | import re 2 | import glob 3 | import pandas as pd 4 | 5 | def get_string(directory, file_name): 6 | list_ = [] 7 | df = pd.DataFrame() 8 | for path in glob.glob(f"{directory}/*"): 9 | with open(path, "rb") as fp: 10 | string = fp.read().decode("utf-8", errors="ignore") 11 | raw_words = re.findall("[a-zA-Z]+", string) 12 | words_space = " ".join(w for w in raw_words if 4 < len(w) < 20) 13 | list_.append(words_space) 14 | df["words"] = list_ 15 | df.to_csv(f"{file_name}.csv", index=False) 16 | print(len(list_)) 17 | 18 | 19 | get_string("/home/datacon/malware/YYY_step1", "/home/jovyan/media_directory/end_raw_test") -------------------------------------------------------------------------------- /DataCon2020/codes/get_raw_train_data.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | 4 | def get_train_data(label_file_path, data_db_path, file_name, media_directory): 5 | with open(label_file_path, "r") as fp: 6 | id_ = fp.read().split() 7 | list_ = [] 8 | df = pd.DataFrame() 9 | for path in id_: 10 | with open(f"{data_db_path}/{path}", "rb") as fp: 11 | string = fp.read().decode("utf-8", errors="ignore") 12 | raw_words = re.findall("[a-zA-Z]+", string) 13 | words_space = " ".join(w for w in raw_words if 4 < len(w) < 20) 14 | list_.append(words_space) 15 | df["words"] = list_ 16 | df.to_csv(f"{media_directory}/{file_name}.csv", index=False) 17 | return df 18 | 19 | def merge(black, white, file_name, media_directory): 20 | train_raw_data = black.append(white) 21 | train_raw_data["labels"] = [1 for _ in range(black.shape[0])] + [0 for _ in range(white.shape[0])] 22 | train_raw_data.to_csv(f"{media_directory}/{file_name}.csv", index=False) 23 | 24 | black = get_train_data("/home/datacon/malware/XXX/black.txt", 25 | "/home/datacon/malware/XXX/data", 26 | "black", 27 | "/home/jovyan/media_directory") 28 | print("black is over!") 29 | white = get_train_data("/home/datacon/malware/XXX/white.txt", 30 | "/home/datacon/malware/XXX/data", 31 | "white", 32 | "/home/jovyan/media_directory") 33 | print("white is over!") 34 | # black = pd.read_csv("/home/jovyan/media_directory/black.csv") 35 | # white = pd.read_csv("/home/jovyan/media_directory/white.csv") 36 | merge(black, white, "raw_train_data", "/home/jovyan/media_directory") -------------------------------------------------------------------------------- /DataCon2020/codes/lgb_cv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import lightgbm as lgb 3 | from sklearn.model_selection import StratifiedKFold 4 | 5 | 6 | # LightGBM模型+交叉验证 7 | params = {'boosting_type': 'gbdt', 8 | 'objective': 'binary', 9 | 'metric': 'binary_logloss', 10 | 'learning_rate': 0.001, 11 | 'num_leaves': 82, 12 | 'max_depth': 8, 13 | 'min_data_in_leaf': 64, 14 | 'min_child_weight':1.435, 15 | 'bagging_fraction': 0.785, 16 | 'feature_fraction': 0.373, 17 | 'bagging_freq': 22, 18 | 'reg_lambda': 0.065, 19 | 'reg_alpha': 0.797, 20 | 'min_split_gain': 0.350, 21 | 'nthread': 8, 22 | 'seed': 42, 23 | 'scale_pos_weight':1.15, 24 | 'verbose': -1} 25 | 26 | def get_lgb_oof(params, x_train, y_train, x_test, n_splits): 27 | n_train, n_test = x_train.shape[0], x_test.shape[0] 28 | kf = StratifiedKFold(n_splits=n_splits, random_state=0) 29 | oof_train = np.empty((n_train, )) 30 | oof_test = np.empty((n_test, )) 31 | oof_test_skf = np.empty((n_splits, n_test)) 32 | for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)): 33 | kf_x_train = x_train[train_index] 34 | kf_y_train = y_train[train_index] 35 | kf_x_test = x_train[test_index] 36 | kf_y_test = y_train[test_index] 37 | train_matrix = lgb.Dataset(kf_x_train, label=kf_y_train) 38 | valid_matrix = lgb.Dataset(kf_x_test, label=kf_y_test) 39 | model = lgb.train(params, 40 | train_set=train_matrix, 41 | num_boost_round=80000, 42 | valid_sets=valid_matrix, 43 | verbose_eval=-1, 44 | early_stopping_rounds=600) 45 | oof_test_skf[i, :] = model.predict(x_test) 46 | oof_test[:] = oof_test_skf.mean(axis=0) 47 | return oof_test.reshape(-1, 1) 48 | 49 | 50 | # 技巧提升 51 | # 用一个较大的learning rate学习得到初始版本模型1; 52 | # 用一个较小的learning rate在模型1上继续训练得到模型2; 53 | params1 = {'boosting_type': 'gbdt', 54 | 'objective': 'binary', 55 | 'metric': 'binary_logloss', 56 | 'learning_rate': 0.025, 57 | "feature_fraction":0.5, 58 | "num_leaves": 200, 59 | "lambda_l1":2, 60 | "lambda_l2":2, 61 | "learning_rate":0.01, 62 | 'min_child_samples': 50, 63 | "bagging_fraction":0.7, 64 | "bagging_freq":1, 65 | 'verbose': -1} 66 | 67 | params2 = {'boosting_type': 'gbdt', 68 | 'objective': 'binary', 69 | 'metric': 'binary_logloss', 70 | 'learning_rate': 0.001, 71 | 'num_leaves': 82, 72 | 'max_depth': 8, 73 | 'min_data_in_leaf': 64, 74 | 'min_child_weight': 1.435, 75 | 'bagging_fraction': 0.785, 76 | 'feature_fraction': 0.373, 77 | 'bagging_freq': 22, 78 | 'reg_lambda': 0.065, 79 | 'reg_alpha': 0.797, 80 | 'min_split_gain': 0.350, 81 | 'nthread': 8, 82 | 'seed': 42, 83 | 'scale_pos_weight':1.15, 84 | 'verbose': -1} 85 | 86 | def get_lgb_oof(params1, params2, x_train, y_train, x_test, n_splits): 87 | n_train, n_test = x_train.shape[0], x_test.shape[0] 88 | kf = StratifiedKFold(n_splits=n_splits) 89 | oof_train = np.empty((n_train, )) 90 | oof_test = np.empty((n_test, )) 91 | oof_test_skf = np.empty((n_splits, n_test)) 92 | for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)): 93 | kf_x_train = x_train[train_index] 94 | kf_y_train = y_train[train_index] 95 | kf_x_test = x_train[test_index] 96 | kf_y_test = y_train[test_index] 97 | train_matrix = lgb.Dataset(kf_x_train, label=kf_y_train) 98 | valid_matrix = lgb.Dataset(kf_x_test, label=kf_y_test) 99 | model1 = lgb.train(params1, 100 | train_set=train_matrix, 101 | num_boost_round=20000, 102 | valid_sets=valid_matrix, 103 | verbose_eval=-1, 104 | early_stopping_rounds=200) 105 | 106 | model2 = lgb.train(params2, 107 | train_set=train_matrix, 108 | num_boost_round=20000, 109 | valid_sets=valid_matrix, 110 | init_model=model1, 111 | verbose_eval=-1, 112 | early_stopping_rounds=200) 113 | oof_test_skf[i, :] = model2.predict(x_test) 114 | oof_test[:] = oof_test_skf.mean(axis=0) 115 | return oof_test.reshape(-1, 1) 116 | -------------------------------------------------------------------------------- /DataCon2020/codes/plot.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | 5 | __author__ = "yhangf" 6 | 7 | def plot_roc_curve(test_label, y_pred, *, model_name, save=True): 8 | """Calculate the AUC value of the model 9 | and drawing. 10 | :@param test_label: the actual label of the test set. 11 | :type test_label: the K dimension np.array. 12 | :@param y_pred: the predictive label of the model. 13 | :type y_pred: the K dimension np.array. 14 | :@param model_name: name of the model. 15 | :type model_name: str. 16 | :@param save: control the saving of images. 17 | :type save: bool. 18 | """ 19 | 20 | font = {"color": "darkred", "size": 13, "family": "serif"} 21 | 22 | # calculate auc value 23 | fpr, tpr, _ = metrics.roc_curve(test_label, y_pred) 24 | auc = metrics.roc_auc_score(test_label, y_pred) 25 | 26 | # draw a roc curve 27 | with plt.style.context("bmh"): 28 | fig, ax = plt.subplots() 29 | ax.plot( 30 | fpr, 31 | tpr, 32 | label=f"{model_name} AUC = {auc:.5f}", 33 | color="steelblue", 34 | rasterized=True, 35 | linewidth=2, 36 | ) 37 | 38 | ax.set_xlim([0.0, 1.0]) 39 | ax.set_ylim([0.0, 1.05]) 40 | ax.set_xlabel("False Positive Rate", fontdict=font) 41 | ax.set_ylabel("True Positive Rate", fontdict=font) 42 | ax.set_title("ROC curve", fontdict=font) 43 | ax.legend(loc="lower right") 44 | ax.tick_params(axis="both") 45 | plt.tight_layout() 46 | if save: 47 | fig.savefig(f"{model_name}_auc_curve.pdf") 48 | 49 | 50 | def plot_multiple_roc_curve( 51 | test_label_array, 52 | y_pred_array, 53 | model_name_list, 54 | data_volume_list, 55 | *, 56 | col, 57 | width, 58 | height, 59 | save=True, 60 | ): 61 | """Calculate the AUC value of the multiple model 62 | and drawing. 63 | :@param test_label_array: the actual label array of the test set. 64 | :type test_label_array: the MxK dimension np.array or list. 65 | :@param y_pred_array: the predictive label array of the model. 66 | :type y_pred: the MxK dimension np.array or list. 67 | :@param model_name_list: name list of the multiple model. 68 | :type model_name: list[str]. 69 | :@param data_volume_list: the sample number of each training is listed. 70 | :type data_volume_list: list. 71 | :@param col: control the number of subgraphs. 72 | :type col: int. 73 | :@param width: the total width of the canvas. 74 | :type width: float. 75 | :@param height: the total height of the canvas. 76 | :type height: float. 77 | :@param save: control the saving of images. 78 | :type save: bool. 79 | """ 80 | 81 | font = {"color": "#392f41", "size": 11, "family": "serif"} 82 | 83 | # calculate {tpr fpr auc} value and save as a list 84 | fpr_list, tpr_list, auc_list = [], [], [] 85 | for test_label, y_pred in zip(test_label_array, y_pred_array): 86 | fpr, tpr, _ = metrics.roc_curve(test_label, y_pred) 87 | auc = metrics.roc_auc_score(test_label, y_pred) 88 | fpr_list.append(fpr) 89 | tpr_list.append(tpr) 90 | auc_list.append(auc) 91 | # calculate the number of rows in a subgraph 92 | if len(auc_list) % col: 93 | row = len(auc_list) // col + 1 94 | else: 95 | row = len(auc_list) // col 96 | 97 | with plt.style.context("bmh"): 98 | fig, axs = plt.subplots(row, col, figsize=(width, height)) 99 | # while row or col is 1, add new dimension 100 | if row == 1 or col == 1: 101 | axs = axs[:, np.newaxis] 102 | axs = [i for ax in axs for i in ax] # modify the dimensions of axs 103 | for ax, fpr, tpr, model_name, auc, volume in zip( 104 | axs, fpr_list, tpr_list, model_name_list, auc_list, data_volume_list 105 | ): 106 | 107 | ax.plot( 108 | fpr, 109 | tpr, 110 | label=f"{model_name} AUC = {auc:.5f}", 111 | color="steelblue", 112 | rasterized=True, 113 | linewidth=2, 114 | ) 115 | 116 | ax.set_xlim([0.0, 1.0]) 117 | ax.set_ylim([0.0, 1.05]) 118 | ax.set_xlabel("False Positive Rate", fontdict=font) 119 | ax.set_ylabel("True Positive Rate", fontdict=font) 120 | ax.set_title(f"ROC curve (Data volume {volume})", fontdict=font) 121 | ax.legend(loc="lower right") 122 | ax.tick_params(axis="both") 123 | plt.tight_layout() 124 | 125 | if save: 126 | fig.savefig("multiple_auc_curve.pdf") 127 | 128 | def plot_train_test_data_pdf(train, 129 | test, 130 | rows, 131 | cols, 132 | *, 133 | width=16, 134 | height=8, 135 | save=False 136 | ): 137 | """Draw the distribution of corresponding features of training set 138 | and test set. 139 | :@param train: training set. 140 | :type train: pd.DataFrame. 141 | :@param test: testing set. 142 | :type test: pd.DataFrame. 143 | :@param rows: controls the number of subgraphs in the row direction. 144 | :type rows: int. 145 | :@param cols: controls the number of subgraphs in the col direction. 146 | :type cols: int. 147 | :@param width: the total width of the canvas. 148 | :type width: float. 149 | :@param height: the total height of the canvas. 150 | :type height: float. 151 | :@param save: control the saving of images. 152 | :type save: bool. 153 | """ 154 | 155 | font = {"size": 10, 156 | "family" : "serif"} 157 | legend_font = {"family" : "serif", 158 | "size": 6} 159 | with plt.style.context("bmh"): 160 | plt.figure(figsize=(width, height), dpi=400) 161 | for i, col in enumerate(train.columns): 162 | ax = plt.subplot(rows, cols, i + 1) 163 | sns.kdeplot(train[col], n_levels=2, color="darkred", shade=True, ax=ax) 164 | sns.kdeplot(test[col], n_levels=2, color="steelblue", shade=True, ax=ax) 165 | ax.set_xlabel(col, fontdict=font) 166 | ax.set_ylabel("Density", fontdict=font) 167 | ax.legend(["train","test"], loc="best", prop=legend_font) 168 | plt.tight_layout() 169 | 170 | if save: 171 | plt.savefig("pdf_curve.pdf") 172 | -------------------------------------------------------------------------------- /DataCon2020/codes/t_sne.py: -------------------------------------------------------------------------------- 1 | from sklearn.manifold import TSNE 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_t_sne(train_tfidf_features): 5 | X_tsne = TSNE(n_components=2, perplexity=300, random_state=42).fit_transform(train_tfidf_features) 6 | font = {"size": 13, 7 | "family" : "serif"} 8 | with plt.style.context("bmh"): 9 | fig, ax = plt.subplots(figsize=(8, 6)) 10 | ax.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.6, 11 | cmap=plt.cm.get_cmap('rainbow', 2)) 12 | ax.set_title("Features Visualization", fontdict=font) 13 | ax.set_ylim([-80, 81]) 14 | ax.set_xlim([-82, 81]) 15 | -------------------------------------------------------------------------------- /DataCon2020/codes/test_train_model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from xgboost import XGBClassifier 4 | import joblib 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | 8 | 9 | def calc_score(y_true, y_pred, alpha=1.2): 10 | y_true = np.array(y_true) 11 | y_pred = np.array(y_pred) 12 | y_true_black_index = {i for i in range(len(y_true)) if y_true[i] == 1} 13 | y_pred_black_index = {i for i in range(len(y_pred)) if y_pred[i] == 1} 14 | y_true_white_index = {i for i in range(len(y_true)) if y_true[i] == 0} 15 | y_pred_white_index = {i for i in range(len(y_pred)) if y_pred[i] == 0} 16 | 17 | black_is_black = len(y_true_black_index & y_pred_black_index) 18 | black_is_white = len(y_true_black_index & y_pred_white_index) 19 | white_is_black = len(y_true_white_index & y_pred_black_index) 20 | white_is_white = len(y_true_white_index & y_pred_white_index) 21 | 22 | recall = black_is_black / (black_is_black + black_is_white) 23 | error_ratio = white_is_black / (white_is_black + white_is_white) 24 | score = recall - alpha * error_ratio 25 | return score 26 | 27 | train_data_ = pd.read_pickle("/home/jovyan/media_directory/train_tfidf_features") 28 | train_labels = pd.read_pickle("/home/jovyan/media_directory/train_labels") 29 | 30 | result = [] 31 | 32 | for i in range(20, 50): 33 | train_data, test_data, train_label, test_label = train_test_split(train_data_, 34 | train_labels, 35 | test_size=0.25, 36 | random_state=i) 37 | 38 | _ = [] 39 | model = XGBClassifier(max_depth=5, n_estimators=90) 40 | model.fit(train_data, train_label) 41 | y_pred = model.predict(test_data) 42 | score = calc_score(test_label, y_pred) 43 | _.append(score) 44 | 45 | model = XGBClassifier(max_depth=5, n_estimators=80) 46 | model.fit(train_data, train_label) 47 | y_pred = model.predict(test_data) 48 | 49 | score = calc_score(test_label, y_pred) 50 | _.append(score) 51 | 52 | model = XGBClassifier(max_depth=5, n_estimators=70) 53 | model.fit(train_data, train_label) 54 | y_pred = model.predict(test_data) 55 | score = calc_score(test_label, y_pred) 56 | _.append(score) 57 | 58 | model = XGBClassifier(max_depth=5, n_estimators=60) 59 | model.fit(train_data, train_label) 60 | y_pred = model.predict(test_data) 61 | score = calc_score(test_label, y_pred) 62 | _.append(score) 63 | 64 | model = XGBClassifier(max_depth=5, n_estimators=50) 65 | model.fit(train_data, train_label) 66 | y_pred = model.predict(test_data) 67 | score = calc_score(test_label, y_pred) 68 | _.append(score) 69 | 70 | model = XGBClassifier(max_depth=5, n_estimators=40) 71 | model.fit(train_data, train_label) 72 | y_pred = model.predict(test_data) 73 | score = calc_score(test_label, y_pred) 74 | _.append(score) 75 | 76 | model = XGBClassifier(max_depth=5, n_estimators=30) 77 | model.fit(train_data, train_label) 78 | y_pred = model.predict(test_data) 79 | score = calc_score(test_label, y_pred) 80 | _.append(score) 81 | 82 | result.append(_) 83 | 84 | print(np.vstack(result).mean(axis=0)) 85 | 86 | -------------------------------------------------------------------------------- /DataCon2020/codes/xgb_bagging.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from xgboost import XGBClassifier 3 | 4 | # 适用于训练数据较少,且预测值抖动现象明显的场合 5 | result = [] 6 | for i in np.random.randint(0xFFFFF, size=10): 7 | train_data, test_data, train_label, test_label = train_test_split(train_tfidf_features, 8 | labels, 9 | test_size=0.2, 10 | random_state=i) 11 | 12 | model = XGBClassifier(n_estimators=100) 13 | model.fit(train_data, train_label) 14 | y_pred = model.predict(test_tfidf_features) 15 | result.append(y_pred) 16 | y_pred = np.array(result).mean(axis=0) 17 | y_pred_end = [1 if i >= 0.5 else 0 for i in y_pred] 18 | -------------------------------------------------------------------------------- /DataCon2020/codes/yield_end_result.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from xgboost import XGBClassifier 4 | import joblib 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | 8 | 9 | with open("/home/jovyan/models/tfidf_model", "rb") as fp: 10 | vectorizer = joblib.load(fp) 11 | with open("/home/jovyan/models/train_model", "rb") as fp: 12 | model = joblib.load(fp) 13 | 14 | test_data_ = pd.read_csv("/home/jovyan/media_directory/end_raw_test.csv") 15 | id_ = pd.read_csv("/home/jovyan/media_directory/test_id.csv", header=None) 16 | 17 | test_tfidf_features = vectorizer.transform(test_data_.words.tolist()) 18 | y_pred = model.predict(test_tfidf_features) 19 | 20 | result = pd.DataFrame() 21 | result["id_"] = id_.values.flatten() 22 | result["y_pred"] = y_pred 23 | 24 | result.to_csv("/home/jovyan/malware_final.txt", index=False, header=None) -------------------------------------------------------------------------------- /DataCon2020/codes/yield_features.py: -------------------------------------------------------------------------------- 1 | import joblib 2 | import pickle 3 | import pandas as pd 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | 6 | train_data_ = pd.read_csv("media_directory/raw_train_data.csv") 7 | 8 | vectorizer = TfidfVectorizer(min_df=3, max_df=0.9, max_features=3000) 9 | train_tfidf_features = vectorizer.fit_transform(train_data_.words.tolist()) 10 | 11 | with open("/home/jovyan/models/tfidf_model", "wb") as fp: 12 | joblib.dump(vectorizer, fp) 13 | 14 | with open("/home/jovyan/media_directory/train_tfidf_features", "wb") as fp: 15 | pickle.dump(train_tfidf_features, fp) 16 | 17 | with open("/home/jovyan/media_directory/train_labels", "wb") as fp: 18 | pickle.dump(train_data_.labels, fp) -------------------------------------------------------------------------------- /DataCon2020/codes/yield_train_model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from xgboost import XGBClassifier 3 | import joblib 4 | 5 | 6 | 7 | train_tfidf_features = pd.read_pickle("/home/jovyan/media_directory/train_tfidf_features") 8 | labels = pd.read_pickle("/home/jovyan/media_directory/train_labels") 9 | 10 | model = XGBClassifier(n_estimators=400, learning_rate=0.05) 11 | model.fit(train_tfidf_features, labels) 12 | 13 | with open("/home/jovyan/models/train_model", "wb") as fp: 14 | joblib.dump(model, fp) -------------------------------------------------------------------------------- /DataCon2020/readme.md: -------------------------------------------------------------------------------- 1 | 执行setup_run.sh即可完成预测前的所有环境配置 -------------------------------------------------------------------------------- /DataCon2020/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | python3 /home/jovyan/codes/yield_end_result.py -------------------------------------------------------------------------------- /DataCon2020/scripts/yield_raw_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | python3 /home/jovyan/codes/get_raw_test_data.py 4 | python3 /home/jovyan/codes/get_id.py -------------------------------------------------------------------------------- /DataCon2020/setup_run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | pip3 install sklearn 4 | pip3 install xgboost 5 | pip3 install pandas 6 | 7 | source /home/jovyan/scripts/yield_raw_data.sh 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #

DataCon:beers:

2 | 3 | ```shell 4 | _ .-') 5 | ( '.( OO )_ 6 | ,--. .-'),-----. .-'),-----. ,--. ,--.) 7 | | |.-') ( OO' .-. '( OO' .-. '| `.' | 8 | | | OO )/ | | | |/ | | | || | 9 | | |`-' |\_) | |\| |\_) | |\| || |'.'| | 10 | (| '---.' \ | | | | \ | | | || | | | 11 | | | `' '-' ' `' '-' '| | | | 12 | `------' `-----' `-----' `--' `--' 13 | ``` 14 | > [DataCon2019大数据安全分析大赛](https://www.butian.net/datacon)方向二(恶意代码检测)冠军方案:rose::rose:,详细思路分享见[知乎](https://zhuanlan.zhihu.com/p/64252076),[DataCon2020大数据安全分析大赛](https://datacon.qianxin.com/#integral)方向五(恶意代码分析)季军方案,详细思路分享见[知乎](https://zhuanlan.zhihu.com/p/185715807),由于比赛时间仓促代码写得比较混乱,还请各位读者多多见谅! 15 | 16 | ### DataCon2019综合积分榜排名(部分) 17 | 18 | ![](https://github.com/yhangf/DataCon/blob/master/DataCon2019/useful/rank.png) 19 | 20 | ### 源码 21 | 22 | #### stage1 23 | 24 | - [[deep_learning_model.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/deep_learning_model.ipynb)] 25 | - [[call_pid_tfidf_stacking.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/call_pid_tfidf_stacking.ipynb)] 26 | - [[exinfos.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/exinfos.ipynb)] 27 | - [[explore.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/explore.ipynb)] 28 | - [[feature_engineering.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/feature_engineering.ipynb)] 29 | - [[new_feature_engineering.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/new_feature_engineering.ipynb)] 30 | - [[out_of_fold.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/out_of_fold.ipynb)] 31 | - [[ret_value_stacking.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/ret_value_stacking.ipynb)] 32 | - [[stacking.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/stacking.ipynb)] 33 | - [[test.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/test.ipynb)] 34 | 35 | #### stage2 36 | 37 | - [[feature_engineering.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage2/feature_engineering.ipynb)] 38 | - [[for_cluster_kmeans.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/for_cluster_kmeans.py)] 39 | - [[get_call_name_tfidf_features.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/get_call_name_tfidf_features.py)] 40 | - [[plot_comparison.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/plot_comparison.py)] 41 | - [[yield_call_name_api_name_exinfos_tsne.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/yield_call_name_api_name_exinfos_tsne.py)] 42 | - [[DBSCAN.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/DBSCAN.py)] 43 | 44 | ### DataCon2020综合积分榜排名(部分) 45 | 46 | ![](https://github.com/yhangf/DataCon/blob/master/DataCon2020/PPT/picture/2020rank.png) 47 | 48 | ### 源码 49 | 50 | - [[get_id.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/get_id.py)]: 获取测试集的文件名 51 | - [[get_raw_test_data.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/get_raw_test_data.py)]: 获取测试集的原始字符串 52 | - [[get_raw_train_data.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/get_raw_train_data.py)]: 获取训练集的原始字符串 53 | - [[test_train_model.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/test_train_model.py)]: 测试训练的模型 54 | - [[yield_end_result.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/yield_end_result.py)]: 生成最终提交的结果 55 | - [[yield_features.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/yield_features.py)]: 由原始字符串生成特征矩阵 56 | - [[yield_train_model.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/yield_train_model.py)]: 生成训练模型 57 | - [[plot.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/plot.py)]: 绘图模块 58 | - [[t_sne.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/t_sne.py)]: 降维可视化模块 59 | - [[lgb_cv.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/lgb_cv.py)]: LightGBM模型+交叉验证 60 | - [[xgb_bagging.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/xgb_bagging.py)]: XGBoost模型+Bagging 61 | - [[bagging.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/bagging.py)]: 经典Bagging框架代码 62 | --------------------------------------------------------------------------------