├── DataCon2019
    ├── code
    │   ├── stage1
    │   │   ├── call_pid_tfidf_stacking.ipynb
    │   │   ├── deep_learning_model.ipynb
    │   │   ├── exinfos.ipynb
    │   │   ├── explore.ipynb
    │   │   ├── feature_engineering.ipynb
    │   │   ├── new_feature_engineering.ipynb
    │   │   ├── out_of_fold.ipynb
    │   │   ├── ret_value_stacking.ipynb
    │   │   ├── stacking.ipynb
    │   │   └── test.ipynb
    │   └── stage2
    │   │   ├── DBSCAN.py
    │   │   ├── feature_engineering.ipynb
    │   │   ├── for_cluster_kmeans.py
    │   │   ├── get_call_name_tfidf_features.py
    │   │   ├── plot_comparison.py
    │   │   └── yield_call_name_api_name_exinfos_tsne.py
    ├── loom_大数据安全分析比赛决赛.pdf
    └── useful
    │   ├── K-means_and_DBSCAN_cluster_comparison.jpg
    │   ├── K-means_and_DBSCAN_cluster_comparison.pdf
    │   ├── K-means_cluster_comparison.jpg
    │   ├── api_name_barh.pdf
    │   ├── call_pid_barh.pdf
    │   ├── draw_origin_data.jpg
    │   ├── exinfos_barh.pdf
    │   ├── rank.png
    │   ├── ret_value_barh.pdf
    │   └── table.md
├── DataCon2020
    ├── PPT
    │   ├── loom_2020DataCon大数据安全分析比赛分享.pptx
    │   └── picture
    │   │   ├── 2020rank.png
    │   │   ├── ROC_curve.png
    │   │   ├── black.png
    │   │   ├── black_white_pdf.png
    │   │   ├── decode.png
    │   │   ├── features_tsne.png
    │   │   ├── result1.png
    │   │   ├── result2.png
    │   │   ├── tfidf.png
    │   │   ├── time.png
    │   │   ├── train_flow.png
    │   │   ├── vb.png
    │   │   ├── white.png
    │   │   ├── xgb1.png
    │   │   ├── xgb2.png
    │   │   ├── xgb3.png
    │   │   └── 方差偏差均衡.png
    ├── codes
    │   ├── bagging.py
    │   ├── get_id.py
    │   ├── get_raw_test_data.py
    │   ├── get_raw_train_data.py
    │   ├── lgb_cv.py
    │   ├── plot.py
    │   ├── t_sne.py
    │   ├── test_train_model.py
    │   ├── xgb_bagging.py
    │   ├── yield_end_result.py
    │   ├── yield_features.py
    │   └── yield_train_model.py
    ├── readme.md
    ├── run.sh
    ├── scripts
    │   └── yield_raw_data.sh
    └── setup_run.sh
└── README.md


/DataCon2019/code/stage1/call_pid_tfidf_stacking.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import time\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "import pickle\n",
 13 |     "import dask.array as da\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.model_selection import StratifiedKFold\n",
 18 |     "from sklearn.model_selection import cross_validate\n",
 19 |     "from sklearn.model_selection import GridSearchCV\n",
 20 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 21 |     "\n",
 22 |     "from sklearn import svm\n",
 23 |     "from sklearn import neighbors\n",
 24 |     "from sklearn import naive_bayes\n",
 25 |     "from sklearn.svm import LinearSVC\n",
 26 |     "from xgboost import XGBClassifier\n",
 27 |     "from sklearn.tree import DecisionTreeClassifier\n",
 28 |     "from sklearn.linear_model import LogisticRegression\n",
 29 |     "from sklearn.linear_model import LogisticRegressionCV\n",
 30 |     "from sklearn.tree import DecisionTreeClassifier\n",
 31 |     "from sklearn.gaussian_process import GaussianProcessClassifier\n",
 32 |     "\n",
 33 |     "from sklearn.ensemble import RandomForestClassifier\n",
 34 |     "from sklearn.ensemble import AdaBoostClassifier\n",
 35 |     "from sklearn.ensemble import BaggingClassifier\n",
 36 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
 37 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
 38 |     "from sklearn.ensemble import VotingClassifier\n",
 39 |     "\n",
 40 |     "from sklearn import metrics\n",
 41 |     "from sklearn.metrics import accuracy_score\n",
 42 |     "from sklearn.metrics import classification_report\n",
 43 |     "\n",
 44 |     "from sklearn.externals import joblib\n",
 45 |     "\n",
 46 |     "%config InlineBackend.figure_format = 'svg'\n",
 47 |     "%matplotlib inline\n",
 48 |     "\n",
 49 |     "import warnings\n",
 50 |     "warnings.filterwarnings(\"ignore\")"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 2,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "safe_type = pd.read_csv(\"origin_data.csv\")[\"safe_type\"]\n",
 60 |     "train_call_pid = pd.read_csv(\"origin_data.csv\")[\"call_pid\"]\n",
 61 |     "test_call_pid = pd.read_csv(\"origin_test.csv\")[\"call_pid\"]"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 8,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "vectorizes = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)\n",
 71 |     "train_call_pid_tfidf = vectorizes.fit_transform(train_call_pid.tolist())\n",
 72 |     "test_call_pid_tfidf = vectorizes.transform(test_call_pid.tolist())"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 12,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "with open(\"train_call_pid_tfidf.pkl\", \"wb\") as fp:\n",
 82 |     "    pickle.dump(train_call_pid_tfidf, fp)\n",
 83 |     "with open(\"test_call_pid_tfidf.pkl\", \"wb\") as fp:\n",
 84 |     "    pickle.dump(test_call_pid_tfidf, fp)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 3,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "train_call_pid_tfidf = pd.read_pickle(\"train_call_pid_tfidf.pkl\")\n",
 94 |     "test_call_pid_tfidf = pd.read_pickle(\"test_call_pid_tfidf.pkl\")"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 4,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "bc_model = BaggingClassifier()\n",
104 |     "gbc_model = GradientBoostingClassifier()\n",
105 |     "lr_model = LogisticRegression()\n",
106 |     "svm_model = svm.LinearSVC()\n",
107 |     "dt_model = DecisionTreeClassifier()\n",
108 |     "xgb_model = XGBClassifier(max_depth=7,\n",
109 |     "                          learning_rate=0.05,\n",
110 |     "                          n_estimators=1000)\n",
111 |     "\n",
112 |     "rfc_model = RandomForestClassifier(200)\n",
113 |     "etc_model = ExtraTreesClassifier()\n",
114 |     "mnb_model = naive_bayes.MultinomialNB(alpha=0.01)\n",
115 |     "ada_model = AdaBoostClassifier()"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 5,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "def get_oof(model, x_train, y_train, x_test, n_splits):\n",
125 |     "    \"\"\"\n",
126 |     "    :@param x_train: feature matrix.\n",
127 |     "    :type x: np.array(M X N) or list(M X N).\n",
128 |     "    :@param y_train: class label.\n",
129 |     "    :type y: int.\n",
130 |     "    :@param x_test: test set feature matrix.\n",
131 |     "    :type x_test: np.array(M X N) or list(M X N).\n",
132 |     "    :@param n_splits: K-fold parameter.\n",
133 |     "    :type n_splits: int.\n",
134 |     "    \"\"\"\n",
135 |     "    n_train, n_test = x_train.shape[0], x_test.shape[0]\n",
136 |     "    kf = StratifiedKFold(n_splits=n_splits, random_state=0)\n",
137 |     "    oof_train = np.empty((n_train, ))\n",
138 |     "    oof_test = np.empty((n_test, ))\n",
139 |     "    oof_test_skf = np.empty((n_splits, n_test))\n",
140 |     "    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):\n",
141 |     "        kf_x_train = x_train[train_index]\n",
142 |     "        kf_y_train = y_train[train_index]\n",
143 |     "        kf_x_test = x_train[test_index]\n",
144 |     "        model.fit(kf_x_train, kf_y_train)\n",
145 |     "        oof_train[test_index] = model.predict(kf_x_test)\n",
146 |     "        oof_test_skf[i, :] = model.predict(x_test)\n",
147 |     "    oof_test[:] = oof_test_skf.mean(axis=0)\n",
148 |     "    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "train_tfidf_features = train_call_pid_tfidf\n",
158 |     "test_tfidf_features = test_call_pid_tfidf\n",
159 |     "try:\n",
160 |     "    lr_model_oof_train, lr_model_oof_test = get_oof(lr_model, \n",
161 |     "                                                    train_tfidf_features.tolil(), \n",
162 |     "                                                    safe_type.values,\n",
163 |     "                                                    test_tfidf_features.tolil(),\n",
164 |     "                                                    10)\n",
165 |     "    with open(\"call_pid_lr_model_oof_train.csv\", \"wb\") as fp:\n",
166 |     "        pickle.dump(lr_model_oof_train, fp)\n",
167 |     "    with open(\"call_pid_lr_model_oof_test.csv\", \"wb\") as fp:\n",
168 |     "        pickle.dump(lr_model_oof_test, fp)\n",
169 |     "    print(\"lr success!\")\n",
170 |     "except:\n",
171 |     "    print(\"lr error!\")\n",
172 |     "try:\n",
173 |     "    gbc_model_oof_train, gbc_model_oof_test = get_oof(gbc_model, \n",
174 |     "                                                      train_tfidf_features.tolil(), \n",
175 |     "                                                      safe_type.values,\n",
176 |     "                                                      test_tfidf_features.tolil(),\n",
177 |     "                                                      10)\n",
178 |     "    with open(\"call_pid_gbc_model_oof_train.csv\", \"wb\") as fp:\n",
179 |     "        pickle.dump(gbc_model_oof_train, fp)\n",
180 |     "    with open(\"call_pid_gbc_model_oof_test.csv\", \"wb\") as fp:\n",
181 |     "        pickle.dump(gbc_model_oof_test, fp)\n",
182 |     "    print(\"gbc success!\")\n",
183 |     "except:\n",
184 |     "    print(\"gbc error!\")\n",
185 |     "try:\n",
186 |     "    bc_model_oof_train, bc_model_oof_test = get_oof(bc_model, \n",
187 |     "                                                    train_tfidf_features.tolil(), \n",
188 |     "                                                    safe_type.values,\n",
189 |     "                                                    test_tfidf_features.tolil(),\n",
190 |     "                                                    10)\n",
191 |     "    with open(\"call_pid_bc_model_oof_train.csv\", \"wb\") as fp:\n",
192 |     "        pickle.dump(bc_model_oof_train, fp)\n",
193 |     "    with open(\"call_pid_bc_model_oof_test.csv\", \"wb\") as fp:\n",
194 |     "        pickle.dump(bc_model_oof_test, fp)\n",
195 |     "    print(\"bc success!\")\n",
196 |     "except:\n",
197 |     "    print(\"bc error!\")\n",
198 |     "try:\n",
199 |     "    svm_model_oof_train, svm_model_oof_test = get_oof(svm_model, \n",
200 |     "                                                      train_tfidf_features.tolil(), \n",
201 |     "                                                      safe_type.values,\n",
202 |     "                                                      test_tfidf_features.tolil(),\n",
203 |     "                                                      10)\n",
204 |     "    with open(\"call_pid_svm_model_oof_train.csv\", \"wb\") as fp:\n",
205 |     "        pickle.dump(svm_model_oof_train, fp)\n",
206 |     "    with open(\"call_pid_svm_model_oof_test.csv\", \"wb\") as fp:\n",
207 |     "        pickle.dump(svm_model_oof_test, fp)\n",
208 |     "    print(\"svm success!\")\n",
209 |     "except:\n",
210 |     "    print(\"svm error!\")\n",
211 |     "try:\n",
212 |     "    dt_model_oof_train, dt_model_oof_test = get_oof(dt_model, \n",
213 |     "                                                      train_tfidf_features.tolil(), \n",
214 |     "                                                      safe_type.values,\n",
215 |     "                                                      test_tfidf_features.tolil(),\n",
216 |     "                                                      10)\n",
217 |     "    with open(\"call_pid_dt_model_oof_train.csv\", \"wb\") as fp:\n",
218 |     "        pickle.dump(dt_model_oof_train, fp)\n",
219 |     "    with open(\"call_pid_dt_model_oof_test.csv\", \"wb\") as fp:\n",
220 |     "        pickle.dump(dt_model_oof_test, fp)\n",
221 |     "    print(\"dt success!\")\n",
222 |     "except:\n",
223 |     "    print(\"dt error!\")\n",
224 |     "\n",
225 |     "    \n",
226 |     "try:\n",
227 |     "    rfc_model_oof_train, rfc_model_oof_test = get_oof(rfc_model, \n",
228 |     "                                                      train_tfidf_features.tolil(), \n",
229 |     "                                                      safe_type.values,\n",
230 |     "                                                      test_tfidf_features.tolil(),\n",
231 |     "                                                      10)\n",
232 |     "    with open(\"call_pid_rfc_model_oof_train.csv\", \"wb\") as fp:\n",
233 |     "        pickle.dump(rfc_model_oof_train, fp)\n",
234 |     "    with open(\"call_pid_rfc_model_oof_test.csv\", \"wb\") as fp:\n",
235 |     "        pickle.dump(rfc_model_oof_test, fp)\n",
236 |     "    print(\"rfc success!\")\n",
237 |     "except:\n",
238 |     "    print(\"rfc error!\")\n",
239 |     "    \n",
240 |     "try:\n",
241 |     "    etc_model_oof_train, etc_model_oof_test = get_oof(etc_model, \n",
242 |     "                                                      train_tfidf_features.tolil(), \n",
243 |     "                                                      safe_type.values,\n",
244 |     "                                                      test_tfidf_features.tolil(),\n",
245 |     "                                                      10)\n",
246 |     "    with open(\"call_pid_etc_model_oof_train.csv\", \"wb\") as fp:\n",
247 |     "        pickle.dump(etc_model_oof_train, fp)\n",
248 |     "    with open(\"call_pid_etc_model_oof_test.csv\", \"wb\") as fp:\n",
249 |     "        pickle.dump(etc_model_oof_test, fp)\n",
250 |     "    print(\"etc success!\")\n",
251 |     "except:\n",
252 |     "    print(\"etc error!\")\n",
253 |     "try:\n",
254 |     "    mnb_model_oof_train, mnb_model_oof_test = get_oof(mnb_model, \n",
255 |     "                                                      train_tfidf_features.tolil(), \n",
256 |     "                                                      safe_type.values,\n",
257 |     "                                                      test_tfidf_features.tolil(),\n",
258 |     "                                                      10)\n",
259 |     "    with open(\"call_pid_mnb_model_oof_train.csv\", \"wb\") as fp:\n",
260 |     "        pickle.dump(mnb_model_oof_train, fp)\n",
261 |     "    with open(\"call_pid_mnb_model_oof_test.csv\", \"wb\") as fp:\n",
262 |     "        pickle.dump(mnb_model_oof_test, fp)\n",
263 |     "    print(\"mnb success!\")\n",
264 |     "except:\n",
265 |     "    print(\"mnb error!\")\n",
266 |     "    \n",
267 |     "try:\n",
268 |     "    ada_model_oof_train, ada_model_oof_test = get_oof(ada_model, \n",
269 |     "                                                      train_tfidf_features.tolil(), \n",
270 |     "                                                      safe_type.values,\n",
271 |     "                                                      test_tfidf_features.tolil(),\n",
272 |     "                                                      10)\n",
273 |     "    with open(\"call_pid_ada_model_oof_train.csv\", \"wb\") as fp:\n",
274 |     "        pickle.dump(ada_model_oof_train, fp)\n",
275 |     "    with open(\"call_pid_ada_model_oof_test.csv\", \"wb\") as fp:\n",
276 |     "        pickle.dump(ada_model_oof_test, fp)\n",
277 |     "    print(\"ada success!\")\n",
278 |     "except:\n",
279 |     "    print(\"ada error!\")\n",
280 |     "\n",
281 |     "try:\n",
282 |     "    xgb_model_oof_train, xgb_model_oof_test = get_oof(xgb_model, \n",
283 |     "                                                      train_tfidf_features.tolil(), \n",
284 |     "                                                      safe_type.values,\n",
285 |     "                                                      test_tfidf_features.tolil(),\n",
286 |     "                                                      10)\n",
287 |     "    with open(\"call_pid_xgb_model_oof_train.csv\", \"wb\") as fp:\n",
288 |     "        pickle.dump(xgb_model_oof_train, fp)\n",
289 |     "    with open(\"call_pid_xgb_model_oof_test.csv\", \"wb\") as fp:\n",
290 |     "        pickle.dump(xgb_model_oof_test, fp)\n",
291 |     "    print(\"xgb success!\")\n",
292 |     "except:\n",
293 |     "    print(\"xgb error!\")\n",
294 |     "\n",
295 |     "\n",
296 |     "call_pid_stacking_train_10 = np.hstack([lr_model_oof_train, gbc_model_oof_train, bc_model_oof_train,\n",
297 |     "                            svm_model_oof_train, xgb_model_oof_train, dt_model_oof_train,\n",
298 |     "                            rfc_model_oof_train, etc_model_oof_train, mnb_model_oof_train,\n",
299 |     "                            ada_model_oof_train])\n",
300 |     "call_pid_stacking_test_10 = np.hstack([lr_model_oof_test, gbc_model_oof_test, bc_model_oof_test,\n",
301 |     "                           svm_model_oof_test, xgb_model_oof_test, dt_model_oof_test,\n",
302 |     "                           rfc_model_oof_test, etc_model_oof_test, mnb_model_oof_test,\n",
303 |     "                           ada_model_oof_test])\n",
304 |     "with open(\"call_pid_stacking_train_10.pkl\", \"wb\") as fp:\n",
305 |     "    pickle.dump(call_pid_stacking_train_10, fp)\n",
306 |     "    \n",
307 |     "with open(\"call_pid_stacking_test_10.pkl\", \"wb\") as fp:\n",
308 |     "    pickle.dump(call_pid_stacking_test_10, fp)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": []
317 |   }
318 |  ],
319 |  "metadata": {
320 |   "kernelspec": {
321 |    "display_name": "Python 3",
322 |    "language": "python",
323 |    "name": "python3"
324 |   },
325 |   "language_info": {
326 |    "codemirror_mode": {
327 |     "name": "ipython",
328 |     "version": 3
329 |    },
330 |    "file_extension": ".py",
331 |    "mimetype": "text/x-python",
332 |    "name": "python",
333 |    "nbconvert_exporter": "python",
334 |    "pygments_lexer": "ipython3",
335 |    "version": "3.6.7"
336 |   }
337 |  },
338 |  "nbformat": 4,
339 |  "nbformat_minor": 2
340 | }
341 | 


--------------------------------------------------------------------------------
/DataCon2019/code/stage1/deep_learning_model.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 27,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "from sklearn.preprocessing import MinMaxScaler\n",
  10 |     "from keras import models \n",
  11 |     "from keras import layers \n",
  12 |     "from keras.layers import Dropout\n",
  13 |     "import pandas as pd\n",
  14 |     "import numpy as np\n",
  15 |     "import matplotlib.pyplot as plt\n",
  16 |     "\n",
  17 |     "from sklearn import metrics\n",
  18 |     "from sklearn.metrics import accuracy_score\n",
  19 |     "from sklearn.metrics import classification_report\n",
  20 |     "from sklearn.model_selection import train_test_split\n",
  21 |     "\n",
  22 |     "%config InlineBackend.figure_format = 'svg'\n",
  23 |     "%matplotlib inline\n",
  24 |     "\n",
  25 |     "import warnings\n",
  26 |     "warnings.filterwarnings(\"ignore\")"
  27 |    ]
  28 |   },
  29 |   {
  30 |    "cell_type": "code",
  31 |    "execution_count": 11,
  32 |    "metadata": {},
  33 |    "outputs": [],
  34 |    "source": [
  35 |     "data = pd.read_csv(\"fliter_train_data_2.csv\")\n",
  36 |     "safe_type = data[\"safe_type\"]\n",
  37 |     "features = data.iloc[:, 2:]\n",
  38 |     "\n",
  39 |     "test = pd.read_csv(\"fliter_test_data_2.csv\")\n",
  40 |     "id_ = test[\"id\"]\n",
  41 |     "test_features = test.iloc[:, 1:]"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": 32,
  47 |    "metadata": {},
  48 |    "outputs": [],
  49 |    "source": [
  50 |     "min_max_scaler = MinMaxScaler()\n",
  51 |     "train_data = min_max_scaler.fit_transform(features)\n",
  52 |     "test_data = min_max_scaler.fit_transform(test_features)"
  53 |    ]
  54 |   },
  55 |   {
  56 |    "cell_type": "code",
  57 |    "execution_count": 40,
  58 |    "metadata": {},
  59 |    "outputs": [],
  60 |    "source": [
  61 |     "train_data, test_data, train_label, test_label = train_test_split(train_data, \n",
  62 |     "                                                                  safe_type, \n",
  63 |     "                                                                  test_size=0.2, \n",
  64 |     "                                                                  random_state=0)"
  65 |    ]
  66 |   },
  67 |   {
  68 |    "cell_type": "code",
  69 |    "execution_count": 30,
  70 |    "metadata": {},
  71 |    "outputs": [],
  72 |    "source": [
  73 |     "def build_model(dimension): \n",
  74 |     "    model = models.Sequential() \n",
  75 |     "    model.add(layers.Dense(64, activation='relu', input_shape=(dimension,))) \n",
  76 |     "    model.add(Dropout(0.2))\n",
  77 |     "    model.add(layers.Dense(128, activation='relu')) \n",
  78 |     "    model.add(Dropout(0.2))\n",
  79 |     "    model.add(layers.Dense(32, activation='relu'))\n",
  80 |     "    model.add(Dropout(0.2))\n",
  81 |     "    model.add(layers.Dense(16, activation='relu'))\n",
  82 |     "    model.add(Dropout(0.2))\n",
  83 |     "    model.add(layers.Dense(8, activation='relu'))\n",
  84 |     "    model.add(Dropout(0.2))\n",
  85 |     "    model.add(layers.Dense(1, activation='sigmoid')) \n",
  86 |     "    model.compile(optimizer='rmsprop',\n",
  87 |     "                  loss='binary_crossentropy',\n",
  88 |     "                  metrics=['accuracy']) \n",
  89 |     "    return model"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "code",
  94 |    "execution_count": 41,
  95 |    "metadata": {},
  96 |    "outputs": [
  97 |     {
  98 |      "name": "stdout",
  99 |      "output_type": "stream",
 100 |      "text": [
 101 |       "Epoch 1/100\n",
 102 |       "23983/23983 [==============================] - 1s 35us/step - loss: 0.4526 - acc: 0.8262\n",
 103 |       "Epoch 2/100\n",
 104 |       "23983/23983 [==============================] - 0s 12us/step - loss: 0.3507 - acc: 0.8834\n",
 105 |       "Epoch 3/100\n",
 106 |       "23983/23983 [==============================] - 0s 12us/step - loss: 0.3146 - acc: 0.8981\n",
 107 |       "Epoch 4/100\n",
 108 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.2854 - acc: 0.9068\n",
 109 |       "Epoch 5/100\n",
 110 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.2603 - acc: 0.9161\n",
 111 |       "Epoch 6/100\n",
 112 |       "23983/23983 [==============================] - 0s 17us/step - loss: 0.2448 - acc: 0.9219\n",
 113 |       "Epoch 7/100\n",
 114 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.2351 - acc: 0.9266\n",
 115 |       "Epoch 8/100\n",
 116 |       "23983/23983 [==============================] - 0s 12us/step - loss: 0.2227 - acc: 0.9324\n",
 117 |       "Epoch 9/100\n",
 118 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.2119 - acc: 0.9345\n",
 119 |       "Epoch 10/100\n",
 120 |       "23983/23983 [==============================] - 0s 12us/step - loss: 0.2051 - acc: 0.9364\n",
 121 |       "Epoch 11/100\n",
 122 |       "23983/23983 [==============================] - 0s 12us/step - loss: 0.2036 - acc: 0.9383\n",
 123 |       "Epoch 12/100\n",
 124 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1993 - acc: 0.9409\n",
 125 |       "Epoch 13/100\n",
 126 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1933 - acc: 0.9416\n",
 127 |       "Epoch 14/100\n",
 128 |       "23983/23983 [==============================] - 0s 16us/step - loss: 0.1849 - acc: 0.9438\n",
 129 |       "Epoch 15/100\n",
 130 |       "23983/23983 [==============================] - 0s 12us/step - loss: 0.1840 - acc: 0.9458\n",
 131 |       "Epoch 16/100\n",
 132 |       "23983/23983 [==============================] - 0s 12us/step - loss: 0.1816 - acc: 0.9455\n",
 133 |       "Epoch 17/100\n",
 134 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1812 - acc: 0.9457\n",
 135 |       "Epoch 18/100\n",
 136 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1746 - acc: 0.9489\n",
 137 |       "Epoch 19/100\n",
 138 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1757 - acc: 0.9477\n",
 139 |       "Epoch 20/100\n",
 140 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1749 - acc: 0.9481\n",
 141 |       "Epoch 21/100\n",
 142 |       "23983/23983 [==============================] - 0s 15us/step - loss: 0.1709 - acc: 0.9487\n",
 143 |       "Epoch 22/100\n",
 144 |       "23983/23983 [==============================] - 0s 16us/step - loss: 0.1680 - acc: 0.9513\n",
 145 |       "Epoch 23/100\n",
 146 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1630 - acc: 0.9506\n",
 147 |       "Epoch 24/100\n",
 148 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1665 - acc: 0.9511\n",
 149 |       "Epoch 25/100\n",
 150 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1627 - acc: 0.9523\n",
 151 |       "Epoch 26/100\n",
 152 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1633 - acc: 0.9508\n",
 153 |       "Epoch 27/100\n",
 154 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1581 - acc: 0.9535\n",
 155 |       "Epoch 28/100\n",
 156 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1586 - acc: 0.9513\n",
 157 |       "Epoch 29/100\n",
 158 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1553 - acc: 0.9529\n",
 159 |       "Epoch 30/100\n",
 160 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1602 - acc: 0.9519\n",
 161 |       "Epoch 31/100\n",
 162 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1549 - acc: 0.9533\n",
 163 |       "Epoch 32/100\n",
 164 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1552 - acc: 0.9517\n",
 165 |       "Epoch 33/100\n",
 166 |       "23983/23983 [==============================] - 0s 16us/step - loss: 0.1520 - acc: 0.9531\n",
 167 |       "Epoch 34/100\n",
 168 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1494 - acc: 0.9551\n",
 169 |       "Epoch 35/100\n",
 170 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1523 - acc: 0.9541\n",
 171 |       "Epoch 36/100\n",
 172 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1495 - acc: 0.9556\n",
 173 |       "Epoch 37/100\n",
 174 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1477 - acc: 0.9548\n",
 175 |       "Epoch 38/100\n",
 176 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1454 - acc: 0.9563\n",
 177 |       "Epoch 39/100\n",
 178 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1481 - acc: 0.9557\n",
 179 |       "Epoch 40/100\n",
 180 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1437 - acc: 0.9563\n",
 181 |       "Epoch 41/100\n",
 182 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1451 - acc: 0.9553\n",
 183 |       "Epoch 42/100\n",
 184 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1430 - acc: 0.9556\n",
 185 |       "Epoch 43/100\n",
 186 |       "23983/23983 [==============================] - 0s 15us/step - loss: 0.1384 - acc: 0.9569\n",
 187 |       "Epoch 44/100\n",
 188 |       "23983/23983 [==============================] - 0s 16us/step - loss: 0.1419 - acc: 0.9560\n",
 189 |       "Epoch 45/100\n",
 190 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1380 - acc: 0.9577\n",
 191 |       "Epoch 46/100\n",
 192 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1371 - acc: 0.9568\n",
 193 |       "Epoch 47/100\n",
 194 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1370 - acc: 0.9575\n",
 195 |       "Epoch 48/100\n",
 196 |       "23983/23983 [==============================] - 0s 12us/step - loss: 0.1349 - acc: 0.9580\n",
 197 |       "Epoch 49/100\n",
 198 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1379 - acc: 0.9572\n",
 199 |       "Epoch 50/100\n",
 200 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1343 - acc: 0.9579\n",
 201 |       "Epoch 51/100\n",
 202 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1329 - acc: 0.9581\n",
 203 |       "Epoch 52/100\n",
 204 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1361 - acc: 0.9584\n",
 205 |       "Epoch 53/100\n",
 206 |       "23983/23983 [==============================] - 0s 15us/step - loss: 0.1342 - acc: 0.9573\n",
 207 |       "Epoch 54/100\n",
 208 |       "23983/23983 [==============================] - 0s 15us/step - loss: 0.1298 - acc: 0.9597\n",
 209 |       "Epoch 55/100\n",
 210 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1330 - acc: 0.9589\n",
 211 |       "Epoch 56/100\n",
 212 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1281 - acc: 0.9593\n",
 213 |       "Epoch 57/100\n",
 214 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1311 - acc: 0.9575\n",
 215 |       "Epoch 58/100\n",
 216 |       "23983/23983 [==============================] - 0s 12us/step - loss: 0.1286 - acc: 0.9605\n",
 217 |       "Epoch 59/100\n",
 218 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1305 - acc: 0.9581\n",
 219 |       "Epoch 60/100\n",
 220 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1300 - acc: 0.9589\n",
 221 |       "Epoch 61/100\n",
 222 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1272 - acc: 0.9579\n",
 223 |       "Epoch 62/100\n",
 224 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1252 - acc: 0.9591\n",
 225 |       "Epoch 63/100\n",
 226 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1258 - acc: 0.9605\n",
 227 |       "Epoch 64/100\n",
 228 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1293 - acc: 0.9589\n",
 229 |       "Epoch 65/100\n",
 230 |       "23983/23983 [==============================] - 0s 17us/step - loss: 0.1209 - acc: 0.9611\n",
 231 |       "Epoch 66/100\n",
 232 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1239 - acc: 0.9606\n",
 233 |       "Epoch 67/100\n",
 234 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1256 - acc: 0.9605\n",
 235 |       "Epoch 68/100\n",
 236 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1236 - acc: 0.9613\n",
 237 |       "Epoch 69/100\n",
 238 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1216 - acc: 0.9613\n",
 239 |       "Epoch 70/100\n",
 240 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1224 - acc: 0.9600\n",
 241 |       "Epoch 71/100\n",
 242 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1239 - acc: 0.9607\n",
 243 |       "Epoch 72/100\n",
 244 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1183 - acc: 0.9626\n",
 245 |       "Epoch 73/100\n",
 246 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1230 - acc: 0.9617\n",
 247 |       "Epoch 74/100\n",
 248 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1211 - acc: 0.9606\n",
 249 |       "Epoch 75/100\n",
 250 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1213 - acc: 0.9621\n",
 251 |       "Epoch 76/100\n",
 252 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1190 - acc: 0.9613\n",
 253 |       "Epoch 77/100\n",
 254 |       "23983/23983 [==============================] - 0s 16us/step - loss: 0.1160 - acc: 0.9635\n",
 255 |       "Epoch 78/100\n",
 256 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1168 - acc: 0.9631\n",
 257 |       "Epoch 79/100\n",
 258 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1169 - acc: 0.9621\n",
 259 |       "Epoch 80/100\n",
 260 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1180 - acc: 0.9626\n",
 261 |       "Epoch 81/100\n",
 262 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1174 - acc: 0.9631\n",
 263 |       "Epoch 82/100\n",
 264 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1157 - acc: 0.9636\n",
 265 |       "Epoch 83/100\n",
 266 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1170 - acc: 0.9631\n",
 267 |       "Epoch 84/100\n",
 268 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1151 - acc: 0.9657\n",
 269 |       "Epoch 85/100\n",
 270 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1143 - acc: 0.9632\n",
 271 |       "Epoch 86/100\n",
 272 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1124 - acc: 0.9647\n",
 273 |       "Epoch 87/100\n",
 274 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1115 - acc: 0.9649\n",
 275 |       "Epoch 88/100\n",
 276 |       "23983/23983 [==============================] - 0s 17us/step - loss: 0.1179 - acc: 0.9628\n",
 277 |       "Epoch 89/100\n",
 278 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1121 - acc: 0.9641\n",
 279 |       "Epoch 90/100\n",
 280 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1119 - acc: 0.9650\n",
 281 |       "Epoch 91/100\n",
 282 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1131 - acc: 0.9633\n",
 283 |       "Epoch 92/100\n",
 284 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1113 - acc: 0.9647\n",
 285 |       "Epoch 93/100\n",
 286 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1119 - acc: 0.9649\n",
 287 |       "Epoch 94/100\n",
 288 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1150 - acc: 0.9630\n",
 289 |       "Epoch 95/100\n",
 290 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1098 - acc: 0.9662\n",
 291 |       "Epoch 96/100\n",
 292 |       "23983/23983 [==============================] - 0s 13us/step - loss: 0.1105 - acc: 0.9642\n",
 293 |       "Epoch 97/100\n",
 294 |       "23983/23983 [==============================] - 0s 15us/step - loss: 0.1090 - acc: 0.9656\n",
 295 |       "Epoch 98/100\n",
 296 |       "23983/23983 [==============================] - 0s 15us/step - loss: 0.1091 - acc: 0.9654\n",
 297 |       "Epoch 99/100\n",
 298 |       "23983/23983 [==============================] - 0s 14us/step - loss: 0.1062 - acc: 0.9649\n",
 299 |       "Epoch 100/100\n",
 300 |       "23983/23983 [==============================] - 0s 16us/step - loss: 0.1105 - acc: 0.9648\n"
 301 |      ]
 302 |     },
 303 |     {
 304 |      "data": {
 305 |       "text/plain": [
 306 |        "<keras.callbacks.History at 0x23bc98c6ba8>"
 307 |       ]
 308 |      },
 309 |      "execution_count": 41,
 310 |      "metadata": {},
 311 |      "output_type": "execute_result"
 312 |     }
 313 |    ],
 314 |    "source": [
 315 |     "model = build_model(train_data.shape[1])\n",
 316 |     "model.fit(train_data, \n",
 317 |     "          train_label, \n",
 318 |     "          epochs=100, \n",
 319 |     "          batch_size=524)"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "code",
 324 |    "execution_count": 43,
 325 |    "metadata": {},
 326 |    "outputs": [],
 327 |    "source": [
 328 |     "predict = model.predict_classes(test_data)"
 329 |    ]
 330 |   },
 331 |   {
 332 |    "cell_type": "code",
 333 |    "execution_count": 42,
 334 |    "metadata": {},
 335 |    "outputs": [],
 336 |    "source": [
 337 |     "def plot(test_label, y_pred, model):\n",
 338 |     "    font = {\"color\": \"darkred\",\n",
 339 |     "            \"size\": 13, \n",
 340 |     "            \"family\" : \"serif\"}\n",
 341 |     "\n",
 342 |     "    accs = accuracy_score(test_label, y_pred)\n",
 343 |     "    fpr, tpr, _ = metrics.roc_curve(test_label,  y_pred)\n",
 344 |     "    auc = metrics.roc_auc_score(test_label, y_pred)\n",
 345 |     "    plt.style.use(\"fivethirtyeight\")\n",
 346 |     "    fig, ax = plt.subplots()\n",
 347 |     "    ax.plot(fpr, tpr, label=\"{}, auc=\".format(model)+str(auc), color='green', linewidth=2)\n",
 348 |     "    ax.set_title(\"ROC curve\", fontdict=font)\n",
 349 |     "    leg = ax.legend(loc=\"best\")\n",
 350 |     "    text = leg.get_texts()\n",
 351 |     "    _ = plt.setp(text, color=\"blue\") "
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": 44,
 357 |    "metadata": {},
 358 |    "outputs": [
 359 |     {
 360 |      "data": {
 361 |       "image/svg+xml": [
 362 |        "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\r\n",
 363 |        "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\r\n",
 364 |        "  \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\r\n",
 365 |        "<!-- Created with matplotlib (https://matplotlib.org/) -->\r\n",
 366 |        "<svg height=\"280.607344pt\" version=\"1.1\" viewBox=\"0 0 416.004375 280.607344\" width=\"416.004375pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\r\n",
 367 |        " <defs>\r\n",
 368 |        "  <style type=\"text/css\">\r\n",
 369 |        "*{stroke-linecap:butt;stroke-linejoin:round;}\r\n",
 370 |        "  </style>\r\n",
 371 |        " </defs>\r\n",
 372 |        " <g id=\"figure_1\">\r\n",
 373 |        "  <g id=\"patch_1\">\r\n",
 374 |        "   <path d=\"M 0 280.607344 \r\n",
 375 |        "L 416.004375 280.607344 \r\n",
 376 |        "L 416.004375 0 \r\n",
 377 |        "L 0 0 \r\n",
 378 |        "z\r\n",
 379 |        "\" style=\"fill:#f0f0f0;\"/>\r\n",
 380 |        "  </g>\r\n",
 381 |        "  <g id=\"axes_1\">\r\n",
 382 |        "   <g id=\"patch_2\">\r\n",
 383 |        "    <path d=\"M 32.964375 256.357969 \r\n",
 384 |        "L 408.804375 256.357969 \r\n",
 385 |        "L 408.804375 23.077969 \r\n",
 386 |        "L 32.964375 23.077969 \r\n",
 387 |        "z\r\n",
 388 |        "\" style=\"fill:#f0f0f0;\"/>\r\n",
 389 |        "   </g>\r\n",
 390 |        "   <g id=\"matplotlib.axis_1\">\r\n",
 391 |        "    <g id=\"xtick_1\">\r\n",
 392 |        "     <g id=\"line2d_1\">\r\n",
 393 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 50.048011 256.357969 \r\n",
 394 |        "L 50.048011 23.077969 \r\n",
 395 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 396 |        "     </g>\r\n",
 397 |        "     <g id=\"line2d_2\"/>\r\n",
 398 |        "     <g id=\"text_1\">\r\n",
 399 |        "      <!-- 0.0 -->\r\n",
 400 |        "      <defs>\r\n",
 401 |        "       <path d=\"M 31.78125 66.40625 \r\n",
 402 |        "Q 24.171875 66.40625 20.328125 58.90625 \r\n",
 403 |        "Q 16.5 51.421875 16.5 36.375 \r\n",
 404 |        "Q 16.5 21.390625 20.328125 13.890625 \r\n",
 405 |        "Q 24.171875 6.390625 31.78125 6.390625 \r\n",
 406 |        "Q 39.453125 6.390625 43.28125 13.890625 \r\n",
 407 |        "Q 47.125 21.390625 47.125 36.375 \r\n",
 408 |        "Q 47.125 51.421875 43.28125 58.90625 \r\n",
 409 |        "Q 39.453125 66.40625 31.78125 66.40625 \r\n",
 410 |        "z\r\n",
 411 |        "M 31.78125 74.21875 \r\n",
 412 |        "Q 44.046875 74.21875 50.515625 64.515625 \r\n",
 413 |        "Q 56.984375 54.828125 56.984375 36.375 \r\n",
 414 |        "Q 56.984375 17.96875 50.515625 8.265625 \r\n",
 415 |        "Q 44.046875 -1.421875 31.78125 -1.421875 \r\n",
 416 |        "Q 19.53125 -1.421875 13.0625 8.265625 \r\n",
 417 |        "Q 6.59375 17.96875 6.59375 36.375 \r\n",
 418 |        "Q 6.59375 54.828125 13.0625 64.515625 \r\n",
 419 |        "Q 19.53125 74.21875 31.78125 74.21875 \r\n",
 420 |        "z\r\n",
 421 |        "\" id=\"DejaVuSans-48\"/>\r\n",
 422 |        "       <path d=\"M 10.6875 12.40625 \r\n",
 423 |        "L 21 12.40625 \r\n",
 424 |        "L 21 0 \r\n",
 425 |        "L 10.6875 0 \r\n",
 426 |        "z\r\n",
 427 |        "\" id=\"DejaVuSans-46\"/>\r\n",
 428 |        "      </defs>\r\n",
 429 |        "      <g transform=\"translate(38.915824 270.495781)scale(0.14 -0.14)\">\r\n",
 430 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 431 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 432 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-48\"/>\r\n",
 433 |        "      </g>\r\n",
 434 |        "     </g>\r\n",
 435 |        "    </g>\r\n",
 436 |        "    <g id=\"xtick_2\">\r\n",
 437 |        "     <g id=\"line2d_3\">\r\n",
 438 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 118.382557 256.357969 \r\n",
 439 |        "L 118.382557 23.077969 \r\n",
 440 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 441 |        "     </g>\r\n",
 442 |        "     <g id=\"line2d_4\"/>\r\n",
 443 |        "     <g id=\"text_2\">\r\n",
 444 |        "      <!-- 0.2 -->\r\n",
 445 |        "      <defs>\r\n",
 446 |        "       <path d=\"M 19.1875 8.296875 \r\n",
 447 |        "L 53.609375 8.296875 \r\n",
 448 |        "L 53.609375 0 \r\n",
 449 |        "L 7.328125 0 \r\n",
 450 |        "L 7.328125 8.296875 \r\n",
 451 |        "Q 12.9375 14.109375 22.625 23.890625 \r\n",
 452 |        "Q 32.328125 33.6875 34.8125 36.53125 \r\n",
 453 |        "Q 39.546875 41.84375 41.421875 45.53125 \r\n",
 454 |        "Q 43.3125 49.21875 43.3125 52.78125 \r\n",
 455 |        "Q 43.3125 58.59375 39.234375 62.25 \r\n",
 456 |        "Q 35.15625 65.921875 28.609375 65.921875 \r\n",
 457 |        "Q 23.96875 65.921875 18.8125 64.3125 \r\n",
 458 |        "Q 13.671875 62.703125 7.8125 59.421875 \r\n",
 459 |        "L 7.8125 69.390625 \r\n",
 460 |        "Q 13.765625 71.78125 18.9375 73 \r\n",
 461 |        "Q 24.125 74.21875 28.421875 74.21875 \r\n",
 462 |        "Q 39.75 74.21875 46.484375 68.546875 \r\n",
 463 |        "Q 53.21875 62.890625 53.21875 53.421875 \r\n",
 464 |        "Q 53.21875 48.921875 51.53125 44.890625 \r\n",
 465 |        "Q 49.859375 40.875 45.40625 35.40625 \r\n",
 466 |        "Q 44.1875 33.984375 37.640625 27.21875 \r\n",
 467 |        "Q 31.109375 20.453125 19.1875 8.296875 \r\n",
 468 |        "z\r\n",
 469 |        "\" id=\"DejaVuSans-50\"/>\r\n",
 470 |        "      </defs>\r\n",
 471 |        "      <g transform=\"translate(107.250369 270.495781)scale(0.14 -0.14)\">\r\n",
 472 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 473 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 474 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-50\"/>\r\n",
 475 |        "      </g>\r\n",
 476 |        "     </g>\r\n",
 477 |        "    </g>\r\n",
 478 |        "    <g id=\"xtick_3\">\r\n",
 479 |        "     <g id=\"line2d_5\">\r\n",
 480 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 186.717102 256.357969 \r\n",
 481 |        "L 186.717102 23.077969 \r\n",
 482 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 483 |        "     </g>\r\n",
 484 |        "     <g id=\"line2d_6\"/>\r\n",
 485 |        "     <g id=\"text_3\">\r\n",
 486 |        "      <!-- 0.4 -->\r\n",
 487 |        "      <defs>\r\n",
 488 |        "       <path d=\"M 37.796875 64.3125 \r\n",
 489 |        "L 12.890625 25.390625 \r\n",
 490 |        "L 37.796875 25.390625 \r\n",
 491 |        "z\r\n",
 492 |        "M 35.203125 72.90625 \r\n",
 493 |        "L 47.609375 72.90625 \r\n",
 494 |        "L 47.609375 25.390625 \r\n",
 495 |        "L 58.015625 25.390625 \r\n",
 496 |        "L 58.015625 17.1875 \r\n",
 497 |        "L 47.609375 17.1875 \r\n",
 498 |        "L 47.609375 0 \r\n",
 499 |        "L 37.796875 0 \r\n",
 500 |        "L 37.796875 17.1875 \r\n",
 501 |        "L 4.890625 17.1875 \r\n",
 502 |        "L 4.890625 26.703125 \r\n",
 503 |        "z\r\n",
 504 |        "\" id=\"DejaVuSans-52\"/>\r\n",
 505 |        "      </defs>\r\n",
 506 |        "      <g transform=\"translate(175.584915 270.495781)scale(0.14 -0.14)\">\r\n",
 507 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 508 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 509 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-52\"/>\r\n",
 510 |        "      </g>\r\n",
 511 |        "     </g>\r\n",
 512 |        "    </g>\r\n",
 513 |        "    <g id=\"xtick_4\">\r\n",
 514 |        "     <g id=\"line2d_7\">\r\n",
 515 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 255.051648 256.357969 \r\n",
 516 |        "L 255.051648 23.077969 \r\n",
 517 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 518 |        "     </g>\r\n",
 519 |        "     <g id=\"line2d_8\"/>\r\n",
 520 |        "     <g id=\"text_4\">\r\n",
 521 |        "      <!-- 0.6 -->\r\n",
 522 |        "      <defs>\r\n",
 523 |        "       <path d=\"M 33.015625 40.375 \r\n",
 524 |        "Q 26.375 40.375 22.484375 35.828125 \r\n",
 525 |        "Q 18.609375 31.296875 18.609375 23.390625 \r\n",
 526 |        "Q 18.609375 15.53125 22.484375 10.953125 \r\n",
 527 |        "Q 26.375 6.390625 33.015625 6.390625 \r\n",
 528 |        "Q 39.65625 6.390625 43.53125 10.953125 \r\n",
 529 |        "Q 47.40625 15.53125 47.40625 23.390625 \r\n",
 530 |        "Q 47.40625 31.296875 43.53125 35.828125 \r\n",
 531 |        "Q 39.65625 40.375 33.015625 40.375 \r\n",
 532 |        "z\r\n",
 533 |        "M 52.59375 71.296875 \r\n",
 534 |        "L 52.59375 62.3125 \r\n",
 535 |        "Q 48.875 64.0625 45.09375 64.984375 \r\n",
 536 |        "Q 41.3125 65.921875 37.59375 65.921875 \r\n",
 537 |        "Q 27.828125 65.921875 22.671875 59.328125 \r\n",
 538 |        "Q 17.53125 52.734375 16.796875 39.40625 \r\n",
 539 |        "Q 19.671875 43.65625 24.015625 45.921875 \r\n",
 540 |        "Q 28.375 48.1875 33.59375 48.1875 \r\n",
 541 |        "Q 44.578125 48.1875 50.953125 41.515625 \r\n",
 542 |        "Q 57.328125 34.859375 57.328125 23.390625 \r\n",
 543 |        "Q 57.328125 12.15625 50.6875 5.359375 \r\n",
 544 |        "Q 44.046875 -1.421875 33.015625 -1.421875 \r\n",
 545 |        "Q 20.359375 -1.421875 13.671875 8.265625 \r\n",
 546 |        "Q 6.984375 17.96875 6.984375 36.375 \r\n",
 547 |        "Q 6.984375 53.65625 15.1875 63.9375 \r\n",
 548 |        "Q 23.390625 74.21875 37.203125 74.21875 \r\n",
 549 |        "Q 40.921875 74.21875 44.703125 73.484375 \r\n",
 550 |        "Q 48.484375 72.75 52.59375 71.296875 \r\n",
 551 |        "z\r\n",
 552 |        "\" id=\"DejaVuSans-54\"/>\r\n",
 553 |        "      </defs>\r\n",
 554 |        "      <g transform=\"translate(243.91946 270.495781)scale(0.14 -0.14)\">\r\n",
 555 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 556 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 557 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-54\"/>\r\n",
 558 |        "      </g>\r\n",
 559 |        "     </g>\r\n",
 560 |        "    </g>\r\n",
 561 |        "    <g id=\"xtick_5\">\r\n",
 562 |        "     <g id=\"line2d_9\">\r\n",
 563 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 323.386193 256.357969 \r\n",
 564 |        "L 323.386193 23.077969 \r\n",
 565 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 566 |        "     </g>\r\n",
 567 |        "     <g id=\"line2d_10\"/>\r\n",
 568 |        "     <g id=\"text_5\">\r\n",
 569 |        "      <!-- 0.8 -->\r\n",
 570 |        "      <defs>\r\n",
 571 |        "       <path d=\"M 31.78125 34.625 \r\n",
 572 |        "Q 24.75 34.625 20.71875 30.859375 \r\n",
 573 |        "Q 16.703125 27.09375 16.703125 20.515625 \r\n",
 574 |        "Q 16.703125 13.921875 20.71875 10.15625 \r\n",
 575 |        "Q 24.75 6.390625 31.78125 6.390625 \r\n",
 576 |        "Q 38.8125 6.390625 42.859375 10.171875 \r\n",
 577 |        "Q 46.921875 13.96875 46.921875 20.515625 \r\n",
 578 |        "Q 46.921875 27.09375 42.890625 30.859375 \r\n",
 579 |        "Q 38.875 34.625 31.78125 34.625 \r\n",
 580 |        "z\r\n",
 581 |        "M 21.921875 38.8125 \r\n",
 582 |        "Q 15.578125 40.375 12.03125 44.71875 \r\n",
 583 |        "Q 8.5 49.078125 8.5 55.328125 \r\n",
 584 |        "Q 8.5 64.0625 14.71875 69.140625 \r\n",
 585 |        "Q 20.953125 74.21875 31.78125 74.21875 \r\n",
 586 |        "Q 42.671875 74.21875 48.875 69.140625 \r\n",
 587 |        "Q 55.078125 64.0625 55.078125 55.328125 \r\n",
 588 |        "Q 55.078125 49.078125 51.53125 44.71875 \r\n",
 589 |        "Q 48 40.375 41.703125 38.8125 \r\n",
 590 |        "Q 48.828125 37.15625 52.796875 32.3125 \r\n",
 591 |        "Q 56.78125 27.484375 56.78125 20.515625 \r\n",
 592 |        "Q 56.78125 9.90625 50.3125 4.234375 \r\n",
 593 |        "Q 43.84375 -1.421875 31.78125 -1.421875 \r\n",
 594 |        "Q 19.734375 -1.421875 13.25 4.234375 \r\n",
 595 |        "Q 6.78125 9.90625 6.78125 20.515625 \r\n",
 596 |        "Q 6.78125 27.484375 10.78125 32.3125 \r\n",
 597 |        "Q 14.796875 37.15625 21.921875 38.8125 \r\n",
 598 |        "z\r\n",
 599 |        "M 18.3125 54.390625 \r\n",
 600 |        "Q 18.3125 48.734375 21.84375 45.5625 \r\n",
 601 |        "Q 25.390625 42.390625 31.78125 42.390625 \r\n",
 602 |        "Q 38.140625 42.390625 41.71875 45.5625 \r\n",
 603 |        "Q 45.3125 48.734375 45.3125 54.390625 \r\n",
 604 |        "Q 45.3125 60.0625 41.71875 63.234375 \r\n",
 605 |        "Q 38.140625 66.40625 31.78125 66.40625 \r\n",
 606 |        "Q 25.390625 66.40625 21.84375 63.234375 \r\n",
 607 |        "Q 18.3125 60.0625 18.3125 54.390625 \r\n",
 608 |        "z\r\n",
 609 |        "\" id=\"DejaVuSans-56\"/>\r\n",
 610 |        "      </defs>\r\n",
 611 |        "      <g transform=\"translate(312.254006 270.495781)scale(0.14 -0.14)\">\r\n",
 612 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 613 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 614 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-56\"/>\r\n",
 615 |        "      </g>\r\n",
 616 |        "     </g>\r\n",
 617 |        "    </g>\r\n",
 618 |        "    <g id=\"xtick_6\">\r\n",
 619 |        "     <g id=\"line2d_11\">\r\n",
 620 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 391.720739 256.357969 \r\n",
 621 |        "L 391.720739 23.077969 \r\n",
 622 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 623 |        "     </g>\r\n",
 624 |        "     <g id=\"line2d_12\"/>\r\n",
 625 |        "     <g id=\"text_6\">\r\n",
 626 |        "      <!-- 1.0 -->\r\n",
 627 |        "      <defs>\r\n",
 628 |        "       <path d=\"M 12.40625 8.296875 \r\n",
 629 |        "L 28.515625 8.296875 \r\n",
 630 |        "L 28.515625 63.921875 \r\n",
 631 |        "L 10.984375 60.40625 \r\n",
 632 |        "L 10.984375 69.390625 \r\n",
 633 |        "L 28.421875 72.90625 \r\n",
 634 |        "L 38.28125 72.90625 \r\n",
 635 |        "L 38.28125 8.296875 \r\n",
 636 |        "L 54.390625 8.296875 \r\n",
 637 |        "L 54.390625 0 \r\n",
 638 |        "L 12.40625 0 \r\n",
 639 |        "z\r\n",
 640 |        "\" id=\"DejaVuSans-49\"/>\r\n",
 641 |        "      </defs>\r\n",
 642 |        "      <g transform=\"translate(380.588551 270.495781)scale(0.14 -0.14)\">\r\n",
 643 |        "       <use xlink:href=\"#DejaVuSans-49\"/>\r\n",
 644 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 645 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-48\"/>\r\n",
 646 |        "      </g>\r\n",
 647 |        "     </g>\r\n",
 648 |        "    </g>\r\n",
 649 |        "   </g>\r\n",
 650 |        "   <g id=\"matplotlib.axis_2\">\r\n",
 651 |        "    <g id=\"ytick_1\">\r\n",
 652 |        "     <g id=\"line2d_13\">\r\n",
 653 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 32.964375 245.754332 \r\n",
 654 |        "L 408.804375 245.754332 \r\n",
 655 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 656 |        "     </g>\r\n",
 657 |        "     <g id=\"line2d_14\"/>\r\n",
 658 |        "     <g id=\"text_7\">\r\n",
 659 |        "      <!-- 0.0 -->\r\n",
 660 |        "      <g transform=\"translate(7.2 251.073239)scale(0.14 -0.14)\">\r\n",
 661 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 662 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 663 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-48\"/>\r\n",
 664 |        "      </g>\r\n",
 665 |        "     </g>\r\n",
 666 |        "    </g>\r\n",
 667 |        "    <g id=\"ytick_2\">\r\n",
 668 |        "     <g id=\"line2d_15\">\r\n",
 669 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 32.964375 203.339787 \r\n",
 670 |        "L 408.804375 203.339787 \r\n",
 671 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 672 |        "     </g>\r\n",
 673 |        "     <g id=\"line2d_16\"/>\r\n",
 674 |        "     <g id=\"text_8\">\r\n",
 675 |        "      <!-- 0.2 -->\r\n",
 676 |        "      <g transform=\"translate(7.2 208.658693)scale(0.14 -0.14)\">\r\n",
 677 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 678 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 679 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-50\"/>\r\n",
 680 |        "      </g>\r\n",
 681 |        "     </g>\r\n",
 682 |        "    </g>\r\n",
 683 |        "    <g id=\"ytick_3\">\r\n",
 684 |        "     <g id=\"line2d_17\">\r\n",
 685 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 32.964375 160.925241 \r\n",
 686 |        "L 408.804375 160.925241 \r\n",
 687 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 688 |        "     </g>\r\n",
 689 |        "     <g id=\"line2d_18\"/>\r\n",
 690 |        "     <g id=\"text_9\">\r\n",
 691 |        "      <!-- 0.4 -->\r\n",
 692 |        "      <g transform=\"translate(7.2 166.244148)scale(0.14 -0.14)\">\r\n",
 693 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 694 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 695 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-52\"/>\r\n",
 696 |        "      </g>\r\n",
 697 |        "     </g>\r\n",
 698 |        "    </g>\r\n",
 699 |        "    <g id=\"ytick_4\">\r\n",
 700 |        "     <g id=\"line2d_19\">\r\n",
 701 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 32.964375 118.510696 \r\n",
 702 |        "L 408.804375 118.510696 \r\n",
 703 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 704 |        "     </g>\r\n",
 705 |        "     <g id=\"line2d_20\"/>\r\n",
 706 |        "     <g id=\"text_10\">\r\n",
 707 |        "      <!-- 0.6 -->\r\n",
 708 |        "      <g transform=\"translate(7.2 123.829602)scale(0.14 -0.14)\">\r\n",
 709 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 710 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 711 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-54\"/>\r\n",
 712 |        "      </g>\r\n",
 713 |        "     </g>\r\n",
 714 |        "    </g>\r\n",
 715 |        "    <g id=\"ytick_5\">\r\n",
 716 |        "     <g id=\"line2d_21\">\r\n",
 717 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 32.964375 76.096151 \r\n",
 718 |        "L 408.804375 76.096151 \r\n",
 719 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 720 |        "     </g>\r\n",
 721 |        "     <g id=\"line2d_22\"/>\r\n",
 722 |        "     <g id=\"text_11\">\r\n",
 723 |        "      <!-- 0.8 -->\r\n",
 724 |        "      <g transform=\"translate(7.2 81.415057)scale(0.14 -0.14)\">\r\n",
 725 |        "       <use xlink:href=\"#DejaVuSans-48\"/>\r\n",
 726 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 727 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-56\"/>\r\n",
 728 |        "      </g>\r\n",
 729 |        "     </g>\r\n",
 730 |        "    </g>\r\n",
 731 |        "    <g id=\"ytick_6\">\r\n",
 732 |        "     <g id=\"line2d_23\">\r\n",
 733 |        "      <path clip-path=\"url(#p888b19183b)\" d=\"M 32.964375 33.681605 \r\n",
 734 |        "L 408.804375 33.681605 \r\n",
 735 |        "\" style=\"fill:none;stroke:#cbcbcb;\"/>\r\n",
 736 |        "     </g>\r\n",
 737 |        "     <g id=\"line2d_24\"/>\r\n",
 738 |        "     <g id=\"text_12\">\r\n",
 739 |        "      <!-- 1.0 -->\r\n",
 740 |        "      <g transform=\"translate(7.2 39.000511)scale(0.14 -0.14)\">\r\n",
 741 |        "       <use xlink:href=\"#DejaVuSans-49\"/>\r\n",
 742 |        "       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
 743 |        "       <use x=\"95.410156\" xlink:href=\"#DejaVuSans-48\"/>\r\n",
 744 |        "      </g>\r\n",
 745 |        "     </g>\r\n",
 746 |        "    </g>\r\n",
 747 |        "   </g>\r\n",
 748 |        "   <g id=\"line2d_25\">\r\n",
 749 |        "    <path clip-path=\"url(#p888b19183b)\" d=\"M 50.048011 245.754332 \r\n",
 750 |        "L 54.919283 49.390696 \r\n",
 751 |        "L 391.720739 33.681605 \r\n",
 752 |        "\" style=\"fill:none;stroke:#008000;stroke-width:2;\"/>\r\n",
 753 |        "   </g>\r\n",
 754 |        "   <g id=\"patch_3\">\r\n",
 755 |        "    <path d=\"M 32.964375 256.357969 \r\n",
 756 |        "L 32.964375 23.077969 \r\n",
 757 |        "\" style=\"fill:none;stroke:#f0f0f0;stroke-linecap:square;stroke-linejoin:miter;stroke-width:3;\"/>\r\n",
 758 |        "   </g>\r\n",
 759 |        "   <g id=\"patch_4\">\r\n",
 760 |        "    <path d=\"M 408.804375 256.357969 \r\n",
 761 |        "L 408.804375 23.077969 \r\n",
 762 |        "\" style=\"fill:none;stroke:#f0f0f0;stroke-linecap:square;stroke-linejoin:miter;stroke-width:3;\"/>\r\n",
 763 |        "   </g>\r\n",
 764 |        "   <g id=\"patch_5\">\r\n",
 765 |        "    <path d=\"M 32.964375 256.357969 \r\n",
 766 |        "L 408.804375 256.357969 \r\n",
 767 |        "\" style=\"fill:none;stroke:#f0f0f0;stroke-linecap:square;stroke-linejoin:miter;stroke-width:3;\"/>\r\n",
 768 |        "   </g>\r\n",
 769 |        "   <g id=\"patch_6\">\r\n",
 770 |        "    <path d=\"M 32.964375 23.077969 \r\n",
 771 |        "L 408.804375 23.077969 \r\n",
 772 |        "\" style=\"fill:none;stroke:#f0f0f0;stroke-linecap:square;stroke-linejoin:miter;stroke-width:3;\"/>\r\n",
 773 |        "   </g>\r\n",
 774 |        "   <g id=\"text_13\">\r\n",
 775 |        "    <!-- ROC curve -->\r\n",
 776 |        "    <defs>\r\n",
 777 |        "     <path d=\"M 47.90625 36.1875 \r\n",
 778 |        "Q 51.3125 35.25 53.78125 33.03125 \r\n",
 779 |        "Q 56.25 30.8125 58.203125 26.8125 \r\n",
 780 |        "L 68.796875 5.171875 \r\n",
 781 |        "L 77.6875 5.171875 \r\n",
 782 |        "L 77.6875 0 \r\n",
 783 |        "L 60.5 0 \r\n",
 784 |        "L 49.125 23.1875 \r\n",
 785 |        "Q 45.84375 29.9375 43.109375 31.90625 \r\n",
 786 |        "Q 40.375 33.890625 35.59375 33.890625 \r\n",
 787 |        "L 24.703125 33.890625 \r\n",
 788 |        "L 24.703125 5.171875 \r\n",
 789 |        "L 35.015625 5.171875 \r\n",
 790 |        "L 35.015625 0 \r\n",
 791 |        "L 5.515625 0 \r\n",
 792 |        "L 5.515625 5.171875 \r\n",
 793 |        "L 14.796875 5.171875 \r\n",
 794 |        "L 14.796875 67.671875 \r\n",
 795 |        "L 5.515625 67.671875 \r\n",
 796 |        "L 5.515625 72.90625 \r\n",
 797 |        "L 42.484375 72.90625 \r\n",
 798 |        "Q 53.125 72.90625 58.90625 67.828125 \r\n",
 799 |        "Q 64.703125 62.75 64.703125 53.421875 \r\n",
 800 |        "Q 64.703125 45.90625 60.46875 41.578125 \r\n",
 801 |        "Q 56.25 37.25 47.90625 36.1875 \r\n",
 802 |        "z\r\n",
 803 |        "M 24.703125 39.109375 \r\n",
 804 |        "L 39.109375 39.109375 \r\n",
 805 |        "Q 46.484375 39.109375 50 42.59375 \r\n",
 806 |        "Q 53.515625 46.09375 53.515625 53.421875 \r\n",
 807 |        "Q 53.515625 60.75 50 64.203125 \r\n",
 808 |        "Q 46.484375 67.671875 39.109375 67.671875 \r\n",
 809 |        "L 24.703125 67.671875 \r\n",
 810 |        "z\r\n",
 811 |        "\" id=\"DejaVuSerif-82\"/>\r\n",
 812 |        "     <path d=\"M 41.015625 3.8125 \r\n",
 813 |        "Q 52.984375 3.8125 59.078125 12.03125 \r\n",
 814 |        "Q 65.1875 20.265625 65.1875 36.375 \r\n",
 815 |        "Q 65.1875 52.546875 59.078125 60.765625 \r\n",
 816 |        "Q 52.984375 69 41.015625 69 \r\n",
 817 |        "Q 29 69 22.890625 60.765625 \r\n",
 818 |        "Q 16.796875 52.546875 16.796875 36.375 \r\n",
 819 |        "Q 16.796875 20.265625 22.890625 12.03125 \r\n",
 820 |        "Q 29 3.8125 41.015625 3.8125 \r\n",
 821 |        "z\r\n",
 822 |        "M 41.015625 -1.421875 \r\n",
 823 |        "Q 33.59375 -1.421875 27.359375 1.015625 \r\n",
 824 |        "Q 21.140625 3.46875 16.40625 8.203125 \r\n",
 825 |        "Q 10.9375 13.671875 8.265625 20.609375 \r\n",
 826 |        "Q 5.609375 27.546875 5.609375 36.375 \r\n",
 827 |        "Q 5.609375 45.21875 8.265625 52.171875 \r\n",
 828 |        "Q 10.9375 59.125 16.40625 64.59375 \r\n",
 829 |        "Q 21.1875 69.390625 27.34375 71.796875 \r\n",
 830 |        "Q 33.5 74.21875 41.015625 74.21875 \r\n",
 831 |        "Q 56.890625 74.21875 66.65625 63.8125 \r\n",
 832 |        "Q 76.421875 53.421875 76.421875 36.375 \r\n",
 833 |        "Q 76.421875 27.640625 73.734375 20.625 \r\n",
 834 |        "Q 71.046875 13.625 65.578125 8.203125 \r\n",
 835 |        "Q 60.796875 3.421875 54.640625 1 \r\n",
 836 |        "Q 48.484375 -1.421875 41.015625 -1.421875 \r\n",
 837 |        "z\r\n",
 838 |        "\" id=\"DejaVuSerif-79\"/>\r\n",
 839 |        "     <path d=\"M 70.515625 19.28125 \r\n",
 840 |        "Q 67.28125 9.078125 59.6875 3.828125 \r\n",
 841 |        "Q 52.09375 -1.421875 40.484375 -1.421875 \r\n",
 842 |        "Q 33.34375 -1.421875 27.234375 1.015625 \r\n",
 843 |        "Q 21.140625 3.46875 16.40625 8.203125 \r\n",
 844 |        "Q 10.9375 13.671875 8.265625 20.625 \r\n",
 845 |        "Q 5.609375 27.59375 5.609375 36.375 \r\n",
 846 |        "Q 5.609375 53.375 15.421875 63.796875 \r\n",
 847 |        "Q 25.25 74.21875 41.3125 74.21875 \r\n",
 848 |        "Q 47.265625 74.21875 54 72.65625 \r\n",
 849 |        "Q 60.75 71.09375 68.5 67.921875 \r\n",
 850 |        "L 68.5 51.125 \r\n",
 851 |        "L 62.984375 51.125 \r\n",
 852 |        "Q 61.1875 60.296875 55.734375 64.640625 \r\n",
 853 |        "Q 50.296875 69 40.484375 69 \r\n",
 854 |        "Q 28.8125 69 22.796875 60.71875 \r\n",
 855 |        "Q 16.796875 52.4375 16.796875 36.375 \r\n",
 856 |        "Q 16.796875 20.359375 22.796875 12.078125 \r\n",
 857 |        "Q 28.8125 3.8125 40.484375 3.8125 \r\n",
 858 |        "Q 48.640625 3.8125 53.90625 7.6875 \r\n",
 859 |        "Q 59.1875 11.578125 61.53125 19.28125 \r\n",
 860 |        "z\r\n",
 861 |        "\" id=\"DejaVuSerif-67\"/>\r\n",
 862 |        "     <path id=\"DejaVuSerif-32\"/>\r\n",
 863 |        "     <path d=\"M 51.421875 15.578125 \r\n",
 864 |        "Q 49.515625 7.28125 44.09375 2.921875 \r\n",
 865 |        "Q 38.671875 -1.421875 30.078125 -1.421875 \r\n",
 866 |        "Q 18.75 -1.421875 11.859375 6.078125 \r\n",
 867 |        "Q 4.984375 13.578125 4.984375 25.984375 \r\n",
 868 |        "Q 4.984375 38.421875 11.859375 45.875 \r\n",
 869 |        "Q 18.75 53.328125 30.078125 53.328125 \r\n",
 870 |        "Q 35.015625 53.328125 39.890625 52.171875 \r\n",
 871 |        "Q 44.78125 51.03125 49.703125 48.6875 \r\n",
 872 |        "L 49.703125 35.40625 \r\n",
 873 |        "L 44.484375 35.40625 \r\n",
 874 |        "Q 43.453125 42.234375 40.015625 45.359375 \r\n",
 875 |        "Q 36.578125 48.484375 30.171875 48.484375 \r\n",
 876 |        "Q 22.90625 48.484375 19.1875 42.84375 \r\n",
 877 |        "Q 15.484375 37.203125 15.484375 25.984375 \r\n",
 878 |        "Q 15.484375 14.75 19.171875 9.078125 \r\n",
 879 |        "Q 22.859375 3.421875 30.171875 3.421875 \r\n",
 880 |        "Q 35.984375 3.421875 39.453125 6.4375 \r\n",
 881 |        "Q 42.921875 9.46875 44.1875 15.578125 \r\n",
 882 |        "z\r\n",
 883 |        "\" id=\"DejaVuSerif-99\"/>\r\n",
 884 |        "     <path d=\"M 35.40625 51.90625 \r\n",
 885 |        "L 52.203125 51.90625 \r\n",
 886 |        "L 52.203125 5.171875 \r\n",
 887 |        "L 60.6875 5.171875 \r\n",
 888 |        "L 60.6875 0 \r\n",
 889 |        "L 43.21875 0 \r\n",
 890 |        "L 43.21875 9.1875 \r\n",
 891 |        "Q 40.71875 4 36.765625 1.28125 \r\n",
 892 |        "Q 32.8125 -1.421875 27.59375 -1.421875 \r\n",
 893 |        "Q 18.953125 -1.421875 14.875 3.484375 \r\n",
 894 |        "Q 10.796875 8.40625 10.796875 18.890625 \r\n",
 895 |        "L 10.796875 46.6875 \r\n",
 896 |        "L 2.6875 46.6875 \r\n",
 897 |        "L 2.6875 51.90625 \r\n",
 898 |        "L 19.828125 51.90625 \r\n",
 899 |        "L 19.828125 21.6875 \r\n",
 900 |        "Q 19.828125 12.203125 22.140625 8.6875 \r\n",
 901 |        "Q 24.46875 5.171875 30.421875 5.171875 \r\n",
 902 |        "Q 36.671875 5.171875 39.9375 9.765625 \r\n",
 903 |        "Q 43.21875 14.359375 43.21875 23.09375 \r\n",
 904 |        "L 43.21875 46.6875 \r\n",
 905 |        "L 35.40625 46.6875 \r\n",
 906 |        "z\r\n",
 907 |        "\" id=\"DejaVuSerif-117\"/>\r\n",
 908 |        "     <path d=\"M 47.796875 52 \r\n",
 909 |        "L 47.796875 39.015625 \r\n",
 910 |        "L 42.625 39.015625 \r\n",
 911 |        "Q 42.390625 42.875 40.484375 44.78125 \r\n",
 912 |        "Q 38.578125 46.6875 34.90625 46.6875 \r\n",
 913 |        "Q 28.265625 46.6875 24.71875 42.09375 \r\n",
 914 |        "Q 21.1875 37.5 21.1875 28.90625 \r\n",
 915 |        "L 21.1875 5.171875 \r\n",
 916 |        "L 31.59375 5.171875 \r\n",
 917 |        "L 31.59375 0 \r\n",
 918 |        "L 4.109375 0 \r\n",
 919 |        "L 4.109375 5.171875 \r\n",
 920 |        "L 12.203125 5.171875 \r\n",
 921 |        "L 12.203125 46.78125 \r\n",
 922 |        "L 3.609375 46.78125 \r\n",
 923 |        "L 3.609375 51.90625 \r\n",
 924 |        "L 21.1875 51.90625 \r\n",
 925 |        "L 21.1875 42.671875 \r\n",
 926 |        "Q 23.828125 48.09375 27.96875 50.703125 \r\n",
 927 |        "Q 32.125 53.328125 38.09375 53.328125 \r\n",
 928 |        "Q 40.28125 53.328125 42.703125 52.984375 \r\n",
 929 |        "Q 45.125 52.640625 47.796875 52 \r\n",
 930 |        "z\r\n",
 931 |        "\" id=\"DejaVuSerif-114\"/>\r\n",
 932 |        "     <path d=\"M 24.703125 0 \r\n",
 933 |        "L 5.609375 46.6875 \r\n",
 934 |        "L -0.296875 46.6875 \r\n",
 935 |        "L -0.296875 51.90625 \r\n",
 936 |        "L 23.578125 51.90625 \r\n",
 937 |        "L 23.578125 46.6875 \r\n",
 938 |        "L 15.28125 46.6875 \r\n",
 939 |        "L 29.890625 10.984375 \r\n",
 940 |        "L 44.484375 46.6875 \r\n",
 941 |        "L 36.71875 46.6875 \r\n",
 942 |        "L 36.71875 51.90625 \r\n",
 943 |        "L 56.203125 51.90625 \r\n",
 944 |        "L 56.203125 46.6875 \r\n",
 945 |        "L 50.390625 46.6875 \r\n",
 946 |        "L 31.296875 0 \r\n",
 947 |        "z\r\n",
 948 |        "\" id=\"DejaVuSerif-118\"/>\r\n",
 949 |        "     <path d=\"M 54.203125 25 \r\n",
 950 |        "L 15.484375 25 \r\n",
 951 |        "L 15.484375 24.609375 \r\n",
 952 |        "Q 15.484375 14.109375 19.4375 8.765625 \r\n",
 953 |        "Q 23.390625 3.421875 31.109375 3.421875 \r\n",
 954 |        "Q 37.015625 3.421875 40.796875 6.515625 \r\n",
 955 |        "Q 44.578125 9.625 46.09375 15.71875 \r\n",
 956 |        "L 53.328125 15.71875 \r\n",
 957 |        "Q 51.171875 7.171875 45.375 2.875 \r\n",
 958 |        "Q 39.59375 -1.421875 30.171875 -1.421875 \r\n",
 959 |        "Q 18.796875 -1.421875 11.890625 6.078125 \r\n",
 960 |        "Q 4.984375 13.578125 4.984375 25.984375 \r\n",
 961 |        "Q 4.984375 38.28125 11.765625 45.796875 \r\n",
 962 |        "Q 18.5625 53.328125 29.59375 53.328125 \r\n",
 963 |        "Q 41.359375 53.328125 47.65625 46.0625 \r\n",
 964 |        "Q 53.953125 38.8125 54.203125 25 \r\n",
 965 |        "z\r\n",
 966 |        "M 43.609375 30.171875 \r\n",
 967 |        "Q 43.3125 39.265625 39.765625 43.875 \r\n",
 968 |        "Q 36.234375 48.484375 29.59375 48.484375 \r\n",
 969 |        "Q 23.390625 48.484375 19.828125 43.84375 \r\n",
 970 |        "Q 16.265625 39.203125 15.484375 30.171875 \r\n",
 971 |        "z\r\n",
 972 |        "\" id=\"DejaVuSerif-101\"/>\r\n",
 973 |        "    </defs>\r\n",
 974 |        "    <g style=\"fill:#8b0000;\" transform=\"translate(185.168906 17.077969)scale(0.13 -0.13)\">\r\n",
 975 |        "     <use xlink:href=\"#DejaVuSerif-82\"/>\r\n",
 976 |        "     <use x=\"75.292969\" xlink:href=\"#DejaVuSerif-79\"/>\r\n",
 977 |        "     <use x=\"157.275391\" xlink:href=\"#DejaVuSerif-67\"/>\r\n",
 978 |        "     <use x=\"233.789062\" xlink:href=\"#DejaVuSerif-32\"/>\r\n",
 979 |        "     <use x=\"265.576172\" xlink:href=\"#DejaVuSerif-99\"/>\r\n",
 980 |        "     <use x=\"321.582031\" xlink:href=\"#DejaVuSerif-117\"/>\r\n",
 981 |        "     <use x=\"385.986328\" xlink:href=\"#DejaVuSerif-114\"/>\r\n",
 982 |        "     <use x=\"433.789062\" xlink:href=\"#DejaVuSerif-118\"/>\r\n",
 983 |        "     <use x=\"490.283203\" xlink:href=\"#DejaVuSerif-101\"/>\r\n",
 984 |        "    </g>\r\n",
 985 |        "   </g>\r\n",
 986 |        "   <g id=\"legend_1\">\r\n",
 987 |        "    <g id=\"patch_7\">\r\n",
 988 |        "     <path d=\"M 143.71 249.357969 \r\n",
 989 |        "L 399.004375 249.357969 \r\n",
 990 |        "Q 401.804375 249.357969 401.804375 246.557969 \r\n",
 991 |        "L 401.804375 227.408594 \r\n",
 992 |        "Q 401.804375 224.608594 399.004375 224.608594 \r\n",
 993 |        "L 143.71 224.608594 \r\n",
 994 |        "Q 140.91 224.608594 140.91 227.408594 \r\n",
 995 |        "L 140.91 246.557969 \r\n",
 996 |        "Q 140.91 249.357969 143.71 249.357969 \r\n",
 997 |        "z\r\n",
 998 |        "\" style=\"fill:#f0f0f0;opacity:0.8;stroke:#cccccc;stroke-linejoin:miter;stroke-width:0.5;\"/>\r\n",
 999 |        "    </g>\r\n",
1000 |        "    <g id=\"line2d_26\">\r\n",
1001 |        "     <path d=\"M 146.51 235.946406 \r\n",
1002 |        "L 174.51 235.946406 \r\n",
1003 |        "\" style=\"fill:none;stroke:#008000;stroke-width:2;\"/>\r\n",
1004 |        "    </g>\r\n",
1005 |        "    <g id=\"line2d_27\"/>\r\n",
1006 |        "    <g id=\"text_14\">\r\n",
1007 |        "     <!-- nn, auc=0.955834398680822 -->\r\n",
1008 |        "     <defs>\r\n",
1009 |        "      <path d=\"M 54.890625 33.015625 \r\n",
1010 |        "L 54.890625 0 \r\n",
1011 |        "L 45.90625 0 \r\n",
1012 |        "L 45.90625 32.71875 \r\n",
1013 |        "Q 45.90625 40.484375 42.875 44.328125 \r\n",
1014 |        "Q 39.84375 48.1875 33.796875 48.1875 \r\n",
1015 |        "Q 26.515625 48.1875 22.3125 43.546875 \r\n",
1016 |        "Q 18.109375 38.921875 18.109375 30.90625 \r\n",
1017 |        "L 18.109375 0 \r\n",
1018 |        "L 9.078125 0 \r\n",
1019 |        "L 9.078125 54.6875 \r\n",
1020 |        "L 18.109375 54.6875 \r\n",
1021 |        "L 18.109375 46.1875 \r\n",
1022 |        "Q 21.34375 51.125 25.703125 53.5625 \r\n",
1023 |        "Q 30.078125 56 35.796875 56 \r\n",
1024 |        "Q 45.21875 56 50.046875 50.171875 \r\n",
1025 |        "Q 54.890625 44.34375 54.890625 33.015625 \r\n",
1026 |        "z\r\n",
1027 |        "\" id=\"DejaVuSans-110\"/>\r\n",
1028 |        "      <path d=\"M 11.71875 12.40625 \r\n",
1029 |        "L 22.015625 12.40625 \r\n",
1030 |        "L 22.015625 4 \r\n",
1031 |        "L 14.015625 -11.625 \r\n",
1032 |        "L 7.71875 -11.625 \r\n",
1033 |        "L 11.71875 4 \r\n",
1034 |        "z\r\n",
1035 |        "\" id=\"DejaVuSans-44\"/>\r\n",
1036 |        "      <path id=\"DejaVuSans-32\"/>\r\n",
1037 |        "      <path d=\"M 34.28125 27.484375 \r\n",
1038 |        "Q 23.390625 27.484375 19.1875 25 \r\n",
1039 |        "Q 14.984375 22.515625 14.984375 16.5 \r\n",
1040 |        "Q 14.984375 11.71875 18.140625 8.90625 \r\n",
1041 |        "Q 21.296875 6.109375 26.703125 6.109375 \r\n",
1042 |        "Q 34.1875 6.109375 38.703125 11.40625 \r\n",
1043 |        "Q 43.21875 16.703125 43.21875 25.484375 \r\n",
1044 |        "L 43.21875 27.484375 \r\n",
1045 |        "z\r\n",
1046 |        "M 52.203125 31.203125 \r\n",
1047 |        "L 52.203125 0 \r\n",
1048 |        "L 43.21875 0 \r\n",
1049 |        "L 43.21875 8.296875 \r\n",
1050 |        "Q 40.140625 3.328125 35.546875 0.953125 \r\n",
1051 |        "Q 30.953125 -1.421875 24.3125 -1.421875 \r\n",
1052 |        "Q 15.921875 -1.421875 10.953125 3.296875 \r\n",
1053 |        "Q 6 8.015625 6 15.921875 \r\n",
1054 |        "Q 6 25.140625 12.171875 29.828125 \r\n",
1055 |        "Q 18.359375 34.515625 30.609375 34.515625 \r\n",
1056 |        "L 43.21875 34.515625 \r\n",
1057 |        "L 43.21875 35.40625 \r\n",
1058 |        "Q 43.21875 41.609375 39.140625 45 \r\n",
1059 |        "Q 35.0625 48.390625 27.6875 48.390625 \r\n",
1060 |        "Q 23 48.390625 18.546875 47.265625 \r\n",
1061 |        "Q 14.109375 46.140625 10.015625 43.890625 \r\n",
1062 |        "L 10.015625 52.203125 \r\n",
1063 |        "Q 14.9375 54.109375 19.578125 55.046875 \r\n",
1064 |        "Q 24.21875 56 28.609375 56 \r\n",
1065 |        "Q 40.484375 56 46.34375 49.84375 \r\n",
1066 |        "Q 52.203125 43.703125 52.203125 31.203125 \r\n",
1067 |        "z\r\n",
1068 |        "\" id=\"DejaVuSans-97\"/>\r\n",
1069 |        "      <path d=\"M 8.5 21.578125 \r\n",
1070 |        "L 8.5 54.6875 \r\n",
1071 |        "L 17.484375 54.6875 \r\n",
1072 |        "L 17.484375 21.921875 \r\n",
1073 |        "Q 17.484375 14.15625 20.5 10.265625 \r\n",
1074 |        "Q 23.53125 6.390625 29.59375 6.390625 \r\n",
1075 |        "Q 36.859375 6.390625 41.078125 11.03125 \r\n",
1076 |        "Q 45.3125 15.671875 45.3125 23.6875 \r\n",
1077 |        "L 45.3125 54.6875 \r\n",
1078 |        "L 54.296875 54.6875 \r\n",
1079 |        "L 54.296875 0 \r\n",
1080 |        "L 45.3125 0 \r\n",
1081 |        "L 45.3125 8.40625 \r\n",
1082 |        "Q 42.046875 3.421875 37.71875 1 \r\n",
1083 |        "Q 33.40625 -1.421875 27.6875 -1.421875 \r\n",
1084 |        "Q 18.265625 -1.421875 13.375 4.4375 \r\n",
1085 |        "Q 8.5 10.296875 8.5 21.578125 \r\n",
1086 |        "z\r\n",
1087 |        "M 31.109375 56 \r\n",
1088 |        "z\r\n",
1089 |        "\" id=\"DejaVuSans-117\"/>\r\n",
1090 |        "      <path d=\"M 48.78125 52.59375 \r\n",
1091 |        "L 48.78125 44.1875 \r\n",
1092 |        "Q 44.96875 46.296875 41.140625 47.34375 \r\n",
1093 |        "Q 37.3125 48.390625 33.40625 48.390625 \r\n",
1094 |        "Q 24.65625 48.390625 19.8125 42.84375 \r\n",
1095 |        "Q 14.984375 37.3125 14.984375 27.296875 \r\n",
1096 |        "Q 14.984375 17.28125 19.8125 11.734375 \r\n",
1097 |        "Q 24.65625 6.203125 33.40625 6.203125 \r\n",
1098 |        "Q 37.3125 6.203125 41.140625 7.25 \r\n",
1099 |        "Q 44.96875 8.296875 48.78125 10.40625 \r\n",
1100 |        "L 48.78125 2.09375 \r\n",
1101 |        "Q 45.015625 0.34375 40.984375 -0.53125 \r\n",
1102 |        "Q 36.96875 -1.421875 32.421875 -1.421875 \r\n",
1103 |        "Q 20.0625 -1.421875 12.78125 6.34375 \r\n",
1104 |        "Q 5.515625 14.109375 5.515625 27.296875 \r\n",
1105 |        "Q 5.515625 40.671875 12.859375 48.328125 \r\n",
1106 |        "Q 20.21875 56 33.015625 56 \r\n",
1107 |        "Q 37.15625 56 41.109375 55.140625 \r\n",
1108 |        "Q 45.0625 54.296875 48.78125 52.59375 \r\n",
1109 |        "z\r\n",
1110 |        "\" id=\"DejaVuSans-99\"/>\r\n",
1111 |        "      <path d=\"M 10.59375 45.40625 \r\n",
1112 |        "L 73.1875 45.40625 \r\n",
1113 |        "L 73.1875 37.203125 \r\n",
1114 |        "L 10.59375 37.203125 \r\n",
1115 |        "z\r\n",
1116 |        "M 10.59375 25.484375 \r\n",
1117 |        "L 73.1875 25.484375 \r\n",
1118 |        "L 73.1875 17.1875 \r\n",
1119 |        "L 10.59375 17.1875 \r\n",
1120 |        "z\r\n",
1121 |        "\" id=\"DejaVuSans-61\"/>\r\n",
1122 |        "      <path d=\"M 10.984375 1.515625 \r\n",
1123 |        "L 10.984375 10.5 \r\n",
1124 |        "Q 14.703125 8.734375 18.5 7.8125 \r\n",
1125 |        "Q 22.3125 6.890625 25.984375 6.890625 \r\n",
1126 |        "Q 35.75 6.890625 40.890625 13.453125 \r\n",
1127 |        "Q 46.046875 20.015625 46.78125 33.40625 \r\n",
1128 |        "Q 43.953125 29.203125 39.59375 26.953125 \r\n",
1129 |        "Q 35.25 24.703125 29.984375 24.703125 \r\n",
1130 |        "Q 19.046875 24.703125 12.671875 31.3125 \r\n",
1131 |        "Q 6.296875 37.9375 6.296875 49.421875 \r\n",
1132 |        "Q 6.296875 60.640625 12.9375 67.421875 \r\n",
1133 |        "Q 19.578125 74.21875 30.609375 74.21875 \r\n",
1134 |        "Q 43.265625 74.21875 49.921875 64.515625 \r\n",
1135 |        "Q 56.59375 54.828125 56.59375 36.375 \r\n",
1136 |        "Q 56.59375 19.140625 48.40625 8.859375 \r\n",
1137 |        "Q 40.234375 -1.421875 26.421875 -1.421875 \r\n",
1138 |        "Q 22.703125 -1.421875 18.890625 -0.6875 \r\n",
1139 |        "Q 15.09375 0.046875 10.984375 1.515625 \r\n",
1140 |        "z\r\n",
1141 |        "M 30.609375 32.421875 \r\n",
1142 |        "Q 37.25 32.421875 41.125 36.953125 \r\n",
1143 |        "Q 45.015625 41.5 45.015625 49.421875 \r\n",
1144 |        "Q 45.015625 57.28125 41.125 61.84375 \r\n",
1145 |        "Q 37.25 66.40625 30.609375 66.40625 \r\n",
1146 |        "Q 23.96875 66.40625 20.09375 61.84375 \r\n",
1147 |        "Q 16.21875 57.28125 16.21875 49.421875 \r\n",
1148 |        "Q 16.21875 41.5 20.09375 36.953125 \r\n",
1149 |        "Q 23.96875 32.421875 30.609375 32.421875 \r\n",
1150 |        "z\r\n",
1151 |        "\" id=\"DejaVuSans-57\"/>\r\n",
1152 |        "      <path d=\"M 10.796875 72.90625 \r\n",
1153 |        "L 49.515625 72.90625 \r\n",
1154 |        "L 49.515625 64.59375 \r\n",
1155 |        "L 19.828125 64.59375 \r\n",
1156 |        "L 19.828125 46.734375 \r\n",
1157 |        "Q 21.96875 47.46875 24.109375 47.828125 \r\n",
1158 |        "Q 26.265625 48.1875 28.421875 48.1875 \r\n",
1159 |        "Q 40.625 48.1875 47.75 41.5 \r\n",
1160 |        "Q 54.890625 34.8125 54.890625 23.390625 \r\n",
1161 |        "Q 54.890625 11.625 47.5625 5.09375 \r\n",
1162 |        "Q 40.234375 -1.421875 26.90625 -1.421875 \r\n",
1163 |        "Q 22.3125 -1.421875 17.546875 -0.640625 \r\n",
1164 |        "Q 12.796875 0.140625 7.71875 1.703125 \r\n",
1165 |        "L 7.71875 11.625 \r\n",
1166 |        "Q 12.109375 9.234375 16.796875 8.0625 \r\n",
1167 |        "Q 21.484375 6.890625 26.703125 6.890625 \r\n",
1168 |        "Q 35.15625 6.890625 40.078125 11.328125 \r\n",
1169 |        "Q 45.015625 15.765625 45.015625 23.390625 \r\n",
1170 |        "Q 45.015625 31 40.078125 35.4375 \r\n",
1171 |        "Q 35.15625 39.890625 26.703125 39.890625 \r\n",
1172 |        "Q 22.75 39.890625 18.8125 39.015625 \r\n",
1173 |        "Q 14.890625 38.140625 10.796875 36.28125 \r\n",
1174 |        "z\r\n",
1175 |        "\" id=\"DejaVuSans-53\"/>\r\n",
1176 |        "      <path d=\"M 40.578125 39.3125 \r\n",
1177 |        "Q 47.65625 37.796875 51.625 33 \r\n",
1178 |        "Q 55.609375 28.21875 55.609375 21.1875 \r\n",
1179 |        "Q 55.609375 10.40625 48.1875 4.484375 \r\n",
1180 |        "Q 40.765625 -1.421875 27.09375 -1.421875 \r\n",
1181 |        "Q 22.515625 -1.421875 17.65625 -0.515625 \r\n",
1182 |        "Q 12.796875 0.390625 7.625 2.203125 \r\n",
1183 |        "L 7.625 11.71875 \r\n",
1184 |        "Q 11.71875 9.328125 16.59375 8.109375 \r\n",
1185 |        "Q 21.484375 6.890625 26.8125 6.890625 \r\n",
1186 |        "Q 36.078125 6.890625 40.9375 10.546875 \r\n",
1187 |        "Q 45.796875 14.203125 45.796875 21.1875 \r\n",
1188 |        "Q 45.796875 27.640625 41.28125 31.265625 \r\n",
1189 |        "Q 36.765625 34.90625 28.71875 34.90625 \r\n",
1190 |        "L 20.21875 34.90625 \r\n",
1191 |        "L 20.21875 43.015625 \r\n",
1192 |        "L 29.109375 43.015625 \r\n",
1193 |        "Q 36.375 43.015625 40.234375 45.921875 \r\n",
1194 |        "Q 44.09375 48.828125 44.09375 54.296875 \r\n",
1195 |        "Q 44.09375 59.90625 40.109375 62.90625 \r\n",
1196 |        "Q 36.140625 65.921875 28.71875 65.921875 \r\n",
1197 |        "Q 24.65625 65.921875 20.015625 65.03125 \r\n",
1198 |        "Q 15.375 64.15625 9.8125 62.3125 \r\n",
1199 |        "L 9.8125 71.09375 \r\n",
1200 |        "Q 15.4375 72.65625 20.34375 73.4375 \r\n",
1201 |        "Q 25.25 74.21875 29.59375 74.21875 \r\n",
1202 |        "Q 40.828125 74.21875 47.359375 69.109375 \r\n",
1203 |        "Q 53.90625 64.015625 53.90625 55.328125 \r\n",
1204 |        "Q 53.90625 49.265625 50.4375 45.09375 \r\n",
1205 |        "Q 46.96875 40.921875 40.578125 39.3125 \r\n",
1206 |        "z\r\n",
1207 |        "\" id=\"DejaVuSans-51\"/>\r\n",
1208 |        "     </defs>\r\n",
1209 |        "     <g style=\"fill:#0000ff;\" transform=\"translate(185.71 240.846406)scale(0.14 -0.14)\">\r\n",
1210 |        "      <use xlink:href=\"#DejaVuSans-110\"/>\r\n",
1211 |        "      <use x=\"63.378906\" xlink:href=\"#DejaVuSans-110\"/>\r\n",
1212 |        "      <use x=\"126.757812\" xlink:href=\"#DejaVuSans-44\"/>\r\n",
1213 |        "      <use x=\"158.544922\" xlink:href=\"#DejaVuSans-32\"/>\r\n",
1214 |        "      <use x=\"190.332031\" xlink:href=\"#DejaVuSans-97\"/>\r\n",
1215 |        "      <use x=\"251.611328\" xlink:href=\"#DejaVuSans-117\"/>\r\n",
1216 |        "      <use x=\"314.990234\" xlink:href=\"#DejaVuSans-99\"/>\r\n",
1217 |        "      <use x=\"369.970703\" xlink:href=\"#DejaVuSans-61\"/>\r\n",
1218 |        "      <use x=\"453.759766\" xlink:href=\"#DejaVuSans-48\"/>\r\n",
1219 |        "      <use x=\"517.382812\" xlink:href=\"#DejaVuSans-46\"/>\r\n",
1220 |        "      <use x=\"549.169922\" xlink:href=\"#DejaVuSans-57\"/>\r\n",
1221 |        "      <use x=\"612.792969\" xlink:href=\"#DejaVuSans-53\"/>\r\n",
1222 |        "      <use x=\"676.416016\" xlink:href=\"#DejaVuSans-53\"/>\r\n",
1223 |        "      <use x=\"740.039062\" xlink:href=\"#DejaVuSans-56\"/>\r\n",
1224 |        "      <use x=\"803.662109\" xlink:href=\"#DejaVuSans-51\"/>\r\n",
1225 |        "      <use x=\"867.285156\" xlink:href=\"#DejaVuSans-52\"/>\r\n",
1226 |        "      <use x=\"930.908203\" xlink:href=\"#DejaVuSans-51\"/>\r\n",
1227 |        "      <use x=\"994.53125\" xlink:href=\"#DejaVuSans-57\"/>\r\n",
1228 |        "      <use x=\"1058.154297\" xlink:href=\"#DejaVuSans-56\"/>\r\n",
1229 |        "      <use x=\"1121.777344\" xlink:href=\"#DejaVuSans-54\"/>\r\n",
1230 |        "      <use x=\"1185.400391\" xlink:href=\"#DejaVuSans-56\"/>\r\n",
1231 |        "      <use x=\"1249.023438\" xlink:href=\"#DejaVuSans-48\"/>\r\n",
1232 |        "      <use x=\"1312.646484\" xlink:href=\"#DejaVuSans-56\"/>\r\n",
1233 |        "      <use x=\"1376.269531\" xlink:href=\"#DejaVuSans-50\"/>\r\n",
1234 |        "      <use x=\"1439.892578\" xlink:href=\"#DejaVuSans-50\"/>\r\n",
1235 |        "     </g>\r\n",
1236 |        "    </g>\r\n",
1237 |        "   </g>\r\n",
1238 |        "  </g>\r\n",
1239 |        " </g>\r\n",
1240 |        " <defs>\r\n",
1241 |        "  <clipPath id=\"p888b19183b\">\r\n",
1242 |        "   <rect height=\"233.28\" width=\"375.84\" x=\"32.964375\" y=\"23.077969\"/>\r\n",
1243 |        "  </clipPath>\r\n",
1244 |        " </defs>\r\n",
1245 |        "</svg>\r\n"
1246 |       ],
1247 |       "text/plain": [
1248 |        "<Figure size 432x288 with 1 Axes>"
1249 |       ]
1250 |      },
1251 |      "metadata": {},
1252 |      "output_type": "display_data"
1253 |     }
1254 |    ],
1255 |    "source": [
1256 |     "plot(test_label, predict, \"nn\")"
1257 |    ]
1258 |   },
1259 |   {
1260 |    "cell_type": "code",
1261 |    "execution_count": 38,
1262 |    "metadata": {},
1263 |    "outputs": [],
1264 |    "source": [
1265 |     "result = pd.DataFrame()\n",
1266 |     "result[\"id\"] = id_\n",
1267 |     "result[\"safe_type\"] = predict\n",
1268 |     "result.to_csv(\"result.csv\", encoding=\"utf-8\", index=False)"
1269 |    ]
1270 |   },
1271 |   {
1272 |    "cell_type": "code",
1273 |    "execution_count": null,
1274 |    "metadata": {},
1275 |    "outputs": [],
1276 |    "source": []
1277 |   }
1278 |  ],
1279 |  "metadata": {
1280 |   "kernelspec": {
1281 |    "display_name": "Python 3",
1282 |    "language": "python",
1283 |    "name": "python3"
1284 |   },
1285 |   "language_info": {
1286 |    "codemirror_mode": {
1287 |     "name": "ipython",
1288 |     "version": 3
1289 |    },
1290 |    "file_extension": ".py",
1291 |    "mimetype": "text/x-python",
1292 |    "name": "python",
1293 |    "nbconvert_exporter": "python",
1294 |    "pygments_lexer": "ipython3",
1295 |    "version": "3.6.7"
1296 |   }
1297 |  },
1298 |  "nbformat": 4,
1299 |  "nbformat_minor": 2
1300 | }
1301 | 


--------------------------------------------------------------------------------
/DataCon2019/code/stage1/feature_engineering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "import glob\n",
 11 |     "import pandas as pd\n",
 12 |     "import pickle\n",
 13 |     "from collections import Counter\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "from xgboost import XGBClassifier\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "\n",
 18 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 19 |     "from sklearn import metrics\n",
 20 |     "from sklearn.metrics import accuracy_score\n",
 21 |     "from sklearn.metrics import classification_report\n",
 22 |     "\n",
 23 |     "from sklearn.preprocessing import MinMaxScaler\n",
 24 |     "from sklearn.externals import joblib\n",
 25 |     "\n",
 26 |     "%config InlineBackend.figure_format = 'svg'\n",
 27 |     "%matplotlib inline\n",
 28 |     "\n",
 29 |     "import warnings\n",
 30 |     "warnings.filterwarnings(\"ignore\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 16,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "def yield_origin_csv(file_type):\n",
 40 |     "    flag = 1\n",
 41 |     "    id_, api_name_list, call_pid_list, ret_value_list = [], [], [], []\n",
 42 |     "    api_name_regex = re.compile('<action api_name=\"(.*?)\" call_name')\n",
 43 |     "    call_pid_regex = re.compile('call_pid=\"(.*?)\" call_time=')\n",
 44 |     "    ret_value_regex = re.compile('ret_value=\"(.*?)\"')\n",
 45 |     "    for path in glob.glob(\"./stage1_dataset/train/{}/*\".format(file_type)):\n",
 46 |     "        with open(path, \"r\") as fp:\n",
 47 |     "            xml = fp.read()\n",
 48 |     "        api_names = re.findall(api_name_regex, xml)\n",
 49 |     "        call_pids = re.findall(call_pid_regex, xml)\n",
 50 |     "        ret_values = re.findall(ret_value_regex, xml)\n",
 51 |     "        \n",
 52 |     "        id_.append(path.split(\".\")[1].split(\"/\")[-1]) \n",
 53 |     "        api_name_list.append(\" \".join(api_names))\n",
 54 |     "        call_pid_list.append(\" \".join(call_pids))\n",
 55 |     "        ret_value_list.append(\" \".join(ret_values))\n",
 56 |     "        \n",
 57 |     "        \n",
 58 |     "        if flag % 300 == 0:\n",
 59 |     "            print(flag)\n",
 60 |     "        flag += 1\n",
 61 |     "    df = pd.DataFrame()\n",
 62 |     "    df[\"id\"] = id_\n",
 63 |     "    df[\"api_name\"] = api_name_list\n",
 64 |     "    df[\"call_pid\"] = call_pid_list\n",
 65 |     "    df[\"ret_value\"] = ret_value_list\n",
 66 |     "    return df"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 24,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# black = yield_origin_csv(\"black\")\n",
 76 |     "# white = yield_origin_csv(\"white\")\n",
 77 |     "# origin_test = yield_origin_csv(\"test\")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 21,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "origin_data = pd.concat([white, black])\n",
 87 |     "origin_data[\"safe_type\"] = [0 for _ in range(20000)] + [1 for _ in range(10000)]"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 27,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "# origin_data.to_csv(\"origin_data.csv\", encoding=\"utf-8\", index=False)\n",
 97 |     "# origin_test.to_csv(\"origin_test.csv\", encoding=\"utf-8\", index=False)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "'''\n",
107 |     "api_distinct_cnt: file调用了多少不同的API ;\n",
108 |     "api_cnt: file调用api的总数；\n",
109 |     "api_cnt_mean: file调用API的平均值；\n",
110 |     "call_pid_distinct_cnt: file调用了多少不同的进程；\n",
111 |     "call_pid_cnt_max,tid_api_cnt_min,tid_api_cnt_mean: \",\"file中的线程调用的 最多/最少/平均 api数目;\n",
112 |     "\n",
113 |     "value_equals0_cnt: file返回值为0的样本数;\n",
114 |     "value_equals0_rate： file返回值为0的样本比率;\n",
115 |     "value_distinct_cnt: file有多少不同的返回值;\n",
116 |     "\n",
117 |     "'''"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 28,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "def get_value(x, kind=\"mean\"):\n",
127 |     "    dict_ = Counter(x.split())\n",
128 |     "    tmp = sorted(dict_.values())\n",
129 |     "    if kind == \"mean\":\n",
130 |     "        return sum(dict_.values()) / len(dict_)   \n",
131 |     "    if kind == \"max\":\n",
132 |     "        return tmp[-1]\n",
133 |     "    if kind == \"min\":\n",
134 |     "        return tmp[0]\n",
135 |     "    "
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 31,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "def make_features(data):\n",
145 |     "    data[\"api_cnt\"] = data[\"api_name\"].apply(lambda x: len(x.split()))\n",
146 |     "    data[\"api_distinct_cnt\"] = data[\"api_name\"].apply(lambda x: len(set(x.split())))\n",
147 |     "    data[\"api_cnt_mean\"] = data[\"api_name\"].apply(lambda x: get_value(x))\n",
148 |     "    data[\"call_pid_distinct_cnt\"] = data[\"call_pid\"].apply(lambda x: len(set(x.split())))\n",
149 |     "    data[\"call_pid_cnt_mean\"] = data[\"call_pid\"].apply(lambda x: get_value(x))\n",
150 |     "    data[\"call_pid_cnt_max\"] = data[\"call_pid\"].apply(lambda x: get_value(x, kind=\"max\"))\n",
151 |     "    data[\"call_pid_cnt_min\"] = data[\"call_pid\"].apply(lambda x: get_value(x, kind=\"min\"))\n",
152 |     "    data[\"ret_value_equals0_cnt\"] = data[\"ret_value\"].apply(lambda x: x.split().count('0'))\n",
153 |     "    data[\"ret_value_equals0_rate\"] = data[\"ret_value\"].apply(lambda x: x.split().count('0') / (len(x.split())))\n",
154 |     "    data[\"ret_value_distinct_cnt\"] = data[\"ret_value\"].apply(lambda x: len(set(x.split())))\n",
155 |     "    data.drop([\"call_pid\", \"api_name\", \"ret_value\"], axis=1, inplace=True)\n",
156 |     "    return data"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 32,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "train_features = make_features(origin_data)\n",
166 |     "test_features = make_features(origin_test)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 41,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "train_features.to_csv(\"train_features.csv\", encoding=\"utf-8\", index=False)\n",
176 |     "test_features.to_csv(\"test_features.csv\", encoding=\"utf-8\", index=False)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## <center>Get n-gram features</center>"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 3,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "origin_train_data = pd.read_csv(\"origin_data.csv\")\n",
193 |     "origin_test_data = pd.read_csv(\"origin_test.csv\")"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 45,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "train_data_api_name = origin_train_data[\"api_name\"]\n",
203 |     "test_data_api_name = origin_test_data[\"api_name\"]"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 51,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)\n",
213 |     "train_tfidf_features = vectorizer.fit_transform(train_data_api_name.tolist())\n",
214 |     "test_tfidf_features = vectorizer.transform(test_data_api_name.tolist())"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 82,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "with open(\"train_tfidf_features.pkl\", \"wb\") as fp:\n",
224 |     "    pickle.dump(train_tfidf_features, fp)\n",
225 |     "with open(\"test_tfidf_features.pkl\", \"wb\") as fp:\n",
226 |     "    pickle.dump(test_tfidf_features, fp)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 4,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "train_data_ret_value = origin_train_data[\"ret_value\"]\n",
236 |     "test_data_ret_value = origin_test_data[\"ret_value\"]"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 9,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)\n",
246 |     "train_tfidf_features = vectorizer.fit_transform(train_data_ret_value.tolist())\n",
247 |     "test_tfidf_features = vectorizer.transform(test_data_ret_value.tolist())"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 15,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "with open(\"train_ret_value_tfidf_features.pkl\", \"wb\") as fp:\n",
257 |     "    pickle.dump(train_tfidf_features, fp)\n",
258 |     "    \n",
259 |     "with open(\"test_ret_value_tfidf_features.pkl\", \"wb\") as fp:\n",
260 |     "    pickle.dump(test_tfidf_features, fp)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": []
269 |   }
270 |  ],
271 |  "metadata": {
272 |   "kernelspec": {
273 |    "display_name": "Python 3",
274 |    "language": "python",
275 |    "name": "python3"
276 |   },
277 |   "language_info": {
278 |    "codemirror_mode": {
279 |     "name": "ipython",
280 |     "version": 3
281 |    },
282 |    "file_extension": ".py",
283 |    "mimetype": "text/x-python",
284 |    "name": "python",
285 |    "nbconvert_exporter": "python",
286 |    "pygments_lexer": "ipython3",
287 |    "version": "3.6.7"
288 |   }
289 |  },
290 |  "nbformat": 4,
291 |  "nbformat_minor": 2
292 | }
293 | 


--------------------------------------------------------------------------------
/DataCon2019/code/stage1/out_of_fold.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import time\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "\n",
 14 |     "from sklearn.model_selection import train_test_split\n",
 15 |     "from sklearn.model_selection import StratifiedKFold\n",
 16 |     "from sklearn.model_selection import cross_validate\n",
 17 |     "from sklearn.model_selection import GridSearchCV\n",
 18 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 19 |     "\n",
 20 |     "from sklearn import svm\n",
 21 |     "from sklearn import neighbors\n",
 22 |     "from sklearn import naive_bayes\n",
 23 |     "from sklearn.svm import LinearSVC\n",
 24 |     "from xgboost import XGBClassifier\n",
 25 |     "from sklearn.tree import DecisionTreeClassifier\n",
 26 |     "from sklearn.linear_model import LogisticRegression\n",
 27 |     "from sklearn.linear_model import LogisticRegressionCV\n",
 28 |     "from sklearn.gaussian_process import GaussianProcessClassifier\n",
 29 |     "\n",
 30 |     "from sklearn.ensemble import RandomForestClassifier\n",
 31 |     "from sklearn.ensemble import AdaBoostClassifier\n",
 32 |     "from sklearn.ensemble import BaggingClassifier\n",
 33 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
 34 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
 35 |     "from sklearn.ensemble import VotingClassifier\n",
 36 |     "\n",
 37 |     "from sklearn import metrics\n",
 38 |     "from sklearn.metrics import accuracy_score\n",
 39 |     "from sklearn.metrics import classification_report\n",
 40 |     "\n",
 41 |     "from sklearn.externals import joblib\n",
 42 |     "\n",
 43 |     "%config InlineBackend.figure_format = 'svg'\n",
 44 |     "%matplotlib inline\n",
 45 |     "\n",
 46 |     "import warnings\n",
 47 |     "warnings.filterwarnings(\"ignore\")"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 2,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "data = pd.read_csv(\"fliter_train_data.csv\")\n",
 57 |     "safe_type = data[\"safe_type\"]\n",
 58 |     "features = data.iloc[:, 2:]"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 7,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "train_data, test_data, train_label, test_label = train_test_split(features, \n",
 68 |     "                                                                  safe_type, \n",
 69 |     "                                                                  test_size=0.2, \n",
 70 |     "                                                                  random_state=0)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 9,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "def plot(test_label, y_pred, model):\n",
 80 |     "    font = {\"color\": \"darkred\",\n",
 81 |     "            \"size\": 13, \n",
 82 |     "            \"family\" : \"serif\"}\n",
 83 |     "\n",
 84 |     "    accs = accuracy_score(test_label, y_pred)\n",
 85 |     "    fpr, tpr, _ = metrics.roc_curve(test_label,  y_pred)\n",
 86 |     "    auc = metrics.roc_auc_score(test_label, y_pred)\n",
 87 |     "    plt.style.use(\"fivethirtyeight\")\n",
 88 |     "    fig, ax = plt.subplots()\n",
 89 |     "    ax.plot(fpr, tpr, label=\"{}, auc=\".format(model)+str(auc), color='green', linewidth=2)\n",
 90 |     "    ax.set_title(\"ROC curve\", fontdict=font)\n",
 91 |     "    leg = ax.legend(loc=\"best\")\n",
 92 |     "    text = leg.get_texts()\n",
 93 |     "    _ = plt.setp(text, color=\"blue\") "
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 8,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "model = XGBClassifier()               \n",
103 |     "model.fit(train_data, train_label)            \n",
104 |     "y_pred = model.predict(test_data)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 3,
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "The best parameter for BaggingClassifier is {'max_samples': 0.5, 'n_estimators': 300, 'random_state': 0} with a runtime of 1259.84 seconds.\n",
117 |       "The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 300, 'random_state': 0} with a runtime of 2480.75 seconds.\n",
118 |       "The best parameter for LogisticRegression is {'fit_intercept': False, 'random_state': 0, 'solver': 'newton-cg'} with a runtime of 300.54 seconds.\n",
119 |       "The best parameter for BernoulliNB is {'alpha': 0.1} with a runtime of 1.86 seconds.\n",
120 |       "The best parameter for KNeighborsClassifier is {} with a runtime of 67.43 seconds.\n",
121 |       "The best parameter for XGBClassifier is {'algorithm': 'auto', 'n_neighbors': 1, 'weights': 'uniform'} with a runtime of 771.57 seconds.\n",
122 |       "Total optimization time was 81.37 minutes.\n",
123 |       "----------\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "grid_n_estimator = [10, 50, 100, 300]\n",
129 |     "grid_ratio = [0.1, 0.25, 0.5, 0.75, 1.0]\n",
130 |     "grid_learn = [0.01, 0.03, 0.05, 0.1, 0.25]\n",
131 |     "grid_max_depth = [2, 4, 6, 8, 10, None]\n",
132 |     "grid_min_samples = [5, 10, 0.03, 0.05, 0.10]\n",
133 |     "grid_criterion = ['gini', 'entropy']\n",
134 |     "grid_bool = [True, False]\n",
135 |     "grid_seed = [0]\n",
136 |     "\n",
137 |     "layer_1 = [\n",
138 |     "            #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html\n",
139 |     "#             ('ada', AdaBoostClassifier()),\n",
140 |     "            ('bc', BaggingClassifier()),\n",
141 |     "#             ('etc', ExtraTreesClassifier()),\n",
142 |     "            ('gbc', GradientBoostingClassifier()),\n",
143 |     "#             ('rfc', RandomForestClassifier()),\n",
144 |     "\n",
145 |     "            #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc\n",
146 |     "#             ('gpc', GaussianProcessClassifier()),\n",
147 |     "\n",
148 |     "            #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
149 |     "            ('lr', LogisticRegression()),\n",
150 |     "\n",
151 |     "            #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html\n",
152 |     "            ('bnb', naive_bayes.BernoulliNB()),\n",
153 |     "#             ('gnb', naive_bayes.GaussianNB()),\n",
154 |     "\n",
155 |     "            #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html\n",
156 |     "            ('knn', neighbors.KNeighborsClassifier()),\n",
157 |     "\n",
158 |     "            #SVM: http://scikit-learn.org/stable/modules/svm.html\n",
159 |     "#             ('svc', svm.SVC(probability=True)),\n",
160 |     "\n",
161 |     "            #xgboost: http://xgboost.readthedocs.io/en/latest/model.html\n",
162 |     "           ('xgb', XGBClassifier())\n",
163 |     "\n",
164 |     "          ]\n",
165 |     "\n",
166 |     "grid_param = [\n",
167 |     "#                 [{\n",
168 |     "#                 #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html\n",
169 |     "#                 'n_estimators': grid_n_estimator, #default=50\n",
170 |     "#                 'learning_rate': grid_learn, #default=1\n",
171 |     "#                 #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R\n",
172 |     "#                 'random_state': grid_seed\n",
173 |     "#                 }],\n",
174 |     "\n",
175 |     "\n",
176 |     "                [{\n",
177 |     "                #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier\n",
178 |     "                'n_estimators': grid_n_estimator, #default=10\n",
179 |     "                'max_samples': grid_ratio, #default=1.0\n",
180 |     "                'random_state': grid_seed\n",
181 |     "                 }],\n",
182 |     "\n",
183 |     "\n",
184 |     "#                 [{\n",
185 |     "#                 #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier\n",
186 |     "#                 'n_estimators': grid_n_estimator, #default=10\n",
187 |     "#                 'criterion': grid_criterion, #default=”gini”\n",
188 |     "#                 'max_depth': grid_max_depth, #default=None\n",
189 |     "#                 'random_state': grid_seed\n",
190 |     "#                  }],\n",
191 |     "\n",
192 |     "\n",
193 |     "                [{\n",
194 |     "                #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier\n",
195 |     "                #'loss': ['deviance', 'exponential'], #default=’deviance’\n",
196 |     "                'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.\n",
197 |     "                'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.\n",
198 |     "                #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”\n",
199 |     "                'max_depth': grid_max_depth, #default=3   \n",
200 |     "                'random_state': grid_seed\n",
201 |     "                 }],\n",
202 |     "\n",
203 |     "\n",
204 |     "#                 [{\n",
205 |     "#                 #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier\n",
206 |     "#                 'n_estimators': grid_n_estimator, #default=10\n",
207 |     "#                 'criterion': grid_criterion, #default=”gini”\n",
208 |     "#                 'max_depth': grid_max_depth, #default=None\n",
209 |     "#                 'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.\n",
210 |     "#                 'random_state': grid_seed\n",
211 |     "#                  }],\n",
212 |     "\n",
213 |     "#                 [{    \n",
214 |     "#                 #GaussianProcessClassifier\n",
215 |     "#                 'max_iter_predict': grid_n_estimator, #default: 100\n",
216 |     "#                 'random_state': grid_seed\n",
217 |     "#                 }],\n",
218 |     "\n",
219 |     "\n",
220 |     "                [{\n",
221 |     "                #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV\n",
222 |     "                'fit_intercept': grid_bool, #default: True\n",
223 |     "                #'penalty': ['l1','l2'],\n",
224 |     "                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs\n",
225 |     "                'random_state': grid_seed\n",
226 |     "                 }],\n",
227 |     "\n",
228 |     "\n",
229 |     "                [{\n",
230 |     "                #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB\n",
231 |     "                'alpha': grid_ratio, #default: 1.0\n",
232 |     "                 }],\n",
233 |     "\n",
234 |     "\n",
235 |     "                #GaussianNB - \n",
236 |     "                [{}],\n",
237 |     "\n",
238 |     "                [{\n",
239 |     "                #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier\n",
240 |     "                'n_neighbors': [1,2,3,4,5,6,7], #default: 5\n",
241 |     "                'weights': ['uniform', 'distance'], #default = ‘uniform’\n",
242 |     "                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']\n",
243 |     "                }],\n",
244 |     "\n",
245 |     "\n",
246 |     "#                 [{\n",
247 |     "#                 #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC\n",
248 |     "#                 #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r\n",
249 |     "#                 #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],\n",
250 |     "#                 'C': [1,2,3,4,5], #default=1.0\n",
251 |     "#                 'gamma': grid_ratio, #edfault: auto\n",
252 |     "#                 'decision_function_shape': ['ovo', 'ovr'], #default:ovr\n",
253 |     "#                 'probability': [True],\n",
254 |     "#                 'random_state': grid_seed\n",
255 |     "#                  }],\n",
256 |     "\n",
257 |     "\n",
258 |     "                [{\n",
259 |     "                #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html\n",
260 |     "                'learning_rate': grid_learn, #default: .3\n",
261 |     "                'max_depth': [1,2,4,6,8,10], #default 2\n",
262 |     "                'n_estimators': grid_n_estimator, \n",
263 |     "                'seed': grid_seed  \n",
264 |     "                 }]   \n",
265 |     "            ]\n",
266 |     "\n",
267 |     "\n",
268 |     "\n",
269 |     "start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter\n",
270 |     "for clf, param in zip (layer_1, grid_param): #https://docs.python.org/3/library/functions.html#zip\n",
271 |     "\n",
272 |     "    #print(clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm\n",
273 |     "    #print(param)\n",
274 |     "    \n",
275 |     "    \n",
276 |     "    start = time.perf_counter()        \n",
277 |     "    best_search = GridSearchCV(estimator = clf[1], param_grid = param, cv = 5, scoring = 'roc_auc')\n",
278 |     "    best_search.fit(features, safe_type)\n",
279 |     "    run = time.perf_counter() - start\n",
280 |     "\n",
281 |     "    best_param = best_search.best_params_\n",
282 |     "    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf[1].__class__.__name__, best_param, run))\n",
283 |     "    clf[1].set_params(**best_param) \n",
284 |     "\n",
285 |     "\n",
286 |     "run_total = time.perf_counter() - start_total\n",
287 |     "print('Total optimization time was {:.2f} minutes.'.format(run_total/60))\n",
288 |     "print('-'*10)"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 4,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "layer_1 = [\n",
298 |     "            #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html\n",
299 |     "#             ('ada', AdaBoostClassifier()),\n",
300 |     "            ('bc', BaggingClassifier()),\n",
301 |     "#             ('etc', ExtraTreesClassifier()),\n",
302 |     "            ('gbc', GradientBoostingClassifier()),\n",
303 |     "#             ('rfc', RandomForestClassifier()),\n",
304 |     "\n",
305 |     "            #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc\n",
306 |     "#             ('gpc', GaussianProcessClassifier()),\n",
307 |     "\n",
308 |     "            #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
309 |     "            ('lr', LogisticRegression()),\n",
310 |     "\n",
311 |     "            #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html\n",
312 |     "            ('bnb', naive_bayes.BernoulliNB()),\n",
313 |     "#             ('gnb', naive_bayes.GaussianNB()),\n",
314 |     "\n",
315 |     "            #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html\n",
316 |     "            ('knn', neighbors.KNeighborsClassifier()),\n",
317 |     "\n",
318 |     "            #SVM: http://scikit-learn.org/stable/modules/svm.html\n",
319 |     "            ('svc', svm.SVC(probability=True)),\n",
320 |     "\n",
321 |     "            #xgboost: http://xgboost.readthedocs.io/en/latest/model.html\n",
322 |     "           ('xgb', XGBClassifier())\n",
323 |     "\n",
324 |     "          ]"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 5,
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "Soft Voting Training w/bin score mean: 98.43\n",
337 |       "Soft Voting Test w/bin score mean: 97.14\n",
338 |       "Soft Voting Test w/bin score 3*std: +/- 0.68\n",
339 |       "----------\n"
340 |      ]
341 |     }
342 |    ],
343 |    "source": [
344 |     "vote_soft = VotingClassifier(estimators=layer_1 , voting = 'soft')\n",
345 |     "vote_soft_cv = cross_validate(vote_soft, features, safe_type, cv=5)\n",
346 |     "vote_soft.fit(features, safe_type)\n",
347 |     "\n",
348 |     "print(\"Soft Voting Training w/bin score mean: {:.2f}\".format(vote_soft_cv['train_score'].mean()*100)) \n",
349 |     "print(\"Soft Voting Test w/bin score mean: {:.2f}\".format(vote_soft_cv['test_score'].mean()*100))\n",
350 |     "print(\"Soft Voting Test w/bin score 3*std: +/- {:.2f}\".format(vote_soft_cv['test_score'].std()*100*3))\n",
351 |     "print('-'*10)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 9,
357 |    "metadata": {},
358 |    "outputs": [
359 |     {
360 |      "name": "stdout",
361 |      "output_type": "stream",
362 |      "text": [
363 |       "Soft Voting Training w/bin score mean: 98.79\n",
364 |       "Soft Voting Test w/bin score mean: 97.85\n",
365 |       "Soft Voting Test w/bin score 3*std: +/- 0.46\n",
366 |       "----------\n"
367 |      ]
368 |     }
369 |    ],
370 |    "source": [
371 |     "gv_vote_soft = VotingClassifier(estimators=layer_1 , voting = 'soft')\n",
372 |     "gv_vote_soft_cv = cross_validate(gv_vote_soft, features, safe_type, cv=5)\n",
373 |     "gv_vote_soft.fit(features, safe_type)\n",
374 |     "\n",
375 |     "print(\"Soft Voting Training w/bin score mean: {:.2f}\".format(gv_vote_soft_cv['train_score'].mean()*100)) \n",
376 |     "print(\"Soft Voting Test w/bin score mean: {:.2f}\".format(gv_vote_soft_cv['test_score'].mean()*100))\n",
377 |     "print(\"Soft Voting Test w/bin score 3*std: +/- {:.2f}\".format(gv_vote_soft_cv['test_score'].std()*100*3))\n",
378 |     "print('-'*10)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 10,
384 |    "metadata": {},
385 |    "outputs": [
386 |     {
387 |      "data": {
388 |       "text/plain": [
389 |        "['gv_vote_soft.m']"
390 |       ]
391 |      },
392 |      "execution_count": 10,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "joblib.dump(gv_vote_soft, \"gv_vote_soft.m\")"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 11,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "test = pd.read_csv(\"fliter_test_data.csv\")\n",
408 |     "id_ = test[\"id\"]\n",
409 |     "test_features = test.iloc[:, 1:]"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 12,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "predict = gv_vote_soft.predict(test_features)"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 13,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "result = pd.DataFrame()\n",
428 |     "result[\"id\"] = id_\n",
429 |     "result[\"safe_type\"] = predict\n",
430 |     "result.to_csv(\"result.csv\", encoding=\"utf-8\", index=False)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 21,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "bc = joblib.load(\"./models/bc_gr_model.m\")"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
449 |     "import glob\n",
450 |     "def read_data(file_type):\n",
451 |     "    data = []\n",
452 |     "    for path in glob.glob(\"./stage1_dataset/train/{}/*\".format(file_type)):\n",
453 |     "        with open(path, \"r\") as fp:\n",
454 |     "            data.append(fp.read())\n",
455 |     "    return data\n",
456 |     "\n",
457 |     "vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=3000)\n",
458 |     "white_data = read_data(\"white\")\n",
459 |     "black_data = read_data(\"black\")\n",
460 |     "data = white_data + black_data\n",
461 |     "white = [0 for _ in range(len(white_data))]\n",
462 |     "black = [1 for _ in range(len(black_data))]\n",
463 |     "safe_type = white + black\n",
464 |     "features = vectorizer.fit_transform(data)"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 45,
470 |    "metadata": {},
471 |    "outputs": [],
472 |    "source": [
473 |     "def get_oof(model, x_train, y_train, x_test, n_splits):\n",
474 |     "    \"\"\"\n",
475 |     "    :@param x_train: feature matrix.\n",
476 |     "    :type x: np.array(M X N) or list(M X N).\n",
477 |     "    :@param y_train: class label.\n",
478 |     "    :type y: int.\n",
479 |     "    :@param x_test: test set feature matrix.\n",
480 |     "    :type x_test: np.array(M X N) or list(M X N).\n",
481 |     "    :@param n_splits: K-fold parameter.\n",
482 |     "    :type n_splits: int.\n",
483 |     "    \"\"\"\n",
484 |     "    n_train, n_test = x_train.shape[0], x_test.shape[0]\n",
485 |     "    kf = StratifiedKFold(n_splits=n_splits, random_state=0)\n",
486 |     "    oof_train = np.empty((n_train, ))\n",
487 |     "    oof_test = np.empty((n_test, ))\n",
488 |     "    oof_test_skf = np.empty((n_splits, n_test))\n",
489 |     "    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):\n",
490 |     "        kf_x_train = x_train[train_index]\n",
491 |     "        kf_y_train = y_train[train_index]\n",
492 |     "        kf_x_test = x_train[test_index]\n",
493 |     "        model.fit(kf_x_train, kf_y_train)\n",
494 |     "        oof_train[test_index] = model.predict(kf_x_test)\n",
495 |     "        oof_test_skf[i, :] = model.predict(x_test)\n",
496 |     "    oof_test[:] = oof_test_skf.mean(axis=0)\n",
497 |     "    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": null,
503 |    "metadata": {},
504 |    "outputs": [],
505 |    "source": []
506 |   },
507 |   {
508 |    "cell_type": "code",
509 |    "execution_count": 67,
510 |    "metadata": {},
511 |    "outputs": [],
512 |    "source": [
513 |     "import numpy as np\n",
514 |     "from PIL import Image\n",
515 |     "import binascii\n",
516 |     "\n",
517 |     "def getMatrixfrom_bin(filename, width):\n",
518 |     "    with open(filename, 'rb') as f:\n",
519 |     "        content = f.read()\n",
520 |     "    hexst = binascii.hexlify(content)  #将二进制文件转换为十六进制字符串\n",
521 |     "    fh = np.array([int(hexst[i: i+2], 16) for i in range(0, len(hexst), 2)])  #按字节分割\n",
522 |     "    rn = len(fh) // width\n",
523 |     "    fh = np.reshape(fh[:rn * width], (-1, width))  #根据设定的宽度生成矩阵\n",
524 |     "    fh = np.uint8(fh)\n",
525 |     "    return fh\n"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": 68,
531 |    "metadata": {},
532 |    "outputs": [],
533 |    "source": [
534 |     "filename = \"./pandalearning.exe\"\n",
535 |     "im = Image.fromarray(getMatrixfrom_bin(filename, 512)) #转换为图像\n",
536 |     "# im.save(\"your_img_filename.png\")"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 69,
542 |    "metadata": {},
543 |    "outputs": [],
544 |    "source": [
545 |     "im.show()"
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "code",
550 |    "execution_count": 70,
551 |    "metadata": {},
552 |    "outputs": [],
553 |    "source": [
554 |     "import pefile\n",
555 |     "PEfile_Path = \"pandalearning.exe\"\n",
556 |     "pe = pefile.PE(PEfile_Path)"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": 79,
562 |    "metadata": {},
563 |    "outputs": [],
564 |    "source": [
565 |     "with open(\"test.txt\", \"w\") as fp:\n",
566 |     "    fp.write(str(pe))"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 80,
572 |    "metadata": {},
573 |    "outputs": [],
574 |    "source": [
575 |     "import re\n",
576 |     "from collections import *\n",
577 |     "# 从.asm文件获取Opcode序列\n",
578 |     "def getOpcodeSequence(filename):\n",
579 |     "    opcode_seq = []\n",
580 |     "    p = re.compile(r'\\s([a-fA-F0-9]{2}\\s)+\\s*([a-z]+)')\n",
581 |     "    with open(filename) as f:\n",
582 |     "        for line in f:\n",
583 |     "            if line.startswith(\".text\"):\n",
584 |     "                m = re.findall(p, line)\n",
585 |     "                if m:\n",
586 |     "                    opc = m[0][10]\n",
587 |     "                    if opc != \"align\":\n",
588 |     "                        opcode_seq.append(opc)\n",
589 |     "    return opcode_seq\n",
590 |     "# 根据Opcode序列，统计对应的n-gram\n",
591 |     "def getOpcodeNgram(ops ,n = 3):\n",
592 |     "    opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)]\n",
593 |     "    opngram = Counter(opngramlist)\n",
594 |     "    return opngram\n",
595 |     "file = \"test.txt\"\n",
596 |     "ops = getOpcodeSequence(file)\n",
597 |     "opngram = getOpcodeNgram(ops)"
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "code",
602 |    "execution_count": 82,
603 |    "metadata": {},
604 |    "outputs": [],
605 |    "source": [
606 |     "data = str(pe)"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": null,
612 |    "metadata": {},
613 |    "outputs": [],
614 |    "source": []
615 |   }
616 |  ],
617 |  "metadata": {
618 |   "kernelspec": {
619 |    "display_name": "Python 3",
620 |    "language": "python",
621 |    "name": "python3"
622 |   },
623 |   "language_info": {
624 |    "codemirror_mode": {
625 |     "name": "ipython",
626 |     "version": 3
627 |    },
628 |    "file_extension": ".py",
629 |    "mimetype": "text/x-python",
630 |    "name": "python",
631 |    "nbconvert_exporter": "python",
632 |    "pygments_lexer": "ipython3",
633 |    "version": "3.6.7"
634 |   }
635 |  },
636 |  "nbformat": 4,
637 |  "nbformat_minor": 2
638 | }
639 | 


--------------------------------------------------------------------------------
/DataCon2019/code/stage1/ret_value_stacking.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import time\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "import pickle\n",
 13 |     "import dask.array as da\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.model_selection import StratifiedKFold\n",
 18 |     "from sklearn.model_selection import cross_validate\n",
 19 |     "from sklearn.model_selection import GridSearchCV\n",
 20 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 21 |     "\n",
 22 |     "from sklearn import svm\n",
 23 |     "from sklearn import neighbors\n",
 24 |     "from sklearn import naive_bayes\n",
 25 |     "from sklearn.svm import LinearSVC\n",
 26 |     "from xgboost import XGBClassifier\n",
 27 |     "from sklearn.tree import DecisionTreeClassifier\n",
 28 |     "from sklearn.linear_model import LogisticRegression\n",
 29 |     "from sklearn.linear_model import LogisticRegressionCV\n",
 30 |     "from sklearn.tree import DecisionTreeClassifier\n",
 31 |     "from sklearn.gaussian_process import GaussianProcessClassifier\n",
 32 |     "\n",
 33 |     "from sklearn.ensemble import RandomForestClassifier\n",
 34 |     "from sklearn.ensemble import AdaBoostClassifier\n",
 35 |     "from sklearn.ensemble import BaggingClassifier\n",
 36 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
 37 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
 38 |     "from sklearn.ensemble import VotingClassifier\n",
 39 |     "\n",
 40 |     "from sklearn import metrics\n",
 41 |     "from sklearn.metrics import accuracy_score\n",
 42 |     "from sklearn.metrics import classification_report\n",
 43 |     "\n",
 44 |     "from sklearn.externals import joblib\n",
 45 |     "\n",
 46 |     "%config InlineBackend.figure_format = 'svg'\n",
 47 |     "%matplotlib inline\n",
 48 |     "\n",
 49 |     "import warnings\n",
 50 |     "warnings.filterwarnings(\"ignore\")"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 2,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "with open(\"train_ret_value_tfidf_features.pkl\", \"rb\") as fp:\n",
 60 |     "    train_tfidf_features = pickle.load(fp)\n",
 61 |     "with open(\"test_ret_value_tfidf_features.pkl\", \"rb\") as fp:\n",
 62 |     "    test_tfidf_features = pickle.load(fp)\n",
 63 |     "safe_type = pd.read_csv(\"safe_type.csv\", header=None)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "bc_model = BaggingClassifier()\n",
 73 |     "gbc_model = GradientBoostingClassifier()\n",
 74 |     "lr_model = LogisticRegression()\n",
 75 |     "svm_model = svm.LinearSVC()\n",
 76 |     "dt_model = DecisionTreeClassifier()\n",
 77 |     "xgb_model = XGBClassifier(max_depth=7,\n",
 78 |     "                          learning_rate=0.05,\n",
 79 |     "                          n_estimators=1000)\n",
 80 |     "\n",
 81 |     "rfc_model = RandomForestClassifier(200)\n",
 82 |     "etc_model = ExtraTreesClassifier()\n",
 83 |     "mnb_model = naive_bayes.MultinomialNB(alpha=0.01)\n",
 84 |     "ada_model = AdaBoostClassifier()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 4,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "def get_oof(model, x_train, y_train, x_test, n_splits):\n",
 94 |     "    \"\"\"\n",
 95 |     "    :@param x_train: feature matrix.\n",
 96 |     "    :type x: np.array(M X N) or list(M X N).\n",
 97 |     "    :@param y_train: class label.\n",
 98 |     "    :type y: int.\n",
 99 |     "    :@param x_test: test set feature matrix.\n",
100 |     "    :type x_test: np.array(M X N) or list(M X N).\n",
101 |     "    :@param n_splits: K-fold parameter.\n",
102 |     "    :type n_splits: int.\n",
103 |     "    \"\"\"\n",
104 |     "    n_train, n_test = x_train.shape[0], x_test.shape[0]\n",
105 |     "    kf = StratifiedKFold(n_splits=n_splits, random_state=0)\n",
106 |     "    oof_train = np.empty((n_train, ))\n",
107 |     "    oof_test = np.empty((n_test, ))\n",
108 |     "    oof_test_skf = np.empty((n_splits, n_test))\n",
109 |     "    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):\n",
110 |     "        kf_x_train = x_train[train_index]\n",
111 |     "        kf_y_train = y_train[train_index]\n",
112 |     "        kf_x_test = x_train[test_index]\n",
113 |     "        model.fit(kf_x_train, kf_y_train)\n",
114 |     "        oof_train[test_index] = model.predict(kf_x_test)\n",
115 |     "        oof_test_skf[i, :] = model.predict(x_test)\n",
116 |     "    oof_test[:] = oof_test_skf.mean(axis=0)\n",
117 |     "    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "rfc success!\n",
130 |       "etc success!\n",
131 |       "mnb success!\n",
132 |       "ada success!\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "try:\n",
138 |     "    lr_model_oof_train, lr_model_oof_test = get_oof(lr_model, \n",
139 |     "                                                    train_tfidf_features.tolil(), \n",
140 |     "                                                    safe_type.values,\n",
141 |     "                                                    test_tfidf_features.tolil(),\n",
142 |     "                                                    10)\n",
143 |     "    with open(\"ret_value_lr_model_oof_train.csv\", \"wb\") as fp:\n",
144 |     "        pickle.dump(lr_model_oof_train, fp)\n",
145 |     "    with open(\"ret_value_lr_model_oof_test.csv\", \"wb\") as fp:\n",
146 |     "        pickle.dump(lr_model_oof_test, fp)\n",
147 |     "    print(\"lr success!\")\n",
148 |     "except:\n",
149 |     "    print(\"lr error!\")\n",
150 |     "try:\n",
151 |     "    gbc_model_oof_train, gbc_model_oof_test = get_oof(gbc_model, \n",
152 |     "                                                      train_tfidf_features.tolil(), \n",
153 |     "                                                      safe_type.values,\n",
154 |     "                                                      test_tfidf_features.tolil(),\n",
155 |     "                                                      10)\n",
156 |     "    with open(\"ret_value_gbc_model_oof_train.csv\", \"wb\") as fp:\n",
157 |     "        pickle.dump(gbc_model_oof_train, fp)\n",
158 |     "    with open(\"ret_value_gbc_model_oof_test.csv\", \"wb\") as fp:\n",
159 |     "        pickle.dump(gbc_model_oof_test, fp)\n",
160 |     "    print(\"gbc success!\")\n",
161 |     "except:\n",
162 |     "    print(\"gbc error!\")\n",
163 |     "try:\n",
164 |     "    bc_model_oof_train, bc_model_oof_test = get_oof(bc_model, \n",
165 |     "                                                    train_tfidf_features.tolil(), \n",
166 |     "                                                    safe_type.values,\n",
167 |     "                                                    test_tfidf_features.tolil(),\n",
168 |     "                                                    10)\n",
169 |     "    with open(\"ret_value_bc_model_oof_train.csv\", \"wb\") as fp:\n",
170 |     "        pickle.dump(bc_model_oof_train, fp)\n",
171 |     "    with open(\"ret_value_bc_model_oof_test.csv\", \"wb\") as fp:\n",
172 |     "        pickle.dump(bc_model_oof_test, fp)\n",
173 |     "    print(\"bc success!\")\n",
174 |     "except:\n",
175 |     "    print(\"bc error!\")\n",
176 |     "try:\n",
177 |     "    svm_model_oof_train, svm_model_oof_test = get_oof(svm_model, \n",
178 |     "                                                      train_tfidf_features.tolil(), \n",
179 |     "                                                      safe_type.values,\n",
180 |     "                                                      test_tfidf_features.tolil(),\n",
181 |     "                                                      10)\n",
182 |     "    with open(\"ret_value_svm_model_oof_train.csv\", \"wb\") as fp:\n",
183 |     "        pickle.dump(svm_model_oof_train, fp)\n",
184 |     "    with open(\"ret_value_svm_model_oof_test.csv\", \"wb\") as fp:\n",
185 |     "        pickle.dump(svm_model_oof_test, fp)\n",
186 |     "    print(\"svm success!\")\n",
187 |     "except:\n",
188 |     "    print(\"svm error!\")\n",
189 |     "try:\n",
190 |     "    dt_model_oof_train, dt_model_oof_test = get_oof(dt_model, \n",
191 |     "                                                      train_tfidf_features.tolil(), \n",
192 |     "                                                      safe_type.values,\n",
193 |     "                                                      test_tfidf_features.tolil(),\n",
194 |     "                                                      10)\n",
195 |     "    with open(\"ret_value_dt_model_oof_train.csv\", \"wb\") as fp:\n",
196 |     "        pickle.dump(dt_model_oof_train, fp)\n",
197 |     "    with open(\"ret_value_dt_model_oof_test.csv\", \"wb\") as fp:\n",
198 |     "        pickle.dump(dt_model_oof_test, fp)\n",
199 |     "    print(\"dt success!\")\n",
200 |     "except:\n",
201 |     "    print(\"dt error!\")\n",
202 |     "try:\n",
203 |     "    xgb_model_oof_train, xgb_model_oof_test = get_oof(xgb_model, \n",
204 |     "                                                      train_tfidf_features.tolil(), \n",
205 |     "                                                      safe_type.values,\n",
206 |     "                                                      test_tfidf_features.tolil(),\n",
207 |     "                                                      10)\n",
208 |     "    with open(\"ret_value_xgb_model_oof_train.csv\", \"wb\") as fp:\n",
209 |     "        pickle.dump(xgb_model_oof_train, fp)\n",
210 |     "    with open(\"ret_value_xgb_model_oof_test.csv\", \"wb\") as fp:\n",
211 |     "        pickle.dump(xgb_model_oof_test, fp)\n",
212 |     "    print(\"xgb success!\")\n",
213 |     "except:\n",
214 |     "    print(\"xgb error!\")\n",
215 |     "try:\n",
216 |     "    rfc_model_oof_train, rfc_model_oof_test = get_oof(rfc_model, \n",
217 |     "                                                      train_tfidf_features.tolil(), \n",
218 |     "                                                      safe_type.values,\n",
219 |     "                                                      test_tfidf_features.tolil(),\n",
220 |     "                                                      10)\n",
221 |     "    with open(\"ret_value_rfc_model_oof_train.csv\", \"wb\") as fp:\n",
222 |     "        pickle.dump(rfc_model_oof_train, fp)\n",
223 |     "    with open(\"ret_value_rfc_model_oof_test.csv\", \"wb\") as fp:\n",
224 |     "        pickle.dump(rfc_model_oof_test, fp)\n",
225 |     "    print(\"rfc success!\")\n",
226 |     "except:\n",
227 |     "    print(\"rfc error!\")\n",
228 |     "    \n",
229 |     "try:\n",
230 |     "    etc_model_oof_train, etc_model_oof_test = get_oof(etc_model, \n",
231 |     "                                                      train_tfidf_features.tolil(), \n",
232 |     "                                                      safe_type.values,\n",
233 |     "                                                      test_tfidf_features.tolil(),\n",
234 |     "                                                      10)\n",
235 |     "    with open(\"ret_value_etc_model_oof_train.csv\", \"wb\") as fp:\n",
236 |     "        pickle.dump(etc_model_oof_train, fp)\n",
237 |     "    with open(\"ret_value_etc_model_oof_test.csv\", \"wb\") as fp:\n",
238 |     "        pickle.dump(etc_model_oof_test, fp)\n",
239 |     "    print(\"etc success!\")\n",
240 |     "except:\n",
241 |     "    print(\"etc error!\")\n",
242 |     "try:\n",
243 |     "    mnb_model_oof_train, mnb_model_oof_test = get_oof(mnb_model, \n",
244 |     "                                                      train_tfidf_features.tolil(), \n",
245 |     "                                                      safe_type.values,\n",
246 |     "                                                      test_tfidf_features.tolil(),\n",
247 |     "                                                      10)\n",
248 |     "    with open(\"ret_value_mnb_model_oof_train.csv\", \"wb\") as fp:\n",
249 |     "        pickle.dump(mnb_model_oof_train, fp)\n",
250 |     "    with open(\"ret_value_mnb_model_oof_test.csv\", \"wb\") as fp:\n",
251 |     "        pickle.dump(mnb_model_oof_test, fp)\n",
252 |     "    print(\"mnb success!\")\n",
253 |     "except:\n",
254 |     "    print(\"mnb error!\")\n",
255 |     "    \n",
256 |     "try:\n",
257 |     "    ada_model_oof_train, ada_model_oof_test = get_oof(ada_model, \n",
258 |     "                                                      train_tfidf_features.tolil(), \n",
259 |     "                                                      safe_type.values,\n",
260 |     "                                                      test_tfidf_features.tolil(),\n",
261 |     "                                                      10)\n",
262 |     "    with open(\"ret_value_ada_model_oof_train.csv\", \"wb\") as fp:\n",
263 |     "        pickle.dump(ada_model_oof_train, fp)\n",
264 |     "    with open(\"ret_value_ada_model_oof_test.csv\", \"wb\") as fp:\n",
265 |     "        pickle.dump(ada_model_oof_test, fp)\n",
266 |     "    print(\"ada success!\")\n",
267 |     "except:\n",
268 |     "    print(\"ada error!\")\n",
269 |     "\n",
270 |     "\n",
271 |     "ret_value_stacking_train_10 = np.hstack([lr_model_oof_train, gbc_model_oof_train, bc_model_oof_train,\n",
272 |     "                            svm_model_oof_train, xgb_model_oof_train, dt_model_oof_train,\n",
273 |     "                            rfc_model_oof_train, etc_model_oof_train, mnb_model_oof_train,\n",
274 |     "                            ada_model_oof_train])\n",
275 |     "ret_value_stacking_test_10 = np.hstack([lr_model_oof_test, gbc_model_oof_test, bc_model_oof_test,\n",
276 |     "                           svm_model_oof_test, xgb_model_oof_test, dt_model_oof_test,\n",
277 |     "                           rfc_model_oof_test, etc_model_oof_test, mnb_model_oof_test,\n",
278 |     "                           ada_model_oof_test])\n",
279 |     "with open(\"ret_value_stacking_train_10.pkl\", \"wb\") as fp:\n",
280 |     "    pickle.dump(ret_value_stacking_train_10, fp)\n",
281 |     "    \n",
282 |     "with open(\"ret_value_stacking_test_10.pkl\", \"wb\") as fp:\n",
283 |     "    pickle.dump(ret_value_stacking_test_10, fp)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 2,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "import pandas as pd"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 11,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "lr = pd.read_pickle(\"exinfos_lr_model_oof_train.csv\")"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 10,
307 |    "metadata": {},
308 |    "outputs": [
309 |     {
310 |      "data": {
311 |       "text/plain": [
312 |        "9251.0"
313 |       ]
314 |      },
315 |      "execution_count": 10,
316 |      "metadata": {},
317 |      "output_type": "execute_result"
318 |     }
319 |    ],
320 |    "source": [
321 |     "lr.sum()"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 12,
327 |    "metadata": {},
328 |    "outputs": [
329 |     {
330 |      "data": {
331 |       "text/plain": [
332 |        "9464.0"
333 |       ]
334 |      },
335 |      "execution_count": 12,
336 |      "metadata": {},
337 |      "output_type": "execute_result"
338 |     }
339 |    ],
340 |    "source": [
341 |     "lr.sum()"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 6,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "t = pd.read_pickle(\"api_name_and_ret_value_stacked_mix_train.pkl\")"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 8,
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "data": {
360 |       "text/plain": [
361 |        "array([9669., 9726., 9800., 9817., 9992., 9650., 9872., 9582., 9836.,\n",
362 |        "       9844., 9128., 9116., 9575., 9559., 9885., 9487., 9613., 9308.,\n",
363 |        "       9653., 9425.])"
364 |       ]
365 |      },
366 |      "execution_count": 8,
367 |      "metadata": {},
368 |      "output_type": "execute_result"
369 |     }
370 |    ],
371 |    "source": [
372 |     "t.sum(axis=0)"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": []
381 |   }
382 |  ],
383 |  "metadata": {
384 |   "kernelspec": {
385 |    "display_name": "Python 3",
386 |    "language": "python",
387 |    "name": "python3"
388 |   },
389 |   "language_info": {
390 |    "codemirror_mode": {
391 |     "name": "ipython",
392 |     "version": 3
393 |    },
394 |    "file_extension": ".py",
395 |    "mimetype": "text/x-python",
396 |    "name": "python",
397 |    "nbconvert_exporter": "python",
398 |    "pygments_lexer": "ipython3",
399 |    "version": "3.6.7"
400 |   }
401 |  },
402 |  "nbformat": 4,
403 |  "nbformat_minor": 2
404 | }
405 | 


--------------------------------------------------------------------------------
/DataCon2019/code/stage2/DBSCAN.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import sklearn.cluster as skc
 4 | 
 5 | api_name_svded_train = pd.read_pickle("api_name_svded_features.pkl")
 6 | exinfos_svded_train = pd.read_pickle("exinfos_svded_features.pkl")
 7 | call_name_svded_train = pd.read_pickle("call_name_svded_features.pkl")
 8 | 
 9 | merge_data = np.hstack([api_name_svded_train, exinfos_svded_train, call_name_svded_train])
10 | dbscan = skc.DBSCAN()
11 | y_pred = dbscan.fit_predict(merge_data)
12 | 
13 | result = pd.DataFrame()
14 | result["id"] = pd.read_csv("id.csv", names=["id"])["id"]
15 | result["family_id"] = y_pred
16 | 
17 | result.to_csv("result.csv", encoding="utf-8", index=False)
18 | 


--------------------------------------------------------------------------------
/DataCon2019/code/stage2/feature_engineering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "import glob\n",
 11 |     "import time\n",
 12 |     "import numpy as np\n",
 13 |     "import pandas as pd\n",
 14 |     "import pickle\n",
 15 |     "import matplotlib.pyplot as plt\n",
 16 |     "\n",
 17 |     "from sklearn.model_selection import train_test_split\n",
 18 |     "from sklearn.model_selection import StratifiedKFold\n",
 19 |     "from sklearn.model_selection import cross_validate\n",
 20 |     "from sklearn.model_selection import GridSearchCV\n",
 21 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 22 |     "\n",
 23 |     "from sklearn import svm\n",
 24 |     "from sklearn import neighbors\n",
 25 |     "from sklearn import naive_bayes\n",
 26 |     "from sklearn.svm import LinearSVC\n",
 27 |     "# from xgboost import XGBClassifier\n",
 28 |     "from sklearn.cluster import KMeans\n",
 29 |     "from sklearn.decomposition import TruncatedSVD \n",
 30 |     "from sklearn.tree import DecisionTreeClassifier\n",
 31 |     "from sklearn.linear_model import LogisticRegression\n",
 32 |     "from sklearn.linear_model import LogisticRegressionCV\n",
 33 |     "from sklearn.tree import DecisionTreeClassifier\n",
 34 |     "from sklearn.gaussian_process import GaussianProcessClassifier\n",
 35 |     "\n",
 36 |     "from sklearn.ensemble import RandomForestClassifier\n",
 37 |     "from sklearn.ensemble import AdaBoostClassifier\n",
 38 |     "from sklearn.ensemble import BaggingClassifier\n",
 39 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
 40 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
 41 |     "from sklearn.ensemble import VotingClassifier\n",
 42 |     "\n",
 43 |     "from sklearn import metrics\n",
 44 |     "from sklearn.metrics import accuracy_score\n",
 45 |     "from sklearn.metrics import classification_report\n",
 46 |     "\n",
 47 |     "from sklearn.externals import joblib\n",
 48 |     "\n",
 49 |     "%config InlineBackend.figure_format = 'svg'\n",
 50 |     "%matplotlib inline\n",
 51 |     "\n",
 52 |     "import warnings\n",
 53 |     "warnings.filterwarnings(\"ignore\")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 4,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "def yield_origin_csv():\n",
 63 |     "    flag = 1\n",
 64 |     "    id_, api_name_list, exinfos_list = [], [], []\n",
 65 |     "    api_name_regex = re.compile('<action api_name=\"(.*?)\" call_name')\n",
 66 |     "    exinfos_regex = re.compile('<exInfo value=\"(.*?)\"')\n",
 67 |     "    for path in glob.glob(\"./stage2_dataset/*\"):\n",
 68 |     "        with open(path, \"r\") as fp:\n",
 69 |     "            xml = fp.read()\n",
 70 |     "        api_names = re.findall(api_name_regex, xml)\n",
 71 |     "        exinfos = re.findall(exinfos_regex, xml)\n",
 72 |     "        api_name_list.append(\" \".join(api_names))\n",
 73 |     "        dll_exinfos = [ef.split(\"\\\\\")[-1].split('.')[0] for ef in exinfos \n",
 74 |     "                       if ef.endswith(\".dll\") and ef.startswith(\"C:\")]\n",
 75 |     "        id_.append(path.split(\".\")[1].split(\"/\")[-1]) \n",
 76 |     "        exinfos_list.append(\" \".join(dll_exinfos))\n",
 77 |     "        \n",
 78 |     "        \n",
 79 |     "        if flag % 300 == 0:\n",
 80 |     "            print(flag)\n",
 81 |     "        flag += 1\n",
 82 |     "    df = pd.DataFrame()\n",
 83 |     "    df[\"id\"] = id_\n",
 84 |     "    df[\"api_name\"] = api_name_list\n",
 85 |     "    df[\"exinfos\"] = exinfos_list\n",
 86 |     "    return df"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 10,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "# data = yield_origin_csv()\n",
 96 |     "data = pd.read_csv(\"stage2_api_name_exinfos.csv\")"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 3,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/html": [
107 |        "<div>\n",
108 |        "<style scoped>\n",
109 |        "    .dataframe tbody tr th:only-of-type {\n",
110 |        "        vertical-align: middle;\n",
111 |        "    }\n",
112 |        "\n",
113 |        "    .dataframe tbody tr th {\n",
114 |        "        vertical-align: top;\n",
115 |        "    }\n",
116 |        "\n",
117 |        "    .dataframe thead th {\n",
118 |        "        text-align: right;\n",
119 |        "    }\n",
120 |        "</style>\n",
121 |        "<table border=\"1\" class=\"dataframe\">\n",
122 |        "  <thead>\n",
123 |        "    <tr style=\"text-align: right;\">\n",
124 |        "      <th></th>\n",
125 |        "      <th>id</th>\n",
126 |        "      <th>api_name</th>\n",
127 |        "      <th>exinfos</th>\n",
128 |        "    </tr>\n",
129 |        "  </thead>\n",
130 |        "  <tbody>\n",
131 |        "    <tr>\n",
132 |        "      <th>0</th>\n",
133 |        "      <td>3ec88410420dd913bf5676b2ba0ae4baa41dad0d55df9b...</td>\n",
134 |        "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...</td>\n",
135 |        "      <td>user32 gdi32 mfc42 msvcrt imm32 advapi32 rpcrt...</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <th>1</th>\n",
139 |        "      <td>2dfd653c6b862500ff7c47615ad0725a8ce88ddb8ee083...</td>\n",
140 |        "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze Fak...</td>\n",
141 |        "      <td>mpr advapi32 rpcrt4 secur32 user32 gdi32 imm32...</td>\n",
142 |        "    </tr>\n",
143 |        "    <tr>\n",
144 |        "      <th>2</th>\n",
145 |        "      <td>fb7ae8ad837ee4c2afc58bc321e6bfddb6564a6bce3743...</td>\n",
146 |        "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze Unp...</td>\n",
147 |        "      <td>user32 gdi32 advapi32 rpcrt4 secur32 oleaut32 ...</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>3</th>\n",
151 |        "      <td>c97a29518ee63fecae29dd973941b8395bd3aaceb11c52...</td>\n",
152 |        "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...</td>\n",
153 |        "      <td>user32 gdi32 advapi32 rpcrt4 secur32 iphlpapi ...</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <th>4</th>\n",
157 |        "      <td>fb146a3d534cfc36b325bc1c4d7995122b722eb5ae04d5...</td>\n",
158 |        "      <td>AnalyzeStart Fake_BeCreatedEx TryToAnalyze NtQ...</td>\n",
159 |        "      <td>mfc42 msvcrt gdi32 user32 imm32 advapi32 rpcrt...</td>\n",
160 |        "    </tr>\n",
161 |        "  </tbody>\n",
162 |        "</table>\n",
163 |        "</div>"
164 |       ],
165 |       "text/plain": [
166 |        "                                                  id  \\\n",
167 |        "0  3ec88410420dd913bf5676b2ba0ae4baa41dad0d55df9b...   \n",
168 |        "1  2dfd653c6b862500ff7c47615ad0725a8ce88ddb8ee083...   \n",
169 |        "2  fb7ae8ad837ee4c2afc58bc321e6bfddb6564a6bce3743...   \n",
170 |        "3  c97a29518ee63fecae29dd973941b8395bd3aaceb11c52...   \n",
171 |        "4  fb146a3d534cfc36b325bc1c4d7995122b722eb5ae04d5...   \n",
172 |        "\n",
173 |        "                                            api_name  \\\n",
174 |        "0  AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...   \n",
175 |        "1  AnalyzeStart Fake_BeCreatedEx TryToAnalyze Fak...   \n",
176 |        "2  AnalyzeStart Fake_BeCreatedEx TryToAnalyze Unp...   \n",
177 |        "3  AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...   \n",
178 |        "4  AnalyzeStart Fake_BeCreatedEx TryToAnalyze NtQ...   \n",
179 |        "\n",
180 |        "                                             exinfos  \n",
181 |        "0  user32 gdi32 mfc42 msvcrt imm32 advapi32 rpcrt...  \n",
182 |        "1  mpr advapi32 rpcrt4 secur32 user32 gdi32 imm32...  \n",
183 |        "2  user32 gdi32 advapi32 rpcrt4 secur32 oleaut32 ...  \n",
184 |        "3  user32 gdi32 advapi32 rpcrt4 secur32 iphlpapi ...  \n",
185 |        "4  mfc42 msvcrt gdi32 user32 imm32 advapi32 rpcrt...  "
186 |       ]
187 |      },
188 |      "execution_count": 3,
189 |      "metadata": {},
190 |      "output_type": "execute_result"
191 |     }
192 |    ],
193 |    "source": [
194 |     "data.head()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 4,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "data.fillna(method=\"ffill\", inplace=True)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 5,
209 |    "metadata": {},
210 |    "outputs": [
211 |     {
212 |      "name": "stdout",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "<class 'pandas.core.frame.DataFrame'>\n",
216 |       "RangeIndex: 60000 entries, 0 to 59999\n",
217 |       "Data columns (total 3 columns):\n",
218 |       "id          60000 non-null object\n",
219 |       "api_name    60000 non-null object\n",
220 |       "exinfos     60000 non-null object\n",
221 |       "dtypes: object(3)\n",
222 |       "memory usage: 1.4+ MB\n"
223 |      ]
224 |     }
225 |    ],
226 |    "source": [
227 |     "data.info()"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 13,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "api_name_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=100000)\n",
237 |     "api_name_train_tfidf_features = api_name_vectorizer.fit_transform(data[\"api_name\"].tolist())\n",
238 |     "\n",
239 |     "exinfos_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=100000)\n",
240 |     "exinfos_train_tfidf_features = exinfos_vectorizer.fit_transform(data[\"exinfos\"].tolist())"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 10,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "with open(\"api_name_train_tfidf_features.pkl\", \"wb\") as fp:\n",
250 |     "    pickle.dump(api_name_train_tfidf_features, fp)\n",
251 |     "with open(\"exinfos_train_tfidf_features.pkl\", \"wb\") as fp:\n",
252 |     "    pickle.dump(exinfos_train_tfidf_features, fp)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 2,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "api_name_train_tfidf_features = pd.read_pickle(\"api_name_train_tfidf_features.pkl\")\n",
262 |     "exinfos_train_tfidf_features = pd.read_pickle(\"exinfos_svded_features.pkl\")"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "svd = TruncatedSVD(n_components=1000, algorithm=\"arpack\", random_state=0)\n",
272 |     "svded_train = svd.fit_transform(api_name_train_tfidf_features.tolil())\n",
273 |     "svd = TruncatedSVD(n_components=10000, algorithm=\"arpack\", random_state=0)\n",
274 |     "exinfos_svded_train = svd.fit_transform(exinfos_train_tfidf_features.tolil())\n",
275 |     "with open(\"api_name_svded_10000_features.pkl\", \"wb\") as fp:\n",
276 |     "    pickle.dump(svded_train, fp)\n",
277 |     "with open(\"exinfos_svded_10000_features.pkl\", \"wb\") as fp:\n",
278 |     "    pickle.dump(exinfos_svded_train, fp)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 3,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "api_name_svded_train = pd.read_pickle(\"api_name_svded_features.pkl\")\n",
288 |     "exinfos_svded_train = pd.read_pickle(\"exinfos_svded_features.pkl\")"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 4,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "merge_data = np.hstack([api_name_svded_train, exinfos_svded_train])"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 5,
303 |    "metadata": {},
304 |    "outputs": [
305 |     {
306 |      "data": {
307 |       "text/plain": [
308 |        "(60000, 2000)"
309 |       ]
310 |      },
311 |      "execution_count": 5,
312 |      "metadata": {},
313 |      "output_type": "execute_result"
314 |     }
315 |    ],
316 |    "source": [
317 |     "merge_data.shape"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 7,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "kmeans = KMeans(n_clusters=50, random_state=0)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 8,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "y_pred = kmeans.fit_predict(merge_data)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 11,
341 |    "metadata": {},
342 |    "outputs": [],
343 |    "source": [
344 |     "result = pd.DataFrame()\n",
345 |     "result[\"id\"] = data[\"id\"]\n",
346 |     "result[\"family_id\"] = y_pred"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 14,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "result.to_csv(\"result.csv\", encoding=\"utf-8\", index=False)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 25,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "y_pred = pd.read_csv(\"result.csv\")[\"family_id\"]"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 44,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "exinfos = pd.read_pickle(\"exinfos_svded_features.pkl\")"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 6,
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": [
382 |     "from sklearn.manifold import TSNE\n",
383 |     "\n",
384 |     "X_tsne = TSNE(n_components=2, random_state=33).fit_transform(merge_data)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "with open(\"api_name_exinfos_stne_data.pkl\", \"wb\") as fp:\n",
394 |     "    pickle.dump(X_tsne, fp)"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 2,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "X_tsne =  pd.read_pickle(\"call_name_tsne_data.pkl\")"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 9,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "font = {\"color\": \"darkred\",\n",
413 |     "        \"size\": 13, \n",
414 |     "        \"family\" : \"serif\"}\n",
415 |     "\n",
416 |     "plt.style.use(\"dark_background\")\n",
417 |     "plt.figure()\n",
418 |     "plt.scatter(X_tsne[:, 0], X_tsne[:, 1])\n",
419 |     "plt.title(\"origin_data_t-SNE\", fontdict=font)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 10,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "y_pred = pd.read_csv(\"34.78_k=100.csv\")[\"family_id\"]"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 19,
434 |    "metadata": {},
435 |    "outputs": [],
436 |    "source": [
437 |     "font = {\"color\": \"darkred\",\n",
438 |     "        \"size\": 13, \n",
439 |     "        \"family\" : \"serif\"}\n",
440 |     "\n",
441 |     "plt.style.use(\"dark_background\")\n",
442 |     "plt.figure()\n",
443 |     "plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_pred.values, alpha=0.6, \n",
444 |     "            cmap=plt.cm.get_cmap('rainbow', 100))\n",
445 |     "plt.title(\"api_name_and_exinfos_t-SNE\", fontdict=font)\n",
446 |     "cbar = plt.colorbar() \n",
447 |     "cbar.set_label(label='family id', fontdict=font)\n",
448 |     "plt.clim(-5, 100)\n",
449 |     "plt.tight_layout()\n",
450 |     "plt.savefig(\"api_name_and_exinfos_TSNE.pdf\")"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 21,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": [
459 |     "call_name_svded_features = pd.read_pickle(\"call_name_svded_features.pkl\")\n",
460 |     "api_name_svded_features = pd.read_pickle(\"api_name_svded_features.pkl\")\n",
461 |     "exinfos_svded_features = pd.read_pickle(\"exinfos_svded_features.pkl\")\n",
462 |     "merge_data = np.hstack([api_name_svded_features, exinfos_svded_features, call_name_svded_features])"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 26,
468 |    "metadata": {},
469 |    "outputs": [],
470 |    "source": [
471 |     "kmeans = KMeans(n_clusters=100, random_state=0)\n",
472 |     "y_pred = kmeans.fit_predict(merge_data)"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 27,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "cluster = 100\n",
482 |     "result = pd.DataFrame()\n",
483 |     "result[\"id\"] = pd.read_csv(\"id.csv\", names=[\"id\"])[\"id\"]\n",
484 |     "result[\"family_id\"] = y_pred\n",
485 |     "\n",
486 |     "result.to_csv(f\"k-means_cluster={cluster}_result.csv\", encoding=\"utf-8\", index=False)"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": 48,
492 |    "metadata": {},
493 |    "outputs": [
494 |     {
495 |      "data": {
496 |       "text/plain": [
497 |        "2.6399999999999997"
498 |       ]
499 |      },
500 |      "execution_count": 48,
501 |      "metadata": {},
502 |      "output_type": "execute_result"
503 |     }
504 |    ],
505 |    "source": [
506 |     "10.53 - 7.89"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": 49,
512 |    "metadata": {},
513 |    "outputs": [
514 |     {
515 |      "data": {
516 |       "text/plain": [
517 |        "7.89"
518 |       ]
519 |      },
520 |      "execution_count": 49,
521 |      "metadata": {},
522 |      "output_type": "execute_result"
523 |     }
524 |    ],
525 |    "source": [
526 |     "2.63 * 3"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": []
535 |   }
536 |  ],
537 |  "metadata": {
538 |   "kernelspec": {
539 |    "display_name": "Python 3",
540 |    "language": "python",
541 |    "name": "python3"
542 |   },
543 |   "language_info": {
544 |    "codemirror_mode": {
545 |     "name": "ipython",
546 |     "version": 3
547 |    },
548 |    "file_extension": ".py",
549 |    "mimetype": "text/x-python",
550 |    "name": "python",
551 |    "nbconvert_exporter": "python",
552 |    "pygments_lexer": "ipython3",
553 |    "version": "3.6.7"
554 |   }
555 |  },
556 |  "nbformat": 4,
557 |  "nbformat_minor": 2
558 | }
559 | 


--------------------------------------------------------------------------------
/DataCon2019/code/stage2/for_cluster_kmeans.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn.cluster import KMeans
 5 | 
 6 | call_name_svded_features = pd.read_pickle("call_name_svded_features.pkl")
 7 | api_name_svded_features = pd.read_pickle("api_name_svded_features.pkl")
 8 | exinfos_svded_features = pd.read_pickle("exinfos_svded_features.pkl")
 9 | merge_data = np.hstack([api_name_svded_features, exinfos_svded_features, call_name_svded_features])
10 | 
11 | for cluster in [50, 250, 300, 400, 500]:
12 |     kmeans = KMeans(n_clusters=cluster, random_state=0)
13 |     y_pred = kmeans.fit_predict(merge_data)
14 |     result = pd.DataFrame()
15 |     result["id"] = pd.read_csv("id.csv", names=["id"])["id"]
16 |     result["family_id"] = y_pred
17 | 
18 |     result.to_csv(f"k-means_cluster={cluster}_result.csv", encoding="utf-8", index=False)


--------------------------------------------------------------------------------
/DataCon2019/code/stage2/get_call_name_tfidf_features.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn.manifold import TSNE
 5 | from sklearn.decomposition import TruncatedSVD 
 6 | from sklearn.feature_extraction.text import TfidfVectorizer
 7 | 
 8 | 
 9 | data = pd.read_csv("call_name.csv")
10 | call_name_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)
11 | call_name_train_tfidf_features = call_name_vectorizer.fit_transform(data["call_name"].tolist())
12 | with open("call_name_tfidf_features.pkl", "wb") as fp:
13 |     pickle.dump(call_name_train_tfidf_features, fp)
14 |     
15 | svd = TruncatedSVD(n_components=1000, algorithm="arpack", random_state=0)
16 | call_name_svded_train = svd.fit_transform(call_name_train_tfidf_features.tolil())
17 | 
18 | with open("call_name_svded_features.pkl", "wb") as fp:
19 |     pickle.dump(call_name_svded_train, fp)
20 |     
21 | X_tsne = TSNE(n_components=2, random_state=33).fit_transform(call_name_svded_train)
22 | 
23 | with open("call_name_tsne_data.pkl", "wb") as fp:
24 |     pickle.dump(X_tsne, fp)


--------------------------------------------------------------------------------
/DataCon2019/code/stage2/plot_comparison.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | X_tsne =  pd.read_pickle("api_name_exinfos_call_name_tsne_data.pkl")
 7 | dbscan_y_pred = pd.read_csv("result.csv")["family_id"]
 8 | kmeans_50 = pd.read_csv("k-means_cluster=50_result.csv")["family_id"]
 9 | kmeans_100 = pd.read_csv("k-means_cluster=100_result.csv")["family_id"]
10 | kmeans_200 = pd.read_csv("k-means_cluster=200_result.csv")["family_id"]
11 | kmeans_250 = pd.read_csv("k-means_cluster=250_result.csv")["family_id"]
12 | kmeans_300 = pd.read_csv("k-means_cluster=300_result.csv")["family_id"]
13 | kmeans_400 = pd.read_csv("k-means_cluster=400_result.csv")["family_id"]
14 | kmeans_500 = pd.read_csv("k-means_cluster=500_result.csv")["family_id"]
15 | 
16 | font = {"color": "darkred",
17 |         "size": 25, 
18 |         "family" : "serif"}
19 | 
20 | plt.style.use("dark_background")
21 | plt.figure(figsize=(30, 25))
22 | 
23 | plt.subplot(3, 3, 1) 
24 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_50.values, alpha=0.6, 
25 |             cmap=plt.cm.get_cmap('rainbow', 50))
26 | plt.title("K-means_cluster=50_t-SNE", fontdict=font)
27 | cbar = plt.colorbar() 
28 | cbar.set_label(label='family id', fontdict=font)
29 | plt.clim(0, 50) 
30 | 
31 | plt.subplot(3, 3, 2)
32 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=dbscan_y_pred.values, alpha=0.6, 
33 |             cmap=plt.cm.get_cmap('rainbow', dbscan_y_pred.max()-dbscan_y_pred.min()))
34 | plt.title("DBSCAN_t-SNE", fontdict=font)
35 | cbar = plt.colorbar() 
36 | cbar.set_label(label='family id', fontdict=font)
37 | plt.clim(dbscan_y_pred.min(), 1000)
38 | 
39 | plt.subplot(3, 3, 3) 
40 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_100.values, alpha=0.6, 
41 |             cmap=plt.cm.get_cmap('rainbow', 100))
42 | plt.title("K-means_cluster=100_t-SNE", fontdict=font)
43 | cbar = plt.colorbar() 
44 | cbar.set_label(label='family id', fontdict=font)
45 | plt.clim(0, 100) 
46 | 
47 | plt.subplot(3, 3, 4) 
48 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_200.values, alpha=0.6, 
49 |             cmap=plt.cm.get_cmap('rainbow', 200))
50 | plt.title("K-means_cluster=200_t-SNE", fontdict=font)
51 | cbar = plt.colorbar() 
52 | cbar.set_label(label='family id', fontdict=font)
53 | plt.clim(0, 200) 
54 | 
55 | plt.subplot(3, 3, 5) 
56 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.6, c=np.ones(60000), cmap=plt.cm.get_cmap('rainbow', 1))
57 | plt.title("origin_data_t-SNE", fontdict=font)
58 | cbar = plt.colorbar(ticks=[0]) 
59 | cbar.set_label(label='color bar', fontdict=font)
60 | 
61 | plt.subplot(3, 3, 6) 
62 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_250.values, alpha=0.6, 
63 |             cmap=plt.cm.get_cmap('rainbow', 250))
64 | plt.title("K-means_cluster=250_t-SNE", fontdict=font)
65 | cbar = plt.colorbar() 
66 | cbar.set_label(label='family id', fontdict=font)
67 | plt.clim(0, 250) 
68 | 
69 | plt.subplot(3, 3, 7) 
70 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_300.values, alpha=0.6, 
71 |             cmap=plt.cm.get_cmap('rainbow', 300))
72 | plt.title("K-means_cluster=300_t-SNE", fontdict=font)
73 | cbar = plt.colorbar() 
74 | cbar.set_label(label='family id', fontdict=font)
75 | plt.clim(0, 300) 
76 | 
77 | plt.subplot(3, 3, 8) 
78 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_400.values, alpha=0.6, 
79 |             cmap=plt.cm.get_cmap('rainbow', 400))
80 | plt.title("K-means_cluster=400_t-SNE", fontdict=font)
81 | cbar = plt.colorbar() 
82 | cbar.set_label(label='family id', fontdict=font)
83 | plt.clim(0, 400) 
84 | 
85 | plt.subplot(3, 3, 9) 
86 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_500.values, alpha=0.6, 
87 |             cmap=plt.cm.get_cmap('rainbow', 500))
88 | plt.title("K-means_cluster=500_t-SNE", fontdict=font)
89 | cbar = plt.colorbar() 
90 | cbar.set_label(label='family id', fontdict=font)
91 | plt.clim(0, 500) 
92 | 
93 | plt.tight_layout()
94 | plt.savefig("K-means_and_DBSCAN_cluster_comparison.jpg")


--------------------------------------------------------------------------------
/DataCon2019/code/stage2/yield_call_name_api_name_exinfos_tsne.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn.manifold import TSNE
 5 | from sklearn.decomposition import TruncatedSVD 
 6 | 
 7 | call_name_svded_features = pd.read_pickle("call_name_svded_features.pkl")
 8 | api_name_svded_features = pd.read_pickle("api_name_svded_features.pkl")
 9 | exinfos_svded_features = pd.read_pickle("exinfos_svded_features.pkl")
10 | merge_data = np.hstack([api_name_svded_features, exinfos_svded_features, call_name_svded_features])
11 | X_tsne = TSNE(n_components=2, random_state=33).fit_transform(merge_data)
12 | 
13 | with open("api_name_exinfos_tsne_call_name_data.pkl", "wb") as fp:
14 |     pickle.dump(X_tsne, fp)


--------------------------------------------------------------------------------
/DataCon2019/loom_大数据安全分析比赛决赛.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/loom_大数据安全分析比赛决赛.pdf


--------------------------------------------------------------------------------
/DataCon2019/useful/K-means_and_DBSCAN_cluster_comparison.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/K-means_and_DBSCAN_cluster_comparison.jpg


--------------------------------------------------------------------------------
/DataCon2019/useful/K-means_and_DBSCAN_cluster_comparison.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/K-means_and_DBSCAN_cluster_comparison.pdf


--------------------------------------------------------------------------------
/DataCon2019/useful/K-means_cluster_comparison.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/K-means_cluster_comparison.jpg


--------------------------------------------------------------------------------
/DataCon2019/useful/api_name_barh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/api_name_barh.pdf


--------------------------------------------------------------------------------
/DataCon2019/useful/call_pid_barh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/call_pid_barh.pdf


--------------------------------------------------------------------------------
/DataCon2019/useful/draw_origin_data.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/draw_origin_data.jpg


--------------------------------------------------------------------------------
/DataCon2019/useful/exinfos_barh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/exinfos_barh.pdf


--------------------------------------------------------------------------------
/DataCon2019/useful/rank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/rank.png


--------------------------------------------------------------------------------
/DataCon2019/useful/ret_value_barh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2019/useful/ret_value_barh.pdf


--------------------------------------------------------------------------------
/DataCon2019/useful/table.md:
--------------------------------------------------------------------------------
 1 | |               算法名称               |              准确率              |              召回率               |     $\boldsymbol{F1\ Score}$      |
 2 | | :----------------------------------: | :------------------------------: | :-------------------------------: | :-------------------------------: |
 3 | |    $\boldsymbol{Random\ Forest}$     |            $0.9800 $             |            $0.98 00 $             |            $ 0.98 00 $            |
 4 | |        $\boldsymbol{XGBoost}$        |             $0.9800$             |             $0.9700$              |             $0.9 800$             |
 5 | |         $\boldsymbol{BPNN}$          |             $0.9635$             |             $0.9635$              |             $0.9635$              |
 6 | |    $\boldsymbol{Decission\ Tree}$    |            $0.95 00 $            |            $0.96 00 $             |             $0.950 0$             |
 7 | | $\boldsymbol{Logistic\ Regression}$  |             $0.9600$             |             $0.9700$              |             $0.9400$              |
 8 | |   $\boldsymbol{Naive\ Byes^{[1]}}$   |            $0.940 0$             |             $ 0.9500$             |             $0.9400 $             |
 9 | |   $\boldsymbol{Naive\ Byes^{[2]}}$   |             $0.9484$             |             $0.9484$              |             $0.9484$              |
10 | |         $\boldsymbol{GBDT}$          |            $0.97 00$             |             $0.95 00$             |             $0.9 600$             |
11 | |        $\boldsymbol{Bagging}$        |             $0.9700$             |             $0.9600$              |             $0.9700$              |
12 | |       $\boldsymbol{AdaBoost}$        |             $0.9521$             |             $0.9521$              |             $0.9521$              |
13 | |          $\boldsymbol{SVM}$          |            $0.9 700$             |             $0.9500 $             |             $0.9600 $             |
14 | | $\boldsymbol{Ensemble\ model^{[1]}}$ | $\boldsymbol{\color{red}0.9839}$ | $\boldsymbol{\color{red}0.9839}$  | $\boldsymbol{\color{red}0.9839}$  |
15 | | $\boldsymbol{Ensemble\ model^{[2]}}$ | $\boldsymbol{\color{red}0.9967}$ | $\boldsymbol{\color{red}0.9 967}$ | $\boldsymbol{\color{red}0.99 67}$ |
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/DataCon2020/PPT/loom_2020DataCon大数据安全分析比赛分享.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/loom_2020DataCon大数据安全分析比赛分享.pptx


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/2020rank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/2020rank.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/ROC_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/ROC_curve.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/black.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/black_white_pdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/black_white_pdf.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/decode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/decode.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/features_tsne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/features_tsne.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/result1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/result1.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/result2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/result2.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/tfidf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/tfidf.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/time.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/train_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/train_flow.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/vb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/vb.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/white.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/xgb1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/xgb1.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/xgb2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/xgb2.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/xgb3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/xgb3.png


--------------------------------------------------------------------------------
/DataCon2020/PPT/picture/方差偏差均衡.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhangf/DataCon/45a5a23757a65f286098ca92ea4945f36c32e435/DataCon2020/PPT/picture/方差偏差均衡.png


--------------------------------------------------------------------------------
/DataCon2020/codes/bagging.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.model_selection import StratifiedKFold
 3 | 
 4 | def bagging(model, x_train, y_train, x_test, n_splits):
 5 |     """
 6 |     :@param x_train: feature matrix.
 7 |     :type x_train: np.array(M X N) or list(M X N).
 8 |     :@param y_train: class label.
 9 |     :type y_train: np.array(M X 1).
10 |     :@param x_test: test set feature matrix.
11 |     :type x_test: np.array(M X N) or list(M X N).
12 |     :@param n_splits: K-fold parameter.
13 |     :type n_splits: int.
14 |     """
15 |     n_train, n_test = x_train.shape[0], x_test.shape[0]
16 |     # 随机划分数据
17 |     kf = StratifiedKFold(n_splits=n_splits, random_state=0)
18 |     oof_train = np.empty((n_train, ))
19 |     oof_test = np.empty((n_test, ))
20 |     oof_test_skf = np.empty((n_splits, n_test))
21 | 
22 |     # 训练第i个模型
23 |     for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
24 |         kf_x_train = x_train[train_index]
25 |         kf_y_train = y_train[train_index]
26 |         model.fit(kf_x_train, kf_y_train)
27 |         oof_test_skf[i, :] = model.predict(x_test)
28 |     # 对所有的模型结果进行集成
29 |     oof_test[:] = oof_test_skf.mean(axis=0)
30 |     return oof_test.reshape(-1, 1)
31 | 


--------------------------------------------------------------------------------
/DataCon2020/codes/get_id.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import pandas as pd
 3 | 
 4 | names = []
 5 | df = pd.DataFrame()
 6 | for path in glob.glob("/home/datacon/malware/YYY_step1/*"):
 7 |     names.append(path.split("/")[-1])
 8 | 
 9 | df["id"] = names
10 | df.to_csv("/home/jovyan/media_directory/test_id.csv", index=False, header=None, encoding="utf-8")


--------------------------------------------------------------------------------
/DataCon2020/codes/get_raw_test_data.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import glob
 3 | import pandas as pd
 4 | 
 5 | def get_string(directory, file_name):
 6 |     list_ = []
 7 |     df = pd.DataFrame()
 8 |     for path in glob.glob(f"{directory}/*"):
 9 |         with open(path, "rb") as fp:
10 |             string = fp.read().decode("utf-8", errors="ignore")
11 |         raw_words = re.findall("[a-zA-Z]+", string) 
12 |         words_space = " ".join(w for w in raw_words if 4 < len(w) < 20)
13 |         list_.append(words_space)
14 |     df["words"] = list_
15 |     df.to_csv(f"{file_name}.csv", index=False)
16 |     print(len(list_))
17 | 
18 | 
19 | get_string("/home/datacon/malware/YYY_step1", "/home/jovyan/media_directory/end_raw_test")


--------------------------------------------------------------------------------
/DataCon2020/codes/get_raw_train_data.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import pandas as pd
 3 | 
 4 | def get_train_data(label_file_path, data_db_path, file_name, media_directory):
 5 |     with open(label_file_path, "r") as fp:
 6 |         id_ = fp.read().split()
 7 |     list_ = []
 8 |     df = pd.DataFrame()
 9 |     for path in id_:
10 |         with open(f"{data_db_path}/{path}", "rb") as fp:
11 |             string = fp.read().decode("utf-8", errors="ignore")
12 |         raw_words = re.findall("[a-zA-Z]+", string) 
13 |         words_space = " ".join(w for w in raw_words if 4 < len(w) < 20)
14 |         list_.append(words_space)
15 |     df["words"] = list_
16 |     df.to_csv(f"{media_directory}/{file_name}.csv", index=False)
17 |     return df
18 | 
19 | def merge(black, white, file_name, media_directory):
20 |     train_raw_data = black.append(white)
21 |     train_raw_data["labels"] = [1 for _ in range(black.shape[0])] + [0 for _ in range(white.shape[0])]
22 |     train_raw_data.to_csv(f"{media_directory}/{file_name}.csv", index=False)
23 |     
24 | black = get_train_data("/home/datacon/malware/XXX/black.txt",
25 |                        "/home/datacon/malware/XXX/data",
26 |                        "black",
27 |                        "/home/jovyan/media_directory")
28 | print("black is over!")
29 | white = get_train_data("/home/datacon/malware/XXX/white.txt",
30 |                        "/home/datacon/malware/XXX/data",
31 |                        "white",
32 |                        "/home/jovyan/media_directory")
33 | print("white is over!")
34 | # black = pd.read_csv("/home/jovyan/media_directory/black.csv")
35 | # white = pd.read_csv("/home/jovyan/media_directory/white.csv")
36 | merge(black, white, "raw_train_data", "/home/jovyan/media_directory")


--------------------------------------------------------------------------------
/DataCon2020/codes/lgb_cv.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import lightgbm as lgb
  3 | from sklearn.model_selection import StratifiedKFold
  4 | 
  5 | 
  6 | # LightGBM模型+交叉验证
  7 | params =  {'boosting_type': 'gbdt',
  8 |            'objective': 'binary',
  9 |            'metric': 'binary_logloss',
 10 |            'learning_rate': 0.001,
 11 |            'num_leaves': 82,
 12 |            'max_depth': 8,
 13 |            'min_data_in_leaf': 64,
 14 |            'min_child_weight':1.435,
 15 |            'bagging_fraction': 0.785,
 16 |            'feature_fraction': 0.373,
 17 |            'bagging_freq': 22,
 18 |            'reg_lambda': 0.065,
 19 |            'reg_alpha': 0.797,
 20 |            'min_split_gain': 0.350,
 21 |            'nthread': 8,
 22 |            'seed': 42,
 23 |            'scale_pos_weight':1.15,
 24 |            'verbose': -1}
 25 | 
 26 | def get_lgb_oof(params, x_train, y_train, x_test, n_splits):
 27 |     n_train, n_test = x_train.shape[0], x_test.shape[0]
 28 |     kf = StratifiedKFold(n_splits=n_splits, random_state=0)
 29 |     oof_train = np.empty((n_train, ))
 30 |     oof_test = np.empty((n_test, ))
 31 |     oof_test_skf = np.empty((n_splits, n_test))
 32 |     for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
 33 |         kf_x_train = x_train[train_index]
 34 |         kf_y_train = y_train[train_index]
 35 |         kf_x_test = x_train[test_index]
 36 |         kf_y_test = y_train[test_index]
 37 |         train_matrix = lgb.Dataset(kf_x_train, label=kf_y_train)
 38 |         valid_matrix = lgb.Dataset(kf_x_test, label=kf_y_test)
 39 |         model = lgb.train(params, 
 40 |                           train_set=train_matrix, 
 41 |                           num_boost_round=80000, 
 42 |                           valid_sets=valid_matrix, 
 43 |                           verbose_eval=-1, 
 44 |                           early_stopping_rounds=600)
 45 |         oof_test_skf[i, :] = model.predict(x_test)
 46 |     oof_test[:] = oof_test_skf.mean(axis=0)
 47 |     return oof_test.reshape(-1, 1)
 48 | 
 49 | 
 50 | # 技巧提升
 51 | # 用一个较大的learning rate学习得到初始版本模型1；
 52 | # 用一个较小的learning rate在模型1上继续训练得到模型2；
 53 | params1 =  {'boosting_type': 'gbdt',
 54 |            'objective': 'binary',
 55 |            'metric': 'binary_logloss',
 56 |            'learning_rate': 0.025,
 57 |            "feature_fraction":0.5,
 58 |            "num_leaves": 200,
 59 |            "lambda_l1":2,
 60 |            "lambda_l2":2,
 61 |            "learning_rate":0.01,
 62 |            'min_child_samples': 50,
 63 |            "bagging_fraction":0.7,
 64 |            "bagging_freq":1,
 65 |            'verbose': -1}
 66 | 
 67 | params2 =  {'boosting_type': 'gbdt',
 68 |            'objective': 'binary',
 69 |            'metric': 'binary_logloss',
 70 |            'learning_rate': 0.001,
 71 |            'num_leaves': 82,
 72 |            'max_depth': 8,
 73 |            'min_data_in_leaf': 64,
 74 |            'min_child_weight': 1.435,
 75 |            'bagging_fraction': 0.785,
 76 |            'feature_fraction': 0.373,
 77 |            'bagging_freq': 22,
 78 |            'reg_lambda': 0.065,
 79 |            'reg_alpha': 0.797,
 80 |            'min_split_gain': 0.350,
 81 |            'nthread': 8,
 82 |            'seed': 42,
 83 |            'scale_pos_weight':1.15,
 84 |            'verbose': -1}
 85 | 
 86 | def get_lgb_oof(params1, params2, x_train, y_train, x_test, n_splits):
 87 |     n_train, n_test = x_train.shape[0], x_test.shape[0]
 88 |     kf = StratifiedKFold(n_splits=n_splits)
 89 |     oof_train = np.empty((n_train, ))
 90 |     oof_test = np.empty((n_test, ))
 91 |     oof_test_skf = np.empty((n_splits, n_test))
 92 |     for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
 93 |         kf_x_train = x_train[train_index]
 94 |         kf_y_train = y_train[train_index]
 95 |         kf_x_test = x_train[test_index]
 96 |         kf_y_test = y_train[test_index]
 97 |         train_matrix = lgb.Dataset(kf_x_train, label=kf_y_train)
 98 |         valid_matrix = lgb.Dataset(kf_x_test, label=kf_y_test)
 99 |         model1 = lgb.train(params1, 
100 |                            train_set=train_matrix, 
101 |                            num_boost_round=20000, 
102 |                            valid_sets=valid_matrix, 
103 |                            verbose_eval=-1, 
104 |                            early_stopping_rounds=200)
105 |         
106 |         model2 = lgb.train(params2, 
107 |                            train_set=train_matrix, 
108 |                            num_boost_round=20000, 
109 |                            valid_sets=valid_matrix, 
110 |                            init_model=model1,
111 |                            verbose_eval=-1, 
112 |                            early_stopping_rounds=200)
113 |         oof_test_skf[i, :] = model2.predict(x_test)
114 |     oof_test[:] = oof_test_skf.mean(axis=0)
115 |     return oof_test.reshape(-1, 1)
116 | 


--------------------------------------------------------------------------------
/DataCon2020/codes/plot.py:
--------------------------------------------------------------------------------
  1 | from sklearn import metrics
  2 | import seaborn as sns
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | __author__ = "yhangf"
  6 | 
  7 | def plot_roc_curve(test_label, y_pred, *, model_name, save=True):
  8 |     """Calculate the AUC value of the model
  9 |        and drawing.
 10 |     :@param test_label: the actual label of the test set.
 11 |     :type test_label: the K dimension np.array.
 12 |     :@param y_pred: the predictive label of the model.
 13 |     :type y_pred: the K dimension np.array.
 14 |     :@param model_name: name of the model.
 15 |     :type model_name: str.
 16 |     :@param save: control the saving of images.
 17 |     :type save: bool.
 18 |     """
 19 | 
 20 |     font = {"color": "darkred", "size": 13, "family": "serif"}
 21 | 
 22 |     # calculate auc value
 23 |     fpr, tpr, _ = metrics.roc_curve(test_label, y_pred)
 24 |     auc = metrics.roc_auc_score(test_label, y_pred)
 25 | 
 26 |     # draw a roc curve
 27 |     with plt.style.context("bmh"):
 28 |         fig, ax = plt.subplots()
 29 |         ax.plot(
 30 |             fpr,
 31 |             tpr,
 32 |             label=f"{model_name} AUC = {auc:.5f}",
 33 |             color="steelblue",
 34 |             rasterized=True,
 35 |             linewidth=2,
 36 |         )
 37 | 
 38 |         ax.set_xlim([0.0, 1.0])
 39 |         ax.set_ylim([0.0, 1.05])
 40 |         ax.set_xlabel("False Positive Rate", fontdict=font)
 41 |         ax.set_ylabel("True Positive Rate", fontdict=font)
 42 |         ax.set_title("ROC curve", fontdict=font)
 43 |         ax.legend(loc="lower right")
 44 |         ax.tick_params(axis="both")
 45 |         plt.tight_layout()
 46 |         if save:
 47 |             fig.savefig(f"{model_name}_auc_curve.pdf")
 48 | 
 49 | 
 50 | def plot_multiple_roc_curve(
 51 |     test_label_array,
 52 |     y_pred_array,
 53 |     model_name_list,
 54 |     data_volume_list,
 55 |     *,
 56 |     col,
 57 |     width,
 58 |     height,
 59 |     save=True,
 60 | ):
 61 |     """Calculate the AUC value of the multiple model
 62 |        and drawing.
 63 |     :@param test_label_array: the actual label array of the test set.
 64 |     :type test_label_array: the MxK dimension np.array or list.
 65 |     :@param y_pred_array: the predictive label array of the model.
 66 |     :type y_pred: the MxK dimension np.array or list.
 67 |     :@param model_name_list: name list of the multiple model.
 68 |     :type model_name: list[str].
 69 |     :@param data_volume_list: the sample number of each training is listed.
 70 |     :type data_volume_list: list.
 71 |     :@param col: control the number of subgraphs.
 72 |     :type col: int.
 73 |     :@param width: the total width of the canvas.
 74 |     :type width: float.
 75 |     :@param height: the total height of the canvas.
 76 |     :type height: float.
 77 |     :@param save: control the saving of images.
 78 |     :type save: bool.
 79 |     """
 80 | 
 81 |     font = {"color": "#392f41", "size": 11, "family": "serif"}
 82 | 
 83 |     # calculate {tpr fpr auc} value and save as a list
 84 |     fpr_list, tpr_list, auc_list = [], [], []
 85 |     for test_label, y_pred in zip(test_label_array, y_pred_array):
 86 |         fpr, tpr, _ = metrics.roc_curve(test_label, y_pred)
 87 |         auc = metrics.roc_auc_score(test_label, y_pred)
 88 |         fpr_list.append(fpr)
 89 |         tpr_list.append(tpr)
 90 |         auc_list.append(auc)
 91 |     # calculate the number of rows in a subgraph
 92 |     if len(auc_list) % col:
 93 |         row = len(auc_list) // col + 1
 94 |     else:
 95 |         row = len(auc_list) // col
 96 | 
 97 |     with plt.style.context("bmh"):
 98 |         fig, axs = plt.subplots(row, col, figsize=(width, height))
 99 |         # while row or col is 1, add new dimension
100 |         if row == 1 or col == 1:
101 |             axs = axs[:, np.newaxis]
102 |         axs = [i for ax in axs for i in ax]  # modify the dimensions of axs
103 |         for ax, fpr, tpr, model_name, auc, volume in zip(
104 |             axs, fpr_list, tpr_list, model_name_list, auc_list, data_volume_list
105 |         ):
106 | 
107 |             ax.plot(
108 |                 fpr,
109 |                 tpr,
110 |                 label=f"{model_name} AUC = {auc:.5f}",
111 |                 color="steelblue",
112 |                 rasterized=True,
113 |                 linewidth=2,
114 |             )
115 | 
116 |             ax.set_xlim([0.0, 1.0])
117 |             ax.set_ylim([0.0, 1.05])
118 |             ax.set_xlabel("False Positive Rate", fontdict=font)
119 |             ax.set_ylabel("True Positive Rate", fontdict=font)
120 |             ax.set_title(f"ROC curve (Data volume {volume})", fontdict=font)
121 |             ax.legend(loc="lower right")
122 |             ax.tick_params(axis="both")
123 |             plt.tight_layout()
124 | 
125 |         if save:
126 |             fig.savefig("multiple_auc_curve.pdf")
127 | 
128 | def plot_train_test_data_pdf(train, 
129 |                              test, 
130 |                              rows, 
131 |                              cols,
132 |                              *,
133 |                              width=16, 
134 |                              height=8, 
135 |                              save=False
136 | ):
137 |     """Draw the distribution of corresponding features of training set 
138 |        and test set.
139 |        :@param train: training set.
140 |        :type train: pd.DataFrame.
141 |        :@param test: testing set.
142 |        :type test: pd.DataFrame.
143 |        :@param rows: controls the number of subgraphs in the row direction.
144 |        :type rows: int.
145 |        :@param cols: controls the number of subgraphs in the col direction.
146 |        :type cols: int.
147 |        :@param width: the total width of the canvas.
148 |        :type width: float.
149 |        :@param height: the total height of the canvas.
150 |        :type height: float.
151 |        :@param save: control the saving of images.
152 |        :type save: bool.
153 |     """
154 | 
155 |     font = {"size": 10, 
156 |             "family" : "serif"}
157 |     legend_font = {"family" : "serif",
158 |                    "size": 6}
159 |     with plt.style.context("bmh"):
160 |         plt.figure(figsize=(width, height), dpi=400)
161 |         for i, col in enumerate(train.columns):
162 |             ax = plt.subplot(rows, cols, i + 1)
163 |             sns.kdeplot(train[col], n_levels=2, color="darkred", shade=True, ax=ax)
164 |             sns.kdeplot(test[col], n_levels=2, color="steelblue", shade=True, ax=ax)
165 |             ax.set_xlabel(col, fontdict=font)
166 |             ax.set_ylabel("Density", fontdict=font)
167 |             ax.legend(["train","test"], loc="best", prop=legend_font)
168 |         plt.tight_layout()
169 |         
170 |     if save:
171 |         plt.savefig("pdf_curve.pdf")
172 | 


--------------------------------------------------------------------------------
/DataCon2020/codes/t_sne.py:
--------------------------------------------------------------------------------
 1 | from sklearn.manifold import TSNE
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_t_sne(train_tfidf_features):
 5 |     X_tsne = TSNE(n_components=2, perplexity=300, random_state=42).fit_transform(train_tfidf_features)
 6 |     font = {"size": 13, 
 7 |             "family" : "serif"}
 8 |     with plt.style.context("bmh"):
 9 |         fig, ax = plt.subplots(figsize=(8, 6))
10 |         ax.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.6, 
11 |                     cmap=plt.cm.get_cmap('rainbow', 2))
12 |         ax.set_title("Features Visualization", fontdict=font)
13 |         ax.set_ylim([-80, 81])
14 |         ax.set_xlim([-82, 81])
15 | 


--------------------------------------------------------------------------------
/DataCon2020/codes/test_train_model.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from xgboost import XGBClassifier
 4 | import joblib
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.feature_extraction.text import TfidfVectorizer
 7 | 
 8 | 
 9 | def calc_score(y_true, y_pred, alpha=1.2):
10 |     y_true = np.array(y_true)
11 |     y_pred = np.array(y_pred)
12 |     y_true_black_index = {i for i in range(len(y_true)) if y_true[i] == 1}
13 |     y_pred_black_index = {i for i in range(len(y_pred)) if y_pred[i] == 1}
14 |     y_true_white_index = {i for i in range(len(y_true)) if y_true[i] == 0}
15 |     y_pred_white_index = {i for i in range(len(y_pred)) if y_pred[i] == 0}
16 |     
17 |     black_is_black = len(y_true_black_index & y_pred_black_index)
18 |     black_is_white = len(y_true_black_index & y_pred_white_index)
19 |     white_is_black = len(y_true_white_index & y_pred_black_index)
20 |     white_is_white = len(y_true_white_index & y_pred_white_index)
21 |     
22 |     recall = black_is_black / (black_is_black + black_is_white) 
23 |     error_ratio = white_is_black / (white_is_black + white_is_white)
24 |     score = recall - alpha * error_ratio
25 |     return score
26 | 
27 | train_data_ = pd.read_pickle("/home/jovyan/media_directory/train_tfidf_features")
28 | train_labels = pd.read_pickle("/home/jovyan/media_directory/train_labels")
29 | 
30 | result = []
31 | 
32 | for i in range(20, 50):
33 |     train_data, test_data, train_label, test_label = train_test_split(train_data_, 
34 |                                                                       train_labels, 
35 |                                                                       test_size=0.25, 
36 |                                                                       random_state=i)
37 | 
38 |     _ = []
39 |     model = XGBClassifier(max_depth=5, n_estimators=90) 
40 |     model.fit(train_data, train_label)
41 |     y_pred = model.predict(test_data)
42 |     score = calc_score(test_label, y_pred)
43 |     _.append(score)
44 | 
45 |     model = XGBClassifier(max_depth=5, n_estimators=80) 
46 |     model.fit(train_data, train_label)
47 |     y_pred = model.predict(test_data)
48 | 
49 |     score = calc_score(test_label, y_pred)
50 |     _.append(score)
51 |     
52 |     model = XGBClassifier(max_depth=5, n_estimators=70) 
53 |     model.fit(train_data, train_label)
54 |     y_pred = model.predict(test_data)
55 |     score = calc_score(test_label, y_pred)
56 |     _.append(score)
57 | 
58 |     model = XGBClassifier(max_depth=5, n_estimators=60) 
59 |     model.fit(train_data, train_label)
60 |     y_pred = model.predict(test_data)
61 |     score = calc_score(test_label, y_pred)
62 |     _.append(score)
63 | 
64 |     model = XGBClassifier(max_depth=5, n_estimators=50) 
65 |     model.fit(train_data, train_label)
66 |     y_pred = model.predict(test_data)
67 |     score = calc_score(test_label, y_pred)
68 |     _.append(score)
69 | 
70 |     model = XGBClassifier(max_depth=5, n_estimators=40) 
71 |     model.fit(train_data, train_label)
72 |     y_pred = model.predict(test_data)
73 |     score = calc_score(test_label, y_pred)
74 |     _.append(score)
75 | 
76 |     model = XGBClassifier(max_depth=5, n_estimators=30) 
77 |     model.fit(train_data, train_label)
78 |     y_pred = model.predict(test_data)
79 |     score = calc_score(test_label, y_pred)
80 |     _.append(score)
81 |     
82 |     result.append(_)
83 |     
84 | print(np.vstack(result).mean(axis=0))
85 | 
86 | 


--------------------------------------------------------------------------------
/DataCon2020/codes/xgb_bagging.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from xgboost import XGBClassifier
 3 | 
 4 | # 适用于训练数据较少，且预测值抖动现象明显的场合
 5 | result = []
 6 | for i in np.random.randint(0xFFFFF, size=10):
 7 |     train_data, test_data, train_label, test_label = train_test_split(train_tfidf_features, 
 8 |                                                                       labels, 
 9 |                                                                       test_size=0.2, 
10 |                                                                       random_state=i)
11 | 
12 |     model = XGBClassifier(n_estimators=100) 
13 |     model.fit(train_data, train_label)
14 |     y_pred = model.predict(test_tfidf_features)
15 |     result.append(y_pred)
16 | y_pred = np.array(result).mean(axis=0)
17 | y_pred_end = [1 if i >= 0.5 else 0 for i in y_pred]
18 | 


--------------------------------------------------------------------------------
/DataCon2020/codes/yield_end_result.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from xgboost import XGBClassifier
 4 | import joblib
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.feature_extraction.text import TfidfVectorizer
 7 | 
 8 | 
 9 | with open("/home/jovyan/models/tfidf_model", "rb") as fp:
10 |     vectorizer = joblib.load(fp)
11 | with open("/home/jovyan/models/train_model", "rb") as fp:
12 |     model = joblib.load(fp)
13 | 
14 | test_data_ = pd.read_csv("/home/jovyan/media_directory/end_raw_test.csv")
15 | id_ = pd.read_csv("/home/jovyan/media_directory/test_id.csv", header=None)
16 | 
17 | test_tfidf_features = vectorizer.transform(test_data_.words.tolist())
18 | y_pred = model.predict(test_tfidf_features)
19 | 
20 | result = pd.DataFrame()
21 | result["id_"] = id_.values.flatten()
22 | result["y_pred"] = y_pred
23 | 
24 | result.to_csv("/home/jovyan/malware_final.txt", index=False, header=None)


--------------------------------------------------------------------------------
/DataCon2020/codes/yield_features.py:
--------------------------------------------------------------------------------
 1 | import joblib
 2 | import pickle
 3 | import pandas as pd
 4 | from sklearn.feature_extraction.text import TfidfVectorizer
 5 | 
 6 | train_data_ = pd.read_csv("media_directory/raw_train_data.csv")
 7 | 
 8 | vectorizer = TfidfVectorizer(min_df=3, max_df=0.9, max_features=3000)
 9 | train_tfidf_features = vectorizer.fit_transform(train_data_.words.tolist())
10 | 
11 | with open("/home/jovyan/models/tfidf_model", "wb") as fp:
12 |     joblib.dump(vectorizer, fp)
13 |     
14 | with open("/home/jovyan/media_directory/train_tfidf_features", "wb") as fp:
15 |     pickle.dump(train_tfidf_features, fp)
16 |     
17 | with open("/home/jovyan/media_directory/train_labels", "wb") as fp:
18 |     pickle.dump(train_data_.labels, fp)


--------------------------------------------------------------------------------
/DataCon2020/codes/yield_train_model.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from xgboost import XGBClassifier
 3 | import joblib
 4 | 
 5 | 
 6 | 
 7 | train_tfidf_features = pd.read_pickle("/home/jovyan/media_directory/train_tfidf_features")
 8 | labels = pd.read_pickle("/home/jovyan/media_directory/train_labels")
 9 | 
10 | model = XGBClassifier(n_estimators=400, learning_rate=0.05) 
11 | model.fit(train_tfidf_features, labels)
12 | 
13 | with open("/home/jovyan/models/train_model", "wb") as fp:
14 |     joblib.dump(model, fp)


--------------------------------------------------------------------------------
/DataCon2020/readme.md:
--------------------------------------------------------------------------------
1 | 执行setup_run.sh即可完成预测前的所有环境配置


--------------------------------------------------------------------------------
/DataCon2020/run.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | python3 /home/jovyan/codes/yield_end_result.py


--------------------------------------------------------------------------------
/DataCon2020/scripts/yield_raw_data.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | python3 /home/jovyan/codes/get_raw_test_data.py
4 | python3 /home/jovyan/codes/get_id.py


--------------------------------------------------------------------------------
/DataCon2020/setup_run.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | pip3 install sklearn
4 | pip3 install xgboost 
5 | pip3 install pandas
6 | 
7 | source /home/jovyan/scripts/yield_raw_data.sh
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # <p align="center">DataCon:beers:</p>
 2 | 
 3 | ```shell
 4 |                         _   .-')
 5 |                         ( '.( OO )_
 6 |                         ,--.      .-'),-----.  .-'),-----. ,--.   ,--.)
 7 |                         |  |.-') ( OO'  .-.  '( OO'  .-.  '|   `.'   |
 8 |                         |  | OO )/   |  | |  |/   |  | |  ||         |
 9 |                         |  |`-' |\_) |  |\|  |\_) |  |\|  ||  |'.'|  |
10 |                         (|  '---.'  \ |  | |  |  \ |  | |  ||  |   |  |
11 |                         |      |    `'  '-'  '   `'  '-'  '|  |   |  |
12 |                         `------'      `-----'      `-----' `--'   `--'
13 | ```
14 | > [DataCon2019大数据安全分析大赛](https://www.butian.net/datacon)方向二（恶意代码检测）冠军方案:rose::rose:，详细思路分享见[知乎](https://zhuanlan.zhihu.com/p/64252076)，[DataCon2020大数据安全分析大赛](https://datacon.qianxin.com/#integral)方向五（恶意代码分析）季军方案，详细思路分享见[知乎](https://zhuanlan.zhihu.com/p/185715807)，由于比赛时间仓促代码写得比较混乱，还请各位读者多多见谅！
15 | 
16 | ### DataCon2019综合积分榜排名（部分）
17 | 
18 | ![](https://github.com/yhangf/DataCon/blob/master/DataCon2019/useful/rank.png)
19 | 
20 | ### 源码
21 | 
22 | #### stage1
23 | 
24 | - [[deep_learning_model.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/deep_learning_model.ipynb)]
25 | - [[call_pid_tfidf_stacking.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/call_pid_tfidf_stacking.ipynb)]
26 | - [[exinfos.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/exinfos.ipynb)]
27 | - [[explore.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/explore.ipynb)]
28 | - [[feature_engineering.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/feature_engineering.ipynb)]
29 | - [[new_feature_engineering.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/new_feature_engineering.ipynb)]
30 | - [[out_of_fold.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/out_of_fold.ipynb)]
31 | - [[ret_value_stacking.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/ret_value_stacking.ipynb)]
32 | - [[stacking.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/stacking.ipynb)]
33 | - [[test.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage1/test.ipynb)]
34 | 
35 | #### stage2
36 | 
37 | - [[feature_engineering.ipynb](https://nbviewer.jupyter.org/github/yhangf/DataCon/blob/master/DataCon2019/code/stage2/feature_engineering.ipynb)]
38 | - [[for_cluster_kmeans.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/for_cluster_kmeans.py)]
39 | - [[get_call_name_tfidf_features.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/get_call_name_tfidf_features.py)]
40 | - [[plot_comparison.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/plot_comparison.py)]
41 | - [[yield_call_name_api_name_exinfos_tsne.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/yield_call_name_api_name_exinfos_tsne.py)]
42 | - [[DBSCAN.py](https://github.com/yhangf/DataCon/blob/master/DataCon2019/code/stage2/DBSCAN.py)]
43 | 
44 | ### DataCon2020综合积分榜排名（部分）
45 | 
46 | ![](https://github.com/yhangf/DataCon/blob/master/DataCon2020/PPT/picture/2020rank.png)
47 | 
48 | ### 源码
49 | 
50 | - [[get_id.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/get_id.py)]: 获取测试集的文件名
51 | - [[get_raw_test_data.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/get_raw_test_data.py)]: 获取测试集的原始字符串
52 | - [[get_raw_train_data.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/get_raw_train_data.py)]: 获取训练集的原始字符串
53 | - [[test_train_model.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/test_train_model.py)]: 测试训练的模型
54 | - [[yield_end_result.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/yield_end_result.py)]: 生成最终提交的结果
55 | - [[yield_features.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/yield_features.py)]: 由原始字符串生成特征矩阵
56 | - [[yield_train_model.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/yield_train_model.py)]: 生成训练模型
57 | - [[plot.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/plot.py)]: 绘图模块
58 | - [[t_sne.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/t_sne.py)]: 降维可视化模块
59 | - [[lgb_cv.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/lgb_cv.py)]: LightGBM模型+交叉验证
60 | - [[xgb_bagging.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/xgb_bagging.py)]: XGBoost模型+Bagging
61 | - [[bagging.py](https://github.com/yhangf/DataCon/blob/master/DataCon2020/codes/bagging.py)]: 经典Bagging框架代码
62 | 


--------------------------------------------------------------------------------