├── README.md
├── 用TF-IDF进行特征提取后进行机器学习.ipynb
└── 自定义特征后进行机器学习.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # Machine-Learning-on-CSIC-2010
2 | 机器学习实战之CSIC2010网络攻击数据
3 | 


--------------------------------------------------------------------------------
/用TF-IDF进行特征提取后进行机器学习.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "def load_data(file):\n",
 10 |     "    with open(file, 'r', encoding='utf-8') as f:\n",
 11 |     "        data = f.readlines()\n",
 12 |     "    result = []\n",
 13 |     "    for d in data:\n",
 14 |     "        d = d.strip()\n",
 15 |     "        if len(d) > 0:\n",
 16 |     "            result.append(d)\n",
 17 |     "    return result"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "normal_requests = load_data('normal.txt')\n",
 27 |     "anomalous_requests = load_data('anomalous.txt')\n",
 28 |     "\n",
 29 |     "all_requests = normal_requests + anomalous_requests\n",
 30 |     "y_normal = [0] * len(normal_requests)\n",
 31 |     "y_anomalous = [1] * len(anomalous_requests)\n",
 32 |     "y = y_normal + y_anomalous"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 42 |     "from sklearn.model_selection import train_test_split"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 4,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "vectorizer = TfidfVectorizer(min_df=0.0, analyzer=\"word\", sublinear_tf=True)\n",
 52 |     "X = vectorizer.fit_transform(all_requests)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 5,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "text/plain": [
 63 |        "(61065, 33550)"
 64 |       ]
 65 |      },
 66 |      "execution_count": 5,
 67 |      "metadata": {},
 68 |      "output_type": "execute_result"
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "#vectorizer.vocabulary_\n",
 73 |     "X.shape"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 19,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "from sklearn.model_selection import train_test_split\n",
 83 |     "\n",
 84 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "# 1 k近邻"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 20,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "Wall time: 49.6 ms\n",
104 |       "Parser   : 280 ms\n"
105 |      ]
106 |     },
107 |     {
108 |      "data": {
109 |       "text/plain": [
110 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
111 |        "                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
112 |        "                     weights='uniform')"
113 |       ]
114 |      },
115 |      "execution_count": 20,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "%%time\n",
122 |     "#复杂性太高，无法得出结果\n",
123 |     "# from sklearn.model_selection import GridSearchCV\n",
124 |     "from sklearn.neighbors import KNeighborsClassifier\n",
125 |     "# from sklearn.preprocessing import StandardScaler\n",
126 |     "\n",
127 |     "# 数据归一化\n",
128 |     "standardScalar = StandardScaler(with_mean=False)\n",
129 |     "standardScalar.fit(X_train)\n",
130 |     "X_train = standardScalar.transform(X_train)\n",
131 |     "X_test = standardScalar.transform(X_test)\n",
132 |     "\n",
133 |     "# # 网格搜索的参数\n",
134 |     "# param_grid = [\n",
135 |     "#     {\n",
136 |     "#         'weights': ['uniform'],\n",
137 |     "#         'n_neighbors': [i for i in range(2, 11)] #从1开始容易过拟合\n",
138 |     "#     },\n",
139 |     "#     {\n",
140 |     "#         'weights': ['distance'],\n",
141 |     "#         'n_neighbors': [i for i in range(2, 11)],\n",
142 |     "#         'p': [i for i in range(1, 6)]\n",
143 |     "#     }\n",
144 |     "# ]\n",
145 |     "\n",
146 |     "# cv其实也是一个超参数，一般越大越好，但是越大训练时间越长\n",
147 |     "#grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1, cv=5)\n",
148 |     "knn_clf = KNeighborsClassifier()\n",
149 |     "knn_clf.fit(X_train, y_train)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 21,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "data": {
159 |       "text/plain": [
160 |        "0.9233603537214443"
161 |       ]
162 |      },
163 |      "execution_count": 21,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "knn_clf.score(X_test, y_test)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 22,
175 |    "metadata": {
176 |     "scrolled": true
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "0.9194262813752373\n",
184 |       "0.8872379401587625\n",
185 |       "0.9030453697949038\n"
186 |      ]
187 |     }
188 |    ],
189 |    "source": [
190 |     "y_predict = knn_clf.predict(X_test)\n",
191 |     "\n",
192 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
193 |     "\n",
194 |     "print(precision_score(y_test, y_predict))\n",
195 |     "print(recall_score(y_test, y_predict))\n",
196 |     "print(f1_score(y_test, y_predict))"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "# 2 逻辑回归"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 14,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "from sklearn.linear_model import LogisticRegression\n",
213 |     "from sklearn.model_selection import GridSearchCV\n",
214 |     "\n",
215 |     "param_grid = [\n",
216 |     "    {\n",
217 |     "        'C': [0.1, 1, 3, 5, 7],\n",
218 |     "        'penalty': ['l1', 'l2']\n",
219 |     "    }\n",
220 |     "]\n",
221 |     "\n",
222 |     "grid_search = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1, cv=5)\n"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 15,
228 |    "metadata": {
229 |     "scrolled": true
230 |    },
231 |    "outputs": [
232 |     {
233 |      "name": "stderr",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "C:\\Software\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
237 |       "  FutureWarning)\n"
238 |      ]
239 |     },
240 |     {
241 |      "name": "stdout",
242 |      "output_type": "stream",
243 |      "text": [
244 |       "Wall time: 50.9 s\n"
245 |      ]
246 |     },
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
251 |        "             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,\n",
252 |        "                                          fit_intercept=True,\n",
253 |        "                                          intercept_scaling=1, l1_ratio=None,\n",
254 |        "                                          max_iter=100, multi_class='warn',\n",
255 |        "                                          n_jobs=None, penalty='l2',\n",
256 |        "                                          random_state=None, solver='warn',\n",
257 |        "                                          tol=0.0001, verbose=0,\n",
258 |        "                                          warm_start=False),\n",
259 |        "             iid='warn', n_jobs=-1,\n",
260 |        "             param_grid=[{'C': [0.1, 1, 3, 5, 7], 'penalty': ['l1', 'l2']}],\n",
261 |        "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
262 |        "             scoring=None, verbose=0)"
263 |       ]
264 |      },
265 |      "execution_count": 15,
266 |      "metadata": {},
267 |      "output_type": "execute_result"
268 |     }
269 |    ],
270 |    "source": [
271 |     "%%time\n",
272 |     "grid_search.fit(X_train, y_train)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 12,
278 |    "metadata": {},
279 |    "outputs": [
280 |     {
281 |      "data": {
282 |       "text/plain": [
283 |        "0.9680463440596087"
284 |       ]
285 |      },
286 |      "execution_count": 12,
287 |      "metadata": {},
288 |      "output_type": "execute_result"
289 |     }
290 |    ],
291 |    "source": [
292 |     "grid_search.best_score_"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 13,
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "data": {
302 |       "text/plain": [
303 |        "{'C': 7, 'penalty': 'l2'}"
304 |       ]
305 |      },
306 |      "execution_count": 13,
307 |      "metadata": {},
308 |      "output_type": "execute_result"
309 |     }
310 |    ],
311 |    "source": [
312 |     "grid_search.best_params_"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 14,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "data": {
322 |       "text/plain": [
323 |        "0.9737165315647262"
324 |       ]
325 |      },
326 |      "execution_count": 14,
327 |      "metadata": {},
328 |      "output_type": "execute_result"
329 |     }
330 |    ],
331 |    "source": [
332 |     "best_knn_clf = grid_search.best_estimator_\n",
333 |     "best_knn_clf.score(X_test, y_test)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 16,
339 |    "metadata": {
340 |     "scrolled": false
341 |    },
342 |    "outputs": [
343 |     {
344 |      "name": "stdout",
345 |      "output_type": "stream",
346 |      "text": [
347 |       "0.9922813036020584\n",
348 |       "0.941990637085284\n",
349 |       "0.9664821969301451\n"
350 |      ]
351 |     }
352 |    ],
353 |    "source": [
354 |     "y_predict = best_knn_clf.predict(X_test)\n",
355 |     "\n",
356 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
357 |     "\n",
358 |     "print(precision_score(y_test, y_predict))\n",
359 |     "print(recall_score(y_test, y_predict))\n",
360 |     "print(f1_score(y_test, y_predict))"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "# 3 决策树"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 17,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "from sklearn.tree import DecisionTreeClassifier\n",
377 |     "\n",
378 |     "param_grid = [\n",
379 |     "    {\n",
380 |     "        'max_depth':[i for i in range(1, 10)],\n",
381 |     "        'min_samples_leaf':[i for i in range(1, 20)],\n",
382 |     "        'min_samples_split':[i for i in range(10, 30)],\n",
383 |     "    }\n",
384 |     "]\n",
385 |     "\n",
386 |     "grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, n_jobs=-1, cv=5)"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 18,
392 |    "metadata": {},
393 |    "outputs": [
394 |     {
395 |      "name": "stdout",
396 |      "output_type": "stream",
397 |      "text": [
398 |       "Wall time: 1h 7min 6s\n"
399 |      ]
400 |     },
401 |     {
402 |      "data": {
403 |       "text/plain": [
404 |        "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
405 |        "             estimator=DecisionTreeClassifier(class_weight=None,\n",
406 |        "                                              criterion='gini', max_depth=None,\n",
407 |        "                                              max_features=None,\n",
408 |        "                                              max_leaf_nodes=None,\n",
409 |        "                                              min_impurity_decrease=0.0,\n",
410 |        "                                              min_impurity_split=None,\n",
411 |        "                                              min_samples_leaf=1,\n",
412 |        "                                              min_samples_split=2,\n",
413 |        "                                              min_weight_fraction_leaf=0.0,\n",
414 |        "                                              presort=False, random_state=None,\n",
415 |        "                                              splitter='best'),\n",
416 |        "             iid='warn', n_jobs=-1,\n",
417 |        "             param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],\n",
418 |        "                          'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,\n",
419 |        "                                               11, 12, 13, 14, 15, 16, 17, 18,\n",
420 |        "                                               19],\n",
421 |        "                          'min_samples_split': [10, 11, 12, 13, 14, 15, 16, 17,\n",
422 |        "                                                18, 19, 20, 21, 22, 23, 24, 25,\n",
423 |        "                                                26, 27, 28, 29]}],\n",
424 |        "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
425 |        "             scoring=None, verbose=0)"
426 |       ]
427 |      },
428 |      "execution_count": 18,
429 |      "metadata": {},
430 |      "output_type": "execute_result"
431 |     }
432 |    ],
433 |    "source": [
434 |     "%%time\n",
435 |     "grid_search.fit(X_train, y_train)"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 19,
441 |    "metadata": {},
442 |    "outputs": [
443 |     {
444 |      "data": {
445 |       "text/plain": [
446 |        "0.8979775648898715"
447 |       ]
448 |      },
449 |      "execution_count": 19,
450 |      "metadata": {},
451 |      "output_type": "execute_result"
452 |     }
453 |    ],
454 |    "source": [
455 |     "grid_search.best_score_"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 20,
461 |    "metadata": {},
462 |    "outputs": [
463 |     {
464 |      "data": {
465 |       "text/plain": [
466 |        "{'max_depth': 9, 'min_samples_leaf': 19, 'min_samples_split': 10}"
467 |       ]
468 |      },
469 |      "execution_count": 20,
470 |      "metadata": {},
471 |      "output_type": "execute_result"
472 |     }
473 |    ],
474 |    "source": [
475 |     "grid_search.best_params_"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": 21,
481 |    "metadata": {},
482 |    "outputs": [
483 |     {
484 |      "data": {
485 |       "text/plain": [
486 |        "0.90084336362892"
487 |       ]
488 |      },
489 |      "execution_count": 21,
490 |      "metadata": {},
491 |      "output_type": "execute_result"
492 |     }
493 |    ],
494 |    "source": [
495 |     "best_tree_clf = grid_search.best_estimator_\n",
496 |     "best_tree_clf.score(X_test, y_test)"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": 23,
502 |    "metadata": {
503 |     "scrolled": true
504 |    },
505 |    "outputs": [
506 |     {
507 |      "name": "stdout",
508 |      "output_type": "stream",
509 |      "text": [
510 |       "0.951904296875\n",
511 |       "0.7936087929981681\n",
512 |       "0.8655788655788657\n"
513 |      ]
514 |     }
515 |    ],
516 |    "source": [
517 |     "y_predict = best_tree_clf.predict(X_test)\n",
518 |     "\n",
519 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
520 |     "\n",
521 |     "print(precision_score(y_test, y_predict))\n",
522 |     "print(recall_score(y_test, y_predict))\n",
523 |     "print(f1_score(y_test, y_predict))"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "markdown",
528 |    "metadata": {},
529 |    "source": [
530 |     "# 4 SVM"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": 25,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "from sklearn.preprocessing import StandardScaler\n",
540 |     "\n",
541 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)\n",
542 |     "# 数据归一化\n",
543 |     "standardScalar = StandardScaler(with_mean=False)\n",
544 |     "standardScalar.fit(X_train)\n",
545 |     "X_train = standardScalar.transform(X_train)\n",
546 |     "X_test = standardScalar.transform(X_test)"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": 27,
552 |    "metadata": {},
553 |    "outputs": [
554 |     {
555 |      "name": "stderr",
556 |      "output_type": "stream",
557 |      "text": [
558 |       "C:\\Software\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
559 |       "  \"avoid this warning.\", FutureWarning)\n"
560 |      ]
561 |     },
562 |     {
563 |      "name": "stdout",
564 |      "output_type": "stream",
565 |      "text": [
566 |       "Wall time: 10min 22s\n"
567 |      ]
568 |     },
569 |     {
570 |      "data": {
571 |       "text/plain": [
572 |        "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
573 |        "    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',\n",
574 |        "    kernel='rbf', max_iter=-1, probability=False, random_state=None,\n",
575 |        "    shrinking=True, tol=0.001, verbose=False)"
576 |       ]
577 |      },
578 |      "execution_count": 27,
579 |      "metadata": {},
580 |      "output_type": "execute_result"
581 |     }
582 |    ],
583 |    "source": [
584 |     "%%time\n",
585 |     "from sklearn.svm import SVC\n",
586 |     "\n",
587 |     "svm_clf = SVC()\n",
588 |     "svm_clf.fit(X_train, y_train)"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": 28,
594 |    "metadata": {},
595 |    "outputs": [
596 |     {
597 |      "data": {
598 |       "text/plain": [
599 |        "0.9745558011954475"
600 |       ]
601 |      },
602 |      "execution_count": 28,
603 |      "metadata": {},
604 |      "output_type": "execute_result"
605 |     }
606 |    ],
607 |    "source": [
608 |     "svm_clf.score(X_train, y_train)"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "code",
613 |    "execution_count": 29,
614 |    "metadata": {},
615 |    "outputs": [
616 |     {
617 |      "data": {
618 |       "text/plain": [
619 |        "0.9619258167526407"
620 |       ]
621 |      },
622 |      "execution_count": 29,
623 |      "metadata": {},
624 |      "output_type": "execute_result"
625 |     }
626 |    ],
627 |    "source": [
628 |     "svm_clf.score(X_test, y_test)"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": 30,
634 |    "metadata": {},
635 |    "outputs": [
636 |     {
637 |      "name": "stdout",
638 |      "output_type": "stream",
639 |      "text": [
640 |       "0.9623700623700624\n",
641 |       "0.9421941787095461\n",
642 |       "0.9521752545510646\n"
643 |      ]
644 |     }
645 |    ],
646 |    "source": [
647 |     "y_predict = svm_clf.predict(X_test)\n",
648 |     "\n",
649 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
650 |     "\n",
651 |     "print(precision_score(y_test, y_predict))\n",
652 |     "print(recall_score(y_test, y_predict))\n",
653 |     "print(f1_score(y_test, y_predict))"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "markdown",
658 |    "metadata": {},
659 |    "source": [
660 |     "# 5 随机森林"
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": 7,
666 |    "metadata": {},
667 |    "outputs": [],
668 |    "source": [
669 |     "from sklearn.ensemble  import RandomForestClassifier\n",
670 |     "\n",
671 |     "rf_clf = RandomForestClassifier(n_estimators=500,\n",
672 |     "                               random_state=666,\n",
673 |     "                               oob_score=True,\n",
674 |     "                               n_jobs=-1)"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": 8,
680 |    "metadata": {},
681 |    "outputs": [
682 |     {
683 |      "name": "stdout",
684 |      "output_type": "stream",
685 |      "text": [
686 |       "Wall time: 2min 50s\n"
687 |      ]
688 |     },
689 |     {
690 |      "data": {
691 |       "text/plain": [
692 |        "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
693 |        "                       max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
694 |        "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
695 |        "                       min_samples_leaf=1, min_samples_split=2,\n",
696 |        "                       min_weight_fraction_leaf=0.0, n_estimators=500,\n",
697 |        "                       n_jobs=-1, oob_score=True, random_state=666, verbose=0,\n",
698 |        "                       warm_start=False)"
699 |       ]
700 |      },
701 |      "execution_count": 8,
702 |      "metadata": {},
703 |      "output_type": "execute_result"
704 |     }
705 |    ],
706 |    "source": [
707 |     "%%time\n",
708 |     "rf_clf.fit(X_train, y_train)"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "code",
713 |    "execution_count": 9,
714 |    "metadata": {},
715 |    "outputs": [
716 |     {
717 |      "data": {
718 |       "text/plain": [
719 |        "0.9647916154916892"
720 |       ]
721 |      },
722 |      "execution_count": 9,
723 |      "metadata": {},
724 |      "output_type": "execute_result"
725 |     }
726 |    ],
727 |    "source": [
728 |     "rf_clf.score(X_test, y_test)"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": 10,
734 |    "metadata": {},
735 |    "outputs": [
736 |     {
737 |      "name": "stdout",
738 |      "output_type": "stream",
739 |      "text": [
740 |       "0.9618792499484855\n",
741 |       "0.9501323020557704\n",
742 |       "0.9559696907638747\n"
743 |      ]
744 |     }
745 |    ],
746 |    "source": [
747 |     "y_predict = rf_clf.predict(X_test)\n",
748 |     "\n",
749 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
750 |     "\n",
751 |     "print(precision_score(y_test, y_predict))\n",
752 |     "print(recall_score(y_test, y_predict))\n",
753 |     "print(f1_score(y_test, y_predict))"
754 |    ]
755 |   },
756 |   {
757 |    "cell_type": "code",
758 |    "execution_count": null,
759 |    "metadata": {},
760 |    "outputs": [],
761 |    "source": []
762 |   }
763 |  ],
764 |  "metadata": {
765 |   "kernelspec": {
766 |    "display_name": "Python 3",
767 |    "language": "python",
768 |    "name": "python3"
769 |   },
770 |   "language_info": {
771 |    "codemirror_mode": {
772 |     "name": "ipython",
773 |     "version": 3
774 |    },
775 |    "file_extension": ".py",
776 |    "mimetype": "text/x-python",
777 |    "name": "python",
778 |    "nbconvert_exporter": "python",
779 |    "pygments_lexer": "ipython3",
780 |    "version": "3.7.4"
781 |   }
782 |  },
783 |  "nbformat": 4,
784 |  "nbformat_minor": 2
785 | }
786 | 


--------------------------------------------------------------------------------
/自定义特征后进行机器学习.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "\n",
 11 |     "normal = np.loadtxt(\"vector_normal\")\n",
 12 |     "anomalous = np.loadtxt(\"vector_anomalous\")\n",
 13 |     "\n",
 14 |     "all_requests = np.concatenate([normal, anomalous])\n",
 15 |     "X = all_requests\n",
 16 |     "\n",
 17 |     "y_normal = np.zeros(shape=(normal.shape[0]), dtype='int')\n",
 18 |     "y_anomalous = np.ones(shape=(anomalous.shape[0]), dtype='int')\n",
 19 |     "y = np.concatenate([y_normal, y_anomalous])"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# 1 K近邻算法"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "划分测试集和训练集"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from sklearn.model_selection import train_test_split\n",
 43 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "### 网格搜索\n",
 51 |     "网格搜索将使用交叉验证的方式来评估超参数的所有可能的组合"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from sklearn.model_selection import GridSearchCV\n",
 61 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 62 |     "#from sklearn.pipeline import Pipeline\n",
 63 |     "from sklearn.preprocessing import StandardScaler\n",
 64 |     "\n",
 65 |     "# 数据归一化\n",
 66 |     "standardScalar = StandardScaler()\n",
 67 |     "standardScalar.fit(X_train)\n",
 68 |     "X_train = standardScalar.transform(X_train)\n",
 69 |     "X_test_std = standardScalar.transform(X_test)\n",
 70 |     "\n",
 71 |     "# 网格搜索的参数\n",
 72 |     "param_grid = [\n",
 73 |     "    {\n",
 74 |     "        'weights': ['uniform'],\n",
 75 |     "        'n_neighbors': [i for i in range(2, 11)] #从1开始容易过拟合\n",
 76 |     "    },\n",
 77 |     "    {\n",
 78 |     "        'weights': ['distance'],\n",
 79 |     "        'n_neighbors': [i for i in range(2, 11)],\n",
 80 |     "        'p': [i for i in range(1, 6)]\n",
 81 |     "    }\n",
 82 |     "]\n",
 83 |     "\n",
 84 |     "# cv其实也是一个超参数，一般越大越好，但是越大训练时间越长\n",
 85 |     "grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1, cv=5)\n",
 86 |     "\n",
 87 |     "# pipe_grid_knn = Pipeline([\n",
 88 |     "#     (\"sta_scaler\", StandardScaler()),\n",
 89 |     "#     (\"grid_sea\", grid_search)\n",
 90 |     "# ])"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 4,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "Wall time: 52.1 s\n"
103 |      ]
104 |     },
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
109 |        "             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,\n",
110 |        "                                            metric='minkowski',\n",
111 |        "                                            metric_params=None, n_jobs=None,\n",
112 |        "                                            n_neighbors=5, p=2,\n",
113 |        "                                            weights='uniform'),\n",
114 |        "             iid='warn', n_jobs=-1,\n",
115 |        "             param_grid=[{'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
116 |        "                          'weights': ['uniform']},\n",
117 |        "                         {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
118 |        "                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],\n",
119 |        "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
120 |        "             scoring=None, verbose=0)"
121 |       ]
122 |      },
123 |      "execution_count": 4,
124 |      "metadata": {},
125 |      "output_type": "execute_result"
126 |     }
127 |    ],
128 |    "source": [
129 |     "%%time\n",
130 |     "grid_search.fit(X_train, y_train)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 5,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "0.8759619101163076"
142 |       ]
143 |      },
144 |      "execution_count": 5,
145 |      "metadata": {},
146 |      "output_type": "execute_result"
147 |     }
148 |    ],
149 |    "source": [
150 |     "grid_search.best_score_"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 6,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "{'n_neighbors': 10, 'p': 3, 'weights': 'distance'}"
162 |       ]
163 |      },
164 |      "execution_count": 6,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "grid_search.best_params_"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 7,
176 |    "metadata": {},
177 |    "outputs": [
178 |     {
179 |      "data": {
180 |       "text/plain": [
181 |        "0.8733661278988053"
182 |       ]
183 |      },
184 |      "execution_count": 7,
185 |      "metadata": {},
186 |      "output_type": "execute_result"
187 |     }
188 |    ],
189 |    "source": [
190 |     "best_knn_clf = grid_search.best_estimator_\n",
191 |     "best_knn_clf.score(X_test_std, y_test)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "### 分类结果评价"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 8,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "data": {
208 |       "text/plain": [
209 |        "array([[2735,  481],\n",
210 |        "       [ 420, 3479]], dtype=int64)"
211 |       ]
212 |      },
213 |      "execution_count": 8,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "from sklearn.metrics import confusion_matrix\n",
220 |     "\n",
221 |     "y_predict = best_knn_clf.predict(X_test_std)\n",
222 |     "confusion_matrix(y_test, y_predict)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 9,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "0.8785353535353535\n",
235 |       "0.8922800718132855\n",
236 |       "0.8853543707850872\n"
237 |      ]
238 |     }
239 |    ],
240 |    "source": [
241 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
242 |     "\n",
243 |     "print(precision_score(y_test, y_predict))\n",
244 |     "print(recall_score(y_test, y_predict))\n",
245 |     "print(f1_score(y_test, y_predict))"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 10,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "X_test = X_test_std"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "# 2 逻辑回归"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 10,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "from sklearn.linear_model import LogisticRegression\n",
271 |     "\n",
272 |     "param_grid = [\n",
273 |     "    {\n",
274 |     "        'C': [0.1, 1, 3, 5, 7],\n",
275 |     "        'penalty': ['l1', 'l2']\n",
276 |     "    }\n",
277 |     "]\n",
278 |     "\n",
279 |     "grid_search = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1, cv=5)\n"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 11,
285 |    "metadata": {},
286 |    "outputs": [
287 |     {
288 |      "name": "stdout",
289 |      "output_type": "stream",
290 |      "text": [
291 |       "Wall time: 2min 48s\n"
292 |      ]
293 |     },
294 |     {
295 |      "name": "stderr",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "C:\\Software\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
299 |       "  FutureWarning)\n"
300 |      ]
301 |     },
302 |     {
303 |      "data": {
304 |       "text/plain": [
305 |        "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
306 |        "             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,\n",
307 |        "                                          fit_intercept=True,\n",
308 |        "                                          intercept_scaling=1, l1_ratio=None,\n",
309 |        "                                          max_iter=100, multi_class='warn',\n",
310 |        "                                          n_jobs=None, penalty='l2',\n",
311 |        "                                          random_state=None, solver='warn',\n",
312 |        "                                          tol=0.0001, verbose=0,\n",
313 |        "                                          warm_start=False),\n",
314 |        "             iid='warn', n_jobs=-1,\n",
315 |        "             param_grid=[{'C': [0.1, 1, 3, 5, 7], 'penalty': ['l1', 'l2']}],\n",
316 |        "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
317 |        "             scoring=None, verbose=0)"
318 |       ]
319 |      },
320 |      "execution_count": 11,
321 |      "metadata": {},
322 |      "output_type": "execute_result"
323 |     }
324 |    ],
325 |    "source": [
326 |     "%%time\n",
327 |     "grid_search.fit(X_train, y_train)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 13,
333 |    "metadata": {},
334 |    "outputs": [
335 |     {
336 |      "data": {
337 |       "text/plain": [
338 |        "0.6869882989563935"
339 |       ]
340 |      },
341 |      "execution_count": 13,
342 |      "metadata": {},
343 |      "output_type": "execute_result"
344 |     }
345 |    ],
346 |    "source": [
347 |     "grid_search.best_score_"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 14,
353 |    "metadata": {},
354 |    "outputs": [
355 |     {
356 |      "data": {
357 |       "text/plain": [
358 |        "{'C': 7, 'penalty': 'l1'}"
359 |       ]
360 |      },
361 |      "execution_count": 14,
362 |      "metadata": {},
363 |      "output_type": "execute_result"
364 |     }
365 |    ],
366 |    "source": [
367 |     "grid_search.best_params_"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 16,
373 |    "metadata": {},
374 |    "outputs": [
375 |     {
376 |      "data": {
377 |       "text/plain": [
378 |        "0.6929023190442727"
379 |       ]
380 |      },
381 |      "execution_count": 16,
382 |      "metadata": {},
383 |      "output_type": "execute_result"
384 |     }
385 |    ],
386 |    "source": [
387 |     "best_log_clf = grid_search.best_estimator_\n",
388 |     "best_log_clf.score(X_test, y_test)"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 17,
394 |    "metadata": {},
395 |    "outputs": [
396 |     {
397 |      "name": "stdout",
398 |      "output_type": "stream",
399 |      "text": [
400 |       "0.7455587392550144\n",
401 |       "0.6673506027186458\n",
402 |       "0.7042901610502098\n"
403 |      ]
404 |     }
405 |    ],
406 |    "source": [
407 |     "y_predict = best_log_clf.predict(X_test_std)\n",
408 |     "\n",
409 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
410 |     "\n",
411 |     "print(precision_score(y_test, y_predict))\n",
412 |     "print(recall_score(y_test, y_predict))\n",
413 |     "print(f1_score(y_test, y_predict))"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "# 3 决策树"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 18,
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "from sklearn.tree import DecisionTreeClassifier\n",
430 |     "\n",
431 |     "param_grid = [\n",
432 |     "    {\n",
433 |     "        'max_depth':[i for i in range(1, 10)],\n",
434 |     "        'min_samples_leaf':[i for i in range(1, 20)],\n",
435 |     "        'min_samples_split':[i for i in range(10, 30)],\n",
436 |     "    }\n",
437 |     "]\n",
438 |     "\n",
439 |     "grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, n_jobs=-1, cv=5)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 19,
445 |    "metadata": {},
446 |    "outputs": [
447 |     {
448 |      "name": "stdout",
449 |      "output_type": "stream",
450 |      "text": [
451 |       "Wall time: 3min 46s\n"
452 |      ]
453 |     },
454 |     {
455 |      "data": {
456 |       "text/plain": [
457 |        "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
458 |        "             estimator=DecisionTreeClassifier(class_weight=None,\n",
459 |        "                                              criterion='gini', max_depth=None,\n",
460 |        "                                              max_features=None,\n",
461 |        "                                              max_leaf_nodes=None,\n",
462 |        "                                              min_impurity_decrease=0.0,\n",
463 |        "                                              min_impurity_split=None,\n",
464 |        "                                              min_samples_leaf=1,\n",
465 |        "                                              min_samples_split=2,\n",
466 |        "                                              min_weight_fraction_leaf=0.0,\n",
467 |        "                                              presort=False, random_state=None,\n",
468 |        "                                              splitter='best'),\n",
469 |        "             iid='warn', n_jobs=-1,\n",
470 |        "             param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],\n",
471 |        "                          'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,\n",
472 |        "                                               11, 12, 13, 14, 15, 16, 17, 18,\n",
473 |        "                                               19],\n",
474 |        "                          'min_samples_split': [10, 11, 12, 13, 14, 15, 16, 17,\n",
475 |        "                                                18, 19, 20, 21, 22, 23, 24, 25,\n",
476 |        "                                                26, 27, 28, 29]}],\n",
477 |        "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
478 |        "             scoring=None, verbose=0)"
479 |       ]
480 |      },
481 |      "execution_count": 19,
482 |      "metadata": {},
483 |      "output_type": "execute_result"
484 |     }
485 |    ],
486 |    "source": [
487 |     "%%time\n",
488 |     "grid_search.fit(X_train, y_train)"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": 20,
494 |    "metadata": {},
495 |    "outputs": [
496 |     {
497 |      "data": {
498 |       "text/plain": [
499 |        "0.7973224638954285"
500 |       ]
501 |      },
502 |      "execution_count": 20,
503 |      "metadata": {},
504 |      "output_type": "execute_result"
505 |     }
506 |    ],
507 |    "source": [
508 |     "grid_search.best_score_"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": 22,
514 |    "metadata": {},
515 |    "outputs": [
516 |     {
517 |      "data": {
518 |       "text/plain": [
519 |        "{'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 27}"
520 |       ]
521 |      },
522 |      "execution_count": 22,
523 |      "metadata": {},
524 |      "output_type": "execute_result"
525 |     }
526 |    ],
527 |    "source": [
528 |     "grid_search.best_params_"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": 23,
534 |    "metadata": {},
535 |    "outputs": [
536 |     {
537 |      "data": {
538 |       "text/plain": [
539 |        "0.8042164441321152"
540 |       ]
541 |      },
542 |      "execution_count": 23,
543 |      "metadata": {},
544 |      "output_type": "execute_result"
545 |     }
546 |    ],
547 |    "source": [
548 |     "best_tree_clf = grid_search.best_estimator_\n",
549 |     "best_tree_clf.score(X_test, y_test)"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 24,
555 |    "metadata": {},
556 |    "outputs": [
557 |     {
558 |      "name": "stdout",
559 |      "output_type": "stream",
560 |      "text": [
561 |       "0.7658039881204921\n",
562 |       "0.9258784303667608\n",
563 |       "0.8382677348194589\n"
564 |      ]
565 |     }
566 |    ],
567 |    "source": [
568 |     "y_predict = best_tree_clf.predict(X_test_std)\n",
569 |     "\n",
570 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
571 |     "\n",
572 |     "print(precision_score(y_test, y_predict))\n",
573 |     "print(recall_score(y_test, y_predict))\n",
574 |     "print(f1_score(y_test, y_predict))"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "markdown",
579 |    "metadata": {},
580 |    "source": [
581 |     "决策树可以不用对原始数据进行缩放，但是上面的步骤进行了归一化操作，下面采用原始数据进行一次训练"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "code",
586 |    "execution_count": 25,
587 |    "metadata": {},
588 |    "outputs": [],
589 |    "source": [
590 |     "X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "code",
595 |    "execution_count": 27,
596 |    "metadata": {},
597 |    "outputs": [],
598 |    "source": [
599 |     "tree_clf = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1, min_samples_split=27)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 29,
605 |    "metadata": {},
606 |    "outputs": [
607 |     {
608 |      "data": {
609 |       "text/plain": [
610 |        "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,\n",
611 |        "                       max_features=None, max_leaf_nodes=None,\n",
612 |        "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
613 |        "                       min_samples_leaf=1, min_samples_split=27,\n",
614 |        "                       min_weight_fraction_leaf=0.0, presort=False,\n",
615 |        "                       random_state=None, splitter='best')"
616 |       ]
617 |      },
618 |      "execution_count": 29,
619 |      "metadata": {},
620 |      "output_type": "execute_result"
621 |     }
622 |    ],
623 |    "source": [
624 |     "tree_clf.fit(X_train_raw, y_train)"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": 33,
630 |    "metadata": {},
631 |    "outputs": [
632 |     {
633 |      "data": {
634 |       "text/plain": [
635 |        "0.80965599634562"
636 |       ]
637 |      },
638 |      "execution_count": 33,
639 |      "metadata": {},
640 |      "output_type": "execute_result"
641 |     }
642 |    ],
643 |    "source": [
644 |     "tree_clf.score(X_train_raw, y_train)"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "code",
649 |    "execution_count": 35,
650 |    "metadata": {},
651 |    "outputs": [
652 |     {
653 |      "data": {
654 |       "text/plain": [
655 |        "0.8042164441321152"
656 |       ]
657 |      },
658 |      "execution_count": 35,
659 |      "metadata": {},
660 |      "output_type": "execute_result"
661 |     }
662 |    ],
663 |    "source": [
664 |     "tree_clf.score(X_test_raw, y_test)"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "code",
669 |    "execution_count": 36,
670 |    "metadata": {},
671 |    "outputs": [
672 |     {
673 |      "name": "stdout",
674 |      "output_type": "stream",
675 |      "text": [
676 |       "0.5479971890372453\n",
677 |       "1.0\n",
678 |       "0.708007989831124\n"
679 |      ]
680 |     }
681 |    ],
682 |    "source": [
683 |     "y_predict = best_tree_clf.predict(X_test_raw)\n",
684 |     "\n",
685 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
686 |     "\n",
687 |     "print(precision_score(y_test, y_predict))\n",
688 |     "print(recall_score(y_test, y_predict))\n",
689 |     "print(f1_score(y_test, y_predict))"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "markdown",
694 |    "metadata": {},
695 |    "source": [
696 |     "# 4 SVM"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": 37,
702 |    "metadata": {},
703 |    "outputs": [],
704 |    "source": [
705 |     "from sklearn.preprocessing import StandardScaler\n",
706 |     "\n",
707 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)\n",
708 |     "# 数据归一化\n",
709 |     "standardScalar = StandardScaler()\n",
710 |     "standardScalar.fit(X_train)\n",
711 |     "X_train = standardScalar.transform(X_train)\n",
712 |     "X_test = standardScalar.transform(X_test)"
713 |    ]
714 |   },
715 |   {
716 |    "cell_type": "code",
717 |    "execution_count": 48,
718 |    "metadata": {},
719 |    "outputs": [],
720 |    "source": [
721 |     "from sklearn.svm import SVC\n",
722 |     "\n",
723 |     "param_grid = [\n",
724 |     "    {\n",
725 |     "        'kernel': [\"poly\"],\n",
726 |     "        'degree': [1, 2, 3],\n",
727 |     "        'C': [0.1, 1, 3, 5]\n",
728 |     "    }\n",
729 |     "]\n",
730 |     "\n",
731 |     "grid_search = GridSearchCV(SVC(), param_grid, n_jobs=-1, cv=5)"
732 |    ]
733 |   },
734 |   {
735 |    "cell_type": "code",
736 |    "execution_count": 49,
737 |    "metadata": {},
738 |    "outputs": [
739 |     {
740 |      "name": "stdout",
741 |      "output_type": "stream",
742 |      "text": [
743 |       "Wall time: 8min 26s\n"
744 |      ]
745 |     },
746 |     {
747 |      "data": {
748 |       "text/plain": [
749 |        "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
750 |        "             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
751 |        "                           decision_function_shape='ovr', degree=3,\n",
752 |        "                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,\n",
753 |        "                           probability=False, random_state=None, shrinking=True,\n",
754 |        "                           tol=0.001, verbose=False),\n",
755 |        "             iid='warn', n_jobs=-1,\n",
756 |        "             param_grid=[{'C': [0.1, 1, 3, 5], 'degree': [1, 2, 3],\n",
757 |        "                          'kernel': ['poly']}],\n",
758 |        "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
759 |        "             scoring=None, verbose=0)"
760 |       ]
761 |      },
762 |      "execution_count": 49,
763 |      "metadata": {},
764 |      "output_type": "execute_result"
765 |     }
766 |    ],
767 |    "source": [
768 |     "%%time\n",
769 |     "grid_search.fit(X_train, y_train)"
770 |    ]
771 |   },
772 |   {
773 |    "cell_type": "code",
774 |    "execution_count": 50,
775 |    "metadata": {},
776 |    "outputs": [
777 |     {
778 |      "data": {
779 |       "text/plain": [
780 |        "{'C': 5, 'degree': 3, 'kernel': 'poly'}"
781 |       ]
782 |      },
783 |      "execution_count": 50,
784 |      "metadata": {},
785 |      "output_type": "execute_result"
786 |     }
787 |    ],
788 |    "source": [
789 |     "grid_search.best_params_"
790 |    ]
791 |   },
792 |   {
793 |    "cell_type": "code",
794 |    "execution_count": 51,
795 |    "metadata": {},
796 |    "outputs": [
797 |     {
798 |      "data": {
799 |       "text/plain": [
800 |        "0.7022734460100496"
801 |       ]
802 |      },
803 |      "execution_count": 51,
804 |      "metadata": {},
805 |      "output_type": "execute_result"
806 |     }
807 |    ],
808 |    "source": [
809 |     "grid_search.best_score_"
810 |    ]
811 |   },
812 |   {
813 |    "cell_type": "code",
814 |    "execution_count": 53,
815 |    "metadata": {},
816 |    "outputs": [
817 |     {
818 |      "data": {
819 |       "text/plain": [
820 |        "0.7114546732255798"
821 |       ]
822 |      },
823 |      "execution_count": 53,
824 |      "metadata": {},
825 |      "output_type": "execute_result"
826 |     }
827 |    ],
828 |    "source": [
829 |     "best_svm_clf = grid_search.best_estimator_\n",
830 |     "best_svm_clf.score(X_test, y_test)"
831 |    ]
832 |   },
833 |   {
834 |    "cell_type": "code",
835 |    "execution_count": 54,
836 |    "metadata": {},
837 |    "outputs": [
838 |     {
839 |      "name": "stdout",
840 |      "output_type": "stream",
841 |      "text": [
842 |       "0.7404898384575299\n",
843 |       "0.7289048473967684\n",
844 |       "0.7346516737753651\n"
845 |      ]
846 |     }
847 |    ],
848 |    "source": [
849 |     "y_predict = best_svm_clf.predict(X_test)\n",
850 |     "\n",
851 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
852 |     "\n",
853 |     "print(precision_score(y_test, y_predict))\n",
854 |     "print(recall_score(y_test, y_predict))\n",
855 |     "print(f1_score(y_test, y_predict))"
856 |    ]
857 |   },
858 |   {
859 |    "cell_type": "markdown",
860 |    "metadata": {},
861 |    "source": [
862 |     "# 5 随机森林"
863 |    ]
864 |   },
865 |   {
866 |    "cell_type": "code",
867 |    "execution_count": 3,
868 |    "metadata": {},
869 |    "outputs": [],
870 |    "source": [
871 |     "from sklearn.ensemble  import RandomForestClassifier"
872 |    ]
873 |   },
874 |   {
875 |    "cell_type": "code",
876 |    "execution_count": 4,
877 |    "metadata": {},
878 |    "outputs": [],
879 |    "source": [
880 |     "rf_clf = RandomForestClassifier(n_estimators=500,\n",
881 |     "                               random_state=666,\n",
882 |     "                               oob_score=True,\n",
883 |     "                               n_jobs=-1)"
884 |    ]
885 |   },
886 |   {
887 |    "cell_type": "code",
888 |    "execution_count": 5,
889 |    "metadata": {
890 |     "scrolled": false
891 |    },
892 |    "outputs": [
893 |     {
894 |      "name": "stdout",
895 |      "output_type": "stream",
896 |      "text": [
897 |       "Wall time: 3.77 s\n"
898 |      ]
899 |     },
900 |     {
901 |      "data": {
902 |       "text/plain": [
903 |        "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
904 |        "                       max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
905 |        "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
906 |        "                       min_samples_leaf=1, min_samples_split=2,\n",
907 |        "                       min_weight_fraction_leaf=0.0, n_estimators=500,\n",
908 |        "                       n_jobs=-1, oob_score=True, random_state=666, verbose=0,\n",
909 |        "                       warm_start=False)"
910 |       ]
911 |      },
912 |      "execution_count": 5,
913 |      "metadata": {},
914 |      "output_type": "execute_result"
915 |     }
916 |    ],
917 |    "source": [
918 |     "%%time\n",
919 |     "rf_clf.fit(X_train, y_train)"
920 |    ]
921 |   },
922 |   {
923 |    "cell_type": "code",
924 |    "execution_count": 8,
925 |    "metadata": {},
926 |    "outputs": [
927 |     {
928 |      "data": {
929 |       "text/plain": [
930 |        "0.947013352073085"
931 |       ]
932 |      },
933 |      "execution_count": 8,
934 |      "metadata": {},
935 |      "output_type": "execute_result"
936 |     }
937 |    ],
938 |    "source": [
939 |     "rf_clf.score(X_test, y_test)"
940 |    ]
941 |   },
942 |   {
943 |    "cell_type": "code",
944 |    "execution_count": 9,
945 |    "metadata": {},
946 |    "outputs": [
947 |     {
948 |      "name": "stdout",
949 |      "output_type": "stream",
950 |      "text": [
951 |       "0.9471813103098019\n",
952 |       "0.956655552705822\n",
953 |       "0.9518948577261708\n"
954 |      ]
955 |     }
956 |    ],
957 |    "source": [
958 |     "y_predict = rf_clf.predict(X_test)\n",
959 |     "\n",
960 |     "from sklearn.metrics import precision_score, recall_score, f1_score\n",
961 |     "\n",
962 |     "print(precision_score(y_test, y_predict))\n",
963 |     "print(recall_score(y_test, y_predict))\n",
964 |     "print(f1_score(y_test, y_predict))"
965 |    ]
966 |   },
967 |   {
968 |    "cell_type": "code",
969 |    "execution_count": null,
970 |    "metadata": {},
971 |    "outputs": [],
972 |    "source": []
973 |   }
974 |  ],
975 |  "metadata": {
976 |   "kernelspec": {
977 |    "display_name": "Python 3",
978 |    "language": "python",
979 |    "name": "python3"
980 |   },
981 |   "language_info": {
982 |    "codemirror_mode": {
983 |     "name": "ipython",
984 |     "version": 3
985 |    },
986 |    "file_extension": ".py",
987 |    "mimetype": "text/x-python",
988 |    "name": "python",
989 |    "nbconvert_exporter": "python",
990 |    "pygments_lexer": "ipython3",
991 |    "version": "3.7.4"
992 |   }
993 |  },
994 |  "nbformat": 4,
995 |  "nbformat_minor": 2
996 | }
997 | 


--------------------------------------------------------------------------------