├── README.md ├── 用TF-IDF进行特征提取后进行机器学习.ipynb └── 自定义特征后进行机器学习.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-on-CSIC-2010 2 | 机器学习实战之CSIC2010网络攻击数据 3 | -------------------------------------------------------------------------------- /用TF-IDF进行特征提取后进行机器学习.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "def load_data(file):\n", 10 | " with open(file, 'r', encoding='utf-8') as f:\n", 11 | " data = f.readlines()\n", 12 | " result = []\n", 13 | " for d in data:\n", 14 | " d = d.strip()\n", 15 | " if len(d) > 0:\n", 16 | " result.append(d)\n", 17 | " return result" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "normal_requests = load_data('normal.txt')\n", 27 | "anomalous_requests = load_data('anomalous.txt')\n", 28 | "\n", 29 | "all_requests = normal_requests + anomalous_requests\n", 30 | "y_normal = [0] * len(normal_requests)\n", 31 | "y_anomalous = [1] * len(anomalous_requests)\n", 32 | "y = y_normal + y_anomalous" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 42 | "from sklearn.model_selection import train_test_split" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "vectorizer = TfidfVectorizer(min_df=0.0, analyzer=\"word\", sublinear_tf=True)\n", 52 | "X = vectorizer.fit_transform(all_requests)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "(61065, 33550)" 64 | ] 65 | }, 66 | "execution_count": 5, 67 | "metadata": {}, 68 | "output_type": "execute_result" 69 | } 70 | ], 71 | "source": [ 72 | "#vectorizer.vocabulary_\n", 73 | "X.shape" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 19, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "from sklearn.model_selection import train_test_split\n", 83 | "\n", 84 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "# 1 k近邻" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 20, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "Wall time: 49.6 ms\n", 104 | "Parser : 280 ms\n" 105 | ] 106 | }, 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 111 | " metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n", 112 | " weights='uniform')" 113 | ] 114 | }, 115 | "execution_count": 20, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "%%time\n", 122 | "#复杂性太高,无法得出结果\n", 123 | "# from sklearn.model_selection import GridSearchCV\n", 124 | "from sklearn.neighbors import KNeighborsClassifier\n", 125 | "# from sklearn.preprocessing import StandardScaler\n", 126 | "\n", 127 | "# 数据归一化\n", 128 | "standardScalar = StandardScaler(with_mean=False)\n", 129 | "standardScalar.fit(X_train)\n", 130 | "X_train = standardScalar.transform(X_train)\n", 131 | "X_test = standardScalar.transform(X_test)\n", 132 | "\n", 133 | "# # 网格搜索的参数\n", 134 | "# param_grid = [\n", 135 | "# {\n", 136 | "# 'weights': ['uniform'],\n", 137 | "# 'n_neighbors': [i for i in range(2, 11)] #从1开始容易过拟合\n", 138 | "# },\n", 139 | "# {\n", 140 | "# 'weights': ['distance'],\n", 141 | "# 'n_neighbors': [i for i in range(2, 11)],\n", 142 | "# 'p': [i for i in range(1, 6)]\n", 143 | "# }\n", 144 | "# ]\n", 145 | "\n", 146 | "# cv其实也是一个超参数,一般越大越好,但是越大训练时间越长\n", 147 | "#grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1, cv=5)\n", 148 | "knn_clf = KNeighborsClassifier()\n", 149 | "knn_clf.fit(X_train, y_train)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 21, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "0.9233603537214443" 161 | ] 162 | }, 163 | "execution_count": 21, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "knn_clf.score(X_test, y_test)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 22, 175 | "metadata": { 176 | "scrolled": true 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "0.9194262813752373\n", 184 | "0.8872379401587625\n", 185 | "0.9030453697949038\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "y_predict = knn_clf.predict(X_test)\n", 191 | "\n", 192 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 193 | "\n", 194 | "print(precision_score(y_test, y_predict))\n", 195 | "print(recall_score(y_test, y_predict))\n", 196 | "print(f1_score(y_test, y_predict))" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "# 2 逻辑回归" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 14, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "from sklearn.linear_model import LogisticRegression\n", 213 | "from sklearn.model_selection import GridSearchCV\n", 214 | "\n", 215 | "param_grid = [\n", 216 | " {\n", 217 | " 'C': [0.1, 1, 3, 5, 7],\n", 218 | " 'penalty': ['l1', 'l2']\n", 219 | " }\n", 220 | "]\n", 221 | "\n", 222 | "grid_search = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1, cv=5)\n" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 15, 228 | "metadata": { 229 | "scrolled": true 230 | }, 231 | "outputs": [ 232 | { 233 | "name": "stderr", 234 | "output_type": "stream", 235 | "text": [ 236 | "C:\\Software\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 237 | " FutureWarning)\n" 238 | ] 239 | }, 240 | { 241 | "name": "stdout", 242 | "output_type": "stream", 243 | "text": [ 244 | "Wall time: 50.9 s\n" 245 | ] 246 | }, 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "GridSearchCV(cv=5, error_score='raise-deprecating',\n", 251 | " estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,\n", 252 | " fit_intercept=True,\n", 253 | " intercept_scaling=1, l1_ratio=None,\n", 254 | " max_iter=100, multi_class='warn',\n", 255 | " n_jobs=None, penalty='l2',\n", 256 | " random_state=None, solver='warn',\n", 257 | " tol=0.0001, verbose=0,\n", 258 | " warm_start=False),\n", 259 | " iid='warn', n_jobs=-1,\n", 260 | " param_grid=[{'C': [0.1, 1, 3, 5, 7], 'penalty': ['l1', 'l2']}],\n", 261 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 262 | " scoring=None, verbose=0)" 263 | ] 264 | }, 265 | "execution_count": 15, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "%%time\n", 272 | "grid_search.fit(X_train, y_train)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 12, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "0.9680463440596087" 284 | ] 285 | }, 286 | "execution_count": 12, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "grid_search.best_score_" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 13, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "data": { 302 | "text/plain": [ 303 | "{'C': 7, 'penalty': 'l2'}" 304 | ] 305 | }, 306 | "execution_count": 13, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "grid_search.best_params_" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 14, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/plain": [ 323 | "0.9737165315647262" 324 | ] 325 | }, 326 | "execution_count": 14, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "best_knn_clf = grid_search.best_estimator_\n", 333 | "best_knn_clf.score(X_test, y_test)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 16, 339 | "metadata": { 340 | "scrolled": false 341 | }, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "0.9922813036020584\n", 348 | "0.941990637085284\n", 349 | "0.9664821969301451\n" 350 | ] 351 | } 352 | ], 353 | "source": [ 354 | "y_predict = best_knn_clf.predict(X_test)\n", 355 | "\n", 356 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 357 | "\n", 358 | "print(precision_score(y_test, y_predict))\n", 359 | "print(recall_score(y_test, y_predict))\n", 360 | "print(f1_score(y_test, y_predict))" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "# 3 决策树" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 17, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "from sklearn.tree import DecisionTreeClassifier\n", 377 | "\n", 378 | "param_grid = [\n", 379 | " {\n", 380 | " 'max_depth':[i for i in range(1, 10)],\n", 381 | " 'min_samples_leaf':[i for i in range(1, 20)],\n", 382 | " 'min_samples_split':[i for i in range(10, 30)],\n", 383 | " }\n", 384 | "]\n", 385 | "\n", 386 | "grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, n_jobs=-1, cv=5)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 18, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "Wall time: 1h 7min 6s\n" 399 | ] 400 | }, 401 | { 402 | "data": { 403 | "text/plain": [ 404 | "GridSearchCV(cv=5, error_score='raise-deprecating',\n", 405 | " estimator=DecisionTreeClassifier(class_weight=None,\n", 406 | " criterion='gini', max_depth=None,\n", 407 | " max_features=None,\n", 408 | " max_leaf_nodes=None,\n", 409 | " min_impurity_decrease=0.0,\n", 410 | " min_impurity_split=None,\n", 411 | " min_samples_leaf=1,\n", 412 | " min_samples_split=2,\n", 413 | " min_weight_fraction_leaf=0.0,\n", 414 | " presort=False, random_state=None,\n", 415 | " splitter='best'),\n", 416 | " iid='warn', n_jobs=-1,\n", 417 | " param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],\n", 418 | " 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,\n", 419 | " 11, 12, 13, 14, 15, 16, 17, 18,\n", 420 | " 19],\n", 421 | " 'min_samples_split': [10, 11, 12, 13, 14, 15, 16, 17,\n", 422 | " 18, 19, 20, 21, 22, 23, 24, 25,\n", 423 | " 26, 27, 28, 29]}],\n", 424 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 425 | " scoring=None, verbose=0)" 426 | ] 427 | }, 428 | "execution_count": 18, 429 | "metadata": {}, 430 | "output_type": "execute_result" 431 | } 432 | ], 433 | "source": [ 434 | "%%time\n", 435 | "grid_search.fit(X_train, y_train)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 19, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "0.8979775648898715" 447 | ] 448 | }, 449 | "execution_count": 19, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | } 453 | ], 454 | "source": [ 455 | "grid_search.best_score_" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 20, 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "data": { 465 | "text/plain": [ 466 | "{'max_depth': 9, 'min_samples_leaf': 19, 'min_samples_split': 10}" 467 | ] 468 | }, 469 | "execution_count": 20, 470 | "metadata": {}, 471 | "output_type": "execute_result" 472 | } 473 | ], 474 | "source": [ 475 | "grid_search.best_params_" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 21, 481 | "metadata": {}, 482 | "outputs": [ 483 | { 484 | "data": { 485 | "text/plain": [ 486 | "0.90084336362892" 487 | ] 488 | }, 489 | "execution_count": 21, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "best_tree_clf = grid_search.best_estimator_\n", 496 | "best_tree_clf.score(X_test, y_test)" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 23, 502 | "metadata": { 503 | "scrolled": true 504 | }, 505 | "outputs": [ 506 | { 507 | "name": "stdout", 508 | "output_type": "stream", 509 | "text": [ 510 | "0.951904296875\n", 511 | "0.7936087929981681\n", 512 | "0.8655788655788657\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "y_predict = best_tree_clf.predict(X_test)\n", 518 | "\n", 519 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 520 | "\n", 521 | "print(precision_score(y_test, y_predict))\n", 522 | "print(recall_score(y_test, y_predict))\n", 523 | "print(f1_score(y_test, y_predict))" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "# 4 SVM" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 25, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "from sklearn.preprocessing import StandardScaler\n", 540 | "\n", 541 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)\n", 542 | "# 数据归一化\n", 543 | "standardScalar = StandardScaler(with_mean=False)\n", 544 | "standardScalar.fit(X_train)\n", 545 | "X_train = standardScalar.transform(X_train)\n", 546 | "X_test = standardScalar.transform(X_test)" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 27, 552 | "metadata": {}, 553 | "outputs": [ 554 | { 555 | "name": "stderr", 556 | "output_type": "stream", 557 | "text": [ 558 | "C:\\Software\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 559 | " \"avoid this warning.\", FutureWarning)\n" 560 | ] 561 | }, 562 | { 563 | "name": "stdout", 564 | "output_type": "stream", 565 | "text": [ 566 | "Wall time: 10min 22s\n" 567 | ] 568 | }, 569 | { 570 | "data": { 571 | "text/plain": [ 572 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 573 | " decision_function_shape='ovr', degree=3, gamma='auto_deprecated',\n", 574 | " kernel='rbf', max_iter=-1, probability=False, random_state=None,\n", 575 | " shrinking=True, tol=0.001, verbose=False)" 576 | ] 577 | }, 578 | "execution_count": 27, 579 | "metadata": {}, 580 | "output_type": "execute_result" 581 | } 582 | ], 583 | "source": [ 584 | "%%time\n", 585 | "from sklearn.svm import SVC\n", 586 | "\n", 587 | "svm_clf = SVC()\n", 588 | "svm_clf.fit(X_train, y_train)" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 28, 594 | "metadata": {}, 595 | "outputs": [ 596 | { 597 | "data": { 598 | "text/plain": [ 599 | "0.9745558011954475" 600 | ] 601 | }, 602 | "execution_count": 28, 603 | "metadata": {}, 604 | "output_type": "execute_result" 605 | } 606 | ], 607 | "source": [ 608 | "svm_clf.score(X_train, y_train)" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 29, 614 | "metadata": {}, 615 | "outputs": [ 616 | { 617 | "data": { 618 | "text/plain": [ 619 | "0.9619258167526407" 620 | ] 621 | }, 622 | "execution_count": 29, 623 | "metadata": {}, 624 | "output_type": "execute_result" 625 | } 626 | ], 627 | "source": [ 628 | "svm_clf.score(X_test, y_test)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 30, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "name": "stdout", 638 | "output_type": "stream", 639 | "text": [ 640 | "0.9623700623700624\n", 641 | "0.9421941787095461\n", 642 | "0.9521752545510646\n" 643 | ] 644 | } 645 | ], 646 | "source": [ 647 | "y_predict = svm_clf.predict(X_test)\n", 648 | "\n", 649 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 650 | "\n", 651 | "print(precision_score(y_test, y_predict))\n", 652 | "print(recall_score(y_test, y_predict))\n", 653 | "print(f1_score(y_test, y_predict))" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "# 5 随机森林" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": 7, 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [ 669 | "from sklearn.ensemble import RandomForestClassifier\n", 670 | "\n", 671 | "rf_clf = RandomForestClassifier(n_estimators=500,\n", 672 | " random_state=666,\n", 673 | " oob_score=True,\n", 674 | " n_jobs=-1)" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": 8, 680 | "metadata": {}, 681 | "outputs": [ 682 | { 683 | "name": "stdout", 684 | "output_type": "stream", 685 | "text": [ 686 | "Wall time: 2min 50s\n" 687 | ] 688 | }, 689 | { 690 | "data": { 691 | "text/plain": [ 692 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 693 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 694 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 695 | " min_samples_leaf=1, min_samples_split=2,\n", 696 | " min_weight_fraction_leaf=0.0, n_estimators=500,\n", 697 | " n_jobs=-1, oob_score=True, random_state=666, verbose=0,\n", 698 | " warm_start=False)" 699 | ] 700 | }, 701 | "execution_count": 8, 702 | "metadata": {}, 703 | "output_type": "execute_result" 704 | } 705 | ], 706 | "source": [ 707 | "%%time\n", 708 | "rf_clf.fit(X_train, y_train)" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": 9, 714 | "metadata": {}, 715 | "outputs": [ 716 | { 717 | "data": { 718 | "text/plain": [ 719 | "0.9647916154916892" 720 | ] 721 | }, 722 | "execution_count": 9, 723 | "metadata": {}, 724 | "output_type": "execute_result" 725 | } 726 | ], 727 | "source": [ 728 | "rf_clf.score(X_test, y_test)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 10, 734 | "metadata": {}, 735 | "outputs": [ 736 | { 737 | "name": "stdout", 738 | "output_type": "stream", 739 | "text": [ 740 | "0.9618792499484855\n", 741 | "0.9501323020557704\n", 742 | "0.9559696907638747\n" 743 | ] 744 | } 745 | ], 746 | "source": [ 747 | "y_predict = rf_clf.predict(X_test)\n", 748 | "\n", 749 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 750 | "\n", 751 | "print(precision_score(y_test, y_predict))\n", 752 | "print(recall_score(y_test, y_predict))\n", 753 | "print(f1_score(y_test, y_predict))" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": null, 759 | "metadata": {}, 760 | "outputs": [], 761 | "source": [] 762 | } 763 | ], 764 | "metadata": { 765 | "kernelspec": { 766 | "display_name": "Python 3", 767 | "language": "python", 768 | "name": "python3" 769 | }, 770 | "language_info": { 771 | "codemirror_mode": { 772 | "name": "ipython", 773 | "version": 3 774 | }, 775 | "file_extension": ".py", 776 | "mimetype": "text/x-python", 777 | "name": "python", 778 | "nbconvert_exporter": "python", 779 | "pygments_lexer": "ipython3", 780 | "version": "3.7.4" 781 | } 782 | }, 783 | "nbformat": 4, 784 | "nbformat_minor": 2 785 | } 786 | -------------------------------------------------------------------------------- /自定义特征后进行机器学习.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "\n", 11 | "normal = np.loadtxt(\"vector_normal\")\n", 12 | "anomalous = np.loadtxt(\"vector_anomalous\")\n", 13 | "\n", 14 | "all_requests = np.concatenate([normal, anomalous])\n", 15 | "X = all_requests\n", 16 | "\n", 17 | "y_normal = np.zeros(shape=(normal.shape[0]), dtype='int')\n", 18 | "y_anomalous = np.ones(shape=(anomalous.shape[0]), dtype='int')\n", 19 | "y = np.concatenate([y_normal, y_anomalous])" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# 1 K近邻算法" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "划分测试集和训练集" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from sklearn.model_selection import train_test_split\n", 43 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "### 网格搜索\n", 51 | "网格搜索将使用交叉验证的方式来评估超参数的所有可能的组合" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from sklearn.model_selection import GridSearchCV\n", 61 | "from sklearn.neighbors import KNeighborsClassifier\n", 62 | "#from sklearn.pipeline import Pipeline\n", 63 | "from sklearn.preprocessing import StandardScaler\n", 64 | "\n", 65 | "# 数据归一化\n", 66 | "standardScalar = StandardScaler()\n", 67 | "standardScalar.fit(X_train)\n", 68 | "X_train = standardScalar.transform(X_train)\n", 69 | "X_test_std = standardScalar.transform(X_test)\n", 70 | "\n", 71 | "# 网格搜索的参数\n", 72 | "param_grid = [\n", 73 | " {\n", 74 | " 'weights': ['uniform'],\n", 75 | " 'n_neighbors': [i for i in range(2, 11)] #从1开始容易过拟合\n", 76 | " },\n", 77 | " {\n", 78 | " 'weights': ['distance'],\n", 79 | " 'n_neighbors': [i for i in range(2, 11)],\n", 80 | " 'p': [i for i in range(1, 6)]\n", 81 | " }\n", 82 | "]\n", 83 | "\n", 84 | "# cv其实也是一个超参数,一般越大越好,但是越大训练时间越长\n", 85 | "grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1, cv=5)\n", 86 | "\n", 87 | "# pipe_grid_knn = Pipeline([\n", 88 | "# (\"sta_scaler\", StandardScaler()),\n", 89 | "# (\"grid_sea\", grid_search)\n", 90 | "# ])" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "Wall time: 52.1 s\n" 103 | ] 104 | }, 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "GridSearchCV(cv=5, error_score='raise-deprecating',\n", 109 | " estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,\n", 110 | " metric='minkowski',\n", 111 | " metric_params=None, n_jobs=None,\n", 112 | " n_neighbors=5, p=2,\n", 113 | " weights='uniform'),\n", 114 | " iid='warn', n_jobs=-1,\n", 115 | " param_grid=[{'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],\n", 116 | " 'weights': ['uniform']},\n", 117 | " {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],\n", 118 | " 'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],\n", 119 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 120 | " scoring=None, verbose=0)" 121 | ] 122 | }, 123 | "execution_count": 4, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "%%time\n", 130 | "grid_search.fit(X_train, y_train)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "0.8759619101163076" 142 | ] 143 | }, 144 | "execution_count": 5, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "grid_search.best_score_" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 6, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "{'n_neighbors': 10, 'p': 3, 'weights': 'distance'}" 162 | ] 163 | }, 164 | "execution_count": 6, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "grid_search.best_params_" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 7, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "0.8733661278988053" 182 | ] 183 | }, 184 | "execution_count": 7, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "best_knn_clf = grid_search.best_estimator_\n", 191 | "best_knn_clf.score(X_test_std, y_test)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "### 分类结果评价" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 8, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "array([[2735, 481],\n", 210 | " [ 420, 3479]], dtype=int64)" 211 | ] 212 | }, 213 | "execution_count": 8, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "from sklearn.metrics import confusion_matrix\n", 220 | "\n", 221 | "y_predict = best_knn_clf.predict(X_test_std)\n", 222 | "confusion_matrix(y_test, y_predict)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 9, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "0.8785353535353535\n", 235 | "0.8922800718132855\n", 236 | "0.8853543707850872\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 242 | "\n", 243 | "print(precision_score(y_test, y_predict))\n", 244 | "print(recall_score(y_test, y_predict))\n", 245 | "print(f1_score(y_test, y_predict))" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 10, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "X_test = X_test_std" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "# 2 逻辑回归" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 10, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "from sklearn.linear_model import LogisticRegression\n", 271 | "\n", 272 | "param_grid = [\n", 273 | " {\n", 274 | " 'C': [0.1, 1, 3, 5, 7],\n", 275 | " 'penalty': ['l1', 'l2']\n", 276 | " }\n", 277 | "]\n", 278 | "\n", 279 | "grid_search = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1, cv=5)\n" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 11, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "Wall time: 2min 48s\n" 292 | ] 293 | }, 294 | { 295 | "name": "stderr", 296 | "output_type": "stream", 297 | "text": [ 298 | "C:\\Software\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 299 | " FutureWarning)\n" 300 | ] 301 | }, 302 | { 303 | "data": { 304 | "text/plain": [ 305 | "GridSearchCV(cv=5, error_score='raise-deprecating',\n", 306 | " estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,\n", 307 | " fit_intercept=True,\n", 308 | " intercept_scaling=1, l1_ratio=None,\n", 309 | " max_iter=100, multi_class='warn',\n", 310 | " n_jobs=None, penalty='l2',\n", 311 | " random_state=None, solver='warn',\n", 312 | " tol=0.0001, verbose=0,\n", 313 | " warm_start=False),\n", 314 | " iid='warn', n_jobs=-1,\n", 315 | " param_grid=[{'C': [0.1, 1, 3, 5, 7], 'penalty': ['l1', 'l2']}],\n", 316 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 317 | " scoring=None, verbose=0)" 318 | ] 319 | }, 320 | "execution_count": 11, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | } 324 | ], 325 | "source": [ 326 | "%%time\n", 327 | "grid_search.fit(X_train, y_train)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 13, 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "0.6869882989563935" 339 | ] 340 | }, 341 | "execution_count": 13, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "grid_search.best_score_" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 14, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "{'C': 7, 'penalty': 'l1'}" 359 | ] 360 | }, 361 | "execution_count": 14, 362 | "metadata": {}, 363 | "output_type": "execute_result" 364 | } 365 | ], 366 | "source": [ 367 | "grid_search.best_params_" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 16, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/plain": [ 378 | "0.6929023190442727" 379 | ] 380 | }, 381 | "execution_count": 16, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "best_log_clf = grid_search.best_estimator_\n", 388 | "best_log_clf.score(X_test, y_test)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 17, 394 | "metadata": {}, 395 | "outputs": [ 396 | { 397 | "name": "stdout", 398 | "output_type": "stream", 399 | "text": [ 400 | "0.7455587392550144\n", 401 | "0.6673506027186458\n", 402 | "0.7042901610502098\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "y_predict = best_log_clf.predict(X_test_std)\n", 408 | "\n", 409 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 410 | "\n", 411 | "print(precision_score(y_test, y_predict))\n", 412 | "print(recall_score(y_test, y_predict))\n", 413 | "print(f1_score(y_test, y_predict))" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "# 3 决策树" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 18, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "from sklearn.tree import DecisionTreeClassifier\n", 430 | "\n", 431 | "param_grid = [\n", 432 | " {\n", 433 | " 'max_depth':[i for i in range(1, 10)],\n", 434 | " 'min_samples_leaf':[i for i in range(1, 20)],\n", 435 | " 'min_samples_split':[i for i in range(10, 30)],\n", 436 | " }\n", 437 | "]\n", 438 | "\n", 439 | "grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, n_jobs=-1, cv=5)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 19, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "Wall time: 3min 46s\n" 452 | ] 453 | }, 454 | { 455 | "data": { 456 | "text/plain": [ 457 | "GridSearchCV(cv=5, error_score='raise-deprecating',\n", 458 | " estimator=DecisionTreeClassifier(class_weight=None,\n", 459 | " criterion='gini', max_depth=None,\n", 460 | " max_features=None,\n", 461 | " max_leaf_nodes=None,\n", 462 | " min_impurity_decrease=0.0,\n", 463 | " min_impurity_split=None,\n", 464 | " min_samples_leaf=1,\n", 465 | " min_samples_split=2,\n", 466 | " min_weight_fraction_leaf=0.0,\n", 467 | " presort=False, random_state=None,\n", 468 | " splitter='best'),\n", 469 | " iid='warn', n_jobs=-1,\n", 470 | " param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],\n", 471 | " 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,\n", 472 | " 11, 12, 13, 14, 15, 16, 17, 18,\n", 473 | " 19],\n", 474 | " 'min_samples_split': [10, 11, 12, 13, 14, 15, 16, 17,\n", 475 | " 18, 19, 20, 21, 22, 23, 24, 25,\n", 476 | " 26, 27, 28, 29]}],\n", 477 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 478 | " scoring=None, verbose=0)" 479 | ] 480 | }, 481 | "execution_count": 19, 482 | "metadata": {}, 483 | "output_type": "execute_result" 484 | } 485 | ], 486 | "source": [ 487 | "%%time\n", 488 | "grid_search.fit(X_train, y_train)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 20, 494 | "metadata": {}, 495 | "outputs": [ 496 | { 497 | "data": { 498 | "text/plain": [ 499 | "0.7973224638954285" 500 | ] 501 | }, 502 | "execution_count": 20, 503 | "metadata": {}, 504 | "output_type": "execute_result" 505 | } 506 | ], 507 | "source": [ 508 | "grid_search.best_score_" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 22, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "data": { 518 | "text/plain": [ 519 | "{'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 27}" 520 | ] 521 | }, 522 | "execution_count": 22, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "grid_search.best_params_" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 23, 534 | "metadata": {}, 535 | "outputs": [ 536 | { 537 | "data": { 538 | "text/plain": [ 539 | "0.8042164441321152" 540 | ] 541 | }, 542 | "execution_count": 23, 543 | "metadata": {}, 544 | "output_type": "execute_result" 545 | } 546 | ], 547 | "source": [ 548 | "best_tree_clf = grid_search.best_estimator_\n", 549 | "best_tree_clf.score(X_test, y_test)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 24, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "name": "stdout", 559 | "output_type": "stream", 560 | "text": [ 561 | "0.7658039881204921\n", 562 | "0.9258784303667608\n", 563 | "0.8382677348194589\n" 564 | ] 565 | } 566 | ], 567 | "source": [ 568 | "y_predict = best_tree_clf.predict(X_test_std)\n", 569 | "\n", 570 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 571 | "\n", 572 | "print(precision_score(y_test, y_predict))\n", 573 | "print(recall_score(y_test, y_predict))\n", 574 | "print(f1_score(y_test, y_predict))" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "决策树可以不用对原始数据进行缩放,但是上面的步骤进行了归一化操作,下面采用原始数据进行一次训练" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 25, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 27, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "tree_clf = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1, min_samples_split=27)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 29, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "data": { 609 | "text/plain": [ 610 | "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,\n", 611 | " max_features=None, max_leaf_nodes=None,\n", 612 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 613 | " min_samples_leaf=1, min_samples_split=27,\n", 614 | " min_weight_fraction_leaf=0.0, presort=False,\n", 615 | " random_state=None, splitter='best')" 616 | ] 617 | }, 618 | "execution_count": 29, 619 | "metadata": {}, 620 | "output_type": "execute_result" 621 | } 622 | ], 623 | "source": [ 624 | "tree_clf.fit(X_train_raw, y_train)" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 33, 630 | "metadata": {}, 631 | "outputs": [ 632 | { 633 | "data": { 634 | "text/plain": [ 635 | "0.80965599634562" 636 | ] 637 | }, 638 | "execution_count": 33, 639 | "metadata": {}, 640 | "output_type": "execute_result" 641 | } 642 | ], 643 | "source": [ 644 | "tree_clf.score(X_train_raw, y_train)" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 35, 650 | "metadata": {}, 651 | "outputs": [ 652 | { 653 | "data": { 654 | "text/plain": [ 655 | "0.8042164441321152" 656 | ] 657 | }, 658 | "execution_count": 35, 659 | "metadata": {}, 660 | "output_type": "execute_result" 661 | } 662 | ], 663 | "source": [ 664 | "tree_clf.score(X_test_raw, y_test)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 36, 670 | "metadata": {}, 671 | "outputs": [ 672 | { 673 | "name": "stdout", 674 | "output_type": "stream", 675 | "text": [ 676 | "0.5479971890372453\n", 677 | "1.0\n", 678 | "0.708007989831124\n" 679 | ] 680 | } 681 | ], 682 | "source": [ 683 | "y_predict = best_tree_clf.predict(X_test_raw)\n", 684 | "\n", 685 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 686 | "\n", 687 | "print(precision_score(y_test, y_predict))\n", 688 | "print(recall_score(y_test, y_predict))\n", 689 | "print(f1_score(y_test, y_predict))" 690 | ] 691 | }, 692 | { 693 | "cell_type": "markdown", 694 | "metadata": {}, 695 | "source": [ 696 | "# 4 SVM" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 37, 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [ 705 | "from sklearn.preprocessing import StandardScaler\n", 706 | "\n", 707 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)\n", 708 | "# 数据归一化\n", 709 | "standardScalar = StandardScaler()\n", 710 | "standardScalar.fit(X_train)\n", 711 | "X_train = standardScalar.transform(X_train)\n", 712 | "X_test = standardScalar.transform(X_test)" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": 48, 718 | "metadata": {}, 719 | "outputs": [], 720 | "source": [ 721 | "from sklearn.svm import SVC\n", 722 | "\n", 723 | "param_grid = [\n", 724 | " {\n", 725 | " 'kernel': [\"poly\"],\n", 726 | " 'degree': [1, 2, 3],\n", 727 | " 'C': [0.1, 1, 3, 5]\n", 728 | " }\n", 729 | "]\n", 730 | "\n", 731 | "grid_search = GridSearchCV(SVC(), param_grid, n_jobs=-1, cv=5)" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 49, 737 | "metadata": {}, 738 | "outputs": [ 739 | { 740 | "name": "stdout", 741 | "output_type": "stream", 742 | "text": [ 743 | "Wall time: 8min 26s\n" 744 | ] 745 | }, 746 | { 747 | "data": { 748 | "text/plain": [ 749 | "GridSearchCV(cv=5, error_score='raise-deprecating',\n", 750 | " estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 751 | " decision_function_shape='ovr', degree=3,\n", 752 | " gamma='auto_deprecated', kernel='rbf', max_iter=-1,\n", 753 | " probability=False, random_state=None, shrinking=True,\n", 754 | " tol=0.001, verbose=False),\n", 755 | " iid='warn', n_jobs=-1,\n", 756 | " param_grid=[{'C': [0.1, 1, 3, 5], 'degree': [1, 2, 3],\n", 757 | " 'kernel': ['poly']}],\n", 758 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 759 | " scoring=None, verbose=0)" 760 | ] 761 | }, 762 | "execution_count": 49, 763 | "metadata": {}, 764 | "output_type": "execute_result" 765 | } 766 | ], 767 | "source": [ 768 | "%%time\n", 769 | "grid_search.fit(X_train, y_train)" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": 50, 775 | "metadata": {}, 776 | "outputs": [ 777 | { 778 | "data": { 779 | "text/plain": [ 780 | "{'C': 5, 'degree': 3, 'kernel': 'poly'}" 781 | ] 782 | }, 783 | "execution_count": 50, 784 | "metadata": {}, 785 | "output_type": "execute_result" 786 | } 787 | ], 788 | "source": [ 789 | "grid_search.best_params_" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": 51, 795 | "metadata": {}, 796 | "outputs": [ 797 | { 798 | "data": { 799 | "text/plain": [ 800 | "0.7022734460100496" 801 | ] 802 | }, 803 | "execution_count": 51, 804 | "metadata": {}, 805 | "output_type": "execute_result" 806 | } 807 | ], 808 | "source": [ 809 | "grid_search.best_score_" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 53, 815 | "metadata": {}, 816 | "outputs": [ 817 | { 818 | "data": { 819 | "text/plain": [ 820 | "0.7114546732255798" 821 | ] 822 | }, 823 | "execution_count": 53, 824 | "metadata": {}, 825 | "output_type": "execute_result" 826 | } 827 | ], 828 | "source": [ 829 | "best_svm_clf = grid_search.best_estimator_\n", 830 | "best_svm_clf.score(X_test, y_test)" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 54, 836 | "metadata": {}, 837 | "outputs": [ 838 | { 839 | "name": "stdout", 840 | "output_type": "stream", 841 | "text": [ 842 | "0.7404898384575299\n", 843 | "0.7289048473967684\n", 844 | "0.7346516737753651\n" 845 | ] 846 | } 847 | ], 848 | "source": [ 849 | "y_predict = best_svm_clf.predict(X_test)\n", 850 | "\n", 851 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 852 | "\n", 853 | "print(precision_score(y_test, y_predict))\n", 854 | "print(recall_score(y_test, y_predict))\n", 855 | "print(f1_score(y_test, y_predict))" 856 | ] 857 | }, 858 | { 859 | "cell_type": "markdown", 860 | "metadata": {}, 861 | "source": [ 862 | "# 5 随机森林" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 3, 868 | "metadata": {}, 869 | "outputs": [], 870 | "source": [ 871 | "from sklearn.ensemble import RandomForestClassifier" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": 4, 877 | "metadata": {}, 878 | "outputs": [], 879 | "source": [ 880 | "rf_clf = RandomForestClassifier(n_estimators=500,\n", 881 | " random_state=666,\n", 882 | " oob_score=True,\n", 883 | " n_jobs=-1)" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": 5, 889 | "metadata": { 890 | "scrolled": false 891 | }, 892 | "outputs": [ 893 | { 894 | "name": "stdout", 895 | "output_type": "stream", 896 | "text": [ 897 | "Wall time: 3.77 s\n" 898 | ] 899 | }, 900 | { 901 | "data": { 902 | "text/plain": [ 903 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 904 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 905 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 906 | " min_samples_leaf=1, min_samples_split=2,\n", 907 | " min_weight_fraction_leaf=0.0, n_estimators=500,\n", 908 | " n_jobs=-1, oob_score=True, random_state=666, verbose=0,\n", 909 | " warm_start=False)" 910 | ] 911 | }, 912 | "execution_count": 5, 913 | "metadata": {}, 914 | "output_type": "execute_result" 915 | } 916 | ], 917 | "source": [ 918 | "%%time\n", 919 | "rf_clf.fit(X_train, y_train)" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": 8, 925 | "metadata": {}, 926 | "outputs": [ 927 | { 928 | "data": { 929 | "text/plain": [ 930 | "0.947013352073085" 931 | ] 932 | }, 933 | "execution_count": 8, 934 | "metadata": {}, 935 | "output_type": "execute_result" 936 | } 937 | ], 938 | "source": [ 939 | "rf_clf.score(X_test, y_test)" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": 9, 945 | "metadata": {}, 946 | "outputs": [ 947 | { 948 | "name": "stdout", 949 | "output_type": "stream", 950 | "text": [ 951 | "0.9471813103098019\n", 952 | "0.956655552705822\n", 953 | "0.9518948577261708\n" 954 | ] 955 | } 956 | ], 957 | "source": [ 958 | "y_predict = rf_clf.predict(X_test)\n", 959 | "\n", 960 | "from sklearn.metrics import precision_score, recall_score, f1_score\n", 961 | "\n", 962 | "print(precision_score(y_test, y_predict))\n", 963 | "print(recall_score(y_test, y_predict))\n", 964 | "print(f1_score(y_test, y_predict))" 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": null, 970 | "metadata": {}, 971 | "outputs": [], 972 | "source": [] 973 | } 974 | ], 975 | "metadata": { 976 | "kernelspec": { 977 | "display_name": "Python 3", 978 | "language": "python", 979 | "name": "python3" 980 | }, 981 | "language_info": { 982 | "codemirror_mode": { 983 | "name": "ipython", 984 | "version": 3 985 | }, 986 | "file_extension": ".py", 987 | "mimetype": "text/x-python", 988 | "name": "python", 989 | "nbconvert_exporter": "python", 990 | "pygments_lexer": "ipython3", 991 | "version": "3.7.4" 992 | } 993 | }, 994 | "nbformat": 4, 995 | "nbformat_minor": 2 996 | } 997 | --------------------------------------------------------------------------------