├── .gitignore ├── 007. machine learning ensemble - random forest.ipynb ├── 008. machine learning - ensemble boosting basic.ipynb ├── 009. XGboost, LightGBM.ipynb ├── 010. credit_card_fraud_basic.ipynb ├── 011. outlier, oversampling with credit card fraud_kaggle.ipynb ├── 012. stacking ensemble.ipynb ├── 013. Time series cointegration.ipynb └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,pycharm,jupyternotebooks 3 | # Edit at https://www.gitignore.io/?templates=python,pycharm,jupyternotebooks 4 | 5 | ### JupyterNotebooks ### 6 | # gitignore template for Jupyter Notebooks 7 | # website: http://jupyter.org/ 8 | 9 | .ipynb_checkpoints 10 | */.ipynb_checkpoints/* 11 | 12 | # IPython 13 | profile_default/ 14 | ipython_config.py 15 | 16 | # Remove previous ipynb_checkpoints 17 | # git rm -r .ipynb_checkpoints/ 18 | 19 | ### PyCharm ### 20 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 21 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 22 | 23 | # User-specific stuff 24 | .idea/**/workspace.xml 25 | .idea/**/tasks.xml 26 | .idea/**/usage.statistics.xml 27 | .idea/**/dictionaries 28 | .idea/**/shelf 29 | 30 | # Generated files 31 | .idea/**/contentModel.xml 32 | 33 | # Sensitive or high-churn files 34 | .idea/**/dataSources/ 35 | .idea/**/dataSources.ids 36 | .idea/**/dataSources.local.xml 37 | .idea/**/sqlDataSources.xml 38 | .idea/**/dynamic.xml 39 | .idea/**/uiDesigner.xml 40 | .idea/**/dbnavigator.xml 41 | 42 | # Gradle 43 | .idea/**/gradle.xml 44 | .idea/**/libraries 45 | 46 | # Gradle and Maven with auto-import 47 | # When using Gradle or Maven with auto-import, you should exclude module files, 48 | # since they will be recreated, and may cause churn. Uncomment if using 49 | # auto-import. 50 | # .idea/modules.xml 51 | # .idea/*.iml 52 | # .idea/modules 53 | # *.iml 54 | # *.ipr 55 | 56 | # CMake 57 | cmake-build-*/ 58 | 59 | # Mongo Explorer plugin 60 | .idea/**/mongoSettings.xml 61 | 62 | # File-based project format 63 | *.iws 64 | 65 | # IntelliJ 66 | out/ 67 | 68 | # mpeltonen/sbt-idea plugin 69 | .idea_modules/ 70 | 71 | # JIRA plugin 72 | atlassian-ide-plugin.xml 73 | 74 | # Cursive Clojure plugin 75 | .idea/replstate.xml 76 | 77 | # Crashlytics plugin (for Android Studio and IntelliJ) 78 | com_crashlytics_export_strings.xml 79 | crashlytics.properties 80 | crashlytics-build.properties 81 | fabric.properties 82 | 83 | # Editor-based Rest Client 84 | .idea/httpRequests 85 | 86 | # Android studio 3.1+ serialized cache file 87 | .idea/caches/build_file_checksums.ser 88 | 89 | ### PyCharm Patch ### 90 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 91 | 92 | # *.iml 93 | # modules.xml 94 | # .idea/misc.xml 95 | # *.ipr 96 | 97 | # Sonarlint plugin 98 | .idea/**/sonarlint/ 99 | 100 | # SonarQube Plugin 101 | .idea/**/sonarIssues.xml 102 | 103 | # Markdown Navigator plugin 104 | .idea/**/markdown-navigator.xml 105 | .idea/**/markdown-navigator/ 106 | 107 | ### Python ### 108 | # Byte-compiled / optimized / DLL files 109 | __pycache__/ 110 | *.py[cod] 111 | *$py.class 112 | 113 | # C extensions 114 | *.so 115 | 116 | # Distribution / packaging 117 | .Python 118 | build/ 119 | develop-eggs/ 120 | dist/ 121 | downloads/ 122 | eggs/ 123 | .eggs/ 124 | lib/ 125 | lib64/ 126 | parts/ 127 | sdist/ 128 | var/ 129 | wheels/ 130 | pip-wheel-metadata/ 131 | share/python-wheels/ 132 | *.egg-info/ 133 | .installed.cfg 134 | *.egg 135 | MANIFEST 136 | 137 | # PyInstaller 138 | # Usually these files are written by a python script from a template 139 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 140 | *.manifest 141 | *.spec 142 | 143 | # Installer logs 144 | pip-log.txt 145 | pip-delete-this-directory.txt 146 | 147 | # Unit test / coverage reports 148 | htmlcov/ 149 | .tox/ 150 | .nox/ 151 | .coverage 152 | .coverage.* 153 | .cache 154 | nosetests.xml 155 | coverage.xml 156 | *.cover 157 | .hypothesis/ 158 | .pytest_cache/ 159 | 160 | # Translations 161 | *.mo 162 | *.pot 163 | 164 | # Scrapy stuff: 165 | .scrapy 166 | 167 | # Sphinx documentation 168 | docs/_build/ 169 | 170 | # PyBuilder 171 | target/ 172 | 173 | # pyenv 174 | .python-version 175 | 176 | # pipenv 177 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 178 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 179 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 180 | # install all needed dependencies. 181 | #Pipfile.lock 182 | 183 | # celery beat schedule file 184 | celerybeat-schedule 185 | 186 | # SageMath parsed files 187 | *.sage.py 188 | 189 | # Spyder project settings 190 | .spyderproject 191 | .spyproject 192 | 193 | # Rope project settings 194 | .ropeproject 195 | 196 | # Mr Developer 197 | .mr.developer.cfg 198 | .project 199 | .pydevproject 200 | 201 | # mkdocs documentation 202 | /site 203 | 204 | # mypy 205 | .mypy_cache/ 206 | .dmypy.json 207 | dmypy.json 208 | 209 | # Pyre type checker 210 | .pyre/ 211 | 212 | # End of https://www.gitignore.io/api/python,pycharm,jupyternotebooks 213 | -------------------------------------------------------------------------------- /007. machine learning ensemble - random forest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 출처\n", 8 | "\n", 9 | "- https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python\n", 10 | "- https://www.kaggle.com/lsjsj92/simple-titanic-kernel-82-for-beginner-like-me\n", 11 | "- https://www.kaggle.com/startupsci/titanic-data-science-solutions\n", 12 | "- https://www.kaggle.com/ash316/eda-to-prediction-dietanic\n", 13 | "- https://www.kaggle.com/mjbahmani/a-comprehensive-ml-workflow-with-python" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd\n", 23 | "import numpy as np\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import seaborn as sns\n", 26 | "\n", 27 | "from sklearn.tree import DecisionTreeClassifier\n", 28 | "from sklearn.ensemble import RandomForestClassifier\n", 29 | "from sklearn.model_selection import train_test_split, GridSearchCV\n", 30 | "from sklearn.metrics import accuracy_score" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 42 | "\n", 55 | "\n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", 151 | "
" 152 | ], 153 | "text/plain": [ 154 | " PassengerId Survived Pclass \\\n", 155 | "0 1 0 3 \n", 156 | "1 2 1 1 \n", 157 | "2 3 1 3 \n", 158 | "3 4 1 1 \n", 159 | "4 5 0 3 \n", 160 | "\n", 161 | " Name Sex Age SibSp \\\n", 162 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 163 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 164 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 165 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 166 | "4 Allen, Mr. William Henry male 35.0 0 \n", 167 | "\n", 168 | " Parch Ticket Fare Cabin Embarked \n", 169 | "0 0 A/5 21171 7.2500 NaN S \n", 170 | "1 0 PC 17599 71.2833 C85 C \n", 171 | "2 0 STON/O2. 3101282 7.9250 NaN S \n", 172 | "3 0 113803 53.1000 C123 S \n", 173 | "4 0 373450 8.0500 NaN S " 174 | ] 175 | }, 176 | "execution_count": 2, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "data = pd.read_csv('../datas/titanic/train.csv')\n", 183 | "data.head()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 3, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "PassengerId 0\n", 195 | "Survived 0\n", 196 | "Pclass 0\n", 197 | "Name 0\n", 198 | "Sex 0\n", 199 | "Age 177\n", 200 | "SibSp 0\n", 201 | "Parch 0\n", 202 | "Ticket 0\n", 203 | "Fare 0\n", 204 | "Cabin 687\n", 205 | "Embarked 2\n", 206 | "dtype: int64" 207 | ] 208 | }, 209 | "execution_count": 3, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "data.isna().sum()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 4, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "age null값 비율 : 19.865\n", 228 | "cabin null값 비율 : 77.104\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "print(\"age null값 비율 : {0:.3f}\".format((data['Age'].isna().sum() / len(data)) * 100))\n", 234 | "print(\"cabin null값 비율 : {0:.3f}\".format((data['Cabin'].isna().sum() / len(data)) * 100 ))" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 13, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "data['Embarked'].fillna('S', inplace = True)\n", 251 | "data['Fare'].fillna(0, inplace=True)\n", 252 | "data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 14, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\\.')\n", 262 | "data['Initial'].data(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)\n", 263 | "mapping = {\n", 264 | " \"Mr\":0,\n", 265 | " \"Miss\":1,\n", 266 | " \"Mrs\" : 1,\n", 267 | " \"Master\":2,\n", 268 | " \"Other\":3\n", 269 | "}\n", 270 | "\n", 271 | "data['Initial'] = data['Initial'].map(mapping)\n" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 15, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "mapping_sex = {\n", 281 | " 'male' : 0,\n", 282 | " 'female': 1\n", 283 | "}\n", 284 | "\n", 285 | "mapping_em = {\n", 286 | " 'S' :0,\n", 287 | " 'C' :1,\n", 288 | " 'Q' :2\n", 289 | "}\n", 290 | "\n", 291 | "\n", 292 | "data['Sex'] = data['Sex'].map(mapping_sex)\n", 293 | "data['Embarked'] = data['Embarked'].map(mapping_em)\n", 294 | "\n", 295 | "\n", 296 | "data.drop(['PassengerId', \"Ticket\", \"Cabin\", \"Name\"], axis = 1, inplace = True)\n" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 16, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "Initial\n", 308 | "0 32.739609\n", 309 | "1 27.834615\n", 310 | "2 4.574167\n", 311 | "3 45.888889\n", 312 | "Name: Age, dtype: float64" 313 | ] 314 | }, 315 | "execution_count": 16, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "data.groupby('Initial')['Age'].mean()" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 17, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 0), 'Age' ] = 32\n", 331 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 1), 'Age' ] = 28\n", 332 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 2), 'Age' ] = 5\n", 333 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 3), 'Age' ] = 45" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 18, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/html": [ 344 | "
\n", 345 | "\n", 358 | "\n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | "
SurvivedPclassSexAgeSibSpParchFareEmbarkedInitial
003022.0101.98100100
111138.0104.26666211
213126.0002.07002201
311135.0103.97217701
403035.0002.08567200
\n", 436 | "
" 437 | ], 438 | "text/plain": [ 439 | " Survived Pclass Sex Age SibSp Parch Fare Embarked Initial\n", 440 | "0 0 3 0 22.0 1 0 1.981001 0 0\n", 441 | "1 1 1 1 38.0 1 0 4.266662 1 1\n", 442 | "2 1 3 1 26.0 0 0 2.070022 0 1\n", 443 | "3 1 1 1 35.0 1 0 3.972177 0 1\n", 444 | "4 0 3 0 35.0 0 0 2.085672 0 0" 445 | ] 446 | }, 447 | "execution_count": 18, 448 | "metadata": {}, 449 | "output_type": "execute_result" 450 | } 451 | ], 452 | "source": [ 453 | "data.head()" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 19, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "y = data['Survived']\n", 463 | "X = data.drop('Survived', axis = 1)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 20, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 21, 478 | "metadata": {}, 479 | "outputs": [ 480 | { 481 | "name": "stderr", 482 | "output_type": "stream", 483 | "text": [ 484 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", 485 | " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" 486 | ] 487 | }, 488 | { 489 | "data": { 490 | "text/plain": [ 491 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 492 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 493 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 494 | " min_samples_leaf=1, min_samples_split=2,\n", 495 | " min_weight_fraction_leaf=0.0, n_estimators=10,\n", 496 | " n_jobs=None, oob_score=False, random_state=0, verbose=0,\n", 497 | " warm_start=False)" 498 | ] 499 | }, 500 | "execution_count": 21, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "rf = RandomForestClassifier(random_state=0)\n", 507 | "rf.fit(X_train, y_train)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 22, 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "정확도 :0.810\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "pred = rf.predict(X_test)\n", 525 | "print(\"정확도 :{0:.3f}\".format(accuracy_score(y_test, pred)))" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 23, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [ 534 | "rf_param_grid = {\n", 535 | " 'n_estimators' : [100, 200, 300],\n", 536 | " 'max_depth' : [4, 6, 8, 10, 12],\n", 537 | " 'min_samples_leaf' : [3, 5, 6, 7, 10],\n", 538 | " 'min_samples_split' : [2, 3, 5, 7, 10]\n", 539 | "}" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": 24, 545 | "metadata": {}, 546 | "outputs": [ 547 | { 548 | "name": "stderr", 549 | "output_type": "stream", 550 | "text": [ 551 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n", 552 | " warnings.warn(CV_WARNING, FutureWarning)\n", 553 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n" 554 | ] 555 | }, 556 | { 557 | "name": "stdout", 558 | "output_type": "stream", 559 | "text": [ 560 | "Fitting 3 folds for each of 128 candidates, totalling 384 fits\n" 561 | ] 562 | }, 563 | { 564 | "name": "stderr", 565 | "output_type": "stream", 566 | "text": [ 567 | "[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 4.4s\n", 568 | "[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 9.7s\n", 569 | "[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed: 16.7s finished\n" 570 | ] 571 | }, 572 | { 573 | "data": { 574 | "text/plain": [ 575 | "GridSearchCV(cv='warn', error_score='raise-deprecating',\n", 576 | " estimator=RandomForestClassifier(bootstrap=True, class_weight=None,\n", 577 | " criterion='gini', max_depth=None,\n", 578 | " max_features='auto',\n", 579 | " max_leaf_nodes=None,\n", 580 | " min_impurity_decrease=0.0,\n", 581 | " min_impurity_split=None,\n", 582 | " min_samples_leaf=1,\n", 583 | " min_samples_split=2,\n", 584 | " min_weight_fraction_leaf=0.0,\n", 585 | " n_estimators=10, n_jobs=None,\n", 586 | " oob_score=False, random_state=0,\n", 587 | " verbose=0, warm_start=False),\n", 588 | " iid='warn', n_jobs=-1,\n", 589 | " param_grid={'max_depth': [6, 8, 10, 12],\n", 590 | " 'min_samples_leaf': [3, 5, 7, 10],\n", 591 | " 'min_samples_split': [2, 3, 5, 10],\n", 592 | " 'n_estimators': [100, 200]},\n", 593 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 594 | " scoring='accuracy', verbose=1)" 595 | ] 596 | }, 597 | "execution_count": 24, 598 | "metadata": {}, 599 | "output_type": "execute_result" 600 | } 601 | ], 602 | "source": [ 603 | "rf_grid = GridSearchCV(rf, param_grid = rf_param_grid, scoring=\"accuracy\", n_jobs= -1, verbose = 1)\n", 604 | "rf_grid.fit(X_train, y_train)" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 25, 610 | "metadata": {}, 611 | "outputs": [ 612 | { 613 | "name": "stdout", 614 | "output_type": "stream", 615 | "text": [ 616 | "최고 평균 정확도 : 0.8174\n", 617 | "최고의 파라미터 : {'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 100}\n" 618 | ] 619 | } 620 | ], 621 | "source": [ 622 | "print(\"최고 평균 정확도 : {0:.4f}\".format(rf_grid.best_score_))\n", 623 | "print(\"최고의 파라미터 : \", rf_grid.best_params_)" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 26, 629 | "metadata": {}, 630 | "outputs": [ 631 | { 632 | "data": { 633 | "text/plain": [ 634 | "Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',\n", 635 | " 'param_max_depth', 'param_min_samples_leaf', 'param_min_samples_split',\n", 636 | " 'param_n_estimators', 'params', 'split0_test_score',\n", 637 | " 'split1_test_score', 'split2_test_score', 'mean_test_score',\n", 638 | " 'std_test_score', 'rank_test_score'],\n", 639 | " dtype='object')" 640 | ] 641 | }, 642 | "execution_count": 26, 643 | "metadata": {}, 644 | "output_type": "execute_result" 645 | } 646 | ], 647 | "source": [ 648 | "result = pd.DataFrame(rf_grid.cv_results_)\n", 649 | "result.columns" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": 27, 655 | "metadata": {}, 656 | "outputs": [], 657 | "source": [ 658 | "result.sort_values(by=['rank_test_score'], inplace=True)" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 28, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "data": { 668 | "text/html": [ 669 | "
\n", 670 | "\n", 683 | "\n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | "
paramsmean_test_scorerank_test_score
32{'max_depth': 8, 'min_samples_leaf': 3, 'min_s...0.8174161
34{'max_depth': 8, 'min_samples_leaf': 3, 'min_s...0.8174161
36{'max_depth': 8, 'min_samples_leaf': 3, 'min_s...0.8174161
56{'max_depth': 8, 'min_samples_leaf': 10, 'min_...0.8146074
105{'max_depth': 12, 'min_samples_leaf': 5, 'min_...0.8146074
107{'max_depth': 12, 'min_samples_leaf': 5, 'min_...0.8146074
68{'max_depth': 10, 'min_samples_leaf': 3, 'min_...0.8146074
66{'max_depth': 10, 'min_samples_leaf': 3, 'min_...0.8146074
111{'max_depth': 12, 'min_samples_leaf': 5, 'min_...0.8146074
109{'max_depth': 12, 'min_samples_leaf': 5, 'min_...0.8146074
\n", 755 | "
" 756 | ], 757 | "text/plain": [ 758 | " params mean_test_score \\\n", 759 | "32 {'max_depth': 8, 'min_samples_leaf': 3, 'min_s... 0.817416 \n", 760 | "34 {'max_depth': 8, 'min_samples_leaf': 3, 'min_s... 0.817416 \n", 761 | "36 {'max_depth': 8, 'min_samples_leaf': 3, 'min_s... 0.817416 \n", 762 | "56 {'max_depth': 8, 'min_samples_leaf': 10, 'min_... 0.814607 \n", 763 | "105 {'max_depth': 12, 'min_samples_leaf': 5, 'min_... 0.814607 \n", 764 | "107 {'max_depth': 12, 'min_samples_leaf': 5, 'min_... 0.814607 \n", 765 | "68 {'max_depth': 10, 'min_samples_leaf': 3, 'min_... 0.814607 \n", 766 | "66 {'max_depth': 10, 'min_samples_leaf': 3, 'min_... 0.814607 \n", 767 | "111 {'max_depth': 12, 'min_samples_leaf': 5, 'min_... 0.814607 \n", 768 | "109 {'max_depth': 12, 'min_samples_leaf': 5, 'min_... 0.814607 \n", 769 | "\n", 770 | " rank_test_score \n", 771 | "32 1 \n", 772 | "34 1 \n", 773 | "36 1 \n", 774 | "56 4 \n", 775 | "105 4 \n", 776 | "107 4 \n", 777 | "68 4 \n", 778 | "66 4 \n", 779 | "111 4 \n", 780 | "109 4 " 781 | ] 782 | }, 783 | "execution_count": 28, 784 | "metadata": {}, 785 | "output_type": "execute_result" 786 | } 787 | ], 788 | "source": [ 789 | "result[['params', 'mean_test_score', 'rank_test_score']].head(10)" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": 29, 795 | "metadata": {}, 796 | "outputs": [ 797 | { 798 | "name": "stdout", 799 | "output_type": "stream", 800 | "text": [ 801 | "정확도 : 0.8603\n" 802 | ] 803 | } 804 | ], 805 | "source": [ 806 | "model = rf_grid.best_estimator_\n", 807 | "pred = model.predict(X_test)\n", 808 | "acc = accuracy_score(y_test, pred)\n", 809 | "print(\"정확도 : {0:.4f}\".format(acc))\n" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 30, 815 | "metadata": {}, 816 | "outputs": [], 817 | "source": [ 818 | "feature_importances = model.feature_importances_" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 31, 824 | "metadata": {}, 825 | "outputs": [ 826 | { 827 | "data": { 828 | "image/png": "\n", 829 | "text/plain": [ 830 | "
" 831 | ] 832 | }, 833 | "metadata": { 834 | "needs_background": "light" 835 | }, 836 | "output_type": "display_data" 837 | } 838 | ], 839 | "source": [ 840 | "ft_importances = pd.Series(feature_importances, index = X_train.columns)\n", 841 | "ft_importances = ft_importances.sort_values(ascending=False)\n", 842 | "\n", 843 | "plt.figure(figsize=(12, 10))\n", 844 | "plt.title(\"feature importances\")\n", 845 | "sns.barplot(x=ft_importances, y = X_train.columns)\n", 846 | "plt.show()\n" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "metadata": {}, 853 | "outputs": [], 854 | "source": [] 855 | } 856 | ], 857 | "metadata": { 858 | "kernelspec": { 859 | "display_name": "Python 3", 860 | "language": "python", 861 | "name": "python3" 862 | }, 863 | "language_info": { 864 | "codemirror_mode": { 865 | "name": "ipython", 866 | "version": 3 867 | }, 868 | "file_extension": ".py", 869 | "mimetype": "text/x-python", 870 | "name": "python", 871 | "nbconvert_exporter": "python", 872 | "pygments_lexer": "ipython3", 873 | "version": "3.6.9" 874 | } 875 | }, 876 | "nbformat": 4, 877 | "nbformat_minor": 2 878 | } 879 | -------------------------------------------------------------------------------- /008. machine learning - ensemble boosting basic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 출처\n", 8 | "\n", 9 | "- https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python\n", 10 | "- https://www.kaggle.com/lsjsj92/simple-titanic-kernel-82-for-beginner-like-me\n", 11 | "- https://www.kaggle.com/startupsci/titanic-data-science-solutions\n", 12 | "- https://www.kaggle.com/ash316/eda-to-prediction-dietanic\n", 13 | "- https://www.kaggle.com/mjbahmani/a-comprehensive-ml-workflow-with-python" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd\n", 23 | "import numpy as np\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import seaborn as sns\n", 26 | "\n", 27 | "from sklearn.tree import DecisionTreeClassifier\n", 28 | "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", 29 | "from sklearn.model_selection import train_test_split, GridSearchCV\n", 30 | "from sklearn.metrics import accuracy_score" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 42 | "\n", 55 | "\n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", 151 | "
" 152 | ], 153 | "text/plain": [ 154 | " PassengerId Survived Pclass \\\n", 155 | "0 1 0 3 \n", 156 | "1 2 1 1 \n", 157 | "2 3 1 3 \n", 158 | "3 4 1 1 \n", 159 | "4 5 0 3 \n", 160 | "\n", 161 | " Name Sex Age SibSp \\\n", 162 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 163 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 164 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 165 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 166 | "4 Allen, Mr. William Henry male 35.0 0 \n", 167 | "\n", 168 | " Parch Ticket Fare Cabin Embarked \n", 169 | "0 0 A/5 21171 7.2500 NaN S \n", 170 | "1 0 PC 17599 71.2833 C85 C \n", 171 | "2 0 STON/O2. 3101282 7.9250 NaN S \n", 172 | "3 0 113803 53.1000 C123 S \n", 173 | "4 0 373450 8.0500 NaN S " 174 | ] 175 | }, 176 | "execution_count": 4, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "data = pd.read_csv('../datas/titanic/train.csv')\n", 183 | "data.head()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 5, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "data['Embarked'].fillna('S', inplace = True)\n", 193 | "data['Fare'].fillna(0, inplace=True)\n", 194 | "data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 6, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\\.')\n", 204 | "data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)\n", 205 | "mapping = {\n", 206 | " \"Mr\":0,\n", 207 | " \"Miss\":1,\n", 208 | " \"Mrs\" : 1,\n", 209 | " \"Master\":2,\n", 210 | " \"Other\":3\n", 211 | "}\n", 212 | "\n", 213 | "data['Initial'] = data['Initial'].map(mapping)\n" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 7, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "mapping_sex = {\n", 223 | " 'male' : 0,\n", 224 | " 'female': 1\n", 225 | "}\n", 226 | "\n", 227 | "mapping_em = {\n", 228 | " 'S' :0,\n", 229 | " 'C' :1,\n", 230 | " 'Q' :2\n", 231 | "}\n", 232 | "\n", 233 | "\n", 234 | "data['Sex'] = data['Sex'].map(mapping_sex)\n", 235 | "data['Embarked'] = data['Embarked'].map(mapping_em)\n", 236 | "\n", 237 | "\n", 238 | "data.drop(['PassengerId', \"Ticket\", \"Cabin\", \"Name\"], axis = 1, inplace = True)\n" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 8, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "Initial\n", 250 | "0 32.739609\n", 251 | "1 27.834615\n", 252 | "2 4.574167\n", 253 | "3 45.888889\n", 254 | "Name: Age, dtype: float64" 255 | ] 256 | }, 257 | "execution_count": 8, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "data.groupby('Initial')['Age'].mean()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 9, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 0), 'Age' ] = 32\n", 273 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 1), 'Age' ] = 28\n", 274 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 2), 'Age' ] = 5\n", 275 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 3), 'Age' ] = 45" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 10, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/html": [ 286 | "
\n", 287 | "\n", 300 | "\n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | "
SurvivedPclassSexAgeSibSpParchFareEmbarkedInitial
003022.0101.98100100
111138.0104.26666211
213126.0002.07002201
311135.0103.97217701
403035.0002.08567200
\n", 378 | "
" 379 | ], 380 | "text/plain": [ 381 | " Survived Pclass Sex Age SibSp Parch Fare Embarked Initial\n", 382 | "0 0 3 0 22.0 1 0 1.981001 0 0\n", 383 | "1 1 1 1 38.0 1 0 4.266662 1 1\n", 384 | "2 1 3 1 26.0 0 0 2.070022 0 1\n", 385 | "3 1 1 1 35.0 1 0 3.972177 0 1\n", 386 | "4 0 3 0 35.0 0 0 2.085672 0 0" 387 | ] 388 | }, 389 | "execution_count": 10, 390 | "metadata": {}, 391 | "output_type": "execute_result" 392 | } 393 | ], 394 | "source": [ 395 | "data.head()" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 11, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "y = data['Survived']\n", 405 | "X = data.drop('Survived', axis = 1)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 12, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 13, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "name": "stderr", 424 | "output_type": "stream", 425 | "text": [ 426 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", 427 | " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" 428 | ] 429 | }, 430 | { 431 | "data": { 432 | "text/plain": [ 433 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 434 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 435 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 436 | " min_samples_leaf=1, min_samples_split=2,\n", 437 | " min_weight_fraction_leaf=0.0, n_estimators=10,\n", 438 | " n_jobs=None, oob_score=False, random_state=0, verbose=0,\n", 439 | " warm_start=False)" 440 | ] 441 | }, 442 | "execution_count": 13, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | } 446 | ], 447 | "source": [ 448 | "rf = RandomForestClassifier(random_state=0)\n", 449 | "rf.fit(X_train, y_train)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 14, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "name": "stdout", 459 | "output_type": "stream", 460 | "text": [ 461 | "정확도 :0.810\n" 462 | ] 463 | } 464 | ], 465 | "source": [ 466 | "pred = rf.predict(X_test)\n", 467 | "print(\"정확도 :{0:.3f}\".format(accuracy_score(y_test, pred)))" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 11, 473 | "metadata": {}, 474 | "outputs": [ 475 | { 476 | "data": { 477 | "text/plain": [ 478 | "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n", 479 | " learning_rate=0.1, loss='deviance', max_depth=3,\n", 480 | " max_features=None, max_leaf_nodes=None,\n", 481 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 482 | " min_samples_leaf=1, min_samples_split=2,\n", 483 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n", 484 | " n_iter_no_change=None, presort='auto',\n", 485 | " random_state=0, subsample=1.0, tol=0.0001,\n", 486 | " validation_fraction=0.1, verbose=0,\n", 487 | " warm_start=False)" 488 | ] 489 | }, 490 | "execution_count": 11, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "gb = GradientBoostingClassifier(random_state=0)\n", 497 | "gb.fit(X_train, y_train)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 12, 503 | "metadata": {}, 504 | "outputs": [ 505 | { 506 | "name": "stdout", 507 | "output_type": "stream", 508 | "text": [ 509 | "정확도 :0.832\n" 510 | ] 511 | } 512 | ], 513 | "source": [ 514 | "pred = gb.predict(X_test)\n", 515 | "print(\"정확도 :{0:.3f}\".format(accuracy_score(y_test, pred)))" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 13, 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [ 524 | "gb_param_grid = {\n", 525 | " 'n_estimators' : [100, 200],\n", 526 | " 'max_depth' : [6, 8, 10, 12],\n", 527 | " 'min_samples_leaf' : [3, 5, 7, 10],\n", 528 | " 'min_samples_split' : [2, 3, 5, 10],\n", 529 | " 'learning_rate' : [0.05, 0.1, 0.2]\n", 530 | "}" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 14, 536 | "metadata": {}, 537 | "outputs": [ 538 | { 539 | "name": "stderr", 540 | "output_type": "stream", 541 | "text": [ 542 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n", 543 | " warnings.warn(CV_WARNING, FutureWarning)\n", 544 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n" 545 | ] 546 | }, 547 | { 548 | "name": "stdout", 549 | "output_type": "stream", 550 | "text": [ 551 | "Fitting 3 folds for each of 384 candidates, totalling 1152 fits\n" 552 | ] 553 | }, 554 | { 555 | "name": "stderr", 556 | "output_type": "stream", 557 | "text": [ 558 | "[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 4.6s\n", 559 | "[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 9.4s\n", 560 | "[Parallel(n_jobs=-1)]: Done 426 tasks | elapsed: 20.0s\n", 561 | "[Parallel(n_jobs=-1)]: Done 776 tasks | elapsed: 34.5s\n", 562 | "[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed: 49.2s finished\n" 563 | ] 564 | }, 565 | { 566 | "data": { 567 | "text/plain": [ 568 | "GridSearchCV(cv='warn', error_score='raise-deprecating',\n", 569 | " estimator=GradientBoostingClassifier(criterion='friedman_mse',\n", 570 | " init=None, learning_rate=0.1,\n", 571 | " loss='deviance', max_depth=3,\n", 572 | " max_features=None,\n", 573 | " max_leaf_nodes=None,\n", 574 | " min_impurity_decrease=0.0,\n", 575 | " min_impurity_split=None,\n", 576 | " min_samples_leaf=1,\n", 577 | " min_samples_split=2,\n", 578 | " min_weight_fraction_leaf=0.0,\n", 579 | " n_estimators=100,\n", 580 | " n_it...\n", 581 | " random_state=0, subsample=1.0,\n", 582 | " tol=0.0001,\n", 583 | " validation_fraction=0.1,\n", 584 | " verbose=0, warm_start=False),\n", 585 | " iid='warn', n_jobs=-1,\n", 586 | " param_grid={'learning_rate': [0.05, 0.1, 0.2],\n", 587 | " 'max_depth': [6, 8, 10, 12],\n", 588 | " 'min_samples_leaf': [3, 5, 7, 10],\n", 589 | " 'min_samples_split': [2, 3, 5, 10],\n", 590 | " 'n_estimators': [100, 200]},\n", 591 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 592 | " scoring='accuracy', verbose=1)" 593 | ] 594 | }, 595 | "execution_count": 14, 596 | "metadata": {}, 597 | "output_type": "execute_result" 598 | } 599 | ], 600 | "source": [ 601 | "gb_grid = GridSearchCV(gb, param_grid = gb_param_grid, scoring=\"accuracy\", n_jobs= -1, verbose = 1)\n", 602 | "gb_grid.fit(X_train, y_train)" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 15, 608 | "metadata": {}, 609 | "outputs": [ 610 | { 611 | "name": "stdout", 612 | "output_type": "stream", 613 | "text": [ 614 | "최고 평균 정확도 : 0.8202\n", 615 | "최고의 파라미터 : {'learning_rate': 0.05, 'max_depth': 6, 'min_samples_leaf': 7, 'min_samples_split': 2, 'n_estimators': 100}\n" 616 | ] 617 | } 618 | ], 619 | "source": [ 620 | "print(\"최고 평균 정확도 : {0:.4f}\".format(gb_grid.best_score_))\n", 621 | "print(\"최고의 파라미터 : \", gb_grid.best_params_)" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 16, 627 | "metadata": {}, 628 | "outputs": [ 629 | { 630 | "data": { 631 | "text/plain": [ 632 | "Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',\n", 633 | " 'param_learning_rate', 'param_max_depth', 'param_min_samples_leaf',\n", 634 | " 'param_min_samples_split', 'param_n_estimators', 'params',\n", 635 | " 'split0_test_score', 'split1_test_score', 'split2_test_score',\n", 636 | " 'mean_test_score', 'std_test_score', 'rank_test_score'],\n", 637 | " dtype='object')" 638 | ] 639 | }, 640 | "execution_count": 16, 641 | "metadata": {}, 642 | "output_type": "execute_result" 643 | } 644 | ], 645 | "source": [ 646 | "result = pd.DataFrame(gb_grid.cv_results_)\n", 647 | "result.columns" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 17, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "result.sort_values(by=['rank_test_score'], inplace=True)" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 18, 662 | "metadata": {}, 663 | "outputs": [ 664 | { 665 | "data": { 666 | "text/html": [ 667 | "
\n", 668 | "\n", 681 | "\n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | "
paramsmean_test_scorerank_test_score
22{'learning_rate': 0.05, 'max_depth': 6, 'min_s...0.8202251
16{'learning_rate': 0.05, 'max_depth': 6, 'min_s...0.8202251
18{'learning_rate': 0.05, 'max_depth': 6, 'min_s...0.8202251
20{'learning_rate': 0.05, 'max_depth': 6, 'min_s...0.8202251
58{'learning_rate': 0.05, 'max_depth': 8, 'min_s...0.8188205
60{'learning_rate': 0.05, 'max_depth': 8, 'min_s...0.8188205
62{'learning_rate': 0.05, 'max_depth': 8, 'min_s...0.8188205
135{'learning_rate': 0.1, 'max_depth': 6, 'min_sa...0.8188205
56{'learning_rate': 0.05, 'max_depth': 8, 'min_s...0.8188205
120{'learning_rate': 0.05, 'max_depth': 12, 'min_...0.81741610
\n", 753 | "
" 754 | ], 755 | "text/plain": [ 756 | " params mean_test_score \\\n", 757 | "22 {'learning_rate': 0.05, 'max_depth': 6, 'min_s... 0.820225 \n", 758 | "16 {'learning_rate': 0.05, 'max_depth': 6, 'min_s... 0.820225 \n", 759 | "18 {'learning_rate': 0.05, 'max_depth': 6, 'min_s... 0.820225 \n", 760 | "20 {'learning_rate': 0.05, 'max_depth': 6, 'min_s... 0.820225 \n", 761 | "58 {'learning_rate': 0.05, 'max_depth': 8, 'min_s... 0.818820 \n", 762 | "60 {'learning_rate': 0.05, 'max_depth': 8, 'min_s... 0.818820 \n", 763 | "62 {'learning_rate': 0.05, 'max_depth': 8, 'min_s... 0.818820 \n", 764 | "135 {'learning_rate': 0.1, 'max_depth': 6, 'min_sa... 0.818820 \n", 765 | "56 {'learning_rate': 0.05, 'max_depth': 8, 'min_s... 0.818820 \n", 766 | "120 {'learning_rate': 0.05, 'max_depth': 12, 'min_... 0.817416 \n", 767 | "\n", 768 | " rank_test_score \n", 769 | "22 1 \n", 770 | "16 1 \n", 771 | "18 1 \n", 772 | "20 1 \n", 773 | "58 5 \n", 774 | "60 5 \n", 775 | "62 5 \n", 776 | "135 5 \n", 777 | "56 5 \n", 778 | "120 10 " 779 | ] 780 | }, 781 | "execution_count": 18, 782 | "metadata": {}, 783 | "output_type": "execute_result" 784 | } 785 | ], 786 | "source": [ 787 | "result[['params', 'mean_test_score', 'rank_test_score']].head(10)" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 20, 793 | "metadata": {}, 794 | "outputs": [ 795 | { 796 | "name": "stdout", 797 | "output_type": "stream", 798 | "text": [ 799 | "정확도 : 0.8492\n" 800 | ] 801 | } 802 | ], 803 | "source": [ 804 | "model = gb_grid.best_estimator_\n", 805 | "pred = model.predict(X_test)\n", 806 | "acc = accuracy_score(y_test, pred)\n", 807 | "print(\"정확도 : {0:.4f}\".format(acc))\n" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 21, 813 | "metadata": {}, 814 | "outputs": [], 815 | "source": [ 816 | "feature_importances = model.feature_importances_" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": 22, 822 | "metadata": {}, 823 | "outputs": [ 824 | { 825 | "data": { 826 | "image/png": "\n", 827 | "text/plain": [ 828 | "
" 829 | ] 830 | }, 831 | "metadata": { 832 | "needs_background": "light" 833 | }, 834 | "output_type": "display_data" 835 | } 836 | ], 837 | "source": [ 838 | "ft_importances = pd.Series(feature_importances, index = X_train.columns)\n", 839 | "ft_importances = ft_importances.sort_values(ascending=False)\n", 840 | "\n", 841 | "plt.figure(figsize=(12, 10))\n", 842 | "plt.title(\"feature importances\")\n", 843 | "sns.barplot(x=ft_importances, y = X_train.columns)\n", 844 | "plt.show()\n" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [] 853 | } 854 | ], 855 | "metadata": { 856 | "kernelspec": { 857 | "display_name": "Python 3", 858 | "language": "python", 859 | "name": "python3" 860 | }, 861 | "language_info": { 862 | "codemirror_mode": { 863 | "name": "ipython", 864 | "version": 3 865 | }, 866 | "file_extension": ".py", 867 | "mimetype": "text/x-python", 868 | "name": "python", 869 | "nbconvert_exporter": "python", 870 | "pygments_lexer": "ipython3", 871 | "version": "3.6.9" 872 | } 873 | }, 874 | "nbformat": 4, 875 | "nbformat_minor": 2 876 | } 877 | -------------------------------------------------------------------------------- /009. XGboost, LightGBM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 출처 \n", 8 | "\n", 9 | "## xgboost\n", 10 | "\n", 11 | "- https://apple-rbox.tistory.com/6\n", 12 | "- https://brunch.co.kr/@snobberys/137\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## lightgbm\n", 20 | "\n", 21 | "- https://ko.raw3h.net/page/what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-5295f7/\n", 22 | "\n", 23 | "\n", 24 | "## 그 외\n", 25 | "\n", 26 | "- https://www.kaggle.com/shep312/applying-lightgbm-to-titanic-dataset\n", 27 | "- https://www.kaggle.com/suniliitb96/titanic-survival-prediction-using-xgboost\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import pandas as pd\n", 37 | "import numpy as np\n", 38 | "import matplotlib.pyplot as plt\n", 39 | "import seaborn as sns\n", 40 | "from xgboost import plot_importance\n", 41 | "from xgboost import XGBClassifier\n", 42 | "from sklearn.datasets import load_breast_cancer\n", 43 | "from sklearn.model_selection import train_test_split, GridSearchCV\n", 44 | "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 10, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "def metrics(y_test, pred):\n", 54 | " accuracy = accuracy_score(y_test, pred)\n", 55 | " precision = precision_score(y_test, pred)\n", 56 | " recall = recall_score(y_test, pred)\n", 57 | " f1 = f1_score(y_test, pred)\n", 58 | " roc_score = roc_auc_score(y_test, pred)\n", 59 | " print('정확도 : {0:.2f}, 정밀도 : {1:.2f}, 재현율 : {2:.2f}'.format(accuracy, precision, recall))\n", 60 | " print('f1-score : {0:.2f}, auc : {1:.2f}'.format(f1, roc_score))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/html": [ 71 | "
\n", 72 | "\n", 85 | "\n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimensiontarget
017.9910.38122.801001.00.118400.277600.30010.147100.24190.07871...17.33184.602019.00.16220.66560.71190.26540.46010.118900
120.5717.77132.901326.00.084740.078640.08690.070170.18120.05667...23.41158.801956.00.12380.18660.24160.18600.27500.089020
219.6921.25130.001203.00.109600.159900.19740.127900.20690.05999...25.53152.501709.00.14440.42450.45040.24300.36130.087580
311.4220.3877.58386.10.142500.283900.24140.105200.25970.09744...26.5098.87567.70.20980.86630.68690.25750.66380.173000
420.2914.34135.101297.00.100300.132800.19800.104300.18090.05883...16.67152.201575.00.13740.20500.40000.16250.23640.076780
\n", 235 | "

5 rows × 31 columns

\n", 236 | "
" 237 | ], 238 | "text/plain": [ 239 | " mean radius mean texture mean perimeter mean area mean smoothness \\\n", 240 | "0 17.99 10.38 122.80 1001.0 0.11840 \n", 241 | "1 20.57 17.77 132.90 1326.0 0.08474 \n", 242 | "2 19.69 21.25 130.00 1203.0 0.10960 \n", 243 | "3 11.42 20.38 77.58 386.1 0.14250 \n", 244 | "4 20.29 14.34 135.10 1297.0 0.10030 \n", 245 | "\n", 246 | " mean compactness mean concavity mean concave points mean symmetry \\\n", 247 | "0 0.27760 0.3001 0.14710 0.2419 \n", 248 | "1 0.07864 0.0869 0.07017 0.1812 \n", 249 | "2 0.15990 0.1974 0.12790 0.2069 \n", 250 | "3 0.28390 0.2414 0.10520 0.2597 \n", 251 | "4 0.13280 0.1980 0.10430 0.1809 \n", 252 | "\n", 253 | " mean fractal dimension ... worst texture worst perimeter worst area \\\n", 254 | "0 0.07871 ... 17.33 184.60 2019.0 \n", 255 | "1 0.05667 ... 23.41 158.80 1956.0 \n", 256 | "2 0.05999 ... 25.53 152.50 1709.0 \n", 257 | "3 0.09744 ... 26.50 98.87 567.7 \n", 258 | "4 0.05883 ... 16.67 152.20 1575.0 \n", 259 | "\n", 260 | " worst smoothness worst compactness worst concavity worst concave points \\\n", 261 | "0 0.1622 0.6656 0.7119 0.2654 \n", 262 | "1 0.1238 0.1866 0.2416 0.1860 \n", 263 | "2 0.1444 0.4245 0.4504 0.2430 \n", 264 | "3 0.2098 0.8663 0.6869 0.2575 \n", 265 | "4 0.1374 0.2050 0.4000 0.1625 \n", 266 | "\n", 267 | " worst symmetry worst fractal dimension target \n", 268 | "0 0.4601 0.11890 0 \n", 269 | "1 0.2750 0.08902 0 \n", 270 | "2 0.3613 0.08758 0 \n", 271 | "3 0.6638 0.17300 0 \n", 272 | "4 0.2364 0.07678 0 \n", 273 | "\n", 274 | "[5 rows x 31 columns]" 275 | ] 276 | }, 277 | "execution_count": 3, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "data = load_breast_cancer()\n", 284 | "\n", 285 | "cancer = pd.DataFrame(data.data, columns = data.feature_names)\n", 286 | "cancer['target'] = data.target\n", 287 | "cancer.head()" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 5, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/plain": [ 298 | "1 357\n", 299 | "0 212\n", 300 | "Name: target, dtype: int64" 301 | ] 302 | }, 303 | "execution_count": 5, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "cancer['target'].value_counts()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 7, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "(455, 30) (114, 30)\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "y = cancer['target']\n", 327 | "X = cancer.drop('target', axis = 1)\n", 328 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)\n", 329 | "print(X_train.shape, X_test.shape)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 8, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "xgb = XGBClassifier(n_estimators=500, learning_rate = 0.1, max_depth = 4)\n", 339 | "xgb.fit(X_train, y_train)\n", 340 | "xgb_pred = xgb.predict(X_test)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 12, 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "정확도 : 0.99, 정밀도 : 0.99, 재현율 : 1.00\n", 353 | "f1-score : 0.99, auc : 0.99\n" 354 | ] 355 | } 356 | ], 357 | "source": [ 358 | "metrics(y_test, xgb_pred)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 2, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "data": { 389 | "text/html": [ 390 | "
\n", 391 | "\n", 404 | "\n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", 500 | "
" 501 | ], 502 | "text/plain": [ 503 | " PassengerId Survived Pclass \\\n", 504 | "0 1 0 3 \n", 505 | "1 2 1 1 \n", 506 | "2 3 1 3 \n", 507 | "3 4 1 1 \n", 508 | "4 5 0 3 \n", 509 | "\n", 510 | " Name Sex Age SibSp \\\n", 511 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 512 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 513 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 514 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 515 | "4 Allen, Mr. William Henry male 35.0 0 \n", 516 | "\n", 517 | " Parch Ticket Fare Cabin Embarked \n", 518 | "0 0 A/5 21171 7.2500 NaN S \n", 519 | "1 0 PC 17599 71.2833 C85 C \n", 520 | "2 0 STON/O2. 3101282 7.9250 NaN S \n", 521 | "3 0 113803 53.1000 C123 S \n", 522 | "4 0 373450 8.0500 NaN S " 523 | ] 524 | }, 525 | "execution_count": 2, 526 | "metadata": {}, 527 | "output_type": "execute_result" 528 | } 529 | ], 530 | "source": [ 531 | "data = pd.read_csv('./datas/titanic/train.csv')\n", 532 | "data.head()" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 3, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "data['Embarked'].fillna('S', inplace = True)\n", 542 | "data['Fare'].fillna(0, inplace=True)\n", 543 | "data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\\.')\n", 544 | "data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)\n" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 3, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "\n", 554 | "mapping = {\n", 555 | " \"Mr\":0,\n", 556 | " \"Miss\":1,\n", 557 | " \"Mrs\" : 1,\n", 558 | " \"Master\":2,\n", 559 | " \"Other\":3\n", 560 | "}\n", 561 | "\n", 562 | "\n", 563 | "\n", 564 | "mapping_sex = {\n", 565 | " 'male' : 0,\n", 566 | " 'female': 1\n", 567 | "}\n", 568 | "\n", 569 | "mapping_em = {\n", 570 | " 'S' :0,\n", 571 | " 'C' :1,\n", 572 | " 'Q' :2\n", 573 | "}\n" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 3, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "\n", 583 | "data['Initial'] = data['Initial'].map(mapping)\n", 584 | "data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)\n", 585 | "data['Sex'] = data['Sex'].map(mapping_sex)\n", 586 | "data['Embarked'] = data['Embarked'].map(mapping_em)\n" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 3, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "data.drop(['PassengerId', \"Ticket\", \"Cabin\", \"Name\"], axis = 1, inplace = True)\n", 596 | "\n", 597 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 0), 'Age' ] = 32\n", 598 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 1), 'Age' ] = 28\n", 599 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 2), 'Age' ] = 5\n", 600 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 3), 'Age' ] = 45" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 4, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "y = data['Survived']\n", 610 | "X = data.drop('Survived', axis = 1)\n", 611 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=10)" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 13, 617 | "metadata": {}, 618 | "outputs": [ 619 | { 620 | "name": "stderr", 621 | "output_type": "stream", 622 | "text": [ 623 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n", 624 | " warnings.warn(CV_WARNING, FutureWarning)\n", 625 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n" 626 | ] 627 | }, 628 | { 629 | "name": "stdout", 630 | "output_type": "stream", 631 | "text": [ 632 | "Fitting 3 folds for each of 100 candidates, totalling 300 fits\n" 633 | ] 634 | }, 635 | { 636 | "name": "stderr", 637 | "output_type": "stream", 638 | "text": [ 639 | "[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 3.5s\n", 640 | "[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 8.8s\n", 641 | "[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 12.8s finished\n" 642 | ] 643 | }, 644 | { 645 | "data": { 646 | "text/plain": [ 647 | "GridSearchCV(cv='warn', error_score='raise-deprecating',\n", 648 | " estimator=XGBClassifier(base_score=0.5, booster='gbtree',\n", 649 | " colsample_bylevel=1, colsample_bynode=1,\n", 650 | " colsample_bytree=1, gamma=0,\n", 651 | " learning_rate=0.1, max_delta_step=0,\n", 652 | " max_depth=3, min_child_weight=1,\n", 653 | " missing=None, n_estimators=100, n_jobs=1,\n", 654 | " nthread=None, objective='binary:logistic',\n", 655 | " random_state=0, reg_alpha=0, reg_lambda=1,\n", 656 | " scale_pos_weight=1, seed=None, silent=None,\n", 657 | " subsample=1, verbosity=1),\n", 658 | " iid='warn', n_jobs=-1,\n", 659 | " param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],\n", 660 | " 'max_depth': [4, 6, 8, 10, 12],\n", 661 | " 'n_estimators': [100, 200, 400, 600]},\n", 662 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", 663 | " scoring='accuracy', verbose=1)" 664 | ] 665 | }, 666 | "execution_count": 13, 667 | "metadata": {}, 668 | "output_type": "execute_result" 669 | } 670 | ], 671 | "source": [ 672 | "xgb = XGBClassifier()\n", 673 | "\n", 674 | "xgb_param_grid = {\n", 675 | " 'n_estimators' : [100, 200, 400, 600],\n", 676 | " 'learning_rate' : [0.01, 0.05, 0.1, 0.15, 0.2],\n", 677 | " 'max_depth' : [4, 6, 8, 10, 12],\n", 678 | "}\n", 679 | "\n", 680 | "xgb_grid = GridSearchCV(xgb, param_grid = xgb_param_grid, scoring=\"accuracy\", n_jobs= -1, verbose = 1)\n", 681 | "xgb_grid.fit(X_train, y_train)\n" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 14, 687 | "metadata": {}, 688 | "outputs": [ 689 | { 690 | "name": "stdout", 691 | "output_type": "stream", 692 | "text": [ 693 | "최고 평균 정확도 : 0.8244\n", 694 | "최고의 파라미터 : {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}\n" 695 | ] 696 | } 697 | ], 698 | "source": [ 699 | "print(\"최고 평균 정확도 : {0:.4f}\".format(xgb_grid.best_score_))\n", 700 | "print(\"최고의 파라미터 : \", xgb_grid.best_params_)" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 15, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "result = pd.DataFrame(xgb_grid.cv_results_)\n", 710 | "result.sort_values(by=['rank_test_score'], inplace=True)" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 16, 716 | "metadata": {}, 717 | "outputs": [ 718 | { 719 | "data": { 720 | "text/html": [ 721 | "
\n", 722 | "\n", 735 | "\n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | "
paramsmean_test_scorerank_test_score
16{'learning_rate': 0.01, 'max_depth': 12, 'n_es...0.8244381
12{'learning_rate': 0.01, 'max_depth': 10, 'n_es...0.8244381
10{'learning_rate': 0.01, 'max_depth': 8, 'n_est...0.8230343
6{'learning_rate': 0.01, 'max_depth': 6, 'n_est...0.8230343
11{'learning_rate': 0.01, 'max_depth': 8, 'n_est...0.8216295
14{'learning_rate': 0.01, 'max_depth': 10, 'n_es...0.8202256
7{'learning_rate': 0.01, 'max_depth': 6, 'n_est...0.8202256
8{'learning_rate': 0.01, 'max_depth': 8, 'n_est...0.8202256
15{'learning_rate': 0.01, 'max_depth': 10, 'n_es...0.8188209
20{'learning_rate': 0.05, 'max_depth': 4, 'n_est...0.8188209
\n", 807 | "
" 808 | ], 809 | "text/plain": [ 810 | " params mean_test_score \\\n", 811 | "16 {'learning_rate': 0.01, 'max_depth': 12, 'n_es... 0.824438 \n", 812 | "12 {'learning_rate': 0.01, 'max_depth': 10, 'n_es... 0.824438 \n", 813 | "10 {'learning_rate': 0.01, 'max_depth': 8, 'n_est... 0.823034 \n", 814 | "6 {'learning_rate': 0.01, 'max_depth': 6, 'n_est... 0.823034 \n", 815 | "11 {'learning_rate': 0.01, 'max_depth': 8, 'n_est... 0.821629 \n", 816 | "14 {'learning_rate': 0.01, 'max_depth': 10, 'n_es... 0.820225 \n", 817 | "7 {'learning_rate': 0.01, 'max_depth': 6, 'n_est... 0.820225 \n", 818 | "8 {'learning_rate': 0.01, 'max_depth': 8, 'n_est... 0.820225 \n", 819 | "15 {'learning_rate': 0.01, 'max_depth': 10, 'n_es... 0.818820 \n", 820 | "20 {'learning_rate': 0.05, 'max_depth': 4, 'n_est... 0.818820 \n", 821 | "\n", 822 | " rank_test_score \n", 823 | "16 1 \n", 824 | "12 1 \n", 825 | "10 3 \n", 826 | "6 3 \n", 827 | "11 5 \n", 828 | "14 6 \n", 829 | "7 6 \n", 830 | "8 6 \n", 831 | "15 9 \n", 832 | "20 9 " 833 | ] 834 | }, 835 | "execution_count": 16, 836 | "metadata": {}, 837 | "output_type": "execute_result" 838 | } 839 | ], 840 | "source": [ 841 | "result[['params', 'mean_test_score', 'rank_test_score']].head(10)" 842 | ] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": 5, 847 | "metadata": {}, 848 | "outputs": [], 849 | "source": [] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": 5, 854 | "metadata": {}, 855 | "outputs": [], 856 | "source": [] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 7, 861 | "metadata": {}, 862 | "outputs": [ 863 | { 864 | "name": "stdout", 865 | "output_type": "stream", 866 | "text": [ 867 | "[0]\tvalidation_0-logloss:0.643237\n", 868 | "Will train until validation_0-logloss hasn't improved in 100 rounds.\n", 869 | "[1]\tvalidation_0-logloss:0.600544\n", 870 | "[2]\tvalidation_0-logloss:0.567278\n", 871 | "[3]\tvalidation_0-logloss:0.539616\n", 872 | "[4]\tvalidation_0-logloss:0.515783\n", 873 | "[5]\tvalidation_0-logloss:0.493083\n", 874 | "[6]\tvalidation_0-logloss:0.475519\n", 875 | "[7]\tvalidation_0-logloss:0.460413\n", 876 | "[8]\tvalidation_0-logloss:0.447876\n", 877 | "[9]\tvalidation_0-logloss:0.434248\n", 878 | "[10]\tvalidation_0-logloss:0.424888\n", 879 | "[11]\tvalidation_0-logloss:0.417043\n", 880 | "[12]\tvalidation_0-logloss:0.410522\n", 881 | "[13]\tvalidation_0-logloss:0.40404\n", 882 | "[14]\tvalidation_0-logloss:0.399429\n", 883 | "[15]\tvalidation_0-logloss:0.393991\n", 884 | "[16]\tvalidation_0-logloss:0.390694\n", 885 | "[17]\tvalidation_0-logloss:0.386906\n", 886 | "[18]\tvalidation_0-logloss:0.383186\n", 887 | "[19]\tvalidation_0-logloss:0.379069\n", 888 | "[20]\tvalidation_0-logloss:0.375775\n", 889 | "[21]\tvalidation_0-logloss:0.374222\n", 890 | "[22]\tvalidation_0-logloss:0.372794\n", 891 | "[23]\tvalidation_0-logloss:0.373341\n", 892 | "[24]\tvalidation_0-logloss:0.370527\n", 893 | "[25]\tvalidation_0-logloss:0.369523\n", 894 | "[26]\tvalidation_0-logloss:0.369086\n", 895 | "[27]\tvalidation_0-logloss:0.369067\n", 896 | "[28]\tvalidation_0-logloss:0.367292\n", 897 | "[29]\tvalidation_0-logloss:0.366029\n", 898 | "[30]\tvalidation_0-logloss:0.365949\n", 899 | "[31]\tvalidation_0-logloss:0.364792\n", 900 | "[32]\tvalidation_0-logloss:0.365043\n", 901 | "[33]\tvalidation_0-logloss:0.365255\n", 902 | "[34]\tvalidation_0-logloss:0.364502\n", 903 | "[35]\tvalidation_0-logloss:0.36495\n", 904 | "[36]\tvalidation_0-logloss:0.3653\n", 905 | "[37]\tvalidation_0-logloss:0.365692\n", 906 | "[38]\tvalidation_0-logloss:0.364633\n", 907 | "[39]\tvalidation_0-logloss:0.365394\n", 908 | "[40]\tvalidation_0-logloss:0.366006\n", 909 | "[41]\tvalidation_0-logloss:0.364511\n", 910 | "[42]\tvalidation_0-logloss:0.362128\n", 911 | "[43]\tvalidation_0-logloss:0.363265\n", 912 | "[44]\tvalidation_0-logloss:0.362809\n", 913 | "[45]\tvalidation_0-logloss:0.361418\n", 914 | "[46]\tvalidation_0-logloss:0.361081\n", 915 | "[47]\tvalidation_0-logloss:0.362271\n", 916 | "[48]\tvalidation_0-logloss:0.360343\n", 917 | "[49]\tvalidation_0-logloss:0.360147\n", 918 | "[50]\tvalidation_0-logloss:0.359536\n", 919 | "[51]\tvalidation_0-logloss:0.360105\n", 920 | "[52]\tvalidation_0-logloss:0.359863\n", 921 | "[53]\tvalidation_0-logloss:0.360054\n", 922 | "[54]\tvalidation_0-logloss:0.360457\n", 923 | "[55]\tvalidation_0-logloss:0.359963\n", 924 | "[56]\tvalidation_0-logloss:0.359591\n", 925 | "[57]\tvalidation_0-logloss:0.360042\n", 926 | "[58]\tvalidation_0-logloss:0.358606\n", 927 | "[59]\tvalidation_0-logloss:0.35847\n", 928 | "[60]\tvalidation_0-logloss:0.358429\n", 929 | "[61]\tvalidation_0-logloss:0.358046\n", 930 | "[62]\tvalidation_0-logloss:0.357865\n", 931 | "[63]\tvalidation_0-logloss:0.356589\n", 932 | "[64]\tvalidation_0-logloss:0.356376\n", 933 | "[65]\tvalidation_0-logloss:0.357027\n", 934 | "[66]\tvalidation_0-logloss:0.356924\n", 935 | "[67]\tvalidation_0-logloss:0.357237\n", 936 | "[68]\tvalidation_0-logloss:0.358427\n", 937 | "[69]\tvalidation_0-logloss:0.358904\n", 938 | "[70]\tvalidation_0-logloss:0.356838\n", 939 | "[71]\tvalidation_0-logloss:0.355709\n", 940 | "[72]\tvalidation_0-logloss:0.356185\n", 941 | "[73]\tvalidation_0-logloss:0.357439\n", 942 | "[74]\tvalidation_0-logloss:0.356952\n", 943 | "[75]\tvalidation_0-logloss:0.356894\n", 944 | "[76]\tvalidation_0-logloss:0.357164\n", 945 | "[77]\tvalidation_0-logloss:0.35748\n", 946 | "[78]\tvalidation_0-logloss:0.357296\n", 947 | "[79]\tvalidation_0-logloss:0.357984\n", 948 | "[80]\tvalidation_0-logloss:0.357816\n", 949 | "[81]\tvalidation_0-logloss:0.358238\n", 950 | "[82]\tvalidation_0-logloss:0.358398\n", 951 | "[83]\tvalidation_0-logloss:0.358424\n", 952 | "[84]\tvalidation_0-logloss:0.358912\n", 953 | "[85]\tvalidation_0-logloss:0.360025\n", 954 | "[86]\tvalidation_0-logloss:0.359234\n", 955 | "[87]\tvalidation_0-logloss:0.359403\n", 956 | "[88]\tvalidation_0-logloss:0.358514\n", 957 | "[89]\tvalidation_0-logloss:0.359621\n", 958 | "[90]\tvalidation_0-logloss:0.359716\n", 959 | "[91]\tvalidation_0-logloss:0.360305\n", 960 | "[92]\tvalidation_0-logloss:0.359297\n", 961 | "[93]\tvalidation_0-logloss:0.35923\n", 962 | "[94]\tvalidation_0-logloss:0.35925\n", 963 | "[95]\tvalidation_0-logloss:0.359636\n", 964 | "[96]\tvalidation_0-logloss:0.358746\n", 965 | "[97]\tvalidation_0-logloss:0.359995\n", 966 | "[98]\tvalidation_0-logloss:0.358856\n", 967 | "[99]\tvalidation_0-logloss:0.359269\n", 968 | "[100]\tvalidation_0-logloss:0.359495\n", 969 | "[101]\tvalidation_0-logloss:0.359534\n", 970 | "[102]\tvalidation_0-logloss:0.359903\n", 971 | "[103]\tvalidation_0-logloss:0.360073\n", 972 | "[104]\tvalidation_0-logloss:0.360139\n", 973 | "[105]\tvalidation_0-logloss:0.360796\n", 974 | "[106]\tvalidation_0-logloss:0.359293\n", 975 | "[107]\tvalidation_0-logloss:0.359956\n", 976 | "[108]\tvalidation_0-logloss:0.360043\n", 977 | "[109]\tvalidation_0-logloss:0.359125\n", 978 | "[110]\tvalidation_0-logloss:0.359315\n", 979 | "[111]\tvalidation_0-logloss:0.3594\n", 980 | "[112]\tvalidation_0-logloss:0.359811\n", 981 | "[113]\tvalidation_0-logloss:0.359921\n", 982 | "[114]\tvalidation_0-logloss:0.360095\n", 983 | "[115]\tvalidation_0-logloss:0.35926\n", 984 | "[116]\tvalidation_0-logloss:0.359522\n", 985 | "[117]\tvalidation_0-logloss:0.35992\n", 986 | "[118]\tvalidation_0-logloss:0.359175\n", 987 | "[119]\tvalidation_0-logloss:0.358587\n", 988 | "[120]\tvalidation_0-logloss:0.358692\n", 989 | "[121]\tvalidation_0-logloss:0.359066\n", 990 | "[122]\tvalidation_0-logloss:0.359215\n", 991 | "[123]\tvalidation_0-logloss:0.358593\n", 992 | "[124]\tvalidation_0-logloss:0.35855\n", 993 | "[125]\tvalidation_0-logloss:0.35841\n", 994 | "[126]\tvalidation_0-logloss:0.358248\n", 995 | "[127]\tvalidation_0-logloss:0.358388\n", 996 | "[128]\tvalidation_0-logloss:0.358489\n", 997 | "[129]\tvalidation_0-logloss:0.358913\n", 998 | "[130]\tvalidation_0-logloss:0.359169\n", 999 | "[131]\tvalidation_0-logloss:0.358706\n", 1000 | "[132]\tvalidation_0-logloss:0.358846\n", 1001 | "[133]\tvalidation_0-logloss:0.35899\n", 1002 | "[134]\tvalidation_0-logloss:0.358574\n", 1003 | "[135]\tvalidation_0-logloss:0.358431\n", 1004 | "[136]\tvalidation_0-logloss:0.358572\n", 1005 | "[137]\tvalidation_0-logloss:0.357526\n", 1006 | "[138]\tvalidation_0-logloss:0.3576\n", 1007 | "[139]\tvalidation_0-logloss:0.358176\n", 1008 | "[140]\tvalidation_0-logloss:0.357707\n", 1009 | "[141]\tvalidation_0-logloss:0.357483\n", 1010 | "[142]\tvalidation_0-logloss:0.357542\n", 1011 | "[143]\tvalidation_0-logloss:0.357489\n", 1012 | "[144]\tvalidation_0-logloss:0.357366\n", 1013 | "[145]\tvalidation_0-logloss:0.358119\n", 1014 | "[146]\tvalidation_0-logloss:0.358145\n", 1015 | "[147]\tvalidation_0-logloss:0.35822\n", 1016 | "[148]\tvalidation_0-logloss:0.35805\n", 1017 | "[149]\tvalidation_0-logloss:0.35899\n", 1018 | "[150]\tvalidation_0-logloss:0.35882\n", 1019 | "[151]\tvalidation_0-logloss:0.357895\n", 1020 | "[152]\tvalidation_0-logloss:0.358154\n", 1021 | "[153]\tvalidation_0-logloss:0.357417\n", 1022 | "[154]\tvalidation_0-logloss:0.359365\n", 1023 | "[155]\tvalidation_0-logloss:0.358782\n", 1024 | "[156]\tvalidation_0-logloss:0.358195\n", 1025 | "[157]\tvalidation_0-logloss:0.357697\n", 1026 | "[158]\tvalidation_0-logloss:0.358491\n", 1027 | "[159]\tvalidation_0-logloss:0.358627\n", 1028 | "[160]\tvalidation_0-logloss:0.358216\n", 1029 | "[161]\tvalidation_0-logloss:0.358591\n", 1030 | "[162]\tvalidation_0-logloss:0.358682\n", 1031 | "[163]\tvalidation_0-logloss:0.358732\n", 1032 | "[164]\tvalidation_0-logloss:0.358995\n", 1033 | "[165]\tvalidation_0-logloss:0.359204\n", 1034 | "[166]\tvalidation_0-logloss:0.358358\n", 1035 | "[167]\tvalidation_0-logloss:0.359008\n", 1036 | "[168]\tvalidation_0-logloss:0.358891\n", 1037 | "[169]\tvalidation_0-logloss:0.357869\n", 1038 | "[170]\tvalidation_0-logloss:0.357907\n", 1039 | "[171]\tvalidation_0-logloss:0.3576\n", 1040 | "Stopping. Best iteration:\n", 1041 | "[71]\tvalidation_0-logloss:0.355709\n", 1042 | "\n" 1043 | ] 1044 | }, 1045 | { 1046 | "data": { 1047 | "text/plain": [ 1048 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", 1049 | " colsample_bynode=1, colsample_bytree=1, gamma=0,\n", 1050 | " learning_rate=0.1, max_delta_step=0, max_depth=3,\n", 1051 | " min_child_weight=1, missing=None, n_estimators=400, n_jobs=1,\n", 1052 | " nthread=None, objective='binary:logistic', random_state=0,\n", 1053 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n", 1054 | " silent=None, subsample=1, verbosity=1)" 1055 | ] 1056 | }, 1057 | "execution_count": 7, 1058 | "metadata": {}, 1059 | "output_type": "execute_result" 1060 | } 1061 | ], 1062 | "source": [ 1063 | "xgb = XGBClassifier(n_estimators=400, learning_rate = 0.1, max_depth = 3)\n", 1064 | "evals = [(X_test, y_test)]\n", 1065 | "xgb.fit(X_train, y_train, early_stopping_rounds = 100, eval_metric = \"logloss\", eval_set = evals, verbose = 1)" 1066 | ] 1067 | }, 1068 | { 1069 | "cell_type": "code", 1070 | "execution_count": 17, 1071 | "metadata": {}, 1072 | "outputs": [ 1073 | { 1074 | "name": "stdout", 1075 | "output_type": "stream", 1076 | "text": [ 1077 | "[0]\tvalidation_0-logloss:0.643237\n", 1078 | "Will train until validation_0-logloss hasn't improved in 3 rounds.\n", 1079 | "[1]\tvalidation_0-logloss:0.600544\n", 1080 | "[2]\tvalidation_0-logloss:0.567278\n", 1081 | "[3]\tvalidation_0-logloss:0.539616\n", 1082 | "[4]\tvalidation_0-logloss:0.515783\n", 1083 | "[5]\tvalidation_0-logloss:0.493083\n", 1084 | "[6]\tvalidation_0-logloss:0.475519\n", 1085 | "[7]\tvalidation_0-logloss:0.460413\n", 1086 | "[8]\tvalidation_0-logloss:0.447876\n", 1087 | "[9]\tvalidation_0-logloss:0.434248\n", 1088 | "[10]\tvalidation_0-logloss:0.424888\n", 1089 | "[11]\tvalidation_0-logloss:0.417043\n", 1090 | "[12]\tvalidation_0-logloss:0.410522\n", 1091 | "[13]\tvalidation_0-logloss:0.40404\n", 1092 | "[14]\tvalidation_0-logloss:0.399429\n", 1093 | "[15]\tvalidation_0-logloss:0.393991\n", 1094 | "[16]\tvalidation_0-logloss:0.390694\n", 1095 | "[17]\tvalidation_0-logloss:0.386906\n", 1096 | "[18]\tvalidation_0-logloss:0.383186\n", 1097 | "[19]\tvalidation_0-logloss:0.379069\n", 1098 | "[20]\tvalidation_0-logloss:0.375775\n", 1099 | "[21]\tvalidation_0-logloss:0.374222\n", 1100 | "[22]\tvalidation_0-logloss:0.372794\n", 1101 | "[23]\tvalidation_0-logloss:0.373341\n", 1102 | "[24]\tvalidation_0-logloss:0.370527\n", 1103 | "[25]\tvalidation_0-logloss:0.369523\n", 1104 | "[26]\tvalidation_0-logloss:0.369086\n", 1105 | "[27]\tvalidation_0-logloss:0.369067\n", 1106 | "[28]\tvalidation_0-logloss:0.367292\n", 1107 | "[29]\tvalidation_0-logloss:0.366029\n", 1108 | "[30]\tvalidation_0-logloss:0.365949\n", 1109 | "[31]\tvalidation_0-logloss:0.364792\n", 1110 | "[32]\tvalidation_0-logloss:0.365043\n", 1111 | "[33]\tvalidation_0-logloss:0.365255\n", 1112 | "[34]\tvalidation_0-logloss:0.364502\n", 1113 | "[35]\tvalidation_0-logloss:0.36495\n", 1114 | "[36]\tvalidation_0-logloss:0.3653\n", 1115 | "[37]\tvalidation_0-logloss:0.365692\n", 1116 | "Stopping. Best iteration:\n", 1117 | "[34]\tvalidation_0-logloss:0.364502\n", 1118 | "\n" 1119 | ] 1120 | }, 1121 | { 1122 | "data": { 1123 | "text/plain": [ 1124 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", 1125 | " colsample_bynode=1, colsample_bytree=1, gamma=0,\n", 1126 | " learning_rate=0.1, max_delta_step=0, max_depth=3,\n", 1127 | " min_child_weight=1, missing=None, n_estimators=400, n_jobs=1,\n", 1128 | " nthread=None, objective='binary:logistic', random_state=0,\n", 1129 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n", 1130 | " silent=None, subsample=1, verbosity=1)" 1131 | ] 1132 | }, 1133 | "execution_count": 17, 1134 | "metadata": {}, 1135 | "output_type": "execute_result" 1136 | } 1137 | ], 1138 | "source": [ 1139 | "xgb = XGBClassifier(n_estimators=400, learning_rate = 0.1, max_depth = 3)\n", 1140 | "evals = [(X_test, y_test)]\n", 1141 | "xgb.fit(X_train, y_train, early_stopping_rounds = 3, eval_metric = \"logloss\", eval_set = evals, verbose = 1)" 1142 | ] 1143 | }, 1144 | { 1145 | "cell_type": "code", 1146 | "execution_count": 9, 1147 | "metadata": {}, 1148 | "outputs": [ 1149 | { 1150 | "data": { 1151 | "text/plain": [ 1152 | "" 1153 | ] 1154 | }, 1155 | "execution_count": 9, 1156 | "metadata": {}, 1157 | "output_type": "execute_result" 1158 | }, 1159 | { 1160 | "data": { 1161 | "image/png": "\n", 1162 | "text/plain": [ 1163 | "
" 1164 | ] 1165 | }, 1166 | "metadata": { 1167 | "needs_background": "light" 1168 | }, 1169 | "output_type": "display_data" 1170 | } 1171 | ], 1172 | "source": [ 1173 | "fig, ax = plt.subplots()\n", 1174 | "plot_importance(xgb, ax=ax)" 1175 | ] 1176 | }, 1177 | { 1178 | "cell_type": "code", 1179 | "execution_count": null, 1180 | "metadata": {}, 1181 | "outputs": [], 1182 | "source": [] 1183 | }, 1184 | { 1185 | "cell_type": "code", 1186 | "execution_count": 6, 1187 | "metadata": {}, 1188 | "outputs": [], 1189 | "source": [ 1190 | "from lightgbm import LGBMClassifier, plot_importance" 1191 | ] 1192 | }, 1193 | { 1194 | "cell_type": "code", 1195 | "execution_count": 11, 1196 | "metadata": {}, 1197 | "outputs": [ 1198 | { 1199 | "data": { 1200 | "text/plain": [ 1201 | "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n", 1202 | " importance_type='split', learning_rate=0.1, max_depth=-1,\n", 1203 | " min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n", 1204 | " n_estimaotrs=400, n_estimators=100, n_jobs=-1, num_leaves=31,\n", 1205 | " objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n", 1206 | " silent=True, subsample=1.0, subsample_for_bin=200000,\n", 1207 | " subsample_freq=0)" 1208 | ] 1209 | }, 1210 | "execution_count": 11, 1211 | "metadata": {}, 1212 | "output_type": "execute_result" 1213 | } 1214 | ], 1215 | "source": [ 1216 | "lgb = LGBMClassifier(n_estimaotrs = 400)\n", 1217 | "lgb.fit(X_train, y_train)" 1218 | ] 1219 | }, 1220 | { 1221 | "cell_type": "code", 1222 | "execution_count": 12, 1223 | "metadata": {}, 1224 | "outputs": [ 1225 | { 1226 | "name": "stdout", 1227 | "output_type": "stream", 1228 | "text": [ 1229 | "정확도 : 0.84, 정밀도 : 0.79, 재현율 : 0.73\n", 1230 | "f1-score : 0.76, auc : 0.81\n" 1231 | ] 1232 | } 1233 | ], 1234 | "source": [ 1235 | "lgb_pred = lgb.predict(X_test)\n", 1236 | "metrics(y_test, lgb_pred)" 1237 | ] 1238 | }, 1239 | { 1240 | "cell_type": "code", 1241 | "execution_count": 13, 1242 | "metadata": {}, 1243 | "outputs": [ 1244 | { 1245 | "name": "stdout", 1246 | "output_type": "stream", 1247 | "text": [ 1248 | "[1]\tvalid_0's binary_logloss: 0.605701\n", 1249 | "Training until validation scores don't improve for 100 rounds\n", 1250 | "[2]\tvalid_0's binary_logloss: 0.569461\n", 1251 | "[3]\tvalid_0's binary_logloss: 0.540251\n", 1252 | "[4]\tvalid_0's binary_logloss: 0.5147\n", 1253 | "[5]\tvalid_0's binary_logloss: 0.493662\n", 1254 | "[6]\tvalid_0's binary_logloss: 0.47569\n", 1255 | "[7]\tvalid_0's binary_logloss: 0.45573\n", 1256 | "[8]\tvalid_0's binary_logloss: 0.442288\n", 1257 | "[9]\tvalid_0's binary_logloss: 0.427343\n", 1258 | "[10]\tvalid_0's binary_logloss: 0.41478\n", 1259 | "[11]\tvalid_0's binary_logloss: 0.404568\n", 1260 | "[12]\tvalid_0's binary_logloss: 0.394087\n", 1261 | "[13]\tvalid_0's binary_logloss: 0.384579\n", 1262 | "[14]\tvalid_0's binary_logloss: 0.377022\n", 1263 | "[15]\tvalid_0's binary_logloss: 0.372698\n", 1264 | "[16]\tvalid_0's binary_logloss: 0.367266\n", 1265 | "[17]\tvalid_0's binary_logloss: 0.364566\n", 1266 | "[18]\tvalid_0's binary_logloss: 0.362322\n", 1267 | "[19]\tvalid_0's binary_logloss: 0.35638\n", 1268 | "[20]\tvalid_0's binary_logloss: 0.352956\n", 1269 | "[21]\tvalid_0's binary_logloss: 0.351149\n", 1270 | "[22]\tvalid_0's binary_logloss: 0.350341\n", 1271 | "[23]\tvalid_0's binary_logloss: 0.348923\n", 1272 | "[24]\tvalid_0's binary_logloss: 0.348176\n", 1273 | "[25]\tvalid_0's binary_logloss: 0.34714\n", 1274 | "[26]\tvalid_0's binary_logloss: 0.346754\n", 1275 | "[27]\tvalid_0's binary_logloss: 0.347015\n", 1276 | "[28]\tvalid_0's binary_logloss: 0.347799\n", 1277 | "[29]\tvalid_0's binary_logloss: 0.348623\n", 1278 | "[30]\tvalid_0's binary_logloss: 0.349346\n", 1279 | "[31]\tvalid_0's binary_logloss: 0.350961\n", 1280 | "[32]\tvalid_0's binary_logloss: 0.352158\n", 1281 | "[33]\tvalid_0's binary_logloss: 0.352746\n", 1282 | "[34]\tvalid_0's binary_logloss: 0.353988\n", 1283 | "[35]\tvalid_0's binary_logloss: 0.35563\n", 1284 | "[36]\tvalid_0's binary_logloss: 0.357587\n", 1285 | "[37]\tvalid_0's binary_logloss: 0.357775\n", 1286 | "[38]\tvalid_0's binary_logloss: 0.359317\n", 1287 | "[39]\tvalid_0's binary_logloss: 0.360177\n", 1288 | "[40]\tvalid_0's binary_logloss: 0.359158\n", 1289 | "[41]\tvalid_0's binary_logloss: 0.360159\n", 1290 | "[42]\tvalid_0's binary_logloss: 0.359884\n", 1291 | "[43]\tvalid_0's binary_logloss: 0.360693\n", 1292 | "[44]\tvalid_0's binary_logloss: 0.361518\n", 1293 | "[45]\tvalid_0's binary_logloss: 0.361417\n", 1294 | "[46]\tvalid_0's binary_logloss: 0.36477\n", 1295 | "[47]\tvalid_0's binary_logloss: 0.366563\n", 1296 | "[48]\tvalid_0's binary_logloss: 0.367413\n", 1297 | "[49]\tvalid_0's binary_logloss: 0.370403\n", 1298 | "[50]\tvalid_0's binary_logloss: 0.370454\n", 1299 | "[51]\tvalid_0's binary_logloss: 0.3713\n", 1300 | "[52]\tvalid_0's binary_logloss: 0.373395\n", 1301 | "[53]\tvalid_0's binary_logloss: 0.371452\n", 1302 | "[54]\tvalid_0's binary_logloss: 0.370792\n", 1303 | "[55]\tvalid_0's binary_logloss: 0.369311\n", 1304 | "[56]\tvalid_0's binary_logloss: 0.368987\n", 1305 | "[57]\tvalid_0's binary_logloss: 0.372909\n", 1306 | "[58]\tvalid_0's binary_logloss: 0.371587\n", 1307 | "[59]\tvalid_0's binary_logloss: 0.371235\n", 1308 | "[60]\tvalid_0's binary_logloss: 0.371714\n", 1309 | "[61]\tvalid_0's binary_logloss: 0.372394\n", 1310 | "[62]\tvalid_0's binary_logloss: 0.371164\n", 1311 | "[63]\tvalid_0's binary_logloss: 0.371928\n", 1312 | "[64]\tvalid_0's binary_logloss: 0.372314\n", 1313 | "[65]\tvalid_0's binary_logloss: 0.372502\n", 1314 | "[66]\tvalid_0's binary_logloss: 0.376302\n", 1315 | "[67]\tvalid_0's binary_logloss: 0.378364\n", 1316 | "[68]\tvalid_0's binary_logloss: 0.378404\n", 1317 | "[69]\tvalid_0's binary_logloss: 0.381327\n", 1318 | "[70]\tvalid_0's binary_logloss: 0.380973\n", 1319 | "[71]\tvalid_0's binary_logloss: 0.382481\n", 1320 | "[72]\tvalid_0's binary_logloss: 0.38136\n", 1321 | "[73]\tvalid_0's binary_logloss: 0.383008\n", 1322 | "[74]\tvalid_0's binary_logloss: 0.381861\n", 1323 | "[75]\tvalid_0's binary_logloss: 0.382796\n", 1324 | "[76]\tvalid_0's binary_logloss: 0.38258\n", 1325 | "[77]\tvalid_0's binary_logloss: 0.384473\n", 1326 | "[78]\tvalid_0's binary_logloss: 0.383581\n", 1327 | "[79]\tvalid_0's binary_logloss: 0.385198\n", 1328 | "[80]\tvalid_0's binary_logloss: 0.383797\n", 1329 | "[81]\tvalid_0's binary_logloss: 0.383937\n", 1330 | "[82]\tvalid_0's binary_logloss: 0.383372\n", 1331 | "[83]\tvalid_0's binary_logloss: 0.384661\n", 1332 | "[84]\tvalid_0's binary_logloss: 0.383799\n", 1333 | "[85]\tvalid_0's binary_logloss: 0.384108\n", 1334 | "[86]\tvalid_0's binary_logloss: 0.383364\n", 1335 | "[87]\tvalid_0's binary_logloss: 0.384795\n", 1336 | "[88]\tvalid_0's binary_logloss: 0.384702\n", 1337 | "[89]\tvalid_0's binary_logloss: 0.386003\n", 1338 | "[90]\tvalid_0's binary_logloss: 0.386621\n", 1339 | "[91]\tvalid_0's binary_logloss: 0.387986\n", 1340 | "[92]\tvalid_0's binary_logloss: 0.390496\n", 1341 | "[93]\tvalid_0's binary_logloss: 0.389984\n", 1342 | "[94]\tvalid_0's binary_logloss: 0.391477\n", 1343 | "[95]\tvalid_0's binary_logloss: 0.391917\n", 1344 | "[96]\tvalid_0's binary_logloss: 0.392326\n", 1345 | "[97]\tvalid_0's binary_logloss: 0.392731\n", 1346 | "[98]\tvalid_0's binary_logloss: 0.392586\n", 1347 | "[99]\tvalid_0's binary_logloss: 0.394479\n", 1348 | "[100]\tvalid_0's binary_logloss: 0.397251\n", 1349 | "Did not meet early stopping. Best iteration is:\n", 1350 | "[26]\tvalid_0's binary_logloss: 0.346754\n" 1351 | ] 1352 | }, 1353 | { 1354 | "data": { 1355 | "text/plain": [ 1356 | "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n", 1357 | " importance_type='split', learning_rate=0.1, max_depth=-1,\n", 1358 | " min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n", 1359 | " n_estimaotrs=400, n_estimators=100, n_jobs=-1, num_leaves=31,\n", 1360 | " objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n", 1361 | " silent=True, subsample=1.0, subsample_for_bin=200000,\n", 1362 | " subsample_freq=0)" 1363 | ] 1364 | }, 1365 | "execution_count": 13, 1366 | "metadata": {}, 1367 | "output_type": "execute_result" 1368 | } 1369 | ], 1370 | "source": [ 1371 | "lgb = LGBMClassifier(n_estimaotrs = 400)\n", 1372 | "evals = [(X_test, y_test)]\n", 1373 | "lgb.fit(X_train, y_train, early_stopping_rounds = 100, eval_metric = \"logloss\", eval_set = evals, verbose = True)" 1374 | ] 1375 | }, 1376 | { 1377 | "cell_type": "code", 1378 | "execution_count": 15, 1379 | "metadata": {}, 1380 | "outputs": [ 1381 | { 1382 | "data": { 1383 | "text/plain": [ 1384 | "" 1385 | ] 1386 | }, 1387 | "execution_count": 15, 1388 | "metadata": {}, 1389 | "output_type": "execute_result" 1390 | }, 1391 | { 1392 | "data": { 1393 | "image/png": "\n", 1394 | "text/plain": [ 1395 | "
" 1396 | ] 1397 | }, 1398 | "metadata": { 1399 | "needs_background": "light" 1400 | }, 1401 | "output_type": "display_data" 1402 | } 1403 | ], 1404 | "source": [ 1405 | "fig, ax = plt.subplots(figsize=(10, 6))\n", 1406 | "plot_importance(lgb, ax = ax)" 1407 | ] 1408 | }, 1409 | { 1410 | "cell_type": "code", 1411 | "execution_count": null, 1412 | "metadata": {}, 1413 | "outputs": [], 1414 | "source": [] 1415 | }, 1416 | { 1417 | "cell_type": "code", 1418 | "execution_count": null, 1419 | "metadata": {}, 1420 | "outputs": [], 1421 | "source": [] 1422 | } 1423 | ], 1424 | "metadata": { 1425 | "kernelspec": { 1426 | "display_name": "Python 3", 1427 | "language": "python", 1428 | "name": "python3" 1429 | }, 1430 | "language_info": { 1431 | "codemirror_mode": { 1432 | "name": "ipython", 1433 | "version": 3 1434 | }, 1435 | "file_extension": ".py", 1436 | "mimetype": "text/x-python", 1437 | "name": "python", 1438 | "nbconvert_exporter": "python", 1439 | "pygments_lexer": "ipython3", 1440 | "version": "3.6.9" 1441 | } 1442 | }, 1443 | "nbformat": 4, 1444 | "nbformat_minor": 2 1445 | } 1446 | -------------------------------------------------------------------------------- /010. credit_card_fraud_basic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "import seaborn as sns\n", 13 | "from xgboost import XGBClassifier, plot_importance as xg_importance\n", 14 | "from lightgbm import LGBMClassifier, plot_importance as lgb_importance\n", 15 | "from sklearn.datasets import load_breast_cancer\n", 16 | "from sklearn.model_selection import train_test_split, GridSearchCV\n", 17 | "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix\n", 18 | "\n", 19 | "import warnings\n", 20 | "warnings.filterwarnings('ignore')\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "def metrics(y_test, pred):\n", 30 | " accuracy = accuracy_score(y_test, pred)\n", 31 | " precision = precision_score(y_test, pred)\n", 32 | " recall = recall_score(y_test, pred)\n", 33 | " f1 = f1_score(y_test, pred)\n", 34 | " roc_score = roc_auc_score(y_test, pred, average='macro')\n", 35 | " print('정확도 : {0:.2f}, 정밀도 : {1:.2f}, 재현율 : {2:.2f}'.format(accuracy, precision, recall))\n", 36 | " print('f1-score : {0:.2f}, auc : {1:.2f}'.format(f1, roc_score))" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "data = pd.read_csv('../datas/credit card fraud/creditcard.csv')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 5, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "(284807, 31)" 57 | ] 58 | }, 59 | "execution_count": 5, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "data.shape" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "\n", 78 | "RangeIndex: 284807 entries, 0 to 284806\n", 79 | "Data columns (total 31 columns):\n", 80 | "Time 284807 non-null float64\n", 81 | "V1 284807 non-null float64\n", 82 | "V2 284807 non-null float64\n", 83 | "V3 284807 non-null float64\n", 84 | "V4 284807 non-null float64\n", 85 | "V5 284807 non-null float64\n", 86 | "V6 284807 non-null float64\n", 87 | "V7 284807 non-null float64\n", 88 | "V8 284807 non-null float64\n", 89 | "V9 284807 non-null float64\n", 90 | "V10 284807 non-null float64\n", 91 | "V11 284807 non-null float64\n", 92 | "V12 284807 non-null float64\n", 93 | "V13 284807 non-null float64\n", 94 | "V14 284807 non-null float64\n", 95 | "V15 284807 non-null float64\n", 96 | "V16 284807 non-null float64\n", 97 | "V17 284807 non-null float64\n", 98 | "V18 284807 non-null float64\n", 99 | "V19 284807 non-null float64\n", 100 | "V20 284807 non-null float64\n", 101 | "V21 284807 non-null float64\n", 102 | "V22 284807 non-null float64\n", 103 | "V23 284807 non-null float64\n", 104 | "V24 284807 non-null float64\n", 105 | "V25 284807 non-null float64\n", 106 | "V26 284807 non-null float64\n", 107 | "V27 284807 non-null float64\n", 108 | "V28 284807 non-null float64\n", 109 | "Amount 284807 non-null float64\n", 110 | "Class 284807 non-null int64\n", 111 | "dtypes: float64(30), int64(1)\n", 112 | "memory usage: 67.4 MB\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "data.info()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 7, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "Time 0\n", 129 | "V1 0\n", 130 | "V2 0\n", 131 | "V3 0\n", 132 | "V4 0\n", 133 | "V5 0\n", 134 | "V6 0\n", 135 | "V7 0\n", 136 | "V8 0\n", 137 | "V9 0\n", 138 | "V10 0\n", 139 | "V11 0\n", 140 | "V12 0\n", 141 | "V13 0\n", 142 | "V14 0\n", 143 | "V15 0\n", 144 | "V16 0\n", 145 | "V17 0\n", 146 | "V18 0\n", 147 | "V19 0\n", 148 | "V20 0\n", 149 | "V21 0\n", 150 | "V22 0\n", 151 | "V23 0\n", 152 | "V24 0\n", 153 | "V25 0\n", 154 | "V26 0\n", 155 | "V27 0\n", 156 | "V28 0\n", 157 | "Amount 0\n", 158 | "Class 0\n", 159 | "dtype: int64" 160 | ] 161 | }, 162 | "execution_count": 7, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "data.isna().sum()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 8, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "0 284315\n", 180 | "1 492\n", 181 | "Name: Class, dtype: int64" 182 | ] 183 | }, 184 | "execution_count": 8, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "data.iloc[:, -1].value_counts()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 9, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "" 202 | ] 203 | }, 204 | "execution_count": 9, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | }, 208 | { 209 | "data": { 210 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD1CAYAAAClSgmzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAPZElEQVR4nO3cUaxdVZ3H8e9vWjFmHKXKhTBtmRLtZKwmU7WBJr44kpTCPJRJICkP0hCSGgOJJj5YfalRSfRBSUi0SQ0NxTgiQQ3NTLXTVCbGjGIvSoDaYXqDCNc2UGxFJkYd8D8PZzUeLmfde3sL5xb6/SQ7Z5//XmvtdZLb++tee5+bqkKSpFH+arEnIEk6exkSkqQuQ0KS1GVISJK6DAlJUpchIUnqWrrYE3ilXXDBBbVq1arFnoYkvaY8+OCDz1bVxMz66y4kVq1axeTk5GJPQ5JeU5L8alTd5SZJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSul53X6Z7rVi17d8XewqvK0984Z8XewrS65JXEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqmjMkkqxMcn+Sw0kOJflYq38mya+TPNS2q4f6fCrJVJLHklw5VN/YalNJtg3VL03yQJIjSb6V5LxWf2N7P9WOr3olP7wkaXbzuZJ4AfhEVb0LWA/cnGRNO3ZbVa1t216Admwz8G5gI/DVJEuSLAG+AlwFrAGuHxrni22s1cBJ4KZWvwk4WVXvBG5r7SRJYzJnSFTVsar6Wdt/HjgMLJ+lyybg7qr6Y1X9EpgCLmvbVFU9XlV/Au4GNiUJ8CHg3tZ/N3DN0Fi72/69wBWtvSRpDE7rnkRb7nkv8EAr3ZLk4SS7kixrteXAU0PdplutV3878NuqemFG/SVjtePPtfYz57U1yWSSyePHj5/OR5IkzWLeIZHkzcC3gY9X1e+AHcA7gLXAMeBLp5qO6F4LqM821ksLVTural1VrZuYmJj1c0iS5m9eIZHkDQwC4htV9R2Aqnq6ql6sqj8DX2OwnASDK4GVQ91XAEdnqT8LnJ9k6Yz6S8Zqx98KnDidDyhJWrj5PN0U4A7gcFV9eah+8VCzfwEebft7gM3tyaRLgdXAT4GDwOr2JNN5DG5u76mqAu4Hrm39twD3DY21pe1fC/ygtZckjcHSuZvwAeDDwCNJHmq1TzN4Omktg+WfJ4CPAFTVoST3AL9g8GTUzVX1IkCSW4B9wBJgV1UdauN9Erg7yeeBnzMIJdrr15NMMbiC2HwGn1WSdJrmDImq+hGj7w3snaXPrcCtI+p7R/Wrqsf5y3LVcP0PwHVzzVGS9OrwG9eSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkrrmDIkkK5Pcn+RwkkNJPtbqb0uyP8mR9rqs1ZPk9iRTSR5O8r6hsba09keSbBmqvz/JI63P7Uky2zkkSeMxnyuJF4BPVNW7gPXAzUnWANuAA1W1GjjQ3gNcBaxu21ZgBwx+4QPbgcuBy4DtQ7/0d7S2p/ptbPXeOSRJYzBnSFTVsar6Wdt/HjgMLAc2Abtbs93ANW1/E3BXDfwEOD/JxcCVwP6qOlFVJ4H9wMZ27C1V9eOqKuCuGWONOockaQxO655EklXAe4EHgIuq6hgMggS4sDVbDjw11G261WarT4+oM8s5JEljMO+QSPJm4NvAx6vqd7M1HVGrBdTnLcnWJJNJJo8fP346XSVJs5hXSCR5A4OA+EZVfaeVn25LRbTXZ1p9Glg51H0FcHSO+ooR9dnO8RJVtbOq1lXVuomJifl8JEnSPMzn6aYAdwCHq+rLQ4f2AKeeUNoC3DdUv6E95bQeeK4tFe0DNiRZ1m5YbwD2tWPPJ1nfznXDjLFGnUOSNAZL59HmA8CHgUeSPNRqnwa+ANyT5CbgSeC6dmwvcDUwBfweuBGgqk4k+RxwsLX7bFWdaPsfBe4E3gR8r23Mcg5J0hjMGRJV9SNG3zcAuGJE+wJu7oy1C9g1oj4JvGdE/TejziFJGg+/cS1J6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqWvOkEiyK8kzSR4dqn0mya+TPNS2q4eOfSrJVJLHklw5VN/YalNJtg3VL03yQJIjSb6V5LxWf2N7P9WOr3qlPrQkaX7mcyVxJ7BxRP22qlrbtr0ASdYAm4F3tz5fTbIkyRLgK8BVwBrg+tYW4IttrNXASeCmVr8JOFlV7wRua+0kSWM0Z0hU1Q+BE/McbxNwd1X9sap+CUwBl7Vtqqoer6o/AXcDm5IE+BBwb+u/G7hmaKzdbf9e4IrWXpI0JmdyT+KWJA+35ahlrbYceGqozXSr9epvB35bVS/MqL9krHb8udZekjQmCw2JHcA7gLXAMeBLrT7qf/q1gPpsY71Mkq1JJpNMHj9+fLZ5S5JOw4JCoqqerqoXq+rPwNcYLCfB4Epg5VDTFcDRWerPAucnWTqj/pKx2vG30ln2qqqdVbWuqtZNTEws5CNJkkZYUEgkuXjo7b8Ap5582gNsbk8mXQqsBn4KHARWtyeZzmNwc3tPVRVwP3Bt678FuG9orC1t/1rgB629JGlMls7VIMk3gQ8CFySZBrYDH0yylsHyzxPARwCq6lCSe4BfAC8AN1fVi22cW4B9wBJgV1Udaqf4JHB3ks8DPwfuaPU7gK8nmWJwBbH5jD+tJOm0zBkSVXX9iPIdI2qn2t8K3DqivhfYO6L+OH9Zrhqu/wG4bq75SZJePX7jWpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeqaMySS7EryTJJHh2pvS7I/yZH2uqzVk+T2JFNJHk7yvqE+W1r7I0m2DNXfn+SR1uf2JJntHJKk8ZnPlcSdwMYZtW3AgapaDRxo7wGuAla3bSuwAwa/8IHtwOXAZcD2oV/6O1rbU/02znEOSdKYzBkSVfVD4MSM8iZgd9vfDVwzVL+rBn4CnJ/kYuBKYH9Vnaiqk8B+YGM79paq+nFVFXDXjLFGnUOSNCYLvSdxUVUdA2ivF7b6cuCpoXbTrTZbfXpEfbZzSJLG5JW+cZ0RtVpA/fROmmxNMplk8vjx46fbXZLUsdCQeLotFdFen2n1aWDlULsVwNE56itG1Gc7x8tU1c6qWldV6yYmJhb4kSRJMy00JPYAp55Q2gLcN1S/oT3ltB54ri0V7QM2JFnWblhvAPa1Y88nWd+earphxlijziFJGpOlczVI8k3gg8AFSaYZPKX0BeCeJDcBTwLXteZ7gauBKeD3wI0AVXUiyeeAg63dZ6vq1M3wjzJ4gupNwPfaxiznkCSNyZwhUVXXdw5dMaJtATd3xtkF7BpRnwTeM6L+m1HnkCSNj9+4liR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUdUYhkeSJJI8keSjJZKu9Lcn+JEfa67JWT5Lbk0wleTjJ+4bG2dLaH0myZaj+/jb+VOubM5mvJOn0vBJXEv9UVWural17vw04UFWrgQPtPcBVwOq2bQV2wCBUgO3A5cBlwPZTwdLabB3qt/EVmK8kaZ5ejeWmTcDutr8buGaoflcN/AQ4P8nFwJXA/qo6UVUngf3AxnbsLVX146oq4K6hsSRJY3CmIVHAfyR5MMnWVruoqo4BtNcLW3058NRQ3+lWm60+PaIuSRqTpWfY/wNVdTTJhcD+JP89S9tR9xNqAfWXDzwIqK0Al1xyyewzliTN2xldSVTV0fb6DPBdBvcUnm5LRbTXZ1rzaWDlUPcVwNE56itG1EfNY2dVrauqdRMTE2fykSRJQxYcEkn+OsnfnNoHNgCPAnuAU08obQHua/t7gBvaU07rgefactQ+YEOSZe2G9QZgXzv2fJL17ammG4bGkiSNwZksN10EfLc9lboU+Neq+n6Sg8A9SW4CngSua+33AlcDU8DvgRsBqupEks8BB1u7z1bVibb/UeBO4E3A99omSRqTBYdEVT0O/OOI+m+AK0bUC7i5M9YuYNeI+iTwnoXOUZJ0ZvzGtSSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktR11odEko1JHksylWTbYs9Hks4lZ3VIJFkCfAW4ClgDXJ9kzeLOSpLOHWd1SACXAVNV9XhV/Qm4G9i0yHOSpHPG0sWewByWA08NvZ8GLp/ZKMlWYGt7+79JHhvD3M4VFwDPLvYk5pIvLvYMtAheEz+bryF/N6p4todERtTqZYWqncDOV386554kk1W1brHnIc3kz+Z4nO3LTdPAyqH3K4CjizQXSTrnnO0hcRBYneTSJOcBm4E9izwnSTpnnNXLTVX1QpJbgH3AEmBXVR1a5Gmda1zG09nKn80xSNXLlvglSQLO/uUmSdIiMiQkSV2GhCSp66y+ca3xSvIPDL7RvpzB91GOAnuq6vCiTkzSovFKQgAk+SSDP3sS4KcMHj8O8E3/sKLOZkluXOw5vJ75dJMASPI/wLur6v9m1M8DDlXV6sWZmTS7JE9W1SWLPY/XK5ebdMqfgb8FfjWjfnE7Ji2aJA/3DgEXjXMu5xpDQqd8HDiQ5Ah/+aOKlwDvBG5ZtFlJAxcBVwInZ9QD/Nf4p3PuMCQEQFV9P8nfM/jz7MsZ/OObBg5W1YuLOjkJ/g14c1U9NPNAkv8c/3TOHd6TkCR1+XSTJKnLkJAkdRkSkqQuQ0KS1GVISJK6/h96EFvhHkMj3AAAAABJRU5ErkJggg==\n", 211 | "text/plain": [ 212 | "
" 213 | ] 214 | }, 215 | "metadata": { 216 | "needs_background": "light" 217 | }, 218 | "output_type": "display_data" 219 | } 220 | ], 221 | "source": [ 222 | "data.iloc[:, -1].value_counts().plot(kind='bar')" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 10, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "0 99.827251\n", 234 | "1 0.172749\n", 235 | "Name: Class, dtype: float64" 236 | ] 237 | }, 238 | "execution_count": 10, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "data.iloc[:, -1].value_counts() / data.iloc[:, -1].count() * 100" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 11, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "X = data.iloc[:, :-1]\n", 254 | "y = data.iloc[:, -1]" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 13, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 14, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "name": "stdout", 273 | "output_type": "stream", 274 | "text": [ 275 | "0 99.826315\n", 276 | "1 0.173685\n", 277 | "Name: Class, dtype: float64\n", 278 | "0 99.830061\n", 279 | "1 0.169939\n", 280 | "Name: Class, dtype: float64\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "print(y_train.value_counts() / y_train.count() * 100)\n", 286 | "print(y_test.value_counts() / y_test.count() * 100)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 16, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "def modeling(model, x_train, x_test, y_train, y_test):\n", 296 | " model.fit(x_train, y_train)\n", 297 | " pred = model.predict(x_test)\n", 298 | " metrics(y_test, pred)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 20, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "정확도 : 1.00, 정밀도 : 0.77, 재현율 : 0.53\n", 311 | "f1-score : 0.63, auc : 0.76\n" 312 | ] 313 | } 314 | ], 315 | "source": [ 316 | "from sklearn.linear_model import LogisticRegression\n", 317 | "lr = LogisticRegression()\n", 318 | "modeling(lr, X_train, X_test, y_train, y_test)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 21, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "정확도 : 1.00, 정밀도 : 0.95, 재현율 : 0.83\n", 331 | "f1-score : 0.88, auc : 0.91\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "lgb = LGBMClassifier(n_estimators = 1000, num_leaves = 64, n_jobs = -1, boost_from_average = False)\n", 337 | "modeling(lgb, x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [] 353 | } 354 | ], 355 | "metadata": { 356 | "kernelspec": { 357 | "display_name": "Python 3", 358 | "language": "python", 359 | "name": "python3" 360 | }, 361 | "language_info": { 362 | "codemirror_mode": { 363 | "name": "ipython", 364 | "version": 3 365 | }, 366 | "file_extension": ".py", 367 | "mimetype": "text/x-python", 368 | "name": "python", 369 | "nbconvert_exporter": "python", 370 | "pygments_lexer": "ipython3", 371 | "version": "3.6.9" 372 | } 373 | }, 374 | "nbformat": 4, 375 | "nbformat_minor": 2 376 | } 377 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # machine_learning_basic 2 | Repo for everyone who wants a machine learning basic 3 | --------------------------------------------------------------------------------