├── .gitignore
├── 007. machine learning ensemble - random forest.ipynb
├── 008. machine learning - ensemble boosting basic.ipynb
├── 009. XGboost, LightGBM.ipynb
├── 010. credit_card_fraud_basic.ipynb
├── 011. outlier, oversampling with credit card fraud_kaggle.ipynb
├── 012. stacking ensemble.ipynb
├── 013. Time series cointegration.ipynb
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.gitignore.io/api/python,pycharm,jupyternotebooks
3 | # Edit at https://www.gitignore.io/?templates=python,pycharm,jupyternotebooks
4 |
5 | ### JupyterNotebooks ###
6 | # gitignore template for Jupyter Notebooks
7 | # website: http://jupyter.org/
8 |
9 | .ipynb_checkpoints
10 | */.ipynb_checkpoints/*
11 |
12 | # IPython
13 | profile_default/
14 | ipython_config.py
15 |
16 | # Remove previous ipynb_checkpoints
17 | # git rm -r .ipynb_checkpoints/
18 |
19 | ### PyCharm ###
20 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
21 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
22 |
23 | # User-specific stuff
24 | .idea/**/workspace.xml
25 | .idea/**/tasks.xml
26 | .idea/**/usage.statistics.xml
27 | .idea/**/dictionaries
28 | .idea/**/shelf
29 |
30 | # Generated files
31 | .idea/**/contentModel.xml
32 |
33 | # Sensitive or high-churn files
34 | .idea/**/dataSources/
35 | .idea/**/dataSources.ids
36 | .idea/**/dataSources.local.xml
37 | .idea/**/sqlDataSources.xml
38 | .idea/**/dynamic.xml
39 | .idea/**/uiDesigner.xml
40 | .idea/**/dbnavigator.xml
41 |
42 | # Gradle
43 | .idea/**/gradle.xml
44 | .idea/**/libraries
45 |
46 | # Gradle and Maven with auto-import
47 | # When using Gradle or Maven with auto-import, you should exclude module files,
48 | # since they will be recreated, and may cause churn. Uncomment if using
49 | # auto-import.
50 | # .idea/modules.xml
51 | # .idea/*.iml
52 | # .idea/modules
53 | # *.iml
54 | # *.ipr
55 |
56 | # CMake
57 | cmake-build-*/
58 |
59 | # Mongo Explorer plugin
60 | .idea/**/mongoSettings.xml
61 |
62 | # File-based project format
63 | *.iws
64 |
65 | # IntelliJ
66 | out/
67 |
68 | # mpeltonen/sbt-idea plugin
69 | .idea_modules/
70 |
71 | # JIRA plugin
72 | atlassian-ide-plugin.xml
73 |
74 | # Cursive Clojure plugin
75 | .idea/replstate.xml
76 |
77 | # Crashlytics plugin (for Android Studio and IntelliJ)
78 | com_crashlytics_export_strings.xml
79 | crashlytics.properties
80 | crashlytics-build.properties
81 | fabric.properties
82 |
83 | # Editor-based Rest Client
84 | .idea/httpRequests
85 |
86 | # Android studio 3.1+ serialized cache file
87 | .idea/caches/build_file_checksums.ser
88 |
89 | ### PyCharm Patch ###
90 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
91 |
92 | # *.iml
93 | # modules.xml
94 | # .idea/misc.xml
95 | # *.ipr
96 |
97 | # Sonarlint plugin
98 | .idea/**/sonarlint/
99 |
100 | # SonarQube Plugin
101 | .idea/**/sonarIssues.xml
102 |
103 | # Markdown Navigator plugin
104 | .idea/**/markdown-navigator.xml
105 | .idea/**/markdown-navigator/
106 |
107 | ### Python ###
108 | # Byte-compiled / optimized / DLL files
109 | __pycache__/
110 | *.py[cod]
111 | *$py.class
112 |
113 | # C extensions
114 | *.so
115 |
116 | # Distribution / packaging
117 | .Python
118 | build/
119 | develop-eggs/
120 | dist/
121 | downloads/
122 | eggs/
123 | .eggs/
124 | lib/
125 | lib64/
126 | parts/
127 | sdist/
128 | var/
129 | wheels/
130 | pip-wheel-metadata/
131 | share/python-wheels/
132 | *.egg-info/
133 | .installed.cfg
134 | *.egg
135 | MANIFEST
136 |
137 | # PyInstaller
138 | # Usually these files are written by a python script from a template
139 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
140 | *.manifest
141 | *.spec
142 |
143 | # Installer logs
144 | pip-log.txt
145 | pip-delete-this-directory.txt
146 |
147 | # Unit test / coverage reports
148 | htmlcov/
149 | .tox/
150 | .nox/
151 | .coverage
152 | .coverage.*
153 | .cache
154 | nosetests.xml
155 | coverage.xml
156 | *.cover
157 | .hypothesis/
158 | .pytest_cache/
159 |
160 | # Translations
161 | *.mo
162 | *.pot
163 |
164 | # Scrapy stuff:
165 | .scrapy
166 |
167 | # Sphinx documentation
168 | docs/_build/
169 |
170 | # PyBuilder
171 | target/
172 |
173 | # pyenv
174 | .python-version
175 |
176 | # pipenv
177 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
178 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
179 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
180 | # install all needed dependencies.
181 | #Pipfile.lock
182 |
183 | # celery beat schedule file
184 | celerybeat-schedule
185 |
186 | # SageMath parsed files
187 | *.sage.py
188 |
189 | # Spyder project settings
190 | .spyderproject
191 | .spyproject
192 |
193 | # Rope project settings
194 | .ropeproject
195 |
196 | # Mr Developer
197 | .mr.developer.cfg
198 | .project
199 | .pydevproject
200 |
201 | # mkdocs documentation
202 | /site
203 |
204 | # mypy
205 | .mypy_cache/
206 | .dmypy.json
207 | dmypy.json
208 |
209 | # Pyre type checker
210 | .pyre/
211 |
212 | # End of https://www.gitignore.io/api/python,pycharm,jupyternotebooks
213 |
--------------------------------------------------------------------------------
/007. machine learning ensemble - random forest.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 출처\n",
8 | "\n",
9 | "- https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python\n",
10 | "- https://www.kaggle.com/lsjsj92/simple-titanic-kernel-82-for-beginner-like-me\n",
11 | "- https://www.kaggle.com/startupsci/titanic-data-science-solutions\n",
12 | "- https://www.kaggle.com/ash316/eda-to-prediction-dietanic\n",
13 | "- https://www.kaggle.com/mjbahmani/a-comprehensive-ml-workflow-with-python"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import pandas as pd\n",
23 | "import numpy as np\n",
24 | "import matplotlib.pyplot as plt\n",
25 | "import seaborn as sns\n",
26 | "\n",
27 | "from sklearn.tree import DecisionTreeClassifier\n",
28 | "from sklearn.ensemble import RandomForestClassifier\n",
29 | "from sklearn.model_selection import train_test_split, GridSearchCV\n",
30 | "from sklearn.metrics import accuracy_score"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/html": [
41 | "
\n",
42 | "\n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " | \n",
59 | " PassengerId | \n",
60 | " Survived | \n",
61 | " Pclass | \n",
62 | " Name | \n",
63 | " Sex | \n",
64 | " Age | \n",
65 | " SibSp | \n",
66 | " Parch | \n",
67 | " Ticket | \n",
68 | " Fare | \n",
69 | " Cabin | \n",
70 | " Embarked | \n",
71 | "
\n",
72 | " \n",
73 | " \n",
74 | " \n",
75 | " 0 | \n",
76 | " 1 | \n",
77 | " 0 | \n",
78 | " 3 | \n",
79 | " Braund, Mr. Owen Harris | \n",
80 | " male | \n",
81 | " 22.0 | \n",
82 | " 1 | \n",
83 | " 0 | \n",
84 | " A/5 21171 | \n",
85 | " 7.2500 | \n",
86 | " NaN | \n",
87 | " S | \n",
88 | "
\n",
89 | " \n",
90 | " 1 | \n",
91 | " 2 | \n",
92 | " 1 | \n",
93 | " 1 | \n",
94 | " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
95 | " female | \n",
96 | " 38.0 | \n",
97 | " 1 | \n",
98 | " 0 | \n",
99 | " PC 17599 | \n",
100 | " 71.2833 | \n",
101 | " C85 | \n",
102 | " C | \n",
103 | "
\n",
104 | " \n",
105 | " 2 | \n",
106 | " 3 | \n",
107 | " 1 | \n",
108 | " 3 | \n",
109 | " Heikkinen, Miss. Laina | \n",
110 | " female | \n",
111 | " 26.0 | \n",
112 | " 0 | \n",
113 | " 0 | \n",
114 | " STON/O2. 3101282 | \n",
115 | " 7.9250 | \n",
116 | " NaN | \n",
117 | " S | \n",
118 | "
\n",
119 | " \n",
120 | " 3 | \n",
121 | " 4 | \n",
122 | " 1 | \n",
123 | " 1 | \n",
124 | " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
125 | " female | \n",
126 | " 35.0 | \n",
127 | " 1 | \n",
128 | " 0 | \n",
129 | " 113803 | \n",
130 | " 53.1000 | \n",
131 | " C123 | \n",
132 | " S | \n",
133 | "
\n",
134 | " \n",
135 | " 4 | \n",
136 | " 5 | \n",
137 | " 0 | \n",
138 | " 3 | \n",
139 | " Allen, Mr. William Henry | \n",
140 | " male | \n",
141 | " 35.0 | \n",
142 | " 0 | \n",
143 | " 0 | \n",
144 | " 373450 | \n",
145 | " 8.0500 | \n",
146 | " NaN | \n",
147 | " S | \n",
148 | "
\n",
149 | " \n",
150 | "
\n",
151 | "
"
152 | ],
153 | "text/plain": [
154 | " PassengerId Survived Pclass \\\n",
155 | "0 1 0 3 \n",
156 | "1 2 1 1 \n",
157 | "2 3 1 3 \n",
158 | "3 4 1 1 \n",
159 | "4 5 0 3 \n",
160 | "\n",
161 | " Name Sex Age SibSp \\\n",
162 | "0 Braund, Mr. Owen Harris male 22.0 1 \n",
163 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
164 | "2 Heikkinen, Miss. Laina female 26.0 0 \n",
165 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
166 | "4 Allen, Mr. William Henry male 35.0 0 \n",
167 | "\n",
168 | " Parch Ticket Fare Cabin Embarked \n",
169 | "0 0 A/5 21171 7.2500 NaN S \n",
170 | "1 0 PC 17599 71.2833 C85 C \n",
171 | "2 0 STON/O2. 3101282 7.9250 NaN S \n",
172 | "3 0 113803 53.1000 C123 S \n",
173 | "4 0 373450 8.0500 NaN S "
174 | ]
175 | },
176 | "execution_count": 2,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "data = pd.read_csv('../datas/titanic/train.csv')\n",
183 | "data.head()"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 3,
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | "PassengerId 0\n",
195 | "Survived 0\n",
196 | "Pclass 0\n",
197 | "Name 0\n",
198 | "Sex 0\n",
199 | "Age 177\n",
200 | "SibSp 0\n",
201 | "Parch 0\n",
202 | "Ticket 0\n",
203 | "Fare 0\n",
204 | "Cabin 687\n",
205 | "Embarked 2\n",
206 | "dtype: int64"
207 | ]
208 | },
209 | "execution_count": 3,
210 | "metadata": {},
211 | "output_type": "execute_result"
212 | }
213 | ],
214 | "source": [
215 | "data.isna().sum()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 4,
221 | "metadata": {},
222 | "outputs": [
223 | {
224 | "name": "stdout",
225 | "output_type": "stream",
226 | "text": [
227 | "age null값 비율 : 19.865\n",
228 | "cabin null값 비율 : 77.104\n"
229 | ]
230 | }
231 | ],
232 | "source": [
233 | "print(\"age null값 비율 : {0:.3f}\".format((data['Age'].isna().sum() / len(data)) * 100))\n",
234 | "print(\"cabin null값 비율 : {0:.3f}\".format((data['Cabin'].isna().sum() / len(data)) * 100 ))"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": []
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 13,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "data['Embarked'].fillna('S', inplace = True)\n",
251 | "data['Fare'].fillna(0, inplace=True)\n",
252 | "data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 14,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\\.')\n",
262 | "data['Initial'].data(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)\n",
263 | "mapping = {\n",
264 | " \"Mr\":0,\n",
265 | " \"Miss\":1,\n",
266 | " \"Mrs\" : 1,\n",
267 | " \"Master\":2,\n",
268 | " \"Other\":3\n",
269 | "}\n",
270 | "\n",
271 | "data['Initial'] = data['Initial'].map(mapping)\n"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 15,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": [
280 | "mapping_sex = {\n",
281 | " 'male' : 0,\n",
282 | " 'female': 1\n",
283 | "}\n",
284 | "\n",
285 | "mapping_em = {\n",
286 | " 'S' :0,\n",
287 | " 'C' :1,\n",
288 | " 'Q' :2\n",
289 | "}\n",
290 | "\n",
291 | "\n",
292 | "data['Sex'] = data['Sex'].map(mapping_sex)\n",
293 | "data['Embarked'] = data['Embarked'].map(mapping_em)\n",
294 | "\n",
295 | "\n",
296 | "data.drop(['PassengerId', \"Ticket\", \"Cabin\", \"Name\"], axis = 1, inplace = True)\n"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 16,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "data": {
306 | "text/plain": [
307 | "Initial\n",
308 | "0 32.739609\n",
309 | "1 27.834615\n",
310 | "2 4.574167\n",
311 | "3 45.888889\n",
312 | "Name: Age, dtype: float64"
313 | ]
314 | },
315 | "execution_count": 16,
316 | "metadata": {},
317 | "output_type": "execute_result"
318 | }
319 | ],
320 | "source": [
321 | "data.groupby('Initial')['Age'].mean()"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 17,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 0), 'Age' ] = 32\n",
331 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 1), 'Age' ] = 28\n",
332 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 2), 'Age' ] = 5\n",
333 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 3), 'Age' ] = 45"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 18,
339 | "metadata": {},
340 | "outputs": [
341 | {
342 | "data": {
343 | "text/html": [
344 | "\n",
345 | "\n",
358 | "
\n",
359 | " \n",
360 | " \n",
361 | " | \n",
362 | " Survived | \n",
363 | " Pclass | \n",
364 | " Sex | \n",
365 | " Age | \n",
366 | " SibSp | \n",
367 | " Parch | \n",
368 | " Fare | \n",
369 | " Embarked | \n",
370 | " Initial | \n",
371 | "
\n",
372 | " \n",
373 | " \n",
374 | " \n",
375 | " 0 | \n",
376 | " 0 | \n",
377 | " 3 | \n",
378 | " 0 | \n",
379 | " 22.0 | \n",
380 | " 1 | \n",
381 | " 0 | \n",
382 | " 1.981001 | \n",
383 | " 0 | \n",
384 | " 0 | \n",
385 | "
\n",
386 | " \n",
387 | " 1 | \n",
388 | " 1 | \n",
389 | " 1 | \n",
390 | " 1 | \n",
391 | " 38.0 | \n",
392 | " 1 | \n",
393 | " 0 | \n",
394 | " 4.266662 | \n",
395 | " 1 | \n",
396 | " 1 | \n",
397 | "
\n",
398 | " \n",
399 | " 2 | \n",
400 | " 1 | \n",
401 | " 3 | \n",
402 | " 1 | \n",
403 | " 26.0 | \n",
404 | " 0 | \n",
405 | " 0 | \n",
406 | " 2.070022 | \n",
407 | " 0 | \n",
408 | " 1 | \n",
409 | "
\n",
410 | " \n",
411 | " 3 | \n",
412 | " 1 | \n",
413 | " 1 | \n",
414 | " 1 | \n",
415 | " 35.0 | \n",
416 | " 1 | \n",
417 | " 0 | \n",
418 | " 3.972177 | \n",
419 | " 0 | \n",
420 | " 1 | \n",
421 | "
\n",
422 | " \n",
423 | " 4 | \n",
424 | " 0 | \n",
425 | " 3 | \n",
426 | " 0 | \n",
427 | " 35.0 | \n",
428 | " 0 | \n",
429 | " 0 | \n",
430 | " 2.085672 | \n",
431 | " 0 | \n",
432 | " 0 | \n",
433 | "
\n",
434 | " \n",
435 | "
\n",
436 | "
"
437 | ],
438 | "text/plain": [
439 | " Survived Pclass Sex Age SibSp Parch Fare Embarked Initial\n",
440 | "0 0 3 0 22.0 1 0 1.981001 0 0\n",
441 | "1 1 1 1 38.0 1 0 4.266662 1 1\n",
442 | "2 1 3 1 26.0 0 0 2.070022 0 1\n",
443 | "3 1 1 1 35.0 1 0 3.972177 0 1\n",
444 | "4 0 3 0 35.0 0 0 2.085672 0 0"
445 | ]
446 | },
447 | "execution_count": 18,
448 | "metadata": {},
449 | "output_type": "execute_result"
450 | }
451 | ],
452 | "source": [
453 | "data.head()"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": 19,
459 | "metadata": {},
460 | "outputs": [],
461 | "source": [
462 | "y = data['Survived']\n",
463 | "X = data.drop('Survived', axis = 1)"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": 20,
469 | "metadata": {},
470 | "outputs": [],
471 | "source": [
472 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 21,
478 | "metadata": {},
479 | "outputs": [
480 | {
481 | "name": "stderr",
482 | "output_type": "stream",
483 | "text": [
484 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
485 | " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
486 | ]
487 | },
488 | {
489 | "data": {
490 | "text/plain": [
491 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
492 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
493 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
494 | " min_samples_leaf=1, min_samples_split=2,\n",
495 | " min_weight_fraction_leaf=0.0, n_estimators=10,\n",
496 | " n_jobs=None, oob_score=False, random_state=0, verbose=0,\n",
497 | " warm_start=False)"
498 | ]
499 | },
500 | "execution_count": 21,
501 | "metadata": {},
502 | "output_type": "execute_result"
503 | }
504 | ],
505 | "source": [
506 | "rf = RandomForestClassifier(random_state=0)\n",
507 | "rf.fit(X_train, y_train)"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": 22,
513 | "metadata": {},
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "정확도 :0.810\n"
520 | ]
521 | }
522 | ],
523 | "source": [
524 | "pred = rf.predict(X_test)\n",
525 | "print(\"정확도 :{0:.3f}\".format(accuracy_score(y_test, pred)))"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": 23,
531 | "metadata": {},
532 | "outputs": [],
533 | "source": [
534 | "rf_param_grid = {\n",
535 | " 'n_estimators' : [100, 200, 300],\n",
536 | " 'max_depth' : [4, 6, 8, 10, 12],\n",
537 | " 'min_samples_leaf' : [3, 5, 6, 7, 10],\n",
538 | " 'min_samples_split' : [2, 3, 5, 7, 10]\n",
539 | "}"
540 | ]
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": 24,
545 | "metadata": {},
546 | "outputs": [
547 | {
548 | "name": "stderr",
549 | "output_type": "stream",
550 | "text": [
551 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n",
552 | " warnings.warn(CV_WARNING, FutureWarning)\n",
553 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n"
554 | ]
555 | },
556 | {
557 | "name": "stdout",
558 | "output_type": "stream",
559 | "text": [
560 | "Fitting 3 folds for each of 128 candidates, totalling 384 fits\n"
561 | ]
562 | },
563 | {
564 | "name": "stderr",
565 | "output_type": "stream",
566 | "text": [
567 | "[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 4.4s\n",
568 | "[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 9.7s\n",
569 | "[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed: 16.7s finished\n"
570 | ]
571 | },
572 | {
573 | "data": {
574 | "text/plain": [
575 | "GridSearchCV(cv='warn', error_score='raise-deprecating',\n",
576 | " estimator=RandomForestClassifier(bootstrap=True, class_weight=None,\n",
577 | " criterion='gini', max_depth=None,\n",
578 | " max_features='auto',\n",
579 | " max_leaf_nodes=None,\n",
580 | " min_impurity_decrease=0.0,\n",
581 | " min_impurity_split=None,\n",
582 | " min_samples_leaf=1,\n",
583 | " min_samples_split=2,\n",
584 | " min_weight_fraction_leaf=0.0,\n",
585 | " n_estimators=10, n_jobs=None,\n",
586 | " oob_score=False, random_state=0,\n",
587 | " verbose=0, warm_start=False),\n",
588 | " iid='warn', n_jobs=-1,\n",
589 | " param_grid={'max_depth': [6, 8, 10, 12],\n",
590 | " 'min_samples_leaf': [3, 5, 7, 10],\n",
591 | " 'min_samples_split': [2, 3, 5, 10],\n",
592 | " 'n_estimators': [100, 200]},\n",
593 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
594 | " scoring='accuracy', verbose=1)"
595 | ]
596 | },
597 | "execution_count": 24,
598 | "metadata": {},
599 | "output_type": "execute_result"
600 | }
601 | ],
602 | "source": [
603 | "rf_grid = GridSearchCV(rf, param_grid = rf_param_grid, scoring=\"accuracy\", n_jobs= -1, verbose = 1)\n",
604 | "rf_grid.fit(X_train, y_train)"
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": 25,
610 | "metadata": {},
611 | "outputs": [
612 | {
613 | "name": "stdout",
614 | "output_type": "stream",
615 | "text": [
616 | "최고 평균 정확도 : 0.8174\n",
617 | "최고의 파라미터 : {'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 100}\n"
618 | ]
619 | }
620 | ],
621 | "source": [
622 | "print(\"최고 평균 정확도 : {0:.4f}\".format(rf_grid.best_score_))\n",
623 | "print(\"최고의 파라미터 : \", rf_grid.best_params_)"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": 26,
629 | "metadata": {},
630 | "outputs": [
631 | {
632 | "data": {
633 | "text/plain": [
634 | "Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',\n",
635 | " 'param_max_depth', 'param_min_samples_leaf', 'param_min_samples_split',\n",
636 | " 'param_n_estimators', 'params', 'split0_test_score',\n",
637 | " 'split1_test_score', 'split2_test_score', 'mean_test_score',\n",
638 | " 'std_test_score', 'rank_test_score'],\n",
639 | " dtype='object')"
640 | ]
641 | },
642 | "execution_count": 26,
643 | "metadata": {},
644 | "output_type": "execute_result"
645 | }
646 | ],
647 | "source": [
648 | "result = pd.DataFrame(rf_grid.cv_results_)\n",
649 | "result.columns"
650 | ]
651 | },
652 | {
653 | "cell_type": "code",
654 | "execution_count": 27,
655 | "metadata": {},
656 | "outputs": [],
657 | "source": [
658 | "result.sort_values(by=['rank_test_score'], inplace=True)"
659 | ]
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": 28,
664 | "metadata": {},
665 | "outputs": [
666 | {
667 | "data": {
668 | "text/html": [
669 | "\n",
670 | "\n",
683 | "
\n",
684 | " \n",
685 | " \n",
686 | " | \n",
687 | " params | \n",
688 | " mean_test_score | \n",
689 | " rank_test_score | \n",
690 | "
\n",
691 | " \n",
692 | " \n",
693 | " \n",
694 | " 32 | \n",
695 | " {'max_depth': 8, 'min_samples_leaf': 3, 'min_s... | \n",
696 | " 0.817416 | \n",
697 | " 1 | \n",
698 | "
\n",
699 | " \n",
700 | " 34 | \n",
701 | " {'max_depth': 8, 'min_samples_leaf': 3, 'min_s... | \n",
702 | " 0.817416 | \n",
703 | " 1 | \n",
704 | "
\n",
705 | " \n",
706 | " 36 | \n",
707 | " {'max_depth': 8, 'min_samples_leaf': 3, 'min_s... | \n",
708 | " 0.817416 | \n",
709 | " 1 | \n",
710 | "
\n",
711 | " \n",
712 | " 56 | \n",
713 | " {'max_depth': 8, 'min_samples_leaf': 10, 'min_... | \n",
714 | " 0.814607 | \n",
715 | " 4 | \n",
716 | "
\n",
717 | " \n",
718 | " 105 | \n",
719 | " {'max_depth': 12, 'min_samples_leaf': 5, 'min_... | \n",
720 | " 0.814607 | \n",
721 | " 4 | \n",
722 | "
\n",
723 | " \n",
724 | " 107 | \n",
725 | " {'max_depth': 12, 'min_samples_leaf': 5, 'min_... | \n",
726 | " 0.814607 | \n",
727 | " 4 | \n",
728 | "
\n",
729 | " \n",
730 | " 68 | \n",
731 | " {'max_depth': 10, 'min_samples_leaf': 3, 'min_... | \n",
732 | " 0.814607 | \n",
733 | " 4 | \n",
734 | "
\n",
735 | " \n",
736 | " 66 | \n",
737 | " {'max_depth': 10, 'min_samples_leaf': 3, 'min_... | \n",
738 | " 0.814607 | \n",
739 | " 4 | \n",
740 | "
\n",
741 | " \n",
742 | " 111 | \n",
743 | " {'max_depth': 12, 'min_samples_leaf': 5, 'min_... | \n",
744 | " 0.814607 | \n",
745 | " 4 | \n",
746 | "
\n",
747 | " \n",
748 | " 109 | \n",
749 | " {'max_depth': 12, 'min_samples_leaf': 5, 'min_... | \n",
750 | " 0.814607 | \n",
751 | " 4 | \n",
752 | "
\n",
753 | " \n",
754 | "
\n",
755 | "
"
756 | ],
757 | "text/plain": [
758 | " params mean_test_score \\\n",
759 | "32 {'max_depth': 8, 'min_samples_leaf': 3, 'min_s... 0.817416 \n",
760 | "34 {'max_depth': 8, 'min_samples_leaf': 3, 'min_s... 0.817416 \n",
761 | "36 {'max_depth': 8, 'min_samples_leaf': 3, 'min_s... 0.817416 \n",
762 | "56 {'max_depth': 8, 'min_samples_leaf': 10, 'min_... 0.814607 \n",
763 | "105 {'max_depth': 12, 'min_samples_leaf': 5, 'min_... 0.814607 \n",
764 | "107 {'max_depth': 12, 'min_samples_leaf': 5, 'min_... 0.814607 \n",
765 | "68 {'max_depth': 10, 'min_samples_leaf': 3, 'min_... 0.814607 \n",
766 | "66 {'max_depth': 10, 'min_samples_leaf': 3, 'min_... 0.814607 \n",
767 | "111 {'max_depth': 12, 'min_samples_leaf': 5, 'min_... 0.814607 \n",
768 | "109 {'max_depth': 12, 'min_samples_leaf': 5, 'min_... 0.814607 \n",
769 | "\n",
770 | " rank_test_score \n",
771 | "32 1 \n",
772 | "34 1 \n",
773 | "36 1 \n",
774 | "56 4 \n",
775 | "105 4 \n",
776 | "107 4 \n",
777 | "68 4 \n",
778 | "66 4 \n",
779 | "111 4 \n",
780 | "109 4 "
781 | ]
782 | },
783 | "execution_count": 28,
784 | "metadata": {},
785 | "output_type": "execute_result"
786 | }
787 | ],
788 | "source": [
789 | "result[['params', 'mean_test_score', 'rank_test_score']].head(10)"
790 | ]
791 | },
792 | {
793 | "cell_type": "code",
794 | "execution_count": 29,
795 | "metadata": {},
796 | "outputs": [
797 | {
798 | "name": "stdout",
799 | "output_type": "stream",
800 | "text": [
801 | "정확도 : 0.8603\n"
802 | ]
803 | }
804 | ],
805 | "source": [
806 | "model = rf_grid.best_estimator_\n",
807 | "pred = model.predict(X_test)\n",
808 | "acc = accuracy_score(y_test, pred)\n",
809 | "print(\"정확도 : {0:.4f}\".format(acc))\n"
810 | ]
811 | },
812 | {
813 | "cell_type": "code",
814 | "execution_count": 30,
815 | "metadata": {},
816 | "outputs": [],
817 | "source": [
818 | "feature_importances = model.feature_importances_"
819 | ]
820 | },
821 | {
822 | "cell_type": "code",
823 | "execution_count": 31,
824 | "metadata": {},
825 | "outputs": [
826 | {
827 | "data": {
828 | "image/png": "\n",
829 | "text/plain": [
830 | ""
831 | ]
832 | },
833 | "metadata": {
834 | "needs_background": "light"
835 | },
836 | "output_type": "display_data"
837 | }
838 | ],
839 | "source": [
840 | "ft_importances = pd.Series(feature_importances, index = X_train.columns)\n",
841 | "ft_importances = ft_importances.sort_values(ascending=False)\n",
842 | "\n",
843 | "plt.figure(figsize=(12, 10))\n",
844 | "plt.title(\"feature importances\")\n",
845 | "sns.barplot(x=ft_importances, y = X_train.columns)\n",
846 | "plt.show()\n"
847 | ]
848 | },
849 | {
850 | "cell_type": "code",
851 | "execution_count": null,
852 | "metadata": {},
853 | "outputs": [],
854 | "source": []
855 | }
856 | ],
857 | "metadata": {
858 | "kernelspec": {
859 | "display_name": "Python 3",
860 | "language": "python",
861 | "name": "python3"
862 | },
863 | "language_info": {
864 | "codemirror_mode": {
865 | "name": "ipython",
866 | "version": 3
867 | },
868 | "file_extension": ".py",
869 | "mimetype": "text/x-python",
870 | "name": "python",
871 | "nbconvert_exporter": "python",
872 | "pygments_lexer": "ipython3",
873 | "version": "3.6.9"
874 | }
875 | },
876 | "nbformat": 4,
877 | "nbformat_minor": 2
878 | }
879 |
--------------------------------------------------------------------------------
/008. machine learning - ensemble boosting basic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 출처\n",
8 | "\n",
9 | "- https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python\n",
10 | "- https://www.kaggle.com/lsjsj92/simple-titanic-kernel-82-for-beginner-like-me\n",
11 | "- https://www.kaggle.com/startupsci/titanic-data-science-solutions\n",
12 | "- https://www.kaggle.com/ash316/eda-to-prediction-dietanic\n",
13 | "- https://www.kaggle.com/mjbahmani/a-comprehensive-ml-workflow-with-python"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 3,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import pandas as pd\n",
23 | "import numpy as np\n",
24 | "import matplotlib.pyplot as plt\n",
25 | "import seaborn as sns\n",
26 | "\n",
27 | "from sklearn.tree import DecisionTreeClassifier\n",
28 | "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
29 | "from sklearn.model_selection import train_test_split, GridSearchCV\n",
30 | "from sklearn.metrics import accuracy_score"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 4,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/html": [
41 | "\n",
42 | "\n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " | \n",
59 | " PassengerId | \n",
60 | " Survived | \n",
61 | " Pclass | \n",
62 | " Name | \n",
63 | " Sex | \n",
64 | " Age | \n",
65 | " SibSp | \n",
66 | " Parch | \n",
67 | " Ticket | \n",
68 | " Fare | \n",
69 | " Cabin | \n",
70 | " Embarked | \n",
71 | "
\n",
72 | " \n",
73 | " \n",
74 | " \n",
75 | " 0 | \n",
76 | " 1 | \n",
77 | " 0 | \n",
78 | " 3 | \n",
79 | " Braund, Mr. Owen Harris | \n",
80 | " male | \n",
81 | " 22.0 | \n",
82 | " 1 | \n",
83 | " 0 | \n",
84 | " A/5 21171 | \n",
85 | " 7.2500 | \n",
86 | " NaN | \n",
87 | " S | \n",
88 | "
\n",
89 | " \n",
90 | " 1 | \n",
91 | " 2 | \n",
92 | " 1 | \n",
93 | " 1 | \n",
94 | " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
95 | " female | \n",
96 | " 38.0 | \n",
97 | " 1 | \n",
98 | " 0 | \n",
99 | " PC 17599 | \n",
100 | " 71.2833 | \n",
101 | " C85 | \n",
102 | " C | \n",
103 | "
\n",
104 | " \n",
105 | " 2 | \n",
106 | " 3 | \n",
107 | " 1 | \n",
108 | " 3 | \n",
109 | " Heikkinen, Miss. Laina | \n",
110 | " female | \n",
111 | " 26.0 | \n",
112 | " 0 | \n",
113 | " 0 | \n",
114 | " STON/O2. 3101282 | \n",
115 | " 7.9250 | \n",
116 | " NaN | \n",
117 | " S | \n",
118 | "
\n",
119 | " \n",
120 | " 3 | \n",
121 | " 4 | \n",
122 | " 1 | \n",
123 | " 1 | \n",
124 | " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
125 | " female | \n",
126 | " 35.0 | \n",
127 | " 1 | \n",
128 | " 0 | \n",
129 | " 113803 | \n",
130 | " 53.1000 | \n",
131 | " C123 | \n",
132 | " S | \n",
133 | "
\n",
134 | " \n",
135 | " 4 | \n",
136 | " 5 | \n",
137 | " 0 | \n",
138 | " 3 | \n",
139 | " Allen, Mr. William Henry | \n",
140 | " male | \n",
141 | " 35.0 | \n",
142 | " 0 | \n",
143 | " 0 | \n",
144 | " 373450 | \n",
145 | " 8.0500 | \n",
146 | " NaN | \n",
147 | " S | \n",
148 | "
\n",
149 | " \n",
150 | "
\n",
151 | "
"
152 | ],
153 | "text/plain": [
154 | " PassengerId Survived Pclass \\\n",
155 | "0 1 0 3 \n",
156 | "1 2 1 1 \n",
157 | "2 3 1 3 \n",
158 | "3 4 1 1 \n",
159 | "4 5 0 3 \n",
160 | "\n",
161 | " Name Sex Age SibSp \\\n",
162 | "0 Braund, Mr. Owen Harris male 22.0 1 \n",
163 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
164 | "2 Heikkinen, Miss. Laina female 26.0 0 \n",
165 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
166 | "4 Allen, Mr. William Henry male 35.0 0 \n",
167 | "\n",
168 | " Parch Ticket Fare Cabin Embarked \n",
169 | "0 0 A/5 21171 7.2500 NaN S \n",
170 | "1 0 PC 17599 71.2833 C85 C \n",
171 | "2 0 STON/O2. 3101282 7.9250 NaN S \n",
172 | "3 0 113803 53.1000 C123 S \n",
173 | "4 0 373450 8.0500 NaN S "
174 | ]
175 | },
176 | "execution_count": 4,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "data = pd.read_csv('../datas/titanic/train.csv')\n",
183 | "data.head()"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 5,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "data['Embarked'].fillna('S', inplace = True)\n",
193 | "data['Fare'].fillna(0, inplace=True)\n",
194 | "data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 6,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\\.')\n",
204 | "data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)\n",
205 | "mapping = {\n",
206 | " \"Mr\":0,\n",
207 | " \"Miss\":1,\n",
208 | " \"Mrs\" : 1,\n",
209 | " \"Master\":2,\n",
210 | " \"Other\":3\n",
211 | "}\n",
212 | "\n",
213 | "data['Initial'] = data['Initial'].map(mapping)\n"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 7,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "mapping_sex = {\n",
223 | " 'male' : 0,\n",
224 | " 'female': 1\n",
225 | "}\n",
226 | "\n",
227 | "mapping_em = {\n",
228 | " 'S' :0,\n",
229 | " 'C' :1,\n",
230 | " 'Q' :2\n",
231 | "}\n",
232 | "\n",
233 | "\n",
234 | "data['Sex'] = data['Sex'].map(mapping_sex)\n",
235 | "data['Embarked'] = data['Embarked'].map(mapping_em)\n",
236 | "\n",
237 | "\n",
238 | "data.drop(['PassengerId', \"Ticket\", \"Cabin\", \"Name\"], axis = 1, inplace = True)\n"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 8,
244 | "metadata": {},
245 | "outputs": [
246 | {
247 | "data": {
248 | "text/plain": [
249 | "Initial\n",
250 | "0 32.739609\n",
251 | "1 27.834615\n",
252 | "2 4.574167\n",
253 | "3 45.888889\n",
254 | "Name: Age, dtype: float64"
255 | ]
256 | },
257 | "execution_count": 8,
258 | "metadata": {},
259 | "output_type": "execute_result"
260 | }
261 | ],
262 | "source": [
263 | "data.groupby('Initial')['Age'].mean()"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 9,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 0), 'Age' ] = 32\n",
273 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 1), 'Age' ] = 28\n",
274 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 2), 'Age' ] = 5\n",
275 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 3), 'Age' ] = 45"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 10,
281 | "metadata": {},
282 | "outputs": [
283 | {
284 | "data": {
285 | "text/html": [
286 | "\n",
287 | "\n",
300 | "
\n",
301 | " \n",
302 | " \n",
303 | " | \n",
304 | " Survived | \n",
305 | " Pclass | \n",
306 | " Sex | \n",
307 | " Age | \n",
308 | " SibSp | \n",
309 | " Parch | \n",
310 | " Fare | \n",
311 | " Embarked | \n",
312 | " Initial | \n",
313 | "
\n",
314 | " \n",
315 | " \n",
316 | " \n",
317 | " 0 | \n",
318 | " 0 | \n",
319 | " 3 | \n",
320 | " 0 | \n",
321 | " 22.0 | \n",
322 | " 1 | \n",
323 | " 0 | \n",
324 | " 1.981001 | \n",
325 | " 0 | \n",
326 | " 0 | \n",
327 | "
\n",
328 | " \n",
329 | " 1 | \n",
330 | " 1 | \n",
331 | " 1 | \n",
332 | " 1 | \n",
333 | " 38.0 | \n",
334 | " 1 | \n",
335 | " 0 | \n",
336 | " 4.266662 | \n",
337 | " 1 | \n",
338 | " 1 | \n",
339 | "
\n",
340 | " \n",
341 | " 2 | \n",
342 | " 1 | \n",
343 | " 3 | \n",
344 | " 1 | \n",
345 | " 26.0 | \n",
346 | " 0 | \n",
347 | " 0 | \n",
348 | " 2.070022 | \n",
349 | " 0 | \n",
350 | " 1 | \n",
351 | "
\n",
352 | " \n",
353 | " 3 | \n",
354 | " 1 | \n",
355 | " 1 | \n",
356 | " 1 | \n",
357 | " 35.0 | \n",
358 | " 1 | \n",
359 | " 0 | \n",
360 | " 3.972177 | \n",
361 | " 0 | \n",
362 | " 1 | \n",
363 | "
\n",
364 | " \n",
365 | " 4 | \n",
366 | " 0 | \n",
367 | " 3 | \n",
368 | " 0 | \n",
369 | " 35.0 | \n",
370 | " 0 | \n",
371 | " 0 | \n",
372 | " 2.085672 | \n",
373 | " 0 | \n",
374 | " 0 | \n",
375 | "
\n",
376 | " \n",
377 | "
\n",
378 | "
"
379 | ],
380 | "text/plain": [
381 | " Survived Pclass Sex Age SibSp Parch Fare Embarked Initial\n",
382 | "0 0 3 0 22.0 1 0 1.981001 0 0\n",
383 | "1 1 1 1 38.0 1 0 4.266662 1 1\n",
384 | "2 1 3 1 26.0 0 0 2.070022 0 1\n",
385 | "3 1 1 1 35.0 1 0 3.972177 0 1\n",
386 | "4 0 3 0 35.0 0 0 2.085672 0 0"
387 | ]
388 | },
389 | "execution_count": 10,
390 | "metadata": {},
391 | "output_type": "execute_result"
392 | }
393 | ],
394 | "source": [
395 | "data.head()"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 11,
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "y = data['Survived']\n",
405 | "X = data.drop('Survived', axis = 1)"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": 12,
411 | "metadata": {},
412 | "outputs": [],
413 | "source": [
414 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": 13,
420 | "metadata": {},
421 | "outputs": [
422 | {
423 | "name": "stderr",
424 | "output_type": "stream",
425 | "text": [
426 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
427 | " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
428 | ]
429 | },
430 | {
431 | "data": {
432 | "text/plain": [
433 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
434 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
435 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
436 | " min_samples_leaf=1, min_samples_split=2,\n",
437 | " min_weight_fraction_leaf=0.0, n_estimators=10,\n",
438 | " n_jobs=None, oob_score=False, random_state=0, verbose=0,\n",
439 | " warm_start=False)"
440 | ]
441 | },
442 | "execution_count": 13,
443 | "metadata": {},
444 | "output_type": "execute_result"
445 | }
446 | ],
447 | "source": [
448 | "rf = RandomForestClassifier(random_state=0)\n",
449 | "rf.fit(X_train, y_train)"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": 14,
455 | "metadata": {},
456 | "outputs": [
457 | {
458 | "name": "stdout",
459 | "output_type": "stream",
460 | "text": [
461 | "정확도 :0.810\n"
462 | ]
463 | }
464 | ],
465 | "source": [
466 | "pred = rf.predict(X_test)\n",
467 | "print(\"정확도 :{0:.3f}\".format(accuracy_score(y_test, pred)))"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": 11,
473 | "metadata": {},
474 | "outputs": [
475 | {
476 | "data": {
477 | "text/plain": [
478 | "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
479 | " learning_rate=0.1, loss='deviance', max_depth=3,\n",
480 | " max_features=None, max_leaf_nodes=None,\n",
481 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
482 | " min_samples_leaf=1, min_samples_split=2,\n",
483 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n",
484 | " n_iter_no_change=None, presort='auto',\n",
485 | " random_state=0, subsample=1.0, tol=0.0001,\n",
486 | " validation_fraction=0.1, verbose=0,\n",
487 | " warm_start=False)"
488 | ]
489 | },
490 | "execution_count": 11,
491 | "metadata": {},
492 | "output_type": "execute_result"
493 | }
494 | ],
495 | "source": [
496 | "gb = GradientBoostingClassifier(random_state=0)\n",
497 | "gb.fit(X_train, y_train)"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 12,
503 | "metadata": {},
504 | "outputs": [
505 | {
506 | "name": "stdout",
507 | "output_type": "stream",
508 | "text": [
509 | "정확도 :0.832\n"
510 | ]
511 | }
512 | ],
513 | "source": [
514 | "pred = gb.predict(X_test)\n",
515 | "print(\"정확도 :{0:.3f}\".format(accuracy_score(y_test, pred)))"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": 13,
521 | "metadata": {},
522 | "outputs": [],
523 | "source": [
524 | "gb_param_grid = {\n",
525 | " 'n_estimators' : [100, 200],\n",
526 | " 'max_depth' : [6, 8, 10, 12],\n",
527 | " 'min_samples_leaf' : [3, 5, 7, 10],\n",
528 | " 'min_samples_split' : [2, 3, 5, 10],\n",
529 | " 'learning_rate' : [0.05, 0.1, 0.2]\n",
530 | "}"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 14,
536 | "metadata": {},
537 | "outputs": [
538 | {
539 | "name": "stderr",
540 | "output_type": "stream",
541 | "text": [
542 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n",
543 | " warnings.warn(CV_WARNING, FutureWarning)\n",
544 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n"
545 | ]
546 | },
547 | {
548 | "name": "stdout",
549 | "output_type": "stream",
550 | "text": [
551 | "Fitting 3 folds for each of 384 candidates, totalling 1152 fits\n"
552 | ]
553 | },
554 | {
555 | "name": "stderr",
556 | "output_type": "stream",
557 | "text": [
558 | "[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 4.6s\n",
559 | "[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 9.4s\n",
560 | "[Parallel(n_jobs=-1)]: Done 426 tasks | elapsed: 20.0s\n",
561 | "[Parallel(n_jobs=-1)]: Done 776 tasks | elapsed: 34.5s\n",
562 | "[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed: 49.2s finished\n"
563 | ]
564 | },
565 | {
566 | "data": {
567 | "text/plain": [
568 | "GridSearchCV(cv='warn', error_score='raise-deprecating',\n",
569 | " estimator=GradientBoostingClassifier(criterion='friedman_mse',\n",
570 | " init=None, learning_rate=0.1,\n",
571 | " loss='deviance', max_depth=3,\n",
572 | " max_features=None,\n",
573 | " max_leaf_nodes=None,\n",
574 | " min_impurity_decrease=0.0,\n",
575 | " min_impurity_split=None,\n",
576 | " min_samples_leaf=1,\n",
577 | " min_samples_split=2,\n",
578 | " min_weight_fraction_leaf=0.0,\n",
579 | " n_estimators=100,\n",
580 | " n_it...\n",
581 | " random_state=0, subsample=1.0,\n",
582 | " tol=0.0001,\n",
583 | " validation_fraction=0.1,\n",
584 | " verbose=0, warm_start=False),\n",
585 | " iid='warn', n_jobs=-1,\n",
586 | " param_grid={'learning_rate': [0.05, 0.1, 0.2],\n",
587 | " 'max_depth': [6, 8, 10, 12],\n",
588 | " 'min_samples_leaf': [3, 5, 7, 10],\n",
589 | " 'min_samples_split': [2, 3, 5, 10],\n",
590 | " 'n_estimators': [100, 200]},\n",
591 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
592 | " scoring='accuracy', verbose=1)"
593 | ]
594 | },
595 | "execution_count": 14,
596 | "metadata": {},
597 | "output_type": "execute_result"
598 | }
599 | ],
600 | "source": [
601 | "gb_grid = GridSearchCV(gb, param_grid = gb_param_grid, scoring=\"accuracy\", n_jobs= -1, verbose = 1)\n",
602 | "gb_grid.fit(X_train, y_train)"
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": 15,
608 | "metadata": {},
609 | "outputs": [
610 | {
611 | "name": "stdout",
612 | "output_type": "stream",
613 | "text": [
614 | "최고 평균 정확도 : 0.8202\n",
615 | "최고의 파라미터 : {'learning_rate': 0.05, 'max_depth': 6, 'min_samples_leaf': 7, 'min_samples_split': 2, 'n_estimators': 100}\n"
616 | ]
617 | }
618 | ],
619 | "source": [
620 | "print(\"최고 평균 정확도 : {0:.4f}\".format(gb_grid.best_score_))\n",
621 | "print(\"최고의 파라미터 : \", gb_grid.best_params_)"
622 | ]
623 | },
624 | {
625 | "cell_type": "code",
626 | "execution_count": 16,
627 | "metadata": {},
628 | "outputs": [
629 | {
630 | "data": {
631 | "text/plain": [
632 | "Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',\n",
633 | " 'param_learning_rate', 'param_max_depth', 'param_min_samples_leaf',\n",
634 | " 'param_min_samples_split', 'param_n_estimators', 'params',\n",
635 | " 'split0_test_score', 'split1_test_score', 'split2_test_score',\n",
636 | " 'mean_test_score', 'std_test_score', 'rank_test_score'],\n",
637 | " dtype='object')"
638 | ]
639 | },
640 | "execution_count": 16,
641 | "metadata": {},
642 | "output_type": "execute_result"
643 | }
644 | ],
645 | "source": [
646 | "result = pd.DataFrame(gb_grid.cv_results_)\n",
647 | "result.columns"
648 | ]
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": 17,
653 | "metadata": {},
654 | "outputs": [],
655 | "source": [
656 | "result.sort_values(by=['rank_test_score'], inplace=True)"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": 18,
662 | "metadata": {},
663 | "outputs": [
664 | {
665 | "data": {
666 | "text/html": [
667 | "\n",
668 | "\n",
681 | "
\n",
682 | " \n",
683 | " \n",
684 | " | \n",
685 | " params | \n",
686 | " mean_test_score | \n",
687 | " rank_test_score | \n",
688 | "
\n",
689 | " \n",
690 | " \n",
691 | " \n",
692 | " 22 | \n",
693 | " {'learning_rate': 0.05, 'max_depth': 6, 'min_s... | \n",
694 | " 0.820225 | \n",
695 | " 1 | \n",
696 | "
\n",
697 | " \n",
698 | " 16 | \n",
699 | " {'learning_rate': 0.05, 'max_depth': 6, 'min_s... | \n",
700 | " 0.820225 | \n",
701 | " 1 | \n",
702 | "
\n",
703 | " \n",
704 | " 18 | \n",
705 | " {'learning_rate': 0.05, 'max_depth': 6, 'min_s... | \n",
706 | " 0.820225 | \n",
707 | " 1 | \n",
708 | "
\n",
709 | " \n",
710 | " 20 | \n",
711 | " {'learning_rate': 0.05, 'max_depth': 6, 'min_s... | \n",
712 | " 0.820225 | \n",
713 | " 1 | \n",
714 | "
\n",
715 | " \n",
716 | " 58 | \n",
717 | " {'learning_rate': 0.05, 'max_depth': 8, 'min_s... | \n",
718 | " 0.818820 | \n",
719 | " 5 | \n",
720 | "
\n",
721 | " \n",
722 | " 60 | \n",
723 | " {'learning_rate': 0.05, 'max_depth': 8, 'min_s... | \n",
724 | " 0.818820 | \n",
725 | " 5 | \n",
726 | "
\n",
727 | " \n",
728 | " 62 | \n",
729 | " {'learning_rate': 0.05, 'max_depth': 8, 'min_s... | \n",
730 | " 0.818820 | \n",
731 | " 5 | \n",
732 | "
\n",
733 | " \n",
734 | " 135 | \n",
735 | " {'learning_rate': 0.1, 'max_depth': 6, 'min_sa... | \n",
736 | " 0.818820 | \n",
737 | " 5 | \n",
738 | "
\n",
739 | " \n",
740 | " 56 | \n",
741 | " {'learning_rate': 0.05, 'max_depth': 8, 'min_s... | \n",
742 | " 0.818820 | \n",
743 | " 5 | \n",
744 | "
\n",
745 | " \n",
746 | " 120 | \n",
747 | " {'learning_rate': 0.05, 'max_depth': 12, 'min_... | \n",
748 | " 0.817416 | \n",
749 | " 10 | \n",
750 | "
\n",
751 | " \n",
752 | "
\n",
753 | "
"
754 | ],
755 | "text/plain": [
756 | " params mean_test_score \\\n",
757 | "22 {'learning_rate': 0.05, 'max_depth': 6, 'min_s... 0.820225 \n",
758 | "16 {'learning_rate': 0.05, 'max_depth': 6, 'min_s... 0.820225 \n",
759 | "18 {'learning_rate': 0.05, 'max_depth': 6, 'min_s... 0.820225 \n",
760 | "20 {'learning_rate': 0.05, 'max_depth': 6, 'min_s... 0.820225 \n",
761 | "58 {'learning_rate': 0.05, 'max_depth': 8, 'min_s... 0.818820 \n",
762 | "60 {'learning_rate': 0.05, 'max_depth': 8, 'min_s... 0.818820 \n",
763 | "62 {'learning_rate': 0.05, 'max_depth': 8, 'min_s... 0.818820 \n",
764 | "135 {'learning_rate': 0.1, 'max_depth': 6, 'min_sa... 0.818820 \n",
765 | "56 {'learning_rate': 0.05, 'max_depth': 8, 'min_s... 0.818820 \n",
766 | "120 {'learning_rate': 0.05, 'max_depth': 12, 'min_... 0.817416 \n",
767 | "\n",
768 | " rank_test_score \n",
769 | "22 1 \n",
770 | "16 1 \n",
771 | "18 1 \n",
772 | "20 1 \n",
773 | "58 5 \n",
774 | "60 5 \n",
775 | "62 5 \n",
776 | "135 5 \n",
777 | "56 5 \n",
778 | "120 10 "
779 | ]
780 | },
781 | "execution_count": 18,
782 | "metadata": {},
783 | "output_type": "execute_result"
784 | }
785 | ],
786 | "source": [
787 | "result[['params', 'mean_test_score', 'rank_test_score']].head(10)"
788 | ]
789 | },
790 | {
791 | "cell_type": "code",
792 | "execution_count": 20,
793 | "metadata": {},
794 | "outputs": [
795 | {
796 | "name": "stdout",
797 | "output_type": "stream",
798 | "text": [
799 | "정확도 : 0.8492\n"
800 | ]
801 | }
802 | ],
803 | "source": [
804 | "model = gb_grid.best_estimator_\n",
805 | "pred = model.predict(X_test)\n",
806 | "acc = accuracy_score(y_test, pred)\n",
807 | "print(\"정확도 : {0:.4f}\".format(acc))\n"
808 | ]
809 | },
810 | {
811 | "cell_type": "code",
812 | "execution_count": 21,
813 | "metadata": {},
814 | "outputs": [],
815 | "source": [
816 | "feature_importances = model.feature_importances_"
817 | ]
818 | },
819 | {
820 | "cell_type": "code",
821 | "execution_count": 22,
822 | "metadata": {},
823 | "outputs": [
824 | {
825 | "data": {
826 | "image/png": "\n",
827 | "text/plain": [
828 | ""
829 | ]
830 | },
831 | "metadata": {
832 | "needs_background": "light"
833 | },
834 | "output_type": "display_data"
835 | }
836 | ],
837 | "source": [
838 | "ft_importances = pd.Series(feature_importances, index = X_train.columns)\n",
839 | "ft_importances = ft_importances.sort_values(ascending=False)\n",
840 | "\n",
841 | "plt.figure(figsize=(12, 10))\n",
842 | "plt.title(\"feature importances\")\n",
843 | "sns.barplot(x=ft_importances, y = X_train.columns)\n",
844 | "plt.show()\n"
845 | ]
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": null,
850 | "metadata": {},
851 | "outputs": [],
852 | "source": []
853 | }
854 | ],
855 | "metadata": {
856 | "kernelspec": {
857 | "display_name": "Python 3",
858 | "language": "python",
859 | "name": "python3"
860 | },
861 | "language_info": {
862 | "codemirror_mode": {
863 | "name": "ipython",
864 | "version": 3
865 | },
866 | "file_extension": ".py",
867 | "mimetype": "text/x-python",
868 | "name": "python",
869 | "nbconvert_exporter": "python",
870 | "pygments_lexer": "ipython3",
871 | "version": "3.6.9"
872 | }
873 | },
874 | "nbformat": 4,
875 | "nbformat_minor": 2
876 | }
877 |
--------------------------------------------------------------------------------
/009. XGboost, LightGBM.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 출처 \n",
8 | "\n",
9 | "## xgboost\n",
10 | "\n",
11 | "- https://apple-rbox.tistory.com/6\n",
12 | "- https://brunch.co.kr/@snobberys/137\n"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "## lightgbm\n",
20 | "\n",
21 | "- https://ko.raw3h.net/page/what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-5295f7/\n",
22 | "\n",
23 | "\n",
24 | "## 그 외\n",
25 | "\n",
26 | "- https://www.kaggle.com/shep312/applying-lightgbm-to-titanic-dataset\n",
27 | "- https://www.kaggle.com/suniliitb96/titanic-survival-prediction-using-xgboost\n"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 1,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import pandas as pd\n",
37 | "import numpy as np\n",
38 | "import matplotlib.pyplot as plt\n",
39 | "import seaborn as sns\n",
40 | "from xgboost import plot_importance\n",
41 | "from xgboost import XGBClassifier\n",
42 | "from sklearn.datasets import load_breast_cancer\n",
43 | "from sklearn.model_selection import train_test_split, GridSearchCV\n",
44 | "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 10,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "def metrics(y_test, pred):\n",
54 | " accuracy = accuracy_score(y_test, pred)\n",
55 | " precision = precision_score(y_test, pred)\n",
56 | " recall = recall_score(y_test, pred)\n",
57 | " f1 = f1_score(y_test, pred)\n",
58 | " roc_score = roc_auc_score(y_test, pred)\n",
59 | " print('정확도 : {0:.2f}, 정밀도 : {1:.2f}, 재현율 : {2:.2f}'.format(accuracy, precision, recall))\n",
60 | " print('f1-score : {0:.2f}, auc : {1:.2f}'.format(f1, roc_score))"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 3,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/html": [
71 | "\n",
72 | "\n",
85 | "
\n",
86 | " \n",
87 | " \n",
88 | " | \n",
89 | " mean radius | \n",
90 | " mean texture | \n",
91 | " mean perimeter | \n",
92 | " mean area | \n",
93 | " mean smoothness | \n",
94 | " mean compactness | \n",
95 | " mean concavity | \n",
96 | " mean concave points | \n",
97 | " mean symmetry | \n",
98 | " mean fractal dimension | \n",
99 | " ... | \n",
100 | " worst texture | \n",
101 | " worst perimeter | \n",
102 | " worst area | \n",
103 | " worst smoothness | \n",
104 | " worst compactness | \n",
105 | " worst concavity | \n",
106 | " worst concave points | \n",
107 | " worst symmetry | \n",
108 | " worst fractal dimension | \n",
109 | " target | \n",
110 | "
\n",
111 | " \n",
112 | " \n",
113 | " \n",
114 | " 0 | \n",
115 | " 17.99 | \n",
116 | " 10.38 | \n",
117 | " 122.80 | \n",
118 | " 1001.0 | \n",
119 | " 0.11840 | \n",
120 | " 0.27760 | \n",
121 | " 0.3001 | \n",
122 | " 0.14710 | \n",
123 | " 0.2419 | \n",
124 | " 0.07871 | \n",
125 | " ... | \n",
126 | " 17.33 | \n",
127 | " 184.60 | \n",
128 | " 2019.0 | \n",
129 | " 0.1622 | \n",
130 | " 0.6656 | \n",
131 | " 0.7119 | \n",
132 | " 0.2654 | \n",
133 | " 0.4601 | \n",
134 | " 0.11890 | \n",
135 | " 0 | \n",
136 | "
\n",
137 | " \n",
138 | " 1 | \n",
139 | " 20.57 | \n",
140 | " 17.77 | \n",
141 | " 132.90 | \n",
142 | " 1326.0 | \n",
143 | " 0.08474 | \n",
144 | " 0.07864 | \n",
145 | " 0.0869 | \n",
146 | " 0.07017 | \n",
147 | " 0.1812 | \n",
148 | " 0.05667 | \n",
149 | " ... | \n",
150 | " 23.41 | \n",
151 | " 158.80 | \n",
152 | " 1956.0 | \n",
153 | " 0.1238 | \n",
154 | " 0.1866 | \n",
155 | " 0.2416 | \n",
156 | " 0.1860 | \n",
157 | " 0.2750 | \n",
158 | " 0.08902 | \n",
159 | " 0 | \n",
160 | "
\n",
161 | " \n",
162 | " 2 | \n",
163 | " 19.69 | \n",
164 | " 21.25 | \n",
165 | " 130.00 | \n",
166 | " 1203.0 | \n",
167 | " 0.10960 | \n",
168 | " 0.15990 | \n",
169 | " 0.1974 | \n",
170 | " 0.12790 | \n",
171 | " 0.2069 | \n",
172 | " 0.05999 | \n",
173 | " ... | \n",
174 | " 25.53 | \n",
175 | " 152.50 | \n",
176 | " 1709.0 | \n",
177 | " 0.1444 | \n",
178 | " 0.4245 | \n",
179 | " 0.4504 | \n",
180 | " 0.2430 | \n",
181 | " 0.3613 | \n",
182 | " 0.08758 | \n",
183 | " 0 | \n",
184 | "
\n",
185 | " \n",
186 | " 3 | \n",
187 | " 11.42 | \n",
188 | " 20.38 | \n",
189 | " 77.58 | \n",
190 | " 386.1 | \n",
191 | " 0.14250 | \n",
192 | " 0.28390 | \n",
193 | " 0.2414 | \n",
194 | " 0.10520 | \n",
195 | " 0.2597 | \n",
196 | " 0.09744 | \n",
197 | " ... | \n",
198 | " 26.50 | \n",
199 | " 98.87 | \n",
200 | " 567.7 | \n",
201 | " 0.2098 | \n",
202 | " 0.8663 | \n",
203 | " 0.6869 | \n",
204 | " 0.2575 | \n",
205 | " 0.6638 | \n",
206 | " 0.17300 | \n",
207 | " 0 | \n",
208 | "
\n",
209 | " \n",
210 | " 4 | \n",
211 | " 20.29 | \n",
212 | " 14.34 | \n",
213 | " 135.10 | \n",
214 | " 1297.0 | \n",
215 | " 0.10030 | \n",
216 | " 0.13280 | \n",
217 | " 0.1980 | \n",
218 | " 0.10430 | \n",
219 | " 0.1809 | \n",
220 | " 0.05883 | \n",
221 | " ... | \n",
222 | " 16.67 | \n",
223 | " 152.20 | \n",
224 | " 1575.0 | \n",
225 | " 0.1374 | \n",
226 | " 0.2050 | \n",
227 | " 0.4000 | \n",
228 | " 0.1625 | \n",
229 | " 0.2364 | \n",
230 | " 0.07678 | \n",
231 | " 0 | \n",
232 | "
\n",
233 | " \n",
234 | "
\n",
235 | "
5 rows × 31 columns
\n",
236 | "
"
237 | ],
238 | "text/plain": [
239 | " mean radius mean texture mean perimeter mean area mean smoothness \\\n",
240 | "0 17.99 10.38 122.80 1001.0 0.11840 \n",
241 | "1 20.57 17.77 132.90 1326.0 0.08474 \n",
242 | "2 19.69 21.25 130.00 1203.0 0.10960 \n",
243 | "3 11.42 20.38 77.58 386.1 0.14250 \n",
244 | "4 20.29 14.34 135.10 1297.0 0.10030 \n",
245 | "\n",
246 | " mean compactness mean concavity mean concave points mean symmetry \\\n",
247 | "0 0.27760 0.3001 0.14710 0.2419 \n",
248 | "1 0.07864 0.0869 0.07017 0.1812 \n",
249 | "2 0.15990 0.1974 0.12790 0.2069 \n",
250 | "3 0.28390 0.2414 0.10520 0.2597 \n",
251 | "4 0.13280 0.1980 0.10430 0.1809 \n",
252 | "\n",
253 | " mean fractal dimension ... worst texture worst perimeter worst area \\\n",
254 | "0 0.07871 ... 17.33 184.60 2019.0 \n",
255 | "1 0.05667 ... 23.41 158.80 1956.0 \n",
256 | "2 0.05999 ... 25.53 152.50 1709.0 \n",
257 | "3 0.09744 ... 26.50 98.87 567.7 \n",
258 | "4 0.05883 ... 16.67 152.20 1575.0 \n",
259 | "\n",
260 | " worst smoothness worst compactness worst concavity worst concave points \\\n",
261 | "0 0.1622 0.6656 0.7119 0.2654 \n",
262 | "1 0.1238 0.1866 0.2416 0.1860 \n",
263 | "2 0.1444 0.4245 0.4504 0.2430 \n",
264 | "3 0.2098 0.8663 0.6869 0.2575 \n",
265 | "4 0.1374 0.2050 0.4000 0.1625 \n",
266 | "\n",
267 | " worst symmetry worst fractal dimension target \n",
268 | "0 0.4601 0.11890 0 \n",
269 | "1 0.2750 0.08902 0 \n",
270 | "2 0.3613 0.08758 0 \n",
271 | "3 0.6638 0.17300 0 \n",
272 | "4 0.2364 0.07678 0 \n",
273 | "\n",
274 | "[5 rows x 31 columns]"
275 | ]
276 | },
277 | "execution_count": 3,
278 | "metadata": {},
279 | "output_type": "execute_result"
280 | }
281 | ],
282 | "source": [
283 | "data = load_breast_cancer()\n",
284 | "\n",
285 | "cancer = pd.DataFrame(data.data, columns = data.feature_names)\n",
286 | "cancer['target'] = data.target\n",
287 | "cancer.head()"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 5,
293 | "metadata": {},
294 | "outputs": [
295 | {
296 | "data": {
297 | "text/plain": [
298 | "1 357\n",
299 | "0 212\n",
300 | "Name: target, dtype: int64"
301 | ]
302 | },
303 | "execution_count": 5,
304 | "metadata": {},
305 | "output_type": "execute_result"
306 | }
307 | ],
308 | "source": [
309 | "cancer['target'].value_counts()"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 7,
315 | "metadata": {},
316 | "outputs": [
317 | {
318 | "name": "stdout",
319 | "output_type": "stream",
320 | "text": [
321 | "(455, 30) (114, 30)\n"
322 | ]
323 | }
324 | ],
325 | "source": [
326 | "y = cancer['target']\n",
327 | "X = cancer.drop('target', axis = 1)\n",
328 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)\n",
329 | "print(X_train.shape, X_test.shape)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 8,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": [
338 | "xgb = XGBClassifier(n_estimators=500, learning_rate = 0.1, max_depth = 4)\n",
339 | "xgb.fit(X_train, y_train)\n",
340 | "xgb_pred = xgb.predict(X_test)"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": 12,
346 | "metadata": {},
347 | "outputs": [
348 | {
349 | "name": "stdout",
350 | "output_type": "stream",
351 | "text": [
352 | "정확도 : 0.99, 정밀도 : 0.99, 재현율 : 1.00\n",
353 | "f1-score : 0.99, auc : 0.99\n"
354 | ]
355 | }
356 | ],
357 | "source": [
358 | "metrics(y_test, xgb_pred)"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": []
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": []
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": null,
378 | "metadata": {},
379 | "outputs": [],
380 | "source": []
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 2,
385 | "metadata": {},
386 | "outputs": [
387 | {
388 | "data": {
389 | "text/html": [
390 | "\n",
391 | "\n",
404 | "
\n",
405 | " \n",
406 | " \n",
407 | " | \n",
408 | " PassengerId | \n",
409 | " Survived | \n",
410 | " Pclass | \n",
411 | " Name | \n",
412 | " Sex | \n",
413 | " Age | \n",
414 | " SibSp | \n",
415 | " Parch | \n",
416 | " Ticket | \n",
417 | " Fare | \n",
418 | " Cabin | \n",
419 | " Embarked | \n",
420 | "
\n",
421 | " \n",
422 | " \n",
423 | " \n",
424 | " 0 | \n",
425 | " 1 | \n",
426 | " 0 | \n",
427 | " 3 | \n",
428 | " Braund, Mr. Owen Harris | \n",
429 | " male | \n",
430 | " 22.0 | \n",
431 | " 1 | \n",
432 | " 0 | \n",
433 | " A/5 21171 | \n",
434 | " 7.2500 | \n",
435 | " NaN | \n",
436 | " S | \n",
437 | "
\n",
438 | " \n",
439 | " 1 | \n",
440 | " 2 | \n",
441 | " 1 | \n",
442 | " 1 | \n",
443 | " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
444 | " female | \n",
445 | " 38.0 | \n",
446 | " 1 | \n",
447 | " 0 | \n",
448 | " PC 17599 | \n",
449 | " 71.2833 | \n",
450 | " C85 | \n",
451 | " C | \n",
452 | "
\n",
453 | " \n",
454 | " 2 | \n",
455 | " 3 | \n",
456 | " 1 | \n",
457 | " 3 | \n",
458 | " Heikkinen, Miss. Laina | \n",
459 | " female | \n",
460 | " 26.0 | \n",
461 | " 0 | \n",
462 | " 0 | \n",
463 | " STON/O2. 3101282 | \n",
464 | " 7.9250 | \n",
465 | " NaN | \n",
466 | " S | \n",
467 | "
\n",
468 | " \n",
469 | " 3 | \n",
470 | " 4 | \n",
471 | " 1 | \n",
472 | " 1 | \n",
473 | " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
474 | " female | \n",
475 | " 35.0 | \n",
476 | " 1 | \n",
477 | " 0 | \n",
478 | " 113803 | \n",
479 | " 53.1000 | \n",
480 | " C123 | \n",
481 | " S | \n",
482 | "
\n",
483 | " \n",
484 | " 4 | \n",
485 | " 5 | \n",
486 | " 0 | \n",
487 | " 3 | \n",
488 | " Allen, Mr. William Henry | \n",
489 | " male | \n",
490 | " 35.0 | \n",
491 | " 0 | \n",
492 | " 0 | \n",
493 | " 373450 | \n",
494 | " 8.0500 | \n",
495 | " NaN | \n",
496 | " S | \n",
497 | "
\n",
498 | " \n",
499 | "
\n",
500 | "
"
501 | ],
502 | "text/plain": [
503 | " PassengerId Survived Pclass \\\n",
504 | "0 1 0 3 \n",
505 | "1 2 1 1 \n",
506 | "2 3 1 3 \n",
507 | "3 4 1 1 \n",
508 | "4 5 0 3 \n",
509 | "\n",
510 | " Name Sex Age SibSp \\\n",
511 | "0 Braund, Mr. Owen Harris male 22.0 1 \n",
512 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
513 | "2 Heikkinen, Miss. Laina female 26.0 0 \n",
514 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
515 | "4 Allen, Mr. William Henry male 35.0 0 \n",
516 | "\n",
517 | " Parch Ticket Fare Cabin Embarked \n",
518 | "0 0 A/5 21171 7.2500 NaN S \n",
519 | "1 0 PC 17599 71.2833 C85 C \n",
520 | "2 0 STON/O2. 3101282 7.9250 NaN S \n",
521 | "3 0 113803 53.1000 C123 S \n",
522 | "4 0 373450 8.0500 NaN S "
523 | ]
524 | },
525 | "execution_count": 2,
526 | "metadata": {},
527 | "output_type": "execute_result"
528 | }
529 | ],
530 | "source": [
531 | "data = pd.read_csv('./datas/titanic/train.csv')\n",
532 | "data.head()"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": 3,
538 | "metadata": {},
539 | "outputs": [],
540 | "source": [
541 | "data['Embarked'].fillna('S', inplace = True)\n",
542 | "data['Fare'].fillna(0, inplace=True)\n",
543 | "data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\\.')\n",
544 | "data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)\n"
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": 3,
550 | "metadata": {},
551 | "outputs": [],
552 | "source": [
553 | "\n",
554 | "mapping = {\n",
555 | " \"Mr\":0,\n",
556 | " \"Miss\":1,\n",
557 | " \"Mrs\" : 1,\n",
558 | " \"Master\":2,\n",
559 | " \"Other\":3\n",
560 | "}\n",
561 | "\n",
562 | "\n",
563 | "\n",
564 | "mapping_sex = {\n",
565 | " 'male' : 0,\n",
566 | " 'female': 1\n",
567 | "}\n",
568 | "\n",
569 | "mapping_em = {\n",
570 | " 'S' :0,\n",
571 | " 'C' :1,\n",
572 | " 'Q' :2\n",
573 | "}\n"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": 3,
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "\n",
583 | "data['Initial'] = data['Initial'].map(mapping)\n",
584 | "data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)\n",
585 | "data['Sex'] = data['Sex'].map(mapping_sex)\n",
586 | "data['Embarked'] = data['Embarked'].map(mapping_em)\n"
587 | ]
588 | },
589 | {
590 | "cell_type": "code",
591 | "execution_count": 3,
592 | "metadata": {},
593 | "outputs": [],
594 | "source": [
595 | "data.drop(['PassengerId', \"Ticket\", \"Cabin\", \"Name\"], axis = 1, inplace = True)\n",
596 | "\n",
597 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 0), 'Age' ] = 32\n",
598 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 1), 'Age' ] = 28\n",
599 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 2), 'Age' ] = 5\n",
600 | "data.loc[ (data['Age'].isnull()) & (data['Initial'] == 3), 'Age' ] = 45"
601 | ]
602 | },
603 | {
604 | "cell_type": "code",
605 | "execution_count": 4,
606 | "metadata": {},
607 | "outputs": [],
608 | "source": [
609 | "y = data['Survived']\n",
610 | "X = data.drop('Survived', axis = 1)\n",
611 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=10)"
612 | ]
613 | },
614 | {
615 | "cell_type": "code",
616 | "execution_count": 13,
617 | "metadata": {},
618 | "outputs": [
619 | {
620 | "name": "stderr",
621 | "output_type": "stream",
622 | "text": [
623 | "d:\\anaconda3\\envs\\soojin\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n",
624 | " warnings.warn(CV_WARNING, FutureWarning)\n",
625 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n"
626 | ]
627 | },
628 | {
629 | "name": "stdout",
630 | "output_type": "stream",
631 | "text": [
632 | "Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"
633 | ]
634 | },
635 | {
636 | "name": "stderr",
637 | "output_type": "stream",
638 | "text": [
639 | "[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 3.5s\n",
640 | "[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 8.8s\n",
641 | "[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 12.8s finished\n"
642 | ]
643 | },
644 | {
645 | "data": {
646 | "text/plain": [
647 | "GridSearchCV(cv='warn', error_score='raise-deprecating',\n",
648 | " estimator=XGBClassifier(base_score=0.5, booster='gbtree',\n",
649 | " colsample_bylevel=1, colsample_bynode=1,\n",
650 | " colsample_bytree=1, gamma=0,\n",
651 | " learning_rate=0.1, max_delta_step=0,\n",
652 | " max_depth=3, min_child_weight=1,\n",
653 | " missing=None, n_estimators=100, n_jobs=1,\n",
654 | " nthread=None, objective='binary:logistic',\n",
655 | " random_state=0, reg_alpha=0, reg_lambda=1,\n",
656 | " scale_pos_weight=1, seed=None, silent=None,\n",
657 | " subsample=1, verbosity=1),\n",
658 | " iid='warn', n_jobs=-1,\n",
659 | " param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],\n",
660 | " 'max_depth': [4, 6, 8, 10, 12],\n",
661 | " 'n_estimators': [100, 200, 400, 600]},\n",
662 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
663 | " scoring='accuracy', verbose=1)"
664 | ]
665 | },
666 | "execution_count": 13,
667 | "metadata": {},
668 | "output_type": "execute_result"
669 | }
670 | ],
671 | "source": [
672 | "xgb = XGBClassifier()\n",
673 | "\n",
674 | "xgb_param_grid = {\n",
675 | " 'n_estimators' : [100, 200, 400, 600],\n",
676 | " 'learning_rate' : [0.01, 0.05, 0.1, 0.15, 0.2],\n",
677 | " 'max_depth' : [4, 6, 8, 10, 12],\n",
678 | "}\n",
679 | "\n",
680 | "xgb_grid = GridSearchCV(xgb, param_grid = xgb_param_grid, scoring=\"accuracy\", n_jobs= -1, verbose = 1)\n",
681 | "xgb_grid.fit(X_train, y_train)\n"
682 | ]
683 | },
684 | {
685 | "cell_type": "code",
686 | "execution_count": 14,
687 | "metadata": {},
688 | "outputs": [
689 | {
690 | "name": "stdout",
691 | "output_type": "stream",
692 | "text": [
693 | "최고 평균 정확도 : 0.8244\n",
694 | "최고의 파라미터 : {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}\n"
695 | ]
696 | }
697 | ],
698 | "source": [
699 | "print(\"최고 평균 정확도 : {0:.4f}\".format(xgb_grid.best_score_))\n",
700 | "print(\"최고의 파라미터 : \", xgb_grid.best_params_)"
701 | ]
702 | },
703 | {
704 | "cell_type": "code",
705 | "execution_count": 15,
706 | "metadata": {},
707 | "outputs": [],
708 | "source": [
709 | "result = pd.DataFrame(xgb_grid.cv_results_)\n",
710 | "result.sort_values(by=['rank_test_score'], inplace=True)"
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": 16,
716 | "metadata": {},
717 | "outputs": [
718 | {
719 | "data": {
720 | "text/html": [
721 | "\n",
722 | "\n",
735 | "
\n",
736 | " \n",
737 | " \n",
738 | " | \n",
739 | " params | \n",
740 | " mean_test_score | \n",
741 | " rank_test_score | \n",
742 | "
\n",
743 | " \n",
744 | " \n",
745 | " \n",
746 | " 16 | \n",
747 | " {'learning_rate': 0.01, 'max_depth': 12, 'n_es... | \n",
748 | " 0.824438 | \n",
749 | " 1 | \n",
750 | "
\n",
751 | " \n",
752 | " 12 | \n",
753 | " {'learning_rate': 0.01, 'max_depth': 10, 'n_es... | \n",
754 | " 0.824438 | \n",
755 | " 1 | \n",
756 | "
\n",
757 | " \n",
758 | " 10 | \n",
759 | " {'learning_rate': 0.01, 'max_depth': 8, 'n_est... | \n",
760 | " 0.823034 | \n",
761 | " 3 | \n",
762 | "
\n",
763 | " \n",
764 | " 6 | \n",
765 | " {'learning_rate': 0.01, 'max_depth': 6, 'n_est... | \n",
766 | " 0.823034 | \n",
767 | " 3 | \n",
768 | "
\n",
769 | " \n",
770 | " 11 | \n",
771 | " {'learning_rate': 0.01, 'max_depth': 8, 'n_est... | \n",
772 | " 0.821629 | \n",
773 | " 5 | \n",
774 | "
\n",
775 | " \n",
776 | " 14 | \n",
777 | " {'learning_rate': 0.01, 'max_depth': 10, 'n_es... | \n",
778 | " 0.820225 | \n",
779 | " 6 | \n",
780 | "
\n",
781 | " \n",
782 | " 7 | \n",
783 | " {'learning_rate': 0.01, 'max_depth': 6, 'n_est... | \n",
784 | " 0.820225 | \n",
785 | " 6 | \n",
786 | "
\n",
787 | " \n",
788 | " 8 | \n",
789 | " {'learning_rate': 0.01, 'max_depth': 8, 'n_est... | \n",
790 | " 0.820225 | \n",
791 | " 6 | \n",
792 | "
\n",
793 | " \n",
794 | " 15 | \n",
795 | " {'learning_rate': 0.01, 'max_depth': 10, 'n_es... | \n",
796 | " 0.818820 | \n",
797 | " 9 | \n",
798 | "
\n",
799 | " \n",
800 | " 20 | \n",
801 | " {'learning_rate': 0.05, 'max_depth': 4, 'n_est... | \n",
802 | " 0.818820 | \n",
803 | " 9 | \n",
804 | "
\n",
805 | " \n",
806 | "
\n",
807 | "
"
808 | ],
809 | "text/plain": [
810 | " params mean_test_score \\\n",
811 | "16 {'learning_rate': 0.01, 'max_depth': 12, 'n_es... 0.824438 \n",
812 | "12 {'learning_rate': 0.01, 'max_depth': 10, 'n_es... 0.824438 \n",
813 | "10 {'learning_rate': 0.01, 'max_depth': 8, 'n_est... 0.823034 \n",
814 | "6 {'learning_rate': 0.01, 'max_depth': 6, 'n_est... 0.823034 \n",
815 | "11 {'learning_rate': 0.01, 'max_depth': 8, 'n_est... 0.821629 \n",
816 | "14 {'learning_rate': 0.01, 'max_depth': 10, 'n_es... 0.820225 \n",
817 | "7 {'learning_rate': 0.01, 'max_depth': 6, 'n_est... 0.820225 \n",
818 | "8 {'learning_rate': 0.01, 'max_depth': 8, 'n_est... 0.820225 \n",
819 | "15 {'learning_rate': 0.01, 'max_depth': 10, 'n_es... 0.818820 \n",
820 | "20 {'learning_rate': 0.05, 'max_depth': 4, 'n_est... 0.818820 \n",
821 | "\n",
822 | " rank_test_score \n",
823 | "16 1 \n",
824 | "12 1 \n",
825 | "10 3 \n",
826 | "6 3 \n",
827 | "11 5 \n",
828 | "14 6 \n",
829 | "7 6 \n",
830 | "8 6 \n",
831 | "15 9 \n",
832 | "20 9 "
833 | ]
834 | },
835 | "execution_count": 16,
836 | "metadata": {},
837 | "output_type": "execute_result"
838 | }
839 | ],
840 | "source": [
841 | "result[['params', 'mean_test_score', 'rank_test_score']].head(10)"
842 | ]
843 | },
844 | {
845 | "cell_type": "code",
846 | "execution_count": 5,
847 | "metadata": {},
848 | "outputs": [],
849 | "source": []
850 | },
851 | {
852 | "cell_type": "code",
853 | "execution_count": 5,
854 | "metadata": {},
855 | "outputs": [],
856 | "source": []
857 | },
858 | {
859 | "cell_type": "code",
860 | "execution_count": 7,
861 | "metadata": {},
862 | "outputs": [
863 | {
864 | "name": "stdout",
865 | "output_type": "stream",
866 | "text": [
867 | "[0]\tvalidation_0-logloss:0.643237\n",
868 | "Will train until validation_0-logloss hasn't improved in 100 rounds.\n",
869 | "[1]\tvalidation_0-logloss:0.600544\n",
870 | "[2]\tvalidation_0-logloss:0.567278\n",
871 | "[3]\tvalidation_0-logloss:0.539616\n",
872 | "[4]\tvalidation_0-logloss:0.515783\n",
873 | "[5]\tvalidation_0-logloss:0.493083\n",
874 | "[6]\tvalidation_0-logloss:0.475519\n",
875 | "[7]\tvalidation_0-logloss:0.460413\n",
876 | "[8]\tvalidation_0-logloss:0.447876\n",
877 | "[9]\tvalidation_0-logloss:0.434248\n",
878 | "[10]\tvalidation_0-logloss:0.424888\n",
879 | "[11]\tvalidation_0-logloss:0.417043\n",
880 | "[12]\tvalidation_0-logloss:0.410522\n",
881 | "[13]\tvalidation_0-logloss:0.40404\n",
882 | "[14]\tvalidation_0-logloss:0.399429\n",
883 | "[15]\tvalidation_0-logloss:0.393991\n",
884 | "[16]\tvalidation_0-logloss:0.390694\n",
885 | "[17]\tvalidation_0-logloss:0.386906\n",
886 | "[18]\tvalidation_0-logloss:0.383186\n",
887 | "[19]\tvalidation_0-logloss:0.379069\n",
888 | "[20]\tvalidation_0-logloss:0.375775\n",
889 | "[21]\tvalidation_0-logloss:0.374222\n",
890 | "[22]\tvalidation_0-logloss:0.372794\n",
891 | "[23]\tvalidation_0-logloss:0.373341\n",
892 | "[24]\tvalidation_0-logloss:0.370527\n",
893 | "[25]\tvalidation_0-logloss:0.369523\n",
894 | "[26]\tvalidation_0-logloss:0.369086\n",
895 | "[27]\tvalidation_0-logloss:0.369067\n",
896 | "[28]\tvalidation_0-logloss:0.367292\n",
897 | "[29]\tvalidation_0-logloss:0.366029\n",
898 | "[30]\tvalidation_0-logloss:0.365949\n",
899 | "[31]\tvalidation_0-logloss:0.364792\n",
900 | "[32]\tvalidation_0-logloss:0.365043\n",
901 | "[33]\tvalidation_0-logloss:0.365255\n",
902 | "[34]\tvalidation_0-logloss:0.364502\n",
903 | "[35]\tvalidation_0-logloss:0.36495\n",
904 | "[36]\tvalidation_0-logloss:0.3653\n",
905 | "[37]\tvalidation_0-logloss:0.365692\n",
906 | "[38]\tvalidation_0-logloss:0.364633\n",
907 | "[39]\tvalidation_0-logloss:0.365394\n",
908 | "[40]\tvalidation_0-logloss:0.366006\n",
909 | "[41]\tvalidation_0-logloss:0.364511\n",
910 | "[42]\tvalidation_0-logloss:0.362128\n",
911 | "[43]\tvalidation_0-logloss:0.363265\n",
912 | "[44]\tvalidation_0-logloss:0.362809\n",
913 | "[45]\tvalidation_0-logloss:0.361418\n",
914 | "[46]\tvalidation_0-logloss:0.361081\n",
915 | "[47]\tvalidation_0-logloss:0.362271\n",
916 | "[48]\tvalidation_0-logloss:0.360343\n",
917 | "[49]\tvalidation_0-logloss:0.360147\n",
918 | "[50]\tvalidation_0-logloss:0.359536\n",
919 | "[51]\tvalidation_0-logloss:0.360105\n",
920 | "[52]\tvalidation_0-logloss:0.359863\n",
921 | "[53]\tvalidation_0-logloss:0.360054\n",
922 | "[54]\tvalidation_0-logloss:0.360457\n",
923 | "[55]\tvalidation_0-logloss:0.359963\n",
924 | "[56]\tvalidation_0-logloss:0.359591\n",
925 | "[57]\tvalidation_0-logloss:0.360042\n",
926 | "[58]\tvalidation_0-logloss:0.358606\n",
927 | "[59]\tvalidation_0-logloss:0.35847\n",
928 | "[60]\tvalidation_0-logloss:0.358429\n",
929 | "[61]\tvalidation_0-logloss:0.358046\n",
930 | "[62]\tvalidation_0-logloss:0.357865\n",
931 | "[63]\tvalidation_0-logloss:0.356589\n",
932 | "[64]\tvalidation_0-logloss:0.356376\n",
933 | "[65]\tvalidation_0-logloss:0.357027\n",
934 | "[66]\tvalidation_0-logloss:0.356924\n",
935 | "[67]\tvalidation_0-logloss:0.357237\n",
936 | "[68]\tvalidation_0-logloss:0.358427\n",
937 | "[69]\tvalidation_0-logloss:0.358904\n",
938 | "[70]\tvalidation_0-logloss:0.356838\n",
939 | "[71]\tvalidation_0-logloss:0.355709\n",
940 | "[72]\tvalidation_0-logloss:0.356185\n",
941 | "[73]\tvalidation_0-logloss:0.357439\n",
942 | "[74]\tvalidation_0-logloss:0.356952\n",
943 | "[75]\tvalidation_0-logloss:0.356894\n",
944 | "[76]\tvalidation_0-logloss:0.357164\n",
945 | "[77]\tvalidation_0-logloss:0.35748\n",
946 | "[78]\tvalidation_0-logloss:0.357296\n",
947 | "[79]\tvalidation_0-logloss:0.357984\n",
948 | "[80]\tvalidation_0-logloss:0.357816\n",
949 | "[81]\tvalidation_0-logloss:0.358238\n",
950 | "[82]\tvalidation_0-logloss:0.358398\n",
951 | "[83]\tvalidation_0-logloss:0.358424\n",
952 | "[84]\tvalidation_0-logloss:0.358912\n",
953 | "[85]\tvalidation_0-logloss:0.360025\n",
954 | "[86]\tvalidation_0-logloss:0.359234\n",
955 | "[87]\tvalidation_0-logloss:0.359403\n",
956 | "[88]\tvalidation_0-logloss:0.358514\n",
957 | "[89]\tvalidation_0-logloss:0.359621\n",
958 | "[90]\tvalidation_0-logloss:0.359716\n",
959 | "[91]\tvalidation_0-logloss:0.360305\n",
960 | "[92]\tvalidation_0-logloss:0.359297\n",
961 | "[93]\tvalidation_0-logloss:0.35923\n",
962 | "[94]\tvalidation_0-logloss:0.35925\n",
963 | "[95]\tvalidation_0-logloss:0.359636\n",
964 | "[96]\tvalidation_0-logloss:0.358746\n",
965 | "[97]\tvalidation_0-logloss:0.359995\n",
966 | "[98]\tvalidation_0-logloss:0.358856\n",
967 | "[99]\tvalidation_0-logloss:0.359269\n",
968 | "[100]\tvalidation_0-logloss:0.359495\n",
969 | "[101]\tvalidation_0-logloss:0.359534\n",
970 | "[102]\tvalidation_0-logloss:0.359903\n",
971 | "[103]\tvalidation_0-logloss:0.360073\n",
972 | "[104]\tvalidation_0-logloss:0.360139\n",
973 | "[105]\tvalidation_0-logloss:0.360796\n",
974 | "[106]\tvalidation_0-logloss:0.359293\n",
975 | "[107]\tvalidation_0-logloss:0.359956\n",
976 | "[108]\tvalidation_0-logloss:0.360043\n",
977 | "[109]\tvalidation_0-logloss:0.359125\n",
978 | "[110]\tvalidation_0-logloss:0.359315\n",
979 | "[111]\tvalidation_0-logloss:0.3594\n",
980 | "[112]\tvalidation_0-logloss:0.359811\n",
981 | "[113]\tvalidation_0-logloss:0.359921\n",
982 | "[114]\tvalidation_0-logloss:0.360095\n",
983 | "[115]\tvalidation_0-logloss:0.35926\n",
984 | "[116]\tvalidation_0-logloss:0.359522\n",
985 | "[117]\tvalidation_0-logloss:0.35992\n",
986 | "[118]\tvalidation_0-logloss:0.359175\n",
987 | "[119]\tvalidation_0-logloss:0.358587\n",
988 | "[120]\tvalidation_0-logloss:0.358692\n",
989 | "[121]\tvalidation_0-logloss:0.359066\n",
990 | "[122]\tvalidation_0-logloss:0.359215\n",
991 | "[123]\tvalidation_0-logloss:0.358593\n",
992 | "[124]\tvalidation_0-logloss:0.35855\n",
993 | "[125]\tvalidation_0-logloss:0.35841\n",
994 | "[126]\tvalidation_0-logloss:0.358248\n",
995 | "[127]\tvalidation_0-logloss:0.358388\n",
996 | "[128]\tvalidation_0-logloss:0.358489\n",
997 | "[129]\tvalidation_0-logloss:0.358913\n",
998 | "[130]\tvalidation_0-logloss:0.359169\n",
999 | "[131]\tvalidation_0-logloss:0.358706\n",
1000 | "[132]\tvalidation_0-logloss:0.358846\n",
1001 | "[133]\tvalidation_0-logloss:0.35899\n",
1002 | "[134]\tvalidation_0-logloss:0.358574\n",
1003 | "[135]\tvalidation_0-logloss:0.358431\n",
1004 | "[136]\tvalidation_0-logloss:0.358572\n",
1005 | "[137]\tvalidation_0-logloss:0.357526\n",
1006 | "[138]\tvalidation_0-logloss:0.3576\n",
1007 | "[139]\tvalidation_0-logloss:0.358176\n",
1008 | "[140]\tvalidation_0-logloss:0.357707\n",
1009 | "[141]\tvalidation_0-logloss:0.357483\n",
1010 | "[142]\tvalidation_0-logloss:0.357542\n",
1011 | "[143]\tvalidation_0-logloss:0.357489\n",
1012 | "[144]\tvalidation_0-logloss:0.357366\n",
1013 | "[145]\tvalidation_0-logloss:0.358119\n",
1014 | "[146]\tvalidation_0-logloss:0.358145\n",
1015 | "[147]\tvalidation_0-logloss:0.35822\n",
1016 | "[148]\tvalidation_0-logloss:0.35805\n",
1017 | "[149]\tvalidation_0-logloss:0.35899\n",
1018 | "[150]\tvalidation_0-logloss:0.35882\n",
1019 | "[151]\tvalidation_0-logloss:0.357895\n",
1020 | "[152]\tvalidation_0-logloss:0.358154\n",
1021 | "[153]\tvalidation_0-logloss:0.357417\n",
1022 | "[154]\tvalidation_0-logloss:0.359365\n",
1023 | "[155]\tvalidation_0-logloss:0.358782\n",
1024 | "[156]\tvalidation_0-logloss:0.358195\n",
1025 | "[157]\tvalidation_0-logloss:0.357697\n",
1026 | "[158]\tvalidation_0-logloss:0.358491\n",
1027 | "[159]\tvalidation_0-logloss:0.358627\n",
1028 | "[160]\tvalidation_0-logloss:0.358216\n",
1029 | "[161]\tvalidation_0-logloss:0.358591\n",
1030 | "[162]\tvalidation_0-logloss:0.358682\n",
1031 | "[163]\tvalidation_0-logloss:0.358732\n",
1032 | "[164]\tvalidation_0-logloss:0.358995\n",
1033 | "[165]\tvalidation_0-logloss:0.359204\n",
1034 | "[166]\tvalidation_0-logloss:0.358358\n",
1035 | "[167]\tvalidation_0-logloss:0.359008\n",
1036 | "[168]\tvalidation_0-logloss:0.358891\n",
1037 | "[169]\tvalidation_0-logloss:0.357869\n",
1038 | "[170]\tvalidation_0-logloss:0.357907\n",
1039 | "[171]\tvalidation_0-logloss:0.3576\n",
1040 | "Stopping. Best iteration:\n",
1041 | "[71]\tvalidation_0-logloss:0.355709\n",
1042 | "\n"
1043 | ]
1044 | },
1045 | {
1046 | "data": {
1047 | "text/plain": [
1048 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
1049 | " colsample_bynode=1, colsample_bytree=1, gamma=0,\n",
1050 | " learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
1051 | " min_child_weight=1, missing=None, n_estimators=400, n_jobs=1,\n",
1052 | " nthread=None, objective='binary:logistic', random_state=0,\n",
1053 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
1054 | " silent=None, subsample=1, verbosity=1)"
1055 | ]
1056 | },
1057 | "execution_count": 7,
1058 | "metadata": {},
1059 | "output_type": "execute_result"
1060 | }
1061 | ],
1062 | "source": [
1063 | "xgb = XGBClassifier(n_estimators=400, learning_rate = 0.1, max_depth = 3)\n",
1064 | "evals = [(X_test, y_test)]\n",
1065 | "xgb.fit(X_train, y_train, early_stopping_rounds = 100, eval_metric = \"logloss\", eval_set = evals, verbose = 1)"
1066 | ]
1067 | },
1068 | {
1069 | "cell_type": "code",
1070 | "execution_count": 17,
1071 | "metadata": {},
1072 | "outputs": [
1073 | {
1074 | "name": "stdout",
1075 | "output_type": "stream",
1076 | "text": [
1077 | "[0]\tvalidation_0-logloss:0.643237\n",
1078 | "Will train until validation_0-logloss hasn't improved in 3 rounds.\n",
1079 | "[1]\tvalidation_0-logloss:0.600544\n",
1080 | "[2]\tvalidation_0-logloss:0.567278\n",
1081 | "[3]\tvalidation_0-logloss:0.539616\n",
1082 | "[4]\tvalidation_0-logloss:0.515783\n",
1083 | "[5]\tvalidation_0-logloss:0.493083\n",
1084 | "[6]\tvalidation_0-logloss:0.475519\n",
1085 | "[7]\tvalidation_0-logloss:0.460413\n",
1086 | "[8]\tvalidation_0-logloss:0.447876\n",
1087 | "[9]\tvalidation_0-logloss:0.434248\n",
1088 | "[10]\tvalidation_0-logloss:0.424888\n",
1089 | "[11]\tvalidation_0-logloss:0.417043\n",
1090 | "[12]\tvalidation_0-logloss:0.410522\n",
1091 | "[13]\tvalidation_0-logloss:0.40404\n",
1092 | "[14]\tvalidation_0-logloss:0.399429\n",
1093 | "[15]\tvalidation_0-logloss:0.393991\n",
1094 | "[16]\tvalidation_0-logloss:0.390694\n",
1095 | "[17]\tvalidation_0-logloss:0.386906\n",
1096 | "[18]\tvalidation_0-logloss:0.383186\n",
1097 | "[19]\tvalidation_0-logloss:0.379069\n",
1098 | "[20]\tvalidation_0-logloss:0.375775\n",
1099 | "[21]\tvalidation_0-logloss:0.374222\n",
1100 | "[22]\tvalidation_0-logloss:0.372794\n",
1101 | "[23]\tvalidation_0-logloss:0.373341\n",
1102 | "[24]\tvalidation_0-logloss:0.370527\n",
1103 | "[25]\tvalidation_0-logloss:0.369523\n",
1104 | "[26]\tvalidation_0-logloss:0.369086\n",
1105 | "[27]\tvalidation_0-logloss:0.369067\n",
1106 | "[28]\tvalidation_0-logloss:0.367292\n",
1107 | "[29]\tvalidation_0-logloss:0.366029\n",
1108 | "[30]\tvalidation_0-logloss:0.365949\n",
1109 | "[31]\tvalidation_0-logloss:0.364792\n",
1110 | "[32]\tvalidation_0-logloss:0.365043\n",
1111 | "[33]\tvalidation_0-logloss:0.365255\n",
1112 | "[34]\tvalidation_0-logloss:0.364502\n",
1113 | "[35]\tvalidation_0-logloss:0.36495\n",
1114 | "[36]\tvalidation_0-logloss:0.3653\n",
1115 | "[37]\tvalidation_0-logloss:0.365692\n",
1116 | "Stopping. Best iteration:\n",
1117 | "[34]\tvalidation_0-logloss:0.364502\n",
1118 | "\n"
1119 | ]
1120 | },
1121 | {
1122 | "data": {
1123 | "text/plain": [
1124 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
1125 | " colsample_bynode=1, colsample_bytree=1, gamma=0,\n",
1126 | " learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
1127 | " min_child_weight=1, missing=None, n_estimators=400, n_jobs=1,\n",
1128 | " nthread=None, objective='binary:logistic', random_state=0,\n",
1129 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
1130 | " silent=None, subsample=1, verbosity=1)"
1131 | ]
1132 | },
1133 | "execution_count": 17,
1134 | "metadata": {},
1135 | "output_type": "execute_result"
1136 | }
1137 | ],
1138 | "source": [
1139 | "xgb = XGBClassifier(n_estimators=400, learning_rate = 0.1, max_depth = 3)\n",
1140 | "evals = [(X_test, y_test)]\n",
1141 | "xgb.fit(X_train, y_train, early_stopping_rounds = 3, eval_metric = \"logloss\", eval_set = evals, verbose = 1)"
1142 | ]
1143 | },
1144 | {
1145 | "cell_type": "code",
1146 | "execution_count": 9,
1147 | "metadata": {},
1148 | "outputs": [
1149 | {
1150 | "data": {
1151 | "text/plain": [
1152 | ""
1153 | ]
1154 | },
1155 | "execution_count": 9,
1156 | "metadata": {},
1157 | "output_type": "execute_result"
1158 | },
1159 | {
1160 | "data": {
1161 | "image/png": "\n",
1162 | "text/plain": [
1163 | ""
1164 | ]
1165 | },
1166 | "metadata": {
1167 | "needs_background": "light"
1168 | },
1169 | "output_type": "display_data"
1170 | }
1171 | ],
1172 | "source": [
1173 | "fig, ax = plt.subplots()\n",
1174 | "plot_importance(xgb, ax=ax)"
1175 | ]
1176 | },
1177 | {
1178 | "cell_type": "code",
1179 | "execution_count": null,
1180 | "metadata": {},
1181 | "outputs": [],
1182 | "source": []
1183 | },
1184 | {
1185 | "cell_type": "code",
1186 | "execution_count": 6,
1187 | "metadata": {},
1188 | "outputs": [],
1189 | "source": [
1190 | "from lightgbm import LGBMClassifier, plot_importance"
1191 | ]
1192 | },
1193 | {
1194 | "cell_type": "code",
1195 | "execution_count": 11,
1196 | "metadata": {},
1197 | "outputs": [
1198 | {
1199 | "data": {
1200 | "text/plain": [
1201 | "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n",
1202 | " importance_type='split', learning_rate=0.1, max_depth=-1,\n",
1203 | " min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n",
1204 | " n_estimaotrs=400, n_estimators=100, n_jobs=-1, num_leaves=31,\n",
1205 | " objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n",
1206 | " silent=True, subsample=1.0, subsample_for_bin=200000,\n",
1207 | " subsample_freq=0)"
1208 | ]
1209 | },
1210 | "execution_count": 11,
1211 | "metadata": {},
1212 | "output_type": "execute_result"
1213 | }
1214 | ],
1215 | "source": [
1216 | "lgb = LGBMClassifier(n_estimaotrs = 400)\n",
1217 | "lgb.fit(X_train, y_train)"
1218 | ]
1219 | },
1220 | {
1221 | "cell_type": "code",
1222 | "execution_count": 12,
1223 | "metadata": {},
1224 | "outputs": [
1225 | {
1226 | "name": "stdout",
1227 | "output_type": "stream",
1228 | "text": [
1229 | "정확도 : 0.84, 정밀도 : 0.79, 재현율 : 0.73\n",
1230 | "f1-score : 0.76, auc : 0.81\n"
1231 | ]
1232 | }
1233 | ],
1234 | "source": [
1235 | "lgb_pred = lgb.predict(X_test)\n",
1236 | "metrics(y_test, lgb_pred)"
1237 | ]
1238 | },
1239 | {
1240 | "cell_type": "code",
1241 | "execution_count": 13,
1242 | "metadata": {},
1243 | "outputs": [
1244 | {
1245 | "name": "stdout",
1246 | "output_type": "stream",
1247 | "text": [
1248 | "[1]\tvalid_0's binary_logloss: 0.605701\n",
1249 | "Training until validation scores don't improve for 100 rounds\n",
1250 | "[2]\tvalid_0's binary_logloss: 0.569461\n",
1251 | "[3]\tvalid_0's binary_logloss: 0.540251\n",
1252 | "[4]\tvalid_0's binary_logloss: 0.5147\n",
1253 | "[5]\tvalid_0's binary_logloss: 0.493662\n",
1254 | "[6]\tvalid_0's binary_logloss: 0.47569\n",
1255 | "[7]\tvalid_0's binary_logloss: 0.45573\n",
1256 | "[8]\tvalid_0's binary_logloss: 0.442288\n",
1257 | "[9]\tvalid_0's binary_logloss: 0.427343\n",
1258 | "[10]\tvalid_0's binary_logloss: 0.41478\n",
1259 | "[11]\tvalid_0's binary_logloss: 0.404568\n",
1260 | "[12]\tvalid_0's binary_logloss: 0.394087\n",
1261 | "[13]\tvalid_0's binary_logloss: 0.384579\n",
1262 | "[14]\tvalid_0's binary_logloss: 0.377022\n",
1263 | "[15]\tvalid_0's binary_logloss: 0.372698\n",
1264 | "[16]\tvalid_0's binary_logloss: 0.367266\n",
1265 | "[17]\tvalid_0's binary_logloss: 0.364566\n",
1266 | "[18]\tvalid_0's binary_logloss: 0.362322\n",
1267 | "[19]\tvalid_0's binary_logloss: 0.35638\n",
1268 | "[20]\tvalid_0's binary_logloss: 0.352956\n",
1269 | "[21]\tvalid_0's binary_logloss: 0.351149\n",
1270 | "[22]\tvalid_0's binary_logloss: 0.350341\n",
1271 | "[23]\tvalid_0's binary_logloss: 0.348923\n",
1272 | "[24]\tvalid_0's binary_logloss: 0.348176\n",
1273 | "[25]\tvalid_0's binary_logloss: 0.34714\n",
1274 | "[26]\tvalid_0's binary_logloss: 0.346754\n",
1275 | "[27]\tvalid_0's binary_logloss: 0.347015\n",
1276 | "[28]\tvalid_0's binary_logloss: 0.347799\n",
1277 | "[29]\tvalid_0's binary_logloss: 0.348623\n",
1278 | "[30]\tvalid_0's binary_logloss: 0.349346\n",
1279 | "[31]\tvalid_0's binary_logloss: 0.350961\n",
1280 | "[32]\tvalid_0's binary_logloss: 0.352158\n",
1281 | "[33]\tvalid_0's binary_logloss: 0.352746\n",
1282 | "[34]\tvalid_0's binary_logloss: 0.353988\n",
1283 | "[35]\tvalid_0's binary_logloss: 0.35563\n",
1284 | "[36]\tvalid_0's binary_logloss: 0.357587\n",
1285 | "[37]\tvalid_0's binary_logloss: 0.357775\n",
1286 | "[38]\tvalid_0's binary_logloss: 0.359317\n",
1287 | "[39]\tvalid_0's binary_logloss: 0.360177\n",
1288 | "[40]\tvalid_0's binary_logloss: 0.359158\n",
1289 | "[41]\tvalid_0's binary_logloss: 0.360159\n",
1290 | "[42]\tvalid_0's binary_logloss: 0.359884\n",
1291 | "[43]\tvalid_0's binary_logloss: 0.360693\n",
1292 | "[44]\tvalid_0's binary_logloss: 0.361518\n",
1293 | "[45]\tvalid_0's binary_logloss: 0.361417\n",
1294 | "[46]\tvalid_0's binary_logloss: 0.36477\n",
1295 | "[47]\tvalid_0's binary_logloss: 0.366563\n",
1296 | "[48]\tvalid_0's binary_logloss: 0.367413\n",
1297 | "[49]\tvalid_0's binary_logloss: 0.370403\n",
1298 | "[50]\tvalid_0's binary_logloss: 0.370454\n",
1299 | "[51]\tvalid_0's binary_logloss: 0.3713\n",
1300 | "[52]\tvalid_0's binary_logloss: 0.373395\n",
1301 | "[53]\tvalid_0's binary_logloss: 0.371452\n",
1302 | "[54]\tvalid_0's binary_logloss: 0.370792\n",
1303 | "[55]\tvalid_0's binary_logloss: 0.369311\n",
1304 | "[56]\tvalid_0's binary_logloss: 0.368987\n",
1305 | "[57]\tvalid_0's binary_logloss: 0.372909\n",
1306 | "[58]\tvalid_0's binary_logloss: 0.371587\n",
1307 | "[59]\tvalid_0's binary_logloss: 0.371235\n",
1308 | "[60]\tvalid_0's binary_logloss: 0.371714\n",
1309 | "[61]\tvalid_0's binary_logloss: 0.372394\n",
1310 | "[62]\tvalid_0's binary_logloss: 0.371164\n",
1311 | "[63]\tvalid_0's binary_logloss: 0.371928\n",
1312 | "[64]\tvalid_0's binary_logloss: 0.372314\n",
1313 | "[65]\tvalid_0's binary_logloss: 0.372502\n",
1314 | "[66]\tvalid_0's binary_logloss: 0.376302\n",
1315 | "[67]\tvalid_0's binary_logloss: 0.378364\n",
1316 | "[68]\tvalid_0's binary_logloss: 0.378404\n",
1317 | "[69]\tvalid_0's binary_logloss: 0.381327\n",
1318 | "[70]\tvalid_0's binary_logloss: 0.380973\n",
1319 | "[71]\tvalid_0's binary_logloss: 0.382481\n",
1320 | "[72]\tvalid_0's binary_logloss: 0.38136\n",
1321 | "[73]\tvalid_0's binary_logloss: 0.383008\n",
1322 | "[74]\tvalid_0's binary_logloss: 0.381861\n",
1323 | "[75]\tvalid_0's binary_logloss: 0.382796\n",
1324 | "[76]\tvalid_0's binary_logloss: 0.38258\n",
1325 | "[77]\tvalid_0's binary_logloss: 0.384473\n",
1326 | "[78]\tvalid_0's binary_logloss: 0.383581\n",
1327 | "[79]\tvalid_0's binary_logloss: 0.385198\n",
1328 | "[80]\tvalid_0's binary_logloss: 0.383797\n",
1329 | "[81]\tvalid_0's binary_logloss: 0.383937\n",
1330 | "[82]\tvalid_0's binary_logloss: 0.383372\n",
1331 | "[83]\tvalid_0's binary_logloss: 0.384661\n",
1332 | "[84]\tvalid_0's binary_logloss: 0.383799\n",
1333 | "[85]\tvalid_0's binary_logloss: 0.384108\n",
1334 | "[86]\tvalid_0's binary_logloss: 0.383364\n",
1335 | "[87]\tvalid_0's binary_logloss: 0.384795\n",
1336 | "[88]\tvalid_0's binary_logloss: 0.384702\n",
1337 | "[89]\tvalid_0's binary_logloss: 0.386003\n",
1338 | "[90]\tvalid_0's binary_logloss: 0.386621\n",
1339 | "[91]\tvalid_0's binary_logloss: 0.387986\n",
1340 | "[92]\tvalid_0's binary_logloss: 0.390496\n",
1341 | "[93]\tvalid_0's binary_logloss: 0.389984\n",
1342 | "[94]\tvalid_0's binary_logloss: 0.391477\n",
1343 | "[95]\tvalid_0's binary_logloss: 0.391917\n",
1344 | "[96]\tvalid_0's binary_logloss: 0.392326\n",
1345 | "[97]\tvalid_0's binary_logloss: 0.392731\n",
1346 | "[98]\tvalid_0's binary_logloss: 0.392586\n",
1347 | "[99]\tvalid_0's binary_logloss: 0.394479\n",
1348 | "[100]\tvalid_0's binary_logloss: 0.397251\n",
1349 | "Did not meet early stopping. Best iteration is:\n",
1350 | "[26]\tvalid_0's binary_logloss: 0.346754\n"
1351 | ]
1352 | },
1353 | {
1354 | "data": {
1355 | "text/plain": [
1356 | "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n",
1357 | " importance_type='split', learning_rate=0.1, max_depth=-1,\n",
1358 | " min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n",
1359 | " n_estimaotrs=400, n_estimators=100, n_jobs=-1, num_leaves=31,\n",
1360 | " objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n",
1361 | " silent=True, subsample=1.0, subsample_for_bin=200000,\n",
1362 | " subsample_freq=0)"
1363 | ]
1364 | },
1365 | "execution_count": 13,
1366 | "metadata": {},
1367 | "output_type": "execute_result"
1368 | }
1369 | ],
1370 | "source": [
1371 | "lgb = LGBMClassifier(n_estimaotrs = 400)\n",
1372 | "evals = [(X_test, y_test)]\n",
1373 | "lgb.fit(X_train, y_train, early_stopping_rounds = 100, eval_metric = \"logloss\", eval_set = evals, verbose = True)"
1374 | ]
1375 | },
1376 | {
1377 | "cell_type": "code",
1378 | "execution_count": 15,
1379 | "metadata": {},
1380 | "outputs": [
1381 | {
1382 | "data": {
1383 | "text/plain": [
1384 | ""
1385 | ]
1386 | },
1387 | "execution_count": 15,
1388 | "metadata": {},
1389 | "output_type": "execute_result"
1390 | },
1391 | {
1392 | "data": {
1393 | "image/png": "\n",
1394 | "text/plain": [
1395 | ""
1396 | ]
1397 | },
1398 | "metadata": {
1399 | "needs_background": "light"
1400 | },
1401 | "output_type": "display_data"
1402 | }
1403 | ],
1404 | "source": [
1405 | "fig, ax = plt.subplots(figsize=(10, 6))\n",
1406 | "plot_importance(lgb, ax = ax)"
1407 | ]
1408 | },
1409 | {
1410 | "cell_type": "code",
1411 | "execution_count": null,
1412 | "metadata": {},
1413 | "outputs": [],
1414 | "source": []
1415 | },
1416 | {
1417 | "cell_type": "code",
1418 | "execution_count": null,
1419 | "metadata": {},
1420 | "outputs": [],
1421 | "source": []
1422 | }
1423 | ],
1424 | "metadata": {
1425 | "kernelspec": {
1426 | "display_name": "Python 3",
1427 | "language": "python",
1428 | "name": "python3"
1429 | },
1430 | "language_info": {
1431 | "codemirror_mode": {
1432 | "name": "ipython",
1433 | "version": 3
1434 | },
1435 | "file_extension": ".py",
1436 | "mimetype": "text/x-python",
1437 | "name": "python",
1438 | "nbconvert_exporter": "python",
1439 | "pygments_lexer": "ipython3",
1440 | "version": "3.6.9"
1441 | }
1442 | },
1443 | "nbformat": 4,
1444 | "nbformat_minor": 2
1445 | }
1446 |
--------------------------------------------------------------------------------
/010. credit_card_fraud_basic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "import seaborn as sns\n",
13 | "from xgboost import XGBClassifier, plot_importance as xg_importance\n",
14 | "from lightgbm import LGBMClassifier, plot_importance as lgb_importance\n",
15 | "from sklearn.datasets import load_breast_cancer\n",
16 | "from sklearn.model_selection import train_test_split, GridSearchCV\n",
17 | "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix\n",
18 | "\n",
19 | "import warnings\n",
20 | "warnings.filterwarnings('ignore')\n"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "def metrics(y_test, pred):\n",
30 | " accuracy = accuracy_score(y_test, pred)\n",
31 | " precision = precision_score(y_test, pred)\n",
32 | " recall = recall_score(y_test, pred)\n",
33 | " f1 = f1_score(y_test, pred)\n",
34 | " roc_score = roc_auc_score(y_test, pred, average='macro')\n",
35 | " print('정확도 : {0:.2f}, 정밀도 : {1:.2f}, 재현율 : {2:.2f}'.format(accuracy, precision, recall))\n",
36 | " print('f1-score : {0:.2f}, auc : {1:.2f}'.format(f1, roc_score))"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "data = pd.read_csv('../datas/credit card fraud/creditcard.csv')"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 5,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/plain": [
56 | "(284807, 31)"
57 | ]
58 | },
59 | "execution_count": 5,
60 | "metadata": {},
61 | "output_type": "execute_result"
62 | }
63 | ],
64 | "source": [
65 | "data.shape"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 6,
71 | "metadata": {},
72 | "outputs": [
73 | {
74 | "name": "stdout",
75 | "output_type": "stream",
76 | "text": [
77 | "\n",
78 | "RangeIndex: 284807 entries, 0 to 284806\n",
79 | "Data columns (total 31 columns):\n",
80 | "Time 284807 non-null float64\n",
81 | "V1 284807 non-null float64\n",
82 | "V2 284807 non-null float64\n",
83 | "V3 284807 non-null float64\n",
84 | "V4 284807 non-null float64\n",
85 | "V5 284807 non-null float64\n",
86 | "V6 284807 non-null float64\n",
87 | "V7 284807 non-null float64\n",
88 | "V8 284807 non-null float64\n",
89 | "V9 284807 non-null float64\n",
90 | "V10 284807 non-null float64\n",
91 | "V11 284807 non-null float64\n",
92 | "V12 284807 non-null float64\n",
93 | "V13 284807 non-null float64\n",
94 | "V14 284807 non-null float64\n",
95 | "V15 284807 non-null float64\n",
96 | "V16 284807 non-null float64\n",
97 | "V17 284807 non-null float64\n",
98 | "V18 284807 non-null float64\n",
99 | "V19 284807 non-null float64\n",
100 | "V20 284807 non-null float64\n",
101 | "V21 284807 non-null float64\n",
102 | "V22 284807 non-null float64\n",
103 | "V23 284807 non-null float64\n",
104 | "V24 284807 non-null float64\n",
105 | "V25 284807 non-null float64\n",
106 | "V26 284807 non-null float64\n",
107 | "V27 284807 non-null float64\n",
108 | "V28 284807 non-null float64\n",
109 | "Amount 284807 non-null float64\n",
110 | "Class 284807 non-null int64\n",
111 | "dtypes: float64(30), int64(1)\n",
112 | "memory usage: 67.4 MB\n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "data.info()"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 7,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/plain": [
128 | "Time 0\n",
129 | "V1 0\n",
130 | "V2 0\n",
131 | "V3 0\n",
132 | "V4 0\n",
133 | "V5 0\n",
134 | "V6 0\n",
135 | "V7 0\n",
136 | "V8 0\n",
137 | "V9 0\n",
138 | "V10 0\n",
139 | "V11 0\n",
140 | "V12 0\n",
141 | "V13 0\n",
142 | "V14 0\n",
143 | "V15 0\n",
144 | "V16 0\n",
145 | "V17 0\n",
146 | "V18 0\n",
147 | "V19 0\n",
148 | "V20 0\n",
149 | "V21 0\n",
150 | "V22 0\n",
151 | "V23 0\n",
152 | "V24 0\n",
153 | "V25 0\n",
154 | "V26 0\n",
155 | "V27 0\n",
156 | "V28 0\n",
157 | "Amount 0\n",
158 | "Class 0\n",
159 | "dtype: int64"
160 | ]
161 | },
162 | "execution_count": 7,
163 | "metadata": {},
164 | "output_type": "execute_result"
165 | }
166 | ],
167 | "source": [
168 | "data.isna().sum()"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 8,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "data": {
178 | "text/plain": [
179 | "0 284315\n",
180 | "1 492\n",
181 | "Name: Class, dtype: int64"
182 | ]
183 | },
184 | "execution_count": 8,
185 | "metadata": {},
186 | "output_type": "execute_result"
187 | }
188 | ],
189 | "source": [
190 | "data.iloc[:, -1].value_counts()"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 9,
196 | "metadata": {},
197 | "outputs": [
198 | {
199 | "data": {
200 | "text/plain": [
201 | ""
202 | ]
203 | },
204 | "execution_count": 9,
205 | "metadata": {},
206 | "output_type": "execute_result"
207 | },
208 | {
209 | "data": {
210 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD1CAYAAAClSgmzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAPZElEQVR4nO3cUaxdVZ3H8e9vWjFmHKXKhTBtmRLtZKwmU7WBJr44kpTCPJRJICkP0hCSGgOJJj5YfalRSfRBSUi0SQ0NxTgiQQ3NTLXTVCbGjGIvSoDaYXqDCNc2UGxFJkYd8D8PZzUeLmfde3sL5xb6/SQ7Z5//XmvtdZLb++tee5+bqkKSpFH+arEnIEk6exkSkqQuQ0KS1GVISJK6DAlJUpchIUnqWrrYE3ilXXDBBbVq1arFnoYkvaY8+OCDz1bVxMz66y4kVq1axeTk5GJPQ5JeU5L8alTd5SZJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSul53X6Z7rVi17d8XewqvK0984Z8XewrS65JXEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqmjMkkqxMcn+Sw0kOJflYq38mya+TPNS2q4f6fCrJVJLHklw5VN/YalNJtg3VL03yQJIjSb6V5LxWf2N7P9WOr3olP7wkaXbzuZJ4AfhEVb0LWA/cnGRNO3ZbVa1t216Admwz8G5gI/DVJEuSLAG+AlwFrAGuHxrni22s1cBJ4KZWvwk4WVXvBG5r7SRJYzJnSFTVsar6Wdt/HjgMLJ+lyybg7qr6Y1X9EpgCLmvbVFU9XlV/Au4GNiUJ8CHg3tZ/N3DN0Fi72/69wBWtvSRpDE7rnkRb7nkv8EAr3ZLk4SS7kixrteXAU0PdplutV3878NuqemFG/SVjtePPtfYz57U1yWSSyePHj5/OR5IkzWLeIZHkzcC3gY9X1e+AHcA7gLXAMeBLp5qO6F4LqM821ksLVTural1VrZuYmJj1c0iS5m9eIZHkDQwC4htV9R2Aqnq6ql6sqj8DX2OwnASDK4GVQ91XAEdnqT8LnJ9k6Yz6S8Zqx98KnDidDyhJWrj5PN0U4A7gcFV9eah+8VCzfwEebft7gM3tyaRLgdXAT4GDwOr2JNN5DG5u76mqAu4Hrm39twD3DY21pe1fC/ygtZckjcHSuZvwAeDDwCNJHmq1TzN4Omktg+WfJ4CPAFTVoST3AL9g8GTUzVX1IkCSW4B9wBJgV1UdauN9Erg7yeeBnzMIJdrr15NMMbiC2HwGn1WSdJrmDImq+hGj7w3snaXPrcCtI+p7R/Wrqsf5y3LVcP0PwHVzzVGS9OrwG9eSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkrrmDIkkK5Pcn+RwkkNJPtbqb0uyP8mR9rqs1ZPk9iRTSR5O8r6hsba09keSbBmqvz/JI63P7Uky2zkkSeMxnyuJF4BPVNW7gPXAzUnWANuAA1W1GjjQ3gNcBaxu21ZgBwx+4QPbgcuBy4DtQ7/0d7S2p/ptbPXeOSRJYzBnSFTVsar6Wdt/HjgMLAc2Abtbs93ANW1/E3BXDfwEOD/JxcCVwP6qOlFVJ4H9wMZ27C1V9eOqKuCuGWONOockaQxO655EklXAe4EHgIuq6hgMggS4sDVbDjw11G261WarT4+oM8s5JEljMO+QSPJm4NvAx6vqd7M1HVGrBdTnLcnWJJNJJo8fP346XSVJs5hXSCR5A4OA+EZVfaeVn25LRbTXZ1p9Glg51H0FcHSO+ooR9dnO8RJVtbOq1lXVuomJifl8JEnSPMzn6aYAdwCHq+rLQ4f2AKeeUNoC3DdUv6E95bQeeK4tFe0DNiRZ1m5YbwD2tWPPJ1nfznXDjLFGnUOSNAZL59HmA8CHgUeSPNRqnwa+ANyT5CbgSeC6dmwvcDUwBfweuBGgqk4k+RxwsLX7bFWdaPsfBe4E3gR8r23Mcg5J0hjMGRJV9SNG3zcAuGJE+wJu7oy1C9g1oj4JvGdE/TejziFJGg+/cS1J6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqWvOkEiyK8kzSR4dqn0mya+TPNS2q4eOfSrJVJLHklw5VN/YalNJtg3VL03yQJIjSb6V5LxWf2N7P9WOr3qlPrQkaX7mcyVxJ7BxRP22qlrbtr0ASdYAm4F3tz5fTbIkyRLgK8BVwBrg+tYW4IttrNXASeCmVr8JOFlV7wRua+0kSWM0Z0hU1Q+BE/McbxNwd1X9sap+CUwBl7Vtqqoer6o/AXcDm5IE+BBwb+u/G7hmaKzdbf9e4IrWXpI0JmdyT+KWJA+35ahlrbYceGqozXSr9epvB35bVS/MqL9krHb8udZekjQmCw2JHcA7gLXAMeBLrT7qf/q1gPpsY71Mkq1JJpNMHj9+fLZ5S5JOw4JCoqqerqoXq+rPwNcYLCfB4Epg5VDTFcDRWerPAucnWTqj/pKx2vG30ln2qqqdVbWuqtZNTEws5CNJkkZYUEgkuXjo7b8Ap5582gNsbk8mXQqsBn4KHARWtyeZzmNwc3tPVRVwP3Bt678FuG9orC1t/1rgB629JGlMls7VIMk3gQ8CFySZBrYDH0yylsHyzxPARwCq6lCSe4BfAC8AN1fVi22cW4B9wBJgV1Udaqf4JHB3ks8DPwfuaPU7gK8nmWJwBbH5jD+tJOm0zBkSVXX9iPIdI2qn2t8K3DqivhfYO6L+OH9Zrhqu/wG4bq75SZJePX7jWpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeqaMySS7EryTJJHh2pvS7I/yZH2uqzVk+T2JFNJHk7yvqE+W1r7I0m2DNXfn+SR1uf2JJntHJKk8ZnPlcSdwMYZtW3AgapaDRxo7wGuAla3bSuwAwa/8IHtwOXAZcD2oV/6O1rbU/02znEOSdKYzBkSVfVD4MSM8iZgd9vfDVwzVL+rBn4CnJ/kYuBKYH9Vnaiqk8B+YGM79paq+nFVFXDXjLFGnUOSNCYLvSdxUVUdA2ivF7b6cuCpoXbTrTZbfXpEfbZzSJLG5JW+cZ0RtVpA/fROmmxNMplk8vjx46fbXZLUsdCQeLotFdFen2n1aWDlULsVwNE56itG1Gc7x8tU1c6qWldV6yYmJhb4kSRJMy00JPYAp55Q2gLcN1S/oT3ltB54ri0V7QM2JFnWblhvAPa1Y88nWd+earphxlijziFJGpOlczVI8k3gg8AFSaYZPKX0BeCeJDcBTwLXteZ7gauBKeD3wI0AVXUiyeeAg63dZ6vq1M3wjzJ4gupNwPfaxiznkCSNyZwhUVXXdw5dMaJtATd3xtkF7BpRnwTeM6L+m1HnkCSNj9+4liR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUdUYhkeSJJI8keSjJZKu9Lcn+JEfa67JWT5Lbk0wleTjJ+4bG2dLaH0myZaj+/jb+VOubM5mvJOn0vBJXEv9UVWural17vw04UFWrgQPtPcBVwOq2bQV2wCBUgO3A5cBlwPZTwdLabB3qt/EVmK8kaZ5ejeWmTcDutr8buGaoflcN/AQ4P8nFwJXA/qo6UVUngf3AxnbsLVX146oq4K6hsSRJY3CmIVHAfyR5MMnWVruoqo4BtNcLW3058NRQ3+lWm60+PaIuSRqTpWfY/wNVdTTJhcD+JP89S9tR9xNqAfWXDzwIqK0Al1xyyewzliTN2xldSVTV0fb6DPBdBvcUnm5LRbTXZ1rzaWDlUPcVwNE56itG1EfNY2dVrauqdRMTE2fykSRJQxYcEkn+OsnfnNoHNgCPAnuAU08obQHua/t7gBvaU07rgefactQ+YEOSZe2G9QZgXzv2fJL17ammG4bGkiSNwZksN10EfLc9lboU+Neq+n6Sg8A9SW4CngSua+33AlcDU8DvgRsBqupEks8BB1u7z1bVibb/UeBO4E3A99omSRqTBYdEVT0O/OOI+m+AK0bUC7i5M9YuYNeI+iTwnoXOUZJ0ZvzGtSSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktR11odEko1JHksylWTbYs9Hks4lZ3VIJFkCfAW4ClgDXJ9kzeLOSpLOHWd1SACXAVNV9XhV/Qm4G9i0yHOSpHPG0sWewByWA08NvZ8GLp/ZKMlWYGt7+79JHhvD3M4VFwDPLvYk5pIvLvYMtAheEz+bryF/N6p4todERtTqZYWqncDOV386554kk1W1brHnIc3kz+Z4nO3LTdPAyqH3K4CjizQXSTrnnO0hcRBYneTSJOcBm4E9izwnSTpnnNXLTVX1QpJbgH3AEmBXVR1a5Gmda1zG09nKn80xSNXLlvglSQLO/uUmSdIiMiQkSV2GhCSp66y+ca3xSvIPDL7RvpzB91GOAnuq6vCiTkzSovFKQgAk+SSDP3sS4KcMHj8O8E3/sKLOZkluXOw5vJ75dJMASPI/wLur6v9m1M8DDlXV6sWZmTS7JE9W1SWLPY/XK5ebdMqfgb8FfjWjfnE7Ji2aJA/3DgEXjXMu5xpDQqd8HDiQ5Ah/+aOKlwDvBG5ZtFlJAxcBVwInZ9QD/Nf4p3PuMCQEQFV9P8nfM/jz7MsZ/OObBg5W1YuLOjkJ/g14c1U9NPNAkv8c/3TOHd6TkCR1+XSTJKnLkJAkdRkSkqQuQ0KS1GVISJK6/h96EFvhHkMj3AAAAABJRU5ErkJggg==\n",
211 | "text/plain": [
212 | ""
213 | ]
214 | },
215 | "metadata": {
216 | "needs_background": "light"
217 | },
218 | "output_type": "display_data"
219 | }
220 | ],
221 | "source": [
222 | "data.iloc[:, -1].value_counts().plot(kind='bar')"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 10,
228 | "metadata": {},
229 | "outputs": [
230 | {
231 | "data": {
232 | "text/plain": [
233 | "0 99.827251\n",
234 | "1 0.172749\n",
235 | "Name: Class, dtype: float64"
236 | ]
237 | },
238 | "execution_count": 10,
239 | "metadata": {},
240 | "output_type": "execute_result"
241 | }
242 | ],
243 | "source": [
244 | "data.iloc[:, -1].value_counts() / data.iloc[:, -1].count() * 100"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 11,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "X = data.iloc[:, :-1]\n",
254 | "y = data.iloc[:, -1]"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 13,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 14,
269 | "metadata": {},
270 | "outputs": [
271 | {
272 | "name": "stdout",
273 | "output_type": "stream",
274 | "text": [
275 | "0 99.826315\n",
276 | "1 0.173685\n",
277 | "Name: Class, dtype: float64\n",
278 | "0 99.830061\n",
279 | "1 0.169939\n",
280 | "Name: Class, dtype: float64\n"
281 | ]
282 | }
283 | ],
284 | "source": [
285 | "print(y_train.value_counts() / y_train.count() * 100)\n",
286 | "print(y_test.value_counts() / y_test.count() * 100)"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 16,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "def modeling(model, x_train, x_test, y_train, y_test):\n",
296 | " model.fit(x_train, y_train)\n",
297 | " pred = model.predict(x_test)\n",
298 | " metrics(y_test, pred)"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 20,
304 | "metadata": {},
305 | "outputs": [
306 | {
307 | "name": "stdout",
308 | "output_type": "stream",
309 | "text": [
310 | "정확도 : 1.00, 정밀도 : 0.77, 재현율 : 0.53\n",
311 | "f1-score : 0.63, auc : 0.76\n"
312 | ]
313 | }
314 | ],
315 | "source": [
316 | "from sklearn.linear_model import LogisticRegression\n",
317 | "lr = LogisticRegression()\n",
318 | "modeling(lr, X_train, X_test, y_train, y_test)"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 21,
324 | "metadata": {},
325 | "outputs": [
326 | {
327 | "name": "stdout",
328 | "output_type": "stream",
329 | "text": [
330 | "정확도 : 1.00, 정밀도 : 0.95, 재현율 : 0.83\n",
331 | "f1-score : 0.88, auc : 0.91\n"
332 | ]
333 | }
334 | ],
335 | "source": [
336 | "lgb = LGBMClassifier(n_estimators = 1000, num_leaves = 64, n_jobs = -1, boost_from_average = False)\n",
337 | "modeling(lgb, x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test)"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "metadata": {},
344 | "outputs": [],
345 | "source": []
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": []
353 | }
354 | ],
355 | "metadata": {
356 | "kernelspec": {
357 | "display_name": "Python 3",
358 | "language": "python",
359 | "name": "python3"
360 | },
361 | "language_info": {
362 | "codemirror_mode": {
363 | "name": "ipython",
364 | "version": 3
365 | },
366 | "file_extension": ".py",
367 | "mimetype": "text/x-python",
368 | "name": "python",
369 | "nbconvert_exporter": "python",
370 | "pygments_lexer": "ipython3",
371 | "version": "3.6.9"
372 | }
373 | },
374 | "nbformat": 4,
375 | "nbformat_minor": 2
376 | }
377 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # machine_learning_basic
2 | Repo for everyone who wants a machine learning basic
3 |
--------------------------------------------------------------------------------