├── README ├── titanic_video1.ipynb ├── titanic_video2.ipynb ├── titanic_video3_4_validacao.ipynb ├── titanic_video5.ipynb └── titanic_video6_final.ipynb /README: -------------------------------------------------------------------------------- 1 | Material para o tutorial da playlist de vídeos sobre Machine Learning usando os dados do Titanic 2 | 3 | https://www.youtube.com/playlist?list=PLwnip85KhroW8Q1JSNbgl06iNPeC0SDkx -------------------------------------------------------------------------------- /titanic_video1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Para quem tiver curiosidade de saber como gerar uma sub igual à gender_submission, esse é o código." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 12, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "data = pd.read_csv(\"test.csv\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 13, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/html": [ 36 | "
\n", 37 | "\n", 50 | "\n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", 140 | "
" 141 | ], 142 | "text/plain": [ 143 | " PassengerId Pclass Name Sex \\\n", 144 | "0 892 3 Kelly, Mr. James male \n", 145 | "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", 146 | "2 894 2 Myles, Mr. Thomas Francis male \n", 147 | "3 895 3 Wirz, Mr. Albert male \n", 148 | "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", 149 | "\n", 150 | " Age SibSp Parch Ticket Fare Cabin Embarked \n", 151 | "0 34.5 0 0 330911 7.8292 NaN Q \n", 152 | "1 47.0 1 0 363272 7.0000 NaN S \n", 153 | "2 62.0 0 0 240276 9.6875 NaN Q \n", 154 | "3 27.0 0 0 315154 8.6625 NaN S \n", 155 | "4 22.0 1 1 3101298 12.2875 NaN S " 156 | ] 157 | }, 158 | "execution_count": 13, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "data.head()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 14, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "0 0\n", 176 | "1 1\n", 177 | "2 0\n", 178 | "3 0\n", 179 | "4 1\n", 180 | "Name: Sex, dtype: int64" 181 | ] 182 | }, 183 | "execution_count": 14, 184 | "metadata": {}, 185 | "output_type": "execute_result" 186 | } 187 | ], 188 | "source": [ 189 | "e_feminino = (data['Sex'] == 'female').astype(int)\n", 190 | "e_feminino.head()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 15, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "PassengerId\n", 202 | "892 0\n", 203 | "893 1\n", 204 | "894 0\n", 205 | "895 0\n", 206 | "896 1\n", 207 | "Name: Sex, dtype: int64" 208 | ] 209 | }, 210 | "execution_count": 15, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "e_feminino.index = data['PassengerId']\n", 217 | "e_feminino.head()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 16, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "PassengerId\n", 229 | "892 0\n", 230 | "893 1\n", 231 | "894 0\n", 232 | "895 0\n", 233 | "896 1\n", 234 | "Name: Survived, dtype: int64" 235 | ] 236 | }, 237 | "execution_count": 16, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "e_feminino.name = 'Survived'\n", 244 | "e_feminino.head()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 17, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "e_feminino.to_csv('gender_submission.csv', header=True)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 18, 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "PassengerId,Survived\n", 266 | "892,0\n", 267 | "893,1\n", 268 | "894,0\n", 269 | "895,0\n", 270 | "896,1\n", 271 | "897,0\n", 272 | "898,1\n", 273 | "899,0\n", 274 | "900,1\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "!head -n10 gender_submission.csv" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [] 288 | } 289 | ], 290 | "metadata": { 291 | "kernelspec": { 292 | "display_name": "Python 3", 293 | "language": "python", 294 | "name": "python3" 295 | }, 296 | "language_info": { 297 | "codemirror_mode": { 298 | "name": "ipython", 299 | "version": 3 300 | }, 301 | "file_extension": ".py", 302 | "mimetype": "text/x-python", 303 | "name": "python", 304 | "nbconvert_exporter": "python", 305 | "pygments_lexer": "ipython3", 306 | "version": "3.7.3" 307 | } 308 | }, 309 | "nbformat": 4, 310 | "nbformat_minor": 4 311 | } 312 | -------------------------------------------------------------------------------- /titanic_video2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 72, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 73, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "train = pd.read_csv(\"train.csv\")\n", 20 | "test = pd.read_csv(\"test.csv\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 74, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", 141 | "
" 142 | ], 143 | "text/plain": [ 144 | " PassengerId Survived Pclass \\\n", 145 | "0 1 0 3 \n", 146 | "1 2 1 1 \n", 147 | "2 3 1 3 \n", 148 | "3 4 1 1 \n", 149 | "4 5 0 3 \n", 150 | "\n", 151 | " Name Sex Age SibSp \\\n", 152 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 153 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 154 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 155 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 156 | "4 Allen, Mr. William Henry male 35.0 0 \n", 157 | "\n", 158 | " Parch Ticket Fare Cabin Embarked \n", 159 | "0 0 A/5 21171 7.2500 NaN S \n", 160 | "1 0 PC 17599 71.2833 C85 C \n", 161 | "2 0 STON/O2. 3101282 7.9250 NaN S \n", 162 | "3 0 113803 53.1000 C123 S \n", 163 | "4 0 373450 8.0500 NaN S " 164 | ] 165 | }, 166 | "execution_count": 74, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "train.head()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 75, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "from sklearn.ensemble import RandomForestClassifier" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 76, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "modelo = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 77, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "male 577\n", 202 | "female 314\n", 203 | "Name: Sex, dtype: int64" 204 | ] 205 | }, 206 | "execution_count": 77, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "train['Sex'].value_counts()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 78, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "def transformar_sexo(valor):\n", 222 | " if valor == 'female':\n", 223 | " return 1\n", 224 | " else:\n", 225 | " return 0\n", 226 | " \n", 227 | "train['Sex_binario'] = train['Sex'].map(transformar_sexo)\n", 228 | " " 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 79, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "data": { 238 | "text/html": [ 239 | "
\n", 240 | "\n", 253 | "\n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedSex_binario
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS0
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C1
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS1
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S1
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS0
\n", 355 | "
" 356 | ], 357 | "text/plain": [ 358 | " PassengerId Survived Pclass \\\n", 359 | "0 1 0 3 \n", 360 | "1 2 1 1 \n", 361 | "2 3 1 3 \n", 362 | "3 4 1 1 \n", 363 | "4 5 0 3 \n", 364 | "\n", 365 | " Name Sex Age SibSp \\\n", 366 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 367 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 368 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 369 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 370 | "4 Allen, Mr. William Henry male 35.0 0 \n", 371 | "\n", 372 | " Parch Ticket Fare Cabin Embarked Sex_binario \n", 373 | "0 0 A/5 21171 7.2500 NaN S 0 \n", 374 | "1 0 PC 17599 71.2833 C85 C 1 \n", 375 | "2 0 STON/O2. 3101282 7.9250 NaN S 1 \n", 376 | "3 0 113803 53.1000 C123 S 1 \n", 377 | "4 0 373450 8.0500 NaN S 0 " 378 | ] 379 | }, 380 | "execution_count": 79, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "train.head()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 80, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "variaveis = ['Sex_binario', 'Age']" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 81, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "X = train[variaveis]\n", 405 | "y = train['Survived']" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 82, 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "data": { 415 | "text/html": [ 416 | "
\n", 417 | "\n", 430 | "\n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | "
Sex_binarioAge
0022.0
1138.0
2126.0
3135.0
4035.0
\n", 466 | "
" 467 | ], 468 | "text/plain": [ 469 | " Sex_binario Age\n", 470 | "0 0 22.0\n", 471 | "1 1 38.0\n", 472 | "2 1 26.0\n", 473 | "3 1 35.0\n", 474 | "4 0 35.0" 475 | ] 476 | }, 477 | "execution_count": 82, 478 | "metadata": {}, 479 | "output_type": "execute_result" 480 | } 481 | ], 482 | "source": [ 483 | "X.head()" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 83, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "data": { 493 | "text/plain": [ 494 | "0 0\n", 495 | "1 1\n", 496 | "2 1\n", 497 | "3 1\n", 498 | "4 0\n", 499 | "Name: Survived, dtype: int64" 500 | ] 501 | }, 502 | "execution_count": 83, 503 | "metadata": {}, 504 | "output_type": "execute_result" 505 | } 506 | ], 507 | "source": [ 508 | "y.head()" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 84, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "ename": "ValueError", 518 | "evalue": "Input contains NaN, infinity or a value too large for dtype('float32').", 519 | "output_type": "error", 520 | "traceback": [ 521 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 522 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 523 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodelo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 524 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0;31m# Validate or convert input data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 250\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"csc\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDTYPE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 251\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 525 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 571\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 572\u001b[0m _assert_all_finite(array,\n\u001b[0;32m--> 573\u001b[0;31m allow_nan=force_all_finite == 'allow-nan')\n\u001b[0m\u001b[1;32m 574\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 575\u001b[0m \u001b[0mshape_repr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_shape_repr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 526 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan)\u001b[0m\n\u001b[1;32m 54\u001b[0m not allow_nan and not np.isfinite(X).all()):\n\u001b[1;32m 55\u001b[0m \u001b[0mtype_err\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'infinity'\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mallow_nan\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m'NaN, infinity'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg_err\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype_err\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 527 | "\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float32')." 528 | ] 529 | } 530 | ], 531 | "source": [ 532 | "modelo.fit(X, y)" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 85, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "X = X.fillna(-1)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 86, 547 | "metadata": {}, 548 | "outputs": [ 549 | { 550 | "data": { 551 | "text/plain": [ 552 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 553 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 554 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 555 | " min_samples_leaf=1, min_samples_split=2,\n", 556 | " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n", 557 | " oob_score=False, random_state=0, verbose=0, warm_start=False)" 558 | ] 559 | }, 560 | "execution_count": 86, 561 | "metadata": {}, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "modelo.fit(X, y)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 87, 572 | "metadata": {}, 573 | "outputs": [ 574 | { 575 | "ename": "KeyError", 576 | "evalue": "\"['Sex_binario'] not in index\"", 577 | "output_type": "error", 578 | "traceback": [ 579 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 580 | "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", 581 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mX_prev\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mvariaveis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 582 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2932\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2933\u001b[0m indexer = self.loc._convert_to_indexer(key, axis=1,\n\u001b[0;32m-> 2934\u001b[0;31m raise_missing=True)\n\u001b[0m\u001b[1;32m 2935\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2936\u001b[0m \u001b[0;31m# take() does not accept boolean indexers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 583 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_convert_to_indexer\u001b[0;34m(self, obj, axis, is_setter, raise_missing)\u001b[0m\n\u001b[1;32m 1352\u001b[0m kwargs = {'raise_missing': True if is_setter else\n\u001b[1;32m 1353\u001b[0m raise_missing}\n\u001b[0;32m-> 1354\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1355\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1356\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 584 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[0;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1159\u001b[0m self._validate_read_indexer(keyarr, indexer,\n\u001b[1;32m 1160\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1161\u001b[0;31m raise_missing=raise_missing)\n\u001b[0m\u001b[1;32m 1162\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1163\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 585 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[1;32m 1250\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'loc'\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1251\u001b[0m \u001b[0mnot_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1252\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{} not in index\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnot_found\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1253\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1254\u001b[0m \u001b[0;31m# we skip the warning on Categorical/Interval\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 586 | "\u001b[0;31mKeyError\u001b[0m: \"['Sex_binario'] not in index\"" 587 | ] 588 | } 589 | ], 590 | "source": [ 591 | "X_prev = test[variaveis]" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 88, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "test['Sex_binario'] = test['Sex'].map(transformar_sexo)" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 89, 606 | "metadata": {}, 607 | "outputs": [ 608 | { 609 | "data": { 610 | "text/html": [ 611 | "
\n", 612 | "\n", 625 | "\n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | "
Sex_binarioAge
0034.5
1147.0
2062.0
3027.0
4122.0
\n", 661 | "
" 662 | ], 663 | "text/plain": [ 664 | " Sex_binario Age\n", 665 | "0 0 34.5\n", 666 | "1 1 47.0\n", 667 | "2 0 62.0\n", 668 | "3 0 27.0\n", 669 | "4 1 22.0" 670 | ] 671 | }, 672 | "execution_count": 89, 673 | "metadata": {}, 674 | "output_type": "execute_result" 675 | } 676 | ], 677 | "source": [ 678 | "X_prev = test[variaveis]\n", 679 | "X_prev = X_prev.fillna(-1)\n", 680 | "X_prev.head()" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 90, 686 | "metadata": {}, 687 | "outputs": [ 688 | { 689 | "data": { 690 | "text/plain": [ 691 | "array([0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,\n", 692 | " 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,\n", 693 | " 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,\n", 694 | " 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,\n", 695 | " 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,\n", 696 | " 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,\n", 697 | " 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,\n", 698 | " 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,\n", 699 | " 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,\n", 700 | " 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,\n", 701 | " 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,\n", 702 | " 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,\n", 703 | " 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,\n", 704 | " 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,\n", 705 | " 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n", 706 | " 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,\n", 707 | " 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,\n", 708 | " 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,\n", 709 | " 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0])" 710 | ] 711 | }, 712 | "execution_count": 90, 713 | "metadata": {}, 714 | "output_type": "execute_result" 715 | } 716 | ], 717 | "source": [ 718 | "p = modelo.predict(X_prev)\n", 719 | "p" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": 91, 725 | "metadata": {}, 726 | "outputs": [ 727 | { 728 | "data": { 729 | "text/html": [ 730 | "
\n", 731 | "\n", 744 | "\n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedSex_binario
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ0
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS1
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ0
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS0
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS1
\n", 840 | "
" 841 | ], 842 | "text/plain": [ 843 | " PassengerId Pclass Name Sex \\\n", 844 | "0 892 3 Kelly, Mr. James male \n", 845 | "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", 846 | "2 894 2 Myles, Mr. Thomas Francis male \n", 847 | "3 895 3 Wirz, Mr. Albert male \n", 848 | "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", 849 | "\n", 850 | " Age SibSp Parch Ticket Fare Cabin Embarked Sex_binario \n", 851 | "0 34.5 0 0 330911 7.8292 NaN Q 0 \n", 852 | "1 47.0 1 0 363272 7.0000 NaN S 1 \n", 853 | "2 62.0 0 0 240276 9.6875 NaN Q 0 \n", 854 | "3 27.0 0 0 315154 8.6625 NaN S 0 \n", 855 | "4 22.0 1 1 3101298 12.2875 NaN S 1 " 856 | ] 857 | }, 858 | "execution_count": 91, 859 | "metadata": {}, 860 | "output_type": "execute_result" 861 | } 862 | ], 863 | "source": [ 864 | "test.head()" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": 92, 870 | "metadata": {}, 871 | "outputs": [ 872 | { 873 | "data": { 874 | "text/plain": [ 875 | "(418,)" 876 | ] 877 | }, 878 | "execution_count": 92, 879 | "metadata": {}, 880 | "output_type": "execute_result" 881 | } 882 | ], 883 | "source": [ 884 | "sub = pd.Series(p, index=test['PassengerId'], name='Survived')\n", 885 | "sub.shape" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": 93, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "sub.to_csv(\"primeiro_modelo.csv\", header=True)" 895 | ] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "execution_count": 94, 900 | "metadata": {}, 901 | "outputs": [ 902 | { 903 | "name": "stdout", 904 | "output_type": "stream", 905 | "text": [ 906 | "PassengerId,Survived\n", 907 | "892,0\n", 908 | "893,1\n", 909 | "894,0\n", 910 | "895,1\n", 911 | "896,1\n", 912 | "897,0\n", 913 | "898,1\n", 914 | "899,0\n", 915 | "900,1\n" 916 | ] 917 | } 918 | ], 919 | "source": [ 920 | "!head -n10 primeiro_modelo.csv" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": null, 926 | "metadata": {}, 927 | "outputs": [], 928 | "source": [] 929 | } 930 | ], 931 | "metadata": { 932 | "kernelspec": { 933 | "display_name": "Python 3", 934 | "language": "python", 935 | "name": "python3" 936 | }, 937 | "language_info": { 938 | "codemirror_mode": { 939 | "name": "ipython", 940 | "version": 3 941 | }, 942 | "file_extension": ".py", 943 | "mimetype": "text/x-python", 944 | "name": "python", 945 | "nbconvert_exporter": "python", 946 | "pygments_lexer": "ipython3", 947 | "version": "3.7.3" 948 | } 949 | }, 950 | "nbformat": 4, 951 | "nbformat_minor": 4 952 | } 953 | -------------------------------------------------------------------------------- /titanic_video3_4_validacao.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 26, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 27, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "def transformar_sexo(valor):\n", 20 | " if valor == 'female':\n", 21 | " return 1\n", 22 | " else:\n", 23 | " return 0" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 34, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "train = pd.read_csv(\"train.csv\")\n", 33 | "test = pd.read_csv(\"test.csv\")\n", 34 | "\n", 35 | "train['Sex_binario'] = train['Sex'].map(transformar_sexo)\n", 36 | "test['Sex_binario'] = test['Sex'].map(transformar_sexo)\n", 37 | "\n", 38 | "variaveis = ['Sex_binario', 'Age']\n", 39 | "\n", 40 | "X = train[variaveis].fillna(-1)\n", 41 | "y = train['Survived']" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 35, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/html": [ 52 | "
\n", 53 | "\n", 66 | "\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedSex_binario
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS0
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C1
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS1
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S1
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS0
\n", 168 | "
" 169 | ], 170 | "text/plain": [ 171 | " PassengerId Survived Pclass \\\n", 172 | "0 1 0 3 \n", 173 | "1 2 1 1 \n", 174 | "2 3 1 3 \n", 175 | "3 4 1 1 \n", 176 | "4 5 0 3 \n", 177 | "\n", 178 | " Name Sex Age SibSp \\\n", 179 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 180 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 181 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 182 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 183 | "4 Allen, Mr. William Henry male 35.0 0 \n", 184 | "\n", 185 | " Parch Ticket Fare Cabin Embarked Sex_binario \n", 186 | "0 0 A/5 21171 7.2500 NaN S 0 \n", 187 | "1 0 PC 17599 71.2833 C85 C 1 \n", 188 | "2 0 STON/O2. 3101282 7.9250 NaN S 1 \n", 189 | "3 0 113803 53.1000 C123 S 1 \n", 190 | "4 0 373450 8.0500 NaN S 0 " 191 | ] 192 | }, 193 | "execution_count": 35, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "train.head()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 30, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "from sklearn.ensemble import RandomForestClassifier\n", 209 | "from sklearn.model_selection import train_test_split" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 31, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 221 | ] 222 | }, 223 | "execution_count": 31, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "X_falso = np.arange(10)\n", 230 | "X_falso" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 32, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "[array([6, 7, 3, 0, 5]), array([2, 8, 4, 9, 1])]" 242 | ] 243 | }, 244 | "execution_count": 32, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "np.random.seed(0)\n", 251 | "train_test_split(X_falso, test_size=0.5)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 48, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "np.random.seed(1)\n", 261 | "X_treino, X_valid, y_treino, y_valid = train_test_split(X, y, test_size=0.5)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 49, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/html": [ 272 | "
\n", 273 | "\n", 286 | "\n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | "
Sex_binarioAge
394124.00
851074.00
373022.00
523144.00
7800.83
\n", 322 | "
" 323 | ], 324 | "text/plain": [ 325 | " Sex_binario Age\n", 326 | "394 1 24.00\n", 327 | "851 0 74.00\n", 328 | "373 0 22.00\n", 329 | "523 1 44.00\n", 330 | "78 0 0.83" 331 | ] 332 | }, 333 | "execution_count": 49, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "X_treino.head()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 50, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "((445, 2), (446, 2), (445,), (446,))" 351 | ] 352 | }, 353 | "execution_count": 50, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "X_treino.shape, X_valid.shape, y_treino.shape, y_valid.shape" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 51, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "data": { 369 | "text/plain": [ 370 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 371 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 372 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 373 | " min_samples_leaf=1, min_samples_split=2,\n", 374 | " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n", 375 | " oob_score=False, random_state=0, verbose=0, warm_start=False)" 376 | ] 377 | }, 378 | "execution_count": 51, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "modelo = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)\n", 385 | "modelo.fit(X_treino, y_treino)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 52, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "p = modelo.predict(X_valid)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 53, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "0.7466367713004485" 406 | ] 407 | }, 408 | "execution_count": 53, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "np.mean(y_valid == p)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 54, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "data": { 424 | "text/plain": [ 425 | "0.7623318385650224" 426 | ] 427 | }, 428 | "execution_count": 54, 429 | "metadata": {}, 430 | "output_type": "execute_result" 431 | } 432 | ], 433 | "source": [ 434 | "p = (X_valid['Sex_binario'] == 1).astype(np.int64)\n", 435 | "np.mean(y_valid == p)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "## Validação cruzada" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 55, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 454 | ] 455 | }, 456 | "execution_count": 55, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "X_falso" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 56, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "from sklearn.model_selection import KFold" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 57, 477 | "metadata": {}, 478 | "outputs": [ 479 | { 480 | "data": { 481 | "text/plain": [ 482 | "\u001b[0;31mInit signature:\u001b[0m \u001b[0mKFold\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_splits\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'warn'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 483 | "\u001b[0;31mDocstring:\u001b[0m \n", 484 | "K-Folds cross-validator\n", 485 | "\n", 486 | "Provides train/test indices to split data in train/test sets. Split\n", 487 | "dataset into k consecutive folds (without shuffling by default).\n", 488 | "\n", 489 | "Each fold is then used once as a validation while the k - 1 remaining\n", 490 | "folds form the training set.\n", 491 | "\n", 492 | "Read more in the :ref:`User Guide `.\n", 493 | "\n", 494 | "Parameters\n", 495 | "----------\n", 496 | "n_splits : int, default=3\n", 497 | " Number of folds. Must be at least 2.\n", 498 | "\n", 499 | " .. versionchanged:: 0.20\n", 500 | " ``n_splits`` default value will change from 3 to 5 in v0.22.\n", 501 | "\n", 502 | "shuffle : boolean, optional\n", 503 | " Whether to shuffle the data before splitting into batches.\n", 504 | "\n", 505 | "random_state : int, RandomState instance or None, optional, default=None\n", 506 | " If int, random_state is the seed used by the random number generator;\n", 507 | " If RandomState instance, random_state is the random number generator;\n", 508 | " If None, the random number generator is the RandomState instance used\n", 509 | " by `np.random`. Used when ``shuffle`` == True.\n", 510 | "\n", 511 | "Examples\n", 512 | "--------\n", 513 | ">>> from sklearn.model_selection import KFold\n", 514 | ">>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n", 515 | ">>> y = np.array([1, 2, 3, 4])\n", 516 | ">>> kf = KFold(n_splits=2)\n", 517 | ">>> kf.get_n_splits(X)\n", 518 | "2\n", 519 | ">>> print(kf) # doctest: +NORMALIZE_WHITESPACE\n", 520 | "KFold(n_splits=2, random_state=None, shuffle=False)\n", 521 | ">>> for train_index, test_index in kf.split(X):\n", 522 | "... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", 523 | "... X_train, X_test = X[train_index], X[test_index]\n", 524 | "... y_train, y_test = y[train_index], y[test_index]\n", 525 | "TRAIN: [2 3] TEST: [0 1]\n", 526 | "TRAIN: [0 1] TEST: [2 3]\n", 527 | "\n", 528 | "Notes\n", 529 | "-----\n", 530 | "The first ``n_samples % n_splits`` folds have size\n", 531 | "``n_samples // n_splits + 1``, other folds have size\n", 532 | "``n_samples // n_splits``, where ``n_samples`` is the number of samples.\n", 533 | "\n", 534 | "Randomized CV splitters may return different results for each call of\n", 535 | "split. You can make the results identical by setting ``random_state``\n", 536 | "to an integer.\n", 537 | "\n", 538 | "See also\n", 539 | "--------\n", 540 | "StratifiedKFold\n", 541 | " Takes group information into account to avoid building folds with\n", 542 | " imbalanced class distributions (for binary or multiclass\n", 543 | " classification tasks).\n", 544 | "\n", 545 | "GroupKFold: K-fold iterator variant with non-overlapping groups.\n", 546 | "\n", 547 | "RepeatedKFold: Repeats K-Fold n times.\n", 548 | "\u001b[0;31mFile:\u001b[0m ~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py\n", 549 | "\u001b[0;31mType:\u001b[0m ABCMeta\n", 550 | "\u001b[0;31mSubclasses:\u001b[0m \n" 551 | ] 552 | }, 553 | "metadata": {}, 554 | "output_type": "display_data" 555 | } 556 | ], 557 | "source": [ 558 | "?KFold" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 59, 564 | "metadata": {}, 565 | "outputs": [ 566 | { 567 | "name": "stdout", 568 | "output_type": "stream", 569 | "text": [ 570 | "Treino: [0 1 3 5 6 7]\n", 571 | "Valid: [2 4 8 9]\n", 572 | "\n", 573 | "Treino: [0 2 3 4 5 8 9]\n", 574 | "Valid: [1 6 7]\n", 575 | "\n", 576 | "Treino: [1 2 4 6 7 8 9]\n", 577 | "Valid: [0 3 5]\n", 578 | "\n" 579 | ] 580 | } 581 | ], 582 | "source": [ 583 | "kf = KFold(3, shuffle=True, random_state=0)\n", 584 | "for linhas_treino, linhas_valid in kf.split(X_falso):\n", 585 | " print(\"Treino:\", linhas_treino)\n", 586 | " print(\"Valid:\", linhas_valid)\n", 587 | " print()" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 72, 593 | "metadata": {}, 594 | "outputs": [ 595 | { 596 | "name": "stdout", 597 | "output_type": "stream", 598 | "text": [ 599 | "Rep: 0\n", 600 | "Treino: 712\n", 601 | "Valid: 179\n", 602 | "Acc: 0.7988826815642458\n", 603 | "\n", 604 | "Treino: 713\n", 605 | "Valid: 178\n", 606 | "Acc: 0.7359550561797753\n", 607 | "\n", 608 | "Treino: 713\n", 609 | "Valid: 178\n", 610 | "Acc: 0.7808988764044944\n", 611 | "\n", 612 | "Treino: 713\n", 613 | "Valid: 178\n", 614 | "Acc: 0.797752808988764\n", 615 | "\n", 616 | "Treino: 713\n", 617 | "Valid: 178\n", 618 | "Acc: 0.7808988764044944\n", 619 | "\n", 620 | "Rep: 1\n", 621 | "Treino: 712\n", 622 | "Valid: 179\n", 623 | "Acc: 0.7374301675977654\n", 624 | "\n", 625 | "Treino: 713\n", 626 | "Valid: 178\n", 627 | "Acc: 0.7247191011235955\n", 628 | "\n", 629 | "Treino: 713\n", 630 | "Valid: 178\n", 631 | "Acc: 0.7808988764044944\n", 632 | "\n", 633 | "Treino: 713\n", 634 | "Valid: 178\n", 635 | "Acc: 0.7921348314606742\n", 636 | "\n", 637 | "Treino: 713\n", 638 | "Valid: 178\n", 639 | "Acc: 0.7921348314606742\n", 640 | "\n", 641 | "Rep: 2\n", 642 | "Treino: 712\n", 643 | "Valid: 179\n", 644 | "Acc: 0.7653631284916201\n", 645 | "\n", 646 | "Treino: 713\n", 647 | "Valid: 178\n", 648 | "Acc: 0.7865168539325843\n", 649 | "\n", 650 | "Treino: 713\n", 651 | "Valid: 178\n", 652 | "Acc: 0.7865168539325843\n", 653 | "\n", 654 | "Treino: 713\n", 655 | "Valid: 178\n", 656 | "Acc: 0.7808988764044944\n", 657 | "\n", 658 | "Treino: 713\n", 659 | "Valid: 178\n", 660 | "Acc: 0.7640449438202247\n", 661 | "\n", 662 | "Rep: 3\n", 663 | "Treino: 712\n", 664 | "Valid: 179\n", 665 | "Acc: 0.7653631284916201\n", 666 | "\n", 667 | "Treino: 713\n", 668 | "Valid: 178\n", 669 | "Acc: 0.7471910112359551\n", 670 | "\n", 671 | "Treino: 713\n", 672 | "Valid: 178\n", 673 | "Acc: 0.7808988764044944\n", 674 | "\n", 675 | "Treino: 713\n", 676 | "Valid: 178\n", 677 | "Acc: 0.7415730337078652\n", 678 | "\n", 679 | "Treino: 713\n", 680 | "Valid: 178\n", 681 | "Acc: 0.8202247191011236\n", 682 | "\n", 683 | "Rep: 4\n", 684 | "Treino: 712\n", 685 | "Valid: 179\n", 686 | "Acc: 0.7988826815642458\n", 687 | "\n", 688 | "Treino: 713\n", 689 | "Valid: 178\n", 690 | "Acc: 0.797752808988764\n", 691 | "\n", 692 | "Treino: 713\n", 693 | "Valid: 178\n", 694 | "Acc: 0.7752808988764045\n", 695 | "\n", 696 | "Treino: 713\n", 697 | "Valid: 178\n", 698 | "Acc: 0.7415730337078652\n", 699 | "\n", 700 | "Treino: 713\n", 701 | "Valid: 178\n", 702 | "Acc: 0.7471910112359551\n", 703 | "\n", 704 | "Rep: 5\n", 705 | "Treino: 712\n", 706 | "Valid: 179\n", 707 | "Acc: 0.7653631284916201\n", 708 | "\n", 709 | "Treino: 713\n", 710 | "Valid: 178\n", 711 | "Acc: 0.7415730337078652\n", 712 | "\n", 713 | "Treino: 713\n", 714 | "Valid: 178\n", 715 | "Acc: 0.8370786516853933\n", 716 | "\n", 717 | "Treino: 713\n", 718 | "Valid: 178\n", 719 | "Acc: 0.7471910112359551\n", 720 | "\n", 721 | "Treino: 713\n", 722 | "Valid: 178\n", 723 | "Acc: 0.702247191011236\n", 724 | "\n", 725 | "Rep: 6\n", 726 | "Treino: 712\n", 727 | "Valid: 179\n", 728 | "Acc: 0.7821229050279329\n", 729 | "\n", 730 | "Treino: 713\n", 731 | "Valid: 178\n", 732 | "Acc: 0.8146067415730337\n", 733 | "\n", 734 | "Treino: 713\n", 735 | "Valid: 178\n", 736 | "Acc: 0.7752808988764045\n", 737 | "\n", 738 | "Treino: 713\n", 739 | "Valid: 178\n", 740 | "Acc: 0.7134831460674157\n", 741 | "\n", 742 | "Treino: 713\n", 743 | "Valid: 178\n", 744 | "Acc: 0.7247191011235955\n", 745 | "\n", 746 | "Rep: 7\n", 747 | "Treino: 712\n", 748 | "Valid: 179\n", 749 | "Acc: 0.7150837988826816\n", 750 | "\n", 751 | "Treino: 713\n", 752 | "Valid: 178\n", 753 | "Acc: 0.7247191011235955\n", 754 | "\n", 755 | "Treino: 713\n", 756 | "Valid: 178\n", 757 | "Acc: 0.8370786516853933\n", 758 | "\n", 759 | "Treino: 713\n", 760 | "Valid: 178\n", 761 | "Acc: 0.8033707865168539\n", 762 | "\n", 763 | "Treino: 713\n", 764 | "Valid: 178\n", 765 | "Acc: 0.7808988764044944\n", 766 | "\n", 767 | "Rep: 8\n", 768 | "Treino: 712\n", 769 | "Valid: 179\n", 770 | "Acc: 0.7597765363128491\n", 771 | "\n", 772 | "Treino: 713\n", 773 | "Valid: 178\n", 774 | "Acc: 0.7921348314606742\n", 775 | "\n", 776 | "Treino: 713\n", 777 | "Valid: 178\n", 778 | "Acc: 0.7921348314606742\n", 779 | "\n", 780 | "Treino: 713\n", 781 | "Valid: 178\n", 782 | "Acc: 0.7696629213483146\n", 783 | "\n", 784 | "Treino: 713\n", 785 | "Valid: 178\n", 786 | "Acc: 0.7640449438202247\n", 787 | "\n", 788 | "Rep: 9\n", 789 | "Treino: 712\n", 790 | "Valid: 179\n", 791 | "Acc: 0.7039106145251397\n", 792 | "\n", 793 | "Treino: 713\n", 794 | "Valid: 178\n", 795 | "Acc: 0.7584269662921348\n", 796 | "\n", 797 | "Treino: 713\n", 798 | "Valid: 178\n", 799 | "Acc: 0.7415730337078652\n", 800 | "\n", 801 | "Treino: 713\n", 802 | "Valid: 178\n", 803 | "Acc: 0.8033707865168539\n", 804 | "\n", 805 | "Treino: 713\n", 806 | "Valid: 178\n", 807 | "Acc: 0.7921348314606742\n", 808 | "\n" 809 | ] 810 | } 811 | ], 812 | "source": [ 813 | "resultados = []\n", 814 | "for rep in range(10):\n", 815 | " print(\"Rep:\", rep)\n", 816 | " kf = KFold(5, shuffle=True, random_state=rep)\n", 817 | " \n", 818 | " for linhas_treino, linhas_valid in kf.split(X):\n", 819 | " print(\"Treino:\", linhas_treino.shape[0])\n", 820 | " print(\"Valid:\", linhas_valid.shape[0])\n", 821 | "\n", 822 | " X_treino, X_valid = X.iloc[linhas_treino], X.iloc[linhas_valid]\n", 823 | " y_treino, y_valid = y.iloc[linhas_treino], y.iloc[linhas_valid]\n", 824 | "\n", 825 | " modelo = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)\n", 826 | " modelo.fit(X_treino, y_treino)\n", 827 | "\n", 828 | " p = modelo.predict(X_valid)\n", 829 | "\n", 830 | " acc = np.mean(y_valid == p)\n", 831 | " resultados.append(acc)\n", 832 | " print(\"Acc:\", acc)\n", 833 | " print()\n", 834 | " #print(X_treino.head())\n", 835 | " #print()" 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "execution_count": 73, 841 | "metadata": {}, 842 | "outputs": [ 843 | { 844 | "data": { 845 | "text/plain": [ 846 | "50" 847 | ] 848 | }, 849 | "execution_count": 73, 850 | "metadata": {}, 851 | "output_type": "execute_result" 852 | } 853 | ], 854 | "source": [ 855 | "len(resultados)" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 74, 861 | "metadata": {}, 862 | "outputs": [ 863 | { 864 | "data": { 865 | "text/plain": [ 866 | "0.7692593057560732" 867 | ] 868 | }, 869 | "execution_count": 74, 870 | "metadata": {}, 871 | "output_type": "execute_result" 872 | } 873 | ], 874 | "source": [ 875 | "np.mean(resultados)" 876 | ] 877 | }, 878 | { 879 | "cell_type": "markdown", 880 | "metadata": {}, 881 | "source": [ 882 | "## Criar submission" 883 | ] 884 | }, 885 | { 886 | "cell_type": "code", 887 | "execution_count": 92, 888 | "metadata": {}, 889 | "outputs": [ 890 | { 891 | "data": { 892 | "text/plain": [ 893 | "(418,)" 894 | ] 895 | }, 896 | "execution_count": 92, 897 | "metadata": {}, 898 | "output_type": "execute_result" 899 | } 900 | ], 901 | "source": [ 902 | "sub = pd.Series(p, index=test['PassengerId'], name='Survived')\n", 903 | "sub.shape" 904 | ] 905 | }, 906 | { 907 | "cell_type": "code", 908 | "execution_count": 93, 909 | "metadata": {}, 910 | "outputs": [], 911 | "source": [ 912 | "sub.to_csv(\"primeiro_modelo.csv\", header=True)" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": 94, 918 | "metadata": {}, 919 | "outputs": [ 920 | { 921 | "name": "stdout", 922 | "output_type": "stream", 923 | "text": [ 924 | "PassengerId,Survived\n", 925 | "892,0\n", 926 | "893,1\n", 927 | "894,0\n", 928 | "895,1\n", 929 | "896,1\n", 930 | "897,0\n", 931 | "898,1\n", 932 | "899,0\n", 933 | "900,1\n" 934 | ] 935 | } 936 | ], 937 | "source": [ 938 | "!head -n10 primeiro_modelo.csv" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": null, 944 | "metadata": {}, 945 | "outputs": [], 946 | "source": [] 947 | } 948 | ], 949 | "metadata": { 950 | "kernelspec": { 951 | "display_name": "Python 3", 952 | "language": "python", 953 | "name": "python3" 954 | }, 955 | "language_info": { 956 | "codemirror_mode": { 957 | "name": "ipython", 958 | "version": 3 959 | }, 960 | "file_extension": ".py", 961 | "mimetype": "text/x-python", 962 | "name": "python", 963 | "nbconvert_exporter": "python", 964 | "pygments_lexer": "ipython3", 965 | "version": "3.7.3" 966 | } 967 | }, 968 | "nbformat": 4, 969 | "nbformat_minor": 4 970 | } 971 | -------------------------------------------------------------------------------- /titanic_video5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 3, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "def transformar_sexo(valor):\n", 20 | " if valor == 'female':\n", 21 | " return 1\n", 22 | " else:\n", 23 | " return 0" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 28, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "train = pd.read_csv(\"train.csv\")\n", 33 | "test = pd.read_csv(\"test.csv\")\n", 34 | "\n", 35 | "train['Sex_binario'] = train['Sex'].map(transformar_sexo)\n", 36 | "test['Sex_binario'] = test['Sex'].map(transformar_sexo)\n", 37 | "\n", 38 | "variaveis = ['Sex_binario', 'Age']\n", 39 | "\n", 40 | "X = train[variaveis].fillna(-1)\n", 41 | "y = train['Survived']" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 5, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/html": [ 52 | "
\n", 53 | "\n", 66 | "\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedSex_binario
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS0
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C1
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS1
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S1
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS0
\n", 168 | "
" 169 | ], 170 | "text/plain": [ 171 | " PassengerId Survived Pclass \\\n", 172 | "0 1 0 3 \n", 173 | "1 2 1 1 \n", 174 | "2 3 1 3 \n", 175 | "3 4 1 1 \n", 176 | "4 5 0 3 \n", 177 | "\n", 178 | " Name Sex Age SibSp \\\n", 179 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 180 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 181 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 182 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 183 | "4 Allen, Mr. William Henry male 35.0 0 \n", 184 | "\n", 185 | " Parch Ticket Fare Cabin Embarked Sex_binario \n", 186 | "0 0 A/5 21171 7.2500 NaN S 0 \n", 187 | "1 0 PC 17599 71.2833 C85 C 1 \n", 188 | "2 0 STON/O2. 3101282 7.9250 NaN S 1 \n", 189 | "3 0 113803 53.1000 C123 S 1 \n", 190 | "4 0 373450 8.0500 NaN S 0 " 191 | ] 192 | }, 193 | "execution_count": 5, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "train.head()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 6, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "from sklearn.ensemble import RandomForestClassifier\n", 209 | "from sklearn.model_selection import RepeatedKFold" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "resultados = []\n", 219 | "kf = RepeatedKFold(n_splits=2, n_repeats=10, random_state=10)\n", 220 | "\n", 221 | "for linhas_treino, linhas_valid in kf.split(X):\n", 222 | " print(\"Treino:\", linhas_treino.shape[0])\n", 223 | " print(\"Valid:\", linhas_valid.shape[0])\n", 224 | "\n", 225 | " X_treino, X_valid = X.iloc[linhas_treino], X.iloc[linhas_valid]\n", 226 | " y_treino, y_valid = y.iloc[linhas_treino], y.iloc[linhas_valid]\n", 227 | "\n", 228 | " modelo = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)\n", 229 | " modelo.fit(X_treino, y_treino)\n", 230 | "\n", 231 | " p = modelo.predict(X_valid)\n", 232 | "\n", 233 | " acc = np.mean(y_valid == p)\n", 234 | " resultados.append(acc)\n", 235 | " print(\"Acc:\", acc)\n", 236 | " print()\n", 237 | " #print(X_treino.head())\n", 238 | " #print()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 12, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "Populating the interactive namespace from numpy and matplotlib\n" 251 | ] 252 | }, 253 | { 254 | "name": "stderr", 255 | "output_type": "stream", 256 | "text": [ 257 | "/Users/mario/anaconda3/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['test']\n", 258 | "`%matplotlib` prevents importing * from pylab and numpy\n", 259 | " \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "%matplotlib inline\n", 265 | "%pylab inline" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 14, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/plain": [ 276 | "0.759601451100922" 277 | ] 278 | }, 279 | "execution_count": 14, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "np.mean(resultados)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 13, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "(array([2., 1., 3., 6., 4., 2., 1., 0., 0., 1.]),\n", 297 | " array([0.73542601, 0.74165869, 0.74789137, 0.75412405, 0.76035673,\n", 298 | " 0.76658941, 0.77282209, 0.77905477, 0.78528745, 0.79152013,\n", 299 | " 0.79775281]),\n", 300 | " )" 301 | ] 302 | }, 303 | "execution_count": 13, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | }, 307 | { 308 | "data": { 309 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAD4CAYAAAAaT9YAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAM/UlEQVR4nO3cbYxc91WA8efUm6iJ6zalHiKIu91GVIFQqY21SlUsWZAWcOo2VSU+JFKRGoEWpBC5gCiGT0V8SQRCRWoVZCV9kZoXFSeRSixCitoABWrwJg6x61Sk7pa6abGjUNoEidTp4cNcR2tnd+fOztyZPcvzk1aemb0zc/4768fXd14iM5Ek1fSqaQ8gSVo/Iy5JhRlxSSrMiEtSYUZckgqb6eJGt2/fnnNzc13ctCRtSouLi89mZm/Y63US8bm5OY4cOdLFTUvSphQR31zP9TycIkmFGXFJKsyIS1JhRlySCjPiklSYEZekwlpFPCIui4iDEfFURJyIiHd2PZgkabC2rxP/c+DhzPyViLgYuLTDmSRJLQ2MeES8FtgNfAggM18EXux2LElSG232xK8EzgCfioi3AYvAvsx8YflGEbEALADMzs6Oe051YG7/oand99Jte6d239Jm0uaY+AywE7gjM68BXgD2X7hRZh7IzPnMnO/1hn77vyRpHdpE/BRwKjMPN+cP0o+6JGnKBkY8M78LfCsirmouehfw1U6nkiS10vbVKbcCdzevTDkJ3NzdSJKktlpFPDOPAvMdzyJJGpLv2JSkwoy4JBVmxCWpMCMuSYUZcUkqzIhLUmFGXJIKM+KSVJgRl6TCjLgkFWbEJakwIy5JhRlxSSrMiEtSYUZckgoz4pJUmBGXpMKMuCQVZsQlqTAjLkmFGXFJKsyIS1JhRlySCjPiklSYEZekwmbabBQRS8APgJeAs5k53+VQkqR2WkW88QuZ+Wxnk0iShubhFEkqrG3EE3gkIhYjYmGlDSJiISKORMSRM2fOjG9CSdKq2kZ8V2buBK4HbomI3RdukJkHMnM+M+d7vd5Yh5QkraxVxDPzmebP08CDwLVdDiVJamdgxCNia0RsO3ca+CXgWNeDSZIGa/PqlMuBByPi3Pb3ZObDnU4lSWplYMQz8yTwtgnMIkkaki8xlKTCjLgkFWbEJakwIy5JhRlxSSrMiEtSYUZckgoz4pJUmBGXpMKMuCQVZsQlqTAjLkmFGXFJKsyIS1JhRlySCjPiklSYEZekwoy4JBVmxCWpMCMuSYUZcUkqzIhLUmFGXJIKM+KSVJgRl6TCjLgkFdY64hGxJSIej4iHuhxIktTeMHvi+4ATXQ0iSRpeq4hHxA5gL3Bnt+NIkoYx03K7jwEfAbattkFELAALALOzs6NPpk1tbv+hqdzv0m17p3K/UlcG7olHxHuB05m5uNZ2mXkgM+czc77X641tQEnS6tocTtkF3BARS8B9wHUR8dlOp5IktTIw4pn5B5m5IzPngBuBL2bmBzufTJI0kK8Tl6TC2j6xCUBmPgo82skkkqShuScuSYUZcUkqzIhLUmFGXJIKM+KSVJgRl6TCjLgkFWbEJakwIy5JhRlxSSrMiEtSYUZckgoz4pJUmBGXpMKMuCQVZsQlqTAjLkmFGXFJKsyIS1JhRlySCjPiklSYEZekwoy4JBVmxCWpMCMuSYUNjHhEvDoi/iUinoiI4xHxR5MYTJI02EyLbf4XuC4zn4+Ii4AvR8RfZ+ZXOp5NkjTAwIhnZgLPN2cvar6yy6EkSe202RMnIrYAi8BPAZ/IzMMrbLMALADMzs6Oc8ZNb27/oWmPIKmoVk9sZuZLmfl2YAdwbUS8dYVtDmTmfGbO93q9cc8pSVrBUK9OyczvAY8CezqZRpI0lDavTulFxGXN6UuAdwNPdT2YJGmwNsfEfwL4THNc/FXA5zLzoW7HkiS10ebVKf8GXDOBWSRJQ/Idm5JUmBGXpMKMuCQVZsQlqTAjLkmFGXFJKsyIS1JhRlySCjPiklSYEZekwoy4JBVmxCWpMCMuSYUZcUkqzIhLUmFGXJIKM+KSVJgRl6TCjLgkFWbEJakwIy5JhRlxSSrMiEtSYUZckgoz4pJUmBGXpMIGRjwi3hgRX4qIExFxPCL2TWIwSdJgMy22OQv8bmY+FhHbgMWI+EJmfrXj2SRJAwzcE8/M72TmY83pHwAngCu6HkySNFibPfGXRcQccA1weIXvLQALALOzs+seaG7/oXVfdxRLt+2dyv1qsvz90mbT+onNiHgNcD/w4cz8/oXfz8wDmTmfmfO9Xm+cM0qSVtEq4hFxEf2A352ZD3Q7kiSprTavTgngLuBEZv5Z9yNJktpqsye+C/hV4LqIONp8vafjuSRJLQx8YjMzvwzEBGaRJA3Jd2xKUmFGXJIKM+KSVJgRl6TCjLgkFWbEJakwIy5JhRlxSSrMiEtSYUZckgoz4pJUmBGXpMKMuCQVZsQlqTAjLkmFGXFJKsyIS1JhRlySCjPiklSYEZekwoy4JBVmxCWpMCMuSYUZcUkqzIhLUmEDIx4Rn4yI0xFxbBIDSZLaa7Mn/mlgT8dzSJLWYWDEM/PvgecmMIskaUgz47qhiFgAFgBmZ2fHdbMTM7f/0LRH0CY2zd+vpdv2Tu2+p2VaP+9p/KzH9sRmZh7IzPnMnO/1euO6WUnSGnx1iiQVZsQlqbA2LzG8F/hn4KqIOBURv9b9WJKkNgY+sZmZN01iEEnS8DycIkmFGXFJKsyIS1JhRlySCjPiklSYEZekwoy4JBVmxCWpMCMuSYUZcUkqzIhLUmFGXJIKM+KSVJgRl6TCjLgkFWbEJakwIy5JhRlxSSrMiEtSYUZckgoz4pJUmBGXpMKMuCQVZsQlqTAjLkmFGXFJKqxVxCNiT0R8LSKejoj9XQ8lSWpnYMQjYgvwCeB64Grgpoi4uuvBJEmDtdkTvxZ4OjNPZuaLwH3A+7sdS5LUxkyLba4AvrXs/CngHRduFBELwEJz9vmI+Nro443VduDZaQ8xItewMZRaQ9z+iotKzb+KDbmGFX7Wa7lwDW9az322iXiscFm+4oLMA8CB9QwxCRFxJDPnpz3HKFzDxlB9DdXnB9ewXJvDKaeANy47vwN4ZtQ7liSNrk3E/xV4S0S8OSIuBm4EPt/tWJKkNgYeTsnMsxHxW8DfAFuAT2bm8c4nG78Ne6hnCK5hY6i+hurzg2t4WWS+4vC2JKkI37EpSYUZcUkqbFNEfNDHAkTE70XE0ebrWES8FBE/tuz7WyLi8Yh4aLKTv3z/654/IpYi4snme0cmP/3LM46yhssi4mBEPBURJyLinZNfwfrXEBFXLbv8aER8PyI+XGkNzfd+OyKON5ffGxGvnvwKRl7Dvuay49N6DJo5Bq3hdRHxVxHxRDPrzW2v+wqZWfqL/pOtXweuBC4GngCuXmP79wFfvOCy3wHuAR6qNj+wBGyv/BgAnwF+vTl9MXBZtTVccDvfBd5UaQ3039T3DeCS5vzngA8VW8NbgWPApfRftPG3wFs24hqAPwRub073gOeabYdaf2Zuij3xYT8W4Cbg3nNnImIHsBe4s9MpVzfS/BvEutcQEa8FdgN3AWTmi5n5vY7nXcm4Hod3AV/PzG92MOMgo65hBrgkImboh3Aa7wcZZQ0/A3wlM/8nM88Cfwd8oNNpV9ZmDQlsi4gAXkM/4mdbXvc8myHiK30swBUrbRgRlwJ7gPuXXfwx4CPAj7oacIBR50/gkYhYbD76YBpGWcOVwBngU80hrTsjYmuXw65i1MfhnBuZ3j+y615DZn4b+FPgP4DvAP+dmY90Ou3KRnkcjgG7I+INzffew/lvVJyUNmv4OP1/dJ4BngT2ZeaPWl73PJsh4q0+FqDxPuAfM/M5gIh4L3A6Mxe7Gq6Fdc/f2JWZO+l/yuQtEbF73AO2MMoaZoCdwB2ZeQ3wAjCNjzse9XGgeTPcDcBfjnm2tkb5u/B6+nt8bwZ+EtgaER/sZMq1rXsNmXkCuB34AvAw/UMRZ7sYcoA2a/hl4Cj9n/XbgY83/ysdZv3A5oj4MB8LcOFe0i7ghohYov/flusi4rNdDLmGUeYnM59p/jwNPEj/v2OTNsoaTgGnMvNwc/4g/ahP2kiPQ+N64LHM/M8xz9bWKGt4N/CNzDyTmT8EHgB+rpMp1zbq34e7MnNnZu6mf4ji3zuZcm1t1nAz8ED2PU3/+Yifbnnd8036oH8HTyLMACfp70GceyLgZ1fY7nX0H9Stq9zOzzOdJzbXPT+wFdi27PQ/AXsqraG5/B+Aq5rTHwX+pNoamu/dB9w86dnH9Lv0DuA4/WPhQf/J5lsrraG5/MebP2eBp4DXb8Q1AHcAH21OXw58m/6nGrZa//KvNp9iuKHlKh8LEBG/2Xz/L5pNPwA8kpkvTGnUFY04/+XAg/3nRpgB7snMhyc3fd8YHoNbgbubwxEn6e+lTNSoa2iOwf4i8BsTHPs8o6whMw9HxEHgMfqHIB5nCm9tH8Pv0v0R8Qbgh8Atmflfk5r9nJZr+GPg0xHxJP1/NH8/M58FWOm6a92fb7uXpMI2wzFxSfp/y4hLUmFGXJIKM+KSVJgRl6TCjLgkFWbEJamw/wPRiHJduTn4JQAAAABJRU5ErkJggg==\n", 310 | "text/plain": [ 311 | "
" 312 | ] 313 | }, 314 | "metadata": { 315 | "needs_background": "light" 316 | }, 317 | "output_type": "display_data" 318 | } 319 | ], 320 | "source": [ 321 | "pylab.hist(resultados)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "# Novas variáveis" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "modelo anterior = 0.759601451100922" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 15, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/html": [ 346 | "
\n", 347 | "\n", 360 | "\n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedSex_binario
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS0
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C1
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS1
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S1
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS0
\n", 462 | "
" 463 | ], 464 | "text/plain": [ 465 | " PassengerId Survived Pclass \\\n", 466 | "0 1 0 3 \n", 467 | "1 2 1 1 \n", 468 | "2 3 1 3 \n", 469 | "3 4 1 1 \n", 470 | "4 5 0 3 \n", 471 | "\n", 472 | " Name Sex Age SibSp \\\n", 473 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 474 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 475 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 476 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 477 | "4 Allen, Mr. William Henry male 35.0 0 \n", 478 | "\n", 479 | " Parch Ticket Fare Cabin Embarked Sex_binario \n", 480 | "0 0 A/5 21171 7.2500 NaN S 0 \n", 481 | "1 0 PC 17599 71.2833 C85 C 1 \n", 482 | "2 0 STON/O2. 3101282 7.9250 NaN S 1 \n", 483 | "3 0 113803 53.1000 C123 S 1 \n", 484 | "4 0 373450 8.0500 NaN S 0 " 485 | ] 486 | }, 487 | "execution_count": 15, 488 | "metadata": {}, 489 | "output_type": "execute_result" 490 | } 491 | ], 492 | "source": [ 493 | "train.head()" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 29, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "variaveis = ['Sex_binario', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare']" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 30, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "X = train[variaveis].fillna(-1)\n", 512 | "y = train['Survived']" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 19, 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "Treino: 445\n", 525 | "Valid: 446\n", 526 | "Acc: 0.7869955156950673\n", 527 | "\n", 528 | "Treino: 446\n", 529 | "Valid: 445\n", 530 | "Acc: 0.7797752808988764\n", 531 | "\n", 532 | "Treino: 445\n", 533 | "Valid: 446\n", 534 | "Acc: 0.827354260089686\n", 535 | "\n", 536 | "Treino: 446\n", 537 | "Valid: 445\n", 538 | "Acc: 0.8179775280898877\n", 539 | "\n", 540 | "Treino: 445\n", 541 | "Valid: 446\n", 542 | "Acc: 0.7847533632286996\n", 543 | "\n", 544 | "Treino: 446\n", 545 | "Valid: 445\n", 546 | "Acc: 0.7842696629213484\n", 547 | "\n", 548 | "Treino: 445\n", 549 | "Valid: 446\n", 550 | "Acc: 0.8161434977578476\n", 551 | "\n", 552 | "Treino: 446\n", 553 | "Valid: 445\n", 554 | "Acc: 0.7842696629213484\n", 555 | "\n", 556 | "Treino: 445\n", 557 | "Valid: 446\n", 558 | "Acc: 0.8004484304932735\n", 559 | "\n", 560 | "Treino: 446\n", 561 | "Valid: 445\n", 562 | "Acc: 0.8\n", 563 | "\n", 564 | "Treino: 445\n", 565 | "Valid: 446\n", 566 | "Acc: 0.8183856502242153\n", 567 | "\n", 568 | "Treino: 446\n", 569 | "Valid: 445\n", 570 | "Acc: 0.802247191011236\n", 571 | "\n", 572 | "Treino: 445\n", 573 | "Valid: 446\n", 574 | "Acc: 0.8116591928251121\n", 575 | "\n", 576 | "Treino: 446\n", 577 | "Valid: 445\n", 578 | "Acc: 0.8067415730337079\n", 579 | "\n", 580 | "Treino: 445\n", 581 | "Valid: 446\n", 582 | "Acc: 0.820627802690583\n", 583 | "\n", 584 | "Treino: 446\n", 585 | "Valid: 445\n", 586 | "Acc: 0.7887640449438202\n", 587 | "\n", 588 | "Treino: 445\n", 589 | "Valid: 446\n", 590 | "Acc: 0.8385650224215246\n", 591 | "\n", 592 | "Treino: 446\n", 593 | "Valid: 445\n", 594 | "Acc: 0.8044943820224719\n", 595 | "\n", 596 | "Treino: 445\n", 597 | "Valid: 446\n", 598 | "Acc: 0.7982062780269058\n", 599 | "\n", 600 | "Treino: 446\n", 601 | "Valid: 445\n", 602 | "Acc: 0.8112359550561797\n", 603 | "\n" 604 | ] 605 | } 606 | ], 607 | "source": [ 608 | "resultados = []\n", 609 | "kf = RepeatedKFold(n_splits=2, n_repeats=10, random_state=10)\n", 610 | "\n", 611 | "for linhas_treino, linhas_valid in kf.split(X):\n", 612 | " print(\"Treino:\", linhas_treino.shape[0])\n", 613 | " print(\"Valid:\", linhas_valid.shape[0])\n", 614 | "\n", 615 | " X_treino, X_valid = X.iloc[linhas_treino], X.iloc[linhas_valid]\n", 616 | " y_treino, y_valid = y.iloc[linhas_treino], y.iloc[linhas_valid]\n", 617 | "\n", 618 | " modelo = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)\n", 619 | " modelo.fit(X_treino, y_treino)\n", 620 | "\n", 621 | " p = modelo.predict(X_valid)\n", 622 | "\n", 623 | " acc = np.mean(y_valid == p)\n", 624 | " resultados.append(acc)\n", 625 | " print(\"Acc:\", acc)\n", 626 | " print()\n", 627 | " #print(X_treino.head())\n", 628 | " #print()" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 20, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/plain": [ 639 | "(array([4., 2., 0., 4., 2., 2., 4., 0., 1., 1.]),\n", 640 | " array([0.77977528, 0.78565426, 0.79153323, 0.7974122 , 0.80329118,\n", 641 | " 0.80917015, 0.81504913, 0.8209281 , 0.82680707, 0.83268605,\n", 642 | " 0.83856502]),\n", 643 | "
)" 644 | ] 645 | }, 646 | "execution_count": 20, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | }, 650 | { 651 | "data": { 652 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAARNElEQVR4nO3df6zddX3H8edrpQ4UJm69G6w/qEtINiHjR24KjMUQ5jJ+KVnCH5AokSxpILjAZmaYS3D8p8liDNTQNEqU6TAmEtJhmbIoE5YULbXUlupWlY1KEyrEYgNRy97743xnjre3Pd97z7k/+tnzkZz0++Nzvt/3+56bV7/3e77nfFNVSJLa8mtLXYAkafIMd0lqkOEuSQ0y3CWpQYa7JDXolKXa8apVq2r9+vVLtXtJOik988wzP66qqVHjlizc169fz44dO5Zq95J0UkryX33GeVpGkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNah3uCdZkeTbSR6dZV2S3Jtkf5LdSS6ebJmSpLmYy5H7HcC+46y7Gji3e2wE7h+zLknSGHqFe5I1wLXAp44z5HrgwRrYDpyZ5OwJ1ShJmqO+n1D9BPAh4IzjrF8NvDA0f6BbdnB4UJKNDI7sWbdu3ZwKHbb+ri/P+7njev6j1y7ZvpfKUv28/Vkvnv+PP+vWjTxyT3Id8FJVPXOiYbMsO+YWT1W1paqmq2p6amrkVyNIkuapz2mZy4H3JHke+AJwZZLPzRhzAFg7NL8GeHEiFUqS5mxkuFfV31bVmqpaD9wIfK2q3jtj2Fbg5u6qmUuBw1V1cOa2JEmLY97fCpnkVoCq2gxsA64B9gOvAbdMpDpJ0rzMKdyr6gngiW5689DyAm6fZGGSpPnzE6qS1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAb1uUH2qUm+meTZJHuT3DPLmCuSHE6yq3vcvTDlSpL66HMnpp8BV1bVkSQrgaeSPFZV22eMe7Kqrpt8iZKkuRoZ7t0t9I50syu7Ry1kUZKk8fQ6555kRZJdwEvA41X19CzDLutO3TyW5LyJVilJmpNe4V5Vb1TVhcAaYEOS82cM2QmcU1UXAPcBj8y2nSQbk+xIsuPQoUPj1C1JOoE5XS1TVT8BngCumrH81ao60k1vA1YmWTXL87dU1XRVTU9NTc2/aknSCfW5WmYqyZnd9GnAu4DvzhhzVpJ00xu67b48+XIlSX30uVrmbOCzSVYwCO0vVtWjSW4FqKrNwA3AbUmOAq8DN3ZvxEqSlkCfq2V2AxfNsnzz0PQmYNNkS5MkzZefUJWkBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QG9bmH6qlJvpnk2SR7k9wzy5gkuTfJ/iS7k1y8MOVKkvrocw/VnwFXVtWRJCuBp5I8VlXbh8ZcDZzbPS4B7u/+lSQtgZFH7jVwpJtd2T1m3vz6euDBbux24MwkZ0+2VElSX73OuSdZkWQX8BLweFU9PWPIauCFofkD3bKZ29mYZEeSHYcOHZpvzZKkEXqFe1W9UVUXAmuADUnOnzEksz1tlu1sqarpqpqempqae7WSpF7mdLVMVf0EeAK4asaqA8Daofk1wItjVSZJmrc+V8tMJTmzmz4NeBfw3RnDtgI3d1fNXAocrqqDE69WktRLn6tlzgY+m2QFg/8MvlhVjya5FaCqNgPbgGuA/cBrwC0LVK8kqYeR4V5Vu4GLZlm+eWi6gNsnW5okab78hKokNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1qM89VNcm+XqSfUn2JrljljFXJDmcZFf3uHthypUk9dHnHqpHgQ9W1c4kZwDPJHm8qp6bMe7Jqrpu8iVKkuZq5JF7VR2sqp3d9E+BfcDqhS5MkjR/czrnnmQ9g5tlPz3L6suSPJvksSTnHef5G5PsSLLj0KFDcy5WktRP73BPcjrwJeDOqnp1xuqdwDlVdQFwH/DIbNuoqi1VNV1V01NTU/OtWZI0Qq9wT7KSQbB/vqoenrm+ql6tqiPd9DZgZZJVE61UktRbn6tlAnwa2FdVHz/OmLO6cSTZ0G335UkWKknqr8/VMpcD7wO+k2RXt+zDwDqAqtoM3ADcluQo8DpwY1XVAtQrSephZLhX1VNARozZBGyaVFGSpPH4CVVJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqUJ97qK5N8vUk+5LsTXLHLGOS5N4k+5PsTnLxwpQrSeqjzz1UjwIfrKqdSc4AnknyeFU9NzTmauDc7nEJcH/3ryRpCYw8cq+qg1W1s5v+KbAPWD1j2PXAgzWwHTgzydkTr1aS1EufI/dfSrIeuAh4esaq1cALQ/MHumUHZzx/I7ARYN26dXOrdJlYf9eXl2S/z3/02iXZ71Jaqp+11ILeb6gmOR34EnBnVb06c/UsT6ljFlRtqarpqpqempqaW6WSpN56hXuSlQyC/fNV9fAsQw4Aa4fm1wAvjl+eJGk++lwtE+DTwL6q+vhxhm0Fbu6umrkUOFxVB48zVpK0wPqcc78ceB/wnSS7umUfBtYBVNVmYBtwDbAfeA24ZfKlSpL6GhnuVfUUs59THx5TwO2TKkqSNB4/oSpJDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkN6nMP1QeSvJRkz3HWX5HkcJJd3ePuyZcpSZqLPvdQ/QywCXjwBGOerKrrJlKRJGlsI4/cq+obwCuLUIskaUImdc79siTPJnksyXnHG5RkY5IdSXYcOnRoQruWJM00iXDfCZxTVRcA9wGPHG9gVW2pqumqmp6amprAriVJsxk73Kvq1ao60k1vA1YmWTV2ZZKkeRs73JOclSTd9IZumy+Pu11J0vyNvFomyUPAFcCqJAeAjwArAapqM3ADcFuSo8DrwI1VVQtWsSRppJHhXlU3jVi/icGlkpKkZcJPqEpSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDRoZ7kgeSvJRkz3HWJ8m9SfYn2Z3k4smXKUmaiz5H7p8BrjrB+quBc7vHRuD+8cuSJI1jZLhX1TeAV04w5HrgwRrYDpyZ5OxJFShJmruRN8juYTXwwtD8gW7ZwZkDk2xkcHTPunXrJrBrSSe79Xd9ealLWHTPf/TaBd/HJN5QzSzLaraBVbWlqqaranpqamoCu5YkzWYS4X4AWDs0vwZ4cQLblSTN0yTCfStwc3fVzKXA4ao65pSMJGnxjDznnuQh4ApgVZIDwEeAlQBVtRnYBlwD7AdeA25ZqGIlSf2MDPequmnE+gJun1hFkqSx+QlVSWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJalCvcE9yVZLvJdmf5K5Z1l+R5HCSXd3j7smXKknqq889VFcAnwT+FDgAfCvJ1qp6bsbQJ6vqugWoUZI0R32O3DcA+6vqB1X1c+ALwPULW5YkaRx9wn018MLQ/IFu2UyXJXk2yWNJzpttQ0k2JtmRZMehQ4fmUa4kqY8+4Z5ZltWM+Z3AOVV1AXAf8MhsG6qqLVU1XVXTU1NTc6tUktRbn3A/AKwdml8DvDg8oKperaoj3fQ2YGWSVROrUpI0J33C/VvAuUnenuRNwI3A1uEBSc5Kkm56Q7fdlyddrCSpn5FXy1TV0SQfAL4CrAAeqKq9SW7t1m8GbgBuS3IUeB24sapmnrqRJC2SkeEOvzzVsm3Gss1D05uATZMtTZI0X35CVZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhrUK9yTXJXke0n2J7lrlvVJcm+3fneSiydfqiSpr5HhnmQF8EngauAdwE1J3jFj2NXAud1jI3D/hOuUJM1BnyP3DcD+qvpBVf0c+AJw/Ywx1wMP1sB24MwkZ0+4VklST31ukL0aeGFo/gBwSY8xq4GDw4OSbGRwZA9wJMn35lTtsVYBPx5zG8vFCXvJxxaxkslo6bWBxvs5CX+/hp10r82In/eofs7ps48+4Z5ZltU8xlBVW4AtPfbZS5IdVTU9qe0tpZZ6AftZ7lrqp6VeYHL99DktcwBYOzS/BnhxHmMkSYukT7h/Czg3yduTvAm4Edg6Y8xW4ObuqplLgcNVdXDmhiRJi2PkaZmqOprkA8BXgBXAA1W1N8mt3frNwDbgGmA/8Bpwy8KV/CsmdopnGWipF7Cf5a6lflrqBSbUT6qOOTUuSTrJ+QlVSWqQ4S5JDVqW4d7j6w7+Jsmu7rEnyRtJfrNb91dJ9nbLH0py6uJ3cEy94/RzR7dsb5I7F7/6Y/Xo561J/jnJs13dt/R97mIbs5cHkryUZM/iVn188+0nydokX0+yr1t+x+JXf6wx+jk1yTeHlt+z+NUfa5zft279iiTfTvLoyJ1V1bJ6MHjT9vvA7wFvAp4F3nGC8e8GvtZNrwZ+CJzWzX8ReP9J3M/5wB7gzQze/P5X4Nzl3g/wYeBj3fQU8Eo3dk4/i+XcSzf/TuBiYM9SviYTem3OBi7ulp8B/MdSvjYT6CfA6d3ylcDTwKUnaz9D6/8a+Cfg0VH7W45H7n2+7mDYTcBDQ/OnAKclOYVBKC719fbj9PMHwPaqeq2qjgL/Bvz5glY7Wp9+CjgjSYDTGfyCHu353MU0Ti9U1Te6+eVi3v1U1cGq2glQVT8F9jE4WFpK4/RTVXWkG7Oyeyz11SNj/b4lWQNcC3yqz86WY7gf76sMjpHkzcBVwJcAqupHwD8A/83gqw8OV9VXF7Ta0ebdD4Oj9ncm+a1u3TX86ofFlkKffjYx+I/pReA7wB1V9T89n7uYxullOZpIP0nWAxcxONpdSmP1053C2AW8BDxeVSd1P8AngA8BvX7/lmO49/oqg867gX+vqlcAkryNwf+Ebwd+F3hLkvcuSJX9zbufqtoHfAx4HPgXBn/GHV2IIuegTz9/Buxi8BpcCGxK8hs9n7uYxullORq7nySnMzi4uLOqXl2oQnsaq5+qeqOqLmTwifkNSc5fyGJ7mHc/Sa4DXqqqZ/rubDmG+1y+yuBGfvWUzLuAH1bVoar6BfAw8EcLUmV/4/RDVX26qi6uqncy+BPtPxekyv769HML8HD3p/F+Bu+D/H7P5y6mcXpZjsbqJ8lKBsH++ap6eBHqHWUir09V/QR4gsFfxUtpnH4uB96T5HkGp3OuTPK5E+5tKd9gOM6bDqcAP2Bw9P1/bzqcN8u4tzIIu7cMLbsE2MvgXHuAzwJ/ebL20y3/7e7fdcB3gbct934YfJ//33fTvwP8iME33fX6WZwMvQytX8/yeUN1nNcmwIPAJ5a6jwn1MwWc2S0/DXgSuO5k7WfGmCvo8Ybqkr+Ax/khXMPg3frvA3/XLbsVuHVozPuBL8zy3Hu6ENwD/CPw6yd5P08Cz3W/CH+y1L306YfBn5RfZXDOcA/w3hM99yTu5SEG7+38gsFR2V+crP0Af8zgFMFuBqcFdgHXnMT9/CHw7a6fPcDdS93LuL9vQ9u4gh7h7tcPSFKDluM5d0nSmAx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1KD/BdGTMajEA4nCAAAAAElFTkSuQmCC\n", 653 | "text/plain": [ 654 | "
" 655 | ] 656 | }, 657 | "metadata": { 658 | "needs_background": "light" 659 | }, 660 | "output_type": "display_data" 661 | } 662 | ], 663 | "source": [ 664 | "pylab.hist(resultados)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 21, 670 | "metadata": {}, 671 | "outputs": [ 672 | { 673 | "data": { 674 | "text/plain": [ 675 | "0.8041457147175896" 676 | ] 677 | }, 678 | "execution_count": 21, 679 | "metadata": {}, 680 | "output_type": "execute_result" 681 | } 682 | ], 683 | "source": [ 684 | "np.mean(resultados)" 685 | ] 686 | }, 687 | { 688 | "cell_type": "markdown", 689 | "metadata": {}, 690 | "source": [ 691 | "# Retreinar o modelo" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": 25, 697 | "metadata": {}, 698 | "outputs": [ 699 | { 700 | "data": { 701 | "text/html": [ 702 | "
\n", 703 | "\n", 716 | "\n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | "
Sex_binarioAgePclassSibSpParchFare
0022.03107.2500
1138.011071.2833
2126.03007.9250
3135.011053.1000
4035.03008.0500
\n", 776 | "
" 777 | ], 778 | "text/plain": [ 779 | " Sex_binario Age Pclass SibSp Parch Fare\n", 780 | "0 0 22.0 3 1 0 7.2500\n", 781 | "1 1 38.0 1 1 0 71.2833\n", 782 | "2 1 26.0 3 0 0 7.9250\n", 783 | "3 1 35.0 1 1 0 53.1000\n", 784 | "4 0 35.0 3 0 0 8.0500" 785 | ] 786 | }, 787 | "execution_count": 25, 788 | "metadata": {}, 789 | "output_type": "execute_result" 790 | } 791 | ], 792 | "source": [ 793 | "X.head()" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": 26, 799 | "metadata": {}, 800 | "outputs": [ 801 | { 802 | "data": { 803 | "text/plain": [ 804 | "0 0\n", 805 | "1 1\n", 806 | "2 1\n", 807 | "3 1\n", 808 | "4 0\n", 809 | "Name: Survived, dtype: int64" 810 | ] 811 | }, 812 | "execution_count": 26, 813 | "metadata": {}, 814 | "output_type": "execute_result" 815 | } 816 | ], 817 | "source": [ 818 | "y.head()" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 31, 824 | "metadata": {}, 825 | "outputs": [ 826 | { 827 | "data": { 828 | "text/html": [ 829 | "
\n", 830 | "\n", 843 | "\n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | "
Sex_binarioAgePclassSibSpParchFare
0034.53007.8292
1147.03107.0000
2062.02009.6875
3027.03008.6625
4122.031112.2875
\n", 903 | "
" 904 | ], 905 | "text/plain": [ 906 | " Sex_binario Age Pclass SibSp Parch Fare\n", 907 | "0 0 34.5 3 0 0 7.8292\n", 908 | "1 1 47.0 3 1 0 7.0000\n", 909 | "2 0 62.0 2 0 0 9.6875\n", 910 | "3 0 27.0 3 0 0 8.6625\n", 911 | "4 1 22.0 3 1 1 12.2875" 912 | ] 913 | }, 914 | "execution_count": 31, 915 | "metadata": {}, 916 | "output_type": "execute_result" 917 | } 918 | ], 919 | "source": [ 920 | "test[variaveis].head()" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": 32, 926 | "metadata": {}, 927 | "outputs": [], 928 | "source": [ 929 | "modelo = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)\n", 930 | "modelo.fit(X, y)\n", 931 | "\n", 932 | "p = modelo.predict(test[variaveis].fillna(-1))" 933 | ] 934 | }, 935 | { 936 | "cell_type": "markdown", 937 | "metadata": {}, 938 | "source": [ 939 | "## Criar submission" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": 47, 945 | "metadata": {}, 946 | "outputs": [ 947 | { 948 | "data": { 949 | "text/plain": [ 950 | "(418,)" 951 | ] 952 | }, 953 | "execution_count": 47, 954 | "metadata": {}, 955 | "output_type": "execute_result" 956 | } 957 | ], 958 | "source": [ 959 | "sub = pd.Series(p, index=test['PassengerId'], name='Survived')\n", 960 | "sub.shape" 961 | ] 962 | }, 963 | { 964 | "cell_type": "code", 965 | "execution_count": 48, 966 | "metadata": {}, 967 | "outputs": [], 968 | "source": [ 969 | "sub.to_csv(\"modelo_video5.csv\", header=True)" 970 | ] 971 | }, 972 | { 973 | "cell_type": "code", 974 | "execution_count": 49, 975 | "metadata": {}, 976 | "outputs": [ 977 | { 978 | "name": "stdout", 979 | "output_type": "stream", 980 | "text": [ 981 | "PassengerId,Survived\n", 982 | "892,0\n", 983 | "893,0\n", 984 | "894,1\n", 985 | "895,1\n", 986 | "896,1\n", 987 | "897,0\n", 988 | "898,0\n", 989 | "899,0\n", 990 | "900,1\n" 991 | ] 992 | } 993 | ], 994 | "source": [ 995 | "!head -n10 modelo_video5.csv" 996 | ] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "execution_count": null, 1001 | "metadata": {}, 1002 | "outputs": [], 1003 | "source": [] 1004 | } 1005 | ], 1006 | "metadata": { 1007 | "kernelspec": { 1008 | "display_name": "Python 3", 1009 | "language": "python", 1010 | "name": "python3" 1011 | }, 1012 | "language_info": { 1013 | "codemirror_mode": { 1014 | "name": "ipython", 1015 | "version": 3 1016 | }, 1017 | "file_extension": ".py", 1018 | "mimetype": "text/x-python", 1019 | "name": "python", 1020 | "nbconvert_exporter": "python", 1021 | "pygments_lexer": "ipython3", 1022 | "version": "3.7.3" 1023 | } 1024 | }, 1025 | "nbformat": 4, 1026 | "nbformat_minor": 4 1027 | } 1028 | --------------------------------------------------------------------------------