├── README.md └── notebookd8108b9e6a.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # ds_standard_battery 2 | This repo is an attempt at building a standard battey for data science and machine learnig. 3 | Using various tools and a user friendly frontend go from ground truth statement to end analysis. 4 | To accomplish this the five overarching steps are: 5 | Collect 6 | Clean 7 | Explore 8 | Model building 9 | Model Deployment 10 | 11 | 12 | 13 | For collecting data consider starting with Kafka or similar for a data stream and a DB for data storage as you will likely outgrow CSV's and spreadsheets for data collection. 14 | 15 | Cleaning is time and labor intensive but garbage in equals garbage out. 16 | 17 | **Cleaning step 1)** Remove duplicates and irrelevant data. You dont need vehicle data for 1980's caddilacs when researching new model electric vehicles. 18 | 19 | **Cleaning step 2)** Fix data structures. N/A's, Typos, bad naming conventions. The goal is uniformity 20 | 21 | **Cleaning step 3)** Missing data. You can drop it or input missing values based on other observations (average). 22 | 23 | **Cleaning step 4)** Outliers. Exercise caution when considering removing outliers as some of them provide more insight to your data than the average data point. 24 | 25 | **Cleaning step 5)** Data validation. Does the data make sense? Does the data follow the appropriate rules for its field? 26 | Can you say this about the data? 27 | 28 | Validity. The degree to which your data conforms to defined business rules or constraints. 29 | Accuracy. Ensure your data is close to the true values. 30 | Completeness. The degree to which all required data is known. 31 | Consistency. Ensure your data is consistent within the same dataset and/or across multiple data sets. 32 | Uniformity. The degree to which the data is specified using the same unit of measure. 33 | 34 | Consider a cleaning tool such as https://openrefine.org/ it's free and open source! 35 | -------------------------------------------------------------------------------- /notebookd8108b9e6a.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "371708ce", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2022-07-04T20:23:40.901366Z", 12 | "iopub.status.busy": "2022-07-04T20:23:40.900878Z", 13 | "iopub.status.idle": "2022-07-04T20:23:40.925785Z", 14 | "shell.execute_reply": "2022-07-04T20:23:40.923945Z" 15 | }, 16 | "papermill": { 17 | "duration": 0.034989, 18 | "end_time": "2022-07-04T20:23:40.928815", 19 | "exception": false, 20 | "start_time": "2022-07-04T20:23:40.893826", 21 | "status": "completed" 22 | }, 23 | "tags": [] 24 | }, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "/kaggle/input/titanic/train.csv\n", 31 | "/kaggle/input/titanic/test.csv\n", 32 | "/kaggle/input/titanic/gender_submission.csv\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 38 | "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", 39 | "# For example, here's several helpful packages to load\n", 40 | "\n", 41 | "import numpy as np # linear algebra\n", 42 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 43 | "\n", 44 | "# Input data files are available in the read-only \"../input/\" directory\n", 45 | "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", 46 | "\n", 47 | "import os\n", 48 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 49 | " for filename in filenames:\n", 50 | " print(os.path.join(dirname, filename))\n", 51 | "\n", 52 | "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", 53 | "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "63a2f665", 59 | "metadata": { 60 | "papermill": { 61 | "duration": 0.004672, 62 | "end_time": "2022-07-04T20:23:40.937928", 63 | "exception": false, 64 | "start_time": "2022-07-04T20:23:40.933256", 65 | "status": "completed" 66 | }, 67 | "tags": [] 68 | }, 69 | "source": [ 70 | "This notebook is based on:\n", 71 | "https://www.kaggle.com/code/cocorin/yh-cur-titanic-top-4-with-ensemble-modeling\n" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "ca431dbc", 77 | "metadata": { 78 | "papermill": { 79 | "duration": 0.004514, 80 | "end_time": "2022-07-04T20:23:40.946790", 81 | "exception": false, 82 | "start_time": "2022-07-04T20:23:40.942276", 83 | "status": "completed" 84 | }, 85 | "tags": [] 86 | }, 87 | "source": [ 88 | "Workflow:\n", 89 | "##### 2 Load and check data\n", 90 | "\n", 91 | " 2.1 load data\n", 92 | " 2.2 Outlier detection\n", 93 | " 2.3 joining train and test set\n", 94 | " 2.4 null and missing values\n", 95 | "\n", 96 | "##### 3 Feature analysis\n", 97 | "\n", 98 | " 3.1 Numerical values\n", 99 | " 3.2 Categorical values\n", 100 | "\n", 101 | "##### 4 Filling missing Values\n", 102 | "\n", 103 | "##### 5 Feature engineering\n", 104 | "\n", 105 | " 5.1 tool 1\n", 106 | " 5.2 tool 2\n", 107 | " 5.3 tool 3\n", 108 | " 5.4 tool 4\n", 109 | "\n", 110 | "##### 6 Modeling\n", 111 | "\n", 112 | " 6.1 Simple modeling\n", 113 | " 6.1.1 Cross validate models.\n", 114 | " 6.1.2 Hyperparameter tunning for best models\n", 115 | " 6.1.3 Plot learning curves\n", 116 | " 6.1.4 Feature importance of the tree based classifiers\n", 117 | " 6.2 Ensemble modeling\n", 118 | " 6.2.1 Combining models\n", 119 | " 6.3 Prediction\n", 120 | " 6.3.1 Predict and Submit results\n", 121 | "\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 2, 127 | "id": "3873d49d", 128 | "metadata": { 129 | "execution": { 130 | "iopub.execute_input": "2022-07-04T20:23:40.957802Z", 131 | "iopub.status.busy": "2022-07-04T20:23:40.957333Z", 132 | "iopub.status.idle": "2022-07-04T20:23:42.791252Z", 133 | "shell.execute_reply": "2022-07-04T20:23:42.790070Z" 134 | }, 135 | "papermill": { 136 | "duration": 1.843355, 137 | "end_time": "2022-07-04T20:23:42.794576", 138 | "exception": false, 139 | "start_time": "2022-07-04T20:23:40.951221", 140 | "status": "completed" 141 | }, 142 | "tags": [] 143 | }, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "all imports loaded\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "import pandas as pd\n", 155 | "import numpy as np\n", 156 | "import matplotlib.pyplot as plt\n", 157 | "import seaborn as sns\n", 158 | "%matplotlib inline\n", 159 | "\n", 160 | "import warnings\n", 161 | "warnings.filterwarnings('ignore')\n", 162 | "\n", 163 | "from collections import Counter\n", 164 | "\n", 165 | "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier\n", 166 | "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", 167 | "from sklearn.linear_model import LogisticRegression\n", 168 | "from sklearn.neighbors import KNeighborsClassifier\n", 169 | "from sklearn.tree import DecisionTreeClassifier\n", 170 | "from sklearn.neural_network import MLPClassifier\n", 171 | "from sklearn.svm import SVC\n", 172 | "from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve\n", 173 | "\n", 174 | "print('all imports loaded')" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 3, 180 | "id": "4c1a23e3", 181 | "metadata": { 182 | "execution": { 183 | "iopub.execute_input": "2022-07-04T20:23:42.806629Z", 184 | "iopub.status.busy": "2022-07-04T20:23:42.805636Z", 185 | "iopub.status.idle": "2022-07-04T20:23:42.857792Z", 186 | "shell.execute_reply": "2022-07-04T20:23:42.856659Z" 187 | }, 188 | "papermill": { 189 | "duration": 0.06098, 190 | "end_time": "2022-07-04T20:23:42.860891", 191 | "exception": false, 192 | "start_time": "2022-07-04T20:23:42.799911", 193 | "status": "completed" 194 | }, 195 | "tags": [] 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "# Load data\n", 200 | "# make sure to add your data to the notebook look --> and click + add data \n", 201 | "train = pd.read_csv('../input/titanic/train.csv')\n", 202 | "test = pd.read_csv('../input/titanic/test.csv')\n", 203 | "\n", 204 | "## join train and test datasets in order to obtain the same number of features during categorical conversion\n", 205 | "train_len= len(train)\n", 206 | "dataset= pd.concat(objs= [train, test], axis= 0).reset_index(drop= True)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 4, 212 | "id": "64ff25cc", 213 | "metadata": { 214 | "execution": { 215 | "iopub.execute_input": "2022-07-04T20:23:42.872251Z", 216 | "iopub.status.busy": "2022-07-04T20:23:42.871877Z", 217 | "iopub.status.idle": "2022-07-04T20:23:42.890166Z", 218 | "shell.execute_reply": "2022-07-04T20:23:42.889203Z" 219 | }, 220 | "papermill": { 221 | "duration": 0.027638, 222 | "end_time": "2022-07-04T20:23:42.893118", 223 | "exception": false, 224 | "start_time": "2022-07-04T20:23:42.865480", 225 | "status": "completed" 226 | }, 227 | "tags": [] 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "# Outlier detection\n", 232 | "\n", 233 | "def detect_outliers(df, n, features):\n", 234 | " '''\n", 235 | " Takes a dataframe df of features and returns a list of the indices corresponding\n", 236 | " to the obeservations containing more than n outliers according to the Tukey method\n", 237 | " '''\n", 238 | " outlier_indices= []\n", 239 | " \n", 240 | " # iterate over features(columns)\n", 241 | " for col in features:\n", 242 | " # 1st quartile (25%)\n", 243 | " Q1 = np.percentile(df[col], 25)\n", 244 | " # 3rd quartile (75%)\n", 245 | " Q3 = np.percentile(df[col], 75)\n", 246 | " # Interquartile range (IQR)\n", 247 | " IQR = Q3 - Q1\n", 248 | " \n", 249 | " # outlier step\n", 250 | " outlier_step= 1.5* IQR\n", 251 | " \n", 252 | " # Determine a list of indices of outliers for feature col\n", 253 | " outlier_list_col= df[(df[col]< Q1 - outlier_step) | (df[col]> Q3 + outlier_step)].index\n", 254 | " \n", 255 | " # append the found outlier indices for col to the list of outlier in dices\n", 256 | " outlier_indices.extend(outlier_list_col)\n", 257 | " \n", 258 | " # select observations contaning more than 2 outliers\n", 259 | " outlier_indices= Counter(outlier_indices)\n", 260 | " multiple_outliers= list(k for k, v in outlier_indices.items() if v> n)\n", 261 | " \n", 262 | " return multiple_outliers\n", 263 | "\n", 264 | "# detect outliers from selected columns\n", 265 | "Outliers_to_drop= detect_outliers(train, 2, ['Age', 'SibSp', 'Parch', 'Fare'])" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 5, 271 | "id": "a20e2be0", 272 | "metadata": { 273 | "execution": { 274 | "iopub.execute_input": "2022-07-04T20:23:42.905573Z", 275 | "iopub.status.busy": "2022-07-04T20:23:42.904258Z", 276 | "iopub.status.idle": "2022-07-04T20:23:42.936527Z", 277 | "shell.execute_reply": "2022-07-04T20:23:42.935059Z" 278 | }, 279 | "papermill": { 280 | "duration": 0.040725, 281 | "end_time": "2022-07-04T20:23:42.938918", 282 | "exception": false, 283 | "start_time": "2022-07-04T20:23:42.898193", 284 | "status": "completed" 285 | }, 286 | "tags": [] 287 | }, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/html": [ 292 | "
\n", 293 | "\n", 306 | "\n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
272801Fortune, Mr. Charles Alexandermale19.03219950263.00C23 C25 C27S
888911Fortune, Miss. Mabel Helenfemale23.03219950263.00C23 C25 C27S
15916003Sage, Master. Thomas HenrymaleNaN82CA. 234369.55NaNS
18018103Sage, Miss. Constance GladysfemaleNaN82CA. 234369.55NaNS
20120203Sage, Mr. FrederickmaleNaN82CA. 234369.55NaNS
32432503Sage, Mr. George John JrmaleNaN82CA. 234369.55NaNS
34134211Fortune, Miss. Alice Elizabethfemale24.03219950263.00C23 C25 C27S
79279303Sage, Miss. Stella AnnafemaleNaN82CA. 234369.55NaNS
84684703Sage, Mr. Douglas BullenmaleNaN82CA. 234369.55NaNS
86386403Sage, Miss. Dorothy Edith \"Dolly\"femaleNaN82CA. 234369.55NaNS
\n", 477 | "
" 478 | ], 479 | "text/plain": [ 480 | " PassengerId Survived Pclass Name Sex \\\n", 481 | "27 28 0 1 Fortune, Mr. Charles Alexander male \n", 482 | "88 89 1 1 Fortune, Miss. Mabel Helen female \n", 483 | "159 160 0 3 Sage, Master. Thomas Henry male \n", 484 | "180 181 0 3 Sage, Miss. Constance Gladys female \n", 485 | "201 202 0 3 Sage, Mr. Frederick male \n", 486 | "324 325 0 3 Sage, Mr. George John Jr male \n", 487 | "341 342 1 1 Fortune, Miss. Alice Elizabeth female \n", 488 | "792 793 0 3 Sage, Miss. Stella Anna female \n", 489 | "846 847 0 3 Sage, Mr. Douglas Bullen male \n", 490 | "863 864 0 3 Sage, Miss. Dorothy Edith \"Dolly\" female \n", 491 | "\n", 492 | " Age SibSp Parch Ticket Fare Cabin Embarked \n", 493 | "27 19.0 3 2 19950 263.00 C23 C25 C27 S \n", 494 | "88 23.0 3 2 19950 263.00 C23 C25 C27 S \n", 495 | "159 NaN 8 2 CA. 2343 69.55 NaN S \n", 496 | "180 NaN 8 2 CA. 2343 69.55 NaN S \n", 497 | "201 NaN 8 2 CA. 2343 69.55 NaN S \n", 498 | "324 NaN 8 2 CA. 2343 69.55 NaN S \n", 499 | "341 24.0 3 2 19950 263.00 C23 C25 C27 S \n", 500 | "792 NaN 8 2 CA. 2343 69.55 NaN S \n", 501 | "846 NaN 8 2 CA. 2343 69.55 NaN S \n", 502 | "863 NaN 8 2 CA. 2343 69.55 NaN S " 503 | ] 504 | }, 505 | "execution_count": 5, 506 | "metadata": {}, 507 | "output_type": "execute_result" 508 | } 509 | ], 510 | "source": [ 511 | "# Show the outliers row\n", 512 | "train.loc[Outliers_to_drop] " 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 6, 518 | "id": "2da4c8d6", 519 | "metadata": { 520 | "execution": { 521 | "iopub.execute_input": "2022-07-04T20:23:42.950999Z", 522 | "iopub.status.busy": "2022-07-04T20:23:42.950607Z", 523 | "iopub.status.idle": "2022-07-04T20:23:42.957729Z", 524 | "shell.execute_reply": "2022-07-04T20:23:42.956527Z" 525 | }, 526 | "papermill": { 527 | "duration": 0.01633, 528 | "end_time": "2022-07-04T20:23:42.960413", 529 | "exception": false, 530 | "start_time": "2022-07-04T20:23:42.944083", 531 | "status": "completed" 532 | }, 533 | "tags": [] 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "# uncomment the line below to drop outliers\n", 538 | "train= train.drop(Outliers_to_drop, axis= 0).reset_index(drop= True)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 7, 544 | "id": "6d41acfd", 545 | "metadata": { 546 | "execution": { 547 | "iopub.execute_input": "2022-07-04T20:23:42.972387Z", 548 | "iopub.status.busy": "2022-07-04T20:23:42.972018Z", 549 | "iopub.status.idle": "2022-07-04T20:23:42.987707Z", 550 | "shell.execute_reply": "2022-07-04T20:23:42.986540Z" 551 | }, 552 | "papermill": { 553 | "duration": 0.024504, 554 | "end_time": "2022-07-04T20:23:42.990126", 555 | "exception": false, 556 | "start_time": "2022-07-04T20:23:42.965622", 557 | "status": "completed" 558 | }, 559 | "tags": [] 560 | }, 561 | "outputs": [ 562 | { 563 | "data": { 564 | "text/plain": [ 565 | "Cabin 1014\n", 566 | "Survived 418\n", 567 | "Age 263\n", 568 | "Embarked 2\n", 569 | "Fare 1\n", 570 | "PassengerId 0\n", 571 | "Pclass 0\n", 572 | "Name 0\n", 573 | "Sex 0\n", 574 | "SibSp 0\n", 575 | "Parch 0\n", 576 | "Ticket 0\n", 577 | "dtype: int64" 578 | ] 579 | }, 580 | "execution_count": 7, 581 | "metadata": {}, 582 | "output_type": "execute_result" 583 | } 584 | ], 585 | "source": [ 586 | "# Fill empty and NaNs values with NaN\n", 587 | "dataset= dataset.fillna(np.nan)\n", 588 | "\n", 589 | "# Check for Null values\n", 590 | "dataset.isnull().sum().sort_values(ascending= False)" 591 | ] 592 | } 593 | ], 594 | "metadata": { 595 | "kernelspec": { 596 | "display_name": "Python 3", 597 | "language": "python", 598 | "name": "python3" 599 | }, 600 | "language_info": { 601 | "codemirror_mode": { 602 | "name": "ipython", 603 | "version": 3 604 | }, 605 | "file_extension": ".py", 606 | "mimetype": "text/x-python", 607 | "name": "python", 608 | "nbconvert_exporter": "python", 609 | "pygments_lexer": "ipython3", 610 | "version": "3.7.12" 611 | }, 612 | "papermill": { 613 | "default_parameters": {}, 614 | "duration": 15.776307, 615 | "end_time": "2022-07-04T20:23:44.019113", 616 | "environment_variables": {}, 617 | "exception": null, 618 | "input_path": "__notebook__.ipynb", 619 | "output_path": "__notebook__.ipynb", 620 | "parameters": {}, 621 | "start_time": "2022-07-04T20:23:28.242806", 622 | "version": "2.3.4" 623 | } 624 | }, 625 | "nbformat": 4, 626 | "nbformat_minor": 5 627 | } 628 | --------------------------------------------------------------------------------