├── README.md └── notebookd8108b9e6a.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # ds_standard_battery 2 | This repo is an attempt at building a standard battey for data science and machine learnig. 3 | Using various tools and a user friendly frontend go from ground truth statement to end analysis. 4 | To accomplish this the five overarching steps are: 5 | Collect 6 | Clean 7 | Explore 8 | Model building 9 | Model Deployment 10 | 11 | 12 | 13 | For collecting data consider starting with Kafka or similar for a data stream and a DB for data storage as you will likely outgrow CSV's and spreadsheets for data collection. 14 | 15 | Cleaning is time and labor intensive but garbage in equals garbage out. 16 | 17 | **Cleaning step 1)** Remove duplicates and irrelevant data. You dont need vehicle data for 1980's caddilacs when researching new model electric vehicles. 18 | 19 | **Cleaning step 2)** Fix data structures. N/A's, Typos, bad naming conventions. The goal is uniformity 20 | 21 | **Cleaning step 3)** Missing data. You can drop it or input missing values based on other observations (average). 22 | 23 | **Cleaning step 4)** Outliers. Exercise caution when considering removing outliers as some of them provide more insight to your data than the average data point. 24 | 25 | **Cleaning step 5)** Data validation. Does the data make sense? Does the data follow the appropriate rules for its field? 26 | Can you say this about the data? 27 | 28 | Validity. The degree to which your data conforms to defined business rules or constraints. 29 | Accuracy. Ensure your data is close to the true values. 30 | Completeness. The degree to which all required data is known. 31 | Consistency. Ensure your data is consistent within the same dataset and/or across multiple data sets. 32 | Uniformity. The degree to which the data is specified using the same unit of measure. 33 | 34 | Consider a cleaning tool such as https://openrefine.org/ it's free and open source! 35 | -------------------------------------------------------------------------------- /notebookd8108b9e6a.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "371708ce", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2022-07-04T20:23:40.901366Z", 12 | "iopub.status.busy": "2022-07-04T20:23:40.900878Z", 13 | "iopub.status.idle": "2022-07-04T20:23:40.925785Z", 14 | "shell.execute_reply": "2022-07-04T20:23:40.923945Z" 15 | }, 16 | "papermill": { 17 | "duration": 0.034989, 18 | "end_time": "2022-07-04T20:23:40.928815", 19 | "exception": false, 20 | "start_time": "2022-07-04T20:23:40.893826", 21 | "status": "completed" 22 | }, 23 | "tags": [] 24 | }, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "/kaggle/input/titanic/train.csv\n", 31 | "/kaggle/input/titanic/test.csv\n", 32 | "/kaggle/input/titanic/gender_submission.csv\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 38 | "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", 39 | "# For example, here's several helpful packages to load\n", 40 | "\n", 41 | "import numpy as np # linear algebra\n", 42 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 43 | "\n", 44 | "# Input data files are available in the read-only \"../input/\" directory\n", 45 | "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", 46 | "\n", 47 | "import os\n", 48 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 49 | " for filename in filenames:\n", 50 | " print(os.path.join(dirname, filename))\n", 51 | "\n", 52 | "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", 53 | "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "63a2f665", 59 | "metadata": { 60 | "papermill": { 61 | "duration": 0.004672, 62 | "end_time": "2022-07-04T20:23:40.937928", 63 | "exception": false, 64 | "start_time": "2022-07-04T20:23:40.933256", 65 | "status": "completed" 66 | }, 67 | "tags": [] 68 | }, 69 | "source": [ 70 | "This notebook is based on:\n", 71 | "https://www.kaggle.com/code/cocorin/yh-cur-titanic-top-4-with-ensemble-modeling\n" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "ca431dbc", 77 | "metadata": { 78 | "papermill": { 79 | "duration": 0.004514, 80 | "end_time": "2022-07-04T20:23:40.946790", 81 | "exception": false, 82 | "start_time": "2022-07-04T20:23:40.942276", 83 | "status": "completed" 84 | }, 85 | "tags": [] 86 | }, 87 | "source": [ 88 | "Workflow:\n", 89 | "##### 2 Load and check data\n", 90 | "\n", 91 | " 2.1 load data\n", 92 | " 2.2 Outlier detection\n", 93 | " 2.3 joining train and test set\n", 94 | " 2.4 null and missing values\n", 95 | "\n", 96 | "##### 3 Feature analysis\n", 97 | "\n", 98 | " 3.1 Numerical values\n", 99 | " 3.2 Categorical values\n", 100 | "\n", 101 | "##### 4 Filling missing Values\n", 102 | "\n", 103 | "##### 5 Feature engineering\n", 104 | "\n", 105 | " 5.1 tool 1\n", 106 | " 5.2 tool 2\n", 107 | " 5.3 tool 3\n", 108 | " 5.4 tool 4\n", 109 | "\n", 110 | "##### 6 Modeling\n", 111 | "\n", 112 | " 6.1 Simple modeling\n", 113 | " 6.1.1 Cross validate models.\n", 114 | " 6.1.2 Hyperparameter tunning for best models\n", 115 | " 6.1.3 Plot learning curves\n", 116 | " 6.1.4 Feature importance of the tree based classifiers\n", 117 | " 6.2 Ensemble modeling\n", 118 | " 6.2.1 Combining models\n", 119 | " 6.3 Prediction\n", 120 | " 6.3.1 Predict and Submit results\n", 121 | "\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 2, 127 | "id": "3873d49d", 128 | "metadata": { 129 | "execution": { 130 | "iopub.execute_input": "2022-07-04T20:23:40.957802Z", 131 | "iopub.status.busy": "2022-07-04T20:23:40.957333Z", 132 | "iopub.status.idle": "2022-07-04T20:23:42.791252Z", 133 | "shell.execute_reply": "2022-07-04T20:23:42.790070Z" 134 | }, 135 | "papermill": { 136 | "duration": 1.843355, 137 | "end_time": "2022-07-04T20:23:42.794576", 138 | "exception": false, 139 | "start_time": "2022-07-04T20:23:40.951221", 140 | "status": "completed" 141 | }, 142 | "tags": [] 143 | }, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "all imports loaded\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "import pandas as pd\n", 155 | "import numpy as np\n", 156 | "import matplotlib.pyplot as plt\n", 157 | "import seaborn as sns\n", 158 | "%matplotlib inline\n", 159 | "\n", 160 | "import warnings\n", 161 | "warnings.filterwarnings('ignore')\n", 162 | "\n", 163 | "from collections import Counter\n", 164 | "\n", 165 | "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier\n", 166 | "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", 167 | "from sklearn.linear_model import LogisticRegression\n", 168 | "from sklearn.neighbors import KNeighborsClassifier\n", 169 | "from sklearn.tree import DecisionTreeClassifier\n", 170 | "from sklearn.neural_network import MLPClassifier\n", 171 | "from sklearn.svm import SVC\n", 172 | "from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve\n", 173 | "\n", 174 | "print('all imports loaded')" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 3, 180 | "id": "4c1a23e3", 181 | "metadata": { 182 | "execution": { 183 | "iopub.execute_input": "2022-07-04T20:23:42.806629Z", 184 | "iopub.status.busy": "2022-07-04T20:23:42.805636Z", 185 | "iopub.status.idle": "2022-07-04T20:23:42.857792Z", 186 | "shell.execute_reply": "2022-07-04T20:23:42.856659Z" 187 | }, 188 | "papermill": { 189 | "duration": 0.06098, 190 | "end_time": "2022-07-04T20:23:42.860891", 191 | "exception": false, 192 | "start_time": "2022-07-04T20:23:42.799911", 193 | "status": "completed" 194 | }, 195 | "tags": [] 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "# Load data\n", 200 | "# make sure to add your data to the notebook look --> and click + add data \n", 201 | "train = pd.read_csv('../input/titanic/train.csv')\n", 202 | "test = pd.read_csv('../input/titanic/test.csv')\n", 203 | "\n", 204 | "## join train and test datasets in order to obtain the same number of features during categorical conversion\n", 205 | "train_len= len(train)\n", 206 | "dataset= pd.concat(objs= [train, test], axis= 0).reset_index(drop= True)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 4, 212 | "id": "64ff25cc", 213 | "metadata": { 214 | "execution": { 215 | "iopub.execute_input": "2022-07-04T20:23:42.872251Z", 216 | "iopub.status.busy": "2022-07-04T20:23:42.871877Z", 217 | "iopub.status.idle": "2022-07-04T20:23:42.890166Z", 218 | "shell.execute_reply": "2022-07-04T20:23:42.889203Z" 219 | }, 220 | "papermill": { 221 | "duration": 0.027638, 222 | "end_time": "2022-07-04T20:23:42.893118", 223 | "exception": false, 224 | "start_time": "2022-07-04T20:23:42.865480", 225 | "status": "completed" 226 | }, 227 | "tags": [] 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "# Outlier detection\n", 232 | "\n", 233 | "def detect_outliers(df, n, features):\n", 234 | " '''\n", 235 | " Takes a dataframe df of features and returns a list of the indices corresponding\n", 236 | " to the obeservations containing more than n outliers according to the Tukey method\n", 237 | " '''\n", 238 | " outlier_indices= []\n", 239 | " \n", 240 | " # iterate over features(columns)\n", 241 | " for col in features:\n", 242 | " # 1st quartile (25%)\n", 243 | " Q1 = np.percentile(df[col], 25)\n", 244 | " # 3rd quartile (75%)\n", 245 | " Q3 = np.percentile(df[col], 75)\n", 246 | " # Interquartile range (IQR)\n", 247 | " IQR = Q3 - Q1\n", 248 | " \n", 249 | " # outlier step\n", 250 | " outlier_step= 1.5* IQR\n", 251 | " \n", 252 | " # Determine a list of indices of outliers for feature col\n", 253 | " outlier_list_col= df[(df[col]< Q1 - outlier_step) | (df[col]> Q3 + outlier_step)].index\n", 254 | " \n", 255 | " # append the found outlier indices for col to the list of outlier in dices\n", 256 | " outlier_indices.extend(outlier_list_col)\n", 257 | " \n", 258 | " # select observations contaning more than 2 outliers\n", 259 | " outlier_indices= Counter(outlier_indices)\n", 260 | " multiple_outliers= list(k for k, v in outlier_indices.items() if v> n)\n", 261 | " \n", 262 | " return multiple_outliers\n", 263 | "\n", 264 | "# detect outliers from selected columns\n", 265 | "Outliers_to_drop= detect_outliers(train, 2, ['Age', 'SibSp', 'Parch', 'Fare'])" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 5, 271 | "id": "a20e2be0", 272 | "metadata": { 273 | "execution": { 274 | "iopub.execute_input": "2022-07-04T20:23:42.905573Z", 275 | "iopub.status.busy": "2022-07-04T20:23:42.904258Z", 276 | "iopub.status.idle": "2022-07-04T20:23:42.936527Z", 277 | "shell.execute_reply": "2022-07-04T20:23:42.935059Z" 278 | }, 279 | "papermill": { 280 | "duration": 0.040725, 281 | "end_time": "2022-07-04T20:23:42.938918", 282 | "exception": false, 283 | "start_time": "2022-07-04T20:23:42.898193", 284 | "status": "completed" 285 | }, 286 | "tags": [] 287 | }, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/html": [ 292 | "
\n", 310 | " | PassengerId | \n", 311 | "Survived | \n", 312 | "Pclass | \n", 313 | "Name | \n", 314 | "Sex | \n", 315 | "Age | \n", 316 | "SibSp | \n", 317 | "Parch | \n", 318 | "Ticket | \n", 319 | "Fare | \n", 320 | "Cabin | \n", 321 | "Embarked | \n", 322 | "
---|---|---|---|---|---|---|---|---|---|---|---|---|
27 | \n", 327 | "28 | \n", 328 | "0 | \n", 329 | "1 | \n", 330 | "Fortune, Mr. Charles Alexander | \n", 331 | "male | \n", 332 | "19.0 | \n", 333 | "3 | \n", 334 | "2 | \n", 335 | "19950 | \n", 336 | "263.00 | \n", 337 | "C23 C25 C27 | \n", 338 | "S | \n", 339 | "
88 | \n", 342 | "89 | \n", 343 | "1 | \n", 344 | "1 | \n", 345 | "Fortune, Miss. Mabel Helen | \n", 346 | "female | \n", 347 | "23.0 | \n", 348 | "3 | \n", 349 | "2 | \n", 350 | "19950 | \n", 351 | "263.00 | \n", 352 | "C23 C25 C27 | \n", 353 | "S | \n", 354 | "
159 | \n", 357 | "160 | \n", 358 | "0 | \n", 359 | "3 | \n", 360 | "Sage, Master. Thomas Henry | \n", 361 | "male | \n", 362 | "NaN | \n", 363 | "8 | \n", 364 | "2 | \n", 365 | "CA. 2343 | \n", 366 | "69.55 | \n", 367 | "NaN | \n", 368 | "S | \n", 369 | "
180 | \n", 372 | "181 | \n", 373 | "0 | \n", 374 | "3 | \n", 375 | "Sage, Miss. Constance Gladys | \n", 376 | "female | \n", 377 | "NaN | \n", 378 | "8 | \n", 379 | "2 | \n", 380 | "CA. 2343 | \n", 381 | "69.55 | \n", 382 | "NaN | \n", 383 | "S | \n", 384 | "
201 | \n", 387 | "202 | \n", 388 | "0 | \n", 389 | "3 | \n", 390 | "Sage, Mr. Frederick | \n", 391 | "male | \n", 392 | "NaN | \n", 393 | "8 | \n", 394 | "2 | \n", 395 | "CA. 2343 | \n", 396 | "69.55 | \n", 397 | "NaN | \n", 398 | "S | \n", 399 | "
324 | \n", 402 | "325 | \n", 403 | "0 | \n", 404 | "3 | \n", 405 | "Sage, Mr. George John Jr | \n", 406 | "male | \n", 407 | "NaN | \n", 408 | "8 | \n", 409 | "2 | \n", 410 | "CA. 2343 | \n", 411 | "69.55 | \n", 412 | "NaN | \n", 413 | "S | \n", 414 | "
341 | \n", 417 | "342 | \n", 418 | "1 | \n", 419 | "1 | \n", 420 | "Fortune, Miss. Alice Elizabeth | \n", 421 | "female | \n", 422 | "24.0 | \n", 423 | "3 | \n", 424 | "2 | \n", 425 | "19950 | \n", 426 | "263.00 | \n", 427 | "C23 C25 C27 | \n", 428 | "S | \n", 429 | "
792 | \n", 432 | "793 | \n", 433 | "0 | \n", 434 | "3 | \n", 435 | "Sage, Miss. Stella Anna | \n", 436 | "female | \n", 437 | "NaN | \n", 438 | "8 | \n", 439 | "2 | \n", 440 | "CA. 2343 | \n", 441 | "69.55 | \n", 442 | "NaN | \n", 443 | "S | \n", 444 | "
846 | \n", 447 | "847 | \n", 448 | "0 | \n", 449 | "3 | \n", 450 | "Sage, Mr. Douglas Bullen | \n", 451 | "male | \n", 452 | "NaN | \n", 453 | "8 | \n", 454 | "2 | \n", 455 | "CA. 2343 | \n", 456 | "69.55 | \n", 457 | "NaN | \n", 458 | "S | \n", 459 | "
863 | \n", 462 | "864 | \n", 463 | "0 | \n", 464 | "3 | \n", 465 | "Sage, Miss. Dorothy Edith \"Dolly\" | \n", 466 | "female | \n", 467 | "NaN | \n", 468 | "8 | \n", 469 | "2 | \n", 470 | "CA. 2343 | \n", 471 | "69.55 | \n", 472 | "NaN | \n", 473 | "S | \n", 474 | "