├── README.md
├── Step 1. EDA.ipynb
├── Step 2. Data Cleaning.ipynb
├── Step 3. Model Training.ipynb
├── Step 4. Scripting the Process.ipynb
├── Step 5. Model Deployment.ipynb
├── Step 6. Containerization.ipynb
├── Step 7. Cloud Deployment.ipynb
├── assets
├── allstate_banner-660x120.png
├── docker-process.drawio
├── docker-process.png
├── docker.png
├── ml-iceberg.jpg
└── tweet_eda.png
├── requirements.txt
├── scripts
├── Dockerfile
├── clean_data.py
├── data
│ ├── test.csv.gz
│ ├── test_cleaned.csv.gz
│ ├── train.csv.gz
│ └── train_cleaned.csv.gz
├── inference.py
├── model
│ └── model.bin
├── requirements.txt
└── train.py
└── virtual-env
├── Pipfile
├── Pipfile.lock
├── README.md
└── requirements.txt
/README.md:
--------------------------------------------------------------------------------
1 | # Machine Learning Workflow - From EDA to Production
2 |
3 | ## Introduction
4 |
5 | This repo tries to study & apply the least minimal steps involved in machine learning workflow the right way. It was compiled during the first cohort of ["Machine Learning Zoomcamp"](https://datatalks.club/courses/2021-winter-ml-zoomcamp.html) course instructed by amazing [@alexeygrigorev](https://github.com/alexeygrigorev).
6 |
7 |
8 |
9 | ## Problem Description
10 |
11 | The problem we will study was held as a competition on Kaggle titled as ["Allstate Claims Severity"](https://www.kaggle.com/c/allstate-claims-severity/). The data was provided by ["Allstate"](https://www.allstate.com/), a personal insurer in the United States. They were looking for ML-based methods to reduce the cost of insurance claims.
12 | The objective of the problem is to predict _'loss'_ value for a claim, which makes it a __regression__ problem. The submissions for test data are evaluated on the __Mean Absolute Error (MAE)__ between the predicted loss and the actual loss.
13 | All the data column values and names in provided dataset are obfuscated for the privacy reasons. Thus, we'll have no __"Domain Knowledge"__ over this problem.
14 |
15 |
16 |
17 |
52 |
53 |
\n", 87 | " | id | \n", 88 | "cat1 | \n", 89 | "cat2 | \n", 90 | "cat4 | \n", 91 | "cat5 | \n", 92 | "cat6 | \n", 93 | "cat8 | \n", 94 | "cat9 | \n", 95 | "cat10 | \n", 96 | "cat11 | \n", 97 | "... | \n", 98 | "cont5 | \n", 99 | "cont6 | \n", 100 | "cont7 | \n", 101 | "cont8 | \n", 102 | "cont9 | \n", 103 | "cont10 | \n", 104 | "cont11 | \n", 105 | "cont13 | \n", 106 | "cont14 | \n", 107 | "loss | \n", 108 | "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", 113 | "1 | \n", 114 | "A | \n", 115 | "B | \n", 116 | "B | \n", 117 | "A | \n", 118 | "A | \n", 119 | "A | \n", 120 | "B | \n", 121 | "A | \n", 122 | "B | \n", 123 | "... | \n", 124 | "0.310061 | \n", 125 | "0.718367 | \n", 126 | "0.335060 | \n", 127 | "0.30260 | \n", 128 | "0.67135 | \n", 129 | "0.83510 | \n", 130 | "0.569745 | \n", 131 | "0.822493 | \n", 132 | "0.714843 | \n", 133 | "2213.18 | \n", 134 | "
1 | \n", 137 | "2 | \n", 138 | "A | \n", 139 | "B | \n", 140 | "A | \n", 141 | "A | \n", 142 | "A | \n", 143 | "A | \n", 144 | "B | \n", 145 | "B | \n", 146 | "A | \n", 147 | "... | \n", 148 | "0.885834 | \n", 149 | "0.438917 | \n", 150 | "0.436585 | \n", 151 | "0.60087 | \n", 152 | "0.35127 | \n", 153 | "0.43919 | \n", 154 | "0.338312 | \n", 155 | "0.611431 | \n", 156 | "0.304496 | \n", 157 | "1283.60 | \n", 158 | "
2 | \n", 161 | "5 | \n", 162 | "A | \n", 163 | "B | \n", 164 | "A | \n", 165 | "B | \n", 166 | "A | \n", 167 | "A | \n", 168 | "B | \n", 169 | "B | \n", 170 | "B | \n", 171 | "... | \n", 172 | "0.397069 | \n", 173 | "0.289648 | \n", 174 | "0.315545 | \n", 175 | "0.27320 | \n", 176 | "0.26076 | \n", 177 | "0.32446 | \n", 178 | "0.381398 | \n", 179 | "0.195709 | \n", 180 | "0.774425 | \n", 181 | "3005.09 | \n", 182 | "
3 | \n", 185 | "10 | \n", 186 | "B | \n", 187 | "B | \n", 188 | "B | \n", 189 | "A | \n", 190 | "A | \n", 191 | "A | \n", 192 | "B | \n", 193 | "A | \n", 194 | "A | \n", 195 | "... | \n", 196 | "0.422268 | \n", 197 | "0.440945 | \n", 198 | "0.391128 | \n", 199 | "0.31796 | \n", 200 | "0.32128 | \n", 201 | "0.44467 | \n", 202 | "0.327915 | \n", 203 | "0.605077 | \n", 204 | "0.602642 | \n", 205 | "939.85 | \n", 206 | "
4 | \n", 209 | "11 | \n", 210 | "A | \n", 211 | "B | \n", 212 | "B | \n", 213 | "A | \n", 214 | "A | \n", 215 | "A | \n", 216 | "B | \n", 217 | "B | \n", 218 | "A | \n", 219 | "... | \n", 220 | "0.704268 | \n", 221 | "0.178193 | \n", 222 | "0.247408 | \n", 223 | "0.24564 | \n", 224 | "0.22089 | \n", 225 | "0.21230 | \n", 226 | "0.204687 | \n", 227 | "0.246011 | \n", 228 | "0.432606 | \n", 229 | "2763.85 | \n", 230 | "
5 rows × 120 columns
\n", 234 | "ColumnTransformer(transformers=[('num',\n", 629 | " Pipeline(steps=[('imputer', SimpleImputer()),\n", 630 | " ('normalizer', MinMaxScaler()),\n", 631 | " ('standardizer',\n", 632 | " StandardScaler())]),\n", 633 | " ['cont1', 'cont2', 'cont3', 'cont4', 'cont5',\n", 634 | " 'cont6', 'cont7', 'cont8', 'cont9', 'cont10',\n", 635 | " 'cont11', 'cont13', 'cont14']),\n", 636 | " ('cat1',\n", 637 | " Pipeline(steps=[('imputer',\n", 638 | " SimpleImputer(strategy='most_frequent')),\n", 639 | " ('ordinal',\n", 640 | " Ord...\n", 641 | " SimpleImputer(fill_value='missing',\n", 642 | " strategy='constant')),\n", 643 | " ('onehot',\n", 644 | " OneHotEncoder(handle_unknown='ignore',\n", 645 | " sparse=False))]),\n", 646 | " ['cat1', 'cat2', 'cat4', 'cat5', 'cat6',\n", 647 | " 'cat8', 'cat9', 'cat10', 'cat11', 'cat12',\n", 648 | " 'cat13', 'cat14', 'cat16', 'cat17', 'cat18',\n", 649 | " 'cat19', 'cat20', 'cat21', 'cat23', 'cat24',\n", 650 | " 'cat25', 'cat26', 'cat27', 'cat28', 'cat29',\n", 651 | " 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', ...])])
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont13', 'cont14']
SimpleImputer()
MinMaxScaler()
StandardScaler()
['cat99', 'cat100', 'cat101', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116']
SimpleImputer(strategy='most_frequent')
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
MinMaxScaler()
['cat1', 'cat2', 'cat4', 'cat5', 'cat6', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat65', 'cat66', 'cat67', 'cat69', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat102']
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
\n",
14 | " \n",
15 | "
\n",
25 | " \n",
26 | "