├── .gitignore └── 01. Income Classification with Random Forest ├── requirements.txt ├── train.py └── income classification with random forest.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Project 2 | .ipynb_checkpoints 3 | 4 | # Python 5 | *.pyc 6 | 7 | # ML 8 | *.joblib 9 | 10 | -------------------------------------------------------------------------------- /01. Income Classification with Random Forest/requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==0.16.0 2 | numpy==1.19.1 3 | pandas==1.1.1 4 | scikit-learn==0.23.2 5 | scipy==1.5.2 6 | 7 | -------------------------------------------------------------------------------- /01. Income Classification with Random Forest/train.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import joblib 4 | 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.feature_selection import SelectKBest 7 | from sklearn.pipeline import FeatureUnion 8 | from sklearn.pipeline import Pipeline 9 | from sklearn.preprocessing import LabelBinarizer 10 | from sklearn.metrics import classification_report 11 | 12 | COLUMNS = ( 13 | 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 14 | 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-level') 15 | 16 | CATEGORICAL_COLUMNS = ( 17 | 'workclass', 'education', 'marital-status', 'occupation', 18 | 'relationship', 'race', 'sex', 'native-country') 19 | 20 | with open('./train.csv', 'r') as f: 21 | train_data = pd.read_csv(f, header=None, names=COLUMNS) 22 | 23 | with open('./test.csv', 'r') as f: 24 | test_data = pd.read_csv(f, names=COLUMNS, skiprows=1) 25 | 26 | x_train = train_data.drop('income-level', axis=1).values 27 | y_train = (train_data['income-level'] == ' >50K').values 28 | 29 | x_test = test_data.drop('income-level', axis=1).values 30 | y_test = (test_data['income-level'] == ' >50K.').values 31 | 32 | categorical_pipelines = [] 33 | for i, col in enumerate(COLUMNS[:-1]): 34 | if col in CATEGORICAL_COLUMNS: 35 | # Build the scores array 36 | scores = [0] * len(COLUMNS[:-1]) 37 | # This column is the categorical column you want to extract. 38 | scores[i] = 1 39 | skb = SelectKBest(k=1) 40 | skb.scores_ = scores 41 | # Convert the categorical column to a numerical value 42 | lbn = LabelBinarizer() 43 | r = skb.transform(x_train) 44 | lbn.fit(r) 45 | # Create the pipeline to extract the categorical feature 46 | categorical_pipelines.append( 47 | ('categorical-{}'.format(i), Pipeline([ 48 | ('SKB-{}'.format(i), skb), 49 | ('LBN-{}'.format(i), lbn)]))) 50 | 51 | # Create pipeline to extract the numerical features 52 | skb = SelectKBest(k=6) 53 | skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0] 54 | categorical_pipelines.append(('numerical', skb)) 55 | preprocess = FeatureUnion(categorical_pipelines) 56 | 57 | clf = RandomForestClassifier() 58 | clf.fit(preprocess.transform(x_train), y_train) 59 | 60 | # Create the overall model as a single pipeline 61 | pipeline = Pipeline([ 62 | ('union', preprocess), 63 | ('classifier', clf) 64 | ]) 65 | joblib.dump(pipeline, 'model.joblib') 66 | 67 | y_pred = pipeline.predict(x_test) 68 | print(classification_report(y_test, y_pred)) 69 | -------------------------------------------------------------------------------- /01. Income Classification with Random Forest/income classification with random forest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import json\n", 11 | "import joblib\n", 12 | "\n", 13 | "from sklearn.ensemble import RandomForestClassifier\n", 14 | "from sklearn.feature_selection import SelectKBest\n", 15 | "from sklearn.pipeline import FeatureUnion\n", 16 | "from sklearn.pipeline import Pipeline\n", 17 | "from sklearn.preprocessing import LabelBinarizer\n", 18 | "from sklearn.metrics import classification_report" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# Data" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# ! curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data --output train.csv\n", 35 | "# ! curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test --output test.csv" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": { 42 | "hide_input": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "x_train: (32561, 14)\n", 50 | "y_train: (32561,)\n", 51 | "x_test: (16281, 14)\n", 52 | "y_test: (16281,)\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "COLUMNS = (\n", 58 | " 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',\n", 59 | " 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-level')\n", 60 | "\n", 61 | "CATEGORICAL_COLUMNS = (\n", 62 | " 'workclass', 'education', 'marital-status', 'occupation',\n", 63 | " 'relationship', 'race', 'sex', 'native-country')\n", 64 | "\n", 65 | "with open('./train.csv', 'r') as f:\n", 66 | " train_data = pd.read_csv(f, header=None, names=COLUMNS)\n", 67 | "\n", 68 | "with open('./test.csv', 'r') as f:\n", 69 | " test_data = pd.read_csv(f, names=COLUMNS, skiprows=1)\n", 70 | "\n", 71 | "x_train = train_data.drop('income-level', axis=1).values\n", 72 | "y_train = (train_data['income-level'] == ' >50K').values\n", 73 | "\n", 74 | "x_test = test_data.drop('income-level', axis=1).values\n", 75 | "y_test = (test_data['income-level'] == ' >50K.').values\n", 76 | "\n", 77 | "print('x_train:', x_train.shape)\n", 78 | "print('y_train:', y_train.shape)\n", 79 | "print('x_test:', x_test.shape)\n", 80 | "print('y_test:', y_test.shape)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "# Preprocessing" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 4, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "categorical_pipelines = []\n", 97 | "\n", 98 | "for i, col in enumerate(COLUMNS[:-1]):\n", 99 | " if col in CATEGORICAL_COLUMNS:\n", 100 | " # Create a scores array to get the individual categorical column.\n", 101 | " # Example:\n", 102 | " # data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical',\n", 103 | " # 'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States']\n", 104 | " # scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", 105 | " #\n", 106 | " # Returns: [['State-gov']]\n", 107 | " # Build the scores array\n", 108 | " scores = [0] * len(COLUMNS[:-1])\n", 109 | " # This column is the categorical column you want to extract.\n", 110 | " scores[i] = 1\n", 111 | " skb = SelectKBest(k=1)\n", 112 | " skb.scores_ = scores\n", 113 | " # Convert the categorical column to a numerical value\n", 114 | " lbn = LabelBinarizer()\n", 115 | " r = skb.transform(x_train)\n", 116 | " lbn.fit(r)\n", 117 | " # Create the pipeline to extract the categorical feature\n", 118 | " categorical_pipelines.append(\n", 119 | " ('categorical-{}'.format(i), Pipeline([\n", 120 | " ('SKB-{}'.format(i), skb),\n", 121 | " ('LBN-{}'.format(i), lbn)])))\n", 122 | "\n", 123 | "# Create pipeline to extract the numerical features\n", 124 | "skb = SelectKBest(k=6)\n", 125 | "skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]\n", 126 | "categorical_pipelines.append(('numerical', skb))\n", 127 | "preprocess = FeatureUnion(categorical_pipelines)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "# Model & Train" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 5, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "['model.joblib']" 146 | ] 147 | }, 148 | "execution_count": 5, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "clf = RandomForestClassifier()\n", 155 | "clf.fit(preprocess.transform(x_train), y_train)\n", 156 | "\n", 157 | "# Create the overall model as a single pipeline\n", 158 | "pipeline = Pipeline([\n", 159 | " ('union', preprocess),\n", 160 | " ('classifier', clf)\n", 161 | "])\n", 162 | "joblib.dump(pipeline, 'model.joblib')" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## Prediction" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 6, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | " precision recall f1-score support\n", 182 | "\n", 183 | " False 0.88 0.93 0.91 12435\n", 184 | " True 0.72 0.61 0.66 3846\n", 185 | "\n", 186 | " accuracy 0.85 16281\n", 187 | " macro avg 0.80 0.77 0.78 16281\n", 188 | "weighted avg 0.85 0.85 0.85 16281\n", 189 | "\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "y_pred = pipeline.predict(x_test)\n", 195 | "print(classification_report(y_test, y_pred))" 196 | ] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Python 3", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.6.9" 216 | }, 217 | "toc": { 218 | "base_numbering": 1, 219 | "nav_menu": {}, 220 | "number_sections": true, 221 | "sideBar": true, 222 | "skip_h1_title": false, 223 | "title_cell": "Table of Contents", 224 | "title_sidebar": "Contents", 225 | "toc_cell": false, 226 | "toc_position": { 227 | "height": "calc(100% - 180px)", 228 | "left": "10px", 229 | "top": "150px", 230 | "width": "345.188px" 231 | }, 232 | "toc_section_display": true, 233 | "toc_window_display": false 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 4 238 | } 239 | --------------------------------------------------------------------------------