├── .gitignore
└── 01. Income Classification with Random Forest
    ├── requirements.txt
    ├── train.py
    └── income classification with random forest.ipynb


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Project
 2 | .ipynb_checkpoints
 3 | 
 4 | # Python
 5 | *.pyc
 6 | 
 7 | # ML
 8 | *.joblib
 9 | 
10 | 


--------------------------------------------------------------------------------
/01. Income Classification with Random Forest/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib==0.16.0
2 | numpy==1.19.1
3 | pandas==1.1.1
4 | scikit-learn==0.23.2
5 | scipy==1.5.2
6 | 
7 | 


--------------------------------------------------------------------------------
/01. Income Classification with Random Forest/train.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import joblib
 4 | 
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | from sklearn.feature_selection import SelectKBest
 7 | from sklearn.pipeline import FeatureUnion
 8 | from sklearn.pipeline import Pipeline
 9 | from sklearn.preprocessing import LabelBinarizer
10 | from sklearn.metrics import classification_report
11 | 
12 | COLUMNS = (
13 |     'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
14 |     'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-level')
15 | 
16 | CATEGORICAL_COLUMNS = (
17 |     'workclass', 'education', 'marital-status', 'occupation',
18 |     'relationship', 'race', 'sex', 'native-country')
19 | 
20 | with open('./train.csv', 'r') as f:
21 |     train_data = pd.read_csv(f, header=None, names=COLUMNS)
22 | 
23 | with open('./test.csv', 'r') as f:
24 |     test_data = pd.read_csv(f, names=COLUMNS, skiprows=1)
25 | 
26 | x_train = train_data.drop('income-level', axis=1).values
27 | y_train = (train_data['income-level'] == ' >50K').values
28 | 
29 | x_test = test_data.drop('income-level', axis=1).values
30 | y_test = (test_data['income-level'] == ' >50K.').values
31 | 
32 | categorical_pipelines = []
33 | for i, col in enumerate(COLUMNS[:-1]):
34 |     if col in CATEGORICAL_COLUMNS:
35 |         # Build the scores array
36 |         scores = [0] * len(COLUMNS[:-1])
37 |         # This column is the categorical column you want to extract.
38 |         scores[i] = 1
39 |         skb = SelectKBest(k=1)
40 |         skb.scores_ = scores
41 |         # Convert the categorical column to a numerical value
42 |         lbn = LabelBinarizer()
43 |         r = skb.transform(x_train)
44 |         lbn.fit(r)
45 |         # Create the pipeline to extract the categorical feature
46 |         categorical_pipelines.append(
47 |             ('categorical-{}'.format(i), Pipeline([
48 |                 ('SKB-{}'.format(i), skb),
49 |                 ('LBN-{}'.format(i), lbn)])))
50 | 
51 | # Create pipeline to extract the numerical features
52 | skb = SelectKBest(k=6)
53 | skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
54 | categorical_pipelines.append(('numerical', skb))
55 | preprocess = FeatureUnion(categorical_pipelines)
56 | 
57 | clf = RandomForestClassifier()
58 | clf.fit(preprocess.transform(x_train), y_train)
59 | 
60 | # Create the overall model as a single pipeline
61 | pipeline = Pipeline([
62 |     ('union', preprocess),
63 |     ('classifier', clf)
64 | ])
65 | joblib.dump(pipeline, 'model.joblib')
66 | 
67 | y_pred = pipeline.predict(x_test)
68 | print(classification_report(y_test, y_pred))
69 | 


--------------------------------------------------------------------------------
/01. Income Classification with Random Forest/income classification with random forest.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import json\n",
 11 |     "import joblib\n",
 12 |     "\n",
 13 |     "from sklearn.ensemble import RandomForestClassifier\n",
 14 |     "from sklearn.feature_selection import SelectKBest\n",
 15 |     "from sklearn.pipeline import FeatureUnion\n",
 16 |     "from sklearn.pipeline import Pipeline\n",
 17 |     "from sklearn.preprocessing import LabelBinarizer\n",
 18 |     "from sklearn.metrics import classification_report"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "# Data"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# ! curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data --output train.csv\n",
 35 |     "# ! curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test --output test.csv"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {
 42 |     "hide_input": false
 43 |    },
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "x_train: (32561, 14)\n",
 50 |       "y_train: (32561,)\n",
 51 |       "x_test: (16281, 14)\n",
 52 |       "y_test: (16281,)\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "COLUMNS = (\n",
 58 |     "    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',\n",
 59 |     "    'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-level')\n",
 60 |     "\n",
 61 |     "CATEGORICAL_COLUMNS = (\n",
 62 |     "    'workclass', 'education', 'marital-status', 'occupation',\n",
 63 |     "    'relationship', 'race', 'sex', 'native-country')\n",
 64 |     "\n",
 65 |     "with open('./train.csv', 'r') as f:\n",
 66 |     "    train_data = pd.read_csv(f, header=None, names=COLUMNS)\n",
 67 |     "\n",
 68 |     "with open('./test.csv', 'r') as f:\n",
 69 |     "    test_data = pd.read_csv(f, names=COLUMNS, skiprows=1)\n",
 70 |     "\n",
 71 |     "x_train = train_data.drop('income-level', axis=1).values\n",
 72 |     "y_train = (train_data['income-level'] == ' >50K').values\n",
 73 |     "\n",
 74 |     "x_test = test_data.drop('income-level', axis=1).values\n",
 75 |     "y_test = (test_data['income-level'] == ' >50K.').values\n",
 76 |     "\n",
 77 |     "print('x_train:', x_train.shape)\n",
 78 |     "print('y_train:', y_train.shape)\n",
 79 |     "print('x_test:', x_test.shape)\n",
 80 |     "print('y_test:', y_test.shape)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "# Preprocessing"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 4,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "categorical_pipelines = []\n",
 97 |     "\n",
 98 |     "for i, col in enumerate(COLUMNS[:-1]):\n",
 99 |     "    if col in CATEGORICAL_COLUMNS:\n",
100 |     "        # Create a scores array to get the individual categorical column.\n",
101 |     "        # Example:\n",
102 |     "        #  data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical',\n",
103 |     "        #         'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States']\n",
104 |     "        #  scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
105 |     "        #\n",
106 |     "        # Returns: [['State-gov']]\n",
107 |     "        # Build the scores array\n",
108 |     "        scores = [0] * len(COLUMNS[:-1])\n",
109 |     "        # This column is the categorical column you want to extract.\n",
110 |     "        scores[i] = 1\n",
111 |     "        skb = SelectKBest(k=1)\n",
112 |     "        skb.scores_ = scores\n",
113 |     "        # Convert the categorical column to a numerical value\n",
114 |     "        lbn = LabelBinarizer()\n",
115 |     "        r = skb.transform(x_train)\n",
116 |     "        lbn.fit(r)\n",
117 |     "        # Create the pipeline to extract the categorical feature\n",
118 |     "        categorical_pipelines.append(\n",
119 |     "            ('categorical-{}'.format(i), Pipeline([\n",
120 |     "                ('SKB-{}'.format(i), skb),\n",
121 |     "                ('LBN-{}'.format(i), lbn)])))\n",
122 |     "\n",
123 |     "# Create pipeline to extract the numerical features\n",
124 |     "skb = SelectKBest(k=6)\n",
125 |     "skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]\n",
126 |     "categorical_pipelines.append(('numerical', skb))\n",
127 |     "preprocess = FeatureUnion(categorical_pipelines)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "# Model & Train"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 5,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "['model.joblib']"
146 |       ]
147 |      },
148 |      "execution_count": 5,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "clf = RandomForestClassifier()\n",
155 |     "clf.fit(preprocess.transform(x_train), y_train)\n",
156 |     "\n",
157 |     "# Create the overall model as a single pipeline\n",
158 |     "pipeline = Pipeline([\n",
159 |     "    ('union', preprocess),\n",
160 |     "    ('classifier', clf)\n",
161 |     "])\n",
162 |     "joblib.dump(pipeline, 'model.joblib')"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## Prediction"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 6,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "name": "stdout",
179 |      "output_type": "stream",
180 |      "text": [
181 |       "              precision    recall  f1-score   support\n",
182 |       "\n",
183 |       "       False       0.88      0.93      0.91     12435\n",
184 |       "        True       0.72      0.61      0.66      3846\n",
185 |       "\n",
186 |       "    accuracy                           0.85     16281\n",
187 |       "   macro avg       0.80      0.77      0.78     16281\n",
188 |       "weighted avg       0.85      0.85      0.85     16281\n",
189 |       "\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "y_pred = pipeline.predict(x_test)\n",
195 |     "print(classification_report(y_test, y_pred))"
196 |    ]
197 |   }
198 |  ],
199 |  "metadata": {
200 |   "kernelspec": {
201 |    "display_name": "Python 3",
202 |    "language": "python",
203 |    "name": "python3"
204 |   },
205 |   "language_info": {
206 |    "codemirror_mode": {
207 |     "name": "ipython",
208 |     "version": 3
209 |    },
210 |    "file_extension": ".py",
211 |    "mimetype": "text/x-python",
212 |    "name": "python",
213 |    "nbconvert_exporter": "python",
214 |    "pygments_lexer": "ipython3",
215 |    "version": "3.6.9"
216 |   },
217 |   "toc": {
218 |    "base_numbering": 1,
219 |    "nav_menu": {},
220 |    "number_sections": true,
221 |    "sideBar": true,
222 |    "skip_h1_title": false,
223 |    "title_cell": "Table of Contents",
224 |    "title_sidebar": "Contents",
225 |    "toc_cell": false,
226 |    "toc_position": {
227 |     "height": "calc(100% - 180px)",
228 |     "left": "10px",
229 |     "top": "150px",
230 |     "width": "345.188px"
231 |    },
232 |    "toc_section_display": true,
233 |    "toc_window_display": false
234 |   }
235 |  },
236 |  "nbformat": 4,
237 |  "nbformat_minor": 4
238 | }
239 | 


--------------------------------------------------------------------------------