├── .github
└── workflows
│ └── python-package.yml
├── .gitignore
├── 1_Demo_Data_Explore.ipynb
├── 2.1_Demo_Missing_Data.ipynb
├── 2.2_Demo_Outlier.ipynb
├── 2.3_Demo_Rare_Values.ipynb
├── 3.1_Demo_Feature_Scaling.ipynb
├── 3.2_Demo_Discretisation.ipynb
├── 3.3_Demo_Feature_Encoding.ipynb
├── 3.4_Demo_Feature_Transformation.ipynb
├── 3.5_Demo_Feature_Generation.ipynb
├── 4.1_Demo_Feature_Selection_Filter.ipynb
├── 4.2_Demo_Feature_Selection_Wrapper.ipynb
├── 4.3_Demo_Feature_Selection_Embedded.ipynb
├── 4.4_Demo_Feature_Selection_Feature_Shuffling.ipynb
├── 4.5_Demo_Feature_Selection_Hybrid_method.ipynb
├── A Short Guide for Feature Engineering and Feature Selection.md
├── A Short Guide for Feature Engineering and Feature Selection.pdf
├── README.md
├── data
├── housing.data.txt
├── pima-indians-diabetes.data.csv
└── titanic.csv
├── data_exploration
└── explore.py
├── feature_cleaning
├── missing_data.py
├── outlier.py
└── rare_values.py
├── feature_engineering
├── discretization.py
├── encoding.py
└── transformation.py
├── feature_selection
├── embedded_method.py
├── feature_shuffle.py
├── filter_method.py
└── hybrid.py
├── images
├── 001.png
├── IV.png
├── box-cox.png
├── embedded.png
├── featuretools.png
├── filter.png
├── scaling.png
├── sphx_glr_plot_map_data_to_normal_001.png
├── workflow2.png
└── wrapper.png
└── output
├── Barplot_Pclass_Survived.png
├── Boxplot_Pclass_Fare.png
├── Corr_plot.png
├── Countplot_Pclass.png
├── Distplot_Fare.png
├── Heatmap.png
├── Scatter_plot_Fare_Pclass.png
├── describe.csv
└── missing.csv
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ "master" ]
9 | pull_request:
10 | branches: [ "master" ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | fail-fast: false
18 | matrix:
19 | python-version: ["3.9", "3.10", "3.11"]
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | python -m pip install flake8 pytest
31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 | - name: Lint with flake8
33 | run: |
34 | # stop the build if there are Python syntax errors or undefined names
35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 | - name: Test with pytest
39 | run: |
40 | pytest
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | rule_extraction 20181014.py
2 | __pycache__
3 | .ipynb_checkpoints
4 | .gitignore.bak
5 | history
6 | README_bk.md
7 | A Short Guide for Feature Engineering and Feature Selection.docx
8 | A Short Guide for Feature Engineering and Feature Selection.html
9 |
--------------------------------------------------------------------------------
/2.3_Demo_Rare_Values.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "# import seaborn as sns\n",
14 | "# import matplotlib.pyplot as plt\n",
15 | "import os\n",
16 | "# plt.style.use('seaborn-colorblind')\n",
17 | "# %matplotlib inline\n",
18 | "from feature_cleaning import rare_values as ra"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Load Dataset"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "Variable Pclass label proportion:\n",
38 | "3 0.551066\n",
39 | "1 0.242424\n",
40 | "2 0.206510\n",
41 | "Name: Pclass, dtype: float64\n",
42 | "Variable SibSp label proportion:\n",
43 | "0 0.682379\n",
44 | "1 0.234568\n",
45 | "2 0.031425\n",
46 | "4 0.020202\n",
47 | "3 0.017957\n",
48 | "8 0.007856\n",
49 | "5 0.005612\n",
50 | "Name: SibSp, dtype: float64\n"
51 | ]
52 | }
53 | ],
54 | "source": [
55 | "use_cols = [\n",
56 | " 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
57 | " 'Survived'\n",
58 | "]\n",
59 | "\n",
60 | "# see column Pclass & SibSp's distributions\n",
61 | "# SibSp has values 3/8/5 that occur rarely, under 2%\n",
62 | "# Pclass has 3 values, but no one is under 20%\n",
63 | "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
64 | "for i in ['Pclass','SibSp']:\n",
65 | " print('Variable',i,'label proportion:')\n",
66 | " print(data[i].value_counts()/len(data))"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## Grouping into one new category\n",
74 | "Grouping the observations that show rare labels into a unique category ('rare')"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 3,
80 | "metadata": {
81 | "collapsed": true
82 | },
83 | "outputs": [],
84 | "source": [
85 | "# create the encoder and fit with our data\n",
86 | "enc = ra.GroupingRareValues(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 4,
92 | "metadata": {},
93 | "outputs": [
94 | {
95 | "name": "stdout",
96 | "output_type": "stream",
97 | "text": [
98 | "[{'col': 'Pclass', 'mapping': 3 3\n",
99 | "1 1\n",
100 | "2 2\n",
101 | "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0 0\n",
102 | "1 1\n",
103 | "2 2\n",
104 | "4 4\n",
105 | "3 3\n",
106 | "8 rare\n",
107 | "5 rare\n",
108 | "dtype: object, 'data_type': dtype('int64')}]\n"
109 | ]
110 | }
111 | ],
112 | "source": [
113 | "# let's see the mapping\n",
114 | "# for SibSp, values 5 & 8 are encoded as 'rare' as they appear less than 10%\n",
115 | "# for Pclass, nothing changed\n",
116 | "print(enc.mapping)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 5,
122 | "metadata": {
123 | "collapsed": true
124 | },
125 | "outputs": [],
126 | "source": [
127 | "# perform transformation\n",
128 | "data2 = enc.transform(data)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 6,
134 | "metadata": {},
135 | "outputs": [
136 | {
137 | "name": "stdout",
138 | "output_type": "stream",
139 | "text": [
140 | "0 608\n",
141 | "1 209\n",
142 | "2 28\n",
143 | "4 18\n",
144 | "3 16\n",
145 | "rare 12\n",
146 | "Name: SibSp, dtype: int64\n"
147 | ]
148 | }
149 | ],
150 | "source": [
151 | "# check the result\n",
152 | "print(data2.SibSp.value_counts())"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "## Mode Imputation\n",
160 | "Replacing the rare label by most frequent label"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 7,
166 | "metadata": {
167 | "collapsed": true
168 | },
169 | "outputs": [],
170 | "source": [
171 | "# create the encoder and fit with our data\n",
172 | "enc = ra.ModeImputation(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 8,
178 | "metadata": {},
179 | "outputs": [
180 | {
181 | "name": "stdout",
182 | "output_type": "stream",
183 | "text": [
184 | "[{'col': 'Pclass', 'mapping': 3 3\n",
185 | "1 1\n",
186 | "2 2\n",
187 | "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0 0\n",
188 | "1 1\n",
189 | "2 2\n",
190 | "4 4\n",
191 | "3 3\n",
192 | "8 0\n",
193 | "5 0\n",
194 | "dtype: int64, 'data_type': dtype('int64')}]\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "# let's see the mapping\n",
200 | "# for SibSp, values 5 & 8 are encoded as 0, as label 0 is the most frequent label\n",
201 | "# for Pclass, nothing changed\n",
202 | "print(enc.mapping)"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 9,
208 | "metadata": {
209 | "collapsed": true
210 | },
211 | "outputs": [],
212 | "source": [
213 | "# perform transformation\n",
214 | "data3 = enc.transform(data)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 10,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "name": "stdout",
224 | "output_type": "stream",
225 | "text": [
226 | "0 620\n",
227 | "1 209\n",
228 | "2 28\n",
229 | "4 18\n",
230 | "3 16\n",
231 | "Name: SibSp, dtype: int64\n"
232 | ]
233 | }
234 | ],
235 | "source": [
236 | "# check the result\n",
237 | "print(data3.SibSp.value_counts())"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {
244 | "collapsed": true
245 | },
246 | "outputs": [],
247 | "source": []
248 | }
249 | ],
250 | "metadata": {
251 | "kernelspec": {
252 | "display_name": "Python 3",
253 | "language": "python",
254 | "name": "python3"
255 | },
256 | "language_info": {
257 | "codemirror_mode": {
258 | "name": "ipython",
259 | "version": 3
260 | },
261 | "file_extension": ".py",
262 | "mimetype": "text/x-python",
263 | "name": "python",
264 | "nbconvert_exporter": "python",
265 | "pygments_lexer": "ipython3",
266 | "version": "3.6.1"
267 | }
268 | },
269 | "nbformat": 4,
270 | "nbformat_minor": 2
271 | }
272 |
--------------------------------------------------------------------------------
/3.1_Demo_Feature_Scaling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "# import seaborn as sns\n",
14 | "# import matplotlib.pyplot as plt\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "\n",
18 | "# plt.style.use('seaborn-colorblind')\n",
19 | "# %matplotlib inline\n",
20 | "#from feature_cleaning import rare_values as ra"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "## Load Dataset"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {
34 | "collapsed": true
35 | },
36 | "outputs": [],
37 | "source": [
38 | "use_cols = [\n",
39 | " 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
40 | " 'Survived'\n",
41 | "]\n",
42 | "\n",
43 | "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 3,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/html": [
54 | "
\n",
55 | "\n",
68 | "
\n",
69 | " \n",
70 | " \n",
71 | " | \n",
72 | " Survived | \n",
73 | " Pclass | \n",
74 | " Sex | \n",
75 | " Age | \n",
76 | " SibSp | \n",
77 | " Fare | \n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " 0 | \n",
83 | " 0 | \n",
84 | " 3 | \n",
85 | " male | \n",
86 | " 22.0 | \n",
87 | " 1 | \n",
88 | " 7.2500 | \n",
89 | "
\n",
90 | " \n",
91 | " 1 | \n",
92 | " 1 | \n",
93 | " 1 | \n",
94 | " female | \n",
95 | " 38.0 | \n",
96 | " 1 | \n",
97 | " 71.2833 | \n",
98 | "
\n",
99 | " \n",
100 | " 2 | \n",
101 | " 1 | \n",
102 | " 3 | \n",
103 | " female | \n",
104 | " 26.0 | \n",
105 | " 0 | \n",
106 | " 7.9250 | \n",
107 | "
\n",
108 | " \n",
109 | "
\n",
110 | "
"
111 | ],
112 | "text/plain": [
113 | " Survived Pclass Sex Age SibSp Fare\n",
114 | "0 0 3 male 22.0 1 7.2500\n",
115 | "1 1 1 female 38.0 1 71.2833\n",
116 | "2 1 3 female 26.0 0 7.9250"
117 | ]
118 | },
119 | "execution_count": 3,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "data.head(3)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 4,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "data": {
135 | "text/plain": [
136 | "((623, 6), (268, 6))"
137 | ]
138 | },
139 | "execution_count": 4,
140 | "metadata": {},
141 | "output_type": "execute_result"
142 | }
143 | ],
144 | "source": [
145 | "# Note that we include target variable in the X_train \n",
146 | "# because we need it to supervise our discretization\n",
147 | "# this is not the standard way of using train-test-split\n",
148 | "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
149 | " random_state=0)\n",
150 | "X_train.shape, X_test.shape"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "## Normalization - Standardization (Z-score scaling)\n",
158 | "\n",
159 | "removes the mean and scales the data to unit variance.
z = (X - X.mean) / std"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 5,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "name": "stdout",
169 | "output_type": "stream",
170 | "text": [
171 | " Survived Pclass Sex Age SibSp Fare Fare_zscore\n",
172 | "857 1 1 male 51.0 0 26.5500 -0.122530\n",
173 | "52 1 1 female 49.0 1 76.7292 0.918124\n",
174 | "386 0 3 male 1.0 5 46.9000 0.299503\n",
175 | "124 0 1 male 54.0 0 77.2875 0.929702\n",
176 | "578 0 3 female NaN 1 14.4583 -0.373297\n",
177 | "549 1 2 male 8.0 1 36.7500 0.089005\n"
178 | ]
179 | }
180 | ],
181 | "source": [
182 | "# add the new created feature\n",
183 | "from sklearn.preprocessing import StandardScaler\n",
184 | "ss = StandardScaler().fit(X_train[['Fare']])\n",
185 | "X_train_copy = X_train.copy(deep=True)\n",
186 | "X_train_copy['Fare_zscore'] = ss.transform(X_train_copy[['Fare']])\n",
187 | "print(X_train_copy.head(6))"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 6,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "5.916437306188636e-17\n",
200 | "1.0008035356861\n"
201 | ]
202 | }
203 | ],
204 | "source": [
205 | "# check if it is with mean=0 std=1\n",
206 | "print(X_train_copy['Fare_zscore'].mean())\n",
207 | "print(X_train_copy['Fare_zscore'].std())\n"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "## Min-Max scaling\n",
215 | "transforms features by scaling each feature to a given range. Default to [0,1].
X_scaled = (X - X.min / (X.max - X.min)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 7,
221 | "metadata": {},
222 | "outputs": [
223 | {
224 | "name": "stdout",
225 | "output_type": "stream",
226 | "text": [
227 | " Survived Pclass Sex Age SibSp Fare Fare_minmax\n",
228 | "857 1 1 male 51.0 0 26.5500 0.051822\n",
229 | "52 1 1 female 49.0 1 76.7292 0.149765\n",
230 | "386 0 3 male 1.0 5 46.9000 0.091543\n",
231 | "124 0 1 male 54.0 0 77.2875 0.150855\n",
232 | "578 0 3 female NaN 1 14.4583 0.028221\n",
233 | "549 1 2 male 8.0 1 36.7500 0.071731\n"
234 | ]
235 | }
236 | ],
237 | "source": [
238 | "# add the new created feature\n",
239 | "from sklearn.preprocessing import MinMaxScaler\n",
240 | "mms = MinMaxScaler().fit(X_train[['Fare']])\n",
241 | "X_train_copy = X_train.copy(deep=True)\n",
242 | "X_train_copy['Fare_minmax'] = mms.transform(X_train_copy[['Fare']])\n",
243 | "print(X_train_copy.head(6))"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 8,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "1.0\n",
256 | "0.0\n"
257 | ]
258 | }
259 | ],
260 | "source": [
261 | "# check the range of Fare_minmax\n",
262 | "print(X_train_copy['Fare_minmax'].max())\n",
263 | "print(X_train_copy['Fare_minmax'].min())"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {
269 | "collapsed": true
270 | },
271 | "source": [
272 | "## Robust scaling\n",
273 | "removes the median and scales the data according to the quantile range (defaults to IQR)
X_scaled = (X - X.median) / IQR"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 9,
279 | "metadata": {},
280 | "outputs": [
281 | {
282 | "name": "stdout",
283 | "output_type": "stream",
284 | "text": [
285 | " Survived Pclass Sex Age SibSp Fare Fare_robust\n",
286 | "857 1 1 male 51.0 0 26.5500 0.492275\n",
287 | "52 1 1 female 49.0 1 76.7292 2.630973\n",
288 | "386 0 3 male 1.0 5 46.9000 1.359616\n",
289 | "124 0 1 male 54.0 0 77.2875 2.654768\n",
290 | "578 0 3 female NaN 1 14.4583 -0.023088\n",
291 | "549 1 2 male 8.0 1 36.7500 0.927011\n"
292 | ]
293 | }
294 | ],
295 | "source": [
296 | "# add the new created feature\n",
297 | "from sklearn.preprocessing import RobustScaler\n",
298 | "rs = RobustScaler().fit(X_train[['Fare']])\n",
299 | "X_train_copy = X_train.copy(deep=True)\n",
300 | "X_train_copy['Fare_robust'] = rs.transform(X_train_copy[['Fare']])\n",
301 | "print(X_train_copy.head(6))"
302 | ]
303 | }
304 | ],
305 | "metadata": {
306 | "kernelspec": {
307 | "display_name": "Python 3",
308 | "language": "python",
309 | "name": "python3"
310 | },
311 | "language_info": {
312 | "codemirror_mode": {
313 | "name": "ipython",
314 | "version": 3
315 | },
316 | "file_extension": ".py",
317 | "mimetype": "text/x-python",
318 | "name": "python",
319 | "nbconvert_exporter": "python",
320 | "pygments_lexer": "ipython3",
321 | "version": "3.6.1"
322 | }
323 | },
324 | "nbformat": 4,
325 | "nbformat_minor": 2
326 | }
327 |
--------------------------------------------------------------------------------
/3.3_Demo_Feature_Encoding.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "# import seaborn as sns\n",
14 | "# import matplotlib.pyplot as plt\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "\n",
18 | "import category_encoders as ce\n",
19 | "from feature_engineering import encoding\n"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Load Dataset"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/html": [
37 | "\n",
38 | "\n",
51 | "
\n",
52 | " \n",
53 | " \n",
54 | " | \n",
55 | " Survived | \n",
56 | " Pclass | \n",
57 | " Sex | \n",
58 | " Age | \n",
59 | " SibSp | \n",
60 | " Fare | \n",
61 | "
\n",
62 | " \n",
63 | " \n",
64 | " \n",
65 | " 0 | \n",
66 | " 0 | \n",
67 | " 3 | \n",
68 | " male | \n",
69 | " 22.0 | \n",
70 | " 1 | \n",
71 | " 7.2500 | \n",
72 | "
\n",
73 | " \n",
74 | " 1 | \n",
75 | " 1 | \n",
76 | " 1 | \n",
77 | " female | \n",
78 | " 38.0 | \n",
79 | " 1 | \n",
80 | " 71.2833 | \n",
81 | "
\n",
82 | " \n",
83 | " 2 | \n",
84 | " 1 | \n",
85 | " 3 | \n",
86 | " female | \n",
87 | " 26.0 | \n",
88 | " 0 | \n",
89 | " 7.9250 | \n",
90 | "
\n",
91 | " \n",
92 | " 3 | \n",
93 | " 1 | \n",
94 | " 1 | \n",
95 | " female | \n",
96 | " 35.0 | \n",
97 | " 1 | \n",
98 | " 53.1000 | \n",
99 | "
\n",
100 | " \n",
101 | " 4 | \n",
102 | " 0 | \n",
103 | " 3 | \n",
104 | " male | \n",
105 | " 35.0 | \n",
106 | " 0 | \n",
107 | " 8.0500 | \n",
108 | "
\n",
109 | " \n",
110 | "
\n",
111 | "
"
112 | ],
113 | "text/plain": [
114 | " Survived Pclass Sex Age SibSp Fare\n",
115 | "0 0 3 male 22.0 1 7.2500\n",
116 | "1 1 1 female 38.0 1 71.2833\n",
117 | "2 1 3 female 26.0 0 7.9250\n",
118 | "3 1 1 female 35.0 1 53.1000\n",
119 | "4 0 3 male 35.0 0 8.0500"
120 | ]
121 | },
122 | "execution_count": 2,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "use_cols = [\n",
129 | " 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
130 | " 'Survived'\n",
131 | "]\n",
132 | "\n",
133 | "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
134 | "data.head()"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 3,
140 | "metadata": {},
141 | "outputs": [
142 | {
143 | "data": {
144 | "text/plain": [
145 | "((623, 6), (268, 6))"
146 | ]
147 | },
148 | "execution_count": 3,
149 | "metadata": {},
150 | "output_type": "execute_result"
151 | }
152 | ],
153 | "source": [
154 | "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
155 | " random_state=0)\n",
156 | "X_train.shape, X_test.shape"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "## One-hot encoding\n",
164 | "replace the categorical variable by different boolean variables (0/1) to indicate whether or not certain label is true for that observation"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 4,
170 | "metadata": {
171 | "collapsed": true
172 | },
173 | "outputs": [],
174 | "source": [
175 | "data1 = pd.get_dummies(data,drop_first=True)"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 5,
181 | "metadata": {
182 | "scrolled": true
183 | },
184 | "outputs": [
185 | {
186 | "data": {
187 | "text/html": [
188 | "\n",
189 | "\n",
202 | "
\n",
203 | " \n",
204 | " \n",
205 | " | \n",
206 | " Survived | \n",
207 | " Pclass | \n",
208 | " Age | \n",
209 | " SibSp | \n",
210 | " Fare | \n",
211 | " Sex_male | \n",
212 | "
\n",
213 | " \n",
214 | " \n",
215 | " \n",
216 | " 0 | \n",
217 | " 0 | \n",
218 | " 3 | \n",
219 | " 22.0 | \n",
220 | " 1 | \n",
221 | " 7.2500 | \n",
222 | " 1 | \n",
223 | "
\n",
224 | " \n",
225 | " 1 | \n",
226 | " 1 | \n",
227 | " 1 | \n",
228 | " 38.0 | \n",
229 | " 1 | \n",
230 | " 71.2833 | \n",
231 | " 0 | \n",
232 | "
\n",
233 | " \n",
234 | " 2 | \n",
235 | " 1 | \n",
236 | " 3 | \n",
237 | " 26.0 | \n",
238 | " 0 | \n",
239 | " 7.9250 | \n",
240 | " 0 | \n",
241 | "
\n",
242 | " \n",
243 | " 3 | \n",
244 | " 1 | \n",
245 | " 1 | \n",
246 | " 35.0 | \n",
247 | " 1 | \n",
248 | " 53.1000 | \n",
249 | " 0 | \n",
250 | "
\n",
251 | " \n",
252 | " 4 | \n",
253 | " 0 | \n",
254 | " 3 | \n",
255 | " 35.0 | \n",
256 | " 0 | \n",
257 | " 8.0500 | \n",
258 | " 1 | \n",
259 | "
\n",
260 | " \n",
261 | "
\n",
262 | "
"
263 | ],
264 | "text/plain": [
265 | " Survived Pclass Age SibSp Fare Sex_male\n",
266 | "0 0 3 22.0 1 7.2500 1\n",
267 | "1 1 1 38.0 1 71.2833 0\n",
268 | "2 1 3 26.0 0 7.9250 0\n",
269 | "3 1 1 35.0 1 53.1000 0\n",
270 | "4 0 3 35.0 0 8.0500 1"
271 | ]
272 | },
273 | "execution_count": 5,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "data1.head()"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "## Ordinal-encoding\n",
287 | "replace the labels by some ordinal number if ordinal is meaningful"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 6,
293 | "metadata": {
294 | "collapsed": true
295 | },
296 | "outputs": [],
297 | "source": [
298 | "ord_enc = ce.OrdinalEncoder(cols=['Sex']).fit(X_train,y_train)"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 7,
304 | "metadata": {},
305 | "outputs": [
306 | {
307 | "name": "stdout",
308 | "output_type": "stream",
309 | "text": [
310 | " Survived Pclass Sex Age SibSp Fare\n",
311 | "0 0 3 1 22.0 1 7.2500\n",
312 | "1 1 1 2 38.0 1 71.2833\n",
313 | "2 1 3 2 26.0 0 7.9250\n",
314 | "3 1 1 2 35.0 1 53.1000\n",
315 | "4 0 3 1 35.0 0 8.0500\n"
316 | ]
317 | }
318 | ],
319 | "source": [
320 | "data4 = ord_enc.transform(data)\n",
321 | "print(data4.head(5))"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | "## Mean encoding\n",
329 | "replace the label by the mean of the target for that label. \n",
330 | "(the target must be 0/1 valued or continuous)\n"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": 8,
336 | "metadata": {},
337 | "outputs": [
338 | {
339 | "data": {
340 | "text/plain": [
341 | "Sex\n",
342 | "female 0.753488\n",
343 | "male 0.196078\n",
344 | "Name: Survived, dtype: float64"
345 | ]
346 | },
347 | "execution_count": 8,
348 | "metadata": {},
349 | "output_type": "execute_result"
350 | }
351 | ],
352 | "source": [
353 | "# cross check-- the mean of target group by Sex\n",
354 | "X_train['Survived'].groupby(data['Sex']).mean()\n"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": 9,
360 | "metadata": {},
361 | "outputs": [],
362 | "source": [
363 | "mean_enc = encoding.MeanEncoding(cols=['Sex']).fit(X_train,y_train)"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 10,
369 | "metadata": {},
370 | "outputs": [
371 | {
372 | "name": "stdout",
373 | "output_type": "stream",
374 | "text": [
375 | " Survived Pclass Sex Age SibSp Fare\n",
376 | "0 0 3 0.196078 22.0 1 7.2500\n",
377 | "1 1 1 0.753488 38.0 1 71.2833\n",
378 | "2 1 3 0.753488 26.0 0 7.9250\n",
379 | "3 1 1 0.753488 35.0 1 53.1000\n",
380 | "4 0 3 0.196078 35.0 0 8.0500\n"
381 | ]
382 | }
383 | ],
384 | "source": [
385 | "data6 = mean_enc.transform(data)\n",
386 | "print(data6.head(5))"
387 | ]
388 | },
389 | {
390 | "cell_type": "markdown",
391 | "metadata": {},
392 | "source": [
393 | "## Target-encoding\n",
394 | "Similar to mean encoding, but use both posterior probability and prior probability of the target"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 11,
400 | "metadata": {},
401 | "outputs": [],
402 | "source": [
403 | "# create the encoder and fit with our data\n",
404 | "target_enc = ce.TargetEncoder(cols=['Sex']).fit(X_train,y_train)"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": 12,
410 | "metadata": {
411 | "collapsed": true
412 | },
413 | "outputs": [],
414 | "source": [
415 | "# perform transformation\n",
416 | "# data.Survived.groupby(data['Sex']).agg(['mean'])\n",
417 | "data2 = target_enc.transform(data)"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 13,
423 | "metadata": {},
424 | "outputs": [
425 | {
426 | "data": {
427 | "text/html": [
428 | "\n",
429 | "\n",
442 | "
\n",
443 | " \n",
444 | " \n",
445 | " | \n",
446 | " Survived | \n",
447 | " Pclass | \n",
448 | " Sex | \n",
449 | " Age | \n",
450 | " SibSp | \n",
451 | " Fare | \n",
452 | "
\n",
453 | " \n",
454 | " \n",
455 | " \n",
456 | " 0 | \n",
457 | " 0 | \n",
458 | " 3 | \n",
459 | " 0.196078 | \n",
460 | " 22.0 | \n",
461 | " 1 | \n",
462 | " 7.2500 | \n",
463 | "
\n",
464 | " \n",
465 | " 1 | \n",
466 | " 1 | \n",
467 | " 1 | \n",
468 | " 0.753488 | \n",
469 | " 38.0 | \n",
470 | " 1 | \n",
471 | " 71.2833 | \n",
472 | "
\n",
473 | " \n",
474 | " 2 | \n",
475 | " 1 | \n",
476 | " 3 | \n",
477 | " 0.753488 | \n",
478 | " 26.0 | \n",
479 | " 0 | \n",
480 | " 7.9250 | \n",
481 | "
\n",
482 | " \n",
483 | " 3 | \n",
484 | " 1 | \n",
485 | " 1 | \n",
486 | " 0.753488 | \n",
487 | " 35.0 | \n",
488 | " 1 | \n",
489 | " 53.1000 | \n",
490 | "
\n",
491 | " \n",
492 | " 4 | \n",
493 | " 0 | \n",
494 | " 3 | \n",
495 | " 0.196078 | \n",
496 | " 35.0 | \n",
497 | " 0 | \n",
498 | " 8.0500 | \n",
499 | "
\n",
500 | " \n",
501 | "
\n",
502 | "
"
503 | ],
504 | "text/plain": [
505 | " Survived Pclass Sex Age SibSp Fare\n",
506 | "0 0 3 0.196078 22.0 1 7.2500\n",
507 | "1 1 1 0.753488 38.0 1 71.2833\n",
508 | "2 1 3 0.753488 26.0 0 7.9250\n",
509 | "3 1 1 0.753488 35.0 1 53.1000\n",
510 | "4 0 3 0.196078 35.0 0 8.0500"
511 | ]
512 | },
513 | "execution_count": 13,
514 | "metadata": {},
515 | "output_type": "execute_result"
516 | }
517 | ],
518 | "source": [
519 | "# check the result\n",
520 | "data2.head()"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "metadata": {},
526 | "source": [
527 | "## WOE-encoding\n",
528 | "replace the label with Weight of Evidence of each label. WOE is computed from the basic odds ratio: \n",
529 | "\n",
530 | "ln( (Proportion of Good Outcomes) / (Proportion of Bad Outcomes))"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 14,
536 | "metadata": {
537 | "collapsed": true
538 | },
539 | "outputs": [],
540 | "source": [
541 | "woe_enc = ce.WOEEncoder(cols=['Sex']).fit(X_train,y_train)"
542 | ]
543 | },
544 | {
545 | "cell_type": "code",
546 | "execution_count": 15,
547 | "metadata": {
548 | "collapsed": true
549 | },
550 | "outputs": [],
551 | "source": [
552 | "data3 = woe_enc.transform(data)"
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": 16,
558 | "metadata": {},
559 | "outputs": [
560 | {
561 | "data": {
562 | "text/html": [
563 | "\n",
564 | "\n",
577 | "
\n",
578 | " \n",
579 | " \n",
580 | " | \n",
581 | " Survived | \n",
582 | " Pclass | \n",
583 | " Sex | \n",
584 | " Age | \n",
585 | " SibSp | \n",
586 | " Fare | \n",
587 | "
\n",
588 | " \n",
589 | " \n",
590 | " \n",
591 | " 0 | \n",
592 | " 0 | \n",
593 | " 3 | \n",
594 | " -0.950742 | \n",
595 | " 22.0 | \n",
596 | " 1 | \n",
597 | " 7.2500 | \n",
598 | "
\n",
599 | " \n",
600 | " 1 | \n",
601 | " 1 | \n",
602 | " 1 | \n",
603 | " 1.555633 | \n",
604 | " 38.0 | \n",
605 | " 1 | \n",
606 | " 71.2833 | \n",
607 | "
\n",
608 | " \n",
609 | " 2 | \n",
610 | " 1 | \n",
611 | " 3 | \n",
612 | " 1.555633 | \n",
613 | " 26.0 | \n",
614 | " 0 | \n",
615 | " 7.9250 | \n",
616 | "
\n",
617 | " \n",
618 | " 3 | \n",
619 | " 1 | \n",
620 | " 1 | \n",
621 | " 1.555633 | \n",
622 | " 35.0 | \n",
623 | " 1 | \n",
624 | " 53.1000 | \n",
625 | "
\n",
626 | " \n",
627 | " 4 | \n",
628 | " 0 | \n",
629 | " 3 | \n",
630 | " -0.950742 | \n",
631 | " 35.0 | \n",
632 | " 0 | \n",
633 | " 8.0500 | \n",
634 | "
\n",
635 | " \n",
636 | "
\n",
637 | "
"
638 | ],
639 | "text/plain": [
640 | " Survived Pclass Sex Age SibSp Fare\n",
641 | "0 0 3 -0.950742 22.0 1 7.2500\n",
642 | "1 1 1 1.555633 38.0 1 71.2833\n",
643 | "2 1 3 1.555633 26.0 0 7.9250\n",
644 | "3 1 1 1.555633 35.0 1 53.1000\n",
645 | "4 0 3 -0.950742 35.0 0 8.0500"
646 | ]
647 | },
648 | "execution_count": 16,
649 | "metadata": {},
650 | "output_type": "execute_result"
651 | }
652 | ],
653 | "source": [
654 | "data3.head(5)"
655 | ]
656 | },
657 | {
658 | "cell_type": "code",
659 | "execution_count": null,
660 | "metadata": {
661 | "collapsed": true
662 | },
663 | "outputs": [],
664 | "source": []
665 | }
666 | ],
667 | "metadata": {
668 | "kernelspec": {
669 | "display_name": "Python 3",
670 | "language": "python",
671 | "name": "python3"
672 | },
673 | "language_info": {
674 | "codemirror_mode": {
675 | "name": "ipython",
676 | "version": 3
677 | },
678 | "file_extension": ".py",
679 | "mimetype": "text/x-python",
680 | "name": "python",
681 | "nbconvert_exporter": "python",
682 | "pygments_lexer": "ipython3",
683 | "version": "3.6.1"
684 | }
685 | },
686 | "nbformat": 4,
687 | "nbformat_minor": 2
688 | }
689 |
--------------------------------------------------------------------------------
/3.5_Demo_Feature_Generation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "# import seaborn as sns\n",
14 | "# import matplotlib.pyplot as plt\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from sklearn.metrics import roc_curve, roc_auc_score\n",
18 | "\n",
19 | "# plt.style.use('seaborn-colorblind')\n",
20 | "# %matplotlib inline\n",
21 | "#from feature_cleaning import rare_values as ra"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## Load Dataset"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {
35 | "collapsed": true
36 | },
37 | "outputs": [],
38 | "source": [
39 | "use_cols = [\n",
40 | " 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
41 | " 'Survived'\n",
42 | "]\n",
43 | "\n",
44 | "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/html": [
55 | "\n",
56 | "\n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " | \n",
73 | " Survived | \n",
74 | " Pclass | \n",
75 | " Sex | \n",
76 | " Age | \n",
77 | " SibSp | \n",
78 | " Fare | \n",
79 | "
\n",
80 | " \n",
81 | " \n",
82 | " \n",
83 | " 0 | \n",
84 | " 0 | \n",
85 | " 3 | \n",
86 | " male | \n",
87 | " 22.0 | \n",
88 | " 1 | \n",
89 | " 7.2500 | \n",
90 | "
\n",
91 | " \n",
92 | " 1 | \n",
93 | " 1 | \n",
94 | " 1 | \n",
95 | " female | \n",
96 | " 38.0 | \n",
97 | " 1 | \n",
98 | " 71.2833 | \n",
99 | "
\n",
100 | " \n",
101 | " 2 | \n",
102 | " 1 | \n",
103 | " 3 | \n",
104 | " female | \n",
105 | " 26.0 | \n",
106 | " 0 | \n",
107 | " 7.9250 | \n",
108 | "
\n",
109 | " \n",
110 | "
\n",
111 | "
"
112 | ],
113 | "text/plain": [
114 | " Survived Pclass Sex Age SibSp Fare\n",
115 | "0 0 3 male 22.0 1 7.2500\n",
116 | "1 1 1 female 38.0 1 71.2833\n",
117 | "2 1 3 female 26.0 0 7.9250"
118 | ]
119 | },
120 | "execution_count": 3,
121 | "metadata": {},
122 | "output_type": "execute_result"
123 | }
124 | ],
125 | "source": [
126 | "data.head(3)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 4,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "data": {
136 | "text/plain": [
137 | "((623, 6), (268, 6))"
138 | ]
139 | },
140 | "execution_count": 4,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "# Note that we include target variable in the X_train \n",
147 | "# because we need it to supervise our discretization\n",
148 | "# this is not the standard way of using train-test-split\n",
149 | "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
150 | " random_state=0)\n",
151 | "X_train.shape, X_test.shape"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "## Polynomial Expansion\n",
159 | "\n",
160 | "generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 5,
166 | "metadata": {},
167 | "outputs": [
168 | {
169 | "name": "stdout",
170 | "output_type": "stream",
171 | "text": [
172 | " Pclass SibSp Pclass^2 Pclass SibSp SibSp^2\n",
173 | "0 1.0 0.0 1.0 0.0 0.0\n",
174 | "1 1.0 1.0 1.0 1.0 1.0\n",
175 | "2 3.0 5.0 9.0 15.0 25.0\n",
176 | "3 1.0 0.0 1.0 0.0 0.0\n",
177 | "4 3.0 1.0 9.0 3.0 1.0\n",
178 | "5 2.0 1.0 4.0 2.0 1.0\n"
179 | ]
180 | }
181 | ],
182 | "source": [
183 | "# create polynomial combinations of feature 'Pclass','SibSp' with degree 2\n",
184 | "from sklearn.preprocessing import PolynomialFeatures\n",
185 | "pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])\n",
186 | "tmp = pf.transform(X_train[['Pclass','SibSp']])\n",
187 | "X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names(['Pclass','SibSp']))\n",
188 | "print(X_train_copy.head(6))"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "## Feature Learning by Trees\n",
196 | "GBDT derived feature + LR"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 6,
202 | "metadata": {},
203 | "outputs": [
204 | {
205 | "name": "stdout",
206 | "output_type": "stream",
207 | "text": [
208 | "sample's belonging node of each base tree \n",
209 | "' [[ 7. 7. 6. ... 4. 7. 4.]\n",
210 | " [ 7. 7. 6. ... 14. 7. 7.]\n",
211 | " [11. 11. 11. ... 4. 6. 11.]\n",
212 | " ...\n",
213 | " [10. 10. 10. ... 4. 6. 10.]\n",
214 | " [13. 14. 13. ... 4. 7. 13.]\n",
215 | " [ 7. 7. 6. ... 6. 7. 7.]]\n",
216 | "AUC for GBDT derived feature + LR: 0.7746130952380953\n"
217 | ]
218 | },
219 | {
220 | "name": "stderr",
221 | "output_type": "stream",
222 | "text": [
223 | "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
224 | "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
225 | "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
226 | " warnings.warn(msg, FutureWarning)\n"
227 | ]
228 | }
229 | ],
230 | "source": [
231 | "from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n",
232 | "from sklearn.preprocessing import OneHotEncoder\n",
233 | "\n",
234 | "gbdt = GradientBoostingClassifier(n_estimators=20)\n",
235 | "one_hot = OneHotEncoder()\n",
236 | "\n",
237 | "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
238 | "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
239 | "\n",
240 | "gbdt.fit(X_train, y_train)\n",
241 | "\n",
242 | "X_leaf_index = gbdt.apply(X_train)[:, :, 0] # apply return the node index on each tree \n",
243 | "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
244 | "# fit one-hot encoder\n",
245 | "one_hot.fit(X_leaf_index) \n",
246 | "X_one_hot = one_hot.transform(X_leaf_index) \n",
247 | "\n",
248 | "\n",
249 | "from sklearn.linear_model import LogisticRegression\n",
250 | "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
251 | "lr.fit(X_one_hot,y_train)\n",
252 | "y_pred = lr.predict_proba(\n",
253 | " one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]\n",
254 | "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
255 | "print(\"AUC for GBDT derived feature + LR:\", roc_auc_score(y_test, y_pred))\n"
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "metadata": {},
261 | "source": [
262 | "## Feature Learning by Trees\n",
263 | "RandomForest derived feature + LR"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 7,
269 | "metadata": {},
270 | "outputs": [
271 | {
272 | "name": "stdout",
273 | "output_type": "stream",
274 | "text": [
275 | "sample's belonging node of each base tree \n",
276 | "' [[212 35 79 ... 146 60 46]\n",
277 | " [307 165 266 ... 136 132 44]\n",
278 | " [285 285 320 ... 301 294 300]\n",
279 | " ...\n",
280 | " [ 13 177 133 ... 186 169 117]\n",
281 | " [190 296 311 ... 282 289 297]\n",
282 | " [264 165 243 ... 152 110 314]]\n",
283 | "AUC for RandomForest derived feature + LR: 0.759672619047619\n"
284 | ]
285 | },
286 | {
287 | "name": "stderr",
288 | "output_type": "stream",
289 | "text": [
290 | "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
291 | "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
292 | "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
293 | " warnings.warn(msg, FutureWarning)\n"
294 | ]
295 | }
296 | ],
297 | "source": [
298 | "rf = RandomForestClassifier(n_estimators=20)\n",
299 | "one_hot = OneHotEncoder()\n",
300 | "\n",
301 | "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
302 | "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
303 | "\n",
304 | "rf.fit(X_train, y_train)\n",
305 | "\n",
306 | "X_leaf_index = rf.apply(X_train) # apply return the node index on each tree \n",
307 | "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
308 | "# fit one-hot encoder\n",
309 | "one_hot.fit(X_leaf_index) \n",
310 | "X_one_hot = one_hot.transform(X_leaf_index) \n",
311 | "\n",
312 | "\n",
313 | "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
314 | "lr.fit(X_one_hot,y_train)\n",
315 | "y_pred = lr.predict_proba(\n",
316 | " one_hot.transform(rf.apply(X_test)))[:,1]\n",
317 | "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
318 | "print(\"AUC for RandomForest derived feature + LR:\", roc_auc_score(y_test, y_pred))\n"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {
324 | "collapsed": true
325 | },
326 | "source": [
327 | "## Feature Learning by Trees\n",
328 | "GBDT derived feature + Raw feature +LR"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 8,
334 | "metadata": {},
335 | "outputs": [
336 | {
337 | "name": "stdout",
338 | "output_type": "stream",
339 | "text": [
340 | "AUC for GBDT derived feature + Raw feature +LR: 0.7603571428571428\n"
341 | ]
342 | }
343 | ],
344 | "source": [
345 | "from scipy.sparse import hstack\n",
346 | "\n",
347 | "X_train_ext = hstack([one_hot.transform(gbdt.apply(X_train)[:, :, 0]), X_train])\n",
348 | "X_test_ext = hstack([one_hot.transform(gbdt.apply(X_test)[:, :, 0]), X_test])\n",
349 | "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
350 | "lr.fit(X_train_ext,y_train)\n",
351 | "y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
352 | "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
353 | "print(\"AUC for GBDT derived feature + Raw feature +LR:\", roc_auc_score(y_test, y_pred))\n"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "## Feature Learning by Trees\n",
361 | "RandomForest derived feature + Raw feature +LR"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": 9,
367 | "metadata": {},
368 | "outputs": [
369 | {
370 | "name": "stdout",
371 | "output_type": "stream",
372 | "text": [
373 | "AUC for RandomForest derived feature + Raw feature + LR: 0.76\n"
374 | ]
375 | }
376 | ],
377 | "source": [
378 | "X_train_ext = hstack([one_hot.transform(rf.apply(X_train)), X_train])\n",
379 | "X_test_ext = hstack([one_hot.transform(rf.apply(X_test)), X_test])\n",
380 | "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
381 | "lr.fit(X_train_ext,y_train)\n",
382 | "y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
383 | "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
384 | "print(\"AUC for RandomForest derived feature + Raw feature + LR:\", roc_auc_score(y_test, y_pred))\n"
385 | ]
386 | },
387 | {
388 | "cell_type": "markdown",
389 | "metadata": {},
390 | "source": [
391 | "## Feature Learning by Trees\n",
392 | "Use only Raw Feature + LR"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 10,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "name": "stdout",
402 | "output_type": "stream",
403 | "text": [
404 | "AUC for RandomForest derived feature + LR: 0.6988690476190476\n"
405 | ]
406 | }
407 | ],
408 | "source": [
409 | "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
410 | "lr.fit(X_train,y_train)\n",
411 | "y_pred = lr.predict_proba(X_test)[:,1]\n",
412 | "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
413 | "print(\"AUC for RandomForest derived feature + LR:\", roc_auc_score(y_test, y_pred))\n"
414 | ]
415 | },
416 | {
417 | "cell_type": "markdown",
418 | "metadata": {},
419 | "source": [
420 | "## Feature Learning by Trees\n",
421 | "\n",
422 | "Use only Raw Feature + GBDT"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": 13,
428 | "metadata": {},
429 | "outputs": [
430 | {
431 | "name": "stdout",
432 | "output_type": "stream",
433 | "text": [
434 | "AUC for Raw feature + GBDT: 0.7613988095238096\n"
435 | ]
436 | }
437 | ],
438 | "source": [
439 | "gbdt = GradientBoostingClassifier(n_estimators=20)\n",
440 | "\n",
441 | "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
442 | "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
443 | "\n",
444 | "gbdt.fit(X_train, y_train)\n",
445 | "y_pred = gbdt.predict_proba(X_test)[:,1]\n",
446 | "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
447 | "print(\"AUC for Raw feature + GBDT:\", roc_auc_score(y_test, y_pred))\n"
448 | ]
449 | },
450 | {
451 | "cell_type": "markdown",
452 | "metadata": {},
453 | "source": [
454 | "## Feature Learning by Trees\n",
455 | "\n",
456 | "Use only Raw Feature + RF\n"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": 16,
462 | "metadata": {},
463 | "outputs": [
464 | {
465 | "name": "stdout",
466 | "output_type": "stream",
467 | "text": [
468 | "AUC for Raw feature + RF: 0.7235119047619047\n"
469 | ]
470 | }
471 | ],
472 | "source": [
473 | "rf = RandomForestClassifier(n_estimators=20)\n",
474 | "\n",
475 | "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
476 | "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
477 | "\n",
478 | "rf.fit(X_train, y_train)\n",
479 | "y_pred = rf.predict_proba(X_test)[:,1]\n",
480 | "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
481 | "print(\"AUC for Raw feature + RF:\", roc_auc_score(y_test, y_pred))"
482 | ]
483 | },
484 | {
485 | "cell_type": "markdown",
486 | "metadata": {},
487 | "source": [
488 | "#### Without tuning, we can see GBDT derived feature + LR get the best result"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {
495 | "collapsed": true
496 | },
497 | "outputs": [],
498 | "source": []
499 | }
500 | ],
501 | "metadata": {
502 | "kernelspec": {
503 | "display_name": "Python 3",
504 | "language": "python",
505 | "name": "python3"
506 | },
507 | "language_info": {
508 | "codemirror_mode": {
509 | "name": "ipython",
510 | "version": 3
511 | },
512 | "file_extension": ".py",
513 | "mimetype": "text/x-python",
514 | "name": "python",
515 | "nbconvert_exporter": "python",
516 | "pygments_lexer": "ipython3",
517 | "version": "3.6.1"
518 | }
519 | },
520 | "nbformat": 4,
521 | "nbformat_minor": 2
522 | }
523 |
--------------------------------------------------------------------------------
/4.1_Demo_Feature_Selection_Filter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "# import seaborn as sns\n",
14 | "# import matplotlib.pyplot as plt\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "# plt.style.use('seaborn-colorblind')\n",
18 | "# %matplotlib inline\n",
19 | "from feature_selection import filter_method as ft"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Load Dataset"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {
33 | "collapsed": true
34 | },
35 | "outputs": [],
36 | "source": [
37 | "from sklearn.datasets import load_breast_cancer\n",
38 | "data = load_breast_cancer()\n",
39 | "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
40 | " columns= np.append(data['feature_names'], ['target']))"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 3,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/html": [
51 | "\n",
52 | "\n",
65 | "
\n",
66 | " \n",
67 | " \n",
68 | " | \n",
69 | " mean radius | \n",
70 | " mean texture | \n",
71 | " mean perimeter | \n",
72 | " mean area | \n",
73 | " mean smoothness | \n",
74 | " mean compactness | \n",
75 | " mean concavity | \n",
76 | " mean concave points | \n",
77 | " mean symmetry | \n",
78 | " mean fractal dimension | \n",
79 | " ... | \n",
80 | " worst texture | \n",
81 | " worst perimeter | \n",
82 | " worst area | \n",
83 | " worst smoothness | \n",
84 | " worst compactness | \n",
85 | " worst concavity | \n",
86 | " worst concave points | \n",
87 | " worst symmetry | \n",
88 | " worst fractal dimension | \n",
89 | " target | \n",
90 | "
\n",
91 | " \n",
92 | " \n",
93 | " \n",
94 | " 0 | \n",
95 | " 17.99 | \n",
96 | " 10.38 | \n",
97 | " 122.80 | \n",
98 | " 1001.0 | \n",
99 | " 0.11840 | \n",
100 | " 0.27760 | \n",
101 | " 0.3001 | \n",
102 | " 0.14710 | \n",
103 | " 0.2419 | \n",
104 | " 0.07871 | \n",
105 | " ... | \n",
106 | " 17.33 | \n",
107 | " 184.60 | \n",
108 | " 2019.0 | \n",
109 | " 0.1622 | \n",
110 | " 0.6656 | \n",
111 | " 0.7119 | \n",
112 | " 0.2654 | \n",
113 | " 0.4601 | \n",
114 | " 0.11890 | \n",
115 | " 0.0 | \n",
116 | "
\n",
117 | " \n",
118 | " 1 | \n",
119 | " 20.57 | \n",
120 | " 17.77 | \n",
121 | " 132.90 | \n",
122 | " 1326.0 | \n",
123 | " 0.08474 | \n",
124 | " 0.07864 | \n",
125 | " 0.0869 | \n",
126 | " 0.07017 | \n",
127 | " 0.1812 | \n",
128 | " 0.05667 | \n",
129 | " ... | \n",
130 | " 23.41 | \n",
131 | " 158.80 | \n",
132 | " 1956.0 | \n",
133 | " 0.1238 | \n",
134 | " 0.1866 | \n",
135 | " 0.2416 | \n",
136 | " 0.1860 | \n",
137 | " 0.2750 | \n",
138 | " 0.08902 | \n",
139 | " 0.0 | \n",
140 | "
\n",
141 | " \n",
142 | " 2 | \n",
143 | " 19.69 | \n",
144 | " 21.25 | \n",
145 | " 130.00 | \n",
146 | " 1203.0 | \n",
147 | " 0.10960 | \n",
148 | " 0.15990 | \n",
149 | " 0.1974 | \n",
150 | " 0.12790 | \n",
151 | " 0.2069 | \n",
152 | " 0.05999 | \n",
153 | " ... | \n",
154 | " 25.53 | \n",
155 | " 152.50 | \n",
156 | " 1709.0 | \n",
157 | " 0.1444 | \n",
158 | " 0.4245 | \n",
159 | " 0.4504 | \n",
160 | " 0.2430 | \n",
161 | " 0.3613 | \n",
162 | " 0.08758 | \n",
163 | " 0.0 | \n",
164 | "
\n",
165 | " \n",
166 | " 3 | \n",
167 | " 11.42 | \n",
168 | " 20.38 | \n",
169 | " 77.58 | \n",
170 | " 386.1 | \n",
171 | " 0.14250 | \n",
172 | " 0.28390 | \n",
173 | " 0.2414 | \n",
174 | " 0.10520 | \n",
175 | " 0.2597 | \n",
176 | " 0.09744 | \n",
177 | " ... | \n",
178 | " 26.50 | \n",
179 | " 98.87 | \n",
180 | " 567.7 | \n",
181 | " 0.2098 | \n",
182 | " 0.8663 | \n",
183 | " 0.6869 | \n",
184 | " 0.2575 | \n",
185 | " 0.6638 | \n",
186 | " 0.17300 | \n",
187 | " 0.0 | \n",
188 | "
\n",
189 | " \n",
190 | " 4 | \n",
191 | " 20.29 | \n",
192 | " 14.34 | \n",
193 | " 135.10 | \n",
194 | " 1297.0 | \n",
195 | " 0.10030 | \n",
196 | " 0.13280 | \n",
197 | " 0.1980 | \n",
198 | " 0.10430 | \n",
199 | " 0.1809 | \n",
200 | " 0.05883 | \n",
201 | " ... | \n",
202 | " 16.67 | \n",
203 | " 152.20 | \n",
204 | " 1575.0 | \n",
205 | " 0.1374 | \n",
206 | " 0.2050 | \n",
207 | " 0.4000 | \n",
208 | " 0.1625 | \n",
209 | " 0.2364 | \n",
210 | " 0.07678 | \n",
211 | " 0.0 | \n",
212 | "
\n",
213 | " \n",
214 | "
\n",
215 | "
5 rows × 31 columns
\n",
216 | "
"
217 | ],
218 | "text/plain": [
219 | " mean radius mean texture mean perimeter mean area mean smoothness \\\n",
220 | "0 17.99 10.38 122.80 1001.0 0.11840 \n",
221 | "1 20.57 17.77 132.90 1326.0 0.08474 \n",
222 | "2 19.69 21.25 130.00 1203.0 0.10960 \n",
223 | "3 11.42 20.38 77.58 386.1 0.14250 \n",
224 | "4 20.29 14.34 135.10 1297.0 0.10030 \n",
225 | "\n",
226 | " mean compactness mean concavity mean concave points mean symmetry \\\n",
227 | "0 0.27760 0.3001 0.14710 0.2419 \n",
228 | "1 0.07864 0.0869 0.07017 0.1812 \n",
229 | "2 0.15990 0.1974 0.12790 0.2069 \n",
230 | "3 0.28390 0.2414 0.10520 0.2597 \n",
231 | "4 0.13280 0.1980 0.10430 0.1809 \n",
232 | "\n",
233 | " mean fractal dimension ... worst texture worst perimeter worst area \\\n",
234 | "0 0.07871 ... 17.33 184.60 2019.0 \n",
235 | "1 0.05667 ... 23.41 158.80 1956.0 \n",
236 | "2 0.05999 ... 25.53 152.50 1709.0 \n",
237 | "3 0.09744 ... 26.50 98.87 567.7 \n",
238 | "4 0.05883 ... 16.67 152.20 1575.0 \n",
239 | "\n",
240 | " worst smoothness worst compactness worst concavity worst concave points \\\n",
241 | "0 0.1622 0.6656 0.7119 0.2654 \n",
242 | "1 0.1238 0.1866 0.2416 0.1860 \n",
243 | "2 0.1444 0.4245 0.4504 0.2430 \n",
244 | "3 0.2098 0.8663 0.6869 0.2575 \n",
245 | "4 0.1374 0.2050 0.4000 0.1625 \n",
246 | "\n",
247 | " worst symmetry worst fractal dimension target \n",
248 | "0 0.4601 0.11890 0.0 \n",
249 | "1 0.2750 0.08902 0.0 \n",
250 | "2 0.3613 0.08758 0.0 \n",
251 | "3 0.6638 0.17300 0.0 \n",
252 | "4 0.2364 0.07678 0.0 \n",
253 | "\n",
254 | "[5 rows x 31 columns]"
255 | ]
256 | },
257 | "execution_count": 3,
258 | "metadata": {},
259 | "output_type": "execute_result"
260 | }
261 | ],
262 | "source": [
263 | "data.head(5)"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 4,
269 | "metadata": {},
270 | "outputs": [
271 | {
272 | "data": {
273 | "text/plain": [
274 | "((455, 30), (114, 30))"
275 | ]
276 | },
277 | "execution_count": 4,
278 | "metadata": {},
279 | "output_type": "execute_result"
280 | }
281 | ],
282 | "source": [
283 | "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
284 | " data.target, test_size=0.2,\n",
285 | " random_state=0)\n",
286 | "X_train.shape, X_test.shape"
287 | ]
288 | },
289 | {
290 | "cell_type": "markdown",
291 | "metadata": {},
292 | "source": [
293 | "## Variance method\n",
294 | "removing features that show the same value for the majority/all of the observations (constant/quasi-constant features)"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 5,
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "name": "stdout",
304 | "output_type": "stream",
305 | "text": [
306 | "0 variables are found to be almost constant\n"
307 | ]
308 | }
309 | ],
310 | "source": [
311 | "# the original dataset has no constant variable\n",
312 | "quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 6,
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "data": {
322 | "text/plain": [
323 | "1.0 0.923077\n",
324 | "0.0 0.068132\n",
325 | "2.0 0.008791\n",
326 | "Name: dummy, dtype: float64"
327 | ]
328 | },
329 | "execution_count": 6,
330 | "metadata": {},
331 | "output_type": "execute_result"
332 | }
333 | ],
334 | "source": [
335 | "# lets create a duumy variable that help us do the demonstration\n",
336 | "X_train['dummy'] = np.floor(X_train['worst smoothness']*10)\n",
337 | "# variable dummy has> 92% of the observations show one value, 1.0\n",
338 | "X_train.dummy.value_counts() / np.float(len(X_train))"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 7,
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "name": "stdout",
348 | "output_type": "stream",
349 | "text": [
350 | "1 variables are found to be almost constant\n"
351 | ]
352 | },
353 | {
354 | "data": {
355 | "text/plain": [
356 | "['dummy']"
357 | ]
358 | },
359 | "execution_count": 7,
360 | "metadata": {},
361 | "output_type": "execute_result"
362 | }
363 | ],
364 | "source": [
365 | "quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)\n",
366 | "quasi_constant_feature"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 8,
372 | "metadata": {},
373 | "outputs": [
374 | {
375 | "name": "stdout",
376 | "output_type": "stream",
377 | "text": [
378 | "(455, 30)\n"
379 | ]
380 | }
381 | ],
382 | "source": [
383 | "# drop that variable\n",
384 | "X_train.drop(labels=quasi_constant_feature,axis=1,inplace=True)\n",
385 | "print(X_train.shape)"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "## Correlation method\n",
393 | "remove features that are highly correlated with each other"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 9,
399 | "metadata": {},
400 | "outputs": [
401 | {
402 | "name": "stdout",
403 | "output_type": "stream",
404 | "text": [
405 | " feature1 feature2 corr\n",
406 | "0 mean perimeter mean radius 0.998185\n",
407 | "6 mean perimeter mean area 0.986692\n",
408 | "14 mean perimeter worst perimeter 0.970507\n",
409 | "19 mean perimeter worst radius 0.969520\n",
410 | "33 mean perimeter worst area 0.941920 \n",
411 | "\n",
412 | " feature1 feature2 corr\n",
413 | "12 perimeter error radius error 0.978323\n",
414 | "30 perimeter error area error 0.944995 \n",
415 | "\n",
416 | " feature1 feature2 corr\n",
417 | "36 mean concavity mean concave points 0.914627 \n",
418 | "\n",
419 | " feature1 feature2 corr\n",
420 | "38 mean texture worst texture 0.908182 \n",
421 | "\n",
422 | " feature1 feature2 corr\n",
423 | "40 worst concave points mean concave points 0.906312 \n",
424 | "\n"
425 | ]
426 | }
427 | ],
428 | "source": [
429 | "corr = ft.corr_feature_detect(data=X_train,threshold=0.9)\n",
430 | "# print all the correlated feature groups!\n",
431 | "for i in corr:\n",
432 | " print(i,'\\n')"
433 | ]
434 | },
435 | {
436 | "cell_type": "markdown",
437 | "metadata": {},
438 | "source": [
439 | "then we can decide which ones to remove."
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "## Mutual Information Filter\n",
447 | "Mutual information measures how much information the presence/absence of a feature contributes to making the correct prediction on Y."
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": 10,
453 | "metadata": {},
454 | "outputs": [
455 | {
456 | "name": "stdout",
457 | "output_type": "stream",
458 | "text": [
459 | "Index(['mean concave points', 'worst perimeter', 'worst area'], dtype='object')\n"
460 | ]
461 | }
462 | ],
463 | "source": [
464 | "# select the top 3 features\n",
465 | "mi = ft.mutual_info(X=X_train,y=y_train,select_k=3)\n",
466 | "print(mi)"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 11,
472 | "metadata": {},
473 | "outputs": [
474 | {
475 | "name": "stdout",
476 | "output_type": "stream",
477 | "text": [
478 | "Index(['mean perimeter', 'mean concave points', 'worst radius',\n",
479 | " 'worst perimeter', 'worst area', 'worst concave points'],\n",
480 | " dtype='object')\n"
481 | ]
482 | }
483 | ],
484 | "source": [
485 | "# select the top 20% features\n",
486 | "mi = ft.mutual_info(X=X_train,y=y_train,select_k=0.2)\n",
487 | "print(mi)"
488 | ]
489 | },
490 | {
491 | "cell_type": "markdown",
492 | "metadata": {},
493 | "source": [
494 | "## Chi-Square Filter\n",
495 | "Compute chi-squared stats between each non-negative feature and class"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": 12,
501 | "metadata": {},
502 | "outputs": [
503 | {
504 | "name": "stdout",
505 | "output_type": "stream",
506 | "text": [
507 | "Index(['mean area', 'area error', 'worst area'], dtype='object')\n"
508 | ]
509 | }
510 | ],
511 | "source": [
512 | "# select the top 3 features\n",
513 | "chi = ft.chi_square_test(X=X_train,y=y_train,select_k=3)\n",
514 | "print(chi)"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": 13,
520 | "metadata": {},
521 | "outputs": [
522 | {
523 | "name": "stdout",
524 | "output_type": "stream",
525 | "text": [
526 | "Index(['mean perimeter', 'mean area', 'area error', 'worst radius',\n",
527 | " 'worst perimeter', 'worst area'],\n",
528 | " dtype='object')\n"
529 | ]
530 | }
531 | ],
532 | "source": [
533 | "# select the top 20% features\n",
534 | "chi = ft.chi_square_test(X=X_train,y=y_train,select_k=0.2)\n",
535 | "print(chi)"
536 | ]
537 | },
538 | {
539 | "cell_type": "markdown",
540 | "metadata": {},
541 | "source": [
542 | "## Univariate ROC-AUC or MSE\n",
543 | "builds one decision tree per feature, to predict the target, then make predictions and ranks the features according to the machine learning metric (roc-auc or mse)"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": 18,
549 | "metadata": {},
550 | "outputs": [
551 | {
552 | "name": "stdout",
553 | "output_type": "stream",
554 | "text": [
555 | "worst perimeter 0.917275\n",
556 | "worst area 0.895840\n",
557 | "worst radius 0.893458\n",
558 | "worst concave points 0.863131\n",
559 | "mean concavity 0.856939\n",
560 | "mean radius 0.849000\n",
561 | "mean area 0.839314\n",
562 | "worst concavity 0.831375\n",
563 | "mean perimeter 0.829628\n",
564 | "mean concave points 0.826453\n",
565 | "area error 0.812321\n",
566 | "worst compactness 0.742299\n",
567 | "radius error 0.740235\n",
568 | "mean compactness 0.734360\n",
569 | "perimeter error 0.680534\n",
570 | "worst texture 0.647666\n",
571 | "worst fractal dimension 0.640997\n",
572 | "concavity error 0.640203\n",
573 | "worst symmetry 0.620991\n",
574 | "concave points error 0.618133\n",
575 | "compactness error 0.607336\n",
576 | "mean symmetry 0.591775\n",
577 | "mean texture 0.573357\n",
578 | "texture error 0.568593\n",
579 | "worst smoothness 0.565100\n",
580 | "mean smoothness 0.557637\n",
581 | "fractal dimension error 0.542077\n",
582 | "smoothness error 0.522706\n",
583 | "symmetry error 0.493649\n",
584 | "mean fractal dimension 0.475548\n",
585 | "dtype: float64\n",
586 | "11 out of the 30 featues are kept\n",
587 | "mean radius 0.849000\n",
588 | "mean perimeter 0.829628\n",
589 | "mean area 0.839314\n",
590 | "mean concavity 0.856939\n",
591 | "mean concave points 0.826453\n",
592 | "area error 0.812321\n",
593 | "worst radius 0.893458\n",
594 | "worst perimeter 0.917275\n",
595 | "worst area 0.895840\n",
596 | "worst concavity 0.831375\n",
597 | "worst concave points 0.863131\n",
598 | "dtype: float64\n"
599 | ]
600 | }
601 | ],
602 | "source": [
603 | "uni_roc_auc = ft.univariate_roc_auc(X_train=X_train,y_train=y_train,\n",
604 | " X_test=X_test,y_test=y_test,threshold=0.8)\n",
605 | "print(uni_roc_auc)"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": 17,
611 | "metadata": {},
612 | "outputs": [
613 | {
614 | "name": "stdout",
615 | "output_type": "stream",
616 | "text": [
617 | "mean fractal dimension 0.491228\n",
618 | "symmetry error 0.480750\n",
619 | "fractal dimension error 0.456140\n",
620 | "smoothness error 0.449561\n",
621 | "texture error 0.412281\n",
622 | "worst smoothness 0.403265\n",
623 | "mean smoothness 0.399123\n",
624 | "mean texture 0.396930\n",
625 | "mean symmetry 0.363060\n",
626 | "compactness error 0.361842\n",
627 | "concave points error 0.357456\n",
628 | "worst fractal dimension 0.355263\n",
629 | "worst symmetry 0.350877\n",
630 | "worst texture 0.333333\n",
631 | "concavity error 0.333333\n",
632 | "perimeter error 0.300439\n",
633 | "mean compactness 0.258772\n",
634 | "worst compactness 0.254386\n",
635 | "radius error 0.245614\n",
636 | "area error 0.179825\n",
637 | "mean perimeter 0.166667\n",
638 | "mean concave points 0.166667\n",
639 | "worst concavity 0.162281\n",
640 | "mean radius 0.146930\n",
641 | "mean concavity 0.142544\n",
642 | "mean area 0.140351\n",
643 | "worst concave points 0.123782\n",
644 | "worst area 0.103070\n",
645 | "worst radius 0.100877\n",
646 | "worst perimeter 0.098684\n",
647 | "dtype: float64\n",
648 | "6 out of the 30 featues are kept\n",
649 | "mean fractal dimension 0.491228\n",
650 | "texture error 0.412281\n",
651 | "smoothness error 0.449561\n",
652 | "symmetry error 0.480750\n",
653 | "fractal dimension error 0.456140\n",
654 | "worst smoothness 0.403265\n",
655 | "dtype: float64\n"
656 | ]
657 | }
658 | ],
659 | "source": [
660 | "uni_mse = ft.univariate_mse(X_train=X_train,y_train=y_train,\n",
661 | " X_test=X_test,y_test=y_test,threshold=0.4)\n",
662 | "print(uni_mse)"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": null,
668 | "metadata": {
669 | "collapsed": true
670 | },
671 | "outputs": [],
672 | "source": []
673 | }
674 | ],
675 | "metadata": {
676 | "kernelspec": {
677 | "display_name": "Python 3",
678 | "language": "python",
679 | "name": "python3"
680 | },
681 | "language_info": {
682 | "codemirror_mode": {
683 | "name": "ipython",
684 | "version": 3
685 | },
686 | "file_extension": ".py",
687 | "mimetype": "text/x-python",
688 | "name": "python",
689 | "nbconvert_exporter": "python",
690 | "pygments_lexer": "ipython3",
691 | "version": "3.6.1"
692 | }
693 | },
694 | "nbformat": 4,
695 | "nbformat_minor": 2
696 | }
697 |
--------------------------------------------------------------------------------
/4.2_Demo_Feature_Selection_Wrapper.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 45,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "# import seaborn as sns\n",
14 | "# import matplotlib.pyplot as plt\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n",
18 | "from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS\n",
19 | "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
20 | "\n",
21 | "# plt.style.use('seaborn-colorblind')\n",
22 | "# %matplotlib inline\n",
23 | "# from feature_selection import filter_method as ft"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "## Load Dataset"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {
37 | "collapsed": true
38 | },
39 | "outputs": [],
40 | "source": [
41 | "from sklearn.datasets import load_breast_cancer\n",
42 | "data = load_breast_cancer()\n",
43 | "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
44 | " columns= np.append(data['feature_names'], ['target']))"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/html": [
55 | "\n",
56 | "\n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " | \n",
73 | " mean radius | \n",
74 | " mean texture | \n",
75 | " mean perimeter | \n",
76 | " mean area | \n",
77 | " mean smoothness | \n",
78 | " mean compactness | \n",
79 | " mean concavity | \n",
80 | " mean concave points | \n",
81 | " mean symmetry | \n",
82 | " mean fractal dimension | \n",
83 | " ... | \n",
84 | " worst texture | \n",
85 | " worst perimeter | \n",
86 | " worst area | \n",
87 | " worst smoothness | \n",
88 | " worst compactness | \n",
89 | " worst concavity | \n",
90 | " worst concave points | \n",
91 | " worst symmetry | \n",
92 | " worst fractal dimension | \n",
93 | " target | \n",
94 | "
\n",
95 | " \n",
96 | " \n",
97 | " \n",
98 | " 0 | \n",
99 | " 17.99 | \n",
100 | " 10.38 | \n",
101 | " 122.80 | \n",
102 | " 1001.0 | \n",
103 | " 0.11840 | \n",
104 | " 0.27760 | \n",
105 | " 0.3001 | \n",
106 | " 0.14710 | \n",
107 | " 0.2419 | \n",
108 | " 0.07871 | \n",
109 | " ... | \n",
110 | " 17.33 | \n",
111 | " 184.60 | \n",
112 | " 2019.0 | \n",
113 | " 0.1622 | \n",
114 | " 0.6656 | \n",
115 | " 0.7119 | \n",
116 | " 0.2654 | \n",
117 | " 0.4601 | \n",
118 | " 0.11890 | \n",
119 | " 0.0 | \n",
120 | "
\n",
121 | " \n",
122 | " 1 | \n",
123 | " 20.57 | \n",
124 | " 17.77 | \n",
125 | " 132.90 | \n",
126 | " 1326.0 | \n",
127 | " 0.08474 | \n",
128 | " 0.07864 | \n",
129 | " 0.0869 | \n",
130 | " 0.07017 | \n",
131 | " 0.1812 | \n",
132 | " 0.05667 | \n",
133 | " ... | \n",
134 | " 23.41 | \n",
135 | " 158.80 | \n",
136 | " 1956.0 | \n",
137 | " 0.1238 | \n",
138 | " 0.1866 | \n",
139 | " 0.2416 | \n",
140 | " 0.1860 | \n",
141 | " 0.2750 | \n",
142 | " 0.08902 | \n",
143 | " 0.0 | \n",
144 | "
\n",
145 | " \n",
146 | " 2 | \n",
147 | " 19.69 | \n",
148 | " 21.25 | \n",
149 | " 130.00 | \n",
150 | " 1203.0 | \n",
151 | " 0.10960 | \n",
152 | " 0.15990 | \n",
153 | " 0.1974 | \n",
154 | " 0.12790 | \n",
155 | " 0.2069 | \n",
156 | " 0.05999 | \n",
157 | " ... | \n",
158 | " 25.53 | \n",
159 | " 152.50 | \n",
160 | " 1709.0 | \n",
161 | " 0.1444 | \n",
162 | " 0.4245 | \n",
163 | " 0.4504 | \n",
164 | " 0.2430 | \n",
165 | " 0.3613 | \n",
166 | " 0.08758 | \n",
167 | " 0.0 | \n",
168 | "
\n",
169 | " \n",
170 | " 3 | \n",
171 | " 11.42 | \n",
172 | " 20.38 | \n",
173 | " 77.58 | \n",
174 | " 386.1 | \n",
175 | " 0.14250 | \n",
176 | " 0.28390 | \n",
177 | " 0.2414 | \n",
178 | " 0.10520 | \n",
179 | " 0.2597 | \n",
180 | " 0.09744 | \n",
181 | " ... | \n",
182 | " 26.50 | \n",
183 | " 98.87 | \n",
184 | " 567.7 | \n",
185 | " 0.2098 | \n",
186 | " 0.8663 | \n",
187 | " 0.6869 | \n",
188 | " 0.2575 | \n",
189 | " 0.6638 | \n",
190 | " 0.17300 | \n",
191 | " 0.0 | \n",
192 | "
\n",
193 | " \n",
194 | " 4 | \n",
195 | " 20.29 | \n",
196 | " 14.34 | \n",
197 | " 135.10 | \n",
198 | " 1297.0 | \n",
199 | " 0.10030 | \n",
200 | " 0.13280 | \n",
201 | " 0.1980 | \n",
202 | " 0.10430 | \n",
203 | " 0.1809 | \n",
204 | " 0.05883 | \n",
205 | " ... | \n",
206 | " 16.67 | \n",
207 | " 152.20 | \n",
208 | " 1575.0 | \n",
209 | " 0.1374 | \n",
210 | " 0.2050 | \n",
211 | " 0.4000 | \n",
212 | " 0.1625 | \n",
213 | " 0.2364 | \n",
214 | " 0.07678 | \n",
215 | " 0.0 | \n",
216 | "
\n",
217 | " \n",
218 | "
\n",
219 | "
5 rows × 31 columns
\n",
220 | "
"
221 | ],
222 | "text/plain": [
223 | " mean radius mean texture mean perimeter mean area mean smoothness \\\n",
224 | "0 17.99 10.38 122.80 1001.0 0.11840 \n",
225 | "1 20.57 17.77 132.90 1326.0 0.08474 \n",
226 | "2 19.69 21.25 130.00 1203.0 0.10960 \n",
227 | "3 11.42 20.38 77.58 386.1 0.14250 \n",
228 | "4 20.29 14.34 135.10 1297.0 0.10030 \n",
229 | "\n",
230 | " mean compactness mean concavity mean concave points mean symmetry \\\n",
231 | "0 0.27760 0.3001 0.14710 0.2419 \n",
232 | "1 0.07864 0.0869 0.07017 0.1812 \n",
233 | "2 0.15990 0.1974 0.12790 0.2069 \n",
234 | "3 0.28390 0.2414 0.10520 0.2597 \n",
235 | "4 0.13280 0.1980 0.10430 0.1809 \n",
236 | "\n",
237 | " mean fractal dimension ... worst texture worst perimeter worst area \\\n",
238 | "0 0.07871 ... 17.33 184.60 2019.0 \n",
239 | "1 0.05667 ... 23.41 158.80 1956.0 \n",
240 | "2 0.05999 ... 25.53 152.50 1709.0 \n",
241 | "3 0.09744 ... 26.50 98.87 567.7 \n",
242 | "4 0.05883 ... 16.67 152.20 1575.0 \n",
243 | "\n",
244 | " worst smoothness worst compactness worst concavity worst concave points \\\n",
245 | "0 0.1622 0.6656 0.7119 0.2654 \n",
246 | "1 0.1238 0.1866 0.2416 0.1860 \n",
247 | "2 0.1444 0.4245 0.4504 0.2430 \n",
248 | "3 0.2098 0.8663 0.6869 0.2575 \n",
249 | "4 0.1374 0.2050 0.4000 0.1625 \n",
250 | "\n",
251 | " worst symmetry worst fractal dimension target \n",
252 | "0 0.4601 0.11890 0.0 \n",
253 | "1 0.2750 0.08902 0.0 \n",
254 | "2 0.3613 0.08758 0.0 \n",
255 | "3 0.6638 0.17300 0.0 \n",
256 | "4 0.2364 0.07678 0.0 \n",
257 | "\n",
258 | "[5 rows x 31 columns]"
259 | ]
260 | },
261 | "execution_count": 3,
262 | "metadata": {},
263 | "output_type": "execute_result"
264 | }
265 | ],
266 | "source": [
267 | "data.head(5)"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 4,
273 | "metadata": {},
274 | "outputs": [
275 | {
276 | "data": {
277 | "text/plain": [
278 | "((455, 30), (114, 30))"
279 | ]
280 | },
281 | "execution_count": 4,
282 | "metadata": {},
283 | "output_type": "execute_result"
284 | }
285 | ],
286 | "source": [
287 | "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
288 | " data.target, test_size=0.2,\n",
289 | " random_state=0)\n",
290 | "X_train.shape, X_test.shape"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "## Forward Selection\n",
298 | " "
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 16,
304 | "metadata": {},
305 | "outputs": [
306 | {
307 | "name": "stderr",
308 | "output_type": "stream",
309 | "text": [
310 | "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
311 | "[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 11.4s finished\n",
312 | "Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
313 | "[Parallel(n_jobs=1)]: Done 29 out of 29 | elapsed: 11.2s finished\n",
314 | "Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
315 | "[Parallel(n_jobs=1)]: Done 28 out of 28 | elapsed: 10.7s finished\n",
316 | "Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
317 | "[Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 10.3s finished\n",
318 | "Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
319 | "[Parallel(n_jobs=1)]: Done 26 out of 26 | elapsed: 10.0s finished\n",
320 | "Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
321 | "[Parallel(n_jobs=1)]: Done 25 out of 25 | elapsed: 9.6s finished\n",
322 | "Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
323 | "[Parallel(n_jobs=1)]: Done 24 out of 24 | elapsed: 9.2s finished\n",
324 | "Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
325 | "[Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 8.8s finished\n",
326 | "Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
327 | "[Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 8.4s finished\n",
328 | "Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
329 | "[Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 8.1s finished\n",
330 | "Features: 10/10"
331 | ]
332 | }
333 | ],
334 | "source": [
335 | "# step forward feature selection\n",
336 | "# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
337 | "\n",
338 | "sfs1 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
339 | " k_features=10, \n",
340 | " forward=True, \n",
341 | " floating=False, \n",
342 | " verbose=1,\n",
343 | " scoring='roc_auc',\n",
344 | " cv=3)\n",
345 | "\n",
346 | "sfs1 = sfs1.fit(np.array(X_train), y_train)"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": 17,
352 | "metadata": {},
353 | "outputs": [
354 | {
355 | "data": {
356 | "text/plain": [
357 | "Index(['mean texture', 'mean perimeter', 'mean concavity',\n",
358 | " 'mean fractal dimension', 'area error', 'compactness error',\n",
359 | " 'worst perimeter', 'worst area', 'worst smoothness', 'worst symmetry'],\n",
360 | " dtype='object')"
361 | ]
362 | },
363 | "execution_count": 17,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "selected_feat1= X_train.columns[list(sfs1.k_feature_idx_)]\n",
370 | "selected_feat1"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "## Backward Elimination"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 18,
383 | "metadata": {},
384 | "outputs": [
385 | {
386 | "name": "stderr",
387 | "output_type": "stream",
388 | "text": [
389 | "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
390 | "[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 11.5s finished\n",
391 | "Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
392 | "[Parallel(n_jobs=1)]: Done 29 out of 29 | elapsed: 11.2s finished\n",
393 | "Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
394 | "[Parallel(n_jobs=1)]: Done 28 out of 28 | elapsed: 10.7s finished\n",
395 | "Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
396 | "[Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 10.2s finished\n",
397 | "Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
398 | "[Parallel(n_jobs=1)]: Done 26 out of 26 | elapsed: 10.1s finished\n",
399 | "Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
400 | "[Parallel(n_jobs=1)]: Done 25 out of 25 | elapsed: 9.6s finished\n",
401 | "Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
402 | "[Parallel(n_jobs=1)]: Done 24 out of 24 | elapsed: 9.2s finished\n",
403 | "Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
404 | "[Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 8.8s finished\n",
405 | "Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
406 | "[Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 8.5s finished\n",
407 | "Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
408 | "[Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 8.2s finished\n",
409 | "Features: 10/10"
410 | ]
411 | }
412 | ],
413 | "source": [
414 | "# step backward feature selection\n",
415 | "# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
416 | "\n",
417 | "sfs2 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
418 | " k_features=10, \n",
419 | " forward=False, \n",
420 | " floating=False, \n",
421 | " verbose=1,\n",
422 | " scoring='roc_auc',\n",
423 | " cv=3)\n",
424 | "\n",
425 | "sfs2 = sfs1.fit(np.array(X_train.fillna(0)), y_train)"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 44,
431 | "metadata": {},
432 | "outputs": [
433 | {
434 | "data": {
435 | "text/plain": [
436 | "Index(['mean area', 'mean compactness', 'texture error', 'area error',\n",
437 | " 'compactness error', 'concavity error', 'worst texture',\n",
438 | " 'worst perimeter', 'worst smoothness', 'worst concavity'],\n",
439 | " dtype='object')"
440 | ]
441 | },
442 | "execution_count": 44,
443 | "metadata": {},
444 | "output_type": "execute_result"
445 | }
446 | ],
447 | "source": [
448 | "selected_feat2= X_train.columns[list(sfs2.k_feature_idx_)]\n",
449 | "selected_feat2\n"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {},
455 | "source": [
456 | "Note that SFS and SBE return different results"
457 | ]
458 | },
459 | {
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | "## Exhaustive Feature Selection"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": 51,
469 | "metadata": {},
470 | "outputs": [
471 | {
472 | "name": "stderr",
473 | "output_type": "stream",
474 | "text": [
475 | "Features: 847/847"
476 | ]
477 | }
478 | ],
479 | "source": [
480 | "efs1 = EFS(RandomForestClassifier(n_jobs=-1,n_estimators=5, random_state=0), \n",
481 | " min_features=1,\n",
482 | " max_features=6, \n",
483 | " scoring='roc_auc',\n",
484 | " print_progress=True,\n",
485 | " cv=2)\n",
486 | "\n",
487 | "# in order to shorter search time for the demonstration\n",
488 | "# we only try all possible 1,2,3,4,5,6\n",
489 | "# feature combinations from a dataset of 10 features\n",
490 | "\n",
491 | "efs1 = efs1.fit(np.array(X_train[X_train.columns[0:10]].fillna(0)), y_train)"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": 52,
497 | "metadata": {},
498 | "outputs": [
499 | {
500 | "data": {
501 | "text/plain": [
502 | "Index(['mean radius', 'mean texture', 'mean area', 'mean smoothness',\n",
503 | " 'mean concavity'],\n",
504 | " dtype='object')"
505 | ]
506 | },
507 | "execution_count": 52,
508 | "metadata": {},
509 | "output_type": "execute_result"
510 | }
511 | ],
512 | "source": [
513 | "selected_feat3= X_train.columns[list(efs1.best_idx_)]\n",
514 | "selected_feat3"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": null,
520 | "metadata": {
521 | "collapsed": true
522 | },
523 | "outputs": [],
524 | "source": []
525 | }
526 | ],
527 | "metadata": {
528 | "kernelspec": {
529 | "display_name": "Python 3",
530 | "language": "python",
531 | "name": "python3"
532 | },
533 | "language_info": {
534 | "codemirror_mode": {
535 | "name": "ipython",
536 | "version": 3
537 | },
538 | "file_extension": ".py",
539 | "mimetype": "text/x-python",
540 | "name": "python",
541 | "nbconvert_exporter": "python",
542 | "pygments_lexer": "ipython3",
543 | "version": "3.6.1"
544 | }
545 | },
546 | "nbformat": 4,
547 | "nbformat_minor": 2
548 | }
549 |
--------------------------------------------------------------------------------
/4.4_Demo_Feature_Selection_Feature_Shuffling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "# import seaborn as sns\n",
14 | "# import matplotlib.pyplot as plt\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from sklearn.feature_selection import SelectFromModel\n",
18 | "from sklearn.ensemble import RandomForestClassifier\n",
19 | "# plt.style.use('seaborn-colorblind')\n",
20 | "# %matplotlib inline\n",
21 | "from feature_selection import feature_shuffle\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## Load Dataset"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {
35 | "collapsed": true
36 | },
37 | "outputs": [],
38 | "source": [
39 | "from sklearn.datasets import load_breast_cancer\n",
40 | "data = load_breast_cancer()\n",
41 | "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
42 | " columns= np.append(data['feature_names'], ['target']))"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "data": {
52 | "text/html": [
53 | "\n",
54 | "\n",
67 | "
\n",
68 | " \n",
69 | " \n",
70 | " | \n",
71 | " mean radius | \n",
72 | " mean texture | \n",
73 | " mean perimeter | \n",
74 | " mean area | \n",
75 | " mean smoothness | \n",
76 | " mean compactness | \n",
77 | " mean concavity | \n",
78 | " mean concave points | \n",
79 | " mean symmetry | \n",
80 | " mean fractal dimension | \n",
81 | " ... | \n",
82 | " worst texture | \n",
83 | " worst perimeter | \n",
84 | " worst area | \n",
85 | " worst smoothness | \n",
86 | " worst compactness | \n",
87 | " worst concavity | \n",
88 | " worst concave points | \n",
89 | " worst symmetry | \n",
90 | " worst fractal dimension | \n",
91 | " target | \n",
92 | "
\n",
93 | " \n",
94 | " \n",
95 | " \n",
96 | " 0 | \n",
97 | " 17.99 | \n",
98 | " 10.38 | \n",
99 | " 122.80 | \n",
100 | " 1001.0 | \n",
101 | " 0.11840 | \n",
102 | " 0.27760 | \n",
103 | " 0.3001 | \n",
104 | " 0.14710 | \n",
105 | " 0.2419 | \n",
106 | " 0.07871 | \n",
107 | " ... | \n",
108 | " 17.33 | \n",
109 | " 184.60 | \n",
110 | " 2019.0 | \n",
111 | " 0.1622 | \n",
112 | " 0.6656 | \n",
113 | " 0.7119 | \n",
114 | " 0.2654 | \n",
115 | " 0.4601 | \n",
116 | " 0.11890 | \n",
117 | " 0.0 | \n",
118 | "
\n",
119 | " \n",
120 | " 1 | \n",
121 | " 20.57 | \n",
122 | " 17.77 | \n",
123 | " 132.90 | \n",
124 | " 1326.0 | \n",
125 | " 0.08474 | \n",
126 | " 0.07864 | \n",
127 | " 0.0869 | \n",
128 | " 0.07017 | \n",
129 | " 0.1812 | \n",
130 | " 0.05667 | \n",
131 | " ... | \n",
132 | " 23.41 | \n",
133 | " 158.80 | \n",
134 | " 1956.0 | \n",
135 | " 0.1238 | \n",
136 | " 0.1866 | \n",
137 | " 0.2416 | \n",
138 | " 0.1860 | \n",
139 | " 0.2750 | \n",
140 | " 0.08902 | \n",
141 | " 0.0 | \n",
142 | "
\n",
143 | " \n",
144 | " 2 | \n",
145 | " 19.69 | \n",
146 | " 21.25 | \n",
147 | " 130.00 | \n",
148 | " 1203.0 | \n",
149 | " 0.10960 | \n",
150 | " 0.15990 | \n",
151 | " 0.1974 | \n",
152 | " 0.12790 | \n",
153 | " 0.2069 | \n",
154 | " 0.05999 | \n",
155 | " ... | \n",
156 | " 25.53 | \n",
157 | " 152.50 | \n",
158 | " 1709.0 | \n",
159 | " 0.1444 | \n",
160 | " 0.4245 | \n",
161 | " 0.4504 | \n",
162 | " 0.2430 | \n",
163 | " 0.3613 | \n",
164 | " 0.08758 | \n",
165 | " 0.0 | \n",
166 | "
\n",
167 | " \n",
168 | " 3 | \n",
169 | " 11.42 | \n",
170 | " 20.38 | \n",
171 | " 77.58 | \n",
172 | " 386.1 | \n",
173 | " 0.14250 | \n",
174 | " 0.28390 | \n",
175 | " 0.2414 | \n",
176 | " 0.10520 | \n",
177 | " 0.2597 | \n",
178 | " 0.09744 | \n",
179 | " ... | \n",
180 | " 26.50 | \n",
181 | " 98.87 | \n",
182 | " 567.7 | \n",
183 | " 0.2098 | \n",
184 | " 0.8663 | \n",
185 | " 0.6869 | \n",
186 | " 0.2575 | \n",
187 | " 0.6638 | \n",
188 | " 0.17300 | \n",
189 | " 0.0 | \n",
190 | "
\n",
191 | " \n",
192 | " 4 | \n",
193 | " 20.29 | \n",
194 | " 14.34 | \n",
195 | " 135.10 | \n",
196 | " 1297.0 | \n",
197 | " 0.10030 | \n",
198 | " 0.13280 | \n",
199 | " 0.1980 | \n",
200 | " 0.10430 | \n",
201 | " 0.1809 | \n",
202 | " 0.05883 | \n",
203 | " ... | \n",
204 | " 16.67 | \n",
205 | " 152.20 | \n",
206 | " 1575.0 | \n",
207 | " 0.1374 | \n",
208 | " 0.2050 | \n",
209 | " 0.4000 | \n",
210 | " 0.1625 | \n",
211 | " 0.2364 | \n",
212 | " 0.07678 | \n",
213 | " 0.0 | \n",
214 | "
\n",
215 | " \n",
216 | "
\n",
217 | "
5 rows × 31 columns
\n",
218 | "
"
219 | ],
220 | "text/plain": [
221 | " mean radius mean texture mean perimeter mean area mean smoothness \\\n",
222 | "0 17.99 10.38 122.80 1001.0 0.11840 \n",
223 | "1 20.57 17.77 132.90 1326.0 0.08474 \n",
224 | "2 19.69 21.25 130.00 1203.0 0.10960 \n",
225 | "3 11.42 20.38 77.58 386.1 0.14250 \n",
226 | "4 20.29 14.34 135.10 1297.0 0.10030 \n",
227 | "\n",
228 | " mean compactness mean concavity mean concave points mean symmetry \\\n",
229 | "0 0.27760 0.3001 0.14710 0.2419 \n",
230 | "1 0.07864 0.0869 0.07017 0.1812 \n",
231 | "2 0.15990 0.1974 0.12790 0.2069 \n",
232 | "3 0.28390 0.2414 0.10520 0.2597 \n",
233 | "4 0.13280 0.1980 0.10430 0.1809 \n",
234 | "\n",
235 | " mean fractal dimension ... worst texture worst perimeter worst area \\\n",
236 | "0 0.07871 ... 17.33 184.60 2019.0 \n",
237 | "1 0.05667 ... 23.41 158.80 1956.0 \n",
238 | "2 0.05999 ... 25.53 152.50 1709.0 \n",
239 | "3 0.09744 ... 26.50 98.87 567.7 \n",
240 | "4 0.05883 ... 16.67 152.20 1575.0 \n",
241 | "\n",
242 | " worst smoothness worst compactness worst concavity worst concave points \\\n",
243 | "0 0.1622 0.6656 0.7119 0.2654 \n",
244 | "1 0.1238 0.1866 0.2416 0.1860 \n",
245 | "2 0.1444 0.4245 0.4504 0.2430 \n",
246 | "3 0.2098 0.8663 0.6869 0.2575 \n",
247 | "4 0.1374 0.2050 0.4000 0.1625 \n",
248 | "\n",
249 | " worst symmetry worst fractal dimension target \n",
250 | "0 0.4601 0.11890 0.0 \n",
251 | "1 0.2750 0.08902 0.0 \n",
252 | "2 0.3613 0.08758 0.0 \n",
253 | "3 0.6638 0.17300 0.0 \n",
254 | "4 0.2364 0.07678 0.0 \n",
255 | "\n",
256 | "[5 rows x 31 columns]"
257 | ]
258 | },
259 | "execution_count": 3,
260 | "metadata": {},
261 | "output_type": "execute_result"
262 | }
263 | ],
264 | "source": [
265 | "data.head(5)"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 4,
271 | "metadata": {},
272 | "outputs": [
273 | {
274 | "data": {
275 | "text/plain": [
276 | "((455, 30), (114, 30))"
277 | ]
278 | },
279 | "execution_count": 4,
280 | "metadata": {},
281 | "output_type": "execute_result"
282 | }
283 | ],
284 | "source": [
285 | "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
286 | " data.target, test_size=0.2,\n",
287 | " random_state=0)\n",
288 | "X_train.shape, X_test.shape"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "## Feature Shuffling\n",
296 | "permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model.\n",
297 | "If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics."
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 17,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "auc_drop, selected_features = feature_shuffle.feature_shuffle_rf(X_train=X_train,\n",
307 | " y_train=y_train,\n",
308 | " random_state=0)"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 18,
314 | "metadata": {},
315 | "outputs": [
316 | {
317 | "data": {
318 | "text/html": [
319 | "\n",
320 | "\n",
333 | "
\n",
334 | " \n",
335 | " \n",
336 | " | \n",
337 | " feature | \n",
338 | " auc_drop | \n",
339 | "
\n",
340 | " \n",
341 | " \n",
342 | " \n",
343 | " 22 | \n",
344 | " worst perimeter | \n",
345 | " 8.359457e-05 | \n",
346 | "
\n",
347 | " \n",
348 | " 27 | \n",
349 | " worst concave points | \n",
350 | " 3.134796e-05 | \n",
351 | "
\n",
352 | " \n",
353 | " 23 | \n",
354 | " worst area | \n",
355 | " 1.110223e-16 | \n",
356 | "
\n",
357 | " \n",
358 | " 12 | \n",
359 | " perimeter error | \n",
360 | " 1.110223e-16 | \n",
361 | "
\n",
362 | " \n",
363 | " 0 | \n",
364 | " mean radius | \n",
365 | " 0.000000e+00 | \n",
366 | "
\n",
367 | " \n",
368 | " 16 | \n",
369 | " concavity error | \n",
370 | " 0.000000e+00 | \n",
371 | "
\n",
372 | " \n",
373 | " 28 | \n",
374 | " worst symmetry | \n",
375 | " 0.000000e+00 | \n",
376 | "
\n",
377 | " \n",
378 | " 26 | \n",
379 | " worst concavity | \n",
380 | " 0.000000e+00 | \n",
381 | "
\n",
382 | " \n",
383 | " 25 | \n",
384 | " worst compactness | \n",
385 | " 0.000000e+00 | \n",
386 | "
\n",
387 | " \n",
388 | " 24 | \n",
389 | " worst smoothness | \n",
390 | " 0.000000e+00 | \n",
391 | "
\n",
392 | " \n",
393 | " 21 | \n",
394 | " worst texture | \n",
395 | " 0.000000e+00 | \n",
396 | "
\n",
397 | " \n",
398 | " 20 | \n",
399 | " worst radius | \n",
400 | " 0.000000e+00 | \n",
401 | "
\n",
402 | " \n",
403 | " 19 | \n",
404 | " fractal dimension error | \n",
405 | " 0.000000e+00 | \n",
406 | "
\n",
407 | " \n",
408 | " 18 | \n",
409 | " symmetry error | \n",
410 | " 0.000000e+00 | \n",
411 | "
\n",
412 | " \n",
413 | " 17 | \n",
414 | " concave points error | \n",
415 | " 0.000000e+00 | \n",
416 | "
\n",
417 | " \n",
418 | " 15 | \n",
419 | " compactness error | \n",
420 | " 0.000000e+00 | \n",
421 | "
\n",
422 | " \n",
423 | " 1 | \n",
424 | " mean texture | \n",
425 | " 0.000000e+00 | \n",
426 | "
\n",
427 | " \n",
428 | " 14 | \n",
429 | " smoothness error | \n",
430 | " 0.000000e+00 | \n",
431 | "
\n",
432 | " \n",
433 | " 13 | \n",
434 | " area error | \n",
435 | " 0.000000e+00 | \n",
436 | "
\n",
437 | " \n",
438 | " 11 | \n",
439 | " texture error | \n",
440 | " 0.000000e+00 | \n",
441 | "
\n",
442 | " \n",
443 | " 10 | \n",
444 | " radius error | \n",
445 | " 0.000000e+00 | \n",
446 | "
\n",
447 | " \n",
448 | " 9 | \n",
449 | " mean fractal dimension | \n",
450 | " 0.000000e+00 | \n",
451 | "
\n",
452 | " \n",
453 | " 8 | \n",
454 | " mean symmetry | \n",
455 | " 0.000000e+00 | \n",
456 | "
\n",
457 | " \n",
458 | " 7 | \n",
459 | " mean concave points | \n",
460 | " 0.000000e+00 | \n",
461 | "
\n",
462 | " \n",
463 | " 6 | \n",
464 | " mean concavity | \n",
465 | " 0.000000e+00 | \n",
466 | "
\n",
467 | " \n",
468 | " 5 | \n",
469 | " mean compactness | \n",
470 | " 0.000000e+00 | \n",
471 | "
\n",
472 | " \n",
473 | " 4 | \n",
474 | " mean smoothness | \n",
475 | " 0.000000e+00 | \n",
476 | "
\n",
477 | " \n",
478 | " 3 | \n",
479 | " mean area | \n",
480 | " 0.000000e+00 | \n",
481 | "
\n",
482 | " \n",
483 | " 2 | \n",
484 | " mean perimeter | \n",
485 | " 0.000000e+00 | \n",
486 | "
\n",
487 | " \n",
488 | " 29 | \n",
489 | " worst fractal dimension | \n",
490 | " 0.000000e+00 | \n",
491 | "
\n",
492 | " \n",
493 | "
\n",
494 | "
"
495 | ],
496 | "text/plain": [
497 | " feature auc_drop\n",
498 | "22 worst perimeter 8.359457e-05\n",
499 | "27 worst concave points 3.134796e-05\n",
500 | "23 worst area 1.110223e-16\n",
501 | "12 perimeter error 1.110223e-16\n",
502 | "0 mean radius 0.000000e+00\n",
503 | "16 concavity error 0.000000e+00\n",
504 | "28 worst symmetry 0.000000e+00\n",
505 | "26 worst concavity 0.000000e+00\n",
506 | "25 worst compactness 0.000000e+00\n",
507 | "24 worst smoothness 0.000000e+00\n",
508 | "21 worst texture 0.000000e+00\n",
509 | "20 worst radius 0.000000e+00\n",
510 | "19 fractal dimension error 0.000000e+00\n",
511 | "18 symmetry error 0.000000e+00\n",
512 | "17 concave points error 0.000000e+00\n",
513 | "15 compactness error 0.000000e+00\n",
514 | "1 mean texture 0.000000e+00\n",
515 | "14 smoothness error 0.000000e+00\n",
516 | "13 area error 0.000000e+00\n",
517 | "11 texture error 0.000000e+00\n",
518 | "10 radius error 0.000000e+00\n",
519 | "9 mean fractal dimension 0.000000e+00\n",
520 | "8 mean symmetry 0.000000e+00\n",
521 | "7 mean concave points 0.000000e+00\n",
522 | "6 mean concavity 0.000000e+00\n",
523 | "5 mean compactness 0.000000e+00\n",
524 | "4 mean smoothness 0.000000e+00\n",
525 | "3 mean area 0.000000e+00\n",
526 | "2 mean perimeter 0.000000e+00\n",
527 | "29 worst fractal dimension 0.000000e+00"
528 | ]
529 | },
530 | "execution_count": 18,
531 | "metadata": {},
532 | "output_type": "execute_result"
533 | }
534 | ],
535 | "source": [
536 | "# we select features that have auc_drop > 0\n",
537 | "auc_drop"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 19,
543 | "metadata": {},
544 | "outputs": [
545 | {
546 | "data": {
547 | "text/plain": [
548 | "22 worst perimeter\n",
549 | "27 worst concave points\n",
550 | "23 worst area\n",
551 | "12 perimeter error\n",
552 | "Name: feature, dtype: object"
553 | ]
554 | },
555 | "execution_count": 19,
556 | "metadata": {},
557 | "output_type": "execute_result"
558 | }
559 | ],
560 | "source": [
561 | "selected_features"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": null,
567 | "metadata": {
568 | "collapsed": true
569 | },
570 | "outputs": [],
571 | "source": []
572 | }
573 | ],
574 | "metadata": {
575 | "kernelspec": {
576 | "display_name": "Python 3",
577 | "language": "python",
578 | "name": "python3"
579 | },
580 | "language_info": {
581 | "codemirror_mode": {
582 | "name": "ipython",
583 | "version": 3
584 | },
585 | "file_extension": ".py",
586 | "mimetype": "text/x-python",
587 | "name": "python",
588 | "nbconvert_exporter": "python",
589 | "pygments_lexer": "ipython3",
590 | "version": "3.6.1"
591 | }
592 | },
593 | "nbformat": 4,
594 | "nbformat_minor": 2
595 | }
596 |
--------------------------------------------------------------------------------
/A Short Guide for Feature Engineering and Feature Selection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/A Short Guide for Feature Engineering and Feature Selection.pdf
--------------------------------------------------------------------------------
/data/pima-indians-diabetes.data.csv:
--------------------------------------------------------------------------------
1 | 6,148,72,35,0,33.6,0.627,50,1
2 | 1,85,66,29,0,26.6,0.351,31,0
3 | 8,183,64,0,0,23.3,0.672,32,1
4 | 1,89,66,23,94,28.1,0.167,21,0
5 | 0,137,40,35,168,43.1,2.288,33,1
6 | 5,116,74,0,0,25.6,0.201,30,0
7 | 3,78,50,32,88,31.0,0.248,26,1
8 | 10,115,0,0,0,35.3,0.134,29,0
9 | 2,197,70,45,543,30.5,0.158,53,1
10 | 8,125,96,0,0,0.0,0.232,54,1
11 | 4,110,92,0,0,37.6,0.191,30,0
12 | 10,168,74,0,0,38.0,0.537,34,1
13 | 10,139,80,0,0,27.1,1.441,57,0
14 | 1,189,60,23,846,30.1,0.398,59,1
15 | 5,166,72,19,175,25.8,0.587,51,1
16 | 7,100,0,0,0,30.0,0.484,32,1
17 | 0,118,84,47,230,45.8,0.551,31,1
18 | 7,107,74,0,0,29.6,0.254,31,1
19 | 1,103,30,38,83,43.3,0.183,33,0
20 | 1,115,70,30,96,34.6,0.529,32,1
21 | 3,126,88,41,235,39.3,0.704,27,0
22 | 8,99,84,0,0,35.4,0.388,50,0
23 | 7,196,90,0,0,39.8,0.451,41,1
24 | 9,119,80,35,0,29.0,0.263,29,1
25 | 11,143,94,33,146,36.6,0.254,51,1
26 | 10,125,70,26,115,31.1,0.205,41,1
27 | 7,147,76,0,0,39.4,0.257,43,1
28 | 1,97,66,15,140,23.2,0.487,22,0
29 | 13,145,82,19,110,22.2,0.245,57,0
30 | 5,117,92,0,0,34.1,0.337,38,0
31 | 5,109,75,26,0,36.0,0.546,60,0
32 | 3,158,76,36,245,31.6,0.851,28,1
33 | 3,88,58,11,54,24.8,0.267,22,0
34 | 6,92,92,0,0,19.9,0.188,28,0
35 | 10,122,78,31,0,27.6,0.512,45,0
36 | 4,103,60,33,192,24.0,0.966,33,0
37 | 11,138,76,0,0,33.2,0.420,35,0
38 | 9,102,76,37,0,32.9,0.665,46,1
39 | 2,90,68,42,0,38.2,0.503,27,1
40 | 4,111,72,47,207,37.1,1.390,56,1
41 | 3,180,64,25,70,34.0,0.271,26,0
42 | 7,133,84,0,0,40.2,0.696,37,0
43 | 7,106,92,18,0,22.7,0.235,48,0
44 | 9,171,110,24,240,45.4,0.721,54,1
45 | 7,159,64,0,0,27.4,0.294,40,0
46 | 0,180,66,39,0,42.0,1.893,25,1
47 | 1,146,56,0,0,29.7,0.564,29,0
48 | 2,71,70,27,0,28.0,0.586,22,0
49 | 7,103,66,32,0,39.1,0.344,31,1
50 | 7,105,0,0,0,0.0,0.305,24,0
51 | 1,103,80,11,82,19.4,0.491,22,0
52 | 1,101,50,15,36,24.2,0.526,26,0
53 | 5,88,66,21,23,24.4,0.342,30,0
54 | 8,176,90,34,300,33.7,0.467,58,1
55 | 7,150,66,42,342,34.7,0.718,42,0
56 | 1,73,50,10,0,23.0,0.248,21,0
57 | 7,187,68,39,304,37.7,0.254,41,1
58 | 0,100,88,60,110,46.8,0.962,31,0
59 | 0,146,82,0,0,40.5,1.781,44,0
60 | 0,105,64,41,142,41.5,0.173,22,0
61 | 2,84,0,0,0,0.0,0.304,21,0
62 | 8,133,72,0,0,32.9,0.270,39,1
63 | 5,44,62,0,0,25.0,0.587,36,0
64 | 2,141,58,34,128,25.4,0.699,24,0
65 | 7,114,66,0,0,32.8,0.258,42,1
66 | 5,99,74,27,0,29.0,0.203,32,0
67 | 0,109,88,30,0,32.5,0.855,38,1
68 | 2,109,92,0,0,42.7,0.845,54,0
69 | 1,95,66,13,38,19.6,0.334,25,0
70 | 4,146,85,27,100,28.9,0.189,27,0
71 | 2,100,66,20,90,32.9,0.867,28,1
72 | 5,139,64,35,140,28.6,0.411,26,0
73 | 13,126,90,0,0,43.4,0.583,42,1
74 | 4,129,86,20,270,35.1,0.231,23,0
75 | 1,79,75,30,0,32.0,0.396,22,0
76 | 1,0,48,20,0,24.7,0.140,22,0
77 | 7,62,78,0,0,32.6,0.391,41,0
78 | 5,95,72,33,0,37.7,0.370,27,0
79 | 0,131,0,0,0,43.2,0.270,26,1
80 | 2,112,66,22,0,25.0,0.307,24,0
81 | 3,113,44,13,0,22.4,0.140,22,0
82 | 2,74,0,0,0,0.0,0.102,22,0
83 | 7,83,78,26,71,29.3,0.767,36,0
84 | 0,101,65,28,0,24.6,0.237,22,0
85 | 5,137,108,0,0,48.8,0.227,37,1
86 | 2,110,74,29,125,32.4,0.698,27,0
87 | 13,106,72,54,0,36.6,0.178,45,0
88 | 2,100,68,25,71,38.5,0.324,26,0
89 | 15,136,70,32,110,37.1,0.153,43,1
90 | 1,107,68,19,0,26.5,0.165,24,0
91 | 1,80,55,0,0,19.1,0.258,21,0
92 | 4,123,80,15,176,32.0,0.443,34,0
93 | 7,81,78,40,48,46.7,0.261,42,0
94 | 4,134,72,0,0,23.8,0.277,60,1
95 | 2,142,82,18,64,24.7,0.761,21,0
96 | 6,144,72,27,228,33.9,0.255,40,0
97 | 2,92,62,28,0,31.6,0.130,24,0
98 | 1,71,48,18,76,20.4,0.323,22,0
99 | 6,93,50,30,64,28.7,0.356,23,0
100 | 1,122,90,51,220,49.7,0.325,31,1
101 | 1,163,72,0,0,39.0,1.222,33,1
102 | 1,151,60,0,0,26.1,0.179,22,0
103 | 0,125,96,0,0,22.5,0.262,21,0
104 | 1,81,72,18,40,26.6,0.283,24,0
105 | 2,85,65,0,0,39.6,0.930,27,0
106 | 1,126,56,29,152,28.7,0.801,21,0
107 | 1,96,122,0,0,22.4,0.207,27,0
108 | 4,144,58,28,140,29.5,0.287,37,0
109 | 3,83,58,31,18,34.3,0.336,25,0
110 | 0,95,85,25,36,37.4,0.247,24,1
111 | 3,171,72,33,135,33.3,0.199,24,1
112 | 8,155,62,26,495,34.0,0.543,46,1
113 | 1,89,76,34,37,31.2,0.192,23,0
114 | 4,76,62,0,0,34.0,0.391,25,0
115 | 7,160,54,32,175,30.5,0.588,39,1
116 | 4,146,92,0,0,31.2,0.539,61,1
117 | 5,124,74,0,0,34.0,0.220,38,1
118 | 5,78,48,0,0,33.7,0.654,25,0
119 | 4,97,60,23,0,28.2,0.443,22,0
120 | 4,99,76,15,51,23.2,0.223,21,0
121 | 0,162,76,56,100,53.2,0.759,25,1
122 | 6,111,64,39,0,34.2,0.260,24,0
123 | 2,107,74,30,100,33.6,0.404,23,0
124 | 5,132,80,0,0,26.8,0.186,69,0
125 | 0,113,76,0,0,33.3,0.278,23,1
126 | 1,88,30,42,99,55.0,0.496,26,1
127 | 3,120,70,30,135,42.9,0.452,30,0
128 | 1,118,58,36,94,33.3,0.261,23,0
129 | 1,117,88,24,145,34.5,0.403,40,1
130 | 0,105,84,0,0,27.9,0.741,62,1
131 | 4,173,70,14,168,29.7,0.361,33,1
132 | 9,122,56,0,0,33.3,1.114,33,1
133 | 3,170,64,37,225,34.5,0.356,30,1
134 | 8,84,74,31,0,38.3,0.457,39,0
135 | 2,96,68,13,49,21.1,0.647,26,0
136 | 2,125,60,20,140,33.8,0.088,31,0
137 | 0,100,70,26,50,30.8,0.597,21,0
138 | 0,93,60,25,92,28.7,0.532,22,0
139 | 0,129,80,0,0,31.2,0.703,29,0
140 | 5,105,72,29,325,36.9,0.159,28,0
141 | 3,128,78,0,0,21.1,0.268,55,0
142 | 5,106,82,30,0,39.5,0.286,38,0
143 | 2,108,52,26,63,32.5,0.318,22,0
144 | 10,108,66,0,0,32.4,0.272,42,1
145 | 4,154,62,31,284,32.8,0.237,23,0
146 | 0,102,75,23,0,0.0,0.572,21,0
147 | 9,57,80,37,0,32.8,0.096,41,0
148 | 2,106,64,35,119,30.5,1.400,34,0
149 | 5,147,78,0,0,33.7,0.218,65,0
150 | 2,90,70,17,0,27.3,0.085,22,0
151 | 1,136,74,50,204,37.4,0.399,24,0
152 | 4,114,65,0,0,21.9,0.432,37,0
153 | 9,156,86,28,155,34.3,1.189,42,1
154 | 1,153,82,42,485,40.6,0.687,23,0
155 | 8,188,78,0,0,47.9,0.137,43,1
156 | 7,152,88,44,0,50.0,0.337,36,1
157 | 2,99,52,15,94,24.6,0.637,21,0
158 | 1,109,56,21,135,25.2,0.833,23,0
159 | 2,88,74,19,53,29.0,0.229,22,0
160 | 17,163,72,41,114,40.9,0.817,47,1
161 | 4,151,90,38,0,29.7,0.294,36,0
162 | 7,102,74,40,105,37.2,0.204,45,0
163 | 0,114,80,34,285,44.2,0.167,27,0
164 | 2,100,64,23,0,29.7,0.368,21,0
165 | 0,131,88,0,0,31.6,0.743,32,1
166 | 6,104,74,18,156,29.9,0.722,41,1
167 | 3,148,66,25,0,32.5,0.256,22,0
168 | 4,120,68,0,0,29.6,0.709,34,0
169 | 4,110,66,0,0,31.9,0.471,29,0
170 | 3,111,90,12,78,28.4,0.495,29,0
171 | 6,102,82,0,0,30.8,0.180,36,1
172 | 6,134,70,23,130,35.4,0.542,29,1
173 | 2,87,0,23,0,28.9,0.773,25,0
174 | 1,79,60,42,48,43.5,0.678,23,0
175 | 2,75,64,24,55,29.7,0.370,33,0
176 | 8,179,72,42,130,32.7,0.719,36,1
177 | 6,85,78,0,0,31.2,0.382,42,0
178 | 0,129,110,46,130,67.1,0.319,26,1
179 | 5,143,78,0,0,45.0,0.190,47,0
180 | 5,130,82,0,0,39.1,0.956,37,1
181 | 6,87,80,0,0,23.2,0.084,32,0
182 | 0,119,64,18,92,34.9,0.725,23,0
183 | 1,0,74,20,23,27.7,0.299,21,0
184 | 5,73,60,0,0,26.8,0.268,27,0
185 | 4,141,74,0,0,27.6,0.244,40,0
186 | 7,194,68,28,0,35.9,0.745,41,1
187 | 8,181,68,36,495,30.1,0.615,60,1
188 | 1,128,98,41,58,32.0,1.321,33,1
189 | 8,109,76,39,114,27.9,0.640,31,1
190 | 5,139,80,35,160,31.6,0.361,25,1
191 | 3,111,62,0,0,22.6,0.142,21,0
192 | 9,123,70,44,94,33.1,0.374,40,0
193 | 7,159,66,0,0,30.4,0.383,36,1
194 | 11,135,0,0,0,52.3,0.578,40,1
195 | 8,85,55,20,0,24.4,0.136,42,0
196 | 5,158,84,41,210,39.4,0.395,29,1
197 | 1,105,58,0,0,24.3,0.187,21,0
198 | 3,107,62,13,48,22.9,0.678,23,1
199 | 4,109,64,44,99,34.8,0.905,26,1
200 | 4,148,60,27,318,30.9,0.150,29,1
201 | 0,113,80,16,0,31.0,0.874,21,0
202 | 1,138,82,0,0,40.1,0.236,28,0
203 | 0,108,68,20,0,27.3,0.787,32,0
204 | 2,99,70,16,44,20.4,0.235,27,0
205 | 6,103,72,32,190,37.7,0.324,55,0
206 | 5,111,72,28,0,23.9,0.407,27,0
207 | 8,196,76,29,280,37.5,0.605,57,1
208 | 5,162,104,0,0,37.7,0.151,52,1
209 | 1,96,64,27,87,33.2,0.289,21,0
210 | 7,184,84,33,0,35.5,0.355,41,1
211 | 2,81,60,22,0,27.7,0.290,25,0
212 | 0,147,85,54,0,42.8,0.375,24,0
213 | 7,179,95,31,0,34.2,0.164,60,0
214 | 0,140,65,26,130,42.6,0.431,24,1
215 | 9,112,82,32,175,34.2,0.260,36,1
216 | 12,151,70,40,271,41.8,0.742,38,1
217 | 5,109,62,41,129,35.8,0.514,25,1
218 | 6,125,68,30,120,30.0,0.464,32,0
219 | 5,85,74,22,0,29.0,1.224,32,1
220 | 5,112,66,0,0,37.8,0.261,41,1
221 | 0,177,60,29,478,34.6,1.072,21,1
222 | 2,158,90,0,0,31.6,0.805,66,1
223 | 7,119,0,0,0,25.2,0.209,37,0
224 | 7,142,60,33,190,28.8,0.687,61,0
225 | 1,100,66,15,56,23.6,0.666,26,0
226 | 1,87,78,27,32,34.6,0.101,22,0
227 | 0,101,76,0,0,35.7,0.198,26,0
228 | 3,162,52,38,0,37.2,0.652,24,1
229 | 4,197,70,39,744,36.7,2.329,31,0
230 | 0,117,80,31,53,45.2,0.089,24,0
231 | 4,142,86,0,0,44.0,0.645,22,1
232 | 6,134,80,37,370,46.2,0.238,46,1
233 | 1,79,80,25,37,25.4,0.583,22,0
234 | 4,122,68,0,0,35.0,0.394,29,0
235 | 3,74,68,28,45,29.7,0.293,23,0
236 | 4,171,72,0,0,43.6,0.479,26,1
237 | 7,181,84,21,192,35.9,0.586,51,1
238 | 0,179,90,27,0,44.1,0.686,23,1
239 | 9,164,84,21,0,30.8,0.831,32,1
240 | 0,104,76,0,0,18.4,0.582,27,0
241 | 1,91,64,24,0,29.2,0.192,21,0
242 | 4,91,70,32,88,33.1,0.446,22,0
243 | 3,139,54,0,0,25.6,0.402,22,1
244 | 6,119,50,22,176,27.1,1.318,33,1
245 | 2,146,76,35,194,38.2,0.329,29,0
246 | 9,184,85,15,0,30.0,1.213,49,1
247 | 10,122,68,0,0,31.2,0.258,41,0
248 | 0,165,90,33,680,52.3,0.427,23,0
249 | 9,124,70,33,402,35.4,0.282,34,0
250 | 1,111,86,19,0,30.1,0.143,23,0
251 | 9,106,52,0,0,31.2,0.380,42,0
252 | 2,129,84,0,0,28.0,0.284,27,0
253 | 2,90,80,14,55,24.4,0.249,24,0
254 | 0,86,68,32,0,35.8,0.238,25,0
255 | 12,92,62,7,258,27.6,0.926,44,1
256 | 1,113,64,35,0,33.6,0.543,21,1
257 | 3,111,56,39,0,30.1,0.557,30,0
258 | 2,114,68,22,0,28.7,0.092,25,0
259 | 1,193,50,16,375,25.9,0.655,24,0
260 | 11,155,76,28,150,33.3,1.353,51,1
261 | 3,191,68,15,130,30.9,0.299,34,0
262 | 3,141,0,0,0,30.0,0.761,27,1
263 | 4,95,70,32,0,32.1,0.612,24,0
264 | 3,142,80,15,0,32.4,0.200,63,0
265 | 4,123,62,0,0,32.0,0.226,35,1
266 | 5,96,74,18,67,33.6,0.997,43,0
267 | 0,138,0,0,0,36.3,0.933,25,1
268 | 2,128,64,42,0,40.0,1.101,24,0
269 | 0,102,52,0,0,25.1,0.078,21,0
270 | 2,146,0,0,0,27.5,0.240,28,1
271 | 10,101,86,37,0,45.6,1.136,38,1
272 | 2,108,62,32,56,25.2,0.128,21,0
273 | 3,122,78,0,0,23.0,0.254,40,0
274 | 1,71,78,50,45,33.2,0.422,21,0
275 | 13,106,70,0,0,34.2,0.251,52,0
276 | 2,100,70,52,57,40.5,0.677,25,0
277 | 7,106,60,24,0,26.5,0.296,29,1
278 | 0,104,64,23,116,27.8,0.454,23,0
279 | 5,114,74,0,0,24.9,0.744,57,0
280 | 2,108,62,10,278,25.3,0.881,22,0
281 | 0,146,70,0,0,37.9,0.334,28,1
282 | 10,129,76,28,122,35.9,0.280,39,0
283 | 7,133,88,15,155,32.4,0.262,37,0
284 | 7,161,86,0,0,30.4,0.165,47,1
285 | 2,108,80,0,0,27.0,0.259,52,1
286 | 7,136,74,26,135,26.0,0.647,51,0
287 | 5,155,84,44,545,38.7,0.619,34,0
288 | 1,119,86,39,220,45.6,0.808,29,1
289 | 4,96,56,17,49,20.8,0.340,26,0
290 | 5,108,72,43,75,36.1,0.263,33,0
291 | 0,78,88,29,40,36.9,0.434,21,0
292 | 0,107,62,30,74,36.6,0.757,25,1
293 | 2,128,78,37,182,43.3,1.224,31,1
294 | 1,128,48,45,194,40.5,0.613,24,1
295 | 0,161,50,0,0,21.9,0.254,65,0
296 | 6,151,62,31,120,35.5,0.692,28,0
297 | 2,146,70,38,360,28.0,0.337,29,1
298 | 0,126,84,29,215,30.7,0.520,24,0
299 | 14,100,78,25,184,36.6,0.412,46,1
300 | 8,112,72,0,0,23.6,0.840,58,0
301 | 0,167,0,0,0,32.3,0.839,30,1
302 | 2,144,58,33,135,31.6,0.422,25,1
303 | 5,77,82,41,42,35.8,0.156,35,0
304 | 5,115,98,0,0,52.9,0.209,28,1
305 | 3,150,76,0,0,21.0,0.207,37,0
306 | 2,120,76,37,105,39.7,0.215,29,0
307 | 10,161,68,23,132,25.5,0.326,47,1
308 | 0,137,68,14,148,24.8,0.143,21,0
309 | 0,128,68,19,180,30.5,1.391,25,1
310 | 2,124,68,28,205,32.9,0.875,30,1
311 | 6,80,66,30,0,26.2,0.313,41,0
312 | 0,106,70,37,148,39.4,0.605,22,0
313 | 2,155,74,17,96,26.6,0.433,27,1
314 | 3,113,50,10,85,29.5,0.626,25,0
315 | 7,109,80,31,0,35.9,1.127,43,1
316 | 2,112,68,22,94,34.1,0.315,26,0
317 | 3,99,80,11,64,19.3,0.284,30,0
318 | 3,182,74,0,0,30.5,0.345,29,1
319 | 3,115,66,39,140,38.1,0.150,28,0
320 | 6,194,78,0,0,23.5,0.129,59,1
321 | 4,129,60,12,231,27.5,0.527,31,0
322 | 3,112,74,30,0,31.6,0.197,25,1
323 | 0,124,70,20,0,27.4,0.254,36,1
324 | 13,152,90,33,29,26.8,0.731,43,1
325 | 2,112,75,32,0,35.7,0.148,21,0
326 | 1,157,72,21,168,25.6,0.123,24,0
327 | 1,122,64,32,156,35.1,0.692,30,1
328 | 10,179,70,0,0,35.1,0.200,37,0
329 | 2,102,86,36,120,45.5,0.127,23,1
330 | 6,105,70,32,68,30.8,0.122,37,0
331 | 8,118,72,19,0,23.1,1.476,46,0
332 | 2,87,58,16,52,32.7,0.166,25,0
333 | 1,180,0,0,0,43.3,0.282,41,1
334 | 12,106,80,0,0,23.6,0.137,44,0
335 | 1,95,60,18,58,23.9,0.260,22,0
336 | 0,165,76,43,255,47.9,0.259,26,0
337 | 0,117,0,0,0,33.8,0.932,44,0
338 | 5,115,76,0,0,31.2,0.343,44,1
339 | 9,152,78,34,171,34.2,0.893,33,1
340 | 7,178,84,0,0,39.9,0.331,41,1
341 | 1,130,70,13,105,25.9,0.472,22,0
342 | 1,95,74,21,73,25.9,0.673,36,0
343 | 1,0,68,35,0,32.0,0.389,22,0
344 | 5,122,86,0,0,34.7,0.290,33,0
345 | 8,95,72,0,0,36.8,0.485,57,0
346 | 8,126,88,36,108,38.5,0.349,49,0
347 | 1,139,46,19,83,28.7,0.654,22,0
348 | 3,116,0,0,0,23.5,0.187,23,0
349 | 3,99,62,19,74,21.8,0.279,26,0
350 | 5,0,80,32,0,41.0,0.346,37,1
351 | 4,92,80,0,0,42.2,0.237,29,0
352 | 4,137,84,0,0,31.2,0.252,30,0
353 | 3,61,82,28,0,34.4,0.243,46,0
354 | 1,90,62,12,43,27.2,0.580,24,0
355 | 3,90,78,0,0,42.7,0.559,21,0
356 | 9,165,88,0,0,30.4,0.302,49,1
357 | 1,125,50,40,167,33.3,0.962,28,1
358 | 13,129,0,30,0,39.9,0.569,44,1
359 | 12,88,74,40,54,35.3,0.378,48,0
360 | 1,196,76,36,249,36.5,0.875,29,1
361 | 5,189,64,33,325,31.2,0.583,29,1
362 | 5,158,70,0,0,29.8,0.207,63,0
363 | 5,103,108,37,0,39.2,0.305,65,0
364 | 4,146,78,0,0,38.5,0.520,67,1
365 | 4,147,74,25,293,34.9,0.385,30,0
366 | 5,99,54,28,83,34.0,0.499,30,0
367 | 6,124,72,0,0,27.6,0.368,29,1
368 | 0,101,64,17,0,21.0,0.252,21,0
369 | 3,81,86,16,66,27.5,0.306,22,0
370 | 1,133,102,28,140,32.8,0.234,45,1
371 | 3,173,82,48,465,38.4,2.137,25,1
372 | 0,118,64,23,89,0.0,1.731,21,0
373 | 0,84,64,22,66,35.8,0.545,21,0
374 | 2,105,58,40,94,34.9,0.225,25,0
375 | 2,122,52,43,158,36.2,0.816,28,0
376 | 12,140,82,43,325,39.2,0.528,58,1
377 | 0,98,82,15,84,25.2,0.299,22,0
378 | 1,87,60,37,75,37.2,0.509,22,0
379 | 4,156,75,0,0,48.3,0.238,32,1
380 | 0,93,100,39,72,43.4,1.021,35,0
381 | 1,107,72,30,82,30.8,0.821,24,0
382 | 0,105,68,22,0,20.0,0.236,22,0
383 | 1,109,60,8,182,25.4,0.947,21,0
384 | 1,90,62,18,59,25.1,1.268,25,0
385 | 1,125,70,24,110,24.3,0.221,25,0
386 | 1,119,54,13,50,22.3,0.205,24,0
387 | 5,116,74,29,0,32.3,0.660,35,1
388 | 8,105,100,36,0,43.3,0.239,45,1
389 | 5,144,82,26,285,32.0,0.452,58,1
390 | 3,100,68,23,81,31.6,0.949,28,0
391 | 1,100,66,29,196,32.0,0.444,42,0
392 | 5,166,76,0,0,45.7,0.340,27,1
393 | 1,131,64,14,415,23.7,0.389,21,0
394 | 4,116,72,12,87,22.1,0.463,37,0
395 | 4,158,78,0,0,32.9,0.803,31,1
396 | 2,127,58,24,275,27.7,1.600,25,0
397 | 3,96,56,34,115,24.7,0.944,39,0
398 | 0,131,66,40,0,34.3,0.196,22,1
399 | 3,82,70,0,0,21.1,0.389,25,0
400 | 3,193,70,31,0,34.9,0.241,25,1
401 | 4,95,64,0,0,32.0,0.161,31,1
402 | 6,137,61,0,0,24.2,0.151,55,0
403 | 5,136,84,41,88,35.0,0.286,35,1
404 | 9,72,78,25,0,31.6,0.280,38,0
405 | 5,168,64,0,0,32.9,0.135,41,1
406 | 2,123,48,32,165,42.1,0.520,26,0
407 | 4,115,72,0,0,28.9,0.376,46,1
408 | 0,101,62,0,0,21.9,0.336,25,0
409 | 8,197,74,0,0,25.9,1.191,39,1
410 | 1,172,68,49,579,42.4,0.702,28,1
411 | 6,102,90,39,0,35.7,0.674,28,0
412 | 1,112,72,30,176,34.4,0.528,25,0
413 | 1,143,84,23,310,42.4,1.076,22,0
414 | 1,143,74,22,61,26.2,0.256,21,0
415 | 0,138,60,35,167,34.6,0.534,21,1
416 | 3,173,84,33,474,35.7,0.258,22,1
417 | 1,97,68,21,0,27.2,1.095,22,0
418 | 4,144,82,32,0,38.5,0.554,37,1
419 | 1,83,68,0,0,18.2,0.624,27,0
420 | 3,129,64,29,115,26.4,0.219,28,1
421 | 1,119,88,41,170,45.3,0.507,26,0
422 | 2,94,68,18,76,26.0,0.561,21,0
423 | 0,102,64,46,78,40.6,0.496,21,0
424 | 2,115,64,22,0,30.8,0.421,21,0
425 | 8,151,78,32,210,42.9,0.516,36,1
426 | 4,184,78,39,277,37.0,0.264,31,1
427 | 0,94,0,0,0,0.0,0.256,25,0
428 | 1,181,64,30,180,34.1,0.328,38,1
429 | 0,135,94,46,145,40.6,0.284,26,0
430 | 1,95,82,25,180,35.0,0.233,43,1
431 | 2,99,0,0,0,22.2,0.108,23,0
432 | 3,89,74,16,85,30.4,0.551,38,0
433 | 1,80,74,11,60,30.0,0.527,22,0
434 | 2,139,75,0,0,25.6,0.167,29,0
435 | 1,90,68,8,0,24.5,1.138,36,0
436 | 0,141,0,0,0,42.4,0.205,29,1
437 | 12,140,85,33,0,37.4,0.244,41,0
438 | 5,147,75,0,0,29.9,0.434,28,0
439 | 1,97,70,15,0,18.2,0.147,21,0
440 | 6,107,88,0,0,36.8,0.727,31,0
441 | 0,189,104,25,0,34.3,0.435,41,1
442 | 2,83,66,23,50,32.2,0.497,22,0
443 | 4,117,64,27,120,33.2,0.230,24,0
444 | 8,108,70,0,0,30.5,0.955,33,1
445 | 4,117,62,12,0,29.7,0.380,30,1
446 | 0,180,78,63,14,59.4,2.420,25,1
447 | 1,100,72,12,70,25.3,0.658,28,0
448 | 0,95,80,45,92,36.5,0.330,26,0
449 | 0,104,64,37,64,33.6,0.510,22,1
450 | 0,120,74,18,63,30.5,0.285,26,0
451 | 1,82,64,13,95,21.2,0.415,23,0
452 | 2,134,70,0,0,28.9,0.542,23,1
453 | 0,91,68,32,210,39.9,0.381,25,0
454 | 2,119,0,0,0,19.6,0.832,72,0
455 | 2,100,54,28,105,37.8,0.498,24,0
456 | 14,175,62,30,0,33.6,0.212,38,1
457 | 1,135,54,0,0,26.7,0.687,62,0
458 | 5,86,68,28,71,30.2,0.364,24,0
459 | 10,148,84,48,237,37.6,1.001,51,1
460 | 9,134,74,33,60,25.9,0.460,81,0
461 | 9,120,72,22,56,20.8,0.733,48,0
462 | 1,71,62,0,0,21.8,0.416,26,0
463 | 8,74,70,40,49,35.3,0.705,39,0
464 | 5,88,78,30,0,27.6,0.258,37,0
465 | 10,115,98,0,0,24.0,1.022,34,0
466 | 0,124,56,13,105,21.8,0.452,21,0
467 | 0,74,52,10,36,27.8,0.269,22,0
468 | 0,97,64,36,100,36.8,0.600,25,0
469 | 8,120,0,0,0,30.0,0.183,38,1
470 | 6,154,78,41,140,46.1,0.571,27,0
471 | 1,144,82,40,0,41.3,0.607,28,0
472 | 0,137,70,38,0,33.2,0.170,22,0
473 | 0,119,66,27,0,38.8,0.259,22,0
474 | 7,136,90,0,0,29.9,0.210,50,0
475 | 4,114,64,0,0,28.9,0.126,24,0
476 | 0,137,84,27,0,27.3,0.231,59,0
477 | 2,105,80,45,191,33.7,0.711,29,1
478 | 7,114,76,17,110,23.8,0.466,31,0
479 | 8,126,74,38,75,25.9,0.162,39,0
480 | 4,132,86,31,0,28.0,0.419,63,0
481 | 3,158,70,30,328,35.5,0.344,35,1
482 | 0,123,88,37,0,35.2,0.197,29,0
483 | 4,85,58,22,49,27.8,0.306,28,0
484 | 0,84,82,31,125,38.2,0.233,23,0
485 | 0,145,0,0,0,44.2,0.630,31,1
486 | 0,135,68,42,250,42.3,0.365,24,1
487 | 1,139,62,41,480,40.7,0.536,21,0
488 | 0,173,78,32,265,46.5,1.159,58,0
489 | 4,99,72,17,0,25.6,0.294,28,0
490 | 8,194,80,0,0,26.1,0.551,67,0
491 | 2,83,65,28,66,36.8,0.629,24,0
492 | 2,89,90,30,0,33.5,0.292,42,0
493 | 4,99,68,38,0,32.8,0.145,33,0
494 | 4,125,70,18,122,28.9,1.144,45,1
495 | 3,80,0,0,0,0.0,0.174,22,0
496 | 6,166,74,0,0,26.6,0.304,66,0
497 | 5,110,68,0,0,26.0,0.292,30,0
498 | 2,81,72,15,76,30.1,0.547,25,0
499 | 7,195,70,33,145,25.1,0.163,55,1
500 | 6,154,74,32,193,29.3,0.839,39,0
501 | 2,117,90,19,71,25.2,0.313,21,0
502 | 3,84,72,32,0,37.2,0.267,28,0
503 | 6,0,68,41,0,39.0,0.727,41,1
504 | 7,94,64,25,79,33.3,0.738,41,0
505 | 3,96,78,39,0,37.3,0.238,40,0
506 | 10,75,82,0,0,33.3,0.263,38,0
507 | 0,180,90,26,90,36.5,0.314,35,1
508 | 1,130,60,23,170,28.6,0.692,21,0
509 | 2,84,50,23,76,30.4,0.968,21,0
510 | 8,120,78,0,0,25.0,0.409,64,0
511 | 12,84,72,31,0,29.7,0.297,46,1
512 | 0,139,62,17,210,22.1,0.207,21,0
513 | 9,91,68,0,0,24.2,0.200,58,0
514 | 2,91,62,0,0,27.3,0.525,22,0
515 | 3,99,54,19,86,25.6,0.154,24,0
516 | 3,163,70,18,105,31.6,0.268,28,1
517 | 9,145,88,34,165,30.3,0.771,53,1
518 | 7,125,86,0,0,37.6,0.304,51,0
519 | 13,76,60,0,0,32.8,0.180,41,0
520 | 6,129,90,7,326,19.6,0.582,60,0
521 | 2,68,70,32,66,25.0,0.187,25,0
522 | 3,124,80,33,130,33.2,0.305,26,0
523 | 6,114,0,0,0,0.0,0.189,26,0
524 | 9,130,70,0,0,34.2,0.652,45,1
525 | 3,125,58,0,0,31.6,0.151,24,0
526 | 3,87,60,18,0,21.8,0.444,21,0
527 | 1,97,64,19,82,18.2,0.299,21,0
528 | 3,116,74,15,105,26.3,0.107,24,0
529 | 0,117,66,31,188,30.8,0.493,22,0
530 | 0,111,65,0,0,24.6,0.660,31,0
531 | 2,122,60,18,106,29.8,0.717,22,0
532 | 0,107,76,0,0,45.3,0.686,24,0
533 | 1,86,66,52,65,41.3,0.917,29,0
534 | 6,91,0,0,0,29.8,0.501,31,0
535 | 1,77,56,30,56,33.3,1.251,24,0
536 | 4,132,0,0,0,32.9,0.302,23,1
537 | 0,105,90,0,0,29.6,0.197,46,0
538 | 0,57,60,0,0,21.7,0.735,67,0
539 | 0,127,80,37,210,36.3,0.804,23,0
540 | 3,129,92,49,155,36.4,0.968,32,1
541 | 8,100,74,40,215,39.4,0.661,43,1
542 | 3,128,72,25,190,32.4,0.549,27,1
543 | 10,90,85,32,0,34.9,0.825,56,1
544 | 4,84,90,23,56,39.5,0.159,25,0
545 | 1,88,78,29,76,32.0,0.365,29,0
546 | 8,186,90,35,225,34.5,0.423,37,1
547 | 5,187,76,27,207,43.6,1.034,53,1
548 | 4,131,68,21,166,33.1,0.160,28,0
549 | 1,164,82,43,67,32.8,0.341,50,0
550 | 4,189,110,31,0,28.5,0.680,37,0
551 | 1,116,70,28,0,27.4,0.204,21,0
552 | 3,84,68,30,106,31.9,0.591,25,0
553 | 6,114,88,0,0,27.8,0.247,66,0
554 | 1,88,62,24,44,29.9,0.422,23,0
555 | 1,84,64,23,115,36.9,0.471,28,0
556 | 7,124,70,33,215,25.5,0.161,37,0
557 | 1,97,70,40,0,38.1,0.218,30,0
558 | 8,110,76,0,0,27.8,0.237,58,0
559 | 11,103,68,40,0,46.2,0.126,42,0
560 | 11,85,74,0,0,30.1,0.300,35,0
561 | 6,125,76,0,0,33.8,0.121,54,1
562 | 0,198,66,32,274,41.3,0.502,28,1
563 | 1,87,68,34,77,37.6,0.401,24,0
564 | 6,99,60,19,54,26.9,0.497,32,0
565 | 0,91,80,0,0,32.4,0.601,27,0
566 | 2,95,54,14,88,26.1,0.748,22,0
567 | 1,99,72,30,18,38.6,0.412,21,0
568 | 6,92,62,32,126,32.0,0.085,46,0
569 | 4,154,72,29,126,31.3,0.338,37,0
570 | 0,121,66,30,165,34.3,0.203,33,1
571 | 3,78,70,0,0,32.5,0.270,39,0
572 | 2,130,96,0,0,22.6,0.268,21,0
573 | 3,111,58,31,44,29.5,0.430,22,0
574 | 2,98,60,17,120,34.7,0.198,22,0
575 | 1,143,86,30,330,30.1,0.892,23,0
576 | 1,119,44,47,63,35.5,0.280,25,0
577 | 6,108,44,20,130,24.0,0.813,35,0
578 | 2,118,80,0,0,42.9,0.693,21,1
579 | 10,133,68,0,0,27.0,0.245,36,0
580 | 2,197,70,99,0,34.7,0.575,62,1
581 | 0,151,90,46,0,42.1,0.371,21,1
582 | 6,109,60,27,0,25.0,0.206,27,0
583 | 12,121,78,17,0,26.5,0.259,62,0
584 | 8,100,76,0,0,38.7,0.190,42,0
585 | 8,124,76,24,600,28.7,0.687,52,1
586 | 1,93,56,11,0,22.5,0.417,22,0
587 | 8,143,66,0,0,34.9,0.129,41,1
588 | 6,103,66,0,0,24.3,0.249,29,0
589 | 3,176,86,27,156,33.3,1.154,52,1
590 | 0,73,0,0,0,21.1,0.342,25,0
591 | 11,111,84,40,0,46.8,0.925,45,1
592 | 2,112,78,50,140,39.4,0.175,24,0
593 | 3,132,80,0,0,34.4,0.402,44,1
594 | 2,82,52,22,115,28.5,1.699,25,0
595 | 6,123,72,45,230,33.6,0.733,34,0
596 | 0,188,82,14,185,32.0,0.682,22,1
597 | 0,67,76,0,0,45.3,0.194,46,0
598 | 1,89,24,19,25,27.8,0.559,21,0
599 | 1,173,74,0,0,36.8,0.088,38,1
600 | 1,109,38,18,120,23.1,0.407,26,0
601 | 1,108,88,19,0,27.1,0.400,24,0
602 | 6,96,0,0,0,23.7,0.190,28,0
603 | 1,124,74,36,0,27.8,0.100,30,0
604 | 7,150,78,29,126,35.2,0.692,54,1
605 | 4,183,0,0,0,28.4,0.212,36,1
606 | 1,124,60,32,0,35.8,0.514,21,0
607 | 1,181,78,42,293,40.0,1.258,22,1
608 | 1,92,62,25,41,19.5,0.482,25,0
609 | 0,152,82,39,272,41.5,0.270,27,0
610 | 1,111,62,13,182,24.0,0.138,23,0
611 | 3,106,54,21,158,30.9,0.292,24,0
612 | 3,174,58,22,194,32.9,0.593,36,1
613 | 7,168,88,42,321,38.2,0.787,40,1
614 | 6,105,80,28,0,32.5,0.878,26,0
615 | 11,138,74,26,144,36.1,0.557,50,1
616 | 3,106,72,0,0,25.8,0.207,27,0
617 | 6,117,96,0,0,28.7,0.157,30,0
618 | 2,68,62,13,15,20.1,0.257,23,0
619 | 9,112,82,24,0,28.2,1.282,50,1
620 | 0,119,0,0,0,32.4,0.141,24,1
621 | 2,112,86,42,160,38.4,0.246,28,0
622 | 2,92,76,20,0,24.2,1.698,28,0
623 | 6,183,94,0,0,40.8,1.461,45,0
624 | 0,94,70,27,115,43.5,0.347,21,0
625 | 2,108,64,0,0,30.8,0.158,21,0
626 | 4,90,88,47,54,37.7,0.362,29,0
627 | 0,125,68,0,0,24.7,0.206,21,0
628 | 0,132,78,0,0,32.4,0.393,21,0
629 | 5,128,80,0,0,34.6,0.144,45,0
630 | 4,94,65,22,0,24.7,0.148,21,0
631 | 7,114,64,0,0,27.4,0.732,34,1
632 | 0,102,78,40,90,34.5,0.238,24,0
633 | 2,111,60,0,0,26.2,0.343,23,0
634 | 1,128,82,17,183,27.5,0.115,22,0
635 | 10,92,62,0,0,25.9,0.167,31,0
636 | 13,104,72,0,0,31.2,0.465,38,1
637 | 5,104,74,0,0,28.8,0.153,48,0
638 | 2,94,76,18,66,31.6,0.649,23,0
639 | 7,97,76,32,91,40.9,0.871,32,1
640 | 1,100,74,12,46,19.5,0.149,28,0
641 | 0,102,86,17,105,29.3,0.695,27,0
642 | 4,128,70,0,0,34.3,0.303,24,0
643 | 6,147,80,0,0,29.5,0.178,50,1
644 | 4,90,0,0,0,28.0,0.610,31,0
645 | 3,103,72,30,152,27.6,0.730,27,0
646 | 2,157,74,35,440,39.4,0.134,30,0
647 | 1,167,74,17,144,23.4,0.447,33,1
648 | 0,179,50,36,159,37.8,0.455,22,1
649 | 11,136,84,35,130,28.3,0.260,42,1
650 | 0,107,60,25,0,26.4,0.133,23,0
651 | 1,91,54,25,100,25.2,0.234,23,0
652 | 1,117,60,23,106,33.8,0.466,27,0
653 | 5,123,74,40,77,34.1,0.269,28,0
654 | 2,120,54,0,0,26.8,0.455,27,0
655 | 1,106,70,28,135,34.2,0.142,22,0
656 | 2,155,52,27,540,38.7,0.240,25,1
657 | 2,101,58,35,90,21.8,0.155,22,0
658 | 1,120,80,48,200,38.9,1.162,41,0
659 | 11,127,106,0,0,39.0,0.190,51,0
660 | 3,80,82,31,70,34.2,1.292,27,1
661 | 10,162,84,0,0,27.7,0.182,54,0
662 | 1,199,76,43,0,42.9,1.394,22,1
663 | 8,167,106,46,231,37.6,0.165,43,1
664 | 9,145,80,46,130,37.9,0.637,40,1
665 | 6,115,60,39,0,33.7,0.245,40,1
666 | 1,112,80,45,132,34.8,0.217,24,0
667 | 4,145,82,18,0,32.5,0.235,70,1
668 | 10,111,70,27,0,27.5,0.141,40,1
669 | 6,98,58,33,190,34.0,0.430,43,0
670 | 9,154,78,30,100,30.9,0.164,45,0
671 | 6,165,68,26,168,33.6,0.631,49,0
672 | 1,99,58,10,0,25.4,0.551,21,0
673 | 10,68,106,23,49,35.5,0.285,47,0
674 | 3,123,100,35,240,57.3,0.880,22,0
675 | 8,91,82,0,0,35.6,0.587,68,0
676 | 6,195,70,0,0,30.9,0.328,31,1
677 | 9,156,86,0,0,24.8,0.230,53,1
678 | 0,93,60,0,0,35.3,0.263,25,0
679 | 3,121,52,0,0,36.0,0.127,25,1
680 | 2,101,58,17,265,24.2,0.614,23,0
681 | 2,56,56,28,45,24.2,0.332,22,0
682 | 0,162,76,36,0,49.6,0.364,26,1
683 | 0,95,64,39,105,44.6,0.366,22,0
684 | 4,125,80,0,0,32.3,0.536,27,1
685 | 5,136,82,0,0,0.0,0.640,69,0
686 | 2,129,74,26,205,33.2,0.591,25,0
687 | 3,130,64,0,0,23.1,0.314,22,0
688 | 1,107,50,19,0,28.3,0.181,29,0
689 | 1,140,74,26,180,24.1,0.828,23,0
690 | 1,144,82,46,180,46.1,0.335,46,1
691 | 8,107,80,0,0,24.6,0.856,34,0
692 | 13,158,114,0,0,42.3,0.257,44,1
693 | 2,121,70,32,95,39.1,0.886,23,0
694 | 7,129,68,49,125,38.5,0.439,43,1
695 | 2,90,60,0,0,23.5,0.191,25,0
696 | 7,142,90,24,480,30.4,0.128,43,1
697 | 3,169,74,19,125,29.9,0.268,31,1
698 | 0,99,0,0,0,25.0,0.253,22,0
699 | 4,127,88,11,155,34.5,0.598,28,0
700 | 4,118,70,0,0,44.5,0.904,26,0
701 | 2,122,76,27,200,35.9,0.483,26,0
702 | 6,125,78,31,0,27.6,0.565,49,1
703 | 1,168,88,29,0,35.0,0.905,52,1
704 | 2,129,0,0,0,38.5,0.304,41,0
705 | 4,110,76,20,100,28.4,0.118,27,0
706 | 6,80,80,36,0,39.8,0.177,28,0
707 | 10,115,0,0,0,0.0,0.261,30,1
708 | 2,127,46,21,335,34.4,0.176,22,0
709 | 9,164,78,0,0,32.8,0.148,45,1
710 | 2,93,64,32,160,38.0,0.674,23,1
711 | 3,158,64,13,387,31.2,0.295,24,0
712 | 5,126,78,27,22,29.6,0.439,40,0
713 | 10,129,62,36,0,41.2,0.441,38,1
714 | 0,134,58,20,291,26.4,0.352,21,0
715 | 3,102,74,0,0,29.5,0.121,32,0
716 | 7,187,50,33,392,33.9,0.826,34,1
717 | 3,173,78,39,185,33.8,0.970,31,1
718 | 10,94,72,18,0,23.1,0.595,56,0
719 | 1,108,60,46,178,35.5,0.415,24,0
720 | 5,97,76,27,0,35.6,0.378,52,1
721 | 4,83,86,19,0,29.3,0.317,34,0
722 | 1,114,66,36,200,38.1,0.289,21,0
723 | 1,149,68,29,127,29.3,0.349,42,1
724 | 5,117,86,30,105,39.1,0.251,42,0
725 | 1,111,94,0,0,32.8,0.265,45,0
726 | 4,112,78,40,0,39.4,0.236,38,0
727 | 1,116,78,29,180,36.1,0.496,25,0
728 | 0,141,84,26,0,32.4,0.433,22,0
729 | 2,175,88,0,0,22.9,0.326,22,0
730 | 2,92,52,0,0,30.1,0.141,22,0
731 | 3,130,78,23,79,28.4,0.323,34,1
732 | 8,120,86,0,0,28.4,0.259,22,1
733 | 2,174,88,37,120,44.5,0.646,24,1
734 | 2,106,56,27,165,29.0,0.426,22,0
735 | 2,105,75,0,0,23.3,0.560,53,0
736 | 4,95,60,32,0,35.4,0.284,28,0
737 | 0,126,86,27,120,27.4,0.515,21,0
738 | 8,65,72,23,0,32.0,0.600,42,0
739 | 2,99,60,17,160,36.6,0.453,21,0
740 | 1,102,74,0,0,39.5,0.293,42,1
741 | 11,120,80,37,150,42.3,0.785,48,1
742 | 3,102,44,20,94,30.8,0.400,26,0
743 | 1,109,58,18,116,28.5,0.219,22,0
744 | 9,140,94,0,0,32.7,0.734,45,1
745 | 13,153,88,37,140,40.6,1.174,39,0
746 | 12,100,84,33,105,30.0,0.488,46,0
747 | 1,147,94,41,0,49.3,0.358,27,1
748 | 1,81,74,41,57,46.3,1.096,32,0
749 | 3,187,70,22,200,36.4,0.408,36,1
750 | 6,162,62,0,0,24.3,0.178,50,1
751 | 4,136,70,0,0,31.2,1.182,22,1
752 | 1,121,78,39,74,39.0,0.261,28,0
753 | 3,108,62,24,0,26.0,0.223,25,0
754 | 0,181,88,44,510,43.3,0.222,26,1
755 | 8,154,78,32,0,32.4,0.443,45,1
756 | 1,128,88,39,110,36.5,1.057,37,1
757 | 7,137,90,41,0,32.0,0.391,39,0
758 | 0,123,72,0,0,36.3,0.258,52,1
759 | 1,106,76,0,0,37.5,0.197,26,0
760 | 6,190,92,0,0,35.5,0.278,66,1
761 | 2,88,58,26,16,28.4,0.766,22,0
762 | 9,170,74,31,0,44.0,0.403,43,1
763 | 9,89,62,0,0,22.5,0.142,33,0
764 | 10,101,76,48,180,32.9,0.171,63,0
765 | 2,122,70,27,0,36.8,0.340,27,0
766 | 5,121,72,23,112,26.2,0.245,30,0
767 | 1,126,60,0,0,30.1,0.349,47,1
768 | 1,93,70,31,0,30.4,0.315,23,0
--------------------------------------------------------------------------------
/data_exploration/explore.py:
--------------------------------------------------------------------------------
1 | #import pandas as pd
2 | import numpy as np
3 | import seaborn as sns
4 | import matplotlib.pyplot as plt
5 | import os
6 | plt.style.use('seaborn-colorblind')
7 |
8 | # 2018.11.07 Created by Eamon.Zhang
9 |
10 |
11 | def get_dtypes(data,drop_col=[]):
12 | """Return the dtypes for each column of a pandas Dataframe
13 |
14 | Parameters
15 | ----------
16 | data : pandas Dataframe
17 |
18 | drop_col : columns to omit in a list
19 |
20 | Returns
21 | -------
22 | str_var_list, num_var_list, all_var_list
23 |
24 | """
25 |
26 | name_of_col = list(data.columns)
27 | num_var_list = []
28 | str_var_list = []
29 | all_var_list = []
30 |
31 | str_var_list = name_of_col.copy()
32 | for var in name_of_col:
33 | # check if column belongs to numeric type
34 | if (data[var].dtypes in (np.int, np.int64, np.uint, np.int32, np.float,
35 | np.float64, np.float32, np.double)):
36 | str_var_list.remove(var)
37 | num_var_list.append(var)
38 | # drop the omit column from list
39 | for var in drop_col:
40 | if var in str_var_list:
41 | str_var_list.remove(var)
42 | if var in num_var_list:
43 | num_var_list.remove(var)
44 |
45 | all_var_list.extend(str_var_list)
46 | all_var_list.extend(num_var_list)
47 | return str_var_list, num_var_list, all_var_list
48 |
49 |
50 | def describe(data,output_path=None):
51 | """output the general description of a pandas Dataframe
52 | into a csv file
53 |
54 | """
55 |
56 | result = data.describe(include='all')
57 | if output_path is not None:
58 | output = os.path.join(output_path,'describe.csv')
59 | result.to_csv(output)
60 | print('result saved at:', str(output))
61 | return result
62 |
63 |
64 | def discrete_var_barplot(x,y,data,output_path=None):
65 | """draw the barplot of a discrete variable x against y(target variable).
66 | By default the bar shows the mean value of y.
67 |
68 | Parameters
69 | ----------
70 |
71 |
72 | Returns
73 | -------
74 | figure save as PNG
75 | """
76 |
77 | plt.figure(figsize=(15,10))
78 | sns.barplot(x=x,y=y,data=data)
79 | if output_path is not None:
80 | output = os.path.join(output_path,'Barplot_'+str(x)+'_'+str(y)+'.png')
81 | plt.savefig(output)
82 | print('Image saved at', str(output))
83 |
84 |
85 | def discrete_var_countplot(x,data,output_path=None):
86 | """draw the countplot of a discrete variable x.
87 |
88 | Parameters
89 | ----------
90 |
91 |
92 | Returns
93 | -------
94 | figure save as PNG
95 | """
96 |
97 | plt.figure(figsize=(15,10))
98 | sns.countplot(x=x,data=data)
99 | if output_path is not None:
100 | output = os.path.join(output_path,'Countplot_'+str(x)+'.png')
101 | plt.savefig(output)
102 | print('Image saved at',str(output))
103 |
104 |
105 | def discrete_var_boxplot(x,y,data,output_path=None):
106 | """draw the boxplot of a discrete variable x against y.
107 |
108 | Parameters
109 | ----------
110 |
111 |
112 | Returns
113 | -------
114 | figure save as PNG
115 | """
116 |
117 | plt.figure(figsize=(15,10))
118 | sns.boxplot(x=x,y=y,data=data)
119 | if output_path is not None:
120 | output = os.path.join(output_path,'Boxplot_'+str(x)+'_'+str(y)+'.png')
121 | plt.savefig(output)
122 | print('Image saved at',str(output))
123 |
124 |
125 | def continuous_var_distplot(x,output_path=None,bins=None):
126 | """draw the distplot of a continuous variable x.
127 |
128 | Parameters
129 | ----------
130 |
131 |
132 | Returns
133 | -------
134 | figure save as PNG
135 | """
136 |
137 | plt.figure(figsize=(15,10))
138 | sns.distplot(a=x,kde=False,bins=bins)
139 | if output_path is not None:
140 | output=os.path.join(output_path,'Distplot_'+str(x.name)+'.png')
141 | plt.savefig(output)
142 | print('Image saved at',str(output))
143 |
144 |
145 | # 2018.11.28 Created by Eamon.Zhang
146 |
147 | def scatter_plot(x,y,data,output_path=None):
148 | """draw the scatter-plot of two variables.
149 |
150 | Parameters
151 | ----------
152 |
153 |
154 | Returns
155 | -------
156 | figure save as PNG
157 | """
158 |
159 | plt.figure(figsize=(15,10))
160 | sns.scatterplot(x=x,y=y,data=data)
161 | if output_path is not None:
162 | output = os.path.join(output_path,'Scatter_plot_'+str(x.name)+'_'+str(y.name)+'.png')
163 | plt.savefig(output)
164 | print('Image saved at',str(output))
165 |
166 |
167 | def correlation_plot(data,output_path=None):
168 | """draw the correlation plot between variables.
169 |
170 | Parameters
171 | ----------
172 |
173 |
174 | Returns
175 | -------
176 | figure save as PNG
177 | """
178 |
179 | corrmat = data.corr()
180 | fig, ax = plt.subplots()
181 | fig.set_size_inches(11,11)
182 | sns.heatmap(corrmat,cmap="YlGnBu",linewidths=.5,annot=True)
183 | if output_path is not None:
184 | output = os.path.join(output_path,'Corr_plot'+'.png')
185 | plt.savefig(output)
186 | print('Image saved at',str(output))
187 |
188 |
189 | def heatmap(data,output_path=None,fmt='d'):
190 | """draw the heatmap between 2 variables.
191 |
192 | Parameters
193 | ----------
194 |
195 |
196 | Returns
197 | -------
198 | figure save as PNG
199 | """
200 |
201 | fig, ax = plt.subplots()
202 | fig.set_size_inches(11,11)
203 | sns.heatmap(data,cmap="YlGnBu",linewidths=.5,annot=True,fmt=fmt)
204 | if output_path is not None:
205 | output = os.path.join(output_path,'Heatmap'+'.png')
206 | plt.savefig(output)
207 | print('Image saved at',str(output))
--------------------------------------------------------------------------------
/feature_cleaning/missing_data.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from warnings import warn
4 |
5 | # 2018.11.07 Created by Eamon.Zhang
6 |
7 |
8 | def check_missing(data,output_path=None):
9 | """
10 | check the total number & percentage of missing values
11 | per variable of a pandas Dataframe
12 | """
13 |
14 | result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1)
15 | result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
16 | if output_path is not None:
17 | result.to_csv(output_path+'missing.csv')
18 | print('result saved at', output_path, 'missing.csv')
19 | return result
20 |
21 |
22 | def drop_missing(data,axis=0):
23 | """
24 | Listwise deletion:
25 | excluding all cases (listwise) that have missing values
26 |
27 | Parameters
28 | ----------
29 | axis: drop cases(0)/columns(1),default 0
30 |
31 | Returns
32 | -------
33 | Pandas dataframe with missing cases/columns dropped
34 | """
35 |
36 | data_copy = data.copy(deep=True)
37 | data_copy = data_copy.dropna(axis=axis,inplace=False)
38 | return data_copy
39 |
40 |
41 | def add_var_denote_NA(data,NA_col=[]):
42 | """
43 | creating an additional variable indicating whether the data
44 | was missing for that observation (1) or not (0).
45 | """
46 |
47 | data_copy = data.copy(deep=True)
48 | for i in NA_col:
49 | if data_copy[i].isnull().sum()>0:
50 | data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0)
51 | else:
52 | warn("Column %s has no missing cases" % i)
53 |
54 | return data_copy
55 |
56 |
57 | def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):
58 | """
59 | replacing NA with arbitrary values.
60 | """
61 |
62 | data_copy = data.copy(deep=True)
63 | for i in NA_col:
64 | if data_copy[i].isnull().sum()>0:
65 | data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
66 | else:
67 | warn("Column %s has no missing cases" % i)
68 | return data_copy
69 |
70 |
71 | def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
72 | """
73 | replacing the NA with mean/median/most frequent values of that variable.
74 | Note it should only be performed over training set and then propagated to test set.
75 | """
76 |
77 | data_copy = data.copy(deep=True)
78 | for i in NA_col:
79 | if data_copy[i].isnull().sum()>0:
80 | if strategy=='mean':
81 | data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean())
82 | elif strategy=='median':
83 | data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median())
84 | elif strategy=='mode':
85 | data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
86 | else:
87 | warn("Column %s has no missing" % i)
88 | return data_copy
89 |
90 |
91 | def impute_NA_with_end_of_distribution(data,NA_col=[]):
92 | """
93 | replacing the NA by values that are at the far end of the distribution of that variable
94 | calculated by mean + 3*std
95 | """
96 |
97 | data_copy = data.copy(deep=True)
98 | for i in NA_col:
99 | if data_copy[i].isnull().sum()>0:
100 | data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std())
101 | else:
102 | warn("Column %s has no missing" % i)
103 | return data_copy
104 |
105 |
106 | def impute_NA_with_random(data,NA_col=[],random_state=0):
107 | """
108 | replacing the NA with random sampling from the pool of available observations of the variable
109 | """
110 |
111 | data_copy = data.copy(deep=True)
112 | for i in NA_col:
113 | if data_copy[i].isnull().sum()>0:
114 | data_copy[i+'_random'] = data_copy[i]
115 | # extract the random sample to fill the na
116 | random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
117 | random_sample.index = data_copy[data_copy[i].isnull()].index
118 | data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
119 | else:
120 | warn("Column %s has no missing" % i)
121 | return data_copy
122 |
--------------------------------------------------------------------------------
/feature_cleaning/outlier.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | # from warnings import warn
4 |
5 | # 2018.11.07 Created by Eamon.Zhang
6 |
7 | def outlier_detect_arbitrary(data,col,upper_fence,lower_fence):
8 | '''
9 | identify outliers based on arbitrary boundaries passed to the function.
10 | '''
11 |
12 | para = (upper_fence, lower_fence)
13 | tmp = pd.concat([data[col]>upper_fence,data[col]Upper_fence,data[col]Upper_fence,data[col] threshold
83 | print('Num of outlier detected:',outlier_index.value_counts()[1])
84 | print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
85 | return outlier_index
86 |
87 |
88 | # 2018.11.10 outlier treatment
89 | def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]):
90 | """
91 | impute outliers with arbitrary value
92 | """
93 |
94 | data_copy = data.copy(deep=True)
95 | for i in col:
96 | data_copy.loc[outlier_index,i] = value
97 | return data_copy
98 |
99 |
100 | def windsorization(data,col,para,strategy='both'):
101 | """
102 | top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)
103 | """
104 |
105 | data_copy = data.copy(deep=True)
106 | if strategy == 'both':
107 | data_copy.loc[data_copy[col]>para[0],col] = para[0]
108 | data_copy.loc[data_copy[col]para[0],col] = para[0]
111 | elif strategy == 'bottom':
112 | data_copy.loc[data_copy[col]= threshold].index else k)
109 | for k in temp_df.index}
110 |
111 | mapping = pd.Series(mapping)
112 | mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
113 |
114 | return X, mapping_out
115 |
116 |
117 |
118 | #==============================================================================
119 | # def rare_imputation(X_train, X_test, variable):
120 | #
121 | # # find the most frequent category
122 | # frequent_cat = X_train.groupby(variable)[variable].count().sort_values().tail(1).index.values[0]
123 | #
124 | # # find rare labels
125 | # temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))
126 | # rare_cat = [x for x in temp.loc[temp<0.05].index.values]
127 | #
128 | # # create new variables, with Rare labels imputed
129 | #
130 | # # by the most frequent category
131 | # X_train[variable+'_freq_imp'] = np.where(X_train[variable].isin(rare_cat), frequent_cat, X_train[variable])
132 | # X_test[variable+'_freq_imp'] = np.where(X_test[variable].isin(rare_cat), frequent_cat, X_test[variable])
133 | #
134 | # # by adding a new label 'Rare'
135 | # X_train[variable+'_rare_imp'] = np.where(X_train[variable].isin(rare_cat), 'Rare', X_train[variable])
136 | # X_test[variable+'_rare_imp'] = np.where(X_test[variable].isin(rare_cat), 'Rare', X_test[variable])
137 | #==============================================================================
138 |
139 | # 2018.11.26 created by Eamon.Zhang
140 | class ModeImputation():
141 | """
142 | Replacing the rare label by most frequent label
143 |
144 | Parameters
145 | ----------
146 |
147 | """
148 |
149 | def __init__(self, mapping=None, cols=None, threshold=0.01):
150 | self.cols = cols
151 | self.mapping = mapping
152 | self._dim = None
153 | self.threshold = threshold
154 |
155 |
156 | def fit(self, X, y=None, **kwargs):
157 | """Fit encoder according to X and y.
158 | Parameters
159 | ----------
160 | X : array-like, shape = [n_samples, n_features]
161 | Training vectors, where n_samples is the number of samples
162 | and n_features is the number of features.
163 | y : array-like, shape = [n_samples]
164 | Target values.
165 | Returns
166 | -------
167 | self : encoder
168 | Returns self.
169 | """
170 |
171 | self._dim = X.shape[1]
172 |
173 | _, categories = self.impute_with_mode(
174 | X,
175 | mapping=self.mapping,
176 | cols=self.cols,
177 | threshold=self.threshold
178 | )
179 | self.mapping = categories
180 | return self
181 |
182 |
183 | def transform(self, X):
184 | """Perform the transformation to new categorical data.
185 | Will use the mapping (if available) and the column list to encode the
186 | data.
187 | Parameters
188 | ----------
189 | X : array-like, shape = [n_samples, n_features]
190 | Returns
191 | -------
192 | X : Transformed values with encoding applied.
193 | """
194 |
195 | if self._dim is None:
196 | raise ValueError('Must train encoder before it can be used to transform data.')
197 |
198 | # make sure that it is the right size
199 | if X.shape[1] != self._dim:
200 | raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
201 |
202 | X, _ = self.impute_with_mode(
203 | X,
204 | mapping=self.mapping,
205 | cols=self.cols,
206 | threshold=self.threshold
207 | )
208 |
209 | return X
210 |
211 |
212 | def impute_with_mode(self, X_in, threshold, mapping=None, cols=None):
213 | """
214 | Grouping the observations that show rare labels into a unique category ('rare')
215 |
216 | """
217 |
218 | X = X_in.copy(deep=True)
219 |
220 | # if cols is None:
221 | # cols = X.columns.values
222 |
223 | if mapping is not None: # transform
224 | mapping_out = mapping
225 | for i in mapping:
226 | column = i.get('col') # get the column name
227 | X[column] = X[column].map(i['mapping'])
228 |
229 | # try:
230 | # X[column] = X[column].astype(int)
231 | # except ValueError as e:
232 | # X[column] = X[column].astype(float)
233 | else: # fit
234 | mapping_out = []
235 | for col in cols:
236 | # if util.is_category(X[col].dtype):
237 | # categories = X[col].cat.categories
238 | # else:
239 | temp_df = pd.Series(X[col].value_counts()/len(X))
240 | median = X[col].mode()[0]
241 | mapping = { k: (median if k not in temp_df[temp_df >= threshold].index else k)
242 | for k in temp_df.index}
243 |
244 | mapping = pd.Series(mapping)
245 | mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
246 |
247 | return X, mapping_out
248 |
--------------------------------------------------------------------------------
/feature_engineering/discretization.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.tree import DecisionTreeClassifier
3 | from sklearn.model_selection import cross_val_score
4 | import numpy as np
5 |
6 | # from warnings import warn
7 |
8 | # 2018.11.17 Created by Eamon.Zhang
9 | # ChiMerge method modeified from https://github.com/tatsumiw/ChiMerge/blob/master/ChiMerge.py
10 | # TODO: add more constraits to the discretized result.
11 | class ChiMerge():
12 | """
13 | supervised discretization using the ChiMerge method.
14 |
15 |
16 | Parameters
17 | ----------
18 | confidenceVal: number
19 | default=3.841, correspond to p=0.05 dof=1
20 | num_of_bins: int
21 | number of bins after discretize
22 | col: str
23 | the column to be performed
24 |
25 | """
26 |
27 | def __init__(self, col=None, bins=None, confidenceVal=3.841, num_of_bins=10):
28 | self.col = col
29 | self._dim = None
30 | self.confidenceVal = confidenceVal
31 | self.bins = bins
32 | self.num_of_bins = num_of_bins
33 |
34 |
35 | def fit(self, X, y, **kwargs):
36 | """Fit encoder according to X and y.
37 | Parameters
38 | ----------
39 | X : array-like, shape = [n_samples, n_features]
40 | Training vectors, where n_samples is the number of samples
41 | and n_features is the number of features.
42 | y : array-like, shape = [n_samples]
43 | Target values.
44 | Returns
45 | -------
46 | self : encoder
47 | Returns self.
48 | """
49 |
50 | self._dim = X.shape[1]
51 |
52 | _, bins = self.chimerge(
53 | X_in=X,
54 | y=y,
55 | confidenceVal=self.confidenceVal,
56 | col=self.col,
57 | num_of_bins=self.num_of_bins
58 | )
59 | self.bins = bins
60 | return self
61 |
62 |
63 | def transform(self, X):
64 | """Perform the transformation to new data.
65 | Will use the tree model and the column list to discretize the
66 | column.
67 | Parameters
68 | ----------
69 | X : array-like, shape = [n_samples, n_features]
70 | Returns
71 | -------
72 | X : new dataframe with discretized new column.
73 | """
74 |
75 | if self._dim is None:
76 | raise ValueError('Must train encoder before it can be used to transform data.')
77 |
78 | # make sure that it is the right size
79 | if X.shape[1] != self._dim:
80 | raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
81 |
82 | X, _ = self.chimerge(
83 | X_in=X,
84 | col=self.col,
85 | bins=self.bins
86 | )
87 |
88 | return X
89 |
90 | def chimerge(self, X_in, y=None, confidenceVal=None, num_of_bins=None, col=None, bins=None):
91 | """
92 | discretize a variable using ChiMerge
93 |
94 | """
95 |
96 | X = X_in.copy(deep=True)
97 |
98 | if bins is not None: # transform
99 | try:
100 | X[col+'_chimerge'] = pd.cut(X[col],bins=bins,include_lowest=True)
101 | except Exception as e:
102 | print(e)
103 |
104 | else: # fit
105 | try:
106 | # create an array which save the num of 0/1 samples of the column to be chimerge
107 | total_num = X.groupby([col])[y].count()
108 | total_num = pd.DataFrame({'total_num': total_num})
109 | positive_class = X.groupby([col])[y].sum()
110 | positive_class = pd.DataFrame({'positive_class': positive_class})
111 | regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,how='inner')
112 | regroup.reset_index(inplace=True)
113 | regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']
114 | regroup = regroup.drop('total_num', axis=1)
115 | np_regroup = np.array(regroup)
116 | # merge interval that have 0 pos/neg samples
117 | i = 0
118 | while (i <= np_regroup.shape[0] - 2):
119 | if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)):
120 | np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1] # pos
121 | np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2] # neg
122 | np_regroup[i, 0] = np_regroup[i + 1, 0]
123 | np_regroup = np.delete(np_regroup, i + 1, 0)
124 | i = i - 1
125 | i = i + 1
126 | # calculate chi for neighboring intervals
127 | # ∑[(yA-yB)²/yB]
128 | chi_table = np.array([])
129 | for i in np.arange(np_regroup.shape[0] - 1):
130 | chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \
131 | * (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \
132 | ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * (
133 | np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2]))
134 | chi_table = np.append(chi_table, chi)
135 | # merge intervals that have closing chi
136 | while (1):
137 | if (len(chi_table) <= (num_of_bins - 1) and min(chi_table) >= confidenceVal):
138 | break
139 | chi_min_index = np.argwhere(chi_table == min(chi_table))[0]
140 | np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]
141 | np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]
142 | np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0]
143 | np_regroup = np.delete(np_regroup, chi_min_index + 1, 0)
144 |
145 | if (chi_min_index == np_regroup.shape[0] - 1):
146 | chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
147 | * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
148 | ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
149 | chi_table = np.delete(chi_table, chi_min_index, axis=0)
150 |
151 | else:
152 | chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
153 | * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
154 | ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
155 | chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \
156 | * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \
157 | ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]))
158 | chi_table = np.delete(chi_table, chi_min_index + 1, axis=0)
159 | result_data = pd.DataFrame()
160 | result_data['variable'] = [col] * np_regroup.shape[0]
161 | bins = []
162 | tmp = []
163 | for i in np.arange(np_regroup.shape[0]):
164 | if i == 0:
165 | y = '-inf' + ',' + str(np_regroup[i, 0])
166 | #x = np_regroup[i, 0]
167 | #list_temp.append(x)
168 | elif i == np_regroup.shape[0] - 1:
169 | y = str(np_regroup[i - 1, 0]) + '+'
170 | #x = 100000000.
171 | #list_temp.append(x)
172 | else:
173 | y = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0])
174 | #x = np_regroup[i, 0]
175 | #list_temp.append(x)
176 | bins.append(np_regroup[i - 1, 0])
177 | tmp.append(y)
178 |
179 | #list_temp.append(df[variable].max()+0.1)
180 | bins.append(X[col].min()-0.1)
181 |
182 | result_data['interval'] = tmp
183 | result_data['flag_0'] = np_regroup[:, 2]
184 | result_data['flag_1'] = np_regroup[:, 1]
185 | bins.sort(reverse=False)
186 | print('Interval for variable %s' % col)
187 | print(result_data)
188 |
189 | except Exception as e:
190 | print(e)
191 |
192 | return X, bins
193 |
194 |
195 |
196 |
197 | # 2018.11.15 Created by Eamon.Zhang
198 | class DiscretizeByDecisionTree():
199 | """
200 | Discretisation with Decision Trees consists of using a decision tree
201 | to identify the optimal splitting points that would determine the bins
202 | or contiguous intervals:
203 |
204 | 1.train a decision tree of limited depth (2, 3 or 4) using the variable
205 | we want to discretise to predict the target.
206 | 2.the original variable values are then replaced by the
207 | probability returned by the tree.
208 |
209 | Parameters
210 | ----------
211 | col: str
212 | column to discretise
213 | max_depth: int or list of int
214 | max depth of the tree. Can be an int or a list of int we want the tree model to search
215 | for the optimal depth.
216 |
217 | """
218 |
219 | def __init__(self, col=None, max_depth=None, tree_model=None):
220 | self.col = col
221 | self._dim = None
222 | self.max_depth = max_depth
223 | self.tree_model = tree_model
224 |
225 |
226 | def fit(self, X, y, **kwargs):
227 | """Fit encoder according to X and y.
228 | Parameters
229 | ----------
230 | X : array-like, shape = [n_samples, n_features]
231 | Training vectors, where n_samples is the number of samples
232 | and n_features is the number of features.
233 | y : array-like, shape = [n_samples]
234 | Target values.
235 | Returns
236 | -------
237 | self : encoder
238 | Returns self.
239 | """
240 |
241 | self._dim = X.shape[1]
242 |
243 | _, tree = self.discretize(
244 | X_in=X,
245 | y=y,
246 | max_depth=self.max_depth,
247 | col=self.col,
248 | tree_model=self.tree_model
249 | )
250 | self.tree_model = tree
251 | return self
252 |
253 | def transform(self, X):
254 | """Perform the transformation to new categorical data.
255 | Will use the tree model and the column list to discretize the
256 | column.
257 | Parameters
258 | ----------
259 | X : array-like, shape = [n_samples, n_features]
260 | Returns
261 | -------
262 | X : new dataframe with discretized new column.
263 | """
264 |
265 | if self._dim is None:
266 | raise ValueError('Must train encoder before it can be used to transform data.')
267 |
268 | # make sure that it is the right size
269 | if X.shape[1] != self._dim:
270 | raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
271 |
272 | X, _ = self.discretize(
273 | X_in=X,
274 | col=self.col,
275 | tree_model=self.tree_model
276 | )
277 |
278 | return X
279 |
280 |
281 | def discretize(self, X_in, y=None, max_depth=None, tree_model=None, col=None):
282 | """
283 | discretize a variable using DecisionTreeClassifier
284 |
285 | """
286 |
287 | X = X_in.copy(deep=True)
288 |
289 | if tree_model is not None: # transform
290 | X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]
291 |
292 | else: # fit
293 | if isinstance(max_depth,int):
294 | tree_model = DecisionTreeClassifier(max_depth=max_depth)
295 | tree_model.fit(X[col].to_frame(), y)
296 | # X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]
297 | #print(x.tree_discret.unique())
298 | # bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
299 | # X.groupby([col+'_tree_discret'])[col].max()], axis=1)
300 | # print('bins:')
301 | # print(bins)
302 |
303 | elif len(max_depth)>1:
304 | score_ls = [] # here I will store the roc auc
305 | score_std_ls = [] # here I will store the standard deviation of the roc_auc
306 | for tree_depth in max_depth:
307 | tree_model = DecisionTreeClassifier(max_depth=tree_depth)
308 | scores = cross_val_score(tree_model, X[col].to_frame(), y, cv=3, scoring='roc_auc')
309 | score_ls.append(np.mean(scores))
310 | score_std_ls.append(np.std(scores))
311 | temp = pd.concat([pd.Series(max_depth), pd.Series(score_ls), pd.Series(score_std_ls)], axis=1)
312 | temp.columns = ['depth', 'roc_auc_mean', 'roc_auc_std']
313 | print('result ROC-AUC for each depth')
314 | print(temp)
315 | max_roc = temp.roc_auc_mean.max()
316 | optimal_depth=temp[temp.roc_auc_mean==max_roc]['depth'].values
317 | print('optimal_depth:',optimal_depth)
318 | tree_model = DecisionTreeClassifier(max_depth=optimal_depth)
319 | tree_model.fit(X[col].to_frame(), y)
320 | # bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
321 | # X.groupby([col+'_tree_discret'])[col].max()], axis=1)
322 | # print('bins:')
323 | # print(bins)
324 | else:
325 | raise ValueError('max_depth of a tree must be an integer or a list')
326 |
327 | return X, tree_model
328 |
329 |
330 |
--------------------------------------------------------------------------------
/feature_engineering/encoding.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | # 2018.11.28 Created by Eamon.Zhang
4 |
5 | class MeanEncoding():
6 | """
7 | replacing the label by the mean of the target for that label.
8 |
9 | Parameters
10 | ----------
11 |
12 | """
13 |
14 | def __init__(self, mapping=None, cols=None):
15 | self.cols = cols
16 | self.mapping = mapping
17 | self._dim = None
18 | # self.threshold = threshold
19 |
20 |
21 | def fit(self, X, y=None, **kwargs):
22 | """Fit encoder according to X and y.
23 | Parameters
24 | ----------
25 | X : array-like, shape = [n_samples, n_features]
26 | Training vectors, where n_samples is the number of samples
27 | and n_features is the number of features.
28 | y : array-like, shape = [n_samples]
29 | Target values.
30 | Returns
31 | -------
32 | self : encoder
33 | Returns self.
34 | """
35 |
36 | self._dim = X.shape[1]
37 |
38 | _, categories = self.mean_encoding(
39 | X,
40 | y,
41 | mapping=self.mapping,
42 | cols=self.cols
43 | # threshold=self.threshold
44 | )
45 | self.mapping = categories
46 | return self
47 |
48 |
49 | def transform(self, X):
50 | """Perform the transformation to new categorical data.
51 | Will use the mapping (if available) and the column list to encode the
52 | data.
53 | Parameters
54 | ----------
55 | X : array-like, shape = [n_samples, n_features]
56 | Returns
57 | -------
58 | X : Transformed values with encoding applied.
59 | """
60 |
61 | if self._dim is None:
62 | raise ValueError('Must train encoder before it can be used to transform data.')
63 |
64 | # make sure that it is the right size
65 | if X.shape[1] != self._dim:
66 | raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
67 |
68 | X, _ = self.mean_encoding(
69 | X,
70 | mapping=self.mapping,
71 | cols=self.cols
72 | # threshold=self.threshold
73 | )
74 |
75 | return X
76 |
77 |
78 | def mean_encoding(self, X_in, y=None, mapping=None, cols=None):
79 | """
80 | Grouping the observations that show rare labels into a unique category ('rare')
81 |
82 | """
83 |
84 | X = X_in.copy(deep=True)
85 |
86 | # if cols is None:
87 | # cols = X.columns.values
88 |
89 | if mapping is not None: # transform
90 | mapping_out = mapping
91 | for i in mapping:
92 | column = i.get('col') # get the column name
93 | X[column] = X[column].map(i['mapping'])
94 |
95 | # try:
96 | # X[column] = X[column].astype(int)
97 | # except ValueError as e:
98 | # X[column] = X[column].astype(float)
99 | else: # fit
100 | mapping_out = []
101 | for col in cols:
102 | # if util.is_category(X[col].dtype):
103 | # categories = X[col].cat.categories
104 | # else:
105 | mapping = X[y.name].groupby(X[col]).mean().to_dict()
106 | mapping = pd.Series(mapping)
107 | mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
108 |
109 | return X, mapping_out
--------------------------------------------------------------------------------
/feature_engineering/transformation.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import scipy.stats as stats
5 | import pylab
6 | # from warnings import warn
7 |
8 | # 2018.11.26 Created by Eamon.Zhang
9 | def diagnostic_plots(df, variable):
10 | # function to plot a histogram and a Q-Q plot
11 | # side by side, for a certain variable
12 |
13 | plt.figure(figsize=(15,6))
14 | plt.subplot(1, 2, 1)
15 | df[variable].hist()
16 |
17 | plt.subplot(1, 2, 2)
18 | stats.probplot(df[variable], dist="norm", plot=pylab)
19 |
20 | plt.show()
21 |
22 |
23 | def log_transform(data,cols=[]):
24 | """
25 | Logarithmic transformation
26 | """
27 |
28 | data_copy = data.copy(deep=True)
29 | for i in cols:
30 | data_copy[i+'_log'] = np.log(data_copy[i]+1)
31 | print('Variable ' + i +' Q-Q plot')
32 | diagnostic_plots(data_copy,str(i+'_log'))
33 | return data_copy
34 |
35 |
36 | def reciprocal_transform(data,cols=[]):
37 | """
38 | Reciprocal transformation
39 | """
40 |
41 | data_copy = data.copy(deep=True)
42 | for i in cols:
43 | data_copy[i+'_reciprocal'] = 1/(data_copy[i])
44 | print('Variable ' + i +' Q-Q plot')
45 | diagnostic_plots(data_copy,str(i+'_reciprocal'))
46 | return data_copy
47 |
48 |
49 | def square_root_transform(data,cols=[]):
50 | """
51 | square root transformation
52 | """
53 |
54 | data_copy = data.copy(deep=True)
55 | for i in cols:
56 | data_copy[i+'_square_root'] = (data_copy[i])**(0.5)
57 | print('Variable ' + i +' Q-Q plot')
58 | diagnostic_plots(data_copy,str(i+'_square_root'))
59 | return data_copy
60 |
61 |
62 | def exp_transform(data,coef,cols=[]):
63 | """
64 | exp transformation
65 | """
66 |
67 | data_copy = data.copy(deep=True)
68 | for i in cols:
69 | data_copy[i+'_exp'] = (data_copy[i])**coef
70 | print('Variable ' + i +' Q-Q plot')
71 | diagnostic_plots(data_copy,str(i+'_exp'))
72 | return data_copy
73 |
74 |
--------------------------------------------------------------------------------
/feature_selection/embedded_method.py:
--------------------------------------------------------------------------------
1 | #import pandas as pd
2 | import numpy as np
3 |
4 | import matplotlib.pyplot as plt
5 | #import seaborn as sns
6 | #from sklearn.model_selection import train_test_split
7 |
8 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier #RandomForestRegressor
9 | #from sklearn.feature_selection import SelectFromModel
10 |
11 | # 2018.11.27 Created by Eamon.Zhang
12 |
13 | def rf_importance(X_train,y_train,max_depth=10,class_weight=None,top_n=15,n_estimators=50,random_state=0):
14 |
15 | model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
16 | random_state=random_state,class_weight=class_weight,
17 | n_jobs=-1)
18 | model.fit(X_train, y_train)
19 | importances = model.feature_importances_
20 | indices = np.argsort(importances)[::-1]
21 | feat_labels = X_train.columns
22 | std = np.std([tree.feature_importances_ for tree in model.estimators_],
23 | axis=0) # inter-trees variability.
24 | print("Feature ranking:")
25 | # l1,l2,l3,l4 = [],[],[],[]
26 | for f in range(X_train.shape[1]):
27 | print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
28 | # l1.append(f+1)
29 | # l2.append(indices[f])
30 | # l3.append(feat_labels[indices[f]])
31 | # l4.append(importances[indices[f]])
32 | #feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])
33 |
34 | # plotting
35 | indices = indices[0:top_n]
36 | plt.figure()
37 | plt.title("Feature importances top %d" % top_n)
38 | plt.bar(range(top_n), importances[indices],
39 | color="r", yerr=std[indices], align="center")
40 | plt.xticks(range(top_n), indices)
41 | plt.xlim([-1,top_n])
42 | plt.show()
43 |
44 | return model
45 |
46 |
47 | def gbt_importance(X_train,y_train,max_depth=10,top_n=15,n_estimators=50,random_state=0):
48 |
49 | model = GradientBoostingClassifier(n_estimators=n_estimators,max_depth=max_depth,
50 | random_state=random_state)
51 | model.fit(X_train, y_train)
52 | importances = model.feature_importances_
53 | indices = np.argsort(importances)[::-1]
54 | feat_labels = X_train.columns
55 | std = np.std([tree[0].feature_importances_ for tree in model.estimators_],
56 | axis=0) # inter-trees variability.
57 | print("Feature ranking:")
58 | # l1,l2,l3,l4 = [],[],[],[]
59 | for f in range(X_train.shape[1]):
60 | print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
61 | # l1.append(f+1)
62 | # l2.append(indices[f])
63 | # l3.append(feat_labels[indices[f]])
64 | # l4.append(importances[indices[f]])
65 | # feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])
66 | # plotting
67 | indices = indices[0:top_n]
68 | plt.figure()
69 | plt.title("Feature importances top %d" % top_n)
70 | plt.bar(range(top_n), importances[indices],
71 | color="r", yerr=std[indices], align="center")
72 | plt.xticks(range(top_n), indices)
73 | plt.xlim([-1,top_n])
74 | plt.show()
75 |
76 | return model
--------------------------------------------------------------------------------
/feature_selection/feature_shuffle.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | #import numpy as np
3 |
4 |
5 | from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
6 | from sklearn.metrics import roc_auc_score #, mean_squared_error
7 |
8 | # 2018.11.28 Created by Eamon.Zhang
9 |
10 |
11 | def feature_shuffle_rf(X_train,y_train,max_depth=None,class_weight=None,top_n=15,n_estimators=50,random_state=0):
12 |
13 | model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
14 | random_state=random_state,class_weight=class_weight,
15 | n_jobs=-1)
16 | model.fit(X_train, y_train)
17 | train_auc = roc_auc_score(y_train, (model.predict_proba(X_train))[:, 1])
18 | feature_dict = {}
19 |
20 | # selection logic
21 | for feature in X_train.columns:
22 | X_train_c = X_train.copy().reset_index(drop=True)
23 | y_train_c = y_train.copy().reset_index(drop=True)
24 |
25 | # shuffle individual feature
26 | X_train_c[feature] = X_train_c[feature].sample(frac=1,random_state=random_state).reset_index(
27 | drop=True)
28 | #print(X_train_c.isnull().sum())
29 | # make prediction with shuffled feature and calculate roc-auc
30 | shuff_auc = roc_auc_score(y_train_c,
31 | (model.predict_proba(X_train_c))[:, 1])
32 | #print(shuff_auc)
33 | # save the drop in roc-auc
34 | feature_dict[feature] = (train_auc - shuff_auc)
35 | #print(feature_dict)
36 |
37 | auc_drop = pd.Series(feature_dict).reset_index()
38 | auc_drop.columns = ['feature', 'auc_drop']
39 | auc_drop.sort_values(by=['auc_drop'], ascending=False, inplace=True)
40 | selected_features = auc_drop[auc_drop.auc_drop>0]['feature']
41 |
42 | return auc_drop, selected_features
43 |
44 |
--------------------------------------------------------------------------------
/feature_selection/filter_method.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | #from sklearn.feature_selection import VarianceThreshold
4 | from sklearn.feature_selection import mutual_info_classif,chi2
5 | from sklearn.feature_selection import SelectKBest, SelectPercentile
6 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
7 | from sklearn.metrics import roc_auc_score, mean_squared_error
8 |
9 | # 2018.11.17 Created by Eamon.Zhang
10 |
11 | def constant_feature_detect(data,threshold=0.98):
12 | """ detect features that show the same value for the
13 | majority/all of the observations (constant/quasi-constant features)
14 |
15 | Parameters
16 | ----------
17 | data : pd.Dataframe
18 | threshold : threshold to identify the variable as constant
19 |
20 | Returns
21 | -------
22 | list of variables names
23 | """
24 |
25 | data_copy = data.copy(deep=True)
26 | quasi_constant_feature = []
27 | for feature in data_copy.columns:
28 | predominant = (data_copy[feature].value_counts() / np.float(
29 | len(data_copy))).sort_values(ascending=False).values[0]
30 | if predominant >= threshold:
31 | quasi_constant_feature.append(feature)
32 | print(len(quasi_constant_feature),' variables are found to be almost constant')
33 | return quasi_constant_feature
34 |
35 |
36 | def corr_feature_detect(data,threshold=0.8):
37 | """ detect highly-correlated features of a Dataframe
38 | Parameters
39 | ----------
40 | data : pd.Dataframe
41 | threshold : threshold to identify the variable correlated
42 |
43 | Returns
44 | -------
45 | pairs of correlated variables
46 | """
47 |
48 | corrmat = data.corr()
49 | corrmat = corrmat.abs().unstack() # absolute value of corr coef
50 | corrmat = corrmat.sort_values(ascending=False)
51 | corrmat = corrmat[corrmat >= threshold]
52 | corrmat = corrmat[corrmat < 1] # remove the digonal
53 | corrmat = pd.DataFrame(corrmat).reset_index()
54 | corrmat.columns = ['feature1', 'feature2', 'corr']
55 |
56 | grouped_feature_ls = []
57 | correlated_groups = []
58 |
59 | for feature in corrmat.feature1.unique():
60 | if feature not in grouped_feature_ls:
61 |
62 | # find all features correlated to a single feature
63 | correlated_block = corrmat[corrmat.feature1 == feature]
64 | grouped_feature_ls = grouped_feature_ls + list(
65 | correlated_block.feature2.unique()) + [feature]
66 |
67 | # append the block of features to the list
68 | correlated_groups.append(correlated_block)
69 | return correlated_groups
70 |
71 |
72 | def mutual_info(X,y,select_k=10):
73 |
74 | # mi = mutual_info_classif(X,y)
75 | # mi = pd.Series(mi)
76 | # mi.index = X.columns
77 | # mi.sort_values(ascending=False)
78 |
79 | if select_k >= 1:
80 | sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
81 | col = X.columns[sel_.get_support()]
82 |
83 | elif 0 < select_k < 1:
84 | sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
85 | col = X.columns[sel_.get_support()]
86 |
87 | else:
88 | raise ValueError("select_k must be a positive number")
89 |
90 | return col
91 |
92 |
93 | # 2018.11.27 edit Chi-square test
94 | def chi_square_test(X,y,select_k=10):
95 |
96 | """
97 | Compute chi-squared stats between each non-negative feature and class.
98 | This score should be used to evaluate categorical variables in a classification task
99 | """
100 | if select_k >= 1:
101 | sel_ = SelectKBest(chi2, k=select_k).fit(X,y)
102 | col = X.columns[sel_.get_support()]
103 | elif 0 < select_k < 1:
104 | sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y)
105 | col = X.columns[sel_.get_support()]
106 | else:
107 | raise ValueError("select_k must be a positive number")
108 |
109 | return col
110 |
111 |
112 | def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold):
113 |
114 | """
115 | First, it builds one decision tree per feature, to predict the target
116 | Second, it makes predictions using the decision tree and the mentioned feature
117 | Third, it ranks the features according to the machine learning metric (roc-auc or mse)
118 | It selects the highest ranked features
119 |
120 | """
121 | roc_values = []
122 | for feature in X_train.columns:
123 | clf = DecisionTreeClassifier()
124 | clf.fit(X_train[feature].to_frame(), y_train)
125 | y_scored = clf.predict_proba(X_test[feature].to_frame())
126 | roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
127 | roc_values = pd.Series(roc_values)
128 | roc_values.index = X_train.columns
129 | print(roc_values.sort_values(ascending=False))
130 | print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
131 | keep_col = roc_values[roc_values > threshold]
132 | return keep_col
133 |
134 |
135 | def univariate_mse(X_train,y_train,X_test,y_test,threshold):
136 |
137 | """
138 | First, it builds one decision tree per feature, to predict the target
139 | Second, it makes predictions using the decision tree and the mentioned feature
140 | Third, it ranks the features according to the machine learning metric (roc-auc or mse)
141 | It selects the highest ranked features
142 |
143 | """
144 | mse_values = []
145 | for feature in X_train.columns:
146 | clf = DecisionTreeRegressor()
147 | clf.fit(X_train[feature].to_frame(), y_train)
148 | y_scored = clf.predict(X_test[feature].to_frame())
149 | mse_values.append(mean_squared_error(y_test, y_scored))
150 | mse_values = pd.Series(mse_values)
151 | mse_values.index = X_train.columns
152 | print(mse_values.sort_values(ascending=False))
153 | print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
154 | keep_col = mse_values[mse_values > threshold]
155 | return keep_col
156 |
--------------------------------------------------------------------------------
/feature_selection/hybrid.py:
--------------------------------------------------------------------------------
1 | #import pandas as pd
2 | #import numpy as np
3 |
4 | from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
5 | from sklearn.metrics import roc_auc_score #, mean_squared_error
6 |
7 | # 2018.12.02 Created by Eamon.Zhang
8 |
9 |
10 | def recursive_feature_elimination_rf(X_train,y_train,X_test,y_test,
11 | tol=0.001,max_depth=None,
12 | class_weight=None,
13 | top_n=15,n_estimators=50,random_state=0):
14 |
15 |
16 | features_to_remove = []
17 | count = 1
18 | # initial model using all the features
19 | model_all_features = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
20 | random_state=random_state,class_weight=class_weight,
21 | n_jobs=-1)
22 | model_all_features.fit(X_train, y_train)
23 | y_pred_test = model_all_features.predict_proba(X_test)[:, 1]
24 | auc_score_all = roc_auc_score(y_test, y_pred_test)
25 |
26 | for feature in X_train.columns:
27 | print()
28 | print('testing feature: ', feature, ' which is feature ', count,
29 | ' out of ', len(X_train.columns))
30 | count += 1
31 | model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
32 | random_state=random_state,class_weight=class_weight,
33 | n_jobs=-1)
34 |
35 | # fit model with all variables minus the removed features
36 | # and the feature to be evaluated
37 | model.fit(X_train.drop(features_to_remove + [feature], axis=1), y_train)
38 | y_pred_test = model.predict_proba(
39 | X_test.drop(features_to_remove + [feature], axis=1))[:, 1]
40 | auc_score_int = roc_auc_score(y_test, y_pred_test)
41 | print('New Test ROC AUC={}'.format((auc_score_int)))
42 |
43 | # print the original roc-auc with all the features
44 | print('All features Test ROC AUC={}'.format((auc_score_all)))
45 |
46 | # determine the drop in the roc-auc
47 | diff_auc = auc_score_all - auc_score_int
48 |
49 | # compare the drop in roc-auc with the tolerance
50 | if diff_auc >= tol:
51 | print('Drop in ROC AUC={}'.format(diff_auc))
52 | print('keep: ', feature)
53 |
54 | else:
55 | print('Drop in ROC AUC={}'.format(diff_auc))
56 | print('remove: ', feature)
57 |
58 | # if the drop in the roc is small and we remove the
59 | # feature, we need to set the new roc to the one based on
60 | # the remaining features
61 | auc_score_all = auc_score_int
62 |
63 | # and append the feature to remove to the list
64 | features_to_remove.append(feature)
65 | print('DONE!!')
66 | print('total features to remove: ', len(features_to_remove))
67 | features_to_keep = [x for x in X_train.columns if x not in features_to_remove]
68 | print('total features to keep: ', len(features_to_keep))
69 |
70 | return features_to_keep
71 |
72 |
73 | def recursive_feature_addition_rf(X_train,y_train,X_test,y_test,
74 | tol=0.001,max_depth=None,
75 | class_weight=None,
76 | top_n=15,n_estimators=50,random_state=0):
77 |
78 |
79 | features_to_keep = [X_train.columns[0]]
80 | count = 1
81 | # initial model using only one feature
82 | model_one_feature = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
83 | random_state=random_state,class_weight=class_weight,
84 | n_jobs=-1)
85 | model_one_feature.fit(X_train[[X_train.columns[0]]], y_train)
86 | y_pred_test = model_one_feature.predict_proba(X_test[[X_train.columns[0]]])[:, 1]
87 | auc_score_all = roc_auc_score(y_test, y_pred_test)
88 |
89 | for feature in X_train.columns[1:]:
90 | print()
91 | print('testing feature: ', feature, ' which is feature ', count,
92 | ' out of ', len(X_train.columns))
93 | count += 1
94 | model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
95 | random_state=random_state,class_weight=class_weight,
96 | n_jobs=-1)
97 |
98 | # fit model with the selected features
99 | # and the feature to be evaluated
100 | model.fit(X_train[features_to_keep + [feature]], y_train)
101 | y_pred_test = model.predict_proba(
102 | X_test[features_to_keep + [feature]])[:, 1]
103 | auc_score_int = roc_auc_score(y_test, y_pred_test)
104 | print('New Test ROC AUC={}'.format((auc_score_int)))
105 |
106 | # print the original roc-auc with all the features
107 | print('All features Test ROC AUC={}'.format((auc_score_all)))
108 |
109 | # determine the drop in the roc-auc
110 | diff_auc = auc_score_int - auc_score_all
111 |
112 | # compare the drop in roc-auc with the tolerance
113 | if diff_auc >= tol:
114 | # if the increase in the roc is bigger than the threshold
115 | # we keep the feature and re-adjust the roc-auc to the new value
116 | # considering the added feature
117 | print('Increase in ROC AUC={}'.format(diff_auc))
118 | print('keep: ', feature)
119 | auc_score_all = auc_score_int
120 | features_to_keep.append(feature)
121 | else:
122 | print('Increase in ROC AUC={}'.format(diff_auc))
123 | print('remove: ', feature)
124 |
125 | print('DONE!!')
126 | print('total features to keep: ', len(features_to_keep))
127 |
128 | return features_to_keep
--------------------------------------------------------------------------------
/images/001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/001.png
--------------------------------------------------------------------------------
/images/IV.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/IV.png
--------------------------------------------------------------------------------
/images/box-cox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/box-cox.png
--------------------------------------------------------------------------------
/images/embedded.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/embedded.png
--------------------------------------------------------------------------------
/images/featuretools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/featuretools.png
--------------------------------------------------------------------------------
/images/filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/filter.png
--------------------------------------------------------------------------------
/images/scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/scaling.png
--------------------------------------------------------------------------------
/images/sphx_glr_plot_map_data_to_normal_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/sphx_glr_plot_map_data_to_normal_001.png
--------------------------------------------------------------------------------
/images/workflow2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/workflow2.png
--------------------------------------------------------------------------------
/images/wrapper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/wrapper.png
--------------------------------------------------------------------------------
/output/Barplot_Pclass_Survived.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Barplot_Pclass_Survived.png
--------------------------------------------------------------------------------
/output/Boxplot_Pclass_Fare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Boxplot_Pclass_Fare.png
--------------------------------------------------------------------------------
/output/Corr_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Corr_plot.png
--------------------------------------------------------------------------------
/output/Countplot_Pclass.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Countplot_Pclass.png
--------------------------------------------------------------------------------
/output/Distplot_Fare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Distplot_Fare.png
--------------------------------------------------------------------------------
/output/Heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Heatmap.png
--------------------------------------------------------------------------------
/output/Scatter_plot_Fare_Pclass.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Scatter_plot_Fare_Pclass.png
--------------------------------------------------------------------------------
/output/describe.csv:
--------------------------------------------------------------------------------
1 | ,Survived,Pclass,Sex,Age,SibSp,Fare
2 | count,891.0,891.0,891,714.0,891.0,891.0
3 | unique,,,2,,,
4 | top,,,male,,,
5 | freq,,,577,,,
6 | mean,0.3838383838383838,2.308641975308642,,29.69911764705882,0.5230078563411896,32.2042079685746
7 | std,0.4865924542648585,0.8360712409770513,,14.526497332334044,1.1027434322934275,49.693428597180905
8 | min,0.0,1.0,,0.42,0.0,0.0
9 | 25%,0.0,2.0,,20.125,0.0,7.9104
10 | 50%,0.0,3.0,,28.0,0.0,14.4542
11 | 75%,1.0,3.0,,38.0,1.0,31.0
12 | max,1.0,3.0,,80.0,8.0,512.3292
13 |
--------------------------------------------------------------------------------
/output/missing.csv:
--------------------------------------------------------------------------------
1 | ,total missing,proportion
2 | Survived,0,0.0
3 | Pclass,0,0.0
4 | Sex,0,0.0
5 | Age,177,0.19865319865319866
6 | SibSp,0,0.0
7 | Fare,0,0.0
8 |
--------------------------------------------------------------------------------