├── .github
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── 1_Demo_Data_Explore.ipynb
├── 2.1_Demo_Missing_Data.ipynb
├── 2.2_Demo_Outlier.ipynb
├── 2.3_Demo_Rare_Values.ipynb
├── 3.1_Demo_Feature_Scaling.ipynb
├── 3.2_Demo_Discretisation.ipynb
├── 3.3_Demo_Feature_Encoding.ipynb
├── 3.4_Demo_Feature_Transformation.ipynb
├── 3.5_Demo_Feature_Generation.ipynb
├── 4.1_Demo_Feature_Selection_Filter.ipynb
├── 4.2_Demo_Feature_Selection_Wrapper.ipynb
├── 4.3_Demo_Feature_Selection_Embedded.ipynb
├── 4.4_Demo_Feature_Selection_Feature_Shuffling.ipynb
├── 4.5_Demo_Feature_Selection_Hybrid_method.ipynb
├── A Short Guide for Feature Engineering and Feature Selection.md
├── A Short Guide for Feature Engineering and Feature Selection.pdf
├── README.md
├── data
    ├── housing.data.txt
    ├── pima-indians-diabetes.data.csv
    └── titanic.csv
├── data_exploration
    └── explore.py
├── feature_cleaning
    ├── missing_data.py
    ├── outlier.py
    └── rare_values.py
├── feature_engineering
    ├── discretization.py
    ├── encoding.py
    └── transformation.py
├── feature_selection
    ├── embedded_method.py
    ├── feature_shuffle.py
    ├── filter_method.py
    └── hybrid.py
├── images
    ├── 001.png
    ├── IV.png
    ├── box-cox.png
    ├── embedded.png
    ├── featuretools.png
    ├── filter.png
    ├── scaling.png
    ├── sphx_glr_plot_map_data_to_normal_001.png
    ├── workflow2.png
    └── wrapper.png
└── output
    ├── Barplot_Pclass_Survived.png
    ├── Boxplot_Pclass_Fare.png
    ├── Corr_plot.png
    ├── Countplot_Pclass.png
    ├── Distplot_Fare.png
    ├── Heatmap.png
    ├── Scatter_plot_Fare_Pclass.png
    ├── describe.csv
    └── missing.csv


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "master" ]
 9 |   pull_request:
10 |     branches: [ "master" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.9", "3.10", "3.11"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Test with pytest
39 |       run: |
40 |         pytest
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | rule_extraction 20181014.py
2 | __pycache__
3 | .ipynb_checkpoints
4 | .gitignore.bak
5 | history
6 | README_bk.md
7 | A Short Guide for Feature Engineering and Feature Selection.docx
8 | A Short Guide for Feature Engineering and Feature Selection.html
9 | 


--------------------------------------------------------------------------------
/2.3_Demo_Rare_Values.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "# import seaborn as sns\n",
 14 |     "# import matplotlib.pyplot as plt\n",
 15 |     "import os\n",
 16 |     "# plt.style.use('seaborn-colorblind')\n",
 17 |     "# %matplotlib inline\n",
 18 |     "from feature_cleaning import rare_values as ra"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Load Dataset"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "name": "stdout",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "Variable Pclass label proportion:\n",
 38 |       "3    0.551066\n",
 39 |       "1    0.242424\n",
 40 |       "2    0.206510\n",
 41 |       "Name: Pclass, dtype: float64\n",
 42 |       "Variable SibSp label proportion:\n",
 43 |       "0    0.682379\n",
 44 |       "1    0.234568\n",
 45 |       "2    0.031425\n",
 46 |       "4    0.020202\n",
 47 |       "3    0.017957\n",
 48 |       "8    0.007856\n",
 49 |       "5    0.005612\n",
 50 |       "Name: SibSp, dtype: float64\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "use_cols = [\n",
 56 |     "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
 57 |     "    'Survived'\n",
 58 |     "]\n",
 59 |     "\n",
 60 |     "# see column Pclass & SibSp's distributions\n",
 61 |     "# SibSp has values 3/8/5 that occur rarely, under 2%\n",
 62 |     "# Pclass has 3 values, but no one is under 20%\n",
 63 |     "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
 64 |     "for i in ['Pclass','SibSp']:\n",
 65 |     "    print('Variable',i,'label proportion:')\n",
 66 |     "    print(data[i].value_counts()/len(data))"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "## Grouping into one new category\n",
 74 |     "Grouping the observations that show rare labels into a unique category ('rare')"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 3,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# create the encoder and fit with our data\n",
 86 |     "enc = ra.GroupingRareValues(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 4,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "[{'col': 'Pclass', 'mapping': 3    3\n",
 99 |       "1    1\n",
100 |       "2    2\n",
101 |       "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0       0\n",
102 |       "1       1\n",
103 |       "2       2\n",
104 |       "4       4\n",
105 |       "3       3\n",
106 |       "8    rare\n",
107 |       "5    rare\n",
108 |       "dtype: object, 'data_type': dtype('int64')}]\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "# let's see the mapping\n",
114 |     "# for SibSp, values 5 & 8 are encoded as 'rare' as they appear less than 10%\n",
115 |     "# for Pclass, nothing changed\n",
116 |     "print(enc.mapping)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 5,
122 |    "metadata": {
123 |     "collapsed": true
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "# perform transformation\n",
128 |     "data2 = enc.transform(data)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 6,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "0       608\n",
141 |       "1       209\n",
142 |       "2        28\n",
143 |       "4        18\n",
144 |       "3        16\n",
145 |       "rare     12\n",
146 |       "Name: SibSp, dtype: int64\n"
147 |      ]
148 |     }
149 |    ],
150 |    "source": [
151 |     "# check the result\n",
152 |     "print(data2.SibSp.value_counts())"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "## Mode Imputation\n",
160 |     "Replacing the rare label by most frequent label"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 7,
166 |    "metadata": {
167 |     "collapsed": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "# create the encoder and fit with our data\n",
172 |     "enc = ra.ModeImputation(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 8,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "[{'col': 'Pclass', 'mapping': 3    3\n",
185 |       "1    1\n",
186 |       "2    2\n",
187 |       "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0    0\n",
188 |       "1    1\n",
189 |       "2    2\n",
190 |       "4    4\n",
191 |       "3    3\n",
192 |       "8    0\n",
193 |       "5    0\n",
194 |       "dtype: int64, 'data_type': dtype('int64')}]\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "# let's see the mapping\n",
200 |     "# for SibSp, values 5 & 8 are encoded as 0, as label 0 is the most frequent label\n",
201 |     "# for Pclass, nothing changed\n",
202 |     "print(enc.mapping)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 9,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "# perform transformation\n",
214 |     "data3 = enc.transform(data)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 10,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "name": "stdout",
224 |      "output_type": "stream",
225 |      "text": [
226 |       "0    620\n",
227 |       "1    209\n",
228 |       "2     28\n",
229 |       "4     18\n",
230 |       "3     16\n",
231 |       "Name: SibSp, dtype: int64\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "# check the result\n",
237 |     "print(data3.SibSp.value_counts())"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": true
245 |    },
246 |    "outputs": [],
247 |    "source": []
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "kernelspec": {
252 |    "display_name": "Python 3",
253 |    "language": "python",
254 |    "name": "python3"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 3
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython3",
266 |    "version": "3.6.1"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 2
271 | }
272 | 


--------------------------------------------------------------------------------
/3.1_Demo_Feature_Scaling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "# import seaborn as sns\n",
 14 |     "# import matplotlib.pyplot as plt\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "\n",
 18 |     "# plt.style.use('seaborn-colorblind')\n",
 19 |     "# %matplotlib inline\n",
 20 |     "#from feature_cleaning import rare_values as ra"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Load Dataset"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "use_cols = [\n",
 39 |     "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
 40 |     "    'Survived'\n",
 41 |     "]\n",
 42 |     "\n",
 43 |     "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/html": [
 54 |        "<div>\n",
 55 |        "<style scoped>\n",
 56 |        "    .dataframe tbody tr th:only-of-type {\n",
 57 |        "        vertical-align: middle;\n",
 58 |        "    }\n",
 59 |        "\n",
 60 |        "    .dataframe tbody tr th {\n",
 61 |        "        vertical-align: top;\n",
 62 |        "    }\n",
 63 |        "\n",
 64 |        "    .dataframe thead th {\n",
 65 |        "        text-align: right;\n",
 66 |        "    }\n",
 67 |        "</style>\n",
 68 |        "<table border=\"1\" class=\"dataframe\">\n",
 69 |        "  <thead>\n",
 70 |        "    <tr style=\"text-align: right;\">\n",
 71 |        "      <th></th>\n",
 72 |        "      <th>Survived</th>\n",
 73 |        "      <th>Pclass</th>\n",
 74 |        "      <th>Sex</th>\n",
 75 |        "      <th>Age</th>\n",
 76 |        "      <th>SibSp</th>\n",
 77 |        "      <th>Fare</th>\n",
 78 |        "    </tr>\n",
 79 |        "  </thead>\n",
 80 |        "  <tbody>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>0</th>\n",
 83 |        "      <td>0</td>\n",
 84 |        "      <td>3</td>\n",
 85 |        "      <td>male</td>\n",
 86 |        "      <td>22.0</td>\n",
 87 |        "      <td>1</td>\n",
 88 |        "      <td>7.2500</td>\n",
 89 |        "    </tr>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>1</th>\n",
 92 |        "      <td>1</td>\n",
 93 |        "      <td>1</td>\n",
 94 |        "      <td>female</td>\n",
 95 |        "      <td>38.0</td>\n",
 96 |        "      <td>1</td>\n",
 97 |        "      <td>71.2833</td>\n",
 98 |        "    </tr>\n",
 99 |        "    <tr>\n",
100 |        "      <th>2</th>\n",
101 |        "      <td>1</td>\n",
102 |        "      <td>3</td>\n",
103 |        "      <td>female</td>\n",
104 |        "      <td>26.0</td>\n",
105 |        "      <td>0</td>\n",
106 |        "      <td>7.9250</td>\n",
107 |        "    </tr>\n",
108 |        "  </tbody>\n",
109 |        "</table>\n",
110 |        "</div>"
111 |       ],
112 |       "text/plain": [
113 |        "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
114 |        "0         0       3    male  22.0      1   7.2500\n",
115 |        "1         1       1  female  38.0      1  71.2833\n",
116 |        "2         1       3  female  26.0      0   7.9250"
117 |       ]
118 |      },
119 |      "execution_count": 3,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "data.head(3)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 4,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/plain": [
136 |        "((623, 6), (268, 6))"
137 |       ]
138 |      },
139 |      "execution_count": 4,
140 |      "metadata": {},
141 |      "output_type": "execute_result"
142 |     }
143 |    ],
144 |    "source": [
145 |     "# Note that we include target variable in the X_train \n",
146 |     "# because we need it to supervise our discretization\n",
147 |     "# this is not the standard way of using train-test-split\n",
148 |     "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
149 |     "                                                    random_state=0)\n",
150 |     "X_train.shape, X_test.shape"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "## Normalization - Standardization (Z-score scaling)\n",
158 |     "\n",
159 |     "removes the mean and scales the data to unit variance.<br />z = (X - X.mean) /  std"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 5,
165 |    "metadata": {},
166 |    "outputs": [
167 |     {
168 |      "name": "stdout",
169 |      "output_type": "stream",
170 |      "text": [
171 |       "     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_zscore\n",
172 |       "857         1       1    male  51.0      0  26.5500    -0.122530\n",
173 |       "52          1       1  female  49.0      1  76.7292     0.918124\n",
174 |       "386         0       3    male   1.0      5  46.9000     0.299503\n",
175 |       "124         0       1    male  54.0      0  77.2875     0.929702\n",
176 |       "578         0       3  female   NaN      1  14.4583    -0.373297\n",
177 |       "549         1       2    male   8.0      1  36.7500     0.089005\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "# add the new created feature\n",
183 |     "from sklearn.preprocessing import StandardScaler\n",
184 |     "ss = StandardScaler().fit(X_train[['Fare']])\n",
185 |     "X_train_copy = X_train.copy(deep=True)\n",
186 |     "X_train_copy['Fare_zscore'] = ss.transform(X_train_copy[['Fare']])\n",
187 |     "print(X_train_copy.head(6))"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 6,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "5.916437306188636e-17\n",
200 |       "1.0008035356861\n"
201 |      ]
202 |     }
203 |    ],
204 |    "source": [
205 |     "# check if it is with mean=0 std=1\n",
206 |     "print(X_train_copy['Fare_zscore'].mean())\n",
207 |     "print(X_train_copy['Fare_zscore'].std())\n"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "## Min-Max scaling\n",
215 |     "transforms features by scaling each feature to a given range. Default to [0,1].<br />X_scaled = (X - X.min / (X.max - X.min)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 7,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_minmax\n",
228 |       "857         1       1    male  51.0      0  26.5500     0.051822\n",
229 |       "52          1       1  female  49.0      1  76.7292     0.149765\n",
230 |       "386         0       3    male   1.0      5  46.9000     0.091543\n",
231 |       "124         0       1    male  54.0      0  77.2875     0.150855\n",
232 |       "578         0       3  female   NaN      1  14.4583     0.028221\n",
233 |       "549         1       2    male   8.0      1  36.7500     0.071731\n"
234 |      ]
235 |     }
236 |    ],
237 |    "source": [
238 |     "# add the new created feature\n",
239 |     "from sklearn.preprocessing import MinMaxScaler\n",
240 |     "mms = MinMaxScaler().fit(X_train[['Fare']])\n",
241 |     "X_train_copy = X_train.copy(deep=True)\n",
242 |     "X_train_copy['Fare_minmax'] = mms.transform(X_train_copy[['Fare']])\n",
243 |     "print(X_train_copy.head(6))"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 8,
249 |    "metadata": {},
250 |    "outputs": [
251 |     {
252 |      "name": "stdout",
253 |      "output_type": "stream",
254 |      "text": [
255 |       "1.0\n",
256 |       "0.0\n"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "# check the range of Fare_minmax\n",
262 |     "print(X_train_copy['Fare_minmax'].max())\n",
263 |     "print(X_train_copy['Fare_minmax'].min())"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {
269 |     "collapsed": true
270 |    },
271 |    "source": [
272 |     "## Robust scaling\n",
273 |     "removes the median and scales the data according to the quantile range (defaults to IQR)<br />X_scaled = (X - X.median) / IQR"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 9,
279 |    "metadata": {},
280 |    "outputs": [
281 |     {
282 |      "name": "stdout",
283 |      "output_type": "stream",
284 |      "text": [
285 |       "     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_robust\n",
286 |       "857         1       1    male  51.0      0  26.5500     0.492275\n",
287 |       "52          1       1  female  49.0      1  76.7292     2.630973\n",
288 |       "386         0       3    male   1.0      5  46.9000     1.359616\n",
289 |       "124         0       1    male  54.0      0  77.2875     2.654768\n",
290 |       "578         0       3  female   NaN      1  14.4583    -0.023088\n",
291 |       "549         1       2    male   8.0      1  36.7500     0.927011\n"
292 |      ]
293 |     }
294 |    ],
295 |    "source": [
296 |     "# add the new created feature\n",
297 |     "from sklearn.preprocessing import RobustScaler\n",
298 |     "rs = RobustScaler().fit(X_train[['Fare']])\n",
299 |     "X_train_copy = X_train.copy(deep=True)\n",
300 |     "X_train_copy['Fare_robust'] = rs.transform(X_train_copy[['Fare']])\n",
301 |     "print(X_train_copy.head(6))"
302 |    ]
303 |   }
304 |  ],
305 |  "metadata": {
306 |   "kernelspec": {
307 |    "display_name": "Python 3",
308 |    "language": "python",
309 |    "name": "python3"
310 |   },
311 |   "language_info": {
312 |    "codemirror_mode": {
313 |     "name": "ipython",
314 |     "version": 3
315 |    },
316 |    "file_extension": ".py",
317 |    "mimetype": "text/x-python",
318 |    "name": "python",
319 |    "nbconvert_exporter": "python",
320 |    "pygments_lexer": "ipython3",
321 |    "version": "3.6.1"
322 |   }
323 |  },
324 |  "nbformat": 4,
325 |  "nbformat_minor": 2
326 | }
327 | 


--------------------------------------------------------------------------------
/3.3_Demo_Feature_Encoding.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "# import seaborn as sns\n",
 14 |     "# import matplotlib.pyplot as plt\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "\n",
 18 |     "import category_encoders as ce\n",
 19 |     "from feature_engineering import encoding\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Load Dataset"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/html": [
 37 |        "<div>\n",
 38 |        "<style scoped>\n",
 39 |        "    .dataframe tbody tr th:only-of-type {\n",
 40 |        "        vertical-align: middle;\n",
 41 |        "    }\n",
 42 |        "\n",
 43 |        "    .dataframe tbody tr th {\n",
 44 |        "        vertical-align: top;\n",
 45 |        "    }\n",
 46 |        "\n",
 47 |        "    .dataframe thead th {\n",
 48 |        "        text-align: right;\n",
 49 |        "    }\n",
 50 |        "</style>\n",
 51 |        "<table border=\"1\" class=\"dataframe\">\n",
 52 |        "  <thead>\n",
 53 |        "    <tr style=\"text-align: right;\">\n",
 54 |        "      <th></th>\n",
 55 |        "      <th>Survived</th>\n",
 56 |        "      <th>Pclass</th>\n",
 57 |        "      <th>Sex</th>\n",
 58 |        "      <th>Age</th>\n",
 59 |        "      <th>SibSp</th>\n",
 60 |        "      <th>Fare</th>\n",
 61 |        "    </tr>\n",
 62 |        "  </thead>\n",
 63 |        "  <tbody>\n",
 64 |        "    <tr>\n",
 65 |        "      <th>0</th>\n",
 66 |        "      <td>0</td>\n",
 67 |        "      <td>3</td>\n",
 68 |        "      <td>male</td>\n",
 69 |        "      <td>22.0</td>\n",
 70 |        "      <td>1</td>\n",
 71 |        "      <td>7.2500</td>\n",
 72 |        "    </tr>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>1</th>\n",
 75 |        "      <td>1</td>\n",
 76 |        "      <td>1</td>\n",
 77 |        "      <td>female</td>\n",
 78 |        "      <td>38.0</td>\n",
 79 |        "      <td>1</td>\n",
 80 |        "      <td>71.2833</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>2</th>\n",
 84 |        "      <td>1</td>\n",
 85 |        "      <td>3</td>\n",
 86 |        "      <td>female</td>\n",
 87 |        "      <td>26.0</td>\n",
 88 |        "      <td>0</td>\n",
 89 |        "      <td>7.9250</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>3</th>\n",
 93 |        "      <td>1</td>\n",
 94 |        "      <td>1</td>\n",
 95 |        "      <td>female</td>\n",
 96 |        "      <td>35.0</td>\n",
 97 |        "      <td>1</td>\n",
 98 |        "      <td>53.1000</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>4</th>\n",
102 |        "      <td>0</td>\n",
103 |        "      <td>3</td>\n",
104 |        "      <td>male</td>\n",
105 |        "      <td>35.0</td>\n",
106 |        "      <td>0</td>\n",
107 |        "      <td>8.0500</td>\n",
108 |        "    </tr>\n",
109 |        "  </tbody>\n",
110 |        "</table>\n",
111 |        "</div>"
112 |       ],
113 |       "text/plain": [
114 |        "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
115 |        "0         0       3    male  22.0      1   7.2500\n",
116 |        "1         1       1  female  38.0      1  71.2833\n",
117 |        "2         1       3  female  26.0      0   7.9250\n",
118 |        "3         1       1  female  35.0      1  53.1000\n",
119 |        "4         0       3    male  35.0      0   8.0500"
120 |       ]
121 |      },
122 |      "execution_count": 2,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "use_cols = [\n",
129 |     "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
130 |     "    'Survived'\n",
131 |     "]\n",
132 |     "\n",
133 |     "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
134 |     "data.head()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 3,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "((623, 6), (268, 6))"
146 |       ]
147 |      },
148 |      "execution_count": 3,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
155 |     "                                                    random_state=0)\n",
156 |     "X_train.shape, X_test.shape"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "## One-hot encoding\n",
164 |     "replace the categorical variable by different boolean variables (0/1) to indicate whether or not certain label is true for that observation"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 4,
170 |    "metadata": {
171 |     "collapsed": true
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "data1 = pd.get_dummies(data,drop_first=True)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 5,
181 |    "metadata": {
182 |     "scrolled": true
183 |    },
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "text/html": [
188 |        "<div>\n",
189 |        "<style scoped>\n",
190 |        "    .dataframe tbody tr th:only-of-type {\n",
191 |        "        vertical-align: middle;\n",
192 |        "    }\n",
193 |        "\n",
194 |        "    .dataframe tbody tr th {\n",
195 |        "        vertical-align: top;\n",
196 |        "    }\n",
197 |        "\n",
198 |        "    .dataframe thead th {\n",
199 |        "        text-align: right;\n",
200 |        "    }\n",
201 |        "</style>\n",
202 |        "<table border=\"1\" class=\"dataframe\">\n",
203 |        "  <thead>\n",
204 |        "    <tr style=\"text-align: right;\">\n",
205 |        "      <th></th>\n",
206 |        "      <th>Survived</th>\n",
207 |        "      <th>Pclass</th>\n",
208 |        "      <th>Age</th>\n",
209 |        "      <th>SibSp</th>\n",
210 |        "      <th>Fare</th>\n",
211 |        "      <th>Sex_male</th>\n",
212 |        "    </tr>\n",
213 |        "  </thead>\n",
214 |        "  <tbody>\n",
215 |        "    <tr>\n",
216 |        "      <th>0</th>\n",
217 |        "      <td>0</td>\n",
218 |        "      <td>3</td>\n",
219 |        "      <td>22.0</td>\n",
220 |        "      <td>1</td>\n",
221 |        "      <td>7.2500</td>\n",
222 |        "      <td>1</td>\n",
223 |        "    </tr>\n",
224 |        "    <tr>\n",
225 |        "      <th>1</th>\n",
226 |        "      <td>1</td>\n",
227 |        "      <td>1</td>\n",
228 |        "      <td>38.0</td>\n",
229 |        "      <td>1</td>\n",
230 |        "      <td>71.2833</td>\n",
231 |        "      <td>0</td>\n",
232 |        "    </tr>\n",
233 |        "    <tr>\n",
234 |        "      <th>2</th>\n",
235 |        "      <td>1</td>\n",
236 |        "      <td>3</td>\n",
237 |        "      <td>26.0</td>\n",
238 |        "      <td>0</td>\n",
239 |        "      <td>7.9250</td>\n",
240 |        "      <td>0</td>\n",
241 |        "    </tr>\n",
242 |        "    <tr>\n",
243 |        "      <th>3</th>\n",
244 |        "      <td>1</td>\n",
245 |        "      <td>1</td>\n",
246 |        "      <td>35.0</td>\n",
247 |        "      <td>1</td>\n",
248 |        "      <td>53.1000</td>\n",
249 |        "      <td>0</td>\n",
250 |        "    </tr>\n",
251 |        "    <tr>\n",
252 |        "      <th>4</th>\n",
253 |        "      <td>0</td>\n",
254 |        "      <td>3</td>\n",
255 |        "      <td>35.0</td>\n",
256 |        "      <td>0</td>\n",
257 |        "      <td>8.0500</td>\n",
258 |        "      <td>1</td>\n",
259 |        "    </tr>\n",
260 |        "  </tbody>\n",
261 |        "</table>\n",
262 |        "</div>"
263 |       ],
264 |       "text/plain": [
265 |        "   Survived  Pclass   Age  SibSp     Fare  Sex_male\n",
266 |        "0         0       3  22.0      1   7.2500         1\n",
267 |        "1         1       1  38.0      1  71.2833         0\n",
268 |        "2         1       3  26.0      0   7.9250         0\n",
269 |        "3         1       1  35.0      1  53.1000         0\n",
270 |        "4         0       3  35.0      0   8.0500         1"
271 |       ]
272 |      },
273 |      "execution_count": 5,
274 |      "metadata": {},
275 |      "output_type": "execute_result"
276 |     }
277 |    ],
278 |    "source": [
279 |     "data1.head()"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "## Ordinal-encoding\n",
287 |     "replace the labels by some ordinal number if ordinal is meaningful"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 6,
293 |    "metadata": {
294 |     "collapsed": true
295 |    },
296 |    "outputs": [],
297 |    "source": [
298 |     "ord_enc = ce.OrdinalEncoder(cols=['Sex']).fit(X_train,y_train)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 7,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "name": "stdout",
308 |      "output_type": "stream",
309 |      "text": [
310 |       "   Survived  Pclass  Sex   Age  SibSp     Fare\n",
311 |       "0         0       3    1  22.0      1   7.2500\n",
312 |       "1         1       1    2  38.0      1  71.2833\n",
313 |       "2         1       3    2  26.0      0   7.9250\n",
314 |       "3         1       1    2  35.0      1  53.1000\n",
315 |       "4         0       3    1  35.0      0   8.0500\n"
316 |      ]
317 |     }
318 |    ],
319 |    "source": [
320 |     "data4 = ord_enc.transform(data)\n",
321 |     "print(data4.head(5))"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "## Mean encoding\n",
329 |     "replace the label by the mean of the target for that label. \n",
330 |     "(the target must be 0/1 valued or continuous)\n"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 8,
336 |    "metadata": {},
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "Sex\n",
342 |        "female    0.753488\n",
343 |        "male      0.196078\n",
344 |        "Name: Survived, dtype: float64"
345 |       ]
346 |      },
347 |      "execution_count": 8,
348 |      "metadata": {},
349 |      "output_type": "execute_result"
350 |     }
351 |    ],
352 |    "source": [
353 |     "# cross check-- the mean of target group by Sex\n",
354 |     "X_train['Survived'].groupby(data['Sex']).mean()\n"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 9,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "mean_enc = encoding.MeanEncoding(cols=['Sex']).fit(X_train,y_train)"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 10,
369 |    "metadata": {},
370 |    "outputs": [
371 |     {
372 |      "name": "stdout",
373 |      "output_type": "stream",
374 |      "text": [
375 |       "   Survived  Pclass       Sex   Age  SibSp     Fare\n",
376 |       "0         0       3  0.196078  22.0      1   7.2500\n",
377 |       "1         1       1  0.753488  38.0      1  71.2833\n",
378 |       "2         1       3  0.753488  26.0      0   7.9250\n",
379 |       "3         1       1  0.753488  35.0      1  53.1000\n",
380 |       "4         0       3  0.196078  35.0      0   8.0500\n"
381 |      ]
382 |     }
383 |    ],
384 |    "source": [
385 |     "data6 = mean_enc.transform(data)\n",
386 |     "print(data6.head(5))"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "metadata": {},
392 |    "source": [
393 |     "## Target-encoding\n",
394 |     "Similar to mean encoding, but use both posterior probability and prior probability of the target"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 11,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "# create the encoder and fit with our data\n",
404 |     "target_enc = ce.TargetEncoder(cols=['Sex']).fit(X_train,y_train)"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": 12,
410 |    "metadata": {
411 |     "collapsed": true
412 |    },
413 |    "outputs": [],
414 |    "source": [
415 |     "# perform transformation\n",
416 |     "# data.Survived.groupby(data['Sex']).agg(['mean'])\n",
417 |     "data2 = target_enc.transform(data)"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 13,
423 |    "metadata": {},
424 |    "outputs": [
425 |     {
426 |      "data": {
427 |       "text/html": [
428 |        "<div>\n",
429 |        "<style scoped>\n",
430 |        "    .dataframe tbody tr th:only-of-type {\n",
431 |        "        vertical-align: middle;\n",
432 |        "    }\n",
433 |        "\n",
434 |        "    .dataframe tbody tr th {\n",
435 |        "        vertical-align: top;\n",
436 |        "    }\n",
437 |        "\n",
438 |        "    .dataframe thead th {\n",
439 |        "        text-align: right;\n",
440 |        "    }\n",
441 |        "</style>\n",
442 |        "<table border=\"1\" class=\"dataframe\">\n",
443 |        "  <thead>\n",
444 |        "    <tr style=\"text-align: right;\">\n",
445 |        "      <th></th>\n",
446 |        "      <th>Survived</th>\n",
447 |        "      <th>Pclass</th>\n",
448 |        "      <th>Sex</th>\n",
449 |        "      <th>Age</th>\n",
450 |        "      <th>SibSp</th>\n",
451 |        "      <th>Fare</th>\n",
452 |        "    </tr>\n",
453 |        "  </thead>\n",
454 |        "  <tbody>\n",
455 |        "    <tr>\n",
456 |        "      <th>0</th>\n",
457 |        "      <td>0</td>\n",
458 |        "      <td>3</td>\n",
459 |        "      <td>0.196078</td>\n",
460 |        "      <td>22.0</td>\n",
461 |        "      <td>1</td>\n",
462 |        "      <td>7.2500</td>\n",
463 |        "    </tr>\n",
464 |        "    <tr>\n",
465 |        "      <th>1</th>\n",
466 |        "      <td>1</td>\n",
467 |        "      <td>1</td>\n",
468 |        "      <td>0.753488</td>\n",
469 |        "      <td>38.0</td>\n",
470 |        "      <td>1</td>\n",
471 |        "      <td>71.2833</td>\n",
472 |        "    </tr>\n",
473 |        "    <tr>\n",
474 |        "      <th>2</th>\n",
475 |        "      <td>1</td>\n",
476 |        "      <td>3</td>\n",
477 |        "      <td>0.753488</td>\n",
478 |        "      <td>26.0</td>\n",
479 |        "      <td>0</td>\n",
480 |        "      <td>7.9250</td>\n",
481 |        "    </tr>\n",
482 |        "    <tr>\n",
483 |        "      <th>3</th>\n",
484 |        "      <td>1</td>\n",
485 |        "      <td>1</td>\n",
486 |        "      <td>0.753488</td>\n",
487 |        "      <td>35.0</td>\n",
488 |        "      <td>1</td>\n",
489 |        "      <td>53.1000</td>\n",
490 |        "    </tr>\n",
491 |        "    <tr>\n",
492 |        "      <th>4</th>\n",
493 |        "      <td>0</td>\n",
494 |        "      <td>3</td>\n",
495 |        "      <td>0.196078</td>\n",
496 |        "      <td>35.0</td>\n",
497 |        "      <td>0</td>\n",
498 |        "      <td>8.0500</td>\n",
499 |        "    </tr>\n",
500 |        "  </tbody>\n",
501 |        "</table>\n",
502 |        "</div>"
503 |       ],
504 |       "text/plain": [
505 |        "   Survived  Pclass       Sex   Age  SibSp     Fare\n",
506 |        "0         0       3  0.196078  22.0      1   7.2500\n",
507 |        "1         1       1  0.753488  38.0      1  71.2833\n",
508 |        "2         1       3  0.753488  26.0      0   7.9250\n",
509 |        "3         1       1  0.753488  35.0      1  53.1000\n",
510 |        "4         0       3  0.196078  35.0      0   8.0500"
511 |       ]
512 |      },
513 |      "execution_count": 13,
514 |      "metadata": {},
515 |      "output_type": "execute_result"
516 |     }
517 |    ],
518 |    "source": [
519 |     "# check the result\n",
520 |     "data2.head()"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "markdown",
525 |    "metadata": {},
526 |    "source": [
527 |     "## WOE-encoding\n",
528 |     "replace the label  with Weight of Evidence of each label. WOE is computed from the basic odds ratio: \n",
529 |     "\n",
530 |     "ln( (Proportion of Good Outcomes) / (Proportion of Bad Outcomes))"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": 14,
536 |    "metadata": {
537 |     "collapsed": true
538 |    },
539 |    "outputs": [],
540 |    "source": [
541 |     "woe_enc = ce.WOEEncoder(cols=['Sex']).fit(X_train,y_train)"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": 15,
547 |    "metadata": {
548 |     "collapsed": true
549 |    },
550 |    "outputs": [],
551 |    "source": [
552 |     "data3 = woe_enc.transform(data)"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": 16,
558 |    "metadata": {},
559 |    "outputs": [
560 |     {
561 |      "data": {
562 |       "text/html": [
563 |        "<div>\n",
564 |        "<style scoped>\n",
565 |        "    .dataframe tbody tr th:only-of-type {\n",
566 |        "        vertical-align: middle;\n",
567 |        "    }\n",
568 |        "\n",
569 |        "    .dataframe tbody tr th {\n",
570 |        "        vertical-align: top;\n",
571 |        "    }\n",
572 |        "\n",
573 |        "    .dataframe thead th {\n",
574 |        "        text-align: right;\n",
575 |        "    }\n",
576 |        "</style>\n",
577 |        "<table border=\"1\" class=\"dataframe\">\n",
578 |        "  <thead>\n",
579 |        "    <tr style=\"text-align: right;\">\n",
580 |        "      <th></th>\n",
581 |        "      <th>Survived</th>\n",
582 |        "      <th>Pclass</th>\n",
583 |        "      <th>Sex</th>\n",
584 |        "      <th>Age</th>\n",
585 |        "      <th>SibSp</th>\n",
586 |        "      <th>Fare</th>\n",
587 |        "    </tr>\n",
588 |        "  </thead>\n",
589 |        "  <tbody>\n",
590 |        "    <tr>\n",
591 |        "      <th>0</th>\n",
592 |        "      <td>0</td>\n",
593 |        "      <td>3</td>\n",
594 |        "      <td>-0.950742</td>\n",
595 |        "      <td>22.0</td>\n",
596 |        "      <td>1</td>\n",
597 |        "      <td>7.2500</td>\n",
598 |        "    </tr>\n",
599 |        "    <tr>\n",
600 |        "      <th>1</th>\n",
601 |        "      <td>1</td>\n",
602 |        "      <td>1</td>\n",
603 |        "      <td>1.555633</td>\n",
604 |        "      <td>38.0</td>\n",
605 |        "      <td>1</td>\n",
606 |        "      <td>71.2833</td>\n",
607 |        "    </tr>\n",
608 |        "    <tr>\n",
609 |        "      <th>2</th>\n",
610 |        "      <td>1</td>\n",
611 |        "      <td>3</td>\n",
612 |        "      <td>1.555633</td>\n",
613 |        "      <td>26.0</td>\n",
614 |        "      <td>0</td>\n",
615 |        "      <td>7.9250</td>\n",
616 |        "    </tr>\n",
617 |        "    <tr>\n",
618 |        "      <th>3</th>\n",
619 |        "      <td>1</td>\n",
620 |        "      <td>1</td>\n",
621 |        "      <td>1.555633</td>\n",
622 |        "      <td>35.0</td>\n",
623 |        "      <td>1</td>\n",
624 |        "      <td>53.1000</td>\n",
625 |        "    </tr>\n",
626 |        "    <tr>\n",
627 |        "      <th>4</th>\n",
628 |        "      <td>0</td>\n",
629 |        "      <td>3</td>\n",
630 |        "      <td>-0.950742</td>\n",
631 |        "      <td>35.0</td>\n",
632 |        "      <td>0</td>\n",
633 |        "      <td>8.0500</td>\n",
634 |        "    </tr>\n",
635 |        "  </tbody>\n",
636 |        "</table>\n",
637 |        "</div>"
638 |       ],
639 |       "text/plain": [
640 |        "   Survived  Pclass       Sex   Age  SibSp     Fare\n",
641 |        "0         0       3 -0.950742  22.0      1   7.2500\n",
642 |        "1         1       1  1.555633  38.0      1  71.2833\n",
643 |        "2         1       3  1.555633  26.0      0   7.9250\n",
644 |        "3         1       1  1.555633  35.0      1  53.1000\n",
645 |        "4         0       3 -0.950742  35.0      0   8.0500"
646 |       ]
647 |      },
648 |      "execution_count": 16,
649 |      "metadata": {},
650 |      "output_type": "execute_result"
651 |     }
652 |    ],
653 |    "source": [
654 |     "data3.head(5)"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": null,
660 |    "metadata": {
661 |     "collapsed": true
662 |    },
663 |    "outputs": [],
664 |    "source": []
665 |   }
666 |  ],
667 |  "metadata": {
668 |   "kernelspec": {
669 |    "display_name": "Python 3",
670 |    "language": "python",
671 |    "name": "python3"
672 |   },
673 |   "language_info": {
674 |    "codemirror_mode": {
675 |     "name": "ipython",
676 |     "version": 3
677 |    },
678 |    "file_extension": ".py",
679 |    "mimetype": "text/x-python",
680 |    "name": "python",
681 |    "nbconvert_exporter": "python",
682 |    "pygments_lexer": "ipython3",
683 |    "version": "3.6.1"
684 |   }
685 |  },
686 |  "nbformat": 4,
687 |  "nbformat_minor": 2
688 | }
689 | 


--------------------------------------------------------------------------------
/3.5_Demo_Feature_Generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "# import seaborn as sns\n",
 14 |     "# import matplotlib.pyplot as plt\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.metrics import roc_curve,  roc_auc_score\n",
 18 |     "\n",
 19 |     "# plt.style.use('seaborn-colorblind')\n",
 20 |     "# %matplotlib inline\n",
 21 |     "#from feature_cleaning import rare_values as ra"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Load Dataset"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "use_cols = [\n",
 40 |     "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
 41 |     "    'Survived'\n",
 42 |     "]\n",
 43 |     "\n",
 44 |     "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/html": [
 55 |        "<div>\n",
 56 |        "<style scoped>\n",
 57 |        "    .dataframe tbody tr th:only-of-type {\n",
 58 |        "        vertical-align: middle;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe tbody tr th {\n",
 62 |        "        vertical-align: top;\n",
 63 |        "    }\n",
 64 |        "\n",
 65 |        "    .dataframe thead th {\n",
 66 |        "        text-align: right;\n",
 67 |        "    }\n",
 68 |        "</style>\n",
 69 |        "<table border=\"1\" class=\"dataframe\">\n",
 70 |        "  <thead>\n",
 71 |        "    <tr style=\"text-align: right;\">\n",
 72 |        "      <th></th>\n",
 73 |        "      <th>Survived</th>\n",
 74 |        "      <th>Pclass</th>\n",
 75 |        "      <th>Sex</th>\n",
 76 |        "      <th>Age</th>\n",
 77 |        "      <th>SibSp</th>\n",
 78 |        "      <th>Fare</th>\n",
 79 |        "    </tr>\n",
 80 |        "  </thead>\n",
 81 |        "  <tbody>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>0</th>\n",
 84 |        "      <td>0</td>\n",
 85 |        "      <td>3</td>\n",
 86 |        "      <td>male</td>\n",
 87 |        "      <td>22.0</td>\n",
 88 |        "      <td>1</td>\n",
 89 |        "      <td>7.2500</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>1</th>\n",
 93 |        "      <td>1</td>\n",
 94 |        "      <td>1</td>\n",
 95 |        "      <td>female</td>\n",
 96 |        "      <td>38.0</td>\n",
 97 |        "      <td>1</td>\n",
 98 |        "      <td>71.2833</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>2</th>\n",
102 |        "      <td>1</td>\n",
103 |        "      <td>3</td>\n",
104 |        "      <td>female</td>\n",
105 |        "      <td>26.0</td>\n",
106 |        "      <td>0</td>\n",
107 |        "      <td>7.9250</td>\n",
108 |        "    </tr>\n",
109 |        "  </tbody>\n",
110 |        "</table>\n",
111 |        "</div>"
112 |       ],
113 |       "text/plain": [
114 |        "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
115 |        "0         0       3    male  22.0      1   7.2500\n",
116 |        "1         1       1  female  38.0      1  71.2833\n",
117 |        "2         1       3  female  26.0      0   7.9250"
118 |       ]
119 |      },
120 |      "execution_count": 3,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "data.head(3)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 4,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "((623, 6), (268, 6))"
138 |       ]
139 |      },
140 |      "execution_count": 4,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "# Note that we include target variable in the X_train \n",
147 |     "# because we need it to supervise our discretization\n",
148 |     "# this is not the standard way of using train-test-split\n",
149 |     "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
150 |     "                                                    random_state=0)\n",
151 |     "X_train.shape, X_test.shape"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "## Polynomial Expansion\n",
159 |     "\n",
160 |     "generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 5,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "   Pclass  SibSp  Pclass^2  Pclass SibSp  SibSp^2\n",
173 |       "0     1.0    0.0       1.0           0.0      0.0\n",
174 |       "1     1.0    1.0       1.0           1.0      1.0\n",
175 |       "2     3.0    5.0       9.0          15.0     25.0\n",
176 |       "3     1.0    0.0       1.0           0.0      0.0\n",
177 |       "4     3.0    1.0       9.0           3.0      1.0\n",
178 |       "5     2.0    1.0       4.0           2.0      1.0\n"
179 |      ]
180 |     }
181 |    ],
182 |    "source": [
183 |     "# create polynomial combinations of feature 'Pclass','SibSp' with degree 2\n",
184 |     "from sklearn.preprocessing import PolynomialFeatures\n",
185 |     "pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])\n",
186 |     "tmp = pf.transform(X_train[['Pclass','SibSp']])\n",
187 |     "X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names(['Pclass','SibSp']))\n",
188 |     "print(X_train_copy.head(6))"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "## Feature Learning by Trees\n",
196 |     "GBDT derived feature + LR"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 6,
202 |    "metadata": {},
203 |    "outputs": [
204 |     {
205 |      "name": "stdout",
206 |      "output_type": "stream",
207 |      "text": [
208 |       "sample's belonging node of each base tree \n",
209 |       "' [[ 7.  7.  6. ...  4.  7.  4.]\n",
210 |       " [ 7.  7.  6. ... 14.  7.  7.]\n",
211 |       " [11. 11. 11. ...  4.  6. 11.]\n",
212 |       " ...\n",
213 |       " [10. 10. 10. ...  4.  6. 10.]\n",
214 |       " [13. 14. 13. ...  4.  7. 13.]\n",
215 |       " [ 7.  7.  6. ...  6.  7.  7.]]\n",
216 |       "AUC for GBDT derived feature + LR： 0.7746130952380953\n"
217 |      ]
218 |     },
219 |     {
220 |      "name": "stderr",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
224 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
225 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
226 |       "  warnings.warn(msg, FutureWarning)\n"
227 |      ]
228 |     }
229 |    ],
230 |    "source": [
231 |     "from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n",
232 |     "from sklearn.preprocessing import OneHotEncoder\n",
233 |     "\n",
234 |     "gbdt = GradientBoostingClassifier(n_estimators=20)\n",
235 |     "one_hot = OneHotEncoder()\n",
236 |     "\n",
237 |     "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
238 |     "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
239 |     "\n",
240 |     "gbdt.fit(X_train, y_train)\n",
241 |     "\n",
242 |     "X_leaf_index = gbdt.apply(X_train)[:, :, 0]  # apply return the node index on each tree \n",
243 |     "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
244 |     "# fit one-hot encoder\n",
245 |     "one_hot.fit(X_leaf_index)   \n",
246 |     "X_one_hot = one_hot.transform(X_leaf_index)  \n",
247 |     "\n",
248 |     "\n",
249 |     "from sklearn.linear_model import LogisticRegression\n",
250 |     "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
251 |     "lr.fit(X_one_hot,y_train)\n",
252 |     "y_pred = lr.predict_proba(\n",
253 |     "    one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]\n",
254 |     "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
255 |     "print(\"AUC for GBDT derived feature + LR：\", roc_auc_score(y_test, y_pred))\n"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "## Feature Learning by Trees\n",
263 |     "RandomForest derived feature + LR"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 7,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "name": "stdout",
273 |      "output_type": "stream",
274 |      "text": [
275 |       "sample's belonging node of each base tree \n",
276 |       "' [[212  35  79 ... 146  60  46]\n",
277 |       " [307 165 266 ... 136 132  44]\n",
278 |       " [285 285 320 ... 301 294 300]\n",
279 |       " ...\n",
280 |       " [ 13 177 133 ... 186 169 117]\n",
281 |       " [190 296 311 ... 282 289 297]\n",
282 |       " [264 165 243 ... 152 110 314]]\n",
283 |       "AUC for RandomForest derived feature + LR： 0.759672619047619\n"
284 |      ]
285 |     },
286 |     {
287 |      "name": "stderr",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
291 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
292 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
293 |       "  warnings.warn(msg, FutureWarning)\n"
294 |      ]
295 |     }
296 |    ],
297 |    "source": [
298 |     "rf = RandomForestClassifier(n_estimators=20)\n",
299 |     "one_hot = OneHotEncoder()\n",
300 |     "\n",
301 |     "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
302 |     "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
303 |     "\n",
304 |     "rf.fit(X_train, y_train)\n",
305 |     "\n",
306 |     "X_leaf_index = rf.apply(X_train)  # apply return the node index on each tree \n",
307 |     "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
308 |     "# fit one-hot encoder\n",
309 |     "one_hot.fit(X_leaf_index)   \n",
310 |     "X_one_hot = one_hot.transform(X_leaf_index)  \n",
311 |     "\n",
312 |     "\n",
313 |     "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
314 |     "lr.fit(X_one_hot,y_train)\n",
315 |     "y_pred = lr.predict_proba(\n",
316 |     "    one_hot.transform(rf.apply(X_test)))[:,1]\n",
317 |     "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
318 |     "print(\"AUC for RandomForest derived feature + LR：\", roc_auc_score(y_test, y_pred))\n"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {
324 |     "collapsed": true
325 |    },
326 |    "source": [
327 |     "##  Feature Learning by Trees\n",
328 |     "GBDT derived feature + Raw feature +LR"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 8,
334 |    "metadata": {},
335 |    "outputs": [
336 |     {
337 |      "name": "stdout",
338 |      "output_type": "stream",
339 |      "text": [
340 |       "AUC for GBDT derived feature + Raw feature +LR： 0.7603571428571428\n"
341 |      ]
342 |     }
343 |    ],
344 |    "source": [
345 |     "from scipy.sparse import hstack\n",
346 |     "\n",
347 |     "X_train_ext = hstack([one_hot.transform(gbdt.apply(X_train)[:, :, 0]), X_train])\n",
348 |     "X_test_ext = hstack([one_hot.transform(gbdt.apply(X_test)[:, :, 0]), X_test])\n",
349 |     "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
350 |     "lr.fit(X_train_ext,y_train)\n",
351 |     "y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
352 |     "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
353 |     "print(\"AUC for GBDT derived feature + Raw feature +LR：\", roc_auc_score(y_test, y_pred))\n"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "##  Feature Learning by Trees\n",
361 |     "RandomForest derived feature + Raw feature +LR"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": 9,
367 |    "metadata": {},
368 |    "outputs": [
369 |     {
370 |      "name": "stdout",
371 |      "output_type": "stream",
372 |      "text": [
373 |       "AUC for RandomForest derived feature + Raw feature + LR： 0.76\n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "X_train_ext = hstack([one_hot.transform(rf.apply(X_train)), X_train])\n",
379 |     "X_test_ext = hstack([one_hot.transform(rf.apply(X_test)), X_test])\n",
380 |     "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
381 |     "lr.fit(X_train_ext,y_train)\n",
382 |     "y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
383 |     "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
384 |     "print(\"AUC for RandomForest derived feature + Raw feature + LR：\", roc_auc_score(y_test, y_pred))\n"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "##  Feature Learning by Trees\n",
392 |     "Use only Raw Feature + LR"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 10,
398 |    "metadata": {},
399 |    "outputs": [
400 |     {
401 |      "name": "stdout",
402 |      "output_type": "stream",
403 |      "text": [
404 |       "AUC for RandomForest derived feature + LR： 0.6988690476190476\n"
405 |      ]
406 |     }
407 |    ],
408 |    "source": [
409 |     "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
410 |     "lr.fit(X_train,y_train)\n",
411 |     "y_pred = lr.predict_proba(X_test)[:,1]\n",
412 |     "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
413 |     "print(\"AUC for RandomForest derived feature + LR：\", roc_auc_score(y_test, y_pred))\n"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "## Feature Learning by Trees\n",
421 |     "\n",
422 |     "Use only Raw Feature + GBDT"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": 13,
428 |    "metadata": {},
429 |    "outputs": [
430 |     {
431 |      "name": "stdout",
432 |      "output_type": "stream",
433 |      "text": [
434 |       "AUC for Raw feature + GBDT： 0.7613988095238096\n"
435 |      ]
436 |     }
437 |    ],
438 |    "source": [
439 |     "gbdt = GradientBoostingClassifier(n_estimators=20)\n",
440 |     "\n",
441 |     "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
442 |     "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
443 |     "\n",
444 |     "gbdt.fit(X_train, y_train)\n",
445 |     "y_pred = gbdt.predict_proba(X_test)[:,1]\n",
446 |     "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
447 |     "print(\"AUC for Raw feature + GBDT：\", roc_auc_score(y_test, y_pred))\n"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {},
453 |    "source": [
454 |     "## Feature Learning by Trees\n",
455 |     "\n",
456 |     "Use only Raw Feature + RF\n"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 16,
462 |    "metadata": {},
463 |    "outputs": [
464 |     {
465 |      "name": "stdout",
466 |      "output_type": "stream",
467 |      "text": [
468 |       "AUC for Raw feature + RF： 0.7235119047619047\n"
469 |      ]
470 |     }
471 |    ],
472 |    "source": [
473 |     "rf = RandomForestClassifier(n_estimators=20)\n",
474 |     "\n",
475 |     "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
476 |     "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
477 |     "\n",
478 |     "rf.fit(X_train, y_train)\n",
479 |     "y_pred = rf.predict_proba(X_test)[:,1]\n",
480 |     "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
481 |     "print(\"AUC for Raw feature + RF：\", roc_auc_score(y_test, y_pred))"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "markdown",
486 |    "metadata": {},
487 |    "source": [
488 |     "#### Without tuning, we can see GBDT derived feature + LR get the best result"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {
495 |     "collapsed": true
496 |    },
497 |    "outputs": [],
498 |    "source": []
499 |   }
500 |  ],
501 |  "metadata": {
502 |   "kernelspec": {
503 |    "display_name": "Python 3",
504 |    "language": "python",
505 |    "name": "python3"
506 |   },
507 |   "language_info": {
508 |    "codemirror_mode": {
509 |     "name": "ipython",
510 |     "version": 3
511 |    },
512 |    "file_extension": ".py",
513 |    "mimetype": "text/x-python",
514 |    "name": "python",
515 |    "nbconvert_exporter": "python",
516 |    "pygments_lexer": "ipython3",
517 |    "version": "3.6.1"
518 |   }
519 |  },
520 |  "nbformat": 4,
521 |  "nbformat_minor": 2
522 | }
523 | 


--------------------------------------------------------------------------------
/4.1_Demo_Feature_Selection_Filter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "# import seaborn as sns\n",
 14 |     "# import matplotlib.pyplot as plt\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "# plt.style.use('seaborn-colorblind')\n",
 18 |     "# %matplotlib inline\n",
 19 |     "from feature_selection import filter_method as ft"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Load Dataset"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "from sklearn.datasets import load_breast_cancer\n",
 38 |     "data = load_breast_cancer()\n",
 39 |     "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
 40 |     "                  columns= np.append(data['feature_names'], ['target']))"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/html": [
 51 |        "<div>\n",
 52 |        "<style scoped>\n",
 53 |        "    .dataframe tbody tr th:only-of-type {\n",
 54 |        "        vertical-align: middle;\n",
 55 |        "    }\n",
 56 |        "\n",
 57 |        "    .dataframe tbody tr th {\n",
 58 |        "        vertical-align: top;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe thead th {\n",
 62 |        "        text-align: right;\n",
 63 |        "    }\n",
 64 |        "</style>\n",
 65 |        "<table border=\"1\" class=\"dataframe\">\n",
 66 |        "  <thead>\n",
 67 |        "    <tr style=\"text-align: right;\">\n",
 68 |        "      <th></th>\n",
 69 |        "      <th>mean radius</th>\n",
 70 |        "      <th>mean texture</th>\n",
 71 |        "      <th>mean perimeter</th>\n",
 72 |        "      <th>mean area</th>\n",
 73 |        "      <th>mean smoothness</th>\n",
 74 |        "      <th>mean compactness</th>\n",
 75 |        "      <th>mean concavity</th>\n",
 76 |        "      <th>mean concave points</th>\n",
 77 |        "      <th>mean symmetry</th>\n",
 78 |        "      <th>mean fractal dimension</th>\n",
 79 |        "      <th>...</th>\n",
 80 |        "      <th>worst texture</th>\n",
 81 |        "      <th>worst perimeter</th>\n",
 82 |        "      <th>worst area</th>\n",
 83 |        "      <th>worst smoothness</th>\n",
 84 |        "      <th>worst compactness</th>\n",
 85 |        "      <th>worst concavity</th>\n",
 86 |        "      <th>worst concave points</th>\n",
 87 |        "      <th>worst symmetry</th>\n",
 88 |        "      <th>worst fractal dimension</th>\n",
 89 |        "      <th>target</th>\n",
 90 |        "    </tr>\n",
 91 |        "  </thead>\n",
 92 |        "  <tbody>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>0</th>\n",
 95 |        "      <td>17.99</td>\n",
 96 |        "      <td>10.38</td>\n",
 97 |        "      <td>122.80</td>\n",
 98 |        "      <td>1001.0</td>\n",
 99 |        "      <td>0.11840</td>\n",
100 |        "      <td>0.27760</td>\n",
101 |        "      <td>0.3001</td>\n",
102 |        "      <td>0.14710</td>\n",
103 |        "      <td>0.2419</td>\n",
104 |        "      <td>0.07871</td>\n",
105 |        "      <td>...</td>\n",
106 |        "      <td>17.33</td>\n",
107 |        "      <td>184.60</td>\n",
108 |        "      <td>2019.0</td>\n",
109 |        "      <td>0.1622</td>\n",
110 |        "      <td>0.6656</td>\n",
111 |        "      <td>0.7119</td>\n",
112 |        "      <td>0.2654</td>\n",
113 |        "      <td>0.4601</td>\n",
114 |        "      <td>0.11890</td>\n",
115 |        "      <td>0.0</td>\n",
116 |        "    </tr>\n",
117 |        "    <tr>\n",
118 |        "      <th>1</th>\n",
119 |        "      <td>20.57</td>\n",
120 |        "      <td>17.77</td>\n",
121 |        "      <td>132.90</td>\n",
122 |        "      <td>1326.0</td>\n",
123 |        "      <td>0.08474</td>\n",
124 |        "      <td>0.07864</td>\n",
125 |        "      <td>0.0869</td>\n",
126 |        "      <td>0.07017</td>\n",
127 |        "      <td>0.1812</td>\n",
128 |        "      <td>0.05667</td>\n",
129 |        "      <td>...</td>\n",
130 |        "      <td>23.41</td>\n",
131 |        "      <td>158.80</td>\n",
132 |        "      <td>1956.0</td>\n",
133 |        "      <td>0.1238</td>\n",
134 |        "      <td>0.1866</td>\n",
135 |        "      <td>0.2416</td>\n",
136 |        "      <td>0.1860</td>\n",
137 |        "      <td>0.2750</td>\n",
138 |        "      <td>0.08902</td>\n",
139 |        "      <td>0.0</td>\n",
140 |        "    </tr>\n",
141 |        "    <tr>\n",
142 |        "      <th>2</th>\n",
143 |        "      <td>19.69</td>\n",
144 |        "      <td>21.25</td>\n",
145 |        "      <td>130.00</td>\n",
146 |        "      <td>1203.0</td>\n",
147 |        "      <td>0.10960</td>\n",
148 |        "      <td>0.15990</td>\n",
149 |        "      <td>0.1974</td>\n",
150 |        "      <td>0.12790</td>\n",
151 |        "      <td>0.2069</td>\n",
152 |        "      <td>0.05999</td>\n",
153 |        "      <td>...</td>\n",
154 |        "      <td>25.53</td>\n",
155 |        "      <td>152.50</td>\n",
156 |        "      <td>1709.0</td>\n",
157 |        "      <td>0.1444</td>\n",
158 |        "      <td>0.4245</td>\n",
159 |        "      <td>0.4504</td>\n",
160 |        "      <td>0.2430</td>\n",
161 |        "      <td>0.3613</td>\n",
162 |        "      <td>0.08758</td>\n",
163 |        "      <td>0.0</td>\n",
164 |        "    </tr>\n",
165 |        "    <tr>\n",
166 |        "      <th>3</th>\n",
167 |        "      <td>11.42</td>\n",
168 |        "      <td>20.38</td>\n",
169 |        "      <td>77.58</td>\n",
170 |        "      <td>386.1</td>\n",
171 |        "      <td>0.14250</td>\n",
172 |        "      <td>0.28390</td>\n",
173 |        "      <td>0.2414</td>\n",
174 |        "      <td>0.10520</td>\n",
175 |        "      <td>0.2597</td>\n",
176 |        "      <td>0.09744</td>\n",
177 |        "      <td>...</td>\n",
178 |        "      <td>26.50</td>\n",
179 |        "      <td>98.87</td>\n",
180 |        "      <td>567.7</td>\n",
181 |        "      <td>0.2098</td>\n",
182 |        "      <td>0.8663</td>\n",
183 |        "      <td>0.6869</td>\n",
184 |        "      <td>0.2575</td>\n",
185 |        "      <td>0.6638</td>\n",
186 |        "      <td>0.17300</td>\n",
187 |        "      <td>0.0</td>\n",
188 |        "    </tr>\n",
189 |        "    <tr>\n",
190 |        "      <th>4</th>\n",
191 |        "      <td>20.29</td>\n",
192 |        "      <td>14.34</td>\n",
193 |        "      <td>135.10</td>\n",
194 |        "      <td>1297.0</td>\n",
195 |        "      <td>0.10030</td>\n",
196 |        "      <td>0.13280</td>\n",
197 |        "      <td>0.1980</td>\n",
198 |        "      <td>0.10430</td>\n",
199 |        "      <td>0.1809</td>\n",
200 |        "      <td>0.05883</td>\n",
201 |        "      <td>...</td>\n",
202 |        "      <td>16.67</td>\n",
203 |        "      <td>152.20</td>\n",
204 |        "      <td>1575.0</td>\n",
205 |        "      <td>0.1374</td>\n",
206 |        "      <td>0.2050</td>\n",
207 |        "      <td>0.4000</td>\n",
208 |        "      <td>0.1625</td>\n",
209 |        "      <td>0.2364</td>\n",
210 |        "      <td>0.07678</td>\n",
211 |        "      <td>0.0</td>\n",
212 |        "    </tr>\n",
213 |        "  </tbody>\n",
214 |        "</table>\n",
215 |        "<p>5 rows × 31 columns</p>\n",
216 |        "</div>"
217 |       ],
218 |       "text/plain": [
219 |        "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
220 |        "0        17.99         10.38          122.80     1001.0          0.11840   \n",
221 |        "1        20.57         17.77          132.90     1326.0          0.08474   \n",
222 |        "2        19.69         21.25          130.00     1203.0          0.10960   \n",
223 |        "3        11.42         20.38           77.58      386.1          0.14250   \n",
224 |        "4        20.29         14.34          135.10     1297.0          0.10030   \n",
225 |        "\n",
226 |        "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
227 |        "0           0.27760          0.3001              0.14710         0.2419   \n",
228 |        "1           0.07864          0.0869              0.07017         0.1812   \n",
229 |        "2           0.15990          0.1974              0.12790         0.2069   \n",
230 |        "3           0.28390          0.2414              0.10520         0.2597   \n",
231 |        "4           0.13280          0.1980              0.10430         0.1809   \n",
232 |        "\n",
233 |        "   mean fractal dimension   ...    worst texture  worst perimeter  worst area  \\\n",
234 |        "0                 0.07871   ...            17.33           184.60      2019.0   \n",
235 |        "1                 0.05667   ...            23.41           158.80      1956.0   \n",
236 |        "2                 0.05999   ...            25.53           152.50      1709.0   \n",
237 |        "3                 0.09744   ...            26.50            98.87       567.7   \n",
238 |        "4                 0.05883   ...            16.67           152.20      1575.0   \n",
239 |        "\n",
240 |        "   worst smoothness  worst compactness  worst concavity  worst concave points  \\\n",
241 |        "0            0.1622             0.6656           0.7119                0.2654   \n",
242 |        "1            0.1238             0.1866           0.2416                0.1860   \n",
243 |        "2            0.1444             0.4245           0.4504                0.2430   \n",
244 |        "3            0.2098             0.8663           0.6869                0.2575   \n",
245 |        "4            0.1374             0.2050           0.4000                0.1625   \n",
246 |        "\n",
247 |        "   worst symmetry  worst fractal dimension  target  \n",
248 |        "0          0.4601                  0.11890     0.0  \n",
249 |        "1          0.2750                  0.08902     0.0  \n",
250 |        "2          0.3613                  0.08758     0.0  \n",
251 |        "3          0.6638                  0.17300     0.0  \n",
252 |        "4          0.2364                  0.07678     0.0  \n",
253 |        "\n",
254 |        "[5 rows x 31 columns]"
255 |       ]
256 |      },
257 |      "execution_count": 3,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "data.head(5)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 4,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "data": {
273 |       "text/plain": [
274 |        "((455, 30), (114, 30))"
275 |       ]
276 |      },
277 |      "execution_count": 4,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
284 |     "                                                    data.target, test_size=0.2,\n",
285 |     "                                                    random_state=0)\n",
286 |     "X_train.shape, X_test.shape"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "## Variance method\n",
294 |     "removing features that show the same value for the majority/all of the observations (constant/quasi-constant features)"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 5,
300 |    "metadata": {},
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "0  variables are found to be almost constant\n"
307 |      ]
308 |     }
309 |    ],
310 |    "source": [
311 |     "# the original dataset has no constant variable\n",
312 |     "quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 6,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "data": {
322 |       "text/plain": [
323 |        "1.0    0.923077\n",
324 |        "0.0    0.068132\n",
325 |        "2.0    0.008791\n",
326 |        "Name: dummy, dtype: float64"
327 |       ]
328 |      },
329 |      "execution_count": 6,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "# lets create a duumy variable that help us do the demonstration\n",
336 |     "X_train['dummy'] = np.floor(X_train['worst smoothness']*10)\n",
337 |     "# variable dummy has> 92% of the observations show one value, 1.0\n",
338 |     "X_train.dummy.value_counts() / np.float(len(X_train))"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 7,
344 |    "metadata": {},
345 |    "outputs": [
346 |     {
347 |      "name": "stdout",
348 |      "output_type": "stream",
349 |      "text": [
350 |       "1  variables are found to be almost constant\n"
351 |      ]
352 |     },
353 |     {
354 |      "data": {
355 |       "text/plain": [
356 |        "['dummy']"
357 |       ]
358 |      },
359 |      "execution_count": 7,
360 |      "metadata": {},
361 |      "output_type": "execute_result"
362 |     }
363 |    ],
364 |    "source": [
365 |     "quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)\n",
366 |     "quasi_constant_feature"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 8,
372 |    "metadata": {},
373 |    "outputs": [
374 |     {
375 |      "name": "stdout",
376 |      "output_type": "stream",
377 |      "text": [
378 |       "(455, 30)\n"
379 |      ]
380 |     }
381 |    ],
382 |    "source": [
383 |     "# drop that variable\n",
384 |     "X_train.drop(labels=quasi_constant_feature,axis=1,inplace=True)\n",
385 |     "print(X_train.shape)"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {},
391 |    "source": [
392 |     "## Correlation method\n",
393 |     "remove features that are highly correlated with each other"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 9,
399 |    "metadata": {},
400 |    "outputs": [
401 |     {
402 |      "name": "stdout",
403 |      "output_type": "stream",
404 |      "text": [
405 |       "          feature1         feature2      corr\n",
406 |       "0   mean perimeter      mean radius  0.998185\n",
407 |       "6   mean perimeter        mean area  0.986692\n",
408 |       "14  mean perimeter  worst perimeter  0.970507\n",
409 |       "19  mean perimeter     worst radius  0.969520\n",
410 |       "33  mean perimeter       worst area  0.941920 \n",
411 |       "\n",
412 |       "           feature1      feature2      corr\n",
413 |       "12  perimeter error  radius error  0.978323\n",
414 |       "30  perimeter error    area error  0.944995 \n",
415 |       "\n",
416 |       "          feature1             feature2      corr\n",
417 |       "36  mean concavity  mean concave points  0.914627 \n",
418 |       "\n",
419 |       "        feature1       feature2      corr\n",
420 |       "38  mean texture  worst texture  0.908182 \n",
421 |       "\n",
422 |       "                feature1             feature2      corr\n",
423 |       "40  worst concave points  mean concave points  0.906312 \n",
424 |       "\n"
425 |      ]
426 |     }
427 |    ],
428 |    "source": [
429 |     "corr = ft.corr_feature_detect(data=X_train,threshold=0.9)\n",
430 |     "# print all the correlated feature groups!\n",
431 |     "for i in corr:\n",
432 |     "    print(i,'\\n')"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "markdown",
437 |    "metadata": {},
438 |    "source": [
439 |     "then we can decide which ones to remove."
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "## Mutual Information Filter\n",
447 |     "Mutual information measures how much information the presence/absence of a feature contributes to making the correct prediction on Y."
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 10,
453 |    "metadata": {},
454 |    "outputs": [
455 |     {
456 |      "name": "stdout",
457 |      "output_type": "stream",
458 |      "text": [
459 |       "Index(['mean concave points', 'worst perimeter', 'worst area'], dtype='object')\n"
460 |      ]
461 |     }
462 |    ],
463 |    "source": [
464 |     "# select the top 3 features\n",
465 |     "mi = ft.mutual_info(X=X_train,y=y_train,select_k=3)\n",
466 |     "print(mi)"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 11,
472 |    "metadata": {},
473 |    "outputs": [
474 |     {
475 |      "name": "stdout",
476 |      "output_type": "stream",
477 |      "text": [
478 |       "Index(['mean perimeter', 'mean concave points', 'worst radius',\n",
479 |       "       'worst perimeter', 'worst area', 'worst concave points'],\n",
480 |       "      dtype='object')\n"
481 |      ]
482 |     }
483 |    ],
484 |    "source": [
485 |     "# select the top 20% features\n",
486 |     "mi = ft.mutual_info(X=X_train,y=y_train,select_k=0.2)\n",
487 |     "print(mi)"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "markdown",
492 |    "metadata": {},
493 |    "source": [
494 |     "## Chi-Square Filter\n",
495 |     "Compute chi-squared stats between each non-negative feature and class"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": 12,
501 |    "metadata": {},
502 |    "outputs": [
503 |     {
504 |      "name": "stdout",
505 |      "output_type": "stream",
506 |      "text": [
507 |       "Index(['mean area', 'area error', 'worst area'], dtype='object')\n"
508 |      ]
509 |     }
510 |    ],
511 |    "source": [
512 |     "# select the top 3 features\n",
513 |     "chi = ft.chi_square_test(X=X_train,y=y_train,select_k=3)\n",
514 |     "print(chi)"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": 13,
520 |    "metadata": {},
521 |    "outputs": [
522 |     {
523 |      "name": "stdout",
524 |      "output_type": "stream",
525 |      "text": [
526 |       "Index(['mean perimeter', 'mean area', 'area error', 'worst radius',\n",
527 |       "       'worst perimeter', 'worst area'],\n",
528 |       "      dtype='object')\n"
529 |      ]
530 |     }
531 |    ],
532 |    "source": [
533 |     "# select the top 20% features\n",
534 |     "chi = ft.chi_square_test(X=X_train,y=y_train,select_k=0.2)\n",
535 |     "print(chi)"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "markdown",
540 |    "metadata": {},
541 |    "source": [
542 |     "## Univariate ROC-AUC or MSE\n",
543 |     "builds one decision tree per feature, to predict the target, then make predictions and ranks the features according to the machine learning metric (roc-auc or mse)"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": 18,
549 |    "metadata": {},
550 |    "outputs": [
551 |     {
552 |      "name": "stdout",
553 |      "output_type": "stream",
554 |      "text": [
555 |       "worst perimeter            0.917275\n",
556 |       "worst area                 0.895840\n",
557 |       "worst radius               0.893458\n",
558 |       "worst concave points       0.863131\n",
559 |       "mean concavity             0.856939\n",
560 |       "mean radius                0.849000\n",
561 |       "mean area                  0.839314\n",
562 |       "worst concavity            0.831375\n",
563 |       "mean perimeter             0.829628\n",
564 |       "mean concave points        0.826453\n",
565 |       "area error                 0.812321\n",
566 |       "worst compactness          0.742299\n",
567 |       "radius error               0.740235\n",
568 |       "mean compactness           0.734360\n",
569 |       "perimeter error            0.680534\n",
570 |       "worst texture              0.647666\n",
571 |       "worst fractal dimension    0.640997\n",
572 |       "concavity error            0.640203\n",
573 |       "worst symmetry             0.620991\n",
574 |       "concave points error       0.618133\n",
575 |       "compactness error          0.607336\n",
576 |       "mean symmetry              0.591775\n",
577 |       "mean texture               0.573357\n",
578 |       "texture error              0.568593\n",
579 |       "worst smoothness           0.565100\n",
580 |       "mean smoothness            0.557637\n",
581 |       "fractal dimension error    0.542077\n",
582 |       "smoothness error           0.522706\n",
583 |       "symmetry error             0.493649\n",
584 |       "mean fractal dimension     0.475548\n",
585 |       "dtype: float64\n",
586 |       "11 out of the 30 featues are kept\n",
587 |       "mean radius             0.849000\n",
588 |       "mean perimeter          0.829628\n",
589 |       "mean area               0.839314\n",
590 |       "mean concavity          0.856939\n",
591 |       "mean concave points     0.826453\n",
592 |       "area error              0.812321\n",
593 |       "worst radius            0.893458\n",
594 |       "worst perimeter         0.917275\n",
595 |       "worst area              0.895840\n",
596 |       "worst concavity         0.831375\n",
597 |       "worst concave points    0.863131\n",
598 |       "dtype: float64\n"
599 |      ]
600 |     }
601 |    ],
602 |    "source": [
603 |     "uni_roc_auc = ft.univariate_roc_auc(X_train=X_train,y_train=y_train,\n",
604 |     "                                   X_test=X_test,y_test=y_test,threshold=0.8)\n",
605 |     "print(uni_roc_auc)"
606 |    ]
607 |   },
608 |   {
609 |    "cell_type": "code",
610 |    "execution_count": 17,
611 |    "metadata": {},
612 |    "outputs": [
613 |     {
614 |      "name": "stdout",
615 |      "output_type": "stream",
616 |      "text": [
617 |       "mean fractal dimension     0.491228\n",
618 |       "symmetry error             0.480750\n",
619 |       "fractal dimension error    0.456140\n",
620 |       "smoothness error           0.449561\n",
621 |       "texture error              0.412281\n",
622 |       "worst smoothness           0.403265\n",
623 |       "mean smoothness            0.399123\n",
624 |       "mean texture               0.396930\n",
625 |       "mean symmetry              0.363060\n",
626 |       "compactness error          0.361842\n",
627 |       "concave points error       0.357456\n",
628 |       "worst fractal dimension    0.355263\n",
629 |       "worst symmetry             0.350877\n",
630 |       "worst texture              0.333333\n",
631 |       "concavity error            0.333333\n",
632 |       "perimeter error            0.300439\n",
633 |       "mean compactness           0.258772\n",
634 |       "worst compactness          0.254386\n",
635 |       "radius error               0.245614\n",
636 |       "area error                 0.179825\n",
637 |       "mean perimeter             0.166667\n",
638 |       "mean concave points        0.166667\n",
639 |       "worst concavity            0.162281\n",
640 |       "mean radius                0.146930\n",
641 |       "mean concavity             0.142544\n",
642 |       "mean area                  0.140351\n",
643 |       "worst concave points       0.123782\n",
644 |       "worst area                 0.103070\n",
645 |       "worst radius               0.100877\n",
646 |       "worst perimeter            0.098684\n",
647 |       "dtype: float64\n",
648 |       "6 out of the 30 featues are kept\n",
649 |       "mean fractal dimension     0.491228\n",
650 |       "texture error              0.412281\n",
651 |       "smoothness error           0.449561\n",
652 |       "symmetry error             0.480750\n",
653 |       "fractal dimension error    0.456140\n",
654 |       "worst smoothness           0.403265\n",
655 |       "dtype: float64\n"
656 |      ]
657 |     }
658 |    ],
659 |    "source": [
660 |     "uni_mse = ft.univariate_mse(X_train=X_train,y_train=y_train,\n",
661 |     "                            X_test=X_test,y_test=y_test,threshold=0.4)\n",
662 |     "print(uni_mse)"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": null,
668 |    "metadata": {
669 |     "collapsed": true
670 |    },
671 |    "outputs": [],
672 |    "source": []
673 |   }
674 |  ],
675 |  "metadata": {
676 |   "kernelspec": {
677 |    "display_name": "Python 3",
678 |    "language": "python",
679 |    "name": "python3"
680 |   },
681 |   "language_info": {
682 |    "codemirror_mode": {
683 |     "name": "ipython",
684 |     "version": 3
685 |    },
686 |    "file_extension": ".py",
687 |    "mimetype": "text/x-python",
688 |    "name": "python",
689 |    "nbconvert_exporter": "python",
690 |    "pygments_lexer": "ipython3",
691 |    "version": "3.6.1"
692 |   }
693 |  },
694 |  "nbformat": 4,
695 |  "nbformat_minor": 2
696 | }
697 | 


--------------------------------------------------------------------------------
/4.2_Demo_Feature_Selection_Wrapper.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 45,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "# import seaborn as sns\n",
 14 |     "# import matplotlib.pyplot as plt\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n",
 18 |     "from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS\n",
 19 |     "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
 20 |     "\n",
 21 |     "# plt.style.use('seaborn-colorblind')\n",
 22 |     "# %matplotlib inline\n",
 23 |     "# from feature_selection import filter_method as ft"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Load Dataset"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from sklearn.datasets import load_breast_cancer\n",
 42 |     "data = load_breast_cancer()\n",
 43 |     "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
 44 |     "                  columns= np.append(data['feature_names'], ['target']))"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/html": [
 55 |        "<div>\n",
 56 |        "<style scoped>\n",
 57 |        "    .dataframe tbody tr th:only-of-type {\n",
 58 |        "        vertical-align: middle;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe tbody tr th {\n",
 62 |        "        vertical-align: top;\n",
 63 |        "    }\n",
 64 |        "\n",
 65 |        "    .dataframe thead th {\n",
 66 |        "        text-align: right;\n",
 67 |        "    }\n",
 68 |        "</style>\n",
 69 |        "<table border=\"1\" class=\"dataframe\">\n",
 70 |        "  <thead>\n",
 71 |        "    <tr style=\"text-align: right;\">\n",
 72 |        "      <th></th>\n",
 73 |        "      <th>mean radius</th>\n",
 74 |        "      <th>mean texture</th>\n",
 75 |        "      <th>mean perimeter</th>\n",
 76 |        "      <th>mean area</th>\n",
 77 |        "      <th>mean smoothness</th>\n",
 78 |        "      <th>mean compactness</th>\n",
 79 |        "      <th>mean concavity</th>\n",
 80 |        "      <th>mean concave points</th>\n",
 81 |        "      <th>mean symmetry</th>\n",
 82 |        "      <th>mean fractal dimension</th>\n",
 83 |        "      <th>...</th>\n",
 84 |        "      <th>worst texture</th>\n",
 85 |        "      <th>worst perimeter</th>\n",
 86 |        "      <th>worst area</th>\n",
 87 |        "      <th>worst smoothness</th>\n",
 88 |        "      <th>worst compactness</th>\n",
 89 |        "      <th>worst concavity</th>\n",
 90 |        "      <th>worst concave points</th>\n",
 91 |        "      <th>worst symmetry</th>\n",
 92 |        "      <th>worst fractal dimension</th>\n",
 93 |        "      <th>target</th>\n",
 94 |        "    </tr>\n",
 95 |        "  </thead>\n",
 96 |        "  <tbody>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>0</th>\n",
 99 |        "      <td>17.99</td>\n",
100 |        "      <td>10.38</td>\n",
101 |        "      <td>122.80</td>\n",
102 |        "      <td>1001.0</td>\n",
103 |        "      <td>0.11840</td>\n",
104 |        "      <td>0.27760</td>\n",
105 |        "      <td>0.3001</td>\n",
106 |        "      <td>0.14710</td>\n",
107 |        "      <td>0.2419</td>\n",
108 |        "      <td>0.07871</td>\n",
109 |        "      <td>...</td>\n",
110 |        "      <td>17.33</td>\n",
111 |        "      <td>184.60</td>\n",
112 |        "      <td>2019.0</td>\n",
113 |        "      <td>0.1622</td>\n",
114 |        "      <td>0.6656</td>\n",
115 |        "      <td>0.7119</td>\n",
116 |        "      <td>0.2654</td>\n",
117 |        "      <td>0.4601</td>\n",
118 |        "      <td>0.11890</td>\n",
119 |        "      <td>0.0</td>\n",
120 |        "    </tr>\n",
121 |        "    <tr>\n",
122 |        "      <th>1</th>\n",
123 |        "      <td>20.57</td>\n",
124 |        "      <td>17.77</td>\n",
125 |        "      <td>132.90</td>\n",
126 |        "      <td>1326.0</td>\n",
127 |        "      <td>0.08474</td>\n",
128 |        "      <td>0.07864</td>\n",
129 |        "      <td>0.0869</td>\n",
130 |        "      <td>0.07017</td>\n",
131 |        "      <td>0.1812</td>\n",
132 |        "      <td>0.05667</td>\n",
133 |        "      <td>...</td>\n",
134 |        "      <td>23.41</td>\n",
135 |        "      <td>158.80</td>\n",
136 |        "      <td>1956.0</td>\n",
137 |        "      <td>0.1238</td>\n",
138 |        "      <td>0.1866</td>\n",
139 |        "      <td>0.2416</td>\n",
140 |        "      <td>0.1860</td>\n",
141 |        "      <td>0.2750</td>\n",
142 |        "      <td>0.08902</td>\n",
143 |        "      <td>0.0</td>\n",
144 |        "    </tr>\n",
145 |        "    <tr>\n",
146 |        "      <th>2</th>\n",
147 |        "      <td>19.69</td>\n",
148 |        "      <td>21.25</td>\n",
149 |        "      <td>130.00</td>\n",
150 |        "      <td>1203.0</td>\n",
151 |        "      <td>0.10960</td>\n",
152 |        "      <td>0.15990</td>\n",
153 |        "      <td>0.1974</td>\n",
154 |        "      <td>0.12790</td>\n",
155 |        "      <td>0.2069</td>\n",
156 |        "      <td>0.05999</td>\n",
157 |        "      <td>...</td>\n",
158 |        "      <td>25.53</td>\n",
159 |        "      <td>152.50</td>\n",
160 |        "      <td>1709.0</td>\n",
161 |        "      <td>0.1444</td>\n",
162 |        "      <td>0.4245</td>\n",
163 |        "      <td>0.4504</td>\n",
164 |        "      <td>0.2430</td>\n",
165 |        "      <td>0.3613</td>\n",
166 |        "      <td>0.08758</td>\n",
167 |        "      <td>0.0</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>3</th>\n",
171 |        "      <td>11.42</td>\n",
172 |        "      <td>20.38</td>\n",
173 |        "      <td>77.58</td>\n",
174 |        "      <td>386.1</td>\n",
175 |        "      <td>0.14250</td>\n",
176 |        "      <td>0.28390</td>\n",
177 |        "      <td>0.2414</td>\n",
178 |        "      <td>0.10520</td>\n",
179 |        "      <td>0.2597</td>\n",
180 |        "      <td>0.09744</td>\n",
181 |        "      <td>...</td>\n",
182 |        "      <td>26.50</td>\n",
183 |        "      <td>98.87</td>\n",
184 |        "      <td>567.7</td>\n",
185 |        "      <td>0.2098</td>\n",
186 |        "      <td>0.8663</td>\n",
187 |        "      <td>0.6869</td>\n",
188 |        "      <td>0.2575</td>\n",
189 |        "      <td>0.6638</td>\n",
190 |        "      <td>0.17300</td>\n",
191 |        "      <td>0.0</td>\n",
192 |        "    </tr>\n",
193 |        "    <tr>\n",
194 |        "      <th>4</th>\n",
195 |        "      <td>20.29</td>\n",
196 |        "      <td>14.34</td>\n",
197 |        "      <td>135.10</td>\n",
198 |        "      <td>1297.0</td>\n",
199 |        "      <td>0.10030</td>\n",
200 |        "      <td>0.13280</td>\n",
201 |        "      <td>0.1980</td>\n",
202 |        "      <td>0.10430</td>\n",
203 |        "      <td>0.1809</td>\n",
204 |        "      <td>0.05883</td>\n",
205 |        "      <td>...</td>\n",
206 |        "      <td>16.67</td>\n",
207 |        "      <td>152.20</td>\n",
208 |        "      <td>1575.0</td>\n",
209 |        "      <td>0.1374</td>\n",
210 |        "      <td>0.2050</td>\n",
211 |        "      <td>0.4000</td>\n",
212 |        "      <td>0.1625</td>\n",
213 |        "      <td>0.2364</td>\n",
214 |        "      <td>0.07678</td>\n",
215 |        "      <td>0.0</td>\n",
216 |        "    </tr>\n",
217 |        "  </tbody>\n",
218 |        "</table>\n",
219 |        "<p>5 rows × 31 columns</p>\n",
220 |        "</div>"
221 |       ],
222 |       "text/plain": [
223 |        "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
224 |        "0        17.99         10.38          122.80     1001.0          0.11840   \n",
225 |        "1        20.57         17.77          132.90     1326.0          0.08474   \n",
226 |        "2        19.69         21.25          130.00     1203.0          0.10960   \n",
227 |        "3        11.42         20.38           77.58      386.1          0.14250   \n",
228 |        "4        20.29         14.34          135.10     1297.0          0.10030   \n",
229 |        "\n",
230 |        "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
231 |        "0           0.27760          0.3001              0.14710         0.2419   \n",
232 |        "1           0.07864          0.0869              0.07017         0.1812   \n",
233 |        "2           0.15990          0.1974              0.12790         0.2069   \n",
234 |        "3           0.28390          0.2414              0.10520         0.2597   \n",
235 |        "4           0.13280          0.1980              0.10430         0.1809   \n",
236 |        "\n",
237 |        "   mean fractal dimension   ...    worst texture  worst perimeter  worst area  \\\n",
238 |        "0                 0.07871   ...            17.33           184.60      2019.0   \n",
239 |        "1                 0.05667   ...            23.41           158.80      1956.0   \n",
240 |        "2                 0.05999   ...            25.53           152.50      1709.0   \n",
241 |        "3                 0.09744   ...            26.50            98.87       567.7   \n",
242 |        "4                 0.05883   ...            16.67           152.20      1575.0   \n",
243 |        "\n",
244 |        "   worst smoothness  worst compactness  worst concavity  worst concave points  \\\n",
245 |        "0            0.1622             0.6656           0.7119                0.2654   \n",
246 |        "1            0.1238             0.1866           0.2416                0.1860   \n",
247 |        "2            0.1444             0.4245           0.4504                0.2430   \n",
248 |        "3            0.2098             0.8663           0.6869                0.2575   \n",
249 |        "4            0.1374             0.2050           0.4000                0.1625   \n",
250 |        "\n",
251 |        "   worst symmetry  worst fractal dimension  target  \n",
252 |        "0          0.4601                  0.11890     0.0  \n",
253 |        "1          0.2750                  0.08902     0.0  \n",
254 |        "2          0.3613                  0.08758     0.0  \n",
255 |        "3          0.6638                  0.17300     0.0  \n",
256 |        "4          0.2364                  0.07678     0.0  \n",
257 |        "\n",
258 |        "[5 rows x 31 columns]"
259 |       ]
260 |      },
261 |      "execution_count": 3,
262 |      "metadata": {},
263 |      "output_type": "execute_result"
264 |     }
265 |    ],
266 |    "source": [
267 |     "data.head(5)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 4,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "data": {
277 |       "text/plain": [
278 |        "((455, 30), (114, 30))"
279 |       ]
280 |      },
281 |      "execution_count": 4,
282 |      "metadata": {},
283 |      "output_type": "execute_result"
284 |     }
285 |    ],
286 |    "source": [
287 |     "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
288 |     "                                                    data.target, test_size=0.2,\n",
289 |     "                                                    random_state=0)\n",
290 |     "X_train.shape, X_test.shape"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "## Forward Selection\n",
298 |     " "
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 16,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "name": "stderr",
308 |      "output_type": "stream",
309 |      "text": [
310 |       "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
311 |       "[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   11.4s finished\n",
312 |       "Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
313 |       "[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed:   11.2s finished\n",
314 |       "Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
315 |       "[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:   10.7s finished\n",
316 |       "Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
317 |       "[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   10.3s finished\n",
318 |       "Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
319 |       "[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:   10.0s finished\n",
320 |       "Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
321 |       "[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    9.6s finished\n",
322 |       "Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
323 |       "[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    9.2s finished\n",
324 |       "Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
325 |       "[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed:    8.8s finished\n",
326 |       "Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
327 |       "[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    8.4s finished\n",
328 |       "Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
329 |       "[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    8.1s finished\n",
330 |       "Features: 10/10"
331 |      ]
332 |     }
333 |    ],
334 |    "source": [
335 |     "# step forward feature selection\n",
336 |     "# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
337 |     "\n",
338 |     "sfs1 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
339 |     "           k_features=10, \n",
340 |     "           forward=True, \n",
341 |     "           floating=False, \n",
342 |     "           verbose=1,\n",
343 |     "           scoring='roc_auc',\n",
344 |     "           cv=3)\n",
345 |     "\n",
346 |     "sfs1 = sfs1.fit(np.array(X_train), y_train)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 17,
352 |    "metadata": {},
353 |    "outputs": [
354 |     {
355 |      "data": {
356 |       "text/plain": [
357 |        "Index(['mean texture', 'mean perimeter', 'mean concavity',\n",
358 |        "       'mean fractal dimension', 'area error', 'compactness error',\n",
359 |        "       'worst perimeter', 'worst area', 'worst smoothness', 'worst symmetry'],\n",
360 |        "      dtype='object')"
361 |       ]
362 |      },
363 |      "execution_count": 17,
364 |      "metadata": {},
365 |      "output_type": "execute_result"
366 |     }
367 |    ],
368 |    "source": [
369 |     "selected_feat1= X_train.columns[list(sfs1.k_feature_idx_)]\n",
370 |     "selected_feat1"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "metadata": {},
376 |    "source": [
377 |     "## Backward Elimination"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 18,
383 |    "metadata": {},
384 |    "outputs": [
385 |     {
386 |      "name": "stderr",
387 |      "output_type": "stream",
388 |      "text": [
389 |       "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
390 |       "[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   11.5s finished\n",
391 |       "Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
392 |       "[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed:   11.2s finished\n",
393 |       "Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
394 |       "[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:   10.7s finished\n",
395 |       "Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
396 |       "[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   10.2s finished\n",
397 |       "Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
398 |       "[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:   10.1s finished\n",
399 |       "Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
400 |       "[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    9.6s finished\n",
401 |       "Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
402 |       "[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    9.2s finished\n",
403 |       "Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
404 |       "[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed:    8.8s finished\n",
405 |       "Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
406 |       "[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    8.5s finished\n",
407 |       "Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
408 |       "[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    8.2s finished\n",
409 |       "Features: 10/10"
410 |      ]
411 |     }
412 |    ],
413 |    "source": [
414 |     "# step backward feature selection\n",
415 |     "# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
416 |     "\n",
417 |     "sfs2 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
418 |     "           k_features=10, \n",
419 |     "           forward=False, \n",
420 |     "           floating=False, \n",
421 |     "           verbose=1,\n",
422 |     "           scoring='roc_auc',\n",
423 |     "           cv=3)\n",
424 |     "\n",
425 |     "sfs2 = sfs1.fit(np.array(X_train.fillna(0)), y_train)"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 44,
431 |    "metadata": {},
432 |    "outputs": [
433 |     {
434 |      "data": {
435 |       "text/plain": [
436 |        "Index(['mean area', 'mean compactness', 'texture error', 'area error',\n",
437 |        "       'compactness error', 'concavity error', 'worst texture',\n",
438 |        "       'worst perimeter', 'worst smoothness', 'worst concavity'],\n",
439 |        "      dtype='object')"
440 |       ]
441 |      },
442 |      "execution_count": 44,
443 |      "metadata": {},
444 |      "output_type": "execute_result"
445 |     }
446 |    ],
447 |    "source": [
448 |     "selected_feat2= X_train.columns[list(sfs2.k_feature_idx_)]\n",
449 |     "selected_feat2\n"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "Note that SFS and SBE return different results"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "markdown",
461 |    "metadata": {},
462 |    "source": [
463 |     "## Exhaustive Feature Selection"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": 51,
469 |    "metadata": {},
470 |    "outputs": [
471 |     {
472 |      "name": "stderr",
473 |      "output_type": "stream",
474 |      "text": [
475 |       "Features: 847/847"
476 |      ]
477 |     }
478 |    ],
479 |    "source": [
480 |     "efs1 = EFS(RandomForestClassifier(n_jobs=-1,n_estimators=5, random_state=0), \n",
481 |     "           min_features=1,\n",
482 |     "           max_features=6, \n",
483 |     "           scoring='roc_auc',\n",
484 |     "           print_progress=True,\n",
485 |     "           cv=2)\n",
486 |     "\n",
487 |     "# in order to shorter search time for the demonstration\n",
488 |     "# we only try all possible 1,2,3,4,5,6\n",
489 |     "# feature combinations from a dataset of 10 features\n",
490 |     "\n",
491 |     "efs1 = efs1.fit(np.array(X_train[X_train.columns[0:10]].fillna(0)), y_train)"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": 52,
497 |    "metadata": {},
498 |    "outputs": [
499 |     {
500 |      "data": {
501 |       "text/plain": [
502 |        "Index(['mean radius', 'mean texture', 'mean area', 'mean smoothness',\n",
503 |        "       'mean concavity'],\n",
504 |        "      dtype='object')"
505 |       ]
506 |      },
507 |      "execution_count": 52,
508 |      "metadata": {},
509 |      "output_type": "execute_result"
510 |     }
511 |    ],
512 |    "source": [
513 |     "selected_feat3= X_train.columns[list(efs1.best_idx_)]\n",
514 |     "selected_feat3"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "metadata": {
521 |     "collapsed": true
522 |    },
523 |    "outputs": [],
524 |    "source": []
525 |   }
526 |  ],
527 |  "metadata": {
528 |   "kernelspec": {
529 |    "display_name": "Python 3",
530 |    "language": "python",
531 |    "name": "python3"
532 |   },
533 |   "language_info": {
534 |    "codemirror_mode": {
535 |     "name": "ipython",
536 |     "version": 3
537 |    },
538 |    "file_extension": ".py",
539 |    "mimetype": "text/x-python",
540 |    "name": "python",
541 |    "nbconvert_exporter": "python",
542 |    "pygments_lexer": "ipython3",
543 |    "version": "3.6.1"
544 |   }
545 |  },
546 |  "nbformat": 4,
547 |  "nbformat_minor": 2
548 | }
549 | 


--------------------------------------------------------------------------------
/4.4_Demo_Feature_Selection_Feature_Shuffling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "# import seaborn as sns\n",
 14 |     "# import matplotlib.pyplot as plt\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.feature_selection import SelectFromModel\n",
 18 |     "from sklearn.ensemble import RandomForestClassifier\n",
 19 |     "# plt.style.use('seaborn-colorblind')\n",
 20 |     "# %matplotlib inline\n",
 21 |     "from feature_selection import feature_shuffle\n"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Load Dataset"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "from sklearn.datasets import load_breast_cancer\n",
 40 |     "data = load_breast_cancer()\n",
 41 |     "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
 42 |     "                  columns= np.append(data['feature_names'], ['target']))"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "data": {
 52 |       "text/html": [
 53 |        "<div>\n",
 54 |        "<style scoped>\n",
 55 |        "    .dataframe tbody tr th:only-of-type {\n",
 56 |        "        vertical-align: middle;\n",
 57 |        "    }\n",
 58 |        "\n",
 59 |        "    .dataframe tbody tr th {\n",
 60 |        "        vertical-align: top;\n",
 61 |        "    }\n",
 62 |        "\n",
 63 |        "    .dataframe thead th {\n",
 64 |        "        text-align: right;\n",
 65 |        "    }\n",
 66 |        "</style>\n",
 67 |        "<table border=\"1\" class=\"dataframe\">\n",
 68 |        "  <thead>\n",
 69 |        "    <tr style=\"text-align: right;\">\n",
 70 |        "      <th></th>\n",
 71 |        "      <th>mean radius</th>\n",
 72 |        "      <th>mean texture</th>\n",
 73 |        "      <th>mean perimeter</th>\n",
 74 |        "      <th>mean area</th>\n",
 75 |        "      <th>mean smoothness</th>\n",
 76 |        "      <th>mean compactness</th>\n",
 77 |        "      <th>mean concavity</th>\n",
 78 |        "      <th>mean concave points</th>\n",
 79 |        "      <th>mean symmetry</th>\n",
 80 |        "      <th>mean fractal dimension</th>\n",
 81 |        "      <th>...</th>\n",
 82 |        "      <th>worst texture</th>\n",
 83 |        "      <th>worst perimeter</th>\n",
 84 |        "      <th>worst area</th>\n",
 85 |        "      <th>worst smoothness</th>\n",
 86 |        "      <th>worst compactness</th>\n",
 87 |        "      <th>worst concavity</th>\n",
 88 |        "      <th>worst concave points</th>\n",
 89 |        "      <th>worst symmetry</th>\n",
 90 |        "      <th>worst fractal dimension</th>\n",
 91 |        "      <th>target</th>\n",
 92 |        "    </tr>\n",
 93 |        "  </thead>\n",
 94 |        "  <tbody>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>0</th>\n",
 97 |        "      <td>17.99</td>\n",
 98 |        "      <td>10.38</td>\n",
 99 |        "      <td>122.80</td>\n",
100 |        "      <td>1001.0</td>\n",
101 |        "      <td>0.11840</td>\n",
102 |        "      <td>0.27760</td>\n",
103 |        "      <td>0.3001</td>\n",
104 |        "      <td>0.14710</td>\n",
105 |        "      <td>0.2419</td>\n",
106 |        "      <td>0.07871</td>\n",
107 |        "      <td>...</td>\n",
108 |        "      <td>17.33</td>\n",
109 |        "      <td>184.60</td>\n",
110 |        "      <td>2019.0</td>\n",
111 |        "      <td>0.1622</td>\n",
112 |        "      <td>0.6656</td>\n",
113 |        "      <td>0.7119</td>\n",
114 |        "      <td>0.2654</td>\n",
115 |        "      <td>0.4601</td>\n",
116 |        "      <td>0.11890</td>\n",
117 |        "      <td>0.0</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>1</th>\n",
121 |        "      <td>20.57</td>\n",
122 |        "      <td>17.77</td>\n",
123 |        "      <td>132.90</td>\n",
124 |        "      <td>1326.0</td>\n",
125 |        "      <td>0.08474</td>\n",
126 |        "      <td>0.07864</td>\n",
127 |        "      <td>0.0869</td>\n",
128 |        "      <td>0.07017</td>\n",
129 |        "      <td>0.1812</td>\n",
130 |        "      <td>0.05667</td>\n",
131 |        "      <td>...</td>\n",
132 |        "      <td>23.41</td>\n",
133 |        "      <td>158.80</td>\n",
134 |        "      <td>1956.0</td>\n",
135 |        "      <td>0.1238</td>\n",
136 |        "      <td>0.1866</td>\n",
137 |        "      <td>0.2416</td>\n",
138 |        "      <td>0.1860</td>\n",
139 |        "      <td>0.2750</td>\n",
140 |        "      <td>0.08902</td>\n",
141 |        "      <td>0.0</td>\n",
142 |        "    </tr>\n",
143 |        "    <tr>\n",
144 |        "      <th>2</th>\n",
145 |        "      <td>19.69</td>\n",
146 |        "      <td>21.25</td>\n",
147 |        "      <td>130.00</td>\n",
148 |        "      <td>1203.0</td>\n",
149 |        "      <td>0.10960</td>\n",
150 |        "      <td>0.15990</td>\n",
151 |        "      <td>0.1974</td>\n",
152 |        "      <td>0.12790</td>\n",
153 |        "      <td>0.2069</td>\n",
154 |        "      <td>0.05999</td>\n",
155 |        "      <td>...</td>\n",
156 |        "      <td>25.53</td>\n",
157 |        "      <td>152.50</td>\n",
158 |        "      <td>1709.0</td>\n",
159 |        "      <td>0.1444</td>\n",
160 |        "      <td>0.4245</td>\n",
161 |        "      <td>0.4504</td>\n",
162 |        "      <td>0.2430</td>\n",
163 |        "      <td>0.3613</td>\n",
164 |        "      <td>0.08758</td>\n",
165 |        "      <td>0.0</td>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <th>3</th>\n",
169 |        "      <td>11.42</td>\n",
170 |        "      <td>20.38</td>\n",
171 |        "      <td>77.58</td>\n",
172 |        "      <td>386.1</td>\n",
173 |        "      <td>0.14250</td>\n",
174 |        "      <td>0.28390</td>\n",
175 |        "      <td>0.2414</td>\n",
176 |        "      <td>0.10520</td>\n",
177 |        "      <td>0.2597</td>\n",
178 |        "      <td>0.09744</td>\n",
179 |        "      <td>...</td>\n",
180 |        "      <td>26.50</td>\n",
181 |        "      <td>98.87</td>\n",
182 |        "      <td>567.7</td>\n",
183 |        "      <td>0.2098</td>\n",
184 |        "      <td>0.8663</td>\n",
185 |        "      <td>0.6869</td>\n",
186 |        "      <td>0.2575</td>\n",
187 |        "      <td>0.6638</td>\n",
188 |        "      <td>0.17300</td>\n",
189 |        "      <td>0.0</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>4</th>\n",
193 |        "      <td>20.29</td>\n",
194 |        "      <td>14.34</td>\n",
195 |        "      <td>135.10</td>\n",
196 |        "      <td>1297.0</td>\n",
197 |        "      <td>0.10030</td>\n",
198 |        "      <td>0.13280</td>\n",
199 |        "      <td>0.1980</td>\n",
200 |        "      <td>0.10430</td>\n",
201 |        "      <td>0.1809</td>\n",
202 |        "      <td>0.05883</td>\n",
203 |        "      <td>...</td>\n",
204 |        "      <td>16.67</td>\n",
205 |        "      <td>152.20</td>\n",
206 |        "      <td>1575.0</td>\n",
207 |        "      <td>0.1374</td>\n",
208 |        "      <td>0.2050</td>\n",
209 |        "      <td>0.4000</td>\n",
210 |        "      <td>0.1625</td>\n",
211 |        "      <td>0.2364</td>\n",
212 |        "      <td>0.07678</td>\n",
213 |        "      <td>0.0</td>\n",
214 |        "    </tr>\n",
215 |        "  </tbody>\n",
216 |        "</table>\n",
217 |        "<p>5 rows × 31 columns</p>\n",
218 |        "</div>"
219 |       ],
220 |       "text/plain": [
221 |        "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
222 |        "0        17.99         10.38          122.80     1001.0          0.11840   \n",
223 |        "1        20.57         17.77          132.90     1326.0          0.08474   \n",
224 |        "2        19.69         21.25          130.00     1203.0          0.10960   \n",
225 |        "3        11.42         20.38           77.58      386.1          0.14250   \n",
226 |        "4        20.29         14.34          135.10     1297.0          0.10030   \n",
227 |        "\n",
228 |        "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
229 |        "0           0.27760          0.3001              0.14710         0.2419   \n",
230 |        "1           0.07864          0.0869              0.07017         0.1812   \n",
231 |        "2           0.15990          0.1974              0.12790         0.2069   \n",
232 |        "3           0.28390          0.2414              0.10520         0.2597   \n",
233 |        "4           0.13280          0.1980              0.10430         0.1809   \n",
234 |        "\n",
235 |        "   mean fractal dimension   ...    worst texture  worst perimeter  worst area  \\\n",
236 |        "0                 0.07871   ...            17.33           184.60      2019.0   \n",
237 |        "1                 0.05667   ...            23.41           158.80      1956.0   \n",
238 |        "2                 0.05999   ...            25.53           152.50      1709.0   \n",
239 |        "3                 0.09744   ...            26.50            98.87       567.7   \n",
240 |        "4                 0.05883   ...            16.67           152.20      1575.0   \n",
241 |        "\n",
242 |        "   worst smoothness  worst compactness  worst concavity  worst concave points  \\\n",
243 |        "0            0.1622             0.6656           0.7119                0.2654   \n",
244 |        "1            0.1238             0.1866           0.2416                0.1860   \n",
245 |        "2            0.1444             0.4245           0.4504                0.2430   \n",
246 |        "3            0.2098             0.8663           0.6869                0.2575   \n",
247 |        "4            0.1374             0.2050           0.4000                0.1625   \n",
248 |        "\n",
249 |        "   worst symmetry  worst fractal dimension  target  \n",
250 |        "0          0.4601                  0.11890     0.0  \n",
251 |        "1          0.2750                  0.08902     0.0  \n",
252 |        "2          0.3613                  0.08758     0.0  \n",
253 |        "3          0.6638                  0.17300     0.0  \n",
254 |        "4          0.2364                  0.07678     0.0  \n",
255 |        "\n",
256 |        "[5 rows x 31 columns]"
257 |       ]
258 |      },
259 |      "execution_count": 3,
260 |      "metadata": {},
261 |      "output_type": "execute_result"
262 |     }
263 |    ],
264 |    "source": [
265 |     "data.head(5)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 4,
271 |    "metadata": {},
272 |    "outputs": [
273 |     {
274 |      "data": {
275 |       "text/plain": [
276 |        "((455, 30), (114, 30))"
277 |       ]
278 |      },
279 |      "execution_count": 4,
280 |      "metadata": {},
281 |      "output_type": "execute_result"
282 |     }
283 |    ],
284 |    "source": [
285 |     "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
286 |     "                                                    data.target, test_size=0.2,\n",
287 |     "                                                    random_state=0)\n",
288 |     "X_train.shape, X_test.shape"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {},
294 |    "source": [
295 |     "##  Feature Shuffling\n",
296 |     "permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model.\n",
297 |     "If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics."
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 17,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "auc_drop, selected_features = feature_shuffle.feature_shuffle_rf(X_train=X_train,\n",
307 |     "                                                                 y_train=y_train,\n",
308 |     "                                                                 random_state=0)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 18,
314 |    "metadata": {},
315 |    "outputs": [
316 |     {
317 |      "data": {
318 |       "text/html": [
319 |        "<div>\n",
320 |        "<style scoped>\n",
321 |        "    .dataframe tbody tr th:only-of-type {\n",
322 |        "        vertical-align: middle;\n",
323 |        "    }\n",
324 |        "\n",
325 |        "    .dataframe tbody tr th {\n",
326 |        "        vertical-align: top;\n",
327 |        "    }\n",
328 |        "\n",
329 |        "    .dataframe thead th {\n",
330 |        "        text-align: right;\n",
331 |        "    }\n",
332 |        "</style>\n",
333 |        "<table border=\"1\" class=\"dataframe\">\n",
334 |        "  <thead>\n",
335 |        "    <tr style=\"text-align: right;\">\n",
336 |        "      <th></th>\n",
337 |        "      <th>feature</th>\n",
338 |        "      <th>auc_drop</th>\n",
339 |        "    </tr>\n",
340 |        "  </thead>\n",
341 |        "  <tbody>\n",
342 |        "    <tr>\n",
343 |        "      <th>22</th>\n",
344 |        "      <td>worst perimeter</td>\n",
345 |        "      <td>8.359457e-05</td>\n",
346 |        "    </tr>\n",
347 |        "    <tr>\n",
348 |        "      <th>27</th>\n",
349 |        "      <td>worst concave points</td>\n",
350 |        "      <td>3.134796e-05</td>\n",
351 |        "    </tr>\n",
352 |        "    <tr>\n",
353 |        "      <th>23</th>\n",
354 |        "      <td>worst area</td>\n",
355 |        "      <td>1.110223e-16</td>\n",
356 |        "    </tr>\n",
357 |        "    <tr>\n",
358 |        "      <th>12</th>\n",
359 |        "      <td>perimeter error</td>\n",
360 |        "      <td>1.110223e-16</td>\n",
361 |        "    </tr>\n",
362 |        "    <tr>\n",
363 |        "      <th>0</th>\n",
364 |        "      <td>mean radius</td>\n",
365 |        "      <td>0.000000e+00</td>\n",
366 |        "    </tr>\n",
367 |        "    <tr>\n",
368 |        "      <th>16</th>\n",
369 |        "      <td>concavity error</td>\n",
370 |        "      <td>0.000000e+00</td>\n",
371 |        "    </tr>\n",
372 |        "    <tr>\n",
373 |        "      <th>28</th>\n",
374 |        "      <td>worst symmetry</td>\n",
375 |        "      <td>0.000000e+00</td>\n",
376 |        "    </tr>\n",
377 |        "    <tr>\n",
378 |        "      <th>26</th>\n",
379 |        "      <td>worst concavity</td>\n",
380 |        "      <td>0.000000e+00</td>\n",
381 |        "    </tr>\n",
382 |        "    <tr>\n",
383 |        "      <th>25</th>\n",
384 |        "      <td>worst compactness</td>\n",
385 |        "      <td>0.000000e+00</td>\n",
386 |        "    </tr>\n",
387 |        "    <tr>\n",
388 |        "      <th>24</th>\n",
389 |        "      <td>worst smoothness</td>\n",
390 |        "      <td>0.000000e+00</td>\n",
391 |        "    </tr>\n",
392 |        "    <tr>\n",
393 |        "      <th>21</th>\n",
394 |        "      <td>worst texture</td>\n",
395 |        "      <td>0.000000e+00</td>\n",
396 |        "    </tr>\n",
397 |        "    <tr>\n",
398 |        "      <th>20</th>\n",
399 |        "      <td>worst radius</td>\n",
400 |        "      <td>0.000000e+00</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>19</th>\n",
404 |        "      <td>fractal dimension error</td>\n",
405 |        "      <td>0.000000e+00</td>\n",
406 |        "    </tr>\n",
407 |        "    <tr>\n",
408 |        "      <th>18</th>\n",
409 |        "      <td>symmetry error</td>\n",
410 |        "      <td>0.000000e+00</td>\n",
411 |        "    </tr>\n",
412 |        "    <tr>\n",
413 |        "      <th>17</th>\n",
414 |        "      <td>concave points error</td>\n",
415 |        "      <td>0.000000e+00</td>\n",
416 |        "    </tr>\n",
417 |        "    <tr>\n",
418 |        "      <th>15</th>\n",
419 |        "      <td>compactness error</td>\n",
420 |        "      <td>0.000000e+00</td>\n",
421 |        "    </tr>\n",
422 |        "    <tr>\n",
423 |        "      <th>1</th>\n",
424 |        "      <td>mean texture</td>\n",
425 |        "      <td>0.000000e+00</td>\n",
426 |        "    </tr>\n",
427 |        "    <tr>\n",
428 |        "      <th>14</th>\n",
429 |        "      <td>smoothness error</td>\n",
430 |        "      <td>0.000000e+00</td>\n",
431 |        "    </tr>\n",
432 |        "    <tr>\n",
433 |        "      <th>13</th>\n",
434 |        "      <td>area error</td>\n",
435 |        "      <td>0.000000e+00</td>\n",
436 |        "    </tr>\n",
437 |        "    <tr>\n",
438 |        "      <th>11</th>\n",
439 |        "      <td>texture error</td>\n",
440 |        "      <td>0.000000e+00</td>\n",
441 |        "    </tr>\n",
442 |        "    <tr>\n",
443 |        "      <th>10</th>\n",
444 |        "      <td>radius error</td>\n",
445 |        "      <td>0.000000e+00</td>\n",
446 |        "    </tr>\n",
447 |        "    <tr>\n",
448 |        "      <th>9</th>\n",
449 |        "      <td>mean fractal dimension</td>\n",
450 |        "      <td>0.000000e+00</td>\n",
451 |        "    </tr>\n",
452 |        "    <tr>\n",
453 |        "      <th>8</th>\n",
454 |        "      <td>mean symmetry</td>\n",
455 |        "      <td>0.000000e+00</td>\n",
456 |        "    </tr>\n",
457 |        "    <tr>\n",
458 |        "      <th>7</th>\n",
459 |        "      <td>mean concave points</td>\n",
460 |        "      <td>0.000000e+00</td>\n",
461 |        "    </tr>\n",
462 |        "    <tr>\n",
463 |        "      <th>6</th>\n",
464 |        "      <td>mean concavity</td>\n",
465 |        "      <td>0.000000e+00</td>\n",
466 |        "    </tr>\n",
467 |        "    <tr>\n",
468 |        "      <th>5</th>\n",
469 |        "      <td>mean compactness</td>\n",
470 |        "      <td>0.000000e+00</td>\n",
471 |        "    </tr>\n",
472 |        "    <tr>\n",
473 |        "      <th>4</th>\n",
474 |        "      <td>mean smoothness</td>\n",
475 |        "      <td>0.000000e+00</td>\n",
476 |        "    </tr>\n",
477 |        "    <tr>\n",
478 |        "      <th>3</th>\n",
479 |        "      <td>mean area</td>\n",
480 |        "      <td>0.000000e+00</td>\n",
481 |        "    </tr>\n",
482 |        "    <tr>\n",
483 |        "      <th>2</th>\n",
484 |        "      <td>mean perimeter</td>\n",
485 |        "      <td>0.000000e+00</td>\n",
486 |        "    </tr>\n",
487 |        "    <tr>\n",
488 |        "      <th>29</th>\n",
489 |        "      <td>worst fractal dimension</td>\n",
490 |        "      <td>0.000000e+00</td>\n",
491 |        "    </tr>\n",
492 |        "  </tbody>\n",
493 |        "</table>\n",
494 |        "</div>"
495 |       ],
496 |       "text/plain": [
497 |        "                    feature      auc_drop\n",
498 |        "22          worst perimeter  8.359457e-05\n",
499 |        "27     worst concave points  3.134796e-05\n",
500 |        "23               worst area  1.110223e-16\n",
501 |        "12          perimeter error  1.110223e-16\n",
502 |        "0               mean radius  0.000000e+00\n",
503 |        "16          concavity error  0.000000e+00\n",
504 |        "28           worst symmetry  0.000000e+00\n",
505 |        "26          worst concavity  0.000000e+00\n",
506 |        "25        worst compactness  0.000000e+00\n",
507 |        "24         worst smoothness  0.000000e+00\n",
508 |        "21            worst texture  0.000000e+00\n",
509 |        "20             worst radius  0.000000e+00\n",
510 |        "19  fractal dimension error  0.000000e+00\n",
511 |        "18           symmetry error  0.000000e+00\n",
512 |        "17     concave points error  0.000000e+00\n",
513 |        "15        compactness error  0.000000e+00\n",
514 |        "1              mean texture  0.000000e+00\n",
515 |        "14         smoothness error  0.000000e+00\n",
516 |        "13               area error  0.000000e+00\n",
517 |        "11            texture error  0.000000e+00\n",
518 |        "10             radius error  0.000000e+00\n",
519 |        "9    mean fractal dimension  0.000000e+00\n",
520 |        "8             mean symmetry  0.000000e+00\n",
521 |        "7       mean concave points  0.000000e+00\n",
522 |        "6            mean concavity  0.000000e+00\n",
523 |        "5          mean compactness  0.000000e+00\n",
524 |        "4           mean smoothness  0.000000e+00\n",
525 |        "3                 mean area  0.000000e+00\n",
526 |        "2            mean perimeter  0.000000e+00\n",
527 |        "29  worst fractal dimension  0.000000e+00"
528 |       ]
529 |      },
530 |      "execution_count": 18,
531 |      "metadata": {},
532 |      "output_type": "execute_result"
533 |     }
534 |    ],
535 |    "source": [
536 |     "#  we select features that have auc_drop > 0\n",
537 |     "auc_drop"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": 19,
543 |    "metadata": {},
544 |    "outputs": [
545 |     {
546 |      "data": {
547 |       "text/plain": [
548 |        "22         worst perimeter\n",
549 |        "27    worst concave points\n",
550 |        "23              worst area\n",
551 |        "12         perimeter error\n",
552 |        "Name: feature, dtype: object"
553 |       ]
554 |      },
555 |      "execution_count": 19,
556 |      "metadata": {},
557 |      "output_type": "execute_result"
558 |     }
559 |    ],
560 |    "source": [
561 |     "selected_features"
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": null,
567 |    "metadata": {
568 |     "collapsed": true
569 |    },
570 |    "outputs": [],
571 |    "source": []
572 |   }
573 |  ],
574 |  "metadata": {
575 |   "kernelspec": {
576 |    "display_name": "Python 3",
577 |    "language": "python",
578 |    "name": "python3"
579 |   },
580 |   "language_info": {
581 |    "codemirror_mode": {
582 |     "name": "ipython",
583 |     "version": 3
584 |    },
585 |    "file_extension": ".py",
586 |    "mimetype": "text/x-python",
587 |    "name": "python",
588 |    "nbconvert_exporter": "python",
589 |    "pygments_lexer": "ipython3",
590 |    "version": "3.6.1"
591 |   }
592 |  },
593 |  "nbformat": 4,
594 |  "nbformat_minor": 2
595 | }
596 | 


--------------------------------------------------------------------------------
/A Short Guide for Feature Engineering and Feature Selection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/A Short Guide for Feature Engineering and Feature Selection.pdf


--------------------------------------------------------------------------------
/data/pima-indians-diabetes.data.csv:
--------------------------------------------------------------------------------
  1 | 6,148,72,35,0,33.6,0.627,50,1
  2 | 1,85,66,29,0,26.6,0.351,31,0
  3 | 8,183,64,0,0,23.3,0.672,32,1
  4 | 1,89,66,23,94,28.1,0.167,21,0
  5 | 0,137,40,35,168,43.1,2.288,33,1
  6 | 5,116,74,0,0,25.6,0.201,30,0
  7 | 3,78,50,32,88,31.0,0.248,26,1
  8 | 10,115,0,0,0,35.3,0.134,29,0
  9 | 2,197,70,45,543,30.5,0.158,53,1
 10 | 8,125,96,0,0,0.0,0.232,54,1
 11 | 4,110,92,0,0,37.6,0.191,30,0
 12 | 10,168,74,0,0,38.0,0.537,34,1
 13 | 10,139,80,0,0,27.1,1.441,57,0
 14 | 1,189,60,23,846,30.1,0.398,59,1
 15 | 5,166,72,19,175,25.8,0.587,51,1
 16 | 7,100,0,0,0,30.0,0.484,32,1
 17 | 0,118,84,47,230,45.8,0.551,31,1
 18 | 7,107,74,0,0,29.6,0.254,31,1
 19 | 1,103,30,38,83,43.3,0.183,33,0
 20 | 1,115,70,30,96,34.6,0.529,32,1
 21 | 3,126,88,41,235,39.3,0.704,27,0
 22 | 8,99,84,0,0,35.4,0.388,50,0
 23 | 7,196,90,0,0,39.8,0.451,41,1
 24 | 9,119,80,35,0,29.0,0.263,29,1
 25 | 11,143,94,33,146,36.6,0.254,51,1
 26 | 10,125,70,26,115,31.1,0.205,41,1
 27 | 7,147,76,0,0,39.4,0.257,43,1
 28 | 1,97,66,15,140,23.2,0.487,22,0
 29 | 13,145,82,19,110,22.2,0.245,57,0
 30 | 5,117,92,0,0,34.1,0.337,38,0
 31 | 5,109,75,26,0,36.0,0.546,60,0
 32 | 3,158,76,36,245,31.6,0.851,28,1
 33 | 3,88,58,11,54,24.8,0.267,22,0
 34 | 6,92,92,0,0,19.9,0.188,28,0
 35 | 10,122,78,31,0,27.6,0.512,45,0
 36 | 4,103,60,33,192,24.0,0.966,33,0
 37 | 11,138,76,0,0,33.2,0.420,35,0
 38 | 9,102,76,37,0,32.9,0.665,46,1
 39 | 2,90,68,42,0,38.2,0.503,27,1
 40 | 4,111,72,47,207,37.1,1.390,56,1
 41 | 3,180,64,25,70,34.0,0.271,26,0
 42 | 7,133,84,0,0,40.2,0.696,37,0
 43 | 7,106,92,18,0,22.7,0.235,48,0
 44 | 9,171,110,24,240,45.4,0.721,54,1
 45 | 7,159,64,0,0,27.4,0.294,40,0
 46 | 0,180,66,39,0,42.0,1.893,25,1
 47 | 1,146,56,0,0,29.7,0.564,29,0
 48 | 2,71,70,27,0,28.0,0.586,22,0
 49 | 7,103,66,32,0,39.1,0.344,31,1
 50 | 7,105,0,0,0,0.0,0.305,24,0
 51 | 1,103,80,11,82,19.4,0.491,22,0
 52 | 1,101,50,15,36,24.2,0.526,26,0
 53 | 5,88,66,21,23,24.4,0.342,30,0
 54 | 8,176,90,34,300,33.7,0.467,58,1
 55 | 7,150,66,42,342,34.7,0.718,42,0
 56 | 1,73,50,10,0,23.0,0.248,21,0
 57 | 7,187,68,39,304,37.7,0.254,41,1
 58 | 0,100,88,60,110,46.8,0.962,31,0
 59 | 0,146,82,0,0,40.5,1.781,44,0
 60 | 0,105,64,41,142,41.5,0.173,22,0
 61 | 2,84,0,0,0,0.0,0.304,21,0
 62 | 8,133,72,0,0,32.9,0.270,39,1
 63 | 5,44,62,0,0,25.0,0.587,36,0
 64 | 2,141,58,34,128,25.4,0.699,24,0
 65 | 7,114,66,0,0,32.8,0.258,42,1
 66 | 5,99,74,27,0,29.0,0.203,32,0
 67 | 0,109,88,30,0,32.5,0.855,38,1
 68 | 2,109,92,0,0,42.7,0.845,54,0
 69 | 1,95,66,13,38,19.6,0.334,25,0
 70 | 4,146,85,27,100,28.9,0.189,27,0
 71 | 2,100,66,20,90,32.9,0.867,28,1
 72 | 5,139,64,35,140,28.6,0.411,26,0
 73 | 13,126,90,0,0,43.4,0.583,42,1
 74 | 4,129,86,20,270,35.1,0.231,23,0
 75 | 1,79,75,30,0,32.0,0.396,22,0
 76 | 1,0,48,20,0,24.7,0.140,22,0
 77 | 7,62,78,0,0,32.6,0.391,41,0
 78 | 5,95,72,33,0,37.7,0.370,27,0
 79 | 0,131,0,0,0,43.2,0.270,26,1
 80 | 2,112,66,22,0,25.0,0.307,24,0
 81 | 3,113,44,13,0,22.4,0.140,22,0
 82 | 2,74,0,0,0,0.0,0.102,22,0
 83 | 7,83,78,26,71,29.3,0.767,36,0
 84 | 0,101,65,28,0,24.6,0.237,22,0
 85 | 5,137,108,0,0,48.8,0.227,37,1
 86 | 2,110,74,29,125,32.4,0.698,27,0
 87 | 13,106,72,54,0,36.6,0.178,45,0
 88 | 2,100,68,25,71,38.5,0.324,26,0
 89 | 15,136,70,32,110,37.1,0.153,43,1
 90 | 1,107,68,19,0,26.5,0.165,24,0
 91 | 1,80,55,0,0,19.1,0.258,21,0
 92 | 4,123,80,15,176,32.0,0.443,34,0
 93 | 7,81,78,40,48,46.7,0.261,42,0
 94 | 4,134,72,0,0,23.8,0.277,60,1
 95 | 2,142,82,18,64,24.7,0.761,21,0
 96 | 6,144,72,27,228,33.9,0.255,40,0
 97 | 2,92,62,28,0,31.6,0.130,24,0
 98 | 1,71,48,18,76,20.4,0.323,22,0
 99 | 6,93,50,30,64,28.7,0.356,23,0
100 | 1,122,90,51,220,49.7,0.325,31,1
101 | 1,163,72,0,0,39.0,1.222,33,1
102 | 1,151,60,0,0,26.1,0.179,22,0
103 | 0,125,96,0,0,22.5,0.262,21,0
104 | 1,81,72,18,40,26.6,0.283,24,0
105 | 2,85,65,0,0,39.6,0.930,27,0
106 | 1,126,56,29,152,28.7,0.801,21,0
107 | 1,96,122,0,0,22.4,0.207,27,0
108 | 4,144,58,28,140,29.5,0.287,37,0
109 | 3,83,58,31,18,34.3,0.336,25,0
110 | 0,95,85,25,36,37.4,0.247,24,1
111 | 3,171,72,33,135,33.3,0.199,24,1
112 | 8,155,62,26,495,34.0,0.543,46,1
113 | 1,89,76,34,37,31.2,0.192,23,0
114 | 4,76,62,0,0,34.0,0.391,25,0
115 | 7,160,54,32,175,30.5,0.588,39,1
116 | 4,146,92,0,0,31.2,0.539,61,1
117 | 5,124,74,0,0,34.0,0.220,38,1
118 | 5,78,48,0,0,33.7,0.654,25,0
119 | 4,97,60,23,0,28.2,0.443,22,0
120 | 4,99,76,15,51,23.2,0.223,21,0
121 | 0,162,76,56,100,53.2,0.759,25,1
122 | 6,111,64,39,0,34.2,0.260,24,0
123 | 2,107,74,30,100,33.6,0.404,23,0
124 | 5,132,80,0,0,26.8,0.186,69,0
125 | 0,113,76,0,0,33.3,0.278,23,1
126 | 1,88,30,42,99,55.0,0.496,26,1
127 | 3,120,70,30,135,42.9,0.452,30,0
128 | 1,118,58,36,94,33.3,0.261,23,0
129 | 1,117,88,24,145,34.5,0.403,40,1
130 | 0,105,84,0,0,27.9,0.741,62,1
131 | 4,173,70,14,168,29.7,0.361,33,1
132 | 9,122,56,0,0,33.3,1.114,33,1
133 | 3,170,64,37,225,34.5,0.356,30,1
134 | 8,84,74,31,0,38.3,0.457,39,0
135 | 2,96,68,13,49,21.1,0.647,26,0
136 | 2,125,60,20,140,33.8,0.088,31,0
137 | 0,100,70,26,50,30.8,0.597,21,0
138 | 0,93,60,25,92,28.7,0.532,22,0
139 | 0,129,80,0,0,31.2,0.703,29,0
140 | 5,105,72,29,325,36.9,0.159,28,0
141 | 3,128,78,0,0,21.1,0.268,55,0
142 | 5,106,82,30,0,39.5,0.286,38,0
143 | 2,108,52,26,63,32.5,0.318,22,0
144 | 10,108,66,0,0,32.4,0.272,42,1
145 | 4,154,62,31,284,32.8,0.237,23,0
146 | 0,102,75,23,0,0.0,0.572,21,0
147 | 9,57,80,37,0,32.8,0.096,41,0
148 | 2,106,64,35,119,30.5,1.400,34,0
149 | 5,147,78,0,0,33.7,0.218,65,0
150 | 2,90,70,17,0,27.3,0.085,22,0
151 | 1,136,74,50,204,37.4,0.399,24,0
152 | 4,114,65,0,0,21.9,0.432,37,0
153 | 9,156,86,28,155,34.3,1.189,42,1
154 | 1,153,82,42,485,40.6,0.687,23,0
155 | 8,188,78,0,0,47.9,0.137,43,1
156 | 7,152,88,44,0,50.0,0.337,36,1
157 | 2,99,52,15,94,24.6,0.637,21,0
158 | 1,109,56,21,135,25.2,0.833,23,0
159 | 2,88,74,19,53,29.0,0.229,22,0
160 | 17,163,72,41,114,40.9,0.817,47,1
161 | 4,151,90,38,0,29.7,0.294,36,0
162 | 7,102,74,40,105,37.2,0.204,45,0
163 | 0,114,80,34,285,44.2,0.167,27,0
164 | 2,100,64,23,0,29.7,0.368,21,0
165 | 0,131,88,0,0,31.6,0.743,32,1
166 | 6,104,74,18,156,29.9,0.722,41,1
167 | 3,148,66,25,0,32.5,0.256,22,0
168 | 4,120,68,0,0,29.6,0.709,34,0
169 | 4,110,66,0,0,31.9,0.471,29,0
170 | 3,111,90,12,78,28.4,0.495,29,0
171 | 6,102,82,0,0,30.8,0.180,36,1
172 | 6,134,70,23,130,35.4,0.542,29,1
173 | 2,87,0,23,0,28.9,0.773,25,0
174 | 1,79,60,42,48,43.5,0.678,23,0
175 | 2,75,64,24,55,29.7,0.370,33,0
176 | 8,179,72,42,130,32.7,0.719,36,1
177 | 6,85,78,0,0,31.2,0.382,42,0
178 | 0,129,110,46,130,67.1,0.319,26,1
179 | 5,143,78,0,0,45.0,0.190,47,0
180 | 5,130,82,0,0,39.1,0.956,37,1
181 | 6,87,80,0,0,23.2,0.084,32,0
182 | 0,119,64,18,92,34.9,0.725,23,0
183 | 1,0,74,20,23,27.7,0.299,21,0
184 | 5,73,60,0,0,26.8,0.268,27,0
185 | 4,141,74,0,0,27.6,0.244,40,0
186 | 7,194,68,28,0,35.9,0.745,41,1
187 | 8,181,68,36,495,30.1,0.615,60,1
188 | 1,128,98,41,58,32.0,1.321,33,1
189 | 8,109,76,39,114,27.9,0.640,31,1
190 | 5,139,80,35,160,31.6,0.361,25,1
191 | 3,111,62,0,0,22.6,0.142,21,0
192 | 9,123,70,44,94,33.1,0.374,40,0
193 | 7,159,66,0,0,30.4,0.383,36,1
194 | 11,135,0,0,0,52.3,0.578,40,1
195 | 8,85,55,20,0,24.4,0.136,42,0
196 | 5,158,84,41,210,39.4,0.395,29,1
197 | 1,105,58,0,0,24.3,0.187,21,0
198 | 3,107,62,13,48,22.9,0.678,23,1
199 | 4,109,64,44,99,34.8,0.905,26,1
200 | 4,148,60,27,318,30.9,0.150,29,1
201 | 0,113,80,16,0,31.0,0.874,21,0
202 | 1,138,82,0,0,40.1,0.236,28,0
203 | 0,108,68,20,0,27.3,0.787,32,0
204 | 2,99,70,16,44,20.4,0.235,27,0
205 | 6,103,72,32,190,37.7,0.324,55,0
206 | 5,111,72,28,0,23.9,0.407,27,0
207 | 8,196,76,29,280,37.5,0.605,57,1
208 | 5,162,104,0,0,37.7,0.151,52,1
209 | 1,96,64,27,87,33.2,0.289,21,0
210 | 7,184,84,33,0,35.5,0.355,41,1
211 | 2,81,60,22,0,27.7,0.290,25,0
212 | 0,147,85,54,0,42.8,0.375,24,0
213 | 7,179,95,31,0,34.2,0.164,60,0
214 | 0,140,65,26,130,42.6,0.431,24,1
215 | 9,112,82,32,175,34.2,0.260,36,1
216 | 12,151,70,40,271,41.8,0.742,38,1
217 | 5,109,62,41,129,35.8,0.514,25,1
218 | 6,125,68,30,120,30.0,0.464,32,0
219 | 5,85,74,22,0,29.0,1.224,32,1
220 | 5,112,66,0,0,37.8,0.261,41,1
221 | 0,177,60,29,478,34.6,1.072,21,1
222 | 2,158,90,0,0,31.6,0.805,66,1
223 | 7,119,0,0,0,25.2,0.209,37,0
224 | 7,142,60,33,190,28.8,0.687,61,0
225 | 1,100,66,15,56,23.6,0.666,26,0
226 | 1,87,78,27,32,34.6,0.101,22,0
227 | 0,101,76,0,0,35.7,0.198,26,0
228 | 3,162,52,38,0,37.2,0.652,24,1
229 | 4,197,70,39,744,36.7,2.329,31,0
230 | 0,117,80,31,53,45.2,0.089,24,0
231 | 4,142,86,0,0,44.0,0.645,22,1
232 | 6,134,80,37,370,46.2,0.238,46,1
233 | 1,79,80,25,37,25.4,0.583,22,0
234 | 4,122,68,0,0,35.0,0.394,29,0
235 | 3,74,68,28,45,29.7,0.293,23,0
236 | 4,171,72,0,0,43.6,0.479,26,1
237 | 7,181,84,21,192,35.9,0.586,51,1
238 | 0,179,90,27,0,44.1,0.686,23,1
239 | 9,164,84,21,0,30.8,0.831,32,1
240 | 0,104,76,0,0,18.4,0.582,27,0
241 | 1,91,64,24,0,29.2,0.192,21,0
242 | 4,91,70,32,88,33.1,0.446,22,0
243 | 3,139,54,0,0,25.6,0.402,22,1
244 | 6,119,50,22,176,27.1,1.318,33,1
245 | 2,146,76,35,194,38.2,0.329,29,0
246 | 9,184,85,15,0,30.0,1.213,49,1
247 | 10,122,68,0,0,31.2,0.258,41,0
248 | 0,165,90,33,680,52.3,0.427,23,0
249 | 9,124,70,33,402,35.4,0.282,34,0
250 | 1,111,86,19,0,30.1,0.143,23,0
251 | 9,106,52,0,0,31.2,0.380,42,0
252 | 2,129,84,0,0,28.0,0.284,27,0
253 | 2,90,80,14,55,24.4,0.249,24,0
254 | 0,86,68,32,0,35.8,0.238,25,0
255 | 12,92,62,7,258,27.6,0.926,44,1
256 | 1,113,64,35,0,33.6,0.543,21,1
257 | 3,111,56,39,0,30.1,0.557,30,0
258 | 2,114,68,22,0,28.7,0.092,25,0
259 | 1,193,50,16,375,25.9,0.655,24,0
260 | 11,155,76,28,150,33.3,1.353,51,1
261 | 3,191,68,15,130,30.9,0.299,34,0
262 | 3,141,0,0,0,30.0,0.761,27,1
263 | 4,95,70,32,0,32.1,0.612,24,0
264 | 3,142,80,15,0,32.4,0.200,63,0
265 | 4,123,62,0,0,32.0,0.226,35,1
266 | 5,96,74,18,67,33.6,0.997,43,0
267 | 0,138,0,0,0,36.3,0.933,25,1
268 | 2,128,64,42,0,40.0,1.101,24,0
269 | 0,102,52,0,0,25.1,0.078,21,0
270 | 2,146,0,0,0,27.5,0.240,28,1
271 | 10,101,86,37,0,45.6,1.136,38,1
272 | 2,108,62,32,56,25.2,0.128,21,0
273 | 3,122,78,0,0,23.0,0.254,40,0
274 | 1,71,78,50,45,33.2,0.422,21,0
275 | 13,106,70,0,0,34.2,0.251,52,0
276 | 2,100,70,52,57,40.5,0.677,25,0
277 | 7,106,60,24,0,26.5,0.296,29,1
278 | 0,104,64,23,116,27.8,0.454,23,0
279 | 5,114,74,0,0,24.9,0.744,57,0
280 | 2,108,62,10,278,25.3,0.881,22,0
281 | 0,146,70,0,0,37.9,0.334,28,1
282 | 10,129,76,28,122,35.9,0.280,39,0
283 | 7,133,88,15,155,32.4,0.262,37,0
284 | 7,161,86,0,0,30.4,0.165,47,1
285 | 2,108,80,0,0,27.0,0.259,52,1
286 | 7,136,74,26,135,26.0,0.647,51,0
287 | 5,155,84,44,545,38.7,0.619,34,0
288 | 1,119,86,39,220,45.6,0.808,29,1
289 | 4,96,56,17,49,20.8,0.340,26,0
290 | 5,108,72,43,75,36.1,0.263,33,0
291 | 0,78,88,29,40,36.9,0.434,21,0
292 | 0,107,62,30,74,36.6,0.757,25,1
293 | 2,128,78,37,182,43.3,1.224,31,1
294 | 1,128,48,45,194,40.5,0.613,24,1
295 | 0,161,50,0,0,21.9,0.254,65,0
296 | 6,151,62,31,120,35.5,0.692,28,0
297 | 2,146,70,38,360,28.0,0.337,29,1
298 | 0,126,84,29,215,30.7,0.520,24,0
299 | 14,100,78,25,184,36.6,0.412,46,1
300 | 8,112,72,0,0,23.6,0.840,58,0
301 | 0,167,0,0,0,32.3,0.839,30,1
302 | 2,144,58,33,135,31.6,0.422,25,1
303 | 5,77,82,41,42,35.8,0.156,35,0
304 | 5,115,98,0,0,52.9,0.209,28,1
305 | 3,150,76,0,0,21.0,0.207,37,0
306 | 2,120,76,37,105,39.7,0.215,29,0
307 | 10,161,68,23,132,25.5,0.326,47,1
308 | 0,137,68,14,148,24.8,0.143,21,0
309 | 0,128,68,19,180,30.5,1.391,25,1
310 | 2,124,68,28,205,32.9,0.875,30,1
311 | 6,80,66,30,0,26.2,0.313,41,0
312 | 0,106,70,37,148,39.4,0.605,22,0
313 | 2,155,74,17,96,26.6,0.433,27,1
314 | 3,113,50,10,85,29.5,0.626,25,0
315 | 7,109,80,31,0,35.9,1.127,43,1
316 | 2,112,68,22,94,34.1,0.315,26,0
317 | 3,99,80,11,64,19.3,0.284,30,0
318 | 3,182,74,0,0,30.5,0.345,29,1
319 | 3,115,66,39,140,38.1,0.150,28,0
320 | 6,194,78,0,0,23.5,0.129,59,1
321 | 4,129,60,12,231,27.5,0.527,31,0
322 | 3,112,74,30,0,31.6,0.197,25,1
323 | 0,124,70,20,0,27.4,0.254,36,1
324 | 13,152,90,33,29,26.8,0.731,43,1
325 | 2,112,75,32,0,35.7,0.148,21,0
326 | 1,157,72,21,168,25.6,0.123,24,0
327 | 1,122,64,32,156,35.1,0.692,30,1
328 | 10,179,70,0,0,35.1,0.200,37,0
329 | 2,102,86,36,120,45.5,0.127,23,1
330 | 6,105,70,32,68,30.8,0.122,37,0
331 | 8,118,72,19,0,23.1,1.476,46,0
332 | 2,87,58,16,52,32.7,0.166,25,0
333 | 1,180,0,0,0,43.3,0.282,41,1
334 | 12,106,80,0,0,23.6,0.137,44,0
335 | 1,95,60,18,58,23.9,0.260,22,0
336 | 0,165,76,43,255,47.9,0.259,26,0
337 | 0,117,0,0,0,33.8,0.932,44,0
338 | 5,115,76,0,0,31.2,0.343,44,1
339 | 9,152,78,34,171,34.2,0.893,33,1
340 | 7,178,84,0,0,39.9,0.331,41,1
341 | 1,130,70,13,105,25.9,0.472,22,0
342 | 1,95,74,21,73,25.9,0.673,36,0
343 | 1,0,68,35,0,32.0,0.389,22,0
344 | 5,122,86,0,0,34.7,0.290,33,0
345 | 8,95,72,0,0,36.8,0.485,57,0
346 | 8,126,88,36,108,38.5,0.349,49,0
347 | 1,139,46,19,83,28.7,0.654,22,0
348 | 3,116,0,0,0,23.5,0.187,23,0
349 | 3,99,62,19,74,21.8,0.279,26,0
350 | 5,0,80,32,0,41.0,0.346,37,1
351 | 4,92,80,0,0,42.2,0.237,29,0
352 | 4,137,84,0,0,31.2,0.252,30,0
353 | 3,61,82,28,0,34.4,0.243,46,0
354 | 1,90,62,12,43,27.2,0.580,24,0
355 | 3,90,78,0,0,42.7,0.559,21,0
356 | 9,165,88,0,0,30.4,0.302,49,1
357 | 1,125,50,40,167,33.3,0.962,28,1
358 | 13,129,0,30,0,39.9,0.569,44,1
359 | 12,88,74,40,54,35.3,0.378,48,0
360 | 1,196,76,36,249,36.5,0.875,29,1
361 | 5,189,64,33,325,31.2,0.583,29,1
362 | 5,158,70,0,0,29.8,0.207,63,0
363 | 5,103,108,37,0,39.2,0.305,65,0
364 | 4,146,78,0,0,38.5,0.520,67,1
365 | 4,147,74,25,293,34.9,0.385,30,0
366 | 5,99,54,28,83,34.0,0.499,30,0
367 | 6,124,72,0,0,27.6,0.368,29,1
368 | 0,101,64,17,0,21.0,0.252,21,0
369 | 3,81,86,16,66,27.5,0.306,22,0
370 | 1,133,102,28,140,32.8,0.234,45,1
371 | 3,173,82,48,465,38.4,2.137,25,1
372 | 0,118,64,23,89,0.0,1.731,21,0
373 | 0,84,64,22,66,35.8,0.545,21,0
374 | 2,105,58,40,94,34.9,0.225,25,0
375 | 2,122,52,43,158,36.2,0.816,28,0
376 | 12,140,82,43,325,39.2,0.528,58,1
377 | 0,98,82,15,84,25.2,0.299,22,0
378 | 1,87,60,37,75,37.2,0.509,22,0
379 | 4,156,75,0,0,48.3,0.238,32,1
380 | 0,93,100,39,72,43.4,1.021,35,0
381 | 1,107,72,30,82,30.8,0.821,24,0
382 | 0,105,68,22,0,20.0,0.236,22,0
383 | 1,109,60,8,182,25.4,0.947,21,0
384 | 1,90,62,18,59,25.1,1.268,25,0
385 | 1,125,70,24,110,24.3,0.221,25,0
386 | 1,119,54,13,50,22.3,0.205,24,0
387 | 5,116,74,29,0,32.3,0.660,35,1
388 | 8,105,100,36,0,43.3,0.239,45,1
389 | 5,144,82,26,285,32.0,0.452,58,1
390 | 3,100,68,23,81,31.6,0.949,28,0
391 | 1,100,66,29,196,32.0,0.444,42,0
392 | 5,166,76,0,0,45.7,0.340,27,1
393 | 1,131,64,14,415,23.7,0.389,21,0
394 | 4,116,72,12,87,22.1,0.463,37,0
395 | 4,158,78,0,0,32.9,0.803,31,1
396 | 2,127,58,24,275,27.7,1.600,25,0
397 | 3,96,56,34,115,24.7,0.944,39,0
398 | 0,131,66,40,0,34.3,0.196,22,1
399 | 3,82,70,0,0,21.1,0.389,25,0
400 | 3,193,70,31,0,34.9,0.241,25,1
401 | 4,95,64,0,0,32.0,0.161,31,1
402 | 6,137,61,0,0,24.2,0.151,55,0
403 | 5,136,84,41,88,35.0,0.286,35,1
404 | 9,72,78,25,0,31.6,0.280,38,0
405 | 5,168,64,0,0,32.9,0.135,41,1
406 | 2,123,48,32,165,42.1,0.520,26,0
407 | 4,115,72,0,0,28.9,0.376,46,1
408 | 0,101,62,0,0,21.9,0.336,25,0
409 | 8,197,74,0,0,25.9,1.191,39,1
410 | 1,172,68,49,579,42.4,0.702,28,1
411 | 6,102,90,39,0,35.7,0.674,28,0
412 | 1,112,72,30,176,34.4,0.528,25,0
413 | 1,143,84,23,310,42.4,1.076,22,0
414 | 1,143,74,22,61,26.2,0.256,21,0
415 | 0,138,60,35,167,34.6,0.534,21,1
416 | 3,173,84,33,474,35.7,0.258,22,1
417 | 1,97,68,21,0,27.2,1.095,22,0
418 | 4,144,82,32,0,38.5,0.554,37,1
419 | 1,83,68,0,0,18.2,0.624,27,0
420 | 3,129,64,29,115,26.4,0.219,28,1
421 | 1,119,88,41,170,45.3,0.507,26,0
422 | 2,94,68,18,76,26.0,0.561,21,0
423 | 0,102,64,46,78,40.6,0.496,21,0
424 | 2,115,64,22,0,30.8,0.421,21,0
425 | 8,151,78,32,210,42.9,0.516,36,1
426 | 4,184,78,39,277,37.0,0.264,31,1
427 | 0,94,0,0,0,0.0,0.256,25,0
428 | 1,181,64,30,180,34.1,0.328,38,1
429 | 0,135,94,46,145,40.6,0.284,26,0
430 | 1,95,82,25,180,35.0,0.233,43,1
431 | 2,99,0,0,0,22.2,0.108,23,0
432 | 3,89,74,16,85,30.4,0.551,38,0
433 | 1,80,74,11,60,30.0,0.527,22,0
434 | 2,139,75,0,0,25.6,0.167,29,0
435 | 1,90,68,8,0,24.5,1.138,36,0
436 | 0,141,0,0,0,42.4,0.205,29,1
437 | 12,140,85,33,0,37.4,0.244,41,0
438 | 5,147,75,0,0,29.9,0.434,28,0
439 | 1,97,70,15,0,18.2,0.147,21,0
440 | 6,107,88,0,0,36.8,0.727,31,0
441 | 0,189,104,25,0,34.3,0.435,41,1
442 | 2,83,66,23,50,32.2,0.497,22,0
443 | 4,117,64,27,120,33.2,0.230,24,0
444 | 8,108,70,0,0,30.5,0.955,33,1
445 | 4,117,62,12,0,29.7,0.380,30,1
446 | 0,180,78,63,14,59.4,2.420,25,1
447 | 1,100,72,12,70,25.3,0.658,28,0
448 | 0,95,80,45,92,36.5,0.330,26,0
449 | 0,104,64,37,64,33.6,0.510,22,1
450 | 0,120,74,18,63,30.5,0.285,26,0
451 | 1,82,64,13,95,21.2,0.415,23,0
452 | 2,134,70,0,0,28.9,0.542,23,1
453 | 0,91,68,32,210,39.9,0.381,25,0
454 | 2,119,0,0,0,19.6,0.832,72,0
455 | 2,100,54,28,105,37.8,0.498,24,0
456 | 14,175,62,30,0,33.6,0.212,38,1
457 | 1,135,54,0,0,26.7,0.687,62,0
458 | 5,86,68,28,71,30.2,0.364,24,0
459 | 10,148,84,48,237,37.6,1.001,51,1
460 | 9,134,74,33,60,25.9,0.460,81,0
461 | 9,120,72,22,56,20.8,0.733,48,0
462 | 1,71,62,0,0,21.8,0.416,26,0
463 | 8,74,70,40,49,35.3,0.705,39,0
464 | 5,88,78,30,0,27.6,0.258,37,0
465 | 10,115,98,0,0,24.0,1.022,34,0
466 | 0,124,56,13,105,21.8,0.452,21,0
467 | 0,74,52,10,36,27.8,0.269,22,0
468 | 0,97,64,36,100,36.8,0.600,25,0
469 | 8,120,0,0,0,30.0,0.183,38,1
470 | 6,154,78,41,140,46.1,0.571,27,0
471 | 1,144,82,40,0,41.3,0.607,28,0
472 | 0,137,70,38,0,33.2,0.170,22,0
473 | 0,119,66,27,0,38.8,0.259,22,0
474 | 7,136,90,0,0,29.9,0.210,50,0
475 | 4,114,64,0,0,28.9,0.126,24,0
476 | 0,137,84,27,0,27.3,0.231,59,0
477 | 2,105,80,45,191,33.7,0.711,29,1
478 | 7,114,76,17,110,23.8,0.466,31,0
479 | 8,126,74,38,75,25.9,0.162,39,0
480 | 4,132,86,31,0,28.0,0.419,63,0
481 | 3,158,70,30,328,35.5,0.344,35,1
482 | 0,123,88,37,0,35.2,0.197,29,0
483 | 4,85,58,22,49,27.8,0.306,28,0
484 | 0,84,82,31,125,38.2,0.233,23,0
485 | 0,145,0,0,0,44.2,0.630,31,1
486 | 0,135,68,42,250,42.3,0.365,24,1
487 | 1,139,62,41,480,40.7,0.536,21,0
488 | 0,173,78,32,265,46.5,1.159,58,0
489 | 4,99,72,17,0,25.6,0.294,28,0
490 | 8,194,80,0,0,26.1,0.551,67,0
491 | 2,83,65,28,66,36.8,0.629,24,0
492 | 2,89,90,30,0,33.5,0.292,42,0
493 | 4,99,68,38,0,32.8,0.145,33,0
494 | 4,125,70,18,122,28.9,1.144,45,1
495 | 3,80,0,0,0,0.0,0.174,22,0
496 | 6,166,74,0,0,26.6,0.304,66,0
497 | 5,110,68,0,0,26.0,0.292,30,0
498 | 2,81,72,15,76,30.1,0.547,25,0
499 | 7,195,70,33,145,25.1,0.163,55,1
500 | 6,154,74,32,193,29.3,0.839,39,0
501 | 2,117,90,19,71,25.2,0.313,21,0
502 | 3,84,72,32,0,37.2,0.267,28,0
503 | 6,0,68,41,0,39.0,0.727,41,1
504 | 7,94,64,25,79,33.3,0.738,41,0
505 | 3,96,78,39,0,37.3,0.238,40,0
506 | 10,75,82,0,0,33.3,0.263,38,0
507 | 0,180,90,26,90,36.5,0.314,35,1
508 | 1,130,60,23,170,28.6,0.692,21,0
509 | 2,84,50,23,76,30.4,0.968,21,0
510 | 8,120,78,0,0,25.0,0.409,64,0
511 | 12,84,72,31,0,29.7,0.297,46,1
512 | 0,139,62,17,210,22.1,0.207,21,0
513 | 9,91,68,0,0,24.2,0.200,58,0
514 | 2,91,62,0,0,27.3,0.525,22,0
515 | 3,99,54,19,86,25.6,0.154,24,0
516 | 3,163,70,18,105,31.6,0.268,28,1
517 | 9,145,88,34,165,30.3,0.771,53,1
518 | 7,125,86,0,0,37.6,0.304,51,0
519 | 13,76,60,0,0,32.8,0.180,41,0
520 | 6,129,90,7,326,19.6,0.582,60,0
521 | 2,68,70,32,66,25.0,0.187,25,0
522 | 3,124,80,33,130,33.2,0.305,26,0
523 | 6,114,0,0,0,0.0,0.189,26,0
524 | 9,130,70,0,0,34.2,0.652,45,1
525 | 3,125,58,0,0,31.6,0.151,24,0
526 | 3,87,60,18,0,21.8,0.444,21,0
527 | 1,97,64,19,82,18.2,0.299,21,0
528 | 3,116,74,15,105,26.3,0.107,24,0
529 | 0,117,66,31,188,30.8,0.493,22,0
530 | 0,111,65,0,0,24.6,0.660,31,0
531 | 2,122,60,18,106,29.8,0.717,22,0
532 | 0,107,76,0,0,45.3,0.686,24,0
533 | 1,86,66,52,65,41.3,0.917,29,0
534 | 6,91,0,0,0,29.8,0.501,31,0
535 | 1,77,56,30,56,33.3,1.251,24,0
536 | 4,132,0,0,0,32.9,0.302,23,1
537 | 0,105,90,0,0,29.6,0.197,46,0
538 | 0,57,60,0,0,21.7,0.735,67,0
539 | 0,127,80,37,210,36.3,0.804,23,0
540 | 3,129,92,49,155,36.4,0.968,32,1
541 | 8,100,74,40,215,39.4,0.661,43,1
542 | 3,128,72,25,190,32.4,0.549,27,1
543 | 10,90,85,32,0,34.9,0.825,56,1
544 | 4,84,90,23,56,39.5,0.159,25,0
545 | 1,88,78,29,76,32.0,0.365,29,0
546 | 8,186,90,35,225,34.5,0.423,37,1
547 | 5,187,76,27,207,43.6,1.034,53,1
548 | 4,131,68,21,166,33.1,0.160,28,0
549 | 1,164,82,43,67,32.8,0.341,50,0
550 | 4,189,110,31,0,28.5,0.680,37,0
551 | 1,116,70,28,0,27.4,0.204,21,0
552 | 3,84,68,30,106,31.9,0.591,25,0
553 | 6,114,88,0,0,27.8,0.247,66,0
554 | 1,88,62,24,44,29.9,0.422,23,0
555 | 1,84,64,23,115,36.9,0.471,28,0
556 | 7,124,70,33,215,25.5,0.161,37,0
557 | 1,97,70,40,0,38.1,0.218,30,0
558 | 8,110,76,0,0,27.8,0.237,58,0
559 | 11,103,68,40,0,46.2,0.126,42,0
560 | 11,85,74,0,0,30.1,0.300,35,0
561 | 6,125,76,0,0,33.8,0.121,54,1
562 | 0,198,66,32,274,41.3,0.502,28,1
563 | 1,87,68,34,77,37.6,0.401,24,0
564 | 6,99,60,19,54,26.9,0.497,32,0
565 | 0,91,80,0,0,32.4,0.601,27,0
566 | 2,95,54,14,88,26.1,0.748,22,0
567 | 1,99,72,30,18,38.6,0.412,21,0
568 | 6,92,62,32,126,32.0,0.085,46,0
569 | 4,154,72,29,126,31.3,0.338,37,0
570 | 0,121,66,30,165,34.3,0.203,33,1
571 | 3,78,70,0,0,32.5,0.270,39,0
572 | 2,130,96,0,0,22.6,0.268,21,0
573 | 3,111,58,31,44,29.5,0.430,22,0
574 | 2,98,60,17,120,34.7,0.198,22,0
575 | 1,143,86,30,330,30.1,0.892,23,0
576 | 1,119,44,47,63,35.5,0.280,25,0
577 | 6,108,44,20,130,24.0,0.813,35,0
578 | 2,118,80,0,0,42.9,0.693,21,1
579 | 10,133,68,0,0,27.0,0.245,36,0
580 | 2,197,70,99,0,34.7,0.575,62,1
581 | 0,151,90,46,0,42.1,0.371,21,1
582 | 6,109,60,27,0,25.0,0.206,27,0
583 | 12,121,78,17,0,26.5,0.259,62,0
584 | 8,100,76,0,0,38.7,0.190,42,0
585 | 8,124,76,24,600,28.7,0.687,52,1
586 | 1,93,56,11,0,22.5,0.417,22,0
587 | 8,143,66,0,0,34.9,0.129,41,1
588 | 6,103,66,0,0,24.3,0.249,29,0
589 | 3,176,86,27,156,33.3,1.154,52,1
590 | 0,73,0,0,0,21.1,0.342,25,0
591 | 11,111,84,40,0,46.8,0.925,45,1
592 | 2,112,78,50,140,39.4,0.175,24,0
593 | 3,132,80,0,0,34.4,0.402,44,1
594 | 2,82,52,22,115,28.5,1.699,25,0
595 | 6,123,72,45,230,33.6,0.733,34,0
596 | 0,188,82,14,185,32.0,0.682,22,1
597 | 0,67,76,0,0,45.3,0.194,46,0
598 | 1,89,24,19,25,27.8,0.559,21,0
599 | 1,173,74,0,0,36.8,0.088,38,1
600 | 1,109,38,18,120,23.1,0.407,26,0
601 | 1,108,88,19,0,27.1,0.400,24,0
602 | 6,96,0,0,0,23.7,0.190,28,0
603 | 1,124,74,36,0,27.8,0.100,30,0
604 | 7,150,78,29,126,35.2,0.692,54,1
605 | 4,183,0,0,0,28.4,0.212,36,1
606 | 1,124,60,32,0,35.8,0.514,21,0
607 | 1,181,78,42,293,40.0,1.258,22,1
608 | 1,92,62,25,41,19.5,0.482,25,0
609 | 0,152,82,39,272,41.5,0.270,27,0
610 | 1,111,62,13,182,24.0,0.138,23,0
611 | 3,106,54,21,158,30.9,0.292,24,0
612 | 3,174,58,22,194,32.9,0.593,36,1
613 | 7,168,88,42,321,38.2,0.787,40,1
614 | 6,105,80,28,0,32.5,0.878,26,0
615 | 11,138,74,26,144,36.1,0.557,50,1
616 | 3,106,72,0,0,25.8,0.207,27,0
617 | 6,117,96,0,0,28.7,0.157,30,0
618 | 2,68,62,13,15,20.1,0.257,23,0
619 | 9,112,82,24,0,28.2,1.282,50,1
620 | 0,119,0,0,0,32.4,0.141,24,1
621 | 2,112,86,42,160,38.4,0.246,28,0
622 | 2,92,76,20,0,24.2,1.698,28,0
623 | 6,183,94,0,0,40.8,1.461,45,0
624 | 0,94,70,27,115,43.5,0.347,21,0
625 | 2,108,64,0,0,30.8,0.158,21,0
626 | 4,90,88,47,54,37.7,0.362,29,0
627 | 0,125,68,0,0,24.7,0.206,21,0
628 | 0,132,78,0,0,32.4,0.393,21,0
629 | 5,128,80,0,0,34.6,0.144,45,0
630 | 4,94,65,22,0,24.7,0.148,21,0
631 | 7,114,64,0,0,27.4,0.732,34,1
632 | 0,102,78,40,90,34.5,0.238,24,0
633 | 2,111,60,0,0,26.2,0.343,23,0
634 | 1,128,82,17,183,27.5,0.115,22,0
635 | 10,92,62,0,0,25.9,0.167,31,0
636 | 13,104,72,0,0,31.2,0.465,38,1
637 | 5,104,74,0,0,28.8,0.153,48,0
638 | 2,94,76,18,66,31.6,0.649,23,0
639 | 7,97,76,32,91,40.9,0.871,32,1
640 | 1,100,74,12,46,19.5,0.149,28,0
641 | 0,102,86,17,105,29.3,0.695,27,0
642 | 4,128,70,0,0,34.3,0.303,24,0
643 | 6,147,80,0,0,29.5,0.178,50,1
644 | 4,90,0,0,0,28.0,0.610,31,0
645 | 3,103,72,30,152,27.6,0.730,27,0
646 | 2,157,74,35,440,39.4,0.134,30,0
647 | 1,167,74,17,144,23.4,0.447,33,1
648 | 0,179,50,36,159,37.8,0.455,22,1
649 | 11,136,84,35,130,28.3,0.260,42,1
650 | 0,107,60,25,0,26.4,0.133,23,0
651 | 1,91,54,25,100,25.2,0.234,23,0
652 | 1,117,60,23,106,33.8,0.466,27,0
653 | 5,123,74,40,77,34.1,0.269,28,0
654 | 2,120,54,0,0,26.8,0.455,27,0
655 | 1,106,70,28,135,34.2,0.142,22,0
656 | 2,155,52,27,540,38.7,0.240,25,1
657 | 2,101,58,35,90,21.8,0.155,22,0
658 | 1,120,80,48,200,38.9,1.162,41,0
659 | 11,127,106,0,0,39.0,0.190,51,0
660 | 3,80,82,31,70,34.2,1.292,27,1
661 | 10,162,84,0,0,27.7,0.182,54,0
662 | 1,199,76,43,0,42.9,1.394,22,1
663 | 8,167,106,46,231,37.6,0.165,43,1
664 | 9,145,80,46,130,37.9,0.637,40,1
665 | 6,115,60,39,0,33.7,0.245,40,1
666 | 1,112,80,45,132,34.8,0.217,24,0
667 | 4,145,82,18,0,32.5,0.235,70,1
668 | 10,111,70,27,0,27.5,0.141,40,1
669 | 6,98,58,33,190,34.0,0.430,43,0
670 | 9,154,78,30,100,30.9,0.164,45,0
671 | 6,165,68,26,168,33.6,0.631,49,0
672 | 1,99,58,10,0,25.4,0.551,21,0
673 | 10,68,106,23,49,35.5,0.285,47,0
674 | 3,123,100,35,240,57.3,0.880,22,0
675 | 8,91,82,0,0,35.6,0.587,68,0
676 | 6,195,70,0,0,30.9,0.328,31,1
677 | 9,156,86,0,0,24.8,0.230,53,1
678 | 0,93,60,0,0,35.3,0.263,25,0
679 | 3,121,52,0,0,36.0,0.127,25,1
680 | 2,101,58,17,265,24.2,0.614,23,0
681 | 2,56,56,28,45,24.2,0.332,22,0
682 | 0,162,76,36,0,49.6,0.364,26,1
683 | 0,95,64,39,105,44.6,0.366,22,0
684 | 4,125,80,0,0,32.3,0.536,27,1
685 | 5,136,82,0,0,0.0,0.640,69,0
686 | 2,129,74,26,205,33.2,0.591,25,0
687 | 3,130,64,0,0,23.1,0.314,22,0
688 | 1,107,50,19,0,28.3,0.181,29,0
689 | 1,140,74,26,180,24.1,0.828,23,0
690 | 1,144,82,46,180,46.1,0.335,46,1
691 | 8,107,80,0,0,24.6,0.856,34,0
692 | 13,158,114,0,0,42.3,0.257,44,1
693 | 2,121,70,32,95,39.1,0.886,23,0
694 | 7,129,68,49,125,38.5,0.439,43,1
695 | 2,90,60,0,0,23.5,0.191,25,0
696 | 7,142,90,24,480,30.4,0.128,43,1
697 | 3,169,74,19,125,29.9,0.268,31,1
698 | 0,99,0,0,0,25.0,0.253,22,0
699 | 4,127,88,11,155,34.5,0.598,28,0
700 | 4,118,70,0,0,44.5,0.904,26,0
701 | 2,122,76,27,200,35.9,0.483,26,0
702 | 6,125,78,31,0,27.6,0.565,49,1
703 | 1,168,88,29,0,35.0,0.905,52,1
704 | 2,129,0,0,0,38.5,0.304,41,0
705 | 4,110,76,20,100,28.4,0.118,27,0
706 | 6,80,80,36,0,39.8,0.177,28,0
707 | 10,115,0,0,0,0.0,0.261,30,1
708 | 2,127,46,21,335,34.4,0.176,22,0
709 | 9,164,78,0,0,32.8,0.148,45,1
710 | 2,93,64,32,160,38.0,0.674,23,1
711 | 3,158,64,13,387,31.2,0.295,24,0
712 | 5,126,78,27,22,29.6,0.439,40,0
713 | 10,129,62,36,0,41.2,0.441,38,1
714 | 0,134,58,20,291,26.4,0.352,21,0
715 | 3,102,74,0,0,29.5,0.121,32,0
716 | 7,187,50,33,392,33.9,0.826,34,1
717 | 3,173,78,39,185,33.8,0.970,31,1
718 | 10,94,72,18,0,23.1,0.595,56,0
719 | 1,108,60,46,178,35.5,0.415,24,0
720 | 5,97,76,27,0,35.6,0.378,52,1
721 | 4,83,86,19,0,29.3,0.317,34,0
722 | 1,114,66,36,200,38.1,0.289,21,0
723 | 1,149,68,29,127,29.3,0.349,42,1
724 | 5,117,86,30,105,39.1,0.251,42,0
725 | 1,111,94,0,0,32.8,0.265,45,0
726 | 4,112,78,40,0,39.4,0.236,38,0
727 | 1,116,78,29,180,36.1,0.496,25,0
728 | 0,141,84,26,0,32.4,0.433,22,0
729 | 2,175,88,0,0,22.9,0.326,22,0
730 | 2,92,52,0,0,30.1,0.141,22,0
731 | 3,130,78,23,79,28.4,0.323,34,1
732 | 8,120,86,0,0,28.4,0.259,22,1
733 | 2,174,88,37,120,44.5,0.646,24,1
734 | 2,106,56,27,165,29.0,0.426,22,0
735 | 2,105,75,0,0,23.3,0.560,53,0
736 | 4,95,60,32,0,35.4,0.284,28,0
737 | 0,126,86,27,120,27.4,0.515,21,0
738 | 8,65,72,23,0,32.0,0.600,42,0
739 | 2,99,60,17,160,36.6,0.453,21,0
740 | 1,102,74,0,0,39.5,0.293,42,1
741 | 11,120,80,37,150,42.3,0.785,48,1
742 | 3,102,44,20,94,30.8,0.400,26,0
743 | 1,109,58,18,116,28.5,0.219,22,0
744 | 9,140,94,0,0,32.7,0.734,45,1
745 | 13,153,88,37,140,40.6,1.174,39,0
746 | 12,100,84,33,105,30.0,0.488,46,0
747 | 1,147,94,41,0,49.3,0.358,27,1
748 | 1,81,74,41,57,46.3,1.096,32,0
749 | 3,187,70,22,200,36.4,0.408,36,1
750 | 6,162,62,0,0,24.3,0.178,50,1
751 | 4,136,70,0,0,31.2,1.182,22,1
752 | 1,121,78,39,74,39.0,0.261,28,0
753 | 3,108,62,24,0,26.0,0.223,25,0
754 | 0,181,88,44,510,43.3,0.222,26,1
755 | 8,154,78,32,0,32.4,0.443,45,1
756 | 1,128,88,39,110,36.5,1.057,37,1
757 | 7,137,90,41,0,32.0,0.391,39,0
758 | 0,123,72,0,0,36.3,0.258,52,1
759 | 1,106,76,0,0,37.5,0.197,26,0
760 | 6,190,92,0,0,35.5,0.278,66,1
761 | 2,88,58,26,16,28.4,0.766,22,0
762 | 9,170,74,31,0,44.0,0.403,43,1
763 | 9,89,62,0,0,22.5,0.142,33,0
764 | 10,101,76,48,180,32.9,0.171,63,0
765 | 2,122,70,27,0,36.8,0.340,27,0
766 | 5,121,72,23,112,26.2,0.245,30,0
767 | 1,126,60,0,0,30.1,0.349,47,1
768 | 1,93,70,31,0,30.4,0.315,23,0


--------------------------------------------------------------------------------
/data_exploration/explore.py:
--------------------------------------------------------------------------------
  1 | #import pandas as pd
  2 | import numpy as np
  3 | import seaborn as sns
  4 | import matplotlib.pyplot as plt
  5 | import os
  6 | plt.style.use('seaborn-colorblind')
  7 | 
  8 | # 2018.11.07 Created by Eamon.Zhang
  9 | 
 10 | 
 11 | def get_dtypes(data,drop_col=[]):
 12 |     """Return the dtypes for each column of a pandas Dataframe
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     data : pandas Dataframe
 17 | 
 18 |     drop_col : columns to omit in a list
 19 | 
 20 |     Returns
 21 |     -------
 22 |     str_var_list, num_var_list, all_var_list
 23 |     
 24 |     """
 25 | 
 26 |     name_of_col = list(data.columns)
 27 |     num_var_list = []
 28 |     str_var_list = []
 29 |     all_var_list = []
 30 | 
 31 |     str_var_list = name_of_col.copy()
 32 |     for var in name_of_col:
 33 |         # check if column belongs to numeric type
 34 |         if (data[var].dtypes in (np.int, np.int64, np.uint, np.int32, np.float,
 35 |                                np.float64, np.float32, np.double)):
 36 |             str_var_list.remove(var)
 37 |             num_var_list.append(var)
 38 |     # drop the omit column from list
 39 |     for var in drop_col:
 40 |         if var in str_var_list:
 41 |             str_var_list.remove(var)
 42 |         if var in num_var_list:
 43 |             num_var_list.remove(var)
 44 | 
 45 |     all_var_list.extend(str_var_list)
 46 |     all_var_list.extend(num_var_list)
 47 |     return str_var_list, num_var_list, all_var_list
 48 | 
 49 | 
 50 | def describe(data,output_path=None):
 51 |     """output the general description of a  pandas Dataframe
 52 |        into a csv file
 53 |     
 54 |     """
 55 |     
 56 |     result = data.describe(include='all')
 57 |     if output_path is not None:
 58 |         output = os.path.join(output_path,'describe.csv')
 59 |         result.to_csv(output)
 60 |         print('result saved at:', str(output))
 61 |     return result
 62 |     
 63 |     
 64 | def discrete_var_barplot(x,y,data,output_path=None):
 65 |     """draw the barplot of a discrete variable x against y(target variable). 
 66 |     By default the bar shows the mean value of y.
 67 | 
 68 |     Parameters
 69 |     ----------
 70 | 
 71 | 
 72 |     Returns
 73 |     -------
 74 |     figure save as PNG
 75 |     """
 76 |     
 77 |     plt.figure(figsize=(15,10))
 78 |     sns.barplot(x=x,y=y,data=data)
 79 |     if output_path is not None:
 80 |         output = os.path.join(output_path,'Barplot_'+str(x)+'_'+str(y)+'.png')
 81 |         plt.savefig(output)   
 82 |         print('Image saved at', str(output))
 83 |     
 84 |     
 85 | def discrete_var_countplot(x,data,output_path=None):
 86 |     """draw the countplot of a discrete variable x.
 87 | 
 88 |     Parameters
 89 |     ----------
 90 | 
 91 | 
 92 |     Returns
 93 |     -------
 94 |     figure save as PNG
 95 |     """    
 96 |     
 97 |     plt.figure(figsize=(15,10))
 98 |     sns.countplot(x=x,data=data)
 99 |     if output_path is not None:
100 |         output = os.path.join(output_path,'Countplot_'+str(x)+'.png')
101 |         plt.savefig(output) 
102 |         print('Image saved at',str(output))
103 | 
104 | 
105 | def discrete_var_boxplot(x,y,data,output_path=None):
106 |     """draw the boxplot of a discrete variable x against y.
107 | 
108 |     Parameters
109 |     ----------
110 | 
111 | 
112 |     Returns
113 |     -------
114 |     figure save as PNG
115 |     """    
116 |         
117 |     plt.figure(figsize=(15,10))
118 |     sns.boxplot(x=x,y=y,data=data)
119 |     if output_path is not None:
120 |         output = os.path.join(output_path,'Boxplot_'+str(x)+'_'+str(y)+'.png')
121 |         plt.savefig(output) 
122 |         print('Image saved at',str(output))
123 | 
124 | 
125 | def continuous_var_distplot(x,output_path=None,bins=None):
126 |     """draw the distplot of a continuous variable x.
127 | 
128 |     Parameters
129 |     ----------
130 | 
131 | 
132 |     Returns
133 |     -------
134 |     figure save as PNG
135 |     """    
136 |     
137 |     plt.figure(figsize=(15,10))
138 |     sns.distplot(a=x,kde=False,bins=bins)
139 |     if output_path is not None:
140 |         output=os.path.join(output_path,'Distplot_'+str(x.name)+'.png')
141 |         plt.savefig(output)
142 |         print('Image saved at',str(output))    
143 |     
144 |     
145 | # 2018.11.28 Created by Eamon.Zhang 
146 | 
147 | def scatter_plot(x,y,data,output_path=None):
148 |     """draw the scatter-plot of two variables.
149 | 
150 |     Parameters
151 |     ----------
152 | 
153 | 
154 |     Returns
155 |     -------
156 |     figure save as PNG
157 |     """    
158 |     
159 |     plt.figure(figsize=(15,10))
160 |     sns.scatterplot(x=x,y=y,data=data)
161 |     if output_path is not None:
162 |         output = os.path.join(output_path,'Scatter_plot_'+str(x.name)+'_'+str(y.name)+'.png')
163 |         plt.savefig(output)
164 |         print('Image saved at',str(output))       
165 |         
166 |     
167 | def correlation_plot(data,output_path=None):
168 |     """draw the correlation plot between variables.
169 | 
170 |     Parameters
171 |     ----------
172 | 
173 | 
174 |     Returns
175 |     -------
176 |     figure save as PNG
177 |     """    
178 |     
179 |     corrmat = data.corr()
180 |     fig, ax = plt.subplots()
181 |     fig.set_size_inches(11,11)
182 |     sns.heatmap(corrmat,cmap="YlGnBu",linewidths=.5,annot=True)
183 |     if output_path is not None:
184 |         output = os.path.join(output_path,'Corr_plot'+'.png')
185 |         plt.savefig(output)
186 |         print('Image saved at',str(output))  
187 |     
188 |     
189 | def heatmap(data,output_path=None,fmt='d'):
190 |     """draw the heatmap between 2 variables.
191 | 
192 |     Parameters
193 |     ----------
194 | 
195 | 
196 |     Returns
197 |     -------
198 |     figure save as PNG
199 |     """    
200 |     
201 |     fig, ax = plt.subplots()
202 |     fig.set_size_inches(11,11)
203 |     sns.heatmap(data,cmap="YlGnBu",linewidths=.5,annot=True,fmt=fmt)
204 |     if output_path is not None:
205 |         output = os.path.join(output_path,'Heatmap'+'.png')
206 |         plt.savefig(output)
207 |         print('Image saved at',str(output)) 


--------------------------------------------------------------------------------
/feature_cleaning/missing_data.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from warnings import warn
  4 | 
  5 | # 2018.11.07 Created by Eamon.Zhang
  6 | 
  7 | 
  8 | def check_missing(data,output_path=None):
  9 |     """
 10 |     check the total number & percentage of missing values
 11 |     per variable of a pandas Dataframe
 12 |     """
 13 |     
 14 |     result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1)
 15 |     result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
 16 |     if output_path is not None:
 17 |         result.to_csv(output_path+'missing.csv')
 18 |         print('result saved at', output_path, 'missing.csv')
 19 |     return result
 20 | 
 21 | 
 22 | def drop_missing(data,axis=0):
 23 |     """
 24 |     Listwise deletion:
 25 |     excluding all cases (listwise) that have missing values
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     axis: drop cases(0)/columns(1),default 0
 30 | 
 31 |     Returns
 32 |     -------
 33 |     Pandas dataframe with missing cases/columns dropped
 34 |     """    
 35 |     
 36 |     data_copy = data.copy(deep=True)
 37 |     data_copy = data_copy.dropna(axis=axis,inplace=False)
 38 |     return data_copy
 39 |     
 40 | 
 41 | def add_var_denote_NA(data,NA_col=[]):
 42 |     """
 43 |     creating an additional variable indicating whether the data 
 44 |     was missing for that observation (1) or not (0).
 45 |     """
 46 |   
 47 |     data_copy = data.copy(deep=True)
 48 |     for i in NA_col:
 49 |         if data_copy[i].isnull().sum()>0:
 50 |             data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0)
 51 |         else:
 52 |             warn("Column %s has no missing cases" % i)
 53 |             
 54 |     return data_copy
 55 | 
 56 | 
 57 | def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):
 58 |     """
 59 |     replacing NA with arbitrary values. 
 60 |     """
 61 |     
 62 |     data_copy = data.copy(deep=True)
 63 |     for i in NA_col:
 64 |         if data_copy[i].isnull().sum()>0:
 65 |             data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
 66 |         else:
 67 |             warn("Column %s has no missing cases" % i)
 68 |     return data_copy
 69 | 
 70 | 
 71 | def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
 72 |     """
 73 |     replacing the NA with mean/median/most frequent values of that variable. 
 74 |     Note it should only be performed over training set and then propagated to test set.
 75 |     """
 76 |     
 77 |     data_copy = data.copy(deep=True)
 78 |     for i in NA_col:
 79 |         if data_copy[i].isnull().sum()>0:
 80 |             if strategy=='mean':
 81 |                 data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean())
 82 |             elif strategy=='median':
 83 |                 data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median())
 84 |             elif strategy=='mode':
 85 |                 data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
 86 |         else:
 87 |             warn("Column %s has no missing" % i)
 88 |     return data_copy            
 89 | 
 90 | 
 91 | def impute_NA_with_end_of_distribution(data,NA_col=[]):
 92 |     """
 93 |     replacing the NA by values that are at the far end of the distribution of that variable
 94 |     calculated by mean + 3*std
 95 |     """
 96 |     
 97 |     data_copy = data.copy(deep=True)
 98 |     for i in NA_col:
 99 |         if data_copy[i].isnull().sum()>0:
100 |             data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std())
101 |         else:
102 |             warn("Column %s has no missing" % i)
103 |     return data_copy            
104 |     
105 | 
106 | def impute_NA_with_random(data,NA_col=[],random_state=0):
107 |     """
108 |     replacing the NA with random sampling from the pool of available observations of the variable
109 |     """
110 |     
111 |     data_copy = data.copy(deep=True)
112 |     for i in NA_col:
113 |         if data_copy[i].isnull().sum()>0:
114 |             data_copy[i+'_random'] = data_copy[i]
115 |             # extract the random sample to fill the na
116 |             random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
117 |             random_sample.index = data_copy[data_copy[i].isnull()].index
118 |             data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
119 |         else:
120 |             warn("Column %s has no missing" % i)
121 |     return data_copy 
122 |     


--------------------------------------------------------------------------------
/feature_cleaning/outlier.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | # from warnings import warn
  4 | 
  5 | # 2018.11.07 Created by Eamon.Zhang
  6 | 
  7 | def outlier_detect_arbitrary(data,col,upper_fence,lower_fence):
  8 |     '''
  9 |     identify outliers based on arbitrary boundaries passed to the function.
 10 |     '''
 11 | 
 12 |     para = (upper_fence, lower_fence)
 13 |     tmp = pd.concat([data[col]>upper_fence,data[col]<lower_fence],axis=1)
 14 |     outlier_index = tmp.any(axis=1)
 15 |     print('Num of outlier detected:',outlier_index.value_counts()[1])
 16 |     print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))    
 17 |     return outlier_index, para
 18 | 
 19 | 
 20 | 
 21 | def outlier_detect_IQR(data,col,threshold=3):
 22 |     '''
 23 |     outlier detection by Interquartile Ranges Rule, also known as Tukey's test. 
 24 |     calculate the IQR ( 75th quantile - 25th quantile) 
 25 |     and the 25th 75th quantile. 
 26 |     Any value beyond:
 27 |         upper bound = 75th quantile + （IQR * threshold）
 28 |         lower bound = 25th quantile - （IQR * threshold）   
 29 |     are regarded as outliers. Default threshold is 3.
 30 |     '''
 31 |      
 32 |     IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
 33 |     Lower_fence = data[col].quantile(0.25) - (IQR * threshold)
 34 |     Upper_fence = data[col].quantile(0.75) + (IQR * threshold)
 35 |     para = (Upper_fence, Lower_fence)
 36 |     tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
 37 |     outlier_index = tmp.any(axis=1)
 38 |     print('Num of outlier detected:',outlier_index.value_counts()[1])
 39 |     print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
 40 |     return outlier_index, para
 41 | 
 42 | 
 43 | def outlier_detect_mean_std(data,col,threshold=3):
 44 |     '''
 45 |     outlier detection by Mean and Standard Deviation Method.
 46 |     If a value is a certain number(called threshold) of standard deviations away 
 47 |     from the mean, that data point is identified as an outlier. 
 48 |     Default threshold is 3.
 49 | 
 50 |     This method can fail to detect outliers because the outliers increase the standard deviation. 
 51 |     The more extreme the outlier, the more the standard deviation is affected.
 52 |     '''
 53 |    
 54 |     Upper_fence = data[col].mean() + threshold * data[col].std()
 55 |     Lower_fence = data[col].mean() - threshold * data[col].std()   
 56 |     para = (Upper_fence, Lower_fence)   
 57 |     tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
 58 |     outlier_index = tmp.any(axis=1)
 59 |     print('Num of outlier detected:',outlier_index.value_counts()[1])
 60 |     print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
 61 |     return outlier_index, para
 62 | 
 63 | 
 64 | def outlier_detect_MAD(data,col,threshold=3.5):
 65 |     """
 66 |     outlier detection by Median and Median Absolute Deviation Method (MAD)
 67 |     The median of the residuals is calculated. Then, the difference is calculated between each historical value and this median. 
 68 |     These differences are expressed as their absolute values, and a new median is calculated and multiplied by 
 69 |     an empirically derived constant to yield the median absolute deviation (MAD). 
 70 |     If a value is a certain number of MAD away from the median of the residuals, 
 71 |     that value is classified as an outlier. The default threshold is 3 MAD.
 72 |     
 73 |     This method is generally more effective than the mean and standard deviation method for detecting outliers, 
 74 |     but it can be too aggressive in classifying values that are not really extremely different. 
 75 |     Also, if more than 50% of the data points have the same value, MAD is computed to be 0, 
 76 |     so any value different from the residual median is classified as an outlier.
 77 |     """
 78 |     
 79 |     median = data[col].median()
 80 |     median_absolute_deviation = np.median([np.abs(y - median) for y in data[col]])
 81 |     modified_z_scores = pd.Series([0.6745 * (y - median) / median_absolute_deviation for y in data[col]])
 82 |     outlier_index = np.abs(modified_z_scores) > threshold
 83 |     print('Num of outlier detected:',outlier_index.value_counts()[1])
 84 |     print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
 85 |     return outlier_index
 86 | 
 87 | 
 88 | # 2018.11.10 outlier treatment
 89 | def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]):
 90 |     """
 91 |     impute outliers with arbitrary value
 92 |     """
 93 |     
 94 |     data_copy = data.copy(deep=True)
 95 |     for i in col:
 96 |         data_copy.loc[outlier_index,i] = value
 97 |     return data_copy
 98 |     
 99 |     
100 | def windsorization(data,col,para,strategy='both'):
101 |     """
102 |     top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)
103 |     """
104 |     
105 |     data_copy = data.copy(deep=True)  
106 |     if strategy == 'both':
107 |         data_copy.loc[data_copy[col]>para[0],col] = para[0]
108 |         data_copy.loc[data_copy[col]<para[1],col] = para[1]
109 |     elif strategy == 'top':
110 |         data_copy.loc[data_copy[col]>para[0],col] = para[0]
111 |     elif strategy == 'bottom':
112 |         data_copy.loc[data_copy[col]<para[1],col] = para[1]  
113 |     return data_copy
114 | 
115 | 
116 | def drop_outlier(data,outlier_index):
117 |     """
118 |     drop the cases that are outliers
119 |     """
120 |     
121 |     data_copy = data[~outlier_index]
122 |     return data_copy
123 | 
124 | 
125 | def impute_outlier_with_avg(data,col,outlier_index,strategy='mean'):
126 |     """
127 |     impute outlier with mean/median/most frequent values of that variable.
128 |     """
129 |     
130 |     data_copy = data.copy(deep=True)
131 |     if strategy=='mean':
132 |         data_copy.loc[outlier_index,col] = data_copy[col].mean()
133 |     elif strategy=='median':
134 |         data_copy.loc[outlier_index,col] = data_copy[col].median()
135 |     elif strategy=='mode':
136 |         data_copy.loc[outlier_index,col] = data_copy[col].mode()[0]   
137 |         
138 |     return data_copy
139 | 


--------------------------------------------------------------------------------
/feature_cleaning/rare_values.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | # import numpy as np
  3 | # from warnings import warn
  4 | 
  5 | # 2018.11.07 Created by Eamon.Zhang
  6 | # 2018.11.12 change into fit() transform() format
  7 | 
  8 | class GroupingRareValues():
  9 |     """
 10 |     Grouping the observations that show rare labels into a unique category ('rare')
 11 |     
 12 |     Parameters
 13 |     ----------
 14 |    
 15 |     """
 16 | 
 17 |     def __init__(self, mapping=None, cols=None, threshold=0.01):
 18 |         self.cols = cols
 19 |         self.mapping = mapping
 20 |         self._dim = None
 21 |         self.threshold = threshold
 22 | 
 23 | 
 24 |     def fit(self, X, y=None, **kwargs):
 25 |         """Fit encoder according to X and y.
 26 |         Parameters
 27 |         ----------
 28 |         X : array-like, shape = [n_samples, n_features]
 29 |             Training vectors, where n_samples is the number of samples
 30 |             and n_features is the number of features.
 31 |         y : array-like, shape = [n_samples]
 32 |             Target values.
 33 |         Returns
 34 |         -------
 35 |         self : encoder
 36 |             Returns self.
 37 |         """
 38 | 
 39 |         self._dim = X.shape[1]
 40 | 
 41 |         _, categories = self.grouping(
 42 |             X,
 43 |             mapping=self.mapping,
 44 |             cols=self.cols,
 45 |             threshold=self.threshold
 46 |         )
 47 |         self.mapping = categories
 48 |         return self
 49 | 
 50 | 
 51 |     def transform(self, X):
 52 |         """Perform the transformation to new categorical data.
 53 |         Will use the mapping (if available) and the column list to encode the
 54 |         data.
 55 |         Parameters
 56 |         ----------
 57 |         X : array-like, shape = [n_samples, n_features]
 58 |         Returns
 59 |         -------
 60 |         X : Transformed values with encoding applied.
 61 |         """
 62 | 
 63 |         if self._dim is None:
 64 |             raise ValueError('Must train encoder before it can be used to transform data.')
 65 | 
 66 |         #  make sure that it is the right size
 67 |         if X.shape[1] != self._dim:
 68 |             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
 69 | 
 70 |         X, _ = self.grouping(
 71 |             X,
 72 |             mapping=self.mapping,
 73 |             cols=self.cols,
 74 |             threshold=self.threshold
 75 |         )
 76 | 
 77 |         return X 
 78 | 
 79 | 
 80 |     def grouping(self, X_in, threshold, mapping=None, cols=None):
 81 |         """
 82 |         Grouping the observations that show rare labels into a unique category ('rare')
 83 | 
 84 |         """
 85 | 
 86 |         X = X_in.copy(deep=True)
 87 | 
 88 | #        if cols is None:
 89 | #            cols = X.columns.values
 90 | 
 91 |         if mapping is not None:  # transform
 92 |             mapping_out = mapping
 93 |             for i in mapping:
 94 |                 column = i.get('col') # get the column name
 95 |                 X[column] = X[column].map(i['mapping'])
 96 | 
 97 | #                try:
 98 | #                    X[column] = X[column].astype(int)
 99 | #                except ValueError as e:
100 | #                    X[column] = X[column].astype(float)
101 |         else: # fit
102 |             mapping_out = []
103 |             for col in cols:
104 | #                if util.is_category(X[col].dtype):
105 | #                    categories = X[col].cat.categories
106 | #                else:
107 |                 temp_df = pd.Series(X[col].value_counts()/len(X))
108 |                 mapping = { k: ('rare' if k not in temp_df[temp_df >= threshold].index else k)
109 |                           for k in temp_df.index}
110 | 
111 |                 mapping = pd.Series(mapping)
112 |                 mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
113 | 
114 |         return X, mapping_out
115 | 
116 | 
117 | 
118 | #==============================================================================
119 | # def rare_imputation(X_train, X_test, variable):
120 | #     
121 | #     # find the most frequent category
122 | #     frequent_cat = X_train.groupby(variable)[variable].count().sort_values().tail(1).index.values[0]
123 | #     
124 | #     # find rare labels
125 | #     temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))
126 | #     rare_cat = [x for x in temp.loc[temp<0.05].index.values]
127 | #     
128 | #     # create new variables, with Rare labels imputed
129 | #     
130 | #     # by the most frequent category
131 | #     X_train[variable+'_freq_imp'] = np.where(X_train[variable].isin(rare_cat), frequent_cat, X_train[variable])
132 | #     X_test[variable+'_freq_imp'] = np.where(X_test[variable].isin(rare_cat), frequent_cat, X_test[variable])
133 | #     
134 | #     # by adding a new label 'Rare'
135 | #     X_train[variable+'_rare_imp'] = np.where(X_train[variable].isin(rare_cat), 'Rare', X_train[variable])
136 | #     X_test[variable+'_rare_imp'] = np.where(X_test[variable].isin(rare_cat), 'Rare', X_test[variable])
137 | #==============================================================================
138 | 
139 | # 2018.11.26 created by Eamon.Zhang
140 | class ModeImputation():
141 |     """
142 |     Replacing the rare label by most frequent label
143 |     
144 |     Parameters
145 |     ----------
146 |    
147 |     """
148 | 
149 |     def __init__(self, mapping=None, cols=None, threshold=0.01):
150 |         self.cols = cols
151 |         self.mapping = mapping
152 |         self._dim = None
153 |         self.threshold = threshold
154 | 
155 | 
156 |     def fit(self, X, y=None, **kwargs):
157 |         """Fit encoder according to X and y.
158 |         Parameters
159 |         ----------
160 |         X : array-like, shape = [n_samples, n_features]
161 |             Training vectors, where n_samples is the number of samples
162 |             and n_features is the number of features.
163 |         y : array-like, shape = [n_samples]
164 |             Target values.
165 |         Returns
166 |         -------
167 |         self : encoder
168 |             Returns self.
169 |         """
170 | 
171 |         self._dim = X.shape[1]
172 | 
173 |         _, categories = self.impute_with_mode(
174 |             X,
175 |             mapping=self.mapping,
176 |             cols=self.cols,
177 |             threshold=self.threshold
178 |         )
179 |         self.mapping = categories
180 |         return self
181 | 
182 | 
183 |     def transform(self, X):
184 |         """Perform the transformation to new categorical data.
185 |         Will use the mapping (if available) and the column list to encode the
186 |         data.
187 |         Parameters
188 |         ----------
189 |         X : array-like, shape = [n_samples, n_features]
190 |         Returns
191 |         -------
192 |         X : Transformed values with encoding applied.
193 |         """
194 | 
195 |         if self._dim is None:
196 |             raise ValueError('Must train encoder before it can be used to transform data.')
197 | 
198 |         #  make sure that it is the right size
199 |         if X.shape[1] != self._dim:
200 |             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
201 | 
202 |         X, _ = self.impute_with_mode(
203 |             X,
204 |             mapping=self.mapping,
205 |             cols=self.cols,
206 |             threshold=self.threshold
207 |         )
208 | 
209 |         return X 
210 | 
211 | 
212 |     def impute_with_mode(self, X_in, threshold, mapping=None, cols=None):
213 |         """
214 |         Grouping the observations that show rare labels into a unique category ('rare')
215 | 
216 |         """
217 | 
218 |         X = X_in.copy(deep=True)
219 | 
220 | #        if cols is None:
221 | #            cols = X.columns.values
222 | 
223 |         if mapping is not None:  # transform
224 |             mapping_out = mapping
225 |             for i in mapping:
226 |                 column = i.get('col') # get the column name
227 |                 X[column] = X[column].map(i['mapping'])
228 | 
229 | #                try:
230 | #                    X[column] = X[column].astype(int)
231 | #                except ValueError as e:
232 | #                    X[column] = X[column].astype(float)
233 |         else: # fit
234 |             mapping_out = []
235 |             for col in cols:
236 | #                if util.is_category(X[col].dtype):
237 | #                    categories = X[col].cat.categories
238 | #                else:
239 |                 temp_df = pd.Series(X[col].value_counts()/len(X))
240 |                 median = X[col].mode()[0]
241 |                 mapping = { k: (median if k not in temp_df[temp_df >= threshold].index else k)
242 |                           for k in temp_df.index}
243 | 
244 |                 mapping = pd.Series(mapping)
245 |                 mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
246 | 
247 |         return X, mapping_out
248 | 


--------------------------------------------------------------------------------
/feature_engineering/discretization.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from sklearn.tree import DecisionTreeClassifier
  3 | from sklearn.model_selection import cross_val_score
  4 | import numpy as np
  5 | 
  6 | # from warnings import warn
  7 | 
  8 | # 2018.11.17 Created by Eamon.Zhang
  9 | # ChiMerge method modeified from https://github.com/tatsumiw/ChiMerge/blob/master/ChiMerge.py
 10 | # TODO: add more constraits to the discretized result.
 11 | class ChiMerge():
 12 |     """
 13 |     supervised discretization using the ChiMerge method.
 14 |     
 15 |     
 16 |     Parameters
 17 |     ----------
 18 |     confidenceVal: number
 19 |         default=3.841, correspond to p=0.05 dof=1
 20 |     num_of_bins: int
 21 |         number of bins after discretize
 22 |     col: str
 23 |         the column to be performed
 24 |         
 25 |     """
 26 |     
 27 |     def __init__(self, col=None, bins=None, confidenceVal=3.841, num_of_bins=10):
 28 |         self.col = col
 29 |         self._dim = None
 30 |         self.confidenceVal = confidenceVal
 31 |         self.bins = bins
 32 |         self.num_of_bins = num_of_bins
 33 | 
 34 | 
 35 |     def fit(self, X, y, **kwargs):
 36 |         """Fit encoder according to X and y.
 37 |         Parameters
 38 |         ----------
 39 |         X : array-like, shape = [n_samples, n_features]
 40 |             Training vectors, where n_samples is the number of samples
 41 |             and n_features is the number of features.
 42 |         y : array-like, shape = [n_samples]
 43 |             Target values.
 44 |         Returns
 45 |         -------
 46 |         self : encoder
 47 |             Returns self.
 48 |         """
 49 | 
 50 |         self._dim = X.shape[1]
 51 | 
 52 |         _, bins = self.chimerge(
 53 |             X_in=X,
 54 |             y=y,
 55 |             confidenceVal=self.confidenceVal,
 56 |             col=self.col,
 57 |             num_of_bins=self.num_of_bins
 58 |         )
 59 |         self.bins = bins
 60 |         return self
 61 |     
 62 |     
 63 |     def transform(self, X):
 64 |             """Perform the transformation to new data.
 65 |             Will use the tree model and the column list to discretize the
 66 |             column.
 67 |             Parameters
 68 |             ----------
 69 |             X : array-like, shape = [n_samples, n_features]
 70 |             Returns
 71 |             -------
 72 |             X : new dataframe with discretized new column.
 73 |             """
 74 |     
 75 |             if self._dim is None:
 76 |                 raise ValueError('Must train encoder before it can be used to transform data.')
 77 |     
 78 |             #  make sure that it is the right size
 79 |             if X.shape[1] != self._dim:
 80 |                 raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
 81 |     
 82 |             X, _ = self.chimerge(
 83 |                 X_in=X,
 84 |                 col=self.col,
 85 |                 bins=self.bins
 86 |             )
 87 |     
 88 |             return X 
 89 | 
 90 |     def chimerge(self, X_in, y=None, confidenceVal=None, num_of_bins=None, col=None, bins=None):
 91 |         """
 92 |         discretize a variable using ChiMerge
 93 | 
 94 |         """
 95 | 
 96 |         X = X_in.copy(deep=True)
 97 | 
 98 |         if bins is not None:  # transform
 99 |             try:
100 |                 X[col+'_chimerge'] = pd.cut(X[col],bins=bins,include_lowest=True)
101 |             except Exception as e:
102 |                 print(e)
103 |        
104 |         else: # fit
105 |             try:               
106 |                 # create an array which save the num of 0/1 samples of the column to be chimerge
107 |                 total_num = X.groupby([col])[y].count()
108 |                 total_num = pd.DataFrame({'total_num': total_num}) 
109 |                 positive_class = X.groupby([col])[y].sum()
110 |                 positive_class = pd.DataFrame({'positive_class': positive_class}) 
111 |                 regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,how='inner')  
112 |                 regroup.reset_index(inplace=True)
113 |                 regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']  
114 |                 regroup = regroup.drop('total_num', axis=1)
115 |                 np_regroup = np.array(regroup)  
116 |                 # merge interval that have 0 pos/neg samples
117 |                 i = 0
118 |                 while (i <= np_regroup.shape[0] - 2):
119 |                     if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)):
120 |                         np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1]  # pos
121 |                         np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2]  # neg
122 |                         np_regroup[i, 0] = np_regroup[i + 1, 0]
123 |                         np_regroup = np.delete(np_regroup, i + 1, 0)
124 |                         i = i - 1
125 |                     i = i + 1
126 |                 # calculate chi for neighboring intervals
127 |                 # ∑[(yA-yB)²/yB]
128 |                 chi_table = np.array([])
129 |                 for i in np.arange(np_regroup.shape[0] - 1):
130 |                     chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \
131 |                       * (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \
132 |                       ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * (
133 |                       np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2]))
134 |                     chi_table = np.append(chi_table, chi)
135 |                 # merge intervals that have closing chi
136 |                 while (1):
137 |                     if (len(chi_table) <= (num_of_bins - 1) and min(chi_table) >= confidenceVal):
138 |                         break
139 |                     chi_min_index = np.argwhere(chi_table == min(chi_table))[0]  
140 |                     np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]
141 |                     np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]
142 |                     np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0]
143 |                     np_regroup = np.delete(np_regroup, chi_min_index + 1, 0)
144 |         
145 |                     if (chi_min_index == np_regroup.shape[0] - 1): 
146 |                         chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
147 |                                                        * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
148 |                                                    ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
149 |                         chi_table = np.delete(chi_table, chi_min_index, axis=0)
150 |         
151 |                     else:
152 |                         chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
153 |                                                    * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
154 |                                                    ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
155 |                         chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \
156 |                                                    * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \
157 |                                                ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]))
158 |                         chi_table = np.delete(chi_table, chi_min_index + 1, axis=0)
159 |                 result_data = pd.DataFrame()
160 |                 result_data['variable'] = [col] * np_regroup.shape[0]
161 |                 bins = []
162 |                 tmp = []
163 |                 for i in np.arange(np_regroup.shape[0]):
164 |                     if i == 0:
165 |                         y = '-inf' + ',' + str(np_regroup[i, 0])
166 |                         #x = np_regroup[i, 0]
167 |                         #list_temp.append(x)
168 |                     elif i == np_regroup.shape[0] - 1:
169 |                         y = str(np_regroup[i - 1, 0]) + '+'
170 |                         #x = 100000000.
171 |                         #list_temp.append(x)
172 |                     else:
173 |                         y = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0])
174 |                         #x = np_regroup[i, 0]
175 |                         #list_temp.append(x)
176 |                     bins.append(np_regroup[i - 1, 0])
177 |                     tmp.append(y)
178 |                 
179 |                 #list_temp.append(df[variable].max()+0.1)
180 |                 bins.append(X[col].min()-0.1)
181 |                 
182 |                 result_data['interval'] = tmp  
183 |                 result_data['flag_0'] = np_regroup[:, 2] 
184 |                 result_data['flag_1'] = np_regroup[:, 1]  
185 |                 bins.sort(reverse=False)
186 |                 print('Interval for variable %s' % col)
187 |                 print(result_data)
188 |                 
189 |             except Exception as e:
190 |                 print(e)
191 |         
192 |         return X, bins
193 |         
194 |         
195 |         
196 |         
197 | # 2018.11.15 Created by Eamon.Zhang
198 | class DiscretizeByDecisionTree():
199 |     """
200 |     Discretisation with Decision Trees consists of using a decision tree 
201 |     to identify the optimal splitting points that would determine the bins 
202 |     or contiguous intervals:  
203 |         
204 |     1.train a decision tree of limited depth (2, 3 or 4) using the variable 
205 |     we want to discretise to predict the target.
206 |     2.the original variable values are then replaced by the 
207 |     probability returned by the tree.
208 | 
209 |     Parameters
210 |     ----------
211 |     col: str
212 |       column to discretise
213 |     max_depth: int or list of int
214 |       max depth of the tree. Can be an int or a list of int we want the tree model to search 
215 |       for the optimal depth.
216 |     
217 |     """
218 | 
219 |     def __init__(self, col=None, max_depth=None, tree_model=None):
220 |         self.col = col
221 |         self._dim = None
222 |         self.max_depth = max_depth
223 |         self.tree_model = tree_model
224 | 
225 | 
226 |     def fit(self, X, y, **kwargs):
227 |         """Fit encoder according to X and y.
228 |         Parameters
229 |         ----------
230 |         X : array-like, shape = [n_samples, n_features]
231 |             Training vectors, where n_samples is the number of samples
232 |             and n_features is the number of features.
233 |         y : array-like, shape = [n_samples]
234 |             Target values.
235 |         Returns
236 |         -------
237 |         self : encoder
238 |             Returns self.
239 |         """
240 | 
241 |         self._dim = X.shape[1]
242 | 
243 |         _, tree = self.discretize(
244 |             X_in=X,
245 |             y=y,
246 |             max_depth=self.max_depth,
247 |             col=self.col,
248 |             tree_model=self.tree_model
249 |         )
250 |         self.tree_model = tree
251 |         return self
252 | 
253 |     def transform(self, X):
254 |         """Perform the transformation to new categorical data.
255 |         Will use the tree model and the column list to discretize the
256 |         column.
257 |         Parameters
258 |         ----------
259 |         X : array-like, shape = [n_samples, n_features]
260 |         Returns
261 |         -------
262 |         X : new dataframe with discretized new column.
263 |         """
264 | 
265 |         if self._dim is None:
266 |             raise ValueError('Must train encoder before it can be used to transform data.')
267 | 
268 |         #  make sure that it is the right size
269 |         if X.shape[1] != self._dim:
270 |             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
271 | 
272 |         X, _ = self.discretize(
273 |             X_in=X,
274 |             col=self.col,
275 |             tree_model=self.tree_model
276 |         )
277 | 
278 |         return X 
279 | 
280 | 
281 |     def discretize(self, X_in, y=None, max_depth=None, tree_model=None, col=None):
282 |         """
283 |         discretize a variable using DecisionTreeClassifier
284 | 
285 |         """
286 | 
287 |         X = X_in.copy(deep=True)
288 | 
289 |         if tree_model is not None:  # transform
290 |             X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]
291 | 
292 |         else: # fit
293 |             if isinstance(max_depth,int):
294 |                 tree_model = DecisionTreeClassifier(max_depth=max_depth)
295 |                 tree_model.fit(X[col].to_frame(), y)
296 |                 # X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]
297 |                 #print(x.tree_discret.unique())
298 | #                bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
299 | #                                  X.groupby([col+'_tree_discret'])[col].max()], axis=1)
300 | #                print('bins:')            
301 | #                print(bins)
302 |             
303 |             elif len(max_depth)>1:
304 |                 score_ls = [] # here I will store the roc auc
305 |                 score_std_ls = [] # here I will store the standard deviation of the roc_auc
306 |                 for tree_depth in max_depth:
307 |                     tree_model = DecisionTreeClassifier(max_depth=tree_depth)
308 |                     scores = cross_val_score(tree_model, X[col].to_frame(), y, cv=3, scoring='roc_auc')
309 |                     score_ls.append(np.mean(scores))
310 |                     score_std_ls.append(np.std(scores))
311 |                 temp = pd.concat([pd.Series(max_depth), pd.Series(score_ls), pd.Series(score_std_ls)], axis=1)
312 |                 temp.columns = ['depth', 'roc_auc_mean', 'roc_auc_std']
313 |                 print('result ROC-AUC for each depth')
314 |                 print(temp)
315 |                 max_roc = temp.roc_auc_mean.max()
316 |                 optimal_depth=temp[temp.roc_auc_mean==max_roc]['depth'].values
317 |                 print('optimal_depth:',optimal_depth)
318 |                 tree_model = DecisionTreeClassifier(max_depth=optimal_depth)
319 |                 tree_model.fit(X[col].to_frame(), y)
320 | #                bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
321 | #                                  X.groupby([col+'_tree_discret'])[col].max()], axis=1)
322 | #                print('bins:')            
323 | #                print(bins)
324 |             else:
325 |                 raise ValueError('max_depth of a tree must be an integer or a list')
326 | 
327 |         return X, tree_model
328 | 
329 | 
330 | 


--------------------------------------------------------------------------------
/feature_engineering/encoding.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | # 2018.11.28 Created by Eamon.Zhang
  4 | 
  5 | class MeanEncoding():
  6 |     """
  7 |     replacing the label by the mean of the target for that label. 
  8 |     
  9 |     Parameters
 10 |     ----------
 11 |    
 12 |     """
 13 | 
 14 |     def __init__(self, mapping=None, cols=None):
 15 |         self.cols = cols
 16 |         self.mapping = mapping
 17 |         self._dim = None
 18 |         # self.threshold = threshold
 19 | 
 20 | 
 21 |     def fit(self, X, y=None, **kwargs):
 22 |         """Fit encoder according to X and y.
 23 |         Parameters
 24 |         ----------
 25 |         X : array-like, shape = [n_samples, n_features]
 26 |             Training vectors, where n_samples is the number of samples
 27 |             and n_features is the number of features.
 28 |         y : array-like, shape = [n_samples]
 29 |             Target values.
 30 |         Returns
 31 |         -------
 32 |         self : encoder
 33 |             Returns self.
 34 |         """
 35 | 
 36 |         self._dim = X.shape[1]
 37 | 
 38 |         _, categories = self.mean_encoding(
 39 |             X,
 40 |             y,
 41 |             mapping=self.mapping,
 42 |             cols=self.cols
 43 |             # threshold=self.threshold
 44 |         )
 45 |         self.mapping = categories
 46 |         return self
 47 | 
 48 | 
 49 |     def transform(self, X):
 50 |         """Perform the transformation to new categorical data.
 51 |         Will use the mapping (if available) and the column list to encode the
 52 |         data.
 53 |         Parameters
 54 |         ----------
 55 |         X : array-like, shape = [n_samples, n_features]
 56 |         Returns
 57 |         -------
 58 |         X : Transformed values with encoding applied.
 59 |         """
 60 | 
 61 |         if self._dim is None:
 62 |             raise ValueError('Must train encoder before it can be used to transform data.')
 63 | 
 64 |         #  make sure that it is the right size
 65 |         if X.shape[1] != self._dim:
 66 |             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
 67 | 
 68 |         X, _ = self.mean_encoding(
 69 |             X,
 70 |             mapping=self.mapping,
 71 |             cols=self.cols
 72 |             # threshold=self.threshold
 73 |         )
 74 | 
 75 |         return X 
 76 | 
 77 | 
 78 |     def mean_encoding(self, X_in, y=None, mapping=None, cols=None):
 79 |         """
 80 |         Grouping the observations that show rare labels into a unique category ('rare')
 81 | 
 82 |         """
 83 | 
 84 |         X = X_in.copy(deep=True)
 85 | 
 86 | #        if cols is None:
 87 | #            cols = X.columns.values
 88 | 
 89 |         if mapping is not None:  # transform
 90 |             mapping_out = mapping
 91 |             for i in mapping:
 92 |                 column = i.get('col') # get the column name
 93 |                 X[column] = X[column].map(i['mapping'])
 94 | 
 95 | #                try:
 96 | #                    X[column] = X[column].astype(int)
 97 | #                except ValueError as e:
 98 | #                    X[column] = X[column].astype(float)
 99 |         else: # fit
100 |             mapping_out = []
101 |             for col in cols:
102 | #                if util.is_category(X[col].dtype):
103 | #                    categories = X[col].cat.categories
104 | #                else:
105 |                 mapping = X[y.name].groupby(X[col]).mean().to_dict()
106 |                 mapping = pd.Series(mapping)
107 |                 mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
108 | 
109 |         return X, mapping_out


--------------------------------------------------------------------------------
/feature_engineering/transformation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import scipy.stats as stats
 5 | import pylab
 6 | # from warnings import warn
 7 | 
 8 | # 2018.11.26 Created by Eamon.Zhang
 9 | def diagnostic_plots(df, variable):
10 |     # function to plot a histogram and a Q-Q plot
11 |     # side by side, for a certain variable
12 |     
13 |     plt.figure(figsize=(15,6))
14 |     plt.subplot(1, 2, 1)
15 |     df[variable].hist()
16 | 
17 |     plt.subplot(1, 2, 2)
18 |     stats.probplot(df[variable], dist="norm", plot=pylab)
19 | 
20 |     plt.show()
21 |     
22 |     
23 | def log_transform(data,cols=[]):
24 |     """
25 |     Logarithmic transformation
26 |     """
27 |     
28 |     data_copy = data.copy(deep=True)
29 |     for i in cols:
30 |         data_copy[i+'_log'] = np.log(data_copy[i]+1)
31 |         print('Variable ' + i +' Q-Q plot')
32 |         diagnostic_plots(data_copy,str(i+'_log'))       
33 |     return data_copy 
34 | 
35 | 
36 | def reciprocal_transform(data,cols=[]):
37 |     """
38 |     Reciprocal transformation
39 |     """
40 |     
41 |     data_copy = data.copy(deep=True)
42 |     for i in cols:
43 |         data_copy[i+'_reciprocal'] = 1/(data_copy[i])
44 |         print('Variable ' + i +' Q-Q plot')
45 |         diagnostic_plots(data_copy,str(i+'_reciprocal'))       
46 |     return data_copy 
47 | 
48 | 
49 | def square_root_transform(data,cols=[]):
50 |     """
51 |     square root transformation
52 |     """
53 |     
54 |     data_copy = data.copy(deep=True)
55 |     for i in cols:
56 |         data_copy[i+'_square_root'] = (data_copy[i])**(0.5)
57 |         print('Variable ' + i +' Q-Q plot')
58 |         diagnostic_plots(data_copy,str(i+'_square_root'))        
59 |     return data_copy 
60 | 
61 | 
62 | def exp_transform(data,coef,cols=[]):
63 |     """
64 |     exp transformation
65 |     """
66 |     
67 |     data_copy = data.copy(deep=True)
68 |     for i in cols:
69 |         data_copy[i+'_exp'] = (data_copy[i])**coef
70 |         print('Variable ' + i +' Q-Q plot')
71 |         diagnostic_plots(data_copy,str(i+'_exp'))         
72 |     return data_copy 
73 | 
74 | 


--------------------------------------------------------------------------------
/feature_selection/embedded_method.py:
--------------------------------------------------------------------------------
 1 | #import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | #import seaborn as sns
 6 | #from sklearn.model_selection import train_test_split
 7 | 
 8 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier #RandomForestRegressor
 9 | #from sklearn.feature_selection import SelectFromModel
10 | 
11 | # 2018.11.27 Created by Eamon.Zhang
12 | 
13 | def rf_importance(X_train,y_train,max_depth=10,class_weight=None,top_n=15,n_estimators=50,random_state=0):
14 |     
15 |     model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
16 |                                     random_state=random_state,class_weight=class_weight,
17 |                                     n_jobs=-1)
18 |     model.fit(X_train, y_train)
19 |     importances = model.feature_importances_
20 |     indices = np.argsort(importances)[::-1]
21 |     feat_labels = X_train.columns
22 |     std = np.std([tree.feature_importances_ for tree in model.estimators_],
23 |                  axis=0) #  inter-trees variability. 
24 |     print("Feature ranking:") 
25 | #    l1,l2,l3,l4 = [],[],[],[]
26 |     for f in range(X_train.shape[1]):
27 |         print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
28 | #        l1.append(f+1)
29 | #        l2.append(indices[f])
30 | #        l3.append(feat_labels[indices[f]])
31 | #        l4.append(importances[indices[f]])
32 |     #feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])
33 |     
34 |     # plotting
35 |     indices = indices[0:top_n]
36 |     plt.figure()
37 |     plt.title("Feature importances top %d" % top_n)
38 |     plt.bar(range(top_n), importances[indices],
39 |            color="r", yerr=std[indices], align="center")
40 |     plt.xticks(range(top_n), indices)
41 |     plt.xlim([-1,top_n])
42 |     plt.show() 
43 |     
44 |     return model
45 | 
46 | 
47 | def gbt_importance(X_train,y_train,max_depth=10,top_n=15,n_estimators=50,random_state=0):
48 |     
49 |     model = GradientBoostingClassifier(n_estimators=n_estimators,max_depth=max_depth,
50 |                                        random_state=random_state)
51 |     model.fit(X_train, y_train)
52 |     importances = model.feature_importances_
53 |     indices = np.argsort(importances)[::-1]
54 |     feat_labels = X_train.columns
55 |     std = np.std([tree[0].feature_importances_ for tree in model.estimators_],
56 |                  axis=0) #  inter-trees variability. 
57 |     print("Feature ranking:")   
58 | #    l1,l2,l3,l4 = [],[],[],[]
59 |     for f in range(X_train.shape[1]):
60 |         print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
61 | #        l1.append(f+1)
62 | #        l2.append(indices[f])
63 | #        l3.append(feat_labels[indices[f]])
64 | #        l4.append(importances[indices[f]])
65 | #    feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])   
66 |     # plotting
67 |     indices = indices[0:top_n]
68 |     plt.figure()
69 |     plt.title("Feature importances top %d" % top_n)
70 |     plt.bar(range(top_n), importances[indices],
71 |            color="r", yerr=std[indices], align="center")
72 |     plt.xticks(range(top_n), indices)
73 |     plt.xlim([-1,top_n])
74 |     plt.show() 
75 |     
76 |     return model


--------------------------------------------------------------------------------
/feature_selection/feature_shuffle.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | #import numpy as np
 3 | 
 4 | 
 5 | from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
 6 | from sklearn.metrics import roc_auc_score #, mean_squared_error
 7 | 
 8 | # 2018.11.28 Created by Eamon.Zhang
 9 | 
10 | 
11 | def feature_shuffle_rf(X_train,y_train,max_depth=None,class_weight=None,top_n=15,n_estimators=50,random_state=0):
12 |     
13 |     model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
14 |                                     random_state=random_state,class_weight=class_weight,
15 |                                     n_jobs=-1)
16 |     model.fit(X_train, y_train)
17 |     train_auc = roc_auc_score(y_train, (model.predict_proba(X_train))[:, 1])
18 |     feature_dict = {}
19 | 
20 |     # selection  logic
21 |     for feature in X_train.columns:
22 |         X_train_c = X_train.copy().reset_index(drop=True)
23 |         y_train_c = y_train.copy().reset_index(drop=True)
24 |         
25 |         # shuffle individual feature
26 |         X_train_c[feature] = X_train_c[feature].sample(frac=1,random_state=random_state).reset_index(
27 |             drop=True)
28 |         #print(X_train_c.isnull().sum())
29 |         # make prediction with shuffled feature and calculate roc-auc
30 |         shuff_auc = roc_auc_score(y_train_c,
31 |                                   (model.predict_proba(X_train_c))[:, 1])
32 |         #print(shuff_auc)
33 |         # save the drop in roc-auc
34 |         feature_dict[feature] = (train_auc - shuff_auc)
35 |         #print(feature_dict)
36 |     
37 |     auc_drop = pd.Series(feature_dict).reset_index()
38 |     auc_drop.columns = ['feature', 'auc_drop']
39 |     auc_drop.sort_values(by=['auc_drop'], ascending=False, inplace=True)
40 |     selected_features = auc_drop[auc_drop.auc_drop>0]['feature']
41 | 
42 |     return auc_drop, selected_features
43 | 
44 | 


--------------------------------------------------------------------------------
/feature_selection/filter_method.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | #from sklearn.feature_selection import VarianceThreshold
  4 | from sklearn.feature_selection import mutual_info_classif,chi2
  5 | from sklearn.feature_selection import SelectKBest, SelectPercentile
  6 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
  7 | from sklearn.metrics import roc_auc_score, mean_squared_error
  8 | 
  9 | # 2018.11.17 Created by Eamon.Zhang
 10 | 
 11 | def constant_feature_detect(data,threshold=0.98):
 12 |     """ detect features that show the same value for the 
 13 |     majority/all of the observations (constant/quasi-constant features)
 14 |     
 15 |     Parameters
 16 |     ----------
 17 |     data : pd.Dataframe
 18 |     threshold : threshold to identify the variable as constant
 19 |         
 20 |     Returns
 21 |     -------
 22 |     list of variables names
 23 |     """
 24 |     
 25 |     data_copy = data.copy(deep=True)
 26 |     quasi_constant_feature = []
 27 |     for feature in data_copy.columns:
 28 |         predominant = (data_copy[feature].value_counts() / np.float(
 29 |                       len(data_copy))).sort_values(ascending=False).values[0]
 30 |         if predominant >= threshold:
 31 |             quasi_constant_feature.append(feature)
 32 |     print(len(quasi_constant_feature),' variables are found to be almost constant')    
 33 |     return quasi_constant_feature
 34 | 
 35 | 
 36 | def corr_feature_detect(data,threshold=0.8):
 37 |     """ detect highly-correlated features of a Dataframe
 38 |     Parameters
 39 |     ----------
 40 |     data : pd.Dataframe
 41 |     threshold : threshold to identify the variable correlated
 42 |         
 43 |     Returns
 44 |     -------
 45 |     pairs of correlated variables
 46 |     """
 47 |     
 48 |     corrmat = data.corr()
 49 |     corrmat = corrmat.abs().unstack() # absolute value of corr coef
 50 |     corrmat = corrmat.sort_values(ascending=False)
 51 |     corrmat = corrmat[corrmat >= threshold]
 52 |     corrmat = corrmat[corrmat < 1] # remove the digonal
 53 |     corrmat = pd.DataFrame(corrmat).reset_index()
 54 |     corrmat.columns = ['feature1', 'feature2', 'corr']
 55 |    
 56 |     grouped_feature_ls = []
 57 |     correlated_groups = []
 58 |     
 59 |     for feature in corrmat.feature1.unique():
 60 |         if feature not in grouped_feature_ls:
 61 |     
 62 |             # find all features correlated to a single feature
 63 |             correlated_block = corrmat[corrmat.feature1 == feature]
 64 |             grouped_feature_ls = grouped_feature_ls + list(
 65 |                 correlated_block.feature2.unique()) + [feature]
 66 |     
 67 |             # append the block of features to the list
 68 |             correlated_groups.append(correlated_block)
 69 |     return correlated_groups
 70 | 
 71 | 
 72 | def mutual_info(X,y,select_k=10):
 73 |     
 74 | #    mi = mutual_info_classif(X,y)
 75 | #    mi = pd.Series(mi)
 76 | #    mi.index = X.columns
 77 | #    mi.sort_values(ascending=False)
 78 |     
 79 |     if select_k >= 1:
 80 |         sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
 81 |         col = X.columns[sel_.get_support()]
 82 |         
 83 |     elif 0 < select_k < 1:
 84 |         sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
 85 |         col = X.columns[sel_.get_support()]   
 86 |         
 87 |     else:
 88 |         raise ValueError("select_k must be a positive number")
 89 |     
 90 |     return col
 91 |     
 92 | 
 93 | # 2018.11.27 edit Chi-square test
 94 | def chi_square_test(X,y,select_k=10):
 95 |    
 96 |     """
 97 |     Compute chi-squared stats between each non-negative feature and class.
 98 |     This score should be used to evaluate categorical variables in a classification task
 99 |     """
100 |     if select_k >= 1:
101 |         sel_ = SelectKBest(chi2, k=select_k).fit(X,y)
102 |         col = X.columns[sel_.get_support()]
103 |     elif 0 < select_k < 1:
104 |         sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y)
105 |         col = X.columns[sel_.get_support()]   
106 |     else:
107 |         raise ValueError("select_k must be a positive number")  
108 |     
109 |     return col
110 |     
111 | 
112 | def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold):
113 |    
114 |     """
115 |     First, it builds one decision tree per feature, to predict the target
116 |     Second, it makes predictions using the decision tree and the mentioned feature
117 |     Third, it ranks the features according to the machine learning metric (roc-auc or mse)
118 |     It selects the highest ranked features
119 | 
120 |     """
121 |     roc_values = []
122 |     for feature in X_train.columns:
123 |         clf = DecisionTreeClassifier()
124 |         clf.fit(X_train[feature].to_frame(), y_train)
125 |         y_scored = clf.predict_proba(X_test[feature].to_frame())
126 |         roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
127 |     roc_values = pd.Series(roc_values)
128 |     roc_values.index = X_train.columns
129 |     print(roc_values.sort_values(ascending=False))
130 |     print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
131 |     keep_col = roc_values[roc_values > threshold]
132 |     return keep_col
133 |         
134 |         
135 | def univariate_mse(X_train,y_train,X_test,y_test,threshold):
136 |    
137 |     """
138 |     First, it builds one decision tree per feature, to predict the target
139 |     Second, it makes predictions using the decision tree and the mentioned feature
140 |     Third, it ranks the features according to the machine learning metric (roc-auc or mse)
141 |     It selects the highest ranked features
142 | 
143 |     """
144 |     mse_values = []
145 |     for feature in X_train.columns:
146 |         clf = DecisionTreeRegressor()
147 |         clf.fit(X_train[feature].to_frame(), y_train)
148 |         y_scored = clf.predict(X_test[feature].to_frame())
149 |         mse_values.append(mean_squared_error(y_test, y_scored))
150 |     mse_values = pd.Series(mse_values)
151 |     mse_values.index = X_train.columns
152 |     print(mse_values.sort_values(ascending=False))
153 |     print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
154 |     keep_col = mse_values[mse_values > threshold]
155 |     return keep_col        
156 |         


--------------------------------------------------------------------------------
/feature_selection/hybrid.py:
--------------------------------------------------------------------------------
  1 | #import pandas as pd
  2 | #import numpy as np
  3 | 
  4 | from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
  5 | from sklearn.metrics import roc_auc_score #, mean_squared_error
  6 | 
  7 | # 2018.12.02 Created by Eamon.Zhang
  8 | 
  9 | 
 10 | def recursive_feature_elimination_rf(X_train,y_train,X_test,y_test,
 11 |                                      tol=0.001,max_depth=None,
 12 |                                      class_weight=None,
 13 |                                      top_n=15,n_estimators=50,random_state=0):
 14 |     
 15 |    
 16 |     features_to_remove = []
 17 |     count = 1
 18 |     # initial model using all the features
 19 |     model_all_features = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
 20 |                                     random_state=random_state,class_weight=class_weight,
 21 |                                     n_jobs=-1)
 22 |     model_all_features.fit(X_train, y_train)
 23 |     y_pred_test = model_all_features.predict_proba(X_test)[:, 1]
 24 |     auc_score_all = roc_auc_score(y_test, y_pred_test)
 25 |     
 26 |     for feature in X_train.columns:
 27 |         print()
 28 |         print('testing feature: ', feature, ' which is feature ', count,
 29 |           ' out of ', len(X_train.columns))
 30 |         count += 1
 31 |         model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
 32 |                                     random_state=random_state,class_weight=class_weight,
 33 |                                     n_jobs=-1)
 34 |         
 35 |         # fit model with all variables minus the removed features
 36 |         # and the feature to be evaluated
 37 |         model.fit(X_train.drop(features_to_remove + [feature], axis=1), y_train)
 38 |         y_pred_test = model.predict_proba(
 39 |                     X_test.drop(features_to_remove + [feature], axis=1))[:, 1]    
 40 |         auc_score_int = roc_auc_score(y_test, y_pred_test)
 41 |         print('New Test ROC AUC={}'.format((auc_score_int)))
 42 |     
 43 |         # print the original roc-auc with all the features
 44 |         print('All features Test ROC AUC={}'.format((auc_score_all)))
 45 |     
 46 |         # determine the drop in the roc-auc
 47 |         diff_auc = auc_score_all - auc_score_int
 48 |     
 49 |         # compare the drop in roc-auc with the tolerance
 50 |         if diff_auc >= tol:
 51 |             print('Drop in ROC AUC={}'.format(diff_auc))
 52 |             print('keep: ', feature)
 53 |             
 54 |         else:
 55 |             print('Drop in ROC AUC={}'.format(diff_auc))
 56 |             print('remove: ', feature)
 57 |             
 58 |             # if the drop in the roc is small and we remove the
 59 |             # feature, we need to set the new roc to the one based on
 60 |             # the remaining features
 61 |             auc_score_all = auc_score_int
 62 |             
 63 |             # and append the feature to remove to the list
 64 |             features_to_remove.append(feature)
 65 |     print('DONE!!')
 66 |     print('total features to remove: ', len(features_to_remove))  
 67 |     features_to_keep = [x for x in X_train.columns if x not in features_to_remove]
 68 |     print('total features to keep: ', len(features_to_keep))
 69 |     
 70 |     return features_to_keep
 71 | 
 72 | 
 73 | def recursive_feature_addition_rf(X_train,y_train,X_test,y_test,
 74 |                                      tol=0.001,max_depth=None,
 75 |                                      class_weight=None,
 76 |                                      top_n=15,n_estimators=50,random_state=0):
 77 |     
 78 |    
 79 |     features_to_keep = [X_train.columns[0]]
 80 |     count = 1
 81 |     # initial model using only one feature
 82 |     model_one_feature = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
 83 |                                     random_state=random_state,class_weight=class_weight,
 84 |                                     n_jobs=-1)
 85 |     model_one_feature.fit(X_train[[X_train.columns[0]]], y_train)
 86 |     y_pred_test = model_one_feature.predict_proba(X_test[[X_train.columns[0]]])[:, 1]  
 87 |     auc_score_all = roc_auc_score(y_test, y_pred_test)
 88 |     
 89 |     for feature in X_train.columns[1:]:
 90 |         print()
 91 |         print('testing feature: ', feature, ' which is feature ', count,
 92 |           ' out of ', len(X_train.columns))
 93 |         count += 1
 94 |         model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
 95 |                                     random_state=random_state,class_weight=class_weight,
 96 |                                     n_jobs=-1)
 97 |         
 98 |         # fit model with  the selected features
 99 |         # and the feature to be evaluated
100 |         model.fit(X_train[features_to_keep + [feature]], y_train)
101 |         y_pred_test = model.predict_proba(
102 |                     X_test[features_to_keep + [feature]])[:, 1]    
103 |         auc_score_int = roc_auc_score(y_test, y_pred_test)
104 |         print('New Test ROC AUC={}'.format((auc_score_int)))
105 |     
106 |         # print the original roc-auc with all the features
107 |         print('All features Test ROC AUC={}'.format((auc_score_all)))
108 |     
109 |         # determine the drop in the roc-auc
110 |         diff_auc = auc_score_int - auc_score_all
111 |     
112 |         # compare the drop in roc-auc with the tolerance
113 |         if diff_auc >= tol:
114 |             # if the increase in the roc is bigger than the threshold
115 |             # we keep the feature and re-adjust the roc-auc to the new value
116 |             # considering the added feature
117 |             print('Increase in ROC AUC={}'.format(diff_auc))
118 |             print('keep: ', feature)
119 |             auc_score_all = auc_score_int
120 |             features_to_keep.append(feature)
121 |         else:
122 |             print('Increase in ROC AUC={}'.format(diff_auc))
123 |             print('remove: ', feature)          
124 | 
125 |     print('DONE!!')
126 |     print('total features to keep: ', len(features_to_keep))  
127 |    
128 |     return features_to_keep


--------------------------------------------------------------------------------
/images/001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/001.png


--------------------------------------------------------------------------------
/images/IV.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/IV.png


--------------------------------------------------------------------------------
/images/box-cox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/box-cox.png


--------------------------------------------------------------------------------
/images/embedded.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/embedded.png


--------------------------------------------------------------------------------
/images/featuretools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/featuretools.png


--------------------------------------------------------------------------------
/images/filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/filter.png


--------------------------------------------------------------------------------
/images/scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/scaling.png


--------------------------------------------------------------------------------
/images/sphx_glr_plot_map_data_to_normal_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/sphx_glr_plot_map_data_to_normal_001.png


--------------------------------------------------------------------------------
/images/workflow2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/workflow2.png


--------------------------------------------------------------------------------
/images/wrapper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/images/wrapper.png


--------------------------------------------------------------------------------
/output/Barplot_Pclass_Survived.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Barplot_Pclass_Survived.png


--------------------------------------------------------------------------------
/output/Boxplot_Pclass_Fare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Boxplot_Pclass_Fare.png


--------------------------------------------------------------------------------
/output/Corr_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Corr_plot.png


--------------------------------------------------------------------------------
/output/Countplot_Pclass.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Countplot_Pclass.png


--------------------------------------------------------------------------------
/output/Distplot_Fare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Distplot_Fare.png


--------------------------------------------------------------------------------
/output/Heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Heatmap.png


--------------------------------------------------------------------------------
/output/Scatter_plot_Fare_Pclass.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/Amazing-Feature-Engineering/ba5eebe273d7ad30ced7dc0492fd4fd3637d33cf/output/Scatter_plot_Fare_Pclass.png


--------------------------------------------------------------------------------
/output/describe.csv:
--------------------------------------------------------------------------------
 1 | ,Survived,Pclass,Sex,Age,SibSp,Fare
 2 | count,891.0,891.0,891,714.0,891.0,891.0
 3 | unique,,,2,,,
 4 | top,,,male,,,
 5 | freq,,,577,,,
 6 | mean,0.3838383838383838,2.308641975308642,,29.69911764705882,0.5230078563411896,32.2042079685746
 7 | std,0.4865924542648585,0.8360712409770513,,14.526497332334044,1.1027434322934275,49.693428597180905
 8 | min,0.0,1.0,,0.42,0.0,0.0
 9 | 25%,0.0,2.0,,20.125,0.0,7.9104
10 | 50%,0.0,3.0,,28.0,0.0,14.4542
11 | 75%,1.0,3.0,,38.0,1.0,31.0
12 | max,1.0,3.0,,80.0,8.0,512.3292
13 | 


--------------------------------------------------------------------------------
/output/missing.csv:
--------------------------------------------------------------------------------
1 | ,total missing,proportion
2 | Survived,0,0.0
3 | Pclass,0,0.0
4 | Sex,0,0.0
5 | Age,177,0.19865319865319866
6 | SibSp,0,0.0
7 | Fare,0,0.0
8 | 


--------------------------------------------------------------------------------