├── GUARDE_LOS_DATASETS_AQUI.txt
├── LICENSE
├── .github
    └── FUNDING.yml
├── LogoUdemy.png
├── trainindata.png
├── .gitignore
├── Section-08-Discretizacion
    ├── tree_visualisation.png
    └── tree_model.txt
├── README.md
├── Seccion-01-Introduccion
    └── 01.11-Sets-de-datos.ipynb
├── Seccion-04-Sustitucion-Datos-Faltantes
    ├── 04.20_SustitucionMuestraAleatoria_FeatureEngine.ipynb
    ├── 04.18_SustitucionModa_FeatureEngine.ipynb
    ├── 04.19_SustitucionCategoriaAdicional_FeatureEngine.ipynb
    ├── 04.15_SustitucionMediaMediana_FeatureEngine.ipynb
    ├── 04.13_IndicadorAusencia_Sklearn.ipynb
    └── 04.12_SustitucionCategoriaAdicional_Sklearn.ipynb
├── Section-09-Ingenieria-valores-extremos
    └── 09.05-Truncamiento-valores-arbitrarios.ipynb
├── Seccion-02-Tipos-de-Variables
    └── 02.4_VariablesMixtas.ipynb
└── Section-06-Codificacion-Variables-Categoricas
    └── 06.04_Codificacion-frecuencia.ipynb


/GUARDE_LOS_DATASETS_AQUI.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/ingenieria-de-variables/HEAD/LICENSE


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: [solegalli]
4 | 


--------------------------------------------------------------------------------
/LogoUdemy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/ingenieria-de-variables/HEAD/LogoUdemy.png


--------------------------------------------------------------------------------
/trainindata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/ingenieria-de-variables/HEAD/trainindata.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Jupyter Notebook
2 | .ipynb_checkpoints
3 | 
4 | # datasets
5 | *.csv
6 | 
7 | # other files
8 | .DS_Store
9 | 


--------------------------------------------------------------------------------
/Section-08-Discretizacion/tree_visualisation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/ingenieria-de-variables/HEAD/Section-08-Discretizacion/tree_visualisation.png


--------------------------------------------------------------------------------
/Section-08-Discretizacion/tree_model.txt:
--------------------------------------------------------------------------------
 1 | digraph Tree {
 2 | node [shape=box] ;
 3 | 0 [label="X[0] <= 64.5\ngini = 0.474\nsamples = 916\nvalue = [563, 353]"] ;
 4 | 1 [label="X[0] <= 8.5\ngini = 0.475\nsamples = 907\nvalue = [554, 353]"] ;
 5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
 6 | 2 [label="X[0] <= 0.458\ngini = 0.497\nsamples = 52\nvalue = [24, 28]"] ;
 7 | 1 -> 2 ;
 8 | 3 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
 9 | 2 -> 3 ;
10 | 4 [label="gini = 0.498\nsamples = 51\nvalue = [24, 27]"] ;
11 | 2 -> 4 ;
12 | 5 [label="X[0] <= 44.5\ngini = 0.471\nsamples = 855\nvalue = [530, 325]"] ;
13 | 1 -> 5 ;
14 | 6 [label="gini = 0.464\nsamples = 713\nvalue = [452, 261]"] ;
15 | 5 -> 6 ;
16 | 7 [label="gini = 0.495\nsamples = 142\nvalue = [78, 64]"] ;
17 | 5 -> 7 ;
18 | 8 [label="gini = 0.0\nsamples = 9\nvalue = [9, 0]"] ;
19 | 0 -> 8 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
20 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![PythonVersion](https://img.shields.io/badge/python-3.6%20|3.7%20|%203.8%20|%203.9-success)
  2 | [![License https://github.com/solegalli/ingenieria-de-variables/blob/master/LICENSE](https://img.shields.io/badge/license-BSD-success.svg)](https://github.com/solegalli/ingenieria-de-variables/blob/master/LICENSE)
  3 | [![Sponsorship https://www.trainindata.com/](https://img.shields.io/badge/Powered%20By-TrainInData-orange.svg)](https://www.trainindata.com/)
  4 | 
  5 | ## Ingeniería de Variables para Machine Learning - Código
  6 | 
  7 | Publicado en Junio de 2020
  8 | 
  9 | El codigo no se actualiza.
 10 | 
 11 | Videos en español: [Ingeniería de Variables, Playlist en YouTube](https://www.youtube.com/watch?v=fmAUVceuQu4&list=PL_7uaHXkQmKU6JyThyqyUUZdCYqJJ9SeO)
 12 | 
 13 | Curso Original en Inglés: [Feature Engineering for Machine Learning](https://www.trainindata.com/p/feature-engineering-for-machine-learning)
 14 | 
 15 | [<img src="./trainindata.png" width="248">](https://www.trainindata.com/?lang=es)
 16 | 
 17 | 
 18 | ## Links
 19 | 
 20 | - [Curso Online en Inglés](https://www.trainindata.com/p/feature-engineering-for-machine-learning)
 21 | - [Lista de videos en Español](https://www.youtube.com/watch?v=fmAUVceuQu4&list=PL_7uaHXkQmKU6JyThyqyUUZdCYqJJ9SeO)
 22 | 
 23 | 
 24 | ## Tabla de Contenidos
 25 | 
 26 | **Todas las técnicas aplicadas con Pandas, Scikit-learn y Feature-engine**
 27 | 
 28 | 1. **Tipos de variables**
 29 | 	1. Numéricas
 30 | 	2. Categóricas
 31 | 	3. Fecha y hora
 32 | 	4. Mixtas
 33 | 
 34 | 2. **Características de las variables**
 35 | 	1. Datos ausentes 
 36 | 	2. Cardinalidad
 37 | 	3. Etiquetas raras
 38 | 	4. Supuestos de los modelos
 39 | 	5. Valores extremos
 40 | 	6. Escala de las variables
 41 | 
 42 | 3. **Sustitución de datos faltantes**
 43 | 	1. Análisis de Casos Completos
 44 | 	2. Imputación con la media y la mediana
 45 | 	3. Sustitución con valor arbitrario
 46 | 	4. Imputación con valor al final de la distribución
 47 | 	5. Sustitución con la categoría más frecuenta (moda)
 48 | 	7. Imputación con categoría adicional
 49 | 	8. Imputación aleatoria
 50 | 	9. Agregado de indicador de ausencia
 51 | 	11. Secuencia de imputación
 52 | 
 53 | 
 54 | 4. **Codificación de variables categóricas**
 55 | 	1. Codificación One Hot
 56 | 	2. Codificación One Hot de categorías frecuentes
 57 | 	3. Codificación Ordinal
 58 | 	4. Codificación con cuentas o frecuencias
 59 | 	5. Codificación ordinal ordenada
 60 | 	6. Codificación con la media de la variable de respuesta
 61 | 	7. Codificación con tasa de probabilidad
 62 | 	8. Peso de la evidencia
 63 | 	9. Manejo de etiquetas raras
 64 | 
 65 | 5. **Transformación de variables numéricas**
 66 | 	1. Transformación Logarítmica
 67 | 	2. Transformación de Potencia
 68 | 	3. Transformación Reciproca
 69 | 	4. Transformación de BoxCox
 70 | 	5. Transformación de Yeo-Johnson
 71 | 
 72 | 6. **Discretización**
 73 | 	1. Discretización con intervalos de igual rango 
 74 | 	2. Discretización con intervalos de igual frecuencia
 75 | 	3. Discretización arbitraria
 76 | 	4. Discretización con árboles de decisión
 77 | 
 78 | 7. **Datos Extremos**
 79 | 	1. Remoción de datos extremos
 80 | 	2. Truncamiento 
 81 | 	3. Winzorisación
 82 | 
 83 | 8. **Escalamiento de variables**
 84 | 	1. Estandarización
 85 | 	2. Escalamiento por la media
 86 | 	3. Escalamiento al mínimo y máximo valor
 87 | 	4. Escalamiento al máximo absoluto
 88 | 	5. Escalamiento con mediana y rango entre-cuartil
 89 | 	6. Normalización a la norma del vector
 90 | 
 91 | 9. **Variables mixtas**
 92 | 	1. Separación en componente numérico y componente categórico
 93 | 
 94 | 10. **Variables de fecha y hora**
 95 | 	1. Extracción de componentes de día, mes y año
 96 | 	2. Extracción de hora, minutos y segundos
 97 | 	3. Captura de tiempo transcurrido
 98 | 	4. Manejo de zonas horarias
 99 | 
100 | 11. Ensamblado de flujos de aprendizaje automático
101 | 	1. Regresión
102 | 	2. Clasificación
103 | 
104 | - [Curso Online en Inglés](https://www.trainindata.com/p/feature-engineering-for-machine-learning)
105 | - [Lista de videos en Español](https://www.youtube.com/watch?v=fmAUVceuQu4&list=PL_7uaHXkQmKU6JyThyqyUUZdCYqJJ9SeO)
106 | 


--------------------------------------------------------------------------------
/Seccion-01-Introduccion/01.11-Sets-de-datos.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Ejemplo 1: Préstamos Peer-to-Peer (Finanza)\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "### El Modelo de Negocio\n",
 11 |     "\n",
 12 |     "Préstamos Peer-to-peer (abreviado P2P) ocurren cuando inversores prestan dinero directamente a personas o negocios a través de una plataforma online. La plataforma online pone en contacto de manera digital y automática a los prestamistas con los inversores, y conduce también el análisis crediticio y de riesgo necesario para determinar el riesgo del préstamo y la tasa de interés adecuada. Suele haber un menor costo de operación en los prestamos P2P, por esto los inversores obtienen retornos más altos, y los prestamistas intereses más bajos. Aunque hoy en día, esto ya no suele ser siempre así.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "### El set de datos\n",
 16 |     "\n",
 17 |     "El set de datos simulado que creamos para este curso contiene datos sobre préstamos desembolsados por una compañía ficticia de peer to peer. Las variables incluyen el estado actual del préstamo al crearse el set de datos, e información acerca de pagos, así como también información acerca de la situación financiera y otros datos acerca del prestamista. \n",
 18 |     "El set de datos viene junto con los Jupyter notebooks que bajaste en la sección anterior."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Ejemplo 2: Predecir el precio de venta de casas\n",
 26 |     "\n",
 27 |     "En este set de datos, tenemos variables con características de las casas y los barrios en donde se encuentran localizadas, y el objetivo es predecir el precio de venta en base a estas variables. Predecir el precio de venta suele ser un dato útil para anticipar áreas en donde hacer futuras inversiones.\n",
 28 |     "\n",
 29 |     "### Bajar y guardar\n",
 30 |     "\n",
 31 |     "Par bajar el código haz lo siguiente:\n",
 32 |     "\n",
 33 |     "- Visita la [página web House Sale Price competition](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)\n",
 34 |     "- Navega hacia abajo y haz clic en “train.csv” y luego en el botón de “Download” a la derecha\n",
 35 |     "- Cámbiale el nombre al set de datos a “houseprice.csv”\n",
 36 |     "- Guarda el set de datos en la carpeta que contiene los Jupyter notebooks, en donde vez el archivo “GUARDA_DATASETS_AQUI”.\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## Ejemplo 3: Predecir sobrevivencia en el Titanic\n",
 44 |     "\n",
 45 |     "### Historia\n",
 46 |     "Como probablemente sabes, el Titanic fue un lamentable hecho en donde el barco llamado Titanic choca con in tempano de hielo, y se hunde, terminando con la vida de 1502 de sus 2224 pasajeros. Es curioso, como el análisis de las características de los pasajeros revela datos interesantes acerca de quienes fueron priorizados al momento del salvataje, siendo la mayoría de los sobrevivientes mujeres y niños de clase alta. \n",
 47 |     "\n",
 48 |     "### Para generar el set de datos, sigue estas instrucciones"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 1,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "import pandas as pd\n",
 58 |     "import numpy as np"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 2,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/html": [
 69 |        "<div>\n",
 70 |        "<style scoped>\n",
 71 |        "    .dataframe tbody tr th:only-of-type {\n",
 72 |        "        vertical-align: middle;\n",
 73 |        "    }\n",
 74 |        "\n",
 75 |        "    .dataframe tbody tr th {\n",
 76 |        "        vertical-align: top;\n",
 77 |        "    }\n",
 78 |        "\n",
 79 |        "    .dataframe thead th {\n",
 80 |        "        text-align: right;\n",
 81 |        "    }\n",
 82 |        "</style>\n",
 83 |        "<table border=\"1\" class=\"dataframe\">\n",
 84 |        "  <thead>\n",
 85 |        "    <tr style=\"text-align: right;\">\n",
 86 |        "      <th></th>\n",
 87 |        "      <th>pclass</th>\n",
 88 |        "      <th>survived</th>\n",
 89 |        "      <th>name</th>\n",
 90 |        "      <th>sex</th>\n",
 91 |        "      <th>age</th>\n",
 92 |        "      <th>sibsp</th>\n",
 93 |        "      <th>parch</th>\n",
 94 |        "      <th>ticket</th>\n",
 95 |        "      <th>fare</th>\n",
 96 |        "      <th>cabin</th>\n",
 97 |        "      <th>embarked</th>\n",
 98 |        "      <th>boat</th>\n",
 99 |        "      <th>body</th>\n",
100 |        "      <th>home.dest</th>\n",
101 |        "    </tr>\n",
102 |        "  </thead>\n",
103 |        "  <tbody>\n",
104 |        "    <tr>\n",
105 |        "      <th>0</th>\n",
106 |        "      <td>1</td>\n",
107 |        "      <td>1</td>\n",
108 |        "      <td>Allen, Miss. Elisabeth Walton</td>\n",
109 |        "      <td>female</td>\n",
110 |        "      <td>29</td>\n",
111 |        "      <td>0</td>\n",
112 |        "      <td>0</td>\n",
113 |        "      <td>24160</td>\n",
114 |        "      <td>211.3375</td>\n",
115 |        "      <td>B5</td>\n",
116 |        "      <td>S</td>\n",
117 |        "      <td>2</td>\n",
118 |        "      <td>?</td>\n",
119 |        "      <td>St Louis, MO</td>\n",
120 |        "    </tr>\n",
121 |        "    <tr>\n",
122 |        "      <th>1</th>\n",
123 |        "      <td>1</td>\n",
124 |        "      <td>1</td>\n",
125 |        "      <td>Allison, Master. Hudson Trevor</td>\n",
126 |        "      <td>male</td>\n",
127 |        "      <td>0.9167</td>\n",
128 |        "      <td>1</td>\n",
129 |        "      <td>2</td>\n",
130 |        "      <td>113781</td>\n",
131 |        "      <td>151.55</td>\n",
132 |        "      <td>C22 C26</td>\n",
133 |        "      <td>S</td>\n",
134 |        "      <td>11</td>\n",
135 |        "      <td>?</td>\n",
136 |        "      <td>Montreal, PQ / Chesterville, ON</td>\n",
137 |        "    </tr>\n",
138 |        "    <tr>\n",
139 |        "      <th>2</th>\n",
140 |        "      <td>1</td>\n",
141 |        "      <td>0</td>\n",
142 |        "      <td>Allison, Miss. Helen Loraine</td>\n",
143 |        "      <td>female</td>\n",
144 |        "      <td>2</td>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>2</td>\n",
147 |        "      <td>113781</td>\n",
148 |        "      <td>151.55</td>\n",
149 |        "      <td>C22 C26</td>\n",
150 |        "      <td>S</td>\n",
151 |        "      <td>?</td>\n",
152 |        "      <td>?</td>\n",
153 |        "      <td>Montreal, PQ / Chesterville, ON</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <th>3</th>\n",
157 |        "      <td>1</td>\n",
158 |        "      <td>0</td>\n",
159 |        "      <td>Allison, Mr. Hudson Joshua Creighton</td>\n",
160 |        "      <td>male</td>\n",
161 |        "      <td>30</td>\n",
162 |        "      <td>1</td>\n",
163 |        "      <td>2</td>\n",
164 |        "      <td>113781</td>\n",
165 |        "      <td>151.55</td>\n",
166 |        "      <td>C22 C26</td>\n",
167 |        "      <td>S</td>\n",
168 |        "      <td>?</td>\n",
169 |        "      <td>135</td>\n",
170 |        "      <td>Montreal, PQ / Chesterville, ON</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>4</th>\n",
174 |        "      <td>1</td>\n",
175 |        "      <td>0</td>\n",
176 |        "      <td>Allison, Mrs. Hudson J C (Bessie Waldo Daniels)</td>\n",
177 |        "      <td>female</td>\n",
178 |        "      <td>25</td>\n",
179 |        "      <td>1</td>\n",
180 |        "      <td>2</td>\n",
181 |        "      <td>113781</td>\n",
182 |        "      <td>151.55</td>\n",
183 |        "      <td>C22 C26</td>\n",
184 |        "      <td>S</td>\n",
185 |        "      <td>?</td>\n",
186 |        "      <td>?</td>\n",
187 |        "      <td>Montreal, PQ / Chesterville, ON</td>\n",
188 |        "    </tr>\n",
189 |        "  </tbody>\n",
190 |        "</table>\n",
191 |        "</div>"
192 |       ],
193 |       "text/plain": [
194 |        "   pclass  survived                                             name     sex  \\\n",
195 |        "0       1         1                    Allen, Miss. Elisabeth Walton  female   \n",
196 |        "1       1         1                   Allison, Master. Hudson Trevor    male   \n",
197 |        "2       1         0                     Allison, Miss. Helen Loraine  female   \n",
198 |        "3       1         0             Allison, Mr. Hudson Joshua Creighton    male   \n",
199 |        "4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   \n",
200 |        "\n",
201 |        "      age  sibsp  parch  ticket      fare    cabin embarked boat body  \\\n",
202 |        "0      29      0      0   24160  211.3375       B5        S    2    ?   \n",
203 |        "1  0.9167      1      2  113781    151.55  C22 C26        S   11    ?   \n",
204 |        "2       2      1      2  113781    151.55  C22 C26        S    ?    ?   \n",
205 |        "3      30      1      2  113781    151.55  C22 C26        S    ?  135   \n",
206 |        "4      25      1      2  113781    151.55  C22 C26        S    ?    ?   \n",
207 |        "\n",
208 |        "                         home.dest  \n",
209 |        "0                     St Louis, MO  \n",
210 |        "1  Montreal, PQ / Chesterville, ON  \n",
211 |        "2  Montreal, PQ / Chesterville, ON  \n",
212 |        "3  Montreal, PQ / Chesterville, ON  \n",
213 |        "4  Montreal, PQ / Chesterville, ON  "
214 |       ]
215 |      },
216 |      "execution_count": 2,
217 |      "metadata": {},
218 |      "output_type": "execute_result"
219 |     }
220 |    ],
221 |    "source": [
222 |     "data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')\n",
223 |     "data.head()"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 3,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "data": {
233 |       "text/plain": [
234 |        "pclass          0\n",
235 |        "survived        0\n",
236 |        "name            0\n",
237 |        "sex             0\n",
238 |        "age           263\n",
239 |        "sibsp           0\n",
240 |        "parch           0\n",
241 |        "ticket          0\n",
242 |        "fare            1\n",
243 |        "cabin        1014\n",
244 |        "embarked        2\n",
245 |        "boat          823\n",
246 |        "body         1188\n",
247 |        "home.dest     564\n",
248 |        "dtype: int64"
249 |       ]
250 |      },
251 |      "execution_count": 3,
252 |      "metadata": {},
253 |      "output_type": "execute_result"
254 |     }
255 |    ],
256 |    "source": [
257 |     "data = data.replace('?', np.nan)\n",
258 |     "data.isnull().sum()"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 4,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "def get_first_cabin(row):\n",
268 |     "    try:\n",
269 |     "        return row.split()[0]\n",
270 |     "    except:\n",
271 |     "        return np.nan "
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 5,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "data['cabin'] = data['cabin'].apply(get_first_cabin)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 6,
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "data.to_csv('../titanic.csv', index=False)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "**Atención**\n",
297 |     "\n",
298 |     "Si ejecutas ese Jupyter notebook desde donde se localiza en la carpeta que contiene los Jupyter notebooks, el set de datos del titanic, se guardara en la carpeta adecuada.\n",
299 |     "\n",
300 |     "Sino, asegurate de que ese guardado en la carpeta que contiene los Jupyter notebooks, en donde vez el archivo que dice GUARDAR_DATASETS_AQUI\n"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": []
309 |   }
310 |  ],
311 |  "metadata": {
312 |   "kernelspec": {
313 |    "display_name": "Python 3",
314 |    "language": "python",
315 |    "name": "python3"
316 |   },
317 |   "toc": {
318 |    "base_numbering": 1,
319 |    "nav_menu": {},
320 |    "number_sections": true,
321 |    "sideBar": true,
322 |    "skip_h1_title": false,
323 |    "title_cell": "Table of Contents",
324 |    "title_sidebar": "Contents",
325 |    "toc_cell": false,
326 |    "toc_position": {},
327 |    "toc_section_display": true,
328 |    "toc_window_display": true
329 |   }
330 |  },
331 |  "nbformat": 4,
332 |  "nbformat_minor": 2
333 | }
334 | 


--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.20_SustitucionMuestraAleatoria_FeatureEngine.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Sustitución por Muestra Aleatoria ==> Feature-engine\n",
  8 |     "\n",
  9 |     "### Qué es Feature-engine?\n",
 10 |     "\n",
 11 |     "Feature-engine es una librería de Python que hemos creado para este curso. \n",
 12 |     "\n",
 13 |     "- Feature-engine incluye todas las técnicas de ingeniería de variables descritas en este curso\n",
 14 |     "- Feature-engine funciona como Scikit-learn, por lo tanto es fácil de aprender\n",
 15 |     "- Feature-engine te permite implementar pasos de ingeniería de variables específicos para diferentes grupos de variables\n",
 16 |     "- Feature-engine puede ser integrado con las pipelines de Scikit-learn pipeline permitiendo construir modelos fácilmente\n",
 17 |     "** Feature-engine te permite diseñar y guardar un flujo de ingeniería de variables con procesos diseñados específicamente para diferentes grupos de variables.**\n",
 18 |     "\n",
 19 |     "-------------------------------------------------------------------\n",
 20 |     "Feature-engine puede ser instalado vía pip ==> pip install feature-engine\n",
 21 |     "\n",
 22 |     "- Asegurate que haz instalado Feature-engine antes de correr este notebook\n",
 23 |     "\n",
 24 |     "Para más detalle visita el [website the trainindata]( https://www.trainindata.com/feature-engine) \n",
 25 |     "\n",
 26 |     "\n",
 27 |     "## En este demo:\n",
 28 |     "\n",
 29 |     "Vamos a usar **Feature-engine para hacer la sustitución por muestra aleatoria** usando los datos Ames House Price.\n",
 30 |     "\n",
 31 |     "- Para bajar los datos, por favor referirse a la clase **Datasets** en la  **Sección 1** del curso.\n",
 32 |     "\n",
 33 |     "### Nota: \n",
 34 |     "* 'Imputer' se deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahí el nombre dado a la clase.\n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 1,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import pandas as pd\n",
 44 |     "import numpy as np\n",
 45 |     "\n",
 46 |     "import matplotlib.pyplot as plt\n",
 47 |     "\n",
 48 |     "from sklearn.model_selection import train_test_split\n",
 49 |     "from sklearn.pipeline import Pipeline\n",
 50 |     "\n",
 51 |     "#  feature engine\n",
 52 |     "from feature_engine import imputation as mdi"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "text/html": [
 63 |        "<div>\n",
 64 |        "<style scoped>\n",
 65 |        "    .dataframe tbody tr th:only-of-type {\n",
 66 |        "        vertical-align: middle;\n",
 67 |        "    }\n",
 68 |        "\n",
 69 |        "    .dataframe tbody tr th {\n",
 70 |        "        vertical-align: top;\n",
 71 |        "    }\n",
 72 |        "\n",
 73 |        "    .dataframe thead th {\n",
 74 |        "        text-align: right;\n",
 75 |        "    }\n",
 76 |        "</style>\n",
 77 |        "<table border=\"1\" class=\"dataframe\">\n",
 78 |        "  <thead>\n",
 79 |        "    <tr style=\"text-align: right;\">\n",
 80 |        "      <th></th>\n",
 81 |        "      <th>LotFrontage</th>\n",
 82 |        "      <th>MasVnrArea</th>\n",
 83 |        "      <th>BsmtQual</th>\n",
 84 |        "      <th>FireplaceQu</th>\n",
 85 |        "      <th>GarageYrBlt</th>\n",
 86 |        "      <th>SalePrice</th>\n",
 87 |        "    </tr>\n",
 88 |        "  </thead>\n",
 89 |        "  <tbody>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>0</th>\n",
 92 |        "      <td>65.0</td>\n",
 93 |        "      <td>196.0</td>\n",
 94 |        "      <td>Gd</td>\n",
 95 |        "      <td>NaN</td>\n",
 96 |        "      <td>2003.0</td>\n",
 97 |        "      <td>208500</td>\n",
 98 |        "    </tr>\n",
 99 |        "    <tr>\n",
100 |        "      <th>1</th>\n",
101 |        "      <td>80.0</td>\n",
102 |        "      <td>0.0</td>\n",
103 |        "      <td>Gd</td>\n",
104 |        "      <td>TA</td>\n",
105 |        "      <td>1976.0</td>\n",
106 |        "      <td>181500</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>2</th>\n",
110 |        "      <td>68.0</td>\n",
111 |        "      <td>162.0</td>\n",
112 |        "      <td>Gd</td>\n",
113 |        "      <td>TA</td>\n",
114 |        "      <td>2001.0</td>\n",
115 |        "      <td>223500</td>\n",
116 |        "    </tr>\n",
117 |        "    <tr>\n",
118 |        "      <th>3</th>\n",
119 |        "      <td>60.0</td>\n",
120 |        "      <td>0.0</td>\n",
121 |        "      <td>TA</td>\n",
122 |        "      <td>Gd</td>\n",
123 |        "      <td>1998.0</td>\n",
124 |        "      <td>140000</td>\n",
125 |        "    </tr>\n",
126 |        "    <tr>\n",
127 |        "      <th>4</th>\n",
128 |        "      <td>84.0</td>\n",
129 |        "      <td>350.0</td>\n",
130 |        "      <td>Gd</td>\n",
131 |        "      <td>TA</td>\n",
132 |        "      <td>2000.0</td>\n",
133 |        "      <td>250000</td>\n",
134 |        "    </tr>\n",
135 |        "  </tbody>\n",
136 |        "</table>\n",
137 |        "</div>"
138 |       ],
139 |       "text/plain": [
140 |        "   LotFrontage  MasVnrArea BsmtQual FireplaceQu  GarageYrBlt  SalePrice\n",
141 |        "0         65.0       196.0       Gd         NaN       2003.0     208500\n",
142 |        "1         80.0         0.0       Gd          TA       1976.0     181500\n",
143 |        "2         68.0       162.0       Gd          TA       2001.0     223500\n",
144 |        "3         60.0         0.0       TA          Gd       1998.0     140000\n",
145 |        "4         84.0       350.0       Gd          TA       2000.0     250000"
146 |       ]
147 |      },
148 |      "execution_count": 2,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "# carguemos los datos con unas columnas seleccionadas\n",
155 |     "\n",
156 |     "cols_to_use = [\n",
157 |     "    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
158 |     "    'SalePrice'\n",
159 |     "]\n",
160 |     "\n",
161 |     "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
162 |     "data.head()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 3,
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "data": {
172 |       "text/plain": [
173 |        "LotFrontage    0.177397\n",
174 |        "MasVnrArea     0.005479\n",
175 |        "BsmtQual       0.025342\n",
176 |        "FireplaceQu    0.472603\n",
177 |        "GarageYrBlt    0.055479\n",
178 |        "SalePrice      0.000000\n",
179 |        "dtype: float64"
180 |       ]
181 |      },
182 |      "execution_count": 3,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "data.isnull().mean()"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 4,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "((1022, 5), (438, 5))"
200 |       ]
201 |      },
202 |      "execution_count": 4,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "# separar datos en segmentos entrenamiento y prueba\n",
209 |     "\n",
210 |     "# primero, separemos el target (SalePrice) del resto de las variables\n",
211 |     "\n",
212 |     "cols_to_use.remove('SalePrice')\n",
213 |     "\n",
214 |     "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
215 |     "                                                    data['SalePrice'],\n",
216 |     "                                                    test_size=0.3,\n",
217 |     "                                                    random_state=0)\n",
218 |     "X_train.shape, X_test.shape"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "## Feature-engine Random Sampler por defecto captura todas las variables"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 5,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "# llamemos el imputer de Feature-engine\n",
235 |     "# no necesitamos especificar nada\n",
236 |     "\n",
237 |     "imputer = mdi.RandomSampleImputer(random_state = 29)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 6,
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "data": {
247 |       "text/plain": [
248 |        "RandomSampleImputer(random_state=29,\n",
249 |        "                    variables=['BsmtQual', 'FireplaceQu', 'LotFrontage',\n",
250 |        "                               'MasVnrArea', 'GarageYrBlt'])"
251 |       ]
252 |      },
253 |      "execution_count": 6,
254 |      "metadata": {},
255 |      "output_type": "execute_result"
256 |     }
257 |    ],
258 |    "source": [
259 |     "# ajustemos el imputer\n",
260 |     "\n",
261 |     "imputer.fit(X_train)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 7,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "['BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt']"
273 |       ]
274 |      },
275 |      "execution_count": 7,
276 |      "metadata": {},
277 |      "output_type": "execute_result"
278 |     }
279 |    ],
280 |    "source": [
281 |     "# vemos que el imputer encontró las variables categóricas \n",
282 |     "# y numéricas\n",
283 |     "\n",
284 |     "imputer.variables"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 8,
290 |    "metadata": {},
291 |    "outputs": [
292 |     {
293 |      "data": {
294 |       "text/html": [
295 |        "<div>\n",
296 |        "<style scoped>\n",
297 |        "    .dataframe tbody tr th:only-of-type {\n",
298 |        "        vertical-align: middle;\n",
299 |        "    }\n",
300 |        "\n",
301 |        "    .dataframe tbody tr th {\n",
302 |        "        vertical-align: top;\n",
303 |        "    }\n",
304 |        "\n",
305 |        "    .dataframe thead th {\n",
306 |        "        text-align: right;\n",
307 |        "    }\n",
308 |        "</style>\n",
309 |        "<table border=\"1\" class=\"dataframe\">\n",
310 |        "  <thead>\n",
311 |        "    <tr style=\"text-align: right;\">\n",
312 |        "      <th></th>\n",
313 |        "      <th>BsmtQual</th>\n",
314 |        "      <th>FireplaceQu</th>\n",
315 |        "      <th>LotFrontage</th>\n",
316 |        "      <th>MasVnrArea</th>\n",
317 |        "      <th>GarageYrBlt</th>\n",
318 |        "    </tr>\n",
319 |        "  </thead>\n",
320 |        "  <tbody>\n",
321 |        "    <tr>\n",
322 |        "      <th>64</th>\n",
323 |        "      <td>Gd</td>\n",
324 |        "      <td>NaN</td>\n",
325 |        "      <td>NaN</td>\n",
326 |        "      <td>573.0</td>\n",
327 |        "      <td>1998.0</td>\n",
328 |        "    </tr>\n",
329 |        "    <tr>\n",
330 |        "      <th>682</th>\n",
331 |        "      <td>Gd</td>\n",
332 |        "      <td>Gd</td>\n",
333 |        "      <td>NaN</td>\n",
334 |        "      <td>0.0</td>\n",
335 |        "      <td>1996.0</td>\n",
336 |        "    </tr>\n",
337 |        "    <tr>\n",
338 |        "      <th>960</th>\n",
339 |        "      <td>TA</td>\n",
340 |        "      <td>NaN</td>\n",
341 |        "      <td>50.0</td>\n",
342 |        "      <td>0.0</td>\n",
343 |        "      <td>NaN</td>\n",
344 |        "    </tr>\n",
345 |        "    <tr>\n",
346 |        "      <th>1384</th>\n",
347 |        "      <td>TA</td>\n",
348 |        "      <td>NaN</td>\n",
349 |        "      <td>60.0</td>\n",
350 |        "      <td>0.0</td>\n",
351 |        "      <td>1939.0</td>\n",
352 |        "    </tr>\n",
353 |        "    <tr>\n",
354 |        "      <th>1100</th>\n",
355 |        "      <td>TA</td>\n",
356 |        "      <td>NaN</td>\n",
357 |        "      <td>60.0</td>\n",
358 |        "      <td>0.0</td>\n",
359 |        "      <td>1930.0</td>\n",
360 |        "    </tr>\n",
361 |        "  </tbody>\n",
362 |        "</table>\n",
363 |        "</div>"
364 |       ],
365 |       "text/plain": [
366 |        "     BsmtQual FireplaceQu  LotFrontage  MasVnrArea  GarageYrBlt\n",
367 |        "64         Gd         NaN          NaN       573.0       1998.0\n",
368 |        "682        Gd          Gd          NaN         0.0       1996.0\n",
369 |        "960        TA         NaN         50.0         0.0          NaN\n",
370 |        "1384       TA         NaN         60.0         0.0       1939.0\n",
371 |        "1100       TA         NaN         60.0         0.0       1930.0"
372 |       ]
373 |      },
374 |      "execution_count": 8,
375 |      "metadata": {},
376 |      "output_type": "execute_result"
377 |     }
378 |    ],
379 |    "source": [
380 |     "# el imputer guarda una copia de las variables seleccionadas del\n",
381 |     "# segmento de entrenamiento, de las cuales extraer la muestra aleatoria\n",
382 |     "\n",
383 |     "imputer.X_.head()"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 9,
389 |    "metadata": {},
390 |    "outputs": [
391 |     {
392 |      "data": {
393 |       "text/html": [
394 |        "<div>\n",
395 |        "<style scoped>\n",
396 |        "    .dataframe tbody tr th:only-of-type {\n",
397 |        "        vertical-align: middle;\n",
398 |        "    }\n",
399 |        "\n",
400 |        "    .dataframe tbody tr th {\n",
401 |        "        vertical-align: top;\n",
402 |        "    }\n",
403 |        "\n",
404 |        "    .dataframe thead th {\n",
405 |        "        text-align: right;\n",
406 |        "    }\n",
407 |        "</style>\n",
408 |        "<table border=\"1\" class=\"dataframe\">\n",
409 |        "  <thead>\n",
410 |        "    <tr style=\"text-align: right;\">\n",
411 |        "      <th></th>\n",
412 |        "      <th>BsmtQual</th>\n",
413 |        "      <th>FireplaceQu</th>\n",
414 |        "      <th>LotFrontage</th>\n",
415 |        "      <th>MasVnrArea</th>\n",
416 |        "      <th>GarageYrBlt</th>\n",
417 |        "    </tr>\n",
418 |        "  </thead>\n",
419 |        "  <tbody>\n",
420 |        "    <tr>\n",
421 |        "      <th>64</th>\n",
422 |        "      <td>Gd</td>\n",
423 |        "      <td>TA</td>\n",
424 |        "      <td>60.0</td>\n",
425 |        "      <td>573.0</td>\n",
426 |        "      <td>1998.0</td>\n",
427 |        "    </tr>\n",
428 |        "    <tr>\n",
429 |        "      <th>682</th>\n",
430 |        "      <td>Gd</td>\n",
431 |        "      <td>Gd</td>\n",
432 |        "      <td>90.0</td>\n",
433 |        "      <td>0.0</td>\n",
434 |        "      <td>1996.0</td>\n",
435 |        "    </tr>\n",
436 |        "    <tr>\n",
437 |        "      <th>960</th>\n",
438 |        "      <td>TA</td>\n",
439 |        "      <td>Gd</td>\n",
440 |        "      <td>50.0</td>\n",
441 |        "      <td>0.0</td>\n",
442 |        "      <td>1977.0</td>\n",
443 |        "    </tr>\n",
444 |        "    <tr>\n",
445 |        "      <th>1384</th>\n",
446 |        "      <td>TA</td>\n",
447 |        "      <td>Gd</td>\n",
448 |        "      <td>60.0</td>\n",
449 |        "      <td>0.0</td>\n",
450 |        "      <td>1939.0</td>\n",
451 |        "    </tr>\n",
452 |        "    <tr>\n",
453 |        "      <th>1100</th>\n",
454 |        "      <td>TA</td>\n",
455 |        "      <td>Gd</td>\n",
456 |        "      <td>60.0</td>\n",
457 |        "      <td>0.0</td>\n",
458 |        "      <td>1930.0</td>\n",
459 |        "    </tr>\n",
460 |        "  </tbody>\n",
461 |        "</table>\n",
462 |        "</div>"
463 |       ],
464 |       "text/plain": [
465 |        "     BsmtQual FireplaceQu  LotFrontage  MasVnrArea  GarageYrBlt\n",
466 |        "64         Gd          TA         60.0       573.0       1998.0\n",
467 |        "682        Gd          Gd         90.0         0.0       1996.0\n",
468 |        "960        TA          Gd         50.0         0.0       1977.0\n",
469 |        "1384       TA          Gd         60.0         0.0       1939.0\n",
470 |        "1100       TA          Gd         60.0         0.0       1930.0"
471 |       ]
472 |      },
473 |      "execution_count": 9,
474 |      "metadata": {},
475 |      "output_type": "execute_result"
476 |     }
477 |    ],
478 |    "source": [
479 |     "# feature engine devuelve un dataframe\n",
480 |     "\n",
481 |     "tmp = imputer.transform(X_train)\n",
482 |     "tmp.head()"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": 10,
488 |    "metadata": {},
489 |    "outputs": [
490 |     {
491 |      "data": {
492 |       "text/plain": [
493 |        "BsmtQual       0.0\n",
494 |        "FireplaceQu    0.0\n",
495 |        "LotFrontage    0.0\n",
496 |        "MasVnrArea     0.0\n",
497 |        "GarageYrBlt    0.0\n",
498 |        "dtype: float64"
499 |       ]
500 |      },
501 |      "execution_count": 10,
502 |      "metadata": {},
503 |      "output_type": "execute_result"
504 |     }
505 |    ],
506 |    "source": [
507 |     "#revisemos que ya no tenemos valores nulos\n",
508 |     "\n",
509 |     "tmp[imputer.variables].isnull().mean()"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "markdown",
514 |    "metadata": {},
515 |    "source": [
516 |     "Revisa la documentación del RandomSampleImputer() para aprender cómo poner semillas dependiendo de variables en el set de datos, como explicamos anteriormente:\n",
517 |     "https://feature-engine.readthedocs.io/en/latest/imputation/RandomSampleImputer.html"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {},
524 |    "outputs": [],
525 |    "source": []
526 |   }
527 |  ],
528 |  "metadata": {
529 |   "kernelspec": {
530 |    "display_name": "feml",
531 |    "language": "python",
532 |    "name": "feml"
533 |   },
534 |   "language_info": {
535 |    "codemirror_mode": {
536 |     "name": "ipython",
537 |     "version": 3
538 |    },
539 |    "file_extension": ".py",
540 |    "mimetype": "text/x-python",
541 |    "name": "python",
542 |    "nbconvert_exporter": "python",
543 |    "pygments_lexer": "ipython3",
544 |    "version": "3.8.2"
545 |   },
546 |   "toc": {
547 |    "base_numbering": 1,
548 |    "nav_menu": {},
549 |    "number_sections": true,
550 |    "sideBar": true,
551 |    "skip_h1_title": false,
552 |    "title_cell": "Table of Contents",
553 |    "title_sidebar": "Contents",
554 |    "toc_cell": false,
555 |    "toc_position": {},
556 |    "toc_section_display": "block",
557 |    "toc_window_display": true
558 |   }
559 |  },
560 |  "nbformat": 4,
561 |  "nbformat_minor": 2
562 | }
563 | 


--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.18_SustitucionModa_FeatureEngine.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Sustitución categoría más frecuente  ==> Feature-engine\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "### Qué es Feature-engine?\n",
 11 |     "\n",
 12 |     "Feature-engine es una librería de Python que hemos creado para este curso. \n",
 13 |     "\n",
 14 |     "- Feature-engine incluye todas las técnicas de ingeniería de variables descritas en este curso\n",
 15 |     "- Feature-engine funciona como Scikit-learn, por lo tanto es fácil de aprender\n",
 16 |     "- Feature-engine te permite implementar pasos de ingeniería de variables específicos para diferentes grupos de variables\n",
 17 |     "- Feature-engine puede ser integrado con las pipelines de Scikit-learn pipeline permitiendo construir modelos fácilmente\n",
 18 |     "** Feature-engine te permite diseñar y guardar un flujo de ingeniería de variables con procesos diseñados específicamente para diferentes grupos de variables.**\n",
 19 |     "\n",
 20 |     "-------------------------------------------------------------------\n",
 21 |     "Feature-engine puede ser instalado vía pip ==> pip install feature-engine\n",
 22 |     "\n",
 23 |     "- Asegurate que haz instalado Feature-engine antes de correr este notebook\n",
 24 |     "\n",
 25 |     "Para más detalle visita el [website the trainindata]( https://www.trainindata.com/feature-engine) \n",
 26 |     "\n",
 27 |     "\n",
 28 |     "## En este demo:\n",
 29 |     "\n",
 30 |     "Vamos a usar **Feature Engine para hacer la sustitución por la categoría más frecuente** usando los datos Ames House Price.\n",
 31 |     "\n",
 32 |     "- Para bajar los datos, por favor referirse a la clase **Datasets** en la  **Sección 1** del curso.\n",
 33 |     "\n",
 34 |     "### Nota: \n",
 35 |     "* 'Imputer' se deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahí el nombre dado a la clase."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 1,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import pandas as pd\n",
 45 |     "import numpy as np\n",
 46 |     "\n",
 47 |     "import matplotlib.pyplot as plt\n",
 48 |     "\n",
 49 |     "from sklearn.model_selection import train_test_split\n",
 50 |     "from sklearn.pipeline import Pipeline\n",
 51 |     "\n",
 52 |     "#  feature engine\n",
 53 |     "from feature_engine import imputation as mdi"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 2,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/html": [
 64 |        "<div>\n",
 65 |        "<style scoped>\n",
 66 |        "    .dataframe tbody tr th:only-of-type {\n",
 67 |        "        vertical-align: middle;\n",
 68 |        "    }\n",
 69 |        "\n",
 70 |        "    .dataframe tbody tr th {\n",
 71 |        "        vertical-align: top;\n",
 72 |        "    }\n",
 73 |        "\n",
 74 |        "    .dataframe thead th {\n",
 75 |        "        text-align: right;\n",
 76 |        "    }\n",
 77 |        "</style>\n",
 78 |        "<table border=\"1\" class=\"dataframe\">\n",
 79 |        "  <thead>\n",
 80 |        "    <tr style=\"text-align: right;\">\n",
 81 |        "      <th></th>\n",
 82 |        "      <th>LotFrontage</th>\n",
 83 |        "      <th>MasVnrArea</th>\n",
 84 |        "      <th>BsmtQual</th>\n",
 85 |        "      <th>FireplaceQu</th>\n",
 86 |        "      <th>GarageYrBlt</th>\n",
 87 |        "      <th>SalePrice</th>\n",
 88 |        "    </tr>\n",
 89 |        "  </thead>\n",
 90 |        "  <tbody>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>0</th>\n",
 93 |        "      <td>65.0</td>\n",
 94 |        "      <td>196.0</td>\n",
 95 |        "      <td>Gd</td>\n",
 96 |        "      <td>NaN</td>\n",
 97 |        "      <td>2003.0</td>\n",
 98 |        "      <td>208500</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>1</th>\n",
102 |        "      <td>80.0</td>\n",
103 |        "      <td>0.0</td>\n",
104 |        "      <td>Gd</td>\n",
105 |        "      <td>TA</td>\n",
106 |        "      <td>1976.0</td>\n",
107 |        "      <td>181500</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>2</th>\n",
111 |        "      <td>68.0</td>\n",
112 |        "      <td>162.0</td>\n",
113 |        "      <td>Gd</td>\n",
114 |        "      <td>TA</td>\n",
115 |        "      <td>2001.0</td>\n",
116 |        "      <td>223500</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>3</th>\n",
120 |        "      <td>60.0</td>\n",
121 |        "      <td>0.0</td>\n",
122 |        "      <td>TA</td>\n",
123 |        "      <td>Gd</td>\n",
124 |        "      <td>1998.0</td>\n",
125 |        "      <td>140000</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <th>4</th>\n",
129 |        "      <td>84.0</td>\n",
130 |        "      <td>350.0</td>\n",
131 |        "      <td>Gd</td>\n",
132 |        "      <td>TA</td>\n",
133 |        "      <td>2000.0</td>\n",
134 |        "      <td>250000</td>\n",
135 |        "    </tr>\n",
136 |        "  </tbody>\n",
137 |        "</table>\n",
138 |        "</div>"
139 |       ],
140 |       "text/plain": [
141 |        "   LotFrontage  MasVnrArea BsmtQual FireplaceQu  GarageYrBlt  SalePrice\n",
142 |        "0         65.0       196.0       Gd         NaN       2003.0     208500\n",
143 |        "1         80.0         0.0       Gd          TA       1976.0     181500\n",
144 |        "2         68.0       162.0       Gd          TA       2001.0     223500\n",
145 |        "3         60.0         0.0       TA          Gd       1998.0     140000\n",
146 |        "4         84.0       350.0       Gd          TA       2000.0     250000"
147 |       ]
148 |      },
149 |      "execution_count": 2,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "# carguemos los datos con unas columnas seleccionadas\n",
156 |     "\n",
157 |     "cols_to_use = [\n",
158 |     "    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
159 |     "    'SalePrice'\n",
160 |     "]\n",
161 |     "\n",
162 |     "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
163 |     "data.head()"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 3,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "LotFrontage    0.177397\n",
175 |        "MasVnrArea     0.005479\n",
176 |        "BsmtQual       0.025342\n",
177 |        "FireplaceQu    0.472603\n",
178 |        "GarageYrBlt    0.055479\n",
179 |        "SalePrice      0.000000\n",
180 |        "dtype: float64"
181 |       ]
182 |      },
183 |      "execution_count": 3,
184 |      "metadata": {},
185 |      "output_type": "execute_result"
186 |     }
187 |    ],
188 |    "source": [
189 |     "data.isnull().mean()"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "Todas las variables predictivas tienen datos ausentes"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 4,
202 |    "metadata": {},
203 |    "outputs": [
204 |     {
205 |      "data": {
206 |       "text/plain": [
207 |        "((1022, 5), (438, 5))"
208 |       ]
209 |      },
210 |      "execution_count": 4,
211 |      "metadata": {},
212 |      "output_type": "execute_result"
213 |     }
214 |    ],
215 |    "source": [
216 |     "# separar datos en segmentos entrenamiento y prueba\n",
217 |     "\n",
218 |     "# primero, separemos el target (SalePrice) del resto de las variables\n",
219 |     "\n",
220 |     "cols_to_use.remove('SalePrice')\n",
221 |     "\n",
222 |     "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
223 |     "                                                    data['SalePrice'],\n",
224 |     "                                                    test_size=0.3,\n",
225 |     "                                                    random_state=0)\n",
226 |     "X_train.shape, X_test.shape"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "## Feature-engine captura las variables categóricas automáticamente"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 5,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "# llamemos el imputer de feature engine\n",
243 |     "# no necesitamos especificar nada\n",
244 |     "\n",
245 |     "imputer = mdi.CategoricalImputer(imputation_method='frequent')"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 6,
251 |    "metadata": {},
252 |    "outputs": [
253 |     {
254 |      "data": {
255 |       "text/plain": [
256 |        "CategoricalImputer(imputation_method='frequent',\n",
257 |        "                   variables=['BsmtQual', 'FireplaceQu'])"
258 |       ]
259 |      },
260 |      "execution_count": 6,
261 |      "metadata": {},
262 |      "output_type": "execute_result"
263 |     }
264 |    ],
265 |    "source": [
266 |     "# ajustemos el imputer\n",
267 |     "\n",
268 |     "imputer.fit(X_train)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 7,
274 |    "metadata": {},
275 |    "outputs": [
276 |     {
277 |      "data": {
278 |       "text/plain": [
279 |        "['BsmtQual', 'FireplaceQu']"
280 |       ]
281 |      },
282 |      "execution_count": 7,
283 |      "metadata": {},
284 |      "output_type": "execute_result"
285 |     }
286 |    ],
287 |    "source": [
288 |     "# vemos que el imputer encontró las variables categóricas \n",
289 |     "# para sustituir con la categoría más frecuente o moda\n",
290 |     "\n",
291 |     "\n",
292 |     "imputer.variables"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 8,
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "data": {
302 |       "text/plain": [
303 |        "{'BsmtQual': 'TA', 'FireplaceQu': 'Gd'}"
304 |       ]
305 |      },
306 |      "execution_count": 8,
307 |      "metadata": {},
308 |      "output_type": "execute_result"
309 |     }
310 |    ],
311 |    "source": [
312 |     "# aquí vemos los valores  que serán usados\n",
313 |     "# para reemplazar los NA en cada variable\n",
314 |     "\n",
315 |     "\n",
316 |     "imputer.imputer_dict_"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 9,
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "data": {
326 |       "text/html": [
327 |        "<div>\n",
328 |        "<style scoped>\n",
329 |        "    .dataframe tbody tr th:only-of-type {\n",
330 |        "        vertical-align: middle;\n",
331 |        "    }\n",
332 |        "\n",
333 |        "    .dataframe tbody tr th {\n",
334 |        "        vertical-align: top;\n",
335 |        "    }\n",
336 |        "\n",
337 |        "    .dataframe thead th {\n",
338 |        "        text-align: right;\n",
339 |        "    }\n",
340 |        "</style>\n",
341 |        "<table border=\"1\" class=\"dataframe\">\n",
342 |        "  <thead>\n",
343 |        "    <tr style=\"text-align: right;\">\n",
344 |        "      <th></th>\n",
345 |        "      <th>BsmtQual</th>\n",
346 |        "      <th>FireplaceQu</th>\n",
347 |        "    </tr>\n",
348 |        "  </thead>\n",
349 |        "  <tbody>\n",
350 |        "    <tr>\n",
351 |        "      <th>0</th>\n",
352 |        "      <td>TA</td>\n",
353 |        "      <td>Gd</td>\n",
354 |        "    </tr>\n",
355 |        "  </tbody>\n",
356 |        "</table>\n",
357 |        "</div>"
358 |       ],
359 |       "text/plain": [
360 |        "  BsmtQual FireplaceQu\n",
361 |        "0       TA          Gd"
362 |       ]
363 |      },
364 |      "execution_count": 9,
365 |      "metadata": {},
366 |      "output_type": "execute_result"
367 |     }
368 |    ],
369 |    "source": [
370 |     "# revisemos las modas sobre el set de entrenamiento\n",
371 |     "\n",
372 |     "X_train[imputer.variables].mode()"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 10,
378 |    "metadata": {},
379 |    "outputs": [
380 |     {
381 |      "data": {
382 |       "text/html": [
383 |        "<div>\n",
384 |        "<style scoped>\n",
385 |        "    .dataframe tbody tr th:only-of-type {\n",
386 |        "        vertical-align: middle;\n",
387 |        "    }\n",
388 |        "\n",
389 |        "    .dataframe tbody tr th {\n",
390 |        "        vertical-align: top;\n",
391 |        "    }\n",
392 |        "\n",
393 |        "    .dataframe thead th {\n",
394 |        "        text-align: right;\n",
395 |        "    }\n",
396 |        "</style>\n",
397 |        "<table border=\"1\" class=\"dataframe\">\n",
398 |        "  <thead>\n",
399 |        "    <tr style=\"text-align: right;\">\n",
400 |        "      <th></th>\n",
401 |        "      <th>BsmtQual</th>\n",
402 |        "      <th>FireplaceQu</th>\n",
403 |        "      <th>LotFrontage</th>\n",
404 |        "      <th>MasVnrArea</th>\n",
405 |        "      <th>GarageYrBlt</th>\n",
406 |        "    </tr>\n",
407 |        "  </thead>\n",
408 |        "  <tbody>\n",
409 |        "    <tr>\n",
410 |        "      <th>64</th>\n",
411 |        "      <td>Gd</td>\n",
412 |        "      <td>Gd</td>\n",
413 |        "      <td>NaN</td>\n",
414 |        "      <td>573.0</td>\n",
415 |        "      <td>1998.0</td>\n",
416 |        "    </tr>\n",
417 |        "    <tr>\n",
418 |        "      <th>682</th>\n",
419 |        "      <td>Gd</td>\n",
420 |        "      <td>Gd</td>\n",
421 |        "      <td>NaN</td>\n",
422 |        "      <td>0.0</td>\n",
423 |        "      <td>1996.0</td>\n",
424 |        "    </tr>\n",
425 |        "    <tr>\n",
426 |        "      <th>960</th>\n",
427 |        "      <td>TA</td>\n",
428 |        "      <td>Gd</td>\n",
429 |        "      <td>50.0</td>\n",
430 |        "      <td>0.0</td>\n",
431 |        "      <td>NaN</td>\n",
432 |        "    </tr>\n",
433 |        "    <tr>\n",
434 |        "      <th>1384</th>\n",
435 |        "      <td>TA</td>\n",
436 |        "      <td>Gd</td>\n",
437 |        "      <td>60.0</td>\n",
438 |        "      <td>0.0</td>\n",
439 |        "      <td>1939.0</td>\n",
440 |        "    </tr>\n",
441 |        "    <tr>\n",
442 |        "      <th>1100</th>\n",
443 |        "      <td>TA</td>\n",
444 |        "      <td>Gd</td>\n",
445 |        "      <td>60.0</td>\n",
446 |        "      <td>0.0</td>\n",
447 |        "      <td>1930.0</td>\n",
448 |        "    </tr>\n",
449 |        "  </tbody>\n",
450 |        "</table>\n",
451 |        "</div>"
452 |       ],
453 |       "text/plain": [
454 |        "     BsmtQual FireplaceQu  LotFrontage  MasVnrArea  GarageYrBlt\n",
455 |        "64         Gd          Gd          NaN       573.0       1998.0\n",
456 |        "682        Gd          Gd          NaN         0.0       1996.0\n",
457 |        "960        TA          Gd         50.0         0.0          NaN\n",
458 |        "1384       TA          Gd         60.0         0.0       1939.0\n",
459 |        "1100       TA          Gd         60.0         0.0       1930.0"
460 |       ]
461 |      },
462 |      "execution_count": 10,
463 |      "metadata": {},
464 |      "output_type": "execute_result"
465 |     }
466 |    ],
467 |    "source": [
468 |     "# feature-engine devuelve un dataframe\n",
469 |     "\n",
470 |     "tmp = imputer.transform(X_train)\n",
471 |     "tmp.head()"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": 11,
477 |    "metadata": {},
478 |    "outputs": [
479 |     {
480 |      "data": {
481 |       "text/plain": [
482 |        "BsmtQual       0.0\n",
483 |        "FireplaceQu    0.0\n",
484 |        "dtype: float64"
485 |       ]
486 |      },
487 |      "execution_count": 11,
488 |      "metadata": {},
489 |      "output_type": "execute_result"
490 |     }
491 |    ],
492 |    "source": [
493 |     "# revisemos que los valores nulos ya no existen\n",
494 |     "\n",
495 |     "tmp[imputer.variables].isnull().mean()"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "markdown",
500 |    "metadata": {},
501 |    "source": [
502 |     "## Feature-engine te permite especificar grupos de variables fácilmente"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 12,
508 |    "metadata": {},
509 |    "outputs": [
510 |     {
511 |      "data": {
512 |       "text/plain": [
513 |        "CategoricalImputer(imputation_method='frequent', variables=['BsmtQual'])"
514 |       ]
515 |      },
516 |      "execution_count": 12,
517 |      "metadata": {},
518 |      "output_type": "execute_result"
519 |     }
520 |    ],
521 |    "source": [
522 |     "# usemos la sustitución pero esta vez solo\n",
523 |     "# imputemos una variable\n",
524 |     "\n",
525 |     "imputer = mdi.CategoricalImputer(imputation_method='frequent',\n",
526 |     "                                 variables=['BsmtQual'])\n",
527 |     "\n",
528 |     "imputer.fit(X_train)"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": 13,
534 |    "metadata": {},
535 |    "outputs": [
536 |     {
537 |      "data": {
538 |       "text/plain": [
539 |        "['BsmtQual']"
540 |       ]
541 |      },
542 |      "execution_count": 13,
543 |      "metadata": {},
544 |      "output_type": "execute_result"
545 |     }
546 |    ],
547 |    "source": [
548 |     "# ahora el imputer solo tiene la variable indicada\n",
549 |     "\n",
550 |     "imputer.variables"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": 14,
556 |    "metadata": {},
557 |    "outputs": [
558 |     {
559 |      "data": {
560 |       "text/plain": [
561 |        "{'BsmtQual': 'TA'}"
562 |       ]
563 |      },
564 |      "execution_count": 14,
565 |      "metadata": {},
566 |      "output_type": "execute_result"
567 |     }
568 |    ],
569 |    "source": [
570 |     "# y podemos ver el valor asignado para sustituirla\n",
571 |     "\n",
572 |     "imputer.imputer_dict_"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": 15,
578 |    "metadata": {},
579 |    "outputs": [
580 |     {
581 |      "data": {
582 |       "text/plain": [
583 |        "BsmtQual    0.0\n",
584 |        "dtype: float64"
585 |       ]
586 |      },
587 |      "execution_count": 15,
588 |      "metadata": {},
589 |      "output_type": "execute_result"
590 |     }
591 |    ],
592 |    "source": [
593 |     "# feature-engine devuelve un dataframe\n",
594 |     "# al imputar la variable:\n",
595 |     "\n",
596 |     "tmp = imputer.transform(X_train)\n",
597 |     "\n",
598 |     "\n",
599 |     "# revisemos que la variable indicada ya no tiene valores nulos\n",
600 |     "tmp[imputer.variables].isnull().mean()"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "markdown",
605 |    "metadata": {},
606 |    "source": [
607 |     "Funcionó!\n",
608 |     "\n",
609 |     "More details here:\n",
610 |     "https://feature-engine.readthedocs.io/en/latest/imputation/CategoricalImputer.html"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": []
619 |   }
620 |  ],
621 |  "metadata": {
622 |   "kernelspec": {
623 |    "display_name": "feml",
624 |    "language": "python",
625 |    "name": "feml"
626 |   },
627 |   "language_info": {
628 |    "codemirror_mode": {
629 |     "name": "ipython",
630 |     "version": 3
631 |    },
632 |    "file_extension": ".py",
633 |    "mimetype": "text/x-python",
634 |    "name": "python",
635 |    "nbconvert_exporter": "python",
636 |    "pygments_lexer": "ipython3",
637 |    "version": "3.8.2"
638 |   },
639 |   "toc": {
640 |    "base_numbering": 1,
641 |    "nav_menu": {},
642 |    "number_sections": true,
643 |    "sideBar": true,
644 |    "skip_h1_title": false,
645 |    "title_cell": "Table of Contents",
646 |    "title_sidebar": "Contents",
647 |    "toc_cell": false,
648 |    "toc_position": {},
649 |    "toc_section_display": "block",
650 |    "toc_window_display": true
651 |   }
652 |  },
653 |  "nbformat": 4,
654 |  "nbformat_minor": 2
655 | }
656 | 


--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.19_SustitucionCategoriaAdicional_FeatureEngine.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Sustitución usando una etiqueta adicional en variables categóricas  ==> Feature-engine\n",
  8 |     "\n",
  9 |     "### Qué es Feature-engine?\n",
 10 |     "\n",
 11 |     "Feature-engine es una librería de Python que hemos creado para este curso. \n",
 12 |     "\n",
 13 |     "- Feature-engine incluye todas las técnicas de ingeniería de variables descritas en este curso\n",
 14 |     "- Feature-engine funciona como Scikit-learn, por lo tanto es fácil de aprender\n",
 15 |     "- Feature-engine te permite implementar pasos de ingeniería de variables específicos para diferentes grupos de variables\n",
 16 |     "- Feature-engine puede ser integrado con las pipelines de Scikit-learn pipeline permitiendo construir modelos fácilmente\n",
 17 |     "** Feature-engine te permite diseñar y guardar un flujo de ingeniería de variables con procesos diseñados específicamente para diferentes grupos de variables.**\n",
 18 |     "\n",
 19 |     "-------------------------------------------------------------------\n",
 20 |     "Feature-engine puede ser instalado vía pip ==> pip install feature-engine\n",
 21 |     "\n",
 22 |     "- Asegurate que haz instalado Feature-engine antes de correr este notebook\n",
 23 |     "\n",
 24 |     "Para más detalle visita el [website the trainindata]( https://www.trainindata.com/feature-engine) \n",
 25 |     "\n",
 26 |     "\n",
 27 |     "## En este demo:\n",
 28 |     "\n",
 29 |     "Vamos a usar **Feature-engine para hacer la sustitución usando una etiqueta adicional en variables categóricas 'Missing' ** usando los datos Ames House Price.\n",
 30 |     "\n",
 31 |     "- Para bajar los datos, por favor referirse a la clase  **Datasets** en la  **Sección 1** del curso.\n",
 32 |     "\n",
 33 |     "### Nota: \n",
 34 |     "* 'Imputer' se deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahí el nombre dado a la clase."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 1,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import pandas as pd\n",
 44 |     "import numpy as np\n",
 45 |     "\n",
 46 |     "import matplotlib.pyplot as plt\n",
 47 |     "\n",
 48 |     "from sklearn.model_selection import train_test_split\n",
 49 |     "from sklearn.pipeline import Pipeline\n",
 50 |     "\n",
 51 |     "# feature engine\n",
 52 |     "from feature_engine import imputation as mdi"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "text/html": [
 63 |        "<div>\n",
 64 |        "<style scoped>\n",
 65 |        "    .dataframe tbody tr th:only-of-type {\n",
 66 |        "        vertical-align: middle;\n",
 67 |        "    }\n",
 68 |        "\n",
 69 |        "    .dataframe tbody tr th {\n",
 70 |        "        vertical-align: top;\n",
 71 |        "    }\n",
 72 |        "\n",
 73 |        "    .dataframe thead th {\n",
 74 |        "        text-align: right;\n",
 75 |        "    }\n",
 76 |        "</style>\n",
 77 |        "<table border=\"1\" class=\"dataframe\">\n",
 78 |        "  <thead>\n",
 79 |        "    <tr style=\"text-align: right;\">\n",
 80 |        "      <th></th>\n",
 81 |        "      <th>LotFrontage</th>\n",
 82 |        "      <th>MasVnrArea</th>\n",
 83 |        "      <th>BsmtQual</th>\n",
 84 |        "      <th>FireplaceQu</th>\n",
 85 |        "      <th>GarageYrBlt</th>\n",
 86 |        "      <th>SalePrice</th>\n",
 87 |        "    </tr>\n",
 88 |        "  </thead>\n",
 89 |        "  <tbody>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>0</th>\n",
 92 |        "      <td>65.0</td>\n",
 93 |        "      <td>196.0</td>\n",
 94 |        "      <td>Gd</td>\n",
 95 |        "      <td>NaN</td>\n",
 96 |        "      <td>2003.0</td>\n",
 97 |        "      <td>208500</td>\n",
 98 |        "    </tr>\n",
 99 |        "    <tr>\n",
100 |        "      <th>1</th>\n",
101 |        "      <td>80.0</td>\n",
102 |        "      <td>0.0</td>\n",
103 |        "      <td>Gd</td>\n",
104 |        "      <td>TA</td>\n",
105 |        "      <td>1976.0</td>\n",
106 |        "      <td>181500</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>2</th>\n",
110 |        "      <td>68.0</td>\n",
111 |        "      <td>162.0</td>\n",
112 |        "      <td>Gd</td>\n",
113 |        "      <td>TA</td>\n",
114 |        "      <td>2001.0</td>\n",
115 |        "      <td>223500</td>\n",
116 |        "    </tr>\n",
117 |        "    <tr>\n",
118 |        "      <th>3</th>\n",
119 |        "      <td>60.0</td>\n",
120 |        "      <td>0.0</td>\n",
121 |        "      <td>TA</td>\n",
122 |        "      <td>Gd</td>\n",
123 |        "      <td>1998.0</td>\n",
124 |        "      <td>140000</td>\n",
125 |        "    </tr>\n",
126 |        "    <tr>\n",
127 |        "      <th>4</th>\n",
128 |        "      <td>84.0</td>\n",
129 |        "      <td>350.0</td>\n",
130 |        "      <td>Gd</td>\n",
131 |        "      <td>TA</td>\n",
132 |        "      <td>2000.0</td>\n",
133 |        "      <td>250000</td>\n",
134 |        "    </tr>\n",
135 |        "  </tbody>\n",
136 |        "</table>\n",
137 |        "</div>"
138 |       ],
139 |       "text/plain": [
140 |        "   LotFrontage  MasVnrArea BsmtQual FireplaceQu  GarageYrBlt  SalePrice\n",
141 |        "0         65.0       196.0       Gd         NaN       2003.0     208500\n",
142 |        "1         80.0         0.0       Gd          TA       1976.0     181500\n",
143 |        "2         68.0       162.0       Gd          TA       2001.0     223500\n",
144 |        "3         60.0         0.0       TA          Gd       1998.0     140000\n",
145 |        "4         84.0       350.0       Gd          TA       2000.0     250000"
146 |       ]
147 |      },
148 |      "execution_count": 2,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "# carguemos los datos con un grupo de variables seleccionadas\n",
155 |     "\n",
156 |     "cols_to_use = [\n",
157 |     "    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
158 |     "    'SalePrice'\n",
159 |     "]\n",
160 |     "\n",
161 |     "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
162 |     "data.head()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 3,
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "data": {
172 |       "text/plain": [
173 |        "LotFrontage    0.177397\n",
174 |        "MasVnrArea     0.005479\n",
175 |        "BsmtQual       0.025342\n",
176 |        "FireplaceQu    0.472603\n",
177 |        "GarageYrBlt    0.055479\n",
178 |        "SalePrice      0.000000\n",
179 |        "dtype: float64"
180 |       ]
181 |      },
182 |      "execution_count": 3,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "data.isnull().mean()"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 4,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "((1022, 5), (438, 5))"
200 |       ]
201 |      },
202 |      "execution_count": 4,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "# separar datos en segmentos entrenamiento y prueba\n",
209 |     "\n",
210 |     "# primero, separemos el target (SalePrice) del resto de las variables\n",
211 |     "\n",
212 |     "cols_to_use.remove('SalePrice')\n",
213 |     "\n",
214 |     "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
215 |     "                                                    data['SalePrice'],\n",
216 |     "                                                    test_size=0.3,\n",
217 |     "                                                    random_state=0)\n",
218 |     "X_train.shape, X_test.shape"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "## Feature-engine captura las variables categóricas automáticamente"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 5,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "# llamemos el imputer de feature-engine\n",
235 |     "# no necesitamos especificar nada\n",
236 |     "\n",
237 |     "imputer = mdi.CategoricalImputer()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 6,
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "data": {
247 |       "text/plain": [
248 |        "CategoricalImputer(variables=['BsmtQual', 'FireplaceQu'])"
249 |       ]
250 |      },
251 |      "execution_count": 6,
252 |      "metadata": {},
253 |      "output_type": "execute_result"
254 |     }
255 |    ],
256 |    "source": [
257 |     "# ajustamos el imputer\n",
258 |     "\n",
259 |     "imputer.fit(X_train)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 7,
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "data": {
269 |       "text/plain": [
270 |        "['BsmtQual', 'FireplaceQu']"
271 |       ]
272 |      },
273 |      "execution_count": 7,
274 |      "metadata": {},
275 |      "output_type": "execute_result"
276 |     }
277 |    ],
278 |    "source": [
279 |     "# vemos que el imputer encontró las variables categóricas \n",
280 |     "# automáticamente\n",
281 |     "\n",
282 |     "imputer.variables"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "metadata": {},
288 |    "source": [
289 |     "**Este imputer reemplaza la categoría ausente con una etiqueta adicional \"Missing\"**"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 8,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/html": [
300 |        "<div>\n",
301 |        "<style scoped>\n",
302 |        "    .dataframe tbody tr th:only-of-type {\n",
303 |        "        vertical-align: middle;\n",
304 |        "    }\n",
305 |        "\n",
306 |        "    .dataframe tbody tr th {\n",
307 |        "        vertical-align: top;\n",
308 |        "    }\n",
309 |        "\n",
310 |        "    .dataframe thead th {\n",
311 |        "        text-align: right;\n",
312 |        "    }\n",
313 |        "</style>\n",
314 |        "<table border=\"1\" class=\"dataframe\">\n",
315 |        "  <thead>\n",
316 |        "    <tr style=\"text-align: right;\">\n",
317 |        "      <th></th>\n",
318 |        "      <th>BsmtQual</th>\n",
319 |        "      <th>FireplaceQu</th>\n",
320 |        "      <th>LotFrontage</th>\n",
321 |        "      <th>MasVnrArea</th>\n",
322 |        "      <th>GarageYrBlt</th>\n",
323 |        "    </tr>\n",
324 |        "  </thead>\n",
325 |        "  <tbody>\n",
326 |        "    <tr>\n",
327 |        "      <th>64</th>\n",
328 |        "      <td>Gd</td>\n",
329 |        "      <td>Missing</td>\n",
330 |        "      <td>NaN</td>\n",
331 |        "      <td>573.0</td>\n",
332 |        "      <td>1998.0</td>\n",
333 |        "    </tr>\n",
334 |        "    <tr>\n",
335 |        "      <th>682</th>\n",
336 |        "      <td>Gd</td>\n",
337 |        "      <td>Gd</td>\n",
338 |        "      <td>NaN</td>\n",
339 |        "      <td>0.0</td>\n",
340 |        "      <td>1996.0</td>\n",
341 |        "    </tr>\n",
342 |        "    <tr>\n",
343 |        "      <th>960</th>\n",
344 |        "      <td>TA</td>\n",
345 |        "      <td>Missing</td>\n",
346 |        "      <td>50.0</td>\n",
347 |        "      <td>0.0</td>\n",
348 |        "      <td>NaN</td>\n",
349 |        "    </tr>\n",
350 |        "    <tr>\n",
351 |        "      <th>1384</th>\n",
352 |        "      <td>TA</td>\n",
353 |        "      <td>Missing</td>\n",
354 |        "      <td>60.0</td>\n",
355 |        "      <td>0.0</td>\n",
356 |        "      <td>1939.0</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>1100</th>\n",
360 |        "      <td>TA</td>\n",
361 |        "      <td>Missing</td>\n",
362 |        "      <td>60.0</td>\n",
363 |        "      <td>0.0</td>\n",
364 |        "      <td>1930.0</td>\n",
365 |        "    </tr>\n",
366 |        "  </tbody>\n",
367 |        "</table>\n",
368 |        "</div>"
369 |       ],
370 |       "text/plain": [
371 |        "     BsmtQual FireplaceQu  LotFrontage  MasVnrArea  GarageYrBlt\n",
372 |        "64         Gd     Missing          NaN       573.0       1998.0\n",
373 |        "682        Gd          Gd          NaN         0.0       1996.0\n",
374 |        "960        TA     Missing         50.0         0.0          NaN\n",
375 |        "1384       TA     Missing         60.0         0.0       1939.0\n",
376 |        "1100       TA     Missing         60.0         0.0       1930.0"
377 |       ]
378 |      },
379 |      "execution_count": 8,
380 |      "metadata": {},
381 |      "output_type": "execute_result"
382 |     }
383 |    ],
384 |    "source": [
385 |     "# feature engine retorna un dataframe\n",
386 |     "\n",
387 |     "tmp = imputer.transform(X_train)\n",
388 |     "tmp.head()"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 9,
394 |    "metadata": {},
395 |    "outputs": [
396 |     {
397 |      "data": {
398 |       "text/plain": [
399 |        "BsmtQual       0.0\n",
400 |        "FireplaceQu    0.0\n",
401 |        "dtype: float64"
402 |       ]
403 |      },
404 |      "execution_count": 9,
405 |      "metadata": {},
406 |      "output_type": "execute_result"
407 |     }
408 |    ],
409 |    "source": [
410 |     "# revisemos que los valores nulos ya no existen\n",
411 |     "\n",
412 |     "tmp[imputer.variables].isnull().mean()"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "## Feature-engine te permite especificar grupos de variables fácilmente"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 10,
425 |    "metadata": {},
426 |    "outputs": [
427 |     {
428 |      "data": {
429 |       "text/plain": [
430 |        "CategoricalImputer(variables=['BsmtQual'])"
431 |       ]
432 |      },
433 |      "execution_count": 10,
434 |      "metadata": {},
435 |      "output_type": "execute_result"
436 |     }
437 |    ],
438 |    "source": [
439 |     "# usemos la sustitución pero esta vez solo \n",
440 |     "# imputemos una variable\n",
441 |     "\n",
442 |     "imputer = mdi.CategoricalImputer(variables=['BsmtQual'])\n",
443 |     "\n",
444 |     "imputer.fit(X_train)"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 11,
450 |    "metadata": {},
451 |    "outputs": [
452 |     {
453 |      "data": {
454 |       "text/plain": [
455 |        "['BsmtQual']"
456 |       ]
457 |      },
458 |      "execution_count": 11,
459 |      "metadata": {},
460 |      "output_type": "execute_result"
461 |     }
462 |    ],
463 |    "source": [
464 |     "# ahora el imputer solo tiene la variable indicada\n",
465 |     "\n",
466 |     "imputer.variables"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 12,
472 |    "metadata": {},
473 |    "outputs": [
474 |     {
475 |      "data": {
476 |       "text/plain": [
477 |        "BsmtQual    0.0\n",
478 |        "dtype: float64"
479 |       ]
480 |      },
481 |      "execution_count": 12,
482 |      "metadata": {},
483 |      "output_type": "execute_result"
484 |     }
485 |    ],
486 |    "source": [
487 |     "# feature-engine devuelve un dataframe\n",
488 |     "# al imputar la variable:\n",
489 |     "\n",
490 |     "tmp = imputer.transform(X_train)\n",
491 |     "\n",
492 |     "\n",
493 |     "# revisemos que la variable indicada ya no tiene valores nulos\n",
494 |     "tmp[imputer.variables].isnull().mean()"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "metadata": {
500 |     "scrolled": true
501 |    },
502 |    "source": [
503 |     "## Feature-engine puede ser usado con los flujos de Scikit-learn (pipeline)"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 13,
509 |    "metadata": {},
510 |    "outputs": [
511 |     {
512 |      "data": {
513 |       "text/plain": [
514 |        "BsmtQual       0.023483\n",
515 |        "FireplaceQu    0.467710\n",
516 |        "LotFrontage    0.184932\n",
517 |        "MasVnrArea     0.004892\n",
518 |        "GarageYrBlt    0.052838\n",
519 |        "dtype: float64"
520 |       ]
521 |      },
522 |      "execution_count": 13,
523 |      "metadata": {},
524 |      "output_type": "execute_result"
525 |     }
526 |    ],
527 |    "source": [
528 |     "# revisemos los valores nulos\n",
529 |     "\n",
530 |     "X_train.isnull().mean()"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "markdown",
535 |    "metadata": {},
536 |    "source": [
537 |     "Vamos a realizar las siguientes imputaciones\n",
538 |     "\n",
539 |     "- BsmtQual ==> categoría frecuente\n",
540 |     "- FirePlaceQu ==> etiqueta missing"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": 14,
546 |    "metadata": {},
547 |    "outputs": [],
548 |    "source": [
549 |     "pipe = Pipeline([\n",
550 |     "    ('imputer_mode', mdi.CategoricalImputer(\n",
551 |     "        imputation_method='frequent', variables=['BsmtQual'])),\n",
552 |     "    \n",
553 |     "    ('imputer_missing', mdi.CategoricalImputer(\n",
554 |     "        variables=['FireplaceQu'])),\n",
555 |     "])"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 15,
561 |    "metadata": {},
562 |    "outputs": [
563 |     {
564 |      "data": {
565 |       "text/plain": [
566 |        "Pipeline(steps=[('imputer_mode',\n",
567 |        "                 CategoricalImputer(imputation_method='frequent',\n",
568 |        "                                    variables=['BsmtQual'])),\n",
569 |        "                ('imputer_missing',\n",
570 |        "                 CategoricalImputer(variables=['FireplaceQu']))])"
571 |       ]
572 |      },
573 |      "execution_count": 15,
574 |      "metadata": {},
575 |      "output_type": "execute_result"
576 |     }
577 |    ],
578 |    "source": [
579 |     "pipe.fit(X_train)"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "code",
584 |    "execution_count": 16,
585 |    "metadata": {},
586 |    "outputs": [
587 |     {
588 |      "data": {
589 |       "text/plain": [
590 |        "['BsmtQual']"
591 |       ]
592 |      },
593 |      "execution_count": 16,
594 |      "metadata": {},
595 |      "output_type": "execute_result"
596 |     }
597 |    ],
598 |    "source": [
599 |     "pipe.named_steps['imputer_mode'].variables"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 17,
605 |    "metadata": {
606 |     "scrolled": true
607 |    },
608 |    "outputs": [
609 |     {
610 |      "data": {
611 |       "text/plain": [
612 |        "['FireplaceQu']"
613 |       ]
614 |      },
615 |      "execution_count": 17,
616 |      "metadata": {},
617 |      "output_type": "execute_result"
618 |     }
619 |    ],
620 |    "source": [
621 |     "pipe.named_steps['imputer_missing'].variables"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": 18,
627 |    "metadata": {},
628 |    "outputs": [
629 |     {
630 |      "data": {
631 |       "text/plain": [
632 |        "BsmtQual       0.000000\n",
633 |        "FireplaceQu    0.000000\n",
634 |        "LotFrontage    0.184932\n",
635 |        "MasVnrArea     0.004892\n",
636 |        "GarageYrBlt    0.052838\n",
637 |        "dtype: float64"
638 |       ]
639 |      },
640 |      "execution_count": 18,
641 |      "metadata": {},
642 |      "output_type": "execute_result"
643 |     }
644 |    ],
645 |    "source": [
646 |     "# transformemos los datos con la pipeline\n",
647 |     "tmp = pipe.transform(X_train)\n",
648 |     "\n",
649 |     "# revisemos que ya no tenemos valores nulos\n",
650 |     "tmp.isnull().mean()"
651 |    ]
652 |   }
653 |  ],
654 |  "metadata": {
655 |   "kernelspec": {
656 |    "display_name": "feml",
657 |    "language": "python",
658 |    "name": "feml"
659 |   },
660 |   "language_info": {
661 |    "codemirror_mode": {
662 |     "name": "ipython",
663 |     "version": 3
664 |    },
665 |    "file_extension": ".py",
666 |    "mimetype": "text/x-python",
667 |    "name": "python",
668 |    "nbconvert_exporter": "python",
669 |    "pygments_lexer": "ipython3",
670 |    "version": "3.8.2"
671 |   },
672 |   "toc": {
673 |    "base_numbering": 1,
674 |    "nav_menu": {},
675 |    "number_sections": true,
676 |    "sideBar": true,
677 |    "skip_h1_title": false,
678 |    "title_cell": "Table of Contents",
679 |    "title_sidebar": "Contents",
680 |    "toc_cell": false,
681 |    "toc_position": {},
682 |    "toc_section_display": "block",
683 |    "toc_window_display": true
684 |   }
685 |  },
686 |  "nbformat": 4,
687 |  "nbformat_minor": 2
688 | }
689 | 


--------------------------------------------------------------------------------
/Section-09-Ingenieria-valores-extremos/09.05-Truncamiento-valores-arbitrarios.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Tratamiento de valores atípicos\n",
  8 |     "\n",
  9 |     "Un valor atípico o valor extremo (outlier) es un valor el cual es significativamente diferente del resto de los datos. “Un outlier es una observación la cual se desvía tanto del resto de las observaciones que levanta sospechas sobre el mecanismo que lo generó” [D. Hawkins. Identification of Outliers, Chapman and Hall, 1980].\n",
 10 |     "\n",
 11 |     "Valores estadísticos como la media y la varianza son susceptibles a los valores extremos. Además, **algunos modelos de Machine Learning son susceptibles a los outliers** lo cual decrece su desempeño. Por lo tanto, dependiendo de cuál algoritmo deseas usar para entrenar un modelo, es muy común que sea necesario remover los valores atípicos de las variables.\n",
 12 |     "\n",
 13 |     "Discutimos en la sección 3, cómo identificar los outliers. En esta sección vamos a discutir cómo podemos procesarlos para entrenar nuestros modelos de machine learning o aprendizaje automático. \n",
 14 |     "\n",
 15 |     "Es importante resaltar que con cada modificación que hacemos en nuestros datos, introducimos algún sesgo. Por eso es muy importante saber cuáles son las implicaciones de cada método. Si es una buena decisión o no dependerá de la naturaleza de los datos que estemos analizando.\n",
 16 |     "\n",
 17 |     "\n",
 18 |     "## Cómo podemos pre-procesar los valores extremos?\n",
 19 |     "\n",
 20 |     "- Removerlos: eliminar los valores extremos de nuestro conjunto de datos\n",
 21 |     "- Tratar los outliers como datos faltantes y proceder con cualquiera de las técnicas de sustitución\n",
 22 |     "- Discretización: los datos son discretizados ( ver sección 8) y los valores atípicos son colados en los segmentos extremos junto con los valores más bajos y altos del conjunto de datos\n",
 23 |     "- Truncamiento de valores: Limitar la distribución de la variable a unos valores máximos y mínimos. También se le conoce como codificación Top / Bottom \n",
 24 |     "\n",
 25 |     "\n",
 26 |     "**El truncamiento de valores** se conoce en inglés como capping, trimming, censoring o winsorization.\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "## Truncamiento de outliers.\n",
 30 |     "\n",
 31 |     "**Truncar**, significa limitar los valores máximos y/o mínimos de una distribución a un valor arbitrario. En otras palabras, los valores más grandes o más pequeños que los que arbitrariamente se han determinado, son truncados.\n",
 32 |     "\n",
 33 |     "Truncar puede hacerse en ambos extremos de la distribución, o solo en un extremo, dependiendo de la variable y el caso de uso.\n",
 34 |     "\n",
 35 |     "Puedes ver la charla de Soledad en pydata Londres [pydata](https://www.youtube.com/watch?v=KHGGlozsRtA), donde ella presenta un ejemplo de truncamiento de los valores extremos en una compañía financiera.\n",
 36 |     "\n",
 37 |     "Los números en los cuales se debe truncar la distribución pueden ser determinados: \n",
 38 |     "\n",
 39 |     "- arbitrariamente\n",
 40 |     "- usando la regla de proximidad del rango inter-cuartil \n",
 41 |     "- usando la aproximación gaussiana \n",
 42 |     "- usando los cuartiles\n",
 43 |     "\n",
 44 |     "### Ventajas\n",
 45 |     "\n",
 46 |     "- no remueve las observaciones\n",
 47 |     "\n",
 48 |     "### Limitaciones\n",
 49 |     "\n",
 50 |     "- distorsiona la distribución de las variables \n",
 51 |     "- distorsiona la relación entre las variable\n",
 52 |     "\n",
 53 |     "## En este Demo\n",
 54 |     "\n",
 55 |     "Vas a aprender como truncar los valores extremos en las variables utilizando valores arbitrarios y el conjunto de datos del Titanic\n",
 56 |     "\n",
 57 |     "## Importante\n",
 58 |     "\n",
 59 |     "Cuando truncamos nuestros datos, tendemos a limitar los valores en el set de entrenamiento y en el set de prueba. Es importante recordar que los valores de truncamiento DEBEN SER derivados del set de entrenamiento. Y luego esos mismos valores se usan para truncar las variables en el set de prueba.\n",
 60 |     "\n",
 61 |     "Para simplificar el demo, no lo haremos, pero por favor, ten eso en cuenta cuando construyas tus pipelines de machine learning."
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 1,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "import pandas as pd\n",
 71 |     "import numpy as np\n",
 72 |     "\n",
 73 |     "import matplotlib.pyplot as plt\n",
 74 |     "\n",
 75 |     "from feature_engine import imputation  as msi\n",
 76 |     "from feature_engine import outliers as outr"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 2,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# función para cargar los datos del titanic \n",
 86 |     "\n",
 87 |     "def load_titanic():\n",
 88 |     "    data = pd.read_csv('../titanic.csv')\n",
 89 |     "    data['cabin'] = data['cabin'].astype(str).str[0]\n",
 90 |     "    data['pclass'] = data['pclass'].astype('O')\n",
 91 |     "    data['embarked'].fillna('C', inplace=True)\n",
 92 |     "    return data"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 3,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/html": [
103 |        "<div>\n",
104 |        "<style scoped>\n",
105 |        "    .dataframe tbody tr th:only-of-type {\n",
106 |        "        vertical-align: middle;\n",
107 |        "    }\n",
108 |        "\n",
109 |        "    .dataframe tbody tr th {\n",
110 |        "        vertical-align: top;\n",
111 |        "    }\n",
112 |        "\n",
113 |        "    .dataframe thead th {\n",
114 |        "        text-align: right;\n",
115 |        "    }\n",
116 |        "</style>\n",
117 |        "<table border=\"1\" class=\"dataframe\">\n",
118 |        "  <thead>\n",
119 |        "    <tr style=\"text-align: right;\">\n",
120 |        "      <th></th>\n",
121 |        "      <th>pclass</th>\n",
122 |        "      <th>survived</th>\n",
123 |        "      <th>name</th>\n",
124 |        "      <th>sex</th>\n",
125 |        "      <th>age</th>\n",
126 |        "      <th>sibsp</th>\n",
127 |        "      <th>parch</th>\n",
128 |        "      <th>ticket</th>\n",
129 |        "      <th>fare</th>\n",
130 |        "      <th>cabin</th>\n",
131 |        "      <th>embarked</th>\n",
132 |        "      <th>boat</th>\n",
133 |        "      <th>body</th>\n",
134 |        "      <th>home.dest</th>\n",
135 |        "    </tr>\n",
136 |        "  </thead>\n",
137 |        "  <tbody>\n",
138 |        "    <tr>\n",
139 |        "      <th>0</th>\n",
140 |        "      <td>1</td>\n",
141 |        "      <td>1</td>\n",
142 |        "      <td>Allen, Miss. Elisabeth Walton</td>\n",
143 |        "      <td>female</td>\n",
144 |        "      <td>29.0000</td>\n",
145 |        "      <td>0</td>\n",
146 |        "      <td>0</td>\n",
147 |        "      <td>24160</td>\n",
148 |        "      <td>211.3375</td>\n",
149 |        "      <td>B</td>\n",
150 |        "      <td>S</td>\n",
151 |        "      <td>2</td>\n",
152 |        "      <td>NaN</td>\n",
153 |        "      <td>St Louis, MO</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <th>1</th>\n",
157 |        "      <td>1</td>\n",
158 |        "      <td>1</td>\n",
159 |        "      <td>Allison, Master. Hudson Trevor</td>\n",
160 |        "      <td>male</td>\n",
161 |        "      <td>0.9167</td>\n",
162 |        "      <td>1</td>\n",
163 |        "      <td>2</td>\n",
164 |        "      <td>113781</td>\n",
165 |        "      <td>151.5500</td>\n",
166 |        "      <td>C</td>\n",
167 |        "      <td>S</td>\n",
168 |        "      <td>11</td>\n",
169 |        "      <td>NaN</td>\n",
170 |        "      <td>Montreal, PQ / Chesterville, ON</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>2</th>\n",
174 |        "      <td>1</td>\n",
175 |        "      <td>0</td>\n",
176 |        "      <td>Allison, Miss. Helen Loraine</td>\n",
177 |        "      <td>female</td>\n",
178 |        "      <td>2.0000</td>\n",
179 |        "      <td>1</td>\n",
180 |        "      <td>2</td>\n",
181 |        "      <td>113781</td>\n",
182 |        "      <td>151.5500</td>\n",
183 |        "      <td>C</td>\n",
184 |        "      <td>S</td>\n",
185 |        "      <td>NaN</td>\n",
186 |        "      <td>NaN</td>\n",
187 |        "      <td>Montreal, PQ / Chesterville, ON</td>\n",
188 |        "    </tr>\n",
189 |        "    <tr>\n",
190 |        "      <th>3</th>\n",
191 |        "      <td>1</td>\n",
192 |        "      <td>0</td>\n",
193 |        "      <td>Allison, Mr. Hudson Joshua Creighton</td>\n",
194 |        "      <td>male</td>\n",
195 |        "      <td>30.0000</td>\n",
196 |        "      <td>1</td>\n",
197 |        "      <td>2</td>\n",
198 |        "      <td>113781</td>\n",
199 |        "      <td>151.5500</td>\n",
200 |        "      <td>C</td>\n",
201 |        "      <td>S</td>\n",
202 |        "      <td>NaN</td>\n",
203 |        "      <td>135.0</td>\n",
204 |        "      <td>Montreal, PQ / Chesterville, ON</td>\n",
205 |        "    </tr>\n",
206 |        "    <tr>\n",
207 |        "      <th>4</th>\n",
208 |        "      <td>1</td>\n",
209 |        "      <td>0</td>\n",
210 |        "      <td>Allison, Mrs. Hudson J C (Bessie Waldo Daniels)</td>\n",
211 |        "      <td>female</td>\n",
212 |        "      <td>25.0000</td>\n",
213 |        "      <td>1</td>\n",
214 |        "      <td>2</td>\n",
215 |        "      <td>113781</td>\n",
216 |        "      <td>151.5500</td>\n",
217 |        "      <td>C</td>\n",
218 |        "      <td>S</td>\n",
219 |        "      <td>NaN</td>\n",
220 |        "      <td>NaN</td>\n",
221 |        "      <td>Montreal, PQ / Chesterville, ON</td>\n",
222 |        "    </tr>\n",
223 |        "  </tbody>\n",
224 |        "</table>\n",
225 |        "</div>"
226 |       ],
227 |       "text/plain": [
228 |        "  pclass  survived                                             name     sex  \\\n",
229 |        "0      1         1                    Allen, Miss. Elisabeth Walton  female   \n",
230 |        "1      1         1                   Allison, Master. Hudson Trevor    male   \n",
231 |        "2      1         0                     Allison, Miss. Helen Loraine  female   \n",
232 |        "3      1         0             Allison, Mr. Hudson Joshua Creighton    male   \n",
233 |        "4      1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   \n",
234 |        "\n",
235 |        "       age  sibsp  parch  ticket      fare cabin embarked boat   body  \\\n",
236 |        "0  29.0000      0      0   24160  211.3375     B        S    2    NaN   \n",
237 |        "1   0.9167      1      2  113781  151.5500     C        S   11    NaN   \n",
238 |        "2   2.0000      1      2  113781  151.5500     C        S  NaN    NaN   \n",
239 |        "3  30.0000      1      2  113781  151.5500     C        S  NaN  135.0   \n",
240 |        "4  25.0000      1      2  113781  151.5500     C        S  NaN    NaN   \n",
241 |        "\n",
242 |        "                         home.dest  \n",
243 |        "0                     St Louis, MO  \n",
244 |        "1  Montreal, PQ / Chesterville, ON  \n",
245 |        "2  Montreal, PQ / Chesterville, ON  \n",
246 |        "3  Montreal, PQ / Chesterville, ON  \n",
247 |        "4  Montreal, PQ / Chesterville, ON  "
248 |       ]
249 |      },
250 |      "execution_count": 3,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "data = load_titanic()\n",
257 |     "data.head()"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "## Truncador de outliers con valores arbitrarios con Feature-engine\n",
265 |     "\n",
266 |     "Los límites para truncar los valores extremos son determinados por el usuario. "
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "### Truncando el extremo superior"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 4,
279 |    "metadata": {},
280 |    "outputs": [
281 |     {
282 |      "data": {
283 |       "text/plain": [
284 |        "(80.0, 512.3292)"
285 |       ]
286 |      },
287 |      "execution_count": 4,
288 |      "metadata": {},
289 |      "output_type": "execute_result"
290 |     }
291 |    ],
292 |    "source": [
293 |     "# encontremos cual el es valor máximo de la variable Age y \n",
294 |     "# Fare en los datos del titanic\n",
295 |     "\n",
296 |     "data.age.max(), data.fare.max()"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 5,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/plain": [
307 |        "ArbitraryOutlierCapper(max_capping_dict={'age': 50, 'fare': 200},\n",
308 |        "                       missing_values='ignore')"
309 |       ]
310 |      },
311 |      "execution_count": 5,
312 |      "metadata": {},
313 |      "output_type": "execute_result"
314 |     }
315 |    ],
316 |    "source": [
317 |     "# inicialicemos el ArbitraryOutlierCapper de feature-engine\n",
318 |     "capper = outr.ArbitraryOutlierCapper(\n",
319 |     "    max_capping_dict = {'age':50, 'fare':200},\n",
320 |     "    min_capping_dict = None,\n",
321 |     "    missing_values='ignore')\n",
322 |     "\n",
323 |     "capper.fit(data)"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 6,
329 |    "metadata": {},
330 |    "outputs": [
331 |     {
332 |      "data": {
333 |       "text/plain": [
334 |        "{'age': 50, 'fare': 200}"
335 |       ]
336 |      },
337 |      "execution_count": 6,
338 |      "metadata": {},
339 |      "output_type": "execute_result"
340 |     }
341 |    ],
342 |    "source": [
343 |     "capper.right_tail_caps_"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 7,
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "{}"
355 |       ]
356 |      },
357 |      "execution_count": 7,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": [
363 |     "capper.left_tail_caps_"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 8,
369 |    "metadata": {},
370 |    "outputs": [
371 |     {
372 |      "data": {
373 |       "text/plain": [
374 |        "(50.0, 200.0)"
375 |       ]
376 |      },
377 |      "execution_count": 8,
378 |      "metadata": {},
379 |      "output_type": "execute_result"
380 |     }
381 |    ],
382 |    "source": [
383 |     "temp = capper.transform(data)\n",
384 |     "\n",
385 |     "temp.age.max(), temp.fare.max()"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {},
391 |    "source": [
392 |     "### Truncando el extremo inferior"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 9,
398 |    "metadata": {},
399 |    "outputs": [
400 |     {
401 |      "data": {
402 |       "text/plain": [
403 |        "ArbitraryOutlierCapper(min_capping_dict={'age': 10, 'fare': 100},\n",
404 |        "                       missing_values='ignore')"
405 |       ]
406 |      },
407 |      "execution_count": 9,
408 |      "metadata": {},
409 |      "output_type": "execute_result"
410 |     }
411 |    ],
412 |    "source": [
413 |     "capper = outr.ArbitraryOutlierCapper(\n",
414 |     "    max_capping_dict=None,\n",
415 |     "    min_capping_dict={'age': 10,'fare': 100},\n",
416 |     "    missing_values='ignore')\n",
417 |     "\n",
418 |     "capper.fit(data)"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 10,
424 |    "metadata": {},
425 |    "outputs": [
426 |     {
427 |      "data": {
428 |       "text/plain": [
429 |        "['age', 'fare']"
430 |       ]
431 |      },
432 |      "execution_count": 10,
433 |      "metadata": {},
434 |      "output_type": "execute_result"
435 |     }
436 |    ],
437 |    "source": [
438 |     "capper.variables"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": 11,
444 |    "metadata": {},
445 |    "outputs": [
446 |     {
447 |      "data": {
448 |       "text/plain": [
449 |        "{}"
450 |       ]
451 |      },
452 |      "execution_count": 11,
453 |      "metadata": {},
454 |      "output_type": "execute_result"
455 |     }
456 |    ],
457 |    "source": [
458 |     "capper.right_tail_caps_"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 12,
464 |    "metadata": {},
465 |    "outputs": [
466 |     {
467 |      "data": {
468 |       "text/plain": [
469 |        "{'age': 10, 'fare': 100}"
470 |       ]
471 |      },
472 |      "execution_count": 12,
473 |      "metadata": {},
474 |      "output_type": "execute_result"
475 |     }
476 |    ],
477 |    "source": [
478 |     "capper.left_tail_caps_"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 13,
484 |    "metadata": {},
485 |    "outputs": [
486 |     {
487 |      "data": {
488 |       "text/plain": [
489 |        "(10.0, 100.0)"
490 |       ]
491 |      },
492 |      "execution_count": 13,
493 |      "metadata": {},
494 |      "output_type": "execute_result"
495 |     }
496 |    ],
497 |    "source": [
498 |     "temp = capper.transform(data)\n",
499 |     "\n",
500 |     "temp.age.min(), temp.fare.min()"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {},
506 |    "source": [
507 |     "###  Truncando ambos extremos "
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": 14,
513 |    "metadata": {},
514 |    "outputs": [
515 |     {
516 |      "data": {
517 |       "text/plain": [
518 |        "ArbitraryOutlierCapper(max_capping_dict={'age': 50, 'fare': 200},\n",
519 |        "                       min_capping_dict={'age': 10, 'fare': 100},\n",
520 |        "                       missing_values='ignore')"
521 |       ]
522 |      },
523 |      "execution_count": 14,
524 |      "metadata": {},
525 |      "output_type": "execute_result"
526 |     }
527 |    ],
528 |    "source": [
529 |     "capper = outr.ArbitraryOutlierCapper(max_capping_dict={\n",
530 |     "                                     'age': 50, 'fare': 200},\n",
531 |     "                                     min_capping_dict={\n",
532 |     "                                     'age': 10, 'fare': 100},\n",
533 |     "                                     missing_values='ignore')\n",
534 |     "capper.fit(data)"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": 15,
540 |    "metadata": {},
541 |    "outputs": [
542 |     {
543 |      "data": {
544 |       "text/plain": [
545 |        "{'age': 50, 'fare': 200}"
546 |       ]
547 |      },
548 |      "execution_count": 15,
549 |      "metadata": {},
550 |      "output_type": "execute_result"
551 |     }
552 |    ],
553 |    "source": [
554 |     "capper.right_tail_caps_"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": 16,
560 |    "metadata": {},
561 |    "outputs": [
562 |     {
563 |      "data": {
564 |       "text/plain": [
565 |        "{'age': 10, 'fare': 100}"
566 |       ]
567 |      },
568 |      "execution_count": 16,
569 |      "metadata": {},
570 |      "output_type": "execute_result"
571 |     }
572 |    ],
573 |    "source": [
574 |     "capper.left_tail_caps_"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": 17,
580 |    "metadata": {},
581 |    "outputs": [
582 |     {
583 |      "data": {
584 |       "text/plain": [
585 |        "(10.0, 100.0)"
586 |       ]
587 |      },
588 |      "execution_count": 17,
589 |      "metadata": {},
590 |      "output_type": "execute_result"
591 |     }
592 |    ],
593 |    "source": [
594 |     "temp = capper.transform(data)\n",
595 |     "\n",
596 |     "temp.age.min(), temp.fare.min()"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": 18,
602 |    "metadata": {},
603 |    "outputs": [
604 |     {
605 |      "data": {
606 |       "text/plain": [
607 |        "(50.0, 200.0)"
608 |       ]
609 |      },
610 |      "execution_count": 18,
611 |      "metadata": {},
612 |      "output_type": "execute_result"
613 |     }
614 |    ],
615 |    "source": [
616 |     "temp.age.max(), temp.fare.max()"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": null,
622 |    "metadata": {},
623 |    "outputs": [],
624 |    "source": []
625 |   }
626 |  ],
627 |  "metadata": {
628 |   "kernelspec": {
629 |    "display_name": "feml",
630 |    "language": "python",
631 |    "name": "feml"
632 |   },
633 |   "language_info": {
634 |    "codemirror_mode": {
635 |     "name": "ipython",
636 |     "version": 3
637 |    },
638 |    "file_extension": ".py",
639 |    "mimetype": "text/x-python",
640 |    "name": "python",
641 |    "nbconvert_exporter": "python",
642 |    "pygments_lexer": "ipython3",
643 |    "version": "3.8.2"
644 |   },
645 |   "toc": {
646 |    "base_numbering": 1,
647 |    "nav_menu": {},
648 |    "number_sections": true,
649 |    "sideBar": true,
650 |    "skip_h1_title": false,
651 |    "title_cell": "Table of Contents",
652 |    "title_sidebar": "Contents",
653 |    "toc_cell": false,
654 |    "toc_position": {
655 |     "height": "803px",
656 |     "left": "0px",
657 |     "right": "1681px",
658 |     "top": "107px",
659 |     "width": "239px"
660 |    },
661 |    "toc_section_display": "block",
662 |    "toc_window_display": true
663 |   }
664 |  },
665 |  "nbformat": 4,
666 |  "nbformat_minor": 2
667 | }
668 | 


--------------------------------------------------------------------------------
/Seccion-02-Tipos-de-Variables/02.4_VariablesMixtas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Variables Mixtas\n",
  8 |     "\n",
  9 |     "Las variables mixtas son aquellas que contienen valores numéricos y categorías.\n",
 10 |     "\n",
 11 |     "Las variables pueden ser mixtas por una multitud de razones. Por ejemplo, cuando entidades financieras recogen y guardan información acerca de usuarios, generalmente utilizan números. Sin embargo, a veces sucede que dichas entidades no pueden recuperar la información de un cliente. En estos casos, las agencias codifican cada razón con un código diferente. Esto genera estas variables combinadas que contienen códigos alfanuméricos cuando el valor no pudo ser recuperado y números para los valores usuales.\n",
 12 |     "\n",
 13 |     "Un ejemplo es el número de cuentas abiertas 'number_of_open_accounts', que en principio puede ser cualquier número que represente el número de productos financieros que tenga el prestamista. Cuando la información no está disponible, cada razón se codifica con una letra diferente, por ejemplo: 'A': no se pudo identificar a la persona, 'B': datos no relevantes, 'C': la persona parece no tener ninguna cuenta abierta.\n",
 14 |     "\n",
 15 |     "Otro ejemplo de variables mixtas, es la variable missed_payment_status, que indica si el prestamista está retrasado con uno o varios pagos. Por ejemplo, si el prestamista tiene una tarjeta de crédito, esta variable indicaría si está retrasado con sus pagos mensuales. Valores 0, 1, 2, 3 significan que el cliente no ha pagado 0-3 cuotas del préstamo. El valor 'D', se usa cuando el cliente incumple con el número máximo de pagos retrasados permitidos (D por la palabra en inglés 'defaulted'), que típicamente son tres pagos.\n",
 16 |     "\n",
 17 |     "Para este demo, necesitan descargar el archivo csv llamado sample_s2.csv que se encuentra en el folder con los Jupyter Notebooks de la clase **\"Jupyter Notebooks\"** en la  **Sección 1**. También se encuentra disponible para descargar junto al video correspondiente a este Notebook.\n"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import pandas as pd\n",
 27 |     "\n",
 28 |     "import matplotlib.pyplot as plt"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "data": {
 38 |       "text/html": [
 39 |        "<div>\n",
 40 |        "<style scoped>\n",
 41 |        "    .dataframe tbody tr th:only-of-type {\n",
 42 |        "        vertical-align: middle;\n",
 43 |        "    }\n",
 44 |        "\n",
 45 |        "    .dataframe tbody tr th {\n",
 46 |        "        vertical-align: top;\n",
 47 |        "    }\n",
 48 |        "\n",
 49 |        "    .dataframe thead th {\n",
 50 |        "        text-align: right;\n",
 51 |        "    }\n",
 52 |        "</style>\n",
 53 |        "<table border=\"1\" class=\"dataframe\">\n",
 54 |        "  <thead>\n",
 55 |        "    <tr style=\"text-align: right;\">\n",
 56 |        "      <th></th>\n",
 57 |        "      <th>id</th>\n",
 58 |        "      <th>open_il_24m</th>\n",
 59 |        "    </tr>\n",
 60 |        "  </thead>\n",
 61 |        "  <tbody>\n",
 62 |        "    <tr>\n",
 63 |        "      <th>0</th>\n",
 64 |        "      <td>1077501</td>\n",
 65 |        "      <td>C</td>\n",
 66 |        "    </tr>\n",
 67 |        "    <tr>\n",
 68 |        "      <th>1</th>\n",
 69 |        "      <td>1077430</td>\n",
 70 |        "      <td>A</td>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>2</th>\n",
 74 |        "      <td>1077175</td>\n",
 75 |        "      <td>A</td>\n",
 76 |        "    </tr>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>3</th>\n",
 79 |        "      <td>1076863</td>\n",
 80 |        "      <td>A</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>4</th>\n",
 84 |        "      <td>1075358</td>\n",
 85 |        "      <td>A</td>\n",
 86 |        "    </tr>\n",
 87 |        "  </tbody>\n",
 88 |        "</table>\n",
 89 |        "</div>"
 90 |       ],
 91 |       "text/plain": [
 92 |        "        id open_il_24m\n",
 93 |        "0  1077501           C\n",
 94 |        "1  1077430           A\n",
 95 |        "2  1077175           A\n",
 96 |        "3  1076863           A\n",
 97 |        "4  1075358           A"
 98 |       ]
 99 |      },
100 |      "execution_count": 2,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "# open_il_24m indica:\n",
107 |     "# \"Número de líneas de crédito a termino fijo abiertas en los últimos 24 meses\".\n",
108 |     "# Estas líneas de crédito a termino fijo tienen un número de cuotas fijas por un valor \n",
109 |     "# predeterminado entre el prestamista y la entidad financiera.\n",
110 |     "# Ejemplos son los préstamos para vehículos o prestamos para estudiantes.\n",
111 |     "\n",
112 |     "data = pd.read_csv('../sample_s2.csv')\n",
113 |     "\n",
114 |     "data.head()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 3,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "(887379, 2)"
126 |       ]
127 |      },
128 |      "execution_count": 3,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "data.shape"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 4,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "array(['C', 'A', 'B', '0.0', '1.0', '2.0', '4.0', '3.0', '6.0', '5.0',\n",
146 |        "       '9.0', '7.0', '8.0', '13.0', '10.0', '19.0', '11.0', '12.0',\n",
147 |        "       '14.0', '15.0'], dtype=object)"
148 |       ]
149 |      },
150 |      "execution_count": 4,
151 |      "metadata": {},
152 |      "output_type": "execute_result"
153 |     }
154 |    ],
155 |    "source": [
156 |     "# Adicionalmente, la variable toma los siguientes códigos:\n",
157 |     "# 'A': no se pudo identificar a la persona \n",
158 |     "# 'B': no hay datos relevantes\n",
159 |     "# 'C': la persona parece no tener ninguna cuenta abierta\n",
160 |     " \n",
161 |     "data.open_il_24m.unique()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 5,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "Text(0, 0.5, 'Número de prestamistas')"
173 |       ]
174 |      },
175 |      "execution_count": 5,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     },
179 |     {
180 |      "data": {
181 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZgAAAEVCAYAAADdFfNTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de7xVdZ3/8dcb8IIXEBDxgnq80DRqasmgTc0vLw1SXmuswXIkI5nMRmeaS1j9htI0mxm18Vc4Ud7NC1km6Zgips00XsD7BR1QUQhSFFQyNcHP74/vd+tis8/e68BZh30O7+fjsR577e9a38/+7M1mf85a33VRRGBmZtbd+q3vBMzMrG9ygTEzs0q4wJiZWSVcYMzMrBIuMGZmVgkXGDMzq4QLjK0zSVtKmidpn/WdSxmSOiSFpAHrO5e+QlI/STMkndjNcR+VdGB3xsxxR0j6laQVks6R9BVJP8zLdpL0O0n9u/t1NzT+D2YNSVoADAR2jYhXc9vngOMi4sC61c8G/i0iHuzRJDcQkj4DfC4iPrie8/g6sHtEHNdg8ZnArIj4QXe+ZkTs2Z3xCiYBLwCDou5kwIh4FtiiotfdoHgLxpoZAJzabAVJA4FHIuL7PZMSeMuj/UTEaRHx783WabN/t52Bx+qLi3WziPDkaY0JWABMBpYBW+W2zwG35/kOIIABhT63k/7SBvgM8GvgPOAl4CngT3P7QuB5YEKh7ybAvwHPAs8B/wEMzMsOBBYBXwZ+C1ye1/8OsDhP3wE26eS99M+xX8h5nFzMHRgMXAgsAX4DfBPo3yTWV4AngRXAvcCOrT6P/PyzwFxgOXAzsHNhWQCfB+bl5d8DBPwx8DqwCvgd8FJe/zDgfuCV/Hl+vRBrU+AK4MX82c8GRnTyfiYX3stjwMc6WW8c8AfgzZzHg60+u7rvwLK87BJgKnBTjvNrYNv877cceBx4b9338MN5/uvAdOCynO+jwOjCun+cP/OX8rIjO3kvl+T38Yecw4dz7CsafbeB7YEZ+T3MB05c3/8/e8vkLRhrZg7pP+w/rGX//YGHgGHAlcDVwJ8AuwPHAd+VVNsV8W3gXcC+efkOwD8XYm0LDCX95TkJ+CpwQF5/H2AM8LVO8jgROBx4LzAaOKZu+aXAyvy67wXGkoppI18CjgU+CgwiFY3fd/oJZJKOJhWmjwPDgf8Crqpb7XDS57MP8Eng0IiYSyo8d0bEFhGxVV73VeB4YCtSsTkpvwbABNIP/46kz/7zwGudpPYk8Gd5/W8AV0jarn6liPgFcBZwTc6jNt7W6rPbn1TUtyHtRiO/t68BWwNvAHcC9+Xn1wLndpIrwJGk79FWpB/97wJI2gj4OXBLfq2/AX4k6Y8avJfPAD8C/iW/l1ubvB6kf6dFpEJzDHCWpENa9DHwFoynxhP5L0dgL+Bl0o9iV7dg5hWWvSevP6LQ9iKpQIj0g7lbYdn7gafz/IGkvzY3LSx/Evho4fmhwIJO3sttwOcLz8fWcgdGkH7kBhaWHwv8spNYTwBHNWhv9XncBEwsLOtHKkw75+cBfLCwfDowufBZ/neLf6/vAOfl+c8C/wPsvRb/7g80en952dfJf+Xn500/u5z3s3UxLgF+UHj+N8Dcuu/JS/Xfw8Lr31pYtgfwWp7/M9LWbb/C8qsobNk1yOObjd5b8d+SVKRXAVsW1v0WcElP/n/srVM77RO1NhQRj0i6gbQrZW4Xuz9XmH8tx6tv24JUvDYD7pVUWybS7qiapRHxeuH59sAzhefP5LZGtiftRiquW7MzsBGwpPDa/erWL9qRVNy6amfg3yWdU2gTaUutls9vC8t+T5OBZkn7kw6u2AvYmLTL8Md58eU5z6slbUXaXfbViHizQZzjSVtlHblpC9KWRNn31Oqza/Q51n8HGn0nOlP/GW2ax3a2BxZGxFuF5c+QPt91sT2wLCJW1MUdvY5xNwjeRWZlTCHtZir+Z301P25WaNt2LeO/QPph2TMitsrT4Igo/tDUD8YuJv3A1eyU2xpZQvrBLa5bs5D0V/jWhdceFJ0fvbQQ2K1Be6vPYyHw14XX2CoiBkbE/3TyOkWNBqKvJO0i2jEiBpPGrAQQEW9GxDciYg/SuNfhpN1pq5G0M/AD4IvAsEi73x6pxSmRR5nPrqcG0RcDO0oq/qbtRBoXWte4QyVt2c1xNwguMNZSRMwHrgFOKbQtJf0nO05Sf0mfpfEPb5n4b5F+6M6TtA2ApB0kHdqk21XA1yQNl7Q1abzmik7WnQ6cImmkpCGkrbHaay8h7bc/R9KgfD7HbpI+1EmsHwJnSBqlZG9Jw0p8Hv8BnCZpz/z+Bkv6RPNP5m3PASMlbVxo25L0l/XrksYAn6otkHSQpPfk8zheIQ1or2oQd3NSAVia+51A2iJqlkdH7Ud8LT67Kt1NKvL/JGmjfO7MEaTxmrUWEQtJuxu/JWlTSXsDE0ljONaCC4yVdTrpB6noROAfSWMpe5L+I66tL5OO0LlL0ivArcAaA7QF3yQdhPAQ8DBpkPibnaz7A9JRWw/m9X5at/x40m6mx0hHMl0LrDHQnZ1LKli3kH68LySdLwRNPo+IuI50IMPV+f09Anykyfsruo10VNRvJb2Q274AnC5pBam4Ti+sv21+D6+QdmveQYPiGxGPAeeQBtmfI41//LpJHrVdcC9Kui/Pd+Wzq0xE/IF0AMBHSFvEU4HjI+Lxbgh/LGkX4mLgOmBKRMzshrh9nvKglZmZAZJ2JR0uPiD8A7lOvAVjZra6vUhHJLq4rCMXGDOzTNKXgGkUxuls7XkXmZmZVcJbMGZmVgkXGDMzq4TP5C/Yeuuto6OjY32nYWbWq9x7770vRMTw+nYXmIKOjg7mzJmzvtMwM+tVJD3TqN27yMzMrBIuMGZmVgkXGDMzq4QLjJmZVcIFxszMKlF5gZG0QNLDkh6QNCe3DZU0U9K8/DiksP5pkuZLeqJ4uXZJ++U48yWdr3yHI0mbSLomt98tqaPQZ0J+jXmSJlT9Xs3M7B09tQVzUETsGxG1u8BNBmZFxChgVn6OpD2A8aRLnY8DpuZ7WgBcQLoX+6g8jcvtE4HlEbE7cB7pkuhIGkq6Udb+pPu1TykWMjMzq9b62kV2FHBpnr8UOLrQfnVEvBERT5PuDzJG0nbAoIi4M1/h9LK6PrVY1wKH5K2bQ4GZEbEsIpYDM3mnKJmZWcV64kTLAG6RFMD3I2IaMCLfDY+IWFK7iyHplrx3Ffouym1v5vn69lqfhTnWSkkvA8OK7Q36lNIx+caW6yw4+7CuhDQz22D0RIH5QEQszkVkpqRmd5hrdC/waNK+tn3eeUFpEmnXGzvttNMaHczMbO1UvossIhbnx+dJtxsdAzyXd3uRH5/Pqy8Cdix0H0m6TemiPF/fvlofSQOAwcCyJrHq85sWEaMjYvTw4WtcSsfMzNZSpQVG0uaStqzNA2NJ9yKfAdSO6poAXJ/nZwDj85Fhu5AG8+/Ju9NWSDogj68cX9enFusY4LY8TnMzMFbSkDy4Pza3mZlZD6h6F9kI4Lp8RPEA4MqI+IWk2cB0SROBZ4FPAETEo5KmA48BK4GTI2JVjnUScAkwELgpTwAXApdLmk/achmfYy2TdAYwO693ekQsq/LNmpnZOyotMBHxFLBPg/YXgUM66XMmcGaD9jmke2XXt79OLlANll0EXNS1rM3MrDv4TH4zM6uEC4yZmVXCBcbMzCrhAmNmZpVwgTEzs0q4wJiZWSVcYMzMrBIuMGZmVgkXGDMzq4QLjJmZVcIFxszMKuECY2ZmlXCBMTOzSrjAmJlZJVxgzMysEi4wZmZWCRcYMzOrhAuMmZlVwgXGzMwq4QJjZmaVcIExM7NKuMCYmVklXGDMzKwSLjBmZlYJFxgzM6uEC4yZmVXCBcbMzCrhAmNmZpVwgTEzs0q4wJiZWSVcYMzMrBIuMGZmVokuFxhJQyTt3YX1+0u6X9IN+flQSTMlzcuPQwrrniZpvqQnJB1aaN9P0sN52fmSlNs3kXRNbr9bUkehz4T8GvMkTejq+zQzs3VTqsBIul3SIElDgQeBiyWdW/I1TgXmFp5PBmZFxChgVn6OpD2A8cCewDhgqqT+uc8FwCRgVJ7G5faJwPKI2B04D/h2jjUUmALsD4wBphQLmZmZVa/sFszgiHgF+DhwcUTsB3y4VSdJI4HDgB8Wmo8CLs3zlwJHF9qvjog3IuJpYD4wRtJ2wKCIuDMiArisrk8t1rXAIXnr5lBgZkQsi4jlwEzeKUpmZtYDyhaYAfmH/pPADV2I/x3gn4C3Cm0jImIJQH7cJrfvACwsrLcot+2Q5+vbV+sTESuBl4FhTWKZmVkPKVtgTgduBuZHxGxJuwLzmnWQdDjwfETcW/I11KAtmrSvbZ/VX1SaJGmOpDlLly4tlaiZmbVWqsBExI8jYu+I+EJ+/lRE/EWLbh8AjpS0ALgaOFjSFcBzeWuI/Ph8Xn8RsGOh/0hgcW4f2aB9tT6SBgCDgWVNYjV6b9MiYnREjB4+fHiLt2RmZmWVHeTfVNLJkqZKuqg2NesTEadFxMiI6CAN3t8WEccBM4DaUV0TgOvz/AxgfD4ybBfSYP49eTfaCkkH5PGV4+v61GIdk18jSFtbY/MRb0OAsbnNzMx6SNldZJcD25IGz+8gbRGsWMvXPBv4c0nzgD/Pz4mIR4HpwGPAL4CTI2JV7nMS6UCB+cCTwE25/UJgmKT5wJfIR6RFxDLgDGB2nk7PbWZm1kOU/uBvsZJ0f0S8V9JDEbG3pI2AmyPi4OpT7DmjR4+OOXPmvP28Y/KNLfssOPuwKlMyM2t7ku6NiNH17WW3YN7Mjy9J2os01tHRTbmZmVkfNKDketPyWMbXSOMeWwD/t7KszMys1ytbYGblExZ/BewKkAfizczMGiq7i+wnDdqu7c5EzMysb2m6BSPp3aRrgw2W9PHCokHAplUmZmZmvVurXWR/BBwObAUcUWhfAZxYVVJmZtb7NS0wEXE9cL2k90fEnT2Uk5mZ9QFlx2A+li/Xv5GkWZJekHRcpZmZmVmvVrbAjM2X6z+cdJ2vdwH/WFlWZmbW65UtMBvlx48CV/myK2Zm1krZ82B+Lulx4DXgC5KGA69Xl5aZmfV2ZS/XPxl4PzA6It4EXiXdTdLMzKyhVufBHBwRtxXPgUlXzH/bT6tKzMzMerdWu8g+BNzG6ufA1AQuMGZm1olW58FMyY8n9Ew6ZmbWV5Qa5Je0FelOkh3FPhFxSjVpmZlZb1f2KLL/BO4CHgbeqi4dMzPrK8oWmE0j4kuVZmJmZn1K2RMtL5d0oqTtJA2tTZVmZmZmvVrZLZg/AP8KfJV09Bj5cdcqkjIzs96vbIH5ErB7RLxQZTJmZtZ3lN1F9ijw+yoTMTOzvqXsFswq4AFJvwTeqDX6MGUzM+tM2QLzszyZmZmVUqrARMSlVSdiZmZ9S9kz+UcB3wL2ADattUeEjyIzM7OGyg7yXwxcAKwEDgIuAy6vKikzM+v9yhaYgRExC1BEPBMRXwcOri4tMzPr7coO8r8uqR8wT9IXgd8A21SXlpmZ9XZlt2D+FtgMOAXYDziOdHVlMzOzhsoWmI6I+F1ELIqIEyLiL4CdqkzMzMx6t7IF5rSSbWZmZkCLMRhJHwE+Cuwg6fzCokGkI8rMzMwaarUFsxiYA7wO3FuYZgCHNusoaVNJ90h6UNKjkr6R24dKmilpXn4cUuhzmqT5kp6QdGihfT9JD+dl50tSbt9E0jW5/W5JHYU+E/JrzJM0oSsfipmZrbumBSYiHsxn8e8eEZfm+RnA/IhY3iL2G8DBEbEPsC8wTtIBwGRgVkSMAmbl50jaAxgP7AmMA6ZK6p9jXQBMAkblaVxunwgsj4jdgfOAb+dYQ4EpwP7AGGBKsZCZmVn1yo7BzJQ0KP9wPwhcLOncZh0i+V1+ulGeAjgKqF165lLg6Dx/FHB1RLwREU8D84ExkrYDBkXEnRERpJM8i31qsa4FDslbN4cCMyNiWS6EM3mnKJmZWQ8oex7M4Ih4RdLngIsjYoqkh1p1ylsg9wK7A9+LiLsljYiIJQARsURS7XyaHYC7Ct0X5bY383x9e63PwhxrpaSXgWHF9gZ96nOcRNo6Yqeduv/AuI7JNzZdvuDsw7r9Nc3M2kHZLZgBeUvik8ANZYNHxKqI2BcYSdoa2avJ6moUokn72vapz3FaRIyOiNHDhw9vkp6ZmXVF2QJzOnAz8GREzJa0KzCv7ItExEvA7aTdVM/lYkV+fD6vtgjYsdBtJOkgg0V5vr59tT6SBgCDgWVNYpmZWQ8pVWAi4scRsXdEnJSfP5VPtuyUpOGStsrzA4EPA4+TDhKoHdU1Abg+z88Axucjw3YhDebfk3enrZB0QB5fOb6uTy3WMcBteZzmZmCspCF5cH9sbjMzsx5S9nL97yIdyTUiIvaStDdwZER8s0m37YBL8zhMP2B6RNwg6U5guqSJwLPAJwAi4lFJ04HHSOfYnBwRq3Ksk4BLgIHATXkCuBC4XNJ80pbL+BxrmaQzgNl5vdMjYlmZ92pmZt2j7CD/D4B/BL4PEBEPSboS6LTARMRDwHsbtL8IHNJJnzOBMxu0zwHWGL+JiNfJBarBsouAizrLz8zMqlV2DGaziLinrs1n8puZWafKFpgXJO1GPhJL0jHAksqyMjOzXq/sLrKTgWnAuyX9Bnga+HRlWZmZWa/XssDkQfqTIuLDkjYH+kXEiupTMzOz3qxlgYmIVZL2y/OvVp+SmZn1BWV3kd0vaQbwY+DtIhMRP60kKzMz6/XKFpihwIvAwYW2AFxgzMysoVIFJiJOqDoRMzPrW0odpixpV0k/l7RU0vOSrs+XczEzM2uo7HkwVwLTSZd/2Z40FnN1VUmZmVnvV7bAKCIuj4iVebqCTi5/b2ZmBuUH+X8paTJpqyWAvwRuzHe4xBeSNDOzemULzF/mx7+ua/8sqeDs2m0ZmZlZn1D2KDIP6JuZWZeUHYMxMzPrEhcYMzOrhAuMmZlVouyJlpJ0nKR/zs93kjSm2tTMzKw3K7sFMxV4P3Bsfr4C+F4lGZmZWZ9Q9jDl/SPifZLuB4iI5ZI2rjAvMzPr5cpuwbyZbzxWu2XycOCtyrIyM7Ner2yBOR+4DthG0pnAfwNnVZaVmZn1emVPtPyRpHuBQwABR0fE3EozMzOzXq1pgaldayx7HriquMzXIDMzs8602oK5lzTuImAnYHme3wp4FvAlZMzMrKGmYzARsUtE7ArcDBwREVtHxDDgcHy7ZDMza6LsIP+fRMR/1p5ExE3Ah6pJyczM+oKy58G8IOlrQO1GY8cBL1aWlZmZ9Xplt2COBYaTDlW+Ls8f27SHmZlt0MoeprwMOLXiXMzMrA/x1ZTNzKwSLjBmZlaJygqMpB0l/VLSXEmPSjo1tw+VNFPSvPw4pNDnNEnzJT0h6dBC+36SHs7Lzpek3L6JpGty+92SOgp9JuTXmCdpQlXv08zMGit7P5iRkq6TtFTSc5J+Imlki24rgb+PiD8GDgBOlrQHMBmYFRGjgFn5OXnZeGBPYBwwNV9gE+ACYBIwKk/jcvtEYHlE7A6cB3w7xxoKTAH2B8YAU4qFzMzMqld2C+ZiYAawHbAD8PPc1qmIWBIR9+X5FcDc3Pco4NK82qXA0Xn+KODqiHgjIp4G5gNjJG0HDIqIOyMigMvq+tRiXQsckrduDgVmRsSyiFgOzOSdomRmZj2gbIEZHhEXR8TKPF1COlS5lLzr6r3A3cCIiFgCqQgB2+TVdgAWFrotym075Pn69tX6RMRK4GVgWJNYZmbWQ8oWmBfyLZP756n0iZaStgB+AvxtRLzSbNUGbdGkfW371Oc3SdIcSXOWLl3aJD0zM+uKsgXms8Angd8CS4BjcltTkjYiFZcfRUTt2mXP5d1e5Mfnc/siYMdC95HA4tw+skH7an0kDQAGA8uaxFpDREyLiNERMXr48NIbZWZm1kLLApMH2s+KiCMjYnhEbBMRR0fEMy36CbgQmBsR5xYWzQBqR3VNAK4vtI/PR4btQhrMvyfvRlsh6YAc8/i6PrVYxwC35XGam4Gxkobkwf2xuc3MzHpIyzP5I2KVpOGSNo6IP3Qh9geAvwIelvRAbvsKcDYwXdJE0iX/P5Ff51FJ04HHSEegnRwRq3K/k4BLgIHATXmCVMAulzSftOUyPsdaJukMYHZe73Tfu8bMrGeVvdjlAuDXkmYAr9Ya67ZMVhMR/03jsRBId8Zs1OdM4MwG7XOAvRq0v04uUA2WXQRc1Fl+ZmZWrbIFZnGe+gFbVpeOmZn1FWUvdvkNAEmbR8SrrdY3MzMreyb/+yU9RjpZEkn7SJpaaWZmZtarlT1M+Tuks+NfBIiIB4H/U1VSZmbW+5W+2GVELKxrWtVwRTMzM8oP8i+U9KdASNoYOIW8u8zMzKyRslswnwdO5p3rgu2bn5uZmTVU9iiyF4BPV5yLmZn1IaUKTL50y98AHcU+EXFkNWmZmVlvV3YM5meky7L8HHirunTMzKyvKFtgXo+I8yvNxMzM+pSyBebfJU0BbgHeqDXW7lhpZmZWr2yBeQ/pysgH884ussjPzczM1lC2wHwM2LWLl+s3M7MNWNnzYB4EtqoyETMz61vKbsGMAB6XNJvVx2B8mLKZmTVUtsBMqTQLMzPrc8qeyX9H1YmYmVnf0mmBkbRZRPw+z68gHTUGsDGwEfBqRAyqPkUzM+uNmm3BfEbSkIg4MyJWu02ypKOBMdWmZmZmvVmnR5FFxFTgGUnHN1j2M3wOjJmZNdF0DCYirgCQ9PFCcz9gNO/sMjMzM1tD2aPIjijMrwQWAEd1ezZmZtZnlD2K7ISqEzEzs76laYGR9M9NFkdEnNHN+ZiZWR/Ragvm1QZtmwMTgWGAC4yZmTXUapD/nNq8pC2BU4ETgKuBczrrZ2Zm1nIMRtJQ4EvAp4FLgfdFxPKqEzMzs96t1RjMvwIfB6YB74mI3/VIVmZm1uu1ulz/3wPbA18DFkt6JU8rJL1SfXpmZtZbtRqDKXu/GDMzs9W4gJiZWSUqLTCSLpL0vKRHCm1DJc2UNC8/DiksO03SfElPSDq00L6fpIfzsvMlKbdvIuma3H63pI5Cnwn5NeZJmlDl+zQzszVVvQVzCTCurm0yMCsiRgGz8nMk7QGMB/bMfaZK6p/7XABMAkblqRZzIrA8InYHzgO+nWMNJd0kbX/SVZ+nFAuZmZlVr9ICExG/ApbVNR9FOtyZ/Hh0of3qiHgjIp4G5gNjJG0HDIqIOyMigMvq+tRiXQsckrduDgVmRsSyfEj1TNYsdGZmVqH1MQYzIiKWAOTHbXL7DsDCwnqLctsOeb6+fbU+EbESeJl0hYHOYpmZWQ9pp0F+NWiLJu1r22f1F5UmSZojac7SpUtLJWpmZq2tjwLzXN7tRX58PrcvAnYsrDcSWJzbRzZoX62PpAHAYNIuuc5irSEipkXE6IgYPXz48HV4W2ZmVrQ+CswMoHZU1wTg+kL7+Hxk2C6kwfx78m60FZIOyOMrx9f1qcU6Brgtj9PcDIyVNCQP7o/NbWZm1kPK3nBsrUi6CjgQ2FrSItKRXWcD0yVNBJ4FPgEQEY9Kmg48Rrqp2ckRsSqHOol0RNpA4KY8AVwIXC5pPmnLZXyOtUzSGcDsvN7pEVF/sIGZmVWo0gITEcd2suiQTtY/EzizQfscYK8G7a+TC1SDZRcBF5VO1szMulU7DfKbmVkf4gJjZmaVcIExM7NKuMCYmVklXGDMzKwSLjBmZlYJFxgzM6uEC4yZmVXCBcbMzCpR6Zn81j06Jt/YdPmCsw/roUzMzMrzFoyZmVXCBcbMzCrhAmNmZpVwgTEzs0q4wJiZWSVcYMzMrBIuMGZmVgkXGDMzq4QLjJmZVcIFxszMKuECY2ZmlXCBMTOzSrjAmJlZJVxgzMysEi4wZmZWCRcYMzOrhAuMmZlVwgXGzMwq4QJjZmaVcIExM7NKuMCYmVklXGDMzKwSLjBmZlaJPl1gJI2T9ISk+ZImr+98zMw2JAPWdwJVkdQf+B7w58AiYLakGRHx2PrNrOd1TL6x5ToLzj5snWK06m9mG54+W2CAMcD8iHgKQNLVwFHABldg2kV3FCkXOrPeQxGxvnOohKRjgHER8bn8/K+A/SPii3XrTQIm5ad/BDzRJOzWwAvrmFpfidEOObRLjHbIoTtitEMO7RKjHXJolxhl+u8cEcPrG/vyFowatK1RTSNiGjCtVEBpTkSMXqek+kiMdsihXWK0Qw7dEaMdcmiXGO2QQ7vEWJf+fXmQfxGwY+H5SGDxesrFzGyD05cLzGxglKRdJG0MjAdmrOeczMw2GH12F1lErJT0ReBmoD9wUUQ8uo5hS+1K20BitEMO7RKjHXLojhjtkEO7xGiHHNolxlr377OD/GZmtn715V1kZma2HrnAmJlZJVxgzMysEi4wXSDpA5K+14X1d5f0gQbtfyZpt+7NzsysvfTZo8i6i6R9gU8BnwSeBn7ahe7fAb7SoP21vOyItcxpa+DF6OEjNCSNAHYgnbC6OCKe6+kY7ZBDX4rRDjlY3+UC04Ckd5HOmzkWeBG4hnTE3UFdDNUREQ/VN0bEHEkdJXM5ADgbWAacAVxOunRDP0nHR8Qvyiaztj8Eucj+BzAY+E1uHinpJeALEXFf1THaIYe+FKMdcijEeTfpOoFvfzeBGRExt0z/donRDjm0UwwAIsJT3QS8BdwB7F5oe2ot4sxfm2V1680BxgKfAJYDB+T2dwP3l4yxL3AXMBe4NU+P57b3lej/AOk6bvXtBwAPlsxhnWK0Qw59KUY75JDX/XKOMxk4Lk+Ta229JUY75NBOMd6O1ZWVN5QJ+Bhpq2Uh8APgEODptYhzFXBig/aJwDUlYzxQmJ9bt6xsgVnXH6N5TZaVLZTrFKMdcuhLMdohh7ze/wIbNWjfuFn8dovRDjm0U4za5F1kDUTEdcB1kjYHjgb+Dhgh6QLguoi4pWSov81xPg3cm9tGk/6hPlYyxluF+dfqUy0ZY/OIuLu+MSLuyu+xlZsk3QhcRiq6kK7zdjxQdhfdusZohxz6Uox2yAHS93t74Jm69u1Y/bvf7rljnOkAAAbFSURBVDHaIYd2igH4TP7SJA0l7ab6y4g4uIt9DwL2yk8fjYjbutB3FfAq6erQA4Hf1xYBm0bERiVinA/sRuMfgqej7hYGncT4CO/skxXpYqIzIuI/u/Be1ilGO+TQl2K0SQ7jgO8C83jnu7kTsDvwxSgxxtgOMdohh3aK8XYsF5gNQ3f8mJhVQVI/0g0Ci9/N2RGxqjfFaIcc2ikGuMDYOpI0KdI9ddZbjHbIoS/FaIccrG/wiZYbuHxHz3UK0R1p9IEc+lKMdsgBSTf0hRjtkMP6iuEtmA2cpL+OiO+XWK+7jq3fAbg7In5XaB9Xct/wGCAiYrakPYBxwOPrsptP0mURcfw69P8gaVfCI2UP/pC0P+mIwFckDSQdAvo+4DHgrIh4uUSMU0gHnCxstW4n/Wv3SFocEbdK+hTwp6RD2adFxJsl4+xGOmBlR2Alab/9VWXeQ4nY20XEkt4eox1yWF8xXGA2cJJOiIiLW6zzZdJJp1eT9sVCukPoeODqiDi7xOucApxM+gHbFzg1Iq7Py+6LiPe16D8F+Ajp5OCZwP7A7cCHgZsj4swSOdTfcE7AQcBtABFxZIkY90TEmDx/Yn5P15HOVfp5yc/iUWCfSPcsmkY6cONa0uHw+0TEx0vEeJl08MeTpMPhfxwRS1v1K/T/Eemz3Ax4CdiCdJWKQwAi4jMlYpxCuhrFHcBHSYfDLycVnC9ExO1l87HGJG0TEc+v7zzWWleOafbU9ybg2RLrdMex9Q8DW+T5DtIJpKfm5y3P58n9+5N+EF8BBuX2gcBDJXO4D7gCOBD4UH5ckuc/VDLG/YX52cDwPL858HDJGHOLOdUte6BsHqRd3GOBC4GlpEODJwBbluj/UH4cADwH9M/P1YXP8+FCv82A2/P8TmX+TfO6g0lXqnicdNWMF0l/hJwNbNUN3++bSq43CPgW6UoZn6pbNrVE/22BC4DvAcOAr+fPZzqwXckchtZNw4AFwBBgaMkY4+o+2wuBh4ArgRElY2wBnA48Crycv1t3AZ/p6ufvMZgNgKSHOpkeBkaUCFE7Lr5eV46L7x95t1hELCD9uH9E0rmU21+/MiJWRcTvgScj4pUc67Uu5DCadD7SV4GXI/2F/VpE3BERd5SM0U/SEEnDSHsAluY8XiXtIirjEUkn5PkHJY0GapcoKrVrKr1kvBURt0TERNK/z1TSbsOnSr6PjYEtScVhcG7fBGh56HtB7Vy6TXIsIuLZLsSYTtrqOTAihkXEMNJW5XLgx2UCSHpfJ9N+pK3lMi4mfQ9/AoyX9BNJm+RlB5TofwlpF+dC4Jekc9YOA/6LdDmdMl4gfT9r0xzSLuX78nwZZxXmzyH9AXUE6Y+hlrvCsx+RvkOHAt8Azgf+CjhI0lnNOq5hXf9C8NT+E+kv1H2BneumDtI++Fb9xwHzgZtIt0+dRvpreT6Fv5haxLgN2LeubQDp3JxVJfrfDWyW5/sV2gdTtxVQItZI0o/XdymxBVfXd0H+z/d0ftw2t29B+a2PwaQfpCfz+3ozx7qDtIusTIxOtxCAgSX6/11+zWeAU4BZpKtWPAxMKZnDqaS/jqeRtkBOyO3DgV+VjPHE2iyrW29V/n79ssH0WskYD9Q9/yrwa9JWRMvvF6tv2T7bLHaTGP+Q/1+9p9D2dBe/n/d19rpdyOPBuuez82M/0phn+Xy6srKn3jmRNpM/2MmyK0vG6Ef6S+4vgGPyfP8u5DCy9mPcYNkHSvTfpJP2rYv/Ibv4uRxGGlTvjs94M2CXLvbZEtgH2I+Suy8Kfd/VDTlvD2yf57fK/65juhhjz9zv3WuZwy3APxXfP2mr+svArSVjPAKM6mTZwpIx5lL4wyW3TSDtJnqmRP8HC/PfrFtWatdpXrf2x8+5+fvRpWsgksZIvwT8PekPCBWWld31+T+13wvS1s/NhWWlin5t8iC/ma03koaQjqI7CtgmNz8HzADOjojlJWIcQ/oRf6LBsqMj4mclYvwLcEtE3FrXPg74fxExqkX/04F/icLRkbl99/w+jmmVQ12/I0hbUR0RsW0X+k2pa5oaEUslbZvza3nEpKS9gR8C7yIV789GxP9KGg4cGxHnl87HBcbM2lGZIxx7Q4y17Z8PYd8tIh5ph/exNjFcYMysLUl6NiJ26u0x2iGH9RXDV1M2s/VG0ho35KstotwRjm0Rox1yaKcYNS4wZrY+jSAdDls/1iLSYHNvidEOObRTDMAFxszWrxtIJ+A+UL9A0u29KEY75NBOMdL6HoMxM7Mq+Ex+MzOrhAuMmZlVwgXGzMwq4QJjZmaVcIExM7NK/H98D8ciLq5XEgAAAABJRU5ErkJggg==\n",
182 |       "text/plain": [
183 |        "<Figure size 432x288 with 1 Axes>"
184 |       ]
185 |      },
186 |      "metadata": {
187 |       "needs_background": "light"
188 |      },
189 |      "output_type": "display_data"
190 |     }
191 |    ],
192 |    "source": [
193 |     "# Creemos un gráfico de barras, con el número de préstamistas\n",
194 |     "# por cada uno de los valores de la variable combinada\n",
195 |     "\n",
196 |     "fig = data.open_il_24m.value_counts().plot.bar()\n",
197 |     "fig.set_title('Número de cuentas a término fijo')\n",
198 |     "fig.set_ylabel('Número de prestamistas')"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {
204 |     "collapsed": true
205 |    },
206 |    "source": [
207 |     "Este es un ejemplo de una variable combinada!"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {
213 |     "collapsed": true
214 |    },
215 |    "source": [
216 |     "**Esto es todo por este demo, esperamos lo hayan disfrutado y nos vemos en el siguiente. **"
217 |    ]
218 |   }
219 |  ],
220 |  "metadata": {
221 |   "kernelspec": {
222 |    "display_name": "feml",
223 |    "language": "python",
224 |    "name": "feml"
225 |   },
226 |   "language_info": {
227 |    "codemirror_mode": {
228 |     "name": "ipython",
229 |     "version": 3
230 |    },
231 |    "file_extension": ".py",
232 |    "mimetype": "text/x-python",
233 |    "name": "python",
234 |    "nbconvert_exporter": "python",
235 |    "pygments_lexer": "ipython3",
236 |    "version": "3.8.2"
237 |   },
238 |   "toc": {
239 |    "base_numbering": 1,
240 |    "nav_menu": {},
241 |    "number_sections": true,
242 |    "sideBar": true,
243 |    "skip_h1_title": false,
244 |    "title_cell": "Table of Contents",
245 |    "title_sidebar": "Contents",
246 |    "toc_cell": false,
247 |    "toc_position": {
248 |     "height": "550px",
249 |     "left": "0px",
250 |     "right": "869.4px",
251 |     "top": "107px",
252 |     "width": "151px"
253 |    },
254 |    "toc_section_display": "block",
255 |    "toc_window_display": true
256 |   }
257 |  },
258 |  "nbformat": 4,
259 |  "nbformat_minor": 1
260 | }
261 | 


--------------------------------------------------------------------------------
/Section-06-Codificacion-Variables-Categoricas/06.04_Codificacion-frecuencia.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Codificación por número de observaciones o frecuencia \n",
  8 |     "\n",
  9 |     "\n",
 10 |     "En la primera codificación reemplazamos las categorías por el número de observaciones por categoría en los datos. Similarmente, podemos reemplazar la categoría por la frecuencia -o porcentaje- de observaciones en los datos. Eso es, si 10 de nuestras 100 observaciones muestran el color azul, entonces reemplazamos el color azul por 10 o por 0.1 si reemplazamos por la frecuencia. Estas técnicas capturan la representación de cada etiqueta en los datos, pero la codificación puede que no necesariamente tenga poder predictivo en el target. Sin embargo, estos métodos son bastante populares en las competiciones de Kaggle.\n",
 11 |     "\n",
 12 |     "El supuesto de esta técnica es que el número de observaciones presentes en cada una de las categorías de una variable es de alguna forma representativo del poder predictivo de dicha etiqueta.\n",
 13 |     "\n",
 14 |     "### Ventajas\n",
 15 |     "\n",
 16 |     "- Simple\n",
 17 |     "- No extiende el espacio de los datos (número de variables)\n",
 18 |     "\n",
 19 |     "### Desventajas\n",
 20 |     "\n",
 21 |     "- Si dos categorías aparecen el mismo número de veces u observaciones en los datos, serán reemplazadas por el mismo númer; la consecuencia es que puede que perdamos información importante.\n",
 22 |     "\n",
 23 |     "Por ejemplo, si hay 10 observaciones para la categoría azul y 10 observaciones para la categoría roja, ambas serán reemplazadas por 10, y por lo tanto, luego de codificarlas, parecerán ser la misma cosa\n",
 24 |     "\n",
 25 |     "\n",
 26 |     "Sigue esta conversación en [en Kaggle](https://www.kaggle.com/general/16927) para más información.\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "## En este demo:\n",
 31 |     "\n",
 32 |     "Vamos a realizar codificación one hot con:\n",
 33 |     "- pandas\n",
 34 |     "- Feature-Engine\n",
 35 |     "\n",
 36 |     "y las ventajas y limitaciones de cada una de estas implementaciones usando los datos House Prices dataset.\n",
 37 |     "\n"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 1,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "import numpy as np\n",
 47 |     "import pandas as pd\n",
 48 |     "\n",
 49 |     "# separar datos\n",
 50 |     "from sklearn.model_selection import train_test_split\n",
 51 |     "\n",
 52 |     "# codificar con feature-engine\n",
 53 |     "from feature_engine.encoding import CountFrequencyEncoder"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 2,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/html": [
 64 |        "<div>\n",
 65 |        "<style scoped>\n",
 66 |        "    .dataframe tbody tr th:only-of-type {\n",
 67 |        "        vertical-align: middle;\n",
 68 |        "    }\n",
 69 |        "\n",
 70 |        "    .dataframe tbody tr th {\n",
 71 |        "        vertical-align: top;\n",
 72 |        "    }\n",
 73 |        "\n",
 74 |        "    .dataframe thead th {\n",
 75 |        "        text-align: right;\n",
 76 |        "    }\n",
 77 |        "</style>\n",
 78 |        "<table border=\"1\" class=\"dataframe\">\n",
 79 |        "  <thead>\n",
 80 |        "    <tr style=\"text-align: right;\">\n",
 81 |        "      <th></th>\n",
 82 |        "      <th>Neighborhood</th>\n",
 83 |        "      <th>Exterior1st</th>\n",
 84 |        "      <th>Exterior2nd</th>\n",
 85 |        "      <th>SalePrice</th>\n",
 86 |        "    </tr>\n",
 87 |        "  </thead>\n",
 88 |        "  <tbody>\n",
 89 |        "    <tr>\n",
 90 |        "      <th>0</th>\n",
 91 |        "      <td>CollgCr</td>\n",
 92 |        "      <td>VinylSd</td>\n",
 93 |        "      <td>VinylSd</td>\n",
 94 |        "      <td>208500</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>1</th>\n",
 98 |        "      <td>Veenker</td>\n",
 99 |        "      <td>MetalSd</td>\n",
100 |        "      <td>MetalSd</td>\n",
101 |        "      <td>181500</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>2</th>\n",
105 |        "      <td>CollgCr</td>\n",
106 |        "      <td>VinylSd</td>\n",
107 |        "      <td>VinylSd</td>\n",
108 |        "      <td>223500</td>\n",
109 |        "    </tr>\n",
110 |        "    <tr>\n",
111 |        "      <th>3</th>\n",
112 |        "      <td>Crawfor</td>\n",
113 |        "      <td>Wd Sdng</td>\n",
114 |        "      <td>Wd Shng</td>\n",
115 |        "      <td>140000</td>\n",
116 |        "    </tr>\n",
117 |        "    <tr>\n",
118 |        "      <th>4</th>\n",
119 |        "      <td>NoRidge</td>\n",
120 |        "      <td>VinylSd</td>\n",
121 |        "      <td>VinylSd</td>\n",
122 |        "      <td>250000</td>\n",
123 |        "    </tr>\n",
124 |        "  </tbody>\n",
125 |        "</table>\n",
126 |        "</div>"
127 |       ],
128 |       "text/plain": [
129 |        "  Neighborhood Exterior1st Exterior2nd  SalePrice\n",
130 |        "0      CollgCr     VinylSd     VinylSd     208500\n",
131 |        "1      Veenker     MetalSd     MetalSd     181500\n",
132 |        "2      CollgCr     VinylSd     VinylSd     223500\n",
133 |        "3      Crawfor     Wd Sdng     Wd Shng     140000\n",
134 |        "4      NoRidge     VinylSd     VinylSd     250000"
135 |       ]
136 |      },
137 |      "execution_count": 2,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "# cargar dataset\n",
144 |     "\n",
145 |     "data = pd.read_csv(\n",
146 |     "    '../houseprice.csv',\n",
147 |     "    usecols=['Neighborhood', 'Exterior1st', 'Exterior2nd', 'SalePrice'])\n",
148 |     "\n",
149 |     "data.head()"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 3,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "Neighborhood :  25  etiquetas\n",
162 |       "Exterior1st :  15  etiquetas\n",
163 |       "Exterior2nd :  16  etiquetas\n",
164 |       "SalePrice :  663  etiquetas\n"
165 |      ]
166 |     }
167 |    ],
168 |    "source": [
169 |     "# miremos cuantas etiquetas tiene cada variable\n",
170 |     "\n",
171 |     "for col in data.columns:\n",
172 |     "    print(col, ': ', len(data[col].unique()), ' etiquetas')"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "### Importante sobre codificación\n",
180 |     "\n",
181 |     "Cuando hacemos el conteo de observaciones para transformar las variables categóricas, es importante calcular el número ( o frecuencia =  número observaciones /  observaciones totales)  usando el set de entrenamiento; y luego usar estos números para codificar las variables en el set de prueba\n",
182 |     "\n"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 4,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "data": {
192 |       "text/plain": [
193 |        "((1022, 3), (438, 3))"
194 |       ]
195 |      },
196 |      "execution_count": 4,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "# separemos en sets de prueba y entrenamiento\n",
203 |     "\n",
204 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
205 |     "    data[['Neighborhood', 'Exterior1st', 'Exterior2nd']],  # predictores\n",
206 |     "    data['SalePrice'],  # target\n",
207 |     "    test_size=0.3,  # porcentaje observaciones prueba\n",
208 |     "    random_state=0)  # semilla para asegurar reproducibilidad\n",
209 |     "\n",
210 |     "X_train.shape, X_test.shape"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "## Codificación por número de observaciones o frecuencia con pandas"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 5,
223 |    "metadata": {
224 |     "scrolled": true
225 |    },
226 |    "outputs": [
227 |     {
228 |      "data": {
229 |       "text/plain": [
230 |        "{'NAmes': 151,\n",
231 |        " 'CollgCr': 105,\n",
232 |        " 'OldTown': 73,\n",
233 |        " 'Edwards': 71,\n",
234 |        " 'Sawyer': 61,\n",
235 |        " 'Somerst': 56,\n",
236 |        " 'Gilbert': 55,\n",
237 |        " 'NridgHt': 51,\n",
238 |        " 'NWAmes': 51,\n",
239 |        " 'SawyerW': 45,\n",
240 |        " 'BrkSide': 41,\n",
241 |        " 'Mitchel': 36,\n",
242 |        " 'Crawfor': 35,\n",
243 |        " 'Timber': 30,\n",
244 |        " 'NoRidge': 30,\n",
245 |        " 'IDOTRR': 24,\n",
246 |        " 'ClearCr': 24,\n",
247 |        " 'SWISU': 18,\n",
248 |        " 'StoneBr': 16,\n",
249 |        " 'MeadowV': 12,\n",
250 |        " 'Blmngtn': 12,\n",
251 |        " 'BrDale': 10,\n",
252 |        " 'NPkVill': 7,\n",
253 |        " 'Veenker': 6,\n",
254 |        " 'Blueste': 2}"
255 |       ]
256 |      },
257 |      "execution_count": 5,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "# calculemos para cada una de las etiquetas el número de observaciones\n",
264 |     "# para la variable Neigbourhood\n",
265 |     "\n",
266 |     "count_map = X_train['Neighborhood'].value_counts().to_dict()\n",
267 |     "\n",
268 |     "count_map"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "El diccionario contiene el número de observaciones por cada categoría de la variable Neighbourhood."
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 6,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "# reemplacemos las etiquetas con el conteo que hicimos\n",
285 |     "\n",
286 |     "X_train['Neighborhood'] = X_train['Neighborhood'].map(count_map)\n",
287 |     "X_test['Neighborhood'] = X_test['Neighborhood'].map(count_map)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 7,
293 |    "metadata": {},
294 |    "outputs": [
295 |     {
296 |      "data": {
297 |       "text/plain": [
298 |        "64      105\n",
299 |        "682      24\n",
300 |        "960      41\n",
301 |        "1384     71\n",
302 |        "1100     18\n",
303 |        "416      61\n",
304 |        "1034     35\n",
305 |        "853     151\n",
306 |        "472      71\n",
307 |        "1011     71\n",
308 |        "Name: Neighborhood, dtype: int64"
309 |       ]
310 |      },
311 |      "execution_count": 7,
312 |      "metadata": {},
313 |      "output_type": "execute_result"
314 |     }
315 |    ],
316 |    "source": [
317 |     "# exploremos los resultados\n",
318 |     "\n",
319 |     "X_train['Neighborhood'].head(10)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": 8,
325 |    "metadata": {},
326 |    "outputs": [
327 |     {
328 |      "data": {
329 |       "text/plain": [
330 |        "{'VinylSd': 0.3561643835616438,\n",
331 |        " 'HdBoard': 0.149706457925636,\n",
332 |        " 'Wd Sdng': 0.14481409001956946,\n",
333 |        " 'MetalSd': 0.1350293542074364,\n",
334 |        " 'Plywood': 0.08414872798434442,\n",
335 |        " 'CemntBd': 0.03816046966731898,\n",
336 |        " 'BrkFace': 0.03424657534246575,\n",
337 |        " 'WdShing': 0.02054794520547945,\n",
338 |        " 'Stucco': 0.016634050880626222,\n",
339 |        " 'AsbShng': 0.014677103718199608,\n",
340 |        " 'Stone': 0.0019569471624266144,\n",
341 |        " 'ImStucc': 0.0009784735812133072,\n",
342 |        " 'AsphShn': 0.0009784735812133072,\n",
343 |        " 'BrkComm': 0.0009784735812133072,\n",
344 |        " 'CBlock': 0.0009784735812133072}"
345 |       ]
346 |      },
347 |      "execution_count": 8,
348 |      "metadata": {},
349 |      "output_type": "execute_result"
350 |     }
351 |    ],
352 |    "source": [
353 |     "# si en lugar del número de observaciones queremos reemplazar por la frecuencia\n",
354 |     "# solo necesitamos dividir el conteo por el número total de observaciones\n",
355 |     "\n",
356 |     "frequency_map = (X_train['Exterior1st'].value_counts() / len(X_train) ).to_dict()\n",
357 |     "frequency_map"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 9,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "# reemplacemos las categorías por las frecuencias\n",
367 |     "\n",
368 |     "X_train['Exterior1st'] = X_train['Exterior1st'].map(frequency_map)\n",
369 |     "X_test['Exterior1st'] = X_test['Exterior1st'].map(frequency_map)"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "Podemos agrupar estos comandos en dos funciones como hiciemos en los notebooks anteriores y  repetir el proceso (con un ciclo) para cada una de las variables categóricas. Si no sabes como hacer eso, revisa los notebooks anteriores.\n",
377 |     "\n",
378 |     "## Codificación por número de observaciones o frecuencia con Feature-Engine"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 10,
384 |    "metadata": {},
385 |    "outputs": [
386 |     {
387 |      "data": {
388 |       "text/plain": [
389 |        "((1022, 3), (438, 3))"
390 |       ]
391 |      },
392 |      "execution_count": 10,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "# separemos en sets de prueba y entrenamiento\n",
399 |     "\n",
400 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
401 |     "    data[['Neighborhood', 'Exterior1st', 'Exterior2nd']],  # variables\n",
402 |     "    data['SalePrice'],  # target\n",
403 |     "    test_size=0.3,  # porcentaje observaciones prueba\n",
404 |     "    random_state=0)  # semilla para asegurar reproducibilidad\n",
405 |     "\n",
406 |     "X_train.shape, X_test.shape"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 11,
412 |    "metadata": {},
413 |    "outputs": [
414 |     {
415 |      "data": {
416 |       "text/plain": [
417 |        "CountFrequencyEncoder(variables=['Neighborhood', 'Exterior1st', 'Exterior2nd'])"
418 |       ]
419 |      },
420 |      "execution_count": 11,
421 |      "metadata": {},
422 |      "output_type": "execute_result"
423 |     }
424 |    ],
425 |    "source": [
426 |     "count_enc = CountFrequencyEncoder(\n",
427 |     "    encoding_method='count', # para codificar por frecuencia ==> encoding_method='frequency'\n",
428 |     "    variables=['Neighborhood', 'Exterior1st', 'Exterior2nd'])\n",
429 |     "\n",
430 |     "count_enc.fit(X_train)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 12,
436 |    "metadata": {
437 |     "scrolled": true
438 |    },
439 |    "outputs": [
440 |     {
441 |      "data": {
442 |       "text/plain": [
443 |        "{'Neighborhood': {'NAmes': 151,\n",
444 |        "  'CollgCr': 105,\n",
445 |        "  'OldTown': 73,\n",
446 |        "  'Edwards': 71,\n",
447 |        "  'Sawyer': 61,\n",
448 |        "  'Somerst': 56,\n",
449 |        "  'Gilbert': 55,\n",
450 |        "  'NridgHt': 51,\n",
451 |        "  'NWAmes': 51,\n",
452 |        "  'SawyerW': 45,\n",
453 |        "  'BrkSide': 41,\n",
454 |        "  'Mitchel': 36,\n",
455 |        "  'Crawfor': 35,\n",
456 |        "  'Timber': 30,\n",
457 |        "  'NoRidge': 30,\n",
458 |        "  'IDOTRR': 24,\n",
459 |        "  'ClearCr': 24,\n",
460 |        "  'SWISU': 18,\n",
461 |        "  'StoneBr': 16,\n",
462 |        "  'MeadowV': 12,\n",
463 |        "  'Blmngtn': 12,\n",
464 |        "  'BrDale': 10,\n",
465 |        "  'NPkVill': 7,\n",
466 |        "  'Veenker': 6,\n",
467 |        "  'Blueste': 2},\n",
468 |        " 'Exterior1st': {'VinylSd': 364,\n",
469 |        "  'HdBoard': 153,\n",
470 |        "  'Wd Sdng': 148,\n",
471 |        "  'MetalSd': 138,\n",
472 |        "  'Plywood': 86,\n",
473 |        "  'CemntBd': 39,\n",
474 |        "  'BrkFace': 35,\n",
475 |        "  'WdShing': 21,\n",
476 |        "  'Stucco': 17,\n",
477 |        "  'AsbShng': 15,\n",
478 |        "  'Stone': 2,\n",
479 |        "  'ImStucc': 1,\n",
480 |        "  'AsphShn': 1,\n",
481 |        "  'BrkComm': 1,\n",
482 |        "  'CBlock': 1},\n",
483 |        " 'Exterior2nd': {'VinylSd': 353,\n",
484 |        "  'Wd Sdng': 142,\n",
485 |        "  'HdBoard': 141,\n",
486 |        "  'MetalSd': 136,\n",
487 |        "  'Plywood': 112,\n",
488 |        "  'CmentBd': 39,\n",
489 |        "  'Wd Shng': 29,\n",
490 |        "  'BrkFace': 18,\n",
491 |        "  'AsbShng': 17,\n",
492 |        "  'Stucco': 16,\n",
493 |        "  'ImStucc': 8,\n",
494 |        "  'Brk Cmn': 4,\n",
495 |        "  'Stone': 4,\n",
496 |        "  'AsphShn': 1,\n",
497 |        "  'Other': 1,\n",
498 |        "  'CBlock': 1}}"
499 |       ]
500 |      },
501 |      "execution_count": 12,
502 |      "metadata": {},
503 |      "output_type": "execute_result"
504 |     }
505 |    ],
506 |    "source": [
507 |     "# en en atributo encoder_dictdel codificador \n",
508 |     "# podemos ver el número de observaciones por categoría de cada variable\n",
509 |     "\n",
510 |     "count_enc.encoder_dict_"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 13,
516 |    "metadata": {},
517 |    "outputs": [
518 |     {
519 |      "data": {
520 |       "text/html": [
521 |        "<div>\n",
522 |        "<style scoped>\n",
523 |        "    .dataframe tbody tr th:only-of-type {\n",
524 |        "        vertical-align: middle;\n",
525 |        "    }\n",
526 |        "\n",
527 |        "    .dataframe tbody tr th {\n",
528 |        "        vertical-align: top;\n",
529 |        "    }\n",
530 |        "\n",
531 |        "    .dataframe thead th {\n",
532 |        "        text-align: right;\n",
533 |        "    }\n",
534 |        "</style>\n",
535 |        "<table border=\"1\" class=\"dataframe\">\n",
536 |        "  <thead>\n",
537 |        "    <tr style=\"text-align: right;\">\n",
538 |        "      <th></th>\n",
539 |        "      <th>Neighborhood</th>\n",
540 |        "      <th>Exterior1st</th>\n",
541 |        "      <th>Exterior2nd</th>\n",
542 |        "    </tr>\n",
543 |        "  </thead>\n",
544 |        "  <tbody>\n",
545 |        "    <tr>\n",
546 |        "      <th>64</th>\n",
547 |        "      <td>105</td>\n",
548 |        "      <td>364</td>\n",
549 |        "      <td>353</td>\n",
550 |        "    </tr>\n",
551 |        "    <tr>\n",
552 |        "      <th>682</th>\n",
553 |        "      <td>24</td>\n",
554 |        "      <td>148</td>\n",
555 |        "      <td>142</td>\n",
556 |        "    </tr>\n",
557 |        "    <tr>\n",
558 |        "      <th>960</th>\n",
559 |        "      <td>41</td>\n",
560 |        "      <td>148</td>\n",
561 |        "      <td>112</td>\n",
562 |        "    </tr>\n",
563 |        "    <tr>\n",
564 |        "      <th>1384</th>\n",
565 |        "      <td>71</td>\n",
566 |        "      <td>21</td>\n",
567 |        "      <td>29</td>\n",
568 |        "    </tr>\n",
569 |        "    <tr>\n",
570 |        "      <th>1100</th>\n",
571 |        "      <td>18</td>\n",
572 |        "      <td>148</td>\n",
573 |        "      <td>142</td>\n",
574 |        "    </tr>\n",
575 |        "  </tbody>\n",
576 |        "</table>\n",
577 |        "</div>"
578 |       ],
579 |       "text/plain": [
580 |        "      Neighborhood  Exterior1st  Exterior2nd\n",
581 |        "64             105          364          353\n",
582 |        "682             24          148          142\n",
583 |        "960             41          148          112\n",
584 |        "1384            71           21           29\n",
585 |        "1100            18          148          142"
586 |       ]
587 |      },
588 |      "execution_count": 13,
589 |      "metadata": {},
590 |      "output_type": "execute_result"
591 |     }
592 |    ],
593 |    "source": [
594 |     "X_train = count_enc.transform(X_train)\n",
595 |     "X_test = count_enc.transform(X_test)\n",
596 |     "\n",
597 |     "# exploremos el resultado\n",
598 |     "X_train.head()"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "markdown",
603 |    "metadata": {},
604 |    "source": [
605 |     "**Nota**\n",
606 |     "\n",
607 |     "Si el argumento 'variables' se fija en 'None' (ninguno). entonces el codificador automáticamente identificará  **todas las variables categóricas**. Maravilloso verdad?\n",
608 |     "\n",
609 |     "El codificador no codificará las variables numéricas. Entonces si algunas de tus variables categóricas son de hecho numéricas, necesitas hacer el 're-cast' o cambio a tipo 'object' antes de usar el codificador.\n",
610 |     "\n",
611 |     "Si hay una variable en el set de prueba, para el cual el codificador no tiene un número para asignar ( la categoría no estaba presente en el set de entrenamiento), el codificador devolverá un error.\n",
612 |     "\n"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": null,
618 |    "metadata": {},
619 |    "outputs": [],
620 |    "source": []
621 |   }
622 |  ],
623 |  "metadata": {
624 |   "kernelspec": {
625 |    "display_name": "feml",
626 |    "language": "python",
627 |    "name": "feml"
628 |   },
629 |   "language_info": {
630 |    "codemirror_mode": {
631 |     "name": "ipython",
632 |     "version": 3
633 |    },
634 |    "file_extension": ".py",
635 |    "mimetype": "text/x-python",
636 |    "name": "python",
637 |    "nbconvert_exporter": "python",
638 |    "pygments_lexer": "ipython3",
639 |    "version": "3.8.2"
640 |   },
641 |   "toc": {
642 |    "base_numbering": 1,
643 |    "nav_menu": {},
644 |    "number_sections": true,
645 |    "sideBar": true,
646 |    "skip_h1_title": false,
647 |    "title_cell": "Table of Contents",
648 |    "title_sidebar": "Contents",
649 |    "toc_cell": false,
650 |    "toc_position": {},
651 |    "toc_section_display": "block",
652 |    "toc_window_display": true
653 |   }
654 |  },
655 |  "nbformat": 4,
656 |  "nbformat_minor": 2
657 | }
658 | 


--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.15_SustitucionMediaMediana_FeatureEngine.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Sustitución por la Media / Mediana  ==> Feature-engine\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "### Qué es Feature-engine?\n",
 11 |     "\n",
 12 |     "Feature-engine es una librería de Python que hemos creado para este curso. \n",
 13 |     "\n",
 14 |     "- Feature-engine incluye todas las técnicas de ingeniería de variables descritas en este curso\n",
 15 |     "- Feature-engine funciona como Scikit-learn, por lo tanto es fácil de aprender\n",
 16 |     "- Feature-engine te permite implementar pasos de ingeniería de variables específicos para diferentes grupos de variables\n",
 17 |     "- Feature-engine puede ser integrado con las pipelines de Scikit-learn pipeline permitiendo construir modelos fácilmente\n",
 18 |     "** Feature-engine te permite diseñar y guardar un flujo de ingeniería de variables con procesos diseñados específicamente para diferentes grupos de variables.**\n",
 19 |     "\n",
 20 |     "-------------------------------------------------------------------\n",
 21 |     "Feature-engine puede ser instalado vía pip ==> pip install feature-engine\n",
 22 |     "\n",
 23 |     "- Asegurate que haz instalado Feature-engine antes de correr este notebook\n",
 24 |     "\n",
 25 |     "Para más detalle visita el [website de trainindata]( https://www.trainindata.com/feature-engine) \n",
 26 |     "\n",
 27 |     "\n",
 28 |     "## En este demo:\n",
 29 |     "\n",
 30 |     "Vamos a usar ** Feature-engine para hacer la sustitución por la media o la mediana** usando los datos Ames House Price.\n",
 31 |     "\n",
 32 |     "- Para bajar los datos, por favor referirse a la clase de **Datasets** en la  **Sección 1** del curso.\n",
 33 |     "\n",
 34 |     "### Nota: \n",
 35 |     "* 'Imputer' deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahí el nombre dado a la clase."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 1,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import pandas as pd\n",
 45 |     "import numpy as np\n",
 46 |     "\n",
 47 |     "import matplotlib.pyplot as plt\n",
 48 |     "\n",
 49 |     "from sklearn.model_selection import train_test_split\n",
 50 |     "from sklearn.pipeline import Pipeline\n",
 51 |     "\n",
 52 |     "# feature-engine\n",
 53 |     "from feature_engine import imputation as mdi"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 2,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/html": [
 64 |        "<div>\n",
 65 |        "<style scoped>\n",
 66 |        "    .dataframe tbody tr th:only-of-type {\n",
 67 |        "        vertical-align: middle;\n",
 68 |        "    }\n",
 69 |        "\n",
 70 |        "    .dataframe tbody tr th {\n",
 71 |        "        vertical-align: top;\n",
 72 |        "    }\n",
 73 |        "\n",
 74 |        "    .dataframe thead th {\n",
 75 |        "        text-align: right;\n",
 76 |        "    }\n",
 77 |        "</style>\n",
 78 |        "<table border=\"1\" class=\"dataframe\">\n",
 79 |        "  <thead>\n",
 80 |        "    <tr style=\"text-align: right;\">\n",
 81 |        "      <th></th>\n",
 82 |        "      <th>LotFrontage</th>\n",
 83 |        "      <th>MasVnrArea</th>\n",
 84 |        "      <th>BsmtQual</th>\n",
 85 |        "      <th>FireplaceQu</th>\n",
 86 |        "      <th>GarageYrBlt</th>\n",
 87 |        "      <th>SalePrice</th>\n",
 88 |        "    </tr>\n",
 89 |        "  </thead>\n",
 90 |        "  <tbody>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>0</th>\n",
 93 |        "      <td>65.0</td>\n",
 94 |        "      <td>196.0</td>\n",
 95 |        "      <td>Gd</td>\n",
 96 |        "      <td>NaN</td>\n",
 97 |        "      <td>2003.0</td>\n",
 98 |        "      <td>208500</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>1</th>\n",
102 |        "      <td>80.0</td>\n",
103 |        "      <td>0.0</td>\n",
104 |        "      <td>Gd</td>\n",
105 |        "      <td>TA</td>\n",
106 |        "      <td>1976.0</td>\n",
107 |        "      <td>181500</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>2</th>\n",
111 |        "      <td>68.0</td>\n",
112 |        "      <td>162.0</td>\n",
113 |        "      <td>Gd</td>\n",
114 |        "      <td>TA</td>\n",
115 |        "      <td>2001.0</td>\n",
116 |        "      <td>223500</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>3</th>\n",
120 |        "      <td>60.0</td>\n",
121 |        "      <td>0.0</td>\n",
122 |        "      <td>TA</td>\n",
123 |        "      <td>Gd</td>\n",
124 |        "      <td>1998.0</td>\n",
125 |        "      <td>140000</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <th>4</th>\n",
129 |        "      <td>84.0</td>\n",
130 |        "      <td>350.0</td>\n",
131 |        "      <td>Gd</td>\n",
132 |        "      <td>TA</td>\n",
133 |        "      <td>2000.0</td>\n",
134 |        "      <td>250000</td>\n",
135 |        "    </tr>\n",
136 |        "  </tbody>\n",
137 |        "</table>\n",
138 |        "</div>"
139 |       ],
140 |       "text/plain": [
141 |        "   LotFrontage  MasVnrArea BsmtQual FireplaceQu  GarageYrBlt  SalePrice\n",
142 |        "0         65.0       196.0       Gd         NaN       2003.0     208500\n",
143 |        "1         80.0         0.0       Gd          TA       1976.0     181500\n",
144 |        "2         68.0       162.0       Gd          TA       2001.0     223500\n",
145 |        "3         60.0         0.0       TA          Gd       1998.0     140000\n",
146 |        "4         84.0       350.0       Gd          TA       2000.0     250000"
147 |       ]
148 |      },
149 |      "execution_count": 2,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "# carguemos los datos con las variables seleccionadas\n",
156 |     "\n",
157 |     "cols_to_use = [\n",
158 |     "    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
159 |     "    'SalePrice'\n",
160 |     "]\n",
161 |     "\n",
162 |     "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
163 |     "data.head()"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 3,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "LotFrontage    0.177397\n",
175 |        "MasVnrArea     0.005479\n",
176 |        "BsmtQual       0.025342\n",
177 |        "FireplaceQu    0.472603\n",
178 |        "GarageYrBlt    0.055479\n",
179 |        "SalePrice      0.000000\n",
180 |        "dtype: float64"
181 |       ]
182 |      },
183 |      "execution_count": 3,
184 |      "metadata": {},
185 |      "output_type": "execute_result"
186 |     }
187 |    ],
188 |    "source": [
189 |     "# porcentaje de valores nulos\n",
190 |     "\n",
191 |     "data.isnull().mean()"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "Todas las variables predictivas tienen datos ausentes\n"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 4,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "data": {
208 |       "text/plain": [
209 |        "((1022, 5), (438, 5))"
210 |       ]
211 |      },
212 |      "execution_count": 4,
213 |      "metadata": {},
214 |      "output_type": "execute_result"
215 |     }
216 |    ],
217 |    "source": [
218 |     "# separar datos en segmentos de entrenamiento y prueba\n",
219 |     "\n",
220 |     "# primero, separemos el target (SalePrice) del resto de las variables\n",
221 |     "cols_to_use.remove('SalePrice')\n",
222 |     "\n",
223 |     "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
224 |     "                                                    data['SalePrice'],\n",
225 |     "                                                    test_size=0.3,\n",
226 |     "                                                    random_state=0)\n",
227 |     "X_train.shape, X_test.shape"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "### Feature-engine captura las variables numéricas automáticamente"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 5,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "# llamamos el imputer de Feature-engine\n",
244 |     "# especificamos la estrategia de sustitución, mediana en este caso\n",
245 |     "\n",
246 |     "imputer = mdi.MeanMedianImputer(imputation_method='median')"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 6,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "data": {
256 |       "text/plain": [
257 |        "MeanMedianImputer(variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])"
258 |       ]
259 |      },
260 |      "execution_count": 6,
261 |      "metadata": {},
262 |      "output_type": "execute_result"
263 |     }
264 |    ],
265 |    "source": [
266 |     "# ajustamos el imputer\n",
267 |     "imputer.fit(X_train)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 7,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "data": {
277 |       "text/plain": [
278 |        "['LotFrontage', 'MasVnrArea', 'GarageYrBlt']"
279 |       ]
280 |      },
281 |      "execution_count": 7,
282 |      "metadata": {},
283 |      "output_type": "execute_result"
284 |     }
285 |    ],
286 |    "source": [
287 |     "# vemos que el imputer automáticamente encontró las variables numéricas para \n",
288 |     "# sustituir con la media\n",
289 |     "\n",
290 |     "imputer.variables"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 8,
296 |    "metadata": {},
297 |    "outputs": [
298 |     {
299 |      "data": {
300 |       "text/plain": [
301 |        "{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1979.0}"
302 |       ]
303 |      },
304 |      "execution_count": 8,
305 |      "metadata": {},
306 |      "output_type": "execute_result"
307 |     }
308 |    ],
309 |    "source": [
310 |     "# aquí podemos ver la mediana asignada a cada variable\n",
311 |     "\n",
312 |     "imputer.imputer_dict_"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 9,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "data": {
322 |       "text/html": [
323 |        "<div>\n",
324 |        "<style scoped>\n",
325 |        "    .dataframe tbody tr th:only-of-type {\n",
326 |        "        vertical-align: middle;\n",
327 |        "    }\n",
328 |        "\n",
329 |        "    .dataframe tbody tr th {\n",
330 |        "        vertical-align: top;\n",
331 |        "    }\n",
332 |        "\n",
333 |        "    .dataframe thead th {\n",
334 |        "        text-align: right;\n",
335 |        "    }\n",
336 |        "</style>\n",
337 |        "<table border=\"1\" class=\"dataframe\">\n",
338 |        "  <thead>\n",
339 |        "    <tr style=\"text-align: right;\">\n",
340 |        "      <th></th>\n",
341 |        "      <th>BsmtQual</th>\n",
342 |        "      <th>FireplaceQu</th>\n",
343 |        "      <th>LotFrontage</th>\n",
344 |        "      <th>MasVnrArea</th>\n",
345 |        "      <th>GarageYrBlt</th>\n",
346 |        "    </tr>\n",
347 |        "  </thead>\n",
348 |        "  <tbody>\n",
349 |        "    <tr>\n",
350 |        "      <th>64</th>\n",
351 |        "      <td>Gd</td>\n",
352 |        "      <td>NaN</td>\n",
353 |        "      <td>69.0</td>\n",
354 |        "      <td>573.0</td>\n",
355 |        "      <td>1998.0</td>\n",
356 |        "    </tr>\n",
357 |        "    <tr>\n",
358 |        "      <th>682</th>\n",
359 |        "      <td>Gd</td>\n",
360 |        "      <td>Gd</td>\n",
361 |        "      <td>69.0</td>\n",
362 |        "      <td>0.0</td>\n",
363 |        "      <td>1996.0</td>\n",
364 |        "    </tr>\n",
365 |        "    <tr>\n",
366 |        "      <th>960</th>\n",
367 |        "      <td>TA</td>\n",
368 |        "      <td>NaN</td>\n",
369 |        "      <td>50.0</td>\n",
370 |        "      <td>0.0</td>\n",
371 |        "      <td>1979.0</td>\n",
372 |        "    </tr>\n",
373 |        "    <tr>\n",
374 |        "      <th>1384</th>\n",
375 |        "      <td>TA</td>\n",
376 |        "      <td>NaN</td>\n",
377 |        "      <td>60.0</td>\n",
378 |        "      <td>0.0</td>\n",
379 |        "      <td>1939.0</td>\n",
380 |        "    </tr>\n",
381 |        "    <tr>\n",
382 |        "      <th>1100</th>\n",
383 |        "      <td>TA</td>\n",
384 |        "      <td>NaN</td>\n",
385 |        "      <td>60.0</td>\n",
386 |        "      <td>0.0</td>\n",
387 |        "      <td>1930.0</td>\n",
388 |        "    </tr>\n",
389 |        "  </tbody>\n",
390 |        "</table>\n",
391 |        "</div>"
392 |       ],
393 |       "text/plain": [
394 |        "     BsmtQual FireplaceQu  LotFrontage  MasVnrArea  GarageYrBlt\n",
395 |        "64         Gd         NaN         69.0       573.0       1998.0\n",
396 |        "682        Gd          Gd         69.0         0.0       1996.0\n",
397 |        "960        TA         NaN         50.0         0.0       1979.0\n",
398 |        "1384       TA         NaN         60.0         0.0       1939.0\n",
399 |        "1100       TA         NaN         60.0         0.0       1930.0"
400 |       ]
401 |      },
402 |      "execution_count": 9,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "# Feature-engine retorna un dataframe \n",
409 |     "\n",
410 |     "tmp = imputer.transform(X_train)\n",
411 |     "tmp.head()"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 10,
417 |    "metadata": {},
418 |    "outputs": [
419 |     {
420 |      "data": {
421 |       "text/plain": [
422 |        "LotFrontage    0.0\n",
423 |        "MasVnrArea     0.0\n",
424 |        "GarageYrBlt    0.0\n",
425 |        "dtype: float64"
426 |       ]
427 |      },
428 |      "execution_count": 10,
429 |      "metadata": {},
430 |      "output_type": "execute_result"
431 |     }
432 |    ],
433 |    "source": [
434 |     "# revisemos que las variables numéricas no tengan \n",
435 |     "# valores nulos NA \n",
436 |     "\n",
437 |     "tmp[imputer.variables].isnull().mean()"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "markdown",
442 |    "metadata": {},
443 |    "source": [
444 |     "## Feature-engine te permite especificar grupos de variables fácilmente"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 11,
450 |    "metadata": {},
451 |    "outputs": [
452 |     {
453 |      "data": {
454 |       "text/plain": [
455 |        "MeanMedianImputer(imputation_method='mean',\n",
456 |        "                  variables=['LotFrontage', 'MasVnrArea'])"
457 |       ]
458 |      },
459 |      "execution_count": 11,
460 |      "metadata": {},
461 |      "output_type": "execute_result"
462 |     }
463 |    ],
464 |    "source": [
465 |     "# usemos la sustitución por la media \n",
466 |     "# para 2 de la 3 variables numéricas\n",
467 |     "\n",
468 |     "imputer = mdi.MeanMedianImputer(imputation_method='mean',\n",
469 |     "                                variables=['LotFrontage', 'MasVnrArea'])\n",
470 |     "\n",
471 |     "imputer.fit(X_train)"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": 12,
477 |    "metadata": {},
478 |    "outputs": [
479 |     {
480 |      "data": {
481 |       "text/plain": [
482 |        "['LotFrontage', 'MasVnrArea']"
483 |       ]
484 |      },
485 |      "execution_count": 12,
486 |      "metadata": {},
487 |      "output_type": "execute_result"
488 |     }
489 |    ],
490 |    "source": [
491 |     "# ahora el imputer solo imputa las variables que indicamos\n",
492 |     "\n",
493 |     "imputer.variables"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": 13,
499 |    "metadata": {},
500 |    "outputs": [
501 |     {
502 |      "data": {
503 |       "text/plain": [
504 |        "{'LotFrontage': 69.66866746698679, 'MasVnrArea': 103.55358898721731}"
505 |       ]
506 |      },
507 |      "execution_count": 13,
508 |      "metadata": {},
509 |      "output_type": "execute_result"
510 |     }
511 |    ],
512 |    "source": [
513 |     "# y podemos ver el valor asignado a cada variable\n",
514 |     "imputer.imputer_dict_"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": 14,
520 |    "metadata": {},
521 |    "outputs": [
522 |     {
523 |      "data": {
524 |       "text/plain": [
525 |        "LotFrontage     69.668667\n",
526 |        "MasVnrArea     103.553589\n",
527 |        "dtype: float64"
528 |       ]
529 |      },
530 |      "execution_count": 14,
531 |      "metadata": {},
532 |      "output_type": "execute_result"
533 |     }
534 |    ],
535 |    "source": [
536 |     "# corroboremos que el diccionario anterior contiene los valores promedio\n",
537 |     "# de las variables\n",
538 |     "\n",
539 |     "X_train[imputer.variables].mean()"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "code",
544 |    "execution_count": 15,
545 |    "metadata": {},
546 |    "outputs": [
547 |     {
548 |      "data": {
549 |       "text/plain": [
550 |        "LotFrontage    0.0\n",
551 |        "MasVnrArea     0.0\n",
552 |        "dtype: float64"
553 |       ]
554 |      },
555 |      "execution_count": 15,
556 |      "metadata": {},
557 |      "output_type": "execute_result"
558 |     }
559 |    ],
560 |    "source": [
561 |     "# Feature-engine devuelve un dataframe\n",
562 |     "\n",
563 |     "tmp = imputer.transform(X_train)\n",
564 |     "\n",
565 |     "# miremos que los valores nulos efectivamente ya no existen\n",
566 |     "tmp[imputer.variables].isnull().mean()"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "markdown",
571 |    "metadata": {},
572 |    "source": [
573 |     "## Feature-engine puede ser usado con los flujos de Scikit-learn (pipeline)"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 16,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "pipe = Pipeline([\n",
583 |     "    ('median_imputer', mdi.MeanMedianImputer(imputation_method='median',\n",
584 |     "                                             variables = ['LotFrontage', 'GarageYrBlt'])),\n",
585 |     "     \n",
586 |     "    ('mean_imputer', mdi.MeanMedianImputer(imputation_method='mean',\n",
587 |     "                                          variables = ['MasVnrArea'])),\n",
588 |     "     ])"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": 17,
594 |    "metadata": {},
595 |    "outputs": [
596 |     {
597 |      "data": {
598 |       "text/plain": [
599 |        "Pipeline(steps=[('median_imputer',\n",
600 |        "                 MeanMedianImputer(variables=['LotFrontage', 'GarageYrBlt'])),\n",
601 |        "                ('mean_imputer',\n",
602 |        "                 MeanMedianImputer(imputation_method='mean',\n",
603 |        "                                   variables=['MasVnrArea']))])"
604 |       ]
605 |      },
606 |      "execution_count": 17,
607 |      "metadata": {},
608 |      "output_type": "execute_result"
609 |     }
610 |    ],
611 |    "source": [
612 |     "pipe.fit(X_train)"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": 18,
618 |    "metadata": {},
619 |    "outputs": [
620 |     {
621 |      "data": {
622 |       "text/plain": [
623 |        "{'LotFrontage': 69.0, 'GarageYrBlt': 1979.0}"
624 |       ]
625 |      },
626 |      "execution_count": 18,
627 |      "metadata": {},
628 |      "output_type": "execute_result"
629 |     }
630 |    ],
631 |    "source": [
632 |     "pipe.named_steps['median_imputer'].imputer_dict_"
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "code",
637 |    "execution_count": 19,
638 |    "metadata": {},
639 |    "outputs": [
640 |     {
641 |      "data": {
642 |       "text/plain": [
643 |        "{'MasVnrArea': 103.55358898721731}"
644 |       ]
645 |      },
646 |      "execution_count": 19,
647 |      "metadata": {},
648 |      "output_type": "execute_result"
649 |     }
650 |    ],
651 |    "source": [
652 |     "pipe.named_steps['mean_imputer'].imputer_dict_"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": 20,
658 |    "metadata": {},
659 |    "outputs": [
660 |     {
661 |      "data": {
662 |       "text/plain": [
663 |        "BsmtQual       0.023483\n",
664 |        "FireplaceQu    0.467710\n",
665 |        "LotFrontage    0.000000\n",
666 |        "MasVnrArea     0.000000\n",
667 |        "GarageYrBlt    0.000000\n",
668 |        "dtype: float64"
669 |       ]
670 |      },
671 |      "execution_count": 20,
672 |      "metadata": {},
673 |      "output_type": "execute_result"
674 |     }
675 |    ],
676 |    "source": [
677 |     "# transformemos los datos con la pipeline\n",
678 |     "tmp = pipe.transform(X_train)\n",
679 |     "\n",
680 |     "# revisemos que ya no tenemos valores nulos\n",
681 |     "tmp.isnull().mean()"
682 |    ]
683 |   }
684 |  ],
685 |  "metadata": {
686 |   "kernelspec": {
687 |    "display_name": "feml",
688 |    "language": "python",
689 |    "name": "feml"
690 |   },
691 |   "language_info": {
692 |    "codemirror_mode": {
693 |     "name": "ipython",
694 |     "version": 3
695 |    },
696 |    "file_extension": ".py",
697 |    "mimetype": "text/x-python",
698 |    "name": "python",
699 |    "nbconvert_exporter": "python",
700 |    "pygments_lexer": "ipython3",
701 |    "version": "3.8.2"
702 |   },
703 |   "toc": {
704 |    "base_numbering": 1,
705 |    "nav_menu": {},
706 |    "number_sections": true,
707 |    "sideBar": true,
708 |    "skip_h1_title": false,
709 |    "title_cell": "Table of Contents",
710 |    "title_sidebar": "Contents",
711 |    "toc_cell": false,
712 |    "toc_position": {},
713 |    "toc_section_display": "block",
714 |    "toc_window_display": true
715 |   }
716 |  },
717 |  "nbformat": 4,
718 |  "nbformat_minor": 2
719 | }
720 | 


--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.13_IndicadorAusencia_Sklearn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Agregar un Indicador de ausencia con Scikit-learn ==> MissingIndicator\n",
  8 |     "\n",
  9 |     "Scikit-learn tiene una clase **MissingIndicator** para añadir una variable binaria que marque los valores ausentes.\n",
 10 |     "\n",
 11 |     "MissingIndicator tiene la opción de añadir un indicador binario (variable) para cada variable en un conjunto de datos o solamente para aquellas que tienen NA en el segmento de entrenamiento.\n",
 12 |     "\n",
 13 |     "### Atención!\n",
 14 |     "\n",
 15 |     "El transformer solo devuelve las variables binarias, que luego deben ser añadidas a los datos originales de entrenamiento.\n",
 16 |     "\n",
 17 |     "### Más detalles acerca de los transformadores\n",
 18 |     "\n",
 19 |     "- [MissingIndicaror](https://scikit-learn.org/stable/modules/generated/sklearn.impute.MissingIndicator.html#sklearn.impute.MissingIndicator)\n",
 20 |     "\n",
 21 |     "## En este demo:\n",
 22 |     "\n",
 23 |     "Vamos a añadir un Indicador de ausencia (Missing Indicator) para las variables en el Ames House Price Dataset\n",
 24 |     "\n",
 25 |     "- Para bajar los datos, por favor referirse a la clase **Datasets** en la  **Sección 1** del curso."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 1,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import pandas as pd\n",
 35 |     "import numpy as np\n",
 36 |     "\n",
 37 |     "import matplotlib.pyplot as plt\n",
 38 |     "\n",
 39 |     "# estas son las clases para sustitutición con sklearn\n",
 40 |     "from sklearn.impute import SimpleImputer, MissingIndicator\n",
 41 |     "\n",
 42 |     "# separar segmentos prueba/entrenamiento\n",
 43 |     "from sklearn.model_selection import train_test_split"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# solo usaremos las siguientes variables categóricas en el demo:\n",
 53 |     "\n",
 54 |     "# una mezcla de variables categóricas y numéricas\n",
 55 |     "\n",
 56 |     "cols_to_use = ['BsmtQual', 'FireplaceQu', 'MSZoning',\n",
 57 |     "               'BsmtUnfSF', 'LotFrontage', 'MasVnrArea',\n",
 58 |     "               'Street', 'Alley', 'SalePrice']"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "(1460, 9)\n"
 71 |      ]
 72 |     },
 73 |     {
 74 |      "data": {
 75 |       "text/html": [
 76 |        "<div>\n",
 77 |        "<style scoped>\n",
 78 |        "    .dataframe tbody tr th:only-of-type {\n",
 79 |        "        vertical-align: middle;\n",
 80 |        "    }\n",
 81 |        "\n",
 82 |        "    .dataframe tbody tr th {\n",
 83 |        "        vertical-align: top;\n",
 84 |        "    }\n",
 85 |        "\n",
 86 |        "    .dataframe thead th {\n",
 87 |        "        text-align: right;\n",
 88 |        "    }\n",
 89 |        "</style>\n",
 90 |        "<table border=\"1\" class=\"dataframe\">\n",
 91 |        "  <thead>\n",
 92 |        "    <tr style=\"text-align: right;\">\n",
 93 |        "      <th></th>\n",
 94 |        "      <th>MSZoning</th>\n",
 95 |        "      <th>LotFrontage</th>\n",
 96 |        "      <th>Street</th>\n",
 97 |        "      <th>Alley</th>\n",
 98 |        "      <th>MasVnrArea</th>\n",
 99 |        "      <th>BsmtQual</th>\n",
100 |        "      <th>BsmtUnfSF</th>\n",
101 |        "      <th>FireplaceQu</th>\n",
102 |        "      <th>SalePrice</th>\n",
103 |        "    </tr>\n",
104 |        "  </thead>\n",
105 |        "  <tbody>\n",
106 |        "    <tr>\n",
107 |        "      <th>0</th>\n",
108 |        "      <td>RL</td>\n",
109 |        "      <td>65.0</td>\n",
110 |        "      <td>Pave</td>\n",
111 |        "      <td>NaN</td>\n",
112 |        "      <td>196.0</td>\n",
113 |        "      <td>Gd</td>\n",
114 |        "      <td>150</td>\n",
115 |        "      <td>NaN</td>\n",
116 |        "      <td>208500</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>1</th>\n",
120 |        "      <td>RL</td>\n",
121 |        "      <td>80.0</td>\n",
122 |        "      <td>Pave</td>\n",
123 |        "      <td>NaN</td>\n",
124 |        "      <td>0.0</td>\n",
125 |        "      <td>Gd</td>\n",
126 |        "      <td>284</td>\n",
127 |        "      <td>TA</td>\n",
128 |        "      <td>181500</td>\n",
129 |        "    </tr>\n",
130 |        "    <tr>\n",
131 |        "      <th>2</th>\n",
132 |        "      <td>RL</td>\n",
133 |        "      <td>68.0</td>\n",
134 |        "      <td>Pave</td>\n",
135 |        "      <td>NaN</td>\n",
136 |        "      <td>162.0</td>\n",
137 |        "      <td>Gd</td>\n",
138 |        "      <td>434</td>\n",
139 |        "      <td>TA</td>\n",
140 |        "      <td>223500</td>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>3</th>\n",
144 |        "      <td>RL</td>\n",
145 |        "      <td>60.0</td>\n",
146 |        "      <td>Pave</td>\n",
147 |        "      <td>NaN</td>\n",
148 |        "      <td>0.0</td>\n",
149 |        "      <td>TA</td>\n",
150 |        "      <td>540</td>\n",
151 |        "      <td>Gd</td>\n",
152 |        "      <td>140000</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>4</th>\n",
156 |        "      <td>RL</td>\n",
157 |        "      <td>84.0</td>\n",
158 |        "      <td>Pave</td>\n",
159 |        "      <td>NaN</td>\n",
160 |        "      <td>350.0</td>\n",
161 |        "      <td>Gd</td>\n",
162 |        "      <td>490</td>\n",
163 |        "      <td>TA</td>\n",
164 |        "      <td>250000</td>\n",
165 |        "    </tr>\n",
166 |        "  </tbody>\n",
167 |        "</table>\n",
168 |        "</div>"
169 |       ],
170 |       "text/plain": [
171 |        "  MSZoning  LotFrontage Street Alley  MasVnrArea BsmtQual  BsmtUnfSF  \\\n",
172 |        "0       RL         65.0   Pave   NaN       196.0       Gd        150   \n",
173 |        "1       RL         80.0   Pave   NaN         0.0       Gd        284   \n",
174 |        "2       RL         68.0   Pave   NaN       162.0       Gd        434   \n",
175 |        "3       RL         60.0   Pave   NaN         0.0       TA        540   \n",
176 |        "4       RL         84.0   Pave   NaN       350.0       Gd        490   \n",
177 |        "\n",
178 |        "  FireplaceQu  SalePrice  \n",
179 |        "0         NaN     208500  \n",
180 |        "1          TA     181500  \n",
181 |        "2          TA     223500  \n",
182 |        "3          Gd     140000  \n",
183 |        "4          TA     250000  "
184 |       ]
185 |      },
186 |      "execution_count": 3,
187 |      "metadata": {},
188 |      "output_type": "execute_result"
189 |     }
190 |    ],
191 |    "source": [
192 |     "# carguemos los datos \n",
193 |     "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
194 |     "print(data.shape)\n",
195 |     "data.head()"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 4,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "data": {
205 |       "text/plain": [
206 |        "MSZoning       0.000000\n",
207 |        "LotFrontage    0.177397\n",
208 |        "Street         0.000000\n",
209 |        "Alley          0.937671\n",
210 |        "MasVnrArea     0.005479\n",
211 |        "BsmtQual       0.025342\n",
212 |        "BsmtUnfSF      0.000000\n",
213 |        "FireplaceQu    0.472603\n",
214 |        "SalePrice      0.000000\n",
215 |        "dtype: float64"
216 |       ]
217 |      },
218 |      "execution_count": 4,
219 |      "metadata": {},
220 |      "output_type": "execute_result"
221 |     }
222 |    ],
223 |    "source": [
224 |     "# revisemos los valores nulos\n",
225 |     "data.isnull().mean()"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 5,
231 |    "metadata": {},
232 |    "outputs": [
233 |     {
234 |      "data": {
235 |       "text/plain": [
236 |        "((1022, 8), (438, 8))"
237 |       ]
238 |      },
239 |      "execution_count": 5,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "# separar datos en segmentos entrenamiento y prueba\n",
246 |     "\n",
247 |     "# primero, separemos el target (SalePrice) del resto de las variables (features)\n",
248 |     "\n",
249 |     "cols_to_use.remove('SalePrice')\n",
250 |     "\n",
251 |     "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use], # solo las variables\n",
252 |     "                                                    data['SalePrice'], # el target\n",
253 |     "                                                    test_size=0.3, # el porcentaje de obs en el segmento de prueba\n",
254 |     "                                                    random_state=0) # para reproducir\n",
255 |     "X_train.shape, X_test.shape"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 6,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "BsmtQual       0.023483\n",
267 |        "FireplaceQu    0.467710\n",
268 |        "MSZoning       0.000000\n",
269 |        "BsmtUnfSF      0.000000\n",
270 |        "LotFrontage    0.184932\n",
271 |        "MasVnrArea     0.004892\n",
272 |        "Street         0.000000\n",
273 |        "Alley          0.939335\n",
274 |        "dtype: float64"
275 |       ]
276 |      },
277 |      "execution_count": 6,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "# evaluemos el porcentaje de datos ausentes nuevamente\n",
284 |     "X_train.isnull().mean()"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "## Añadir un Indicador  de Ausencia (Missing Indicator)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 7,
297 |    "metadata": {},
298 |    "outputs": [
299 |     {
300 |      "data": {
301 |       "text/plain": [
302 |        "MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,\n",
303 |        "                 sparse='auto')"
304 |       ]
305 |      },
306 |      "execution_count": 7,
307 |      "metadata": {},
308 |      "output_type": "execute_result"
309 |     }
310 |    ],
311 |    "source": [
312 |     "indicator = MissingIndicator(error_on_new=True, features='missing-only')\n",
313 |     "indicator.fit(X_train)  "
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 8,
319 |    "metadata": {},
320 |    "outputs": [
321 |     {
322 |      "data": {
323 |       "text/plain": [
324 |        "array([0, 1, 4, 5, 7], dtype=int64)"
325 |       ]
326 |      },
327 |      "execution_count": 8,
328 |      "metadata": {},
329 |      "output_type": "execute_result"
330 |     }
331 |    ],
332 |    "source": [
333 |     "# podemos ver las variables con valores nulos na:\n",
334 |     "# el resultado muestra el índice (index)\n",
335 |     "\n",
336 |     "indicator.features_"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 9,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "Index(['BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'Alley'], dtype='object')"
348 |       ]
349 |      },
350 |      "execution_count": 9,
351 |      "metadata": {},
352 |      "output_type": "execute_result"
353 |     }
354 |    ],
355 |    "source": [
356 |     "# podemos encontrar el nombre de las variables pasando el índice \n",
357 |     "# a la lista de columnas\n",
358 |     "X_train.columns[indicator.features_]"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 10,
364 |    "metadata": {},
365 |    "outputs": [
366 |     {
367 |      "data": {
368 |       "text/plain": [
369 |        "array([[False,  True,  True, False,  True],\n",
370 |        "       [False, False,  True, False,  True],\n",
371 |        "       [False,  True, False, False,  True],\n",
372 |        "       ...,\n",
373 |        "       [ True,  True, False, False,  True],\n",
374 |        "       [False, False,  True, False,  True],\n",
375 |        "       [False,  True, False, False,  True]])"
376 |       ]
377 |      },
378 |      "execution_count": 10,
379 |      "metadata": {},
380 |      "output_type": "execute_result"
381 |     }
382 |    ],
383 |    "source": [
384 |     "# el 'indicator' devuelve solamente los indicadores adicionales\n",
385 |     "# cuando tranformamos los datos\n",
386 |     "\n",
387 |     "tmp = indicator.transform(X_train)\n",
388 |     "\n",
389 |     "tmp"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 11,
395 |    "metadata": {},
396 |    "outputs": [
397 |     {
398 |      "data": {
399 |       "text/html": [
400 |        "<div>\n",
401 |        "<style scoped>\n",
402 |        "    .dataframe tbody tr th:only-of-type {\n",
403 |        "        vertical-align: middle;\n",
404 |        "    }\n",
405 |        "\n",
406 |        "    .dataframe tbody tr th {\n",
407 |        "        vertical-align: top;\n",
408 |        "    }\n",
409 |        "\n",
410 |        "    .dataframe thead th {\n",
411 |        "        text-align: right;\n",
412 |        "    }\n",
413 |        "</style>\n",
414 |        "<table border=\"1\" class=\"dataframe\">\n",
415 |        "  <thead>\n",
416 |        "    <tr style=\"text-align: right;\">\n",
417 |        "      <th></th>\n",
418 |        "      <th>index</th>\n",
419 |        "      <th>BsmtQual</th>\n",
420 |        "      <th>FireplaceQu</th>\n",
421 |        "      <th>MSZoning</th>\n",
422 |        "      <th>BsmtUnfSF</th>\n",
423 |        "      <th>LotFrontage</th>\n",
424 |        "      <th>MasVnrArea</th>\n",
425 |        "      <th>Street</th>\n",
426 |        "      <th>Alley</th>\n",
427 |        "      <th>BsmtQual_NA</th>\n",
428 |        "      <th>FireplaceQu_NA</th>\n",
429 |        "      <th>LotFrontage_NA</th>\n",
430 |        "      <th>MasVnrArea_NA</th>\n",
431 |        "      <th>Alley_NA</th>\n",
432 |        "    </tr>\n",
433 |        "  </thead>\n",
434 |        "  <tbody>\n",
435 |        "    <tr>\n",
436 |        "      <th>0</th>\n",
437 |        "      <td>64</td>\n",
438 |        "      <td>Gd</td>\n",
439 |        "      <td>NaN</td>\n",
440 |        "      <td>RL</td>\n",
441 |        "      <td>318</td>\n",
442 |        "      <td>NaN</td>\n",
443 |        "      <td>573.0</td>\n",
444 |        "      <td>Pave</td>\n",
445 |        "      <td>NaN</td>\n",
446 |        "      <td>False</td>\n",
447 |        "      <td>True</td>\n",
448 |        "      <td>True</td>\n",
449 |        "      <td>False</td>\n",
450 |        "      <td>True</td>\n",
451 |        "    </tr>\n",
452 |        "    <tr>\n",
453 |        "      <th>1</th>\n",
454 |        "      <td>682</td>\n",
455 |        "      <td>Gd</td>\n",
456 |        "      <td>Gd</td>\n",
457 |        "      <td>RL</td>\n",
458 |        "      <td>288</td>\n",
459 |        "      <td>NaN</td>\n",
460 |        "      <td>0.0</td>\n",
461 |        "      <td>Pave</td>\n",
462 |        "      <td>NaN</td>\n",
463 |        "      <td>False</td>\n",
464 |        "      <td>False</td>\n",
465 |        "      <td>True</td>\n",
466 |        "      <td>False</td>\n",
467 |        "      <td>True</td>\n",
468 |        "    </tr>\n",
469 |        "    <tr>\n",
470 |        "      <th>2</th>\n",
471 |        "      <td>960</td>\n",
472 |        "      <td>TA</td>\n",
473 |        "      <td>NaN</td>\n",
474 |        "      <td>RL</td>\n",
475 |        "      <td>162</td>\n",
476 |        "      <td>50.0</td>\n",
477 |        "      <td>0.0</td>\n",
478 |        "      <td>Pave</td>\n",
479 |        "      <td>NaN</td>\n",
480 |        "      <td>False</td>\n",
481 |        "      <td>True</td>\n",
482 |        "      <td>False</td>\n",
483 |        "      <td>False</td>\n",
484 |        "      <td>True</td>\n",
485 |        "    </tr>\n",
486 |        "    <tr>\n",
487 |        "      <th>3</th>\n",
488 |        "      <td>1384</td>\n",
489 |        "      <td>TA</td>\n",
490 |        "      <td>NaN</td>\n",
491 |        "      <td>RL</td>\n",
492 |        "      <td>356</td>\n",
493 |        "      <td>60.0</td>\n",
494 |        "      <td>0.0</td>\n",
495 |        "      <td>Pave</td>\n",
496 |        "      <td>NaN</td>\n",
497 |        "      <td>False</td>\n",
498 |        "      <td>True</td>\n",
499 |        "      <td>False</td>\n",
500 |        "      <td>False</td>\n",
501 |        "      <td>True</td>\n",
502 |        "    </tr>\n",
503 |        "    <tr>\n",
504 |        "      <th>4</th>\n",
505 |        "      <td>1100</td>\n",
506 |        "      <td>TA</td>\n",
507 |        "      <td>NaN</td>\n",
508 |        "      <td>RL</td>\n",
509 |        "      <td>0</td>\n",
510 |        "      <td>60.0</td>\n",
511 |        "      <td>0.0</td>\n",
512 |        "      <td>Pave</td>\n",
513 |        "      <td>NaN</td>\n",
514 |        "      <td>False</td>\n",
515 |        "      <td>True</td>\n",
516 |        "      <td>False</td>\n",
517 |        "      <td>False</td>\n",
518 |        "      <td>True</td>\n",
519 |        "    </tr>\n",
520 |        "  </tbody>\n",
521 |        "</table>\n",
522 |        "</div>"
523 |       ],
524 |       "text/plain": [
525 |        "   index BsmtQual FireplaceQu MSZoning  BsmtUnfSF  LotFrontage  MasVnrArea  \\\n",
526 |        "0     64       Gd         NaN       RL        318          NaN       573.0   \n",
527 |        "1    682       Gd          Gd       RL        288          NaN         0.0   \n",
528 |        "2    960       TA         NaN       RL        162         50.0         0.0   \n",
529 |        "3   1384       TA         NaN       RL        356         60.0         0.0   \n",
530 |        "4   1100       TA         NaN       RL          0         60.0         0.0   \n",
531 |        "\n",
532 |        "  Street Alley  BsmtQual_NA  FireplaceQu_NA  LotFrontage_NA  MasVnrArea_NA  \\\n",
533 |        "0   Pave   NaN        False            True            True          False   \n",
534 |        "1   Pave   NaN        False           False            True          False   \n",
535 |        "2   Pave   NaN        False            True           False          False   \n",
536 |        "3   Pave   NaN        False            True           False          False   \n",
537 |        "4   Pave   NaN        False            True           False          False   \n",
538 |        "\n",
539 |        "   Alley_NA  \n",
540 |        "0      True  \n",
541 |        "1      True  \n",
542 |        "2      True  \n",
543 |        "3      True  \n",
544 |        "4      True  "
545 |       ]
546 |      },
547 |      "execution_count": 11,
548 |      "metadata": {},
549 |      "output_type": "execute_result"
550 |     }
551 |    ],
552 |    "source": [
553 |     "# ahora necesitamos unirlo manualmente al segmento X_train\n",
554 |     "\n",
555 |     "# creemos una columna por cada uno de los nuevos indicadores MissingIndicators\n",
556 |     "indicator_cols = [c+'_NA' for c in X_train.columns[indicator.features_]]\n",
557 |     "\n",
558 |     "# y ahora concatenamos\n",
559 |     "X_train = pd.concat([\n",
560 |     "    X_train.reset_index(),\n",
561 |     "    pd.DataFrame(tmp, columns = indicator_cols)],\n",
562 |     "    axis=1)\n",
563 |     "\n",
564 |     "X_train.head()"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": 12,
570 |    "metadata": {},
571 |    "outputs": [
572 |     {
573 |      "data": {
574 |       "text/html": [
575 |        "<div>\n",
576 |        "<style scoped>\n",
577 |        "    .dataframe tbody tr th:only-of-type {\n",
578 |        "        vertical-align: middle;\n",
579 |        "    }\n",
580 |        "\n",
581 |        "    .dataframe tbody tr th {\n",
582 |        "        vertical-align: top;\n",
583 |        "    }\n",
584 |        "\n",
585 |        "    .dataframe thead th {\n",
586 |        "        text-align: right;\n",
587 |        "    }\n",
588 |        "</style>\n",
589 |        "<table border=\"1\" class=\"dataframe\">\n",
590 |        "  <thead>\n",
591 |        "    <tr style=\"text-align: right;\">\n",
592 |        "      <th></th>\n",
593 |        "      <th>index</th>\n",
594 |        "      <th>BsmtQual</th>\n",
595 |        "      <th>FireplaceQu</th>\n",
596 |        "      <th>MSZoning</th>\n",
597 |        "      <th>BsmtUnfSF</th>\n",
598 |        "      <th>LotFrontage</th>\n",
599 |        "      <th>MasVnrArea</th>\n",
600 |        "      <th>Street</th>\n",
601 |        "      <th>Alley</th>\n",
602 |        "      <th>BsmtQual_NA</th>\n",
603 |        "      <th>FireplaceQu_NA</th>\n",
604 |        "      <th>LotFrontage_NA</th>\n",
605 |        "      <th>MasVnrArea_NA</th>\n",
606 |        "      <th>Alley_NA</th>\n",
607 |        "    </tr>\n",
608 |        "  </thead>\n",
609 |        "  <tbody>\n",
610 |        "    <tr>\n",
611 |        "      <th>0</th>\n",
612 |        "      <td>529</td>\n",
613 |        "      <td>TA</td>\n",
614 |        "      <td>TA</td>\n",
615 |        "      <td>RL</td>\n",
616 |        "      <td>816</td>\n",
617 |        "      <td>NaN</td>\n",
618 |        "      <td>NaN</td>\n",
619 |        "      <td>Pave</td>\n",
620 |        "      <td>NaN</td>\n",
621 |        "      <td>False</td>\n",
622 |        "      <td>False</td>\n",
623 |        "      <td>True</td>\n",
624 |        "      <td>True</td>\n",
625 |        "      <td>True</td>\n",
626 |        "    </tr>\n",
627 |        "    <tr>\n",
628 |        "      <th>1</th>\n",
629 |        "      <td>491</td>\n",
630 |        "      <td>TA</td>\n",
631 |        "      <td>TA</td>\n",
632 |        "      <td>RL</td>\n",
633 |        "      <td>238</td>\n",
634 |        "      <td>79.0</td>\n",
635 |        "      <td>0.0</td>\n",
636 |        "      <td>Pave</td>\n",
637 |        "      <td>NaN</td>\n",
638 |        "      <td>False</td>\n",
639 |        "      <td>False</td>\n",
640 |        "      <td>False</td>\n",
641 |        "      <td>False</td>\n",
642 |        "      <td>True</td>\n",
643 |        "    </tr>\n",
644 |        "    <tr>\n",
645 |        "      <th>2</th>\n",
646 |        "      <td>459</td>\n",
647 |        "      <td>TA</td>\n",
648 |        "      <td>TA</td>\n",
649 |        "      <td>RL</td>\n",
650 |        "      <td>524</td>\n",
651 |        "      <td>NaN</td>\n",
652 |        "      <td>161.0</td>\n",
653 |        "      <td>Pave</td>\n",
654 |        "      <td>NaN</td>\n",
655 |        "      <td>False</td>\n",
656 |        "      <td>False</td>\n",
657 |        "      <td>True</td>\n",
658 |        "      <td>False</td>\n",
659 |        "      <td>True</td>\n",
660 |        "    </tr>\n",
661 |        "    <tr>\n",
662 |        "      <th>3</th>\n",
663 |        "      <td>279</td>\n",
664 |        "      <td>Gd</td>\n",
665 |        "      <td>TA</td>\n",
666 |        "      <td>RL</td>\n",
667 |        "      <td>768</td>\n",
668 |        "      <td>83.0</td>\n",
669 |        "      <td>299.0</td>\n",
670 |        "      <td>Pave</td>\n",
671 |        "      <td>NaN</td>\n",
672 |        "      <td>False</td>\n",
673 |        "      <td>False</td>\n",
674 |        "      <td>False</td>\n",
675 |        "      <td>False</td>\n",
676 |        "      <td>True</td>\n",
677 |        "    </tr>\n",
678 |        "    <tr>\n",
679 |        "      <th>4</th>\n",
680 |        "      <td>655</td>\n",
681 |        "      <td>TA</td>\n",
682 |        "      <td>NaN</td>\n",
683 |        "      <td>RM</td>\n",
684 |        "      <td>525</td>\n",
685 |        "      <td>21.0</td>\n",
686 |        "      <td>381.0</td>\n",
687 |        "      <td>Pave</td>\n",
688 |        "      <td>NaN</td>\n",
689 |        "      <td>False</td>\n",
690 |        "      <td>True</td>\n",
691 |        "      <td>False</td>\n",
692 |        "      <td>False</td>\n",
693 |        "      <td>True</td>\n",
694 |        "    </tr>\n",
695 |        "  </tbody>\n",
696 |        "</table>\n",
697 |        "</div>"
698 |       ],
699 |       "text/plain": [
700 |        "   index BsmtQual FireplaceQu MSZoning  BsmtUnfSF  LotFrontage  MasVnrArea  \\\n",
701 |        "0    529       TA          TA       RL        816          NaN         NaN   \n",
702 |        "1    491       TA          TA       RL        238         79.0         0.0   \n",
703 |        "2    459       TA          TA       RL        524          NaN       161.0   \n",
704 |        "3    279       Gd          TA       RL        768         83.0       299.0   \n",
705 |        "4    655       TA         NaN       RM        525         21.0       381.0   \n",
706 |        "\n",
707 |        "  Street Alley  BsmtQual_NA  FireplaceQu_NA  LotFrontage_NA  MasVnrArea_NA  \\\n",
708 |        "0   Pave   NaN        False           False            True           True   \n",
709 |        "1   Pave   NaN        False           False           False          False   \n",
710 |        "2   Pave   NaN        False           False            True          False   \n",
711 |        "3   Pave   NaN        False           False           False          False   \n",
712 |        "4   Pave   NaN        False            True           False          False   \n",
713 |        "\n",
714 |        "   Alley_NA  \n",
715 |        "0      True  \n",
716 |        "1      True  \n",
717 |        "2      True  \n",
718 |        "3      True  \n",
719 |        "4      True  "
720 |       ]
721 |      },
722 |      "execution_count": 12,
723 |      "metadata": {},
724 |      "output_type": "execute_result"
725 |     }
726 |    ],
727 |    "source": [
728 |     "# repetimos para el segmento de prueba\n",
729 |     "tmp = indicator.transform(X_test)\n",
730 |     "\n",
731 |     "X_test = pd.concat([\n",
732 |     "    X_test.reset_index(),\n",
733 |     "    pd.DataFrame(tmp, columns = indicator_cols)],\n",
734 |     "    axis=1)\n",
735 |     "\n",
736 |     "X_test.head()"
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "markdown",
741 |    "metadata": {},
742 |    "source": [
743 |     "### SimpleImputer en un conjunto de datos diferente"
744 |    ]
745 |   },
746 |   {
747 |    "cell_type": "code",
748 |    "execution_count": 13,
749 |    "metadata": {},
750 |    "outputs": [
751 |     {
752 |      "data": {
753 |       "text/plain": [
754 |        "SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n",
755 |        "              missing_values=nan, strategy='most_frequent', verbose=0)"
756 |       ]
757 |      },
758 |      "execution_count": 13,
759 |      "metadata": {},
760 |      "output_type": "execute_result"
761 |     }
762 |    ],
763 |    "source": [
764 |     "# Ahora sustituimos los valores ausentes con el SimpleImputer\n",
765 |     "\n",
766 |     "# creamos una instancia del SimpleImputer\n",
767 |     "# indicamos que queramos sustituir con la \n",
768 |     "# categoría más frecuente\n",
769 |     "imputer = SimpleImputer(strategy='most_frequent')\n",
770 |     "\n",
771 |     "# ajustamos el 'imputer' al set de entrenamiento asi aprende\n",
772 |     "# la moda\n",
773 |     "imputer.fit(X_train)"
774 |    ]
775 |   },
776 |   {
777 |    "cell_type": "code",
778 |    "execution_count": 14,
779 |    "metadata": {},
780 |    "outputs": [
781 |     {
782 |      "data": {
783 |       "text/plain": [
784 |        "array([0, 'TA', 'Gd', 'RL', 0, 60.0, 0.0, 'Pave', 'Pave', False, False,\n",
785 |        "       False, False, True], dtype=object)"
786 |       ]
787 |      },
788 |      "execution_count": 14,
789 |      "metadata": {},
790 |      "output_type": "execute_result"
791 |     }
792 |    ],
793 |    "source": [
794 |     "# podemos ver cuales fueron los valores frecuentes aprendidos:\n",
795 |     "imputer.statistics_"
796 |    ]
797 |   },
798 |   {
799 |    "cell_type": "markdown",
800 |    "metadata": {},
801 |    "source": [
802 |     "**Nota** el transformer aprende cual es el valor de la categoría más frecuente para AMBAS variables las categóricas y las numéricas."
803 |    ]
804 |   },
805 |   {
806 |    "cell_type": "code",
807 |    "execution_count": 15,
808 |    "metadata": {},
809 |    "outputs": [
810 |     {
811 |      "data": {
812 |       "text/plain": [
813 |        "array([[64, 'Gd', 'Gd', ..., True, False, True],\n",
814 |        "       [682, 'Gd', 'Gd', ..., True, False, True],\n",
815 |        "       [960, 'TA', 'Gd', ..., False, False, True],\n",
816 |        "       ...,\n",
817 |        "       [1216, 'TA', 'Gd', ..., False, False, True],\n",
818 |        "       [559, 'Gd', 'TA', ..., True, False, True],\n",
819 |        "       [684, 'Gd', 'Gd', ..., False, False, True]], dtype=object)"
820 |       ]
821 |      },
822 |      "execution_count": 15,
823 |      "metadata": {},
824 |      "output_type": "execute_result"
825 |     }
826 |    ],
827 |    "source": [
828 |     "# y ahora sustituimos ambos segmentos de prueba y entrenamiento\n",
829 |     "\n",
830 |     "# NOTA: los datos se devuelven como un numpy array!!!\n",
831 |     "X_train = imputer.transform(X_train)\n",
832 |     "X_test = imputer.transform(X_test)\n",
833 |     "\n",
834 |     "X_train"
835 |    ]
836 |   },
837 |   {
838 |    "cell_type": "code",
839 |    "execution_count": null,
840 |    "metadata": {},
841 |    "outputs": [],
842 |    "source": []
843 |   }
844 |  ],
845 |  "metadata": {
846 |   "kernelspec": {
847 |    "display_name": "feml",
848 |    "language": "python",
849 |    "name": "feml"
850 |   },
851 |   "language_info": {
852 |    "codemirror_mode": {
853 |     "name": "ipython",
854 |     "version": 3
855 |    },
856 |    "file_extension": ".py",
857 |    "mimetype": "text/x-python",
858 |    "name": "python",
859 |    "nbconvert_exporter": "python",
860 |    "pygments_lexer": "ipython3",
861 |    "version": "3.8.2"
862 |   },
863 |   "toc": {
864 |    "base_numbering": 1,
865 |    "nav_menu": {},
866 |    "number_sections": true,
867 |    "sideBar": true,
868 |    "skip_h1_title": false,
869 |    "title_cell": "Table of Contents",
870 |    "title_sidebar": "Contents",
871 |    "toc_cell": false,
872 |    "toc_position": {},
873 |    "toc_section_display": true,
874 |    "toc_window_display": true
875 |   }
876 |  },
877 |  "nbformat": 4,
878 |  "nbformat_minor": 2
879 | }
880 | 


--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.12_SustitucionCategoriaAdicional_Sklearn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Sustitución usando una etiqueta adicional 'Missing' con Scikit-learn ==> SimpleImputer \n",
  8 |     "\n",
  9 |     "En la librería Scikit-learn hay una clase para manejar una gran variedad de métodos de sustitución.\n",
 10 |     "\n",
 11 |     "El **SimpleImputer** es una clase que provee funcionalidad básica para la sustitución de valores ausentes, incluyendo:\n",
 12 |     "\n",
 13 |     "- Sustitución por la media y la mediana para variables numéricas\n",
 14 |     "- Sustitución por la categoría más frecuente para variables categóricas.\n",
 15 |     "- Sustitución por valores arbitrarios para variables numéricas y categóricas.\n",
 16 |     "\n",
 17 |     "### Ventajas\n",
 18 |     "\n",
 19 |     "- Fácil de usar si se aplica a todo el dataframe\n",
 20 |     "- Código mantenido por desarrolladores de Scikit-learn: buena calidad\n",
 21 |     "- Rápida computación (usa NumPy para los cálculos)\n",
 22 |     "- Permite usar grid-search (búsqueda en cuadrículas) para varios métodos de sustitución\n",
 23 |     "- Permite usar diferentes valores para codificar ausencia de datos (se puede indicar si por ejemplo los valores nulos son np.nan, ceros, cadenas de caracteres vacías, u otros)\n",
 24 |     "\n",
 25 |     "### Limitaciones\n",
 26 |     "\n",
 27 |     "- Retorna un arreglo de NumPy en lugar de un dataframe de pandas, lo cual es inconveniente para el análisis de datos\n",
 28 |     "- Necesita usar clases adicionales para seleccionar cuales variables se deben sustituir ==>\n",
 29 |     "    - requiere líneas de código adicional\n",
 30 |     "    - requiere ser usado con otras clases\n",
 31 |     "    - no es tan sencillo de usar\n",
 32 |     "    \n",
 33 |     "### Más detalles acerca de los transformadores (transformers en inglés)\n",
 34 |     "\n",
 35 |     "- [SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer)\n",
 36 |     "- [ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)\n",
 37 |     "- [Stackoverflow](https://stackoverflow.com/questions/54160370/how-to-use-sklearn-column-transformer)\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "## En este demo:\n",
 41 |     "\n",
 42 |     "Vamos a aprender **sustitución con una etiqueta adicional 'Missing' usando Scikit-learn** usando los datos Ames House Price.\n",
 43 |     "\n",
 44 |     "- Para bajar los datos, por favor referirse a la clase  **Datasets** en la  **Sección 1** del curso.\n",
 45 |     "\n",
 46 |     "### Nota: \n",
 47 |     "* 'Imputer' se deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahi el nombre dado a la clase.\n",
 48 |     "* 'slicing' significa seleccionar conjuntos de datos (columnas/filas) de un ‘DataFrame’.\n",
 49 |     "* 'Missing' -> ausente"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 1,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "import pandas as pd\n",
 59 |     "import numpy as np\n",
 60 |     "\n",
 61 |     "import matplotlib.pyplot as plt\n",
 62 |     "\n",
 63 |     "# estas son las clases para sustitutición con sklearn\n",
 64 |     "from sklearn.impute import SimpleImputer\n",
 65 |     "from sklearn.compose import ColumnTransformer\n",
 66 |     "from sklearn.pipeline import Pipeline\n",
 67 |     "\n",
 68 |     "# dividir dataset\n",
 69 |     "from sklearn.model_selection import train_test_split"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 2,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/html": [
 80 |        "<div>\n",
 81 |        "<style scoped>\n",
 82 |        "    .dataframe tbody tr th:only-of-type {\n",
 83 |        "        vertical-align: middle;\n",
 84 |        "    }\n",
 85 |        "\n",
 86 |        "    .dataframe tbody tr th {\n",
 87 |        "        vertical-align: top;\n",
 88 |        "    }\n",
 89 |        "\n",
 90 |        "    .dataframe thead th {\n",
 91 |        "        text-align: right;\n",
 92 |        "    }\n",
 93 |        "</style>\n",
 94 |        "<table border=\"1\" class=\"dataframe\">\n",
 95 |        "  <thead>\n",
 96 |        "    <tr style=\"text-align: right;\">\n",
 97 |        "      <th></th>\n",
 98 |        "      <th>BsmtQual</th>\n",
 99 |        "      <th>FireplaceQu</th>\n",
100 |        "      <th>SalePrice</th>\n",
101 |        "    </tr>\n",
102 |        "  </thead>\n",
103 |        "  <tbody>\n",
104 |        "    <tr>\n",
105 |        "      <th>0</th>\n",
106 |        "      <td>Gd</td>\n",
107 |        "      <td>NaN</td>\n",
108 |        "      <td>208500</td>\n",
109 |        "    </tr>\n",
110 |        "    <tr>\n",
111 |        "      <th>1</th>\n",
112 |        "      <td>Gd</td>\n",
113 |        "      <td>TA</td>\n",
114 |        "      <td>181500</td>\n",
115 |        "    </tr>\n",
116 |        "    <tr>\n",
117 |        "      <th>2</th>\n",
118 |        "      <td>Gd</td>\n",
119 |        "      <td>TA</td>\n",
120 |        "      <td>223500</td>\n",
121 |        "    </tr>\n",
122 |        "    <tr>\n",
123 |        "      <th>3</th>\n",
124 |        "      <td>TA</td>\n",
125 |        "      <td>Gd</td>\n",
126 |        "      <td>140000</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>4</th>\n",
130 |        "      <td>Gd</td>\n",
131 |        "      <td>TA</td>\n",
132 |        "      <td>250000</td>\n",
133 |        "    </tr>\n",
134 |        "  </tbody>\n",
135 |        "</table>\n",
136 |        "</div>"
137 |       ],
138 |       "text/plain": [
139 |        "  BsmtQual FireplaceQu  SalePrice\n",
140 |        "0       Gd         NaN     208500\n",
141 |        "1       Gd          TA     181500\n",
142 |        "2       Gd          TA     223500\n",
143 |        "3       TA          Gd     140000\n",
144 |        "4       Gd          TA     250000"
145 |       ]
146 |      },
147 |      "execution_count": 2,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "# solo usaremos las siguientes variables categóricas en el demo:\n",
154 |     "\n",
155 |     "# estas son las variables categóricas y el target SalePrice\n",
156 |     "cols_to_use = ['BsmtQual', 'FireplaceQu', 'SalePrice']\n",
157 |     "\n",
158 |     "# carguemos los datos \n",
159 |     "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
160 |     "data.head()"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 3,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/plain": [
171 |        "BsmtQual       0.025342\n",
172 |        "FireplaceQu    0.472603\n",
173 |        "SalePrice      0.000000\n",
174 |        "dtype: float64"
175 |       ]
176 |      },
177 |      "execution_count": 3,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "# revisemos los valores nulos\n",
184 |     "data.isnull().mean()"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "Las variables cateogóricas BsmtQual y FirePlaceQu tienen datos ausentes\n"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 4,
197 |    "metadata": {},
198 |    "outputs": [
199 |     {
200 |      "data": {
201 |       "text/plain": [
202 |        "((1022, 2), (438, 2))"
203 |       ]
204 |      },
205 |      "execution_count": 4,
206 |      "metadata": {},
207 |      "output_type": "execute_result"
208 |     }
209 |    ],
210 |    "source": [
211 |     "# separar datos en segmentos entrenamiento y prueba\n",
212 |     "\n",
213 |     "# primero, separemos el target (SalePrice) del resto de las variables (features)\n",
214 |     "cols_to_use.remove('SalePrice')\n",
215 |     "\n",
216 |     "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use], # solo las variables\n",
217 |     "                                                    data['SalePrice'], # el target\n",
218 |     "                                                    test_size=0.3, # el porcentaje de obs en el segmento de prueba\n",
219 |     "                                                    random_state=0) # para reproducir\n",
220 |     "X_train.shape, X_test.shape"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 5,
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/plain": [
231 |        "BsmtQual       0.023483\n",
232 |        "FireplaceQu    0.467710\n",
233 |        "dtype: float64"
234 |       ]
235 |      },
236 |      "execution_count": 5,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "# evaluemos el porcentaje de datos ausentes nuevamente\n",
243 |     "X_train.isnull().mean()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 6,
249 |    "metadata": {},
250 |    "outputs": [
251 |     {
252 |      "data": {
253 |       "text/plain": [
254 |        "array(['Gd', 'TA', 'Fa', nan, 'Ex'], dtype=object)"
255 |       ]
256 |      },
257 |      "execution_count": 6,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "# exploremos los valores de la variable categórica\n",
264 |     "X_train['BsmtQual'].unique()"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 7,
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "data": {
274 |       "text/plain": [
275 |        "array([nan, 'Gd', 'TA', 'Fa', 'Po', 'Ex'], dtype=object)"
276 |       ]
277 |      },
278 |      "execution_count": 7,
279 |      "metadata": {},
280 |      "output_type": "execute_result"
281 |     }
282 |    ],
283 |    "source": [
284 |     "# exploremos los valores de la variable categórica\n",
285 |     "X_train['FireplaceQu'].unique()"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 8,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "data": {
295 |       "text/plain": [
296 |        "SimpleImputer(add_indicator=False, copy=True, fill_value='Missing',\n",
297 |        "              missing_values=nan, strategy='constant', verbose=0)"
298 |       ]
299 |      },
300 |      "execution_count": 8,
301 |      "metadata": {},
302 |      "output_type": "execute_result"
303 |     }
304 |    ],
305 |    "source": [
306 |     "# Ahora sustituyamos los valores faltantes con  SimpleImputer\n",
307 |     "\n",
308 |     "# creemos una instancia de la clase SimpleImputer\n",
309 |     "# indicaremos que queremos sustituir los valores nulos\n",
310 |     "# con la categoría 'Missing'\n",
311 |     "\n",
312 |     "imputer = SimpleImputer(strategy='constant', \n",
313 |     "                       fill_value = 'Missing')\n",
314 |     "\n",
315 |     "# ajustamos el imputer al segmento de entrenamiento\n",
316 |     "# en este caso simplemente reemplaza los valores nulos con el valor 'Missing'\n",
317 |     "\n",
318 |     "imputer.fit(X_train)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 9,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/plain": [
329 |        "array(['Missing', 'Missing'], dtype=object)"
330 |       ]
331 |      },
332 |      "execution_count": 9,
333 |      "metadata": {},
334 |      "output_type": "execute_result"
335 |     }
336 |    ],
337 |    "source": [
338 |     "# veamos los valores ajustados:\n",
339 |     "imputer.statistics_"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 10,
345 |    "metadata": {},
346 |    "outputs": [
347 |     {
348 |      "data": {
349 |       "text/plain": [
350 |        "array([['Gd', 'Missing'],\n",
351 |        "       ['Gd', 'Gd'],\n",
352 |        "       ['TA', 'Missing'],\n",
353 |        "       ...,\n",
354 |        "       ['Missing', 'Missing'],\n",
355 |        "       ['Gd', 'TA'],\n",
356 |        "       ['Gd', 'Missing']], dtype=object)"
357 |       ]
358 |      },
359 |      "execution_count": 10,
360 |      "metadata": {},
361 |      "output_type": "execute_result"
362 |     }
363 |    ],
364 |    "source": [
365 |     "# ahora sustituyamos en los segmentos de entrenamiento y prueba\n",
366 |     "\n",
367 |     "# NOTA: los datos son devueltos como un numpy array!!\n",
368 |     "X_train = imputer.transform(X_train)\n",
369 |     "X_test = imputer.transform(X_test)\n",
370 |     "\n",
371 |     "X_train"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 11,
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "data": {
381 |       "text/html": [
382 |        "<div>\n",
383 |        "<style scoped>\n",
384 |        "    .dataframe tbody tr th:only-of-type {\n",
385 |        "        vertical-align: middle;\n",
386 |        "    }\n",
387 |        "\n",
388 |        "    .dataframe tbody tr th {\n",
389 |        "        vertical-align: top;\n",
390 |        "    }\n",
391 |        "\n",
392 |        "    .dataframe thead th {\n",
393 |        "        text-align: right;\n",
394 |        "    }\n",
395 |        "</style>\n",
396 |        "<table border=\"1\" class=\"dataframe\">\n",
397 |        "  <thead>\n",
398 |        "    <tr style=\"text-align: right;\">\n",
399 |        "      <th></th>\n",
400 |        "      <th>BsmtQual</th>\n",
401 |        "      <th>FireplaceQu</th>\n",
402 |        "    </tr>\n",
403 |        "  </thead>\n",
404 |        "  <tbody>\n",
405 |        "    <tr>\n",
406 |        "      <th>0</th>\n",
407 |        "      <td>Gd</td>\n",
408 |        "      <td>Missing</td>\n",
409 |        "    </tr>\n",
410 |        "    <tr>\n",
411 |        "      <th>1</th>\n",
412 |        "      <td>Gd</td>\n",
413 |        "      <td>Gd</td>\n",
414 |        "    </tr>\n",
415 |        "    <tr>\n",
416 |        "      <th>2</th>\n",
417 |        "      <td>TA</td>\n",
418 |        "      <td>Missing</td>\n",
419 |        "    </tr>\n",
420 |        "    <tr>\n",
421 |        "      <th>3</th>\n",
422 |        "      <td>TA</td>\n",
423 |        "      <td>Missing</td>\n",
424 |        "    </tr>\n",
425 |        "    <tr>\n",
426 |        "      <th>4</th>\n",
427 |        "      <td>TA</td>\n",
428 |        "      <td>Missing</td>\n",
429 |        "    </tr>\n",
430 |        "  </tbody>\n",
431 |        "</table>\n",
432 |        "</div>"
433 |       ],
434 |       "text/plain": [
435 |        "  BsmtQual FireplaceQu\n",
436 |        "0       Gd     Missing\n",
437 |        "1       Gd          Gd\n",
438 |        "2       TA     Missing\n",
439 |        "3       TA     Missing\n",
440 |        "4       TA     Missing"
441 |       ]
442 |      },
443 |      "execution_count": 11,
444 |      "metadata": {},
445 |      "output_type": "execute_result"
446 |     }
447 |    ],
448 |    "source": [
449 |     "# transformemos el segmento de entrenamiento en un dataframe:\n",
450 |     "\n",
451 |     "X_train = pd.DataFrame(X_train, columns=cols_to_use)\n",
452 |     "X_train.head()"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": 12,
458 |    "metadata": {},
459 |    "outputs": [
460 |     {
461 |      "data": {
462 |       "text/plain": [
463 |        "array(['Gd', 'TA', 'Fa', 'Missing', 'Ex'], dtype=object)"
464 |       ]
465 |      },
466 |      "execution_count": 12,
467 |      "metadata": {},
468 |      "output_type": "execute_result"
469 |     }
470 |    ],
471 |    "source": [
472 |     "X_train['BsmtQual'].unique()"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 13,
478 |    "metadata": {},
479 |    "outputs": [
480 |     {
481 |      "data": {
482 |       "text/plain": [
483 |        "BsmtQual       0.0\n",
484 |        "FireplaceQu    0.0\n",
485 |        "dtype: float64"
486 |       ]
487 |      },
488 |      "execution_count": 13,
489 |      "metadata": {},
490 |      "output_type": "execute_result"
491 |     }
492 |    ],
493 |    "source": [
494 |     "X_train.isnull().mean()"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "metadata": {},
500 |    "source": [
501 |     "**ADVERTENCIA**:\n",
502 |     "\n",
503 |     "Cuando usamos SimpleImputer y fijamos los parámetros:\n",
504 |     "- strategy='constant'\n",
505 |     "- fill_value = 'Missing'\n",
506 |     "\n",
507 |     "Si el dataframe contiene variables que son numéricas y categóricas, los valores nulos NA en ambos serán reemplazados con 'Missing\" y por lo tanto una variable numérica se convierte en categórica, que probablemente no es el efecto deseado.\n",
508 |     "\n",
509 |     "La mayoría de datos contienen variables  numéricas y categóricas, por lo tanto lo más probable es que tendrás que usar un transformador para seleccionar las columnas como mostramos en los notebooks previos y en las siguientes celdas.\n"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": 14,
515 |    "metadata": {},
516 |    "outputs": [
517 |     {
518 |      "data": {
519 |       "text/html": [
520 |        "<div>\n",
521 |        "<style scoped>\n",
522 |        "    .dataframe tbody tr th:only-of-type {\n",
523 |        "        vertical-align: middle;\n",
524 |        "    }\n",
525 |        "\n",
526 |        "    .dataframe tbody tr th {\n",
527 |        "        vertical-align: top;\n",
528 |        "    }\n",
529 |        "\n",
530 |        "    .dataframe thead th {\n",
531 |        "        text-align: right;\n",
532 |        "    }\n",
533 |        "</style>\n",
534 |        "<table border=\"1\" class=\"dataframe\">\n",
535 |        "  <thead>\n",
536 |        "    <tr style=\"text-align: right;\">\n",
537 |        "      <th></th>\n",
538 |        "      <th>LotFrontage</th>\n",
539 |        "      <th>MasVnrArea</th>\n",
540 |        "      <th>BsmtQual</th>\n",
541 |        "      <th>FireplaceQu</th>\n",
542 |        "      <th>GarageYrBlt</th>\n",
543 |        "      <th>SalePrice</th>\n",
544 |        "    </tr>\n",
545 |        "  </thead>\n",
546 |        "  <tbody>\n",
547 |        "    <tr>\n",
548 |        "      <th>0</th>\n",
549 |        "      <td>65.0</td>\n",
550 |        "      <td>196.0</td>\n",
551 |        "      <td>Gd</td>\n",
552 |        "      <td>NaN</td>\n",
553 |        "      <td>2003.0</td>\n",
554 |        "      <td>208500</td>\n",
555 |        "    </tr>\n",
556 |        "    <tr>\n",
557 |        "      <th>1</th>\n",
558 |        "      <td>80.0</td>\n",
559 |        "      <td>0.0</td>\n",
560 |        "      <td>Gd</td>\n",
561 |        "      <td>TA</td>\n",
562 |        "      <td>1976.0</td>\n",
563 |        "      <td>181500</td>\n",
564 |        "    </tr>\n",
565 |        "    <tr>\n",
566 |        "      <th>2</th>\n",
567 |        "      <td>68.0</td>\n",
568 |        "      <td>162.0</td>\n",
569 |        "      <td>Gd</td>\n",
570 |        "      <td>TA</td>\n",
571 |        "      <td>2001.0</td>\n",
572 |        "      <td>223500</td>\n",
573 |        "    </tr>\n",
574 |        "    <tr>\n",
575 |        "      <th>3</th>\n",
576 |        "      <td>60.0</td>\n",
577 |        "      <td>0.0</td>\n",
578 |        "      <td>TA</td>\n",
579 |        "      <td>Gd</td>\n",
580 |        "      <td>1998.0</td>\n",
581 |        "      <td>140000</td>\n",
582 |        "    </tr>\n",
583 |        "    <tr>\n",
584 |        "      <th>4</th>\n",
585 |        "      <td>84.0</td>\n",
586 |        "      <td>350.0</td>\n",
587 |        "      <td>Gd</td>\n",
588 |        "      <td>TA</td>\n",
589 |        "      <td>2000.0</td>\n",
590 |        "      <td>250000</td>\n",
591 |        "    </tr>\n",
592 |        "  </tbody>\n",
593 |        "</table>\n",
594 |        "</div>"
595 |       ],
596 |       "text/plain": [
597 |        "   LotFrontage  MasVnrArea BsmtQual FireplaceQu  GarageYrBlt  SalePrice\n",
598 |        "0         65.0       196.0       Gd         NaN       2003.0     208500\n",
599 |        "1         80.0         0.0       Gd          TA       1976.0     181500\n",
600 |        "2         68.0       162.0       Gd          TA       2001.0     223500\n",
601 |        "3         60.0         0.0       TA          Gd       1998.0     140000\n",
602 |        "4         84.0       350.0       Gd          TA       2000.0     250000"
603 |       ]
604 |      },
605 |      "execution_count": 14,
606 |      "metadata": {},
607 |      "output_type": "execute_result"
608 |     }
609 |    ],
610 |    "source": [
611 |     "# carguemos los datos con variables numéricas y categóricas\n",
612 |     "\n",
613 |     "cols_to_use = [\n",
614 |     "    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
615 |     "    'SalePrice'\n",
616 |     "]\n",
617 |     "\n",
618 |     "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
619 |     "data.head()"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": 15,
625 |    "metadata": {},
626 |    "outputs": [
627 |     {
628 |      "data": {
629 |       "text/plain": [
630 |        "((1022, 5), (438, 5))"
631 |       ]
632 |      },
633 |      "execution_count": 15,
634 |      "metadata": {},
635 |      "output_type": "execute_result"
636 |     }
637 |    ],
638 |    "source": [
639 |     "# separar datos en segmentos entrenamiento y prueba\n",
640 |     "\n",
641 |     "# primero descartemos el target de la lista de variables\n",
642 |     "cols_to_use.remove('SalePrice')\n",
643 |     "\n",
644 |     "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
645 |     "                                                    data['SalePrice'],\n",
646 |     "                                                    test_size=0.3,\n",
647 |     "                                                    random_state=0)\n",
648 |     "X_train.shape, X_test.shape"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": 16,
654 |    "metadata": {},
655 |    "outputs": [
656 |     {
657 |      "data": {
658 |       "text/plain": [
659 |        "BsmtQual       0.023483\n",
660 |        "FireplaceQu    0.467710\n",
661 |        "LotFrontage    0.184932\n",
662 |        "MasVnrArea     0.004892\n",
663 |        "GarageYrBlt    0.052838\n",
664 |        "dtype: float64"
665 |       ]
666 |      },
667 |      "execution_count": 16,
668 |      "metadata": {},
669 |      "output_type": "execute_result"
670 |     }
671 |    ],
672 |    "source": [
673 |     "# revisemos los valores nulos\n",
674 |     "X_train.isnull().mean()"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "markdown",
679 |    "metadata": {},
680 |    "source": [
681 |     "En este demo, vamos a sustituir los valores nulos de las variables numéricas por la media y las variables categóricas por la nueva etiqueta 'Missing'."
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "code",
686 |    "execution_count": 17,
687 |    "metadata": {},
688 |    "outputs": [],
689 |    "source": [
690 |     "# primero vamos a crear una lista, indicando cuales son las\n",
691 |     "# variables a sustituir con cada método\n",
692 |     "\n",
693 |     "features_numeric = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']\n",
694 |     "features_categoric = ['BsmtQual', 'FireplaceQu']\n",
695 |     "\n",
696 |     "# luego vamos a instanciar imputers\n",
697 |     "# creamos un imputer por cada variable\n",
698 |     "# indicando uno para la media y el otro para las variables categóricas\n",
699 |     "\n",
700 |     "\n",
701 |     "# luego ponemos las variables en lista y los transformadores juntos\n",
702 |     "# usando la columna transformer\n",
703 |     "\n",
704 |     "preprocessor = ColumnTransformer(transformers=[\n",
705 |     "    ('imputer_numeric', SimpleImputer(strategy='mean'), features_numeric),\n",
706 |     "    ('imputer_categoric', SimpleImputer(strategy='constant', fill_value='Missing'), features_categoric)])"
707 |    ]
708 |   },
709 |   {
710 |    "cell_type": "code",
711 |    "execution_count": 18,
712 |    "metadata": {},
713 |    "outputs": [
714 |     {
715 |      "data": {
716 |       "text/plain": [
717 |        "ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,\n",
718 |        "                  transformer_weights=None,\n",
719 |        "                  transformers=[('imputer_numeric',\n",
720 |        "                                 SimpleImputer(add_indicator=False, copy=True,\n",
721 |        "                                               fill_value=None,\n",
722 |        "                                               missing_values=nan,\n",
723 |        "                                               strategy='mean', verbose=0),\n",
724 |        "                                 ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']),\n",
725 |        "                                ('imputer_categoric',\n",
726 |        "                                 SimpleImputer(add_indicator=False, copy=True,\n",
727 |        "                                               fill_value='Missing',\n",
728 |        "                                               missing_values=nan,\n",
729 |        "                                               strategy='constant', verbose=0),\n",
730 |        "                                 ['BsmtQual', 'FireplaceQu'])],\n",
731 |        "                  verbose=False)"
732 |       ]
733 |      },
734 |      "execution_count": 18,
735 |      "metadata": {},
736 |      "output_type": "execute_result"
737 |     }
738 |    ],
739 |    "source": [
740 |     "# ajustemos el preprocessor\n",
741 |     "preprocessor.fit(X_train)"
742 |    ]
743 |   },
744 |   {
745 |    "cell_type": "code",
746 |    "execution_count": 19,
747 |    "metadata": {},
748 |    "outputs": [
749 |     {
750 |      "data": {
751 |       "text/plain": [
752 |        "[('imputer_numeric',\n",
753 |        "  SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n",
754 |        "                missing_values=nan, strategy='mean', verbose=0),\n",
755 |        "  ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']),\n",
756 |        " ('imputer_categoric',\n",
757 |        "  SimpleImputer(add_indicator=False, copy=True, fill_value='Missing',\n",
758 |        "                missing_values=nan, strategy='constant', verbose=0),\n",
759 |        "  ['BsmtQual', 'FireplaceQu'])]"
760 |       ]
761 |      },
762 |      "execution_count": 19,
763 |      "metadata": {},
764 |      "output_type": "execute_result"
765 |     }
766 |    ],
767 |    "source": [
768 |     "# podemos explorar el transformador:\n",
769 |     "preprocessor.transformers"
770 |    ]
771 |   },
772 |   {
773 |    "cell_type": "code",
774 |    "execution_count": 20,
775 |    "metadata": {},
776 |    "outputs": [
777 |     {
778 |      "data": {
779 |       "text/plain": [
780 |        "array([  69.66866747,  103.55358899, 1978.01239669])"
781 |       ]
782 |      },
783 |      "execution_count": 20,
784 |      "metadata": {},
785 |      "output_type": "execute_result"
786 |     }
787 |    ],
788 |    "source": [
789 |     "# podemos ver los parámetros ajustados:\n",
790 |     "\n",
791 |     "# para el imputer de las variables numéricas\n",
792 |     "preprocessor.named_transformers_['imputer_numeric'].statistics_"
793 |    ]
794 |   },
795 |   {
796 |    "cell_type": "code",
797 |    "execution_count": 21,
798 |    "metadata": {},
799 |    "outputs": [
800 |     {
801 |      "data": {
802 |       "text/plain": [
803 |        "array(['Missing', 'Missing'], dtype=object)"
804 |       ]
805 |      },
806 |      "execution_count": 21,
807 |      "metadata": {},
808 |      "output_type": "execute_result"
809 |     }
810 |    ],
811 |    "source": [
812 |     "# para el imputer de las variables categóricas\n",
813 |     "preprocessor.named_transformers_['imputer_categoric'].statistics_"
814 |    ]
815 |   },
816 |   {
817 |    "cell_type": "code",
818 |    "execution_count": 22,
819 |    "metadata": {},
820 |    "outputs": [],
821 |    "source": [
822 |     "# y ahora podemos sustituir los segmentos de entrenamiento y prueba\n",
823 |     "# recuerda los datos retornados son numpy arrays\n",
824 |     "\n",
825 |     "X_train = preprocessor.transform(X_train)\n",
826 |     "X_test = preprocessor.transform(X_test)"
827 |    ]
828 |   },
829 |   {
830 |    "cell_type": "code",
831 |    "execution_count": 23,
832 |    "metadata": {},
833 |    "outputs": [
834 |     {
835 |      "data": {
836 |       "text/html": [
837 |        "<div>\n",
838 |        "<style scoped>\n",
839 |        "    .dataframe tbody tr th:only-of-type {\n",
840 |        "        vertical-align: middle;\n",
841 |        "    }\n",
842 |        "\n",
843 |        "    .dataframe tbody tr th {\n",
844 |        "        vertical-align: top;\n",
845 |        "    }\n",
846 |        "\n",
847 |        "    .dataframe thead th {\n",
848 |        "        text-align: right;\n",
849 |        "    }\n",
850 |        "</style>\n",
851 |        "<table border=\"1\" class=\"dataframe\">\n",
852 |        "  <thead>\n",
853 |        "    <tr style=\"text-align: right;\">\n",
854 |        "      <th></th>\n",
855 |        "      <th>LotFrontage</th>\n",
856 |        "      <th>MasVnrArea</th>\n",
857 |        "      <th>GarageYrBlt</th>\n",
858 |        "      <th>BsmtQual</th>\n",
859 |        "      <th>FireplaceQu</th>\n",
860 |        "    </tr>\n",
861 |        "  </thead>\n",
862 |        "  <tbody>\n",
863 |        "    <tr>\n",
864 |        "      <th>0</th>\n",
865 |        "      <td>69.6687</td>\n",
866 |        "      <td>573</td>\n",
867 |        "      <td>1998</td>\n",
868 |        "      <td>Gd</td>\n",
869 |        "      <td>Missing</td>\n",
870 |        "    </tr>\n",
871 |        "    <tr>\n",
872 |        "      <th>1</th>\n",
873 |        "      <td>69.6687</td>\n",
874 |        "      <td>0</td>\n",
875 |        "      <td>1996</td>\n",
876 |        "      <td>Gd</td>\n",
877 |        "      <td>Gd</td>\n",
878 |        "    </tr>\n",
879 |        "    <tr>\n",
880 |        "      <th>2</th>\n",
881 |        "      <td>50</td>\n",
882 |        "      <td>0</td>\n",
883 |        "      <td>1978.01</td>\n",
884 |        "      <td>TA</td>\n",
885 |        "      <td>Missing</td>\n",
886 |        "    </tr>\n",
887 |        "    <tr>\n",
888 |        "      <th>3</th>\n",
889 |        "      <td>60</td>\n",
890 |        "      <td>0</td>\n",
891 |        "      <td>1939</td>\n",
892 |        "      <td>TA</td>\n",
893 |        "      <td>Missing</td>\n",
894 |        "    </tr>\n",
895 |        "    <tr>\n",
896 |        "      <th>4</th>\n",
897 |        "      <td>60</td>\n",
898 |        "      <td>0</td>\n",
899 |        "      <td>1930</td>\n",
900 |        "      <td>TA</td>\n",
901 |        "      <td>Missing</td>\n",
902 |        "    </tr>\n",
903 |        "  </tbody>\n",
904 |        "</table>\n",
905 |        "</div>"
906 |       ],
907 |       "text/plain": [
908 |        "  LotFrontage MasVnrArea GarageYrBlt BsmtQual FireplaceQu\n",
909 |        "0     69.6687        573        1998       Gd     Missing\n",
910 |        "1     69.6687          0        1996       Gd          Gd\n",
911 |        "2          50          0     1978.01       TA     Missing\n",
912 |        "3          60          0        1939       TA     Missing\n",
913 |        "4          60          0        1930       TA     Missing"
914 |       ]
915 |      },
916 |      "execution_count": 23,
917 |      "metadata": {},
918 |      "output_type": "execute_result"
919 |     }
920 |    ],
921 |    "source": [
922 |     "# ahora convirtamos el resultado en un dataframe\n",
923 |     "pd.DataFrame(X_train,\n",
924 |     "             columns=features_numeric+features_categoric).head()"
925 |    ]
926 |   },
927 |   {
928 |    "cell_type": "code",
929 |    "execution_count": 24,
930 |    "metadata": {},
931 |    "outputs": [
932 |     {
933 |      "data": {
934 |       "text/plain": [
935 |        "LotFrontage    0.0\n",
936 |        "MasVnrArea     0.0\n",
937 |        "GarageYrBlt    0.0\n",
938 |        "BsmtQual       0.0\n",
939 |        "FireplaceQu    0.0\n",
940 |        "dtype: float64"
941 |       ]
942 |      },
943 |      "execution_count": 24,
944 |      "metadata": {},
945 |      "output_type": "execute_result"
946 |     }
947 |    ],
948 |    "source": [
949 |     "# ahora convertimos el resultado en un dataframe\n",
950 |     "# y exploramos los valores ausentes\n",
951 |     "# (no debería haber ninguno)\n",
952 |     "\n",
953 |     "\n",
954 |     "X_train = pd.DataFrame(X_train,\n",
955 |     "             columns=features_numeric+features_categoric)\n",
956 |     "\n",
957 |     "X_train.isnull().mean()"
958 |    ]
959 |   }
960 |  ],
961 |  "metadata": {
962 |   "kernelspec": {
963 |    "display_name": "feml",
964 |    "language": "python",
965 |    "name": "feml"
966 |   },
967 |   "language_info": {
968 |    "codemirror_mode": {
969 |     "name": "ipython",
970 |     "version": 3
971 |    },
972 |    "file_extension": ".py",
973 |    "mimetype": "text/x-python",
974 |    "name": "python",
975 |    "nbconvert_exporter": "python",
976 |    "pygments_lexer": "ipython3",
977 |    "version": "3.8.2"
978 |   },
979 |   "toc": {
980 |    "base_numbering": 1,
981 |    "nav_menu": {},
982 |    "number_sections": true,
983 |    "sideBar": true,
984 |    "skip_h1_title": false,
985 |    "title_cell": "Table of Contents",
986 |    "title_sidebar": "Contents",
987 |    "toc_cell": false,
988 |    "toc_position": {},
989 |    "toc_section_display": true,
990 |    "toc_window_display": true
991 |   }
992 |  },
993 |  "nbformat": 4,
994 |  "nbformat_minor": 2
995 | }
996 | 


--------------------------------------------------------------------------------