├── GUARDE_LOS_DATASETS_AQUI.txt
├── LICENSE
├── .github
└── FUNDING.yml
├── LogoUdemy.png
├── trainindata.png
├── .gitignore
├── Section-08-Discretizacion
├── tree_visualisation.png
└── tree_model.txt
├── README.md
├── Seccion-01-Introduccion
└── 01.11-Sets-de-datos.ipynb
├── Seccion-04-Sustitucion-Datos-Faltantes
├── 04.20_SustitucionMuestraAleatoria_FeatureEngine.ipynb
├── 04.18_SustitucionModa_FeatureEngine.ipynb
├── 04.19_SustitucionCategoriaAdicional_FeatureEngine.ipynb
├── 04.15_SustitucionMediaMediana_FeatureEngine.ipynb
├── 04.13_IndicadorAusencia_Sklearn.ipynb
└── 04.12_SustitucionCategoriaAdicional_Sklearn.ipynb
├── Section-09-Ingenieria-valores-extremos
└── 09.05-Truncamiento-valores-arbitrarios.ipynb
├── Seccion-02-Tipos-de-Variables
└── 02.4_VariablesMixtas.ipynb
└── Section-06-Codificacion-Variables-Categoricas
└── 06.04_Codificacion-frecuencia.ipynb
/GUARDE_LOS_DATASETS_AQUI.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/ingenieria-de-variables/HEAD/LICENSE
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [solegalli]
4 |
--------------------------------------------------------------------------------
/LogoUdemy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/ingenieria-de-variables/HEAD/LogoUdemy.png
--------------------------------------------------------------------------------
/trainindata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/ingenieria-de-variables/HEAD/trainindata.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Jupyter Notebook
2 | .ipynb_checkpoints
3 |
4 | # datasets
5 | *.csv
6 |
7 | # other files
8 | .DS_Store
9 |
--------------------------------------------------------------------------------
/Section-08-Discretizacion/tree_visualisation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/ingenieria-de-variables/HEAD/Section-08-Discretizacion/tree_visualisation.png
--------------------------------------------------------------------------------
/Section-08-Discretizacion/tree_model.txt:
--------------------------------------------------------------------------------
1 | digraph Tree {
2 | node [shape=box] ;
3 | 0 [label="X[0] <= 64.5\ngini = 0.474\nsamples = 916\nvalue = [563, 353]"] ;
4 | 1 [label="X[0] <= 8.5\ngini = 0.475\nsamples = 907\nvalue = [554, 353]"] ;
5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
6 | 2 [label="X[0] <= 0.458\ngini = 0.497\nsamples = 52\nvalue = [24, 28]"] ;
7 | 1 -> 2 ;
8 | 3 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
9 | 2 -> 3 ;
10 | 4 [label="gini = 0.498\nsamples = 51\nvalue = [24, 27]"] ;
11 | 2 -> 4 ;
12 | 5 [label="X[0] <= 44.5\ngini = 0.471\nsamples = 855\nvalue = [530, 325]"] ;
13 | 1 -> 5 ;
14 | 6 [label="gini = 0.464\nsamples = 713\nvalue = [452, 261]"] ;
15 | 5 -> 6 ;
16 | 7 [label="gini = 0.495\nsamples = 142\nvalue = [78, 64]"] ;
17 | 5 -> 7 ;
18 | 8 [label="gini = 0.0\nsamples = 9\nvalue = [9, 0]"] ;
19 | 0 -> 8 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
20 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | [](https://github.com/solegalli/ingenieria-de-variables/blob/master/LICENSE)
3 | [](https://www.trainindata.com/)
4 |
5 | ## Ingeniería de Variables para Machine Learning - Código
6 |
7 | Publicado en Junio de 2020
8 |
9 | El codigo no se actualiza.
10 |
11 | Videos en español: [Ingeniería de Variables, Playlist en YouTube](https://www.youtube.com/watch?v=fmAUVceuQu4&list=PL_7uaHXkQmKU6JyThyqyUUZdCYqJJ9SeO)
12 |
13 | Curso Original en Inglés: [Feature Engineering for Machine Learning](https://www.trainindata.com/p/feature-engineering-for-machine-learning)
14 |
15 | [ ](https://www.trainindata.com/?lang=es)
16 |
17 |
18 | ## Links
19 |
20 | - [Curso Online en Inglés](https://www.trainindata.com/p/feature-engineering-for-machine-learning)
21 | - [Lista de videos en Español](https://www.youtube.com/watch?v=fmAUVceuQu4&list=PL_7uaHXkQmKU6JyThyqyUUZdCYqJJ9SeO)
22 |
23 |
24 | ## Tabla de Contenidos
25 |
26 | **Todas las técnicas aplicadas con Pandas, Scikit-learn y Feature-engine**
27 |
28 | 1. **Tipos de variables**
29 | 1. Numéricas
30 | 2. Categóricas
31 | 3. Fecha y hora
32 | 4. Mixtas
33 |
34 | 2. **Características de las variables**
35 | 1. Datos ausentes
36 | 2. Cardinalidad
37 | 3. Etiquetas raras
38 | 4. Supuestos de los modelos
39 | 5. Valores extremos
40 | 6. Escala de las variables
41 |
42 | 3. **Sustitución de datos faltantes**
43 | 1. Análisis de Casos Completos
44 | 2. Imputación con la media y la mediana
45 | 3. Sustitución con valor arbitrario
46 | 4. Imputación con valor al final de la distribución
47 | 5. Sustitución con la categoría más frecuenta (moda)
48 | 7. Imputación con categoría adicional
49 | 8. Imputación aleatoria
50 | 9. Agregado de indicador de ausencia
51 | 11. Secuencia de imputación
52 |
53 |
54 | 4. **Codificación de variables categóricas**
55 | 1. Codificación One Hot
56 | 2. Codificación One Hot de categorías frecuentes
57 | 3. Codificación Ordinal
58 | 4. Codificación con cuentas o frecuencias
59 | 5. Codificación ordinal ordenada
60 | 6. Codificación con la media de la variable de respuesta
61 | 7. Codificación con tasa de probabilidad
62 | 8. Peso de la evidencia
63 | 9. Manejo de etiquetas raras
64 |
65 | 5. **Transformación de variables numéricas**
66 | 1. Transformación Logarítmica
67 | 2. Transformación de Potencia
68 | 3. Transformación Reciproca
69 | 4. Transformación de BoxCox
70 | 5. Transformación de Yeo-Johnson
71 |
72 | 6. **Discretización**
73 | 1. Discretización con intervalos de igual rango
74 | 2. Discretización con intervalos de igual frecuencia
75 | 3. Discretización arbitraria
76 | 4. Discretización con árboles de decisión
77 |
78 | 7. **Datos Extremos**
79 | 1. Remoción de datos extremos
80 | 2. Truncamiento
81 | 3. Winzorisación
82 |
83 | 8. **Escalamiento de variables**
84 | 1. Estandarización
85 | 2. Escalamiento por la media
86 | 3. Escalamiento al mínimo y máximo valor
87 | 4. Escalamiento al máximo absoluto
88 | 5. Escalamiento con mediana y rango entre-cuartil
89 | 6. Normalización a la norma del vector
90 |
91 | 9. **Variables mixtas**
92 | 1. Separación en componente numérico y componente categórico
93 |
94 | 10. **Variables de fecha y hora**
95 | 1. Extracción de componentes de día, mes y año
96 | 2. Extracción de hora, minutos y segundos
97 | 3. Captura de tiempo transcurrido
98 | 4. Manejo de zonas horarias
99 |
100 | 11. Ensamblado de flujos de aprendizaje automático
101 | 1. Regresión
102 | 2. Clasificación
103 |
104 | - [Curso Online en Inglés](https://www.trainindata.com/p/feature-engineering-for-machine-learning)
105 | - [Lista de videos en Español](https://www.youtube.com/watch?v=fmAUVceuQu4&list=PL_7uaHXkQmKU6JyThyqyUUZdCYqJJ9SeO)
106 |
--------------------------------------------------------------------------------
/Seccion-01-Introduccion/01.11-Sets-de-datos.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Ejemplo 1: Préstamos Peer-to-Peer (Finanza)\n",
8 | "\n",
9 | "\n",
10 | "### El Modelo de Negocio\n",
11 | "\n",
12 | "Préstamos Peer-to-peer (abreviado P2P) ocurren cuando inversores prestan dinero directamente a personas o negocios a través de una plataforma online. La plataforma online pone en contacto de manera digital y automática a los prestamistas con los inversores, y conduce también el análisis crediticio y de riesgo necesario para determinar el riesgo del préstamo y la tasa de interés adecuada. Suele haber un menor costo de operación en los prestamos P2P, por esto los inversores obtienen retornos más altos, y los prestamistas intereses más bajos. Aunque hoy en día, esto ya no suele ser siempre así.\n",
13 | "\n",
14 | "\n",
15 | "### El set de datos\n",
16 | "\n",
17 | "El set de datos simulado que creamos para este curso contiene datos sobre préstamos desembolsados por una compañía ficticia de peer to peer. Las variables incluyen el estado actual del préstamo al crearse el set de datos, e información acerca de pagos, así como también información acerca de la situación financiera y otros datos acerca del prestamista. \n",
18 | "El set de datos viene junto con los Jupyter notebooks que bajaste en la sección anterior."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Ejemplo 2: Predecir el precio de venta de casas\n",
26 | "\n",
27 | "En este set de datos, tenemos variables con características de las casas y los barrios en donde se encuentran localizadas, y el objetivo es predecir el precio de venta en base a estas variables. Predecir el precio de venta suele ser un dato útil para anticipar áreas en donde hacer futuras inversiones.\n",
28 | "\n",
29 | "### Bajar y guardar\n",
30 | "\n",
31 | "Par bajar el código haz lo siguiente:\n",
32 | "\n",
33 | "- Visita la [página web House Sale Price competition](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)\n",
34 | "- Navega hacia abajo y haz clic en “train.csv” y luego en el botón de “Download” a la derecha\n",
35 | "- Cámbiale el nombre al set de datos a “houseprice.csv”\n",
36 | "- Guarda el set de datos en la carpeta que contiene los Jupyter notebooks, en donde vez el archivo “GUARDA_DATASETS_AQUI”.\n"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "## Ejemplo 3: Predecir sobrevivencia en el Titanic\n",
44 | "\n",
45 | "### Historia\n",
46 | "Como probablemente sabes, el Titanic fue un lamentable hecho en donde el barco llamado Titanic choca con in tempano de hielo, y se hunde, terminando con la vida de 1502 de sus 2224 pasajeros. Es curioso, como el análisis de las características de los pasajeros revela datos interesantes acerca de quienes fueron priorizados al momento del salvataje, siendo la mayoría de los sobrevivientes mujeres y niños de clase alta. \n",
47 | "\n",
48 | "### Para generar el set de datos, sigue estas instrucciones"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 1,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "import pandas as pd\n",
58 | "import numpy as np"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 2,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/html": [
69 | "
\n",
70 | "\n",
83 | "
\n",
84 | " \n",
85 | " \n",
86 | " \n",
87 | " pclass \n",
88 | " survived \n",
89 | " name \n",
90 | " sex \n",
91 | " age \n",
92 | " sibsp \n",
93 | " parch \n",
94 | " ticket \n",
95 | " fare \n",
96 | " cabin \n",
97 | " embarked \n",
98 | " boat \n",
99 | " body \n",
100 | " home.dest \n",
101 | " \n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " 0 \n",
106 | " 1 \n",
107 | " 1 \n",
108 | " Allen, Miss. Elisabeth Walton \n",
109 | " female \n",
110 | " 29 \n",
111 | " 0 \n",
112 | " 0 \n",
113 | " 24160 \n",
114 | " 211.3375 \n",
115 | " B5 \n",
116 | " S \n",
117 | " 2 \n",
118 | " ? \n",
119 | " St Louis, MO \n",
120 | " \n",
121 | " \n",
122 | " 1 \n",
123 | " 1 \n",
124 | " 1 \n",
125 | " Allison, Master. Hudson Trevor \n",
126 | " male \n",
127 | " 0.9167 \n",
128 | " 1 \n",
129 | " 2 \n",
130 | " 113781 \n",
131 | " 151.55 \n",
132 | " C22 C26 \n",
133 | " S \n",
134 | " 11 \n",
135 | " ? \n",
136 | " Montreal, PQ / Chesterville, ON \n",
137 | " \n",
138 | " \n",
139 | " 2 \n",
140 | " 1 \n",
141 | " 0 \n",
142 | " Allison, Miss. Helen Loraine \n",
143 | " female \n",
144 | " 2 \n",
145 | " 1 \n",
146 | " 2 \n",
147 | " 113781 \n",
148 | " 151.55 \n",
149 | " C22 C26 \n",
150 | " S \n",
151 | " ? \n",
152 | " ? \n",
153 | " Montreal, PQ / Chesterville, ON \n",
154 | " \n",
155 | " \n",
156 | " 3 \n",
157 | " 1 \n",
158 | " 0 \n",
159 | " Allison, Mr. Hudson Joshua Creighton \n",
160 | " male \n",
161 | " 30 \n",
162 | " 1 \n",
163 | " 2 \n",
164 | " 113781 \n",
165 | " 151.55 \n",
166 | " C22 C26 \n",
167 | " S \n",
168 | " ? \n",
169 | " 135 \n",
170 | " Montreal, PQ / Chesterville, ON \n",
171 | " \n",
172 | " \n",
173 | " 4 \n",
174 | " 1 \n",
175 | " 0 \n",
176 | " Allison, Mrs. Hudson J C (Bessie Waldo Daniels) \n",
177 | " female \n",
178 | " 25 \n",
179 | " 1 \n",
180 | " 2 \n",
181 | " 113781 \n",
182 | " 151.55 \n",
183 | " C22 C26 \n",
184 | " S \n",
185 | " ? \n",
186 | " ? \n",
187 | " Montreal, PQ / Chesterville, ON \n",
188 | " \n",
189 | " \n",
190 | "
\n",
191 | "
"
192 | ],
193 | "text/plain": [
194 | " pclass survived name sex \\\n",
195 | "0 1 1 Allen, Miss. Elisabeth Walton female \n",
196 | "1 1 1 Allison, Master. Hudson Trevor male \n",
197 | "2 1 0 Allison, Miss. Helen Loraine female \n",
198 | "3 1 0 Allison, Mr. Hudson Joshua Creighton male \n",
199 | "4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n",
200 | "\n",
201 | " age sibsp parch ticket fare cabin embarked boat body \\\n",
202 | "0 29 0 0 24160 211.3375 B5 S 2 ? \n",
203 | "1 0.9167 1 2 113781 151.55 C22 C26 S 11 ? \n",
204 | "2 2 1 2 113781 151.55 C22 C26 S ? ? \n",
205 | "3 30 1 2 113781 151.55 C22 C26 S ? 135 \n",
206 | "4 25 1 2 113781 151.55 C22 C26 S ? ? \n",
207 | "\n",
208 | " home.dest \n",
209 | "0 St Louis, MO \n",
210 | "1 Montreal, PQ / Chesterville, ON \n",
211 | "2 Montreal, PQ / Chesterville, ON \n",
212 | "3 Montreal, PQ / Chesterville, ON \n",
213 | "4 Montreal, PQ / Chesterville, ON "
214 | ]
215 | },
216 | "execution_count": 2,
217 | "metadata": {},
218 | "output_type": "execute_result"
219 | }
220 | ],
221 | "source": [
222 | "data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')\n",
223 | "data.head()"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 3,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | "pclass 0\n",
235 | "survived 0\n",
236 | "name 0\n",
237 | "sex 0\n",
238 | "age 263\n",
239 | "sibsp 0\n",
240 | "parch 0\n",
241 | "ticket 0\n",
242 | "fare 1\n",
243 | "cabin 1014\n",
244 | "embarked 2\n",
245 | "boat 823\n",
246 | "body 1188\n",
247 | "home.dest 564\n",
248 | "dtype: int64"
249 | ]
250 | },
251 | "execution_count": 3,
252 | "metadata": {},
253 | "output_type": "execute_result"
254 | }
255 | ],
256 | "source": [
257 | "data = data.replace('?', np.nan)\n",
258 | "data.isnull().sum()"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 4,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "def get_first_cabin(row):\n",
268 | " try:\n",
269 | " return row.split()[0]\n",
270 | " except:\n",
271 | " return np.nan "
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 5,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": [
280 | "data['cabin'] = data['cabin'].apply(get_first_cabin)"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": 6,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "data.to_csv('../titanic.csv', index=False)"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "**Atención**\n",
297 | "\n",
298 | "Si ejecutas ese Jupyter notebook desde donde se localiza en la carpeta que contiene los Jupyter notebooks, el set de datos del titanic, se guardara en la carpeta adecuada.\n",
299 | "\n",
300 | "Sino, asegurate de que ese guardado en la carpeta que contiene los Jupyter notebooks, en donde vez el archivo que dice GUARDAR_DATASETS_AQUI\n"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": []
309 | }
310 | ],
311 | "metadata": {
312 | "kernelspec": {
313 | "display_name": "Python 3",
314 | "language": "python",
315 | "name": "python3"
316 | },
317 | "toc": {
318 | "base_numbering": 1,
319 | "nav_menu": {},
320 | "number_sections": true,
321 | "sideBar": true,
322 | "skip_h1_title": false,
323 | "title_cell": "Table of Contents",
324 | "title_sidebar": "Contents",
325 | "toc_cell": false,
326 | "toc_position": {},
327 | "toc_section_display": true,
328 | "toc_window_display": true
329 | }
330 | },
331 | "nbformat": 4,
332 | "nbformat_minor": 2
333 | }
334 |
--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.20_SustitucionMuestraAleatoria_FeatureEngine.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Sustitución por Muestra Aleatoria ==> Feature-engine\n",
8 | "\n",
9 | "### Qué es Feature-engine?\n",
10 | "\n",
11 | "Feature-engine es una librería de Python que hemos creado para este curso. \n",
12 | "\n",
13 | "- Feature-engine incluye todas las técnicas de ingeniería de variables descritas en este curso\n",
14 | "- Feature-engine funciona como Scikit-learn, por lo tanto es fácil de aprender\n",
15 | "- Feature-engine te permite implementar pasos de ingeniería de variables específicos para diferentes grupos de variables\n",
16 | "- Feature-engine puede ser integrado con las pipelines de Scikit-learn pipeline permitiendo construir modelos fácilmente\n",
17 | "** Feature-engine te permite diseñar y guardar un flujo de ingeniería de variables con procesos diseñados específicamente para diferentes grupos de variables.**\n",
18 | "\n",
19 | "-------------------------------------------------------------------\n",
20 | "Feature-engine puede ser instalado vía pip ==> pip install feature-engine\n",
21 | "\n",
22 | "- Asegurate que haz instalado Feature-engine antes de correr este notebook\n",
23 | "\n",
24 | "Para más detalle visita el [website the trainindata]( https://www.trainindata.com/feature-engine) \n",
25 | "\n",
26 | "\n",
27 | "## En este demo:\n",
28 | "\n",
29 | "Vamos a usar **Feature-engine para hacer la sustitución por muestra aleatoria** usando los datos Ames House Price.\n",
30 | "\n",
31 | "- Para bajar los datos, por favor referirse a la clase **Datasets** en la **Sección 1** del curso.\n",
32 | "\n",
33 | "### Nota: \n",
34 | "* 'Imputer' se deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahí el nombre dado a la clase.\n"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 1,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import pandas as pd\n",
44 | "import numpy as np\n",
45 | "\n",
46 | "import matplotlib.pyplot as plt\n",
47 | "\n",
48 | "from sklearn.model_selection import train_test_split\n",
49 | "from sklearn.pipeline import Pipeline\n",
50 | "\n",
51 | "# feature engine\n",
52 | "from feature_engine import imputation as mdi"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "data": {
62 | "text/html": [
63 | "\n",
64 | "\n",
77 | "
\n",
78 | " \n",
79 | " \n",
80 | " \n",
81 | " LotFrontage \n",
82 | " MasVnrArea \n",
83 | " BsmtQual \n",
84 | " FireplaceQu \n",
85 | " GarageYrBlt \n",
86 | " SalePrice \n",
87 | " \n",
88 | " \n",
89 | " \n",
90 | " \n",
91 | " 0 \n",
92 | " 65.0 \n",
93 | " 196.0 \n",
94 | " Gd \n",
95 | " NaN \n",
96 | " 2003.0 \n",
97 | " 208500 \n",
98 | " \n",
99 | " \n",
100 | " 1 \n",
101 | " 80.0 \n",
102 | " 0.0 \n",
103 | " Gd \n",
104 | " TA \n",
105 | " 1976.0 \n",
106 | " 181500 \n",
107 | " \n",
108 | " \n",
109 | " 2 \n",
110 | " 68.0 \n",
111 | " 162.0 \n",
112 | " Gd \n",
113 | " TA \n",
114 | " 2001.0 \n",
115 | " 223500 \n",
116 | " \n",
117 | " \n",
118 | " 3 \n",
119 | " 60.0 \n",
120 | " 0.0 \n",
121 | " TA \n",
122 | " Gd \n",
123 | " 1998.0 \n",
124 | " 140000 \n",
125 | " \n",
126 | " \n",
127 | " 4 \n",
128 | " 84.0 \n",
129 | " 350.0 \n",
130 | " Gd \n",
131 | " TA \n",
132 | " 2000.0 \n",
133 | " 250000 \n",
134 | " \n",
135 | " \n",
136 | "
\n",
137 | "
"
138 | ],
139 | "text/plain": [
140 | " LotFrontage MasVnrArea BsmtQual FireplaceQu GarageYrBlt SalePrice\n",
141 | "0 65.0 196.0 Gd NaN 2003.0 208500\n",
142 | "1 80.0 0.0 Gd TA 1976.0 181500\n",
143 | "2 68.0 162.0 Gd TA 2001.0 223500\n",
144 | "3 60.0 0.0 TA Gd 1998.0 140000\n",
145 | "4 84.0 350.0 Gd TA 2000.0 250000"
146 | ]
147 | },
148 | "execution_count": 2,
149 | "metadata": {},
150 | "output_type": "execute_result"
151 | }
152 | ],
153 | "source": [
154 | "# carguemos los datos con unas columnas seleccionadas\n",
155 | "\n",
156 | "cols_to_use = [\n",
157 | " 'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
158 | " 'SalePrice'\n",
159 | "]\n",
160 | "\n",
161 | "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
162 | "data.head()"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 3,
168 | "metadata": {},
169 | "outputs": [
170 | {
171 | "data": {
172 | "text/plain": [
173 | "LotFrontage 0.177397\n",
174 | "MasVnrArea 0.005479\n",
175 | "BsmtQual 0.025342\n",
176 | "FireplaceQu 0.472603\n",
177 | "GarageYrBlt 0.055479\n",
178 | "SalePrice 0.000000\n",
179 | "dtype: float64"
180 | ]
181 | },
182 | "execution_count": 3,
183 | "metadata": {},
184 | "output_type": "execute_result"
185 | }
186 | ],
187 | "source": [
188 | "data.isnull().mean()"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 4,
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "data": {
198 | "text/plain": [
199 | "((1022, 5), (438, 5))"
200 | ]
201 | },
202 | "execution_count": 4,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "# separar datos en segmentos entrenamiento y prueba\n",
209 | "\n",
210 | "# primero, separemos el target (SalePrice) del resto de las variables\n",
211 | "\n",
212 | "cols_to_use.remove('SalePrice')\n",
213 | "\n",
214 | "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
215 | " data['SalePrice'],\n",
216 | " test_size=0.3,\n",
217 | " random_state=0)\n",
218 | "X_train.shape, X_test.shape"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "## Feature-engine Random Sampler por defecto captura todas las variables"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 5,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "# llamemos el imputer de Feature-engine\n",
235 | "# no necesitamos especificar nada\n",
236 | "\n",
237 | "imputer = mdi.RandomSampleImputer(random_state = 29)"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 6,
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "data": {
247 | "text/plain": [
248 | "RandomSampleImputer(random_state=29,\n",
249 | " variables=['BsmtQual', 'FireplaceQu', 'LotFrontage',\n",
250 | " 'MasVnrArea', 'GarageYrBlt'])"
251 | ]
252 | },
253 | "execution_count": 6,
254 | "metadata": {},
255 | "output_type": "execute_result"
256 | }
257 | ],
258 | "source": [
259 | "# ajustemos el imputer\n",
260 | "\n",
261 | "imputer.fit(X_train)"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 7,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "data": {
271 | "text/plain": [
272 | "['BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt']"
273 | ]
274 | },
275 | "execution_count": 7,
276 | "metadata": {},
277 | "output_type": "execute_result"
278 | }
279 | ],
280 | "source": [
281 | "# vemos que el imputer encontró las variables categóricas \n",
282 | "# y numéricas\n",
283 | "\n",
284 | "imputer.variables"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 8,
290 | "metadata": {},
291 | "outputs": [
292 | {
293 | "data": {
294 | "text/html": [
295 | "\n",
296 | "\n",
309 | "
\n",
310 | " \n",
311 | " \n",
312 | " \n",
313 | " BsmtQual \n",
314 | " FireplaceQu \n",
315 | " LotFrontage \n",
316 | " MasVnrArea \n",
317 | " GarageYrBlt \n",
318 | " \n",
319 | " \n",
320 | " \n",
321 | " \n",
322 | " 64 \n",
323 | " Gd \n",
324 | " NaN \n",
325 | " NaN \n",
326 | " 573.0 \n",
327 | " 1998.0 \n",
328 | " \n",
329 | " \n",
330 | " 682 \n",
331 | " Gd \n",
332 | " Gd \n",
333 | " NaN \n",
334 | " 0.0 \n",
335 | " 1996.0 \n",
336 | " \n",
337 | " \n",
338 | " 960 \n",
339 | " TA \n",
340 | " NaN \n",
341 | " 50.0 \n",
342 | " 0.0 \n",
343 | " NaN \n",
344 | " \n",
345 | " \n",
346 | " 1384 \n",
347 | " TA \n",
348 | " NaN \n",
349 | " 60.0 \n",
350 | " 0.0 \n",
351 | " 1939.0 \n",
352 | " \n",
353 | " \n",
354 | " 1100 \n",
355 | " TA \n",
356 | " NaN \n",
357 | " 60.0 \n",
358 | " 0.0 \n",
359 | " 1930.0 \n",
360 | " \n",
361 | " \n",
362 | "
\n",
363 | "
"
364 | ],
365 | "text/plain": [
366 | " BsmtQual FireplaceQu LotFrontage MasVnrArea GarageYrBlt\n",
367 | "64 Gd NaN NaN 573.0 1998.0\n",
368 | "682 Gd Gd NaN 0.0 1996.0\n",
369 | "960 TA NaN 50.0 0.0 NaN\n",
370 | "1384 TA NaN 60.0 0.0 1939.0\n",
371 | "1100 TA NaN 60.0 0.0 1930.0"
372 | ]
373 | },
374 | "execution_count": 8,
375 | "metadata": {},
376 | "output_type": "execute_result"
377 | }
378 | ],
379 | "source": [
380 | "# el imputer guarda una copia de las variables seleccionadas del\n",
381 | "# segmento de entrenamiento, de las cuales extraer la muestra aleatoria\n",
382 | "\n",
383 | "imputer.X_.head()"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 9,
389 | "metadata": {},
390 | "outputs": [
391 | {
392 | "data": {
393 | "text/html": [
394 | "\n",
395 | "\n",
408 | "
\n",
409 | " \n",
410 | " \n",
411 | " \n",
412 | " BsmtQual \n",
413 | " FireplaceQu \n",
414 | " LotFrontage \n",
415 | " MasVnrArea \n",
416 | " GarageYrBlt \n",
417 | " \n",
418 | " \n",
419 | " \n",
420 | " \n",
421 | " 64 \n",
422 | " Gd \n",
423 | " TA \n",
424 | " 60.0 \n",
425 | " 573.0 \n",
426 | " 1998.0 \n",
427 | " \n",
428 | " \n",
429 | " 682 \n",
430 | " Gd \n",
431 | " Gd \n",
432 | " 90.0 \n",
433 | " 0.0 \n",
434 | " 1996.0 \n",
435 | " \n",
436 | " \n",
437 | " 960 \n",
438 | " TA \n",
439 | " Gd \n",
440 | " 50.0 \n",
441 | " 0.0 \n",
442 | " 1977.0 \n",
443 | " \n",
444 | " \n",
445 | " 1384 \n",
446 | " TA \n",
447 | " Gd \n",
448 | " 60.0 \n",
449 | " 0.0 \n",
450 | " 1939.0 \n",
451 | " \n",
452 | " \n",
453 | " 1100 \n",
454 | " TA \n",
455 | " Gd \n",
456 | " 60.0 \n",
457 | " 0.0 \n",
458 | " 1930.0 \n",
459 | " \n",
460 | " \n",
461 | "
\n",
462 | "
"
463 | ],
464 | "text/plain": [
465 | " BsmtQual FireplaceQu LotFrontage MasVnrArea GarageYrBlt\n",
466 | "64 Gd TA 60.0 573.0 1998.0\n",
467 | "682 Gd Gd 90.0 0.0 1996.0\n",
468 | "960 TA Gd 50.0 0.0 1977.0\n",
469 | "1384 TA Gd 60.0 0.0 1939.0\n",
470 | "1100 TA Gd 60.0 0.0 1930.0"
471 | ]
472 | },
473 | "execution_count": 9,
474 | "metadata": {},
475 | "output_type": "execute_result"
476 | }
477 | ],
478 | "source": [
479 | "# feature engine devuelve un dataframe\n",
480 | "\n",
481 | "tmp = imputer.transform(X_train)\n",
482 | "tmp.head()"
483 | ]
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": 10,
488 | "metadata": {},
489 | "outputs": [
490 | {
491 | "data": {
492 | "text/plain": [
493 | "BsmtQual 0.0\n",
494 | "FireplaceQu 0.0\n",
495 | "LotFrontage 0.0\n",
496 | "MasVnrArea 0.0\n",
497 | "GarageYrBlt 0.0\n",
498 | "dtype: float64"
499 | ]
500 | },
501 | "execution_count": 10,
502 | "metadata": {},
503 | "output_type": "execute_result"
504 | }
505 | ],
506 | "source": [
507 | "#revisemos que ya no tenemos valores nulos\n",
508 | "\n",
509 | "tmp[imputer.variables].isnull().mean()"
510 | ]
511 | },
512 | {
513 | "cell_type": "markdown",
514 | "metadata": {},
515 | "source": [
516 | "Revisa la documentación del RandomSampleImputer() para aprender cómo poner semillas dependiendo de variables en el set de datos, como explicamos anteriormente:\n",
517 | "https://feature-engine.readthedocs.io/en/latest/imputation/RandomSampleImputer.html"
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": null,
523 | "metadata": {},
524 | "outputs": [],
525 | "source": []
526 | }
527 | ],
528 | "metadata": {
529 | "kernelspec": {
530 | "display_name": "feml",
531 | "language": "python",
532 | "name": "feml"
533 | },
534 | "language_info": {
535 | "codemirror_mode": {
536 | "name": "ipython",
537 | "version": 3
538 | },
539 | "file_extension": ".py",
540 | "mimetype": "text/x-python",
541 | "name": "python",
542 | "nbconvert_exporter": "python",
543 | "pygments_lexer": "ipython3",
544 | "version": "3.8.2"
545 | },
546 | "toc": {
547 | "base_numbering": 1,
548 | "nav_menu": {},
549 | "number_sections": true,
550 | "sideBar": true,
551 | "skip_h1_title": false,
552 | "title_cell": "Table of Contents",
553 | "title_sidebar": "Contents",
554 | "toc_cell": false,
555 | "toc_position": {},
556 | "toc_section_display": "block",
557 | "toc_window_display": true
558 | }
559 | },
560 | "nbformat": 4,
561 | "nbformat_minor": 2
562 | }
563 |
--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.18_SustitucionModa_FeatureEngine.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Sustitución categoría más frecuente ==> Feature-engine\n",
8 | "\n",
9 | "\n",
10 | "### Qué es Feature-engine?\n",
11 | "\n",
12 | "Feature-engine es una librería de Python que hemos creado para este curso. \n",
13 | "\n",
14 | "- Feature-engine incluye todas las técnicas de ingeniería de variables descritas en este curso\n",
15 | "- Feature-engine funciona como Scikit-learn, por lo tanto es fácil de aprender\n",
16 | "- Feature-engine te permite implementar pasos de ingeniería de variables específicos para diferentes grupos de variables\n",
17 | "- Feature-engine puede ser integrado con las pipelines de Scikit-learn pipeline permitiendo construir modelos fácilmente\n",
18 | "** Feature-engine te permite diseñar y guardar un flujo de ingeniería de variables con procesos diseñados específicamente para diferentes grupos de variables.**\n",
19 | "\n",
20 | "-------------------------------------------------------------------\n",
21 | "Feature-engine puede ser instalado vía pip ==> pip install feature-engine\n",
22 | "\n",
23 | "- Asegurate que haz instalado Feature-engine antes de correr este notebook\n",
24 | "\n",
25 | "Para más detalle visita el [website the trainindata]( https://www.trainindata.com/feature-engine) \n",
26 | "\n",
27 | "\n",
28 | "## En este demo:\n",
29 | "\n",
30 | "Vamos a usar **Feature Engine para hacer la sustitución por la categoría más frecuente** usando los datos Ames House Price.\n",
31 | "\n",
32 | "- Para bajar los datos, por favor referirse a la clase **Datasets** en la **Sección 1** del curso.\n",
33 | "\n",
34 | "### Nota: \n",
35 | "* 'Imputer' se deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahí el nombre dado a la clase."
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 1,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import pandas as pd\n",
45 | "import numpy as np\n",
46 | "\n",
47 | "import matplotlib.pyplot as plt\n",
48 | "\n",
49 | "from sklearn.model_selection import train_test_split\n",
50 | "from sklearn.pipeline import Pipeline\n",
51 | "\n",
52 | "# feature engine\n",
53 | "from feature_engine import imputation as mdi"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/html": [
64 | "\n",
65 | "\n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " LotFrontage \n",
83 | " MasVnrArea \n",
84 | " BsmtQual \n",
85 | " FireplaceQu \n",
86 | " GarageYrBlt \n",
87 | " SalePrice \n",
88 | " \n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " 0 \n",
93 | " 65.0 \n",
94 | " 196.0 \n",
95 | " Gd \n",
96 | " NaN \n",
97 | " 2003.0 \n",
98 | " 208500 \n",
99 | " \n",
100 | " \n",
101 | " 1 \n",
102 | " 80.0 \n",
103 | " 0.0 \n",
104 | " Gd \n",
105 | " TA \n",
106 | " 1976.0 \n",
107 | " 181500 \n",
108 | " \n",
109 | " \n",
110 | " 2 \n",
111 | " 68.0 \n",
112 | " 162.0 \n",
113 | " Gd \n",
114 | " TA \n",
115 | " 2001.0 \n",
116 | " 223500 \n",
117 | " \n",
118 | " \n",
119 | " 3 \n",
120 | " 60.0 \n",
121 | " 0.0 \n",
122 | " TA \n",
123 | " Gd \n",
124 | " 1998.0 \n",
125 | " 140000 \n",
126 | " \n",
127 | " \n",
128 | " 4 \n",
129 | " 84.0 \n",
130 | " 350.0 \n",
131 | " Gd \n",
132 | " TA \n",
133 | " 2000.0 \n",
134 | " 250000 \n",
135 | " \n",
136 | " \n",
137 | "
\n",
138 | "
"
139 | ],
140 | "text/plain": [
141 | " LotFrontage MasVnrArea BsmtQual FireplaceQu GarageYrBlt SalePrice\n",
142 | "0 65.0 196.0 Gd NaN 2003.0 208500\n",
143 | "1 80.0 0.0 Gd TA 1976.0 181500\n",
144 | "2 68.0 162.0 Gd TA 2001.0 223500\n",
145 | "3 60.0 0.0 TA Gd 1998.0 140000\n",
146 | "4 84.0 350.0 Gd TA 2000.0 250000"
147 | ]
148 | },
149 | "execution_count": 2,
150 | "metadata": {},
151 | "output_type": "execute_result"
152 | }
153 | ],
154 | "source": [
155 | "# carguemos los datos con unas columnas seleccionadas\n",
156 | "\n",
157 | "cols_to_use = [\n",
158 | " 'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
159 | " 'SalePrice'\n",
160 | "]\n",
161 | "\n",
162 | "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
163 | "data.head()"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 3,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/plain": [
174 | "LotFrontage 0.177397\n",
175 | "MasVnrArea 0.005479\n",
176 | "BsmtQual 0.025342\n",
177 | "FireplaceQu 0.472603\n",
178 | "GarageYrBlt 0.055479\n",
179 | "SalePrice 0.000000\n",
180 | "dtype: float64"
181 | ]
182 | },
183 | "execution_count": 3,
184 | "metadata": {},
185 | "output_type": "execute_result"
186 | }
187 | ],
188 | "source": [
189 | "data.isnull().mean()"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "Todas las variables predictivas tienen datos ausentes"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 4,
202 | "metadata": {},
203 | "outputs": [
204 | {
205 | "data": {
206 | "text/plain": [
207 | "((1022, 5), (438, 5))"
208 | ]
209 | },
210 | "execution_count": 4,
211 | "metadata": {},
212 | "output_type": "execute_result"
213 | }
214 | ],
215 | "source": [
216 | "# separar datos en segmentos entrenamiento y prueba\n",
217 | "\n",
218 | "# primero, separemos el target (SalePrice) del resto de las variables\n",
219 | "\n",
220 | "cols_to_use.remove('SalePrice')\n",
221 | "\n",
222 | "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
223 | " data['SalePrice'],\n",
224 | " test_size=0.3,\n",
225 | " random_state=0)\n",
226 | "X_train.shape, X_test.shape"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "## Feature-engine captura las variables categóricas automáticamente"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 5,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "# llamemos el imputer de feature engine\n",
243 | "# no necesitamos especificar nada\n",
244 | "\n",
245 | "imputer = mdi.CategoricalImputer(imputation_method='frequent')"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 6,
251 | "metadata": {},
252 | "outputs": [
253 | {
254 | "data": {
255 | "text/plain": [
256 | "CategoricalImputer(imputation_method='frequent',\n",
257 | " variables=['BsmtQual', 'FireplaceQu'])"
258 | ]
259 | },
260 | "execution_count": 6,
261 | "metadata": {},
262 | "output_type": "execute_result"
263 | }
264 | ],
265 | "source": [
266 | "# ajustemos el imputer\n",
267 | "\n",
268 | "imputer.fit(X_train)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 7,
274 | "metadata": {},
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/plain": [
279 | "['BsmtQual', 'FireplaceQu']"
280 | ]
281 | },
282 | "execution_count": 7,
283 | "metadata": {},
284 | "output_type": "execute_result"
285 | }
286 | ],
287 | "source": [
288 | "# vemos que el imputer encontró las variables categóricas \n",
289 | "# para sustituir con la categoría más frecuente o moda\n",
290 | "\n",
291 | "\n",
292 | "imputer.variables"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 8,
298 | "metadata": {},
299 | "outputs": [
300 | {
301 | "data": {
302 | "text/plain": [
303 | "{'BsmtQual': 'TA', 'FireplaceQu': 'Gd'}"
304 | ]
305 | },
306 | "execution_count": 8,
307 | "metadata": {},
308 | "output_type": "execute_result"
309 | }
310 | ],
311 | "source": [
312 | "# aquí vemos los valores que serán usados\n",
313 | "# para reemplazar los NA en cada variable\n",
314 | "\n",
315 | "\n",
316 | "imputer.imputer_dict_"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 9,
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/html": [
327 | "\n",
328 | "\n",
341 | "
\n",
342 | " \n",
343 | " \n",
344 | " \n",
345 | " BsmtQual \n",
346 | " FireplaceQu \n",
347 | " \n",
348 | " \n",
349 | " \n",
350 | " \n",
351 | " 0 \n",
352 | " TA \n",
353 | " Gd \n",
354 | " \n",
355 | " \n",
356 | "
\n",
357 | "
"
358 | ],
359 | "text/plain": [
360 | " BsmtQual FireplaceQu\n",
361 | "0 TA Gd"
362 | ]
363 | },
364 | "execution_count": 9,
365 | "metadata": {},
366 | "output_type": "execute_result"
367 | }
368 | ],
369 | "source": [
370 | "# revisemos las modas sobre el set de entrenamiento\n",
371 | "\n",
372 | "X_train[imputer.variables].mode()"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": 10,
378 | "metadata": {},
379 | "outputs": [
380 | {
381 | "data": {
382 | "text/html": [
383 | "\n",
384 | "\n",
397 | "
\n",
398 | " \n",
399 | " \n",
400 | " \n",
401 | " BsmtQual \n",
402 | " FireplaceQu \n",
403 | " LotFrontage \n",
404 | " MasVnrArea \n",
405 | " GarageYrBlt \n",
406 | " \n",
407 | " \n",
408 | " \n",
409 | " \n",
410 | " 64 \n",
411 | " Gd \n",
412 | " Gd \n",
413 | " NaN \n",
414 | " 573.0 \n",
415 | " 1998.0 \n",
416 | " \n",
417 | " \n",
418 | " 682 \n",
419 | " Gd \n",
420 | " Gd \n",
421 | " NaN \n",
422 | " 0.0 \n",
423 | " 1996.0 \n",
424 | " \n",
425 | " \n",
426 | " 960 \n",
427 | " TA \n",
428 | " Gd \n",
429 | " 50.0 \n",
430 | " 0.0 \n",
431 | " NaN \n",
432 | " \n",
433 | " \n",
434 | " 1384 \n",
435 | " TA \n",
436 | " Gd \n",
437 | " 60.0 \n",
438 | " 0.0 \n",
439 | " 1939.0 \n",
440 | " \n",
441 | " \n",
442 | " 1100 \n",
443 | " TA \n",
444 | " Gd \n",
445 | " 60.0 \n",
446 | " 0.0 \n",
447 | " 1930.0 \n",
448 | " \n",
449 | " \n",
450 | "
\n",
451 | "
"
452 | ],
453 | "text/plain": [
454 | " BsmtQual FireplaceQu LotFrontage MasVnrArea GarageYrBlt\n",
455 | "64 Gd Gd NaN 573.0 1998.0\n",
456 | "682 Gd Gd NaN 0.0 1996.0\n",
457 | "960 TA Gd 50.0 0.0 NaN\n",
458 | "1384 TA Gd 60.0 0.0 1939.0\n",
459 | "1100 TA Gd 60.0 0.0 1930.0"
460 | ]
461 | },
462 | "execution_count": 10,
463 | "metadata": {},
464 | "output_type": "execute_result"
465 | }
466 | ],
467 | "source": [
468 | "# feature-engine devuelve un dataframe\n",
469 | "\n",
470 | "tmp = imputer.transform(X_train)\n",
471 | "tmp.head()"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": 11,
477 | "metadata": {},
478 | "outputs": [
479 | {
480 | "data": {
481 | "text/plain": [
482 | "BsmtQual 0.0\n",
483 | "FireplaceQu 0.0\n",
484 | "dtype: float64"
485 | ]
486 | },
487 | "execution_count": 11,
488 | "metadata": {},
489 | "output_type": "execute_result"
490 | }
491 | ],
492 | "source": [
493 | "# revisemos que los valores nulos ya no existen\n",
494 | "\n",
495 | "tmp[imputer.variables].isnull().mean()"
496 | ]
497 | },
498 | {
499 | "cell_type": "markdown",
500 | "metadata": {},
501 | "source": [
502 | "## Feature-engine te permite especificar grupos de variables fácilmente"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 12,
508 | "metadata": {},
509 | "outputs": [
510 | {
511 | "data": {
512 | "text/plain": [
513 | "CategoricalImputer(imputation_method='frequent', variables=['BsmtQual'])"
514 | ]
515 | },
516 | "execution_count": 12,
517 | "metadata": {},
518 | "output_type": "execute_result"
519 | }
520 | ],
521 | "source": [
522 | "# usemos la sustitución pero esta vez solo\n",
523 | "# imputemos una variable\n",
524 | "\n",
525 | "imputer = mdi.CategoricalImputer(imputation_method='frequent',\n",
526 | " variables=['BsmtQual'])\n",
527 | "\n",
528 | "imputer.fit(X_train)"
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": 13,
534 | "metadata": {},
535 | "outputs": [
536 | {
537 | "data": {
538 | "text/plain": [
539 | "['BsmtQual']"
540 | ]
541 | },
542 | "execution_count": 13,
543 | "metadata": {},
544 | "output_type": "execute_result"
545 | }
546 | ],
547 | "source": [
548 | "# ahora el imputer solo tiene la variable indicada\n",
549 | "\n",
550 | "imputer.variables"
551 | ]
552 | },
553 | {
554 | "cell_type": "code",
555 | "execution_count": 14,
556 | "metadata": {},
557 | "outputs": [
558 | {
559 | "data": {
560 | "text/plain": [
561 | "{'BsmtQual': 'TA'}"
562 | ]
563 | },
564 | "execution_count": 14,
565 | "metadata": {},
566 | "output_type": "execute_result"
567 | }
568 | ],
569 | "source": [
570 | "# y podemos ver el valor asignado para sustituirla\n",
571 | "\n",
572 | "imputer.imputer_dict_"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": 15,
578 | "metadata": {},
579 | "outputs": [
580 | {
581 | "data": {
582 | "text/plain": [
583 | "BsmtQual 0.0\n",
584 | "dtype: float64"
585 | ]
586 | },
587 | "execution_count": 15,
588 | "metadata": {},
589 | "output_type": "execute_result"
590 | }
591 | ],
592 | "source": [
593 | "# feature-engine devuelve un dataframe\n",
594 | "# al imputar la variable:\n",
595 | "\n",
596 | "tmp = imputer.transform(X_train)\n",
597 | "\n",
598 | "\n",
599 | "# revisemos que la variable indicada ya no tiene valores nulos\n",
600 | "tmp[imputer.variables].isnull().mean()"
601 | ]
602 | },
603 | {
604 | "cell_type": "markdown",
605 | "metadata": {},
606 | "source": [
607 | "Funcionó!\n",
608 | "\n",
609 | "More details here:\n",
610 | "https://feature-engine.readthedocs.io/en/latest/imputation/CategoricalImputer.html"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": null,
616 | "metadata": {},
617 | "outputs": [],
618 | "source": []
619 | }
620 | ],
621 | "metadata": {
622 | "kernelspec": {
623 | "display_name": "feml",
624 | "language": "python",
625 | "name": "feml"
626 | },
627 | "language_info": {
628 | "codemirror_mode": {
629 | "name": "ipython",
630 | "version": 3
631 | },
632 | "file_extension": ".py",
633 | "mimetype": "text/x-python",
634 | "name": "python",
635 | "nbconvert_exporter": "python",
636 | "pygments_lexer": "ipython3",
637 | "version": "3.8.2"
638 | },
639 | "toc": {
640 | "base_numbering": 1,
641 | "nav_menu": {},
642 | "number_sections": true,
643 | "sideBar": true,
644 | "skip_h1_title": false,
645 | "title_cell": "Table of Contents",
646 | "title_sidebar": "Contents",
647 | "toc_cell": false,
648 | "toc_position": {},
649 | "toc_section_display": "block",
650 | "toc_window_display": true
651 | }
652 | },
653 | "nbformat": 4,
654 | "nbformat_minor": 2
655 | }
656 |
--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.19_SustitucionCategoriaAdicional_FeatureEngine.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Sustitución usando una etiqueta adicional en variables categóricas ==> Feature-engine\n",
8 | "\n",
9 | "### Qué es Feature-engine?\n",
10 | "\n",
11 | "Feature-engine es una librería de Python que hemos creado para este curso. \n",
12 | "\n",
13 | "- Feature-engine incluye todas las técnicas de ingeniería de variables descritas en este curso\n",
14 | "- Feature-engine funciona como Scikit-learn, por lo tanto es fácil de aprender\n",
15 | "- Feature-engine te permite implementar pasos de ingeniería de variables específicos para diferentes grupos de variables\n",
16 | "- Feature-engine puede ser integrado con las pipelines de Scikit-learn pipeline permitiendo construir modelos fácilmente\n",
17 | "** Feature-engine te permite diseñar y guardar un flujo de ingeniería de variables con procesos diseñados específicamente para diferentes grupos de variables.**\n",
18 | "\n",
19 | "-------------------------------------------------------------------\n",
20 | "Feature-engine puede ser instalado vía pip ==> pip install feature-engine\n",
21 | "\n",
22 | "- Asegurate que haz instalado Feature-engine antes de correr este notebook\n",
23 | "\n",
24 | "Para más detalle visita el [website the trainindata]( https://www.trainindata.com/feature-engine) \n",
25 | "\n",
26 | "\n",
27 | "## En este demo:\n",
28 | "\n",
29 | "Vamos a usar **Feature-engine para hacer la sustitución usando una etiqueta adicional en variables categóricas 'Missing' ** usando los datos Ames House Price.\n",
30 | "\n",
31 | "- Para bajar los datos, por favor referirse a la clase **Datasets** en la **Sección 1** del curso.\n",
32 | "\n",
33 | "### Nota: \n",
34 | "* 'Imputer' se deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahí el nombre dado a la clase."
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 1,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import pandas as pd\n",
44 | "import numpy as np\n",
45 | "\n",
46 | "import matplotlib.pyplot as plt\n",
47 | "\n",
48 | "from sklearn.model_selection import train_test_split\n",
49 | "from sklearn.pipeline import Pipeline\n",
50 | "\n",
51 | "# feature engine\n",
52 | "from feature_engine import imputation as mdi"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "data": {
62 | "text/html": [
63 | "\n",
64 | "\n",
77 | "
\n",
78 | " \n",
79 | " \n",
80 | " \n",
81 | " LotFrontage \n",
82 | " MasVnrArea \n",
83 | " BsmtQual \n",
84 | " FireplaceQu \n",
85 | " GarageYrBlt \n",
86 | " SalePrice \n",
87 | " \n",
88 | " \n",
89 | " \n",
90 | " \n",
91 | " 0 \n",
92 | " 65.0 \n",
93 | " 196.0 \n",
94 | " Gd \n",
95 | " NaN \n",
96 | " 2003.0 \n",
97 | " 208500 \n",
98 | " \n",
99 | " \n",
100 | " 1 \n",
101 | " 80.0 \n",
102 | " 0.0 \n",
103 | " Gd \n",
104 | " TA \n",
105 | " 1976.0 \n",
106 | " 181500 \n",
107 | " \n",
108 | " \n",
109 | " 2 \n",
110 | " 68.0 \n",
111 | " 162.0 \n",
112 | " Gd \n",
113 | " TA \n",
114 | " 2001.0 \n",
115 | " 223500 \n",
116 | " \n",
117 | " \n",
118 | " 3 \n",
119 | " 60.0 \n",
120 | " 0.0 \n",
121 | " TA \n",
122 | " Gd \n",
123 | " 1998.0 \n",
124 | " 140000 \n",
125 | " \n",
126 | " \n",
127 | " 4 \n",
128 | " 84.0 \n",
129 | " 350.0 \n",
130 | " Gd \n",
131 | " TA \n",
132 | " 2000.0 \n",
133 | " 250000 \n",
134 | " \n",
135 | " \n",
136 | "
\n",
137 | "
"
138 | ],
139 | "text/plain": [
140 | " LotFrontage MasVnrArea BsmtQual FireplaceQu GarageYrBlt SalePrice\n",
141 | "0 65.0 196.0 Gd NaN 2003.0 208500\n",
142 | "1 80.0 0.0 Gd TA 1976.0 181500\n",
143 | "2 68.0 162.0 Gd TA 2001.0 223500\n",
144 | "3 60.0 0.0 TA Gd 1998.0 140000\n",
145 | "4 84.0 350.0 Gd TA 2000.0 250000"
146 | ]
147 | },
148 | "execution_count": 2,
149 | "metadata": {},
150 | "output_type": "execute_result"
151 | }
152 | ],
153 | "source": [
154 | "# carguemos los datos con un grupo de variables seleccionadas\n",
155 | "\n",
156 | "cols_to_use = [\n",
157 | " 'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
158 | " 'SalePrice'\n",
159 | "]\n",
160 | "\n",
161 | "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
162 | "data.head()"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 3,
168 | "metadata": {},
169 | "outputs": [
170 | {
171 | "data": {
172 | "text/plain": [
173 | "LotFrontage 0.177397\n",
174 | "MasVnrArea 0.005479\n",
175 | "BsmtQual 0.025342\n",
176 | "FireplaceQu 0.472603\n",
177 | "GarageYrBlt 0.055479\n",
178 | "SalePrice 0.000000\n",
179 | "dtype: float64"
180 | ]
181 | },
182 | "execution_count": 3,
183 | "metadata": {},
184 | "output_type": "execute_result"
185 | }
186 | ],
187 | "source": [
188 | "data.isnull().mean()"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 4,
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "data": {
198 | "text/plain": [
199 | "((1022, 5), (438, 5))"
200 | ]
201 | },
202 | "execution_count": 4,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "# separar datos en segmentos entrenamiento y prueba\n",
209 | "\n",
210 | "# primero, separemos el target (SalePrice) del resto de las variables\n",
211 | "\n",
212 | "cols_to_use.remove('SalePrice')\n",
213 | "\n",
214 | "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
215 | " data['SalePrice'],\n",
216 | " test_size=0.3,\n",
217 | " random_state=0)\n",
218 | "X_train.shape, X_test.shape"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "## Feature-engine captura las variables categóricas automáticamente"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 5,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "# llamemos el imputer de feature-engine\n",
235 | "# no necesitamos especificar nada\n",
236 | "\n",
237 | "imputer = mdi.CategoricalImputer()"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 6,
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "data": {
247 | "text/plain": [
248 | "CategoricalImputer(variables=['BsmtQual', 'FireplaceQu'])"
249 | ]
250 | },
251 | "execution_count": 6,
252 | "metadata": {},
253 | "output_type": "execute_result"
254 | }
255 | ],
256 | "source": [
257 | "# ajustamos el imputer\n",
258 | "\n",
259 | "imputer.fit(X_train)"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 7,
265 | "metadata": {},
266 | "outputs": [
267 | {
268 | "data": {
269 | "text/plain": [
270 | "['BsmtQual', 'FireplaceQu']"
271 | ]
272 | },
273 | "execution_count": 7,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "# vemos que el imputer encontró las variables categóricas \n",
280 | "# automáticamente\n",
281 | "\n",
282 | "imputer.variables"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {},
288 | "source": [
289 | "**Este imputer reemplaza la categoría ausente con una etiqueta adicional \"Missing\"**"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 8,
295 | "metadata": {},
296 | "outputs": [
297 | {
298 | "data": {
299 | "text/html": [
300 | "\n",
301 | "\n",
314 | "
\n",
315 | " \n",
316 | " \n",
317 | " \n",
318 | " BsmtQual \n",
319 | " FireplaceQu \n",
320 | " LotFrontage \n",
321 | " MasVnrArea \n",
322 | " GarageYrBlt \n",
323 | " \n",
324 | " \n",
325 | " \n",
326 | " \n",
327 | " 64 \n",
328 | " Gd \n",
329 | " Missing \n",
330 | " NaN \n",
331 | " 573.0 \n",
332 | " 1998.0 \n",
333 | " \n",
334 | " \n",
335 | " 682 \n",
336 | " Gd \n",
337 | " Gd \n",
338 | " NaN \n",
339 | " 0.0 \n",
340 | " 1996.0 \n",
341 | " \n",
342 | " \n",
343 | " 960 \n",
344 | " TA \n",
345 | " Missing \n",
346 | " 50.0 \n",
347 | " 0.0 \n",
348 | " NaN \n",
349 | " \n",
350 | " \n",
351 | " 1384 \n",
352 | " TA \n",
353 | " Missing \n",
354 | " 60.0 \n",
355 | " 0.0 \n",
356 | " 1939.0 \n",
357 | " \n",
358 | " \n",
359 | " 1100 \n",
360 | " TA \n",
361 | " Missing \n",
362 | " 60.0 \n",
363 | " 0.0 \n",
364 | " 1930.0 \n",
365 | " \n",
366 | " \n",
367 | "
\n",
368 | "
"
369 | ],
370 | "text/plain": [
371 | " BsmtQual FireplaceQu LotFrontage MasVnrArea GarageYrBlt\n",
372 | "64 Gd Missing NaN 573.0 1998.0\n",
373 | "682 Gd Gd NaN 0.0 1996.0\n",
374 | "960 TA Missing 50.0 0.0 NaN\n",
375 | "1384 TA Missing 60.0 0.0 1939.0\n",
376 | "1100 TA Missing 60.0 0.0 1930.0"
377 | ]
378 | },
379 | "execution_count": 8,
380 | "metadata": {},
381 | "output_type": "execute_result"
382 | }
383 | ],
384 | "source": [
385 | "# feature engine retorna un dataframe\n",
386 | "\n",
387 | "tmp = imputer.transform(X_train)\n",
388 | "tmp.head()"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 9,
394 | "metadata": {},
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | "BsmtQual 0.0\n",
400 | "FireplaceQu 0.0\n",
401 | "dtype: float64"
402 | ]
403 | },
404 | "execution_count": 9,
405 | "metadata": {},
406 | "output_type": "execute_result"
407 | }
408 | ],
409 | "source": [
410 | "# revisemos que los valores nulos ya no existen\n",
411 | "\n",
412 | "tmp[imputer.variables].isnull().mean()"
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "metadata": {},
418 | "source": [
419 | "## Feature-engine te permite especificar grupos de variables fácilmente"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": 10,
425 | "metadata": {},
426 | "outputs": [
427 | {
428 | "data": {
429 | "text/plain": [
430 | "CategoricalImputer(variables=['BsmtQual'])"
431 | ]
432 | },
433 | "execution_count": 10,
434 | "metadata": {},
435 | "output_type": "execute_result"
436 | }
437 | ],
438 | "source": [
439 | "# usemos la sustitución pero esta vez solo \n",
440 | "# imputemos una variable\n",
441 | "\n",
442 | "imputer = mdi.CategoricalImputer(variables=['BsmtQual'])\n",
443 | "\n",
444 | "imputer.fit(X_train)"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 11,
450 | "metadata": {},
451 | "outputs": [
452 | {
453 | "data": {
454 | "text/plain": [
455 | "['BsmtQual']"
456 | ]
457 | },
458 | "execution_count": 11,
459 | "metadata": {},
460 | "output_type": "execute_result"
461 | }
462 | ],
463 | "source": [
464 | "# ahora el imputer solo tiene la variable indicada\n",
465 | "\n",
466 | "imputer.variables"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 12,
472 | "metadata": {},
473 | "outputs": [
474 | {
475 | "data": {
476 | "text/plain": [
477 | "BsmtQual 0.0\n",
478 | "dtype: float64"
479 | ]
480 | },
481 | "execution_count": 12,
482 | "metadata": {},
483 | "output_type": "execute_result"
484 | }
485 | ],
486 | "source": [
487 | "# feature-engine devuelve un dataframe\n",
488 | "# al imputar la variable:\n",
489 | "\n",
490 | "tmp = imputer.transform(X_train)\n",
491 | "\n",
492 | "\n",
493 | "# revisemos que la variable indicada ya no tiene valores nulos\n",
494 | "tmp[imputer.variables].isnull().mean()"
495 | ]
496 | },
497 | {
498 | "cell_type": "markdown",
499 | "metadata": {
500 | "scrolled": true
501 | },
502 | "source": [
503 | "## Feature-engine puede ser usado con los flujos de Scikit-learn (pipeline)"
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": 13,
509 | "metadata": {},
510 | "outputs": [
511 | {
512 | "data": {
513 | "text/plain": [
514 | "BsmtQual 0.023483\n",
515 | "FireplaceQu 0.467710\n",
516 | "LotFrontage 0.184932\n",
517 | "MasVnrArea 0.004892\n",
518 | "GarageYrBlt 0.052838\n",
519 | "dtype: float64"
520 | ]
521 | },
522 | "execution_count": 13,
523 | "metadata": {},
524 | "output_type": "execute_result"
525 | }
526 | ],
527 | "source": [
528 | "# revisemos los valores nulos\n",
529 | "\n",
530 | "X_train.isnull().mean()"
531 | ]
532 | },
533 | {
534 | "cell_type": "markdown",
535 | "metadata": {},
536 | "source": [
537 | "Vamos a realizar las siguientes imputaciones\n",
538 | "\n",
539 | "- BsmtQual ==> categoría frecuente\n",
540 | "- FirePlaceQu ==> etiqueta missing"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": 14,
546 | "metadata": {},
547 | "outputs": [],
548 | "source": [
549 | "pipe = Pipeline([\n",
550 | " ('imputer_mode', mdi.CategoricalImputer(\n",
551 | " imputation_method='frequent', variables=['BsmtQual'])),\n",
552 | " \n",
553 | " ('imputer_missing', mdi.CategoricalImputer(\n",
554 | " variables=['FireplaceQu'])),\n",
555 | "])"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": 15,
561 | "metadata": {},
562 | "outputs": [
563 | {
564 | "data": {
565 | "text/plain": [
566 | "Pipeline(steps=[('imputer_mode',\n",
567 | " CategoricalImputer(imputation_method='frequent',\n",
568 | " variables=['BsmtQual'])),\n",
569 | " ('imputer_missing',\n",
570 | " CategoricalImputer(variables=['FireplaceQu']))])"
571 | ]
572 | },
573 | "execution_count": 15,
574 | "metadata": {},
575 | "output_type": "execute_result"
576 | }
577 | ],
578 | "source": [
579 | "pipe.fit(X_train)"
580 | ]
581 | },
582 | {
583 | "cell_type": "code",
584 | "execution_count": 16,
585 | "metadata": {},
586 | "outputs": [
587 | {
588 | "data": {
589 | "text/plain": [
590 | "['BsmtQual']"
591 | ]
592 | },
593 | "execution_count": 16,
594 | "metadata": {},
595 | "output_type": "execute_result"
596 | }
597 | ],
598 | "source": [
599 | "pipe.named_steps['imputer_mode'].variables"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 17,
605 | "metadata": {
606 | "scrolled": true
607 | },
608 | "outputs": [
609 | {
610 | "data": {
611 | "text/plain": [
612 | "['FireplaceQu']"
613 | ]
614 | },
615 | "execution_count": 17,
616 | "metadata": {},
617 | "output_type": "execute_result"
618 | }
619 | ],
620 | "source": [
621 | "pipe.named_steps['imputer_missing'].variables"
622 | ]
623 | },
624 | {
625 | "cell_type": "code",
626 | "execution_count": 18,
627 | "metadata": {},
628 | "outputs": [
629 | {
630 | "data": {
631 | "text/plain": [
632 | "BsmtQual 0.000000\n",
633 | "FireplaceQu 0.000000\n",
634 | "LotFrontage 0.184932\n",
635 | "MasVnrArea 0.004892\n",
636 | "GarageYrBlt 0.052838\n",
637 | "dtype: float64"
638 | ]
639 | },
640 | "execution_count": 18,
641 | "metadata": {},
642 | "output_type": "execute_result"
643 | }
644 | ],
645 | "source": [
646 | "# transformemos los datos con la pipeline\n",
647 | "tmp = pipe.transform(X_train)\n",
648 | "\n",
649 | "# revisemos que ya no tenemos valores nulos\n",
650 | "tmp.isnull().mean()"
651 | ]
652 | }
653 | ],
654 | "metadata": {
655 | "kernelspec": {
656 | "display_name": "feml",
657 | "language": "python",
658 | "name": "feml"
659 | },
660 | "language_info": {
661 | "codemirror_mode": {
662 | "name": "ipython",
663 | "version": 3
664 | },
665 | "file_extension": ".py",
666 | "mimetype": "text/x-python",
667 | "name": "python",
668 | "nbconvert_exporter": "python",
669 | "pygments_lexer": "ipython3",
670 | "version": "3.8.2"
671 | },
672 | "toc": {
673 | "base_numbering": 1,
674 | "nav_menu": {},
675 | "number_sections": true,
676 | "sideBar": true,
677 | "skip_h1_title": false,
678 | "title_cell": "Table of Contents",
679 | "title_sidebar": "Contents",
680 | "toc_cell": false,
681 | "toc_position": {},
682 | "toc_section_display": "block",
683 | "toc_window_display": true
684 | }
685 | },
686 | "nbformat": 4,
687 | "nbformat_minor": 2
688 | }
689 |
--------------------------------------------------------------------------------
/Section-09-Ingenieria-valores-extremos/09.05-Truncamiento-valores-arbitrarios.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Tratamiento de valores atípicos\n",
8 | "\n",
9 | "Un valor atípico o valor extremo (outlier) es un valor el cual es significativamente diferente del resto de los datos. “Un outlier es una observación la cual se desvía tanto del resto de las observaciones que levanta sospechas sobre el mecanismo que lo generó” [D. Hawkins. Identification of Outliers, Chapman and Hall, 1980].\n",
10 | "\n",
11 | "Valores estadísticos como la media y la varianza son susceptibles a los valores extremos. Además, **algunos modelos de Machine Learning son susceptibles a los outliers** lo cual decrece su desempeño. Por lo tanto, dependiendo de cuál algoritmo deseas usar para entrenar un modelo, es muy común que sea necesario remover los valores atípicos de las variables.\n",
12 | "\n",
13 | "Discutimos en la sección 3, cómo identificar los outliers. En esta sección vamos a discutir cómo podemos procesarlos para entrenar nuestros modelos de machine learning o aprendizaje automático. \n",
14 | "\n",
15 | "Es importante resaltar que con cada modificación que hacemos en nuestros datos, introducimos algún sesgo. Por eso es muy importante saber cuáles son las implicaciones de cada método. Si es una buena decisión o no dependerá de la naturaleza de los datos que estemos analizando.\n",
16 | "\n",
17 | "\n",
18 | "## Cómo podemos pre-procesar los valores extremos?\n",
19 | "\n",
20 | "- Removerlos: eliminar los valores extremos de nuestro conjunto de datos\n",
21 | "- Tratar los outliers como datos faltantes y proceder con cualquiera de las técnicas de sustitución\n",
22 | "- Discretización: los datos son discretizados ( ver sección 8) y los valores atípicos son colados en los segmentos extremos junto con los valores más bajos y altos del conjunto de datos\n",
23 | "- Truncamiento de valores: Limitar la distribución de la variable a unos valores máximos y mínimos. También se le conoce como codificación Top / Bottom \n",
24 | "\n",
25 | "\n",
26 | "**El truncamiento de valores** se conoce en inglés como capping, trimming, censoring o winsorization.\n",
27 | "\n",
28 | "\n",
29 | "## Truncamiento de outliers.\n",
30 | "\n",
31 | "**Truncar**, significa limitar los valores máximos y/o mínimos de una distribución a un valor arbitrario. En otras palabras, los valores más grandes o más pequeños que los que arbitrariamente se han determinado, son truncados.\n",
32 | "\n",
33 | "Truncar puede hacerse en ambos extremos de la distribución, o solo en un extremo, dependiendo de la variable y el caso de uso.\n",
34 | "\n",
35 | "Puedes ver la charla de Soledad en pydata Londres [pydata](https://www.youtube.com/watch?v=KHGGlozsRtA), donde ella presenta un ejemplo de truncamiento de los valores extremos en una compañía financiera.\n",
36 | "\n",
37 | "Los números en los cuales se debe truncar la distribución pueden ser determinados: \n",
38 | "\n",
39 | "- arbitrariamente\n",
40 | "- usando la regla de proximidad del rango inter-cuartil \n",
41 | "- usando la aproximación gaussiana \n",
42 | "- usando los cuartiles\n",
43 | "\n",
44 | "### Ventajas\n",
45 | "\n",
46 | "- no remueve las observaciones\n",
47 | "\n",
48 | "### Limitaciones\n",
49 | "\n",
50 | "- distorsiona la distribución de las variables \n",
51 | "- distorsiona la relación entre las variable\n",
52 | "\n",
53 | "## En este Demo\n",
54 | "\n",
55 | "Vas a aprender como truncar los valores extremos en las variables utilizando valores arbitrarios y el conjunto de datos del Titanic\n",
56 | "\n",
57 | "## Importante\n",
58 | "\n",
59 | "Cuando truncamos nuestros datos, tendemos a limitar los valores en el set de entrenamiento y en el set de prueba. Es importante recordar que los valores de truncamiento DEBEN SER derivados del set de entrenamiento. Y luego esos mismos valores se usan para truncar las variables en el set de prueba.\n",
60 | "\n",
61 | "Para simplificar el demo, no lo haremos, pero por favor, ten eso en cuenta cuando construyas tus pipelines de machine learning."
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 1,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "import pandas as pd\n",
71 | "import numpy as np\n",
72 | "\n",
73 | "import matplotlib.pyplot as plt\n",
74 | "\n",
75 | "from feature_engine import imputation as msi\n",
76 | "from feature_engine import outliers as outr"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 2,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "# función para cargar los datos del titanic \n",
86 | "\n",
87 | "def load_titanic():\n",
88 | " data = pd.read_csv('../titanic.csv')\n",
89 | " data['cabin'] = data['cabin'].astype(str).str[0]\n",
90 | " data['pclass'] = data['pclass'].astype('O')\n",
91 | " data['embarked'].fillna('C', inplace=True)\n",
92 | " return data"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 3,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/html": [
103 | "\n",
104 | "\n",
117 | "
\n",
118 | " \n",
119 | " \n",
120 | " \n",
121 | " pclass \n",
122 | " survived \n",
123 | " name \n",
124 | " sex \n",
125 | " age \n",
126 | " sibsp \n",
127 | " parch \n",
128 | " ticket \n",
129 | " fare \n",
130 | " cabin \n",
131 | " embarked \n",
132 | " boat \n",
133 | " body \n",
134 | " home.dest \n",
135 | " \n",
136 | " \n",
137 | " \n",
138 | " \n",
139 | " 0 \n",
140 | " 1 \n",
141 | " 1 \n",
142 | " Allen, Miss. Elisabeth Walton \n",
143 | " female \n",
144 | " 29.0000 \n",
145 | " 0 \n",
146 | " 0 \n",
147 | " 24160 \n",
148 | " 211.3375 \n",
149 | " B \n",
150 | " S \n",
151 | " 2 \n",
152 | " NaN \n",
153 | " St Louis, MO \n",
154 | " \n",
155 | " \n",
156 | " 1 \n",
157 | " 1 \n",
158 | " 1 \n",
159 | " Allison, Master. Hudson Trevor \n",
160 | " male \n",
161 | " 0.9167 \n",
162 | " 1 \n",
163 | " 2 \n",
164 | " 113781 \n",
165 | " 151.5500 \n",
166 | " C \n",
167 | " S \n",
168 | " 11 \n",
169 | " NaN \n",
170 | " Montreal, PQ / Chesterville, ON \n",
171 | " \n",
172 | " \n",
173 | " 2 \n",
174 | " 1 \n",
175 | " 0 \n",
176 | " Allison, Miss. Helen Loraine \n",
177 | " female \n",
178 | " 2.0000 \n",
179 | " 1 \n",
180 | " 2 \n",
181 | " 113781 \n",
182 | " 151.5500 \n",
183 | " C \n",
184 | " S \n",
185 | " NaN \n",
186 | " NaN \n",
187 | " Montreal, PQ / Chesterville, ON \n",
188 | " \n",
189 | " \n",
190 | " 3 \n",
191 | " 1 \n",
192 | " 0 \n",
193 | " Allison, Mr. Hudson Joshua Creighton \n",
194 | " male \n",
195 | " 30.0000 \n",
196 | " 1 \n",
197 | " 2 \n",
198 | " 113781 \n",
199 | " 151.5500 \n",
200 | " C \n",
201 | " S \n",
202 | " NaN \n",
203 | " 135.0 \n",
204 | " Montreal, PQ / Chesterville, ON \n",
205 | " \n",
206 | " \n",
207 | " 4 \n",
208 | " 1 \n",
209 | " 0 \n",
210 | " Allison, Mrs. Hudson J C (Bessie Waldo Daniels) \n",
211 | " female \n",
212 | " 25.0000 \n",
213 | " 1 \n",
214 | " 2 \n",
215 | " 113781 \n",
216 | " 151.5500 \n",
217 | " C \n",
218 | " S \n",
219 | " NaN \n",
220 | " NaN \n",
221 | " Montreal, PQ / Chesterville, ON \n",
222 | " \n",
223 | " \n",
224 | "
\n",
225 | "
"
226 | ],
227 | "text/plain": [
228 | " pclass survived name sex \\\n",
229 | "0 1 1 Allen, Miss. Elisabeth Walton female \n",
230 | "1 1 1 Allison, Master. Hudson Trevor male \n",
231 | "2 1 0 Allison, Miss. Helen Loraine female \n",
232 | "3 1 0 Allison, Mr. Hudson Joshua Creighton male \n",
233 | "4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n",
234 | "\n",
235 | " age sibsp parch ticket fare cabin embarked boat body \\\n",
236 | "0 29.0000 0 0 24160 211.3375 B S 2 NaN \n",
237 | "1 0.9167 1 2 113781 151.5500 C S 11 NaN \n",
238 | "2 2.0000 1 2 113781 151.5500 C S NaN NaN \n",
239 | "3 30.0000 1 2 113781 151.5500 C S NaN 135.0 \n",
240 | "4 25.0000 1 2 113781 151.5500 C S NaN NaN \n",
241 | "\n",
242 | " home.dest \n",
243 | "0 St Louis, MO \n",
244 | "1 Montreal, PQ / Chesterville, ON \n",
245 | "2 Montreal, PQ / Chesterville, ON \n",
246 | "3 Montreal, PQ / Chesterville, ON \n",
247 | "4 Montreal, PQ / Chesterville, ON "
248 | ]
249 | },
250 | "execution_count": 3,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "data = load_titanic()\n",
257 | "data.head()"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "## Truncador de outliers con valores arbitrarios con Feature-engine\n",
265 | "\n",
266 | "Los límites para truncar los valores extremos son determinados por el usuario. "
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {},
272 | "source": [
273 | "### Truncando el extremo superior"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 4,
279 | "metadata": {},
280 | "outputs": [
281 | {
282 | "data": {
283 | "text/plain": [
284 | "(80.0, 512.3292)"
285 | ]
286 | },
287 | "execution_count": 4,
288 | "metadata": {},
289 | "output_type": "execute_result"
290 | }
291 | ],
292 | "source": [
293 | "# encontremos cual el es valor máximo de la variable Age y \n",
294 | "# Fare en los datos del titanic\n",
295 | "\n",
296 | "data.age.max(), data.fare.max()"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 5,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "data": {
306 | "text/plain": [
307 | "ArbitraryOutlierCapper(max_capping_dict={'age': 50, 'fare': 200},\n",
308 | " missing_values='ignore')"
309 | ]
310 | },
311 | "execution_count": 5,
312 | "metadata": {},
313 | "output_type": "execute_result"
314 | }
315 | ],
316 | "source": [
317 | "# inicialicemos el ArbitraryOutlierCapper de feature-engine\n",
318 | "capper = outr.ArbitraryOutlierCapper(\n",
319 | " max_capping_dict = {'age':50, 'fare':200},\n",
320 | " min_capping_dict = None,\n",
321 | " missing_values='ignore')\n",
322 | "\n",
323 | "capper.fit(data)"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 6,
329 | "metadata": {},
330 | "outputs": [
331 | {
332 | "data": {
333 | "text/plain": [
334 | "{'age': 50, 'fare': 200}"
335 | ]
336 | },
337 | "execution_count": 6,
338 | "metadata": {},
339 | "output_type": "execute_result"
340 | }
341 | ],
342 | "source": [
343 | "capper.right_tail_caps_"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 7,
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "data": {
353 | "text/plain": [
354 | "{}"
355 | ]
356 | },
357 | "execution_count": 7,
358 | "metadata": {},
359 | "output_type": "execute_result"
360 | }
361 | ],
362 | "source": [
363 | "capper.left_tail_caps_"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 8,
369 | "metadata": {},
370 | "outputs": [
371 | {
372 | "data": {
373 | "text/plain": [
374 | "(50.0, 200.0)"
375 | ]
376 | },
377 | "execution_count": 8,
378 | "metadata": {},
379 | "output_type": "execute_result"
380 | }
381 | ],
382 | "source": [
383 | "temp = capper.transform(data)\n",
384 | "\n",
385 | "temp.age.max(), temp.fare.max()"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "### Truncando el extremo inferior"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 9,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "data": {
402 | "text/plain": [
403 | "ArbitraryOutlierCapper(min_capping_dict={'age': 10, 'fare': 100},\n",
404 | " missing_values='ignore')"
405 | ]
406 | },
407 | "execution_count": 9,
408 | "metadata": {},
409 | "output_type": "execute_result"
410 | }
411 | ],
412 | "source": [
413 | "capper = outr.ArbitraryOutlierCapper(\n",
414 | " max_capping_dict=None,\n",
415 | " min_capping_dict={'age': 10,'fare': 100},\n",
416 | " missing_values='ignore')\n",
417 | "\n",
418 | "capper.fit(data)"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 10,
424 | "metadata": {},
425 | "outputs": [
426 | {
427 | "data": {
428 | "text/plain": [
429 | "['age', 'fare']"
430 | ]
431 | },
432 | "execution_count": 10,
433 | "metadata": {},
434 | "output_type": "execute_result"
435 | }
436 | ],
437 | "source": [
438 | "capper.variables"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": 11,
444 | "metadata": {},
445 | "outputs": [
446 | {
447 | "data": {
448 | "text/plain": [
449 | "{}"
450 | ]
451 | },
452 | "execution_count": 11,
453 | "metadata": {},
454 | "output_type": "execute_result"
455 | }
456 | ],
457 | "source": [
458 | "capper.right_tail_caps_"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": 12,
464 | "metadata": {},
465 | "outputs": [
466 | {
467 | "data": {
468 | "text/plain": [
469 | "{'age': 10, 'fare': 100}"
470 | ]
471 | },
472 | "execution_count": 12,
473 | "metadata": {},
474 | "output_type": "execute_result"
475 | }
476 | ],
477 | "source": [
478 | "capper.left_tail_caps_"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 13,
484 | "metadata": {},
485 | "outputs": [
486 | {
487 | "data": {
488 | "text/plain": [
489 | "(10.0, 100.0)"
490 | ]
491 | },
492 | "execution_count": 13,
493 | "metadata": {},
494 | "output_type": "execute_result"
495 | }
496 | ],
497 | "source": [
498 | "temp = capper.transform(data)\n",
499 | "\n",
500 | "temp.age.min(), temp.fare.min()"
501 | ]
502 | },
503 | {
504 | "cell_type": "markdown",
505 | "metadata": {},
506 | "source": [
507 | "### Truncando ambos extremos "
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": 14,
513 | "metadata": {},
514 | "outputs": [
515 | {
516 | "data": {
517 | "text/plain": [
518 | "ArbitraryOutlierCapper(max_capping_dict={'age': 50, 'fare': 200},\n",
519 | " min_capping_dict={'age': 10, 'fare': 100},\n",
520 | " missing_values='ignore')"
521 | ]
522 | },
523 | "execution_count": 14,
524 | "metadata": {},
525 | "output_type": "execute_result"
526 | }
527 | ],
528 | "source": [
529 | "capper = outr.ArbitraryOutlierCapper(max_capping_dict={\n",
530 | " 'age': 50, 'fare': 200},\n",
531 | " min_capping_dict={\n",
532 | " 'age': 10, 'fare': 100},\n",
533 | " missing_values='ignore')\n",
534 | "capper.fit(data)"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 15,
540 | "metadata": {},
541 | "outputs": [
542 | {
543 | "data": {
544 | "text/plain": [
545 | "{'age': 50, 'fare': 200}"
546 | ]
547 | },
548 | "execution_count": 15,
549 | "metadata": {},
550 | "output_type": "execute_result"
551 | }
552 | ],
553 | "source": [
554 | "capper.right_tail_caps_"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": 16,
560 | "metadata": {},
561 | "outputs": [
562 | {
563 | "data": {
564 | "text/plain": [
565 | "{'age': 10, 'fare': 100}"
566 | ]
567 | },
568 | "execution_count": 16,
569 | "metadata": {},
570 | "output_type": "execute_result"
571 | }
572 | ],
573 | "source": [
574 | "capper.left_tail_caps_"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 17,
580 | "metadata": {},
581 | "outputs": [
582 | {
583 | "data": {
584 | "text/plain": [
585 | "(10.0, 100.0)"
586 | ]
587 | },
588 | "execution_count": 17,
589 | "metadata": {},
590 | "output_type": "execute_result"
591 | }
592 | ],
593 | "source": [
594 | "temp = capper.transform(data)\n",
595 | "\n",
596 | "temp.age.min(), temp.fare.min()"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": 18,
602 | "metadata": {},
603 | "outputs": [
604 | {
605 | "data": {
606 | "text/plain": [
607 | "(50.0, 200.0)"
608 | ]
609 | },
610 | "execution_count": 18,
611 | "metadata": {},
612 | "output_type": "execute_result"
613 | }
614 | ],
615 | "source": [
616 | "temp.age.max(), temp.fare.max()"
617 | ]
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": null,
622 | "metadata": {},
623 | "outputs": [],
624 | "source": []
625 | }
626 | ],
627 | "metadata": {
628 | "kernelspec": {
629 | "display_name": "feml",
630 | "language": "python",
631 | "name": "feml"
632 | },
633 | "language_info": {
634 | "codemirror_mode": {
635 | "name": "ipython",
636 | "version": 3
637 | },
638 | "file_extension": ".py",
639 | "mimetype": "text/x-python",
640 | "name": "python",
641 | "nbconvert_exporter": "python",
642 | "pygments_lexer": "ipython3",
643 | "version": "3.8.2"
644 | },
645 | "toc": {
646 | "base_numbering": 1,
647 | "nav_menu": {},
648 | "number_sections": true,
649 | "sideBar": true,
650 | "skip_h1_title": false,
651 | "title_cell": "Table of Contents",
652 | "title_sidebar": "Contents",
653 | "toc_cell": false,
654 | "toc_position": {
655 | "height": "803px",
656 | "left": "0px",
657 | "right": "1681px",
658 | "top": "107px",
659 | "width": "239px"
660 | },
661 | "toc_section_display": "block",
662 | "toc_window_display": true
663 | }
664 | },
665 | "nbformat": 4,
666 | "nbformat_minor": 2
667 | }
668 |
--------------------------------------------------------------------------------
/Seccion-02-Tipos-de-Variables/02.4_VariablesMixtas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Variables Mixtas\n",
8 | "\n",
9 | "Las variables mixtas son aquellas que contienen valores numéricos y categorías.\n",
10 | "\n",
11 | "Las variables pueden ser mixtas por una multitud de razones. Por ejemplo, cuando entidades financieras recogen y guardan información acerca de usuarios, generalmente utilizan números. Sin embargo, a veces sucede que dichas entidades no pueden recuperar la información de un cliente. En estos casos, las agencias codifican cada razón con un código diferente. Esto genera estas variables combinadas que contienen códigos alfanuméricos cuando el valor no pudo ser recuperado y números para los valores usuales.\n",
12 | "\n",
13 | "Un ejemplo es el número de cuentas abiertas 'number_of_open_accounts', que en principio puede ser cualquier número que represente el número de productos financieros que tenga el prestamista. Cuando la información no está disponible, cada razón se codifica con una letra diferente, por ejemplo: 'A': no se pudo identificar a la persona, 'B': datos no relevantes, 'C': la persona parece no tener ninguna cuenta abierta.\n",
14 | "\n",
15 | "Otro ejemplo de variables mixtas, es la variable missed_payment_status, que indica si el prestamista está retrasado con uno o varios pagos. Por ejemplo, si el prestamista tiene una tarjeta de crédito, esta variable indicaría si está retrasado con sus pagos mensuales. Valores 0, 1, 2, 3 significan que el cliente no ha pagado 0-3 cuotas del préstamo. El valor 'D', se usa cuando el cliente incumple con el número máximo de pagos retrasados permitidos (D por la palabra en inglés 'defaulted'), que típicamente son tres pagos.\n",
16 | "\n",
17 | "Para este demo, necesitan descargar el archivo csv llamado sample_s2.csv que se encuentra en el folder con los Jupyter Notebooks de la clase **\"Jupyter Notebooks\"** en la **Sección 1**. También se encuentra disponible para descargar junto al video correspondiente a este Notebook.\n"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import pandas as pd\n",
27 | "\n",
28 | "import matplotlib.pyplot as plt"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "data": {
38 | "text/html": [
39 | "\n",
40 | "\n",
53 | "
\n",
54 | " \n",
55 | " \n",
56 | " \n",
57 | " id \n",
58 | " open_il_24m \n",
59 | " \n",
60 | " \n",
61 | " \n",
62 | " \n",
63 | " 0 \n",
64 | " 1077501 \n",
65 | " C \n",
66 | " \n",
67 | " \n",
68 | " 1 \n",
69 | " 1077430 \n",
70 | " A \n",
71 | " \n",
72 | " \n",
73 | " 2 \n",
74 | " 1077175 \n",
75 | " A \n",
76 | " \n",
77 | " \n",
78 | " 3 \n",
79 | " 1076863 \n",
80 | " A \n",
81 | " \n",
82 | " \n",
83 | " 4 \n",
84 | " 1075358 \n",
85 | " A \n",
86 | " \n",
87 | " \n",
88 | "
\n",
89 | "
"
90 | ],
91 | "text/plain": [
92 | " id open_il_24m\n",
93 | "0 1077501 C\n",
94 | "1 1077430 A\n",
95 | "2 1077175 A\n",
96 | "3 1076863 A\n",
97 | "4 1075358 A"
98 | ]
99 | },
100 | "execution_count": 2,
101 | "metadata": {},
102 | "output_type": "execute_result"
103 | }
104 | ],
105 | "source": [
106 | "# open_il_24m indica:\n",
107 | "# \"Número de líneas de crédito a termino fijo abiertas en los últimos 24 meses\".\n",
108 | "# Estas líneas de crédito a termino fijo tienen un número de cuotas fijas por un valor \n",
109 | "# predeterminado entre el prestamista y la entidad financiera.\n",
110 | "# Ejemplos son los préstamos para vehículos o prestamos para estudiantes.\n",
111 | "\n",
112 | "data = pd.read_csv('../sample_s2.csv')\n",
113 | "\n",
114 | "data.head()"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 3,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "(887379, 2)"
126 | ]
127 | },
128 | "execution_count": 3,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | }
132 | ],
133 | "source": [
134 | "data.shape"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 4,
140 | "metadata": {},
141 | "outputs": [
142 | {
143 | "data": {
144 | "text/plain": [
145 | "array(['C', 'A', 'B', '0.0', '1.0', '2.0', '4.0', '3.0', '6.0', '5.0',\n",
146 | " '9.0', '7.0', '8.0', '13.0', '10.0', '19.0', '11.0', '12.0',\n",
147 | " '14.0', '15.0'], dtype=object)"
148 | ]
149 | },
150 | "execution_count": 4,
151 | "metadata": {},
152 | "output_type": "execute_result"
153 | }
154 | ],
155 | "source": [
156 | "# Adicionalmente, la variable toma los siguientes códigos:\n",
157 | "# 'A': no se pudo identificar a la persona \n",
158 | "# 'B': no hay datos relevantes\n",
159 | "# 'C': la persona parece no tener ninguna cuenta abierta\n",
160 | " \n",
161 | "data.open_il_24m.unique()"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 5,
167 | "metadata": {},
168 | "outputs": [
169 | {
170 | "data": {
171 | "text/plain": [
172 | "Text(0, 0.5, 'Número de prestamistas')"
173 | ]
174 | },
175 | "execution_count": 5,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | },
179 | {
180 | "data": {
181 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZgAAAEVCAYAAADdFfNTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de7xVdZ3/8dcb8IIXEBDxgnq80DRqasmgTc0vLw1SXmuswXIkI5nMRmeaS1j9htI0mxm18Vc4Ud7NC1km6Zgips00XsD7BR1QUQhSFFQyNcHP74/vd+tis8/e68BZh30O7+fjsR577e9a38/+7M1mf85a33VRRGBmZtbd+q3vBMzMrG9ygTEzs0q4wJiZWSVcYMzMrBIuMGZmVgkXGDMzq4QLjK0zSVtKmidpn/WdSxmSOiSFpAHrO5e+QlI/STMkndjNcR+VdGB3xsxxR0j6laQVks6R9BVJP8zLdpL0O0n9u/t1NzT+D2YNSVoADAR2jYhXc9vngOMi4sC61c8G/i0iHuzRJDcQkj4DfC4iPrie8/g6sHtEHNdg8ZnArIj4QXe+ZkTs2Z3xCiYBLwCDou5kwIh4FtiiotfdoHgLxpoZAJzabAVJA4FHIuL7PZMSeMuj/UTEaRHx783WabN/t52Bx+qLi3WziPDkaY0JWABMBpYBW+W2zwG35/kOIIABhT63k/7SBvgM8GvgPOAl4CngT3P7QuB5YEKh7ybAvwHPAs8B/wEMzMsOBBYBXwZ+C1ye1/8OsDhP3wE26eS99M+xX8h5nFzMHRgMXAgsAX4DfBPo3yTWV4AngRXAvcCOrT6P/PyzwFxgOXAzsHNhWQCfB+bl5d8DBPwx8DqwCvgd8FJe/zDgfuCV/Hl+vRBrU+AK4MX82c8GRnTyfiYX3stjwMc6WW8c8AfgzZzHg60+u7rvwLK87BJgKnBTjvNrYNv877cceBx4b9338MN5/uvAdOCynO+jwOjCun+cP/OX8rIjO3kvl+T38Yecw4dz7CsafbeB7YEZ+T3MB05c3/8/e8vkLRhrZg7pP+w/rGX//YGHgGHAlcDVwJ8AuwPHAd+VVNsV8W3gXcC+efkOwD8XYm0LDCX95TkJ+CpwQF5/H2AM8LVO8jgROBx4LzAaOKZu+aXAyvy67wXGkoppI18CjgU+CgwiFY3fd/oJZJKOJhWmjwPDgf8Crqpb7XDS57MP8Eng0IiYSyo8d0bEFhGxVV73VeB4YCtSsTkpvwbABNIP/46kz/7zwGudpPYk8Gd5/W8AV0jarn6liPgFcBZwTc6jNt7W6rPbn1TUtyHtRiO/t68BWwNvAHcC9+Xn1wLndpIrwJGk79FWpB/97wJI2gj4OXBLfq2/AX4k6Y8avJfPAD8C/iW/l1ubvB6kf6dFpEJzDHCWpENa9DHwFoynxhP5L0dgL+Bl0o9iV7dg5hWWvSevP6LQ9iKpQIj0g7lbYdn7gafz/IGkvzY3LSx/Evho4fmhwIJO3sttwOcLz8fWcgdGkH7kBhaWHwv8spNYTwBHNWhv9XncBEwsLOtHKkw75+cBfLCwfDowufBZ/neLf6/vAOfl+c8C/wPsvRb/7g80en952dfJf+Xn500/u5z3s3UxLgF+UHj+N8Dcuu/JS/Xfw8Lr31pYtgfwWp7/M9LWbb/C8qsobNk1yOObjd5b8d+SVKRXAVsW1v0WcElP/n/srVM77RO1NhQRj0i6gbQrZW4Xuz9XmH8tx6tv24JUvDYD7pVUWybS7qiapRHxeuH59sAzhefP5LZGtiftRiquW7MzsBGwpPDa/erWL9qRVNy6amfg3yWdU2gTaUutls9vC8t+T5OBZkn7kw6u2AvYmLTL8Md58eU5z6slbUXaXfbViHizQZzjSVtlHblpC9KWRNn31Oqza/Q51n8HGn0nOlP/GW2ax3a2BxZGxFuF5c+QPt91sT2wLCJW1MUdvY5xNwjeRWZlTCHtZir+Z301P25WaNt2LeO/QPph2TMitsrT4Igo/tDUD8YuJv3A1eyU2xpZQvrBLa5bs5D0V/jWhdceFJ0fvbQQ2K1Be6vPYyHw14XX2CoiBkbE/3TyOkWNBqKvJO0i2jEiBpPGrAQQEW9GxDciYg/SuNfhpN1pq5G0M/AD4IvAsEi73x6pxSmRR5nPrqcG0RcDO0oq/qbtRBoXWte4QyVt2c1xNwguMNZSRMwHrgFOKbQtJf0nO05Sf0mfpfEPb5n4b5F+6M6TtA2ApB0kHdqk21XA1yQNl7Q1abzmik7WnQ6cImmkpCGkrbHaay8h7bc/R9KgfD7HbpI+1EmsHwJnSBqlZG9Jw0p8Hv8BnCZpz/z+Bkv6RPNP5m3PASMlbVxo25L0l/XrksYAn6otkHSQpPfk8zheIQ1or2oQd3NSAVia+51A2iJqlkdH7Ud8LT67Kt1NKvL/JGmjfO7MEaTxmrUWEQtJuxu/JWlTSXsDE0ljONaCC4yVdTrpB6noROAfSWMpe5L+I66tL5OO0LlL0ivArcAaA7QF3yQdhPAQ8DBpkPibnaz7A9JRWw/m9X5at/x40m6mx0hHMl0LrDHQnZ1LKli3kH68LySdLwRNPo+IuI50IMPV+f09Anykyfsruo10VNRvJb2Q274AnC5pBam4Ti+sv21+D6+QdmveQYPiGxGPAeeQBtmfI41//LpJHrVdcC9Kui/Pd+Wzq0xE/IF0AMBHSFvEU4HjI+Lxbgh/LGkX4mLgOmBKRMzshrh9nvKglZmZAZJ2JR0uPiD8A7lOvAVjZra6vUhHJLq4rCMXGDOzTNKXgGkUxuls7XkXmZmZVcJbMGZmVgkXGDMzq4TP5C/Yeuuto6OjY32nYWbWq9x7770vRMTw+nYXmIKOjg7mzJmzvtMwM+tVJD3TqN27yMzMrBIuMGZmVgkXGDMzq4QLjJmZVcIFxszMKlF5gZG0QNLDkh6QNCe3DZU0U9K8/DiksP5pkuZLeqJ4uXZJ++U48yWdr3yHI0mbSLomt98tqaPQZ0J+jXmSJlT9Xs3M7B09tQVzUETsGxG1u8BNBmZFxChgVn6OpD2A8aRLnY8DpuZ7WgBcQLoX+6g8jcvtE4HlEbE7cB7pkuhIGkq6Udb+pPu1TykWMjMzq9b62kV2FHBpnr8UOLrQfnVEvBERT5PuDzJG0nbAoIi4M1/h9LK6PrVY1wKH5K2bQ4GZEbEsIpYDM3mnKJmZWcV64kTLAG6RFMD3I2IaMCLfDY+IWFK7iyHplrx3Ffouym1v5vn69lqfhTnWSkkvA8OK7Q36lNIx+caW6yw4+7CuhDQz22D0RIH5QEQszkVkpqRmd5hrdC/waNK+tn3eeUFpEmnXGzvttNMaHczMbO1UvossIhbnx+dJtxsdAzyXd3uRH5/Pqy8Cdix0H0m6TemiPF/fvlofSQOAwcCyJrHq85sWEaMjYvTw4WtcSsfMzNZSpQVG0uaStqzNA2NJ9yKfAdSO6poAXJ/nZwDj85Fhu5AG8+/Ju9NWSDogj68cX9enFusY4LY8TnMzMFbSkDy4Pza3mZlZD6h6F9kI4Lp8RPEA4MqI+IWk2cB0SROBZ4FPAETEo5KmA48BK4GTI2JVjnUScAkwELgpTwAXApdLmk/achmfYy2TdAYwO693ekQsq/LNmpnZOyotMBHxFLBPg/YXgUM66XMmcGaD9jmke2XXt79OLlANll0EXNS1rM3MrDv4TH4zM6uEC4yZmVXCBcbMzCrhAmNmZpVwgTEzs0q4wJiZWSVcYMzMrBIuMGZmVgkXGDMzq4QLjJmZVcIFxszMKuECY2ZmlXCBMTOzSrjAmJlZJVxgzMysEi4wZmZWCRcYMzOrhAuMmZlVwgXGzMwq4QJjZmaVcIExM7NKuMCYmVklXGDMzKwSLjBmZlYJFxgzM6uEC4yZmVXCBcbMzCrhAmNmZpVwgTEzs0q4wJiZWSVcYMzMrBIuMGZmVokuFxhJQyTt3YX1+0u6X9IN+flQSTMlzcuPQwrrniZpvqQnJB1aaN9P0sN52fmSlNs3kXRNbr9bUkehz4T8GvMkTejq+zQzs3VTqsBIul3SIElDgQeBiyWdW/I1TgXmFp5PBmZFxChgVn6OpD2A8cCewDhgqqT+uc8FwCRgVJ7G5faJwPKI2B04D/h2jjUUmALsD4wBphQLmZmZVa/sFszgiHgF+DhwcUTsB3y4VSdJI4HDgB8Wmo8CLs3zlwJHF9qvjog3IuJpYD4wRtJ2wKCIuDMiArisrk8t1rXAIXnr5lBgZkQsi4jlwEzeKUpmZtYDyhaYAfmH/pPADV2I/x3gn4C3Cm0jImIJQH7cJrfvACwsrLcot+2Q5+vbV+sTESuBl4FhTWKZmVkPKVtgTgduBuZHxGxJuwLzmnWQdDjwfETcW/I11KAtmrSvbZ/VX1SaJGmOpDlLly4tlaiZmbVWqsBExI8jYu+I+EJ+/lRE/EWLbh8AjpS0ALgaOFjSFcBzeWuI/Ph8Xn8RsGOh/0hgcW4f2aB9tT6SBgCDgWVNYjV6b9MiYnREjB4+fHiLt2RmZmWVHeTfVNLJkqZKuqg2NesTEadFxMiI6CAN3t8WEccBM4DaUV0TgOvz/AxgfD4ybBfSYP49eTfaCkkH5PGV4+v61GIdk18jSFtbY/MRb0OAsbnNzMx6SNldZJcD25IGz+8gbRGsWMvXPBv4c0nzgD/Pz4mIR4HpwGPAL4CTI2JV7nMS6UCB+cCTwE25/UJgmKT5wJfIR6RFxDLgDGB2nk7PbWZm1kOU/uBvsZJ0f0S8V9JDEbG3pI2AmyPi4OpT7DmjR4+OOXPmvP28Y/KNLfssOPuwKlMyM2t7ku6NiNH17WW3YN7Mjy9J2os01tHRTbmZmVkfNKDketPyWMbXSOMeWwD/t7KszMys1ytbYGblExZ/BewKkAfizczMGiq7i+wnDdqu7c5EzMysb2m6BSPp3aRrgw2W9PHCokHAplUmZmZmvVurXWR/BBwObAUcUWhfAZxYVVJmZtb7NS0wEXE9cL2k90fEnT2Uk5mZ9QFlx2A+li/Xv5GkWZJekHRcpZmZmVmvVrbAjM2X6z+cdJ2vdwH/WFlWZmbW65UtMBvlx48CV/myK2Zm1krZ82B+Lulx4DXgC5KGA69Xl5aZmfV2ZS/XPxl4PzA6It4EXiXdTdLMzKyhVufBHBwRtxXPgUlXzH/bT6tKzMzMerdWu8g+BNzG6ufA1AQuMGZm1olW58FMyY8n9Ew6ZmbWV5Qa5Je0FelOkh3FPhFxSjVpmZlZb1f2KLL/BO4CHgbeqi4dMzPrK8oWmE0j4kuVZmJmZn1K2RMtL5d0oqTtJA2tTZVmZmZmvVrZLZg/AP8KfJV09Bj5cdcqkjIzs96vbIH5ErB7RLxQZTJmZtZ3lN1F9ijw+yoTMTOzvqXsFswq4AFJvwTeqDX6MGUzM+tM2QLzszyZmZmVUqrARMSlVSdiZmZ9S9kz+UcB3wL2ADattUeEjyIzM7OGyg7yXwxcAKwEDgIuAy6vKikzM+v9yhaYgRExC1BEPBMRXwcOri4tMzPr7coO8r8uqR8wT9IXgd8A21SXlpmZ9XZlt2D+FtgMOAXYDziOdHVlMzOzhsoWmI6I+F1ELIqIEyLiL4CdqkzMzMx6t7IF5rSSbWZmZkCLMRhJHwE+Cuwg6fzCokGkI8rMzMwaarUFsxiYA7wO3FuYZgCHNusoaVNJ90h6UNKjkr6R24dKmilpXn4cUuhzmqT5kp6QdGihfT9JD+dl50tSbt9E0jW5/W5JHYU+E/JrzJM0oSsfipmZrbumBSYiHsxn8e8eEZfm+RnA/IhY3iL2G8DBEbEPsC8wTtIBwGRgVkSMAmbl50jaAxgP7AmMA6ZK6p9jXQBMAkblaVxunwgsj4jdgfOAb+dYQ4EpwP7AGGBKsZCZmVn1yo7BzJQ0KP9wPwhcLOncZh0i+V1+ulGeAjgKqF165lLg6Dx/FHB1RLwREU8D84ExkrYDBkXEnRERpJM8i31qsa4FDslbN4cCMyNiWS6EM3mnKJmZWQ8oex7M4Ih4RdLngIsjYoqkh1p1ylsg9wK7A9+LiLsljYiIJQARsURS7XyaHYC7Ct0X5bY383x9e63PwhxrpaSXgWHF9gZ96nOcRNo6Yqeduv/AuI7JNzZdvuDsw7r9Nc3M2kHZLZgBeUvik8ANZYNHxKqI2BcYSdoa2avJ6moUokn72vapz3FaRIyOiNHDhw9vkp6ZmXVF2QJzOnAz8GREzJa0KzCv7ItExEvA7aTdVM/lYkV+fD6vtgjYsdBtJOkgg0V5vr59tT6SBgCDgWVNYpmZWQ8pVWAi4scRsXdEnJSfP5VPtuyUpOGStsrzA4EPA4+TDhKoHdU1Abg+z88Axucjw3YhDebfk3enrZB0QB5fOb6uTy3WMcBteZzmZmCspCF5cH9sbjMzsx5S9nL97yIdyTUiIvaStDdwZER8s0m37YBL8zhMP2B6RNwg6U5guqSJwLPAJwAi4lFJ04HHSOfYnBwRq3Ksk4BLgIHATXkCuBC4XNJ80pbL+BxrmaQzgNl5vdMjYlmZ92pmZt2j7CD/D4B/BL4PEBEPSboS6LTARMRDwHsbtL8IHNJJnzOBMxu0zwHWGL+JiNfJBarBsouAizrLz8zMqlV2DGaziLinrs1n8puZWafKFpgXJO1GPhJL0jHAksqyMjOzXq/sLrKTgWnAuyX9Bnga+HRlWZmZWa/XssDkQfqTIuLDkjYH+kXEiupTMzOz3qxlgYmIVZL2y/OvVp+SmZn1BWV3kd0vaQbwY+DtIhMRP60kKzMz6/XKFpihwIvAwYW2AFxgzMysoVIFJiJOqDoRMzPrW0odpixpV0k/l7RU0vOSrs+XczEzM2uo7HkwVwLTSZd/2Z40FnN1VUmZmVnvV7bAKCIuj4iVebqCTi5/b2ZmBuUH+X8paTJpqyWAvwRuzHe4xBeSNDOzemULzF/mx7+ua/8sqeDs2m0ZmZlZn1D2KDIP6JuZWZeUHYMxMzPrEhcYMzOrhAuMmZlVouyJlpJ0nKR/zs93kjSm2tTMzKw3K7sFMxV4P3Bsfr4C+F4lGZmZWZ9Q9jDl/SPifZLuB4iI5ZI2rjAvMzPr5cpuwbyZbzxWu2XycOCtyrIyM7Ner2yBOR+4DthG0pnAfwNnVZaVmZn1emVPtPyRpHuBQwABR0fE3EozMzOzXq1pgaldayx7HriquMzXIDMzs8602oK5lzTuImAnYHme3wp4FvAlZMzMrKGmYzARsUtE7ArcDBwREVtHxDDgcHy7ZDMza6LsIP+fRMR/1p5ExE3Ah6pJyczM+oKy58G8IOlrQO1GY8cBL1aWlZmZ9Xplt2COBYaTDlW+Ls8f27SHmZlt0MoeprwMOLXiXMzMrA/x1ZTNzKwSLjBmZlaJygqMpB0l/VLSXEmPSjo1tw+VNFPSvPw4pNDnNEnzJT0h6dBC+36SHs7Lzpek3L6JpGty+92SOgp9JuTXmCdpQlXv08zMGit7P5iRkq6TtFTSc5J+Imlki24rgb+PiD8GDgBOlrQHMBmYFRGjgFn5OXnZeGBPYBwwNV9gE+ACYBIwKk/jcvtEYHlE7A6cB3w7xxoKTAH2B8YAU4qFzMzMqld2C+ZiYAawHbAD8PPc1qmIWBIR9+X5FcDc3Pco4NK82qXA0Xn+KODqiHgjIp4G5gNjJG0HDIqIOyMigMvq+tRiXQsckrduDgVmRsSyiFgOzOSdomRmZj2gbIEZHhEXR8TKPF1COlS5lLzr6r3A3cCIiFgCqQgB2+TVdgAWFrotym075Pn69tX6RMRK4GVgWJNYZmbWQ8oWmBfyLZP756n0iZaStgB+AvxtRLzSbNUGbdGkfW371Oc3SdIcSXOWLl3aJD0zM+uKsgXms8Angd8CS4BjcltTkjYiFZcfRUTt2mXP5d1e5Mfnc/siYMdC95HA4tw+skH7an0kDQAGA8uaxFpDREyLiNERMXr48NIbZWZm1kLLApMH2s+KiCMjYnhEbBMRR0fEMy36CbgQmBsR5xYWzQBqR3VNAK4vtI/PR4btQhrMvyfvRlsh6YAc8/i6PrVYxwC35XGam4Gxkobkwf2xuc3MzHpIyzP5I2KVpOGSNo6IP3Qh9geAvwIelvRAbvsKcDYwXdJE0iX/P5Ff51FJ04HHSEegnRwRq3K/k4BLgIHATXmCVMAulzSftOUyPsdaJukMYHZe73Tfu8bMrGeVvdjlAuDXkmYAr9Ya67ZMVhMR/03jsRBId8Zs1OdM4MwG7XOAvRq0v04uUA2WXQRc1Fl+ZmZWrbIFZnGe+gFbVpeOmZn1FWUvdvkNAEmbR8SrrdY3MzMreyb/+yU9RjpZEkn7SJpaaWZmZtarlT1M+Tuks+NfBIiIB4H/U1VSZmbW+5W+2GVELKxrWtVwRTMzM8oP8i+U9KdASNoYOIW8u8zMzKyRslswnwdO5p3rgu2bn5uZmTVU9iiyF4BPV5yLmZn1IaUKTL50y98AHcU+EXFkNWmZmVlvV3YM5meky7L8HHirunTMzKyvKFtgXo+I8yvNxMzM+pSyBebfJU0BbgHeqDXW7lhpZmZWr2yBeQ/pysgH884ussjPzczM1lC2wHwM2LWLl+s3M7MNWNnzYB4EtqoyETMz61vKbsGMAB6XNJvVx2B8mLKZmTVUtsBMqTQLMzPrc8qeyX9H1YmYmVnf0mmBkbRZRPw+z68gHTUGsDGwEfBqRAyqPkUzM+uNmm3BfEbSkIg4MyJWu02ypKOBMdWmZmZmvVmnR5FFxFTgGUnHN1j2M3wOjJmZNdF0DCYirgCQ9PFCcz9gNO/sMjMzM1tD2aPIjijMrwQWAEd1ezZmZtZnlD2K7ISqEzEzs76laYGR9M9NFkdEnNHN+ZiZWR/Ragvm1QZtmwMTgWGAC4yZmTXUapD/nNq8pC2BU4ETgKuBczrrZ2Zm1nIMRtJQ4EvAp4FLgfdFxPKqEzMzs96t1RjMvwIfB6YB74mI3/VIVmZm1uu1ulz/3wPbA18DFkt6JU8rJL1SfXpmZtZbtRqDKXu/GDMzs9W4gJiZWSUqLTCSLpL0vKRHCm1DJc2UNC8/DiksO03SfElPSDq00L6fpIfzsvMlKbdvIuma3H63pI5Cnwn5NeZJmlDl+zQzszVVvQVzCTCurm0yMCsiRgGz8nMk7QGMB/bMfaZK6p/7XABMAkblqRZzIrA8InYHzgO+nWMNJd0kbX/SVZ+nFAuZmZlVr9ICExG/ApbVNR9FOtyZ/Hh0of3qiHgjIp4G5gNjJG0HDIqIOyMigMvq+tRiXQsckrduDgVmRsSyfEj1TNYsdGZmVqH1MQYzIiKWAOTHbXL7DsDCwnqLctsOeb6+fbU+EbESeJl0hYHOYpmZWQ9pp0F+NWiLJu1r22f1F5UmSZojac7SpUtLJWpmZq2tjwLzXN7tRX58PrcvAnYsrDcSWJzbRzZoX62PpAHAYNIuuc5irSEipkXE6IgYPXz48HV4W2ZmVrQ+CswMoHZU1wTg+kL7+Hxk2C6kwfx78m60FZIOyOMrx9f1qcU6Brgtj9PcDIyVNCQP7o/NbWZm1kPK3nBsrUi6CjgQ2FrSItKRXWcD0yVNBJ4FPgEQEY9Kmg48Rrqp2ckRsSqHOol0RNpA4KY8AVwIXC5pPmnLZXyOtUzSGcDsvN7pEVF/sIGZmVWo0gITEcd2suiQTtY/EzizQfscYK8G7a+TC1SDZRcBF5VO1szMulU7DfKbmVkf4gJjZmaVcIExM7NKuMCYmVklXGDMzKwSLjBmZlYJFxgzM6uEC4yZmVXCBcbMzCpR6Zn81j06Jt/YdPmCsw/roUzMzMrzFoyZmVXCBcbMzCrhAmNmZpVwgTEzs0q4wJiZWSVcYMzMrBIuMGZmVgkXGDMzq4QLjJmZVcIFxszMKuECY2ZmlXCBMTOzSrjAmJlZJVxgzMysEi4wZmZWCRcYMzOrhAuMmZlVwgXGzMwq4QJjZmaVcIExM7NKuMCYmVklXGDMzKwSLjBmZlaJPl1gJI2T9ISk+ZImr+98zMw2JAPWdwJVkdQf+B7w58AiYLakGRHx2PrNrOd1TL6x5ToLzj5snWK06m9mG54+W2CAMcD8iHgKQNLVwFHABldg2kV3FCkXOrPeQxGxvnOohKRjgHER8bn8/K+A/SPii3XrTQIm5ad/BDzRJOzWwAvrmFpfidEOObRLjHbIoTtitEMO7RKjHXJolxhl+u8cEcPrG/vyFowatK1RTSNiGjCtVEBpTkSMXqek+kiMdsihXWK0Qw7dEaMdcmiXGO2QQ7vEWJf+fXmQfxGwY+H5SGDxesrFzGyD05cLzGxglKRdJG0MjAdmrOeczMw2GH12F1lErJT0ReBmoD9wUUQ8uo5hS+1K20BitEMO7RKjHXLojhjtkEO7xGiHHNolxlr377OD/GZmtn715V1kZma2HrnAmJlZJVxgzMysEi4wXSDpA5K+14X1d5f0gQbtfyZpt+7NzsysvfTZo8i6i6R9gU8BnwSeBn7ahe7fAb7SoP21vOyItcxpa+DF6OEjNCSNAHYgnbC6OCKe6+kY7ZBDX4rRDjlY3+UC04Ckd5HOmzkWeBG4hnTE3UFdDNUREQ/VN0bEHEkdJXM5ADgbWAacAVxOunRDP0nHR8Qvyiaztj8Eucj+BzAY+E1uHinpJeALEXFf1THaIYe+FKMdcijEeTfpOoFvfzeBGRExt0z/donRDjm0UwwAIsJT3QS8BdwB7F5oe2ot4sxfm2V1680BxgKfAJYDB+T2dwP3l4yxL3AXMBe4NU+P57b3lej/AOk6bvXtBwAPlsxhnWK0Qw59KUY75JDX/XKOMxk4Lk+Ta229JUY75NBOMd6O1ZWVN5QJ+Bhpq2Uh8APgEODptYhzFXBig/aJwDUlYzxQmJ9bt6xsgVnXH6N5TZaVLZTrFKMdcuhLMdohh7ze/wIbNWjfuFn8dovRDjm0U4za5F1kDUTEdcB1kjYHjgb+Dhgh6QLguoi4pWSov81xPg3cm9tGk/6hPlYyxluF+dfqUy0ZY/OIuLu+MSLuyu+xlZsk3QhcRiq6kK7zdjxQdhfdusZohxz6Uox2yAHS93t74Jm69u1Y/bvf7rljnOkAAAbFSURBVDHaIYd2igH4TP7SJA0l7ab6y4g4uIt9DwL2yk8fjYjbutB3FfAq6erQA4Hf1xYBm0bERiVinA/sRuMfgqej7hYGncT4CO/skxXpYqIzIuI/u/Be1ilGO+TQl2K0SQ7jgO8C83jnu7kTsDvwxSgxxtgOMdohh3aK8XYsF5gNQ3f8mJhVQVI/0g0Ci9/N2RGxqjfFaIcc2ikGuMDYOpI0KdI9ddZbjHbIoS/FaIccrG/wiZYbuHxHz3UK0R1p9IEc+lKMdsgBSTf0hRjtkMP6iuEtmA2cpL+OiO+XWK+7jq3fAbg7In5XaB9Xct/wGCAiYrakPYBxwOPrsptP0mURcfw69P8gaVfCI2UP/pC0P+mIwFckDSQdAvo+4DHgrIh4uUSMU0gHnCxstW4n/Wv3SFocEbdK+hTwp6RD2adFxJsl4+xGOmBlR2Alab/9VWXeQ4nY20XEkt4eox1yWF8xXGA2cJJOiIiLW6zzZdJJp1eT9sVCukPoeODqiDi7xOucApxM+gHbFzg1Iq7Py+6LiPe16D8F+Ajp5OCZwP7A7cCHgZsj4swSOdTfcE7AQcBtABFxZIkY90TEmDx/Yn5P15HOVfp5yc/iUWCfSPcsmkY6cONa0uHw+0TEx0vEeJl08MeTpMPhfxwRS1v1K/T/Eemz3Ax4CdiCdJWKQwAi4jMlYpxCuhrFHcBHSYfDLycVnC9ExO1l87HGJG0TEc+v7zzWWleOafbU9ybg2RLrdMex9Q8DW+T5DtIJpKfm5y3P58n9+5N+EF8BBuX2gcBDJXO4D7gCOBD4UH5ckuc/VDLG/YX52cDwPL858HDJGHOLOdUte6BsHqRd3GOBC4GlpEODJwBbluj/UH4cADwH9M/P1YXP8+FCv82A2/P8TmX+TfO6g0lXqnicdNWMF0l/hJwNbNUN3++bSq43CPgW6UoZn6pbNrVE/22BC4DvAcOAr+fPZzqwXckchtZNw4AFwBBgaMkY4+o+2wuBh4ArgRElY2wBnA48Crycv1t3AZ/p6ufvMZgNgKSHOpkeBkaUCFE7Lr5eV46L7x95t1hELCD9uH9E0rmU21+/MiJWRcTvgScj4pUc67Uu5DCadD7SV4GXI/2F/VpE3BERd5SM0U/SEEnDSHsAluY8XiXtIirjEUkn5PkHJY0GapcoKrVrKr1kvBURt0TERNK/z1TSbsOnSr6PjYEtScVhcG7fBGh56HtB7Vy6TXIsIuLZLsSYTtrqOTAihkXEMNJW5XLgx2UCSHpfJ9N+pK3lMi4mfQ9/AoyX9BNJm+RlB5TofwlpF+dC4Jekc9YOA/6LdDmdMl4gfT9r0xzSLuX78nwZZxXmzyH9AXUE6Y+hlrvCsx+RvkOHAt8Azgf+CjhI0lnNOq5hXf9C8NT+E+kv1H2BneumDtI++Fb9xwHzgZtIt0+dRvpreT6Fv5haxLgN2LeubQDp3JxVJfrfDWyW5/sV2gdTtxVQItZI0o/XdymxBVfXd0H+z/d0ftw2t29B+a2PwaQfpCfz+3ozx7qDtIusTIxOtxCAgSX6/11+zWeAU4BZpKtWPAxMKZnDqaS/jqeRtkBOyO3DgV+VjPHE2iyrW29V/n79ssH0WskYD9Q9/yrwa9JWRMvvF6tv2T7bLHaTGP+Q/1+9p9D2dBe/n/d19rpdyOPBuuez82M/0phn+Xy6srKn3jmRNpM/2MmyK0vG6Ef6S+4vgGPyfP8u5DCy9mPcYNkHSvTfpJP2rYv/Ibv4uRxGGlTvjs94M2CXLvbZEtgH2I+Suy8Kfd/VDTlvD2yf57fK/65juhhjz9zv3WuZwy3APxXfP2mr+svArSVjPAKM6mTZwpIx5lL4wyW3TSDtJnqmRP8HC/PfrFtWatdpXrf2x8+5+fvRpWsgksZIvwT8PekPCBWWld31+T+13wvS1s/NhWWlin5t8iC/ma03koaQjqI7CtgmNz8HzADOjojlJWIcQ/oRf6LBsqMj4mclYvwLcEtE3FrXPg74fxExqkX/04F/icLRkbl99/w+jmmVQ12/I0hbUR0RsW0X+k2pa5oaEUslbZvza3nEpKS9gR8C7yIV789GxP9KGg4cGxHnl87HBcbM2lGZIxx7Q4y17Z8PYd8tIh5ph/exNjFcYMysLUl6NiJ26u0x2iGH9RXDV1M2s/VG0ho35KstotwRjm0Rox1yaKcYNS4wZrY+jSAdDls/1iLSYHNvidEOObRTDMAFxszWrxtIJ+A+UL9A0u29KEY75NBOMdL6HoMxM7Mq+Ex+MzOrhAuMmZlVwgXGzMwq4QJjZmaVcIExM7NK/H98D8ciLq5XEgAAAABJRU5ErkJggg==\n",
182 | "text/plain": [
183 | ""
184 | ]
185 | },
186 | "metadata": {
187 | "needs_background": "light"
188 | },
189 | "output_type": "display_data"
190 | }
191 | ],
192 | "source": [
193 | "# Creemos un gráfico de barras, con el número de préstamistas\n",
194 | "# por cada uno de los valores de la variable combinada\n",
195 | "\n",
196 | "fig = data.open_il_24m.value_counts().plot.bar()\n",
197 | "fig.set_title('Número de cuentas a término fijo')\n",
198 | "fig.set_ylabel('Número de prestamistas')"
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {
204 | "collapsed": true
205 | },
206 | "source": [
207 | "Este es un ejemplo de una variable combinada!"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {
213 | "collapsed": true
214 | },
215 | "source": [
216 | "**Esto es todo por este demo, esperamos lo hayan disfrutado y nos vemos en el siguiente. **"
217 | ]
218 | }
219 | ],
220 | "metadata": {
221 | "kernelspec": {
222 | "display_name": "feml",
223 | "language": "python",
224 | "name": "feml"
225 | },
226 | "language_info": {
227 | "codemirror_mode": {
228 | "name": "ipython",
229 | "version": 3
230 | },
231 | "file_extension": ".py",
232 | "mimetype": "text/x-python",
233 | "name": "python",
234 | "nbconvert_exporter": "python",
235 | "pygments_lexer": "ipython3",
236 | "version": "3.8.2"
237 | },
238 | "toc": {
239 | "base_numbering": 1,
240 | "nav_menu": {},
241 | "number_sections": true,
242 | "sideBar": true,
243 | "skip_h1_title": false,
244 | "title_cell": "Table of Contents",
245 | "title_sidebar": "Contents",
246 | "toc_cell": false,
247 | "toc_position": {
248 | "height": "550px",
249 | "left": "0px",
250 | "right": "869.4px",
251 | "top": "107px",
252 | "width": "151px"
253 | },
254 | "toc_section_display": "block",
255 | "toc_window_display": true
256 | }
257 | },
258 | "nbformat": 4,
259 | "nbformat_minor": 1
260 | }
261 |
--------------------------------------------------------------------------------
/Section-06-Codificacion-Variables-Categoricas/06.04_Codificacion-frecuencia.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Codificación por número de observaciones o frecuencia \n",
8 | "\n",
9 | "\n",
10 | "En la primera codificación reemplazamos las categorías por el número de observaciones por categoría en los datos. Similarmente, podemos reemplazar la categoría por la frecuencia -o porcentaje- de observaciones en los datos. Eso es, si 10 de nuestras 100 observaciones muestran el color azul, entonces reemplazamos el color azul por 10 o por 0.1 si reemplazamos por la frecuencia. Estas técnicas capturan la representación de cada etiqueta en los datos, pero la codificación puede que no necesariamente tenga poder predictivo en el target. Sin embargo, estos métodos son bastante populares en las competiciones de Kaggle.\n",
11 | "\n",
12 | "El supuesto de esta técnica es que el número de observaciones presentes en cada una de las categorías de una variable es de alguna forma representativo del poder predictivo de dicha etiqueta.\n",
13 | "\n",
14 | "### Ventajas\n",
15 | "\n",
16 | "- Simple\n",
17 | "- No extiende el espacio de los datos (número de variables)\n",
18 | "\n",
19 | "### Desventajas\n",
20 | "\n",
21 | "- Si dos categorías aparecen el mismo número de veces u observaciones en los datos, serán reemplazadas por el mismo númer; la consecuencia es que puede que perdamos información importante.\n",
22 | "\n",
23 | "Por ejemplo, si hay 10 observaciones para la categoría azul y 10 observaciones para la categoría roja, ambas serán reemplazadas por 10, y por lo tanto, luego de codificarlas, parecerán ser la misma cosa\n",
24 | "\n",
25 | "\n",
26 | "Sigue esta conversación en [en Kaggle](https://www.kaggle.com/general/16927) para más información.\n",
27 | "\n",
28 | "\n",
29 | "\n",
30 | "## En este demo:\n",
31 | "\n",
32 | "Vamos a realizar codificación one hot con:\n",
33 | "- pandas\n",
34 | "- Feature-Engine\n",
35 | "\n",
36 | "y las ventajas y limitaciones de cada una de estas implementaciones usando los datos House Prices dataset.\n",
37 | "\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 1,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "import numpy as np\n",
47 | "import pandas as pd\n",
48 | "\n",
49 | "# separar datos\n",
50 | "from sklearn.model_selection import train_test_split\n",
51 | "\n",
52 | "# codificar con feature-engine\n",
53 | "from feature_engine.encoding import CountFrequencyEncoder"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/html": [
64 | "\n",
65 | "\n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " Neighborhood \n",
83 | " Exterior1st \n",
84 | " Exterior2nd \n",
85 | " SalePrice \n",
86 | " \n",
87 | " \n",
88 | " \n",
89 | " \n",
90 | " 0 \n",
91 | " CollgCr \n",
92 | " VinylSd \n",
93 | " VinylSd \n",
94 | " 208500 \n",
95 | " \n",
96 | " \n",
97 | " 1 \n",
98 | " Veenker \n",
99 | " MetalSd \n",
100 | " MetalSd \n",
101 | " 181500 \n",
102 | " \n",
103 | " \n",
104 | " 2 \n",
105 | " CollgCr \n",
106 | " VinylSd \n",
107 | " VinylSd \n",
108 | " 223500 \n",
109 | " \n",
110 | " \n",
111 | " 3 \n",
112 | " Crawfor \n",
113 | " Wd Sdng \n",
114 | " Wd Shng \n",
115 | " 140000 \n",
116 | " \n",
117 | " \n",
118 | " 4 \n",
119 | " NoRidge \n",
120 | " VinylSd \n",
121 | " VinylSd \n",
122 | " 250000 \n",
123 | " \n",
124 | " \n",
125 | "
\n",
126 | "
"
127 | ],
128 | "text/plain": [
129 | " Neighborhood Exterior1st Exterior2nd SalePrice\n",
130 | "0 CollgCr VinylSd VinylSd 208500\n",
131 | "1 Veenker MetalSd MetalSd 181500\n",
132 | "2 CollgCr VinylSd VinylSd 223500\n",
133 | "3 Crawfor Wd Sdng Wd Shng 140000\n",
134 | "4 NoRidge VinylSd VinylSd 250000"
135 | ]
136 | },
137 | "execution_count": 2,
138 | "metadata": {},
139 | "output_type": "execute_result"
140 | }
141 | ],
142 | "source": [
143 | "# cargar dataset\n",
144 | "\n",
145 | "data = pd.read_csv(\n",
146 | " '../houseprice.csv',\n",
147 | " usecols=['Neighborhood', 'Exterior1st', 'Exterior2nd', 'SalePrice'])\n",
148 | "\n",
149 | "data.head()"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 3,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "Neighborhood : 25 etiquetas\n",
162 | "Exterior1st : 15 etiquetas\n",
163 | "Exterior2nd : 16 etiquetas\n",
164 | "SalePrice : 663 etiquetas\n"
165 | ]
166 | }
167 | ],
168 | "source": [
169 | "# miremos cuantas etiquetas tiene cada variable\n",
170 | "\n",
171 | "for col in data.columns:\n",
172 | " print(col, ': ', len(data[col].unique()), ' etiquetas')"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### Importante sobre codificación\n",
180 | "\n",
181 | "Cuando hacemos el conteo de observaciones para transformar las variables categóricas, es importante calcular el número ( o frecuencia = número observaciones / observaciones totales) usando el set de entrenamiento; y luego usar estos números para codificar las variables en el set de prueba\n",
182 | "\n"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 4,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "data": {
192 | "text/plain": [
193 | "((1022, 3), (438, 3))"
194 | ]
195 | },
196 | "execution_count": 4,
197 | "metadata": {},
198 | "output_type": "execute_result"
199 | }
200 | ],
201 | "source": [
202 | "# separemos en sets de prueba y entrenamiento\n",
203 | "\n",
204 | "X_train, X_test, y_train, y_test = train_test_split(\n",
205 | " data[['Neighborhood', 'Exterior1st', 'Exterior2nd']], # predictores\n",
206 | " data['SalePrice'], # target\n",
207 | " test_size=0.3, # porcentaje observaciones prueba\n",
208 | " random_state=0) # semilla para asegurar reproducibilidad\n",
209 | "\n",
210 | "X_train.shape, X_test.shape"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "## Codificación por número de observaciones o frecuencia con pandas"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 5,
223 | "metadata": {
224 | "scrolled": true
225 | },
226 | "outputs": [
227 | {
228 | "data": {
229 | "text/plain": [
230 | "{'NAmes': 151,\n",
231 | " 'CollgCr': 105,\n",
232 | " 'OldTown': 73,\n",
233 | " 'Edwards': 71,\n",
234 | " 'Sawyer': 61,\n",
235 | " 'Somerst': 56,\n",
236 | " 'Gilbert': 55,\n",
237 | " 'NridgHt': 51,\n",
238 | " 'NWAmes': 51,\n",
239 | " 'SawyerW': 45,\n",
240 | " 'BrkSide': 41,\n",
241 | " 'Mitchel': 36,\n",
242 | " 'Crawfor': 35,\n",
243 | " 'Timber': 30,\n",
244 | " 'NoRidge': 30,\n",
245 | " 'IDOTRR': 24,\n",
246 | " 'ClearCr': 24,\n",
247 | " 'SWISU': 18,\n",
248 | " 'StoneBr': 16,\n",
249 | " 'MeadowV': 12,\n",
250 | " 'Blmngtn': 12,\n",
251 | " 'BrDale': 10,\n",
252 | " 'NPkVill': 7,\n",
253 | " 'Veenker': 6,\n",
254 | " 'Blueste': 2}"
255 | ]
256 | },
257 | "execution_count": 5,
258 | "metadata": {},
259 | "output_type": "execute_result"
260 | }
261 | ],
262 | "source": [
263 | "# calculemos para cada una de las etiquetas el número de observaciones\n",
264 | "# para la variable Neigbourhood\n",
265 | "\n",
266 | "count_map = X_train['Neighborhood'].value_counts().to_dict()\n",
267 | "\n",
268 | "count_map"
269 | ]
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "metadata": {},
274 | "source": [
275 | "El diccionario contiene el número de observaciones por cada categoría de la variable Neighbourhood."
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 6,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "# reemplacemos las etiquetas con el conteo que hicimos\n",
285 | "\n",
286 | "X_train['Neighborhood'] = X_train['Neighborhood'].map(count_map)\n",
287 | "X_test['Neighborhood'] = X_test['Neighborhood'].map(count_map)"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 7,
293 | "metadata": {},
294 | "outputs": [
295 | {
296 | "data": {
297 | "text/plain": [
298 | "64 105\n",
299 | "682 24\n",
300 | "960 41\n",
301 | "1384 71\n",
302 | "1100 18\n",
303 | "416 61\n",
304 | "1034 35\n",
305 | "853 151\n",
306 | "472 71\n",
307 | "1011 71\n",
308 | "Name: Neighborhood, dtype: int64"
309 | ]
310 | },
311 | "execution_count": 7,
312 | "metadata": {},
313 | "output_type": "execute_result"
314 | }
315 | ],
316 | "source": [
317 | "# exploremos los resultados\n",
318 | "\n",
319 | "X_train['Neighborhood'].head(10)"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 8,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "data": {
329 | "text/plain": [
330 | "{'VinylSd': 0.3561643835616438,\n",
331 | " 'HdBoard': 0.149706457925636,\n",
332 | " 'Wd Sdng': 0.14481409001956946,\n",
333 | " 'MetalSd': 0.1350293542074364,\n",
334 | " 'Plywood': 0.08414872798434442,\n",
335 | " 'CemntBd': 0.03816046966731898,\n",
336 | " 'BrkFace': 0.03424657534246575,\n",
337 | " 'WdShing': 0.02054794520547945,\n",
338 | " 'Stucco': 0.016634050880626222,\n",
339 | " 'AsbShng': 0.014677103718199608,\n",
340 | " 'Stone': 0.0019569471624266144,\n",
341 | " 'ImStucc': 0.0009784735812133072,\n",
342 | " 'AsphShn': 0.0009784735812133072,\n",
343 | " 'BrkComm': 0.0009784735812133072,\n",
344 | " 'CBlock': 0.0009784735812133072}"
345 | ]
346 | },
347 | "execution_count": 8,
348 | "metadata": {},
349 | "output_type": "execute_result"
350 | }
351 | ],
352 | "source": [
353 | "# si en lugar del número de observaciones queremos reemplazar por la frecuencia\n",
354 | "# solo necesitamos dividir el conteo por el número total de observaciones\n",
355 | "\n",
356 | "frequency_map = (X_train['Exterior1st'].value_counts() / len(X_train) ).to_dict()\n",
357 | "frequency_map"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 9,
363 | "metadata": {},
364 | "outputs": [],
365 | "source": [
366 | "# reemplacemos las categorías por las frecuencias\n",
367 | "\n",
368 | "X_train['Exterior1st'] = X_train['Exterior1st'].map(frequency_map)\n",
369 | "X_test['Exterior1st'] = X_test['Exterior1st'].map(frequency_map)"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "Podemos agrupar estos comandos en dos funciones como hiciemos en los notebooks anteriores y repetir el proceso (con un ciclo) para cada una de las variables categóricas. Si no sabes como hacer eso, revisa los notebooks anteriores.\n",
377 | "\n",
378 | "## Codificación por número de observaciones o frecuencia con Feature-Engine"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 10,
384 | "metadata": {},
385 | "outputs": [
386 | {
387 | "data": {
388 | "text/plain": [
389 | "((1022, 3), (438, 3))"
390 | ]
391 | },
392 | "execution_count": 10,
393 | "metadata": {},
394 | "output_type": "execute_result"
395 | }
396 | ],
397 | "source": [
398 | "# separemos en sets de prueba y entrenamiento\n",
399 | "\n",
400 | "X_train, X_test, y_train, y_test = train_test_split(\n",
401 | " data[['Neighborhood', 'Exterior1st', 'Exterior2nd']], # variables\n",
402 | " data['SalePrice'], # target\n",
403 | " test_size=0.3, # porcentaje observaciones prueba\n",
404 | " random_state=0) # semilla para asegurar reproducibilidad\n",
405 | "\n",
406 | "X_train.shape, X_test.shape"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": 11,
412 | "metadata": {},
413 | "outputs": [
414 | {
415 | "data": {
416 | "text/plain": [
417 | "CountFrequencyEncoder(variables=['Neighborhood', 'Exterior1st', 'Exterior2nd'])"
418 | ]
419 | },
420 | "execution_count": 11,
421 | "metadata": {},
422 | "output_type": "execute_result"
423 | }
424 | ],
425 | "source": [
426 | "count_enc = CountFrequencyEncoder(\n",
427 | " encoding_method='count', # para codificar por frecuencia ==> encoding_method='frequency'\n",
428 | " variables=['Neighborhood', 'Exterior1st', 'Exterior2nd'])\n",
429 | "\n",
430 | "count_enc.fit(X_train)"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": 12,
436 | "metadata": {
437 | "scrolled": true
438 | },
439 | "outputs": [
440 | {
441 | "data": {
442 | "text/plain": [
443 | "{'Neighborhood': {'NAmes': 151,\n",
444 | " 'CollgCr': 105,\n",
445 | " 'OldTown': 73,\n",
446 | " 'Edwards': 71,\n",
447 | " 'Sawyer': 61,\n",
448 | " 'Somerst': 56,\n",
449 | " 'Gilbert': 55,\n",
450 | " 'NridgHt': 51,\n",
451 | " 'NWAmes': 51,\n",
452 | " 'SawyerW': 45,\n",
453 | " 'BrkSide': 41,\n",
454 | " 'Mitchel': 36,\n",
455 | " 'Crawfor': 35,\n",
456 | " 'Timber': 30,\n",
457 | " 'NoRidge': 30,\n",
458 | " 'IDOTRR': 24,\n",
459 | " 'ClearCr': 24,\n",
460 | " 'SWISU': 18,\n",
461 | " 'StoneBr': 16,\n",
462 | " 'MeadowV': 12,\n",
463 | " 'Blmngtn': 12,\n",
464 | " 'BrDale': 10,\n",
465 | " 'NPkVill': 7,\n",
466 | " 'Veenker': 6,\n",
467 | " 'Blueste': 2},\n",
468 | " 'Exterior1st': {'VinylSd': 364,\n",
469 | " 'HdBoard': 153,\n",
470 | " 'Wd Sdng': 148,\n",
471 | " 'MetalSd': 138,\n",
472 | " 'Plywood': 86,\n",
473 | " 'CemntBd': 39,\n",
474 | " 'BrkFace': 35,\n",
475 | " 'WdShing': 21,\n",
476 | " 'Stucco': 17,\n",
477 | " 'AsbShng': 15,\n",
478 | " 'Stone': 2,\n",
479 | " 'ImStucc': 1,\n",
480 | " 'AsphShn': 1,\n",
481 | " 'BrkComm': 1,\n",
482 | " 'CBlock': 1},\n",
483 | " 'Exterior2nd': {'VinylSd': 353,\n",
484 | " 'Wd Sdng': 142,\n",
485 | " 'HdBoard': 141,\n",
486 | " 'MetalSd': 136,\n",
487 | " 'Plywood': 112,\n",
488 | " 'CmentBd': 39,\n",
489 | " 'Wd Shng': 29,\n",
490 | " 'BrkFace': 18,\n",
491 | " 'AsbShng': 17,\n",
492 | " 'Stucco': 16,\n",
493 | " 'ImStucc': 8,\n",
494 | " 'Brk Cmn': 4,\n",
495 | " 'Stone': 4,\n",
496 | " 'AsphShn': 1,\n",
497 | " 'Other': 1,\n",
498 | " 'CBlock': 1}}"
499 | ]
500 | },
501 | "execution_count": 12,
502 | "metadata": {},
503 | "output_type": "execute_result"
504 | }
505 | ],
506 | "source": [
507 | "# en en atributo encoder_dictdel codificador \n",
508 | "# podemos ver el número de observaciones por categoría de cada variable\n",
509 | "\n",
510 | "count_enc.encoder_dict_"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 13,
516 | "metadata": {},
517 | "outputs": [
518 | {
519 | "data": {
520 | "text/html": [
521 | "\n",
522 | "\n",
535 | "
\n",
536 | " \n",
537 | " \n",
538 | " \n",
539 | " Neighborhood \n",
540 | " Exterior1st \n",
541 | " Exterior2nd \n",
542 | " \n",
543 | " \n",
544 | " \n",
545 | " \n",
546 | " 64 \n",
547 | " 105 \n",
548 | " 364 \n",
549 | " 353 \n",
550 | " \n",
551 | " \n",
552 | " 682 \n",
553 | " 24 \n",
554 | " 148 \n",
555 | " 142 \n",
556 | " \n",
557 | " \n",
558 | " 960 \n",
559 | " 41 \n",
560 | " 148 \n",
561 | " 112 \n",
562 | " \n",
563 | " \n",
564 | " 1384 \n",
565 | " 71 \n",
566 | " 21 \n",
567 | " 29 \n",
568 | " \n",
569 | " \n",
570 | " 1100 \n",
571 | " 18 \n",
572 | " 148 \n",
573 | " 142 \n",
574 | " \n",
575 | " \n",
576 | "
\n",
577 | "
"
578 | ],
579 | "text/plain": [
580 | " Neighborhood Exterior1st Exterior2nd\n",
581 | "64 105 364 353\n",
582 | "682 24 148 142\n",
583 | "960 41 148 112\n",
584 | "1384 71 21 29\n",
585 | "1100 18 148 142"
586 | ]
587 | },
588 | "execution_count": 13,
589 | "metadata": {},
590 | "output_type": "execute_result"
591 | }
592 | ],
593 | "source": [
594 | "X_train = count_enc.transform(X_train)\n",
595 | "X_test = count_enc.transform(X_test)\n",
596 | "\n",
597 | "# exploremos el resultado\n",
598 | "X_train.head()"
599 | ]
600 | },
601 | {
602 | "cell_type": "markdown",
603 | "metadata": {},
604 | "source": [
605 | "**Nota**\n",
606 | "\n",
607 | "Si el argumento 'variables' se fija en 'None' (ninguno). entonces el codificador automáticamente identificará **todas las variables categóricas**. Maravilloso verdad?\n",
608 | "\n",
609 | "El codificador no codificará las variables numéricas. Entonces si algunas de tus variables categóricas son de hecho numéricas, necesitas hacer el 're-cast' o cambio a tipo 'object' antes de usar el codificador.\n",
610 | "\n",
611 | "Si hay una variable en el set de prueba, para el cual el codificador no tiene un número para asignar ( la categoría no estaba presente en el set de entrenamiento), el codificador devolverá un error.\n",
612 | "\n"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": null,
618 | "metadata": {},
619 | "outputs": [],
620 | "source": []
621 | }
622 | ],
623 | "metadata": {
624 | "kernelspec": {
625 | "display_name": "feml",
626 | "language": "python",
627 | "name": "feml"
628 | },
629 | "language_info": {
630 | "codemirror_mode": {
631 | "name": "ipython",
632 | "version": 3
633 | },
634 | "file_extension": ".py",
635 | "mimetype": "text/x-python",
636 | "name": "python",
637 | "nbconvert_exporter": "python",
638 | "pygments_lexer": "ipython3",
639 | "version": "3.8.2"
640 | },
641 | "toc": {
642 | "base_numbering": 1,
643 | "nav_menu": {},
644 | "number_sections": true,
645 | "sideBar": true,
646 | "skip_h1_title": false,
647 | "title_cell": "Table of Contents",
648 | "title_sidebar": "Contents",
649 | "toc_cell": false,
650 | "toc_position": {},
651 | "toc_section_display": "block",
652 | "toc_window_display": true
653 | }
654 | },
655 | "nbformat": 4,
656 | "nbformat_minor": 2
657 | }
658 |
--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.15_SustitucionMediaMediana_FeatureEngine.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Sustitución por la Media / Mediana ==> Feature-engine\n",
8 | "\n",
9 | "\n",
10 | "### Qué es Feature-engine?\n",
11 | "\n",
12 | "Feature-engine es una librería de Python que hemos creado para este curso. \n",
13 | "\n",
14 | "- Feature-engine incluye todas las técnicas de ingeniería de variables descritas en este curso\n",
15 | "- Feature-engine funciona como Scikit-learn, por lo tanto es fácil de aprender\n",
16 | "- Feature-engine te permite implementar pasos de ingeniería de variables específicos para diferentes grupos de variables\n",
17 | "- Feature-engine puede ser integrado con las pipelines de Scikit-learn pipeline permitiendo construir modelos fácilmente\n",
18 | "** Feature-engine te permite diseñar y guardar un flujo de ingeniería de variables con procesos diseñados específicamente para diferentes grupos de variables.**\n",
19 | "\n",
20 | "-------------------------------------------------------------------\n",
21 | "Feature-engine puede ser instalado vía pip ==> pip install feature-engine\n",
22 | "\n",
23 | "- Asegurate que haz instalado Feature-engine antes de correr este notebook\n",
24 | "\n",
25 | "Para más detalle visita el [website de trainindata]( https://www.trainindata.com/feature-engine) \n",
26 | "\n",
27 | "\n",
28 | "## En este demo:\n",
29 | "\n",
30 | "Vamos a usar ** Feature-engine para hacer la sustitución por la media o la mediana** usando los datos Ames House Price.\n",
31 | "\n",
32 | "- Para bajar los datos, por favor referirse a la clase de **Datasets** en la **Sección 1** del curso.\n",
33 | "\n",
34 | "### Nota: \n",
35 | "* 'Imputer' deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahí el nombre dado a la clase."
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 1,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import pandas as pd\n",
45 | "import numpy as np\n",
46 | "\n",
47 | "import matplotlib.pyplot as plt\n",
48 | "\n",
49 | "from sklearn.model_selection import train_test_split\n",
50 | "from sklearn.pipeline import Pipeline\n",
51 | "\n",
52 | "# feature-engine\n",
53 | "from feature_engine import imputation as mdi"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/html": [
64 | "\n",
65 | "\n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " LotFrontage \n",
83 | " MasVnrArea \n",
84 | " BsmtQual \n",
85 | " FireplaceQu \n",
86 | " GarageYrBlt \n",
87 | " SalePrice \n",
88 | " \n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " 0 \n",
93 | " 65.0 \n",
94 | " 196.0 \n",
95 | " Gd \n",
96 | " NaN \n",
97 | " 2003.0 \n",
98 | " 208500 \n",
99 | " \n",
100 | " \n",
101 | " 1 \n",
102 | " 80.0 \n",
103 | " 0.0 \n",
104 | " Gd \n",
105 | " TA \n",
106 | " 1976.0 \n",
107 | " 181500 \n",
108 | " \n",
109 | " \n",
110 | " 2 \n",
111 | " 68.0 \n",
112 | " 162.0 \n",
113 | " Gd \n",
114 | " TA \n",
115 | " 2001.0 \n",
116 | " 223500 \n",
117 | " \n",
118 | " \n",
119 | " 3 \n",
120 | " 60.0 \n",
121 | " 0.0 \n",
122 | " TA \n",
123 | " Gd \n",
124 | " 1998.0 \n",
125 | " 140000 \n",
126 | " \n",
127 | " \n",
128 | " 4 \n",
129 | " 84.0 \n",
130 | " 350.0 \n",
131 | " Gd \n",
132 | " TA \n",
133 | " 2000.0 \n",
134 | " 250000 \n",
135 | " \n",
136 | " \n",
137 | "
\n",
138 | "
"
139 | ],
140 | "text/plain": [
141 | " LotFrontage MasVnrArea BsmtQual FireplaceQu GarageYrBlt SalePrice\n",
142 | "0 65.0 196.0 Gd NaN 2003.0 208500\n",
143 | "1 80.0 0.0 Gd TA 1976.0 181500\n",
144 | "2 68.0 162.0 Gd TA 2001.0 223500\n",
145 | "3 60.0 0.0 TA Gd 1998.0 140000\n",
146 | "4 84.0 350.0 Gd TA 2000.0 250000"
147 | ]
148 | },
149 | "execution_count": 2,
150 | "metadata": {},
151 | "output_type": "execute_result"
152 | }
153 | ],
154 | "source": [
155 | "# carguemos los datos con las variables seleccionadas\n",
156 | "\n",
157 | "cols_to_use = [\n",
158 | " 'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
159 | " 'SalePrice'\n",
160 | "]\n",
161 | "\n",
162 | "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
163 | "data.head()"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 3,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/plain": [
174 | "LotFrontage 0.177397\n",
175 | "MasVnrArea 0.005479\n",
176 | "BsmtQual 0.025342\n",
177 | "FireplaceQu 0.472603\n",
178 | "GarageYrBlt 0.055479\n",
179 | "SalePrice 0.000000\n",
180 | "dtype: float64"
181 | ]
182 | },
183 | "execution_count": 3,
184 | "metadata": {},
185 | "output_type": "execute_result"
186 | }
187 | ],
188 | "source": [
189 | "# porcentaje de valores nulos\n",
190 | "\n",
191 | "data.isnull().mean()"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "Todas las variables predictivas tienen datos ausentes\n"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 4,
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "data": {
208 | "text/plain": [
209 | "((1022, 5), (438, 5))"
210 | ]
211 | },
212 | "execution_count": 4,
213 | "metadata": {},
214 | "output_type": "execute_result"
215 | }
216 | ],
217 | "source": [
218 | "# separar datos en segmentos de entrenamiento y prueba\n",
219 | "\n",
220 | "# primero, separemos el target (SalePrice) del resto de las variables\n",
221 | "cols_to_use.remove('SalePrice')\n",
222 | "\n",
223 | "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
224 | " data['SalePrice'],\n",
225 | " test_size=0.3,\n",
226 | " random_state=0)\n",
227 | "X_train.shape, X_test.shape"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "### Feature-engine captura las variables numéricas automáticamente"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 5,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "# llamamos el imputer de Feature-engine\n",
244 | "# especificamos la estrategia de sustitución, mediana en este caso\n",
245 | "\n",
246 | "imputer = mdi.MeanMedianImputer(imputation_method='median')"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 6,
252 | "metadata": {},
253 | "outputs": [
254 | {
255 | "data": {
256 | "text/plain": [
257 | "MeanMedianImputer(variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])"
258 | ]
259 | },
260 | "execution_count": 6,
261 | "metadata": {},
262 | "output_type": "execute_result"
263 | }
264 | ],
265 | "source": [
266 | "# ajustamos el imputer\n",
267 | "imputer.fit(X_train)"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 7,
273 | "metadata": {},
274 | "outputs": [
275 | {
276 | "data": {
277 | "text/plain": [
278 | "['LotFrontage', 'MasVnrArea', 'GarageYrBlt']"
279 | ]
280 | },
281 | "execution_count": 7,
282 | "metadata": {},
283 | "output_type": "execute_result"
284 | }
285 | ],
286 | "source": [
287 | "# vemos que el imputer automáticamente encontró las variables numéricas para \n",
288 | "# sustituir con la media\n",
289 | "\n",
290 | "imputer.variables"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 8,
296 | "metadata": {},
297 | "outputs": [
298 | {
299 | "data": {
300 | "text/plain": [
301 | "{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1979.0}"
302 | ]
303 | },
304 | "execution_count": 8,
305 | "metadata": {},
306 | "output_type": "execute_result"
307 | }
308 | ],
309 | "source": [
310 | "# aquí podemos ver la mediana asignada a cada variable\n",
311 | "\n",
312 | "imputer.imputer_dict_"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 9,
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "data": {
322 | "text/html": [
323 | "\n",
324 | "\n",
337 | "
\n",
338 | " \n",
339 | " \n",
340 | " \n",
341 | " BsmtQual \n",
342 | " FireplaceQu \n",
343 | " LotFrontage \n",
344 | " MasVnrArea \n",
345 | " GarageYrBlt \n",
346 | " \n",
347 | " \n",
348 | " \n",
349 | " \n",
350 | " 64 \n",
351 | " Gd \n",
352 | " NaN \n",
353 | " 69.0 \n",
354 | " 573.0 \n",
355 | " 1998.0 \n",
356 | " \n",
357 | " \n",
358 | " 682 \n",
359 | " Gd \n",
360 | " Gd \n",
361 | " 69.0 \n",
362 | " 0.0 \n",
363 | " 1996.0 \n",
364 | " \n",
365 | " \n",
366 | " 960 \n",
367 | " TA \n",
368 | " NaN \n",
369 | " 50.0 \n",
370 | " 0.0 \n",
371 | " 1979.0 \n",
372 | " \n",
373 | " \n",
374 | " 1384 \n",
375 | " TA \n",
376 | " NaN \n",
377 | " 60.0 \n",
378 | " 0.0 \n",
379 | " 1939.0 \n",
380 | " \n",
381 | " \n",
382 | " 1100 \n",
383 | " TA \n",
384 | " NaN \n",
385 | " 60.0 \n",
386 | " 0.0 \n",
387 | " 1930.0 \n",
388 | " \n",
389 | " \n",
390 | "
\n",
391 | "
"
392 | ],
393 | "text/plain": [
394 | " BsmtQual FireplaceQu LotFrontage MasVnrArea GarageYrBlt\n",
395 | "64 Gd NaN 69.0 573.0 1998.0\n",
396 | "682 Gd Gd 69.0 0.0 1996.0\n",
397 | "960 TA NaN 50.0 0.0 1979.0\n",
398 | "1384 TA NaN 60.0 0.0 1939.0\n",
399 | "1100 TA NaN 60.0 0.0 1930.0"
400 | ]
401 | },
402 | "execution_count": 9,
403 | "metadata": {},
404 | "output_type": "execute_result"
405 | }
406 | ],
407 | "source": [
408 | "# Feature-engine retorna un dataframe \n",
409 | "\n",
410 | "tmp = imputer.transform(X_train)\n",
411 | "tmp.head()"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 10,
417 | "metadata": {},
418 | "outputs": [
419 | {
420 | "data": {
421 | "text/plain": [
422 | "LotFrontage 0.0\n",
423 | "MasVnrArea 0.0\n",
424 | "GarageYrBlt 0.0\n",
425 | "dtype: float64"
426 | ]
427 | },
428 | "execution_count": 10,
429 | "metadata": {},
430 | "output_type": "execute_result"
431 | }
432 | ],
433 | "source": [
434 | "# revisemos que las variables numéricas no tengan \n",
435 | "# valores nulos NA \n",
436 | "\n",
437 | "tmp[imputer.variables].isnull().mean()"
438 | ]
439 | },
440 | {
441 | "cell_type": "markdown",
442 | "metadata": {},
443 | "source": [
444 | "## Feature-engine te permite especificar grupos de variables fácilmente"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 11,
450 | "metadata": {},
451 | "outputs": [
452 | {
453 | "data": {
454 | "text/plain": [
455 | "MeanMedianImputer(imputation_method='mean',\n",
456 | " variables=['LotFrontage', 'MasVnrArea'])"
457 | ]
458 | },
459 | "execution_count": 11,
460 | "metadata": {},
461 | "output_type": "execute_result"
462 | }
463 | ],
464 | "source": [
465 | "# usemos la sustitución por la media \n",
466 | "# para 2 de la 3 variables numéricas\n",
467 | "\n",
468 | "imputer = mdi.MeanMedianImputer(imputation_method='mean',\n",
469 | " variables=['LotFrontage', 'MasVnrArea'])\n",
470 | "\n",
471 | "imputer.fit(X_train)"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": 12,
477 | "metadata": {},
478 | "outputs": [
479 | {
480 | "data": {
481 | "text/plain": [
482 | "['LotFrontage', 'MasVnrArea']"
483 | ]
484 | },
485 | "execution_count": 12,
486 | "metadata": {},
487 | "output_type": "execute_result"
488 | }
489 | ],
490 | "source": [
491 | "# ahora el imputer solo imputa las variables que indicamos\n",
492 | "\n",
493 | "imputer.variables"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 13,
499 | "metadata": {},
500 | "outputs": [
501 | {
502 | "data": {
503 | "text/plain": [
504 | "{'LotFrontage': 69.66866746698679, 'MasVnrArea': 103.55358898721731}"
505 | ]
506 | },
507 | "execution_count": 13,
508 | "metadata": {},
509 | "output_type": "execute_result"
510 | }
511 | ],
512 | "source": [
513 | "# y podemos ver el valor asignado a cada variable\n",
514 | "imputer.imputer_dict_"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": 14,
520 | "metadata": {},
521 | "outputs": [
522 | {
523 | "data": {
524 | "text/plain": [
525 | "LotFrontage 69.668667\n",
526 | "MasVnrArea 103.553589\n",
527 | "dtype: float64"
528 | ]
529 | },
530 | "execution_count": 14,
531 | "metadata": {},
532 | "output_type": "execute_result"
533 | }
534 | ],
535 | "source": [
536 | "# corroboremos que el diccionario anterior contiene los valores promedio\n",
537 | "# de las variables\n",
538 | "\n",
539 | "X_train[imputer.variables].mean()"
540 | ]
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": 15,
545 | "metadata": {},
546 | "outputs": [
547 | {
548 | "data": {
549 | "text/plain": [
550 | "LotFrontage 0.0\n",
551 | "MasVnrArea 0.0\n",
552 | "dtype: float64"
553 | ]
554 | },
555 | "execution_count": 15,
556 | "metadata": {},
557 | "output_type": "execute_result"
558 | }
559 | ],
560 | "source": [
561 | "# Feature-engine devuelve un dataframe\n",
562 | "\n",
563 | "tmp = imputer.transform(X_train)\n",
564 | "\n",
565 | "# miremos que los valores nulos efectivamente ya no existen\n",
566 | "tmp[imputer.variables].isnull().mean()"
567 | ]
568 | },
569 | {
570 | "cell_type": "markdown",
571 | "metadata": {},
572 | "source": [
573 | "## Feature-engine puede ser usado con los flujos de Scikit-learn (pipeline)"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": 16,
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "pipe = Pipeline([\n",
583 | " ('median_imputer', mdi.MeanMedianImputer(imputation_method='median',\n",
584 | " variables = ['LotFrontage', 'GarageYrBlt'])),\n",
585 | " \n",
586 | " ('mean_imputer', mdi.MeanMedianImputer(imputation_method='mean',\n",
587 | " variables = ['MasVnrArea'])),\n",
588 | " ])"
589 | ]
590 | },
591 | {
592 | "cell_type": "code",
593 | "execution_count": 17,
594 | "metadata": {},
595 | "outputs": [
596 | {
597 | "data": {
598 | "text/plain": [
599 | "Pipeline(steps=[('median_imputer',\n",
600 | " MeanMedianImputer(variables=['LotFrontage', 'GarageYrBlt'])),\n",
601 | " ('mean_imputer',\n",
602 | " MeanMedianImputer(imputation_method='mean',\n",
603 | " variables=['MasVnrArea']))])"
604 | ]
605 | },
606 | "execution_count": 17,
607 | "metadata": {},
608 | "output_type": "execute_result"
609 | }
610 | ],
611 | "source": [
612 | "pipe.fit(X_train)"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": 18,
618 | "metadata": {},
619 | "outputs": [
620 | {
621 | "data": {
622 | "text/plain": [
623 | "{'LotFrontage': 69.0, 'GarageYrBlt': 1979.0}"
624 | ]
625 | },
626 | "execution_count": 18,
627 | "metadata": {},
628 | "output_type": "execute_result"
629 | }
630 | ],
631 | "source": [
632 | "pipe.named_steps['median_imputer'].imputer_dict_"
633 | ]
634 | },
635 | {
636 | "cell_type": "code",
637 | "execution_count": 19,
638 | "metadata": {},
639 | "outputs": [
640 | {
641 | "data": {
642 | "text/plain": [
643 | "{'MasVnrArea': 103.55358898721731}"
644 | ]
645 | },
646 | "execution_count": 19,
647 | "metadata": {},
648 | "output_type": "execute_result"
649 | }
650 | ],
651 | "source": [
652 | "pipe.named_steps['mean_imputer'].imputer_dict_"
653 | ]
654 | },
655 | {
656 | "cell_type": "code",
657 | "execution_count": 20,
658 | "metadata": {},
659 | "outputs": [
660 | {
661 | "data": {
662 | "text/plain": [
663 | "BsmtQual 0.023483\n",
664 | "FireplaceQu 0.467710\n",
665 | "LotFrontage 0.000000\n",
666 | "MasVnrArea 0.000000\n",
667 | "GarageYrBlt 0.000000\n",
668 | "dtype: float64"
669 | ]
670 | },
671 | "execution_count": 20,
672 | "metadata": {},
673 | "output_type": "execute_result"
674 | }
675 | ],
676 | "source": [
677 | "# transformemos los datos con la pipeline\n",
678 | "tmp = pipe.transform(X_train)\n",
679 | "\n",
680 | "# revisemos que ya no tenemos valores nulos\n",
681 | "tmp.isnull().mean()"
682 | ]
683 | }
684 | ],
685 | "metadata": {
686 | "kernelspec": {
687 | "display_name": "feml",
688 | "language": "python",
689 | "name": "feml"
690 | },
691 | "language_info": {
692 | "codemirror_mode": {
693 | "name": "ipython",
694 | "version": 3
695 | },
696 | "file_extension": ".py",
697 | "mimetype": "text/x-python",
698 | "name": "python",
699 | "nbconvert_exporter": "python",
700 | "pygments_lexer": "ipython3",
701 | "version": "3.8.2"
702 | },
703 | "toc": {
704 | "base_numbering": 1,
705 | "nav_menu": {},
706 | "number_sections": true,
707 | "sideBar": true,
708 | "skip_h1_title": false,
709 | "title_cell": "Table of Contents",
710 | "title_sidebar": "Contents",
711 | "toc_cell": false,
712 | "toc_position": {},
713 | "toc_section_display": "block",
714 | "toc_window_display": true
715 | }
716 | },
717 | "nbformat": 4,
718 | "nbformat_minor": 2
719 | }
720 |
--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.13_IndicadorAusencia_Sklearn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Agregar un Indicador de ausencia con Scikit-learn ==> MissingIndicator\n",
8 | "\n",
9 | "Scikit-learn tiene una clase **MissingIndicator** para añadir una variable binaria que marque los valores ausentes.\n",
10 | "\n",
11 | "MissingIndicator tiene la opción de añadir un indicador binario (variable) para cada variable en un conjunto de datos o solamente para aquellas que tienen NA en el segmento de entrenamiento.\n",
12 | "\n",
13 | "### Atención!\n",
14 | "\n",
15 | "El transformer solo devuelve las variables binarias, que luego deben ser añadidas a los datos originales de entrenamiento.\n",
16 | "\n",
17 | "### Más detalles acerca de los transformadores\n",
18 | "\n",
19 | "- [MissingIndicaror](https://scikit-learn.org/stable/modules/generated/sklearn.impute.MissingIndicator.html#sklearn.impute.MissingIndicator)\n",
20 | "\n",
21 | "## En este demo:\n",
22 | "\n",
23 | "Vamos a añadir un Indicador de ausencia (Missing Indicator) para las variables en el Ames House Price Dataset\n",
24 | "\n",
25 | "- Para bajar los datos, por favor referirse a la clase **Datasets** en la **Sección 1** del curso."
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "import pandas as pd\n",
35 | "import numpy as np\n",
36 | "\n",
37 | "import matplotlib.pyplot as plt\n",
38 | "\n",
39 | "# estas son las clases para sustitutición con sklearn\n",
40 | "from sklearn.impute import SimpleImputer, MissingIndicator\n",
41 | "\n",
42 | "# separar segmentos prueba/entrenamiento\n",
43 | "from sklearn.model_selection import train_test_split"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "# solo usaremos las siguientes variables categóricas en el demo:\n",
53 | "\n",
54 | "# una mezcla de variables categóricas y numéricas\n",
55 | "\n",
56 | "cols_to_use = ['BsmtQual', 'FireplaceQu', 'MSZoning',\n",
57 | " 'BsmtUnfSF', 'LotFrontage', 'MasVnrArea',\n",
58 | " 'Street', 'Alley', 'SalePrice']"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 3,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | "(1460, 9)\n"
71 | ]
72 | },
73 | {
74 | "data": {
75 | "text/html": [
76 | "\n",
77 | "\n",
90 | "
\n",
91 | " \n",
92 | " \n",
93 | " \n",
94 | " MSZoning \n",
95 | " LotFrontage \n",
96 | " Street \n",
97 | " Alley \n",
98 | " MasVnrArea \n",
99 | " BsmtQual \n",
100 | " BsmtUnfSF \n",
101 | " FireplaceQu \n",
102 | " SalePrice \n",
103 | " \n",
104 | " \n",
105 | " \n",
106 | " \n",
107 | " 0 \n",
108 | " RL \n",
109 | " 65.0 \n",
110 | " Pave \n",
111 | " NaN \n",
112 | " 196.0 \n",
113 | " Gd \n",
114 | " 150 \n",
115 | " NaN \n",
116 | " 208500 \n",
117 | " \n",
118 | " \n",
119 | " 1 \n",
120 | " RL \n",
121 | " 80.0 \n",
122 | " Pave \n",
123 | " NaN \n",
124 | " 0.0 \n",
125 | " Gd \n",
126 | " 284 \n",
127 | " TA \n",
128 | " 181500 \n",
129 | " \n",
130 | " \n",
131 | " 2 \n",
132 | " RL \n",
133 | " 68.0 \n",
134 | " Pave \n",
135 | " NaN \n",
136 | " 162.0 \n",
137 | " Gd \n",
138 | " 434 \n",
139 | " TA \n",
140 | " 223500 \n",
141 | " \n",
142 | " \n",
143 | " 3 \n",
144 | " RL \n",
145 | " 60.0 \n",
146 | " Pave \n",
147 | " NaN \n",
148 | " 0.0 \n",
149 | " TA \n",
150 | " 540 \n",
151 | " Gd \n",
152 | " 140000 \n",
153 | " \n",
154 | " \n",
155 | " 4 \n",
156 | " RL \n",
157 | " 84.0 \n",
158 | " Pave \n",
159 | " NaN \n",
160 | " 350.0 \n",
161 | " Gd \n",
162 | " 490 \n",
163 | " TA \n",
164 | " 250000 \n",
165 | " \n",
166 | " \n",
167 | "
\n",
168 | "
"
169 | ],
170 | "text/plain": [
171 | " MSZoning LotFrontage Street Alley MasVnrArea BsmtQual BsmtUnfSF \\\n",
172 | "0 RL 65.0 Pave NaN 196.0 Gd 150 \n",
173 | "1 RL 80.0 Pave NaN 0.0 Gd 284 \n",
174 | "2 RL 68.0 Pave NaN 162.0 Gd 434 \n",
175 | "3 RL 60.0 Pave NaN 0.0 TA 540 \n",
176 | "4 RL 84.0 Pave NaN 350.0 Gd 490 \n",
177 | "\n",
178 | " FireplaceQu SalePrice \n",
179 | "0 NaN 208500 \n",
180 | "1 TA 181500 \n",
181 | "2 TA 223500 \n",
182 | "3 Gd 140000 \n",
183 | "4 TA 250000 "
184 | ]
185 | },
186 | "execution_count": 3,
187 | "metadata": {},
188 | "output_type": "execute_result"
189 | }
190 | ],
191 | "source": [
192 | "# carguemos los datos \n",
193 | "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
194 | "print(data.shape)\n",
195 | "data.head()"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 4,
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "data": {
205 | "text/plain": [
206 | "MSZoning 0.000000\n",
207 | "LotFrontage 0.177397\n",
208 | "Street 0.000000\n",
209 | "Alley 0.937671\n",
210 | "MasVnrArea 0.005479\n",
211 | "BsmtQual 0.025342\n",
212 | "BsmtUnfSF 0.000000\n",
213 | "FireplaceQu 0.472603\n",
214 | "SalePrice 0.000000\n",
215 | "dtype: float64"
216 | ]
217 | },
218 | "execution_count": 4,
219 | "metadata": {},
220 | "output_type": "execute_result"
221 | }
222 | ],
223 | "source": [
224 | "# revisemos los valores nulos\n",
225 | "data.isnull().mean()"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 5,
231 | "metadata": {},
232 | "outputs": [
233 | {
234 | "data": {
235 | "text/plain": [
236 | "((1022, 8), (438, 8))"
237 | ]
238 | },
239 | "execution_count": 5,
240 | "metadata": {},
241 | "output_type": "execute_result"
242 | }
243 | ],
244 | "source": [
245 | "# separar datos en segmentos entrenamiento y prueba\n",
246 | "\n",
247 | "# primero, separemos el target (SalePrice) del resto de las variables (features)\n",
248 | "\n",
249 | "cols_to_use.remove('SalePrice')\n",
250 | "\n",
251 | "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use], # solo las variables\n",
252 | " data['SalePrice'], # el target\n",
253 | " test_size=0.3, # el porcentaje de obs en el segmento de prueba\n",
254 | " random_state=0) # para reproducir\n",
255 | "X_train.shape, X_test.shape"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 6,
261 | "metadata": {},
262 | "outputs": [
263 | {
264 | "data": {
265 | "text/plain": [
266 | "BsmtQual 0.023483\n",
267 | "FireplaceQu 0.467710\n",
268 | "MSZoning 0.000000\n",
269 | "BsmtUnfSF 0.000000\n",
270 | "LotFrontage 0.184932\n",
271 | "MasVnrArea 0.004892\n",
272 | "Street 0.000000\n",
273 | "Alley 0.939335\n",
274 | "dtype: float64"
275 | ]
276 | },
277 | "execution_count": 6,
278 | "metadata": {},
279 | "output_type": "execute_result"
280 | }
281 | ],
282 | "source": [
283 | "# evaluemos el porcentaje de datos ausentes nuevamente\n",
284 | "X_train.isnull().mean()"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "## Añadir un Indicador de Ausencia (Missing Indicator)"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 7,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "data": {
301 | "text/plain": [
302 | "MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,\n",
303 | " sparse='auto')"
304 | ]
305 | },
306 | "execution_count": 7,
307 | "metadata": {},
308 | "output_type": "execute_result"
309 | }
310 | ],
311 | "source": [
312 | "indicator = MissingIndicator(error_on_new=True, features='missing-only')\n",
313 | "indicator.fit(X_train) "
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 8,
319 | "metadata": {},
320 | "outputs": [
321 | {
322 | "data": {
323 | "text/plain": [
324 | "array([0, 1, 4, 5, 7], dtype=int64)"
325 | ]
326 | },
327 | "execution_count": 8,
328 | "metadata": {},
329 | "output_type": "execute_result"
330 | }
331 | ],
332 | "source": [
333 | "# podemos ver las variables con valores nulos na:\n",
334 | "# el resultado muestra el índice (index)\n",
335 | "\n",
336 | "indicator.features_"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": 9,
342 | "metadata": {},
343 | "outputs": [
344 | {
345 | "data": {
346 | "text/plain": [
347 | "Index(['BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'Alley'], dtype='object')"
348 | ]
349 | },
350 | "execution_count": 9,
351 | "metadata": {},
352 | "output_type": "execute_result"
353 | }
354 | ],
355 | "source": [
356 | "# podemos encontrar el nombre de las variables pasando el índice \n",
357 | "# a la lista de columnas\n",
358 | "X_train.columns[indicator.features_]"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 10,
364 | "metadata": {},
365 | "outputs": [
366 | {
367 | "data": {
368 | "text/plain": [
369 | "array([[False, True, True, False, True],\n",
370 | " [False, False, True, False, True],\n",
371 | " [False, True, False, False, True],\n",
372 | " ...,\n",
373 | " [ True, True, False, False, True],\n",
374 | " [False, False, True, False, True],\n",
375 | " [False, True, False, False, True]])"
376 | ]
377 | },
378 | "execution_count": 10,
379 | "metadata": {},
380 | "output_type": "execute_result"
381 | }
382 | ],
383 | "source": [
384 | "# el 'indicator' devuelve solamente los indicadores adicionales\n",
385 | "# cuando tranformamos los datos\n",
386 | "\n",
387 | "tmp = indicator.transform(X_train)\n",
388 | "\n",
389 | "tmp"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 11,
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "data": {
399 | "text/html": [
400 | "\n",
401 | "\n",
414 | "
\n",
415 | " \n",
416 | " \n",
417 | " \n",
418 | " index \n",
419 | " BsmtQual \n",
420 | " FireplaceQu \n",
421 | " MSZoning \n",
422 | " BsmtUnfSF \n",
423 | " LotFrontage \n",
424 | " MasVnrArea \n",
425 | " Street \n",
426 | " Alley \n",
427 | " BsmtQual_NA \n",
428 | " FireplaceQu_NA \n",
429 | " LotFrontage_NA \n",
430 | " MasVnrArea_NA \n",
431 | " Alley_NA \n",
432 | " \n",
433 | " \n",
434 | " \n",
435 | " \n",
436 | " 0 \n",
437 | " 64 \n",
438 | " Gd \n",
439 | " NaN \n",
440 | " RL \n",
441 | " 318 \n",
442 | " NaN \n",
443 | " 573.0 \n",
444 | " Pave \n",
445 | " NaN \n",
446 | " False \n",
447 | " True \n",
448 | " True \n",
449 | " False \n",
450 | " True \n",
451 | " \n",
452 | " \n",
453 | " 1 \n",
454 | " 682 \n",
455 | " Gd \n",
456 | " Gd \n",
457 | " RL \n",
458 | " 288 \n",
459 | " NaN \n",
460 | " 0.0 \n",
461 | " Pave \n",
462 | " NaN \n",
463 | " False \n",
464 | " False \n",
465 | " True \n",
466 | " False \n",
467 | " True \n",
468 | " \n",
469 | " \n",
470 | " 2 \n",
471 | " 960 \n",
472 | " TA \n",
473 | " NaN \n",
474 | " RL \n",
475 | " 162 \n",
476 | " 50.0 \n",
477 | " 0.0 \n",
478 | " Pave \n",
479 | " NaN \n",
480 | " False \n",
481 | " True \n",
482 | " False \n",
483 | " False \n",
484 | " True \n",
485 | " \n",
486 | " \n",
487 | " 3 \n",
488 | " 1384 \n",
489 | " TA \n",
490 | " NaN \n",
491 | " RL \n",
492 | " 356 \n",
493 | " 60.0 \n",
494 | " 0.0 \n",
495 | " Pave \n",
496 | " NaN \n",
497 | " False \n",
498 | " True \n",
499 | " False \n",
500 | " False \n",
501 | " True \n",
502 | " \n",
503 | " \n",
504 | " 4 \n",
505 | " 1100 \n",
506 | " TA \n",
507 | " NaN \n",
508 | " RL \n",
509 | " 0 \n",
510 | " 60.0 \n",
511 | " 0.0 \n",
512 | " Pave \n",
513 | " NaN \n",
514 | " False \n",
515 | " True \n",
516 | " False \n",
517 | " False \n",
518 | " True \n",
519 | " \n",
520 | " \n",
521 | "
\n",
522 | "
"
523 | ],
524 | "text/plain": [
525 | " index BsmtQual FireplaceQu MSZoning BsmtUnfSF LotFrontage MasVnrArea \\\n",
526 | "0 64 Gd NaN RL 318 NaN 573.0 \n",
527 | "1 682 Gd Gd RL 288 NaN 0.0 \n",
528 | "2 960 TA NaN RL 162 50.0 0.0 \n",
529 | "3 1384 TA NaN RL 356 60.0 0.0 \n",
530 | "4 1100 TA NaN RL 0 60.0 0.0 \n",
531 | "\n",
532 | " Street Alley BsmtQual_NA FireplaceQu_NA LotFrontage_NA MasVnrArea_NA \\\n",
533 | "0 Pave NaN False True True False \n",
534 | "1 Pave NaN False False True False \n",
535 | "2 Pave NaN False True False False \n",
536 | "3 Pave NaN False True False False \n",
537 | "4 Pave NaN False True False False \n",
538 | "\n",
539 | " Alley_NA \n",
540 | "0 True \n",
541 | "1 True \n",
542 | "2 True \n",
543 | "3 True \n",
544 | "4 True "
545 | ]
546 | },
547 | "execution_count": 11,
548 | "metadata": {},
549 | "output_type": "execute_result"
550 | }
551 | ],
552 | "source": [
553 | "# ahora necesitamos unirlo manualmente al segmento X_train\n",
554 | "\n",
555 | "# creemos una columna por cada uno de los nuevos indicadores MissingIndicators\n",
556 | "indicator_cols = [c+'_NA' for c in X_train.columns[indicator.features_]]\n",
557 | "\n",
558 | "# y ahora concatenamos\n",
559 | "X_train = pd.concat([\n",
560 | " X_train.reset_index(),\n",
561 | " pd.DataFrame(tmp, columns = indicator_cols)],\n",
562 | " axis=1)\n",
563 | "\n",
564 | "X_train.head()"
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": 12,
570 | "metadata": {},
571 | "outputs": [
572 | {
573 | "data": {
574 | "text/html": [
575 | "\n",
576 | "\n",
589 | "
\n",
590 | " \n",
591 | " \n",
592 | " \n",
593 | " index \n",
594 | " BsmtQual \n",
595 | " FireplaceQu \n",
596 | " MSZoning \n",
597 | " BsmtUnfSF \n",
598 | " LotFrontage \n",
599 | " MasVnrArea \n",
600 | " Street \n",
601 | " Alley \n",
602 | " BsmtQual_NA \n",
603 | " FireplaceQu_NA \n",
604 | " LotFrontage_NA \n",
605 | " MasVnrArea_NA \n",
606 | " Alley_NA \n",
607 | " \n",
608 | " \n",
609 | " \n",
610 | " \n",
611 | " 0 \n",
612 | " 529 \n",
613 | " TA \n",
614 | " TA \n",
615 | " RL \n",
616 | " 816 \n",
617 | " NaN \n",
618 | " NaN \n",
619 | " Pave \n",
620 | " NaN \n",
621 | " False \n",
622 | " False \n",
623 | " True \n",
624 | " True \n",
625 | " True \n",
626 | " \n",
627 | " \n",
628 | " 1 \n",
629 | " 491 \n",
630 | " TA \n",
631 | " TA \n",
632 | " RL \n",
633 | " 238 \n",
634 | " 79.0 \n",
635 | " 0.0 \n",
636 | " Pave \n",
637 | " NaN \n",
638 | " False \n",
639 | " False \n",
640 | " False \n",
641 | " False \n",
642 | " True \n",
643 | " \n",
644 | " \n",
645 | " 2 \n",
646 | " 459 \n",
647 | " TA \n",
648 | " TA \n",
649 | " RL \n",
650 | " 524 \n",
651 | " NaN \n",
652 | " 161.0 \n",
653 | " Pave \n",
654 | " NaN \n",
655 | " False \n",
656 | " False \n",
657 | " True \n",
658 | " False \n",
659 | " True \n",
660 | " \n",
661 | " \n",
662 | " 3 \n",
663 | " 279 \n",
664 | " Gd \n",
665 | " TA \n",
666 | " RL \n",
667 | " 768 \n",
668 | " 83.0 \n",
669 | " 299.0 \n",
670 | " Pave \n",
671 | " NaN \n",
672 | " False \n",
673 | " False \n",
674 | " False \n",
675 | " False \n",
676 | " True \n",
677 | " \n",
678 | " \n",
679 | " 4 \n",
680 | " 655 \n",
681 | " TA \n",
682 | " NaN \n",
683 | " RM \n",
684 | " 525 \n",
685 | " 21.0 \n",
686 | " 381.0 \n",
687 | " Pave \n",
688 | " NaN \n",
689 | " False \n",
690 | " True \n",
691 | " False \n",
692 | " False \n",
693 | " True \n",
694 | " \n",
695 | " \n",
696 | "
\n",
697 | "
"
698 | ],
699 | "text/plain": [
700 | " index BsmtQual FireplaceQu MSZoning BsmtUnfSF LotFrontage MasVnrArea \\\n",
701 | "0 529 TA TA RL 816 NaN NaN \n",
702 | "1 491 TA TA RL 238 79.0 0.0 \n",
703 | "2 459 TA TA RL 524 NaN 161.0 \n",
704 | "3 279 Gd TA RL 768 83.0 299.0 \n",
705 | "4 655 TA NaN RM 525 21.0 381.0 \n",
706 | "\n",
707 | " Street Alley BsmtQual_NA FireplaceQu_NA LotFrontage_NA MasVnrArea_NA \\\n",
708 | "0 Pave NaN False False True True \n",
709 | "1 Pave NaN False False False False \n",
710 | "2 Pave NaN False False True False \n",
711 | "3 Pave NaN False False False False \n",
712 | "4 Pave NaN False True False False \n",
713 | "\n",
714 | " Alley_NA \n",
715 | "0 True \n",
716 | "1 True \n",
717 | "2 True \n",
718 | "3 True \n",
719 | "4 True "
720 | ]
721 | },
722 | "execution_count": 12,
723 | "metadata": {},
724 | "output_type": "execute_result"
725 | }
726 | ],
727 | "source": [
728 | "# repetimos para el segmento de prueba\n",
729 | "tmp = indicator.transform(X_test)\n",
730 | "\n",
731 | "X_test = pd.concat([\n",
732 | " X_test.reset_index(),\n",
733 | " pd.DataFrame(tmp, columns = indicator_cols)],\n",
734 | " axis=1)\n",
735 | "\n",
736 | "X_test.head()"
737 | ]
738 | },
739 | {
740 | "cell_type": "markdown",
741 | "metadata": {},
742 | "source": [
743 | "### SimpleImputer en un conjunto de datos diferente"
744 | ]
745 | },
746 | {
747 | "cell_type": "code",
748 | "execution_count": 13,
749 | "metadata": {},
750 | "outputs": [
751 | {
752 | "data": {
753 | "text/plain": [
754 | "SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n",
755 | " missing_values=nan, strategy='most_frequent', verbose=0)"
756 | ]
757 | },
758 | "execution_count": 13,
759 | "metadata": {},
760 | "output_type": "execute_result"
761 | }
762 | ],
763 | "source": [
764 | "# Ahora sustituimos los valores ausentes con el SimpleImputer\n",
765 | "\n",
766 | "# creamos una instancia del SimpleImputer\n",
767 | "# indicamos que queramos sustituir con la \n",
768 | "# categoría más frecuente\n",
769 | "imputer = SimpleImputer(strategy='most_frequent')\n",
770 | "\n",
771 | "# ajustamos el 'imputer' al set de entrenamiento asi aprende\n",
772 | "# la moda\n",
773 | "imputer.fit(X_train)"
774 | ]
775 | },
776 | {
777 | "cell_type": "code",
778 | "execution_count": 14,
779 | "metadata": {},
780 | "outputs": [
781 | {
782 | "data": {
783 | "text/plain": [
784 | "array([0, 'TA', 'Gd', 'RL', 0, 60.0, 0.0, 'Pave', 'Pave', False, False,\n",
785 | " False, False, True], dtype=object)"
786 | ]
787 | },
788 | "execution_count": 14,
789 | "metadata": {},
790 | "output_type": "execute_result"
791 | }
792 | ],
793 | "source": [
794 | "# podemos ver cuales fueron los valores frecuentes aprendidos:\n",
795 | "imputer.statistics_"
796 | ]
797 | },
798 | {
799 | "cell_type": "markdown",
800 | "metadata": {},
801 | "source": [
802 | "**Nota** el transformer aprende cual es el valor de la categoría más frecuente para AMBAS variables las categóricas y las numéricas."
803 | ]
804 | },
805 | {
806 | "cell_type": "code",
807 | "execution_count": 15,
808 | "metadata": {},
809 | "outputs": [
810 | {
811 | "data": {
812 | "text/plain": [
813 | "array([[64, 'Gd', 'Gd', ..., True, False, True],\n",
814 | " [682, 'Gd', 'Gd', ..., True, False, True],\n",
815 | " [960, 'TA', 'Gd', ..., False, False, True],\n",
816 | " ...,\n",
817 | " [1216, 'TA', 'Gd', ..., False, False, True],\n",
818 | " [559, 'Gd', 'TA', ..., True, False, True],\n",
819 | " [684, 'Gd', 'Gd', ..., False, False, True]], dtype=object)"
820 | ]
821 | },
822 | "execution_count": 15,
823 | "metadata": {},
824 | "output_type": "execute_result"
825 | }
826 | ],
827 | "source": [
828 | "# y ahora sustituimos ambos segmentos de prueba y entrenamiento\n",
829 | "\n",
830 | "# NOTA: los datos se devuelven como un numpy array!!!\n",
831 | "X_train = imputer.transform(X_train)\n",
832 | "X_test = imputer.transform(X_test)\n",
833 | "\n",
834 | "X_train"
835 | ]
836 | },
837 | {
838 | "cell_type": "code",
839 | "execution_count": null,
840 | "metadata": {},
841 | "outputs": [],
842 | "source": []
843 | }
844 | ],
845 | "metadata": {
846 | "kernelspec": {
847 | "display_name": "feml",
848 | "language": "python",
849 | "name": "feml"
850 | },
851 | "language_info": {
852 | "codemirror_mode": {
853 | "name": "ipython",
854 | "version": 3
855 | },
856 | "file_extension": ".py",
857 | "mimetype": "text/x-python",
858 | "name": "python",
859 | "nbconvert_exporter": "python",
860 | "pygments_lexer": "ipython3",
861 | "version": "3.8.2"
862 | },
863 | "toc": {
864 | "base_numbering": 1,
865 | "nav_menu": {},
866 | "number_sections": true,
867 | "sideBar": true,
868 | "skip_h1_title": false,
869 | "title_cell": "Table of Contents",
870 | "title_sidebar": "Contents",
871 | "toc_cell": false,
872 | "toc_position": {},
873 | "toc_section_display": true,
874 | "toc_window_display": true
875 | }
876 | },
877 | "nbformat": 4,
878 | "nbformat_minor": 2
879 | }
880 |
--------------------------------------------------------------------------------
/Seccion-04-Sustitucion-Datos-Faltantes/04.12_SustitucionCategoriaAdicional_Sklearn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Sustitución usando una etiqueta adicional 'Missing' con Scikit-learn ==> SimpleImputer \n",
8 | "\n",
9 | "En la librería Scikit-learn hay una clase para manejar una gran variedad de métodos de sustitución.\n",
10 | "\n",
11 | "El **SimpleImputer** es una clase que provee funcionalidad básica para la sustitución de valores ausentes, incluyendo:\n",
12 | "\n",
13 | "- Sustitución por la media y la mediana para variables numéricas\n",
14 | "- Sustitución por la categoría más frecuente para variables categóricas.\n",
15 | "- Sustitución por valores arbitrarios para variables numéricas y categóricas.\n",
16 | "\n",
17 | "### Ventajas\n",
18 | "\n",
19 | "- Fácil de usar si se aplica a todo el dataframe\n",
20 | "- Código mantenido por desarrolladores de Scikit-learn: buena calidad\n",
21 | "- Rápida computación (usa NumPy para los cálculos)\n",
22 | "- Permite usar grid-search (búsqueda en cuadrículas) para varios métodos de sustitución\n",
23 | "- Permite usar diferentes valores para codificar ausencia de datos (se puede indicar si por ejemplo los valores nulos son np.nan, ceros, cadenas de caracteres vacías, u otros)\n",
24 | "\n",
25 | "### Limitaciones\n",
26 | "\n",
27 | "- Retorna un arreglo de NumPy en lugar de un dataframe de pandas, lo cual es inconveniente para el análisis de datos\n",
28 | "- Necesita usar clases adicionales para seleccionar cuales variables se deben sustituir ==>\n",
29 | " - requiere líneas de código adicional\n",
30 | " - requiere ser usado con otras clases\n",
31 | " - no es tan sencillo de usar\n",
32 | " \n",
33 | "### Más detalles acerca de los transformadores (transformers en inglés)\n",
34 | "\n",
35 | "- [SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer)\n",
36 | "- [ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)\n",
37 | "- [Stackoverflow](https://stackoverflow.com/questions/54160370/how-to-use-sklearn-column-transformer)\n",
38 | "\n",
39 | "\n",
40 | "## En este demo:\n",
41 | "\n",
42 | "Vamos a aprender **sustitución con una etiqueta adicional 'Missing' usando Scikit-learn** usando los datos Ames House Price.\n",
43 | "\n",
44 | "- Para bajar los datos, por favor referirse a la clase **Datasets** en la **Sección 1** del curso.\n",
45 | "\n",
46 | "### Nota: \n",
47 | "* 'Imputer' se deriva del verbo en inglés 'to impute' que quiere decir sustituir o reemplazar. Imputer es el objeto que completa la sustitución, de ahi el nombre dado a la clase.\n",
48 | "* 'slicing' significa seleccionar conjuntos de datos (columnas/filas) de un ‘DataFrame’.\n",
49 | "* 'Missing' -> ausente"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 1,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "import pandas as pd\n",
59 | "import numpy as np\n",
60 | "\n",
61 | "import matplotlib.pyplot as plt\n",
62 | "\n",
63 | "# estas son las clases para sustitutición con sklearn\n",
64 | "from sklearn.impute import SimpleImputer\n",
65 | "from sklearn.compose import ColumnTransformer\n",
66 | "from sklearn.pipeline import Pipeline\n",
67 | "\n",
68 | "# dividir dataset\n",
69 | "from sklearn.model_selection import train_test_split"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 2,
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "data": {
79 | "text/html": [
80 | "\n",
81 | "\n",
94 | "
\n",
95 | " \n",
96 | " \n",
97 | " \n",
98 | " BsmtQual \n",
99 | " FireplaceQu \n",
100 | " SalePrice \n",
101 | " \n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " 0 \n",
106 | " Gd \n",
107 | " NaN \n",
108 | " 208500 \n",
109 | " \n",
110 | " \n",
111 | " 1 \n",
112 | " Gd \n",
113 | " TA \n",
114 | " 181500 \n",
115 | " \n",
116 | " \n",
117 | " 2 \n",
118 | " Gd \n",
119 | " TA \n",
120 | " 223500 \n",
121 | " \n",
122 | " \n",
123 | " 3 \n",
124 | " TA \n",
125 | " Gd \n",
126 | " 140000 \n",
127 | " \n",
128 | " \n",
129 | " 4 \n",
130 | " Gd \n",
131 | " TA \n",
132 | " 250000 \n",
133 | " \n",
134 | " \n",
135 | "
\n",
136 | "
"
137 | ],
138 | "text/plain": [
139 | " BsmtQual FireplaceQu SalePrice\n",
140 | "0 Gd NaN 208500\n",
141 | "1 Gd TA 181500\n",
142 | "2 Gd TA 223500\n",
143 | "3 TA Gd 140000\n",
144 | "4 Gd TA 250000"
145 | ]
146 | },
147 | "execution_count": 2,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "# solo usaremos las siguientes variables categóricas en el demo:\n",
154 | "\n",
155 | "# estas son las variables categóricas y el target SalePrice\n",
156 | "cols_to_use = ['BsmtQual', 'FireplaceQu', 'SalePrice']\n",
157 | "\n",
158 | "# carguemos los datos \n",
159 | "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
160 | "data.head()"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 3,
166 | "metadata": {},
167 | "outputs": [
168 | {
169 | "data": {
170 | "text/plain": [
171 | "BsmtQual 0.025342\n",
172 | "FireplaceQu 0.472603\n",
173 | "SalePrice 0.000000\n",
174 | "dtype: float64"
175 | ]
176 | },
177 | "execution_count": 3,
178 | "metadata": {},
179 | "output_type": "execute_result"
180 | }
181 | ],
182 | "source": [
183 | "# revisemos los valores nulos\n",
184 | "data.isnull().mean()"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "metadata": {},
190 | "source": [
191 | "Las variables cateogóricas BsmtQual y FirePlaceQu tienen datos ausentes\n"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 4,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/plain": [
202 | "((1022, 2), (438, 2))"
203 | ]
204 | },
205 | "execution_count": 4,
206 | "metadata": {},
207 | "output_type": "execute_result"
208 | }
209 | ],
210 | "source": [
211 | "# separar datos en segmentos entrenamiento y prueba\n",
212 | "\n",
213 | "# primero, separemos el target (SalePrice) del resto de las variables (features)\n",
214 | "cols_to_use.remove('SalePrice')\n",
215 | "\n",
216 | "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use], # solo las variables\n",
217 | " data['SalePrice'], # el target\n",
218 | " test_size=0.3, # el porcentaje de obs en el segmento de prueba\n",
219 | " random_state=0) # para reproducir\n",
220 | "X_train.shape, X_test.shape"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 5,
226 | "metadata": {},
227 | "outputs": [
228 | {
229 | "data": {
230 | "text/plain": [
231 | "BsmtQual 0.023483\n",
232 | "FireplaceQu 0.467710\n",
233 | "dtype: float64"
234 | ]
235 | },
236 | "execution_count": 5,
237 | "metadata": {},
238 | "output_type": "execute_result"
239 | }
240 | ],
241 | "source": [
242 | "# evaluemos el porcentaje de datos ausentes nuevamente\n",
243 | "X_train.isnull().mean()"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 6,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "data": {
253 | "text/plain": [
254 | "array(['Gd', 'TA', 'Fa', nan, 'Ex'], dtype=object)"
255 | ]
256 | },
257 | "execution_count": 6,
258 | "metadata": {},
259 | "output_type": "execute_result"
260 | }
261 | ],
262 | "source": [
263 | "# exploremos los valores de la variable categórica\n",
264 | "X_train['BsmtQual'].unique()"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 7,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/plain": [
275 | "array([nan, 'Gd', 'TA', 'Fa', 'Po', 'Ex'], dtype=object)"
276 | ]
277 | },
278 | "execution_count": 7,
279 | "metadata": {},
280 | "output_type": "execute_result"
281 | }
282 | ],
283 | "source": [
284 | "# exploremos los valores de la variable categórica\n",
285 | "X_train['FireplaceQu'].unique()"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 8,
291 | "metadata": {},
292 | "outputs": [
293 | {
294 | "data": {
295 | "text/plain": [
296 | "SimpleImputer(add_indicator=False, copy=True, fill_value='Missing',\n",
297 | " missing_values=nan, strategy='constant', verbose=0)"
298 | ]
299 | },
300 | "execution_count": 8,
301 | "metadata": {},
302 | "output_type": "execute_result"
303 | }
304 | ],
305 | "source": [
306 | "# Ahora sustituyamos los valores faltantes con SimpleImputer\n",
307 | "\n",
308 | "# creemos una instancia de la clase SimpleImputer\n",
309 | "# indicaremos que queremos sustituir los valores nulos\n",
310 | "# con la categoría 'Missing'\n",
311 | "\n",
312 | "imputer = SimpleImputer(strategy='constant', \n",
313 | " fill_value = 'Missing')\n",
314 | "\n",
315 | "# ajustamos el imputer al segmento de entrenamiento\n",
316 | "# en este caso simplemente reemplaza los valores nulos con el valor 'Missing'\n",
317 | "\n",
318 | "imputer.fit(X_train)"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 9,
324 | "metadata": {},
325 | "outputs": [
326 | {
327 | "data": {
328 | "text/plain": [
329 | "array(['Missing', 'Missing'], dtype=object)"
330 | ]
331 | },
332 | "execution_count": 9,
333 | "metadata": {},
334 | "output_type": "execute_result"
335 | }
336 | ],
337 | "source": [
338 | "# veamos los valores ajustados:\n",
339 | "imputer.statistics_"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 10,
345 | "metadata": {},
346 | "outputs": [
347 | {
348 | "data": {
349 | "text/plain": [
350 | "array([['Gd', 'Missing'],\n",
351 | " ['Gd', 'Gd'],\n",
352 | " ['TA', 'Missing'],\n",
353 | " ...,\n",
354 | " ['Missing', 'Missing'],\n",
355 | " ['Gd', 'TA'],\n",
356 | " ['Gd', 'Missing']], dtype=object)"
357 | ]
358 | },
359 | "execution_count": 10,
360 | "metadata": {},
361 | "output_type": "execute_result"
362 | }
363 | ],
364 | "source": [
365 | "# ahora sustituyamos en los segmentos de entrenamiento y prueba\n",
366 | "\n",
367 | "# NOTA: los datos son devueltos como un numpy array!!\n",
368 | "X_train = imputer.transform(X_train)\n",
369 | "X_test = imputer.transform(X_test)\n",
370 | "\n",
371 | "X_train"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": 11,
377 | "metadata": {},
378 | "outputs": [
379 | {
380 | "data": {
381 | "text/html": [
382 | "\n",
383 | "\n",
396 | "
\n",
397 | " \n",
398 | " \n",
399 | " \n",
400 | " BsmtQual \n",
401 | " FireplaceQu \n",
402 | " \n",
403 | " \n",
404 | " \n",
405 | " \n",
406 | " 0 \n",
407 | " Gd \n",
408 | " Missing \n",
409 | " \n",
410 | " \n",
411 | " 1 \n",
412 | " Gd \n",
413 | " Gd \n",
414 | " \n",
415 | " \n",
416 | " 2 \n",
417 | " TA \n",
418 | " Missing \n",
419 | " \n",
420 | " \n",
421 | " 3 \n",
422 | " TA \n",
423 | " Missing \n",
424 | " \n",
425 | " \n",
426 | " 4 \n",
427 | " TA \n",
428 | " Missing \n",
429 | " \n",
430 | " \n",
431 | "
\n",
432 | "
"
433 | ],
434 | "text/plain": [
435 | " BsmtQual FireplaceQu\n",
436 | "0 Gd Missing\n",
437 | "1 Gd Gd\n",
438 | "2 TA Missing\n",
439 | "3 TA Missing\n",
440 | "4 TA Missing"
441 | ]
442 | },
443 | "execution_count": 11,
444 | "metadata": {},
445 | "output_type": "execute_result"
446 | }
447 | ],
448 | "source": [
449 | "# transformemos el segmento de entrenamiento en un dataframe:\n",
450 | "\n",
451 | "X_train = pd.DataFrame(X_train, columns=cols_to_use)\n",
452 | "X_train.head()"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": 12,
458 | "metadata": {},
459 | "outputs": [
460 | {
461 | "data": {
462 | "text/plain": [
463 | "array(['Gd', 'TA', 'Fa', 'Missing', 'Ex'], dtype=object)"
464 | ]
465 | },
466 | "execution_count": 12,
467 | "metadata": {},
468 | "output_type": "execute_result"
469 | }
470 | ],
471 | "source": [
472 | "X_train['BsmtQual'].unique()"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 13,
478 | "metadata": {},
479 | "outputs": [
480 | {
481 | "data": {
482 | "text/plain": [
483 | "BsmtQual 0.0\n",
484 | "FireplaceQu 0.0\n",
485 | "dtype: float64"
486 | ]
487 | },
488 | "execution_count": 13,
489 | "metadata": {},
490 | "output_type": "execute_result"
491 | }
492 | ],
493 | "source": [
494 | "X_train.isnull().mean()"
495 | ]
496 | },
497 | {
498 | "cell_type": "markdown",
499 | "metadata": {},
500 | "source": [
501 | "**ADVERTENCIA**:\n",
502 | "\n",
503 | "Cuando usamos SimpleImputer y fijamos los parámetros:\n",
504 | "- strategy='constant'\n",
505 | "- fill_value = 'Missing'\n",
506 | "\n",
507 | "Si el dataframe contiene variables que son numéricas y categóricas, los valores nulos NA en ambos serán reemplazados con 'Missing\" y por lo tanto una variable numérica se convierte en categórica, que probablemente no es el efecto deseado.\n",
508 | "\n",
509 | "La mayoría de datos contienen variables numéricas y categóricas, por lo tanto lo más probable es que tendrás que usar un transformador para seleccionar las columnas como mostramos en los notebooks previos y en las siguientes celdas.\n"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 14,
515 | "metadata": {},
516 | "outputs": [
517 | {
518 | "data": {
519 | "text/html": [
520 | "\n",
521 | "\n",
534 | "
\n",
535 | " \n",
536 | " \n",
537 | " \n",
538 | " LotFrontage \n",
539 | " MasVnrArea \n",
540 | " BsmtQual \n",
541 | " FireplaceQu \n",
542 | " GarageYrBlt \n",
543 | " SalePrice \n",
544 | " \n",
545 | " \n",
546 | " \n",
547 | " \n",
548 | " 0 \n",
549 | " 65.0 \n",
550 | " 196.0 \n",
551 | " Gd \n",
552 | " NaN \n",
553 | " 2003.0 \n",
554 | " 208500 \n",
555 | " \n",
556 | " \n",
557 | " 1 \n",
558 | " 80.0 \n",
559 | " 0.0 \n",
560 | " Gd \n",
561 | " TA \n",
562 | " 1976.0 \n",
563 | " 181500 \n",
564 | " \n",
565 | " \n",
566 | " 2 \n",
567 | " 68.0 \n",
568 | " 162.0 \n",
569 | " Gd \n",
570 | " TA \n",
571 | " 2001.0 \n",
572 | " 223500 \n",
573 | " \n",
574 | " \n",
575 | " 3 \n",
576 | " 60.0 \n",
577 | " 0.0 \n",
578 | " TA \n",
579 | " Gd \n",
580 | " 1998.0 \n",
581 | " 140000 \n",
582 | " \n",
583 | " \n",
584 | " 4 \n",
585 | " 84.0 \n",
586 | " 350.0 \n",
587 | " Gd \n",
588 | " TA \n",
589 | " 2000.0 \n",
590 | " 250000 \n",
591 | " \n",
592 | " \n",
593 | "
\n",
594 | "
"
595 | ],
596 | "text/plain": [
597 | " LotFrontage MasVnrArea BsmtQual FireplaceQu GarageYrBlt SalePrice\n",
598 | "0 65.0 196.0 Gd NaN 2003.0 208500\n",
599 | "1 80.0 0.0 Gd TA 1976.0 181500\n",
600 | "2 68.0 162.0 Gd TA 2001.0 223500\n",
601 | "3 60.0 0.0 TA Gd 1998.0 140000\n",
602 | "4 84.0 350.0 Gd TA 2000.0 250000"
603 | ]
604 | },
605 | "execution_count": 14,
606 | "metadata": {},
607 | "output_type": "execute_result"
608 | }
609 | ],
610 | "source": [
611 | "# carguemos los datos con variables numéricas y categóricas\n",
612 | "\n",
613 | "cols_to_use = [\n",
614 | " 'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',\n",
615 | " 'SalePrice'\n",
616 | "]\n",
617 | "\n",
618 | "data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)\n",
619 | "data.head()"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": 15,
625 | "metadata": {},
626 | "outputs": [
627 | {
628 | "data": {
629 | "text/plain": [
630 | "((1022, 5), (438, 5))"
631 | ]
632 | },
633 | "execution_count": 15,
634 | "metadata": {},
635 | "output_type": "execute_result"
636 | }
637 | ],
638 | "source": [
639 | "# separar datos en segmentos entrenamiento y prueba\n",
640 | "\n",
641 | "# primero descartemos el target de la lista de variables\n",
642 | "cols_to_use.remove('SalePrice')\n",
643 | "\n",
644 | "X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],\n",
645 | " data['SalePrice'],\n",
646 | " test_size=0.3,\n",
647 | " random_state=0)\n",
648 | "X_train.shape, X_test.shape"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": 16,
654 | "metadata": {},
655 | "outputs": [
656 | {
657 | "data": {
658 | "text/plain": [
659 | "BsmtQual 0.023483\n",
660 | "FireplaceQu 0.467710\n",
661 | "LotFrontage 0.184932\n",
662 | "MasVnrArea 0.004892\n",
663 | "GarageYrBlt 0.052838\n",
664 | "dtype: float64"
665 | ]
666 | },
667 | "execution_count": 16,
668 | "metadata": {},
669 | "output_type": "execute_result"
670 | }
671 | ],
672 | "source": [
673 | "# revisemos los valores nulos\n",
674 | "X_train.isnull().mean()"
675 | ]
676 | },
677 | {
678 | "cell_type": "markdown",
679 | "metadata": {},
680 | "source": [
681 | "En este demo, vamos a sustituir los valores nulos de las variables numéricas por la media y las variables categóricas por la nueva etiqueta 'Missing'."
682 | ]
683 | },
684 | {
685 | "cell_type": "code",
686 | "execution_count": 17,
687 | "metadata": {},
688 | "outputs": [],
689 | "source": [
690 | "# primero vamos a crear una lista, indicando cuales son las\n",
691 | "# variables a sustituir con cada método\n",
692 | "\n",
693 | "features_numeric = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']\n",
694 | "features_categoric = ['BsmtQual', 'FireplaceQu']\n",
695 | "\n",
696 | "# luego vamos a instanciar imputers\n",
697 | "# creamos un imputer por cada variable\n",
698 | "# indicando uno para la media y el otro para las variables categóricas\n",
699 | "\n",
700 | "\n",
701 | "# luego ponemos las variables en lista y los transformadores juntos\n",
702 | "# usando la columna transformer\n",
703 | "\n",
704 | "preprocessor = ColumnTransformer(transformers=[\n",
705 | " ('imputer_numeric', SimpleImputer(strategy='mean'), features_numeric),\n",
706 | " ('imputer_categoric', SimpleImputer(strategy='constant', fill_value='Missing'), features_categoric)])"
707 | ]
708 | },
709 | {
710 | "cell_type": "code",
711 | "execution_count": 18,
712 | "metadata": {},
713 | "outputs": [
714 | {
715 | "data": {
716 | "text/plain": [
717 | "ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,\n",
718 | " transformer_weights=None,\n",
719 | " transformers=[('imputer_numeric',\n",
720 | " SimpleImputer(add_indicator=False, copy=True,\n",
721 | " fill_value=None,\n",
722 | " missing_values=nan,\n",
723 | " strategy='mean', verbose=0),\n",
724 | " ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']),\n",
725 | " ('imputer_categoric',\n",
726 | " SimpleImputer(add_indicator=False, copy=True,\n",
727 | " fill_value='Missing',\n",
728 | " missing_values=nan,\n",
729 | " strategy='constant', verbose=0),\n",
730 | " ['BsmtQual', 'FireplaceQu'])],\n",
731 | " verbose=False)"
732 | ]
733 | },
734 | "execution_count": 18,
735 | "metadata": {},
736 | "output_type": "execute_result"
737 | }
738 | ],
739 | "source": [
740 | "# ajustemos el preprocessor\n",
741 | "preprocessor.fit(X_train)"
742 | ]
743 | },
744 | {
745 | "cell_type": "code",
746 | "execution_count": 19,
747 | "metadata": {},
748 | "outputs": [
749 | {
750 | "data": {
751 | "text/plain": [
752 | "[('imputer_numeric',\n",
753 | " SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n",
754 | " missing_values=nan, strategy='mean', verbose=0),\n",
755 | " ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']),\n",
756 | " ('imputer_categoric',\n",
757 | " SimpleImputer(add_indicator=False, copy=True, fill_value='Missing',\n",
758 | " missing_values=nan, strategy='constant', verbose=0),\n",
759 | " ['BsmtQual', 'FireplaceQu'])]"
760 | ]
761 | },
762 | "execution_count": 19,
763 | "metadata": {},
764 | "output_type": "execute_result"
765 | }
766 | ],
767 | "source": [
768 | "# podemos explorar el transformador:\n",
769 | "preprocessor.transformers"
770 | ]
771 | },
772 | {
773 | "cell_type": "code",
774 | "execution_count": 20,
775 | "metadata": {},
776 | "outputs": [
777 | {
778 | "data": {
779 | "text/plain": [
780 | "array([ 69.66866747, 103.55358899, 1978.01239669])"
781 | ]
782 | },
783 | "execution_count": 20,
784 | "metadata": {},
785 | "output_type": "execute_result"
786 | }
787 | ],
788 | "source": [
789 | "# podemos ver los parámetros ajustados:\n",
790 | "\n",
791 | "# para el imputer de las variables numéricas\n",
792 | "preprocessor.named_transformers_['imputer_numeric'].statistics_"
793 | ]
794 | },
795 | {
796 | "cell_type": "code",
797 | "execution_count": 21,
798 | "metadata": {},
799 | "outputs": [
800 | {
801 | "data": {
802 | "text/plain": [
803 | "array(['Missing', 'Missing'], dtype=object)"
804 | ]
805 | },
806 | "execution_count": 21,
807 | "metadata": {},
808 | "output_type": "execute_result"
809 | }
810 | ],
811 | "source": [
812 | "# para el imputer de las variables categóricas\n",
813 | "preprocessor.named_transformers_['imputer_categoric'].statistics_"
814 | ]
815 | },
816 | {
817 | "cell_type": "code",
818 | "execution_count": 22,
819 | "metadata": {},
820 | "outputs": [],
821 | "source": [
822 | "# y ahora podemos sustituir los segmentos de entrenamiento y prueba\n",
823 | "# recuerda los datos retornados son numpy arrays\n",
824 | "\n",
825 | "X_train = preprocessor.transform(X_train)\n",
826 | "X_test = preprocessor.transform(X_test)"
827 | ]
828 | },
829 | {
830 | "cell_type": "code",
831 | "execution_count": 23,
832 | "metadata": {},
833 | "outputs": [
834 | {
835 | "data": {
836 | "text/html": [
837 | "\n",
838 | "\n",
851 | "
\n",
852 | " \n",
853 | " \n",
854 | " \n",
855 | " LotFrontage \n",
856 | " MasVnrArea \n",
857 | " GarageYrBlt \n",
858 | " BsmtQual \n",
859 | " FireplaceQu \n",
860 | " \n",
861 | " \n",
862 | " \n",
863 | " \n",
864 | " 0 \n",
865 | " 69.6687 \n",
866 | " 573 \n",
867 | " 1998 \n",
868 | " Gd \n",
869 | " Missing \n",
870 | " \n",
871 | " \n",
872 | " 1 \n",
873 | " 69.6687 \n",
874 | " 0 \n",
875 | " 1996 \n",
876 | " Gd \n",
877 | " Gd \n",
878 | " \n",
879 | " \n",
880 | " 2 \n",
881 | " 50 \n",
882 | " 0 \n",
883 | " 1978.01 \n",
884 | " TA \n",
885 | " Missing \n",
886 | " \n",
887 | " \n",
888 | " 3 \n",
889 | " 60 \n",
890 | " 0 \n",
891 | " 1939 \n",
892 | " TA \n",
893 | " Missing \n",
894 | " \n",
895 | " \n",
896 | " 4 \n",
897 | " 60 \n",
898 | " 0 \n",
899 | " 1930 \n",
900 | " TA \n",
901 | " Missing \n",
902 | " \n",
903 | " \n",
904 | "
\n",
905 | "
"
906 | ],
907 | "text/plain": [
908 | " LotFrontage MasVnrArea GarageYrBlt BsmtQual FireplaceQu\n",
909 | "0 69.6687 573 1998 Gd Missing\n",
910 | "1 69.6687 0 1996 Gd Gd\n",
911 | "2 50 0 1978.01 TA Missing\n",
912 | "3 60 0 1939 TA Missing\n",
913 | "4 60 0 1930 TA Missing"
914 | ]
915 | },
916 | "execution_count": 23,
917 | "metadata": {},
918 | "output_type": "execute_result"
919 | }
920 | ],
921 | "source": [
922 | "# ahora convirtamos el resultado en un dataframe\n",
923 | "pd.DataFrame(X_train,\n",
924 | " columns=features_numeric+features_categoric).head()"
925 | ]
926 | },
927 | {
928 | "cell_type": "code",
929 | "execution_count": 24,
930 | "metadata": {},
931 | "outputs": [
932 | {
933 | "data": {
934 | "text/plain": [
935 | "LotFrontage 0.0\n",
936 | "MasVnrArea 0.0\n",
937 | "GarageYrBlt 0.0\n",
938 | "BsmtQual 0.0\n",
939 | "FireplaceQu 0.0\n",
940 | "dtype: float64"
941 | ]
942 | },
943 | "execution_count": 24,
944 | "metadata": {},
945 | "output_type": "execute_result"
946 | }
947 | ],
948 | "source": [
949 | "# ahora convertimos el resultado en un dataframe\n",
950 | "# y exploramos los valores ausentes\n",
951 | "# (no debería haber ninguno)\n",
952 | "\n",
953 | "\n",
954 | "X_train = pd.DataFrame(X_train,\n",
955 | " columns=features_numeric+features_categoric)\n",
956 | "\n",
957 | "X_train.isnull().mean()"
958 | ]
959 | }
960 | ],
961 | "metadata": {
962 | "kernelspec": {
963 | "display_name": "feml",
964 | "language": "python",
965 | "name": "feml"
966 | },
967 | "language_info": {
968 | "codemirror_mode": {
969 | "name": "ipython",
970 | "version": 3
971 | },
972 | "file_extension": ".py",
973 | "mimetype": "text/x-python",
974 | "name": "python",
975 | "nbconvert_exporter": "python",
976 | "pygments_lexer": "ipython3",
977 | "version": "3.8.2"
978 | },
979 | "toc": {
980 | "base_numbering": 1,
981 | "nav_menu": {},
982 | "number_sections": true,
983 | "sideBar": true,
984 | "skip_h1_title": false,
985 | "title_cell": "Table of Contents",
986 | "title_sidebar": "Contents",
987 | "toc_cell": false,
988 | "toc_position": {},
989 | "toc_section_display": true,
990 | "toc_window_display": true
991 | }
992 | },
993 | "nbformat": 4,
994 | "nbformat_minor": 2
995 | }
996 |
--------------------------------------------------------------------------------