├── Basic_Pandas_Test ├── Basic_Pandas_test.ipynb └── readme.md ├── CheatSheet - Exploración de datos usando Pandas ├── CheatSheet, Exploración de datos usando Pandas.ipynb └── README.md ├── EDA Iris ├── Lab. EDA Iris .ipynb └── readme.md ├── Filtrado con Pandas ├── Técnicas de filtrado de datos con Pandas.ipynb └── readme.md ├── Librerias_EDA ├── EDA utilizando la librería de Python Sweetviz.ipynb ├── Librería Datatile .ipynb ├── README.md └── fast EDA.ipynb ├── Pandas Style. ├── Pandas-Style.ipynb └── readme.md ├── Pandas_Tricks ├── Código │ ├── Trucos para usar en la librería pandas (parte1).ipynb │ ├── Trucos para usar en la librería pandas(parte2).ipynb │ ├── pivot_table().ipynb │ └── readme.md └── readme.md ├── README.md ├── Sales Data Analysis ├── - ├── CSV │ ├── README.md │ ├── Sales_April_2019.csv │ ├── Sales_August_2019.csv │ ├── Sales_December_2019.csv │ ├── Sales_February_2019.csv │ ├── Sales_January_2019.csv │ ├── Sales_July_2019.csv │ ├── Sales_June_2019.csv │ ├── Sales_March_2019.csv │ ├── Sales_May_2019.csv │ ├── Sales_November_2019.csv │ ├── Sales_October_2019.csv │ └── Sales_September_2019.csv ├── README.md └── Sales Data Analysis.ipynb ├── Series y dataframes ├── Series en Python.ipynb └── readme.md ├── Tabula ├── Tabula python.pdf └── readme.md ├── Titanic Dataset ├── EDA - Dataset Titanic.ipynb ├── Outliers.ipynb ├── Portada │ ├── - │ └── 2.png ├── README.md ├── Titanic ejercicio.ipynb ├── Titanic group by.ipynb ├── _Visualizar datos faltantes con missingno.ipynb └── titanic_train.csv └── Web_scraping_BeautifulSoup ├── BeautifulSoup_Ejemplo_libros.ipynb └── readme.md /Basic_Pandas_Test/Basic_Pandas_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "cJ19HeAGZnzw" 7 | }, 8 | "source": [ 9 | "# Basic Pandas Test" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "id": "ttELfGTVAAx_" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "url = 'https://raw.githubusercontent.com/bcamandone/Python_Analisis_datos/main/Titanic%20Dataset/titanic_train.csv'\n", 22 | "titanic = pd.read_csv(url)\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "cellView": "form", 30 | "id": "vSID9Bmj-MJH" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "#@markdown ### Preguntas { display-mode: \"form\" }\n", 35 | "#@markdown 1. ¿Cuántas filas se muestran al ejecutar este código, asumiendo que el dataframe (titanic) contiene 1000 registros??\n", 36 | "\n", 37 | " #@markdown data.head()\n", 38 | "\n", 39 | "#@markdown Marque la correcta:\n", 40 | "#@markdown - a. 1\n", 41 | "#@markdown - b. 5\n", 42 | "#@markdown - c. 10\n", 43 | "#@markdown - d. 100\n", 44 | "\n", 45 | "\n", 46 | "item_1 = \"Seleccionar respuesta\" # @param [\"a\", \"b\", \"c\", \"d\",\"Seleccionar respuesta\"]" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "cellView": "form", 54 | "id": "kpnyTCIu_M8q" 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "#@markdown{ display-mode: \"form\" }\n", 59 | "#@markdown 2.¿Qué instrucción me permite conocer el tamaño que tiene el Dataframe del titanic?\n", 60 | "\n", 61 | "\n", 62 | "#@markdown Marque la correcta:\n", 63 | "#@markdown - a. len(titanic)\n", 64 | "#@markdown - b. titanic.columns\n", 65 | "#@markdown - c. titanic.shape\n", 66 | "\n", 67 | "\n", 68 | "\n", 69 | "item_2 = \"Seleccionar respuesta\" # @param [\"a\", \"b\", \"c\",\"Seleccionar respuesta\"]" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "cellView": "form", 77 | "id": "PmRHv-Q9DdVb" 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "#@markdown{ display-mode: \"form\" }\n", 82 | "#@markdown 3.Se cuenta con información en un archivo CSV, sin embargo al momento de crear este archivo los datos fueron separados no con comas (,) sino con punto y comas (;). ¿Qué instrucción me permite cargar esta data?\n", 83 | "\n", 84 | "\n", 85 | "\n", 86 | "#@markdown Marque la correcta:\n", 87 | "#@markdown - a. titanic = pd.read_csv('titanic.csv', divide=';')\n", 88 | "#@markdown - b. titanic = pd.read_csv('titanic.csv', separator=';')\n", 89 | "#@markdown - c. titanic = pd.read_csv('titanic.csv', delimiter=';')\n", 90 | "\n", 91 | "\n", 92 | "\n", 93 | "item_3 = \"Seleccionar respuesta\" # @param [\"a\", \"b\", \"c\",\"Seleccionar respuesta\"]" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "cellView": "form", 101 | "id": "nbchipYuDnM3" 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "#@markdown{ display-mode: \"form\" }\n", 106 | "#@markdown 4. ¿Cuál de las sentencias muestra la siguiente información del conjunto de datos?\n", 107 | "\n", 108 | "\n", 109 | "\n", 110 | "#@markdown Marque la correcta:\n", 111 | "#@markdown - a. titanic.info()\n", 112 | "#@markdown - b. titanic.summary()\n", 113 | "#@markdown - c. titanic.describe()\n", 114 | "\n", 115 | "\n", 116 | "\n", 117 | "item_4 = \"Seleccionar respuesta\" # @param [\"a\", \"b\", \"c\",\"Seleccionar respuesta\"]" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "id": "k_oXKh1AFACi" 124 | }, 125 | "source": [ 126 | "![5.png]()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "cellView": "form", 134 | "id": "WUXS656ODn67" 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "#@markdown{ display-mode: \"form\" }\n", 139 | "#@markdown 5.¿Qué instrucción puedo utilizar para obtener los 10 últimos elementos del Dataframe titanic?\n", 140 | "\n", 141 | "\n", 142 | "#@markdown Marque la correcta:\n", 143 | "#@markdown - a. titanic.last(10)\n", 144 | "#@markdown - b. titanic.tail(10)\n", 145 | "#@markdown - c. titanic.limit(10)\n", 146 | "\n", 147 | "\n", 148 | "\n", 149 | "item_5 = \"Seleccionar respuesta\" # @param [\"a\", \"b\", \"c\",\"Seleccionar respuesta\"]" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "cellView": "form", 157 | "id": "QPx5zVpYDogD" 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "#@markdown{ display-mode: \"form\" }\n", 162 | "#@markdown 6.Obtener aquellos pasajeros que pagaron un ticket > 100, ¿Cuál es la instrucción que nos permite obtener este conjunto de datos?\n", 163 | "\n", 164 | "\n", 165 | "\n", 166 | "#@markdown Marque la correcta:\n", 167 | "#@markdown - a. titanic['fare' > 100]\n", 168 | "#@markdown - b. titanic['fare' < 100]\n", 169 | "#@markdown - c. titanic[titanic['fare'] > 100]\n", 170 | "#@markdown - c. titanic[titanic('fare') > 100]\n", 171 | "\n", 172 | "\n", 173 | "\n", 174 | "item_6 = \"Seleccionar respuesta\" # @param [\"a\", \"b\", \"c\",\"d\",\"Seleccionar respuesta\"]" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "cellView": "form", 182 | "id": "tSof1Sn1G4Zf" 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "#@markdown{ display-mode: \"form\" }\n", 187 | "#@markdown 7.La siguiente linea de código: titanic.sort_values('fare') ordena el dataframe del titanic en base a la columna fare de mayor a menor\n", 188 | "\n", 189 | "\n", 190 | "\n", 191 | "#@markdown Marque la correcta:\n", 192 | "#@markdown - a. Verdadero\n", 193 | "#@markdown - b. Falso\n", 194 | "\n", 195 | "\n", 196 | "item_7 = \"Seleccionar respuesta\" # @param [\"a\", \"b\",\"Seleccionar respuesta\"]" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "cellView": "form", 204 | "id": "pGIdlfNVHVcP" 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "#@markdown{ display-mode: \"form\" }\n", 209 | "#@markdown 8.La siguiente linea de código titanic['Survived'].value_counts() me permite saber cuantos sobrevivientes tuvo el titanic\n", 210 | "\n", 211 | "\n", 212 | "\n", 213 | "#@markdown Marque la correcta:\n", 214 | "#@markdown - a. Verdadero\n", 215 | "#@markdown - b. Falso\n", 216 | "\n", 217 | "\n", 218 | "\n", 219 | "\n", 220 | "item_8 = \"Seleccionar respuesta\" # @param [\"a\", \"b\",\"Seleccionar respuesta\"]" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "cellView": "form", 228 | "id": "MZBvJNaPIfMY" 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "#@markdown{ display-mode: \"form\" }\n", 233 | "#@markdown 9.La siguiente linea de código: titanic.loc[titanic['Sex'] =='female','Fare'].mean() devuelve la tarifa que pagaron en promedio los hombres\n", 234 | "\n", 235 | "\n", 236 | "\n", 237 | "#@markdown Marque la correcta:\n", 238 | "#@markdown - a. Verdadero\n", 239 | "#@markdown - b. Falso\n", 240 | "\n", 241 | "\n", 242 | "\n", 243 | "\n", 244 | "item_9 = \"Seleccionar respuesta\" # @param [\"a\", \"b\",\"Seleccionar respuesta\"]" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "cellView": "form", 252 | "id": "IH4s4Z6gIf7I" 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "#@markdown{ display-mode: \"form\" }\n", 257 | "#@markdown 10.La siguiente linea de código: titanic.groupby('Sex').mean()['Age']) devuelve el promedio de edad agrupado por sexo\n", 258 | "\n", 259 | "\n", 260 | "\n", 261 | "#@markdown Marque la correcta:\n", 262 | "#@markdown - a. Verdadero\n", 263 | "#@markdown - b. Falso\n", 264 | "\n", 265 | "\n", 266 | "\n", 267 | "\n", 268 | "item_10 = \"Seleccionar respuesta\" # @param [\"a\", \"b\",\"Seleccionar respuesta\"]" 269 | ] 270 | } 271 | ], 272 | "metadata": { 273 | "colab": { 274 | "provenance": [] 275 | }, 276 | "kernelspec": { 277 | "display_name": "Python 3", 278 | "name": "python3" 279 | }, 280 | "language_info": { 281 | "name": "python" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 0 286 | } 287 | -------------------------------------------------------------------------------- /Basic_Pandas_Test/readme.md: -------------------------------------------------------------------------------- 1 | 🐼 Test básico de Pandas - Pon a prueba tus conocimientos 🚀 2 | 3 | ¿Estás aprendiendo Pandas y quieres poner a prueba tus conocimientos? ¡He diseñado un test interactivo de 10 preguntas en Google Colab! 4 | 5 | 📊 ¿Por qué Pandas? 6 | Pandas es una biblioteca de Python ampliamente utilizada en ciencia de datos para la manipulación y análisis de datos. 7 | 8 | 👩‍💻 ¿Cómo funciona? 9 | Simplemente sigue este enlace [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bcamandone/Python_Analisis_datos/blob/main/Basic_Pandas_Test/Basic_Pandas_test.ipynb) 10 | y ¡comienza a responder las preguntas! 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /CheatSheet - Exploración de datos usando Pandas/README.md: -------------------------------------------------------------------------------- 1 | ## Exploración de datos usando Pandas 2 | 3 | 4 | Este CheatSheet que realice en Jupyter notebook contiene código de ejemplo para ejecutar una Exploración de datos usando Pandas 5 | 6 | El dataset que utilice es el famoso del Titanic! 7 | 8 | El CheatSheet cuenta con 12 secciones: 9 | 10 | 11 | 1. Lectura de los datos 12 | 13 | 2. Obtener una primera vista de los datos 14 | 15 | 3. Modificar el tipo de datos de una columna 16 | 17 | 4. Renombrar una columna 18 | 19 | 5. Ordenar 20 | 21 | 6. Seleccionar y filtrar 22 | 23 | 7. Chequear duplicados 24 | 25 | 8. Manejo de valores faltantes 26 | 27 | 9. Crear una nueva columna 28 | 29 | 10. Identificar valores únicos 30 | 31 | 11. Realizar cálculos y agrupaciones 32 | 33 | 12. Estadísticos básicos 34 | 35 | 36 | 👏 ¡Por favor, dame un ⭐️ si te gusta este repo! 37 | -------------------------------------------------------------------------------- /EDA Iris/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Filtrado con Pandas/readme.md: -------------------------------------------------------------------------------- 1 | ## Tecnicas de filtrado con pandas 🔍🔍 2 | 3 | En el análisis de datos cotidiano con Python, consultar y filtrar datos es una de las tareas más comunes. Esto nos ayuda a extraer los datos específicos que necesitamos para nuestro análisis. 4 | 5 | En este notebook vas a encontrar 6 técnicas de uso común de filtrado y consulta de datos. 6 | -------------------------------------------------------------------------------- /Librerias_EDA/EDA utilizando la librería de Python Sweetviz.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8ea3099a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "2c83fe77", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "#Utilizamos pandas para importar el CSV\n", 21 | "df = pd.read_csv('Advertising.csv')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "id": "a54d842e", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "#importamos la libreria sweetviz\n", 32 | "import sweetviz as sv" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "id": "41594b26", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "application/vnd.jupyter.widget-view+json": { 44 | "model_id": "ba70029db0a140ac94ab8790584f8351", 45 | "version_major": 2, 46 | "version_minor": 0 47 | }, 48 | "text/plain": [ 49 | " | | [ 0%] 00:00 -> (? left)" 50 | ] 51 | }, 52 | "metadata": {}, 53 | "output_type": "display_data" 54 | } 55 | ], 56 | "source": [ 57 | "#Analizamos el dataset\n", 58 | "reporte = sv.analyze(df)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "id": "db61fbdc", 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "Report Advertising.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "#Mostramos el informe\n", 77 | "reporte.show_html('Advertising.html')" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "4bd7c5fa", 83 | "metadata": {}, 84 | "source": [ 85 | "By María Belén Camandone" 86 | ] 87 | } 88 | ], 89 | "metadata": { 90 | "kernelspec": { 91 | "display_name": "Python 3 (ipykernel)", 92 | "language": "python", 93 | "name": "python3" 94 | }, 95 | "language_info": { 96 | "codemirror_mode": { 97 | "name": "ipython", 98 | "version": 3 99 | }, 100 | "file_extension": ".py", 101 | "mimetype": "text/x-python", 102 | "name": "python", 103 | "nbconvert_exporter": "python", 104 | "pygments_lexer": "ipython3", 105 | "version": "3.9.13" 106 | } 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 5 110 | } 111 | -------------------------------------------------------------------------------- /Librerias_EDA/Librería Datatile .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "8ea3099a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 3, 16 | "id": "2c83fe77", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "#Utilizamos pandas para importar el CSV\n", 21 | "df = pd.read_csv('Advertising.csv')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 4, 27 | "id": "7b13f36e", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/html": [ 33 | "
\n", 34 | "\n", 47 | "\n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | "
TVRadioNewspaperSales
0230.137.869.222.1
144.539.345.110.4
217.245.969.39.3
3151.541.358.518.5
\n", 88 | "
" 89 | ], 90 | "text/plain": [ 91 | " TV Radio Newspaper Sales\n", 92 | "0 230.1 37.8 69.2 22.1\n", 93 | "1 44.5 39.3 45.1 10.4\n", 94 | "2 17.2 45.9 69.3 9.3\n", 95 | "3 151.5 41.3 58.5 18.5" 96 | ] 97 | }, 98 | "execution_count": 4, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "df.head(4)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 5, 110 | "id": "a54d842e", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "#importamos datatile\n", 115 | "from datatile.summary.df import DataFrameSummary" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 6, 121 | "id": "41594b26", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "#Analizamos el dataset\n", 126 | "dff = DataFrameSummary(df)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "cdcbd93d", 132 | "metadata": {}, 133 | "source": [ 134 | "El método .summary() devuelve la cantidad y el porcentaje de valores faltantes, \n", 135 | "la cantidad de valores únicos, el tipo de datos y las estadísticas de resumen para cada columna. " 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 11, 141 | "id": "09ac9ca9", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/html": [ 147 | "
\n", 148 | "\n", 161 | "\n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | "
TVRadioNewspaperSales
count200.0200.0200.0200.0
mean147.042523.26430.55414.0225
std85.85423614.84680921.7786215.217457
min0.70.00.31.6
25%74.3759.97512.7510.375
50%149.7522.925.7512.9
75%218.82536.52545.117.4
max296.449.6114.027.0
counts200200200200
uniques190167172121
missing0000
missing_perc0%0%0%0%
typesnumericnumericnumericnumeric
\n", 265 | "
" 266 | ], 267 | "text/plain": [ 268 | " TV Radio Newspaper Sales\n", 269 | "count 200.0 200.0 200.0 200.0\n", 270 | "mean 147.0425 23.264 30.554 14.0225\n", 271 | "std 85.854236 14.846809 21.778621 5.217457\n", 272 | "min 0.7 0.0 0.3 1.6\n", 273 | "25% 74.375 9.975 12.75 10.375\n", 274 | "50% 149.75 22.9 25.75 12.9\n", 275 | "75% 218.825 36.525 45.1 17.4\n", 276 | "max 296.4 49.6 114.0 27.0\n", 277 | "counts 200 200 200 200\n", 278 | "uniques 190 167 172 121\n", 279 | "missing 0 0 0 0\n", 280 | "missing_perc 0% 0% 0% 0%\n", 281 | "types numeric numeric numeric numeric" 282 | ] 283 | }, 284 | "execution_count": 11, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "\n", 291 | "dff.summary()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "id": "1e0eac80", 297 | "metadata": {}, 298 | "source": [ 299 | "También podemos obtener información solo para columnas seleccionadas. Esto es útil si el conjunto de datos es grande y lleva mucho tiempo calcular la información de todas las columnas, o si queremos analizar solo algunas columnas en particular. " 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 8, 305 | "id": "c88b924b", 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "mean 147.0425\n", 312 | "std 85.854236\n", 313 | "variance 7370.949893\n", 314 | "min 0.7\n", 315 | "max 296.4\n", 316 | "mode 17.2\n", 317 | "5% 13.195\n", 318 | "25% 74.375\n", 319 | "50% 149.75\n", 320 | "75% 218.825\n", 321 | "95% 280.735\n", 322 | "iqr 144.45\n", 323 | "kurtosis -1.226495\n", 324 | "skewness -0.069853\n", 325 | "sum 29408.5\n", 326 | "mad 75.10465\n", 327 | "cv 0.583874\n", 328 | "zeros_num 0\n", 329 | "zeros_perc 0%\n", 330 | "deviating_of_mean 0\n", 331 | "deviating_of_mean_perc 0%\n", 332 | "deviating_of_median 0\n", 333 | "deviating_of_median_perc 0%\n", 334 | "top_correlations Sales: 78.22%\n", 335 | "counts 200\n", 336 | "uniques 190\n", 337 | "missing 0\n", 338 | "missing_perc 0%\n", 339 | "types numeric\n", 340 | "Name: TV, dtype: object" 341 | ] 342 | }, 343 | "execution_count": 8, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "dff['TV']" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "id": "4bd7c5fa", 355 | "metadata": {}, 356 | "source": [ 357 | "By María Belén Camandone" 358 | ] 359 | } 360 | ], 361 | "metadata": { 362 | "kernelspec": { 363 | "display_name": "Python 3 (ipykernel)", 364 | "language": "python", 365 | "name": "python3" 366 | }, 367 | "language_info": { 368 | "codemirror_mode": { 369 | "name": "ipython", 370 | "version": 3 371 | }, 372 | "file_extension": ".py", 373 | "mimetype": "text/x-python", 374 | "name": "python", 375 | "nbconvert_exporter": "python", 376 | "pygments_lexer": "ipython3", 377 | "version": "3.9.13" 378 | } 379 | }, 380 | "nbformat": 4, 381 | "nbformat_minor": 5 382 | } 383 | -------------------------------------------------------------------------------- /Librerias_EDA/README.md: -------------------------------------------------------------------------------- 1 | ## EDA / Librerías 2 | 3 | En esta carpeta vas a poder encontrar código de ejemplo sobre librerías de python, que nos permiten en pocas líneas realizar un Análisis exploratorio de datos. 4 | -------------------------------------------------------------------------------- /Pandas Style./readme.md: -------------------------------------------------------------------------------- 1 | ## Pandas Style 🐼 - Transformando nuestro dataset con estilo 😎 2 | 3 | Si trabajaste con Excel, seguro personalizaste tus hojas, agregaste colores a las celdas y marcaste cifras importantes. Sabías que eso también lo podes hacer en la librería Pandas? 4 | -------------------------------------------------------------------------------- /Pandas_Tricks /Código/Trucos para usar en la librería pandas (parte1).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "42bd20b2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import seaborn as sns" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "a01a2bcc", 17 | "metadata": {}, 18 | "source": [ 19 | "### 1.Filtrar datos de manera sencilla con query()" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "id": "eefd5dfe", 25 | "metadata": {}, 26 | "source": [ 27 | "La función de Pandas query() nos brindará una forma concisa de seleccionar filas de un dataframe según la condición" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "id": "3840cffa", 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/html": [ 39 | "
\n", 40 | "\n", 53 | "\n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | "
ProductoVentas
0Leche2700
2Harina3750
\n", 74 | "
" 75 | ], 76 | "text/plain": [ 77 | " Producto Ventas\n", 78 | "0 Leche 2700\n", 79 | "2 Harina 3750" 80 | ] 81 | }, 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "# dataset \n", 89 | "data = {'Producto': ['Leche', 'Manteca', 'Harina', 'Tomate'],\n", 90 | " 'Ventas': [2700, 2000, 3750, 1200]}\n", 91 | "df = pd.DataFrame(data)\n", 92 | "\n", 93 | "# Buscamos los valores de venta mayores a 2500\n", 94 | "ventas_mayores = df.query('Ventas > 2500')\n", 95 | "\n", 96 | "ventas_mayores" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "ba2d3106", 102 | "metadata": {}, 103 | "source": [ 104 | "### 2.Datos categóricos" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "79789080", 110 | "metadata": {}, 111 | "source": [ 112 | "El uso del tipo de datos categóricos puede hacer que el análisis sea más eficiente y que sea más fácil trabajar con los datos, como por ejemplo a la hora de aplicar un modelo de ML o filtrar fácilmente." 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "db0477a7", 118 | "metadata": {}, 119 | "source": [ 120 | "Supongamos que estamos trabajando con el dataset de un call center donde tenemos 3 categorías:\n", 121 | "\n", 122 | " - Contactado\n", 123 | " \n", 124 | " - No contactado\n", 125 | " \n", 126 | " - Asesorado" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 4, 132 | "id": "6cdcea97", 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "0 Contactado\n", 140 | "1 Asesorado\n", 141 | "2 Contactado\n", 142 | "3 No contactado\n", 143 | "Name: Feedback, dtype: category\n", 144 | "Categories (3, object): ['Asesorado', 'Contactado', 'No contactado']\n", 145 | "0 1\n", 146 | "1 0\n", 147 | "2 1\n", 148 | "3 2\n", 149 | "dtype: int8\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "# dataset \n", 155 | "data1 = {'ID_Cliente': ['456231', '987443', '332167', '594361'],\n", 156 | " 'Feedback': ['Contactado','Asesorado','Contactado','No contactado']}\n", 157 | "df1 = pd.DataFrame(data1)\n", 158 | "\n", 159 | "# Convertimos la columna Feedback a tipo de dato categorico\n", 160 | "df1['Feedback'] = df1['Feedback'].astype('category')\n", 161 | "\n", 162 | "print(df1['Feedback'])\n", 163 | "print(df1['Feedback'].cat.codes)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "2233d64d", 169 | "metadata": {}, 170 | "source": [ 171 | "### 3.Agregación" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "id": "b676d343", 177 | "metadata": {}, 178 | "source": [ 179 | "Podemos utilizar la función agg para aplicar diferentes agregaciones a diferentes columnas de un DataFrame." 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 5, 185 | "id": "0fbfdb63", 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/html": [ 191 | "
\n", 192 | "\n", 205 | "\n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | "
agefare
min0.420000NaN
max80.000000NaN
mean29.69911832.204208
sumNaN28693.949300
\n", 236 | "
" 237 | ], 238 | "text/plain": [ 239 | " age fare\n", 240 | "min 0.420000 NaN\n", 241 | "max 80.000000 NaN\n", 242 | "mean 29.699118 32.204208\n", 243 | "sum NaN 28693.949300" 244 | ] 245 | }, 246 | "execution_count": 5, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "df_titanic = sns.load_dataset('titanic')\n", 253 | "\n", 254 | "aggregaciones = df_titanic.agg({\n", 255 | " 'age': ['min', 'max', 'mean'],\n", 256 | " 'fare': ['sum', 'mean']\n", 257 | "})\n", 258 | "\n", 259 | "agregaciones1 = pd.DataFrame(aggregaciones)\n", 260 | "\n", 261 | "agregaciones1" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "id": "94ea2b2d", 267 | "metadata": {}, 268 | "source": [ 269 | "### 4.Multi-Index" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "id": "09b5a334", 275 | "metadata": {}, 276 | "source": [ 277 | "Pandas tiene una herramienta llamada MultiIndex o indexación jerárquica. Te permite manejar datos con más de dos niveles, como en una tabla" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 6, 283 | "id": "b6a25094", 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "
\n", 290 | "\n", 303 | "\n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | "
Ventas
AñoMes
20241250000
2300000
20233400000
4500000
5324000
6127000
\n", 345 | "
" 346 | ], 347 | "text/plain": [ 348 | " Ventas\n", 349 | "Año Mes \n", 350 | "2024 1 250000\n", 351 | " 2 300000\n", 352 | "2023 3 400000\n", 353 | " 4 500000\n", 354 | " 5 324000\n", 355 | " 6 127000" 356 | ] 357 | }, 358 | "execution_count": 6, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "# dataset\n", 365 | "data3 = {\n", 366 | " 'Año': [2024, 2024, 2023, 2023,2023,2023],\n", 367 | " 'Mes': ['1', '2', '3', '4','5','6'],\n", 368 | " 'Ventas': [250000, 300000, 400000, 500000,324000,127000]\n", 369 | "}\n", 370 | "\n", 371 | "df3 = pd.DataFrame(data3)\n", 372 | "\n", 373 | "# Set MultiIndex\n", 374 | "df3.set_index(['Año', 'Mes'], inplace=True)\n", 375 | "\n", 376 | "df3" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "id": "da3bbe39", 382 | "metadata": {}, 383 | "source": [ 384 | "### 5.Filtrar con el método isin" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "id": "af33b645", 390 | "metadata": {}, 391 | "source": [ 392 | "Podemos utilizar el método isin para filtrar filas según una lista de valores.\n", 393 | "Este método es útil cuando trabajamos con datos categóricos y deseamos seleccionar filas que pertenecen a determinadas categorías. \n", 394 | "Es más conciso que utilizar múltiples condiciones lógicas con el operador &." 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 7, 400 | "id": "a6711ee7", 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "titanic_filtrado = df_titanic[df_titanic['class'].isin(['First', 'Second'])]" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 8, 410 | "id": "ca4403d4", 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "data": { 415 | "text/html": [ 416 | "
\n", 417 | "\n", 430 | "\n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
601male54.00051.8625SFirstmanTrueESouthamptonnoTrue
912female14.01030.0708CSecondchildFalseNaNCherbourgyesFalse
1111female58.00026.5500SFirstwomanFalseCSouthamptonyesTrue
\n", 544 | "
" 545 | ], 546 | "text/plain": [ 547 | " survived pclass sex age sibsp parch fare embarked class \\\n", 548 | "1 1 1 female 38.0 1 0 71.2833 C First \n", 549 | "3 1 1 female 35.0 1 0 53.1000 S First \n", 550 | "6 0 1 male 54.0 0 0 51.8625 S First \n", 551 | "9 1 2 female 14.0 1 0 30.0708 C Second \n", 552 | "11 1 1 female 58.0 0 0 26.5500 S First \n", 553 | "\n", 554 | " who adult_male deck embark_town alive alone \n", 555 | "1 woman False C Cherbourg yes False \n", 556 | "3 woman False C Southampton yes False \n", 557 | "6 man True E Southampton no True \n", 558 | "9 child False NaN Cherbourg yes False \n", 559 | "11 woman False C Southampton yes True " 560 | ] 561 | }, 562 | "execution_count": 8, 563 | "metadata": {}, 564 | "output_type": "execute_result" 565 | } 566 | ], 567 | "source": [ 568 | "titanic_filtrado.head()" 569 | ] 570 | } 571 | ], 572 | "metadata": { 573 | "kernelspec": { 574 | "display_name": "Python 3 (ipykernel)", 575 | "language": "python", 576 | "name": "python3" 577 | }, 578 | "language_info": { 579 | "codemirror_mode": { 580 | "name": "ipython", 581 | "version": 3 582 | }, 583 | "file_extension": ".py", 584 | "mimetype": "text/x-python", 585 | "name": "python", 586 | "nbconvert_exporter": "python", 587 | "pygments_lexer": "ipython3", 588 | "version": "3.11.4" 589 | } 590 | }, 591 | "nbformat": 4, 592 | "nbformat_minor": 5 593 | } 594 | -------------------------------------------------------------------------------- /Pandas_Tricks /Código/Trucos para usar en la librería pandas(parte2).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 73, 6 | "id": "5bd69ad6", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import seaborn as sns" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 74, 17 | "id": "47f83173", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "
\n", 24 | "\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue
\n", 151 | "
" 152 | ], 153 | "text/plain": [ 154 | " survived pclass sex age sibsp parch fare embarked class \\\n", 155 | "0 0 3 male 22.0 1 0 7.2500 S Third \n", 156 | "1 1 1 female 38.0 1 0 71.2833 C First \n", 157 | "2 1 3 female 26.0 0 0 7.9250 S Third \n", 158 | "3 1 1 female 35.0 1 0 53.1000 S First \n", 159 | "4 0 3 male 35.0 0 0 8.0500 S Third \n", 160 | "\n", 161 | " who adult_male deck embark_town alive alone \n", 162 | "0 man True NaN Southampton no False \n", 163 | "1 woman False C Cherbourg yes False \n", 164 | "2 woman False NaN Southampton yes True \n", 165 | "3 woman False C Southampton yes False \n", 166 | "4 man True NaN Southampton no True " 167 | ] 168 | }, 169 | "execution_count": 74, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "df = sns.load_dataset('titanic')\n", 176 | "df.head()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "id": "82bf88db", 182 | "metadata": {}, 183 | "source": [ 184 | "# 1) Cut()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "78de0b8d", 190 | "metadata": {}, 191 | "source": [ 192 | "cut() es increíblemente útil para agrupar datos numéricos en intervalos discretos.\n", 193 | "\n", 194 | "Vamos a clasificar a las personas en diferentes grupos de edad. cut() simplifica este proceso creando contenedores personalizados y asignando etiquetas en consecuencia:" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 75, 200 | "id": "8ec37fc1", 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/html": [ 206 | "
\n", 207 | "\n", 220 | "\n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealoneAge Group
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse0-30
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse31-40
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue0-30
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse31-40
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue31-40
\n", 340 | "
" 341 | ], 342 | "text/plain": [ 343 | " survived pclass sex age sibsp parch fare embarked class \\\n", 344 | "0 0 3 male 22.0 1 0 7.2500 S Third \n", 345 | "1 1 1 female 38.0 1 0 71.2833 C First \n", 346 | "2 1 3 female 26.0 0 0 7.9250 S Third \n", 347 | "3 1 1 female 35.0 1 0 53.1000 S First \n", 348 | "4 0 3 male 35.0 0 0 8.0500 S Third \n", 349 | "\n", 350 | " who adult_male deck embark_town alive alone Age Group \n", 351 | "0 man True NaN Southampton no False 0-30 \n", 352 | "1 woman False C Cherbourg yes False 31-40 \n", 353 | "2 woman False NaN Southampton yes True 0-30 \n", 354 | "3 woman False C Southampton yes False 31-40 \n", 355 | "4 man True NaN Southampton no True 31-40 " 356 | ] 357 | }, 358 | "execution_count": 75, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "bins = [0, 30, 40, 50, 60, 100]\n", 365 | "labels = ['0-30', '31-40', '41-50', '51-60', '61-100']\n", 366 | "\n", 367 | "# Creamos una nueva columna Age Group y usamos cut\n", 368 | "df['Age Group'] = pd.cut(df['age'], bins=bins, labels=labels)\n", 369 | "df.head()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "id": "0c69884a", 375 | "metadata": {}, 376 | "source": [ 377 | "# 2) Filtrar en negativo " 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "id": "1865e428", 383 | "metadata": {}, 384 | "source": [ 385 | "El operador ~ en Pandas se utiliza para filtrar en negativo, es decir, para seleccionar las filas que no cumplen con una condición determinada." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 76, 391 | "id": "dc9d4143", 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/html": [ 397 | "
\n", 398 | "\n", 411 | "\n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealoneAge Group
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse0-30
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue31-40
503maleNaN008.4583QThirdmanTrueNaNQueenstownnoTrueNaN
601male54.00051.8625SFirstmanTrueESouthamptonnoTrue51-60
703male2.03121.0750SThirdchildFalseNaNSouthamptonnoFalse0-30
...................................................
88403male25.0007.0500SThirdmanTrueNaNSouthamptonnoTrue0-30
88503female39.00529.1250QThirdwomanFalseNaNQueenstownnoFalse31-40
88602male27.00013.0000SSecondmanTrueNaNSouthamptonnoTrue0-30
88803femaleNaN1223.4500SThirdwomanFalseNaNSouthamptonnoFalseNaN
89003male32.0007.7500QThirdmanTrueNaNQueenstownnoTrue31-40
\n", 645 | "

549 rows × 16 columns

\n", 646 | "
" 647 | ], 648 | "text/plain": [ 649 | " survived pclass sex age sibsp parch fare embarked class \\\n", 650 | "0 0 3 male 22.0 1 0 7.2500 S Third \n", 651 | "4 0 3 male 35.0 0 0 8.0500 S Third \n", 652 | "5 0 3 male NaN 0 0 8.4583 Q Third \n", 653 | "6 0 1 male 54.0 0 0 51.8625 S First \n", 654 | "7 0 3 male 2.0 3 1 21.0750 S Third \n", 655 | ".. ... ... ... ... ... ... ... ... ... \n", 656 | "884 0 3 male 25.0 0 0 7.0500 S Third \n", 657 | "885 0 3 female 39.0 0 5 29.1250 Q Third \n", 658 | "886 0 2 male 27.0 0 0 13.0000 S Second \n", 659 | "888 0 3 female NaN 1 2 23.4500 S Third \n", 660 | "890 0 3 male 32.0 0 0 7.7500 Q Third \n", 661 | "\n", 662 | " who adult_male deck embark_town alive alone Age Group \n", 663 | "0 man True NaN Southampton no False 0-30 \n", 664 | "4 man True NaN Southampton no True 31-40 \n", 665 | "5 man True NaN Queenstown no True NaN \n", 666 | "6 man True E Southampton no True 51-60 \n", 667 | "7 child False NaN Southampton no False 0-30 \n", 668 | ".. ... ... ... ... ... ... ... \n", 669 | "884 man True NaN Southampton no True 0-30 \n", 670 | "885 woman False NaN Queenstown no False 31-40 \n", 671 | "886 man True NaN Southampton no True 0-30 \n", 672 | "888 woman False NaN Southampton no False NaN \n", 673 | "890 man True NaN Queenstown no True 31-40 \n", 674 | "\n", 675 | "[549 rows x 16 columns]" 676 | ] 677 | }, 678 | "execution_count": 76, 679 | "metadata": {}, 680 | "output_type": "execute_result" 681 | } 682 | ], 683 | "source": [ 684 | "# Filtrar las filas donde el valor de la columna 'alive' no sea yes\n", 685 | "df_filtrado = df[~(df['alive'] =='yes')]\n", 686 | "\n", 687 | "df_filtrado\n" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "id": "93d51977", 693 | "metadata": {}, 694 | "source": [ 695 | "# 3) Crosstab() " 696 | ] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "id": "1ae8c8b9", 701 | "metadata": {}, 702 | "source": [ 703 | "La función crosstab() proporciona una manera poderosa (y súper simple) de explorar las relaciones entre diferentes variables en un conjunto de datos." 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 77, 709 | "id": "4809b239", 710 | "metadata": {}, 711 | "outputs": [ 712 | { 713 | "data": { 714 | "text/html": [ 715 | "
\n", 716 | "\n", 729 | "\n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | "
pclass123
alive
no8097372
yes13687119
\n", 759 | "
" 760 | ], 761 | "text/plain": [ 762 | "pclass 1 2 3\n", 763 | "alive \n", 764 | "no 80 97 372\n", 765 | "yes 136 87 119" 766 | ] 767 | }, 768 | "execution_count": 77, 769 | "metadata": {}, 770 | "output_type": "execute_result" 771 | } 772 | ], 773 | "source": [ 774 | "cross_table = pd.crosstab(df['alive'], df['pclass'])\n", 775 | "cross_table" 776 | ] 777 | }, 778 | { 779 | "cell_type": "markdown", 780 | "id": "3bfaba10", 781 | "metadata": {}, 782 | "source": [ 783 | "# 4) Dividir una columna" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "id": "6d13cf14", 789 | "metadata": {}, 790 | "source": [ 791 | "Supongamos que queremos dividir la columna «Country» en dos columnas usando el espacio como separador:" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": 78, 797 | "id": "b1c970b2", 798 | "metadata": {}, 799 | "outputs": [ 800 | { 801 | "data": { 802 | "text/html": [ 803 | "
\n", 804 | "\n", 817 | "\n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | "
RevenueEmployeesCityCountry
Company
Apple274515147000CaliforniaUnited States
Samsung Electronics200734267937SuwonSouth Korea
Alphabet182527135301CaliforniaUnited States
Foxconn181945878429New Taipei CityTaiwan
Microsoft143015163000WashingtonUnited States
\n", 872 | "
" 873 | ], 874 | "text/plain": [ 875 | " Revenue Employees City Country\n", 876 | "Company \n", 877 | "Apple 274515 147000 California United States\n", 878 | "Samsung Electronics 200734 267937 Suwon South Korea\n", 879 | "Alphabet 182527 135301 California United States\n", 880 | "Foxconn 181945 878429 New Taipei City Taiwan\n", 881 | "Microsoft 143015 163000 Washington United States" 882 | ] 883 | }, 884 | "execution_count": 78, 885 | "metadata": {}, 886 | "output_type": "execute_result" 887 | } 888 | ], 889 | "source": [ 890 | "df_tech = pd.read_csv('tech.csv', delimiter='\\t', index_col='Company')\n", 891 | "df_tech.head()" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": 79, 897 | "id": "83f97c25", 898 | "metadata": {}, 899 | "outputs": [ 900 | { 901 | "data": { 902 | "text/html": [ 903 | "
\n", 904 | "\n", 917 | "\n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | "
01
Company
AppleUnitedStates
Samsung ElectronicsSouthKorea
AlphabetUnitedStates
FoxconnTaiwanNone
MicrosoftUnitedStates
HuaweiChinaNone
Dell TechnologiesUnitedStates
FacebookUnitedStates
SonyJapanNone
HitachiJapanNone
IntelUnitedStates
IBMUnitedStates
TencentChinaNone
PanasonicJapanNone
LenovoChinaNone
HP Inc.UnitedStates
LG ElectronicsSouthKorea
\n", 1018 | "
" 1019 | ], 1020 | "text/plain": [ 1021 | " 0 1\n", 1022 | "Company \n", 1023 | "Apple United States\n", 1024 | "Samsung Electronics South Korea\n", 1025 | "Alphabet United States\n", 1026 | "Foxconn Taiwan None\n", 1027 | "Microsoft United States\n", 1028 | "Huawei China None\n", 1029 | "Dell Technologies United States\n", 1030 | "Facebook United States\n", 1031 | "Sony Japan None\n", 1032 | "Hitachi Japan None\n", 1033 | "Intel United States\n", 1034 | "IBM United States\n", 1035 | "Tencent China None\n", 1036 | "Panasonic Japan None\n", 1037 | "Lenovo China None\n", 1038 | "HP Inc. United States\n", 1039 | "LG Electronics South Korea" 1040 | ] 1041 | }, 1042 | "execution_count": 79, 1043 | "metadata": {}, 1044 | "output_type": "execute_result" 1045 | } 1046 | ], 1047 | "source": [ 1048 | "df_tech['Country'].str.split(' ', expand=True)\n" 1049 | ] 1050 | }, 1051 | { 1052 | "cell_type": "markdown", 1053 | "id": "96dc72fb", 1054 | "metadata": {}, 1055 | "source": [ 1056 | "# 5) Melt" 1057 | ] 1058 | }, 1059 | { 1060 | "cell_type": "markdown", 1061 | "id": "556184c3", 1062 | "metadata": {}, 1063 | "source": [ 1064 | "Los datos suelen venir en formato amplio, donde las variables están distribuidas en columnas. El método melt() proporciona una forma concisa de remodelar su DataFrame de formato ancho a largo, lo que facilita la realización de ciertos análisis y visualizaciones.\n", 1065 | "\n", 1066 | "Considere un DataFrame con varias columnas que representan datos de ventas de diferentes meses. Si desea remodelar el DataFrame para que tenga una sola columna para los meses y otra para los valores de ventas correspondientes, melt() puede simplificar el proceso:" 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "code", 1071 | "execution_count": 80, 1072 | "id": "d7b30ea6", 1073 | "metadata": {}, 1074 | "outputs": [ 1075 | { 1076 | "data": { 1077 | "text/html": [ 1078 | "
\n", 1079 | "\n", 1092 | "\n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | "
ProductoEnero_SalesFebrero_SalesMarzo_Sales
0Remera11015080
1Musculosa190140190
2Top220180210
\n", 1126 | "
" 1127 | ], 1128 | "text/plain": [ 1129 | " Producto Enero_Sales Febrero_Sales Marzo_Sales\n", 1130 | "0 Remera 110 150 80\n", 1131 | "1 Musculosa 190 140 190\n", 1132 | "2 Top 220 180 210" 1133 | ] 1134 | }, 1135 | "execution_count": 80, 1136 | "metadata": {}, 1137 | "output_type": "execute_result" 1138 | } 1139 | ], 1140 | "source": [ 1141 | "data = {'Producto': ['Remera', 'Musculosa', 'Top'],\n", 1142 | " 'Enero_Sales': [110, 190, 220],\n", 1143 | " 'Febrero_Sales': [150, 140, 180],\n", 1144 | " 'Marzo_Sales': [80, 190, 210]}\n", 1145 | "\n", 1146 | "df_ventas = pd.DataFrame(data)\n", 1147 | "df_ventas" 1148 | ] 1149 | }, 1150 | { 1151 | "cell_type": "code", 1152 | "execution_count": 81, 1153 | "id": "cf1cc0ea", 1154 | "metadata": {}, 1155 | "outputs": [ 1156 | { 1157 | "data": { 1158 | "text/html": [ 1159 | "
\n", 1160 | "\n", 1173 | "\n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | "
ProductoMesVentas
0RemeraEnero_Sales110
1MusculosaEnero_Sales190
2TopEnero_Sales220
3RemeraFebrero_Sales150
4MusculosaFebrero_Sales140
5TopFebrero_Sales180
6RemeraMarzo_Sales80
7MusculosaMarzo_Sales190
8TopMarzo_Sales210
\n", 1239 | "
" 1240 | ], 1241 | "text/plain": [ 1242 | " Producto Mes Ventas\n", 1243 | "0 Remera Enero_Sales 110\n", 1244 | "1 Musculosa Enero_Sales 190\n", 1245 | "2 Top Enero_Sales 220\n", 1246 | "3 Remera Febrero_Sales 150\n", 1247 | "4 Musculosa Febrero_Sales 140\n", 1248 | "5 Top Febrero_Sales 180\n", 1249 | "6 Remera Marzo_Sales 80\n", 1250 | "7 Musculosa Marzo_Sales 190\n", 1251 | "8 Top Marzo_Sales 210" 1252 | ] 1253 | }, 1254 | "execution_count": 81, 1255 | "metadata": {}, 1256 | "output_type": "execute_result" 1257 | } 1258 | ], 1259 | "source": [ 1260 | "# Melt\n", 1261 | "df_melted = pd.melt(df_ventas, id_vars=['Producto'], \n", 1262 | "var_name='Mes', value_name='Ventas')\n", 1263 | "\n", 1264 | "df_melted" 1265 | ] 1266 | } 1267 | ], 1268 | "metadata": { 1269 | "kernelspec": { 1270 | "display_name": "Python 3 (ipykernel)", 1271 | "language": "python", 1272 | "name": "python3" 1273 | }, 1274 | "language_info": { 1275 | "codemirror_mode": { 1276 | "name": "ipython", 1277 | "version": 3 1278 | }, 1279 | "file_extension": ".py", 1280 | "mimetype": "text/x-python", 1281 | "name": "python", 1282 | "nbconvert_exporter": "python", 1283 | "pygments_lexer": "ipython3", 1284 | "version": "3.11.4" 1285 | } 1286 | }, 1287 | "nbformat": 4, 1288 | "nbformat_minor": 5 1289 | } 1290 | -------------------------------------------------------------------------------- /Pandas_Tricks /Código/pivot_table().ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "555bdacf", 6 | "metadata": {}, 7 | "source": [ 8 | "## Función pivot_table()" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 29, 14 | "id": "005dae81", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "
\n", 21 | "\n", 34 | "\n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | "
companyrankrank_changerevenueprofitnum. of employeessectorcitystatenewcomerceo_founderceo_womanprofitableprev_rankCEOWebsiteTickerMarket Cap
0Walmart10.0572754.013673.02300000.0RetailingBentonvilleARnononoyes1.0C. Douglas McMillonhttps://www.stock.walmart.comWMT352037
1Amazon20.0469822.033364.01608000.0RetailingSeattleWAnononoyes2.0Andrew R. Jassywww.amazon.comAMZN1202717
2Apple30.0365817.094680.0154000.0TechnologyCupertinoCAnononoyes3.0Timothy D. Cookwww.apple.comAAPL2443962
3CVS Health40.0292111.07910.0258000.0Health CareWoonsocketRInonoyesyes4.0Karen Lynchhttps://www.cvshealth.comCVS125204
4UnitedHealth Group50.0287597.017285.0350000.0Health CareMinnetonkaMNnononoyes5.0Andrew P. Wittywww.unitedhealthgroup.comUNH500468
\n", 166 | "
" 167 | ], 168 | "text/plain": [ 169 | " company rank rank_change revenue profit \\\n", 170 | "0 Walmart 1 0.0 572754.0 13673.0 \n", 171 | "1 Amazon 2 0.0 469822.0 33364.0 \n", 172 | "2 Apple 3 0.0 365817.0 94680.0 \n", 173 | "3 CVS Health 4 0.0 292111.0 7910.0 \n", 174 | "4 UnitedHealth Group 5 0.0 287597.0 17285.0 \n", 175 | "\n", 176 | " num. of employees sector city state newcomer ceo_founder \\\n", 177 | "0 2300000.0 Retailing Bentonville AR no no \n", 178 | "1 1608000.0 Retailing Seattle WA no no \n", 179 | "2 154000.0 Technology Cupertino CA no no \n", 180 | "3 258000.0 Health Care Woonsocket RI no no \n", 181 | "4 350000.0 Health Care Minnetonka MN no no \n", 182 | "\n", 183 | " ceo_woman profitable prev_rank CEO \\\n", 184 | "0 no yes 1.0 C. Douglas McMillon \n", 185 | "1 no yes 2.0 Andrew R. Jassy \n", 186 | "2 no yes 3.0 Timothy D. Cook \n", 187 | "3 yes yes 4.0 Karen Lynch \n", 188 | "4 no yes 5.0 Andrew P. Witty \n", 189 | "\n", 190 | " Website Ticker Market Cap \n", 191 | "0 https://www.stock.walmart.com WMT 352037 \n", 192 | "1 www.amazon.com AMZN 1202717 \n", 193 | "2 www.apple.com AAPL 2443962 \n", 194 | "3 https://www.cvshealth.com CVS 125204 \n", 195 | "4 www.unitedhealthgroup.com UNH 500468 " 196 | ] 197 | }, 198 | "execution_count": 29, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "import pandas as pd\n", 205 | "import numpy as np\n", 206 | "fortune = pd.read_csv(\"Fortune.csv\")\n", 207 | "fortune.head()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 30, 213 | "id": "c0d511a3", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "mean_Profits = fortune[\"revenue\"].mean() # calculo el promedio \n", 218 | "fortune[\"Revenue_segmentation\"] = np.where(fortune[\"revenue\"] > mean_Profits, \"High\", \"Low\")" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 31, 224 | "id": "78b24dd3", 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "Low 798\n", 231 | "High 202\n", 232 | "Name: Revenue_segmentation, dtype: int64" 233 | ] 234 | }, 235 | "execution_count": 31, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "fortune[\"Revenue_segmentation\"].value_counts()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 32, 247 | "id": "a89baeee", 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/html": [ 253 | "
\n", 254 | "\n", 267 | "\n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | "
Revenue_segmentationHighLow
sector
Aerospace & Defense512
Apparel115
Business Services448
Chemicals326
Energy2179
Engineering & Construction230
Financials39127
Food & Drug Stores54
Food, Beverages & Tobacco1024
Health Care2453
Hotels, Restaurants & Leisure226
Household Products221
Industrials842
Materials739
Media325
Motor Vehicles & Parts614
Retailing1760
Technology2299
Telecommunications54
Transportation728
Wholesalers922
\n", 388 | "
" 389 | ], 390 | "text/plain": [ 391 | "Revenue_segmentation High Low\n", 392 | "sector \n", 393 | "Aerospace & Defense 5 12\n", 394 | "Apparel 1 15\n", 395 | "Business Services 4 48\n", 396 | "Chemicals 3 26\n", 397 | "Energy 21 79\n", 398 | "Engineering & Construction 2 30\n", 399 | "Financials 39 127\n", 400 | "Food & Drug Stores 5 4\n", 401 | "Food, Beverages & Tobacco 10 24\n", 402 | "Health Care 24 53\n", 403 | "Hotels, Restaurants & Leisure 2 26\n", 404 | "Household Products 2 21\n", 405 | "Industrials 8 42\n", 406 | "Materials 7 39\n", 407 | "Media 3 25\n", 408 | "Motor Vehicles & Parts 6 14\n", 409 | "Retailing 17 60\n", 410 | "Technology 22 99\n", 411 | "Telecommunications 5 4\n", 412 | "Transportation 7 28\n", 413 | "Wholesalers 9 22" 414 | ] 415 | }, 416 | "execution_count": 32, 417 | "metadata": {}, 418 | "output_type": "execute_result" 419 | } 420 | ], 421 | "source": [ 422 | "pd.pivot_table(data = fortune, \n", 423 | " index = ['sector'],\n", 424 | " columns = [\"Revenue_segmentation\"],\n", 425 | " aggfunc = 'size',\n", 426 | " fill_value = 0) # 'size' indica que se está contando el número de ocurrencias (tamaño) " 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 33, 432 | "id": "619596d9", 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "data": { 437 | "text/html": [ 438 | "
\n", 439 | "\n", 452 | "\n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | "
size_Highsize_Lowsum_Highsum_Low
sectorcity
Aerospace & DefenseBethesda1067044.00.0
Centreville010.03660.8
Chicago1062286.00.0
Cleveland010.04798.0
East Aurora010.02852.0
..................
WholesalersRoswell010.03475.7
St. Louis020.013771.3
Tucker010.03298.8
Waltham010.013248.3
Winona010.06010.9
\n", 554 | "

739 rows × 4 columns

\n", 555 | "
" 556 | ], 557 | "text/plain": [ 558 | " size_High size_Low sum_High sum_Low\n", 559 | "sector city \n", 560 | "Aerospace & Defense Bethesda 1 0 67044.0 0.0\n", 561 | " Centreville 0 1 0.0 3660.8\n", 562 | " Chicago 1 0 62286.0 0.0\n", 563 | " Cleveland 0 1 0.0 4798.0\n", 564 | " East Aurora 0 1 0.0 2852.0\n", 565 | "... ... ... ... ...\n", 566 | "Wholesalers Roswell 0 1 0.0 3475.7\n", 567 | " St. Louis 0 2 0.0 13771.3\n", 568 | " Tucker 0 1 0.0 3298.8\n", 569 | " Waltham 0 1 0.0 13248.3\n", 570 | " Winona 0 1 0.0 6010.9\n", 571 | "\n", 572 | "[739 rows x 4 columns]" 573 | ] 574 | }, 575 | "execution_count": 33, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "# Crear la tabla pivotada\n", 582 | "pivot_table = pd.pivot_table(data = fortune,\n", 583 | " values = 'revenue', \n", 584 | " index = ['sector','city'],\n", 585 | " columns = [\"Revenue_segmentation\"],\n", 586 | " aggfunc = {\"revenue\": [\"sum\", 'size']}, \n", 587 | " fill_value = 0)\n", 588 | "\n", 589 | "\n", 590 | "pivot_table.columns = ['_'.join(col).strip() for col in pivot_table.columns]\n", 591 | "\n", 592 | "\n", 593 | "pivot_table\n" 594 | ] 595 | } 596 | ], 597 | "metadata": { 598 | "kernelspec": { 599 | "display_name": "Python 3 (ipykernel)", 600 | "language": "python", 601 | "name": "python3" 602 | }, 603 | "language_info": { 604 | "codemirror_mode": { 605 | "name": "ipython", 606 | "version": 3 607 | }, 608 | "file_extension": ".py", 609 | "mimetype": "text/x-python", 610 | "name": "python", 611 | "nbconvert_exporter": "python", 612 | "pygments_lexer": "ipython3", 613 | "version": "3.11.4" 614 | } 615 | }, 616 | "nbformat": 4, 617 | "nbformat_minor": 5 618 | } 619 | -------------------------------------------------------------------------------- /Pandas_Tricks /Código/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Pandas_Tricks /readme.md: -------------------------------------------------------------------------------- 1 | ## Trucos para utilizar en la librería Pandas de Python 2 | 3 | ![pandas](https://github.com/bcamandone/Python_Analisis_datos/assets/86261762/85ad9585-13e6-4e57-8dca-a0978a9c6088) 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Análisis de datos con Python 2 | 3 | ![5](https://user-images.githubusercontent.com/86261762/209863473-89108b92-ca1a-4ad2-92b5-25703119aa10.png) 4 | 5 | 6 | Python se ha convertido en el lenguaje de programación más utilizado a la hora de hacer análisis de datos. Tiene miles de librerías para las más variadas tareas, lo que permite realizarlas con relativa facilidad. En el entorno de análisis de datos, las tres más conocidas son : Numpy, Pandas y Matplotlib. 7 | 8 | 9 | Librerías utilizadas en este repositorio: 10 | 11 | - Pandas 12 | - Numpy 13 | - Matplotlib 14 | - Seaborn 15 | - Missingno 16 | - Sweetviz 17 | - BeautifulSoup 18 | - Tabula 19 | - Datatile 20 | - FastEDA 21 | 22 | # ¡Por favor, dame un ⭐️ si te gusta y utilizas este repo! 👏 23 | -------------------------------------------------------------------------------- /Sales Data Analysis/-: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Sales Data Analysis/CSV/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Sales Data Analysis/README.md: -------------------------------------------------------------------------------- 1 | # Análisis de datos de ventas 2 | 3 | ![pr1_A84BA97F19EC6E907968](https://user-images.githubusercontent.com/86261762/197585785-7ab95ab9-a991-4aa8-bddd-d3b7870d6abf.png) 4 | 5 | 6 | Se realiza un analisis sobre las ventas de una empresa para responder a las siguientes preguntas: 7 | 8 | - P: ¿Cuánto se ganó en 2019? 9 | - P: ¿Cuál fue el mejor mes para las ventas? ¿Cuánto se ganó ese mes? 10 | - P: ¿Qué ciudad tuvo el mayor número de ventas? 11 | - P: ¿A qué hora debemos mostrar publicidad para maximizar probabilidad de que el cliente compre el producto? 12 | - P: ¿Qué producto vendió más? ¿Por qué crees que vendió más? 13 | 14 | Dataset: https://www.kaggle.com/code/linocondor/sales-data-analysis/data 15 | 16 | Librerías 17 | - pandas 18 | - numpy 19 | - seaborn 20 | - matplotlib 21 | -------------------------------------------------------------------------------- /Series y dataframes/Series en Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "pRw9aCmeTseK" 7 | }, 8 | "source": [ 9 | "## Series" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 63, 15 | "metadata": { 16 | "id": "DDzqASMDS1a3" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "La serie es una estructura de datos unidimensional en donde cada dato tiene además una etiqueta." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 64, 33 | "metadata": { 34 | "colab": { 35 | "base_uri": "https://localhost:8080/" 36 | }, 37 | "id": "PfxKoWtcU-FK", 38 | "outputId": "e29538d6-45f0-481c-f4ba-01eed27d4121" 39 | }, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "['Celeste', 'Rosa', 'Violeta', 'Rojo']" 45 | ] 46 | }, 47 | "execution_count": 64, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "Colores = [\"Celeste\" , \"Rosa\", \"Violeta\", \"Rojo\"]\n", 54 | "Colores" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "Para crear una serie se usa el constructor pd.Series() que es capaz de convertir a serie distintas estructuras como listas,\n", 62 | "diccionarios,etc." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 65, 68 | "metadata": { 69 | "colab": { 70 | "base_uri": "https://localhost:8080/" 71 | }, 72 | "id": "kWEkH24kVhtN", 73 | "outputId": "66dd3b3c-d905-4daf-af25-827fca975141" 74 | }, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "0 Celeste\n", 80 | "1 Rosa\n", 81 | "2 Violeta\n", 82 | "3 Rojo\n", 83 | "dtype: object" 84 | ] 85 | }, 86 | "execution_count": 65, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "pd.Series(Colores) " 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 66, 98 | "metadata": { 99 | "colab": { 100 | "base_uri": "https://localhost:8080/" 101 | }, 102 | "id": "DbxRNopZV1By", 103 | "outputId": "f8f17f6f-01ed-467a-a319-d2fabf460e27" 104 | }, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "0 1\n", 110 | "1 2\n", 111 | "2 3\n", 112 | "3 65\n", 113 | "4 34\n", 114 | "5 11\n", 115 | "dtype: int64" 116 | ] 117 | }, 118 | "execution_count": 66, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "numeros = [1,2,3,65,34,11]\n", 125 | "\n", 126 | "pd.Series(numeros)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 67, 132 | "metadata": { 133 | "colab": { 134 | "base_uri": "https://localhost:8080/" 135 | }, 136 | "id": "RWH4EPrJWc8T", 137 | "outputId": "b4b9b94d-37f5-4570-c20e-bafd32a397ac" 138 | }, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "0 True\n", 144 | "1 False\n", 145 | "2 True\n", 146 | "3 True\n", 147 | "4 False\n", 148 | "dtype: bool" 149 | ] 150 | }, 151 | "execution_count": 67, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "registros = [True, False, True, True, False]\n", 158 | "\n", 159 | "pd.Series(registros)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": { 165 | "id": "k8FLvA5LXNGR" 166 | }, 167 | "source": [ 168 | "## Atributos de una serie" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 95, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "AAPL 9\n", 180 | "AMZN 13\n", 181 | "GOOGL 26\n", 182 | "MELI 15\n", 183 | "dtype: int64" 184 | ] 185 | }, 186 | "execution_count": 95, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "precios = [ 9, 13, 26, 15]\n", 193 | "acciones = [\"AAPL\", \"AMZN\", \"GOOGL\",\"MELI\"]\n", 194 | "\n", 195 | "acc = pd.Series(data = precios, index = acciones )\n", 196 | "\n", 197 | "acc" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 96, 203 | "metadata": { 204 | "colab": { 205 | "base_uri": "https://localhost:8080/" 206 | }, 207 | "id": "XUwc6eOoW61P", 208 | "outputId": "14bbc48e-1acc-441c-dd20-fb1a41024b69" 209 | }, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "Index(['AAPL', 'AMZN', 'GOOGL', 'MELI'], dtype='object')" 215 | ] 216 | }, 217 | "execution_count": 96, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "acc.index" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 97, 229 | "metadata": { 230 | "colab": { 231 | "base_uri": "https://localhost:8080/" 232 | }, 233 | "id": "E-tDGqPUW65j", 234 | "outputId": "25093043-4fb0-4c3c-a8ab-b0e98e669415" 235 | }, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/plain": [ 240 | "array([ 9, 13, 26, 15], dtype=int64)" 241 | ] 242 | }, 243 | "execution_count": 97, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "acc.values # Me muestra los valores de la serie" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 98, 255 | "metadata": { 256 | "colab": { 257 | "base_uri": "https://localhost:8080/" 258 | }, 259 | "id": "7oeK00_uW6vH", 260 | "outputId": "ed814af3-c3f8-4bb4-bf7b-752074f807ae" 261 | }, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "dtype('int64')" 267 | ] 268 | }, 269 | "execution_count": 98, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "acc.dtype" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 99, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "(4,)" 287 | ] 288 | }, 289 | "execution_count": 99, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "acc.shape" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 100, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "4" 307 | ] 308 | }, 309 | "execution_count": 100, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "acc.size" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 101, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "1" 327 | ] 328 | }, 329 | "execution_count": 101, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "acc.ndim" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "La serie tiene, además, un atributo name, atributo que también encontramos en el índice. Una vez los hemos fijado, se muestran junto con la estructura al imprimir la serie" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 102, 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "AAPL 9\n", 354 | "AMZN 13\n", 355 | "GOOGL 26\n", 356 | "MELI 15\n", 357 | "Name: Mi lista de acciones, dtype: int64" 358 | ] 359 | }, 360 | "execution_count": 102, 361 | "metadata": {}, 362 | "output_type": "execute_result" 363 | } 364 | ], 365 | "source": [ 366 | "acc.name = \"Mi lista de acciones\"\n", 367 | "acc.head(4) " 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "id": "GhVIldoQYSpa" 374 | }, 375 | "source": [ 376 | "## Selección de registros" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 103, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "data": { 386 | "text/plain": [ 387 | "9" 388 | ] 389 | }, 390 | "execution_count": 103, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "acc[0]" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 104, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/plain": [ 407 | "AAPL 9\n", 408 | "AMZN 13\n", 409 | "Name: Mi lista de acciones, dtype: int64" 410 | ] 411 | }, 412 | "execution_count": 104, 413 | "metadata": {}, 414 | "output_type": "execute_result" 415 | } 416 | ], 417 | "source": [ 418 | "acc[0:2]" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 105, 424 | "metadata": {}, 425 | "outputs": [ 426 | { 427 | "data": { 428 | "text/plain": [ 429 | "15" 430 | ] 431 | }, 432 | "execution_count": 105, 433 | "metadata": {}, 434 | "output_type": "execute_result" 435 | } 436 | ], 437 | "source": [ 438 | "acc['MELI']" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 106, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "data": { 448 | "text/plain": [ 449 | "" 454 | ] 455 | }, 456 | "execution_count": 106, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "acc.items" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "## Operaciones con series" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "**Operaciones logicas:**" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 107, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/plain": [ 487 | "AAPL False\n", 488 | "AMZN True\n", 489 | "GOOGL True\n", 490 | "MELI True\n", 491 | "Name: Mi lista de acciones, dtype: bool" 492 | ] 493 | }, 494 | "execution_count": 107, 495 | "metadata": {}, 496 | "output_type": "execute_result" 497 | } 498 | ], 499 | "source": [ 500 | "acc > 10" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 109, 506 | "metadata": {}, 507 | "outputs": [ 508 | { 509 | "data": { 510 | "text/plain": [ 511 | "AMZN 13\n", 512 | "GOOGL 26\n", 513 | "MELI 15\n", 514 | "Name: Mi lista de acciones, dtype: int64" 515 | ] 516 | }, 517 | "execution_count": 109, 518 | "metadata": {}, 519 | "output_type": "execute_result" 520 | } 521 | ], 522 | "source": [ 523 | "acc[acc > 9]" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "**Operaciones Aritméticas**:\n", 531 | "\n", 532 | "Podemos operar entre series y escalares sin ningún tipo de problema:" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 110, 538 | "metadata": {}, 539 | "outputs": [ 540 | { 541 | "data": { 542 | "text/plain": [ 543 | "AAPL 4.5\n", 544 | "AMZN 6.5\n", 545 | "GOOGL 13.0\n", 546 | "MELI 7.5\n", 547 | "Name: Mi lista de acciones, dtype: float64" 548 | ] 549 | }, 550 | "execution_count": 110, 551 | "metadata": {}, 552 | "output_type": "execute_result" 553 | } 554 | ], 555 | "source": [ 556 | "acc/2" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 114, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "comisiones = [0.15,0.18,0.02,0.03]" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 115, 571 | "metadata": {}, 572 | "outputs": [ 573 | { 574 | "data": { 575 | "text/plain": [ 576 | "AAPL 1.35\n", 577 | "AMZN 2.34\n", 578 | "GOOGL 0.52\n", 579 | "MELI 0.45\n", 580 | "Name: Mi lista de acciones, dtype: float64" 581 | ] 582 | }, 583 | "execution_count": 115, 584 | "metadata": {}, 585 | "output_type": "execute_result" 586 | } 587 | ], 588 | "source": [ 589 | "acc*comisiones" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "## Ordenación" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 116, 602 | "metadata": {}, 603 | "outputs": [ 604 | { 605 | "data": { 606 | "text/plain": [ 607 | "GOOGL 26\n", 608 | "MELI 15\n", 609 | "AMZN 13\n", 610 | "AAPL 9\n", 611 | "Name: Mi lista de acciones, dtype: int64" 612 | ] 613 | }, 614 | "execution_count": 116, 615 | "metadata": {}, 616 | "output_type": "execute_result" 617 | } 618 | ], 619 | "source": [ 620 | "acc.sort_values(ascending = False)" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "## Contando valores" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 117, 633 | "metadata": {}, 634 | "outputs": [ 635 | { 636 | "data": { 637 | "text/plain": [ 638 | "9 1\n", 639 | "13 1\n", 640 | "26 1\n", 641 | "15 1\n", 642 | "Name: Mi lista de acciones, dtype: int64" 643 | ] 644 | }, 645 | "execution_count": 117, 646 | "metadata": {}, 647 | "output_type": "execute_result" 648 | } 649 | ], 650 | "source": [ 651 | "acc.value_counts()" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "## Funciones estadisticas\n", 659 | "A modo de ejemplo:" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": 118, 665 | "metadata": { 666 | "colab": { 667 | "base_uri": "https://localhost:8080/" 668 | }, 669 | "id": "H6sP7NYdYgIF", 670 | "outputId": "3c2e07bf-9193-4f9b-d6b8-c0c2656df219" 671 | }, 672 | "outputs": [ 673 | { 674 | "data": { 675 | "text/plain": [ 676 | "15.75" 677 | ] 678 | }, 679 | "execution_count": 118, 680 | "metadata": {}, 681 | "output_type": "execute_result" 682 | } 683 | ], 684 | "source": [ 685 | "acc.mean()" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 119, 691 | "metadata": { 692 | "colab": { 693 | "base_uri": "https://localhost:8080/" 694 | }, 695 | "id": "EZpWA4myYsBW", 696 | "outputId": "18a22017-f18c-4cd1-a417-1eba5a7929c9" 697 | }, 698 | "outputs": [ 699 | { 700 | "data": { 701 | "text/plain": [ 702 | "14.0" 703 | ] 704 | }, 705 | "execution_count": 119, 706 | "metadata": {}, 707 | "output_type": "execute_result" 708 | } 709 | ], 710 | "source": [ 711 | "acc.median()" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": 121, 717 | "metadata": {}, 718 | "outputs": [ 719 | { 720 | "data": { 721 | "text/plain": [ 722 | "count 4.000000\n", 723 | "mean 15.750000\n", 724 | "std 7.274384\n", 725 | "min 9.000000\n", 726 | "25% 12.000000\n", 727 | "50% 14.000000\n", 728 | "75% 17.750000\n", 729 | "max 26.000000\n", 730 | "Name: Mi lista de acciones, dtype: float64" 731 | ] 732 | }, 733 | "execution_count": 121, 734 | "metadata": {}, 735 | "output_type": "execute_result" 736 | } 737 | ], 738 | "source": [ 739 | "acc.describe()" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "[Pandas Documentación de Series](https://pandas.pydata.org/pandas-docs/stable/reference/series.html)" 747 | ] 748 | }, 749 | { 750 | "cell_type": "markdown", 751 | "metadata": {}, 752 | "source": [ 753 | "by María Belén Camandone" 754 | ] 755 | } 756 | ], 757 | "metadata": { 758 | "colab": { 759 | "name": "Laboratorio_2_Series", 760 | "provenance": [] 761 | }, 762 | "kernelspec": { 763 | "display_name": "Python 3 (ipykernel)", 764 | "language": "python", 765 | "name": "python3" 766 | }, 767 | "language_info": { 768 | "codemirror_mode": { 769 | "name": "ipython", 770 | "version": 3 771 | }, 772 | "file_extension": ".py", 773 | "mimetype": "text/x-python", 774 | "name": "python", 775 | "nbconvert_exporter": "python", 776 | "pygments_lexer": "ipython3", 777 | "version": "3.11.4" 778 | } 779 | }, 780 | "nbformat": 4, 781 | "nbformat_minor": 1 782 | } 783 | -------------------------------------------------------------------------------- /Series y dataframes/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ![series](https://github.com/bcamandone/Python_Analisis_datos/assets/86261762/8647c5b2-29b7-4e23-abe1-4f84850f3604) 3 | -------------------------------------------------------------------------------- /Tabula/Tabula python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcamandone/Python_Analisis_datos/164c777861caa37db598fff37659e9ac99edc8f2/Tabula/Tabula python.pdf -------------------------------------------------------------------------------- /Tabula/readme.md: -------------------------------------------------------------------------------- 1 | # Tabula 2 | Con la librería Tabula podemos leer tablas de archivos PDF 3 | -------------------------------------------------------------------------------- /Titanic Dataset/Portada/-: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Titanic Dataset/Portada/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcamandone/Python_Analisis_datos/164c777861caa37db598fff37659e9ac99edc8f2/Titanic Dataset/Portada/2.png -------------------------------------------------------------------------------- /Titanic Dataset/README.md: -------------------------------------------------------------------------------- 1 | # Titanic 2 | 3 | ![2](https://user-images.githubusercontent.com/86261762/197585477-a58d5c2f-de0e-498d-81ea-055fe0a3e338.png) 4 | 5 | 6 | Sin dudas el Titanic es el dataset que, por excelencia usamos por primera vez para aprender y adentrarnos al mundo del análisis de datos. 7 | Este repositorio cuenta con 5 notebooks y el archivo csv. 8 | 9 | Notebooks: 10 | 11 | ## 1) Titanic ejercicio: 12 | En el cual se responde a la siguientes preguntas: 13 | a) ¿Cuál era la edad promedio de los pasajeros de cada clase (Pclass)? 14 | b) ¿Cuál fue la tarifa que pagaron en promedio los hombres? 15 | c) ¿Cuánto pagaron en total los pasajeros de primera clase para subir al Titanic? ¿Y los de tercera? 16 | d) ¿Cuántos pasajeros había en cada tipo de clase? 17 | 18 | ## 2) Titanic group by: 19 | En el cual se analizan las distintas posibilidades para agrupar datos 20 | 21 | 22 | ## 3) EDA - Dataset Titanic: 23 | Se realiza un análisis exploratorio de datos, para responder con estos últimos a preguntas como: ¿sobrevivieron mayor cantidad de hombres? sobrevivieron en mayor cantidad pasajeros de la primera clase? ¿Qué edad tenían los pasajeros? etc. 24 | 25 | ## 4) Outliers: 26 | Detección de outliers en la columna "Age", a través del método IQR y 𝗭-𝗦𝗖𝗢𝗥𝗘 27 | 28 | ## 5) Visualizar datos faltantes con missingno: 29 | Se utiliza la libreria missingno, que proporciona una serie de visualizaciones para comprender la presencia y distribución de datos faltantes. 30 | Se pintan: un gráfico de barras, gráfico de matriz, mapa de calor y dendrograma. 31 | 32 | 33 | Librerías 34 | - pandas 35 | - numpy 36 | - matplotlib 37 | - seaborn 38 | - missingno 39 | -------------------------------------------------------------------------------- /Titanic Dataset/Titanic ejercicio.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b8231186", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "id": "8325f278", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "df = pd.read_csv(\"titanic_train.csv\")" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 12, 27 | "id": "340ac1c5", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "(891, 12)" 34 | ] 35 | }, 36 | "execution_count": 12, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "df.shape" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 5, 48 | "id": "1d4dbb12", 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/html": [ 54 | "
\n", 55 | "\n", 68 | "\n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", 164 | "
" 165 | ], 166 | "text/plain": [ 167 | " PassengerId Survived Pclass \\\n", 168 | "0 1 0 3 \n", 169 | "1 2 1 1 \n", 170 | "2 3 1 3 \n", 171 | "3 4 1 1 \n", 172 | "4 5 0 3 \n", 173 | "\n", 174 | " Name Sex Age SibSp \\\n", 175 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 176 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 177 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 178 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 179 | "4 Allen, Mr. William Henry male 35.0 0 \n", 180 | "\n", 181 | " Parch Ticket Fare Cabin Embarked \n", 182 | "0 0 A/5 21171 7.2500 NaN S \n", 183 | "1 0 PC 17599 71.2833 C85 C \n", 184 | "2 0 STON/O2. 3101282 7.9250 NaN S \n", 185 | "3 0 113803 53.1000 C123 S \n", 186 | "4 0 373450 8.0500 NaN S " 187 | ] 188 | }, 189 | "execution_count": 5, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "df.head()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 9, 201 | "id": "4387d8e7", 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "PassengerId int64\n", 208 | "Survived int64\n", 209 | "Pclass int64\n", 210 | "Name object\n", 211 | "Sex object\n", 212 | "Age float64\n", 213 | "SibSp int64\n", 214 | "Parch int64\n", 215 | "Ticket object\n", 216 | "Fare float64\n", 217 | "Cabin object\n", 218 | "Embarked object\n", 219 | "dtype: object" 220 | ] 221 | }, 222 | "execution_count": 9, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "df.dtypes" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 11, 234 | "id": "03d24511", 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "\n", 242 | "RangeIndex: 891 entries, 0 to 890\n", 243 | "Data columns (total 12 columns):\n", 244 | " # Column Non-Null Count Dtype \n", 245 | "--- ------ -------------- ----- \n", 246 | " 0 PassengerId 891 non-null int64 \n", 247 | " 1 Survived 891 non-null int64 \n", 248 | " 2 Pclass 891 non-null int64 \n", 249 | " 3 Name 891 non-null object \n", 250 | " 4 Sex 891 non-null object \n", 251 | " 5 Age 714 non-null float64\n", 252 | " 6 SibSp 891 non-null int64 \n", 253 | " 7 Parch 891 non-null int64 \n", 254 | " 8 Ticket 891 non-null object \n", 255 | " 9 Fare 891 non-null float64\n", 256 | " 10 Cabin 204 non-null object \n", 257 | " 11 Embarked 889 non-null object \n", 258 | "dtypes: float64(2), int64(5), object(5)\n", 259 | "memory usage: 83.7+ KB\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "df.info()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 14, 270 | "id": "57ec56a1", 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/plain": [ 276 | "0" 277 | ] 278 | }, 279 | "execution_count": 14, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "df.duplicated().sum(axis = 0)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 15, 291 | "id": "dbf3cc0a", 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "PassengerId 0\n", 298 | "Survived 0\n", 299 | "Pclass 0\n", 300 | "Name 0\n", 301 | "Sex 0\n", 302 | "Age 177\n", 303 | "SibSp 0\n", 304 | "Parch 0\n", 305 | "Ticket 0\n", 306 | "Fare 0\n", 307 | "Cabin 687\n", 308 | "Embarked 2\n", 309 | "dtype: int64" 310 | ] 311 | }, 312 | "execution_count": 15, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "df.isnull().sum(axis = 0)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 16, 324 | "id": "105b36cd", 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "text/plain": [ 330 | "30" 331 | ] 332 | }, 333 | "execution_count": 16, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "media_edad = round(df['Age'].mean()) #Obtenemos la media de la columna edad \n", 340 | "media_edad " 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 17, 346 | "id": "c78bebd1", 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "df['Age'].fillna(media_edad, inplace=True) #Reemplazamos los valores nulos de la columna edad por la media de la edad " 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "id": "178c5fab", 356 | "metadata": {}, 357 | "source": [ 358 | "### Ejercicio: Volviendo al ejemplo del Titanic.\n", 359 | "\n", 360 | "1) ¿Cuál era la edad promedio de los pasajeros de cada clase (Pclass)?\n", 361 | "\n", 362 | "2) ¿Cuál fue la tarifa que pagaron en promedio los hombres?\n", 363 | "\n", 364 | "3) ¿Cuánto pagaron en total los pasajeros de primera clase para subir al Titanic? ¿Y los de tercera?\n", 365 | "\n", 366 | "4) ¿Cuántos pasajeros había en cada tipo de clase?" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "id": "632bcccd", 372 | "metadata": {}, 373 | "source": [ 374 | "\n", 375 | "1) ¿Cuál era la edad promedio de los pasajeros de cada clase (Pclass)?\n" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 19, 381 | "id": "bf23b224", 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "data": { 386 | "text/html": [ 387 | "
\n", 388 | "\n", 401 | "\n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | "
Age
Pclass
137.0
230.0
326.0
\n", 427 | "
" 428 | ], 429 | "text/plain": [ 430 | " Age\n", 431 | "Pclass \n", 432 | "1 37.0\n", 433 | "2 30.0\n", 434 | "3 26.0" 435 | ] 436 | }, 437 | "execution_count": 19, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "round(df[['Pclass', 'Age']].groupby('Pclass').mean())" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "id": "f13d1507", 449 | "metadata": {}, 450 | "source": [ 451 | "2) ¿Cuál fue la tarifa que pagaron en promedio los hombres?" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 20, 457 | "id": "430dea69", 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "data": { 462 | "text/plain": [ 463 | "25.52" 464 | ] 465 | }, 466 | "execution_count": 20, 467 | "metadata": {}, 468 | "output_type": "execute_result" 469 | } 470 | ], 471 | "source": [ 472 | "round(df.loc[df['Sex'] =='male','Fare'].mean(),2)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "id": "77eb607f", 478 | "metadata": {}, 479 | "source": [ 480 | "3) ¿Cuánto pagaron en total los pasajeros de primera clase para subir al Titanic? ¿Y los de tercera?" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 21, 486 | "id": "d8f20d10", 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "data": { 491 | "text/plain": [ 492 | "18177" 493 | ] 494 | }, 495 | "execution_count": 21, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "round(df.loc[df['Pclass'] == 1,'Fare'].sum())" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 22, 507 | "id": "9f0701ec", 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/plain": [ 513 | "6715" 514 | ] 515 | }, 516 | "execution_count": 22, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "round(df.loc[df['Pclass'] == 3,'Fare'].sum())" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "id": "cb17bb4c", 528 | "metadata": {}, 529 | "source": [ 530 | "\n", 531 | "4) ¿Cuántos pasajeros había en cada tipo de clase?" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 23, 537 | "id": "a90a5829", 538 | "metadata": {}, 539 | "outputs": [ 540 | { 541 | "data": { 542 | "text/html": [ 543 | "
\n", 544 | "\n", 557 | "\n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | "
PassengerId
Pclass
1216
2184
3491
\n", 583 | "
" 584 | ], 585 | "text/plain": [ 586 | " PassengerId\n", 587 | "Pclass \n", 588 | "1 216\n", 589 | "2 184\n", 590 | "3 491" 591 | ] 592 | }, 593 | "execution_count": 23, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "df[['Pclass', 'PassengerId']].groupby('Pclass').count()" 600 | ] 601 | } 602 | ], 603 | "metadata": { 604 | "kernelspec": { 605 | "display_name": "Python 3 (ipykernel)", 606 | "language": "python", 607 | "name": "python3" 608 | }, 609 | "language_info": { 610 | "codemirror_mode": { 611 | "name": "ipython", 612 | "version": 3 613 | }, 614 | "file_extension": ".py", 615 | "mimetype": "text/x-python", 616 | "name": "python", 617 | "nbconvert_exporter": "python", 618 | "pygments_lexer": "ipython3", 619 | "version": "3.11.4" 620 | } 621 | }, 622 | "nbformat": 4, 623 | "nbformat_minor": 5 624 | } 625 | -------------------------------------------------------------------------------- /Titanic Dataset/Titanic group by.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f40fd8c3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "d103538c", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "df = pd.read_csv(\"titanic_train.csv\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "id": "6e5741c8", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/html": [ 32 | "
\n", 33 | "\n", 46 | "\n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", 142 | "
" 143 | ], 144 | "text/plain": [ 145 | " PassengerId Survived Pclass \\\n", 146 | "0 1 0 3 \n", 147 | "1 2 1 1 \n", 148 | "2 3 1 3 \n", 149 | "3 4 1 1 \n", 150 | "4 5 0 3 \n", 151 | "\n", 152 | " Name Sex Age SibSp \\\n", 153 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 154 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 155 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 156 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 157 | "4 Allen, Mr. William Henry male 35.0 0 \n", 158 | "\n", 159 | " Parch Ticket Fare Cabin Embarked \n", 160 | "0 0 A/5 21171 7.2500 NaN S \n", 161 | "1 0 PC 17599 71.2833 C85 C \n", 162 | "2 0 STON/O2. 3101282 7.9250 NaN S \n", 163 | "3 0 113803 53.1000 C123 S \n", 164 | "4 0 373450 8.0500 NaN S " 165 | ] 166 | }, 167 | "execution_count": 3, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "df.head()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "7194ccdb", 179 | "metadata": {}, 180 | "source": [ 181 | "Group by\n", 182 | "\n", 183 | "Usaremos group by para agrupar los datos según los valores de una o más columnas.\n", 184 | "Este método crea una estructura de datos particular a la que le podemos aplicar diferentes operaciones.\n", 185 | "Group by sólo crea la estructura de datos a la que se le puede aplicar la función de agregación o estadísticas. \n" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 4, 191 | "id": "4c21610b", 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/html": [ 197 | "
\n", 198 | "\n", 211 | "\n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | "
PassengerIdSurvivedPclassNameAgeSibSpParchTicketFareCabinEmbarked
Sex
female31431431431426131431431431497312
male577577577577453577577577577107577
\n", 273 | "
" 274 | ], 275 | "text/plain": [ 276 | " PassengerId Survived Pclass Name Age SibSp Parch Ticket Fare \\\n", 277 | "Sex \n", 278 | "female 314 314 314 314 261 314 314 314 314 \n", 279 | "male 577 577 577 577 453 577 577 577 577 \n", 280 | "\n", 281 | " Cabin Embarked \n", 282 | "Sex \n", 283 | "female 97 312 \n", 284 | "male 107 577 " 285 | ] 286 | }, 287 | "execution_count": 4, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "df.groupby('Sex').count()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "id": "11cae7d9", 299 | "metadata": {}, 300 | "source": [ 301 | "Si la operación a realizar solo admite números, se aplica a las columnas numéricas exclusivamente" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 5, 307 | "id": "8628f419", 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/html": [ 313 | "
\n", 314 | "\n", 327 | "\n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | "
PassengerIdSurvivedPclassAgeSibSpParchFare
Sex
female431.0286620.7420382.15923627.9157090.6942680.64968244.479818
male454.1473140.1889082.38994830.7266450.4298090.23570225.523893
\n", 373 | "
" 374 | ], 375 | "text/plain": [ 376 | " PassengerId Survived Pclass Age SibSp Parch \\\n", 377 | "Sex \n", 378 | "female 431.028662 0.742038 2.159236 27.915709 0.694268 0.649682 \n", 379 | "male 454.147314 0.188908 2.389948 30.726645 0.429809 0.235702 \n", 380 | "\n", 381 | " Fare \n", 382 | "Sex \n", 383 | "female 44.479818 \n", 384 | "male 25.523893 " 385 | ] 386 | }, 387 | "execution_count": 5, 388 | "metadata": {}, 389 | "output_type": "execute_result" 390 | } 391 | ], 392 | "source": [ 393 | "df.groupby('Sex').mean()" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "id": "0ca0b5ca", 399 | "metadata": {}, 400 | "source": [ 401 | "Tres maneras de restringir el resultado a ciertas columnas:\n", 402 | " \n", 403 | "1) Seleccionar la columna de interés, luego de hacer las operaciones.\n", 404 | "2) Seleccionar las columnas necesarias antes de hacer el groupby\n", 405 | "(se debe incluir la columna con la que agrupar).\n", 406 | "3) Seleccionar la columna de interés después de hacer el groupby, \n", 407 | "pero antes de aplicar la función de agregación." 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 10, 413 | "id": "9ca3515a", 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "Sex\n", 420 | "female 28.0\n", 421 | "male 31.0\n", 422 | "Name: Age, dtype: float64" 423 | ] 424 | }, 425 | "execution_count": 10, 426 | "metadata": {}, 427 | "output_type": "execute_result" 428 | } 429 | ], 430 | "source": [ 431 | "round(df.groupby('Sex').mean()['Age']) #Opción 1 " 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 11, 437 | "id": "99829739", 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/html": [ 443 | "
\n", 444 | "\n", 457 | "\n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | "
Age
Sex
female28.0
male31.0
\n", 479 | "
" 480 | ], 481 | "text/plain": [ 482 | " Age\n", 483 | "Sex \n", 484 | "female 28.0\n", 485 | "male 31.0" 486 | ] 487 | }, 488 | "execution_count": 11, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "round(df[['Sex','Age']].groupby('Sex').mean()) #Opción 2 " 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 12, 500 | "id": "a1b273b9", 501 | "metadata": {}, 502 | "outputs": [ 503 | { 504 | "data": { 505 | "text/plain": [ 506 | "Sex\n", 507 | "female 28.0\n", 508 | "male 31.0\n", 509 | "Name: Age, dtype: float64" 510 | ] 511 | }, 512 | "execution_count": 12, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "round(df.groupby('Sex')['Age'].mean()) #Opción 3 " 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "id": "31f26f8b", 524 | "metadata": {}, 525 | "source": [ 526 | "En mi caso prefiero utilizar la opción dos porque me parece mas ordenada, \n", 527 | "pero con cualquiera de las 3 llegamos al mismo resultado. " 528 | ] 529 | } 530 | ], 531 | "metadata": { 532 | "kernelspec": { 533 | "display_name": "Python 3 (ipykernel)", 534 | "language": "python", 535 | "name": "python3" 536 | }, 537 | "language_info": { 538 | "codemirror_mode": { 539 | "name": "ipython", 540 | "version": 3 541 | }, 542 | "file_extension": ".py", 543 | "mimetype": "text/x-python", 544 | "name": "python", 545 | "nbconvert_exporter": "python", 546 | "pygments_lexer": "ipython3", 547 | "version": "3.11.4" 548 | } 549 | }, 550 | "nbformat": 4, 551 | "nbformat_minor": 5 552 | } 553 | -------------------------------------------------------------------------------- /Web_scraping_BeautifulSoup/readme.md: -------------------------------------------------------------------------------- 1 | Beautiful Soup es una librería de python que facilita la extracción de información de páginas web. 2 | --------------------------------------------------------------------------------