├── README.md ├── movies.dat └── 01 INTRODUCTION TO VARIABLES.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # PRE-PROCESSING-DATA 2 | -------------------------------------------------------------------------------- /movies.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ssilvacris/PRE-PROCESSING-DATA/main/movies.dat -------------------------------------------------------------------------------- /01 INTRODUCTION TO VARIABLES.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": { 5 | "IT%20Logo.png": { 6 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAABwCAMAAAC6s4C9AAAAzFBMVEXaAH/////XAHT/6fb/2O7ZAHveB4nldqreD4fyi8LZAHrjHJDXAHf/6/XaAH350ub/zOffJov5nc/xiLz9stjeL4vuq8n4wt7/4fHqlrzrca7jNpT5qdT1qtH32uboiLTwudH3u9ryZbPzlMb88fb/+fzoJpn/8fjkU5zxgL3xmMXqTaP/5PL/0ertZ67iKpD1er7wS6rqeLPlX6HsVantcrTnNZriPpb5ttjkYqHofrHursrzn8rpWaPWAGzscrD3gMH0brj8mc/fRZJGHTsEAAALsElEQVR4nO2ce1viuhaHU21rKSWCMJZDZe9B6HCnI15AB47jzPf/Trs3IGslvahny8ST3/P4h22aJnmzkpWVFFI7VZJaNUJOlKQWISZRklqmQii7FELppRBKL4VQeimE0kshlF4KofRSCKWXQii9FELppRBKL4VQeimE0kshlF4KofRSCKWXQii9FELppRBKL4VQeimE0kshlF4KofRSCKWXQii9FELppRBKL4VQeimE0kshlF5HRUhZlUlMLMsiRlHa1+T6CVQOoQXEXchSUa6BF+zlNfKSUmp5o8ezZaVSWZ7d/m6QHDYNNtdAUAhRSam4Y5StZFHF8e3c9K9rxXIIrbsKo3uTWtNKGf3ONQJjurli9Cs7JSVep9KdD7VEp+5mObKMjLTBTzbXiy1OR6djvqTLs87IsziKxnVxJR/Ch6wXcOkZv3IFX7kNvoP/xys2sQcTX+f2bVISYeNCY3RepVZfK6O/7bxcjTVM7WWm9JatIcpa7z9awg5CO6cg4SVOYDdxXrEGeqv/NAqgeTtrUVKo67CSX+bgUu0aVtyawEd+BD14YcKa2sYH9ypFZng8hPQbaspKRjrruSXKvNYXGrm1gcnqU5TKbtYyC6xvOisWoiN8M5AfI6zDi3PYvRZtePvypAOvzLf79MYU3mqzFirU8RA6E9jbtLawu9FVf5CRvXvLT4nGVoeJhs1XIAyNcb0MmDK+EeFwzLyUrnromUvHQqbGmGEX3PHvC+kcD6E1w8mvRcm2uP6MZgt+MP0bJ+oFME0+wrDRutt92rciDG2HmYKbqK9ql7axPQdX9maIjbD1pdCpPhpC+zs3Jbl8KmOhizLeqTZGlsv3eG32/DqEYXsud7m+GaG/PhRsNMfPXNqEohl5Z4YWNMLhlhTqaAgpVzFN44Z9ivokp+E95EOXvK/ShZiLER56xpsRarVFWjDa2HDPRAgD6Cm1b5PCLeDY1G8Ur2yPhZDeCPzCK5zI45oGq33HvoM2rvgk+gKsK0ogDBkmad+OUKtX08ZHHnKkEGHY2WAxrmJYlgsutlclYhPHQmh38QQRNUeAUhX79L7LLpvorcBq/Q18cwmEuwHsHQiH/eR1/MieIkRDZjsyW/seGiGeJ/4khLQqbMfvoNMZixLvGI6Zl+AFWCIX9OVSCDU3eCdCbTaNCmZVBF01Qkhs5NFEZoiM0C1jhMdC6FSEKwUd9DqT81l5DdynwzBJR8KBt9Z8PcJklfoehIlHg9c4sWKEYX8DbTBbULoEg+5wUQLgmxHe1929wNjlnx9u1DuZnciCPXCvKZPG+I7vtqO36ky39s8nHttNOPc90ZpdVyCEw+iXIAXPnAY8wh9fef2IQgcihOGESmmjKypPgpCuoM11GybMZlPCl3kzQmJV9zKBxzVbmodbmSO5vRAGucJFHOt5ICOsrRcPYabb7r5jz7pTA6yhM+bOdofhjBD2ms3m02Wvzpnmgkf4X0egqABChNp5QJ+FFp8gJMYYdqZn+L/O9uf/PUJmQ8f5D2qtEjs9his2FzZQSm/gndOxFWdt0EWyHhn0libwNWkno2OAdQVCOCa2bTuW13TR0N5rcAhPsuojRjicIDvbKUWIFxZwPCvny5C3I2SaBCHM2EFgRR8yp6OLfd42Wh+M9xkbXsjQr08CNNXi8OhBc2a/AiFMo8jU8SawA/gej9DgFT8tRqidZowKKUJiw1Ap7NZuYXA01VEQGpssI9SG++IYsEVB6GZRa29WBrJyKnQdksbpM+UVIox2tK5gqe44hLcdXvHjGQi1jGruEBIcKmUbojg4muoYCGkVdL4eqMb1LqpRhY1xw5gcbYw7fITbqrAPzIDD1PMO1p2BMBzX4ED2k0PY5qV/y54LM7VHaEwz3Lpw7C8OjqY6BkJ7zBrYALacvkMIp8IZqhD/Fujg+RvgndaW+wcyERIH7olclFlU+O9CSGjWCmdWIjia6hgILRAebRG4+r1JcjeuQUttnMJsl2w27QWMLh8cmmyE9Ab6qo0SCAfvRFjtiZMUbvQedASExh1oqWtSBQhbqcf9wl70X3IPAJBoGATuj2taYEl2CJTmIPwG7tQ/ACExlqJFqTbPP7MCdASE9hqYS0Bt4Hr71aRuL+DidSHCW7YthhPDgHHkzQ7Vn4UQ7y6lxcenb/L08QhpgJvQvga1+Bnn8FqEMOY3n1K0dejuDmD8+wgH3D5aDfrKLEKhR1Nio/egj0cInQY/nLapCQaTWVK1F/aaPy5ASOHUF8Un4TJxfwAjDyEox5sR/rVAUQJ/DWPKl6AyfS4gMSsXHE314QhpAHqdG8UBHbgiu4vyh+6MVkeREdxLrSZog2XYSPYjcHV3gdJshPY1uHNVxiMVIrRQZLS9hcdBAELqcSGcMhu9B304QnTg4j4K1dkPAGE9yh8tKobQQzNX0ENFo2Y9MMJsYfxqdwAjB2ELFGP81kXFXydTMHCGE3MOQkKxRzPHh+7y9fFWCMKj/k0SKocR7eh0LTFhU70wzGijX38yQTuA8KjfjWPtZh8gSR2aTIT2FDblC4dw0+W0qYoQWmP2zW7g5CJs9GBVm+UXFJE+GqEB22ng9mKhQwgRLgdiPTeZt94PtVp3cVjfowMqfjvJ9RwgTAOlWQjtAMXe+RgpMaumCf+SpxFCh3jMADBbUDsPIbHhOdP6KxYUkT4cociHxhpEuOxf8OLVjhhNj9a2D1tNdFtid9jvx+0tQhiO5WQBA33a/M07FSFCZqt3GL42HyGtgtnwKvijEaLwaJaihTx9gNf89YNhU2rYVj/Nw59PvHi7DoVHs5QESrnNJscxq6vxGhfsO+ERUoHECA/u1Twk8jqE3h+N0OE9aJHacR2xZdW7y+l0MXEPWQzcZrTjRPmjmiIN4zMaCOH5169fXXfOxUhOPR4hs9HNSIhwH7GtPYcv/UwIzczIPFQUKOUPXviz+VyHa65aL9ruWZbLtRvwCLMkOjvTcwXqOSKEhCQnQgdxcPYTIbTvhBFBXuto2jFzT3Kn8tcmCo9mq/1ol0WoewKE4gKcCBHSRnya7jwZvD8PQtIr4czEit6BVvdi1e5D96aUXWnJsfdSCP0kPlIG4UCMMBlK002uz4OQPpRwHBP9ipuh2LoGE1L2SGSo+oiWQ5g4r+9CSMhzzW+la9FPg9DpZn1nxmkYR2gC8fGhg/xegxqjUl5unLxZDuEmPSL+LoThUHo+TVrj0yCkwV+gJm6LVQ+2TLw1QVcFuzj1lYHCo9opyHUN7X7t0WKE/mR3yP99Vki8Xbj60yC04TlJ/9YDgp6OHudDp7kMo2giCo/6vQDkCkMJtWUxQv0Q4HonwkPNPwtC9MXAugGWyA6KxsSB0pBh9nzod+N48D2ANLt32FxtdKxtYzn5CP2r20OJFUIo4w4OamhTjH6DTbNOwlk0GGdMdbVK5Pjj79HmqD4m+o7v8SQPYa11xoa3FEIoAxy40HT82Y6BfJc0rk3J7YVgNVm7SL7Splv4KckE7Q3TJ0isTzIR1taVRw/sRH5WhHoewtNMhHQKjfAX/j0VGy0DK7uK0sZjH+HVN4t0WxSWTdPxXhv2iOorwY+WzHS31X96XOFfninzoyXx0h4E+GZFCH9ghKCIrX8DIfl9xugRbmdRePNLZiaNDpvwif+ZGesMaHSoCbVWZ5Wuq+vDtq67V5XOqLFrbGsEHrrl9tpQgsfGCpQjeWq08izCfwRCR1xSXtFTt+AVPILfTxk1S4oIHh9ltqBY5RBS9isCXMDcm5kJRSmN7ATRL3itRqPH7Wj0OwDGUpgrTkDhhfhi1jc8fFL+2TJtUFDI0k0okkS/hPh/9dt4r5BECJXEUgill0IovRRC6aUQSi+FUHophNJLIZReCqH0Ugill0IovRRC6aUQSi+FUHophNJLIZReCqH0Ugill0IovRRC6aUQSi+FUHophNJLIZReCqH0Ugill0IovRRC6aUQSi+FUHophNIrBHiiJLUI0ZUk1z9dUGPVsW+BpQAAAABJRU5ErkJggg==" 7 | } 8 | }, 9 | "cell_type": "markdown", 10 | "metadata": {}, 11 | "source": [ 12 | "![IT%20Logo.png](attachment:IT%20Logo.png)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "
Introduction to Variables
\n", 20 | "\n", 21 | "\\begin{align*}Alex\\:Kumenius\\end{align*}\n", 22 | "\\begin{align*}Business\\hspace{2mm}Intelligence\\hspace{2mm}and\\hspace{2mm}Data\\hspace{2mm}Scientist\\hspace{2mm}Project\\hspace{2mm}Integrator\\end{align*}\n", 23 | "$%$ \n", 24 | "\\begin{align*}Date : Gener\\hspace{2mm}2021\\end{align*}
" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "# TYPES OF VARIABLES" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "Generally, in Math and Statistics variables may be Numerical or Categorical Variables.\n", 39 | "\n", 40 | "A Variable is a quantity whose value changes. " 41 | ] 42 | }, 43 | { 44 | "attachments": { 45 | "Types%20of%20Variables.jpg": { 46 | "image/jpeg": "" 47 | } 48 | }, 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "![Types%20of%20Variables.jpg](attachment:Types%20of%20Variables.jpg)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "### NUMERICAL" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "A Numerical variable can take a wide range of numerical values, and it is sensible to add, subtract, or take averages with those values. On the other hand, we would not classify a variable reporting \"telephone area codes\" as numerical since there is **no** sense to *average, sum*, and *difference*." 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "#### Discrete Variables" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "A Discrete variable is a variable whose value is obtained by counting. \n", 81 | "\n", 82 | "Over a particular range of real values ($\\:\\mathbb {R}\\:$) is any value in the range that the variable is permitted to take on, there is a positive minimum distance to the nearest other permissible value. The number of permitted values is either finite or countably infinite. \n", 83 | "\n", 84 | "Common examples are variables that must be integers, non-negative integers, positive integers, or only the integers 0 and 1." 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "Examples: \n", 92 | "- number of students present\n", 93 | "- number of red marbles in a jar\n", 94 | "- number of heads when flipping three coins\n", 95 | "- students’ grade level" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "#### Continuous Variables" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "A Continuous variable is a variable whose value is obtained by measuring." 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "A Continuous variable is one which can take on infinitely many, uncountable values.\n", 117 | "\n", 118 | "For example, a variable over a non-empty range of the real numbers ($\\:\\mathbb {R}\\:$) ***a*** and ***b*** is continuous, if it can take on *any value in that range*. The reason is that any range of real numbers between ***a*** and ***b*** with$\\hspace{3mm}$${\\displaystyle a,b\\in \\mathbb {R} ; \n", 119 | "\\hspace{4mm}a\\neq b}$ is infinite and uncountable." 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "Examples: \n", 127 | "- height of students in class\n", 128 | "- weight of students in class\n", 129 | "- time it takes to get to school\n", 130 | "- distance traveled between classes" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "### CATEGORICAL" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "A Categorical Variable takes on a limited, and usually fixed, number of possible values, categories; and the possible values are call the variable's levels." 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Categorical Variables where their levels have a natural order are \"Ordinal Variables\". \n", 152 | "\n", 153 | "1- The ``categories`` are deduced from the data . \n", 154 | "2- The ``categories`` are messy . \n", 155 | "\n", 156 | "Examples are *gender, social class, blood type, country affiliation, observation time or rating via Likert scales*. " 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "Categorical Variables without this type of special ordering is called \"Nominal Variable\"." 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "Categorical data might have an order (e.g. ‘strongly agree’ vs ‘agree’ or ‘first observation’ vs. ‘second observation’), but **numerical operations** (additions, divisions, …) are not possible." 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "#### Categorical - \"Ordinal Variables\"" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "ExecuteTime": { 185 | "end_time": "2021-01-26T19:52:05.746971Z", 186 | "start_time": "2021-01-26T19:52:05.742982Z" 187 | } 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "import os\n", 192 | "import pandas as pd\n", 193 | "import numpy as np" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "ExecuteTime": { 201 | "end_time": "2021-01-26T18:43:52.984928Z", 202 | "start_time": "2021-01-26T18:43:52.474293Z" 203 | } 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "df = pd.DataFrame({\"A\": [\"a\", \"b\", \"c\", \"a\"]})\n", 208 | "df" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "ExecuteTime": { 216 | "end_time": "2020-10-07T19:05:33.105215Z", 217 | "start_time": "2020-10-07T19:05:33.098229Z" 218 | } 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "type(df)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": { 229 | "ExecuteTime": { 230 | "end_time": "2020-10-07T19:05:44.793545Z", 231 | "start_time": "2020-10-07T19:05:44.784570Z" 232 | } 233 | }, 234 | "outputs": [], 235 | "source": [ 236 | "df.dtypes" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "ExecuteTime": { 244 | "end_time": "2020-10-07T19:09:05.228799Z", 245 | "start_time": "2020-10-07T19:09:05.208853Z" 246 | } 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "# passing astype('category'), as the default behavior\n", 251 | "df[\"B\"] = df[\"A\"].astype('category')\n", 252 | "df" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "ExecuteTime": { 260 | "end_time": "2020-10-07T19:09:24.257645Z", 261 | "start_time": "2020-10-07T19:09:24.251662Z" 262 | } 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "type(df)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "ExecuteTime": { 274 | "end_time": "2020-10-07T19:09:37.949002Z", 275 | "start_time": "2020-10-07T19:09:37.940032Z" 276 | } 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "df.dtypes" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "ExecuteTime": { 288 | "end_time": "2020-10-07T19:09:55.301924Z", 289 | "start_time": "2020-10-07T19:09:55.292946Z" 290 | } 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "df['A']" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "ExecuteTime": { 302 | "end_time": "2020-10-07T19:10:04.229271Z", 303 | "start_time": "2020-10-07T19:10:04.218298Z" 304 | } 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "df['B']" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "#### Categorical - \"Nominal Variables\"" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "How we control the behavior of a Categorical - \"Nominal Variable\"?." 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "ExecuteTime": { 330 | "end_time": "2020-10-07T19:18:25.445046Z", 331 | "start_time": "2020-10-07T19:18:25.441060Z" 332 | } 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "from pandas.api.types import CategoricalDtype" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": { 343 | "ExecuteTime": { 344 | "end_time": "2020-10-07T19:18:27.155759Z", 345 | "start_time": "2020-10-07T19:18:27.143791Z" 346 | } 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "s = pd.Series([\"Wednesday\", \"Monday\", \"Thursday\", \"Sunday\", \"Friday\"])\n", 351 | "s\n", 352 | "s.sort_values(inplace=True)\n", 353 | "s" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "ExecuteTime": { 361 | "end_time": "2020-10-07T19:18:28.398988Z", 362 | "start_time": "2020-10-07T19:18:28.384027Z" 363 | } 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "sc = pd.Series([\"Wednesday\", \"Saturday\", \"Monday\", \"Sunday\", \"Thursday\", \"Tuesday\", \"Friday\"], \n", 368 | " dtype=\"category\")\n", 369 | "sc\n", 370 | "sc.sort_values(inplace=True)\n", 371 | "sc" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": { 378 | "ExecuteTime": { 379 | "end_time": "2020-10-07T19:18:30.706258Z", 380 | "start_time": "2020-10-07T19:18:30.697284Z" 381 | } 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "cat_s = CategoricalDtype(categories=[\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"], \n", 386 | " ordered=True)\n", 387 | "cat_s" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": { 394 | "ExecuteTime": { 395 | "end_time": "2020-10-07T19:19:37.669725Z", 396 | "start_time": "2020-10-07T19:19:37.658755Z" 397 | } 398 | }, 399 | "outputs": [], 400 | "source": [ 401 | "s_cat =s.astype(cat_s)\n", 402 | "s_cat.sort_values(inplace=True)\n", 403 | "s_cat" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "The categorical data type is useful in the following ``cases``:\n", 411 | "\n", 412 | "- A string variable consisting of only a few different values. \n", 413 | "Converting such a string variable to a categorical variable will save some memory. \n", 414 | "$%$\n", 415 | "- The lexical order of a variable is not the same as the logical order (“one”, “two”, “three”). \n", 416 | "By converting to a categorical and specifying an order on the categories, sorting and min/max will use the logical order instead of the lexical order. \n", 417 | "$%$\n", 418 | "- As a signal to other Python libraries that this column should be treated as a categorical variable \n", 419 | "e.g. to use suitable statistical methods or plot types." 420 | ] 421 | }, 422 | { 423 | "attachments": { 424 | "IT%20Logo.png": { 425 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAABwCAMAAAC6s4C9AAAAzFBMVEXaAH/////XAHT/6fb/2O7ZAHveB4nldqreD4fyi8LZAHrjHJDXAHf/6/XaAH350ub/zOffJov5nc/xiLz9stjeL4vuq8n4wt7/4fHqlrzrca7jNpT5qdT1qtH32uboiLTwudH3u9ryZbPzlMb88fb/+fzoJpn/8fjkU5zxgL3xmMXqTaP/5PL/0ertZ67iKpD1er7wS6rqeLPlX6HsVantcrTnNZriPpb5ttjkYqHofrHursrzn8rpWaPWAGzscrD3gMH0brj8mc/fRZJGHTsEAAALsElEQVR4nO2ce1viuhaHU21rKSWCMJZDZe9B6HCnI15AB47jzPf/Trs3IGslvahny8ST3/P4h22aJnmzkpWVFFI7VZJaNUJOlKQWISZRklqmQii7FELppRBKL4VQeimE0kshlF4KofRSCKWXQii9FELppRBKL4VQeimE0kshlF4KofRSCKWXQii9FELppRBKL4VQeimE0kshlF4KofRSCKWXQii9FELppRBKL4VQeimE0kshlF5HRUhZlUlMLMsiRlHa1+T6CVQOoQXEXchSUa6BF+zlNfKSUmp5o8ezZaVSWZ7d/m6QHDYNNtdAUAhRSam4Y5StZFHF8e3c9K9rxXIIrbsKo3uTWtNKGf3ONQJjurli9Cs7JSVep9KdD7VEp+5mObKMjLTBTzbXiy1OR6djvqTLs87IsziKxnVxJR/Ch6wXcOkZv3IFX7kNvoP/xys2sQcTX+f2bVISYeNCY3RepVZfK6O/7bxcjTVM7WWm9JatIcpa7z9awg5CO6cg4SVOYDdxXrEGeqv/NAqgeTtrUVKo67CSX+bgUu0aVtyawEd+BD14YcKa2sYH9ypFZng8hPQbaspKRjrruSXKvNYXGrm1gcnqU5TKbtYyC6xvOisWoiN8M5AfI6zDi3PYvRZtePvypAOvzLf79MYU3mqzFirU8RA6E9jbtLawu9FVf5CRvXvLT4nGVoeJhs1XIAyNcb0MmDK+EeFwzLyUrnromUvHQqbGmGEX3PHvC+kcD6E1w8mvRcm2uP6MZgt+MP0bJ+oFME0+wrDRutt92rciDG2HmYKbqK9ql7axPQdX9maIjbD1pdCpPhpC+zs3Jbl8KmOhizLeqTZGlsv3eG32/DqEYXsud7m+GaG/PhRsNMfPXNqEohl5Z4YWNMLhlhTqaAgpVzFN44Z9ivokp+E95EOXvK/ShZiLER56xpsRarVFWjDa2HDPRAgD6Cm1b5PCLeDY1G8Ur2yPhZDeCPzCK5zI45oGq33HvoM2rvgk+gKsK0ogDBkmad+OUKtX08ZHHnKkEGHY2WAxrmJYlgsutlclYhPHQmh38QQRNUeAUhX79L7LLpvorcBq/Q18cwmEuwHsHQiH/eR1/MieIkRDZjsyW/seGiGeJ/4khLQqbMfvoNMZixLvGI6Zl+AFWCIX9OVSCDU3eCdCbTaNCmZVBF01Qkhs5NFEZoiM0C1jhMdC6FSEKwUd9DqT81l5DdynwzBJR8KBt9Z8PcJklfoehIlHg9c4sWKEYX8DbTBbULoEg+5wUQLgmxHe1929wNjlnx9u1DuZnciCPXCvKZPG+I7vtqO36ky39s8nHttNOPc90ZpdVyCEw+iXIAXPnAY8wh9fef2IQgcihOGESmmjKypPgpCuoM11GybMZlPCl3kzQmJV9zKBxzVbmodbmSO5vRAGucJFHOt5ICOsrRcPYabb7r5jz7pTA6yhM+bOdofhjBD2ms3m02Wvzpnmgkf4X0egqABChNp5QJ+FFp8gJMYYdqZn+L/O9uf/PUJmQ8f5D2qtEjs9his2FzZQSm/gndOxFWdt0EWyHhn0libwNWkno2OAdQVCOCa2bTuW13TR0N5rcAhPsuojRjicIDvbKUWIFxZwPCvny5C3I2SaBCHM2EFgRR8yp6OLfd42Wh+M9xkbXsjQr08CNNXi8OhBc2a/AiFMo8jU8SawA/gej9DgFT8tRqidZowKKUJiw1Ap7NZuYXA01VEQGpssI9SG++IYsEVB6GZRa29WBrJyKnQdksbpM+UVIox2tK5gqe44hLcdXvHjGQi1jGruEBIcKmUbojg4muoYCGkVdL4eqMb1LqpRhY1xw5gcbYw7fITbqrAPzIDD1PMO1p2BMBzX4ED2k0PY5qV/y54LM7VHaEwz3Lpw7C8OjqY6BkJ7zBrYALacvkMIp8IZqhD/Fujg+RvgndaW+wcyERIH7olclFlU+O9CSGjWCmdWIjia6hgILRAebRG4+r1JcjeuQUttnMJsl2w27QWMLh8cmmyE9Ab6qo0SCAfvRFjtiZMUbvQedASExh1oqWtSBQhbqcf9wl70X3IPAJBoGATuj2taYEl2CJTmIPwG7tQ/ACExlqJFqTbPP7MCdASE9hqYS0Bt4Hr71aRuL+DidSHCW7YthhPDgHHkzQ7Vn4UQ7y6lxcenb/L08QhpgJvQvga1+Bnn8FqEMOY3n1K0dejuDmD8+wgH3D5aDfrKLEKhR1Nio/egj0cInQY/nLapCQaTWVK1F/aaPy5ASOHUF8Un4TJxfwAjDyEox5sR/rVAUQJ/DWPKl6AyfS4gMSsXHE314QhpAHqdG8UBHbgiu4vyh+6MVkeREdxLrSZog2XYSPYjcHV3gdJshPY1uHNVxiMVIrRQZLS9hcdBAELqcSGcMhu9B304QnTg4j4K1dkPAGE9yh8tKobQQzNX0ENFo2Y9MMJsYfxqdwAjB2ELFGP81kXFXydTMHCGE3MOQkKxRzPHh+7y9fFWCMKj/k0SKocR7eh0LTFhU70wzGijX38yQTuA8KjfjWPtZh8gSR2aTIT2FDblC4dw0+W0qYoQWmP2zW7g5CJs9GBVm+UXFJE+GqEB22ng9mKhQwgRLgdiPTeZt94PtVp3cVjfowMqfjvJ9RwgTAOlWQjtAMXe+RgpMaumCf+SpxFCh3jMADBbUDsPIbHhOdP6KxYUkT4cociHxhpEuOxf8OLVjhhNj9a2D1tNdFtid9jvx+0tQhiO5WQBA33a/M07FSFCZqt3GL42HyGtgtnwKvijEaLwaJaihTx9gNf89YNhU2rYVj/Nw59PvHi7DoVHs5QESrnNJscxq6vxGhfsO+ERUoHECA/u1Twk8jqE3h+N0OE9aJHacR2xZdW7y+l0MXEPWQzcZrTjRPmjmiIN4zMaCOH5169fXXfOxUhOPR4hs9HNSIhwH7GtPYcv/UwIzczIPFQUKOUPXviz+VyHa65aL9ruWZbLtRvwCLMkOjvTcwXqOSKEhCQnQgdxcPYTIbTvhBFBXuto2jFzT3Kn8tcmCo9mq/1ol0WoewKE4gKcCBHSRnya7jwZvD8PQtIr4czEit6BVvdi1e5D96aUXWnJsfdSCP0kPlIG4UCMMBlK002uz4OQPpRwHBP9ipuh2LoGE1L2SGSo+oiWQ5g4r+9CSMhzzW+la9FPg9DpZn1nxmkYR2gC8fGhg/xegxqjUl5unLxZDuEmPSL+LoThUHo+TVrj0yCkwV+gJm6LVQ+2TLw1QVcFuzj1lYHCo9opyHUN7X7t0WKE/mR3yP99Vki8Xbj60yC04TlJ/9YDgp6OHudDp7kMo2giCo/6vQDkCkMJtWUxQv0Q4HonwkPNPwtC9MXAugGWyA6KxsSB0pBh9nzod+N48D2ANLt32FxtdKxtYzn5CP2r20OJFUIo4w4OamhTjH6DTbNOwlk0GGdMdbVK5Pjj79HmqD4m+o7v8SQPYa11xoa3FEIoAxy40HT82Y6BfJc0rk3J7YVgNVm7SL7Splv4KckE7Q3TJ0isTzIR1taVRw/sRH5WhHoewtNMhHQKjfAX/j0VGy0DK7uK0sZjH+HVN4t0WxSWTdPxXhv2iOorwY+WzHS31X96XOFfninzoyXx0h4E+GZFCH9ghKCIrX8DIfl9xugRbmdRePNLZiaNDpvwif+ZGesMaHSoCbVWZ5Wuq+vDtq67V5XOqLFrbGsEHrrl9tpQgsfGCpQjeWq08izCfwRCR1xSXtFTt+AVPILfTxk1S4oIHh9ltqBY5RBS9isCXMDcm5kJRSmN7ATRL3itRqPH7Wj0OwDGUpgrTkDhhfhi1jc8fFL+2TJtUFDI0k0okkS/hPh/9dt4r5BECJXEUgill0IovRRC6aUQSi+FUHophNJLIZReCqH0Ugill0IovRRC6aUQSi+FUHophNJLIZReCqH0Ugill0IovRRC6aUQSi+FUHophNJLIZReCqH0Ugill0IovRRC6aUQSi+FUHophNIrBHiiJLUI0ZUk1z9dUGPVsW+BpQAAAABJRU5ErkJggg==" 426 | } 427 | }, 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "![IT%20Logo.png](attachment:IT%20Logo.png)" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "# Detectando y Filtrando Outliers" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "Filtrar o transformar Outliers - Valores Atípicos es en gran medida la aplicación de operaciones de ``matriz - arrays``." 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": { 452 | "ExecuteTime": { 453 | "end_time": "2021-01-26T19:38:04.526327Z", 454 | "start_time": "2021-01-26T19:38:04.521341Z" 455 | } 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "data = pd.DataFrame(np.random.randn(1000, 4))" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": { 466 | "ExecuteTime": { 467 | "end_time": "2021-01-26T19:38:09.397310Z", 468 | "start_time": "2021-01-26T19:38:09.386337Z" 469 | } 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "data.shape" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": { 480 | "ExecuteTime": { 481 | "end_time": "2021-01-26T19:38:11.196500Z", 482 | "start_time": "2021-01-26T19:38:11.186527Z" 483 | } 484 | }, 485 | "outputs": [], 486 | "source": [ 487 | "data.info()" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": { 494 | "ExecuteTime": { 495 | "end_time": "2021-01-26T19:38:15.867015Z", 496 | "start_time": "2021-01-26T19:38:15.851058Z" 497 | } 498 | }, 499 | "outputs": [], 500 | "source": [ 501 | "data.head()" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": { 508 | "ExecuteTime": { 509 | "end_time": "2021-01-26T19:38:19.857350Z", 510 | "start_time": "2021-01-26T19:38:19.824441Z" 511 | } 512 | }, 513 | "outputs": [], 514 | "source": [ 515 | "data.describe()" 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "Queremos encontrar en una de las columnas, valores que contengan el número 3 en ``valor absoluto``." 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": { 529 | "ExecuteTime": { 530 | "end_time": "2021-01-26T19:38:31.166125Z", 531 | "start_time": "2021-01-26T19:38:31.161139Z" 532 | } 533 | }, 534 | "outputs": [], 535 | "source": [ 536 | "col = data[1]" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": null, 542 | "metadata": { 543 | "ExecuteTime": { 544 | "end_time": "2021-01-26T19:38:33.032140Z", 545 | "start_time": "2021-01-26T19:38:33.020170Z" 546 | } 547 | }, 548 | "outputs": [], 549 | "source": [ 550 | "col[np.abs(col) > 3]" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "Para selecionar todas las observaciones - cases con valores que excedan los limites ó rangos ``3`` o ``-3``, usaremos el método any() en un DataFrame Boleano : " 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": { 564 | "ExecuteTime": { 565 | "end_time": "2021-01-26T19:38:46.240833Z", 566 | "start_time": "2021-01-26T19:38:46.217894Z" 567 | } 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "data[(np.abs(data) > 3).any(1)]" 572 | ] 573 | }, 574 | { 575 | "cell_type": "markdown", 576 | "metadata": {}, 577 | "source": [ 578 | "Los valores se pueden establecer en función de estos criterios. \n", 579 | " \n", 580 | "Si queremos limitar los valores del intervalo a ``–3`` a ``3``:" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": { 587 | "ExecuteTime": { 588 | "end_time": "2021-01-26T19:38:59.391682Z", 589 | "start_time": "2021-01-26T19:38:59.375726Z" 590 | } 591 | }, 592 | "outputs": [], 593 | "source": [ 594 | "data[np.abs(data) > 3] = np.sign(data) * 3" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": null, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "np.sign(data)" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": { 610 | "ExecuteTime": { 611 | "end_time": "2021-01-26T19:42:20.168046Z", 612 | "start_time": "2021-01-26T19:42:20.153085Z" 613 | } 614 | }, 615 | "outputs": [], 616 | "source": [ 617 | "data.head()" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "metadata": { 624 | "ExecuteTime": { 625 | "end_time": "2021-01-26T19:40:33.842241Z", 626 | "start_time": "2021-01-26T19:40:33.767435Z" 627 | } 628 | }, 629 | "outputs": [], 630 | "source": [ 631 | "# observamos 'min' y 'max' en el resumen estadístico,\n", 632 | "# no supera el intervalo -3 y 3\n", 633 | "data.describe()" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "La declaración np.sign(data) produce valores ``1`` y ``-1``, basandose si los valores en data son positivos o negativos :" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": { 647 | "ExecuteTime": { 648 | "end_time": "2021-01-26T19:41:44.333824Z", 649 | "start_time": "2021-01-26T19:41:44.317867Z" 650 | } 651 | }, 652 | "outputs": [], 653 | "source": [ 654 | "np.sign(data).head()" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": null, 660 | "metadata": { 661 | "ExecuteTime": { 662 | "end_time": "2021-01-26T19:42:47.525923Z", 663 | "start_time": "2021-01-26T19:42:47.505976Z" 664 | } 665 | }, 666 | "outputs": [], 667 | "source": [ 668 | "(np.sign(data) * 3).head()" 669 | ] 670 | }, 671 | { 672 | "cell_type": "markdown", 673 | "metadata": {}, 674 | "source": [ 675 | "# Computing Indicator/Dummy Variables" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "Otro tipo de transformación para aplicaciones de Statistical Modeling o Machine Learning es convertir una variable Categórica en una matriz \"dummy / ficticia\" o \"indicator / indicadora\". \n", 683 | "\n", 684 | "Si una columna en un DataFrame tiene k valores distintos, derivaría una matriz o DataFrame con k columnas que contienen todos los 1s y 0s. Pandas tiene una función get_dummies() para hacerlo :" 685 | ] 686 | }, 687 | { 688 | "cell_type": "markdown", 689 | "metadata": {}, 690 | "source": [ 691 | "## pd.get_dummies() method" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "metadata": { 698 | "ExecuteTime": { 699 | "end_time": "2021-01-26T19:45:31.598953Z", 700 | "start_time": "2021-01-26T19:45:31.584992Z" 701 | } 702 | }, 703 | "outputs": [], 704 | "source": [ 705 | "df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n", 706 | " 'data1': range(6)})\n", 707 | "df" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "metadata": { 714 | "ExecuteTime": { 715 | "end_time": "2021-01-26T19:46:19.737984Z", 716 | "start_time": "2021-01-26T19:46:19.732995Z" 717 | } 718 | }, 719 | "outputs": [], 720 | "source": [ 721 | "type(df)" 722 | ] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": null, 727 | "metadata": { 728 | "ExecuteTime": { 729 | "end_time": "2021-01-26T19:48:54.368489Z", 730 | "start_time": "2021-01-26T19:48:54.362506Z" 731 | } 732 | }, 733 | "outputs": [], 734 | "source": [ 735 | "df.shape" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": { 742 | "ExecuteTime": { 743 | "end_time": "2021-01-26T19:46:34.025999Z", 744 | "start_time": "2021-01-26T19:46:34.011036Z" 745 | } 746 | }, 747 | "outputs": [], 748 | "source": [ 749 | "df_dummies = pd.get_dummies(df['key'])\n", 750 | "df_dummies" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": { 757 | "ExecuteTime": { 758 | "end_time": "2021-01-26T19:49:23.562460Z", 759 | "start_time": "2021-01-26T19:49:23.556482Z" 760 | } 761 | }, 762 | "outputs": [], 763 | "source": [ 764 | "df_dummies.shape" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "Podemos agregar un prefijo a las columnas en el dummy DataFrame, que luego se puede combinar con los otros datos. get_dummies() tiene un argumento prefix para hacer esto:" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": null, 777 | "metadata": { 778 | "ExecuteTime": { 779 | "end_time": "2021-01-26T19:47:30.770852Z", 780 | "start_time": "2021-01-26T19:47:30.756890Z" 781 | } 782 | }, 783 | "outputs": [], 784 | "source": [ 785 | "dummies = pd.get_dummies(df['key'], prefix = 'cod')\n", 786 | "dummies" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": null, 792 | "metadata": { 793 | "ExecuteTime": { 794 | "end_time": "2021-01-26T19:49:42.793060Z", 795 | "start_time": "2021-01-26T19:49:42.787075Z" 796 | } 797 | }, 798 | "outputs": [], 799 | "source": [ 800 | "dummies.shape" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": { 807 | "ExecuteTime": { 808 | "end_time": "2021-01-26T19:47:58.184658Z", 809 | "start_time": "2021-01-26T19:47:58.167706Z" 810 | } 811 | }, 812 | "outputs": [], 813 | "source": [ 814 | "df_con_dummies = df[['data1']].join(dummies)\n", 815 | "df_con_dummies" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": null, 821 | "metadata": { 822 | "ExecuteTime": { 823 | "end_time": "2021-01-26T19:50:01.634701Z", 824 | "start_time": "2021-01-26T19:50:01.628716Z" 825 | } 826 | }, 827 | "outputs": [], 828 | "source": [ 829 | "df_con_dummies.shape" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": null, 835 | "metadata": { 836 | "ExecuteTime": { 837 | "end_time": "2021-01-26T19:52:16.802424Z", 838 | "start_time": "2021-01-26T19:52:16.796440Z" 839 | } 840 | }, 841 | "outputs": [], 842 | "source": [ 843 | "os.getcwd()" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": null, 849 | "metadata": { 850 | "ExecuteTime": { 851 | "end_time": "2021-01-26T19:52:30.322285Z", 852 | "start_time": "2021-01-26T19:52:30.231530Z" 853 | } 854 | }, 855 | "outputs": [], 856 | "source": [ 857 | "os.chdir('D:\\\\Documents\\\\Python\\\\Python for Data Analysis-Pandas Jupyter Notebook\\\\pydata-Notebooks\\\\datasets\\\\movielens')" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": null, 863 | "metadata": { 864 | "ExecuteTime": { 865 | "end_time": "2021-01-26T19:52:43.313563Z", 866 | "start_time": "2021-01-26T19:52:43.307579Z" 867 | } 868 | }, 869 | "outputs": [], 870 | "source": [ 871 | "os.listdir()" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": null, 877 | "metadata": { 878 | "ExecuteTime": { 879 | "end_time": "2021-01-26T19:53:05.057445Z", 880 | "start_time": "2021-01-26T19:53:05.051462Z" 881 | } 882 | }, 883 | "outputs": [], 884 | "source": [ 885 | "mcabecera = ['movie_id', 'titulo', 'genero']\n", 886 | "mcabecera" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": null, 892 | "metadata": { 893 | "ExecuteTime": { 894 | "end_time": "2021-01-26T19:53:25.872811Z", 895 | "start_time": "2021-01-26T19:53:25.778064Z" 896 | } 897 | }, 898 | "outputs": [], 899 | "source": [ 900 | "movies = pd.read_table('movies.dat', sep = '::', header = None, names = mcabecera)\n", 901 | "movies.head()" 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": null, 907 | "metadata": { 908 | "ExecuteTime": { 909 | "end_time": "2021-01-26T19:53:48.110373Z", 910 | "start_time": "2021-01-26T19:53:48.104392Z" 911 | } 912 | }, 913 | "outputs": [], 914 | "source": [ 915 | "movies.shape" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": null, 921 | "metadata": { 922 | "ExecuteTime": { 923 | "end_time": "2021-01-26T19:54:26.564593Z", 924 | "start_time": "2021-01-26T19:54:26.545644Z" 925 | } 926 | }, 927 | "outputs": [], 928 | "source": [ 929 | "movies.describe()" 930 | ] 931 | }, 932 | { 933 | "cell_type": "markdown", 934 | "metadata": {}, 935 | "source": [ 936 | "Agregar dummy variables para cada género requiere un poco de transformación. \n", 937 | "\n", 938 | "Primero, extraemos la lista de géneros únicos en el dataset:" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": null, 944 | "metadata": { 945 | "ExecuteTime": { 946 | "end_time": "2021-01-26T19:55:22.492109Z", 947 | "start_time": "2021-01-26T19:55:22.486126Z" 948 | } 949 | }, 950 | "outputs": [], 951 | "source": [ 952 | "todos_generos = []\n", 953 | "todos_generos" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": null, 959 | "metadata": { 960 | "ExecuteTime": { 961 | "end_time": "2021-01-26T19:55:36.133648Z", 962 | "start_time": "2021-01-26T19:55:36.125669Z" 963 | } 964 | }, 965 | "outputs": [], 966 | "source": [ 967 | "for x in movies.genero:\n", 968 | " todos_generos.extend(x.split('|'))" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": null, 974 | "metadata": { 975 | "ExecuteTime": { 976 | "end_time": "2021-01-26T19:56:10.333382Z", 977 | "start_time": "2021-01-26T19:56:10.327401Z" 978 | } 979 | }, 980 | "outputs": [], 981 | "source": [ 982 | "type(todos_generos)" 983 | ] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": null, 988 | "metadata": { 989 | "ExecuteTime": { 990 | "end_time": "2021-01-26T19:55:49.812087Z", 991 | "start_time": "2021-01-26T19:55:49.806104Z" 992 | } 993 | }, 994 | "outputs": [], 995 | "source": [ 996 | "todos_generos[:8]" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": null, 1002 | "metadata": { 1003 | "ExecuteTime": { 1004 | "end_time": "2021-01-26T19:56:33.090558Z", 1005 | "start_time": "2021-01-26T19:56:33.084574Z" 1006 | } 1007 | }, 1008 | "outputs": [], 1009 | "source": [ 1010 | "len(todos_generos)" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "code", 1015 | "execution_count": null, 1016 | "metadata": { 1017 | "ExecuteTime": { 1018 | "end_time": "2021-01-26T19:56:52.893627Z", 1019 | "start_time": "2021-01-26T19:56:52.884651Z" 1020 | } 1021 | }, 1022 | "outputs": [], 1023 | "source": [ 1024 | "generos = pd.unique(todos_generos)\n", 1025 | "generos" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": null, 1031 | "metadata": { 1032 | "ExecuteTime": { 1033 | "end_time": "2021-01-26T19:57:06.461364Z", 1034 | "start_time": "2021-01-26T19:57:06.456376Z" 1035 | } 1036 | }, 1037 | "outputs": [], 1038 | "source": [ 1039 | "len(generos)" 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": null, 1045 | "metadata": { 1046 | "ExecuteTime": { 1047 | "end_time": "2021-01-26T19:57:44.381011Z", 1048 | "start_time": "2021-01-26T19:57:44.369044Z" 1049 | } 1050 | }, 1051 | "outputs": [], 1052 | "source": [ 1053 | "movies.head(10)" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "markdown", 1058 | "metadata": {}, 1059 | "source": [ 1060 | "Para construir un Dummy DataFrame, se empieza creando una matriz/array 'zeros', para finalmente crear un DaFrame de 'zeros' :" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": null, 1066 | "metadata": { 1067 | "ExecuteTime": { 1068 | "end_time": "2021-01-26T19:58:32.461502Z", 1069 | "start_time": "2021-01-26T19:58:32.455517Z" 1070 | } 1071 | }, 1072 | "outputs": [], 1073 | "source": [ 1074 | "len(movies)" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": null, 1080 | "metadata": { 1081 | "ExecuteTime": { 1082 | "end_time": "2021-01-26T19:58:46.455099Z", 1083 | "start_time": "2021-01-26T19:58:46.449115Z" 1084 | } 1085 | }, 1086 | "outputs": [], 1087 | "source": [ 1088 | "cero_matriz = np.zeros((len(movies), len(generos)))\n", 1089 | "cero_matriz.shape" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "code", 1094 | "execution_count": null, 1095 | "metadata": { 1096 | "ExecuteTime": { 1097 | "end_time": "2021-01-26T19:59:55.580340Z", 1098 | "start_time": "2021-01-26T19:59:55.573359Z" 1099 | } 1100 | }, 1101 | "outputs": [], 1102 | "source": [ 1103 | "cero_matriz" 1104 | ] 1105 | }, 1106 | { 1107 | "cell_type": "code", 1108 | "execution_count": null, 1109 | "metadata": { 1110 | "ExecuteTime": { 1111 | "end_time": "2021-01-26T19:59:19.859815Z", 1112 | "start_time": "2021-01-26T19:59:19.848844Z" 1113 | } 1114 | }, 1115 | "outputs": [], 1116 | "source": [ 1117 | "sum(cero_matriz)" 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "code", 1122 | "execution_count": null, 1123 | "metadata": { 1124 | "ExecuteTime": { 1125 | "end_time": "2021-01-26T19:59:37.990355Z", 1126 | "start_time": "2021-01-26T19:59:37.979386Z" 1127 | } 1128 | }, 1129 | "outputs": [], 1130 | "source": [ 1131 | "len(sum(cero_matriz))" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "execution_count": null, 1137 | "metadata": { 1138 | "ExecuteTime": { 1139 | "end_time": "2021-01-26T20:00:12.919390Z", 1140 | "start_time": "2021-01-26T20:00:12.891466Z" 1141 | } 1142 | }, 1143 | "outputs": [], 1144 | "source": [ 1145 | "dummies = pd.DataFrame(cero_matriz, columns = generos)\n", 1146 | "dummies.head()" 1147 | ] 1148 | }, 1149 | { 1150 | "cell_type": "code", 1151 | "execution_count": null, 1152 | "metadata": { 1153 | "ExecuteTime": { 1154 | "end_time": "2021-01-26T20:00:33.202711Z", 1155 | "start_time": "2021-01-26T20:00:33.192738Z" 1156 | } 1157 | }, 1158 | "outputs": [], 1159 | "source": [ 1160 | "dummies.sum()" 1161 | ] 1162 | }, 1163 | { 1164 | "cell_type": "code", 1165 | "execution_count": null, 1166 | "metadata": { 1167 | "ExecuteTime": { 1168 | "end_time": "2021-01-26T20:01:12.865319Z", 1169 | "start_time": "2021-01-26T20:01:12.762592Z" 1170 | } 1171 | }, 1172 | "outputs": [], 1173 | "source": [ 1174 | "dummies.describe()" 1175 | ] 1176 | }, 1177 | { 1178 | "cell_type": "markdown", 1179 | "metadata": {}, 1180 | "source": [ 1181 | "Ahora, iteramos cada película y configuramos las entradas en cada fila de dummies a 1. Para hacer esto, usamos dummies.columns para calcular los índices de columna para cada género:" 1182 | ] 1183 | }, 1184 | { 1185 | "cell_type": "code", 1186 | "execution_count": null, 1187 | "metadata": { 1188 | "ExecuteTime": { 1189 | "end_time": "2021-01-26T20:02:06.012470Z", 1190 | "start_time": "2021-01-26T20:02:06.005490Z" 1191 | } 1192 | }, 1193 | "outputs": [], 1194 | "source": [ 1195 | "dummies.columns" 1196 | ] 1197 | }, 1198 | { 1199 | "cell_type": "code", 1200 | "execution_count": null, 1201 | "metadata": { 1202 | "ExecuteTime": { 1203 | "end_time": "2021-01-26T20:02:36.185479Z", 1204 | "start_time": "2021-01-26T20:02:36.178496Z" 1205 | } 1206 | }, 1207 | "outputs": [], 1208 | "source": [ 1209 | "gen = movies.genero[0]\n", 1210 | "gen" 1211 | ] 1212 | }, 1213 | { 1214 | "cell_type": "code", 1215 | "execution_count": null, 1216 | "metadata": { 1217 | "ExecuteTime": { 1218 | "end_time": "2021-01-26T20:02:57.306554Z", 1219 | "start_time": "2021-01-26T20:02:57.301568Z" 1220 | } 1221 | }, 1222 | "outputs": [], 1223 | "source": [ 1224 | "gen.split('|')" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "code", 1229 | "execution_count": null, 1230 | "metadata": { 1231 | "ExecuteTime": { 1232 | "end_time": "2021-01-26T20:03:10.256940Z", 1233 | "start_time": "2021-01-26T20:03:10.249960Z" 1234 | } 1235 | }, 1236 | "outputs": [], 1237 | "source": [ 1238 | "dummies.columns.get_indexer(gen.split('|'))" 1239 | ] 1240 | }, 1241 | { 1242 | "cell_type": "markdown", 1243 | "metadata": {}, 1244 | "source": [ 1245 | "Ahora, podemos utilizar .iloc para establecer valores basados en estos índices :" 1246 | ] 1247 | }, 1248 | { 1249 | "cell_type": "code", 1250 | "execution_count": null, 1251 | "metadata": { 1252 | "ExecuteTime": { 1253 | "end_time": "2021-01-26T20:04:18.141497Z", 1254 | "start_time": "2021-01-26T20:04:15.569373Z" 1255 | } 1256 | }, 1257 | "outputs": [], 1258 | "source": [ 1259 | "for i, gen in enumerate(movies.genero):\n", 1260 | " indices = dummies.columns.get_indexer(gen.split('|'))\n", 1261 | " dummies.iloc[i, indices] = 1" 1262 | ] 1263 | }, 1264 | { 1265 | "cell_type": "code", 1266 | "execution_count": null, 1267 | "metadata": { 1268 | "ExecuteTime": { 1269 | "end_time": "2021-01-26T20:04:26.117181Z", 1270 | "start_time": "2021-01-26T20:04:26.104216Z" 1271 | } 1272 | }, 1273 | "outputs": [], 1274 | "source": [ 1275 | "movies.head()" 1276 | ] 1277 | }, 1278 | { 1279 | "cell_type": "code", 1280 | "execution_count": null, 1281 | "metadata": { 1282 | "ExecuteTime": { 1283 | "end_time": "2021-01-26T20:04:43.303247Z", 1284 | "start_time": "2021-01-26T20:04:43.272330Z" 1285 | } 1286 | }, 1287 | "outputs": [], 1288 | "source": [ 1289 | "dummies.head()" 1290 | ] 1291 | }, 1292 | { 1293 | "cell_type": "code", 1294 | "execution_count": null, 1295 | "metadata": { 1296 | "ExecuteTime": { 1297 | "end_time": "2021-01-26T20:05:18.217925Z", 1298 | "start_time": "2021-01-26T20:05:18.208948Z" 1299 | } 1300 | }, 1301 | "outputs": [], 1302 | "source": [ 1303 | "dummies.sum()" 1304 | ] 1305 | }, 1306 | { 1307 | "cell_type": "markdown", 1308 | "metadata": {}, 1309 | "source": [ 1310 | "Finalmente podemos combinar 'dummies', con 'movies'" 1311 | ] 1312 | }, 1313 | { 1314 | "cell_type": "code", 1315 | "execution_count": null, 1316 | "metadata": { 1317 | "ExecuteTime": { 1318 | "end_time": "2021-01-26T20:05:58.156177Z", 1319 | "start_time": "2021-01-26T20:05:58.116286Z" 1320 | } 1321 | }, 1322 | "outputs": [], 1323 | "source": [ 1324 | "movies_dummies = movies.join(dummies.add_prefix('Genero_'))\n", 1325 | "movies_dummies.head()" 1326 | ] 1327 | }, 1328 | { 1329 | "cell_type": "code", 1330 | "execution_count": null, 1331 | "metadata": { 1332 | "ExecuteTime": { 1333 | "end_time": "2021-01-26T20:06:25.775396Z", 1334 | "start_time": "2021-01-26T20:06:25.767415Z" 1335 | } 1336 | }, 1337 | "outputs": [], 1338 | "source": [ 1339 | "movies_dummies.iloc[1]" 1340 | ] 1341 | }, 1342 | { 1343 | "cell_type": "code", 1344 | "execution_count": null, 1345 | "metadata": { 1346 | "ExecuteTime": { 1347 | "end_time": "2021-01-26T20:07:41.326460Z", 1348 | "start_time": "2021-01-26T20:07:41.322473Z" 1349 | } 1350 | }, 1351 | "outputs": [], 1352 | "source": [ 1353 | "np.random.seed(12345)" 1354 | ] 1355 | }, 1356 | { 1357 | "cell_type": "code", 1358 | "execution_count": null, 1359 | "metadata": { 1360 | "ExecuteTime": { 1361 | "end_time": "2021-01-26T20:07:43.477712Z", 1362 | "start_time": "2021-01-26T20:07:43.469733Z" 1363 | } 1364 | }, 1365 | "outputs": [], 1366 | "source": [ 1367 | "values = np.random.rand(10)\n", 1368 | "values" 1369 | ] 1370 | }, 1371 | { 1372 | "cell_type": "code", 1373 | "execution_count": null, 1374 | "metadata": { 1375 | "ExecuteTime": { 1376 | "end_time": "2021-01-26T20:08:01.157459Z", 1377 | "start_time": "2021-01-26T20:08:01.151471Z" 1378 | } 1379 | }, 1380 | "outputs": [], 1381 | "source": [ 1382 | "bins = [0, 0.2, 0.4, 0.6, 0.8, 1]\n", 1383 | "bins" 1384 | ] 1385 | }, 1386 | { 1387 | "cell_type": "code", 1388 | "execution_count": null, 1389 | "metadata": { 1390 | "ExecuteTime": { 1391 | "end_time": "2021-01-26T20:08:13.478527Z", 1392 | "start_time": "2021-01-26T20:08:13.453592Z" 1393 | } 1394 | }, 1395 | "outputs": [], 1396 | "source": [ 1397 | "pd.get_dummies(pd.cut(values, bins))" 1398 | ] 1399 | } 1400 | ], 1401 | "metadata": { 1402 | "kernelspec": { 1403 | "display_name": "Python 3", 1404 | "language": "python", 1405 | "name": "python3" 1406 | }, 1407 | "language_info": { 1408 | "codemirror_mode": { 1409 | "name": "ipython", 1410 | "version": 3 1411 | }, 1412 | "file_extension": ".py", 1413 | "mimetype": "text/x-python", 1414 | "name": "python", 1415 | "nbconvert_exporter": "python", 1416 | "pygments_lexer": "ipython3", 1417 | "version": "3.7.0" 1418 | }, 1419 | "latex_envs": { 1420 | "LaTeX_envs_menu_present": true, 1421 | "autoclose": false, 1422 | "autocomplete": true, 1423 | "bibliofile": "biblio.bib", 1424 | "cite_by": "apalike", 1425 | "current_citInitial": 1, 1426 | "eqLabelWithNumbers": true, 1427 | "eqNumInitial": 1, 1428 | "hotkeys": { 1429 | "equation": "Ctrl-E", 1430 | "itemize": "Ctrl-I" 1431 | }, 1432 | "labels_anchors": false, 1433 | "latex_user_defs": false, 1434 | "report_style_numbering": false, 1435 | "user_envs_cfg": false 1436 | }, 1437 | "toc": { 1438 | "base_numbering": 1, 1439 | "nav_menu": {}, 1440 | "number_sections": true, 1441 | "sideBar": true, 1442 | "skip_h1_title": false, 1443 | "title_cell": "Table of Contents", 1444 | "title_sidebar": "Contents", 1445 | "toc_cell": false, 1446 | "toc_position": { 1447 | "height": "calc(100% - 180px)", 1448 | "left": "10px", 1449 | "top": "150px", 1450 | "width": "165px" 1451 | }, 1452 | "toc_section_display": true, 1453 | "toc_window_display": false 1454 | }, 1455 | "varInspector": { 1456 | "cols": { 1457 | "lenName": 16, 1458 | "lenType": 16, 1459 | "lenVar": 40 1460 | }, 1461 | "kernels_config": { 1462 | "python": { 1463 | "delete_cmd_postfix": "", 1464 | "delete_cmd_prefix": "del ", 1465 | "library": "var_list.py", 1466 | "varRefreshCmd": "print(var_dic_list())" 1467 | }, 1468 | "r": { 1469 | "delete_cmd_postfix": ") ", 1470 | "delete_cmd_prefix": "rm(", 1471 | "library": "var_list.r", 1472 | "varRefreshCmd": "cat(var_dic_list()) " 1473 | } 1474 | }, 1475 | "types_to_exclude": [ 1476 | "module", 1477 | "function", 1478 | "builtin_function_or_method", 1479 | "instance", 1480 | "_Feature" 1481 | ], 1482 | "window_display": false 1483 | } 1484 | }, 1485 | "nbformat": 4, 1486 | "nbformat_minor": 2 1487 | } 1488 | --------------------------------------------------------------------------------