├── .gitignore ├── LICENSE ├── README.md ├── ch01-missing-data-imputation ├── Recipe-01-Removing-observations-with-missing-data.ipynb ├── Recipe-02-Performing-mean-or-median-imputation.ipynb ├── Recipe-03-Imputing-categorical-variables.ipynb ├── Recipe-04-Replacing-missing-values-with-an-arbitrary-number.ipynb ├── Recipe-05-Finding-extreme-values-for-imputation.ipynb ├── Recipe-06-Marking-imputed-values.ipynb ├── Recipe-07-Performing-multivariate-imputation-by-chained-equations.ipynb ├── Recipe-08-Estimating-missing-data-with-K-nearest-neighbours.ipynb └── donwload-prepare-store-credit-approval-dataset.ipynb ├── ch02-categorical-encoding ├── Recipe-01-One-hot-encoding.ipynb ├── Recipe-02-One-hot-encoding-frequent-categories.ipynb ├── Recipe-03-Replacing-categories-by-counts-frequency.ipynb ├── Recipe-04-Ordinal-encoding.ipynb ├── Recipe-05-Ordered-ordinal-encoding.ipynb ├── Recipe-06-Target-mean-encoding.ipynb ├── Recipe-07-Weight-of-evidence.ipynb ├── Recipe-08-Grouping-rare-categories.ipynb ├── Recipe-09-Binary-Encoding.ipynb └── donwload-prepare-store-credit-approval-dataset.ipynb ├── ch03-variable-transformation ├── Recipe-1-logarithmic-transformation.ipynb ├── Recipe-2-reciprocal-transformation.ipynb ├── Recipe-3-square-root-transformation.ipynb ├── Recipe-4-power-transformation.ipynb ├── Recipe-5-Box-Cox-transformation.ipynb └── Recipe-6-Yeo-Johnson-transformation.ipynb ├── ch04-discretization ├── Recipe-1-Equal-width-discretization.ipynb ├── Recipe-2-Equal-frequency-discretisation.ipynb ├── Recipe-3-User-defined-interval-discretization.ipynb ├── Recipe-4-Discretization-k-means.ipynb ├── Recipe-5-Binarization.ipynb ├── Recipe-6-Discretization-with-decision-trees.ipynb └── donwload-prepare-store-enron-data.ipynb ├── ch05-outliers ├── Recipe-1-Visualizing-outliers-with-boxplots.ipynb ├── Recipe-2-Finding-outliers-with-mean-and-std.ipynb ├── Recipe-3-Finding-outliers-with-the-IQR.ipynb ├── Recipe-4-Removing-outliers.ipynb ├── Recipe-5-Capping-outliers.ipynb └── Recipe-6-Capping-outliers-with-quantiles.ipynb ├── ch06-datetime ├── Recipe-1-Extracting-features-from-dates-with-pandas.ipynb ├── Recipe-2-Extracting-features-from-time-with-pandas.ipynb ├── Recipe-3-Capturing-elapsed-time-between-2-variables.ipynb ├── Recipe-4-Working-with-different-time-zones.ipynb └── Recipe-5-Automating-datetime-features-with-Feature-engine.ipynb ├── ch07-scaling ├── Recipe-1-standardization.ipynb ├── Recipe-2-min-max-scaling.ipynb ├── Recipe-3-robust-scaling.ipynb ├── Recipe-4-mean-normalization.ipynb ├── Recipe-5-maximum-absolute-scaling.ipynb └── Recipe-6-scaling-to-unit-length.ipynb ├── ch08-creation ├── Cyclical-features-figures.ipynb ├── Recipe1-Combine-features-with-functions.ipynb ├── Recipe2-Comparing-features-to-reference-variable.ipynb ├── Recipe3-PolynomialExpansion.ipynb ├── Recipe4-Combining-features-with-trees.ipynb ├── Recipe5-Periodic-features.ipynb ├── Recipe6-Spline-features.ipynb ├── Spline-features-figures.ipynb └── polynomial_features_figures.ipynb ├── ch09-featuretools ├── Recipe1-Setting-up-an-entitity-set.ipynb ├── Recipe2-Creating-features-with-cumulative-primitives.ipynb ├── Recipe3-Combining-numerical-features.ipynb ├── Recipe4-Creating-features-from-datetime.ipynb ├── Recipe5-Extracting-features-from-text.ipynb ├── Recipe6-Creating-features-with-aggregation-primitives.ipynb └── prepare-retail-dataset.ipynb ├── ch10-tsfresh ├── Recipe1-extract-features-automatically-with-tsfresh.ipynb ├── Recipe2-extract-relevant-features-with-tsfresh.ipynb ├── Recipe3-extract-specific-features-with-tsfresh.ipynb ├── Recipe4-extract-features-after-feature-selection.ipynb ├── Recipe5-extract-features-automatically-within-pipeline.ipynb └── prepare-occupancy-dataset.ipynb ├── ch11-text ├── Recipe1-Capturing-text-complexity-in-features.ipynb ├── Recipe2-Sentence-tokenization.ipynb ├── Recipe3-bag-of-words.ipynb ├── Recipe4-TFIDF.ipynb └── Recipe5-cleaning-text.ipynb └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | *.csv 3 | *.data 4 | *.txt 5 | *.gz -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ### [Packt Conference : Put Generative AI to work on Oct 11-13 (Virtual)](https://packt.link/JGIEY) 3 | 4 |

[![Packt Conference](https://hub.packtpub.com/wp-content/uploads/2023/08/put-generative-ai-to-work-packt.png)](https://packt.link/JGIEY)

5 | 3 Days, 20+ AI Experts, 25+ Workshops and Power Talks 6 | 7 | Code: USD75OFF 8 | 9 | 10 | 11 | 12 | # Python Feature Engineering Cookbook-Second Edition 13 | 14 | Python Feature Engineering Cookbook-Second Edition 15 | 16 | This is the code repository for [Python Feature Engineering Cookbook-Second Edition](https://www.packtpub.com/product/python-feature-engineering-cookbook-second-edition/9781804611302), published by Packt. 17 | 18 | **Over 70 recipes for creating, engineering, and transforming features to build machine learning models** 19 | 20 | ## What is this book about? 21 | Feature engineering, the process of transforming variables and creating features, albeit time-consuming, ensures that your machine learning models perform seamlessly. This second edition of Python Feature Engineering Cookbook will take the struggle out of feature engineering by showing you how to use open source Python libraries to accelerate the process via a plethora of practical, hands-on recipes. 22 | 23 | This updated edition begins by addressing fundamental data challenges such as missing data and categorical values, before moving on to strategies for dealing with skewed distributions and outliers. The concluding chapters show you how to develop new features from various types of data, including text, time series, and relational databases. With the help of numerous open source Python libraries, you'll learn how to implement each feature engineering method in a performant, reproducible, and elegant manner. 24 | 25 | By the end of this Python book, you will have the tools and expertise needed to confidently build end-to-end and reproducible feature engineering pipelines that can be deployed into production. 26 | 27 | This book covers the following exciting features: 28 | * Impute missing data using various univariate and multivariate methods 29 | * Encode categorical variables with one-hot, ordinal, and count encoding 30 | * Handle highly cardinal categorical variables 31 | * Transform, discretize, and scale your variables 32 | * Create variables from date and time with pandas and Feature-engine 33 | * Combine variables into new features 34 | * Extract features from text as well as from transactional data with Featuretools 35 | * Create features from time series data with tsfresh 36 | 37 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1804611301) today! 38 | 39 | https://www.packtpub.com/ 40 | 41 | ## Instructions and Navigations 42 | All of the code is organized into folders. 43 | 44 | The code will look like the following: 45 | ``` 46 | X_train = pd.DataFrame( 47 | X_train, 48 | columns=numeric_vars + remaining_vars, 49 | ) 50 | ``` 51 | 52 | **Following is what you need for this book:** 53 | This book is for machine learning and data science students and professionals, as well as software engineers working on machine learning model deployment, who want to learn more about how to transform their data and create new features to train machine learning models in a better way. 54 | 55 | With the following software and hardware list you can run all code files present in the book (Chapter 1-11). 56 | 57 | ### Software and Hardware List 58 | 59 | | Chapter | Software required | OS required | 60 | | -------- | -------------------------------------------------------------------------------------| -----------------------------------| 61 | | 1-11 | Python 3.3 or greater | Windows, Mac OS, or Linux | 62 | | 1-11 |Jupyter Notebook | Windows, Mac OS, or Linux | 63 | 64 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it]( https://packt.link/UXyxc). 65 | 66 | ## Errata 67 | 68 | * Page 332 : **Scikit-learn dataset website: z** should be **Scikit-learn dataset website: https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset** 69 | 70 | ### Related products 71 | * Data Cleaning and Exploration with Machine Learning [[Packt]](https://www.packtpub.com/product/data-cleaning-and-exploration-with-machine-learning/9781803241678) [[Amazon]](https://www.amazon.com/dp/1803241675) 72 | 73 | * Production-Ready Applied Deep Learning [[Packt]](https://www.packtpub.com/product/production-ready-applied-deep-learning/9781803243665) [[Amazon]](https://www.amazon.com/dp/180324366X) 74 | 75 | ## Get to Know the Author 76 | **Soledad Galli** is a data scientist, instructor, and software developer with more than 10 years of experience in world-class academic institutions and renowned businesses. She has developed and put into production machine learning models to assess insurance claims and credit risk and prevent fraud. She teaches multiple online courses on machine learning, which have enrolled 44,000+ students worldwide and consistently receive good student reviews. She is also the developer and maintainer of the open source Python library Feature-engine, which is currently downloaded 100,000+ times per month. Soledad received a Data Science Leaders Award in 2018 and was recognized as one of LinkedIn's voices in data science and analytics in 2019. 77 | -------------------------------------------------------------------------------- /ch01-missing-data-imputation/Recipe-05-Finding-extreme-values-for-imputation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Finding extreme values for imputation\n", 8 | "\n", 9 | "In this recipe, we will replace missing values by a value at the end of the distribution, estimated with a Gaussian approximation or the inter-quartile range proximity rule, utilizing pandas and Feature-engine." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "\n", 20 | "# to split the datasets:\n", 21 | "from sklearn.model_selection import train_test_split\n", 22 | "\n", 23 | "# to impute missing data with Feature-engine:\n", 24 | "from feature_engine.imputation import EndTailImputer" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Load data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/html": [ 42 | "
\n", 43 | "\n", 56 | "\n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | "
A1A2A3A4A5A6A7A8A9A10A11A12A13A14A15target
0b30.830.000ugwv1.25tt1fg202.001
1a58.674.460ugqh3.04tt6fg43.05601
2a24.50NaNugqhNaNNaNNaN0fg280.08241
3b27.831.540ugwv3.75tt5tg100.031
4b20.175.625ugwv1.71tf0fs120.001
\n", 176 | "
" 177 | ], 178 | "text/plain": [ 179 | " A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 \\\n", 180 | "0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202.0 0 \n", 181 | "1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43.0 560 \n", 182 | "2 a 24.50 NaN u g q h NaN NaN NaN 0 f g 280.0 824 \n", 183 | "3 b 27.83 1.540 u g w v 3.75 t t 5 t g 100.0 3 \n", 184 | "4 b 20.17 5.625 u g w v 1.71 t f 0 f s 120.0 0 \n", 185 | "\n", 186 | " target \n", 187 | "0 1 \n", 188 | "1 1 \n", 189 | "2 1 \n", 190 | "3 1 \n", 191 | "4 1 " 192 | ] 193 | }, 194 | "execution_count": 2, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "data = pd.read_csv(\"credit_approval_uci.csv\")\n", 201 | "\n", 202 | "data.head()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## Select numerical variables" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 3, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "# We exclude the target variable:\n", 219 | "\n", 220 | "numeric_vars = [\n", 221 | " var for var in data.select_dtypes(exclude=\"O\").columns.to_list() if var != \"target\"\n", 222 | "]" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "## Split data into train and test" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 4, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/plain": [ 240 | "((483, 6), (207, 6))" 241 | ] 242 | }, 243 | "execution_count": 4, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "X_train, X_test, y_train, y_test = train_test_split(\n", 250 | " data[numeric_vars],\n", 251 | " data[\"target\"],\n", 252 | " test_size=0.3,\n", 253 | " random_state=0,\n", 254 | ")\n", 255 | "\n", 256 | "X_train.shape, X_test.shape" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "## Find inter-quartile range" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 5, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "A2 16.4200\n", 275 | "A3 6.5825\n", 276 | "A8 2.8350\n", 277 | "A11 3.0000\n", 278 | "A14 212.0000\n", 279 | "A15 450.0000\n", 280 | "dtype: float64" 281 | ] 282 | }, 283 | "execution_count": 5, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "IQR = X_train.quantile(0.75) - X_train.quantile(0.25)\n", 290 | "\n", 291 | "IQR" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## Find values beyond the right end of the distribution" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 6, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "{'A2': 63.550000000000004,\n", 310 | " 'A3': 17.43625,\n", 311 | " 'A8': 7.2524999999999995,\n", 312 | " 'A11': 7.5,\n", 313 | " 'A14': 590.0,\n", 314 | " 'A15': 1125.0}" 315 | ] 316 | }, 317 | "execution_count": 6, 318 | "metadata": {}, 319 | "output_type": "execute_result" 320 | } 321 | ], 322 | "source": [ 323 | "imputation_dict = (X_train.quantile(0.75) + 1.5 * IQR).to_dict()\n", 324 | "\n", 325 | "imputation_dict" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 7, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "# Replace missing data with estimated values:\n", 335 | "\n", 336 | "X_train = X_train.fillna(value=imputation_dict)\n", 337 | "X_test = X_test.fillna(value=imputation_dict)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "## Find imputation values with mean and standard deviation" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 8, 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/plain": [ 355 | "((483, 6), (207, 6))" 356 | ] 357 | }, 358 | "execution_count": 8, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "# Split the data:\n", 365 | "\n", 366 | "X_train, X_test, y_train, y_test = train_test_split(\n", 367 | " data[numeric_vars],\n", 368 | " data[\"target\"],\n", 369 | " test_size=0.3,\n", 370 | " random_state=0,\n", 371 | ")\n", 372 | "\n", 373 | "X_train.shape, X_test.shape" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 9, 379 | "metadata": {}, 380 | "outputs": [ 381 | { 382 | "data": { 383 | "text/plain": [ 384 | "{'A2': 68.35771260807589,\n", 385 | " 'A3': 19.98993346546277,\n", 386 | " 'A8': 12.418567732660225,\n", 387 | " 'A11': 18.320547522636247,\n", 388 | " 'A14': 710.6258760585449,\n", 389 | " 'A15': 12740.850618383225}" 390 | ] 391 | }, 392 | "execution_count": 9, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "imputation_dict = (X_train.mean() + 3 * X_train.std()).to_dict()\n", 399 | "\n", 400 | "imputation_dict" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 10, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "# Replace missing data with estimated values:\n", 410 | "\n", 411 | "X_train = X_train.fillna(value=imputation_dict)\n", 412 | "X_test = X_test.fillna(value=imputation_dict)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "## End tail imputation with Feature-engine" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 11, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "# Let's separate into train and test sets:\n", 429 | "\n", 430 | "X_train, X_test, y_train, y_test = train_test_split(\n", 431 | " data[numeric_vars],\n", 432 | " data[\"target\"],\n", 433 | " test_size=0.3,\n", 434 | " random_state=0,\n", 435 | ")" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 12, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "EndTailImputer(imputation_method='iqr')" 447 | ] 448 | }, 449 | "execution_count": 12, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | } 453 | ], 454 | "source": [ 455 | "# Set up the imputer to find extreme values based of\n", 456 | "# the inter-quartile range proximity rule, placing\n", 457 | "# estimates at the right tail, using 3 times the IQR:\n", 458 | "\n", 459 | "imputer = EndTailImputer(\n", 460 | " imputation_method=\"iqr\",\n", 461 | " tail=\"right\",\n", 462 | " fold=3,\n", 463 | " variables=None,\n", 464 | ")\n", 465 | "\n", 466 | "imputer.fit(X_train)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 13, 472 | "metadata": {}, 473 | "outputs": [ 474 | { 475 | "data": { 476 | "text/plain": [ 477 | "{'A2': 88.18,\n", 478 | " 'A3': 27.31,\n", 479 | " 'A8': 11.504999999999999,\n", 480 | " 'A11': 12.0,\n", 481 | " 'A14': 908.0,\n", 482 | " 'A15': 1800.0}" 483 | ] 484 | }, 485 | "execution_count": 13, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "# The values to use for the imputation:\n", 492 | "\n", 493 | "imputer.imputer_dict_" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 14, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "# Replace missing data:\n", 503 | "\n", 504 | "X_train = imputer.transform(X_train)\n", 505 | "X_test = imputer.transform(X_test)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [] 514 | } 515 | ], 516 | "metadata": { 517 | "kernelspec": { 518 | "display_name": "fenotebook", 519 | "language": "python", 520 | "name": "fenotebook" 521 | }, 522 | "language_info": { 523 | "codemirror_mode": { 524 | "name": "ipython", 525 | "version": 3 526 | }, 527 | "file_extension": ".py", 528 | "mimetype": "text/x-python", 529 | "name": "python", 530 | "nbconvert_exporter": "python", 531 | "pygments_lexer": "ipython3", 532 | "version": "3.8.2" 533 | }, 534 | "toc": { 535 | "base_numbering": 1, 536 | "nav_menu": {}, 537 | "number_sections": true, 538 | "sideBar": true, 539 | "skip_h1_title": false, 540 | "title_cell": "Table of Contents", 541 | "title_sidebar": "Contents", 542 | "toc_cell": false, 543 | "toc_position": {}, 544 | "toc_section_display": true, 545 | "toc_window_display": true 546 | } 547 | }, 548 | "nbformat": 4, 549 | "nbformat_minor": 2 550 | } 551 | -------------------------------------------------------------------------------- /ch01-missing-data-imputation/Recipe-08-Estimating-missing-data-with-K-nearest-neighbours.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Estimating missing data with K-nearest neighbors\n", 8 | "\n", 9 | "In this notebook, we will replace missing data, by the mean value shown by their closest k neighbors." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import matplotlib.pyplot as plt\n", 19 | "import pandas as pd\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.impute import KNNImputer\n", 22 | "from feature_engine.wrappers import SklearnTransformerWrapper" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Load data" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/html": [ 40 | "
\n", 41 | "\n", 54 | "\n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | "
A2A3A8A11A14A15target
030.830.0001.251202.001
158.674.4603.04643.05601
224.50NaNNaN0280.08241
327.831.5403.755100.031
420.175.6251.710120.001
\n", 120 | "
" 121 | ], 122 | "text/plain": [ 123 | " A2 A3 A8 A11 A14 A15 target\n", 124 | "0 30.83 0.000 1.25 1 202.0 0 1\n", 125 | "1 58.67 4.460 3.04 6 43.0 560 1\n", 126 | "2 24.50 NaN NaN 0 280.0 824 1\n", 127 | "3 27.83 1.540 3.75 5 100.0 3 1\n", 128 | "4 20.17 5.625 1.71 0 120.0 0 1" 129 | ] 130 | }, 131 | "execution_count": 2, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "# Load data with numerical variables\n", 138 | "\n", 139 | "variables = [\"A2\", \"A3\", \"A8\", \"A11\", \"A14\", \"A15\", \"target\"]\n", 140 | "\n", 141 | "data = pd.read_csv(\"credit_approval_uci.csv\", usecols=variables)\n", 142 | "\n", 143 | "data.head()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Split data into train and test sets" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 3, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "((483, 6), (207, 6))" 162 | ] 163 | }, 164 | "execution_count": 3, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "# Let's separate into training and testing set\n", 171 | "\n", 172 | "X_train, X_test, y_train, y_test = train_test_split(\n", 173 | " data.drop(\"target\", axis=1),\n", 174 | " data[\"target\"],\n", 175 | " test_size=0.3,\n", 176 | " random_state=0,\n", 177 | ")\n", 178 | "\n", 179 | "X_train.shape, X_test.shape" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 4, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "A2 0.022774\n", 191 | "A3 0.140787\n", 192 | "A8 0.140787\n", 193 | "A11 0.000000\n", 194 | "A14 0.014493\n", 195 | "A15 0.000000\n", 196 | "dtype: float64" 197 | ] 198 | }, 199 | "execution_count": 4, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "# Find the fraction of missing data:\n", 206 | "\n", 207 | "X_train.isnull().mean()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 5, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "# Set up the imputer to find the closes 5 neighbors\n", 217 | "# utilizing euclidean distance, and weighting the\n", 218 | "# neighbours so that furthest neighbors have smaller\n", 219 | "# influence:\n", 220 | "\n", 221 | "imputer = KNNImputer(\n", 222 | " n_neighbors=5,\n", 223 | " weights=\"distance\",\n", 224 | ")" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 6, 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "KNNImputer(weights='distance')" 236 | ] 237 | }, 238 | "execution_count": 6, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "# Find the closest neighbors:\n", 245 | "\n", 246 | "imputer.fit(X_train)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 7, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# Replace the missing values by the weighted\n", 256 | "# mean of the values shown by the neighbors:\n", 257 | "\n", 258 | "X_train = imputer.transform(X_train)\n", 259 | "X_test = imputer.transform(X_test)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 8, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "array([[4.608e+01, 3.000e+00, 2.375e+00, 8.000e+00, 3.960e+02, 4.159e+03],\n", 271 | " [1.592e+01, 2.875e+00, 8.500e-02, 0.000e+00, 1.200e+02, 0.000e+00],\n", 272 | " [3.633e+01, 2.125e+00, 8.500e-02, 1.000e+00, 5.000e+01, 1.187e+03],\n", 273 | " ...,\n", 274 | " [1.958e+01, 6.650e-01, 1.665e+00, 0.000e+00, 2.200e+02, 5.000e+00],\n", 275 | " [2.283e+01, 2.290e+00, 2.290e+00, 7.000e+00, 1.400e+02, 2.384e+03],\n", 276 | " [4.058e+01, 3.290e+00, 3.500e+00, 0.000e+00, 4.000e+02, 0.000e+00]])" 277 | ] 278 | }, 279 | "execution_count": 8, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "# The result is a NumPy array:\n", 286 | "X_train" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 9, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "0 0\n", 298 | "1 0\n", 299 | "2 0\n", 300 | "3 0\n", 301 | "4 0\n", 302 | "5 0\n", 303 | "dtype: int64" 304 | ] 305 | }, 306 | "execution_count": 9, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "# We can corroborate that there is no missing data:\n", 313 | "\n", 314 | "pd.DataFrame(X_train).isnull().sum()" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "## Find neighbors base on specific variables" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 10, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# Let's separate into training and testing set\n", 331 | "\n", 332 | "X_train, X_test, y_train, y_test = train_test_split(\n", 333 | " data.drop(\"target\", axis=1),\n", 334 | " data[\"target\"],\n", 335 | " test_size=0.3,\n", 336 | " random_state=0,\n", 337 | ")" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 11, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "# Set up the imputer to find neighbous based on\n", 347 | "# 4 numerical variables:\n", 348 | "\n", 349 | "imputer = SklearnTransformerWrapper(\n", 350 | " transformer=KNNImputer(),\n", 351 | " variables=[\"A2\", \"A3\", \"A8\", \"A11\"],\n", 352 | ")" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 12, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "# Find neighbors and replace missing data\n", 362 | "# by their estimates:\n", 363 | "\n", 364 | "X_train = imputer.fit_transform(X_train)\n", 365 | "X_test = imputer.transform(X_test)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [] 374 | } 375 | ], 376 | "metadata": { 377 | "kernelspec": { 378 | "display_name": "fenotebook", 379 | "language": "python", 380 | "name": "fenotebook" 381 | }, 382 | "language_info": { 383 | "codemirror_mode": { 384 | "name": "ipython", 385 | "version": 3 386 | }, 387 | "file_extension": ".py", 388 | "mimetype": "text/x-python", 389 | "name": "python", 390 | "nbconvert_exporter": "python", 391 | "pygments_lexer": "ipython3", 392 | "version": "3.8.2" 393 | }, 394 | "toc": { 395 | "base_numbering": 1, 396 | "nav_menu": {}, 397 | "number_sections": true, 398 | "sideBar": true, 399 | "skip_h1_title": false, 400 | "title_cell": "Table of Contents", 401 | "title_sidebar": "Contents", 402 | "toc_cell": false, 403 | "toc_position": {}, 404 | "toc_section_display": true, 405 | "toc_window_display": true 406 | } 407 | }, 408 | "nbformat": 4, 409 | "nbformat_minor": 2 410 | } 411 | -------------------------------------------------------------------------------- /ch01-missing-data-imputation/donwload-prepare-store-credit-approval-dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Download, prepare and save the Credit Approval Dataset\n", 8 | "\n", 9 | "\n", 10 | "\n", 11 | "In this notebook, you will find guidelines to download, prepare, and store the Credit Approval Dataset from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml).\n", 12 | "\n", 13 | "\n", 14 | "## Download the data\n", 15 | "\n", 16 | "Follow these guidelines to download the data:\n", 17 | "\n", 18 | "- Visit [the UCI website](http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/)\n", 19 | "- Click on **crx.data** to download the data. \n", 20 | "- Save crx.data in the same folder that contains this notebook.\n", 21 | "\n", 22 | "\n", 23 | "You can find more information about this particular dataset [here](https://archive.ics.uci.edu/ml/datasets/credit+approval)." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import random\n", 33 | "import numpy as np\n", 34 | "import pandas as pd" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/html": [ 45 | "
\n", 46 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | "
A1A2A3A4A5A6A7A8A9A10A11A12A13A14A15target
0b30.830.000ugwv1.25tt1fg202.001
1a58.674.460ugqh3.04tt6fg43.05601
2a24.500.500ugqh1.50tf0fg280.08241
3b27.831.540ugwv3.75tt5tg100.031
4b20.175.625ugwv1.71tf0fs120.001
\n", 179 | "
" 180 | ], 181 | "text/plain": [ 182 | " A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 target\n", 183 | "0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202.0 0 1\n", 184 | "1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43.0 560 1\n", 185 | "2 a 24.50 0.500 u g q h 1.50 t f 0 f g 280.0 824 1\n", 186 | "3 b 27.83 1.540 u g w v 3.75 t t 5 t g 100.0 3 1\n", 187 | "4 b 20.17 5.625 u g w v 1.71 t f 0 f s 120.0 0 1" 188 | ] 189 | }, 190 | "execution_count": 2, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "# Load data\n", 197 | "data = pd.read_csv(\"crx.data\", header=None)\n", 198 | "\n", 199 | "# Create variable names according to UCI Machine Learning\n", 200 | "# Repository's information:\n", 201 | "varnames = [f\"A{s}\" for s in range(1, 17)]\n", 202 | "\n", 203 | "# Add column names to dataset:\n", 204 | "data.columns = varnames\n", 205 | "\n", 206 | "# Replace ? by np.nan:\n", 207 | "data = data.replace(\"?\", np.nan)\n", 208 | "\n", 209 | "# Cast variables to correct datatypes:\n", 210 | "data[\"A2\"] = data[\"A2\"].astype(\"float\")\n", 211 | "data[\"A14\"] = data[\"A14\"].astype(\"float\")\n", 212 | "\n", 213 | "# Encode target to binary notation:\n", 214 | "data[\"A16\"] = data[\"A16\"].map({\"+\": 1, \"-\": 0})\n", 215 | "\n", 216 | "# Rename target:\n", 217 | "data.rename(columns={\"A16\": \"target\"}, inplace=True)\n", 218 | "\n", 219 | "# Display first 5 rows of data:\n", 220 | "data.head()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 3, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/plain": [ 231 | "A1 12\n", 232 | "A2 12\n", 233 | "A3 92\n", 234 | "A4 6\n", 235 | "A5 6\n", 236 | "A6 9\n", 237 | "A7 9\n", 238 | "A8 92\n", 239 | "A9 92\n", 240 | "A10 92\n", 241 | "A11 0\n", 242 | "A12 0\n", 243 | "A13 0\n", 244 | "A14 13\n", 245 | "A15 0\n", 246 | "target 0\n", 247 | "dtype: int64" 248 | ] 249 | }, 250 | "execution_count": 3, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "# Add missing values at random positions.\n", 257 | "\n", 258 | "# Set seed for reproducibility:\n", 259 | "random.seed(9001)\n", 260 | "\n", 261 | "# Get the reandom position indexes:\n", 262 | "values = list(set([random.randint(0, len(data)) for p in range(0, 100)]))\n", 263 | "\n", 264 | "# Add missing data:\n", 265 | "data.loc[values, [\"A3\", \"A8\", \"A9\", \"A10\"]] = np.nan\n", 266 | "\n", 267 | "# Check proportion of missing data:\n", 268 | "data.isnull().sum()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 4, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "# Save dataset\n", 278 | "\n", 279 | "data.to_csv(\"credit_approval_uci.csv\", index=False)" 280 | ] 281 | } 282 | ], 283 | "metadata": { 284 | "kernelspec": { 285 | "display_name": "fsml", 286 | "language": "python", 287 | "name": "fsml" 288 | }, 289 | "language_info": { 290 | "codemirror_mode": { 291 | "name": "ipython", 292 | "version": 3 293 | }, 294 | "file_extension": ".py", 295 | "mimetype": "text/x-python", 296 | "name": "python", 297 | "nbconvert_exporter": "python", 298 | "pygments_lexer": "ipython3", 299 | "version": "3.10.5" 300 | }, 301 | "toc": { 302 | "base_numbering": 1, 303 | "nav_menu": {}, 304 | "number_sections": true, 305 | "sideBar": true, 306 | "skip_h1_title": false, 307 | "title_cell": "Table of Contents", 308 | "title_sidebar": "Contents", 309 | "toc_cell": false, 310 | "toc_position": {}, 311 | "toc_section_display": true, 312 | "toc_window_display": false 313 | } 314 | }, 315 | "nbformat": 4, 316 | "nbformat_minor": 2 317 | } 318 | -------------------------------------------------------------------------------- /ch02-categorical-encoding/donwload-prepare-store-credit-approval-dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Download, prepare and save the Credit Approval Dataset\n", 8 | "\n", 9 | "\n", 10 | "In this notebook, you will find guidelines to download, prepare, and store the Credit Approval Dataset from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml).\n", 11 | "\n", 12 | "\n", 13 | "## Download the data\n", 14 | "\n", 15 | "Follow these guidelines to download the data:\n", 16 | "\n", 17 | "- Visit [the UCI website](http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/)\n", 18 | "- Click on **crx.data** to download the data. \n", 19 | "- Save crx.data in the same folder that contains this notebook.\n", 20 | "\n", 21 | "\n", 22 | "You can find more information about this particular dataset [here](https://archive.ics.uci.edu/ml/datasets/credit+approval)." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import random\n", 32 | "import numpy as np\n", 33 | "import pandas as pd" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
A1A2A3A4A5A6A7A8A9A10A11A12A13A14A15target
0b30.830.000ugwv1.25tt1fg202.001
1a58.674.460ugqh3.04tt6fg43.05601
2a24.500.500ugqh1.50tf0fg280.08241
3b27.831.540ugwv3.75tt5tg100.031
4b20.175.625ugwv1.71tf0fs120.001
\n", 178 | "
" 179 | ], 180 | "text/plain": [ 181 | " A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 target\n", 182 | "0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202.0 0 1\n", 183 | "1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43.0 560 1\n", 184 | "2 a 24.50 0.500 u g q h 1.50 t f 0 f g 280.0 824 1\n", 185 | "3 b 27.83 1.540 u g w v 3.75 t t 5 t g 100.0 3 1\n", 186 | "4 b 20.17 5.625 u g w v 1.71 t f 0 f s 120.0 0 1" 187 | ] 188 | }, 189 | "execution_count": 2, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "# Load data\n", 196 | "data = pd.read_csv(\"crx.data\", header=None)\n", 197 | "\n", 198 | "# Create variable names according to UCI Machine Learning\n", 199 | "# Repository's information:\n", 200 | "varnames = [f\"A{s}\" for s in range(1, 17)]\n", 201 | "\n", 202 | "# Add column names to dataset:\n", 203 | "data.columns = varnames\n", 204 | "\n", 205 | "# Replace ? by np.nan:\n", 206 | "data = data.replace(\"?\", np.nan)\n", 207 | "\n", 208 | "# Cast variables to correct data types:\n", 209 | "data[\"A2\"] = data[\"A2\"].astype(\"float\")\n", 210 | "data[\"A14\"] = data[\"A14\"].astype(\"float\")\n", 211 | "\n", 212 | "# Encode target to binary notation:\n", 213 | "data[\"A16\"] = data[\"A16\"].map({\"+\": 1, \"-\": 0})\n", 214 | "\n", 215 | "# Rename target:\n", 216 | "data.rename(columns={\"A16\": \"target\"}, inplace=True)\n", 217 | "\n", 218 | "# Display first 5 rows of data:\n", 219 | "data.head()" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 3, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# find categorical variables\n", 229 | "cat_cols = [c for c in data.columns if data[c].dtypes == \"O\"]\n", 230 | "\n", 231 | "# find numerical variables\n", 232 | "num_cols = [c for c in data.columns if data[c].dtypes != \"O\"]" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 4, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "# fill in missing values\n", 242 | "\n", 243 | "data[num_cols] = data[num_cols].fillna(0)\n", 244 | "data[cat_cols] = data[cat_cols].fillna(\"Missing\")" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 5, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "# Save dataset\n", 254 | "\n", 255 | "data.to_csv(\"credit_approval_uci.csv\", index=False)" 256 | ] 257 | } 258 | ], 259 | "metadata": { 260 | "kernelspec": { 261 | "display_name": "fenotebook", 262 | "language": "python", 263 | "name": "fenotebook" 264 | }, 265 | "language_info": { 266 | "codemirror_mode": { 267 | "name": "ipython", 268 | "version": 3 269 | }, 270 | "file_extension": ".py", 271 | "mimetype": "text/x-python", 272 | "name": "python", 273 | "nbconvert_exporter": "python", 274 | "pygments_lexer": "ipython3", 275 | "version": "3.8.2" 276 | }, 277 | "toc": { 278 | "base_numbering": 1, 279 | "nav_menu": {}, 280 | "number_sections": true, 281 | "sideBar": true, 282 | "skip_h1_title": false, 283 | "title_cell": "Table of Contents", 284 | "title_sidebar": "Contents", 285 | "toc_cell": false, 286 | "toc_position": {}, 287 | "toc_section_display": true, 288 | "toc_window_display": false 289 | } 290 | }, 291 | "nbformat": 4, 292 | "nbformat_minor": 2 293 | } 294 | -------------------------------------------------------------------------------- /ch04-discretization/donwload-prepare-store-enron-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Download, prepare and save the Bag of Words Data Set\n", 8 | "\n", 9 | "In this notebook, you will find guidelines to download, prepare, and store the Bag of Words Data Set from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml).\n", 10 | "\n", 11 | "\n", 12 | "## Download the data\n", 13 | "\n", 14 | "Follow these guidelines to download the data:\n", 15 | "\n", 16 | "- Visit [the UCI website](https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/)\n", 17 | "- Click on **docword.enron.txt.gz** to download the data.\n", 18 | "- Unzip the data and save it in the same folder that contains this notebook.\n", 19 | "- Then click on **vocab.enron.txt** to download the word names.\n", 20 | "- Save vocab.enron.txt in the same folder that contains this notebook.\n", 21 | "\n", 22 | "You can find more information about this particular dataset [here](https://archive.ics.uci.edu/ml/datasets/Bag+of+Words)." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/html": [ 42 | "
\n", 43 | "\n", 56 | "\n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | "
docIDwordIDcount
011181
112851
2112291
3116881
4120681
\n", 98 | "
" 99 | ], 100 | "text/plain": [ 101 | " docID wordID count\n", 102 | "0 1 118 1\n", 103 | "1 1 285 1\n", 104 | "2 1 1229 1\n", 105 | "3 1 1688 1\n", 106 | "4 1 2068 1" 107 | ] 108 | }, 109 | "execution_count": 2, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "# load the word counts\n", 116 | "\n", 117 | "data = pd.read_csv(\"docword.enron.txt\", sep=\" \", skiprows=3, header=None)\n", 118 | "data.columns = [\"docID\", \"wordID\", \"count\"]\n", 119 | "\n", 120 | "data.head()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 3, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/html": [ 131 | "
\n", 132 | "\n", 145 | "\n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | "
words
0aaa
1aaas
2aactive
3aadvantage
4aaker
\n", 175 | "
" 176 | ], 177 | "text/plain": [ 178 | " words\n", 179 | "0 aaa\n", 180 | "1 aaas\n", 181 | "2 aactive\n", 182 | "3 aadvantage\n", 183 | "4 aaker" 184 | ] 185 | }, 186 | "execution_count": 3, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "# load the words\n", 193 | "\n", 194 | "words = pd.read_csv(\"vocab.enron.txt\", header=None)\n", 195 | "words.columns = [\"words\"]\n", 196 | "\n", 197 | "words.head()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 4, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/html": [ 208 | "
\n", 209 | "\n", 222 | "\n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | "
words
8704eurobond
13618keen
11114halligan
19968pvr
23327soda
20714refundable
390advice
6257decker
8680etis
3370cab
\n", 272 | "
" 273 | ], 274 | "text/plain": [ 275 | " words\n", 276 | "8704 eurobond\n", 277 | "13618 keen\n", 278 | "11114 halligan\n", 279 | "19968 pvr\n", 280 | "23327 soda\n", 281 | "20714 refundable\n", 282 | "390 advice\n", 283 | "6257 decker\n", 284 | "8680 etis\n", 285 | "3370 cab" 286 | ] 287 | }, 288 | "execution_count": 4, 289 | "metadata": {}, 290 | "output_type": "execute_result" 291 | } 292 | ], 293 | "source": [ 294 | "# select at random 10 words\n", 295 | "\n", 296 | "words = words.sample(10, random_state=290917)\n", 297 | "\n", 298 | "words" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 5, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/html": [ 309 | "
\n", 310 | "\n", 323 | "\n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | "
wordsdocIDwordIDcount
137715eurobond202187042
140167eurobond2050870411
151530eurobond226987042
155066eurobond235287042
156247eurobond237587042
\n", 371 | "
" 372 | ], 373 | "text/plain": [ 374 | " words docID wordID count\n", 375 | "137715 eurobond 2021 8704 2\n", 376 | "140167 eurobond 2050 8704 11\n", 377 | "151530 eurobond 2269 8704 2\n", 378 | "155066 eurobond 2352 8704 2\n", 379 | "156247 eurobond 2375 8704 2" 380 | ] 381 | }, 382 | "execution_count": 5, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "data = words.merge(data, left_index=True, right_on=\"wordID\")\n", 389 | "\n", 390 | "data.head()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 6, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "data": { 400 | "text/plain": [ 401 | "(1388, 10)" 402 | ] 403 | }, 404 | "execution_count": 6, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "# reconstitute the bag of words dataset\n", 411 | "\n", 412 | "bow = data.pivot(index=\"docID\", columns=\"words\", values=\"count\")\n", 413 | "bow.fillna(0, inplace=True)\n", 414 | "bow.reset_index(inplace=True, drop=True)\n", 415 | "bow.shape" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 7, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "data": { 425 | "text/html": [ 426 | "
\n", 427 | "\n", 440 | "\n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | "
wordsadvicecabdeckeretiseurobondhalligankeenpvrrefundablesoda
00.00.02.00.00.00.00.00.00.00.0
10.00.02.00.00.00.00.00.00.00.0
21.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.01.00.0
40.00.02.00.00.00.00.00.00.00.0
\n", 524 | "
" 525 | ], 526 | "text/plain": [ 527 | "words advice cab decker etis eurobond halligan keen pvr refundable \\\n", 528 | "0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 529 | "1 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 530 | "2 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 531 | "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 \n", 532 | "4 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 533 | "\n", 534 | "words soda \n", 535 | "0 0.0 \n", 536 | "1 0.0 \n", 537 | "2 0.0 \n", 538 | "3 0.0 \n", 539 | "4 0.0 " 540 | ] 541 | }, 542 | "execution_count": 7, 543 | "metadata": {}, 544 | "output_type": "execute_result" 545 | } 546 | ], 547 | "source": [ 548 | "bow.head()" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 8, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "bow.to_csv(\"bag_of_words.csv\", index=False)" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": {}, 564 | "outputs": [], 565 | "source": [] 566 | } 567 | ], 568 | "metadata": { 569 | "kernelspec": { 570 | "display_name": "fets", 571 | "language": "python", 572 | "name": "fets" 573 | }, 574 | "language_info": { 575 | "codemirror_mode": { 576 | "name": "ipython", 577 | "version": 3 578 | }, 579 | "file_extension": ".py", 580 | "mimetype": "text/x-python", 581 | "name": "python", 582 | "nbconvert_exporter": "python", 583 | "pygments_lexer": "ipython3", 584 | "version": "3.8.2" 585 | }, 586 | "toc": { 587 | "base_numbering": 1, 588 | "nav_menu": {}, 589 | "number_sections": true, 590 | "sideBar": true, 591 | "skip_h1_title": false, 592 | "title_cell": "Table of Contents", 593 | "title_sidebar": "Contents", 594 | "toc_cell": false, 595 | "toc_position": {}, 596 | "toc_section_display": true, 597 | "toc_window_display": false 598 | } 599 | }, 600 | "nbformat": 4, 601 | "nbformat_minor": 4 602 | } 603 | -------------------------------------------------------------------------------- /ch05-outliers/Recipe-2-Finding-outliers-with-mean-and-std.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Finding outliers with the mean and standard deviation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "\n", 19 | "# boston house dataset for the demo\n", 20 | "from sklearn.datasets import load_breast_cancer" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst radiusworst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimension
017.9910.38122.801001.00.118400.277600.30010.147100.24190.07871...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
120.5717.77132.901326.00.084740.078640.08690.070170.18120.05667...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
219.6921.25130.001203.00.109600.159900.19740.127900.20690.05999...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
311.4220.3877.58386.10.142500.283900.24140.105200.25970.09744...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
420.2914.34135.101297.00.100300.132800.19800.104300.18090.05883...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", 195 | "

5 rows × 30 columns

\n", 196 | "
" 197 | ], 198 | "text/plain": [ 199 | " mean radius mean texture mean perimeter mean area mean smoothness \\\n", 200 | "0 17.99 10.38 122.80 1001.0 0.11840 \n", 201 | "1 20.57 17.77 132.90 1326.0 0.08474 \n", 202 | "2 19.69 21.25 130.00 1203.0 0.10960 \n", 203 | "3 11.42 20.38 77.58 386.1 0.14250 \n", 204 | "4 20.29 14.34 135.10 1297.0 0.10030 \n", 205 | "\n", 206 | " mean compactness mean concavity mean concave points mean symmetry \\\n", 207 | "0 0.27760 0.3001 0.14710 0.2419 \n", 208 | "1 0.07864 0.0869 0.07017 0.1812 \n", 209 | "2 0.15990 0.1974 0.12790 0.2069 \n", 210 | "3 0.28390 0.2414 0.10520 0.2597 \n", 211 | "4 0.13280 0.1980 0.10430 0.1809 \n", 212 | "\n", 213 | " mean fractal dimension ... worst radius worst texture worst perimeter \\\n", 214 | "0 0.07871 ... 25.38 17.33 184.60 \n", 215 | "1 0.05667 ... 24.99 23.41 158.80 \n", 216 | "2 0.05999 ... 23.57 25.53 152.50 \n", 217 | "3 0.09744 ... 14.91 26.50 98.87 \n", 218 | "4 0.05883 ... 22.54 16.67 152.20 \n", 219 | "\n", 220 | " worst area worst smoothness worst compactness worst concavity \\\n", 221 | "0 2019.0 0.1622 0.6656 0.7119 \n", 222 | "1 1956.0 0.1238 0.1866 0.2416 \n", 223 | "2 1709.0 0.1444 0.4245 0.4504 \n", 224 | "3 567.7 0.2098 0.8663 0.6869 \n", 225 | "4 1575.0 0.1374 0.2050 0.4000 \n", 226 | "\n", 227 | " worst concave points worst symmetry worst fractal dimension \n", 228 | "0 0.2654 0.4601 0.11890 \n", 229 | "1 0.1860 0.2750 0.08902 \n", 230 | "2 0.2430 0.3613 0.08758 \n", 231 | "3 0.2575 0.6638 0.17300 \n", 232 | "4 0.1625 0.2364 0.07678 \n", 233 | "\n", 234 | "[5 rows x 30 columns]" 235 | ] 236 | }, 237 | "execution_count": 2, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "breast_cancer = load_breast_cancer()\n", 244 | "X = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)\n", 245 | "\n", 246 | "# display top 5 rows\n", 247 | "X.head()" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 3, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "# mean plus 3 * std\n", 257 | "\n", 258 | "\n", 259 | "def find_limits(df, variable, fold):\n", 260 | " lower_limit = df[variable].mean() - fold * df[variable].std()\n", 261 | " upper_limit = df[variable].mean() + fold * df[variable].std()\n", 262 | " return lower_limit, upper_limit" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 4, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "(0.05416789678205824, 0.13855266560809995)" 274 | ] 275 | }, 276 | "execution_count": 4, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "# we find the limits\n", 283 | "\n", 284 | "lower_limit, upper_limit = find_limits(X, \"mean smoothness\", 3)\n", 285 | "lower_limit, upper_limit" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 5, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "# let's flag the outliers in the data set\n", 295 | "\n", 296 | "outliers = np.where(\n", 297 | " (X[\"mean smoothness\"] > upper_limit) | \n", 298 | " (X[\"mean smoothness\"] < lower_limit),\n", 299 | " True,\n", 300 | " False,\n", 301 | ")" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 6, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "5" 313 | ] 314 | }, 315 | "execution_count": 6, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "# how many outliers did we find?\n", 322 | "\n", 323 | "outliers.sum()" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 7, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "data": { 333 | "text/plain": [ 334 | "(7.238450329479068, 44.11599606770898)" 335 | ] 336 | }, 337 | "execution_count": 7, 338 | "metadata": {}, 339 | "output_type": "execute_result" 340 | } 341 | ], 342 | "source": [ 343 | "# we find the limits in another variable\n", 344 | "\n", 345 | "lower_limit, upper_limit = find_limits(X, \"worst texture\", 3)\n", 346 | "lower_limit, upper_limit" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 8, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "# let's flag the outliers in the data set\n", 356 | "\n", 357 | "outliers = np.where(\n", 358 | " (X[\"worst texture\"] > upper_limit) |\n", 359 | " (X[\"worst texture\"] < lower_limit),\n", 360 | " True,\n", 361 | " False,\n", 362 | ")" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 9, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "4" 374 | ] 375 | }, 376 | "execution_count": 9, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "# how many outliers did we find?\n", 383 | "\n", 384 | "outliers.sum()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [] 393 | } 394 | ], 395 | "metadata": { 396 | "kernelspec": { 397 | "display_name": "fsml", 398 | "language": "python", 399 | "name": "fsml" 400 | }, 401 | "language_info": { 402 | "codemirror_mode": { 403 | "name": "ipython", 404 | "version": 3 405 | }, 406 | "file_extension": ".py", 407 | "mimetype": "text/x-python", 408 | "name": "python", 409 | "nbconvert_exporter": "python", 410 | "pygments_lexer": "ipython3", 411 | "version": "3.10.5" 412 | }, 413 | "toc": { 414 | "base_numbering": 1, 415 | "nav_menu": {}, 416 | "number_sections": true, 417 | "sideBar": true, 418 | "skip_h1_title": false, 419 | "title_cell": "Table of Contents", 420 | "title_sidebar": "Contents", 421 | "toc_cell": false, 422 | "toc_position": {}, 423 | "toc_section_display": true, 424 | "toc_window_display": false 425 | } 426 | }, 427 | "nbformat": 4, 428 | "nbformat_minor": 2 429 | } 430 | -------------------------------------------------------------------------------- /ch05-outliers/Recipe-3-Finding-outliers-with-the-IQR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Finding outliers with the IQR proximity rule" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "\n", 19 | "# boston house dataset for the demo\n", 20 | "from sklearn.datasets import fetch_california_housing" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitude
08.325241.06.9841271.023810322.02.55555637.88-122.23
18.301421.06.2381370.9718802401.02.10984237.86-122.22
27.257452.08.2881361.073446496.02.80226037.85-122.24
35.643152.05.8173521.073059558.02.54794537.85-122.25
43.846252.06.2818531.081081565.02.18146737.85-122.25
\n", 117 | "
" 118 | ], 119 | "text/plain": [ 120 | " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", 121 | "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", 122 | "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", 123 | "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", 124 | "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", 125 | "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", 126 | "\n", 127 | " Longitude \n", 128 | "0 -122.23 \n", 129 | "1 -122.22 \n", 130 | "2 -122.24 \n", 131 | "3 -122.25 \n", 132 | "4 -122.25 " 133 | ] 134 | }, 135 | "execution_count": 2, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "# load the California House price data from Scikit-learn\n", 142 | "X, y = fetch_california_housing(return_X_y=True, as_frame=True)\n", 143 | "\n", 144 | "# display top 5 rows\n", 145 | "X.head()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 3, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "def find_limits(df, variable, fold):\n", 155 | "\n", 156 | " IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)\n", 157 | "\n", 158 | " lower_limit = df[variable].quantile(0.25) - (IQR * fold)\n", 159 | " upper_limit = df[variable].quantile(0.75) + (IQR * fold)\n", 160 | "\n", 161 | " return lower_limit, upper_limit" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 4, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "(-3.9761500000000005, 11.2828)" 173 | ] 174 | }, 175 | "execution_count": 4, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "# we find the limits\n", 182 | "\n", 183 | "lower_limit, upper_limit = find_limits(X, \"MedInc\", 3)\n", 184 | "lower_limit, upper_limit" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 5, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "# let's flag the outliers in the data set\n", 194 | "\n", 195 | "outliers = np.where(\n", 196 | " (X[\"MedInc\"] > upper_limit) |\n", 197 | " (X[\"MedInc\"] < lower_limit),\n", 198 | " True,\n", 199 | " False,\n", 200 | ")" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 6, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/plain": [ 211 | "140" 212 | ] 213 | }, 214 | "execution_count": 6, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "# how many outliers did we find?\n", 221 | "\n", 222 | "outliers.sum()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 7, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "(-39.0, 94.0)" 234 | ] 235 | }, 236 | "execution_count": 7, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "# we find the limits in another variable\n", 243 | "\n", 244 | "lower_limit, upper_limit = find_limits(X, \"HouseAge\", 3)\n", 245 | "lower_limit, upper_limit" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 8, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "# let's flag the outliers in the data set\n", 255 | "\n", 256 | "outliers = np.where(\n", 257 | " (X[\"HouseAge\"] > upper_limit) |\n", 258 | " (X[\"HouseAge\"] < lower_limit),\n", 259 | " True,\n", 260 | " False,\n", 261 | ")" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 9, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "0" 273 | ] 274 | }, 275 | "execution_count": 9, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "# how many outliers did we find?\n", 282 | "\n", 283 | "outliers.sum()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "fsml", 297 | "language": "python", 298 | "name": "fsml" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.10.5" 311 | }, 312 | "toc": { 313 | "base_numbering": 1, 314 | "nav_menu": {}, 315 | "number_sections": true, 316 | "sideBar": true, 317 | "skip_h1_title": false, 318 | "title_cell": "Table of Contents", 319 | "title_sidebar": "Contents", 320 | "toc_cell": false, 321 | "toc_position": {}, 322 | "toc_section_display": true, 323 | "toc_window_display": false 324 | } 325 | }, 326 | "nbformat": 4, 327 | "nbformat_minor": 2 328 | } 329 | -------------------------------------------------------------------------------- /ch05-outliers/Recipe-4-Removing-outliers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Removing outliers - outlier trimming" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "from sklearn.datasets import fetch_california_housing\n", 19 | "from sklearn.model_selection import train_test_split\n", 20 | "from feature_engine.outliers import OutlierTrimmer" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "((14448, 8), (6192, 8))" 32 | ] 33 | }, 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "# load the California House price data from Scikit-learn\n", 41 | "X, y = fetch_california_housing(return_X_y=True, as_frame=True)\n", 42 | "\n", 43 | "# let's separate the data into training and testing sets\n", 44 | "\n", 45 | "X_train, X_test, y_train, y_test = train_test_split(\n", 46 | " X,\n", 47 | " y,\n", 48 | " test_size=0.3,\n", 49 | " random_state=0,\n", 50 | ")\n", 51 | "\n", 52 | "X_train.shape, X_test.shape" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/html": [ 63 | "
\n", 64 | "\n", 77 | "\n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitude
19891.975052.02.8000000.700000193.04.82500036.73-119.79
2562.260443.03.6714801.184116836.03.01805137.77-122.21
78876.299017.06.4780221.0879121387.03.81044033.87-118.04
45811.719917.02.5180001.1960003051.03.05100034.06-118.28
19932.220650.04.6227541.161677606.03.62874336.73-119.81
\n", 149 | "
" 150 | ], 151 | "text/plain": [ 152 | " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", 153 | "1989 1.9750 52.0 2.800000 0.700000 193.0 4.825000 36.73 \n", 154 | "256 2.2604 43.0 3.671480 1.184116 836.0 3.018051 37.77 \n", 155 | "7887 6.2990 17.0 6.478022 1.087912 1387.0 3.810440 33.87 \n", 156 | "4581 1.7199 17.0 2.518000 1.196000 3051.0 3.051000 34.06 \n", 157 | "1993 2.2206 50.0 4.622754 1.161677 606.0 3.628743 36.73 \n", 158 | "\n", 159 | " Longitude \n", 160 | "1989 -119.79 \n", 161 | "256 -122.21 \n", 162 | "7887 -118.04 \n", 163 | "4581 -118.28 \n", 164 | "1993 -119.81 " 165 | ] 166 | }, 167 | "execution_count": 3, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "X_train.head()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 4, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "def find_limits(df, variable, fold):\n", 183 | "\n", 184 | " IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)\n", 185 | "\n", 186 | " lower_limit = df[variable].quantile(0.25) - (IQR * fold)\n", 187 | " upper_limit = df[variable].quantile(0.75) + (IQR * fold)\n", 188 | "\n", 189 | " return lower_limit, upper_limit" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 5, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "(-3.925900000000002, 11.232600000000001)" 201 | ] 202 | }, 203 | "execution_count": 5, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "# we find the limits\n", 210 | "\n", 211 | "lower_limit, upper_limit = find_limits(X_train, \"MedInc\", 3)\n", 212 | "lower_limit, upper_limit" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 6, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# Remove outliers on the right\n", 222 | "\n", 223 | "inliers = X_train[\"MedInc\"].ge(lower_limit)\n", 224 | "X_train = X_train.loc[inliers]\n", 225 | "\n", 226 | "inliers = X_test[\"MedInc\"].ge(lower_limit)\n", 227 | "X_test = X_test.loc[inliers]" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 7, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "# Remove outliers on the left\n", 237 | "\n", 238 | "inliers = X_train[\"MedInc\"].le(upper_limit)\n", 239 | "X_train = X_train.loc[inliers]\n", 240 | "\n", 241 | "inliers = X_test[\"MedInc\"].le(upper_limit)\n", 242 | "X_test = X_test.loc[inliers]" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "## Feature-engine" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 8, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "data": { 259 | "text/plain": [ 260 | "((14448, 8), (6192, 8))" 261 | ] 262 | }, 263 | "execution_count": 8, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "# let's separate the data into training and testing sets\n", 270 | "\n", 271 | "X_train, X_test, y_train, y_test = train_test_split(\n", 272 | " X,\n", 273 | " y,\n", 274 | " test_size=0.3,\n", 275 | " random_state=0,\n", 276 | ")\n", 277 | "\n", 278 | "X_train.shape, X_test.shape" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 9, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "
OutlierTrimmer(capping_method='iqr', fold=1.5, tail='both',\n",
290 |        "               variables=['MedInc', 'HouseAge', 'Population'])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 292 | ], 293 | "text/plain": [ 294 | "OutlierTrimmer(capping_method='iqr', fold=1.5, tail='both',\n", 295 | " variables=['MedInc', 'HouseAge', 'Population'])" 296 | ] 297 | }, 298 | "execution_count": 9, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "trimmer = OutlierTrimmer(\n", 305 | " variables=[\"MedInc\", \"HouseAge\", \"Population\"],\n", 306 | " capping_method=\"iqr\",\n", 307 | " tail=\"both\",\n", 308 | " fold=1.5,\n", 309 | ")\n", 310 | "\n", 311 | "trimmer.fit(X_train)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 10, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "{'MedInc': -0.6776500000000012, 'HouseAge': -10.5, 'Population': -626.0}" 323 | ] 324 | }, 325 | "execution_count": 10, 326 | "metadata": {}, 327 | "output_type": "execute_result" 328 | } 329 | ], 330 | "source": [ 331 | "trimmer.left_tail_caps_" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 11, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/plain": [ 342 | "{'MedInc': 7.984350000000001, 'HouseAge': 65.5, 'Population': 3134.0}" 343 | ] 344 | }, 345 | "execution_count": 11, 346 | "metadata": {}, 347 | "output_type": "execute_result" 348 | } 349 | ], 350 | "source": [ 351 | "trimmer.right_tail_caps_" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 12, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "(14448, 8) (6192, 8)\n", 364 | "(13165, 8) (5619, 8)\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "print(X_train.shape, X_test.shape)\n", 370 | "\n", 371 | "X_train = trimmer.transform(X_train)\n", 372 | "X_test = trimmer.transform(X_test)\n", 373 | "\n", 374 | "print(X_train.shape, X_test.shape)" 375 | ] 376 | } 377 | ], 378 | "metadata": { 379 | "kernelspec": { 380 | "display_name": "fsml", 381 | "language": "python", 382 | "name": "fsml" 383 | }, 384 | "language_info": { 385 | "codemirror_mode": { 386 | "name": "ipython", 387 | "version": 3 388 | }, 389 | "file_extension": ".py", 390 | "mimetype": "text/x-python", 391 | "name": "python", 392 | "nbconvert_exporter": "python", 393 | "pygments_lexer": "ipython3", 394 | "version": "3.10.5" 395 | }, 396 | "toc": { 397 | "base_numbering": 1, 398 | "nav_menu": {}, 399 | "number_sections": true, 400 | "sideBar": true, 401 | "skip_h1_title": false, 402 | "title_cell": "Table of Contents", 403 | "title_sidebar": "Contents", 404 | "toc_cell": false, 405 | "toc_position": {}, 406 | "toc_section_display": "block", 407 | "toc_window_display": true 408 | } 409 | }, 410 | "nbformat": 4, 411 | "nbformat_minor": 1 412 | } 413 | -------------------------------------------------------------------------------- /ch06-datetime/Recipe-2-Extracting-features-from-time-with-pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Extracting features from time with pandas" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/html": [ 28 | "
\n", 29 | "\n", 42 | "\n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | "
date
02019-03-05 00:00:00
12019-03-05 01:15:10
22019-03-05 02:30:20
32019-03-05 03:45:30
42019-03-05 05:00:40
\n", 72 | "
" 73 | ], 74 | "text/plain": [ 75 | " date\n", 76 | "0 2019-03-05 00:00:00\n", 77 | "1 2019-03-05 01:15:10\n", 78 | "2 2019-03-05 02:30:20\n", 79 | "3 2019-03-05 03:45:30\n", 80 | "4 2019-03-05 05:00:40" 81 | ] 82 | }, 83 | "execution_count": 2, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "# let's create a toy dataframe with some date variables\n", 90 | "\n", 91 | "rng_ = pd.date_range(\"2019-03-05\", periods=20, freq=\"1h15min10s\")\n", 92 | "df = pd.DataFrame({\"date\": rng_})\n", 93 | "df.head()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 3, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/html": [ 104 | "
\n", 105 | "\n", 118 | "\n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | "
datehourminsec
02019-03-05 00:00:00000
12019-03-05 01:15:1011510
22019-03-05 02:30:2023020
32019-03-05 03:45:3034530
42019-03-05 05:00:405040
\n", 166 | "
" 167 | ], 168 | "text/plain": [ 169 | " date hour min sec\n", 170 | "0 2019-03-05 00:00:00 0 0 0\n", 171 | "1 2019-03-05 01:15:10 1 15 10\n", 172 | "2 2019-03-05 02:30:20 2 30 20\n", 173 | "3 2019-03-05 03:45:30 3 45 30\n", 174 | "4 2019-03-05 05:00:40 5 0 40" 175 | ] 176 | }, 177 | "execution_count": 3, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "# extract hr, min and sec\n", 184 | "\n", 185 | "df[\"hour\"] = df[\"date\"].dt.hour\n", 186 | "df[\"min\"] = df[\"date\"].dt.minute\n", 187 | "df[\"sec\"] = df[\"date\"].dt.second\n", 188 | "\n", 189 | "df.head()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 4, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/html": [ 200 | "
\n", 201 | "\n", 214 | "\n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
datehourminsechms
02019-03-05 00:00:00000000
12019-03-05 01:15:101151011510
22019-03-05 02:30:202302023020
32019-03-05 03:45:303453034530
42019-03-05 05:00:4050405040
\n", 280 | "
" 281 | ], 282 | "text/plain": [ 283 | " date hour min sec h m s\n", 284 | "0 2019-03-05 00:00:00 0 0 0 0 0 0\n", 285 | "1 2019-03-05 01:15:10 1 15 10 1 15 10\n", 286 | "2 2019-03-05 02:30:20 2 30 20 2 30 20\n", 287 | "3 2019-03-05 03:45:30 3 45 30 3 45 30\n", 288 | "4 2019-03-05 05:00:40 5 0 40 5 0 40" 289 | ] 290 | }, 291 | "execution_count": 4, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "# the same in one line\n", 298 | "\n", 299 | "df[[\"h\", \"m\", \"s\"]] = pd.DataFrame([(x.hour, x.minute, x.second) for x in df[\"date\"]])\n", 300 | "\n", 301 | "df.head()" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 5, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "array([ 0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18, 20,\n", 313 | " 21, 22, 23], dtype=int64)" 314 | ] 315 | }, 316 | "execution_count": 5, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "df[\"hour\"].unique()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 6, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "text/html": [ 333 | "
\n", 334 | "\n", 347 | "\n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | "
datehourminsechmsis_morning
02019-03-05 00:00:000000000
12019-03-05 01:15:1011510115100
22019-03-05 02:30:2023020230200
32019-03-05 03:45:3034530345300
42019-03-05 05:00:40504050400
\n", 419 | "
" 420 | ], 421 | "text/plain": [ 422 | " date hour min sec h m s is_morning\n", 423 | "0 2019-03-05 00:00:00 0 0 0 0 0 0 0\n", 424 | "1 2019-03-05 01:15:10 1 15 10 1 15 10 0\n", 425 | "2 2019-03-05 02:30:20 2 30 20 2 30 20 0\n", 426 | "3 2019-03-05 03:45:30 3 45 30 3 45 30 0\n", 427 | "4 2019-03-05 05:00:40 5 0 40 5 0 40 0" 428 | ] 429 | }, 430 | "execution_count": 6, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "# is it morning?\n", 437 | "\n", 438 | "df[\"is_morning\"] = np.where((df[\"hour\"] < 12) & (df[\"hour\"] > 6), 1, 0)\n", 439 | "\n", 440 | "df.head()" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [] 449 | } 450 | ], 451 | "metadata": { 452 | "kernelspec": { 453 | "display_name": "fsml", 454 | "language": "python", 455 | "name": "fsml" 456 | }, 457 | "language_info": { 458 | "codemirror_mode": { 459 | "name": "ipython", 460 | "version": 3 461 | }, 462 | "file_extension": ".py", 463 | "mimetype": "text/x-python", 464 | "name": "python", 465 | "nbconvert_exporter": "python", 466 | "pygments_lexer": "ipython3", 467 | "version": "3.10.5" 468 | }, 469 | "toc": { 470 | "base_numbering": 1, 471 | "nav_menu": {}, 472 | "number_sections": true, 473 | "sideBar": true, 474 | "skip_h1_title": false, 475 | "title_cell": "Table of Contents", 476 | "title_sidebar": "Contents", 477 | "toc_cell": false, 478 | "toc_position": {}, 479 | "toc_section_display": "block", 480 | "toc_window_display": false 481 | } 482 | }, 483 | "nbformat": 4, 484 | "nbformat_minor": 2 485 | } 486 | -------------------------------------------------------------------------------- /ch06-datetime/Recipe-3-Capturing-elapsed-time-between-2-variables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Capture elapsed time" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import datetime\n", 17 | "import numpy as np\n", 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/html": [ 29 | "
\n", 30 | "\n", 43 | "\n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | "
date1date2
02019-03-05 00:00:002019-03-31
12019-03-05 01:00:002019-04-30
22019-03-05 02:00:002019-05-31
32019-03-05 03:00:002019-06-30
42019-03-05 04:00:002019-07-31
\n", 79 | "
" 80 | ], 81 | "text/plain": [ 82 | " date1 date2\n", 83 | "0 2019-03-05 00:00:00 2019-03-31\n", 84 | "1 2019-03-05 01:00:00 2019-04-30\n", 85 | "2 2019-03-05 02:00:00 2019-05-31\n", 86 | "3 2019-03-05 03:00:00 2019-06-30\n", 87 | "4 2019-03-05 04:00:00 2019-07-31" 88 | ] 89 | }, 90 | "execution_count": 2, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "# let's create a toy dataframe with some date variables\n", 97 | "\n", 98 | "rng_hr = pd.date_range(\"2019-03-05\", periods=20, freq=\"H\")\n", 99 | "rng_month = pd.date_range(\"2019-03-05\", periods=20, freq=\"M\")\n", 100 | "\n", 101 | "df = pd.DataFrame({\"date1\": rng_hr, \"date2\": rng_month})\n", 102 | "df.head()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/html": [ 113 | "
\n", 114 | "\n", 127 | "\n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | "
date1date2elapsed_days
02019-03-05 00:00:002019-03-3126
12019-03-05 01:00:002019-04-3055
22019-03-05 02:00:002019-05-3186
32019-03-05 03:00:002019-06-30116
42019-03-05 04:00:002019-07-31147
\n", 169 | "
" 170 | ], 171 | "text/plain": [ 172 | " date1 date2 elapsed_days\n", 173 | "0 2019-03-05 00:00:00 2019-03-31 26\n", 174 | "1 2019-03-05 01:00:00 2019-04-30 55\n", 175 | "2 2019-03-05 02:00:00 2019-05-31 86\n", 176 | "3 2019-03-05 03:00:00 2019-06-30 116\n", 177 | "4 2019-03-05 04:00:00 2019-07-31 147" 178 | ] 179 | }, 180 | "execution_count": 3, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "# let's capture the difference in days between the 2 variables\n", 187 | "\n", 188 | "df[\"elapsed_days\"] = (df[\"date2\"] - df[\"date1\"]).dt.days\n", 189 | "\n", 190 | "df.head()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 4, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/html": [ 201 | "
\n", 202 | "\n", 215 | "\n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | "
date1date2elapsed_daysmonths_passed
02019-03-05 00:00:002019-03-31261.0
12019-03-05 01:00:002019-04-30552.0
22019-03-05 02:00:002019-05-31863.0
32019-03-05 03:00:002019-06-301164.0
42019-03-05 04:00:002019-07-311475.0
\n", 263 | "
" 264 | ], 265 | "text/plain": [ 266 | " date1 date2 elapsed_days months_passed\n", 267 | "0 2019-03-05 00:00:00 2019-03-31 26 1.0\n", 268 | "1 2019-03-05 01:00:00 2019-04-30 55 2.0\n", 269 | "2 2019-03-05 02:00:00 2019-05-31 86 3.0\n", 270 | "3 2019-03-05 03:00:00 2019-06-30 116 4.0\n", 271 | "4 2019-03-05 04:00:00 2019-07-31 147 5.0" 272 | ] 273 | }, 274 | "execution_count": 4, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "# let's capture the difference in months between the 2 variables\n", 281 | "\n", 282 | "df[\"months_passed\"] = (df[\"date2\"] - df[\"date1\"]) / np.timedelta64(1, \"M\")\n", 283 | "df[\"months_passed\"] = np.round(df[\"months_passed\"], 0)\n", 284 | "\n", 285 | "df.head()" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 5, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/html": [ 296 | "
\n", 297 | "\n", 310 | "\n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | "
date1date2elapsed_daysmonths_passeddiff_secondsdiff_minutes
02019-03-05 00:00:002019-03-31261.02246400.037440.0
12019-03-05 01:00:002019-04-30552.04834800.080580.0
22019-03-05 02:00:002019-05-31863.07509600.0125160.0
32019-03-05 03:00:002019-06-301164.010098000.0168300.0
42019-03-05 04:00:002019-07-311475.012772800.0212880.0
\n", 370 | "
" 371 | ], 372 | "text/plain": [ 373 | " date1 date2 elapsed_days months_passed diff_seconds \\\n", 374 | "0 2019-03-05 00:00:00 2019-03-31 26 1.0 2246400.0 \n", 375 | "1 2019-03-05 01:00:00 2019-04-30 55 2.0 4834800.0 \n", 376 | "2 2019-03-05 02:00:00 2019-05-31 86 3.0 7509600.0 \n", 377 | "3 2019-03-05 03:00:00 2019-06-30 116 4.0 10098000.0 \n", 378 | "4 2019-03-05 04:00:00 2019-07-31 147 5.0 12772800.0 \n", 379 | "\n", 380 | " diff_minutes \n", 381 | "0 37440.0 \n", 382 | "1 80580.0 \n", 383 | "2 125160.0 \n", 384 | "3 168300.0 \n", 385 | "4 212880.0 " 386 | ] 387 | }, 388 | "execution_count": 5, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "# calculate difference in seconds and minutes\n", 395 | "\n", 396 | "df[\"diff_seconds\"] = (df[\"date2\"] - df[\"date1\"]) / np.timedelta64(1, \"s\")\n", 397 | "df[\"diff_minutes\"] = (df[\"date2\"] - df[\"date1\"]) / np.timedelta64(1, \"m\")\n", 398 | "\n", 399 | "df.head()" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 6, 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "data": { 409 | "text/html": [ 410 | "
\n", 411 | "\n", 424 | "\n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | "
date1date2elapsed_daysmonths_passeddiff_secondsdiff_minutesto_today
02019-03-05 00:00:002019-03-31261.02246400.037440.01265 days 11:41:24.651022
12019-03-05 01:00:002019-04-30552.04834800.080580.01265 days 10:41:24.651022
22019-03-05 02:00:002019-05-31863.07509600.0125160.01265 days 09:41:24.651022
32019-03-05 03:00:002019-06-301164.010098000.0168300.01265 days 08:41:24.651022
42019-03-05 04:00:002019-07-311475.012772800.0212880.01265 days 07:41:24.651022
\n", 490 | "
" 491 | ], 492 | "text/plain": [ 493 | " date1 date2 elapsed_days months_passed diff_seconds \\\n", 494 | "0 2019-03-05 00:00:00 2019-03-31 26 1.0 2246400.0 \n", 495 | "1 2019-03-05 01:00:00 2019-04-30 55 2.0 4834800.0 \n", 496 | "2 2019-03-05 02:00:00 2019-05-31 86 3.0 7509600.0 \n", 497 | "3 2019-03-05 03:00:00 2019-06-30 116 4.0 10098000.0 \n", 498 | "4 2019-03-05 04:00:00 2019-07-31 147 5.0 12772800.0 \n", 499 | "\n", 500 | " diff_minutes to_today \n", 501 | "0 37440.0 1265 days 11:41:24.651022 \n", 502 | "1 80580.0 1265 days 10:41:24.651022 \n", 503 | "2 125160.0 1265 days 09:41:24.651022 \n", 504 | "3 168300.0 1265 days 08:41:24.651022 \n", 505 | "4 212880.0 1265 days 07:41:24.651022 " 506 | ] 507 | }, 508 | "execution_count": 6, 509 | "metadata": {}, 510 | "output_type": "execute_result" 511 | } 512 | ], 513 | "source": [ 514 | "# calculate difference to today\n", 515 | "\n", 516 | "df[\"to_today\"] = datetime.datetime.today() - df[\"date1\"]\n", 517 | "\n", 518 | "df.head()" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [] 527 | } 528 | ], 529 | "metadata": { 530 | "kernelspec": { 531 | "display_name": "fsml", 532 | "language": "python", 533 | "name": "fsml" 534 | }, 535 | "language_info": { 536 | "codemirror_mode": { 537 | "name": "ipython", 538 | "version": 3 539 | }, 540 | "file_extension": ".py", 541 | "mimetype": "text/x-python", 542 | "name": "python", 543 | "nbconvert_exporter": "python", 544 | "pygments_lexer": "ipython3", 545 | "version": "3.10.5" 546 | }, 547 | "toc": { 548 | "base_numbering": 1, 549 | "nav_menu": {}, 550 | "number_sections": true, 551 | "sideBar": true, 552 | "skip_h1_title": false, 553 | "title_cell": "Table of Contents", 554 | "title_sidebar": "Contents", 555 | "toc_cell": false, 556 | "toc_position": {}, 557 | "toc_section_display": "block", 558 | "toc_window_display": false 559 | } 560 | }, 561 | "nbformat": 4, 562 | "nbformat_minor": 2 563 | } 564 | -------------------------------------------------------------------------------- /ch06-datetime/Recipe-4-Working-with-different-time-zones.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Working with different time zones" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/html": [ 27 | "
\n", 28 | "\n", 41 | "\n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | "
time1
02015-06-10 09:00:00+02:00
12015-06-10 10:00:00+02:00
22015-06-10 11:00:00+02:00
02015-09-10 09:00:00-05:00
12015-09-10 10:00:00-05:00
22015-09-10 11:00:00-05:00
\n", 75 | "
" 76 | ], 77 | "text/plain": [ 78 | " time1\n", 79 | "0 2015-06-10 09:00:00+02:00\n", 80 | "1 2015-06-10 10:00:00+02:00\n", 81 | "2 2015-06-10 11:00:00+02:00\n", 82 | "0 2015-09-10 09:00:00-05:00\n", 83 | "1 2015-09-10 10:00:00-05:00\n", 84 | "2 2015-09-10 11:00:00-05:00" 85 | ] 86 | }, 87 | "execution_count": 2, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "# first, let's create a toy dataframe with some timestamps in different time zones\n", 94 | "# variable 1\n", 95 | "\n", 96 | "df = pd.DataFrame()\n", 97 | "\n", 98 | "df[\"time1\"] = pd.concat(\n", 99 | " [\n", 100 | " pd.Series(\n", 101 | " pd.date_range(\n", 102 | " start=\"2015-06-10 09:00\", freq=\"H\", periods=3, tz=\"Europe/Berlin\"\n", 103 | " )\n", 104 | " ),\n", 105 | " pd.Series(\n", 106 | " pd.date_range(\n", 107 | " start=\"2015-09-10 09:00\", freq=\"H\", periods=3, tz=\"US/Central\"\n", 108 | " )\n", 109 | " ),\n", 110 | " ],\n", 111 | " axis=0,\n", 112 | ")\n", 113 | "\n", 114 | "df" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 3, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/html": [ 125 | "
\n", 126 | "\n", 139 | "\n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | "
time1time2
02015-06-10 09:00:00+02:002015-07-01 09:00:00+02:00
12015-06-10 10:00:00+02:002015-07-01 10:00:00+02:00
22015-06-10 11:00:00+02:002015-07-01 11:00:00+02:00
02015-09-10 09:00:00-05:002015-08-01 09:00:00-05:00
12015-09-10 10:00:00-05:002015-08-01 10:00:00-05:00
22015-09-10 11:00:00-05:002015-08-01 11:00:00-05:00
\n", 180 | "
" 181 | ], 182 | "text/plain": [ 183 | " time1 time2\n", 184 | "0 2015-06-10 09:00:00+02:00 2015-07-01 09:00:00+02:00\n", 185 | "1 2015-06-10 10:00:00+02:00 2015-07-01 10:00:00+02:00\n", 186 | "2 2015-06-10 11:00:00+02:00 2015-07-01 11:00:00+02:00\n", 187 | "0 2015-09-10 09:00:00-05:00 2015-08-01 09:00:00-05:00\n", 188 | "1 2015-09-10 10:00:00-05:00 2015-08-01 10:00:00-05:00\n", 189 | "2 2015-09-10 11:00:00-05:00 2015-08-01 11:00:00-05:00" 190 | ] 191 | }, 192 | "execution_count": 3, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "# first, let's create a toy dataframe with some timestamps in different time zones\n", 199 | "# variable 2\n", 200 | "\n", 201 | "df[\"time2\"] = pd.concat(\n", 202 | " [\n", 203 | " pd.Series(\n", 204 | " pd.date_range(\n", 205 | " start=\"2015-07-01 09:00\", freq=\"H\", periods=3, tz=\"Europe/Berlin\"\n", 206 | " )\n", 207 | " ),\n", 208 | " pd.Series(\n", 209 | " pd.date_range(\n", 210 | " start=\"2015-08-01 09:00\", freq=\"H\", periods=3, tz=\"US/Central\"\n", 211 | " )\n", 212 | " ),\n", 213 | " ],\n", 214 | " axis=0,\n", 215 | ")\n", 216 | "\n", 217 | "df" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 4, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/html": [ 228 | "
\n", 229 | "\n", 242 | "\n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | "
time1time2time1_utctime2_utc
02015-06-10 09:00:00+02:002015-07-01 09:00:00+02:002015-06-10 07:00:00+00:002015-07-01 07:00:00+00:00
12015-06-10 10:00:00+02:002015-07-01 10:00:00+02:002015-06-10 08:00:00+00:002015-07-01 08:00:00+00:00
22015-06-10 11:00:00+02:002015-07-01 11:00:00+02:002015-06-10 09:00:00+00:002015-07-01 09:00:00+00:00
02015-09-10 09:00:00-05:002015-08-01 09:00:00-05:002015-09-10 14:00:00+00:002015-08-01 14:00:00+00:00
12015-09-10 10:00:00-05:002015-08-01 10:00:00-05:002015-09-10 15:00:00+00:002015-08-01 15:00:00+00:00
22015-09-10 11:00:00-05:002015-08-01 11:00:00-05:002015-09-10 16:00:00+00:002015-08-01 16:00:00+00:00
\n", 297 | "
" 298 | ], 299 | "text/plain": [ 300 | " time1 time2 \\\n", 301 | "0 2015-06-10 09:00:00+02:00 2015-07-01 09:00:00+02:00 \n", 302 | "1 2015-06-10 10:00:00+02:00 2015-07-01 10:00:00+02:00 \n", 303 | "2 2015-06-10 11:00:00+02:00 2015-07-01 11:00:00+02:00 \n", 304 | "0 2015-09-10 09:00:00-05:00 2015-08-01 09:00:00-05:00 \n", 305 | "1 2015-09-10 10:00:00-05:00 2015-08-01 10:00:00-05:00 \n", 306 | "2 2015-09-10 11:00:00-05:00 2015-08-01 11:00:00-05:00 \n", 307 | "\n", 308 | " time1_utc time2_utc \n", 309 | "0 2015-06-10 07:00:00+00:00 2015-07-01 07:00:00+00:00 \n", 310 | "1 2015-06-10 08:00:00+00:00 2015-07-01 08:00:00+00:00 \n", 311 | "2 2015-06-10 09:00:00+00:00 2015-07-01 09:00:00+00:00 \n", 312 | "0 2015-09-10 14:00:00+00:00 2015-08-01 14:00:00+00:00 \n", 313 | "1 2015-09-10 15:00:00+00:00 2015-08-01 15:00:00+00:00 \n", 314 | "2 2015-09-10 16:00:00+00:00 2015-08-01 16:00:00+00:00 " 315 | ] 316 | }, 317 | "execution_count": 4, 318 | "metadata": {}, 319 | "output_type": "execute_result" 320 | } 321 | ], 322 | "source": [ 323 | "# to work with different time zones, first we unify the timezone to the central one\n", 324 | "# setting utc = True\n", 325 | "\n", 326 | "df[\"time1_utc\"] = pd.to_datetime(df[\"time1\"], utc=True)\n", 327 | "df[\"time2_utc\"] = pd.to_datetime(df[\"time2\"], utc=True)\n", 328 | "\n", 329 | "df" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 5, 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "data": { 339 | "text/plain": [ 340 | "0 21\n", 341 | "1 21\n", 342 | "2 21\n", 343 | "0 -40\n", 344 | "1 -40\n", 345 | "Name: elapsed_days, dtype: int64" 346 | ] 347 | }, 348 | "execution_count": 5, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "# let's explore the variable type\n", 355 | "\n", 356 | "df[\"elapsed_days\"] = (df[\"time2_utc\"] - df[\"time1_utc\"]).dt.days\n", 357 | "\n", 358 | "df[\"elapsed_days\"].head()" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 6, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "data": { 368 | "text/html": [ 369 | "
\n", 370 | "\n", 383 | "\n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | "
time1_londontime2_berlin
02015-06-10 08:00:00+01:002015-06-10 09:00:00+02:00
12015-06-10 09:00:00+01:002015-06-10 10:00:00+02:00
22015-06-10 10:00:00+01:002015-06-10 11:00:00+02:00
02015-09-10 15:00:00+01:002015-09-10 16:00:00+02:00
12015-09-10 16:00:00+01:002015-09-10 17:00:00+02:00
22015-09-10 17:00:00+01:002015-09-10 18:00:00+02:00
\n", 424 | "
" 425 | ], 426 | "text/plain": [ 427 | " time1_london time2_berlin\n", 428 | "0 2015-06-10 08:00:00+01:00 2015-06-10 09:00:00+02:00\n", 429 | "1 2015-06-10 09:00:00+01:00 2015-06-10 10:00:00+02:00\n", 430 | "2 2015-06-10 10:00:00+01:00 2015-06-10 11:00:00+02:00\n", 431 | "0 2015-09-10 15:00:00+01:00 2015-09-10 16:00:00+02:00\n", 432 | "1 2015-09-10 16:00:00+01:00 2015-09-10 17:00:00+02:00\n", 433 | "2 2015-09-10 17:00:00+01:00 2015-09-10 18:00:00+02:00" 434 | ] 435 | }, 436 | "execution_count": 6, 437 | "metadata": {}, 438 | "output_type": "execute_result" 439 | } 440 | ], 441 | "source": [ 442 | "# next we change all timestamps to the desired timezone, eg Europe/London\n", 443 | "# in this example\n", 444 | "\n", 445 | "df[\"time1_london\"] = df[\"time1_utc\"].dt.tz_convert(\"Europe/London\")\n", 446 | "df[\"time2_berlin\"] = df[\"time1_utc\"].dt.tz_convert(\"Europe/Berlin\")\n", 447 | "\n", 448 | "df[[\"time1_london\", \"time2_berlin\"]]" 449 | ] 450 | } 451 | ], 452 | "metadata": { 453 | "kernelspec": { 454 | "display_name": "fsml", 455 | "language": "python", 456 | "name": "fsml" 457 | }, 458 | "language_info": { 459 | "codemirror_mode": { 460 | "name": "ipython", 461 | "version": 3 462 | }, 463 | "file_extension": ".py", 464 | "mimetype": "text/x-python", 465 | "name": "python", 466 | "nbconvert_exporter": "python", 467 | "pygments_lexer": "ipython3", 468 | "version": "3.10.5" 469 | }, 470 | "toc": { 471 | "base_numbering": 1, 472 | "nav_menu": {}, 473 | "number_sections": true, 474 | "sideBar": true, 475 | "skip_h1_title": false, 476 | "title_cell": "Table of Contents", 477 | "title_sidebar": "Contents", 478 | "toc_cell": false, 479 | "toc_position": {}, 480 | "toc_section_display": "block", 481 | "toc_window_display": false 482 | } 483 | }, 484 | "nbformat": 4, 485 | "nbformat_minor": 2 486 | } 487 | -------------------------------------------------------------------------------- /ch07-scaling/Recipe-6-scaling-to-unit-length.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Scaling to vector unit length / unit norm\n", 8 | "\n", 9 | "Scaling to unit norm is achieved by dividing each feature vector by either the Manhattan distance (l1 norm) or the Euclidean distance of the vector (l2 norm):\n", 10 | "\n", 11 | "X_scaled_l1 = X / l1(X)\n", 12 | "\n", 13 | "X_scaled_l2 = X / l2(X)\n", 14 | "\n", 15 | "\n", 16 | "The **Manhattan distance** is given by the sum of the absolute components of the vector:\n", 17 | "\n", 18 | "l1(X) = |x1| + |x2| + ... + |xn|\n", 19 | "\n", 20 | "\n", 21 | "Whereas the **Euclidean distance** is given by the square root of the square sum of the component of the vector:\n", 22 | "\n", 23 | "l2(X) = sqr( x1^2 + x2^2 + ... + xn^2 )\n", 24 | "\n", 25 | "\n", 26 | "In the above example, x1 is variable 1, x2 variable 2, and xn variable n, and X is the data for 1 observation across variables (a row in other words).\n", 27 | "\n", 28 | "\n", 29 | "### Scaling to unit norm, examples\n", 30 | "\n", 31 | "For example, if our data has 1 observations (1 row) and 3 variables:\n", 32 | "\n", 33 | "- number of pets\n", 34 | "- number of children\n", 35 | "- age\n", 36 | "\n", 37 | "The values for each variable for that single observation are 10, 15 and 20. Our vector X = [10, 15, 20]. Then:\n", 38 | "\n", 39 | "l1(X) = 10 + 15 + 20 = 45\n", 40 | "\n", 41 | "l2(X) = sqr( 10^2 + 15^2 + 20^2) = sqr( 100 + 225 + 400) = **26.9**\n", 42 | "\n", 43 | "The euclidean distance is always smaller than the Manhattan distance.\n", 44 | "\n", 45 | "\n", 46 | "The normalized vector values are therefore:\n", 47 | "\n", 48 | "X_scaled_l1 = [ 10/45, 15/45, 20/45 ] = [0.22, 0.33, 0.44]\n", 49 | "\n", 50 | "X_scaled_l2 = [10/26.9, 15/26.9, 20/26.9 ] = [0.37, 0.55, 0.74]" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 1, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import matplotlib.pyplot as plt\n", 60 | "import numpy as np\n", 61 | "import pandas as pd\n", 62 | "from sklearn.datasets import fetch_california_housing\n", 63 | "from sklearn.model_selection import train_test_split\n", 64 | "\n", 65 | "# the scaler - for robust scaling\n", 66 | "from sklearn.preprocessing import Normalizer" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 2, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/html": [ 77 | "
\n", 78 | "\n", 91 | "\n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccup
08.325241.06.9841271.023810322.02.555556
18.301421.06.2381370.9718802401.02.109842
27.257452.08.2881361.073446496.02.802260
35.643152.05.8173521.073059558.02.547945
43.846252.06.2818531.081081565.02.181467
\n", 151 | "
" 152 | ], 153 | "text/plain": [ 154 | " MedInc HouseAge AveRooms AveBedrms Population AveOccup\n", 155 | "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556\n", 156 | "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842\n", 157 | "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260\n", 158 | "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945\n", 159 | "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467" 160 | ] 161 | }, 162 | "execution_count": 2, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "# load the California House price data from Scikit-learn\n", 169 | "X, y = fetch_california_housing(return_X_y=True, as_frame=True)\n", 170 | "\n", 171 | "# Remove 2 variables:\n", 172 | "X.drop(labels=[\"Latitude\", \"Longitude\"], axis=1, inplace=True)\n", 173 | "\n", 174 | "# display top 5 rows\n", 175 | "X.head()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 3, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "((14448, 6), (6192, 6))" 187 | ] 188 | }, 189 | "execution_count": 3, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "# let's separate the data into training and testing sets\n", 196 | "\n", 197 | "X_train, X_test, y_train, y_test = train_test_split(\n", 198 | " X,\n", 199 | " y,\n", 200 | " test_size=0.3,\n", 201 | " random_state=0,\n", 202 | ")\n", 203 | "\n", 204 | "X_train.shape, X_test.shape" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "### Scaling to l1" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 4, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# set up the scaler\n", 221 | "scaler = Normalizer(norm=\"l1\") # for euclidean distance we change to norm='l2'\n", 222 | "\n", 223 | "# fit the scaler, this procedure does NOTHING\n", 224 | "scaler.fit(X_train)\n", 225 | "\n", 226 | "# transform train and test sets\n", 227 | "X_train_scaled = scaler.transform(X_train)\n", 228 | "X_test_scaled = scaler.transform(X_test)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 5, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "data": { 238 | "text/plain": [ 239 | "array([ 255.3, 889.1, 1421.7, ..., 744.6, 1099.5, 1048.9])" 240 | ] 241 | }, 242 | "execution_count": 5, 243 | "metadata": {}, 244 | "output_type": "execute_result" 245 | } 246 | ], 247 | "source": [ 248 | "# let's calculate the norm for each observation (feature vector)\n", 249 | "# original data\n", 250 | "\n", 251 | "np.round(np.linalg.norm(X_train, ord=1, axis=1), 1)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 6, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "array([1., 1., 1., ..., 1., 1., 1.])" 263 | ] 264 | }, 265 | "execution_count": 6, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "# let's calculate the norm for each observation (feature vector)\n", 272 | "# scaled data\n", 273 | "\n", 274 | "np.round(np.linalg.norm(X_train_scaled, ord=1, axis=1), 1)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "### Scaling to l2" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 7, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# set up the scaler\n", 291 | "scaler = Normalizer(norm=\"l2\")\n", 292 | "\n", 293 | "# fit the scaler, this procedure does NOTHING\n", 294 | "scaler.fit(X_train)\n", 295 | "\n", 296 | "# transform train and test sets\n", 297 | "X_train_scaled = scaler.transform(X_train)\n", 298 | "X_test_scaled = scaler.transform(X_test)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 8, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "array([ 200. , 837.1, 1387.1, ..., 704.7, 1052.6, 1024.1])" 310 | ] 311 | }, 312 | "execution_count": 8, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "# let's calculate the norm for each observation (feature vector)\n", 319 | "# original data\n", 320 | "\n", 321 | "np.round(np.linalg.norm(X_train, ord=2, axis=1), 1)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 9, 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "data": { 331 | "text/plain": [ 332 | "array([1., 1., 1., ..., 1., 1., 1.])" 333 | ] 334 | }, 335 | "execution_count": 9, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "# let's calculate the norm for each observation (feature vector)\n", 342 | "# scaled data\n", 343 | "\n", 344 | "np.round(np.linalg.norm(X_train_scaled, ord=2, axis=1), 1)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [] 353 | } 354 | ], 355 | "metadata": { 356 | "kernelspec": { 357 | "display_name": "feml", 358 | "language": "python", 359 | "name": "feml" 360 | }, 361 | "language_info": { 362 | "codemirror_mode": { 363 | "name": "ipython", 364 | "version": 3 365 | }, 366 | "file_extension": ".py", 367 | "mimetype": "text/x-python", 368 | "name": "python", 369 | "nbconvert_exporter": "python", 370 | "pygments_lexer": "ipython3", 371 | "version": "3.8.2" 372 | }, 373 | "toc": { 374 | "base_numbering": 1, 375 | "nav_menu": {}, 376 | "number_sections": true, 377 | "sideBar": true, 378 | "skip_h1_title": false, 379 | "title_cell": "Table of Contents", 380 | "title_sidebar": "Contents", 381 | "toc_cell": false, 382 | "toc_position": {}, 383 | "toc_section_display": "block", 384 | "toc_window_display": true 385 | } 386 | }, 387 | "nbformat": 4, 388 | "nbformat_minor": 2 389 | } 390 | -------------------------------------------------------------------------------- /ch09-featuretools/Recipe3-Combining-numerical-features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import featuretools as ft\n", 11 | "from woodwork.logical_types import Categorical" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# let's load the data again\n", 21 | "\n", 22 | "df = pd.read_csv(\"retail.csv\", parse_dates=[\"invoice_date\"])" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# create and entity set\n", 32 | "\n", 33 | "es = ft.EntitySet(id=\"data\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Add the data to the entity\n", 43 | "\n", 44 | "es = es.add_dataframe(\n", 45 | " dataframe=df, # the dataframe with the data\n", 46 | " dataframe_name=\"data\", # unique name to associate with this dataframe\n", 47 | " index=\"rows\", # column name to index the items\n", 48 | " make_index=True, # if true, create a new column with unique values\n", 49 | " time_index=\"invoice_date\", # column containing time data\n", 50 | " logical_types={\n", 51 | " \"customer_id\": Categorical, # the id is numerical, but should be handled as categorical\n", 52 | " },\n", 53 | ")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 5, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "Entityset: data\n", 65 | " DataFrames:\n", 66 | " data [Rows: 741301, Columns: 8]\n", 67 | " invoices [Rows: 40505, Columns: 3]\n", 68 | " Relationships:\n", 69 | " data.invoice -> invoices.invoice" 70 | ] 71 | }, 72 | "execution_count": 5, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "# Create a new dataframe with invoices\n", 79 | "# indicating its relationship to the main data\n", 80 | "\n", 81 | "es.normalize_dataframe(\n", 82 | " base_dataframe_name=\"data\", # Datarame name from which to split.\n", 83 | " new_dataframe_name=\"invoices\", # Name of the new dataframe.\n", 84 | " index=\"invoice\", # relationship will be created across this column.\n", 85 | " copy_columns=[\"customer_id\"], # columns to remove from base_dataframe and move to new dataframe.\n", 86 | ")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "[,\n", 98 | " ,\n", 99 | " ,\n", 100 | " ,\n", 101 | " ,\n", 102 | " ,\n", 103 | " ]" 104 | ] 105 | }, 106 | "execution_count": 6, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "# Obtain new variable \"amount\" by multiplying\n", 113 | "# price and quantity.\n", 114 | "\n", 115 | "feature_matrix, feature_defs = ft.dfs(\n", 116 | " entityset=es, # the entity set\n", 117 | " target_dataframe_name=\"data\", # the dataframe for wich to create the feature\n", 118 | " agg_primitives=[], # we need an empty list to avoid returning the defo parameters\n", 119 | " trans_primitives=[\"multiply_numeric\"], # the operation to create the new features\n", 120 | " primitive_options={ # the features that we want to multiply\n", 121 | " (\"multiply_numeric\"): {\n", 122 | " 'include_columns': {\n", 123 | " 'data': [\"quantity\", \"price\"]\n", 124 | " }\n", 125 | " }\n", 126 | " },\n", 127 | " ignore_dataframes=[\"invoices\"],\n", 128 | ")\n", 129 | "\n", 130 | "# display name of created features\n", 131 | "feature_defs" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/html": [ 142 | "
\n", 143 | "\n", 156 | "\n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | "
customer_idinvoicestock_codedescriptionquantitypriceprice * quantity
rows
013085.04894348504815CM CHRISTMAS GLASS BALL 20 LIGHTS126.9583.4
113085.048943479323PPINK CHERRY LIGHTS126.7581.0
213085.048943479323WWHITE CHERRY LIGHTS126.7581.0
313085.048943422041RECORD FRAME 7\" SINGLE SIZE482.10100.8
413085.048943421232STRAWBERRY CERAMIC TRINKET BOX241.2530.0
\n", 232 | "
" 233 | ], 234 | "text/plain": [ 235 | " customer_id invoice stock_code description \\\n", 236 | "rows \n", 237 | "0 13085.0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS \n", 238 | "1 13085.0 489434 79323P PINK CHERRY LIGHTS \n", 239 | "2 13085.0 489434 79323W WHITE CHERRY LIGHTS \n", 240 | "3 13085.0 489434 22041 RECORD FRAME 7\" SINGLE SIZE \n", 241 | "4 13085.0 489434 21232 STRAWBERRY CERAMIC TRINKET BOX \n", 242 | "\n", 243 | " quantity price price * quantity \n", 244 | "rows \n", 245 | "0 12 6.95 83.4 \n", 246 | "1 12 6.75 81.0 \n", 247 | "2 12 6.75 81.0 \n", 248 | "3 48 2.10 100.8 \n", 249 | "4 24 1.25 30.0 " 250 | ] 251 | }, 252 | "execution_count": 7, 253 | "metadata": {}, 254 | "output_type": "execute_result" 255 | } 256 | ], 257 | "source": [ 258 | "feature_matrix.head()" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## In relation to pandas" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 8, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/html": [ 276 | "
\n", 277 | "\n", 290 | "\n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | "
customer_idinvoiceinvoice_datestock_codedescriptionquantityprice
013085.04894342009-12-01 07:45:008504815CM CHRISTMAS GLASS BALL 20 LIGHTS126.95
113085.04894342009-12-01 07:45:0079323PPINK CHERRY LIGHTS126.75
213085.04894342009-12-01 07:45:0079323WWHITE CHERRY LIGHTS126.75
313085.04894342009-12-01 07:45:0022041RECORD FRAME 7\" SINGLE SIZE482.10
413085.04894342009-12-01 07:45:0021232STRAWBERRY CERAMIC TRINKET BOX241.25
\n", 356 | "
" 357 | ], 358 | "text/plain": [ 359 | " customer_id invoice invoice_date stock_code \\\n", 360 | "0 13085.0 489434 2009-12-01 07:45:00 85048 \n", 361 | "1 13085.0 489434 2009-12-01 07:45:00 79323P \n", 362 | "2 13085.0 489434 2009-12-01 07:45:00 79323W \n", 363 | "3 13085.0 489434 2009-12-01 07:45:00 22041 \n", 364 | "4 13085.0 489434 2009-12-01 07:45:00 21232 \n", 365 | "\n", 366 | " description quantity price \n", 367 | "0 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 6.95 \n", 368 | "1 PINK CHERRY LIGHTS 12 6.75 \n", 369 | "2 WHITE CHERRY LIGHTS 12 6.75 \n", 370 | "3 RECORD FRAME 7\" SINGLE SIZE 48 2.10 \n", 371 | "4 STRAWBERRY CERAMIC TRINKET BOX 24 1.25 " 372 | ] 373 | }, 374 | "execution_count": 8, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "# load data\n", 381 | "\n", 382 | "df = pd.read_csv(\"retail.csv\", parse_dates=[\"invoice_date\"])\n", 383 | "\n", 384 | "df.head()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 9, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "data": { 394 | "text/html": [ 395 | "
\n", 396 | "\n", 409 | "\n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | "
customer_idinvoiceinvoice_datestock_codedescriptionquantitypriceamount
013085.04894342009-12-01 07:45:008504815CM CHRISTMAS GLASS BALL 20 LIGHTS126.9583.4
113085.04894342009-12-01 07:45:0079323PPINK CHERRY LIGHTS126.7581.0
213085.04894342009-12-01 07:45:0079323WWHITE CHERRY LIGHTS126.7581.0
313085.04894342009-12-01 07:45:0022041RECORD FRAME 7\" SINGLE SIZE482.10100.8
413085.04894342009-12-01 07:45:0021232STRAWBERRY CERAMIC TRINKET BOX241.2530.0
\n", 481 | "
" 482 | ], 483 | "text/plain": [ 484 | " customer_id invoice invoice_date stock_code \\\n", 485 | "0 13085.0 489434 2009-12-01 07:45:00 85048 \n", 486 | "1 13085.0 489434 2009-12-01 07:45:00 79323P \n", 487 | "2 13085.0 489434 2009-12-01 07:45:00 79323W \n", 488 | "3 13085.0 489434 2009-12-01 07:45:00 22041 \n", 489 | "4 13085.0 489434 2009-12-01 07:45:00 21232 \n", 490 | "\n", 491 | " description quantity price amount \n", 492 | "0 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 6.95 83.4 \n", 493 | "1 PINK CHERRY LIGHTS 12 6.75 81.0 \n", 494 | "2 WHITE CHERRY LIGHTS 12 6.75 81.0 \n", 495 | "3 RECORD FRAME 7\" SINGLE SIZE 48 2.10 100.8 \n", 496 | "4 STRAWBERRY CERAMIC TRINKET BOX 24 1.25 30.0 " 497 | ] 498 | }, 499 | "execution_count": 9, 500 | "metadata": {}, 501 | "output_type": "execute_result" 502 | } 503 | ], 504 | "source": [ 505 | "# Add total amount of transaction\n", 506 | "\n", 507 | "df[\"amount\"] = df[\"quantity\"].mul(df[\"price\"])\n", 508 | "\n", 509 | "df.head()" 510 | ] 511 | } 512 | ], 513 | "metadata": { 514 | "kernelspec": { 515 | "display_name": "fsml", 516 | "language": "python", 517 | "name": "fsml" 518 | }, 519 | "language_info": { 520 | "codemirror_mode": { 521 | "name": "ipython", 522 | "version": 3 523 | }, 524 | "file_extension": ".py", 525 | "mimetype": "text/x-python", 526 | "name": "python", 527 | "nbconvert_exporter": "python", 528 | "pygments_lexer": "ipython3", 529 | "version": "3.10.5" 530 | }, 531 | "toc": { 532 | "base_numbering": 1, 533 | "nav_menu": {}, 534 | "number_sections": true, 535 | "sideBar": true, 536 | "skip_h1_title": false, 537 | "title_cell": "Table of Contents", 538 | "title_sidebar": "Contents", 539 | "toc_cell": false, 540 | "toc_position": { 541 | "height": "calc(100% - 180px)", 542 | "left": "10px", 543 | "top": "150px", 544 | "width": "165px" 545 | }, 546 | "toc_section_display": "block", 547 | "toc_window_display": true 548 | } 549 | }, 550 | "nbformat": 4, 551 | "nbformat_minor": 2 552 | } 553 | -------------------------------------------------------------------------------- /ch09-featuretools/prepare-retail-dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Online Retail II Data Set\n", 8 | "\n", 9 | "In this notebook we will prepare and store the Online Retail II Data Set from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail+II)\n", 10 | "\n", 11 | "\n", 12 | "**Citation:**\n", 13 | "\n", 14 | "Chen, D. Sain, S.L., and Guo, K. (2012), Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining, Journal of Database Marketing and Customer Strategy Management, Vol. 19, No. 3, pp. 197-208. https://link.springer.com/article/10.1057/dbm.2012.17. \n", 15 | "\n", 16 | "## Download the data\n", 17 | "\n", 18 | "- Navigate to the [data folder](https://archive.ics.uci.edu/ml/machine-learning-databases/00502/).\n", 19 | "- Download the file called **online_retail_II.xlsx**.\n", 20 | "- Save the file in the same folder that contains this notebook." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "(1067371, 8)" 41 | ] 42 | }, 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "# Load the data\n", 50 | "\n", 51 | "# The data is provided as two sheets in a single Excel file.\n", 52 | "# Load both and join into a single dataframe.\n", 53 | "\n", 54 | "# It takes a while...\n", 55 | "\n", 56 | "file = 'online_retail_II.xlsx'\n", 57 | "\n", 58 | "df_1 = pd.read_excel(file, sheet_name='Year 2009-2010')\n", 59 | "df_2 = pd.read_excel(file, sheet_name='Year 2010-2011')\n", 60 | "\n", 61 | "df = pd.concat([df_1, df_2])\n", 62 | "\n", 63 | "df.shape" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/html": [ 74 | "
\n", 75 | "\n", 88 | "\n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | "
InvoiceStockCodeDescriptionQuantityInvoiceDatePriceCustomer IDCountry
04894348504815CM CHRISTMAS GLASS BALL 20 LIGHTS122009-12-01 07:45:006.9513085.0United Kingdom
148943479323PPINK CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom
248943479323WWHITE CHERRY LIGHTS122009-12-01 07:45:006.7513085.0United Kingdom
348943422041RECORD FRAME 7\" SINGLE SIZE482009-12-01 07:45:002.1013085.0United Kingdom
448943421232STRAWBERRY CERAMIC TRINKET BOX242009-12-01 07:45:001.2513085.0United Kingdom
\n", 160 | "
" 161 | ], 162 | "text/plain": [ 163 | " Invoice StockCode Description Quantity \\\n", 164 | "0 489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 \n", 165 | "1 489434 79323P PINK CHERRY LIGHTS 12 \n", 166 | "2 489434 79323W WHITE CHERRY LIGHTS 12 \n", 167 | "3 489434 22041 RECORD FRAME 7\" SINGLE SIZE 48 \n", 168 | "4 489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 \n", 169 | "\n", 170 | " InvoiceDate Price Customer ID Country \n", 171 | "0 2009-12-01 07:45:00 6.95 13085.0 United Kingdom \n", 172 | "1 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n", 173 | "2 2009-12-01 07:45:00 6.75 13085.0 United Kingdom \n", 174 | "3 2009-12-01 07:45:00 2.10 13085.0 United Kingdom \n", 175 | "4 2009-12-01 07:45:00 1.25 13085.0 United Kingdom " 176 | ] 177 | }, 178 | "execution_count": 3, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "# Inspect dataframe\n", 185 | "\n", 186 | "df.head()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 4, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "# Retain customers from the UK\n", 196 | "\n", 197 | "df = df[df[\"Country\"]==\"United Kingdom\"]\n", 198 | "df.drop(\"Country\", axis=1, inplace=True)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 5, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# Remove transactions without Customer ID\n", 208 | "\n", 209 | "df.dropna(subset=[\"Customer ID\"], inplace=True)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 6, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "# Rename columns\n", 219 | "\n", 220 | "df.columns = [\n", 221 | " \"invoice\",\n", 222 | " \"stock_code\",\n", 223 | " \"description\",\n", 224 | " \"quantity\",\n", 225 | " \"invoice_date\",\n", 226 | " \"price\",\n", 227 | " \"customer_id\",\n", 228 | "]" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 7, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "# index rows with unique values\n", 238 | "\n", 239 | "df.reset_index(inplace=True, drop=True)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 8, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "# re-order columns\n", 249 | "\n", 250 | "ordered_cols = [\n", 251 | " \"customer_id\",\n", 252 | " \"invoice\",\n", 253 | " \"invoice_date\",\n", 254 | " \"stock_code\",\n", 255 | " \"description\",\n", 256 | " \"quantity\",\n", 257 | " \"price\",\n", 258 | "]" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 9, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "# save data with re-ordered columns\n", 268 | "\n", 269 | "df[ordered_cols].to_csv('retail.csv', index=False)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [] 278 | } 279 | ], 280 | "metadata": { 281 | "kernelspec": { 282 | "display_name": "fsml", 283 | "language": "python", 284 | "name": "fsml" 285 | }, 286 | "language_info": { 287 | "codemirror_mode": { 288 | "name": "ipython", 289 | "version": 3 290 | }, 291 | "file_extension": ".py", 292 | "mimetype": "text/x-python", 293 | "name": "python", 294 | "nbconvert_exporter": "python", 295 | "pygments_lexer": "ipython3", 296 | "version": "3.10.5" 297 | }, 298 | "toc": { 299 | "base_numbering": 1, 300 | "nav_menu": {}, 301 | "number_sections": true, 302 | "sideBar": true, 303 | "skip_h1_title": false, 304 | "title_cell": "Table of Contents", 305 | "title_sidebar": "Contents", 306 | "toc_cell": false, 307 | "toc_position": {}, 308 | "toc_section_display": true, 309 | "toc_window_display": false 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 4 314 | } 315 | -------------------------------------------------------------------------------- /ch10-tsfresh/Recipe4-extract-features-after-feature-selection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "6bdb68a4", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "\n", 12 | "from sklearn.feature_selection import SelectFromModel\n", 13 | "from sklearn.linear_model import LogisticRegression\n", 14 | "\n", 15 | "from tsfresh import extract_features, extract_relevant_features\n", 16 | "from tsfresh.feature_extraction import settings" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "id": "77e5e5e6", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# load data\n", 27 | "\n", 28 | "X = pd.read_csv(\"occupancy.csv\", parse_dates=[\"date\"])\n", 29 | "y = pd.read_csv(\"occupancy_target.csv\", index_col=\"id\")\n", 30 | "y = pd.Series(y[\"occupancy\"])" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "id": "7d6e26b5", 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stderr", 41 | "output_type": "stream", 42 | "text": [ 43 | "Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:29<00:00, 2.97s/it]\n" 44 | ] 45 | }, 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(135, 969)" 50 | ] 51 | }, 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "# create and select features\n", 59 | "\n", 60 | "features = extract_relevant_features(\n", 61 | " X,\n", 62 | " y,\n", 63 | " column_id=\"id\",\n", 64 | " column_sort=\"date\",\n", 65 | ")\n", 66 | "\n", 67 | "features.shape" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "id": "b4d73915", 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "array(['light__sum_of_reoccurring_data_points',\n", 80 | " 'co2__spkt_welch_density__coeff_2', 'co2__variance',\n", 81 | " 'temperature__c3__lag_1', 'temperature__abs_energy',\n", 82 | " 'temperature__c3__lag_2', 'temperature__c3__lag_3',\n", 83 | " 'co2__sum_of_reoccurring_data_points',\n", 84 | " 'light__spkt_welch_density__coeff_8', 'light__variance',\n", 85 | " 'light__agg_linear_trend__attr_\"slope\"__chunk_len_50__f_agg_\"var\"',\n", 86 | " 'light__agg_linear_trend__attr_\"intercept\"__chunk_len_10__f_agg_\"var\"'],\n", 87 | " dtype=object)" 88 | ] 89 | }, 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "# select features with lasso\n", 97 | "\n", 98 | "cls = LogisticRegression(\n", 99 | " penalty=\"l1\", \n", 100 | " solver=\"liblinear\",\n", 101 | " random_state=10,\n", 102 | " C=0.05,\n", 103 | " max_iter=1000,\n", 104 | ")\n", 105 | "\n", 106 | "selector = SelectFromModel(cls)\n", 107 | "\n", 108 | "selector.fit(features, y)\n", 109 | "\n", 110 | "features = selector.get_feature_names_out()\n", 111 | "\n", 112 | "features" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "id": "72ddeff0", 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "{'light': {'sum_of_reoccurring_data_points': None,\n", 125 | " 'spkt_welch_density': [{'coeff': 8}],\n", 126 | " 'variance': None,\n", 127 | " 'agg_linear_trend': [{'attr': 'slope', 'chunk_len': 50, 'f_agg': 'var'},\n", 128 | " {'attr': 'intercept', 'chunk_len': 10, 'f_agg': 'var'}]},\n", 129 | " 'co2': {'spkt_welch_density': [{'coeff': 2}],\n", 130 | " 'variance': None,\n", 131 | " 'sum_of_reoccurring_data_points': None},\n", 132 | " 'temperature': {'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],\n", 133 | " 'abs_energy': None}}" 134 | ] 135 | }, 136 | "execution_count": 5, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "# capture selected features for each time series\n", 143 | "\n", 144 | "kind_to_fc_parameters = settings.from_columns(selector.get_feature_names_out())\n", 145 | "\n", 146 | "kind_to_fc_parameters" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 6, 152 | "id": "f91d6eb8", 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stderr", 157 | "output_type": "stream", 158 | "text": [ 159 | "Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:03<00:00, 2.60it/s]\n" 160 | ] 161 | }, 162 | { 163 | "data": { 164 | "text/plain": [ 165 | "(135, 12)" 166 | ] 167 | }, 168 | "execution_count": 6, 169 | "metadata": {}, 170 | "output_type": "execute_result" 171 | } 172 | ], 173 | "source": [ 174 | "# create selected features for each time series\n", 175 | "\n", 176 | "features = extract_features(\n", 177 | " X,\n", 178 | " column_id=\"id\",\n", 179 | " column_sort=\"date\",\n", 180 | " kind_to_fc_parameters=kind_to_fc_parameters,\n", 181 | ")\n", 182 | "\n", 183 | "features.shape" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 7, 189 | "id": "065af937", 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/html": [ 195 | "
\n", 196 | "\n", 209 | "\n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | "
temperature__c3__lag_1temperature__c3__lag_2temperature__c3__lag_3temperature__abs_energylight__sum_of_reoccurring_data_pointslight__spkt_welch_density__coeff_8light__variancelight__agg_linear_trend__attr_\"slope\"__chunk_len_50__f_agg_\"var\"light__agg_linear_trend__attr_\"intercept\"__chunk_len_10__f_agg_\"var\"co2__spkt_welch_density__coeff_2co2__varianceco2__sum_of_reoccurring_data_points
111585.12793411581.20359011578.17880730721.5687032514.0332.22129518086.371875-21130.342519305.13751523.529443756.70066414124.000000
210751.99961010752.68250810753.11981229225.2543740.00.0000000.0000000.00000.0000350.067478377.28089513202.000000
310187.86522610187.20696310186.28404028198.1399500.00.0000000.0000000.00000.000091.896894115.26929820885.666667
49908.9002249909.7313889910.16394727680.8001840.00.0000000.0000000.00000.000042.39490535.47321618285.500000
59705.9897899706.4045519706.70796327299.0974690.00.0000000.0000000.00000.00004.81130327.53908017670.500000
\n", 305 | "
" 306 | ], 307 | "text/plain": [ 308 | " temperature__c3__lag_1 temperature__c3__lag_2 temperature__c3__lag_3 \\\n", 309 | "1 11585.127934 11581.203590 11578.178807 \n", 310 | "2 10751.999610 10752.682508 10753.119812 \n", 311 | "3 10187.865226 10187.206963 10186.284040 \n", 312 | "4 9908.900224 9909.731388 9910.163947 \n", 313 | "5 9705.989789 9706.404551 9706.707963 \n", 314 | "\n", 315 | " temperature__abs_energy light__sum_of_reoccurring_data_points \\\n", 316 | "1 30721.568703 2514.0 \n", 317 | "2 29225.254374 0.0 \n", 318 | "3 28198.139950 0.0 \n", 319 | "4 27680.800184 0.0 \n", 320 | "5 27299.097469 0.0 \n", 321 | "\n", 322 | " light__spkt_welch_density__coeff_8 light__variance \\\n", 323 | "1 332.221295 18086.371875 \n", 324 | "2 0.000000 0.000000 \n", 325 | "3 0.000000 0.000000 \n", 326 | "4 0.000000 0.000000 \n", 327 | "5 0.000000 0.000000 \n", 328 | "\n", 329 | " light__agg_linear_trend__attr_\"slope\"__chunk_len_50__f_agg_\"var\" \\\n", 330 | "1 -21130.3425 \n", 331 | "2 0.0000 \n", 332 | "3 0.0000 \n", 333 | "4 0.0000 \n", 334 | "5 0.0000 \n", 335 | "\n", 336 | " light__agg_linear_trend__attr_\"intercept\"__chunk_len_10__f_agg_\"var\" \\\n", 337 | "1 19305.1375 \n", 338 | "2 0.0000 \n", 339 | "3 0.0000 \n", 340 | "4 0.0000 \n", 341 | "5 0.0000 \n", 342 | "\n", 343 | " co2__spkt_welch_density__coeff_2 co2__variance \\\n", 344 | "1 1523.529443 756.700664 \n", 345 | "2 350.067478 377.280895 \n", 346 | "3 91.896894 115.269298 \n", 347 | "4 42.394905 35.473216 \n", 348 | "5 4.811303 27.539080 \n", 349 | "\n", 350 | " co2__sum_of_reoccurring_data_points \n", 351 | "1 14124.000000 \n", 352 | "2 13202.000000 \n", 353 | "3 20885.666667 \n", 354 | "4 18285.500000 \n", 355 | "5 17670.500000 " 356 | ] 357 | }, 358 | "execution_count": 7, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "features.head()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "id": "cae5efb1", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [] 374 | } 375 | ], 376 | "metadata": { 377 | "kernelspec": { 378 | "display_name": "fsml", 379 | "language": "python", 380 | "name": "fsml" 381 | }, 382 | "language_info": { 383 | "codemirror_mode": { 384 | "name": "ipython", 385 | "version": 3 386 | }, 387 | "file_extension": ".py", 388 | "mimetype": "text/x-python", 389 | "name": "python", 390 | "nbconvert_exporter": "python", 391 | "pygments_lexer": "ipython3", 392 | "version": "3.10.5" 393 | }, 394 | "toc": { 395 | "base_numbering": 1, 396 | "nav_menu": {}, 397 | "number_sections": true, 398 | "sideBar": true, 399 | "skip_h1_title": false, 400 | "title_cell": "Table of Contents", 401 | "title_sidebar": "Contents", 402 | "toc_cell": false, 403 | "toc_position": {}, 404 | "toc_section_display": true, 405 | "toc_window_display": false 406 | } 407 | }, 408 | "nbformat": 4, 409 | "nbformat_minor": 5 410 | } 411 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | category-encoders==2.4.0 2 | feature-engine==1.4.0 3 | featuretools==1.5.0 4 | matplotlib==3.4.2 5 | matplotlib-inline==0.1.2 6 | numpy==1.22.0 7 | pandas==1.5.0 8 | scikit-learn==1.1.0 9 | scipy==1.7.0 10 | seaborn==0.11.1 11 | statsmodels==0.12.2 12 | tsfresh==0.19.0 --------------------------------------------------------------------------------