├── .gitignore
├── LICENSE
├── README.md
├── ch01-missing-data-imputation
    ├── Recipe-01-Removing-observations-with-missing-data.ipynb
    ├── Recipe-02-Performing-mean-or-median-imputation.ipynb
    ├── Recipe-03-Imputing-categorical-variables.ipynb
    ├── Recipe-04-Replacing-missing-values-with-an-arbitrary-number.ipynb
    ├── Recipe-05-Finding-extreme-values-for-imputation.ipynb
    ├── Recipe-06-Marking-imputed-values.ipynb
    ├── Recipe-07-Performing-multivariate-imputation-by-chained-equations.ipynb
    ├── Recipe-08-Estimating-missing-data-with-K-nearest-neighbours.ipynb
    └── donwload-prepare-store-credit-approval-dataset.ipynb
├── ch02-categorical-encoding
    ├── Recipe-01-One-hot-encoding.ipynb
    ├── Recipe-02-One-hot-encoding-frequent-categories.ipynb
    ├── Recipe-03-Replacing-categories-by-counts-frequency.ipynb
    ├── Recipe-04-Ordinal-encoding.ipynb
    ├── Recipe-05-Ordered-ordinal-encoding.ipynb
    ├── Recipe-06-Target-mean-encoding.ipynb
    ├── Recipe-07-Weight-of-evidence.ipynb
    ├── Recipe-08-Grouping-rare-categories.ipynb
    ├── Recipe-09-Binary-Encoding.ipynb
    └── donwload-prepare-store-credit-approval-dataset.ipynb
├── ch03-variable-transformation
    ├── Recipe-1-logarithmic-transformation.ipynb
    ├── Recipe-2-reciprocal-transformation.ipynb
    ├── Recipe-3-square-root-transformation.ipynb
    ├── Recipe-4-power-transformation.ipynb
    ├── Recipe-5-Box-Cox-transformation.ipynb
    └── Recipe-6-Yeo-Johnson-transformation.ipynb
├── ch04-discretization
    ├── Recipe-1-Equal-width-discretization.ipynb
    ├── Recipe-2-Equal-frequency-discretisation.ipynb
    ├── Recipe-3-User-defined-interval-discretization.ipynb
    ├── Recipe-4-Discretization-k-means.ipynb
    ├── Recipe-5-Binarization.ipynb
    ├── Recipe-6-Discretization-with-decision-trees.ipynb
    └── donwload-prepare-store-enron-data.ipynb
├── ch05-outliers
    ├── Recipe-1-Visualizing-outliers-with-boxplots.ipynb
    ├── Recipe-2-Finding-outliers-with-mean-and-std.ipynb
    ├── Recipe-3-Finding-outliers-with-the-IQR.ipynb
    ├── Recipe-4-Removing-outliers.ipynb
    ├── Recipe-5-Capping-outliers.ipynb
    └── Recipe-6-Capping-outliers-with-quantiles.ipynb
├── ch06-datetime
    ├── Recipe-1-Extracting-features-from-dates-with-pandas.ipynb
    ├── Recipe-2-Extracting-features-from-time-with-pandas.ipynb
    ├── Recipe-3-Capturing-elapsed-time-between-2-variables.ipynb
    ├── Recipe-4-Working-with-different-time-zones.ipynb
    └── Recipe-5-Automating-datetime-features-with-Feature-engine.ipynb
├── ch07-scaling
    ├── Recipe-1-standardization.ipynb
    ├── Recipe-2-min-max-scaling.ipynb
    ├── Recipe-3-robust-scaling.ipynb
    ├── Recipe-4-mean-normalization.ipynb
    ├── Recipe-5-maximum-absolute-scaling.ipynb
    └── Recipe-6-scaling-to-unit-length.ipynb
├── ch08-creation
    ├── Cyclical-features-figures.ipynb
    ├── Recipe1-Combine-features-with-functions.ipynb
    ├── Recipe2-Comparing-features-to-reference-variable.ipynb
    ├── Recipe3-PolynomialExpansion.ipynb
    ├── Recipe4-Combining-features-with-trees.ipynb
    ├── Recipe5-Periodic-features.ipynb
    ├── Recipe6-Spline-features.ipynb
    ├── Spline-features-figures.ipynb
    └── polynomial_features_figures.ipynb
├── ch09-featuretools
    ├── Recipe1-Setting-up-an-entitity-set.ipynb
    ├── Recipe2-Creating-features-with-cumulative-primitives.ipynb
    ├── Recipe3-Combining-numerical-features.ipynb
    ├── Recipe4-Creating-features-from-datetime.ipynb
    ├── Recipe5-Extracting-features-from-text.ipynb
    ├── Recipe6-Creating-features-with-aggregation-primitives.ipynb
    └── prepare-retail-dataset.ipynb
├── ch10-tsfresh
    ├── Recipe1-extract-features-automatically-with-tsfresh.ipynb
    ├── Recipe2-extract-relevant-features-with-tsfresh.ipynb
    ├── Recipe3-extract-specific-features-with-tsfresh.ipynb
    ├── Recipe4-extract-features-after-feature-selection.ipynb
    ├── Recipe5-extract-features-automatically-within-pipeline.ipynb
    └── prepare-occupancy-dataset.ipynb
├── ch11-text
    ├── Recipe1-Capturing-text-complexity-in-features.ipynb
    ├── Recipe2-Sentence-tokenization.ipynb
    ├── Recipe3-bag-of-words.ipynb
    ├── Recipe4-TFIDF.ipynb
    └── Recipe5-cleaning-text.ipynb
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | *.csv
3 | *.data
4 | *.txt
5 | *.gz


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### [Packt Conference : Put Generative AI to work on Oct 11-13 (Virtual)](https://packt.link/JGIEY)
 3 | 
 4 | <b><p align='center'>[![Packt Conference](https://hub.packtpub.com/wp-content/uploads/2023/08/put-generative-ai-to-work-packt.png)](https://packt.link/JGIEY)</p></b> 
 5 | 3 Days, 20+ AI Experts, 25+ Workshops and Power Talks 
 6 | 
 7 | Code: <b>USD75OFF</b>
 8 | 
 9 | 
10 | 
11 | 
12 | # Python Feature Engineering Cookbook-Second Edition
13 | 
14 | <a href="https://www.packtpub.com/product/python-feature-engineering-cookbook-second-edition/9781804611302"><img src="https://static.packt-cdn.com/products/9781804611302/cover/smaller" alt="Python Feature Engineering Cookbook-Second Edition" height="256px" align="right"></a>
15 | 
16 | This is the code repository for [Python Feature Engineering Cookbook-Second Edition](https://www.packtpub.com/product/python-feature-engineering-cookbook-second-edition/9781804611302), published by Packt.
17 | 
18 | **Over 70 recipes for creating, engineering, and transforming features to build machine learning models**
19 | 
20 | ## What is this book about?
21 | Feature engineering, the process of transforming variables and creating features, albeit time-consuming, ensures that your machine learning models perform seamlessly. This second edition of Python Feature Engineering Cookbook will take the struggle out of feature engineering by showing you how to use open source Python libraries to accelerate the process via a plethora of practical, hands-on recipes.
22 | 
23 | This updated edition begins by addressing fundamental data challenges such as missing data and categorical values, before moving on to strategies for dealing with skewed distributions and outliers. The concluding chapters show you how to develop new features from various types of data, including text, time series, and relational databases. With the help of numerous open source Python libraries, you'll learn how to implement each feature engineering method in a performant, reproducible, and elegant manner.
24 | 
25 | By the end of this Python book, you will have the tools and expertise needed to confidently build end-to-end and reproducible feature engineering pipelines that can be deployed into production.
26 | 
27 | This book covers the following exciting features: 
28 | * Impute missing data using various univariate and multivariate methods
29 | * Encode categorical variables with one-hot, ordinal, and count encoding
30 | * Handle highly cardinal categorical variables
31 | * Transform, discretize, and scale your variables
32 | * Create variables from date and time with pandas and Feature-engine
33 | * Combine variables into new features
34 | * Extract features from text as well as from transactional data with Featuretools
35 | * Create features from time series data with tsfresh
36 | 
37 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1804611301) today!
38 | 
39 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" alt="https://www.packtpub.com/" border="5" /></a>
40 | 
41 | ## Instructions and Navigations
42 | All of the code is organized into folders.
43 | 
44 | The code will look like the following:
45 | ```
46 | X_train = pd.DataFrame(
47 |     X_train,
48 |     columns=numeric_vars + remaining_vars,
49 | )
50 | ```
51 | 
52 | **Following is what you need for this book:**
53 | This book is for machine learning and data science students and professionals, as well as software engineers working on machine learning model deployment, who want to learn more about how to transform their data and create new features to train machine learning models in a better way.	
54 | 
55 | With the following software and hardware list you can run all code files present in the book (Chapter 1-11).
56 | 
57 | ### Software and Hardware List
58 | 
59 | | Chapter  | Software required                                                                    | OS required                        |
60 | | -------- | -------------------------------------------------------------------------------------| -----------------------------------|
61 | | 1-11 		   |   	Python 3.3 or greater						                                            			  | Windows, Mac OS, or Linux |
62 | |     1-11      |Jupyter Notebook   																					  |         Windows, Mac OS, or Linux                            |
63 | 
64 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it]( https://packt.link/UXyxc).
65 | 
66 | ## Errata
67 | 
68 | * Page 332 : **Scikit-learn dataset website: z** should be **Scikit-learn dataset website: https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset**
69 | 
70 | ### Related products <Other books you may enjoy>
71 | * Data Cleaning and Exploration with Machine Learning [[Packt]](https://www.packtpub.com/product/data-cleaning-and-exploration-with-machine-learning/9781803241678) [[Amazon]](https://www.amazon.com/dp/1803241675)
72 | 
73 | * Production-Ready Applied Deep Learning [[Packt]](https://www.packtpub.com/product/production-ready-applied-deep-learning/9781803243665) [[Amazon]](https://www.amazon.com/dp/180324366X)
74 | 
75 | ## Get to Know the Author
76 | **Soledad Galli** is a data scientist, instructor, and software developer with more than 10 years of experience in world-class academic institutions and renowned businesses. She has developed and put into production machine learning models to assess insurance claims and credit risk and prevent fraud. She teaches multiple online courses on machine learning, which have enrolled 44,000+ students worldwide and consistently receive good student reviews. She is also the developer and maintainer of the open source Python library Feature-engine, which is currently downloaded 100,000+ times per month. Soledad received a Data Science Leaders Award in 2018 and was recognized as one of LinkedIn's voices in data science and analytics in 2019.
77 | 


--------------------------------------------------------------------------------
/ch01-missing-data-imputation/Recipe-05-Finding-extreme-values-for-imputation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Finding extreme values for imputation\n",
  8 |     "\n",
  9 |     "In this recipe, we will replace missing values by a value at the end of the distribution, estimated with a Gaussian approximation or the inter-quartile range proximity rule, utilizing pandas and Feature-engine."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "\n",
 20 |     "# to split the datasets:\n",
 21 |     "from sklearn.model_selection import train_test_split\n",
 22 |     "\n",
 23 |     "# to impute missing data with Feature-engine:\n",
 24 |     "from feature_engine.imputation import EndTailImputer"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Load data"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/html": [
 42 |        "<div>\n",
 43 |        "<style scoped>\n",
 44 |        "    .dataframe tbody tr th:only-of-type {\n",
 45 |        "        vertical-align: middle;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe tbody tr th {\n",
 49 |        "        vertical-align: top;\n",
 50 |        "    }\n",
 51 |        "\n",
 52 |        "    .dataframe thead th {\n",
 53 |        "        text-align: right;\n",
 54 |        "    }\n",
 55 |        "</style>\n",
 56 |        "<table border=\"1\" class=\"dataframe\">\n",
 57 |        "  <thead>\n",
 58 |        "    <tr style=\"text-align: right;\">\n",
 59 |        "      <th></th>\n",
 60 |        "      <th>A1</th>\n",
 61 |        "      <th>A2</th>\n",
 62 |        "      <th>A3</th>\n",
 63 |        "      <th>A4</th>\n",
 64 |        "      <th>A5</th>\n",
 65 |        "      <th>A6</th>\n",
 66 |        "      <th>A7</th>\n",
 67 |        "      <th>A8</th>\n",
 68 |        "      <th>A9</th>\n",
 69 |        "      <th>A10</th>\n",
 70 |        "      <th>A11</th>\n",
 71 |        "      <th>A12</th>\n",
 72 |        "      <th>A13</th>\n",
 73 |        "      <th>A14</th>\n",
 74 |        "      <th>A15</th>\n",
 75 |        "      <th>target</th>\n",
 76 |        "    </tr>\n",
 77 |        "  </thead>\n",
 78 |        "  <tbody>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>0</th>\n",
 81 |        "      <td>b</td>\n",
 82 |        "      <td>30.83</td>\n",
 83 |        "      <td>0.000</td>\n",
 84 |        "      <td>u</td>\n",
 85 |        "      <td>g</td>\n",
 86 |        "      <td>w</td>\n",
 87 |        "      <td>v</td>\n",
 88 |        "      <td>1.25</td>\n",
 89 |        "      <td>t</td>\n",
 90 |        "      <td>t</td>\n",
 91 |        "      <td>1</td>\n",
 92 |        "      <td>f</td>\n",
 93 |        "      <td>g</td>\n",
 94 |        "      <td>202.0</td>\n",
 95 |        "      <td>0</td>\n",
 96 |        "      <td>1</td>\n",
 97 |        "    </tr>\n",
 98 |        "    <tr>\n",
 99 |        "      <th>1</th>\n",
100 |        "      <td>a</td>\n",
101 |        "      <td>58.67</td>\n",
102 |        "      <td>4.460</td>\n",
103 |        "      <td>u</td>\n",
104 |        "      <td>g</td>\n",
105 |        "      <td>q</td>\n",
106 |        "      <td>h</td>\n",
107 |        "      <td>3.04</td>\n",
108 |        "      <td>t</td>\n",
109 |        "      <td>t</td>\n",
110 |        "      <td>6</td>\n",
111 |        "      <td>f</td>\n",
112 |        "      <td>g</td>\n",
113 |        "      <td>43.0</td>\n",
114 |        "      <td>560</td>\n",
115 |        "      <td>1</td>\n",
116 |        "    </tr>\n",
117 |        "    <tr>\n",
118 |        "      <th>2</th>\n",
119 |        "      <td>a</td>\n",
120 |        "      <td>24.50</td>\n",
121 |        "      <td>NaN</td>\n",
122 |        "      <td>u</td>\n",
123 |        "      <td>g</td>\n",
124 |        "      <td>q</td>\n",
125 |        "      <td>h</td>\n",
126 |        "      <td>NaN</td>\n",
127 |        "      <td>NaN</td>\n",
128 |        "      <td>NaN</td>\n",
129 |        "      <td>0</td>\n",
130 |        "      <td>f</td>\n",
131 |        "      <td>g</td>\n",
132 |        "      <td>280.0</td>\n",
133 |        "      <td>824</td>\n",
134 |        "      <td>1</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>3</th>\n",
138 |        "      <td>b</td>\n",
139 |        "      <td>27.83</td>\n",
140 |        "      <td>1.540</td>\n",
141 |        "      <td>u</td>\n",
142 |        "      <td>g</td>\n",
143 |        "      <td>w</td>\n",
144 |        "      <td>v</td>\n",
145 |        "      <td>3.75</td>\n",
146 |        "      <td>t</td>\n",
147 |        "      <td>t</td>\n",
148 |        "      <td>5</td>\n",
149 |        "      <td>t</td>\n",
150 |        "      <td>g</td>\n",
151 |        "      <td>100.0</td>\n",
152 |        "      <td>3</td>\n",
153 |        "      <td>1</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <th>4</th>\n",
157 |        "      <td>b</td>\n",
158 |        "      <td>20.17</td>\n",
159 |        "      <td>5.625</td>\n",
160 |        "      <td>u</td>\n",
161 |        "      <td>g</td>\n",
162 |        "      <td>w</td>\n",
163 |        "      <td>v</td>\n",
164 |        "      <td>1.71</td>\n",
165 |        "      <td>t</td>\n",
166 |        "      <td>f</td>\n",
167 |        "      <td>0</td>\n",
168 |        "      <td>f</td>\n",
169 |        "      <td>s</td>\n",
170 |        "      <td>120.0</td>\n",
171 |        "      <td>0</td>\n",
172 |        "      <td>1</td>\n",
173 |        "    </tr>\n",
174 |        "  </tbody>\n",
175 |        "</table>\n",
176 |        "</div>"
177 |       ],
178 |       "text/plain": [
179 |        "  A1     A2     A3 A4 A5 A6 A7    A8   A9  A10  A11 A12 A13    A14  A15  \\\n",
180 |        "0  b  30.83  0.000  u  g  w  v  1.25    t    t    1   f   g  202.0    0   \n",
181 |        "1  a  58.67  4.460  u  g  q  h  3.04    t    t    6   f   g   43.0  560   \n",
182 |        "2  a  24.50    NaN  u  g  q  h   NaN  NaN  NaN    0   f   g  280.0  824   \n",
183 |        "3  b  27.83  1.540  u  g  w  v  3.75    t    t    5   t   g  100.0    3   \n",
184 |        "4  b  20.17  5.625  u  g  w  v  1.71    t    f    0   f   s  120.0    0   \n",
185 |        "\n",
186 |        "   target  \n",
187 |        "0       1  \n",
188 |        "1       1  \n",
189 |        "2       1  \n",
190 |        "3       1  \n",
191 |        "4       1  "
192 |       ]
193 |      },
194 |      "execution_count": 2,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "data = pd.read_csv(\"credit_approval_uci.csv\")\n",
201 |     "\n",
202 |     "data.head()"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "## Select numerical variables"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 3,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "# We exclude the target variable:\n",
219 |     "\n",
220 |     "numeric_vars = [\n",
221 |     "    var for var in data.select_dtypes(exclude=\"O\").columns.to_list() if var != \"target\"\n",
222 |     "]"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "## Split data into train and test"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 4,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "data": {
239 |       "text/plain": [
240 |        "((483, 6), (207, 6))"
241 |       ]
242 |      },
243 |      "execution_count": 4,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
250 |     "    data[numeric_vars],\n",
251 |     "    data[\"target\"],\n",
252 |     "    test_size=0.3,\n",
253 |     "    random_state=0,\n",
254 |     ")\n",
255 |     "\n",
256 |     "X_train.shape, X_test.shape"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "## Find inter-quartile range"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 5,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "data": {
273 |       "text/plain": [
274 |        "A2      16.4200\n",
275 |        "A3       6.5825\n",
276 |        "A8       2.8350\n",
277 |        "A11      3.0000\n",
278 |        "A14    212.0000\n",
279 |        "A15    450.0000\n",
280 |        "dtype: float64"
281 |       ]
282 |      },
283 |      "execution_count": 5,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "IQR = X_train.quantile(0.75) - X_train.quantile(0.25)\n",
290 |     "\n",
291 |     "IQR"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {},
297 |    "source": [
298 |     "## Find values beyond the right end of the distribution"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 6,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "data": {
308 |       "text/plain": [
309 |        "{'A2': 63.550000000000004,\n",
310 |        " 'A3': 17.43625,\n",
311 |        " 'A8': 7.2524999999999995,\n",
312 |        " 'A11': 7.5,\n",
313 |        " 'A14': 590.0,\n",
314 |        " 'A15': 1125.0}"
315 |       ]
316 |      },
317 |      "execution_count": 6,
318 |      "metadata": {},
319 |      "output_type": "execute_result"
320 |     }
321 |    ],
322 |    "source": [
323 |     "imputation_dict = (X_train.quantile(0.75) + 1.5 * IQR).to_dict()\n",
324 |     "\n",
325 |     "imputation_dict"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 7,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "# Replace missing data with estimated values:\n",
335 |     "\n",
336 |     "X_train = X_train.fillna(value=imputation_dict)\n",
337 |     "X_test = X_test.fillna(value=imputation_dict)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {},
343 |    "source": [
344 |     "## Find imputation values with mean and standard deviation"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 8,
350 |    "metadata": {},
351 |    "outputs": [
352 |     {
353 |      "data": {
354 |       "text/plain": [
355 |        "((483, 6), (207, 6))"
356 |       ]
357 |      },
358 |      "execution_count": 8,
359 |      "metadata": {},
360 |      "output_type": "execute_result"
361 |     }
362 |    ],
363 |    "source": [
364 |     "# Split the data:\n",
365 |     "\n",
366 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
367 |     "    data[numeric_vars],\n",
368 |     "    data[\"target\"],\n",
369 |     "    test_size=0.3,\n",
370 |     "    random_state=0,\n",
371 |     ")\n",
372 |     "\n",
373 |     "X_train.shape, X_test.shape"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 9,
379 |    "metadata": {},
380 |    "outputs": [
381 |     {
382 |      "data": {
383 |       "text/plain": [
384 |        "{'A2': 68.35771260807589,\n",
385 |        " 'A3': 19.98993346546277,\n",
386 |        " 'A8': 12.418567732660225,\n",
387 |        " 'A11': 18.320547522636247,\n",
388 |        " 'A14': 710.6258760585449,\n",
389 |        " 'A15': 12740.850618383225}"
390 |       ]
391 |      },
392 |      "execution_count": 9,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "imputation_dict = (X_train.mean() + 3 * X_train.std()).to_dict()\n",
399 |     "\n",
400 |     "imputation_dict"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": 10,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": [
409 |     "# Replace missing data with estimated values:\n",
410 |     "\n",
411 |     "X_train = X_train.fillna(value=imputation_dict)\n",
412 |     "X_test = X_test.fillna(value=imputation_dict)"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "## End tail imputation with Feature-engine"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 11,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "# Let's separate into train and test sets:\n",
429 |     "\n",
430 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
431 |     "    data[numeric_vars],\n",
432 |     "    data[\"target\"],\n",
433 |     "    test_size=0.3,\n",
434 |     "    random_state=0,\n",
435 |     ")"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 12,
441 |    "metadata": {},
442 |    "outputs": [
443 |     {
444 |      "data": {
445 |       "text/plain": [
446 |        "EndTailImputer(imputation_method='iqr')"
447 |       ]
448 |      },
449 |      "execution_count": 12,
450 |      "metadata": {},
451 |      "output_type": "execute_result"
452 |     }
453 |    ],
454 |    "source": [
455 |     "# Set up the imputer to find extreme values based of\n",
456 |     "# the inter-quartile range proximity rule, placing\n",
457 |     "# estimates at the right tail, using 3 times the IQR:\n",
458 |     "\n",
459 |     "imputer = EndTailImputer(\n",
460 |     "    imputation_method=\"iqr\",\n",
461 |     "    tail=\"right\",\n",
462 |     "    fold=3,\n",
463 |     "    variables=None,\n",
464 |     ")\n",
465 |     "\n",
466 |     "imputer.fit(X_train)"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 13,
472 |    "metadata": {},
473 |    "outputs": [
474 |     {
475 |      "data": {
476 |       "text/plain": [
477 |        "{'A2': 88.18,\n",
478 |        " 'A3': 27.31,\n",
479 |        " 'A8': 11.504999999999999,\n",
480 |        " 'A11': 12.0,\n",
481 |        " 'A14': 908.0,\n",
482 |        " 'A15': 1800.0}"
483 |       ]
484 |      },
485 |      "execution_count": 13,
486 |      "metadata": {},
487 |      "output_type": "execute_result"
488 |     }
489 |    ],
490 |    "source": [
491 |     "# The values to use for the imputation:\n",
492 |     "\n",
493 |     "imputer.imputer_dict_"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": 14,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "# Replace missing data:\n",
503 |     "\n",
504 |     "X_train = imputer.transform(X_train)\n",
505 |     "X_test = imputer.transform(X_test)"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": null,
511 |    "metadata": {},
512 |    "outputs": [],
513 |    "source": []
514 |   }
515 |  ],
516 |  "metadata": {
517 |   "kernelspec": {
518 |    "display_name": "fenotebook",
519 |    "language": "python",
520 |    "name": "fenotebook"
521 |   },
522 |   "language_info": {
523 |    "codemirror_mode": {
524 |     "name": "ipython",
525 |     "version": 3
526 |    },
527 |    "file_extension": ".py",
528 |    "mimetype": "text/x-python",
529 |    "name": "python",
530 |    "nbconvert_exporter": "python",
531 |    "pygments_lexer": "ipython3",
532 |    "version": "3.8.2"
533 |   },
534 |   "toc": {
535 |    "base_numbering": 1,
536 |    "nav_menu": {},
537 |    "number_sections": true,
538 |    "sideBar": true,
539 |    "skip_h1_title": false,
540 |    "title_cell": "Table of Contents",
541 |    "title_sidebar": "Contents",
542 |    "toc_cell": false,
543 |    "toc_position": {},
544 |    "toc_section_display": true,
545 |    "toc_window_display": true
546 |   }
547 |  },
548 |  "nbformat": 4,
549 |  "nbformat_minor": 2
550 | }
551 | 


--------------------------------------------------------------------------------
/ch01-missing-data-imputation/Recipe-08-Estimating-missing-data-with-K-nearest-neighbours.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Estimating missing data with K-nearest neighbors\n",
  8 |     "\n",
  9 |     "In this notebook, we will replace missing data, by the mean value shown by their closest k neighbors."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "import pandas as pd\n",
 20 |     "from sklearn.model_selection import train_test_split\n",
 21 |     "from sklearn.impute import KNNImputer\n",
 22 |     "from feature_engine.wrappers import SklearnTransformerWrapper"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## Load data"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/html": [
 40 |        "<div>\n",
 41 |        "<style scoped>\n",
 42 |        "    .dataframe tbody tr th:only-of-type {\n",
 43 |        "        vertical-align: middle;\n",
 44 |        "    }\n",
 45 |        "\n",
 46 |        "    .dataframe tbody tr th {\n",
 47 |        "        vertical-align: top;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe thead th {\n",
 51 |        "        text-align: right;\n",
 52 |        "    }\n",
 53 |        "</style>\n",
 54 |        "<table border=\"1\" class=\"dataframe\">\n",
 55 |        "  <thead>\n",
 56 |        "    <tr style=\"text-align: right;\">\n",
 57 |        "      <th></th>\n",
 58 |        "      <th>A2</th>\n",
 59 |        "      <th>A3</th>\n",
 60 |        "      <th>A8</th>\n",
 61 |        "      <th>A11</th>\n",
 62 |        "      <th>A14</th>\n",
 63 |        "      <th>A15</th>\n",
 64 |        "      <th>target</th>\n",
 65 |        "    </tr>\n",
 66 |        "  </thead>\n",
 67 |        "  <tbody>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>0</th>\n",
 70 |        "      <td>30.83</td>\n",
 71 |        "      <td>0.000</td>\n",
 72 |        "      <td>1.25</td>\n",
 73 |        "      <td>1</td>\n",
 74 |        "      <td>202.0</td>\n",
 75 |        "      <td>0</td>\n",
 76 |        "      <td>1</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>1</th>\n",
 80 |        "      <td>58.67</td>\n",
 81 |        "      <td>4.460</td>\n",
 82 |        "      <td>3.04</td>\n",
 83 |        "      <td>6</td>\n",
 84 |        "      <td>43.0</td>\n",
 85 |        "      <td>560</td>\n",
 86 |        "      <td>1</td>\n",
 87 |        "    </tr>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>2</th>\n",
 90 |        "      <td>24.50</td>\n",
 91 |        "      <td>NaN</td>\n",
 92 |        "      <td>NaN</td>\n",
 93 |        "      <td>0</td>\n",
 94 |        "      <td>280.0</td>\n",
 95 |        "      <td>824</td>\n",
 96 |        "      <td>1</td>\n",
 97 |        "    </tr>\n",
 98 |        "    <tr>\n",
 99 |        "      <th>3</th>\n",
100 |        "      <td>27.83</td>\n",
101 |        "      <td>1.540</td>\n",
102 |        "      <td>3.75</td>\n",
103 |        "      <td>5</td>\n",
104 |        "      <td>100.0</td>\n",
105 |        "      <td>3</td>\n",
106 |        "      <td>1</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>4</th>\n",
110 |        "      <td>20.17</td>\n",
111 |        "      <td>5.625</td>\n",
112 |        "      <td>1.71</td>\n",
113 |        "      <td>0</td>\n",
114 |        "      <td>120.0</td>\n",
115 |        "      <td>0</td>\n",
116 |        "      <td>1</td>\n",
117 |        "    </tr>\n",
118 |        "  </tbody>\n",
119 |        "</table>\n",
120 |        "</div>"
121 |       ],
122 |       "text/plain": [
123 |        "      A2     A3    A8  A11    A14  A15  target\n",
124 |        "0  30.83  0.000  1.25    1  202.0    0       1\n",
125 |        "1  58.67  4.460  3.04    6   43.0  560       1\n",
126 |        "2  24.50    NaN   NaN    0  280.0  824       1\n",
127 |        "3  27.83  1.540  3.75    5  100.0    3       1\n",
128 |        "4  20.17  5.625  1.71    0  120.0    0       1"
129 |       ]
130 |      },
131 |      "execution_count": 2,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "# Load data with numerical variables\n",
138 |     "\n",
139 |     "variables = [\"A2\", \"A3\", \"A8\", \"A11\", \"A14\", \"A15\", \"target\"]\n",
140 |     "\n",
141 |     "data = pd.read_csv(\"credit_approval_uci.csv\", usecols=variables)\n",
142 |     "\n",
143 |     "data.head()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## Split data into train and test sets"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 3,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "((483, 6), (207, 6))"
162 |       ]
163 |      },
164 |      "execution_count": 3,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "# Let's separate into training and testing set\n",
171 |     "\n",
172 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
173 |     "    data.drop(\"target\", axis=1),\n",
174 |     "    data[\"target\"],\n",
175 |     "    test_size=0.3,\n",
176 |     "    random_state=0,\n",
177 |     ")\n",
178 |     "\n",
179 |     "X_train.shape, X_test.shape"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 4,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "A2     0.022774\n",
191 |        "A3     0.140787\n",
192 |        "A8     0.140787\n",
193 |        "A11    0.000000\n",
194 |        "A14    0.014493\n",
195 |        "A15    0.000000\n",
196 |        "dtype: float64"
197 |       ]
198 |      },
199 |      "execution_count": 4,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "# Find the fraction of missing data:\n",
206 |     "\n",
207 |     "X_train.isnull().mean()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 5,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "# Set up the imputer to find the closes 5 neighbors\n",
217 |     "# utilizing euclidean distance, and weighting the\n",
218 |     "# neighbours so that furthest neighbors have smaller\n",
219 |     "# influence:\n",
220 |     "\n",
221 |     "imputer = KNNImputer(\n",
222 |     "    n_neighbors=5,\n",
223 |     "    weights=\"distance\",\n",
224 |     ")"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 6,
230 |    "metadata": {},
231 |    "outputs": [
232 |     {
233 |      "data": {
234 |       "text/plain": [
235 |        "KNNImputer(weights='distance')"
236 |       ]
237 |      },
238 |      "execution_count": 6,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "# Find the closest neighbors:\n",
245 |     "\n",
246 |     "imputer.fit(X_train)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 7,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "# Replace the missing values by the weighted\n",
256 |     "# mean of the values shown by the neighbors:\n",
257 |     "\n",
258 |     "X_train = imputer.transform(X_train)\n",
259 |     "X_test = imputer.transform(X_test)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 8,
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "data": {
269 |       "text/plain": [
270 |        "array([[4.608e+01, 3.000e+00, 2.375e+00, 8.000e+00, 3.960e+02, 4.159e+03],\n",
271 |        "       [1.592e+01, 2.875e+00, 8.500e-02, 0.000e+00, 1.200e+02, 0.000e+00],\n",
272 |        "       [3.633e+01, 2.125e+00, 8.500e-02, 1.000e+00, 5.000e+01, 1.187e+03],\n",
273 |        "       ...,\n",
274 |        "       [1.958e+01, 6.650e-01, 1.665e+00, 0.000e+00, 2.200e+02, 5.000e+00],\n",
275 |        "       [2.283e+01, 2.290e+00, 2.290e+00, 7.000e+00, 1.400e+02, 2.384e+03],\n",
276 |        "       [4.058e+01, 3.290e+00, 3.500e+00, 0.000e+00, 4.000e+02, 0.000e+00]])"
277 |       ]
278 |      },
279 |      "execution_count": 8,
280 |      "metadata": {},
281 |      "output_type": "execute_result"
282 |     }
283 |    ],
284 |    "source": [
285 |     "# The result is a NumPy array:\n",
286 |     "X_train"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 9,
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "data": {
296 |       "text/plain": [
297 |        "0    0\n",
298 |        "1    0\n",
299 |        "2    0\n",
300 |        "3    0\n",
301 |        "4    0\n",
302 |        "5    0\n",
303 |        "dtype: int64"
304 |       ]
305 |      },
306 |      "execution_count": 9,
307 |      "metadata": {},
308 |      "output_type": "execute_result"
309 |     }
310 |    ],
311 |    "source": [
312 |     "# We can corroborate that there is no missing data:\n",
313 |     "\n",
314 |     "pd.DataFrame(X_train).isnull().sum()"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "## Find neighbors base on specific variables"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 10,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "# Let's separate into training and testing set\n",
331 |     "\n",
332 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
333 |     "    data.drop(\"target\", axis=1),\n",
334 |     "    data[\"target\"],\n",
335 |     "    test_size=0.3,\n",
336 |     "    random_state=0,\n",
337 |     ")"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 11,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "# Set up the imputer to find neighbous based on\n",
347 |     "# 4 numerical variables:\n",
348 |     "\n",
349 |     "imputer = SklearnTransformerWrapper(\n",
350 |     "    transformer=KNNImputer(),\n",
351 |     "    variables=[\"A2\", \"A3\", \"A8\", \"A11\"],\n",
352 |     ")"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 12,
358 |    "metadata": {},
359 |    "outputs": [],
360 |    "source": [
361 |     "# Find neighbors and replace missing data\n",
362 |     "# by their estimates:\n",
363 |     "\n",
364 |     "X_train = imputer.fit_transform(X_train)\n",
365 |     "X_test = imputer.transform(X_test)"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": []
374 |   }
375 |  ],
376 |  "metadata": {
377 |   "kernelspec": {
378 |    "display_name": "fenotebook",
379 |    "language": "python",
380 |    "name": "fenotebook"
381 |   },
382 |   "language_info": {
383 |    "codemirror_mode": {
384 |     "name": "ipython",
385 |     "version": 3
386 |    },
387 |    "file_extension": ".py",
388 |    "mimetype": "text/x-python",
389 |    "name": "python",
390 |    "nbconvert_exporter": "python",
391 |    "pygments_lexer": "ipython3",
392 |    "version": "3.8.2"
393 |   },
394 |   "toc": {
395 |    "base_numbering": 1,
396 |    "nav_menu": {},
397 |    "number_sections": true,
398 |    "sideBar": true,
399 |    "skip_h1_title": false,
400 |    "title_cell": "Table of Contents",
401 |    "title_sidebar": "Contents",
402 |    "toc_cell": false,
403 |    "toc_position": {},
404 |    "toc_section_display": true,
405 |    "toc_window_display": true
406 |   }
407 |  },
408 |  "nbformat": 4,
409 |  "nbformat_minor": 2
410 | }
411 | 


--------------------------------------------------------------------------------
/ch01-missing-data-imputation/donwload-prepare-store-credit-approval-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Download, prepare and save the Credit Approval Dataset\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "\n",
 11 |     "In this notebook, you will find guidelines to download, prepare, and store the Credit Approval Dataset from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml).\n",
 12 |     "\n",
 13 |     "\n",
 14 |     "## Download the data\n",
 15 |     "\n",
 16 |     "Follow these guidelines to download the data:\n",
 17 |     "\n",
 18 |     "- Visit [the UCI website](http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/)\n",
 19 |     "- Click on **crx.data** to download the data. \n",
 20 |     "- Save crx.data in the same folder that contains this notebook.\n",
 21 |     "\n",
 22 |     "\n",
 23 |     "You can find more information about this particular dataset [here](https://archive.ics.uci.edu/ml/datasets/credit+approval)."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 1,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "import random\n",
 33 |     "import numpy as np\n",
 34 |     "import pandas as pd"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "data": {
 44 |       "text/html": [
 45 |        "<div>\n",
 46 |        "<style scoped>\n",
 47 |        "    .dataframe tbody tr th:only-of-type {\n",
 48 |        "        vertical-align: middle;\n",
 49 |        "    }\n",
 50 |        "\n",
 51 |        "    .dataframe tbody tr th {\n",
 52 |        "        vertical-align: top;\n",
 53 |        "    }\n",
 54 |        "\n",
 55 |        "    .dataframe thead th {\n",
 56 |        "        text-align: right;\n",
 57 |        "    }\n",
 58 |        "</style>\n",
 59 |        "<table border=\"1\" class=\"dataframe\">\n",
 60 |        "  <thead>\n",
 61 |        "    <tr style=\"text-align: right;\">\n",
 62 |        "      <th></th>\n",
 63 |        "      <th>A1</th>\n",
 64 |        "      <th>A2</th>\n",
 65 |        "      <th>A3</th>\n",
 66 |        "      <th>A4</th>\n",
 67 |        "      <th>A5</th>\n",
 68 |        "      <th>A6</th>\n",
 69 |        "      <th>A7</th>\n",
 70 |        "      <th>A8</th>\n",
 71 |        "      <th>A9</th>\n",
 72 |        "      <th>A10</th>\n",
 73 |        "      <th>A11</th>\n",
 74 |        "      <th>A12</th>\n",
 75 |        "      <th>A13</th>\n",
 76 |        "      <th>A14</th>\n",
 77 |        "      <th>A15</th>\n",
 78 |        "      <th>target</th>\n",
 79 |        "    </tr>\n",
 80 |        "  </thead>\n",
 81 |        "  <tbody>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>0</th>\n",
 84 |        "      <td>b</td>\n",
 85 |        "      <td>30.83</td>\n",
 86 |        "      <td>0.000</td>\n",
 87 |        "      <td>u</td>\n",
 88 |        "      <td>g</td>\n",
 89 |        "      <td>w</td>\n",
 90 |        "      <td>v</td>\n",
 91 |        "      <td>1.25</td>\n",
 92 |        "      <td>t</td>\n",
 93 |        "      <td>t</td>\n",
 94 |        "      <td>1</td>\n",
 95 |        "      <td>f</td>\n",
 96 |        "      <td>g</td>\n",
 97 |        "      <td>202.0</td>\n",
 98 |        "      <td>0</td>\n",
 99 |        "      <td>1</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>1</th>\n",
103 |        "      <td>a</td>\n",
104 |        "      <td>58.67</td>\n",
105 |        "      <td>4.460</td>\n",
106 |        "      <td>u</td>\n",
107 |        "      <td>g</td>\n",
108 |        "      <td>q</td>\n",
109 |        "      <td>h</td>\n",
110 |        "      <td>3.04</td>\n",
111 |        "      <td>t</td>\n",
112 |        "      <td>t</td>\n",
113 |        "      <td>6</td>\n",
114 |        "      <td>f</td>\n",
115 |        "      <td>g</td>\n",
116 |        "      <td>43.0</td>\n",
117 |        "      <td>560</td>\n",
118 |        "      <td>1</td>\n",
119 |        "    </tr>\n",
120 |        "    <tr>\n",
121 |        "      <th>2</th>\n",
122 |        "      <td>a</td>\n",
123 |        "      <td>24.50</td>\n",
124 |        "      <td>0.500</td>\n",
125 |        "      <td>u</td>\n",
126 |        "      <td>g</td>\n",
127 |        "      <td>q</td>\n",
128 |        "      <td>h</td>\n",
129 |        "      <td>1.50</td>\n",
130 |        "      <td>t</td>\n",
131 |        "      <td>f</td>\n",
132 |        "      <td>0</td>\n",
133 |        "      <td>f</td>\n",
134 |        "      <td>g</td>\n",
135 |        "      <td>280.0</td>\n",
136 |        "      <td>824</td>\n",
137 |        "      <td>1</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>3</th>\n",
141 |        "      <td>b</td>\n",
142 |        "      <td>27.83</td>\n",
143 |        "      <td>1.540</td>\n",
144 |        "      <td>u</td>\n",
145 |        "      <td>g</td>\n",
146 |        "      <td>w</td>\n",
147 |        "      <td>v</td>\n",
148 |        "      <td>3.75</td>\n",
149 |        "      <td>t</td>\n",
150 |        "      <td>t</td>\n",
151 |        "      <td>5</td>\n",
152 |        "      <td>t</td>\n",
153 |        "      <td>g</td>\n",
154 |        "      <td>100.0</td>\n",
155 |        "      <td>3</td>\n",
156 |        "      <td>1</td>\n",
157 |        "    </tr>\n",
158 |        "    <tr>\n",
159 |        "      <th>4</th>\n",
160 |        "      <td>b</td>\n",
161 |        "      <td>20.17</td>\n",
162 |        "      <td>5.625</td>\n",
163 |        "      <td>u</td>\n",
164 |        "      <td>g</td>\n",
165 |        "      <td>w</td>\n",
166 |        "      <td>v</td>\n",
167 |        "      <td>1.71</td>\n",
168 |        "      <td>t</td>\n",
169 |        "      <td>f</td>\n",
170 |        "      <td>0</td>\n",
171 |        "      <td>f</td>\n",
172 |        "      <td>s</td>\n",
173 |        "      <td>120.0</td>\n",
174 |        "      <td>0</td>\n",
175 |        "      <td>1</td>\n",
176 |        "    </tr>\n",
177 |        "  </tbody>\n",
178 |        "</table>\n",
179 |        "</div>"
180 |       ],
181 |       "text/plain": [
182 |        "  A1     A2     A3 A4 A5 A6 A7    A8 A9 A10  A11 A12 A13    A14  A15  target\n",
183 |        "0  b  30.83  0.000  u  g  w  v  1.25  t   t    1   f   g  202.0    0       1\n",
184 |        "1  a  58.67  4.460  u  g  q  h  3.04  t   t    6   f   g   43.0  560       1\n",
185 |        "2  a  24.50  0.500  u  g  q  h  1.50  t   f    0   f   g  280.0  824       1\n",
186 |        "3  b  27.83  1.540  u  g  w  v  3.75  t   t    5   t   g  100.0    3       1\n",
187 |        "4  b  20.17  5.625  u  g  w  v  1.71  t   f    0   f   s  120.0    0       1"
188 |       ]
189 |      },
190 |      "execution_count": 2,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "# Load data\n",
197 |     "data = pd.read_csv(\"crx.data\", header=None)\n",
198 |     "\n",
199 |     "# Create variable names according to UCI Machine Learning\n",
200 |     "# Repository's information:\n",
201 |     "varnames = [f\"A{s}\" for s in range(1, 17)]\n",
202 |     "\n",
203 |     "# Add column names to dataset:\n",
204 |     "data.columns = varnames\n",
205 |     "\n",
206 |     "# Replace ? by np.nan:\n",
207 |     "data = data.replace(\"?\", np.nan)\n",
208 |     "\n",
209 |     "# Cast variables to correct datatypes:\n",
210 |     "data[\"A2\"] = data[\"A2\"].astype(\"float\")\n",
211 |     "data[\"A14\"] = data[\"A14\"].astype(\"float\")\n",
212 |     "\n",
213 |     "# Encode target to binary notation:\n",
214 |     "data[\"A16\"] = data[\"A16\"].map({\"+\": 1, \"-\": 0})\n",
215 |     "\n",
216 |     "# Rename target:\n",
217 |     "data.rename(columns={\"A16\": \"target\"}, inplace=True)\n",
218 |     "\n",
219 |     "# Display first 5 rows of data:\n",
220 |     "data.head()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 3,
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/plain": [
231 |        "A1        12\n",
232 |        "A2        12\n",
233 |        "A3        92\n",
234 |        "A4         6\n",
235 |        "A5         6\n",
236 |        "A6         9\n",
237 |        "A7         9\n",
238 |        "A8        92\n",
239 |        "A9        92\n",
240 |        "A10       92\n",
241 |        "A11        0\n",
242 |        "A12        0\n",
243 |        "A13        0\n",
244 |        "A14       13\n",
245 |        "A15        0\n",
246 |        "target     0\n",
247 |        "dtype: int64"
248 |       ]
249 |      },
250 |      "execution_count": 3,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "# Add missing values at random positions.\n",
257 |     "\n",
258 |     "# Set seed for reproducibility:\n",
259 |     "random.seed(9001)\n",
260 |     "\n",
261 |     "# Get the reandom position indexes:\n",
262 |     "values = list(set([random.randint(0, len(data)) for p in range(0, 100)]))\n",
263 |     "\n",
264 |     "# Add missing data:\n",
265 |     "data.loc[values, [\"A3\", \"A8\", \"A9\", \"A10\"]] = np.nan\n",
266 |     "\n",
267 |     "# Check proportion of missing data:\n",
268 |     "data.isnull().sum()"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 4,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "# Save dataset\n",
278 |     "\n",
279 |     "data.to_csv(\"credit_approval_uci.csv\", index=False)"
280 |    ]
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "fsml",
286 |    "language": "python",
287 |    "name": "fsml"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.10.5"
300 |   },
301 |   "toc": {
302 |    "base_numbering": 1,
303 |    "nav_menu": {},
304 |    "number_sections": true,
305 |    "sideBar": true,
306 |    "skip_h1_title": false,
307 |    "title_cell": "Table of Contents",
308 |    "title_sidebar": "Contents",
309 |    "toc_cell": false,
310 |    "toc_position": {},
311 |    "toc_section_display": true,
312 |    "toc_window_display": false
313 |   }
314 |  },
315 |  "nbformat": 4,
316 |  "nbformat_minor": 2
317 | }
318 | 


--------------------------------------------------------------------------------
/ch02-categorical-encoding/donwload-prepare-store-credit-approval-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Download, prepare and save the Credit Approval Dataset\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "In this notebook, you will find guidelines to download, prepare, and store the Credit Approval Dataset from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml).\n",
 11 |     "\n",
 12 |     "\n",
 13 |     "## Download the data\n",
 14 |     "\n",
 15 |     "Follow these guidelines to download the data:\n",
 16 |     "\n",
 17 |     "- Visit [the UCI website](http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/)\n",
 18 |     "- Click on **crx.data** to download the data. \n",
 19 |     "- Save crx.data in the same folder that contains this notebook.\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "You can find more information about this particular dataset [here](https://archive.ics.uci.edu/ml/datasets/credit+approval)."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import random\n",
 32 |     "import numpy as np\n",
 33 |     "import pandas as pd"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/html": [
 44 |        "<div>\n",
 45 |        "<style scoped>\n",
 46 |        "    .dataframe tbody tr th:only-of-type {\n",
 47 |        "        vertical-align: middle;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe tbody tr th {\n",
 51 |        "        vertical-align: top;\n",
 52 |        "    }\n",
 53 |        "\n",
 54 |        "    .dataframe thead th {\n",
 55 |        "        text-align: right;\n",
 56 |        "    }\n",
 57 |        "</style>\n",
 58 |        "<table border=\"1\" class=\"dataframe\">\n",
 59 |        "  <thead>\n",
 60 |        "    <tr style=\"text-align: right;\">\n",
 61 |        "      <th></th>\n",
 62 |        "      <th>A1</th>\n",
 63 |        "      <th>A2</th>\n",
 64 |        "      <th>A3</th>\n",
 65 |        "      <th>A4</th>\n",
 66 |        "      <th>A5</th>\n",
 67 |        "      <th>A6</th>\n",
 68 |        "      <th>A7</th>\n",
 69 |        "      <th>A8</th>\n",
 70 |        "      <th>A9</th>\n",
 71 |        "      <th>A10</th>\n",
 72 |        "      <th>A11</th>\n",
 73 |        "      <th>A12</th>\n",
 74 |        "      <th>A13</th>\n",
 75 |        "      <th>A14</th>\n",
 76 |        "      <th>A15</th>\n",
 77 |        "      <th>target</th>\n",
 78 |        "    </tr>\n",
 79 |        "  </thead>\n",
 80 |        "  <tbody>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>0</th>\n",
 83 |        "      <td>b</td>\n",
 84 |        "      <td>30.83</td>\n",
 85 |        "      <td>0.000</td>\n",
 86 |        "      <td>u</td>\n",
 87 |        "      <td>g</td>\n",
 88 |        "      <td>w</td>\n",
 89 |        "      <td>v</td>\n",
 90 |        "      <td>1.25</td>\n",
 91 |        "      <td>t</td>\n",
 92 |        "      <td>t</td>\n",
 93 |        "      <td>1</td>\n",
 94 |        "      <td>f</td>\n",
 95 |        "      <td>g</td>\n",
 96 |        "      <td>202.0</td>\n",
 97 |        "      <td>0</td>\n",
 98 |        "      <td>1</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>1</th>\n",
102 |        "      <td>a</td>\n",
103 |        "      <td>58.67</td>\n",
104 |        "      <td>4.460</td>\n",
105 |        "      <td>u</td>\n",
106 |        "      <td>g</td>\n",
107 |        "      <td>q</td>\n",
108 |        "      <td>h</td>\n",
109 |        "      <td>3.04</td>\n",
110 |        "      <td>t</td>\n",
111 |        "      <td>t</td>\n",
112 |        "      <td>6</td>\n",
113 |        "      <td>f</td>\n",
114 |        "      <td>g</td>\n",
115 |        "      <td>43.0</td>\n",
116 |        "      <td>560</td>\n",
117 |        "      <td>1</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>2</th>\n",
121 |        "      <td>a</td>\n",
122 |        "      <td>24.50</td>\n",
123 |        "      <td>0.500</td>\n",
124 |        "      <td>u</td>\n",
125 |        "      <td>g</td>\n",
126 |        "      <td>q</td>\n",
127 |        "      <td>h</td>\n",
128 |        "      <td>1.50</td>\n",
129 |        "      <td>t</td>\n",
130 |        "      <td>f</td>\n",
131 |        "      <td>0</td>\n",
132 |        "      <td>f</td>\n",
133 |        "      <td>g</td>\n",
134 |        "      <td>280.0</td>\n",
135 |        "      <td>824</td>\n",
136 |        "      <td>1</td>\n",
137 |        "    </tr>\n",
138 |        "    <tr>\n",
139 |        "      <th>3</th>\n",
140 |        "      <td>b</td>\n",
141 |        "      <td>27.83</td>\n",
142 |        "      <td>1.540</td>\n",
143 |        "      <td>u</td>\n",
144 |        "      <td>g</td>\n",
145 |        "      <td>w</td>\n",
146 |        "      <td>v</td>\n",
147 |        "      <td>3.75</td>\n",
148 |        "      <td>t</td>\n",
149 |        "      <td>t</td>\n",
150 |        "      <td>5</td>\n",
151 |        "      <td>t</td>\n",
152 |        "      <td>g</td>\n",
153 |        "      <td>100.0</td>\n",
154 |        "      <td>3</td>\n",
155 |        "      <td>1</td>\n",
156 |        "    </tr>\n",
157 |        "    <tr>\n",
158 |        "      <th>4</th>\n",
159 |        "      <td>b</td>\n",
160 |        "      <td>20.17</td>\n",
161 |        "      <td>5.625</td>\n",
162 |        "      <td>u</td>\n",
163 |        "      <td>g</td>\n",
164 |        "      <td>w</td>\n",
165 |        "      <td>v</td>\n",
166 |        "      <td>1.71</td>\n",
167 |        "      <td>t</td>\n",
168 |        "      <td>f</td>\n",
169 |        "      <td>0</td>\n",
170 |        "      <td>f</td>\n",
171 |        "      <td>s</td>\n",
172 |        "      <td>120.0</td>\n",
173 |        "      <td>0</td>\n",
174 |        "      <td>1</td>\n",
175 |        "    </tr>\n",
176 |        "  </tbody>\n",
177 |        "</table>\n",
178 |        "</div>"
179 |       ],
180 |       "text/plain": [
181 |        "  A1     A2     A3 A4 A5 A6 A7    A8 A9 A10  A11 A12 A13    A14  A15  target\n",
182 |        "0  b  30.83  0.000  u  g  w  v  1.25  t   t    1   f   g  202.0    0       1\n",
183 |        "1  a  58.67  4.460  u  g  q  h  3.04  t   t    6   f   g   43.0  560       1\n",
184 |        "2  a  24.50  0.500  u  g  q  h  1.50  t   f    0   f   g  280.0  824       1\n",
185 |        "3  b  27.83  1.540  u  g  w  v  3.75  t   t    5   t   g  100.0    3       1\n",
186 |        "4  b  20.17  5.625  u  g  w  v  1.71  t   f    0   f   s  120.0    0       1"
187 |       ]
188 |      },
189 |      "execution_count": 2,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "# Load data\n",
196 |     "data = pd.read_csv(\"crx.data\", header=None)\n",
197 |     "\n",
198 |     "# Create variable names according to UCI Machine Learning\n",
199 |     "# Repository's information:\n",
200 |     "varnames = [f\"A{s}\" for s in range(1, 17)]\n",
201 |     "\n",
202 |     "# Add column names to dataset:\n",
203 |     "data.columns = varnames\n",
204 |     "\n",
205 |     "# Replace ? by np.nan:\n",
206 |     "data = data.replace(\"?\", np.nan)\n",
207 |     "\n",
208 |     "# Cast variables to correct data types:\n",
209 |     "data[\"A2\"] = data[\"A2\"].astype(\"float\")\n",
210 |     "data[\"A14\"] = data[\"A14\"].astype(\"float\")\n",
211 |     "\n",
212 |     "# Encode target to binary notation:\n",
213 |     "data[\"A16\"] = data[\"A16\"].map({\"+\": 1, \"-\": 0})\n",
214 |     "\n",
215 |     "# Rename target:\n",
216 |     "data.rename(columns={\"A16\": \"target\"}, inplace=True)\n",
217 |     "\n",
218 |     "# Display first 5 rows of data:\n",
219 |     "data.head()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 3,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# find categorical variables\n",
229 |     "cat_cols = [c for c in data.columns if data[c].dtypes == \"O\"]\n",
230 |     "\n",
231 |     "# find numerical variables\n",
232 |     "num_cols = [c for c in data.columns if data[c].dtypes != \"O\"]"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 4,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "# fill in missing values\n",
242 |     "\n",
243 |     "data[num_cols] = data[num_cols].fillna(0)\n",
244 |     "data[cat_cols] = data[cat_cols].fillna(\"Missing\")"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 5,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "# Save dataset\n",
254 |     "\n",
255 |     "data.to_csv(\"credit_approval_uci.csv\", index=False)"
256 |    ]
257 |   }
258 |  ],
259 |  "metadata": {
260 |   "kernelspec": {
261 |    "display_name": "fenotebook",
262 |    "language": "python",
263 |    "name": "fenotebook"
264 |   },
265 |   "language_info": {
266 |    "codemirror_mode": {
267 |     "name": "ipython",
268 |     "version": 3
269 |    },
270 |    "file_extension": ".py",
271 |    "mimetype": "text/x-python",
272 |    "name": "python",
273 |    "nbconvert_exporter": "python",
274 |    "pygments_lexer": "ipython3",
275 |    "version": "3.8.2"
276 |   },
277 |   "toc": {
278 |    "base_numbering": 1,
279 |    "nav_menu": {},
280 |    "number_sections": true,
281 |    "sideBar": true,
282 |    "skip_h1_title": false,
283 |    "title_cell": "Table of Contents",
284 |    "title_sidebar": "Contents",
285 |    "toc_cell": false,
286 |    "toc_position": {},
287 |    "toc_section_display": true,
288 |    "toc_window_display": false
289 |   }
290 |  },
291 |  "nbformat": 4,
292 |  "nbformat_minor": 2
293 | }
294 | 


--------------------------------------------------------------------------------
/ch04-discretization/donwload-prepare-store-enron-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Download, prepare and save the Bag of Words Data Set\n",
  8 |     "\n",
  9 |     "In this notebook, you will find guidelines to download, prepare, and store the Bag of Words Data Set from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml).\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "## Download the data\n",
 13 |     "\n",
 14 |     "Follow these guidelines to download the data:\n",
 15 |     "\n",
 16 |     "- Visit [the UCI website](https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/)\n",
 17 |     "- Click on **docword.enron.txt.gz** to download the data.\n",
 18 |     "- Unzip the data and save it in the same folder that contains this notebook.\n",
 19 |     "- Then click on **vocab.enron.txt** to download the word names.\n",
 20 |     "- Save vocab.enron.txt in the same folder that contains this notebook.\n",
 21 |     "\n",
 22 |     "You can find more information about this particular dataset [here](https://archive.ics.uci.edu/ml/datasets/Bag+of+Words)."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import pandas as pd"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/html": [
 42 |        "<div>\n",
 43 |        "<style scoped>\n",
 44 |        "    .dataframe tbody tr th:only-of-type {\n",
 45 |        "        vertical-align: middle;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe tbody tr th {\n",
 49 |        "        vertical-align: top;\n",
 50 |        "    }\n",
 51 |        "\n",
 52 |        "    .dataframe thead th {\n",
 53 |        "        text-align: right;\n",
 54 |        "    }\n",
 55 |        "</style>\n",
 56 |        "<table border=\"1\" class=\"dataframe\">\n",
 57 |        "  <thead>\n",
 58 |        "    <tr style=\"text-align: right;\">\n",
 59 |        "      <th></th>\n",
 60 |        "      <th>docID</th>\n",
 61 |        "      <th>wordID</th>\n",
 62 |        "      <th>count</th>\n",
 63 |        "    </tr>\n",
 64 |        "  </thead>\n",
 65 |        "  <tbody>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>0</th>\n",
 68 |        "      <td>1</td>\n",
 69 |        "      <td>118</td>\n",
 70 |        "      <td>1</td>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>1</th>\n",
 74 |        "      <td>1</td>\n",
 75 |        "      <td>285</td>\n",
 76 |        "      <td>1</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>2</th>\n",
 80 |        "      <td>1</td>\n",
 81 |        "      <td>1229</td>\n",
 82 |        "      <td>1</td>\n",
 83 |        "    </tr>\n",
 84 |        "    <tr>\n",
 85 |        "      <th>3</th>\n",
 86 |        "      <td>1</td>\n",
 87 |        "      <td>1688</td>\n",
 88 |        "      <td>1</td>\n",
 89 |        "    </tr>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>4</th>\n",
 92 |        "      <td>1</td>\n",
 93 |        "      <td>2068</td>\n",
 94 |        "      <td>1</td>\n",
 95 |        "    </tr>\n",
 96 |        "  </tbody>\n",
 97 |        "</table>\n",
 98 |        "</div>"
 99 |       ],
100 |       "text/plain": [
101 |        "   docID  wordID  count\n",
102 |        "0      1     118      1\n",
103 |        "1      1     285      1\n",
104 |        "2      1    1229      1\n",
105 |        "3      1    1688      1\n",
106 |        "4      1    2068      1"
107 |       ]
108 |      },
109 |      "execution_count": 2,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "# load the word counts\n",
116 |     "\n",
117 |     "data = pd.read_csv(\"docword.enron.txt\", sep=\" \", skiprows=3, header=None)\n",
118 |     "data.columns = [\"docID\", \"wordID\", \"count\"]\n",
119 |     "\n",
120 |     "data.head()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 3,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/html": [
131 |        "<div>\n",
132 |        "<style scoped>\n",
133 |        "    .dataframe tbody tr th:only-of-type {\n",
134 |        "        vertical-align: middle;\n",
135 |        "    }\n",
136 |        "\n",
137 |        "    .dataframe tbody tr th {\n",
138 |        "        vertical-align: top;\n",
139 |        "    }\n",
140 |        "\n",
141 |        "    .dataframe thead th {\n",
142 |        "        text-align: right;\n",
143 |        "    }\n",
144 |        "</style>\n",
145 |        "<table border=\"1\" class=\"dataframe\">\n",
146 |        "  <thead>\n",
147 |        "    <tr style=\"text-align: right;\">\n",
148 |        "      <th></th>\n",
149 |        "      <th>words</th>\n",
150 |        "    </tr>\n",
151 |        "  </thead>\n",
152 |        "  <tbody>\n",
153 |        "    <tr>\n",
154 |        "      <th>0</th>\n",
155 |        "      <td>aaa</td>\n",
156 |        "    </tr>\n",
157 |        "    <tr>\n",
158 |        "      <th>1</th>\n",
159 |        "      <td>aaas</td>\n",
160 |        "    </tr>\n",
161 |        "    <tr>\n",
162 |        "      <th>2</th>\n",
163 |        "      <td>aactive</td>\n",
164 |        "    </tr>\n",
165 |        "    <tr>\n",
166 |        "      <th>3</th>\n",
167 |        "      <td>aadvantage</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>4</th>\n",
171 |        "      <td>aaker</td>\n",
172 |        "    </tr>\n",
173 |        "  </tbody>\n",
174 |        "</table>\n",
175 |        "</div>"
176 |       ],
177 |       "text/plain": [
178 |        "        words\n",
179 |        "0         aaa\n",
180 |        "1        aaas\n",
181 |        "2     aactive\n",
182 |        "3  aadvantage\n",
183 |        "4       aaker"
184 |       ]
185 |      },
186 |      "execution_count": 3,
187 |      "metadata": {},
188 |      "output_type": "execute_result"
189 |     }
190 |    ],
191 |    "source": [
192 |     "# load the words\n",
193 |     "\n",
194 |     "words = pd.read_csv(\"vocab.enron.txt\", header=None)\n",
195 |     "words.columns = [\"words\"]\n",
196 |     "\n",
197 |     "words.head()"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 4,
203 |    "metadata": {},
204 |    "outputs": [
205 |     {
206 |      "data": {
207 |       "text/html": [
208 |        "<div>\n",
209 |        "<style scoped>\n",
210 |        "    .dataframe tbody tr th:only-of-type {\n",
211 |        "        vertical-align: middle;\n",
212 |        "    }\n",
213 |        "\n",
214 |        "    .dataframe tbody tr th {\n",
215 |        "        vertical-align: top;\n",
216 |        "    }\n",
217 |        "\n",
218 |        "    .dataframe thead th {\n",
219 |        "        text-align: right;\n",
220 |        "    }\n",
221 |        "</style>\n",
222 |        "<table border=\"1\" class=\"dataframe\">\n",
223 |        "  <thead>\n",
224 |        "    <tr style=\"text-align: right;\">\n",
225 |        "      <th></th>\n",
226 |        "      <th>words</th>\n",
227 |        "    </tr>\n",
228 |        "  </thead>\n",
229 |        "  <tbody>\n",
230 |        "    <tr>\n",
231 |        "      <th>8704</th>\n",
232 |        "      <td>eurobond</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>13618</th>\n",
236 |        "      <td>keen</td>\n",
237 |        "    </tr>\n",
238 |        "    <tr>\n",
239 |        "      <th>11114</th>\n",
240 |        "      <td>halligan</td>\n",
241 |        "    </tr>\n",
242 |        "    <tr>\n",
243 |        "      <th>19968</th>\n",
244 |        "      <td>pvr</td>\n",
245 |        "    </tr>\n",
246 |        "    <tr>\n",
247 |        "      <th>23327</th>\n",
248 |        "      <td>soda</td>\n",
249 |        "    </tr>\n",
250 |        "    <tr>\n",
251 |        "      <th>20714</th>\n",
252 |        "      <td>refundable</td>\n",
253 |        "    </tr>\n",
254 |        "    <tr>\n",
255 |        "      <th>390</th>\n",
256 |        "      <td>advice</td>\n",
257 |        "    </tr>\n",
258 |        "    <tr>\n",
259 |        "      <th>6257</th>\n",
260 |        "      <td>decker</td>\n",
261 |        "    </tr>\n",
262 |        "    <tr>\n",
263 |        "      <th>8680</th>\n",
264 |        "      <td>etis</td>\n",
265 |        "    </tr>\n",
266 |        "    <tr>\n",
267 |        "      <th>3370</th>\n",
268 |        "      <td>cab</td>\n",
269 |        "    </tr>\n",
270 |        "  </tbody>\n",
271 |        "</table>\n",
272 |        "</div>"
273 |       ],
274 |       "text/plain": [
275 |        "            words\n",
276 |        "8704     eurobond\n",
277 |        "13618        keen\n",
278 |        "11114    halligan\n",
279 |        "19968         pvr\n",
280 |        "23327        soda\n",
281 |        "20714  refundable\n",
282 |        "390        advice\n",
283 |        "6257       decker\n",
284 |        "8680         etis\n",
285 |        "3370          cab"
286 |       ]
287 |      },
288 |      "execution_count": 4,
289 |      "metadata": {},
290 |      "output_type": "execute_result"
291 |     }
292 |    ],
293 |    "source": [
294 |     "# select at random 10 words\n",
295 |     "\n",
296 |     "words = words.sample(10, random_state=290917)\n",
297 |     "\n",
298 |     "words"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 5,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "data": {
308 |       "text/html": [
309 |        "<div>\n",
310 |        "<style scoped>\n",
311 |        "    .dataframe tbody tr th:only-of-type {\n",
312 |        "        vertical-align: middle;\n",
313 |        "    }\n",
314 |        "\n",
315 |        "    .dataframe tbody tr th {\n",
316 |        "        vertical-align: top;\n",
317 |        "    }\n",
318 |        "\n",
319 |        "    .dataframe thead th {\n",
320 |        "        text-align: right;\n",
321 |        "    }\n",
322 |        "</style>\n",
323 |        "<table border=\"1\" class=\"dataframe\">\n",
324 |        "  <thead>\n",
325 |        "    <tr style=\"text-align: right;\">\n",
326 |        "      <th></th>\n",
327 |        "      <th>words</th>\n",
328 |        "      <th>docID</th>\n",
329 |        "      <th>wordID</th>\n",
330 |        "      <th>count</th>\n",
331 |        "    </tr>\n",
332 |        "  </thead>\n",
333 |        "  <tbody>\n",
334 |        "    <tr>\n",
335 |        "      <th>137715</th>\n",
336 |        "      <td>eurobond</td>\n",
337 |        "      <td>2021</td>\n",
338 |        "      <td>8704</td>\n",
339 |        "      <td>2</td>\n",
340 |        "    </tr>\n",
341 |        "    <tr>\n",
342 |        "      <th>140167</th>\n",
343 |        "      <td>eurobond</td>\n",
344 |        "      <td>2050</td>\n",
345 |        "      <td>8704</td>\n",
346 |        "      <td>11</td>\n",
347 |        "    </tr>\n",
348 |        "    <tr>\n",
349 |        "      <th>151530</th>\n",
350 |        "      <td>eurobond</td>\n",
351 |        "      <td>2269</td>\n",
352 |        "      <td>8704</td>\n",
353 |        "      <td>2</td>\n",
354 |        "    </tr>\n",
355 |        "    <tr>\n",
356 |        "      <th>155066</th>\n",
357 |        "      <td>eurobond</td>\n",
358 |        "      <td>2352</td>\n",
359 |        "      <td>8704</td>\n",
360 |        "      <td>2</td>\n",
361 |        "    </tr>\n",
362 |        "    <tr>\n",
363 |        "      <th>156247</th>\n",
364 |        "      <td>eurobond</td>\n",
365 |        "      <td>2375</td>\n",
366 |        "      <td>8704</td>\n",
367 |        "      <td>2</td>\n",
368 |        "    </tr>\n",
369 |        "  </tbody>\n",
370 |        "</table>\n",
371 |        "</div>"
372 |       ],
373 |       "text/plain": [
374 |        "           words  docID  wordID  count\n",
375 |        "137715  eurobond   2021    8704      2\n",
376 |        "140167  eurobond   2050    8704     11\n",
377 |        "151530  eurobond   2269    8704      2\n",
378 |        "155066  eurobond   2352    8704      2\n",
379 |        "156247  eurobond   2375    8704      2"
380 |       ]
381 |      },
382 |      "execution_count": 5,
383 |      "metadata": {},
384 |      "output_type": "execute_result"
385 |     }
386 |    ],
387 |    "source": [
388 |     "data = words.merge(data, left_index=True, right_on=\"wordID\")\n",
389 |     "\n",
390 |     "data.head()"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 6,
396 |    "metadata": {},
397 |    "outputs": [
398 |     {
399 |      "data": {
400 |       "text/plain": [
401 |        "(1388, 10)"
402 |       ]
403 |      },
404 |      "execution_count": 6,
405 |      "metadata": {},
406 |      "output_type": "execute_result"
407 |     }
408 |    ],
409 |    "source": [
410 |     "# reconstitute the bag of words dataset\n",
411 |     "\n",
412 |     "bow = data.pivot(index=\"docID\", columns=\"words\", values=\"count\")\n",
413 |     "bow.fillna(0, inplace=True)\n",
414 |     "bow.reset_index(inplace=True, drop=True)\n",
415 |     "bow.shape"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 7,
421 |    "metadata": {},
422 |    "outputs": [
423 |     {
424 |      "data": {
425 |       "text/html": [
426 |        "<div>\n",
427 |        "<style scoped>\n",
428 |        "    .dataframe tbody tr th:only-of-type {\n",
429 |        "        vertical-align: middle;\n",
430 |        "    }\n",
431 |        "\n",
432 |        "    .dataframe tbody tr th {\n",
433 |        "        vertical-align: top;\n",
434 |        "    }\n",
435 |        "\n",
436 |        "    .dataframe thead th {\n",
437 |        "        text-align: right;\n",
438 |        "    }\n",
439 |        "</style>\n",
440 |        "<table border=\"1\" class=\"dataframe\">\n",
441 |        "  <thead>\n",
442 |        "    <tr style=\"text-align: right;\">\n",
443 |        "      <th>words</th>\n",
444 |        "      <th>advice</th>\n",
445 |        "      <th>cab</th>\n",
446 |        "      <th>decker</th>\n",
447 |        "      <th>etis</th>\n",
448 |        "      <th>eurobond</th>\n",
449 |        "      <th>halligan</th>\n",
450 |        "      <th>keen</th>\n",
451 |        "      <th>pvr</th>\n",
452 |        "      <th>refundable</th>\n",
453 |        "      <th>soda</th>\n",
454 |        "    </tr>\n",
455 |        "  </thead>\n",
456 |        "  <tbody>\n",
457 |        "    <tr>\n",
458 |        "      <th>0</th>\n",
459 |        "      <td>0.0</td>\n",
460 |        "      <td>0.0</td>\n",
461 |        "      <td>2.0</td>\n",
462 |        "      <td>0.0</td>\n",
463 |        "      <td>0.0</td>\n",
464 |        "      <td>0.0</td>\n",
465 |        "      <td>0.0</td>\n",
466 |        "      <td>0.0</td>\n",
467 |        "      <td>0.0</td>\n",
468 |        "      <td>0.0</td>\n",
469 |        "    </tr>\n",
470 |        "    <tr>\n",
471 |        "      <th>1</th>\n",
472 |        "      <td>0.0</td>\n",
473 |        "      <td>0.0</td>\n",
474 |        "      <td>2.0</td>\n",
475 |        "      <td>0.0</td>\n",
476 |        "      <td>0.0</td>\n",
477 |        "      <td>0.0</td>\n",
478 |        "      <td>0.0</td>\n",
479 |        "      <td>0.0</td>\n",
480 |        "      <td>0.0</td>\n",
481 |        "      <td>0.0</td>\n",
482 |        "    </tr>\n",
483 |        "    <tr>\n",
484 |        "      <th>2</th>\n",
485 |        "      <td>1.0</td>\n",
486 |        "      <td>0.0</td>\n",
487 |        "      <td>0.0</td>\n",
488 |        "      <td>0.0</td>\n",
489 |        "      <td>0.0</td>\n",
490 |        "      <td>0.0</td>\n",
491 |        "      <td>0.0</td>\n",
492 |        "      <td>0.0</td>\n",
493 |        "      <td>0.0</td>\n",
494 |        "      <td>0.0</td>\n",
495 |        "    </tr>\n",
496 |        "    <tr>\n",
497 |        "      <th>3</th>\n",
498 |        "      <td>0.0</td>\n",
499 |        "      <td>0.0</td>\n",
500 |        "      <td>0.0</td>\n",
501 |        "      <td>0.0</td>\n",
502 |        "      <td>0.0</td>\n",
503 |        "      <td>0.0</td>\n",
504 |        "      <td>0.0</td>\n",
505 |        "      <td>0.0</td>\n",
506 |        "      <td>1.0</td>\n",
507 |        "      <td>0.0</td>\n",
508 |        "    </tr>\n",
509 |        "    <tr>\n",
510 |        "      <th>4</th>\n",
511 |        "      <td>0.0</td>\n",
512 |        "      <td>0.0</td>\n",
513 |        "      <td>2.0</td>\n",
514 |        "      <td>0.0</td>\n",
515 |        "      <td>0.0</td>\n",
516 |        "      <td>0.0</td>\n",
517 |        "      <td>0.0</td>\n",
518 |        "      <td>0.0</td>\n",
519 |        "      <td>0.0</td>\n",
520 |        "      <td>0.0</td>\n",
521 |        "    </tr>\n",
522 |        "  </tbody>\n",
523 |        "</table>\n",
524 |        "</div>"
525 |       ],
526 |       "text/plain": [
527 |        "words  advice  cab  decker  etis  eurobond  halligan  keen  pvr  refundable  \\\n",
528 |        "0         0.0  0.0     2.0   0.0       0.0       0.0   0.0  0.0         0.0   \n",
529 |        "1         0.0  0.0     2.0   0.0       0.0       0.0   0.0  0.0         0.0   \n",
530 |        "2         1.0  0.0     0.0   0.0       0.0       0.0   0.0  0.0         0.0   \n",
531 |        "3         0.0  0.0     0.0   0.0       0.0       0.0   0.0  0.0         1.0   \n",
532 |        "4         0.0  0.0     2.0   0.0       0.0       0.0   0.0  0.0         0.0   \n",
533 |        "\n",
534 |        "words  soda  \n",
535 |        "0       0.0  \n",
536 |        "1       0.0  \n",
537 |        "2       0.0  \n",
538 |        "3       0.0  \n",
539 |        "4       0.0  "
540 |       ]
541 |      },
542 |      "execution_count": 7,
543 |      "metadata": {},
544 |      "output_type": "execute_result"
545 |     }
546 |    ],
547 |    "source": [
548 |     "bow.head()"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": 8,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "bow.to_csv(\"bag_of_words.csv\", index=False)"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": null,
563 |    "metadata": {},
564 |    "outputs": [],
565 |    "source": []
566 |   }
567 |  ],
568 |  "metadata": {
569 |   "kernelspec": {
570 |    "display_name": "fets",
571 |    "language": "python",
572 |    "name": "fets"
573 |   },
574 |   "language_info": {
575 |    "codemirror_mode": {
576 |     "name": "ipython",
577 |     "version": 3
578 |    },
579 |    "file_extension": ".py",
580 |    "mimetype": "text/x-python",
581 |    "name": "python",
582 |    "nbconvert_exporter": "python",
583 |    "pygments_lexer": "ipython3",
584 |    "version": "3.8.2"
585 |   },
586 |   "toc": {
587 |    "base_numbering": 1,
588 |    "nav_menu": {},
589 |    "number_sections": true,
590 |    "sideBar": true,
591 |    "skip_h1_title": false,
592 |    "title_cell": "Table of Contents",
593 |    "title_sidebar": "Contents",
594 |    "toc_cell": false,
595 |    "toc_position": {},
596 |    "toc_section_display": true,
597 |    "toc_window_display": false
598 |   }
599 |  },
600 |  "nbformat": 4,
601 |  "nbformat_minor": 4
602 | }
603 | 


--------------------------------------------------------------------------------
/ch05-outliers/Recipe-2-Finding-outliers-with-mean-and-std.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Finding outliers with the mean and standard deviation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "\n",
 19 |     "# boston house dataset for the demo\n",
 20 |     "from sklearn.datasets import load_breast_cancer"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "text/html": [
 31 |        "<div>\n",
 32 |        "<style scoped>\n",
 33 |        "    .dataframe tbody tr th:only-of-type {\n",
 34 |        "        vertical-align: middle;\n",
 35 |        "    }\n",
 36 |        "\n",
 37 |        "    .dataframe tbody tr th {\n",
 38 |        "        vertical-align: top;\n",
 39 |        "    }\n",
 40 |        "\n",
 41 |        "    .dataframe thead th {\n",
 42 |        "        text-align: right;\n",
 43 |        "    }\n",
 44 |        "</style>\n",
 45 |        "<table border=\"1\" class=\"dataframe\">\n",
 46 |        "  <thead>\n",
 47 |        "    <tr style=\"text-align: right;\">\n",
 48 |        "      <th></th>\n",
 49 |        "      <th>mean radius</th>\n",
 50 |        "      <th>mean texture</th>\n",
 51 |        "      <th>mean perimeter</th>\n",
 52 |        "      <th>mean area</th>\n",
 53 |        "      <th>mean smoothness</th>\n",
 54 |        "      <th>mean compactness</th>\n",
 55 |        "      <th>mean concavity</th>\n",
 56 |        "      <th>mean concave points</th>\n",
 57 |        "      <th>mean symmetry</th>\n",
 58 |        "      <th>mean fractal dimension</th>\n",
 59 |        "      <th>...</th>\n",
 60 |        "      <th>worst radius</th>\n",
 61 |        "      <th>worst texture</th>\n",
 62 |        "      <th>worst perimeter</th>\n",
 63 |        "      <th>worst area</th>\n",
 64 |        "      <th>worst smoothness</th>\n",
 65 |        "      <th>worst compactness</th>\n",
 66 |        "      <th>worst concavity</th>\n",
 67 |        "      <th>worst concave points</th>\n",
 68 |        "      <th>worst symmetry</th>\n",
 69 |        "      <th>worst fractal dimension</th>\n",
 70 |        "    </tr>\n",
 71 |        "  </thead>\n",
 72 |        "  <tbody>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>0</th>\n",
 75 |        "      <td>17.99</td>\n",
 76 |        "      <td>10.38</td>\n",
 77 |        "      <td>122.80</td>\n",
 78 |        "      <td>1001.0</td>\n",
 79 |        "      <td>0.11840</td>\n",
 80 |        "      <td>0.27760</td>\n",
 81 |        "      <td>0.3001</td>\n",
 82 |        "      <td>0.14710</td>\n",
 83 |        "      <td>0.2419</td>\n",
 84 |        "      <td>0.07871</td>\n",
 85 |        "      <td>...</td>\n",
 86 |        "      <td>25.38</td>\n",
 87 |        "      <td>17.33</td>\n",
 88 |        "      <td>184.60</td>\n",
 89 |        "      <td>2019.0</td>\n",
 90 |        "      <td>0.1622</td>\n",
 91 |        "      <td>0.6656</td>\n",
 92 |        "      <td>0.7119</td>\n",
 93 |        "      <td>0.2654</td>\n",
 94 |        "      <td>0.4601</td>\n",
 95 |        "      <td>0.11890</td>\n",
 96 |        "    </tr>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>1</th>\n",
 99 |        "      <td>20.57</td>\n",
100 |        "      <td>17.77</td>\n",
101 |        "      <td>132.90</td>\n",
102 |        "      <td>1326.0</td>\n",
103 |        "      <td>0.08474</td>\n",
104 |        "      <td>0.07864</td>\n",
105 |        "      <td>0.0869</td>\n",
106 |        "      <td>0.07017</td>\n",
107 |        "      <td>0.1812</td>\n",
108 |        "      <td>0.05667</td>\n",
109 |        "      <td>...</td>\n",
110 |        "      <td>24.99</td>\n",
111 |        "      <td>23.41</td>\n",
112 |        "      <td>158.80</td>\n",
113 |        "      <td>1956.0</td>\n",
114 |        "      <td>0.1238</td>\n",
115 |        "      <td>0.1866</td>\n",
116 |        "      <td>0.2416</td>\n",
117 |        "      <td>0.1860</td>\n",
118 |        "      <td>0.2750</td>\n",
119 |        "      <td>0.08902</td>\n",
120 |        "    </tr>\n",
121 |        "    <tr>\n",
122 |        "      <th>2</th>\n",
123 |        "      <td>19.69</td>\n",
124 |        "      <td>21.25</td>\n",
125 |        "      <td>130.00</td>\n",
126 |        "      <td>1203.0</td>\n",
127 |        "      <td>0.10960</td>\n",
128 |        "      <td>0.15990</td>\n",
129 |        "      <td>0.1974</td>\n",
130 |        "      <td>0.12790</td>\n",
131 |        "      <td>0.2069</td>\n",
132 |        "      <td>0.05999</td>\n",
133 |        "      <td>...</td>\n",
134 |        "      <td>23.57</td>\n",
135 |        "      <td>25.53</td>\n",
136 |        "      <td>152.50</td>\n",
137 |        "      <td>1709.0</td>\n",
138 |        "      <td>0.1444</td>\n",
139 |        "      <td>0.4245</td>\n",
140 |        "      <td>0.4504</td>\n",
141 |        "      <td>0.2430</td>\n",
142 |        "      <td>0.3613</td>\n",
143 |        "      <td>0.08758</td>\n",
144 |        "    </tr>\n",
145 |        "    <tr>\n",
146 |        "      <th>3</th>\n",
147 |        "      <td>11.42</td>\n",
148 |        "      <td>20.38</td>\n",
149 |        "      <td>77.58</td>\n",
150 |        "      <td>386.1</td>\n",
151 |        "      <td>0.14250</td>\n",
152 |        "      <td>0.28390</td>\n",
153 |        "      <td>0.2414</td>\n",
154 |        "      <td>0.10520</td>\n",
155 |        "      <td>0.2597</td>\n",
156 |        "      <td>0.09744</td>\n",
157 |        "      <td>...</td>\n",
158 |        "      <td>14.91</td>\n",
159 |        "      <td>26.50</td>\n",
160 |        "      <td>98.87</td>\n",
161 |        "      <td>567.7</td>\n",
162 |        "      <td>0.2098</td>\n",
163 |        "      <td>0.8663</td>\n",
164 |        "      <td>0.6869</td>\n",
165 |        "      <td>0.2575</td>\n",
166 |        "      <td>0.6638</td>\n",
167 |        "      <td>0.17300</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>4</th>\n",
171 |        "      <td>20.29</td>\n",
172 |        "      <td>14.34</td>\n",
173 |        "      <td>135.10</td>\n",
174 |        "      <td>1297.0</td>\n",
175 |        "      <td>0.10030</td>\n",
176 |        "      <td>0.13280</td>\n",
177 |        "      <td>0.1980</td>\n",
178 |        "      <td>0.10430</td>\n",
179 |        "      <td>0.1809</td>\n",
180 |        "      <td>0.05883</td>\n",
181 |        "      <td>...</td>\n",
182 |        "      <td>22.54</td>\n",
183 |        "      <td>16.67</td>\n",
184 |        "      <td>152.20</td>\n",
185 |        "      <td>1575.0</td>\n",
186 |        "      <td>0.1374</td>\n",
187 |        "      <td>0.2050</td>\n",
188 |        "      <td>0.4000</td>\n",
189 |        "      <td>0.1625</td>\n",
190 |        "      <td>0.2364</td>\n",
191 |        "      <td>0.07678</td>\n",
192 |        "    </tr>\n",
193 |        "  </tbody>\n",
194 |        "</table>\n",
195 |        "<p>5 rows × 30 columns</p>\n",
196 |        "</div>"
197 |       ],
198 |       "text/plain": [
199 |        "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
200 |        "0        17.99         10.38          122.80     1001.0          0.11840   \n",
201 |        "1        20.57         17.77          132.90     1326.0          0.08474   \n",
202 |        "2        19.69         21.25          130.00     1203.0          0.10960   \n",
203 |        "3        11.42         20.38           77.58      386.1          0.14250   \n",
204 |        "4        20.29         14.34          135.10     1297.0          0.10030   \n",
205 |        "\n",
206 |        "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
207 |        "0           0.27760          0.3001              0.14710         0.2419   \n",
208 |        "1           0.07864          0.0869              0.07017         0.1812   \n",
209 |        "2           0.15990          0.1974              0.12790         0.2069   \n",
210 |        "3           0.28390          0.2414              0.10520         0.2597   \n",
211 |        "4           0.13280          0.1980              0.10430         0.1809   \n",
212 |        "\n",
213 |        "   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \\\n",
214 |        "0                 0.07871  ...         25.38          17.33           184.60   \n",
215 |        "1                 0.05667  ...         24.99          23.41           158.80   \n",
216 |        "2                 0.05999  ...         23.57          25.53           152.50   \n",
217 |        "3                 0.09744  ...         14.91          26.50            98.87   \n",
218 |        "4                 0.05883  ...         22.54          16.67           152.20   \n",
219 |        "\n",
220 |        "   worst area  worst smoothness  worst compactness  worst concavity  \\\n",
221 |        "0      2019.0            0.1622             0.6656           0.7119   \n",
222 |        "1      1956.0            0.1238             0.1866           0.2416   \n",
223 |        "2      1709.0            0.1444             0.4245           0.4504   \n",
224 |        "3       567.7            0.2098             0.8663           0.6869   \n",
225 |        "4      1575.0            0.1374             0.2050           0.4000   \n",
226 |        "\n",
227 |        "   worst concave points  worst symmetry  worst fractal dimension  \n",
228 |        "0                0.2654          0.4601                  0.11890  \n",
229 |        "1                0.1860          0.2750                  0.08902  \n",
230 |        "2                0.2430          0.3613                  0.08758  \n",
231 |        "3                0.2575          0.6638                  0.17300  \n",
232 |        "4                0.1625          0.2364                  0.07678  \n",
233 |        "\n",
234 |        "[5 rows x 30 columns]"
235 |       ]
236 |      },
237 |      "execution_count": 2,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "breast_cancer = load_breast_cancer()\n",
244 |     "X = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)\n",
245 |     "\n",
246 |     "# display top 5 rows\n",
247 |     "X.head()"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 3,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "# mean plus 3 * std\n",
257 |     "\n",
258 |     "\n",
259 |     "def find_limits(df, variable, fold):\n",
260 |     "    lower_limit = df[variable].mean() - fold * df[variable].std()\n",
261 |     "    upper_limit = df[variable].mean() + fold * df[variable].std()\n",
262 |     "    return lower_limit, upper_limit"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 4,
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "data": {
272 |       "text/plain": [
273 |        "(0.05416789678205824, 0.13855266560809995)"
274 |       ]
275 |      },
276 |      "execution_count": 4,
277 |      "metadata": {},
278 |      "output_type": "execute_result"
279 |     }
280 |    ],
281 |    "source": [
282 |     "# we find the limits\n",
283 |     "\n",
284 |     "lower_limit, upper_limit = find_limits(X, \"mean smoothness\", 3)\n",
285 |     "lower_limit, upper_limit"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 5,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "# let's flag the outliers in the data set\n",
295 |     "\n",
296 |     "outliers = np.where(\n",
297 |     "    (X[\"mean smoothness\"] > upper_limit) | \n",
298 |     "    (X[\"mean smoothness\"] < lower_limit),\n",
299 |     "    True,\n",
300 |     "    False,\n",
301 |     ")"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 6,
307 |    "metadata": {},
308 |    "outputs": [
309 |     {
310 |      "data": {
311 |       "text/plain": [
312 |        "5"
313 |       ]
314 |      },
315 |      "execution_count": 6,
316 |      "metadata": {},
317 |      "output_type": "execute_result"
318 |     }
319 |    ],
320 |    "source": [
321 |     "# how many outliers did we find?\n",
322 |     "\n",
323 |     "outliers.sum()"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 7,
329 |    "metadata": {},
330 |    "outputs": [
331 |     {
332 |      "data": {
333 |       "text/plain": [
334 |        "(7.238450329479068, 44.11599606770898)"
335 |       ]
336 |      },
337 |      "execution_count": 7,
338 |      "metadata": {},
339 |      "output_type": "execute_result"
340 |     }
341 |    ],
342 |    "source": [
343 |     "# we find the limits in another variable\n",
344 |     "\n",
345 |     "lower_limit, upper_limit = find_limits(X, \"worst texture\", 3)\n",
346 |     "lower_limit, upper_limit"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 8,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "# let's flag the outliers in the data set\n",
356 |     "\n",
357 |     "outliers = np.where(\n",
358 |     "    (X[\"worst texture\"] > upper_limit) |\n",
359 |     "    (X[\"worst texture\"] < lower_limit),\n",
360 |     "    True,\n",
361 |     "    False,\n",
362 |     ")"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 9,
368 |    "metadata": {},
369 |    "outputs": [
370 |     {
371 |      "data": {
372 |       "text/plain": [
373 |        "4"
374 |       ]
375 |      },
376 |      "execution_count": 9,
377 |      "metadata": {},
378 |      "output_type": "execute_result"
379 |     }
380 |    ],
381 |    "source": [
382 |     "# how many outliers did we find?\n",
383 |     "\n",
384 |     "outliers.sum()"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": []
393 |   }
394 |  ],
395 |  "metadata": {
396 |   "kernelspec": {
397 |    "display_name": "fsml",
398 |    "language": "python",
399 |    "name": "fsml"
400 |   },
401 |   "language_info": {
402 |    "codemirror_mode": {
403 |     "name": "ipython",
404 |     "version": 3
405 |    },
406 |    "file_extension": ".py",
407 |    "mimetype": "text/x-python",
408 |    "name": "python",
409 |    "nbconvert_exporter": "python",
410 |    "pygments_lexer": "ipython3",
411 |    "version": "3.10.5"
412 |   },
413 |   "toc": {
414 |    "base_numbering": 1,
415 |    "nav_menu": {},
416 |    "number_sections": true,
417 |    "sideBar": true,
418 |    "skip_h1_title": false,
419 |    "title_cell": "Table of Contents",
420 |    "title_sidebar": "Contents",
421 |    "toc_cell": false,
422 |    "toc_position": {},
423 |    "toc_section_display": true,
424 |    "toc_window_display": false
425 |   }
426 |  },
427 |  "nbformat": 4,
428 |  "nbformat_minor": 2
429 | }
430 | 


--------------------------------------------------------------------------------
/ch05-outliers/Recipe-3-Finding-outliers-with-the-IQR.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Finding outliers with the IQR proximity rule"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "\n",
 19 |     "# boston house dataset for the demo\n",
 20 |     "from sklearn.datasets import fetch_california_housing"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "text/html": [
 31 |        "<div>\n",
 32 |        "<style scoped>\n",
 33 |        "    .dataframe tbody tr th:only-of-type {\n",
 34 |        "        vertical-align: middle;\n",
 35 |        "    }\n",
 36 |        "\n",
 37 |        "    .dataframe tbody tr th {\n",
 38 |        "        vertical-align: top;\n",
 39 |        "    }\n",
 40 |        "\n",
 41 |        "    .dataframe thead th {\n",
 42 |        "        text-align: right;\n",
 43 |        "    }\n",
 44 |        "</style>\n",
 45 |        "<table border=\"1\" class=\"dataframe\">\n",
 46 |        "  <thead>\n",
 47 |        "    <tr style=\"text-align: right;\">\n",
 48 |        "      <th></th>\n",
 49 |        "      <th>MedInc</th>\n",
 50 |        "      <th>HouseAge</th>\n",
 51 |        "      <th>AveRooms</th>\n",
 52 |        "      <th>AveBedrms</th>\n",
 53 |        "      <th>Population</th>\n",
 54 |        "      <th>AveOccup</th>\n",
 55 |        "      <th>Latitude</th>\n",
 56 |        "      <th>Longitude</th>\n",
 57 |        "    </tr>\n",
 58 |        "  </thead>\n",
 59 |        "  <tbody>\n",
 60 |        "    <tr>\n",
 61 |        "      <th>0</th>\n",
 62 |        "      <td>8.3252</td>\n",
 63 |        "      <td>41.0</td>\n",
 64 |        "      <td>6.984127</td>\n",
 65 |        "      <td>1.023810</td>\n",
 66 |        "      <td>322.0</td>\n",
 67 |        "      <td>2.555556</td>\n",
 68 |        "      <td>37.88</td>\n",
 69 |        "      <td>-122.23</td>\n",
 70 |        "    </tr>\n",
 71 |        "    <tr>\n",
 72 |        "      <th>1</th>\n",
 73 |        "      <td>8.3014</td>\n",
 74 |        "      <td>21.0</td>\n",
 75 |        "      <td>6.238137</td>\n",
 76 |        "      <td>0.971880</td>\n",
 77 |        "      <td>2401.0</td>\n",
 78 |        "      <td>2.109842</td>\n",
 79 |        "      <td>37.86</td>\n",
 80 |        "      <td>-122.22</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>2</th>\n",
 84 |        "      <td>7.2574</td>\n",
 85 |        "      <td>52.0</td>\n",
 86 |        "      <td>8.288136</td>\n",
 87 |        "      <td>1.073446</td>\n",
 88 |        "      <td>496.0</td>\n",
 89 |        "      <td>2.802260</td>\n",
 90 |        "      <td>37.85</td>\n",
 91 |        "      <td>-122.24</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>3</th>\n",
 95 |        "      <td>5.6431</td>\n",
 96 |        "      <td>52.0</td>\n",
 97 |        "      <td>5.817352</td>\n",
 98 |        "      <td>1.073059</td>\n",
 99 |        "      <td>558.0</td>\n",
100 |        "      <td>2.547945</td>\n",
101 |        "      <td>37.85</td>\n",
102 |        "      <td>-122.25</td>\n",
103 |        "    </tr>\n",
104 |        "    <tr>\n",
105 |        "      <th>4</th>\n",
106 |        "      <td>3.8462</td>\n",
107 |        "      <td>52.0</td>\n",
108 |        "      <td>6.281853</td>\n",
109 |        "      <td>1.081081</td>\n",
110 |        "      <td>565.0</td>\n",
111 |        "      <td>2.181467</td>\n",
112 |        "      <td>37.85</td>\n",
113 |        "      <td>-122.25</td>\n",
114 |        "    </tr>\n",
115 |        "  </tbody>\n",
116 |        "</table>\n",
117 |        "</div>"
118 |       ],
119 |       "text/plain": [
120 |        "   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \\\n",
121 |        "0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   \n",
122 |        "1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   \n",
123 |        "2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   \n",
124 |        "3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   \n",
125 |        "4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   \n",
126 |        "\n",
127 |        "   Longitude  \n",
128 |        "0    -122.23  \n",
129 |        "1    -122.22  \n",
130 |        "2    -122.24  \n",
131 |        "3    -122.25  \n",
132 |        "4    -122.25  "
133 |       ]
134 |      },
135 |      "execution_count": 2,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "# load the California House price data from Scikit-learn\n",
142 |     "X, y = fetch_california_housing(return_X_y=True, as_frame=True)\n",
143 |     "\n",
144 |     "# display top 5 rows\n",
145 |     "X.head()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 3,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "def find_limits(df, variable, fold):\n",
155 |     "\n",
156 |     "    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)\n",
157 |     "\n",
158 |     "    lower_limit = df[variable].quantile(0.25) - (IQR * fold)\n",
159 |     "    upper_limit = df[variable].quantile(0.75) + (IQR * fold)\n",
160 |     "\n",
161 |     "    return lower_limit, upper_limit"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 4,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "(-3.9761500000000005, 11.2828)"
173 |       ]
174 |      },
175 |      "execution_count": 4,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     }
179 |    ],
180 |    "source": [
181 |     "# we find the limits\n",
182 |     "\n",
183 |     "lower_limit, upper_limit = find_limits(X, \"MedInc\", 3)\n",
184 |     "lower_limit, upper_limit"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 5,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "# let's flag the outliers in the data set\n",
194 |     "\n",
195 |     "outliers = np.where(\n",
196 |     "    (X[\"MedInc\"] > upper_limit) |\n",
197 |     "    (X[\"MedInc\"] < lower_limit),\n",
198 |     "    True,\n",
199 |     "    False,\n",
200 |     ")"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 6,
206 |    "metadata": {},
207 |    "outputs": [
208 |     {
209 |      "data": {
210 |       "text/plain": [
211 |        "140"
212 |       ]
213 |      },
214 |      "execution_count": 6,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "# how many outliers did we find?\n",
221 |     "\n",
222 |     "outliers.sum()"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 7,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "data": {
232 |       "text/plain": [
233 |        "(-39.0, 94.0)"
234 |       ]
235 |      },
236 |      "execution_count": 7,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "# we find the limits in another variable\n",
243 |     "\n",
244 |     "lower_limit, upper_limit = find_limits(X, \"HouseAge\", 3)\n",
245 |     "lower_limit, upper_limit"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 8,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "# let's flag the outliers in the data set\n",
255 |     "\n",
256 |     "outliers = np.where(\n",
257 |     "    (X[\"HouseAge\"] > upper_limit) |\n",
258 |     "    (X[\"HouseAge\"] < lower_limit),\n",
259 |     "    True,\n",
260 |     "    False,\n",
261 |     ")"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 9,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "0"
273 |       ]
274 |      },
275 |      "execution_count": 9,
276 |      "metadata": {},
277 |      "output_type": "execute_result"
278 |     }
279 |    ],
280 |    "source": [
281 |     "# how many outliers did we find?\n",
282 |     "\n",
283 |     "outliers.sum()"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": []
292 |   }
293 |  ],
294 |  "metadata": {
295 |   "kernelspec": {
296 |    "display_name": "fsml",
297 |    "language": "python",
298 |    "name": "fsml"
299 |   },
300 |   "language_info": {
301 |    "codemirror_mode": {
302 |     "name": "ipython",
303 |     "version": 3
304 |    },
305 |    "file_extension": ".py",
306 |    "mimetype": "text/x-python",
307 |    "name": "python",
308 |    "nbconvert_exporter": "python",
309 |    "pygments_lexer": "ipython3",
310 |    "version": "3.10.5"
311 |   },
312 |   "toc": {
313 |    "base_numbering": 1,
314 |    "nav_menu": {},
315 |    "number_sections": true,
316 |    "sideBar": true,
317 |    "skip_h1_title": false,
318 |    "title_cell": "Table of Contents",
319 |    "title_sidebar": "Contents",
320 |    "toc_cell": false,
321 |    "toc_position": {},
322 |    "toc_section_display": true,
323 |    "toc_window_display": false
324 |   }
325 |  },
326 |  "nbformat": 4,
327 |  "nbformat_minor": 2
328 | }
329 | 


--------------------------------------------------------------------------------
/ch05-outliers/Recipe-4-Removing-outliers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Removing outliers - outlier trimming"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "from sklearn.datasets import fetch_california_housing\n",
 19 |     "from sklearn.model_selection import train_test_split\n",
 20 |     "from feature_engine.outliers import OutlierTrimmer"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "text/plain": [
 31 |        "((14448, 8), (6192, 8))"
 32 |       ]
 33 |      },
 34 |      "execution_count": 2,
 35 |      "metadata": {},
 36 |      "output_type": "execute_result"
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "# load the California House price data from Scikit-learn\n",
 41 |     "X, y = fetch_california_housing(return_X_y=True, as_frame=True)\n",
 42 |     "\n",
 43 |     "# let's separate the data into training and testing sets\n",
 44 |     "\n",
 45 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 46 |     "    X,\n",
 47 |     "    y,\n",
 48 |     "    test_size=0.3,\n",
 49 |     "    random_state=0,\n",
 50 |     ")\n",
 51 |     "\n",
 52 |     "X_train.shape, X_test.shape"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 3,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "text/html": [
 63 |        "<div>\n",
 64 |        "<style scoped>\n",
 65 |        "    .dataframe tbody tr th:only-of-type {\n",
 66 |        "        vertical-align: middle;\n",
 67 |        "    }\n",
 68 |        "\n",
 69 |        "    .dataframe tbody tr th {\n",
 70 |        "        vertical-align: top;\n",
 71 |        "    }\n",
 72 |        "\n",
 73 |        "    .dataframe thead th {\n",
 74 |        "        text-align: right;\n",
 75 |        "    }\n",
 76 |        "</style>\n",
 77 |        "<table border=\"1\" class=\"dataframe\">\n",
 78 |        "  <thead>\n",
 79 |        "    <tr style=\"text-align: right;\">\n",
 80 |        "      <th></th>\n",
 81 |        "      <th>MedInc</th>\n",
 82 |        "      <th>HouseAge</th>\n",
 83 |        "      <th>AveRooms</th>\n",
 84 |        "      <th>AveBedrms</th>\n",
 85 |        "      <th>Population</th>\n",
 86 |        "      <th>AveOccup</th>\n",
 87 |        "      <th>Latitude</th>\n",
 88 |        "      <th>Longitude</th>\n",
 89 |        "    </tr>\n",
 90 |        "  </thead>\n",
 91 |        "  <tbody>\n",
 92 |        "    <tr>\n",
 93 |        "      <th>1989</th>\n",
 94 |        "      <td>1.9750</td>\n",
 95 |        "      <td>52.0</td>\n",
 96 |        "      <td>2.800000</td>\n",
 97 |        "      <td>0.700000</td>\n",
 98 |        "      <td>193.0</td>\n",
 99 |        "      <td>4.825000</td>\n",
100 |        "      <td>36.73</td>\n",
101 |        "      <td>-119.79</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>256</th>\n",
105 |        "      <td>2.2604</td>\n",
106 |        "      <td>43.0</td>\n",
107 |        "      <td>3.671480</td>\n",
108 |        "      <td>1.184116</td>\n",
109 |        "      <td>836.0</td>\n",
110 |        "      <td>3.018051</td>\n",
111 |        "      <td>37.77</td>\n",
112 |        "      <td>-122.21</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>7887</th>\n",
116 |        "      <td>6.2990</td>\n",
117 |        "      <td>17.0</td>\n",
118 |        "      <td>6.478022</td>\n",
119 |        "      <td>1.087912</td>\n",
120 |        "      <td>1387.0</td>\n",
121 |        "      <td>3.810440</td>\n",
122 |        "      <td>33.87</td>\n",
123 |        "      <td>-118.04</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>4581</th>\n",
127 |        "      <td>1.7199</td>\n",
128 |        "      <td>17.0</td>\n",
129 |        "      <td>2.518000</td>\n",
130 |        "      <td>1.196000</td>\n",
131 |        "      <td>3051.0</td>\n",
132 |        "      <td>3.051000</td>\n",
133 |        "      <td>34.06</td>\n",
134 |        "      <td>-118.28</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>1993</th>\n",
138 |        "      <td>2.2206</td>\n",
139 |        "      <td>50.0</td>\n",
140 |        "      <td>4.622754</td>\n",
141 |        "      <td>1.161677</td>\n",
142 |        "      <td>606.0</td>\n",
143 |        "      <td>3.628743</td>\n",
144 |        "      <td>36.73</td>\n",
145 |        "      <td>-119.81</td>\n",
146 |        "    </tr>\n",
147 |        "  </tbody>\n",
148 |        "</table>\n",
149 |        "</div>"
150 |       ],
151 |       "text/plain": [
152 |        "      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \\\n",
153 |        "1989  1.9750      52.0  2.800000   0.700000       193.0  4.825000     36.73   \n",
154 |        "256   2.2604      43.0  3.671480   1.184116       836.0  3.018051     37.77   \n",
155 |        "7887  6.2990      17.0  6.478022   1.087912      1387.0  3.810440     33.87   \n",
156 |        "4581  1.7199      17.0  2.518000   1.196000      3051.0  3.051000     34.06   \n",
157 |        "1993  2.2206      50.0  4.622754   1.161677       606.0  3.628743     36.73   \n",
158 |        "\n",
159 |        "      Longitude  \n",
160 |        "1989    -119.79  \n",
161 |        "256     -122.21  \n",
162 |        "7887    -118.04  \n",
163 |        "4581    -118.28  \n",
164 |        "1993    -119.81  "
165 |       ]
166 |      },
167 |      "execution_count": 3,
168 |      "metadata": {},
169 |      "output_type": "execute_result"
170 |     }
171 |    ],
172 |    "source": [
173 |     "X_train.head()"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 4,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "def find_limits(df, variable, fold):\n",
183 |     "\n",
184 |     "    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)\n",
185 |     "\n",
186 |     "    lower_limit = df[variable].quantile(0.25) - (IQR * fold)\n",
187 |     "    upper_limit = df[variable].quantile(0.75) + (IQR * fold)\n",
188 |     "\n",
189 |     "    return lower_limit, upper_limit"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 5,
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "data": {
199 |       "text/plain": [
200 |        "(-3.925900000000002, 11.232600000000001)"
201 |       ]
202 |      },
203 |      "execution_count": 5,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "# we find the limits\n",
210 |     "\n",
211 |     "lower_limit, upper_limit = find_limits(X_train, \"MedInc\", 3)\n",
212 |     "lower_limit, upper_limit"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 6,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "# Remove outliers on the right\n",
222 |     "\n",
223 |     "inliers = X_train[\"MedInc\"].ge(lower_limit)\n",
224 |     "X_train = X_train.loc[inliers]\n",
225 |     "\n",
226 |     "inliers = X_test[\"MedInc\"].ge(lower_limit)\n",
227 |     "X_test = X_test.loc[inliers]"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 7,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "# Remove outliers on the left\n",
237 |     "\n",
238 |     "inliers = X_train[\"MedInc\"].le(upper_limit)\n",
239 |     "X_train = X_train.loc[inliers]\n",
240 |     "\n",
241 |     "inliers = X_test[\"MedInc\"].le(upper_limit)\n",
242 |     "X_test = X_test.loc[inliers]"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "## Feature-engine"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 8,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "data": {
259 |       "text/plain": [
260 |        "((14448, 8), (6192, 8))"
261 |       ]
262 |      },
263 |      "execution_count": 8,
264 |      "metadata": {},
265 |      "output_type": "execute_result"
266 |     }
267 |    ],
268 |    "source": [
269 |     "# let's separate the data into training and testing sets\n",
270 |     "\n",
271 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
272 |     "    X,\n",
273 |     "    y,\n",
274 |     "    test_size=0.3,\n",
275 |     "    random_state=0,\n",
276 |     ")\n",
277 |     "\n",
278 |     "X_train.shape, X_test.shape"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 9,
284 |    "metadata": {},
285 |    "outputs": [
286 |     {
287 |      "data": {
288 |       "text/html": [
289 |        "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>OutlierTrimmer(capping_method=&#x27;iqr&#x27;, fold=1.5, tail=&#x27;both&#x27;,\n",
290 |        "               variables=[&#x27;MedInc&#x27;, &#x27;HouseAge&#x27;, &#x27;Population&#x27;])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">OutlierTrimmer</label><div class=\"sk-toggleable__content\"><pre>OutlierTrimmer(capping_method=&#x27;iqr&#x27;, fold=1.5, tail=&#x27;both&#x27;,\n",
291 |        "               variables=[&#x27;MedInc&#x27;, &#x27;HouseAge&#x27;, &#x27;Population&#x27;])</pre></div></div></div></div></div>"
292 |       ],
293 |       "text/plain": [
294 |        "OutlierTrimmer(capping_method='iqr', fold=1.5, tail='both',\n",
295 |        "               variables=['MedInc', 'HouseAge', 'Population'])"
296 |       ]
297 |      },
298 |      "execution_count": 9,
299 |      "metadata": {},
300 |      "output_type": "execute_result"
301 |     }
302 |    ],
303 |    "source": [
304 |     "trimmer = OutlierTrimmer(\n",
305 |     "    variables=[\"MedInc\", \"HouseAge\", \"Population\"],\n",
306 |     "    capping_method=\"iqr\",\n",
307 |     "    tail=\"both\",\n",
308 |     "    fold=1.5,\n",
309 |     ")\n",
310 |     "\n",
311 |     "trimmer.fit(X_train)"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 10,
317 |    "metadata": {},
318 |    "outputs": [
319 |     {
320 |      "data": {
321 |       "text/plain": [
322 |        "{'MedInc': -0.6776500000000012, 'HouseAge': -10.5, 'Population': -626.0}"
323 |       ]
324 |      },
325 |      "execution_count": 10,
326 |      "metadata": {},
327 |      "output_type": "execute_result"
328 |     }
329 |    ],
330 |    "source": [
331 |     "trimmer.left_tail_caps_"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": 11,
337 |    "metadata": {},
338 |    "outputs": [
339 |     {
340 |      "data": {
341 |       "text/plain": [
342 |        "{'MedInc': 7.984350000000001, 'HouseAge': 65.5, 'Population': 3134.0}"
343 |       ]
344 |      },
345 |      "execution_count": 11,
346 |      "metadata": {},
347 |      "output_type": "execute_result"
348 |     }
349 |    ],
350 |    "source": [
351 |     "trimmer.right_tail_caps_"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 12,
357 |    "metadata": {},
358 |    "outputs": [
359 |     {
360 |      "name": "stdout",
361 |      "output_type": "stream",
362 |      "text": [
363 |       "(14448, 8) (6192, 8)\n",
364 |       "(13165, 8) (5619, 8)\n"
365 |      ]
366 |     }
367 |    ],
368 |    "source": [
369 |     "print(X_train.shape, X_test.shape)\n",
370 |     "\n",
371 |     "X_train = trimmer.transform(X_train)\n",
372 |     "X_test = trimmer.transform(X_test)\n",
373 |     "\n",
374 |     "print(X_train.shape, X_test.shape)"
375 |    ]
376 |   }
377 |  ],
378 |  "metadata": {
379 |   "kernelspec": {
380 |    "display_name": "fsml",
381 |    "language": "python",
382 |    "name": "fsml"
383 |   },
384 |   "language_info": {
385 |    "codemirror_mode": {
386 |     "name": "ipython",
387 |     "version": 3
388 |    },
389 |    "file_extension": ".py",
390 |    "mimetype": "text/x-python",
391 |    "name": "python",
392 |    "nbconvert_exporter": "python",
393 |    "pygments_lexer": "ipython3",
394 |    "version": "3.10.5"
395 |   },
396 |   "toc": {
397 |    "base_numbering": 1,
398 |    "nav_menu": {},
399 |    "number_sections": true,
400 |    "sideBar": true,
401 |    "skip_h1_title": false,
402 |    "title_cell": "Table of Contents",
403 |    "title_sidebar": "Contents",
404 |    "toc_cell": false,
405 |    "toc_position": {},
406 |    "toc_section_display": "block",
407 |    "toc_window_display": true
408 |   }
409 |  },
410 |  "nbformat": 4,
411 |  "nbformat_minor": 1
412 | }
413 | 


--------------------------------------------------------------------------------
/ch06-datetime/Recipe-2-Extracting-features-from-time-with-pandas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Extracting features from time with pandas"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "data": {
 27 |       "text/html": [
 28 |        "<div>\n",
 29 |        "<style scoped>\n",
 30 |        "    .dataframe tbody tr th:only-of-type {\n",
 31 |        "        vertical-align: middle;\n",
 32 |        "    }\n",
 33 |        "\n",
 34 |        "    .dataframe tbody tr th {\n",
 35 |        "        vertical-align: top;\n",
 36 |        "    }\n",
 37 |        "\n",
 38 |        "    .dataframe thead th {\n",
 39 |        "        text-align: right;\n",
 40 |        "    }\n",
 41 |        "</style>\n",
 42 |        "<table border=\"1\" class=\"dataframe\">\n",
 43 |        "  <thead>\n",
 44 |        "    <tr style=\"text-align: right;\">\n",
 45 |        "      <th></th>\n",
 46 |        "      <th>date</th>\n",
 47 |        "    </tr>\n",
 48 |        "  </thead>\n",
 49 |        "  <tbody>\n",
 50 |        "    <tr>\n",
 51 |        "      <th>0</th>\n",
 52 |        "      <td>2019-03-05 00:00:00</td>\n",
 53 |        "    </tr>\n",
 54 |        "    <tr>\n",
 55 |        "      <th>1</th>\n",
 56 |        "      <td>2019-03-05 01:15:10</td>\n",
 57 |        "    </tr>\n",
 58 |        "    <tr>\n",
 59 |        "      <th>2</th>\n",
 60 |        "      <td>2019-03-05 02:30:20</td>\n",
 61 |        "    </tr>\n",
 62 |        "    <tr>\n",
 63 |        "      <th>3</th>\n",
 64 |        "      <td>2019-03-05 03:45:30</td>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>4</th>\n",
 68 |        "      <td>2019-03-05 05:00:40</td>\n",
 69 |        "    </tr>\n",
 70 |        "  </tbody>\n",
 71 |        "</table>\n",
 72 |        "</div>"
 73 |       ],
 74 |       "text/plain": [
 75 |        "                 date\n",
 76 |        "0 2019-03-05 00:00:00\n",
 77 |        "1 2019-03-05 01:15:10\n",
 78 |        "2 2019-03-05 02:30:20\n",
 79 |        "3 2019-03-05 03:45:30\n",
 80 |        "4 2019-03-05 05:00:40"
 81 |       ]
 82 |      },
 83 |      "execution_count": 2,
 84 |      "metadata": {},
 85 |      "output_type": "execute_result"
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "# let's create a toy dataframe with some date variables\n",
 90 |     "\n",
 91 |     "rng_ = pd.date_range(\"2019-03-05\", periods=20, freq=\"1h15min10s\")\n",
 92 |     "df = pd.DataFrame({\"date\": rng_})\n",
 93 |     "df.head()"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 3,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/html": [
104 |        "<div>\n",
105 |        "<style scoped>\n",
106 |        "    .dataframe tbody tr th:only-of-type {\n",
107 |        "        vertical-align: middle;\n",
108 |        "    }\n",
109 |        "\n",
110 |        "    .dataframe tbody tr th {\n",
111 |        "        vertical-align: top;\n",
112 |        "    }\n",
113 |        "\n",
114 |        "    .dataframe thead th {\n",
115 |        "        text-align: right;\n",
116 |        "    }\n",
117 |        "</style>\n",
118 |        "<table border=\"1\" class=\"dataframe\">\n",
119 |        "  <thead>\n",
120 |        "    <tr style=\"text-align: right;\">\n",
121 |        "      <th></th>\n",
122 |        "      <th>date</th>\n",
123 |        "      <th>hour</th>\n",
124 |        "      <th>min</th>\n",
125 |        "      <th>sec</th>\n",
126 |        "    </tr>\n",
127 |        "  </thead>\n",
128 |        "  <tbody>\n",
129 |        "    <tr>\n",
130 |        "      <th>0</th>\n",
131 |        "      <td>2019-03-05 00:00:00</td>\n",
132 |        "      <td>0</td>\n",
133 |        "      <td>0</td>\n",
134 |        "      <td>0</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>1</th>\n",
138 |        "      <td>2019-03-05 01:15:10</td>\n",
139 |        "      <td>1</td>\n",
140 |        "      <td>15</td>\n",
141 |        "      <td>10</td>\n",
142 |        "    </tr>\n",
143 |        "    <tr>\n",
144 |        "      <th>2</th>\n",
145 |        "      <td>2019-03-05 02:30:20</td>\n",
146 |        "      <td>2</td>\n",
147 |        "      <td>30</td>\n",
148 |        "      <td>20</td>\n",
149 |        "    </tr>\n",
150 |        "    <tr>\n",
151 |        "      <th>3</th>\n",
152 |        "      <td>2019-03-05 03:45:30</td>\n",
153 |        "      <td>3</td>\n",
154 |        "      <td>45</td>\n",
155 |        "      <td>30</td>\n",
156 |        "    </tr>\n",
157 |        "    <tr>\n",
158 |        "      <th>4</th>\n",
159 |        "      <td>2019-03-05 05:00:40</td>\n",
160 |        "      <td>5</td>\n",
161 |        "      <td>0</td>\n",
162 |        "      <td>40</td>\n",
163 |        "    </tr>\n",
164 |        "  </tbody>\n",
165 |        "</table>\n",
166 |        "</div>"
167 |       ],
168 |       "text/plain": [
169 |        "                 date  hour  min  sec\n",
170 |        "0 2019-03-05 00:00:00     0    0    0\n",
171 |        "1 2019-03-05 01:15:10     1   15   10\n",
172 |        "2 2019-03-05 02:30:20     2   30   20\n",
173 |        "3 2019-03-05 03:45:30     3   45   30\n",
174 |        "4 2019-03-05 05:00:40     5    0   40"
175 |       ]
176 |      },
177 |      "execution_count": 3,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "# extract hr, min and sec\n",
184 |     "\n",
185 |     "df[\"hour\"] = df[\"date\"].dt.hour\n",
186 |     "df[\"min\"] = df[\"date\"].dt.minute\n",
187 |     "df[\"sec\"] = df[\"date\"].dt.second\n",
188 |     "\n",
189 |     "df.head()"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 4,
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "data": {
199 |       "text/html": [
200 |        "<div>\n",
201 |        "<style scoped>\n",
202 |        "    .dataframe tbody tr th:only-of-type {\n",
203 |        "        vertical-align: middle;\n",
204 |        "    }\n",
205 |        "\n",
206 |        "    .dataframe tbody tr th {\n",
207 |        "        vertical-align: top;\n",
208 |        "    }\n",
209 |        "\n",
210 |        "    .dataframe thead th {\n",
211 |        "        text-align: right;\n",
212 |        "    }\n",
213 |        "</style>\n",
214 |        "<table border=\"1\" class=\"dataframe\">\n",
215 |        "  <thead>\n",
216 |        "    <tr style=\"text-align: right;\">\n",
217 |        "      <th></th>\n",
218 |        "      <th>date</th>\n",
219 |        "      <th>hour</th>\n",
220 |        "      <th>min</th>\n",
221 |        "      <th>sec</th>\n",
222 |        "      <th>h</th>\n",
223 |        "      <th>m</th>\n",
224 |        "      <th>s</th>\n",
225 |        "    </tr>\n",
226 |        "  </thead>\n",
227 |        "  <tbody>\n",
228 |        "    <tr>\n",
229 |        "      <th>0</th>\n",
230 |        "      <td>2019-03-05 00:00:00</td>\n",
231 |        "      <td>0</td>\n",
232 |        "      <td>0</td>\n",
233 |        "      <td>0</td>\n",
234 |        "      <td>0</td>\n",
235 |        "      <td>0</td>\n",
236 |        "      <td>0</td>\n",
237 |        "    </tr>\n",
238 |        "    <tr>\n",
239 |        "      <th>1</th>\n",
240 |        "      <td>2019-03-05 01:15:10</td>\n",
241 |        "      <td>1</td>\n",
242 |        "      <td>15</td>\n",
243 |        "      <td>10</td>\n",
244 |        "      <td>1</td>\n",
245 |        "      <td>15</td>\n",
246 |        "      <td>10</td>\n",
247 |        "    </tr>\n",
248 |        "    <tr>\n",
249 |        "      <th>2</th>\n",
250 |        "      <td>2019-03-05 02:30:20</td>\n",
251 |        "      <td>2</td>\n",
252 |        "      <td>30</td>\n",
253 |        "      <td>20</td>\n",
254 |        "      <td>2</td>\n",
255 |        "      <td>30</td>\n",
256 |        "      <td>20</td>\n",
257 |        "    </tr>\n",
258 |        "    <tr>\n",
259 |        "      <th>3</th>\n",
260 |        "      <td>2019-03-05 03:45:30</td>\n",
261 |        "      <td>3</td>\n",
262 |        "      <td>45</td>\n",
263 |        "      <td>30</td>\n",
264 |        "      <td>3</td>\n",
265 |        "      <td>45</td>\n",
266 |        "      <td>30</td>\n",
267 |        "    </tr>\n",
268 |        "    <tr>\n",
269 |        "      <th>4</th>\n",
270 |        "      <td>2019-03-05 05:00:40</td>\n",
271 |        "      <td>5</td>\n",
272 |        "      <td>0</td>\n",
273 |        "      <td>40</td>\n",
274 |        "      <td>5</td>\n",
275 |        "      <td>0</td>\n",
276 |        "      <td>40</td>\n",
277 |        "    </tr>\n",
278 |        "  </tbody>\n",
279 |        "</table>\n",
280 |        "</div>"
281 |       ],
282 |       "text/plain": [
283 |        "                 date  hour  min  sec  h   m   s\n",
284 |        "0 2019-03-05 00:00:00     0    0    0  0   0   0\n",
285 |        "1 2019-03-05 01:15:10     1   15   10  1  15  10\n",
286 |        "2 2019-03-05 02:30:20     2   30   20  2  30  20\n",
287 |        "3 2019-03-05 03:45:30     3   45   30  3  45  30\n",
288 |        "4 2019-03-05 05:00:40     5    0   40  5   0  40"
289 |       ]
290 |      },
291 |      "execution_count": 4,
292 |      "metadata": {},
293 |      "output_type": "execute_result"
294 |     }
295 |    ],
296 |    "source": [
297 |     "# the same in one line\n",
298 |     "\n",
299 |     "df[[\"h\", \"m\", \"s\"]] = pd.DataFrame([(x.hour, x.minute, x.second) for x in df[\"date\"]])\n",
300 |     "\n",
301 |     "df.head()"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 5,
307 |    "metadata": {},
308 |    "outputs": [
309 |     {
310 |      "data": {
311 |       "text/plain": [
312 |        "array([ 0,  1,  2,  3,  5,  6,  7,  8, 10, 11, 12, 13, 15, 16, 17, 18, 20,\n",
313 |        "       21, 22, 23], dtype=int64)"
314 |       ]
315 |      },
316 |      "execution_count": 5,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "df[\"hour\"].unique()"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 6,
328 |    "metadata": {},
329 |    "outputs": [
330 |     {
331 |      "data": {
332 |       "text/html": [
333 |        "<div>\n",
334 |        "<style scoped>\n",
335 |        "    .dataframe tbody tr th:only-of-type {\n",
336 |        "        vertical-align: middle;\n",
337 |        "    }\n",
338 |        "\n",
339 |        "    .dataframe tbody tr th {\n",
340 |        "        vertical-align: top;\n",
341 |        "    }\n",
342 |        "\n",
343 |        "    .dataframe thead th {\n",
344 |        "        text-align: right;\n",
345 |        "    }\n",
346 |        "</style>\n",
347 |        "<table border=\"1\" class=\"dataframe\">\n",
348 |        "  <thead>\n",
349 |        "    <tr style=\"text-align: right;\">\n",
350 |        "      <th></th>\n",
351 |        "      <th>date</th>\n",
352 |        "      <th>hour</th>\n",
353 |        "      <th>min</th>\n",
354 |        "      <th>sec</th>\n",
355 |        "      <th>h</th>\n",
356 |        "      <th>m</th>\n",
357 |        "      <th>s</th>\n",
358 |        "      <th>is_morning</th>\n",
359 |        "    </tr>\n",
360 |        "  </thead>\n",
361 |        "  <tbody>\n",
362 |        "    <tr>\n",
363 |        "      <th>0</th>\n",
364 |        "      <td>2019-03-05 00:00:00</td>\n",
365 |        "      <td>0</td>\n",
366 |        "      <td>0</td>\n",
367 |        "      <td>0</td>\n",
368 |        "      <td>0</td>\n",
369 |        "      <td>0</td>\n",
370 |        "      <td>0</td>\n",
371 |        "      <td>0</td>\n",
372 |        "    </tr>\n",
373 |        "    <tr>\n",
374 |        "      <th>1</th>\n",
375 |        "      <td>2019-03-05 01:15:10</td>\n",
376 |        "      <td>1</td>\n",
377 |        "      <td>15</td>\n",
378 |        "      <td>10</td>\n",
379 |        "      <td>1</td>\n",
380 |        "      <td>15</td>\n",
381 |        "      <td>10</td>\n",
382 |        "      <td>0</td>\n",
383 |        "    </tr>\n",
384 |        "    <tr>\n",
385 |        "      <th>2</th>\n",
386 |        "      <td>2019-03-05 02:30:20</td>\n",
387 |        "      <td>2</td>\n",
388 |        "      <td>30</td>\n",
389 |        "      <td>20</td>\n",
390 |        "      <td>2</td>\n",
391 |        "      <td>30</td>\n",
392 |        "      <td>20</td>\n",
393 |        "      <td>0</td>\n",
394 |        "    </tr>\n",
395 |        "    <tr>\n",
396 |        "      <th>3</th>\n",
397 |        "      <td>2019-03-05 03:45:30</td>\n",
398 |        "      <td>3</td>\n",
399 |        "      <td>45</td>\n",
400 |        "      <td>30</td>\n",
401 |        "      <td>3</td>\n",
402 |        "      <td>45</td>\n",
403 |        "      <td>30</td>\n",
404 |        "      <td>0</td>\n",
405 |        "    </tr>\n",
406 |        "    <tr>\n",
407 |        "      <th>4</th>\n",
408 |        "      <td>2019-03-05 05:00:40</td>\n",
409 |        "      <td>5</td>\n",
410 |        "      <td>0</td>\n",
411 |        "      <td>40</td>\n",
412 |        "      <td>5</td>\n",
413 |        "      <td>0</td>\n",
414 |        "      <td>40</td>\n",
415 |        "      <td>0</td>\n",
416 |        "    </tr>\n",
417 |        "  </tbody>\n",
418 |        "</table>\n",
419 |        "</div>"
420 |       ],
421 |       "text/plain": [
422 |        "                 date  hour  min  sec  h   m   s  is_morning\n",
423 |        "0 2019-03-05 00:00:00     0    0    0  0   0   0           0\n",
424 |        "1 2019-03-05 01:15:10     1   15   10  1  15  10           0\n",
425 |        "2 2019-03-05 02:30:20     2   30   20  2  30  20           0\n",
426 |        "3 2019-03-05 03:45:30     3   45   30  3  45  30           0\n",
427 |        "4 2019-03-05 05:00:40     5    0   40  5   0  40           0"
428 |       ]
429 |      },
430 |      "execution_count": 6,
431 |      "metadata": {},
432 |      "output_type": "execute_result"
433 |     }
434 |    ],
435 |    "source": [
436 |     "# is it morning?\n",
437 |     "\n",
438 |     "df[\"is_morning\"] = np.where((df[\"hour\"] < 12) & (df[\"hour\"] > 6), 1, 0)\n",
439 |     "\n",
440 |     "df.head()"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": []
449 |   }
450 |  ],
451 |  "metadata": {
452 |   "kernelspec": {
453 |    "display_name": "fsml",
454 |    "language": "python",
455 |    "name": "fsml"
456 |   },
457 |   "language_info": {
458 |    "codemirror_mode": {
459 |     "name": "ipython",
460 |     "version": 3
461 |    },
462 |    "file_extension": ".py",
463 |    "mimetype": "text/x-python",
464 |    "name": "python",
465 |    "nbconvert_exporter": "python",
466 |    "pygments_lexer": "ipython3",
467 |    "version": "3.10.5"
468 |   },
469 |   "toc": {
470 |    "base_numbering": 1,
471 |    "nav_menu": {},
472 |    "number_sections": true,
473 |    "sideBar": true,
474 |    "skip_h1_title": false,
475 |    "title_cell": "Table of Contents",
476 |    "title_sidebar": "Contents",
477 |    "toc_cell": false,
478 |    "toc_position": {},
479 |    "toc_section_display": "block",
480 |    "toc_window_display": false
481 |   }
482 |  },
483 |  "nbformat": 4,
484 |  "nbformat_minor": 2
485 | }
486 | 


--------------------------------------------------------------------------------
/ch06-datetime/Recipe-3-Capturing-elapsed-time-between-2-variables.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Capture elapsed time"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import datetime\n",
 17 |     "import numpy as np\n",
 18 |     "import pandas as pd"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/html": [
 29 |        "<div>\n",
 30 |        "<style scoped>\n",
 31 |        "    .dataframe tbody tr th:only-of-type {\n",
 32 |        "        vertical-align: middle;\n",
 33 |        "    }\n",
 34 |        "\n",
 35 |        "    .dataframe tbody tr th {\n",
 36 |        "        vertical-align: top;\n",
 37 |        "    }\n",
 38 |        "\n",
 39 |        "    .dataframe thead th {\n",
 40 |        "        text-align: right;\n",
 41 |        "    }\n",
 42 |        "</style>\n",
 43 |        "<table border=\"1\" class=\"dataframe\">\n",
 44 |        "  <thead>\n",
 45 |        "    <tr style=\"text-align: right;\">\n",
 46 |        "      <th></th>\n",
 47 |        "      <th>date1</th>\n",
 48 |        "      <th>date2</th>\n",
 49 |        "    </tr>\n",
 50 |        "  </thead>\n",
 51 |        "  <tbody>\n",
 52 |        "    <tr>\n",
 53 |        "      <th>0</th>\n",
 54 |        "      <td>2019-03-05 00:00:00</td>\n",
 55 |        "      <td>2019-03-31</td>\n",
 56 |        "    </tr>\n",
 57 |        "    <tr>\n",
 58 |        "      <th>1</th>\n",
 59 |        "      <td>2019-03-05 01:00:00</td>\n",
 60 |        "      <td>2019-04-30</td>\n",
 61 |        "    </tr>\n",
 62 |        "    <tr>\n",
 63 |        "      <th>2</th>\n",
 64 |        "      <td>2019-03-05 02:00:00</td>\n",
 65 |        "      <td>2019-05-31</td>\n",
 66 |        "    </tr>\n",
 67 |        "    <tr>\n",
 68 |        "      <th>3</th>\n",
 69 |        "      <td>2019-03-05 03:00:00</td>\n",
 70 |        "      <td>2019-06-30</td>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>4</th>\n",
 74 |        "      <td>2019-03-05 04:00:00</td>\n",
 75 |        "      <td>2019-07-31</td>\n",
 76 |        "    </tr>\n",
 77 |        "  </tbody>\n",
 78 |        "</table>\n",
 79 |        "</div>"
 80 |       ],
 81 |       "text/plain": [
 82 |        "                date1      date2\n",
 83 |        "0 2019-03-05 00:00:00 2019-03-31\n",
 84 |        "1 2019-03-05 01:00:00 2019-04-30\n",
 85 |        "2 2019-03-05 02:00:00 2019-05-31\n",
 86 |        "3 2019-03-05 03:00:00 2019-06-30\n",
 87 |        "4 2019-03-05 04:00:00 2019-07-31"
 88 |       ]
 89 |      },
 90 |      "execution_count": 2,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "# let's create a toy dataframe with some date variables\n",
 97 |     "\n",
 98 |     "rng_hr = pd.date_range(\"2019-03-05\", periods=20, freq=\"H\")\n",
 99 |     "rng_month = pd.date_range(\"2019-03-05\", periods=20, freq=\"M\")\n",
100 |     "\n",
101 |     "df = pd.DataFrame({\"date1\": rng_hr, \"date2\": rng_month})\n",
102 |     "df.head()"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 3,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "data": {
112 |       "text/html": [
113 |        "<div>\n",
114 |        "<style scoped>\n",
115 |        "    .dataframe tbody tr th:only-of-type {\n",
116 |        "        vertical-align: middle;\n",
117 |        "    }\n",
118 |        "\n",
119 |        "    .dataframe tbody tr th {\n",
120 |        "        vertical-align: top;\n",
121 |        "    }\n",
122 |        "\n",
123 |        "    .dataframe thead th {\n",
124 |        "        text-align: right;\n",
125 |        "    }\n",
126 |        "</style>\n",
127 |        "<table border=\"1\" class=\"dataframe\">\n",
128 |        "  <thead>\n",
129 |        "    <tr style=\"text-align: right;\">\n",
130 |        "      <th></th>\n",
131 |        "      <th>date1</th>\n",
132 |        "      <th>date2</th>\n",
133 |        "      <th>elapsed_days</th>\n",
134 |        "    </tr>\n",
135 |        "  </thead>\n",
136 |        "  <tbody>\n",
137 |        "    <tr>\n",
138 |        "      <th>0</th>\n",
139 |        "      <td>2019-03-05 00:00:00</td>\n",
140 |        "      <td>2019-03-31</td>\n",
141 |        "      <td>26</td>\n",
142 |        "    </tr>\n",
143 |        "    <tr>\n",
144 |        "      <th>1</th>\n",
145 |        "      <td>2019-03-05 01:00:00</td>\n",
146 |        "      <td>2019-04-30</td>\n",
147 |        "      <td>55</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>2</th>\n",
151 |        "      <td>2019-03-05 02:00:00</td>\n",
152 |        "      <td>2019-05-31</td>\n",
153 |        "      <td>86</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <th>3</th>\n",
157 |        "      <td>2019-03-05 03:00:00</td>\n",
158 |        "      <td>2019-06-30</td>\n",
159 |        "      <td>116</td>\n",
160 |        "    </tr>\n",
161 |        "    <tr>\n",
162 |        "      <th>4</th>\n",
163 |        "      <td>2019-03-05 04:00:00</td>\n",
164 |        "      <td>2019-07-31</td>\n",
165 |        "      <td>147</td>\n",
166 |        "    </tr>\n",
167 |        "  </tbody>\n",
168 |        "</table>\n",
169 |        "</div>"
170 |       ],
171 |       "text/plain": [
172 |        "                date1      date2  elapsed_days\n",
173 |        "0 2019-03-05 00:00:00 2019-03-31            26\n",
174 |        "1 2019-03-05 01:00:00 2019-04-30            55\n",
175 |        "2 2019-03-05 02:00:00 2019-05-31            86\n",
176 |        "3 2019-03-05 03:00:00 2019-06-30           116\n",
177 |        "4 2019-03-05 04:00:00 2019-07-31           147"
178 |       ]
179 |      },
180 |      "execution_count": 3,
181 |      "metadata": {},
182 |      "output_type": "execute_result"
183 |     }
184 |    ],
185 |    "source": [
186 |     "# let's capture the difference in days between the 2 variables\n",
187 |     "\n",
188 |     "df[\"elapsed_days\"] = (df[\"date2\"] - df[\"date1\"]).dt.days\n",
189 |     "\n",
190 |     "df.head()"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 4,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "data": {
200 |       "text/html": [
201 |        "<div>\n",
202 |        "<style scoped>\n",
203 |        "    .dataframe tbody tr th:only-of-type {\n",
204 |        "        vertical-align: middle;\n",
205 |        "    }\n",
206 |        "\n",
207 |        "    .dataframe tbody tr th {\n",
208 |        "        vertical-align: top;\n",
209 |        "    }\n",
210 |        "\n",
211 |        "    .dataframe thead th {\n",
212 |        "        text-align: right;\n",
213 |        "    }\n",
214 |        "</style>\n",
215 |        "<table border=\"1\" class=\"dataframe\">\n",
216 |        "  <thead>\n",
217 |        "    <tr style=\"text-align: right;\">\n",
218 |        "      <th></th>\n",
219 |        "      <th>date1</th>\n",
220 |        "      <th>date2</th>\n",
221 |        "      <th>elapsed_days</th>\n",
222 |        "      <th>months_passed</th>\n",
223 |        "    </tr>\n",
224 |        "  </thead>\n",
225 |        "  <tbody>\n",
226 |        "    <tr>\n",
227 |        "      <th>0</th>\n",
228 |        "      <td>2019-03-05 00:00:00</td>\n",
229 |        "      <td>2019-03-31</td>\n",
230 |        "      <td>26</td>\n",
231 |        "      <td>1.0</td>\n",
232 |        "    </tr>\n",
233 |        "    <tr>\n",
234 |        "      <th>1</th>\n",
235 |        "      <td>2019-03-05 01:00:00</td>\n",
236 |        "      <td>2019-04-30</td>\n",
237 |        "      <td>55</td>\n",
238 |        "      <td>2.0</td>\n",
239 |        "    </tr>\n",
240 |        "    <tr>\n",
241 |        "      <th>2</th>\n",
242 |        "      <td>2019-03-05 02:00:00</td>\n",
243 |        "      <td>2019-05-31</td>\n",
244 |        "      <td>86</td>\n",
245 |        "      <td>3.0</td>\n",
246 |        "    </tr>\n",
247 |        "    <tr>\n",
248 |        "      <th>3</th>\n",
249 |        "      <td>2019-03-05 03:00:00</td>\n",
250 |        "      <td>2019-06-30</td>\n",
251 |        "      <td>116</td>\n",
252 |        "      <td>4.0</td>\n",
253 |        "    </tr>\n",
254 |        "    <tr>\n",
255 |        "      <th>4</th>\n",
256 |        "      <td>2019-03-05 04:00:00</td>\n",
257 |        "      <td>2019-07-31</td>\n",
258 |        "      <td>147</td>\n",
259 |        "      <td>5.0</td>\n",
260 |        "    </tr>\n",
261 |        "  </tbody>\n",
262 |        "</table>\n",
263 |        "</div>"
264 |       ],
265 |       "text/plain": [
266 |        "                date1      date2  elapsed_days  months_passed\n",
267 |        "0 2019-03-05 00:00:00 2019-03-31            26            1.0\n",
268 |        "1 2019-03-05 01:00:00 2019-04-30            55            2.0\n",
269 |        "2 2019-03-05 02:00:00 2019-05-31            86            3.0\n",
270 |        "3 2019-03-05 03:00:00 2019-06-30           116            4.0\n",
271 |        "4 2019-03-05 04:00:00 2019-07-31           147            5.0"
272 |       ]
273 |      },
274 |      "execution_count": 4,
275 |      "metadata": {},
276 |      "output_type": "execute_result"
277 |     }
278 |    ],
279 |    "source": [
280 |     "# let's capture the difference in months between the 2 variables\n",
281 |     "\n",
282 |     "df[\"months_passed\"] = (df[\"date2\"] - df[\"date1\"]) / np.timedelta64(1, \"M\")\n",
283 |     "df[\"months_passed\"] = np.round(df[\"months_passed\"], 0)\n",
284 |     "\n",
285 |     "df.head()"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 5,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "data": {
295 |       "text/html": [
296 |        "<div>\n",
297 |        "<style scoped>\n",
298 |        "    .dataframe tbody tr th:only-of-type {\n",
299 |        "        vertical-align: middle;\n",
300 |        "    }\n",
301 |        "\n",
302 |        "    .dataframe tbody tr th {\n",
303 |        "        vertical-align: top;\n",
304 |        "    }\n",
305 |        "\n",
306 |        "    .dataframe thead th {\n",
307 |        "        text-align: right;\n",
308 |        "    }\n",
309 |        "</style>\n",
310 |        "<table border=\"1\" class=\"dataframe\">\n",
311 |        "  <thead>\n",
312 |        "    <tr style=\"text-align: right;\">\n",
313 |        "      <th></th>\n",
314 |        "      <th>date1</th>\n",
315 |        "      <th>date2</th>\n",
316 |        "      <th>elapsed_days</th>\n",
317 |        "      <th>months_passed</th>\n",
318 |        "      <th>diff_seconds</th>\n",
319 |        "      <th>diff_minutes</th>\n",
320 |        "    </tr>\n",
321 |        "  </thead>\n",
322 |        "  <tbody>\n",
323 |        "    <tr>\n",
324 |        "      <th>0</th>\n",
325 |        "      <td>2019-03-05 00:00:00</td>\n",
326 |        "      <td>2019-03-31</td>\n",
327 |        "      <td>26</td>\n",
328 |        "      <td>1.0</td>\n",
329 |        "      <td>2246400.0</td>\n",
330 |        "      <td>37440.0</td>\n",
331 |        "    </tr>\n",
332 |        "    <tr>\n",
333 |        "      <th>1</th>\n",
334 |        "      <td>2019-03-05 01:00:00</td>\n",
335 |        "      <td>2019-04-30</td>\n",
336 |        "      <td>55</td>\n",
337 |        "      <td>2.0</td>\n",
338 |        "      <td>4834800.0</td>\n",
339 |        "      <td>80580.0</td>\n",
340 |        "    </tr>\n",
341 |        "    <tr>\n",
342 |        "      <th>2</th>\n",
343 |        "      <td>2019-03-05 02:00:00</td>\n",
344 |        "      <td>2019-05-31</td>\n",
345 |        "      <td>86</td>\n",
346 |        "      <td>3.0</td>\n",
347 |        "      <td>7509600.0</td>\n",
348 |        "      <td>125160.0</td>\n",
349 |        "    </tr>\n",
350 |        "    <tr>\n",
351 |        "      <th>3</th>\n",
352 |        "      <td>2019-03-05 03:00:00</td>\n",
353 |        "      <td>2019-06-30</td>\n",
354 |        "      <td>116</td>\n",
355 |        "      <td>4.0</td>\n",
356 |        "      <td>10098000.0</td>\n",
357 |        "      <td>168300.0</td>\n",
358 |        "    </tr>\n",
359 |        "    <tr>\n",
360 |        "      <th>4</th>\n",
361 |        "      <td>2019-03-05 04:00:00</td>\n",
362 |        "      <td>2019-07-31</td>\n",
363 |        "      <td>147</td>\n",
364 |        "      <td>5.0</td>\n",
365 |        "      <td>12772800.0</td>\n",
366 |        "      <td>212880.0</td>\n",
367 |        "    </tr>\n",
368 |        "  </tbody>\n",
369 |        "</table>\n",
370 |        "</div>"
371 |       ],
372 |       "text/plain": [
373 |        "                date1      date2  elapsed_days  months_passed  diff_seconds  \\\n",
374 |        "0 2019-03-05 00:00:00 2019-03-31            26            1.0     2246400.0   \n",
375 |        "1 2019-03-05 01:00:00 2019-04-30            55            2.0     4834800.0   \n",
376 |        "2 2019-03-05 02:00:00 2019-05-31            86            3.0     7509600.0   \n",
377 |        "3 2019-03-05 03:00:00 2019-06-30           116            4.0    10098000.0   \n",
378 |        "4 2019-03-05 04:00:00 2019-07-31           147            5.0    12772800.0   \n",
379 |        "\n",
380 |        "   diff_minutes  \n",
381 |        "0       37440.0  \n",
382 |        "1       80580.0  \n",
383 |        "2      125160.0  \n",
384 |        "3      168300.0  \n",
385 |        "4      212880.0  "
386 |       ]
387 |      },
388 |      "execution_count": 5,
389 |      "metadata": {},
390 |      "output_type": "execute_result"
391 |     }
392 |    ],
393 |    "source": [
394 |     "# calculate difference in seconds and minutes\n",
395 |     "\n",
396 |     "df[\"diff_seconds\"] = (df[\"date2\"] - df[\"date1\"]) / np.timedelta64(1, \"s\")\n",
397 |     "df[\"diff_minutes\"] = (df[\"date2\"] - df[\"date1\"]) / np.timedelta64(1, \"m\")\n",
398 |     "\n",
399 |     "df.head()"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": 6,
405 |    "metadata": {},
406 |    "outputs": [
407 |     {
408 |      "data": {
409 |       "text/html": [
410 |        "<div>\n",
411 |        "<style scoped>\n",
412 |        "    .dataframe tbody tr th:only-of-type {\n",
413 |        "        vertical-align: middle;\n",
414 |        "    }\n",
415 |        "\n",
416 |        "    .dataframe tbody tr th {\n",
417 |        "        vertical-align: top;\n",
418 |        "    }\n",
419 |        "\n",
420 |        "    .dataframe thead th {\n",
421 |        "        text-align: right;\n",
422 |        "    }\n",
423 |        "</style>\n",
424 |        "<table border=\"1\" class=\"dataframe\">\n",
425 |        "  <thead>\n",
426 |        "    <tr style=\"text-align: right;\">\n",
427 |        "      <th></th>\n",
428 |        "      <th>date1</th>\n",
429 |        "      <th>date2</th>\n",
430 |        "      <th>elapsed_days</th>\n",
431 |        "      <th>months_passed</th>\n",
432 |        "      <th>diff_seconds</th>\n",
433 |        "      <th>diff_minutes</th>\n",
434 |        "      <th>to_today</th>\n",
435 |        "    </tr>\n",
436 |        "  </thead>\n",
437 |        "  <tbody>\n",
438 |        "    <tr>\n",
439 |        "      <th>0</th>\n",
440 |        "      <td>2019-03-05 00:00:00</td>\n",
441 |        "      <td>2019-03-31</td>\n",
442 |        "      <td>26</td>\n",
443 |        "      <td>1.0</td>\n",
444 |        "      <td>2246400.0</td>\n",
445 |        "      <td>37440.0</td>\n",
446 |        "      <td>1265 days 11:41:24.651022</td>\n",
447 |        "    </tr>\n",
448 |        "    <tr>\n",
449 |        "      <th>1</th>\n",
450 |        "      <td>2019-03-05 01:00:00</td>\n",
451 |        "      <td>2019-04-30</td>\n",
452 |        "      <td>55</td>\n",
453 |        "      <td>2.0</td>\n",
454 |        "      <td>4834800.0</td>\n",
455 |        "      <td>80580.0</td>\n",
456 |        "      <td>1265 days 10:41:24.651022</td>\n",
457 |        "    </tr>\n",
458 |        "    <tr>\n",
459 |        "      <th>2</th>\n",
460 |        "      <td>2019-03-05 02:00:00</td>\n",
461 |        "      <td>2019-05-31</td>\n",
462 |        "      <td>86</td>\n",
463 |        "      <td>3.0</td>\n",
464 |        "      <td>7509600.0</td>\n",
465 |        "      <td>125160.0</td>\n",
466 |        "      <td>1265 days 09:41:24.651022</td>\n",
467 |        "    </tr>\n",
468 |        "    <tr>\n",
469 |        "      <th>3</th>\n",
470 |        "      <td>2019-03-05 03:00:00</td>\n",
471 |        "      <td>2019-06-30</td>\n",
472 |        "      <td>116</td>\n",
473 |        "      <td>4.0</td>\n",
474 |        "      <td>10098000.0</td>\n",
475 |        "      <td>168300.0</td>\n",
476 |        "      <td>1265 days 08:41:24.651022</td>\n",
477 |        "    </tr>\n",
478 |        "    <tr>\n",
479 |        "      <th>4</th>\n",
480 |        "      <td>2019-03-05 04:00:00</td>\n",
481 |        "      <td>2019-07-31</td>\n",
482 |        "      <td>147</td>\n",
483 |        "      <td>5.0</td>\n",
484 |        "      <td>12772800.0</td>\n",
485 |        "      <td>212880.0</td>\n",
486 |        "      <td>1265 days 07:41:24.651022</td>\n",
487 |        "    </tr>\n",
488 |        "  </tbody>\n",
489 |        "</table>\n",
490 |        "</div>"
491 |       ],
492 |       "text/plain": [
493 |        "                date1      date2  elapsed_days  months_passed  diff_seconds  \\\n",
494 |        "0 2019-03-05 00:00:00 2019-03-31            26            1.0     2246400.0   \n",
495 |        "1 2019-03-05 01:00:00 2019-04-30            55            2.0     4834800.0   \n",
496 |        "2 2019-03-05 02:00:00 2019-05-31            86            3.0     7509600.0   \n",
497 |        "3 2019-03-05 03:00:00 2019-06-30           116            4.0    10098000.0   \n",
498 |        "4 2019-03-05 04:00:00 2019-07-31           147            5.0    12772800.0   \n",
499 |        "\n",
500 |        "   diff_minutes                  to_today  \n",
501 |        "0       37440.0 1265 days 11:41:24.651022  \n",
502 |        "1       80580.0 1265 days 10:41:24.651022  \n",
503 |        "2      125160.0 1265 days 09:41:24.651022  \n",
504 |        "3      168300.0 1265 days 08:41:24.651022  \n",
505 |        "4      212880.0 1265 days 07:41:24.651022  "
506 |       ]
507 |      },
508 |      "execution_count": 6,
509 |      "metadata": {},
510 |      "output_type": "execute_result"
511 |     }
512 |    ],
513 |    "source": [
514 |     "# calculate difference to today\n",
515 |     "\n",
516 |     "df[\"to_today\"] = datetime.datetime.today() - df[\"date1\"]\n",
517 |     "\n",
518 |     "df.head()"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": null,
524 |    "metadata": {},
525 |    "outputs": [],
526 |    "source": []
527 |   }
528 |  ],
529 |  "metadata": {
530 |   "kernelspec": {
531 |    "display_name": "fsml",
532 |    "language": "python",
533 |    "name": "fsml"
534 |   },
535 |   "language_info": {
536 |    "codemirror_mode": {
537 |     "name": "ipython",
538 |     "version": 3
539 |    },
540 |    "file_extension": ".py",
541 |    "mimetype": "text/x-python",
542 |    "name": "python",
543 |    "nbconvert_exporter": "python",
544 |    "pygments_lexer": "ipython3",
545 |    "version": "3.10.5"
546 |   },
547 |   "toc": {
548 |    "base_numbering": 1,
549 |    "nav_menu": {},
550 |    "number_sections": true,
551 |    "sideBar": true,
552 |    "skip_h1_title": false,
553 |    "title_cell": "Table of Contents",
554 |    "title_sidebar": "Contents",
555 |    "toc_cell": false,
556 |    "toc_position": {},
557 |    "toc_section_display": "block",
558 |    "toc_window_display": false
559 |   }
560 |  },
561 |  "nbformat": 4,
562 |  "nbformat_minor": 2
563 | }
564 | 


--------------------------------------------------------------------------------
/ch06-datetime/Recipe-4-Working-with-different-time-zones.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Working with different time zones"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/html": [
 27 |        "<div>\n",
 28 |        "<style scoped>\n",
 29 |        "    .dataframe tbody tr th:only-of-type {\n",
 30 |        "        vertical-align: middle;\n",
 31 |        "    }\n",
 32 |        "\n",
 33 |        "    .dataframe tbody tr th {\n",
 34 |        "        vertical-align: top;\n",
 35 |        "    }\n",
 36 |        "\n",
 37 |        "    .dataframe thead th {\n",
 38 |        "        text-align: right;\n",
 39 |        "    }\n",
 40 |        "</style>\n",
 41 |        "<table border=\"1\" class=\"dataframe\">\n",
 42 |        "  <thead>\n",
 43 |        "    <tr style=\"text-align: right;\">\n",
 44 |        "      <th></th>\n",
 45 |        "      <th>time1</th>\n",
 46 |        "    </tr>\n",
 47 |        "  </thead>\n",
 48 |        "  <tbody>\n",
 49 |        "    <tr>\n",
 50 |        "      <th>0</th>\n",
 51 |        "      <td>2015-06-10 09:00:00+02:00</td>\n",
 52 |        "    </tr>\n",
 53 |        "    <tr>\n",
 54 |        "      <th>1</th>\n",
 55 |        "      <td>2015-06-10 10:00:00+02:00</td>\n",
 56 |        "    </tr>\n",
 57 |        "    <tr>\n",
 58 |        "      <th>2</th>\n",
 59 |        "      <td>2015-06-10 11:00:00+02:00</td>\n",
 60 |        "    </tr>\n",
 61 |        "    <tr>\n",
 62 |        "      <th>0</th>\n",
 63 |        "      <td>2015-09-10 09:00:00-05:00</td>\n",
 64 |        "    </tr>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>1</th>\n",
 67 |        "      <td>2015-09-10 10:00:00-05:00</td>\n",
 68 |        "    </tr>\n",
 69 |        "    <tr>\n",
 70 |        "      <th>2</th>\n",
 71 |        "      <td>2015-09-10 11:00:00-05:00</td>\n",
 72 |        "    </tr>\n",
 73 |        "  </tbody>\n",
 74 |        "</table>\n",
 75 |        "</div>"
 76 |       ],
 77 |       "text/plain": [
 78 |        "                       time1\n",
 79 |        "0  2015-06-10 09:00:00+02:00\n",
 80 |        "1  2015-06-10 10:00:00+02:00\n",
 81 |        "2  2015-06-10 11:00:00+02:00\n",
 82 |        "0  2015-09-10 09:00:00-05:00\n",
 83 |        "1  2015-09-10 10:00:00-05:00\n",
 84 |        "2  2015-09-10 11:00:00-05:00"
 85 |       ]
 86 |      },
 87 |      "execution_count": 2,
 88 |      "metadata": {},
 89 |      "output_type": "execute_result"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "# first, let's create a toy dataframe with some timestamps in different time zones\n",
 94 |     "# variable 1\n",
 95 |     "\n",
 96 |     "df = pd.DataFrame()\n",
 97 |     "\n",
 98 |     "df[\"time1\"] = pd.concat(\n",
 99 |     "    [\n",
100 |     "        pd.Series(\n",
101 |     "            pd.date_range(\n",
102 |     "                start=\"2015-06-10 09:00\", freq=\"H\", periods=3, tz=\"Europe/Berlin\"\n",
103 |     "            )\n",
104 |     "        ),\n",
105 |     "        pd.Series(\n",
106 |     "            pd.date_range(\n",
107 |     "                start=\"2015-09-10 09:00\", freq=\"H\", periods=3, tz=\"US/Central\"\n",
108 |     "            )\n",
109 |     "        ),\n",
110 |     "    ],\n",
111 |     "    axis=0,\n",
112 |     ")\n",
113 |     "\n",
114 |     "df"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 3,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/html": [
125 |        "<div>\n",
126 |        "<style scoped>\n",
127 |        "    .dataframe tbody tr th:only-of-type {\n",
128 |        "        vertical-align: middle;\n",
129 |        "    }\n",
130 |        "\n",
131 |        "    .dataframe tbody tr th {\n",
132 |        "        vertical-align: top;\n",
133 |        "    }\n",
134 |        "\n",
135 |        "    .dataframe thead th {\n",
136 |        "        text-align: right;\n",
137 |        "    }\n",
138 |        "</style>\n",
139 |        "<table border=\"1\" class=\"dataframe\">\n",
140 |        "  <thead>\n",
141 |        "    <tr style=\"text-align: right;\">\n",
142 |        "      <th></th>\n",
143 |        "      <th>time1</th>\n",
144 |        "      <th>time2</th>\n",
145 |        "    </tr>\n",
146 |        "  </thead>\n",
147 |        "  <tbody>\n",
148 |        "    <tr>\n",
149 |        "      <th>0</th>\n",
150 |        "      <td>2015-06-10 09:00:00+02:00</td>\n",
151 |        "      <td>2015-07-01 09:00:00+02:00</td>\n",
152 |        "    </tr>\n",
153 |        "    <tr>\n",
154 |        "      <th>1</th>\n",
155 |        "      <td>2015-06-10 10:00:00+02:00</td>\n",
156 |        "      <td>2015-07-01 10:00:00+02:00</td>\n",
157 |        "    </tr>\n",
158 |        "    <tr>\n",
159 |        "      <th>2</th>\n",
160 |        "      <td>2015-06-10 11:00:00+02:00</td>\n",
161 |        "      <td>2015-07-01 11:00:00+02:00</td>\n",
162 |        "    </tr>\n",
163 |        "    <tr>\n",
164 |        "      <th>0</th>\n",
165 |        "      <td>2015-09-10 09:00:00-05:00</td>\n",
166 |        "      <td>2015-08-01 09:00:00-05:00</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>1</th>\n",
170 |        "      <td>2015-09-10 10:00:00-05:00</td>\n",
171 |        "      <td>2015-08-01 10:00:00-05:00</td>\n",
172 |        "    </tr>\n",
173 |        "    <tr>\n",
174 |        "      <th>2</th>\n",
175 |        "      <td>2015-09-10 11:00:00-05:00</td>\n",
176 |        "      <td>2015-08-01 11:00:00-05:00</td>\n",
177 |        "    </tr>\n",
178 |        "  </tbody>\n",
179 |        "</table>\n",
180 |        "</div>"
181 |       ],
182 |       "text/plain": [
183 |        "                       time1                      time2\n",
184 |        "0  2015-06-10 09:00:00+02:00  2015-07-01 09:00:00+02:00\n",
185 |        "1  2015-06-10 10:00:00+02:00  2015-07-01 10:00:00+02:00\n",
186 |        "2  2015-06-10 11:00:00+02:00  2015-07-01 11:00:00+02:00\n",
187 |        "0  2015-09-10 09:00:00-05:00  2015-08-01 09:00:00-05:00\n",
188 |        "1  2015-09-10 10:00:00-05:00  2015-08-01 10:00:00-05:00\n",
189 |        "2  2015-09-10 11:00:00-05:00  2015-08-01 11:00:00-05:00"
190 |       ]
191 |      },
192 |      "execution_count": 3,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "# first, let's create a toy dataframe with some timestamps in different time zones\n",
199 |     "# variable 2\n",
200 |     "\n",
201 |     "df[\"time2\"] = pd.concat(\n",
202 |     "    [\n",
203 |     "        pd.Series(\n",
204 |     "            pd.date_range(\n",
205 |     "                start=\"2015-07-01 09:00\", freq=\"H\", periods=3, tz=\"Europe/Berlin\"\n",
206 |     "            )\n",
207 |     "        ),\n",
208 |     "        pd.Series(\n",
209 |     "            pd.date_range(\n",
210 |     "                start=\"2015-08-01 09:00\", freq=\"H\", periods=3, tz=\"US/Central\"\n",
211 |     "            )\n",
212 |     "        ),\n",
213 |     "    ],\n",
214 |     "    axis=0,\n",
215 |     ")\n",
216 |     "\n",
217 |     "df"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 4,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "data": {
227 |       "text/html": [
228 |        "<div>\n",
229 |        "<style scoped>\n",
230 |        "    .dataframe tbody tr th:only-of-type {\n",
231 |        "        vertical-align: middle;\n",
232 |        "    }\n",
233 |        "\n",
234 |        "    .dataframe tbody tr th {\n",
235 |        "        vertical-align: top;\n",
236 |        "    }\n",
237 |        "\n",
238 |        "    .dataframe thead th {\n",
239 |        "        text-align: right;\n",
240 |        "    }\n",
241 |        "</style>\n",
242 |        "<table border=\"1\" class=\"dataframe\">\n",
243 |        "  <thead>\n",
244 |        "    <tr style=\"text-align: right;\">\n",
245 |        "      <th></th>\n",
246 |        "      <th>time1</th>\n",
247 |        "      <th>time2</th>\n",
248 |        "      <th>time1_utc</th>\n",
249 |        "      <th>time2_utc</th>\n",
250 |        "    </tr>\n",
251 |        "  </thead>\n",
252 |        "  <tbody>\n",
253 |        "    <tr>\n",
254 |        "      <th>0</th>\n",
255 |        "      <td>2015-06-10 09:00:00+02:00</td>\n",
256 |        "      <td>2015-07-01 09:00:00+02:00</td>\n",
257 |        "      <td>2015-06-10 07:00:00+00:00</td>\n",
258 |        "      <td>2015-07-01 07:00:00+00:00</td>\n",
259 |        "    </tr>\n",
260 |        "    <tr>\n",
261 |        "      <th>1</th>\n",
262 |        "      <td>2015-06-10 10:00:00+02:00</td>\n",
263 |        "      <td>2015-07-01 10:00:00+02:00</td>\n",
264 |        "      <td>2015-06-10 08:00:00+00:00</td>\n",
265 |        "      <td>2015-07-01 08:00:00+00:00</td>\n",
266 |        "    </tr>\n",
267 |        "    <tr>\n",
268 |        "      <th>2</th>\n",
269 |        "      <td>2015-06-10 11:00:00+02:00</td>\n",
270 |        "      <td>2015-07-01 11:00:00+02:00</td>\n",
271 |        "      <td>2015-06-10 09:00:00+00:00</td>\n",
272 |        "      <td>2015-07-01 09:00:00+00:00</td>\n",
273 |        "    </tr>\n",
274 |        "    <tr>\n",
275 |        "      <th>0</th>\n",
276 |        "      <td>2015-09-10 09:00:00-05:00</td>\n",
277 |        "      <td>2015-08-01 09:00:00-05:00</td>\n",
278 |        "      <td>2015-09-10 14:00:00+00:00</td>\n",
279 |        "      <td>2015-08-01 14:00:00+00:00</td>\n",
280 |        "    </tr>\n",
281 |        "    <tr>\n",
282 |        "      <th>1</th>\n",
283 |        "      <td>2015-09-10 10:00:00-05:00</td>\n",
284 |        "      <td>2015-08-01 10:00:00-05:00</td>\n",
285 |        "      <td>2015-09-10 15:00:00+00:00</td>\n",
286 |        "      <td>2015-08-01 15:00:00+00:00</td>\n",
287 |        "    </tr>\n",
288 |        "    <tr>\n",
289 |        "      <th>2</th>\n",
290 |        "      <td>2015-09-10 11:00:00-05:00</td>\n",
291 |        "      <td>2015-08-01 11:00:00-05:00</td>\n",
292 |        "      <td>2015-09-10 16:00:00+00:00</td>\n",
293 |        "      <td>2015-08-01 16:00:00+00:00</td>\n",
294 |        "    </tr>\n",
295 |        "  </tbody>\n",
296 |        "</table>\n",
297 |        "</div>"
298 |       ],
299 |       "text/plain": [
300 |        "                       time1                      time2  \\\n",
301 |        "0  2015-06-10 09:00:00+02:00  2015-07-01 09:00:00+02:00   \n",
302 |        "1  2015-06-10 10:00:00+02:00  2015-07-01 10:00:00+02:00   \n",
303 |        "2  2015-06-10 11:00:00+02:00  2015-07-01 11:00:00+02:00   \n",
304 |        "0  2015-09-10 09:00:00-05:00  2015-08-01 09:00:00-05:00   \n",
305 |        "1  2015-09-10 10:00:00-05:00  2015-08-01 10:00:00-05:00   \n",
306 |        "2  2015-09-10 11:00:00-05:00  2015-08-01 11:00:00-05:00   \n",
307 |        "\n",
308 |        "                  time1_utc                 time2_utc  \n",
309 |        "0 2015-06-10 07:00:00+00:00 2015-07-01 07:00:00+00:00  \n",
310 |        "1 2015-06-10 08:00:00+00:00 2015-07-01 08:00:00+00:00  \n",
311 |        "2 2015-06-10 09:00:00+00:00 2015-07-01 09:00:00+00:00  \n",
312 |        "0 2015-09-10 14:00:00+00:00 2015-08-01 14:00:00+00:00  \n",
313 |        "1 2015-09-10 15:00:00+00:00 2015-08-01 15:00:00+00:00  \n",
314 |        "2 2015-09-10 16:00:00+00:00 2015-08-01 16:00:00+00:00  "
315 |       ]
316 |      },
317 |      "execution_count": 4,
318 |      "metadata": {},
319 |      "output_type": "execute_result"
320 |     }
321 |    ],
322 |    "source": [
323 |     "# to work with different time zones, first we unify the timezone to the central one\n",
324 |     "# setting utc = True\n",
325 |     "\n",
326 |     "df[\"time1_utc\"] = pd.to_datetime(df[\"time1\"], utc=True)\n",
327 |     "df[\"time2_utc\"] = pd.to_datetime(df[\"time2\"], utc=True)\n",
328 |     "\n",
329 |     "df"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 5,
335 |    "metadata": {},
336 |    "outputs": [
337 |     {
338 |      "data": {
339 |       "text/plain": [
340 |        "0    21\n",
341 |        "1    21\n",
342 |        "2    21\n",
343 |        "0   -40\n",
344 |        "1   -40\n",
345 |        "Name: elapsed_days, dtype: int64"
346 |       ]
347 |      },
348 |      "execution_count": 5,
349 |      "metadata": {},
350 |      "output_type": "execute_result"
351 |     }
352 |    ],
353 |    "source": [
354 |     "# let's explore the variable type\n",
355 |     "\n",
356 |     "df[\"elapsed_days\"] = (df[\"time2_utc\"] - df[\"time1_utc\"]).dt.days\n",
357 |     "\n",
358 |     "df[\"elapsed_days\"].head()"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 6,
364 |    "metadata": {},
365 |    "outputs": [
366 |     {
367 |      "data": {
368 |       "text/html": [
369 |        "<div>\n",
370 |        "<style scoped>\n",
371 |        "    .dataframe tbody tr th:only-of-type {\n",
372 |        "        vertical-align: middle;\n",
373 |        "    }\n",
374 |        "\n",
375 |        "    .dataframe tbody tr th {\n",
376 |        "        vertical-align: top;\n",
377 |        "    }\n",
378 |        "\n",
379 |        "    .dataframe thead th {\n",
380 |        "        text-align: right;\n",
381 |        "    }\n",
382 |        "</style>\n",
383 |        "<table border=\"1\" class=\"dataframe\">\n",
384 |        "  <thead>\n",
385 |        "    <tr style=\"text-align: right;\">\n",
386 |        "      <th></th>\n",
387 |        "      <th>time1_london</th>\n",
388 |        "      <th>time2_berlin</th>\n",
389 |        "    </tr>\n",
390 |        "  </thead>\n",
391 |        "  <tbody>\n",
392 |        "    <tr>\n",
393 |        "      <th>0</th>\n",
394 |        "      <td>2015-06-10 08:00:00+01:00</td>\n",
395 |        "      <td>2015-06-10 09:00:00+02:00</td>\n",
396 |        "    </tr>\n",
397 |        "    <tr>\n",
398 |        "      <th>1</th>\n",
399 |        "      <td>2015-06-10 09:00:00+01:00</td>\n",
400 |        "      <td>2015-06-10 10:00:00+02:00</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>2</th>\n",
404 |        "      <td>2015-06-10 10:00:00+01:00</td>\n",
405 |        "      <td>2015-06-10 11:00:00+02:00</td>\n",
406 |        "    </tr>\n",
407 |        "    <tr>\n",
408 |        "      <th>0</th>\n",
409 |        "      <td>2015-09-10 15:00:00+01:00</td>\n",
410 |        "      <td>2015-09-10 16:00:00+02:00</td>\n",
411 |        "    </tr>\n",
412 |        "    <tr>\n",
413 |        "      <th>1</th>\n",
414 |        "      <td>2015-09-10 16:00:00+01:00</td>\n",
415 |        "      <td>2015-09-10 17:00:00+02:00</td>\n",
416 |        "    </tr>\n",
417 |        "    <tr>\n",
418 |        "      <th>2</th>\n",
419 |        "      <td>2015-09-10 17:00:00+01:00</td>\n",
420 |        "      <td>2015-09-10 18:00:00+02:00</td>\n",
421 |        "    </tr>\n",
422 |        "  </tbody>\n",
423 |        "</table>\n",
424 |        "</div>"
425 |       ],
426 |       "text/plain": [
427 |        "               time1_london              time2_berlin\n",
428 |        "0 2015-06-10 08:00:00+01:00 2015-06-10 09:00:00+02:00\n",
429 |        "1 2015-06-10 09:00:00+01:00 2015-06-10 10:00:00+02:00\n",
430 |        "2 2015-06-10 10:00:00+01:00 2015-06-10 11:00:00+02:00\n",
431 |        "0 2015-09-10 15:00:00+01:00 2015-09-10 16:00:00+02:00\n",
432 |        "1 2015-09-10 16:00:00+01:00 2015-09-10 17:00:00+02:00\n",
433 |        "2 2015-09-10 17:00:00+01:00 2015-09-10 18:00:00+02:00"
434 |       ]
435 |      },
436 |      "execution_count": 6,
437 |      "metadata": {},
438 |      "output_type": "execute_result"
439 |     }
440 |    ],
441 |    "source": [
442 |     "# next we change all timestamps to the desired timezone, eg Europe/London\n",
443 |     "# in this example\n",
444 |     "\n",
445 |     "df[\"time1_london\"] = df[\"time1_utc\"].dt.tz_convert(\"Europe/London\")\n",
446 |     "df[\"time2_berlin\"] = df[\"time1_utc\"].dt.tz_convert(\"Europe/Berlin\")\n",
447 |     "\n",
448 |     "df[[\"time1_london\", \"time2_berlin\"]]"
449 |    ]
450 |   }
451 |  ],
452 |  "metadata": {
453 |   "kernelspec": {
454 |    "display_name": "fsml",
455 |    "language": "python",
456 |    "name": "fsml"
457 |   },
458 |   "language_info": {
459 |    "codemirror_mode": {
460 |     "name": "ipython",
461 |     "version": 3
462 |    },
463 |    "file_extension": ".py",
464 |    "mimetype": "text/x-python",
465 |    "name": "python",
466 |    "nbconvert_exporter": "python",
467 |    "pygments_lexer": "ipython3",
468 |    "version": "3.10.5"
469 |   },
470 |   "toc": {
471 |    "base_numbering": 1,
472 |    "nav_menu": {},
473 |    "number_sections": true,
474 |    "sideBar": true,
475 |    "skip_h1_title": false,
476 |    "title_cell": "Table of Contents",
477 |    "title_sidebar": "Contents",
478 |    "toc_cell": false,
479 |    "toc_position": {},
480 |    "toc_section_display": "block",
481 |    "toc_window_display": false
482 |   }
483 |  },
484 |  "nbformat": 4,
485 |  "nbformat_minor": 2
486 | }
487 | 


--------------------------------------------------------------------------------
/ch07-scaling/Recipe-6-scaling-to-unit-length.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Scaling to vector unit  length / unit norm\n",
  8 |     "\n",
  9 |     "Scaling to unit norm is achieved by dividing each feature vector by either the Manhattan distance (l1 norm) or the Euclidean distance of the vector (l2 norm):\n",
 10 |     "\n",
 11 |     "X_scaled_l1 = X / l1(X)\n",
 12 |     "\n",
 13 |     "X_scaled_l2 = X / l2(X)\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "The **Manhattan distance** is given by the sum of the absolute components of the vector:\n",
 17 |     "\n",
 18 |     "l1(X) = |x1| + |x2| + ... + |xn|\n",
 19 |     "\n",
 20 |     "\n",
 21 |     "Whereas the **Euclidean distance** is given by the square root of the square sum of the component of the vector:\n",
 22 |     "\n",
 23 |     "l2(X) = sqr( x1^2 + x2^2 + ... + xn^2 )\n",
 24 |     "\n",
 25 |     "\n",
 26 |     "In the above example, x1 is variable 1, x2 variable 2, and xn variable n, and X is the data for 1 observation across variables (a row in other words).\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "### Scaling to unit norm, examples\n",
 30 |     "\n",
 31 |     "For example, if our data has 1 observations (1 row) and 3 variables:\n",
 32 |     "\n",
 33 |     "- number of pets\n",
 34 |     "- number of children\n",
 35 |     "- age\n",
 36 |     "\n",
 37 |     "The values for each variable for that single observation are 10, 15 and 20. Our vector X = [10, 15, 20]. Then:\n",
 38 |     "\n",
 39 |     "l1(X) = 10 + 15 + 20 = 45\n",
 40 |     "\n",
 41 |     "l2(X) = sqr( 10^2 + 15^2 + 20^2) = sqr( 100 + 225 + 400) = **26.9**\n",
 42 |     "\n",
 43 |     "The euclidean distance is always smaller than the Manhattan distance.\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "The normalized vector values are therefore:\n",
 47 |     "\n",
 48 |     "X_scaled_l1 = [ 10/45, 15/45, 20/45 ]      =  [0.22, 0.33, 0.44]\n",
 49 |     "\n",
 50 |     "X_scaled_l2 = [10/26.9, 15/26.9, 20/26.9 ] =  [0.37, 0.55, 0.74]"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 1,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "import matplotlib.pyplot as plt\n",
 60 |     "import numpy as np\n",
 61 |     "import pandas as pd\n",
 62 |     "from sklearn.datasets import fetch_california_housing\n",
 63 |     "from sklearn.model_selection import train_test_split\n",
 64 |     "\n",
 65 |     "# the scaler - for robust scaling\n",
 66 |     "from sklearn.preprocessing import Normalizer"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 2,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/html": [
 77 |        "<div>\n",
 78 |        "<style scoped>\n",
 79 |        "    .dataframe tbody tr th:only-of-type {\n",
 80 |        "        vertical-align: middle;\n",
 81 |        "    }\n",
 82 |        "\n",
 83 |        "    .dataframe tbody tr th {\n",
 84 |        "        vertical-align: top;\n",
 85 |        "    }\n",
 86 |        "\n",
 87 |        "    .dataframe thead th {\n",
 88 |        "        text-align: right;\n",
 89 |        "    }\n",
 90 |        "</style>\n",
 91 |        "<table border=\"1\" class=\"dataframe\">\n",
 92 |        "  <thead>\n",
 93 |        "    <tr style=\"text-align: right;\">\n",
 94 |        "      <th></th>\n",
 95 |        "      <th>MedInc</th>\n",
 96 |        "      <th>HouseAge</th>\n",
 97 |        "      <th>AveRooms</th>\n",
 98 |        "      <th>AveBedrms</th>\n",
 99 |        "      <th>Population</th>\n",
100 |        "      <th>AveOccup</th>\n",
101 |        "    </tr>\n",
102 |        "  </thead>\n",
103 |        "  <tbody>\n",
104 |        "    <tr>\n",
105 |        "      <th>0</th>\n",
106 |        "      <td>8.3252</td>\n",
107 |        "      <td>41.0</td>\n",
108 |        "      <td>6.984127</td>\n",
109 |        "      <td>1.023810</td>\n",
110 |        "      <td>322.0</td>\n",
111 |        "      <td>2.555556</td>\n",
112 |        "    </tr>\n",
113 |        "    <tr>\n",
114 |        "      <th>1</th>\n",
115 |        "      <td>8.3014</td>\n",
116 |        "      <td>21.0</td>\n",
117 |        "      <td>6.238137</td>\n",
118 |        "      <td>0.971880</td>\n",
119 |        "      <td>2401.0</td>\n",
120 |        "      <td>2.109842</td>\n",
121 |        "    </tr>\n",
122 |        "    <tr>\n",
123 |        "      <th>2</th>\n",
124 |        "      <td>7.2574</td>\n",
125 |        "      <td>52.0</td>\n",
126 |        "      <td>8.288136</td>\n",
127 |        "      <td>1.073446</td>\n",
128 |        "      <td>496.0</td>\n",
129 |        "      <td>2.802260</td>\n",
130 |        "    </tr>\n",
131 |        "    <tr>\n",
132 |        "      <th>3</th>\n",
133 |        "      <td>5.6431</td>\n",
134 |        "      <td>52.0</td>\n",
135 |        "      <td>5.817352</td>\n",
136 |        "      <td>1.073059</td>\n",
137 |        "      <td>558.0</td>\n",
138 |        "      <td>2.547945</td>\n",
139 |        "    </tr>\n",
140 |        "    <tr>\n",
141 |        "      <th>4</th>\n",
142 |        "      <td>3.8462</td>\n",
143 |        "      <td>52.0</td>\n",
144 |        "      <td>6.281853</td>\n",
145 |        "      <td>1.081081</td>\n",
146 |        "      <td>565.0</td>\n",
147 |        "      <td>2.181467</td>\n",
148 |        "    </tr>\n",
149 |        "  </tbody>\n",
150 |        "</table>\n",
151 |        "</div>"
152 |       ],
153 |       "text/plain": [
154 |        "   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup\n",
155 |        "0  8.3252      41.0  6.984127   1.023810       322.0  2.555556\n",
156 |        "1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842\n",
157 |        "2  7.2574      52.0  8.288136   1.073446       496.0  2.802260\n",
158 |        "3  5.6431      52.0  5.817352   1.073059       558.0  2.547945\n",
159 |        "4  3.8462      52.0  6.281853   1.081081       565.0  2.181467"
160 |       ]
161 |      },
162 |      "execution_count": 2,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "# load the California House price data from Scikit-learn\n",
169 |     "X, y = fetch_california_housing(return_X_y=True, as_frame=True)\n",
170 |     "\n",
171 |     "# Remove 2 variables:\n",
172 |     "X.drop(labels=[\"Latitude\", \"Longitude\"], axis=1, inplace=True)\n",
173 |     "\n",
174 |     "# display top 5 rows\n",
175 |     "X.head()"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 3,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "data": {
185 |       "text/plain": [
186 |        "((14448, 6), (6192, 6))"
187 |       ]
188 |      },
189 |      "execution_count": 3,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "# let's separate the data into training and testing sets\n",
196 |     "\n",
197 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
198 |     "    X,\n",
199 |     "    y,\n",
200 |     "    test_size=0.3,\n",
201 |     "    random_state=0,\n",
202 |     ")\n",
203 |     "\n",
204 |     "X_train.shape, X_test.shape"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "### Scaling to l1"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 4,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "# set up the scaler\n",
221 |     "scaler = Normalizer(norm=\"l1\")  # for euclidean distance we change to norm='l2'\n",
222 |     "\n",
223 |     "# fit the scaler, this procedure does NOTHING\n",
224 |     "scaler.fit(X_train)\n",
225 |     "\n",
226 |     "# transform train and test sets\n",
227 |     "X_train_scaled = scaler.transform(X_train)\n",
228 |     "X_test_scaled = scaler.transform(X_test)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 5,
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "data": {
238 |       "text/plain": [
239 |        "array([ 255.3,  889.1, 1421.7, ...,  744.6, 1099.5, 1048.9])"
240 |       ]
241 |      },
242 |      "execution_count": 5,
243 |      "metadata": {},
244 |      "output_type": "execute_result"
245 |     }
246 |    ],
247 |    "source": [
248 |     "# let's calculate the norm for each observation (feature vector)\n",
249 |     "# original data\n",
250 |     "\n",
251 |     "np.round(np.linalg.norm(X_train, ord=1, axis=1), 1)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 6,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "text/plain": [
262 |        "array([1., 1., 1., ..., 1., 1., 1.])"
263 |       ]
264 |      },
265 |      "execution_count": 6,
266 |      "metadata": {},
267 |      "output_type": "execute_result"
268 |     }
269 |    ],
270 |    "source": [
271 |     "# let's calculate the norm for each observation (feature vector)\n",
272 |     "# scaled data\n",
273 |     "\n",
274 |     "np.round(np.linalg.norm(X_train_scaled, ord=1, axis=1), 1)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "### Scaling to l2"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 7,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "# set up the scaler\n",
291 |     "scaler = Normalizer(norm=\"l2\")\n",
292 |     "\n",
293 |     "# fit the scaler, this procedure does NOTHING\n",
294 |     "scaler.fit(X_train)\n",
295 |     "\n",
296 |     "# transform train and test sets\n",
297 |     "X_train_scaled = scaler.transform(X_train)\n",
298 |     "X_test_scaled = scaler.transform(X_test)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 8,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "data": {
308 |       "text/plain": [
309 |        "array([ 200. ,  837.1, 1387.1, ...,  704.7, 1052.6, 1024.1])"
310 |       ]
311 |      },
312 |      "execution_count": 8,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "# let's calculate the norm for each observation (feature vector)\n",
319 |     "# original data\n",
320 |     "\n",
321 |     "np.round(np.linalg.norm(X_train, ord=2, axis=1), 1)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 9,
327 |    "metadata": {},
328 |    "outputs": [
329 |     {
330 |      "data": {
331 |       "text/plain": [
332 |        "array([1., 1., 1., ..., 1., 1., 1.])"
333 |       ]
334 |      },
335 |      "execution_count": 9,
336 |      "metadata": {},
337 |      "output_type": "execute_result"
338 |     }
339 |    ],
340 |    "source": [
341 |     "# let's calculate the norm for each observation (feature vector)\n",
342 |     "# scaled data\n",
343 |     "\n",
344 |     "np.round(np.linalg.norm(X_train_scaled, ord=2, axis=1), 1)"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": []
353 |   }
354 |  ],
355 |  "metadata": {
356 |   "kernelspec": {
357 |    "display_name": "feml",
358 |    "language": "python",
359 |    "name": "feml"
360 |   },
361 |   "language_info": {
362 |    "codemirror_mode": {
363 |     "name": "ipython",
364 |     "version": 3
365 |    },
366 |    "file_extension": ".py",
367 |    "mimetype": "text/x-python",
368 |    "name": "python",
369 |    "nbconvert_exporter": "python",
370 |    "pygments_lexer": "ipython3",
371 |    "version": "3.8.2"
372 |   },
373 |   "toc": {
374 |    "base_numbering": 1,
375 |    "nav_menu": {},
376 |    "number_sections": true,
377 |    "sideBar": true,
378 |    "skip_h1_title": false,
379 |    "title_cell": "Table of Contents",
380 |    "title_sidebar": "Contents",
381 |    "toc_cell": false,
382 |    "toc_position": {},
383 |    "toc_section_display": "block",
384 |    "toc_window_display": true
385 |   }
386 |  },
387 |  "nbformat": 4,
388 |  "nbformat_minor": 2
389 | }
390 | 


--------------------------------------------------------------------------------
/ch09-featuretools/Recipe3-Combining-numerical-features.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import featuretools as ft\n",
 11 |     "from woodwork.logical_types import Categorical"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# let's load the data again\n",
 21 |     "\n",
 22 |     "df = pd.read_csv(\"retail.csv\", parse_dates=[\"invoice_date\"])"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# create and entity set\n",
 32 |     "\n",
 33 |     "es = ft.EntitySet(id=\"data\")"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 4,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# Add the data to the entity\n",
 43 |     "\n",
 44 |     "es = es.add_dataframe(\n",
 45 |     "    dataframe=df,              # the dataframe with the data\n",
 46 |     "    dataframe_name=\"data\",     # unique name to associate with this dataframe\n",
 47 |     "    index=\"rows\",              # column name to index the items\n",
 48 |     "    make_index=True,           # if true, create a new column with unique values\n",
 49 |     "    time_index=\"invoice_date\", # column containing time data\n",
 50 |     "    logical_types={\n",
 51 |     "        \"customer_id\": Categorical, # the id is numerical, but should be handled as categorical\n",
 52 |     "    },\n",
 53 |     ")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 5,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/plain": [
 64 |        "Entityset: data\n",
 65 |        "  DataFrames:\n",
 66 |        "    data [Rows: 741301, Columns: 8]\n",
 67 |        "    invoices [Rows: 40505, Columns: 3]\n",
 68 |        "  Relationships:\n",
 69 |        "    data.invoice -> invoices.invoice"
 70 |       ]
 71 |      },
 72 |      "execution_count": 5,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "# Create a new dataframe with invoices\n",
 79 |     "# indicating its relationship to the main data\n",
 80 |     "\n",
 81 |     "es.normalize_dataframe(\n",
 82 |     "    base_dataframe_name=\"data\",     # Datarame name from which to split.\n",
 83 |     "    new_dataframe_name=\"invoices\",  # Name of the new dataframe.\n",
 84 |     "    index=\"invoice\",                # relationship will be created across this column.\n",
 85 |     "    copy_columns=[\"customer_id\"],   # columns to remove from base_dataframe and move to new dataframe.\n",
 86 |     ")"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 6,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/plain": [
 97 |        "[<Feature: customer_id>,\n",
 98 |        " <Feature: invoice>,\n",
 99 |        " <Feature: stock_code>,\n",
100 |        " <Feature: description>,\n",
101 |        " <Feature: quantity>,\n",
102 |        " <Feature: price>,\n",
103 |        " <Feature: price * quantity>]"
104 |       ]
105 |      },
106 |      "execution_count": 6,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "# Obtain new variable \"amount\" by multiplying\n",
113 |     "# price and quantity.\n",
114 |     "\n",
115 |     "feature_matrix, feature_defs = ft.dfs(\n",
116 |     "    entityset=es,                          # the entity set\n",
117 |     "    target_dataframe_name=\"data\",          # the dataframe for wich to create the feature\n",
118 |     "    agg_primitives=[],                     # we need an empty list to avoid returning the defo parameters\n",
119 |     "    trans_primitives=[\"multiply_numeric\"], # the operation to create the new features\n",
120 |     "    primitive_options={                    # the features that we want to multiply\n",
121 |     "        (\"multiply_numeric\"): {\n",
122 |     "            'include_columns': {\n",
123 |     "                'data': [\"quantity\", \"price\"]\n",
124 |     "            }\n",
125 |     "        }\n",
126 |     "    },\n",
127 |     "    ignore_dataframes=[\"invoices\"],\n",
128 |     ")\n",
129 |     "\n",
130 |     "# display name of created features\n",
131 |     "feature_defs"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/html": [
142 |        "<div>\n",
143 |        "<style scoped>\n",
144 |        "    .dataframe tbody tr th:only-of-type {\n",
145 |        "        vertical-align: middle;\n",
146 |        "    }\n",
147 |        "\n",
148 |        "    .dataframe tbody tr th {\n",
149 |        "        vertical-align: top;\n",
150 |        "    }\n",
151 |        "\n",
152 |        "    .dataframe thead th {\n",
153 |        "        text-align: right;\n",
154 |        "    }\n",
155 |        "</style>\n",
156 |        "<table border=\"1\" class=\"dataframe\">\n",
157 |        "  <thead>\n",
158 |        "    <tr style=\"text-align: right;\">\n",
159 |        "      <th></th>\n",
160 |        "      <th>customer_id</th>\n",
161 |        "      <th>invoice</th>\n",
162 |        "      <th>stock_code</th>\n",
163 |        "      <th>description</th>\n",
164 |        "      <th>quantity</th>\n",
165 |        "      <th>price</th>\n",
166 |        "      <th>price * quantity</th>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>rows</th>\n",
170 |        "      <th></th>\n",
171 |        "      <th></th>\n",
172 |        "      <th></th>\n",
173 |        "      <th></th>\n",
174 |        "      <th></th>\n",
175 |        "      <th></th>\n",
176 |        "      <th></th>\n",
177 |        "    </tr>\n",
178 |        "  </thead>\n",
179 |        "  <tbody>\n",
180 |        "    <tr>\n",
181 |        "      <th>0</th>\n",
182 |        "      <td>13085.0</td>\n",
183 |        "      <td>489434</td>\n",
184 |        "      <td>85048</td>\n",
185 |        "      <td>15CM CHRISTMAS GLASS BALL 20 LIGHTS</td>\n",
186 |        "      <td>12</td>\n",
187 |        "      <td>6.95</td>\n",
188 |        "      <td>83.4</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>1</th>\n",
192 |        "      <td>13085.0</td>\n",
193 |        "      <td>489434</td>\n",
194 |        "      <td>79323P</td>\n",
195 |        "      <td>PINK CHERRY LIGHTS</td>\n",
196 |        "      <td>12</td>\n",
197 |        "      <td>6.75</td>\n",
198 |        "      <td>81.0</td>\n",
199 |        "    </tr>\n",
200 |        "    <tr>\n",
201 |        "      <th>2</th>\n",
202 |        "      <td>13085.0</td>\n",
203 |        "      <td>489434</td>\n",
204 |        "      <td>79323W</td>\n",
205 |        "      <td>WHITE CHERRY LIGHTS</td>\n",
206 |        "      <td>12</td>\n",
207 |        "      <td>6.75</td>\n",
208 |        "      <td>81.0</td>\n",
209 |        "    </tr>\n",
210 |        "    <tr>\n",
211 |        "      <th>3</th>\n",
212 |        "      <td>13085.0</td>\n",
213 |        "      <td>489434</td>\n",
214 |        "      <td>22041</td>\n",
215 |        "      <td>RECORD FRAME 7\" SINGLE SIZE</td>\n",
216 |        "      <td>48</td>\n",
217 |        "      <td>2.10</td>\n",
218 |        "      <td>100.8</td>\n",
219 |        "    </tr>\n",
220 |        "    <tr>\n",
221 |        "      <th>4</th>\n",
222 |        "      <td>13085.0</td>\n",
223 |        "      <td>489434</td>\n",
224 |        "      <td>21232</td>\n",
225 |        "      <td>STRAWBERRY CERAMIC TRINKET BOX</td>\n",
226 |        "      <td>24</td>\n",
227 |        "      <td>1.25</td>\n",
228 |        "      <td>30.0</td>\n",
229 |        "    </tr>\n",
230 |        "  </tbody>\n",
231 |        "</table>\n",
232 |        "</div>"
233 |       ],
234 |       "text/plain": [
235 |        "     customer_id invoice stock_code                          description  \\\n",
236 |        "rows                                                                       \n",
237 |        "0        13085.0  489434      85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS   \n",
238 |        "1        13085.0  489434     79323P                   PINK CHERRY LIGHTS   \n",
239 |        "2        13085.0  489434     79323W                  WHITE CHERRY LIGHTS   \n",
240 |        "3        13085.0  489434      22041         RECORD FRAME 7\" SINGLE SIZE    \n",
241 |        "4        13085.0  489434      21232       STRAWBERRY CERAMIC TRINKET BOX   \n",
242 |        "\n",
243 |        "      quantity  price  price * quantity  \n",
244 |        "rows                                     \n",
245 |        "0           12   6.95              83.4  \n",
246 |        "1           12   6.75              81.0  \n",
247 |        "2           12   6.75              81.0  \n",
248 |        "3           48   2.10             100.8  \n",
249 |        "4           24   1.25              30.0  "
250 |       ]
251 |      },
252 |      "execution_count": 7,
253 |      "metadata": {},
254 |      "output_type": "execute_result"
255 |     }
256 |    ],
257 |    "source": [
258 |     "feature_matrix.head()"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "## In relation to pandas"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 8,
271 |    "metadata": {},
272 |    "outputs": [
273 |     {
274 |      "data": {
275 |       "text/html": [
276 |        "<div>\n",
277 |        "<style scoped>\n",
278 |        "    .dataframe tbody tr th:only-of-type {\n",
279 |        "        vertical-align: middle;\n",
280 |        "    }\n",
281 |        "\n",
282 |        "    .dataframe tbody tr th {\n",
283 |        "        vertical-align: top;\n",
284 |        "    }\n",
285 |        "\n",
286 |        "    .dataframe thead th {\n",
287 |        "        text-align: right;\n",
288 |        "    }\n",
289 |        "</style>\n",
290 |        "<table border=\"1\" class=\"dataframe\">\n",
291 |        "  <thead>\n",
292 |        "    <tr style=\"text-align: right;\">\n",
293 |        "      <th></th>\n",
294 |        "      <th>customer_id</th>\n",
295 |        "      <th>invoice</th>\n",
296 |        "      <th>invoice_date</th>\n",
297 |        "      <th>stock_code</th>\n",
298 |        "      <th>description</th>\n",
299 |        "      <th>quantity</th>\n",
300 |        "      <th>price</th>\n",
301 |        "    </tr>\n",
302 |        "  </thead>\n",
303 |        "  <tbody>\n",
304 |        "    <tr>\n",
305 |        "      <th>0</th>\n",
306 |        "      <td>13085.0</td>\n",
307 |        "      <td>489434</td>\n",
308 |        "      <td>2009-12-01 07:45:00</td>\n",
309 |        "      <td>85048</td>\n",
310 |        "      <td>15CM CHRISTMAS GLASS BALL 20 LIGHTS</td>\n",
311 |        "      <td>12</td>\n",
312 |        "      <td>6.95</td>\n",
313 |        "    </tr>\n",
314 |        "    <tr>\n",
315 |        "      <th>1</th>\n",
316 |        "      <td>13085.0</td>\n",
317 |        "      <td>489434</td>\n",
318 |        "      <td>2009-12-01 07:45:00</td>\n",
319 |        "      <td>79323P</td>\n",
320 |        "      <td>PINK CHERRY LIGHTS</td>\n",
321 |        "      <td>12</td>\n",
322 |        "      <td>6.75</td>\n",
323 |        "    </tr>\n",
324 |        "    <tr>\n",
325 |        "      <th>2</th>\n",
326 |        "      <td>13085.0</td>\n",
327 |        "      <td>489434</td>\n",
328 |        "      <td>2009-12-01 07:45:00</td>\n",
329 |        "      <td>79323W</td>\n",
330 |        "      <td>WHITE CHERRY LIGHTS</td>\n",
331 |        "      <td>12</td>\n",
332 |        "      <td>6.75</td>\n",
333 |        "    </tr>\n",
334 |        "    <tr>\n",
335 |        "      <th>3</th>\n",
336 |        "      <td>13085.0</td>\n",
337 |        "      <td>489434</td>\n",
338 |        "      <td>2009-12-01 07:45:00</td>\n",
339 |        "      <td>22041</td>\n",
340 |        "      <td>RECORD FRAME 7\" SINGLE SIZE</td>\n",
341 |        "      <td>48</td>\n",
342 |        "      <td>2.10</td>\n",
343 |        "    </tr>\n",
344 |        "    <tr>\n",
345 |        "      <th>4</th>\n",
346 |        "      <td>13085.0</td>\n",
347 |        "      <td>489434</td>\n",
348 |        "      <td>2009-12-01 07:45:00</td>\n",
349 |        "      <td>21232</td>\n",
350 |        "      <td>STRAWBERRY CERAMIC TRINKET BOX</td>\n",
351 |        "      <td>24</td>\n",
352 |        "      <td>1.25</td>\n",
353 |        "    </tr>\n",
354 |        "  </tbody>\n",
355 |        "</table>\n",
356 |        "</div>"
357 |       ],
358 |       "text/plain": [
359 |        "   customer_id invoice        invoice_date stock_code  \\\n",
360 |        "0      13085.0  489434 2009-12-01 07:45:00      85048   \n",
361 |        "1      13085.0  489434 2009-12-01 07:45:00     79323P   \n",
362 |        "2      13085.0  489434 2009-12-01 07:45:00     79323W   \n",
363 |        "3      13085.0  489434 2009-12-01 07:45:00      22041   \n",
364 |        "4      13085.0  489434 2009-12-01 07:45:00      21232   \n",
365 |        "\n",
366 |        "                           description  quantity  price  \n",
367 |        "0  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   6.95  \n",
368 |        "1                   PINK CHERRY LIGHTS        12   6.75  \n",
369 |        "2                  WHITE CHERRY LIGHTS        12   6.75  \n",
370 |        "3         RECORD FRAME 7\" SINGLE SIZE         48   2.10  \n",
371 |        "4       STRAWBERRY CERAMIC TRINKET BOX        24   1.25  "
372 |       ]
373 |      },
374 |      "execution_count": 8,
375 |      "metadata": {},
376 |      "output_type": "execute_result"
377 |     }
378 |    ],
379 |    "source": [
380 |     "# load data\n",
381 |     "\n",
382 |     "df = pd.read_csv(\"retail.csv\", parse_dates=[\"invoice_date\"])\n",
383 |     "\n",
384 |     "df.head()"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 9,
390 |    "metadata": {},
391 |    "outputs": [
392 |     {
393 |      "data": {
394 |       "text/html": [
395 |        "<div>\n",
396 |        "<style scoped>\n",
397 |        "    .dataframe tbody tr th:only-of-type {\n",
398 |        "        vertical-align: middle;\n",
399 |        "    }\n",
400 |        "\n",
401 |        "    .dataframe tbody tr th {\n",
402 |        "        vertical-align: top;\n",
403 |        "    }\n",
404 |        "\n",
405 |        "    .dataframe thead th {\n",
406 |        "        text-align: right;\n",
407 |        "    }\n",
408 |        "</style>\n",
409 |        "<table border=\"1\" class=\"dataframe\">\n",
410 |        "  <thead>\n",
411 |        "    <tr style=\"text-align: right;\">\n",
412 |        "      <th></th>\n",
413 |        "      <th>customer_id</th>\n",
414 |        "      <th>invoice</th>\n",
415 |        "      <th>invoice_date</th>\n",
416 |        "      <th>stock_code</th>\n",
417 |        "      <th>description</th>\n",
418 |        "      <th>quantity</th>\n",
419 |        "      <th>price</th>\n",
420 |        "      <th>amount</th>\n",
421 |        "    </tr>\n",
422 |        "  </thead>\n",
423 |        "  <tbody>\n",
424 |        "    <tr>\n",
425 |        "      <th>0</th>\n",
426 |        "      <td>13085.0</td>\n",
427 |        "      <td>489434</td>\n",
428 |        "      <td>2009-12-01 07:45:00</td>\n",
429 |        "      <td>85048</td>\n",
430 |        "      <td>15CM CHRISTMAS GLASS BALL 20 LIGHTS</td>\n",
431 |        "      <td>12</td>\n",
432 |        "      <td>6.95</td>\n",
433 |        "      <td>83.4</td>\n",
434 |        "    </tr>\n",
435 |        "    <tr>\n",
436 |        "      <th>1</th>\n",
437 |        "      <td>13085.0</td>\n",
438 |        "      <td>489434</td>\n",
439 |        "      <td>2009-12-01 07:45:00</td>\n",
440 |        "      <td>79323P</td>\n",
441 |        "      <td>PINK CHERRY LIGHTS</td>\n",
442 |        "      <td>12</td>\n",
443 |        "      <td>6.75</td>\n",
444 |        "      <td>81.0</td>\n",
445 |        "    </tr>\n",
446 |        "    <tr>\n",
447 |        "      <th>2</th>\n",
448 |        "      <td>13085.0</td>\n",
449 |        "      <td>489434</td>\n",
450 |        "      <td>2009-12-01 07:45:00</td>\n",
451 |        "      <td>79323W</td>\n",
452 |        "      <td>WHITE CHERRY LIGHTS</td>\n",
453 |        "      <td>12</td>\n",
454 |        "      <td>6.75</td>\n",
455 |        "      <td>81.0</td>\n",
456 |        "    </tr>\n",
457 |        "    <tr>\n",
458 |        "      <th>3</th>\n",
459 |        "      <td>13085.0</td>\n",
460 |        "      <td>489434</td>\n",
461 |        "      <td>2009-12-01 07:45:00</td>\n",
462 |        "      <td>22041</td>\n",
463 |        "      <td>RECORD FRAME 7\" SINGLE SIZE</td>\n",
464 |        "      <td>48</td>\n",
465 |        "      <td>2.10</td>\n",
466 |        "      <td>100.8</td>\n",
467 |        "    </tr>\n",
468 |        "    <tr>\n",
469 |        "      <th>4</th>\n",
470 |        "      <td>13085.0</td>\n",
471 |        "      <td>489434</td>\n",
472 |        "      <td>2009-12-01 07:45:00</td>\n",
473 |        "      <td>21232</td>\n",
474 |        "      <td>STRAWBERRY CERAMIC TRINKET BOX</td>\n",
475 |        "      <td>24</td>\n",
476 |        "      <td>1.25</td>\n",
477 |        "      <td>30.0</td>\n",
478 |        "    </tr>\n",
479 |        "  </tbody>\n",
480 |        "</table>\n",
481 |        "</div>"
482 |       ],
483 |       "text/plain": [
484 |        "   customer_id invoice        invoice_date stock_code  \\\n",
485 |        "0      13085.0  489434 2009-12-01 07:45:00      85048   \n",
486 |        "1      13085.0  489434 2009-12-01 07:45:00     79323P   \n",
487 |        "2      13085.0  489434 2009-12-01 07:45:00     79323W   \n",
488 |        "3      13085.0  489434 2009-12-01 07:45:00      22041   \n",
489 |        "4      13085.0  489434 2009-12-01 07:45:00      21232   \n",
490 |        "\n",
491 |        "                           description  quantity  price  amount  \n",
492 |        "0  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   6.95    83.4  \n",
493 |        "1                   PINK CHERRY LIGHTS        12   6.75    81.0  \n",
494 |        "2                  WHITE CHERRY LIGHTS        12   6.75    81.0  \n",
495 |        "3         RECORD FRAME 7\" SINGLE SIZE         48   2.10   100.8  \n",
496 |        "4       STRAWBERRY CERAMIC TRINKET BOX        24   1.25    30.0  "
497 |       ]
498 |      },
499 |      "execution_count": 9,
500 |      "metadata": {},
501 |      "output_type": "execute_result"
502 |     }
503 |    ],
504 |    "source": [
505 |     "# Add total amount of transaction\n",
506 |     "\n",
507 |     "df[\"amount\"] = df[\"quantity\"].mul(df[\"price\"])\n",
508 |     "\n",
509 |     "df.head()"
510 |    ]
511 |   }
512 |  ],
513 |  "metadata": {
514 |   "kernelspec": {
515 |    "display_name": "fsml",
516 |    "language": "python",
517 |    "name": "fsml"
518 |   },
519 |   "language_info": {
520 |    "codemirror_mode": {
521 |     "name": "ipython",
522 |     "version": 3
523 |    },
524 |    "file_extension": ".py",
525 |    "mimetype": "text/x-python",
526 |    "name": "python",
527 |    "nbconvert_exporter": "python",
528 |    "pygments_lexer": "ipython3",
529 |    "version": "3.10.5"
530 |   },
531 |   "toc": {
532 |    "base_numbering": 1,
533 |    "nav_menu": {},
534 |    "number_sections": true,
535 |    "sideBar": true,
536 |    "skip_h1_title": false,
537 |    "title_cell": "Table of Contents",
538 |    "title_sidebar": "Contents",
539 |    "toc_cell": false,
540 |    "toc_position": {
541 |     "height": "calc(100% - 180px)",
542 |     "left": "10px",
543 |     "top": "150px",
544 |     "width": "165px"
545 |    },
546 |    "toc_section_display": "block",
547 |    "toc_window_display": true
548 |   }
549 |  },
550 |  "nbformat": 4,
551 |  "nbformat_minor": 2
552 | }
553 | 


--------------------------------------------------------------------------------
/ch09-featuretools/prepare-retail-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Online Retail II Data Set\n",
  8 |     "\n",
  9 |     "In this notebook we will prepare and store the Online Retail II Data Set from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail+II)\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "**Citation:**\n",
 13 |     "\n",
 14 |     "Chen, D. Sain, S.L., and Guo, K. (2012), Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining, Journal of Database Marketing and Customer Strategy Management, Vol. 19, No. 3, pp. 197-208. https://link.springer.com/article/10.1057/dbm.2012.17. \n",
 15 |     "\n",
 16 |     "## Download the data\n",
 17 |     "\n",
 18 |     "- Navigate to the [data folder](https://archive.ics.uci.edu/ml/machine-learning-databases/00502/).\n",
 19 |     "- Download the file called **online_retail_II.xlsx**.\n",
 20 |     "- Save the file in the same folder that contains this notebook."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import pandas as pd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/plain": [
 40 |        "(1067371, 8)"
 41 |       ]
 42 |      },
 43 |      "execution_count": 2,
 44 |      "metadata": {},
 45 |      "output_type": "execute_result"
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "# Load the data\n",
 50 |     "\n",
 51 |     "# The data is provided as two sheets in a single Excel file.\n",
 52 |     "# Load both and join into a single dataframe.\n",
 53 |     "\n",
 54 |     "# It takes a while...\n",
 55 |     "\n",
 56 |     "file = 'online_retail_II.xlsx'\n",
 57 |     "\n",
 58 |     "df_1 = pd.read_excel(file, sheet_name='Year 2009-2010')\n",
 59 |     "df_2 = pd.read_excel(file, sheet_name='Year 2010-2011')\n",
 60 |     "\n",
 61 |     "df = pd.concat([df_1, df_2])\n",
 62 |     "\n",
 63 |     "df.shape"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/html": [
 74 |        "<div>\n",
 75 |        "<style scoped>\n",
 76 |        "    .dataframe tbody tr th:only-of-type {\n",
 77 |        "        vertical-align: middle;\n",
 78 |        "    }\n",
 79 |        "\n",
 80 |        "    .dataframe tbody tr th {\n",
 81 |        "        vertical-align: top;\n",
 82 |        "    }\n",
 83 |        "\n",
 84 |        "    .dataframe thead th {\n",
 85 |        "        text-align: right;\n",
 86 |        "    }\n",
 87 |        "</style>\n",
 88 |        "<table border=\"1\" class=\"dataframe\">\n",
 89 |        "  <thead>\n",
 90 |        "    <tr style=\"text-align: right;\">\n",
 91 |        "      <th></th>\n",
 92 |        "      <th>Invoice</th>\n",
 93 |        "      <th>StockCode</th>\n",
 94 |        "      <th>Description</th>\n",
 95 |        "      <th>Quantity</th>\n",
 96 |        "      <th>InvoiceDate</th>\n",
 97 |        "      <th>Price</th>\n",
 98 |        "      <th>Customer ID</th>\n",
 99 |        "      <th>Country</th>\n",
100 |        "    </tr>\n",
101 |        "  </thead>\n",
102 |        "  <tbody>\n",
103 |        "    <tr>\n",
104 |        "      <th>0</th>\n",
105 |        "      <td>489434</td>\n",
106 |        "      <td>85048</td>\n",
107 |        "      <td>15CM CHRISTMAS GLASS BALL 20 LIGHTS</td>\n",
108 |        "      <td>12</td>\n",
109 |        "      <td>2009-12-01 07:45:00</td>\n",
110 |        "      <td>6.95</td>\n",
111 |        "      <td>13085.0</td>\n",
112 |        "      <td>United Kingdom</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>1</th>\n",
116 |        "      <td>489434</td>\n",
117 |        "      <td>79323P</td>\n",
118 |        "      <td>PINK CHERRY LIGHTS</td>\n",
119 |        "      <td>12</td>\n",
120 |        "      <td>2009-12-01 07:45:00</td>\n",
121 |        "      <td>6.75</td>\n",
122 |        "      <td>13085.0</td>\n",
123 |        "      <td>United Kingdom</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>2</th>\n",
127 |        "      <td>489434</td>\n",
128 |        "      <td>79323W</td>\n",
129 |        "      <td>WHITE CHERRY LIGHTS</td>\n",
130 |        "      <td>12</td>\n",
131 |        "      <td>2009-12-01 07:45:00</td>\n",
132 |        "      <td>6.75</td>\n",
133 |        "      <td>13085.0</td>\n",
134 |        "      <td>United Kingdom</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>3</th>\n",
138 |        "      <td>489434</td>\n",
139 |        "      <td>22041</td>\n",
140 |        "      <td>RECORD FRAME 7\" SINGLE SIZE</td>\n",
141 |        "      <td>48</td>\n",
142 |        "      <td>2009-12-01 07:45:00</td>\n",
143 |        "      <td>2.10</td>\n",
144 |        "      <td>13085.0</td>\n",
145 |        "      <td>United Kingdom</td>\n",
146 |        "    </tr>\n",
147 |        "    <tr>\n",
148 |        "      <th>4</th>\n",
149 |        "      <td>489434</td>\n",
150 |        "      <td>21232</td>\n",
151 |        "      <td>STRAWBERRY CERAMIC TRINKET BOX</td>\n",
152 |        "      <td>24</td>\n",
153 |        "      <td>2009-12-01 07:45:00</td>\n",
154 |        "      <td>1.25</td>\n",
155 |        "      <td>13085.0</td>\n",
156 |        "      <td>United Kingdom</td>\n",
157 |        "    </tr>\n",
158 |        "  </tbody>\n",
159 |        "</table>\n",
160 |        "</div>"
161 |       ],
162 |       "text/plain": [
163 |        "  Invoice StockCode                          Description  Quantity  \\\n",
164 |        "0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   \n",
165 |        "1  489434    79323P                   PINK CHERRY LIGHTS        12   \n",
166 |        "2  489434    79323W                  WHITE CHERRY LIGHTS        12   \n",
167 |        "3  489434     22041         RECORD FRAME 7\" SINGLE SIZE         48   \n",
168 |        "4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   \n",
169 |        "\n",
170 |        "          InvoiceDate  Price  Customer ID         Country  \n",
171 |        "0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  \n",
172 |        "1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  \n",
173 |        "2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  \n",
174 |        "3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  \n",
175 |        "4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  "
176 |       ]
177 |      },
178 |      "execution_count": 3,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "# Inspect dataframe\n",
185 |     "\n",
186 |     "df.head()"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 4,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "# Retain customers from the UK\n",
196 |     "\n",
197 |     "df = df[df[\"Country\"]==\"United Kingdom\"]\n",
198 |     "df.drop(\"Country\", axis=1, inplace=True)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 5,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "# Remove transactions without Customer ID\n",
208 |     "\n",
209 |     "df.dropna(subset=[\"Customer ID\"], inplace=True)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 6,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "# Rename columns\n",
219 |     "\n",
220 |     "df.columns = [\n",
221 |     "    \"invoice\",\n",
222 |     "    \"stock_code\",\n",
223 |     "    \"description\",\n",
224 |     "    \"quantity\",\n",
225 |     "    \"invoice_date\",\n",
226 |     "    \"price\",\n",
227 |     "    \"customer_id\",\n",
228 |     "]"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 7,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "# index rows with unique values\n",
238 |     "\n",
239 |     "df.reset_index(inplace=True, drop=True)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 8,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "# re-order columns\n",
249 |     "\n",
250 |     "ordered_cols = [\n",
251 |     "    \"customer_id\",\n",
252 |     "    \"invoice\",\n",
253 |     "    \"invoice_date\",\n",
254 |     "    \"stock_code\",\n",
255 |     "    \"description\",\n",
256 |     "    \"quantity\",\n",
257 |     "    \"price\",\n",
258 |     "]"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 9,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "# save data with re-ordered columns\n",
268 |     "\n",
269 |     "df[ordered_cols].to_csv('retail.csv', index=False)"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": []
278 |   }
279 |  ],
280 |  "metadata": {
281 |   "kernelspec": {
282 |    "display_name": "fsml",
283 |    "language": "python",
284 |    "name": "fsml"
285 |   },
286 |   "language_info": {
287 |    "codemirror_mode": {
288 |     "name": "ipython",
289 |     "version": 3
290 |    },
291 |    "file_extension": ".py",
292 |    "mimetype": "text/x-python",
293 |    "name": "python",
294 |    "nbconvert_exporter": "python",
295 |    "pygments_lexer": "ipython3",
296 |    "version": "3.10.5"
297 |   },
298 |   "toc": {
299 |    "base_numbering": 1,
300 |    "nav_menu": {},
301 |    "number_sections": true,
302 |    "sideBar": true,
303 |    "skip_h1_title": false,
304 |    "title_cell": "Table of Contents",
305 |    "title_sidebar": "Contents",
306 |    "toc_cell": false,
307 |    "toc_position": {},
308 |    "toc_section_display": true,
309 |    "toc_window_display": false
310 |   }
311 |  },
312 |  "nbformat": 4,
313 |  "nbformat_minor": 4
314 | }
315 | 


--------------------------------------------------------------------------------
/ch10-tsfresh/Recipe4-extract-features-after-feature-selection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "6bdb68a4",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "\n",
 12 |     "from sklearn.feature_selection import SelectFromModel\n",
 13 |     "from sklearn.linear_model import LogisticRegression\n",
 14 |     "\n",
 15 |     "from tsfresh import extract_features, extract_relevant_features\n",
 16 |     "from tsfresh.feature_extraction import settings"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "id": "77e5e5e6",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# load data\n",
 27 |     "\n",
 28 |     "X = pd.read_csv(\"occupancy.csv\", parse_dates=[\"date\"])\n",
 29 |     "y = pd.read_csv(\"occupancy_target.csv\", index_col=\"id\")\n",
 30 |     "y = pd.Series(y[\"occupancy\"])"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "id": "7d6e26b5",
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stderr",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:29<00:00,  2.97s/it]\n"
 44 |      ]
 45 |     },
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(135, 969)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 3,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "# create and select features\n",
 59 |     "\n",
 60 |     "features = extract_relevant_features(\n",
 61 |     "    X,\n",
 62 |     "    y,\n",
 63 |     "    column_id=\"id\",\n",
 64 |     "    column_sort=\"date\",\n",
 65 |     ")\n",
 66 |     "\n",
 67 |     "features.shape"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "id": "b4d73915",
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "array(['light__sum_of_reoccurring_data_points',\n",
 80 |        "       'co2__spkt_welch_density__coeff_2', 'co2__variance',\n",
 81 |        "       'temperature__c3__lag_1', 'temperature__abs_energy',\n",
 82 |        "       'temperature__c3__lag_2', 'temperature__c3__lag_3',\n",
 83 |        "       'co2__sum_of_reoccurring_data_points',\n",
 84 |        "       'light__spkt_welch_density__coeff_8', 'light__variance',\n",
 85 |        "       'light__agg_linear_trend__attr_\"slope\"__chunk_len_50__f_agg_\"var\"',\n",
 86 |        "       'light__agg_linear_trend__attr_\"intercept\"__chunk_len_10__f_agg_\"var\"'],\n",
 87 |        "      dtype=object)"
 88 |       ]
 89 |      },
 90 |      "execution_count": 4,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "# select features with lasso\n",
 97 |     "\n",
 98 |     "cls = LogisticRegression(\n",
 99 |     "    penalty=\"l1\", \n",
100 |     "    solver=\"liblinear\",\n",
101 |     "    random_state=10,\n",
102 |     "    C=0.05,\n",
103 |     "    max_iter=1000,\n",
104 |     ")\n",
105 |     "\n",
106 |     "selector = SelectFromModel(cls)\n",
107 |     "\n",
108 |     "selector.fit(features, y)\n",
109 |     "\n",
110 |     "features = selector.get_feature_names_out()\n",
111 |     "\n",
112 |     "features"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 5,
118 |    "id": "72ddeff0",
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "data": {
123 |       "text/plain": [
124 |        "{'light': {'sum_of_reoccurring_data_points': None,\n",
125 |        "  'spkt_welch_density': [{'coeff': 8}],\n",
126 |        "  'variance': None,\n",
127 |        "  'agg_linear_trend': [{'attr': 'slope', 'chunk_len': 50, 'f_agg': 'var'},\n",
128 |        "   {'attr': 'intercept', 'chunk_len': 10, 'f_agg': 'var'}]},\n",
129 |        " 'co2': {'spkt_welch_density': [{'coeff': 2}],\n",
130 |        "  'variance': None,\n",
131 |        "  'sum_of_reoccurring_data_points': None},\n",
132 |        " 'temperature': {'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],\n",
133 |        "  'abs_energy': None}}"
134 |       ]
135 |      },
136 |      "execution_count": 5,
137 |      "metadata": {},
138 |      "output_type": "execute_result"
139 |     }
140 |    ],
141 |    "source": [
142 |     "# capture selected features for each time series\n",
143 |     "\n",
144 |     "kind_to_fc_parameters = settings.from_columns(selector.get_feature_names_out())\n",
145 |     "\n",
146 |     "kind_to_fc_parameters"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 6,
152 |    "id": "f91d6eb8",
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "name": "stderr",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.60it/s]\n"
160 |      ]
161 |     },
162 |     {
163 |      "data": {
164 |       "text/plain": [
165 |        "(135, 12)"
166 |       ]
167 |      },
168 |      "execution_count": 6,
169 |      "metadata": {},
170 |      "output_type": "execute_result"
171 |     }
172 |    ],
173 |    "source": [
174 |     "# create selected features for each time series\n",
175 |     "\n",
176 |     "features = extract_features(\n",
177 |     "    X,\n",
178 |     "    column_id=\"id\",\n",
179 |     "    column_sort=\"date\",\n",
180 |     "    kind_to_fc_parameters=kind_to_fc_parameters,\n",
181 |     ")\n",
182 |     "\n",
183 |     "features.shape"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 7,
189 |    "id": "065af937",
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "data": {
194 |       "text/html": [
195 |        "<div>\n",
196 |        "<style scoped>\n",
197 |        "    .dataframe tbody tr th:only-of-type {\n",
198 |        "        vertical-align: middle;\n",
199 |        "    }\n",
200 |        "\n",
201 |        "    .dataframe tbody tr th {\n",
202 |        "        vertical-align: top;\n",
203 |        "    }\n",
204 |        "\n",
205 |        "    .dataframe thead th {\n",
206 |        "        text-align: right;\n",
207 |        "    }\n",
208 |        "</style>\n",
209 |        "<table border=\"1\" class=\"dataframe\">\n",
210 |        "  <thead>\n",
211 |        "    <tr style=\"text-align: right;\">\n",
212 |        "      <th></th>\n",
213 |        "      <th>temperature__c3__lag_1</th>\n",
214 |        "      <th>temperature__c3__lag_2</th>\n",
215 |        "      <th>temperature__c3__lag_3</th>\n",
216 |        "      <th>temperature__abs_energy</th>\n",
217 |        "      <th>light__sum_of_reoccurring_data_points</th>\n",
218 |        "      <th>light__spkt_welch_density__coeff_8</th>\n",
219 |        "      <th>light__variance</th>\n",
220 |        "      <th>light__agg_linear_trend__attr_\"slope\"__chunk_len_50__f_agg_\"var\"</th>\n",
221 |        "      <th>light__agg_linear_trend__attr_\"intercept\"__chunk_len_10__f_agg_\"var\"</th>\n",
222 |        "      <th>co2__spkt_welch_density__coeff_2</th>\n",
223 |        "      <th>co2__variance</th>\n",
224 |        "      <th>co2__sum_of_reoccurring_data_points</th>\n",
225 |        "    </tr>\n",
226 |        "  </thead>\n",
227 |        "  <tbody>\n",
228 |        "    <tr>\n",
229 |        "      <th>1</th>\n",
230 |        "      <td>11585.127934</td>\n",
231 |        "      <td>11581.203590</td>\n",
232 |        "      <td>11578.178807</td>\n",
233 |        "      <td>30721.568703</td>\n",
234 |        "      <td>2514.0</td>\n",
235 |        "      <td>332.221295</td>\n",
236 |        "      <td>18086.371875</td>\n",
237 |        "      <td>-21130.3425</td>\n",
238 |        "      <td>19305.1375</td>\n",
239 |        "      <td>1523.529443</td>\n",
240 |        "      <td>756.700664</td>\n",
241 |        "      <td>14124.000000</td>\n",
242 |        "    </tr>\n",
243 |        "    <tr>\n",
244 |        "      <th>2</th>\n",
245 |        "      <td>10751.999610</td>\n",
246 |        "      <td>10752.682508</td>\n",
247 |        "      <td>10753.119812</td>\n",
248 |        "      <td>29225.254374</td>\n",
249 |        "      <td>0.0</td>\n",
250 |        "      <td>0.000000</td>\n",
251 |        "      <td>0.000000</td>\n",
252 |        "      <td>0.0000</td>\n",
253 |        "      <td>0.0000</td>\n",
254 |        "      <td>350.067478</td>\n",
255 |        "      <td>377.280895</td>\n",
256 |        "      <td>13202.000000</td>\n",
257 |        "    </tr>\n",
258 |        "    <tr>\n",
259 |        "      <th>3</th>\n",
260 |        "      <td>10187.865226</td>\n",
261 |        "      <td>10187.206963</td>\n",
262 |        "      <td>10186.284040</td>\n",
263 |        "      <td>28198.139950</td>\n",
264 |        "      <td>0.0</td>\n",
265 |        "      <td>0.000000</td>\n",
266 |        "      <td>0.000000</td>\n",
267 |        "      <td>0.0000</td>\n",
268 |        "      <td>0.0000</td>\n",
269 |        "      <td>91.896894</td>\n",
270 |        "      <td>115.269298</td>\n",
271 |        "      <td>20885.666667</td>\n",
272 |        "    </tr>\n",
273 |        "    <tr>\n",
274 |        "      <th>4</th>\n",
275 |        "      <td>9908.900224</td>\n",
276 |        "      <td>9909.731388</td>\n",
277 |        "      <td>9910.163947</td>\n",
278 |        "      <td>27680.800184</td>\n",
279 |        "      <td>0.0</td>\n",
280 |        "      <td>0.000000</td>\n",
281 |        "      <td>0.000000</td>\n",
282 |        "      <td>0.0000</td>\n",
283 |        "      <td>0.0000</td>\n",
284 |        "      <td>42.394905</td>\n",
285 |        "      <td>35.473216</td>\n",
286 |        "      <td>18285.500000</td>\n",
287 |        "    </tr>\n",
288 |        "    <tr>\n",
289 |        "      <th>5</th>\n",
290 |        "      <td>9705.989789</td>\n",
291 |        "      <td>9706.404551</td>\n",
292 |        "      <td>9706.707963</td>\n",
293 |        "      <td>27299.097469</td>\n",
294 |        "      <td>0.0</td>\n",
295 |        "      <td>0.000000</td>\n",
296 |        "      <td>0.000000</td>\n",
297 |        "      <td>0.0000</td>\n",
298 |        "      <td>0.0000</td>\n",
299 |        "      <td>4.811303</td>\n",
300 |        "      <td>27.539080</td>\n",
301 |        "      <td>17670.500000</td>\n",
302 |        "    </tr>\n",
303 |        "  </tbody>\n",
304 |        "</table>\n",
305 |        "</div>"
306 |       ],
307 |       "text/plain": [
308 |        "   temperature__c3__lag_1  temperature__c3__lag_2  temperature__c3__lag_3  \\\n",
309 |        "1            11585.127934            11581.203590            11578.178807   \n",
310 |        "2            10751.999610            10752.682508            10753.119812   \n",
311 |        "3            10187.865226            10187.206963            10186.284040   \n",
312 |        "4             9908.900224             9909.731388             9910.163947   \n",
313 |        "5             9705.989789             9706.404551             9706.707963   \n",
314 |        "\n",
315 |        "   temperature__abs_energy  light__sum_of_reoccurring_data_points  \\\n",
316 |        "1             30721.568703                                 2514.0   \n",
317 |        "2             29225.254374                                    0.0   \n",
318 |        "3             28198.139950                                    0.0   \n",
319 |        "4             27680.800184                                    0.0   \n",
320 |        "5             27299.097469                                    0.0   \n",
321 |        "\n",
322 |        "   light__spkt_welch_density__coeff_8  light__variance  \\\n",
323 |        "1                          332.221295     18086.371875   \n",
324 |        "2                            0.000000         0.000000   \n",
325 |        "3                            0.000000         0.000000   \n",
326 |        "4                            0.000000         0.000000   \n",
327 |        "5                            0.000000         0.000000   \n",
328 |        "\n",
329 |        "   light__agg_linear_trend__attr_\"slope\"__chunk_len_50__f_agg_\"var\"  \\\n",
330 |        "1                                        -21130.3425                  \n",
331 |        "2                                             0.0000                  \n",
332 |        "3                                             0.0000                  \n",
333 |        "4                                             0.0000                  \n",
334 |        "5                                             0.0000                  \n",
335 |        "\n",
336 |        "   light__agg_linear_trend__attr_\"intercept\"__chunk_len_10__f_agg_\"var\"  \\\n",
337 |        "1                                         19305.1375                      \n",
338 |        "2                                             0.0000                      \n",
339 |        "3                                             0.0000                      \n",
340 |        "4                                             0.0000                      \n",
341 |        "5                                             0.0000                      \n",
342 |        "\n",
343 |        "   co2__spkt_welch_density__coeff_2  co2__variance  \\\n",
344 |        "1                       1523.529443     756.700664   \n",
345 |        "2                        350.067478     377.280895   \n",
346 |        "3                         91.896894     115.269298   \n",
347 |        "4                         42.394905      35.473216   \n",
348 |        "5                          4.811303      27.539080   \n",
349 |        "\n",
350 |        "   co2__sum_of_reoccurring_data_points  \n",
351 |        "1                         14124.000000  \n",
352 |        "2                         13202.000000  \n",
353 |        "3                         20885.666667  \n",
354 |        "4                         18285.500000  \n",
355 |        "5                         17670.500000  "
356 |       ]
357 |      },
358 |      "execution_count": 7,
359 |      "metadata": {},
360 |      "output_type": "execute_result"
361 |     }
362 |    ],
363 |    "source": [
364 |     "features.head()"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "id": "cae5efb1",
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": []
374 |   }
375 |  ],
376 |  "metadata": {
377 |   "kernelspec": {
378 |    "display_name": "fsml",
379 |    "language": "python",
380 |    "name": "fsml"
381 |   },
382 |   "language_info": {
383 |    "codemirror_mode": {
384 |     "name": "ipython",
385 |     "version": 3
386 |    },
387 |    "file_extension": ".py",
388 |    "mimetype": "text/x-python",
389 |    "name": "python",
390 |    "nbconvert_exporter": "python",
391 |    "pygments_lexer": "ipython3",
392 |    "version": "3.10.5"
393 |   },
394 |   "toc": {
395 |    "base_numbering": 1,
396 |    "nav_menu": {},
397 |    "number_sections": true,
398 |    "sideBar": true,
399 |    "skip_h1_title": false,
400 |    "title_cell": "Table of Contents",
401 |    "title_sidebar": "Contents",
402 |    "toc_cell": false,
403 |    "toc_position": {},
404 |    "toc_section_display": true,
405 |    "toc_window_display": false
406 |   }
407 |  },
408 |  "nbformat": 4,
409 |  "nbformat_minor": 5
410 | }
411 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | category-encoders==2.4.0
 2 | feature-engine==1.4.0
 3 | featuretools==1.5.0
 4 | matplotlib==3.4.2
 5 | matplotlib-inline==0.1.2
 6 | numpy==1.22.0
 7 | pandas==1.5.0
 8 | scikit-learn==1.1.0
 9 | scipy==1.7.0
10 | seaborn==0.11.1
11 | statsmodels==0.12.2
12 | tsfresh==0.19.0


--------------------------------------------------------------------------------