├── .Rproj.user
    └── 69A1DB49
    │   ├── pcs
    │       ├── debug-breakpoints.pper
    │       ├── files-pane.pper
    │       ├── source-pane.pper
    │       ├── windowlayoutstate.pper
    │       └── workbench-pane.pper
    │   └── sources
    │       └── prop
    │           └── 91DA5C98
├── .gitignore
├── Chapter 10
    ├── airpassengers.xlsx
    ├── forecasting_lstm.py
    ├── forecasting_prophet.py
    ├── forecasting_statsmodel.py
    ├── imgs
    │   ├── AirPassengers.png
    │   ├── acf_plot.png
    │   ├── auto_arima_calibration_plot.png
    │   ├── pacf_apts.png
    │   ├── ts_obj_rnorm_25.png
    │   └── tuning_grid_plot.png
    ├── plots.py
    ├── sample_data.py
    ├── statistics.py
    └── time_series.R
├── Chapter 11
    ├── call_plumber.R
    ├── fastapi_add.py
    ├── imgs
    │   ├── BERT_Console.png
    │   ├── BERT_Console_Excell_Addins.png
    │   ├── BERT_VBA_to_R_density_plot_rnorm.png
    │   └── RAND.png
    ├── matrix_multiplication.py
    ├── matrix_multiplication.xlsm
    ├── multiply.py
    ├── multiply.xlsx
    ├── plumber_api.R
    └── vba_plumber_curl_request.bas
├── Chapter 3
    ├── Sub_MultiplyByRandom.bas
    ├── executing_VBA.py
    ├── interacting_Excel_objects.py
    ├── mult_by_rand_ch3.xlsm
    ├── retreiving_data.py
    ├── run_MultByRand_macro.R
    └── testing_environment.py
├── Chapter 4
    ├── apscheduler.py
    ├── case_study.py
    ├── get_user_input.R
    ├── hello_world.R
    ├── hello_world_schedule.R
    ├── install_taskscheduleR.R
    ├── schedule.py
    └── send_basic_email.py
├── Chapter 5
    ├── aligning_text_openpyxl.py
    ├── aligning_text_pandas.py
    ├── background_colors_openpyxl.py
    ├── background_colors_pandas.py
    ├── conditional_formatting.py
    ├── font_properties_openpyxl.py
    ├── font_properties_pandas.py
    ├── heatmap.py
    ├── install_ch5_packages.R
    ├── pivot_table.py
    ├── using_basictabler.R
    └── using_styledTables.R
├── Chapter 6
    ├── ch6_barplot.R
    ├── ch6_cowplot.R
    ├── ch6_dumbell_plot.R
    ├── ch6_ggplot2.R
    ├── ch6_timeseries.R
    ├── insert_image_pywin32.py
    ├── matplotlib_basics.py
    ├── matplotlib_customizations.py
    ├── plotnine_additional_layers.py
    ├── plotnine_basics.py
    └── plotnine_customizations.py
├── Chapter 7
    ├── create_pivot.py
    ├── grouping.py
    └── manipulate_pivot.py
├── Chapter 8
    ├── ch8.R
    ├── clean_data.py
    ├── create_sample_data.py
    ├── data_distribution.py
    ├── relationships.py
    └── summary_statistics.py
├── Chapter 9
    ├── ch9_linear_reg.R
    ├── ch9_linear_reg_tidymodels.R
    ├── ch9_logistic_reg.R
    ├── ch9_logistic_reg_tidymodels.R
    ├── linear_regression.py
    └── logistic_regression.py
├── Chapter1
    ├── ch1_create_iris_dataset.R
    ├── ch1_pkgs.R
    ├── ch1_save_xlsx_as_xlsb.R
    ├── excel_sheet_reader.R
    ├── iris_data.xlsb
    ├── multisheet_openpyxl.py
    ├── open_excel_openpyxl.py
    ├── open_excel_pandas.py
    └── read_xlsx_files.R
├── Chapter12
    ├── call_plumber.R
    ├── imgs
    │   ├── api_histogram.png
    │   ├── enter_api_argument.png
    │   ├── get_api.png
    │   ├── swagger_plumber_api_screen.png
    │   └── vba_curl_request.png
    ├── plumber_api.R
    └── vba_plumber_curl_request.bas
├── Chapter14
    ├── auto_xgb.rar
    ├── ch14_data.R
    ├── ch14_diamonds_eda.R
    ├── ch14_diamonds_modeling.R
    ├── imgs
    │   ├── ggplot_diamonds_boxplot.png
    │   ├── ggplot_diamonds_hex_plot.png
    │   ├── ggplot_diamonds_hist_by_cut.png
    │   ├── ggplot_diamonds_mean_price.png
    │   ├── ggplot_diamonds_mean_price_per_carat.png
    │   └── hist_default_and_optbin.png
    └── xgb_wflw_fit.rds
├── Chapter2
    ├── adding_sheets.py
    ├── cell_update.py
    ├── create_workbook.py
    ├── deleting_sheet.py
    ├── excel_write_bench.R
    ├── export2excel_pandas.py
    └── output_file_size_compare.R
├── Chapter7
    └── ch7_tables_with_R.R
├── Chapter8
    └── ch8.R
├── Chapter9
    ├── ch9_linear_reg.R
    ├── ch9_linear_reg_tidymodels.R
    ├── ch9_logistic_reg.R
    └── ch9_logistic_reg_tidymodels.R
├── Extending-Excel-with-Python-and-R.Rproj
├── GroupingExample.xlsx
├── LICENSE
├── README.md
├── aligned_table_openpyxl.xlsx
├── aligned_table_pandas.xlsx
├── chapter6
    ├── ch6_barplot.R
    ├── ch6_cowplot.R
    ├── ch6_dumbell_plot.R
    ├── ch6_ggplot2.R
    ├── ch6_timeseries.R
    └── imgs
    │   └── payergroup_barplot.png
├── colored_table_openpyxl.xlsx
├── colored_table_pandas.xlsx
├── conditional_formatting.xlsx
├── data.xlsx
├── dirty_data.xlsx
├── example.xlsx
├── heatmap_with_conditional_formatting.xlsx
├── iris_data.xlsm
├── iris_data.xlsx
├── linear_regression_input.xlsx
├── logistic_regression_input.xlsx
├── requirements.txt
├── requirements.txt.bak
├── styled_table_openpyxl.xlsx
├── styled_table_pandas.xlsx
└── time_series_data.xlsx


/.Rproj.user/69A1DB49/pcs/debug-breakpoints.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "debugBreakpointsState": {
3 |         "breakpoints": []
4 |     }
5 | }


--------------------------------------------------------------------------------
/.Rproj.user/69A1DB49/pcs/files-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "sortOrder": [
3 |         {
4 |             "columnIndex": 2,
5 |             "ascending": true
6 |         }
7 |     ],
8 |     "path": "C:/Users/steve/Documents/GitHub/Extending-Excel-with-Python-and-R/Chapter 10"
9 | }


--------------------------------------------------------------------------------
/.Rproj.user/69A1DB49/pcs/source-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "activeTab": 2
3 | }


--------------------------------------------------------------------------------
/.Rproj.user/69A1DB49/pcs/windowlayoutstate.pper:
--------------------------------------------------------------------------------
 1 | {
 2 |     "left": {
 3 |         "splitterpos": 417,
 4 |         "topwindowstate": "NORMAL",
 5 |         "panelheight": 822,
 6 |         "windowheight": 860
 7 |     },
 8 |     "right": {
 9 |         "splitterpos": 520,
10 |         "topwindowstate": "NORMAL",
11 |         "panelheight": 822,
12 |         "windowheight": 860
13 |     }
14 | }


--------------------------------------------------------------------------------
/.Rproj.user/69A1DB49/pcs/workbench-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "TabSet1": 0,
3 |     "TabSet2": 0,
4 |     "TabZoom": {}
5 | }


--------------------------------------------------------------------------------
/.Rproj.user/69A1DB49/sources/prop/91DA5C98:
--------------------------------------------------------------------------------
1 | {
2 |     "tempName": "Untitled2",
3 |     "source_window_id": "",
4 |     "Source": "Source",
5 |     "cursorPosition": "38,12",
6 |     "scrollLine": "25"
7 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .bookenv/*
2 | *.xlsx
3 | .Rproj.user
4 | *.png
5 | .Rproj.user/69A1DB49/sources/prop/91DA5C98
6 | *.pper
7 | *.pper
8 | 


--------------------------------------------------------------------------------
/Chapter 10/airpassengers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/airpassengers.xlsx


--------------------------------------------------------------------------------
/Chapter 10/forecasting_lstm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | from keras.models import Sequential
 5 | from keras.layers import LSTM, Dense
 6 | from sklearn.preprocessing import MinMaxScaler
 7 | 
 8 | # Load the time series data (replace with your data)
 9 | time_series_data = pd.read_excel('time_series_data.xlsx')
10 | 
11 | # Normalize the data to be in the range [0, 1]
12 | scaler = MinMaxScaler()
13 | data = scaler.fit_transform(time_series_data['Value'].to_numpy().reshape(-1, 1))
14 | 
15 | # Split the data into training and testing sets
16 | train_size = int(len(data) * 0.67)
17 | train, test = data[0:train_size, :], data[train_size:len(data), :]
18 | 
19 | # Create sequences and labels for training
20 | def create_dataset(dataset, look_back=1):
21 |     X, Y = [], []
22 |     for i in range(len(dataset) - look_back):
23 |         a = dataset[i:(i + look_back), 0]
24 |         X.append(a)
25 |         Y.append(dataset[i + look_back, 0])
26 |     return np.array(X), np.array(Y)
27 | 
28 | look_back = 3
29 | X_train, Y_train = create_dataset(train, look_back)
30 | X_test, Y_test = create_dataset(test, look_back)
31 | 
32 | # Reshape the data for LSTM input
33 | X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
34 | X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
35 | 
36 | # Create and train an LSTM model
37 | model = Sequential()
38 | model.add(LSTM(4, input_shape=(1, look_back)))
39 | model.add(Dense(1))
40 | model.compile(loss='mean_squared_error', optimizer='adam')
41 | model.fit(X_train, Y_train, epochs=100, batch_size=1, verbose=2)
42 | 
43 | # Make predictions:
44 | trainPredict = model.predict(X_train)
45 | testPredict = model.predict(X_test)
46 | 
47 | # Inverse transform the predictions to the original scale
48 | trainPredict = scaler.inverse_transform(trainPredict)
49 | testPredict = scaler.inverse_transform(testPredict)
50 | 
51 | # Plot the training predictions
52 | trainPredictPlot = np.empty_like(data)
53 | trainPredictPlot[:, :] = np.nan
54 | trainPredictPlot[look_back:len(trainPredict) + look_back, :] = trainPredict
55 | 
56 | # Plot the test predictions
57 | testPredictPlot = np.empty_like(data)
58 | testPredictPlot[:, :] = np.nan
59 | testPredictPlot[len(trainPredict) + (look_back * 2):len(data), :] = testPredict
60 | 
61 | # Plot the training data in blue
62 | plt.plot(time_series_data['Value'], color='blue', label='Actual Data')
63 | 
64 | # Create shaded regions for the training and test data
65 | plt.fill_between(range(len(data)), 0, trainPredictPlot, color='lightgray', label='Training Data')
66 | plt.fill_between(range(len(data)), trainPredictPlot, testPredictPlot, color='lightcoral', label='Test Data')
67 | 
68 | # Overlay the predictions in green
69 | plt.plot(testPredictPlot, color='green', label='Predictions')
70 | 
71 | plt.title('Time Series Analysis with LSTM')
72 | plt.legend()
73 | plt.show()


--------------------------------------------------------------------------------
/Chapter 10/forecasting_prophet.py:
--------------------------------------------------------------------------------
 1 | # Import necessary libraries
 2 | import pandas as pd
 3 | from prophet import Prophet
 4 | from prophet.plot import plot
 5 | 
 6 | # Load the time series data (replace with your data)
 7 | time_series_data = pd.read_excel('time_series_data.xlsx')
 8 | 
 9 | # Create a DataFrame with 'ds' and 'y' columns
10 | df = pd.DataFrame({'ds': time_series_data['Date'], 'y': time_series_data['Value']})
11 | 
12 | # Initialize and fit the Prophet model without weekly seasonality
13 | model = Prophet(weekly_seasonality=False)
14 | 
15 | # Add custom seasonality obtained from domain knowledge (in this case: we generated the data so)
16 | model.add_seasonality(name='custom_season', period=365, fourier_order=5)
17 | 
18 | # Fit the customized model
19 | model.fit(df)
20 | 
21 | # Create a dataframe for future dates
22 | forecast_steps = 150  # Adjust the number of forecast steps as needed
23 | future = model.make_future_dataframe(periods=forecast_steps, freq='D')
24 | 
25 | # Make predictions
26 | forecast = model.predict(future)
27 | 
28 | # Plot the forecast
29 | fig = model.plot(forecast)
30 | 
31 | fig.show()
32 | 
33 | # Plot components of the forecast (trend, yearly, and weekly seasonality)
34 | fig2 = model.plot_components(forecast)
35 | 
36 | fig2.show()
37 | 


--------------------------------------------------------------------------------
/Chapter 10/forecasting_statsmodel.py:
--------------------------------------------------------------------------------
 1 | # Import necessary libraries
 2 | import pandas as pd
 3 | import numpy as np
 4 | import statsmodels.api as sm
 5 | from scipy.stats import norm
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | # Load the time series data (replace with your data)
 9 | time_series_data = pd.read_excel('time_series_data.xlsx')['Value']
10 | 
11 | # Perform the Augmented Dickey-Fuller test to check for stationarity
12 | result = sm.tsa.adfuller(time_series_data, autolag='AIC')
13 | 
14 | # If the p-value is greater than a threshold (e.g., 0.05), perform differencing to make the data stationary
15 | if result[1] > 0.05:
16 |     differenced_data = np.diff(time_series_data, n=1)
17 | else:
18 |     differenced_data = time_series_data
19 | 
20 | # Build an ARIMA model
21 | order = (1, 1, 1)  # Replace with appropriate values based on ACF and PACF analysis
22 | model = sm.tsa.ARIMA(differenced_data, order=order)
23 | 
24 | # Fit the ARIMA model
25 | model_fit = model.fit()
26 | 
27 | # Make forecasts
28 | forecast_steps = 50  # Adjust the number of forecast steps as needed
29 | forecast = model_fit.forecast(steps=forecast_steps)
30 | 
31 | # If the p-value is greater than a threshold (e.g., 0.05), perform differencing to make the data stationary
32 | if result[1] > 0.05:
33 |     # The model was trained on the differenced data so the forecasts have to be added to the last data point
34 |     cumsum_forecasts = np.cumsum(forecast)
35 | 
36 |     # Add this cumulative sum to the last observed value in your raw data
37 |     real_forecasts = cumsum_forecasts + time_series_data[len(time_series_data)-1]
38 | 
39 | else:
40 |     real_forecasts = forecast
41 | 
42 | # Retrieve ARIMA model parameters
43 | params = model_fit.params
44 | p, d, q = order
45 | resid = model_fit.resid
46 | 
47 | # Compute the standard errors
48 | stderr = np.std(resid)
49 | 
50 | # Calculate the confidence intervals
51 | z_score = norm.ppf(0.975)  # For a 95% confidence interval
52 | conf_int = np.column_stack((real_forecasts - z_score * stderr, real_forecasts + z_score * stderr))
53 | 
54 | # Separate the forecasts into point forecasts and confidence intervals
55 | point_forecasts = real_forecasts  # The point forecasts
56 | forecast_stderr = stderr  # The standard errors of the forecasts
57 | lower_bound = conf_int[:, 0]  # Lower confidence interval bounds
58 | upper_bound = conf_int[:, 1]  # Upper confidence interval bounds
59 | 
60 | # Visualize the original time series and forecasts
61 | plt.figure(figsize=(12, 6))
62 | plt.plot(time_series_data, label='Original Time Series', color='blue')
63 | plt.plot(range(len(time_series_data), len(time_series_data) + forecast_steps), real_forecasts, label='Forecast', color='red')
64 | plt.fill_between(range(len(time_series_data), len(time_series_data) + forecast_steps), conf_int[:, 0], conf_int[:, 1], color='pink', alpha=0.5)
65 | plt.xlabel('Time Steps')
66 | plt.ylabel('Value')
67 | plt.title('ARIMA Time Series Forecast')
68 | plt.legend()
69 | plt.show()
70 | 


--------------------------------------------------------------------------------
/Chapter 10/imgs/AirPassengers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/AirPassengers.png


--------------------------------------------------------------------------------
/Chapter 10/imgs/acf_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/acf_plot.png


--------------------------------------------------------------------------------
/Chapter 10/imgs/auto_arima_calibration_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/auto_arima_calibration_plot.png


--------------------------------------------------------------------------------
/Chapter 10/imgs/pacf_apts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/pacf_apts.png


--------------------------------------------------------------------------------
/Chapter 10/imgs/ts_obj_rnorm_25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/ts_obj_rnorm_25.png


--------------------------------------------------------------------------------
/Chapter 10/imgs/tuning_grid_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/tuning_grid_plot.png


--------------------------------------------------------------------------------
/Chapter 10/plots.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import statsmodels.api as sm
 5 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
 6 | 
 7 | # Load time series data (replace 'time_series_data.xlsx' with your data file)
 8 | data = pd.read_excel('time_series_data.xlsx')
 9 | 
10 | # Convert the 'Date' column to datetime format and set it as the index
11 | data['Date'] = pd.to_datetime(data['Date'])
12 | data.set_index('Date', inplace=True)
13 | 
14 | # Plot the time series
15 | plt.figure(figsize=(12, 6))
16 | plt.plot(data['Value'])
17 | plt.title('Time Series Plot')
18 | plt.xlabel('Date')
19 | plt.ylabel('Value')
20 | plt.grid(True)
21 | plt.show()
22 | 
23 | # ACF and PACF plots
24 | fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
25 | 
26 | # ACF plot
27 | plot_acf(data['Value'], lags=10, ax=ax1)
28 | ax1.set_title('Autocorrelation Function (ACF)')
29 | 
30 | # PACF plot
31 | plot_pacf(data['Value'], lags=10, ax=ax2)
32 | ax2.set_title('Partial Autocorrelation Function (PACF)')
33 | 
34 | plt.tight_layout()
35 | plt.show()
36 | 


--------------------------------------------------------------------------------
/Chapter 10/sample_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Create a time index
 6 | date_rng = pd.date_range(start='2022-01-01', end='2023-12-31', freq='D')
 7 | 
 8 | # Create a trend component
 9 | trend = 0.05 * np.arange(len(date_rng))
10 | 
11 | # Create a seasonal component (cyclicality)
12 | seasonal = 2.5 * np.sin(2 * np.pi * np.arange(len(date_rng)) / 365)
13 | 
14 | # Add some random noise
15 | noise = np.random.normal(0, 0.5, len(date_rng))
16 | 
17 | # Combine all components to create the time series
18 | time_series = trend + seasonal + noise
19 | 
20 | # Create a DataFrame
21 | df = pd.DataFrame({'Date': date_rng, 'Value': time_series})
22 | 
23 | # Save the data to an Excel file
24 | df.to_excel('time_series_data.xlsx', index=False)
25 | 
26 | # Read the data back into pandas
27 | loaded_df = pd.read_excel('time_series_data.xlsx')
28 | 
29 | # Display the first few rows
30 | print(loaded_df.head())
31 | 


--------------------------------------------------------------------------------
/Chapter 10/statistics.py:
--------------------------------------------------------------------------------
 1 | # Load the data
 2 | import pandas as pd
 3 | 
 4 | # Read the data back into pandas
 5 | df = pd.read_excel('time_series_data.xlsx')
 6 | 
 7 | # Augmented Dickey-Fuller Test
 8 | 
 9 | from statsmodels.tsa.stattools import adfuller
10 | 
11 | adf_result = adfuller(df['Value'])
12 | print("\nAugmented Dickey-Fuller Test:")
13 | print(f"ADF Statistic: {adf_result[0]}")
14 | print(f"P-value: {adf_result[1]}")
15 | print("Null Hypothesis (H0): Data is non-stationary")
16 | print("Alternative Hypothesis (H1): Data is stationary")
17 | 
18 | if adf_result[1] <= 0.05:
19 |     print("Result: Reject the null hypothesis. Data is stationary.")
20 | else:
21 |     print("Result: Fail to reject the null hypothesis. Data is non-stationary.")
22 | 
23 | # Time Series Decomposition
24 | 
25 | from statsmodels.tsa.seasonal import seasonal_decompose
26 | import matplotlib.pyplot as plt
27 | 
28 | decomposition = seasonal_decompose(df['Value'], model='additive', period=365)
29 | trend = decomposition.trend
30 | seasonal = decomposition.seasonal
31 | residual = decomposition.resid
32 | 
33 | # Plot the decomposition components
34 | plt.figure(figsize=(12, 8))
35 | plt.subplot(411)
36 | plt.plot(df['Date'], df['Value'], label='Original')
37 | plt.legend(loc='best')
38 | plt.subplot(412)
39 | plt.plot(df['Date'], trend, label='Trend')
40 | plt.legend(loc='best')
41 | plt.subplot(413)
42 | plt.plot(df['Date'], seasonal, label='Seasonal')
43 | plt.legend(loc='best')
44 | plt.subplot(414)
45 | plt.plot(df['Date'], residual, label='Residual')
46 | plt.legend(loc='best')
47 | plt.suptitle("Time Series Decomposition")
48 | plt.show()
49 | 


--------------------------------------------------------------------------------
/Chapter 10/time_series.R:
--------------------------------------------------------------------------------
 1 | # Generate a Random Time Series
 2 | # Set seed to make results reproducible
 3 | set.seed(123)
 4 | # Generate Random Points using a gaussian distribution with mean 0 and sd = 1
 5 | n <- 25
 6 | x <- rnorm(n)
 7 | head(x)
 8 | 
 9 | # Make x a ts object
10 | ts_obj <- ts(x)
11 | 
12 | class(ts_obj)
13 | str(ts_obj)
14 | attributes(ts_obj)
15 | plot(ts_obj)
16 | 
17 | # Change Start
18 | ts(x, start = 1980)
19 | ts(x, start = c(1980, 05))
20 | ts(x, start = 1980, frequency = 12)
21 | ts(x, start = 1980, frequency = 12/3)
22 | # Change End
23 | ts(x, end = 2023)
24 | ts(x, end = 2023, frequency = 12)
25 | ts(x, end = 2023, frequency = 12/3)
26 | 
27 | # AirPassengers - Plotting, ACF/PACF
28 | library(readxl)
29 | library(writexl)
30 | 
31 | # Write Out the AirPassengers dataset to Excel as a data.frame object
32 | write_xlsx(AirPassengers |> as.data.frame(), "./Chapter 10/airpassengers.xlsx")
33 | 
34 | # Read the airpassengers.xlsx file in and convert to a ts object starting at 1949
35 | ap_ts <- read_xlsx("./Chapter 10/airpassengers.xlsx") |>
36 |   ts(start = 1949, frequency = 12)
37 | 
38 | class(ap_ts)
39 | 
40 | # Plot the ts object
41 | plot(ap_ts)
42 | 
43 | # Decomposition and Visualization
44 | plot(decompose(ap_ts))
45 | 
46 | # P/ACF
47 | acf(ap_ts)
48 | acf(ap_ts, type = "partial")
49 | 
50 | # Auto Arima Modeling
51 | library(healthyR.ts)
52 | library(dplyr)
53 | library(timetk)
54 | library(modeltime)
55 | 
56 | ap_tbl <- ts_to_tbl(ap_ts) |>
57 |   select(-index)
58 | 
59 | class(ap_tbl)
60 | 
61 | # Time Series Split
62 | splits <- time_series_split(
63 |   ap_tbl
64 |   , date_col
65 |   , assess = 12
66 |   , skip = 3
67 |   , cumulative = TRUE
68 | )
69 | 
70 | splits
71 | 
72 | ts_auto_arima <- ts_auto_arima(
73 |   .data = ap_tbl,
74 |   .num_cores = 10,
75 |   .date_col = date_col,
76 |   .value_col = x,
77 |   .rsamp_obj = splits,
78 |   .formula = x ~ .,
79 |   .grid_size = 20,
80 |   .cv_slice_limit = 5,
81 |   .tune = TRUE
82 | )
83 | 
84 | # Brownian Motion
85 | ts_brownian_motion()
86 |   ts_brownian_motion_plot(t, y)
87 | 


--------------------------------------------------------------------------------
/Chapter 11/call_plumber.R:
--------------------------------------------------------------------------------
 1 | # Library Load
 2 | library(plumber)
 3 | 
 4 | # Set dir and file path
 5 | wd <- getwd()
 6 | sub_dir <- paste0("/Chapter 11/")
 7 | full_dir <- paste0(wd, sub_dir)
 8 | f <- "plumber_api.R"
 9 | f_path <- paste0(full_dir, f)
10 | 
11 | # Initiate root
12 | root <- pr(f_path)
13 | root
14 | 
15 | root |> pr_run()
16 | 


--------------------------------------------------------------------------------
/Chapter 11/fastapi_add.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI, Query
 2 | 
 3 | app = FastAPI()
 4 | 
 5 | @app.get("/api/add")
 6 | def add_numbers(
 7 |     num1: int = Query(..., description="First number"),
 8 |     num2: int = Query(..., description="Second number"),
 9 | ):
10 |     result = num1 + num2
11 |     return {"result": result}
12 | 


--------------------------------------------------------------------------------
/Chapter 11/imgs/BERT_Console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/imgs/BERT_Console.png


--------------------------------------------------------------------------------
/Chapter 11/imgs/BERT_Console_Excell_Addins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/imgs/BERT_Console_Excell_Addins.png


--------------------------------------------------------------------------------
/Chapter 11/imgs/BERT_VBA_to_R_density_plot_rnorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/imgs/BERT_VBA_to_R_density_plot_rnorm.png


--------------------------------------------------------------------------------
/Chapter 11/imgs/RAND.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/imgs/RAND.png


--------------------------------------------------------------------------------
/Chapter 11/matrix_multiplication.py:
--------------------------------------------------------------------------------
1 | import xlwings as xw
2 | import numpy as np
3 | 
4 | @xw.func
5 | @xw.arg('x', np.array, ndim=2)
6 | @xw.arg('y', np.array, ndim=2)
7 | def matrix_mult(x, y):
8 |     return x @ y
9 | 


--------------------------------------------------------------------------------
/Chapter 11/matrix_multiplication.xlsm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/matrix_multiplication.xlsm


--------------------------------------------------------------------------------
/Chapter 11/multiply.py:
--------------------------------------------------------------------------------
 1 | import xlwings as xw
 2 | 
 3 | 
 4 | def main():
 5 |     wb = xw.Book.caller()
 6 |     a = wb.sheets[0]['A1'].value
 7 |     b = wb.sheets[0]['B1'].value
 8 |     wb.sheets[0]['C1'].value = a * b
 9 |     pass
10 | 


--------------------------------------------------------------------------------
/Chapter 11/multiply.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/multiply.xlsx


--------------------------------------------------------------------------------
/Chapter 11/plumber_api.R:
--------------------------------------------------------------------------------
1 | #* Plot out data from a random normal distribution
2 | #* @param .mean The mean of the standard normal distribution
3 | #* @get /plot
4 | #* @serializer png
5 | function(.mean) {
6 |   mu <- as.numeric(.mean)
7 |   hist(rnorm(n = 1000, mean = mu, sd = 1))
8 | }


--------------------------------------------------------------------------------
/Chapter 11/vba_plumber_curl_request.bas:
--------------------------------------------------------------------------------
 1 | Sub MakeCurlRequestAndInsertImage()
 2 |     ' Define the curl command
 3 |     Dim curlCommand As String
 4 |     curlCommand = "curl -X GET ""http://127.0.0.1:6855/plot?.mean=0"" -H ""accept: image/png"" -o " & Environ("TEMP") & "\temp_image.png"
 5 | 
 6 |     ' Run the curl command using Shell
 7 |     Shell "cmd /c " & curlCommand, vbHide
 8 | 
 9 |     ' Create a new worksheet or refer to an existing one (Sheet1)
10 |     Dim ws As Worksheet
11 |     Set ws = ActiveWorkbook.Worksheets("Sheet1")
12 | 
13 |     ' Clear previous content in Sheet1
14 |     ws.Cells.Clear
15 | 
16 |     ' Insert the image into the worksheet
17 |     ws.Pictures.Insert(Environ("TEMP") & "\temp_image.png").Select
18 | End Sub
19 | 


--------------------------------------------------------------------------------
/Chapter 3/Sub_MultiplyByRandom.bas:
--------------------------------------------------------------------------------
 1 | Sub MultiplyByRandom()
 2 |     Dim rng As Range
 3 |     Dim cell As Range
 4 |     
 5 |     ' Set the range to the desired range on Sheet2
 6 |     Set rng = Sheets("Sheet2").Range("C3:C13")
 7 |     
 8 |     ' Loop through each cell in the range
 9 |     For Each cell In rng
10 |         ' Multiply the cell value by RAND() and store the result in the adjacent cell
11 |         cell.Offset(0, 1).Value = cell.Value * Rnd()
12 |     Next cell
13 | End Sub


--------------------------------------------------------------------------------
/Chapter 3/executing_VBA.py:
--------------------------------------------------------------------------------
 1 | import win32com.client as win32
 2 | import os
 3 | 
 4 | excel_app = win32.Dispatch("Excel.Application")
 5 | 
 6 | path =  os.getcwd().replace('\'','\\') + '\\'
 7 | 
 8 | workbook = excel_app.Workbooks.Open(path+"iris_data.xlsm")
 9 | excel_app.Run("examplePythonVBA")
10 | workbook.Close(SaveChanges=True)
11 | excel_app.Quit()
12 | 


--------------------------------------------------------------------------------
/Chapter 3/interacting_Excel_objects.py:
--------------------------------------------------------------------------------
 1 | import win32com.client as win32
 2 | import os
 3 | 
 4 | excel_app = win32.Dispatch("Excel.Application")
 5 | path =  os.getcwd().replace('\'','\\') + '\\'
 6 | 
 7 | workbook = excel_app.Workbooks.Open(path+"iris_data.xlsx")
 8 | worksheet = workbook.Worksheets("Sheet1")
 9 | 
10 | data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
11 | for row_index, row_data in enumerate(data, start=1):
12 |     for col_index, value in enumerate(row_data, start=1):
13 |         worksheet.Cells(row_index, col_index).Value = value
14 | 
15 | workbook.Close(SaveChanges=True)
16 | excel_app.Quit()
17 | 


--------------------------------------------------------------------------------
/Chapter 3/mult_by_rand_ch3.xlsm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 3/mult_by_rand_ch3.xlsm


--------------------------------------------------------------------------------
/Chapter 3/retreiving_data.py:
--------------------------------------------------------------------------------
 1 | import win32com.client as win32
 2 | import os
 3 | 
 4 | excel_app = win32.Dispatch("Excel.Application")
 5 | path =  os.getcwd().replace('\'','\\') + '\\'
 6 | 
 7 | workbook = excel_app.Workbooks.Open(path+"iris_data.xlsx")
 8 | worksheet = workbook.Worksheets("Sheet1")
 9 | 
10 | # Access multiple cells using Range notation
11 | range_of_cells = worksheet.Range('A1:C3')
12 | 
13 | # Read the values from the range of cells
14 | values = range_of_cells.Value
15 | 
16 | workbook.Close(SaveChanges=False)
17 | excel_app.Quit()
18 | 
19 | print(values)
20 | 


--------------------------------------------------------------------------------
/Chapter 3/run_MultByRand_macro.R:
--------------------------------------------------------------------------------
 1 | # Load the library
 2 | library(RDCOMClient)
 3 | 
 4 | # Set file path
 5 | f_path <- "C:/Users/steve/Documents/GitHub/Extending-Excel-with-Python-and-R/"
 6 | f_chapter <- "chapter3/"
 7 | f_name <- "mult_by_rand_ch3.xlsm"
 8 | f <- paste0(f_path, f_chapter, f_name)
 9 | 
10 | # Make Excel App
11 | xl_app <- COMCreate("Excel.Application")
12 | xl_wkbk <- xl_app$Workbooks()$Open(f)
13 | xl_app[['Visible']] <- TRUE
14 | 
15 | macro_name <- "MultiplyByRandom"
16 | 
17 | # Run the macro
18 | xl_app$Run(macro_name)
19 | 
20 | # Save and Quit
21 | xl_wkbk$close(TRUE); xl_app$Quit() 
22 | 


--------------------------------------------------------------------------------
/Chapter 3/testing_environment.py:
--------------------------------------------------------------------------------
1 | import win32com.client as win32
2 | 
3 | excel_app = win32.Dispatch("Excel.Application")
4 | 
5 | vba_interface = excel_app.VBE
6 | 
7 | 


--------------------------------------------------------------------------------
/Chapter 4/apscheduler.py:
--------------------------------------------------------------------------------
 1 | from apscheduler.schedulers.blocking import BlockingScheduler
 2 | 
 3 | # Create a scheduler instance
 4 | scheduler = BlockingScheduler()
 5 | 
 6 | # Define a task function
 7 | def send_email():
 8 |     # Code to send an email
 9 |     print("Email sent!")
10 | 
11 | # Schedule the task to run every hour
12 | scheduler.add_job(send_email, 'interval', hours=1)
13 | 
14 | # Start the scheduler
15 | scheduler.start()
16 | 


--------------------------------------------------------------------------------
/Chapter 4/case_study.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import smtplib
 3 | from email.mime.multipart import MIMEMultipart
 4 | from email.mime.text import MIMEText
 5 | 
 6 | def task1():
 7 |     # Simulate data processing for task 1
 8 |     print("Task 1 in progress...")
 9 |     # ... your code here ...
10 |     print("Task 1 completed successfully")
11 | 
12 | def task2():
13 |     # Simulate data processing for task 2
14 |     print("Task 2 in progress...")
15 |     # ... your code here ...
16 |     print("Task 2 completed successfully")
17 | 
18 | def send_email_notification(task_name, status):
19 |     sender_email = os.environ.get("from_email")
20 |     recipient_email = os.environ.get("to_email")
21 |     
22 |     # Create a multi-part email message
23 |     message = MIMEMultipart()
24 |     message["From"] = sender_email
25 |     message["To"] = recipient_email
26 |     
27 |     if status == "success":
28 |         subject = f"Task {task_name} completed successfully"
29 |         body = f"The task {task_name} has been completed successfully."
30 |     elif status == "error":
31 |         subject = f"Error in task {task_name}"
32 |         body = f"There was an error while executing task {task_name}. Please check the log files or attachments for more information."
33 |         
34 |         # Attach log files or other relevant attachments
35 |         attachment = MIMEText("... attachment content ...")
36 |         attachment.add_header("Content-Disposition", "attachment", filename="log.txt")
37 |         message.attach(attachment)
38 |     
39 |     message["Subject"] = subject
40 |     message.attach(MIMEText(body, "plain"))
41 |     
42 |     # Connect to the SMTP server and send the email
43 |     with smtplib.SMTP("smtp.example.com", 587) as server:
44 |         server.starttls()
45 |         server.login(sender_email, os.environ.get("password"))
46 |         server.send_message(message)
47 | 
48 | # Usage example
49 | task1()
50 | send_email_notification("task1", "success")
51 | 
52 | task2()
53 | send_email_notification("task2", "error")
54 | 


--------------------------------------------------------------------------------
/Chapter 4/get_user_input.R:
--------------------------------------------------------------------------------
 1 | # Import the necessary package
 2 | install.packages("svDialogs")
 3 | library(svDialogs)
 4 | 
 5 | # Create a message box
 6 | name <- dlg_input(message = "What is your name? ")
 7 | 
 8 | # Print the name that the user entered
 9 | print(name$res)
10 | 
11 | 


--------------------------------------------------------------------------------
/Chapter 4/hello_world.R:
--------------------------------------------------------------------------------
1 | library("tcltk")
2 | tkmessageBox(
3 |   title='Message',
4 |   message = paste0("Hello, it is: ", Sys.time()), 
5 |   type = "ok"
6 |   )
7 | 


--------------------------------------------------------------------------------
/Chapter 4/hello_world_schedule.R:
--------------------------------------------------------------------------------
 1 | library(taskscheduleR)
 2 | 
 3 | # Create a task scheduler job that runs the script every hour
 4 | taskscheduler_create(
 5 |   taskname = "Hello World Hourly",
 6 |   rscript = "hello_world.R",
 7 |   schedule = "0 * * * *"
 8 | )
 9 | 
10 | # Create a task scheduler job that runs the script once a day at 10:00 AM
11 | taskscheduler_create(
12 |   taskname = "Hello World Daily",
13 |   rscript = "hello_world.R",
14 |   schedule = "0 10 * * *"
15 | )


--------------------------------------------------------------------------------
/Chapter 4/install_taskscheduleR.R:
--------------------------------------------------------------------------------
1 | # The Package itself
2 | install.packages("taskscheduleR")
3 | 
4 | # If you want to use the GUI
5 | install.packages('miniUI')
6 | install.packages('shiny')
7 | 


--------------------------------------------------------------------------------
/Chapter 4/schedule.py:
--------------------------------------------------------------------------------
 1 | import schedule
 2 | import time
 3 | 
 4 | def job():
 5 |     print("This job is executed every day at 8:00 AM.")
 6 | 
 7 | # Schedule the job to run every day at 8:00 AM
 8 | schedule.every().day.at("08:00").do(job)
 9 | 
10 | # Keep the program running
11 | while True:
12 |     schedule.run_pending()
13 |     time.sleep(1)
14 | 


--------------------------------------------------------------------------------
/Chapter 4/send_basic_email.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import smtplib
 3 | from email.mime.text import MIMEText
 4 | from email.mime.multipart import MIMEMultipart
 5 | 
 6 | # Define email server and credentials
 7 | smtp_server = 'smtp.gmail.com'
 8 | smtp_port = 587
 9 | smtp_username = 'your_username'
10 | smtp_password = 'your_password'
11 | 
12 | # Create a MIME message
13 | message = MIMEMultipart()
14 | message['From'] = 'sender@example.com'
15 | message['To'] = 'recipient@example.com'
16 | message['Subject'] = 'Test Email'
17 | 
18 | # Add the email body
19 | body = MIMEText('This is the email body.')
20 | message.attach(body)
21 | 
22 | # Establish a connection with the email server
23 | with smtplib.SMTP(smtp_server, smtp_port) as server:
24 |     # Start the TLS encryption
25 |     server.starttls()
26 |     
27 |     # Log in to the email server
28 |     server.login(smtp_username, smtp_password)
29 |     
30 |     # Send the email
31 |     server.send_message(message)
32 | 


--------------------------------------------------------------------------------
/Chapter 5/aligning_text_openpyxl.py:
--------------------------------------------------------------------------------
 1 | # OpenPyXL example for aligning text within cells
 2 | from openpyxl import Workbook
 3 | from openpyxl.styles import Alignment
 4 | 
 5 | wb = Workbook()
 6 | ws = wb.active
 7 | 
 8 | # Applying text alignment
 9 | alignment = Alignment(horizontal='center', vertical='center')
10 | ws['A1'].alignment = alignment
11 | 
12 | ws['A1'] = 'Name'
13 | ws['B1'] = 'Age'
14 | ws['C1'] = 'City'
15 | 
16 | wb.save('aligned_table_openpyxl.xlsx')
17 | 


--------------------------------------------------------------------------------
/Chapter 5/aligning_text_pandas.py:
--------------------------------------------------------------------------------
 1 | # Pandas example for aligning text within cells
 2 | import pandas as pd
 3 | 
 4 | data = {'Name': ['John', 'Alice', 'Michael'],
 5 |         'Age': [25, 30, 22],
 6 |         'City': ['New York', 'London', 'Paris']}
 7 | 
 8 | df = pd.DataFrame(data)
 9 | 
10 | # Applying text alignment
11 | alignment_styles = {'text-align': 'center'}
12 | styled_df = df.style.set_properties(subset=['Name', 'Age', 'City'], **alignment_styles)
13 | styled_df.to_excel('aligned_table_pandas.xlsx', index=False)
14 | 


--------------------------------------------------------------------------------
/Chapter 5/background_colors_openpyxl.py:
--------------------------------------------------------------------------------
 1 | # openpyxl example for cell background colors
 2 | from openpyxl import Workbook
 3 | from openpyxl.styles import PatternFill
 4 | 
 5 | wb = Workbook()
 6 | ws = wb.active
 7 | 
 8 | # Applying cell background colors
 9 | yellow_fill = PatternFill(start_color='FFFF00', end_color='FFFF00', fill_type='solid')
10 | ws['A1'].fill = yellow_fill
11 | 
12 | ws['A1'] = 'Name'
13 | ws['B1'] = 'Age'
14 | ws['C1'] = 'City'
15 | 
16 | wb.save('colored_table_openpyxl.xlsx')
17 | 


--------------------------------------------------------------------------------
/Chapter 5/background_colors_pandas.py:
--------------------------------------------------------------------------------
 1 | # Pandas example for cell background colors
 2 | import pandas as pd
 3 | 
 4 | data = {'Name': ['John', 'Alice', 'Michael'],
 5 |         'Age': [25, 30, 22],
 6 |         'City': ['New York', 'London', 'Paris']}
 7 | 
 8 | df = pd.DataFrame(data)
 9 | 
10 | # Create a styler object
11 | styled_df = df.style
12 | 
13 | # Define the style for the cells
14 | styled_df = styled_df.applymap(lambda _: 'background-color: yellow', subset=pd.IndexSlice[0, ['Name', 'Age']])
15 | 
16 | # Save the styled DataFrame to an Excel file
17 | styled_df.to_excel('colored_table_pandas.xlsx', index=False)
18 | 


--------------------------------------------------------------------------------
/Chapter 5/conditional_formatting.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import openpyxl
 3 | from openpyxl.formatting.rule import ColorScaleRule, CellIsRule, FormulaRule
 4 | 
 5 | # Create some sample data
 6 | data = {'Name': ['John', 'Alice', 'Michael', 'Emily'],
 7 |         'Age': [25, 30, 22, 28],
 8 |         'City': ['New York', 'London', 'Paris', 'Sydney'],
 9 |         'Sales': [1000, 800, 1200, 900]}
10 | 
11 | df = pd.DataFrame(data)
12 | 
13 | # Write the DataFrame to a worksheet
14 | df.to_excel("conditional_formatting.xlsx", index=False)
15 | 
16 | # Load the workbook
17 | wb = openpyxl.load_workbook('conditional_formatting.xlsx')
18 | ws = wb.active
19 | 
20 | # Define conditional formatting rules
21 | red_text_rule = CellIsRule(operator="lessThan", formula=["1000"], stopIfTrue=True, font=openpyxl.styles.Font(color="FF0000"))
22 | ws.conditional_formatting.add(f"D2:D{len(df)+1}", red_text_rule)
23 | 
24 | # Define the condition for the green fill color scale
25 | min_sales = min(df['Age'])
26 | max_sales = max(df['Age'])
27 | 
28 | green_fill_rule = ColorScaleRule(
29 |     start_type='num', start_value=min_sales, start_color='0000FF00',
30 |     end_type='num', end_value=max_sales, end_color='00FFFF00')
31 | 
32 | ws.conditional_formatting.add(f"B2:B{len(df)+1}", green_fill_rule)
33 | 
34 | # Save the Excel workbook
35 | wb.save('conditional_formatting.xlsx')
36 | 


--------------------------------------------------------------------------------
/Chapter 5/font_properties_openpyxl.py:
--------------------------------------------------------------------------------
 1 | # OpenPyXL example for setting font properties
 2 | from openpyxl import Workbook
 3 | from openpyxl.styles import Font
 4 | 
 5 | wb = Workbook()
 6 | ws = wb.active
 7 | 
 8 | # Applying font properties
 9 | font = Font(size=14, bold=True, italic=True, color='0000FF')
10 | ws['A1'].font = font
11 | 
12 | ws['A1'] = 'Name'
13 | ws['B1'] = 'Age'
14 | ws['C1'] = 'City'
15 | 
16 | wb.save('styled_table_openpyxl.xlsx')
17 | 


--------------------------------------------------------------------------------
/Chapter 5/font_properties_pandas.py:
--------------------------------------------------------------------------------
 1 | # Pandas example for setting font properties
 2 | import pandas as pd
 3 | 
 4 | data = {'Name': ['John', 'Alice', 'Michael'],
 5 |         'Age': [25, 30, 22],
 6 |         'City': ['New York', 'London', 'Paris']}
 7 | 
 8 | df = pd.DataFrame(data)
 9 | 
10 | # Define a function to apply font properties
11 | def apply_font_properties(value):
12 |     return 'font-weight: bold; font-size: 14px; font-style: italic; color: blue'
13 | 
14 | # Applying font properties
15 | styled_df = df.style.applymap(apply_font_properties, subset='Name')
16 | 
17 | # Save the styled DataFrame to an Excel file
18 | styled_df.to_excel('styled_table_pandas.xlsx', index=False)
19 | 


--------------------------------------------------------------------------------
/Chapter 5/heatmap.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import openpyxl
 3 | from openpyxl.utils.dataframe import dataframe_to_rows
 4 | from openpyxl.formatting.rule import ColorScaleRule
 5 | 
 6 | # Sample data for the heatmap
 7 | data = {
 8 |     'Category': ['A', 'B', 'C', 'D'],
 9 |     'Jan': [10, 20, 30, 40],
10 |     'Feb': [15, 25, 35, 45],
11 |     'Mar': [12, 22, 32, 42],
12 |     'Apr': [18, 28, 38, 48]
13 | }
14 | 
15 | # Convert data to a pandas DataFrame
16 | df = pd.DataFrame(data)
17 | 
18 | # Write the DataFrame to a worksheet
19 | df.to_excel("heatmap_with_conditional_formatting.xlsx", index=False)
20 | 
21 | # Load the workbook
22 | wb = openpyxl.load_workbook('heatmap_with_conditional_formatting.xlsx')
23 | ws = wb.active
24 | 
25 | # Define the range for conditional formatting (excluding the 'Category' column)
26 | data_range = f'B2:E{len(df) + 1}'  # Adjust the range based on the DataFrame size
27 | 
28 | # Apply color scale conditional formatting to the range
29 | color_scale_rule = ColorScaleRule(start_type='min', start_color='FFFFFF', end_type='max', end_color='FF0000')
30 | ws.conditional_formatting.add(data_range, color_scale_rule)
31 | 
32 | # Save the workbook
33 | wb.save('heatmap_with_conditional_formatting.xlsx')
34 | 


--------------------------------------------------------------------------------
/Chapter 5/install_ch5_packages.R:
--------------------------------------------------------------------------------
 1 | # styledtabls
 2 | install.packages("devtools")
 3 | 
 4 | # Install development version from GitHub
 5 | devtools::install_github("R-package/styledTables", build_opts = NULL)
 6 | 
 7 | # tidyxl
 8 | install.packages('tidyxl')
 9 | 
10 | # basictabler
11 | install.packages('basictabler')
12 | 


--------------------------------------------------------------------------------
/Chapter 5/pivot_table.py:
--------------------------------------------------------------------------------
 1 | # Import the required modules from the `win32com.client` package:
 2 | import win32com.client as win32
 3 | 
 4 | # Create a new instance of Excel and make it visible:
 5 | excel = win32.Dispatch('Excel.Application')
 6 | excel.Visible = True
 7 | 
 8 | # Create a new workbook or open an existing one:
 9 | workbook = excel.Workbooks.Add()  # Create a new workbook
10 | # Or to open an existing workbook:
11 | # workbook = excel.Workbooks.Open('path/to/your/workbook.xlsx')
12 | 
13 | # Get the reference to the sheet where you want to create the Pivot Table:
14 | sheet = workbook.ActiveSheet  # Get the active sheet
15 | # Or specify the sheet by its name:
16 | # sheet = workbook.Sheets('Sheet1')
17 | 
18 | # Populate the data into the sheet (optional, if you have data to analyze):
19 | # Sample data
20 | data = [
21 |     ['Product', 'Category', 'Sales'],
22 |     ['Product A', 'Category 1', 100],
23 |     ['Product B', 'Category 2', 200],
24 |     ['Product C', 'Category 1', 150],
25 |     ['Product D', 'Category 2', 50],
26 |     # Add more data rows here...
27 | ]
28 | 
29 | # Write the data to the sheet
30 | for row_index, row in enumerate(data, start=1):
31 |     for col_index, value in enumerate(row, start=1):
32 |         sheet.Cells(row_index, col_index).Value = value
33 | 
34 | # Add a new worksheet to the workbook to hold the Pivot Table:
35 | pivot_table_sheet = workbook.Worksheets.Add()
36 | pivot_table_sheet.Name = 'Pivot Table'
37 | 
38 | # Create a Pivot Cache using the data range:
39 | pivot_cache = workbook.PivotCaches().Create(SourceType=1, SourceData=sheet.UsedRange)
40 | 
41 | # Create the Pivot Table on the new sheet using the Pivot Cache:
42 | pivot_table = pivot_cache.CreatePivotTable(TableDestination=pivot_table_sheet.Cells(3, 1), TableName='MyPivotTable')
43 | 
44 | # Add fields to the Pivot Table, specifying their orientation (rows, columns, data, etc.):
45 | pivot_table.PivotFields('Product').Orientation = 1 # row field
46 | pivot_table.PivotFields('Category').Orientation = 2 # column field
47 | pivot_table.PivotFields('Sales').Orientation = 4 # data field
48 | 
49 | # Control row and column grandtotals
50 | pivot_table.ColumnGrand = True
51 | pivot_table.RowGrand = True
52 | 
53 | # Decide which fields have Subtotals
54 | pivot_table.PivotFields('Sales').Subtotals = [False]*12
55 | pivot_table.PivotFields('Product').Subtotals = [False]*12
56 | pivot_table.PivotFields('Category').Subtotals = [True]*12
57 | 
58 | # Customize labels and styles
59 | pivot_table.ShowTableStyleRowStripes = False
60 | pivot_table.PivotFields('Product').Caption = 'Product Name'
61 | pivot_table.PivotFields('Sales').NumberFormat = '#,##0'
62 | pivot_table.PivotFields('Sales').Caption = 'Total Sales'
63 | 
64 | # Save the workbook and close Excel:
65 | workbook.SaveAs('./pivot_table.xlsx')
66 | workbook.Close()
67 | excel.Quit()
68 | 


--------------------------------------------------------------------------------
/Chapter 5/using_basictabler.R:
--------------------------------------------------------------------------------
 1 | library(basictabler)
 2 | 
 3 | # Create a data frame
 4 | data <- data.frame(
 5 |   name = c("John Doe", "Jane Doe"),
 6 |   age = c(30, 25),
 7 |   salary = c(100000, 50000)
 8 | )
 9 | 
10 | # Plain table
11 | table_plain <- qhtbl(data, theme = "largeplain")
12 | table_plain
13 | 
14 | # Create a basictabler object
15 | table <- qhtbl(data,
16 |   theme = "largeplain",
17 |   tableStyle = list("border-color" = "maroon"),
18 |   headingStyle = list(
19 |     "color" = "cornsilk", "background-color" = "maroon",
20 |     "font-style" = "italic", "border-color" = "maroon"
21 |   ),
22 |   cellStyle = list(
23 |     "color" = "maroon", "background-color" = "cornsilk",
24 |     "border-color" = "maroon"
25 |   )
26 | )
27 | 
28 | # Render the table to HTML
29 | table
30 | 
31 | # A longer example
32 | library(TidyDensity)
33 | tn <- tidy_normal(.n = 10)
34 | 
35 | tbl <- BasicTable$new()
36 | # formatting values (explained in the introduction vignette)
37 | columnFormats <- list(
38 |   NULL,
39 |   NULL,
40 |   "%.4f",
41 |   "%.4f",
42 |   "%.4f",
43 |   "%.4f",
44 |   "%.4f"
45 | )
46 | tbl$addData(tn,
47 |   firstColumnAsRowHeaders = TRUE,
48 |   explicitColumnHeaders = c("Simulation", "x", "y", "dx", "dy", "p", "q"),
49 |   columnFormats = columnFormats
50 | )
51 | tbl$renderTable()
52 | 
53 | # Add some conditional formatting
54 | cells <- tbl$getCells(rowNumbers = 2:11, columnNumbers = 3:7, matchMode = "combinations")
55 | 
56 | tbl$mapStyling(
57 |   cells = cells, styleProperty = "background-color", valueType = "color",
58 |   mapType = "logic",
59 |   mappings = list(
60 |     "v<=-3", "red",
61 |     "-3<v<=-2", "orange",
62 |     "-2<v<=-1", "pink",
63 |     "-1<v<= 0", "white",
64 |     "0<v<=1", "white",
65 |     "1<v<=2", "lightgreen",
66 |     "2<v<=3", "lightblue",
67 |     "3<v", "green"
68 |   )
69 | )
70 | 
71 | tbl$renderTable()
72 | 
73 | # Write styled table out to excel
74 | library(openxlsx)
75 | 
76 | # Create Workbook
77 | wb <- createWorkbook()
78 | # Add a sheet called Data
79 | addWorksheet(wb, "Data")
80 | # Use basictabler to write the tbl to excel
81 | tbl$writeToExcelWorksheet(
82 |   wb = wb, 
83 |   wsName = "Data",
84 |   topRowNumber = 1, 
85 |   leftMostColumnNumber = 1, 
86 |   applyStyles = TRUE
87 | )
88 | # Use openxlsx to save the file
89 | saveWorkbook(
90 |   wb, 
91 |   file="chapter5/basictabler_excel.xlsx", 
92 |   overwrite = TRUE
93 | )
94 | 
95 | 


--------------------------------------------------------------------------------
/Chapter 5/using_styledTables.R:
--------------------------------------------------------------------------------
 1 | library(TidyDensity)
 2 | library(styledTables)
 3 | library(xlsx)
 4 | 
 5 | st <- tidy_normal() |>
 6 |   styled_table(keep_header = TRUE) |>
 7 |   set_border_position("all", row_id = 1) |>
 8 |   set_bold(row_id = 1) |>
 9 |   set_fill_color("#00FF00", col_id = 3, condition = X >= 0.5)
10 | 
11 | # open new xlsx workbook and create a worksheet
12 | wb <- createWorkbook()
13 | sheet <- createSheet(wb, "tidy_normal")
14 | 
15 | # insert the styled table in the worksheet
16 | write_excel(sheet, st)
17 | 
18 | # save the workbook
19 | saveWorkbook(wb, "chapter5/styledTables_test.xlsx")
20 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_barplot.R:
--------------------------------------------------------------------------------
 1 | library(healthyR.data)
 2 | library(healthyR)
 3 | library(ggplot2)
 4 | library(dplyr)
 5 | library(forcats)
 6 | library(purrr)
 7 | 
 8 | df <- healthyR_data |>
 9 |   filter(payer_grouping != '?') |>
10 |   category_counts_tbl(
11 |     .count_col = payer_grouping
12 |     , .arrange = TRUE
13 |     , ip_op_flag
14 |   ) |>
15 |   group_by(ip_op_flag) |>
16 |   mutate(order_var = paste0(
17 |     sprintf("%02i", as.integer(rank(n))),
18 |     " - ",
19 |     payer_grouping
20 |     )) |>
21 |   ungroup()
22 | 
23 | ggplot(df, aes(x = order_var, y = n)) +
24 |   geom_col(alpha = 0.328) +
25 |   labs(x = "", y = "") +
26 |   theme(legend.position = "none") +
27 |   facet_wrap(~ ip_op_flag, scale = "free") +
28 |   scale_x_discrete(labels =  with(df, as.character(payer_grouping) |> 
29 |                                     set_names(order_var))) +
30 |   xlab(NULL) +
31 |   theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) +
32 |   coord_flip() +
33 |   theme_minimal()
34 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_cowplot.R:
--------------------------------------------------------------------------------
 1 | # Install Libraries
 2 | install.packages("ggplot2")
 3 | install.packages("cowplot")
 4 | 
 5 | # Load required libraries
 6 | library(ggplot2)
 7 | library(cowplot)
 8 | 
 9 | # Load the Iris dataset
10 | data(iris)
11 | 
12 | # Create separate histograms for each species
13 | histograms <- list()
14 | for (species in unique(iris$Species)) {
15 |   data_subset <- iris[iris$Species == species, ]
16 |   
17 |   histogram <- ggplot(data_subset, aes(x = Sepal.Width)) +
18 |     geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") +
19 |     labs(title = paste("Sepal Width Histogram for", species)) +
20 |     labs(x = "", y = "") +
21 |     theme_minimal()
22 |   
23 |   histograms[[species]] <- histogram
24 | }
25 | 
26 | # Create histogram for all species combined
27 | all_species_hist <- ggplot(iris, aes(x = Sepal.Width)) +
28 |   geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") +
29 |   labs(title = "Sepal Width Histogram for All Species") +
30 |   theme_minimal()
31 | 
32 | # Arrange histograms using cowplot
33 | plot_grid(
34 |   histograms[["setosa"]], 
35 |   histograms[["versicolor"]], 
36 |   histograms[["virginica"]], 
37 |   all_species_hist,
38 |   ncol = 2,
39 |   align = "hv"
40 | )
41 | 
42 | histograms <- lapply(unique(iris$Species), function(species) {
43 |   data_subset <- iris[iris$Species == species, ]
44 |   
45 |   histogram <- ggplot(data_subset, aes(x = Sepal.Width)) +
46 |     geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") +
47 |     labs(title = paste("Sepal Width Histogram for", species)) +
48 |     labs(x = "", y = "") +
49 |     theme_minimal()
50 |   
51 |   return(histogram)
52 | })
53 | 
54 | histograms
55 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_dumbell_plot.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(dplyr)
 3 | 
 4 | # Sample data
 5 | data <- data.frame(
 6 |   Category = c("A", "B", "C", "D"),
 7 |   Initial = c(10, 15, 8, 12),
 8 |   Final = c(18, 22, 14, 16)
 9 | )
10 | 
11 | # Calculate the midpoint for positioning the dots and lines
12 | data <- data %>%
13 |   mutate(Midpoint = (Initial + Final) / 2)
14 | 
15 | # Create the dumbbell plot using ggplot2
16 | dumbbell_plot <- ggplot(data, aes(x = Category, xend = Category, 
17 |                                   y = Initial, yend = Final)) +
18 |   geom_segment(color = "gray50") +  # Lines connecting dots
19 |   geom_point(color = "blue", size = 3) +  # Initial values
20 |   geom_point(aes(y = Final), color = "orange", size = 3) +  # Final values
21 |   geom_point(aes(y = Midpoint), color = "green", size = 3) + # Midpoint Values
22 |   geom_text(aes(label = Midpoint), 
23 |             y = data$Midpoint, vjust = -.5, size = 3) +  # Midpoint labels
24 |   labs(title = "Dumbbell Plot",
25 |        x = "Category",
26 |        y = "Values") +
27 |   theme_minimal()
28 | 
29 | # Print the plot
30 | dumbbell_plot
31 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_ggplot2.R:
--------------------------------------------------------------------------------
 1 | install.packages("ggplot2")
 2 | library(ggplot2)
 3 | 
 4 | # Make a histogram of the sepal width for all species
 5 | hist(iris$Sepal.Width)
 6 | 
 7 | # Make a histogram of the sepal width for each species
 8 | par(mfrow = c(2,2))
 9 | for (species in unique(iris$Species)) {
10 |   hist(iris$Sepal.Width[iris$Species == species], main = species,
11 |        xlab = species)
12 | }
13 | hist(iris$Sepal.Width, main = "All Species")
14 | par(mfrow = c(1,1))
15 | 
16 | # Make a histogram of the sepal width for all species
17 | iris |>
18 | ggplot(aes(x = Sepal.Width)) + 
19 |   geom_histogram(alpha = 0.328) +
20 |   theme_minimal()
21 | 
22 | # Make a histogram of the sepal width for each species
23 | iris |>
24 | ggplot(aes(x = Sepal.Width, fill = Species)) + 
25 |   geom_histogram(alpha = 0.328) +
26 |   theme_minimal()
27 | 
28 | # Make a histogram of the sepal width for each species and facet them
29 | iris |>
30 |   ggplot(aes(x = Sepal.Width, fill = Species)) +
31 |   geom_histogram(alpha = 0.328) +
32 |   facet_wrap(~ Species, scales = "free") +
33 |   theme_minimal()
34 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_timeseries.R:
--------------------------------------------------------------------------------
1 | plot.ts(AirPassengers)
2 | plot(decompose(AirPassengers))
3 | 
4 | library(healthyR.ts)
5 | 
6 | ts_brownian_motion() |>
7 |   ts_brownian_motion_plot(t, y)
8 | 


--------------------------------------------------------------------------------
/Chapter 6/insert_image_pywin32.py:
--------------------------------------------------------------------------------
 1 | import win32com.client as win32
 2 | 
 3 | # Initialize Excel
 4 | excel = win32.gencache.EnsureDispatch('Excel.Application')
 5 | excel.Visible = True
 6 | 
 7 | # Create a new workbook
 8 | workbook = excel.Workbooks.Add()
 9 | 
10 | # Define the image path
11 | image_path = 'path\\to\\your\\image.png'
12 | 
13 | # Insert the image into a specific sheet and cell
14 | sheet = workbook.ActiveSheet
15 | cell = sheet.Range("A1")  # You can specify the cell where you want to insert the image
16 | 
17 | # Add the image to the worksheet
18 | sheet.Shapes.AddPicture(image_path, LinkToFile=False, SaveWithDocument=True, Left=cell.Left, Top=cell.Top, Width=300, Height=200)
19 | 
20 | # Save the workbook
21 | workbook.SaveAs('your_excel_with_image.xlsx')
22 | 
23 | # Close Excel
24 | excel.Application.Quit()
25 | 


--------------------------------------------------------------------------------
/Chapter 6/matplotlib_basics.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | ### scatter plot
  6 | data = {
  7 |     'Height': [155, 162, 168, 173, 179],
  8 |     'Weight': [50, 56, 61, 65, 72]
  9 | }
 10 | 
 11 | df = pd.DataFrame(data)
 12 | 
 13 | # Create a scatter plot
 14 | df.plot.scatter(x='Height', y='Weight', title='Scatter Plot of Height vs. Weight')
 15 | 
 16 | # Save the plot to a file (e.g., .png)
 17 | plt.savefig('matplotlib_scatter_plot.png')
 18 | 
 19 | # Show the plot
 20 | plt.show()
 21 | 
 22 | 
 23 | ### bar chart
 24 | 
 25 | data = {'Category': ['A', 'B', 'C', 'D', 'E'],
 26 |         'Values': [15, 28, 24, 20, 32]}
 27 | 
 28 | df = pd.DataFrame(data)
 29 | 
 30 | # Create a basic bar chart
 31 | plt.figure(figsize=(8, 6))
 32 | plt.bar(df['Category'], df['Values'], color='skyblue')
 33 | plt.xlabel('Categories')
 34 | plt.ylabel('Values')
 35 | plt.title('Basic Bar Chart')
 36 | 
 37 | # Save the plot to a file (e.g., .png)
 38 | plt.savefig('matplotlib_bar_chart.png')
 39 | 
 40 | plt.show()
 41 | 
 42 | ### histogram
 43 | 
 44 | # Generate some random data for the histogram
 45 | data = numpy.random.normal(0, 1, 1000)
 46 | 
 47 | import matplotlib.pyplot as plt
 48 | 
 49 | # Create a basic histogram
 50 | plt.figure(figsize=(8, 6))
 51 | plt.hist(data, bins=20, color='lightblue', edgecolor='black')
 52 | plt.xlabel('Values')
 53 | plt.ylabel('Frequency')
 54 | plt.title('Basic Histogram')
 55 | 
 56 | # Save the plot to a file (e.g., .png)
 57 | plt.savefig('matplotlib_histogram.png')
 58 | 
 59 | plt.show()
 60 | 
 61 | ### box plot
 62 | 
 63 | # Generate some random data for the box plot
 64 | data = [numpy.random.normal(0, 1, 100) for _ in range(3)]  # Three sets of random data
 65 | 
 66 | # Create a basic box plot
 67 | plt.figure(figsize=(8, 6))
 68 | plt.boxplot(data, vert=False, labels=['Set 1', 'Set 2', 'Set 3'])
 69 | plt.xlabel('Values')
 70 | plt.ylabel('Data Sets')
 71 | plt.title('Basic Box Plot')
 72 | 
 73 | # Save the plot to a file (e.g., .png)
 74 | plt.savefig('matplotlib_boxplot.png')
 75 | 
 76 | plt.show()
 77 | 
 78 | ### heatmap
 79 | 
 80 | # Generate some random data for the heatmap
 81 | data = numpy.random.rand(5, 5)  # Create a 5x5 matrix of random values
 82 | 
 83 | # Create a heatmap
 84 | plt.figure(figsize=(8, 6))
 85 | heatmap = plt.imshow(data, cmap='viridis', interpolation='nearest')
 86 | plt.colorbar(heatmap)
 87 | plt.title('Heatmap Example')
 88 | 
 89 | # Save the plot to a file (e.g., .png)
 90 | plt.savefig('matplotlib_heatmap.png')
 91 | 
 92 | plt.show()
 93 | 
 94 | ### violinplot
 95 | 
 96 | # Generate some random data for the violin plot
 97 | data = [numpy.random.normal(0, std, 100) for std in range(1, 4)]
 98 | 
 99 | # Create a violin plot
100 | plt.figure(figsize=(8, 6))
101 | plt.violinplot(data, showmedians=True)
102 | plt.title('Violin Plot Example')
103 | plt.xticks([1, 2, 3], ['Group 1', 'Group 2', 'Group 3'])
104 | plt.xlabel('Groups')
105 | plt.ylabel('Values')
106 | 
107 | # Save the plot to a file (e.g., .png)
108 | plt.savefig('matplotlib_violinplot.png')
109 | 
110 | plt.show()
111 | 


--------------------------------------------------------------------------------
/Chapter 6/matplotlib_customizations.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | ### labels and titles
 4 | 
 5 | # Sample data
 6 | x = [1, 2, 3, 4, 5]
 7 | y = [10, 20, 25, 30, 35]
 8 | 
 9 | # Create a scatter plot
10 | plt.scatter(x, y)
11 | 
12 | # Customize labels and titles
13 | plt.xlabel('X-axis Label')
14 | plt.ylabel('Y-axis Label')
15 | plt.title('Custom Title')
16 | plt.suptitle('Subtitle for Additional Context')
17 | 
18 | # Save the plot to a file (e.g., .png)
19 | plt.savefig('matplotlib_labels.png')
20 | 
21 | # Display the plot
22 | plt.show()
23 | 
24 | ### axes and legend
25 | 
26 | # Sample data
27 | x = [1, 2, 3, 4, 5]
28 | y = [10, 20, 25, 30, 35]
29 | 
30 | # Create a line plot
31 | plt.plot(x, y, label='Data Series A')
32 | 
33 | # Customize axes and legend
34 | plt.xlim(0, 6)
35 | plt.ylim(0, 40)
36 | plt.xticks([1, 2, 3, 4, 5])
37 | plt.yticks([0, 10, 20, 30, 40])
38 | plt.legend()
39 | 
40 | # Save the plot to a file (e.g., .png)
41 | plt.savefig('matplotlib_axes_legends.png')
42 | 
43 | # Display the plot
44 | plt.show()
45 | 
46 | ### themes
47 | 
48 | # Apply a different theme
49 | plt.style.use('ggplot')
50 | 
51 | # Sample data and plot
52 | x = [1, 2, 3, 4, 5]
53 | y = [10, 20, 25, 30, 35]
54 | plt.plot(x, y)
55 | 
56 | # Save the plot to a file (e.g., .png)
57 | plt.savefig('matplotlib_themes.png')
58 | 
59 | # Display the plot
60 | plt.show()
61 | 
62 | ### text formatting
63 | 
64 | # Sample data and plot
65 | x = [1, 2, 3, 4, 5]
66 | y = [10, 20, 25, 30, 35]
67 | plt.plot(x, y)
68 | 
69 | # Customize text formatting
70 | plt.title('Custom Title', fontsize=16, fontweight='bold', color='blue')
71 | plt.xlabel('X-axis Label', fontsize=12, fontstyle='italic', color='green')
72 | plt.ylabel('Y-axis Label', fontsize=12, fontweight='bold', color='red')
73 | 
74 | # Save the plot to a file (e.g., .png)
75 | plt.savefig('matplotlib_text_formatting.png')
76 | 
77 | # Display the plot
78 | plt.show()
79 | 


--------------------------------------------------------------------------------
/Chapter 6/plotnine_additional_layers.py:
--------------------------------------------------------------------------------
 1 | from plotnine import ggplot, aes, geom_line, geom_point, geom_errorbar, position_dodge, geom_text, labs, theme_minimal, geom_smooth
 2 | import pandas
 3 | import numpy
 4 | 
 5 | ### Error bars
 6 | # Sample data
 7 | data = pandas.DataFrame({
 8 |     'x': [1, 2, 3, 4, 5],
 9 |     'y': [10, 15, 8, 12, 18],
10 |     'group': ['A', 'A', 'B', 'B', 'C'],
11 |     'error': [1, 2, 1.5, 1, 2.5],
12 |     'label_x': [2, 4, 3, 1, 5],
13 |     'label_y': [16, 11, 6, 13, 17],
14 |     'annotation_text': ['Peak', 'Valley', 'Low', 'High', 'Bottom']
15 | })
16 | 
17 | # Create a ggplot object
18 | gg = ggplot(data, aes(x='x', y='y', group='group')) + \
19 |     geom_line() + \
20 |     geom_point() + \
21 |     geom_errorbar(aes(ymin='y - error', ymax='y + error'), width=0.1, size=0.5, position=position_dodge(width=0.2)) + \
22 |     geom_text(aes(x='label_x', y='label_y', label='annotation_text'), size=10)
23 | 
24 | # Draw the plot
25 | print(gg)
26 | 
27 | ### Trendline
28 | # Sample data
29 | data = pandas.DataFrame({
30 |     'X': numpy.arange(1, 21),
31 |     'Y': numpy.random.randint(1, 101, size=20)
32 | })
33 | 
34 | # Create a base plot
35 | gg = (ggplot(data, aes(x='X', y='Y')) +
36 |       geom_point() +
37 |       labs(title='Scatter Plot with Trendline') +
38 |       theme_minimal()
39 |      )
40 | 
41 | # Add a trendline
42 | gg = gg + geom_smooth(method='lm', se=False, linetype='dashed', color='red', size=1)
43 | 
44 | print(gg)
45 | 
46 | ### Annotations
47 | # Sample data
48 | data = pandas.DataFrame({
49 |     'X': numpy.arange(1, 11),
50 |     'Y': numpy.random.randint(1, 101, size=10)
51 | })
52 | 
53 | # Create a base plot
54 | gg = (ggplot(data, aes(x='X', y='Y')) +
55 |       geom_point() +
56 |       labs(title='Scatter Plot with Annotations') +
57 |       theme_minimal()
58 |      )
59 | 
60 | # Add an annotation
61 | gg = gg + geom_text(aes(label='Y'), nudge_y=5, color='blue')
62 | 
63 | print(gg)
64 | 


--------------------------------------------------------------------------------
/Chapter 6/plotnine_basics.py:
--------------------------------------------------------------------------------
 1 | # Scatter Plot:
 2 | 
 3 | from plotnine import ggplot, aes, geom_point, geom_bar, geom_histogram, geom_boxplot, geom_tile, geom_violin, theme_minimal, labs
 4 | import pandas
 5 | 
 6 | # Sample data
 7 | data = pandas.DataFrame({'x': [1, 2, 3, 4, 5],
 8 |                          'y': [2, 4, 1, 3, 5]})
 9 | 
10 | # Create a scatter plot
11 | gg = ggplot(aes(x='x', y='y'), data) + geom_point()
12 | print(gg)
13 | 
14 | 
15 | # Bar Chart:
16 | 
17 | # Sample data
18 | data = pandas.DataFrame({'category': ['A', 'B', 'C', 'D'],
19 |                          'value': [10, 25, 15, 30]})
20 | 
21 | # Create a bar chart
22 | gg = ggplot(aes(x='category', y='value'), data) + geom_bar(stat='identity')
23 | print(gg)
24 | 
25 | 
26 | # Histogram:
27 | 
28 | # Sample data
29 | data = pandas.DataFrame({'values': [1, 2, 2, 3, 3, 3, 4, 4, 5]})
30 | 
31 | # Create a histogram
32 | gg = ggplot(aes(x='values'), data) + geom_histogram(binwidth=1, fill='blue', color='black')
33 | print(gg)
34 | 
35 | 
36 | # Box Plot:
37 | 
38 | # Sample data
39 | data = pandas.DataFrame({'category': ['A', 'A', 'B', 'B', 'C', 'C'],
40 |                          'value': [10, 15, 20, 25, 30, 35]})
41 | 
42 | # Create a box plot
43 | gg = ggplot(aes(x='category', y='value'), data) + geom_boxplot()
44 | print(gg)
45 | 
46 | 
47 | # Heatmap:
48 | 
49 | # Sample data
50 | data = {
51 |     'x': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D', 'D'],
52 |     'y': ['W', 'X', 'Y', 'Z', 'W', 'X', 'Y', 'Z', 'W', 'X', 'Y', 'Z', 'W', 'X', 'Y', 'Z'],
53 |     'value': [10, 15, 5, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]
54 | }
55 | 
56 | # Convert data to a DataFrame
57 | data = pandas.DataFrame(data)
58 | 
59 | # Create a heatmap
60 | gg = (ggplot(data, aes(x='x', y='y', fill='value'))
61 |       + geom_tile()
62 |       + theme_minimal()
63 |       + labs(title='Heatmap Example', x='X-Axis', y='Y-Axis', fill='Values'))
64 | print(gg)
65 | 
66 | 
67 | # Violin Plot:
68 | 
69 | # Sample data
70 | data = {
71 |     'Category': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'D', 'D', 'D'],
72 |     'Value': [10, 15, 25, 30, 35, 45, 50, 65, 70, 75]
73 | }
74 | 
75 | # Convert data to a DataFrame
76 | df = pandas.DataFrame(data)
77 | 
78 | # Create a violin plot
79 | gg = (ggplot(df, aes(x='Category', y='Value', fill='Category'))
80 |       + geom_violin()
81 |       + theme_minimal()
82 |       + labs(title='Violin Plot Example', x='Category', y='Value', fill='Category'))
83 | print(gg)
84 | 


--------------------------------------------------------------------------------
/Chapter 6/plotnine_customizations.py:
--------------------------------------------------------------------------------
 1 | # Import necessary libraries
 2 | from plotnine import ggplot, aes, geom_point, xlab, ylab, ggtitle, labs, scale_x_continuous, scale_y_continuous, scale_color_manual, theme_minimal, theme_light, theme, element_text
 3 | import pandas
 4 | 
 5 | # Sample data
 6 | data = pandas.DataFrame({'X': [1, 2, 3, 4, 5],
 7 |                      'Y': [10, 15, 5, 20, 25],
 8 |                      'Category': ['A', 'B', 'A', 'B', 'A']})
 9 | 
10 | # Create a base scatter plot
11 | gg = (ggplot(data, aes(x='X', y='Y', color='Category')) +
12 |       geom_point())
13 | 
14 | # Customize labels and titles
15 | gg = gg + xlab("Custom X Label") + ylab("Custom Y Label")
16 | gg = gg + ggtitle("Custom Plot Title") + labs(subtitle="Custom Subtitle")
17 | 
18 | # Customize axes and legends
19 | gg = gg + scale_x_continuous(breaks=[0, 1, 2, 3, 4], labels=["Zero", "One", "Two", "Three", "Four"])
20 | gg = gg + scale_y_continuous(limits=(0, 30))
21 | gg = gg + scale_color_manual(values={'A': 'red', 'B': 'blue'})
22 | 
23 | # Customize color palettes
24 | # Map the 'category' variable to the 'fill' aesthetic
25 | gg = gg + aes(fill='Category')
26 | 
27 | # Apply themes
28 | gg = gg + theme_minimal()
29 | gg = gg + theme_light()
30 | 
31 | # Control text formatting
32 | gg = gg + theme(text=element_text(size=12, family="Arial", face="bold", color="black"),
33 |                 axis_text_x=element_text(angle=45, hjust=1))
34 | 
35 | print(gg)
36 | 


--------------------------------------------------------------------------------
/Chapter 7/create_pivot.py:
--------------------------------------------------------------------------------
 1 | import win32com.client as win32
 2 | 
 3 | # Create an Excel workbook and add a sheet
 4 | excel = win32.gencache.EnsureDispatch('Excel.Application')
 5 | workbook = excel.Workbooks.Add()
 6 | worksheet = workbook.Worksheets(1)
 7 | 
 8 | # Add some test data
 9 | worksheet.Cells(1, 1).Value = 'Name'
10 | worksheet.Cells(1, 2).Value = 'Category'
11 | worksheet.Cells(1, 3).Value = 'Sales'
12 | 
13 | worksheet.Cells(2, 1).Value = 'John'
14 | worksheet.Cells(2, 2).Value = 'Electronics'
15 | worksheet.Cells(2, 3).Value = 1000
16 | 
17 | worksheet.Cells(3, 1).Value = 'Alice'
18 | worksheet.Cells(3, 2).Value = 'Clothing'
19 | worksheet.Cells(3, 3).Value = 800
20 | 
21 | worksheet.Cells(4, 1).Value = 'John'
22 | worksheet.Cells(4, 2).Value = 'Clothing'
23 | worksheet.Cells(4, 3).Value = 300
24 | 
25 | # Add more data as needed
26 | 
27 | # Define the range of data to be used as input for the pivot table
28 | data_range = worksheet.Range('A1:C4')  # Adjust the range as needed
29 | 
30 | # Add a new worksheet to the workbook to hold the Pivot Table:
31 | pivot_table_sheet = workbook.Worksheets.Add()
32 | pivot_table_sheet.Name = 'Pivot Table'
33 | 
34 | # Create a Pivot Cache using the data range:
35 | pivot_cache = workbook.PivotCaches().Create(SourceType=1, SourceData=data_range)
36 | 
37 | # Create the Pivot Table on the new sheet using the Pivot Cache:
38 | pivot_table = pivot_cache.CreatePivotTable(TableDestination=pivot_table_sheet.Cells(3, 1), TableName='MyPivotTable')
39 | 
40 | # Add the row, column and data fields
41 | pivot_table.PivotFields('Name').Orientation = 1 # row field
42 | pivot_table.PivotFields('Category').Orientation = 2 # column field
43 | pivot_table.PivotFields('Sales').Orientation = 4 # data field
44 | 
45 | # Add the calculated fields
46 | calculated_field = pivot_table.CalculatedFields().Add("Total Sales", "=SUM(Sales)")
47 | 
48 | # Refresh the PivotTable to apply changes
49 | pivot_table.RefreshTable()
50 | 
51 | # Save the Workbook and close Excel
52 | workbook.SaveAs('PivotTableExample.xlsx')
53 | workbook.Close()
54 | excel.Quit()
55 | 


--------------------------------------------------------------------------------
/Chapter 7/grouping.py:
--------------------------------------------------------------------------------
 1 | # Sample Data Generation
 2 | import pandas as pd
 3 | import random
 4 | from datetime import datetime, timedelta
 5 | import win32com.client as win32
 6 | import os 
 7 | import numpy as np
 8 | 
 9 | data = {
10 |     'Date': [datetime(2023, 1, 1) + timedelta(days=i) for i in range(365)],
11 |     'Sales': [random.randint(100, 1000) for _ in range(365)]
12 | }
13 | 
14 | df = pd.DataFrame(data)
15 | 
16 | # Create an ExcelWriter object and write the DataFrame to the Excel worksheet
17 | df.to_excel("GroupingExample.xlsx", sheet_name='Sheet1', index=False)
18 | 
19 | # Connect to Excel
20 | excel = win32.gencache.EnsureDispatch('Excel.Application')
21 | 
22 | # Open the Excel workbook and add a sheet
23 | wd = os.getcwd()
24 | workbook = excel.Workbooks.Open(os.path.join(wd, 'GroupingExample.xlsx'))  # Replace with your workbook path
25 | worksheet = workbook.Worksheets(1)
26 | 
27 | # Add a new worksheet to the workbook to hold the Pivot Table:
28 | pivot_table_sheet = workbook.Worksheets.Add()
29 | pivot_table_sheet.Name = 'Pivot Table'
30 | 
31 | # Define the range of data to be used as input for the pivot table
32 | data_range = worksheet.Range('A1:B365') 
33 | 
34 | # Create a Pivot Cache using the data range:
35 | pivot_cache = workbook.PivotCaches().Create(SourceType=1, SourceData=data_range)
36 | 
37 | starting_row = 3
38 | 
39 | # Create the Pivot Table on the new sheet using the Pivot Cache:
40 | pivot_table = pivot_cache.CreatePivotTable(TableDestination=pivot_table_sheet.Cells(starting_row, 1), TableName='MyPivotTable')
41 | 
42 | # Add the 'Date' field to Rows and define the date_field variable as done with name_field in the example above.
43 | date_field = pivot_table.PivotFields('Date')
44 | date_field.Orientation = 1 # row field
45 | pivot_table.PivotFields('Sales').Orientation = 4 # data field
46 | 
47 | # Add the calculated fields
48 | calculated_field = pivot_table.CalculatedFields().Add("Total Sales", "=SUM(Sales)")
49 | 
50 | # Group by months
51 | date_field.Subtotals = [False]*12
52 | date_field.NumberFormat = 'MMMM YYYY'
53 | 
54 | # Sort Rows
55 | date_field.AutoSort(1, "Date")
56 | 
57 | # count the unique values for each value of the date column in the pivot
58 | date_values = pd.DataFrame([item.Value for item in date_field.PivotItems()], columns = ['date'])
59 | unique_values = pd.DataFrame(np.transpose(np.unique(date_values, return_counts=True)), columns=['date', 'count'])
60 | date_values_count = date_values.merge(unique_values).drop_duplicates()
61 | 
62 | # Group by months
63 | # Set the GroupOn property
64 | date_range = pivot_table_sheet.Range(f"A4:A{starting_row + date_values_count['count'].iloc[0]}")
65 | date_range.Group()
66 | 
67 | # You can use the above method to group the other months as well if you want to
68 | # Note: the pivot is now changed, the second group starts at row starting_row + 2, instead of starting_row + 32
69 | 
70 | # change the formatting of the grouped column to show only month and year and change back the original date column to show the full date
71 | pivot_table.PivotFields('Date2').NumberFormat = 'MMMM YYYY'
72 | date_field.NumberFormat = 'DD MMMM YYYY'
73 | 
74 | # hide the details of the grouped values
75 | for item in pivot_table.PivotFields('Date2').PivotItems():
76 |     item.ShowDetail = False
77 | 
78 | # Refresh data
79 | pivot_table.RefreshTable()
80 | 
81 | #pivot_table.PivotFields('Date2').Orientation = 2
82 | 
83 | # Save and close
84 | workbook.Save()
85 | workbook.Close()
86 | excel.Quit()
87 | 


--------------------------------------------------------------------------------
/Chapter 7/manipulate_pivot.py:
--------------------------------------------------------------------------------
 1 | import win32com.client as win32
 2 | 
 3 | # Connect to Excel
 4 | excel = win32.gencache.EnsureDispatch('Excel.Application')
 5 | 
 6 | # Open the workbook with the pivot table
 7 | workbook = excel.Workbooks.Open('PivotTableExample.xlsx')  # Replace with your workbook path
 8 | worksheet = workbook.Worksheets(1)
 9 | 
10 | # Access the Pivot Table
11 | pivot_table = worksheet.PivotTables('MyPivotTable')  # Use the name of your pivot table
12 | 
13 | # Filter by value (need to make the field a Page field instaed of a column field)
14 | category_field = pivot_table.PivotFields('Category')
15 | category_field.Orientation = 3 # page field
16 | category_field.CurrentPage = "Electronics"
17 | 
18 | # Sort Rows or Columns
19 | name_field = pivot_table.PivotFields('Name')
20 | name_field.AutoSort(1, "Name")
21 | 
22 | # Define the new source data range 
23 | new_source_data_range = 'Sheet1!A1:C2'
24 | 
25 | # Update the SourceData property of the pivot table's Table object
26 | pivot_table.TableRange2(workbook.Sheets('Sheet1').Range(new_source_data_range))
27 | 
28 | # Refresh data
29 | pivot_table.RefreshTable()
30 | 
31 | # Save and close
32 | workbook.Save()
33 | workbook.Close()
34 | excel.Quit()
35 | 


--------------------------------------------------------------------------------
/Chapter 8/ch8.R:
--------------------------------------------------------------------------------
 1 | # The skimr package
 2 | if(!require(skimr)){install.packages("skimr")}
 3 | library(skimr)
 4 | skim(iris)
 5 | 
 6 | if(!require(TidyDensity)){install.packages("TidyDensity")}
 7 | tidy_normal() |> skim()
 8 | 
 9 | if(!require(GGally)){install.packages("GGally")}
10 | library(GGally)
11 | library(TidyDensity)
12 | tidy_normal(.n = 200) |> 
13 |   ggpairs(columns = c("y","p","q","dx","dy"))
14 | 
15 | if(!require(DataExplorer)){install.packages("DataExplorer")}
16 | library(DataExplorer)
17 | library(TidyDensity)
18 | library(dplyr)
19 | 
20 | df <- tidy_normal(.n = 200)
21 | 
22 | df |>
23 |   introduce() |>
24 |   glimpse()
25 | 
26 | df |>
27 |   plot_intro() +
28 |   theme_minimal()
29 | 
30 | df |> 
31 |   plot_qq()
32 | 
33 | df[c("q","y")] |> 
34 |   plot_qq()
35 | 


--------------------------------------------------------------------------------
/Chapter 8/clean_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Load Excel data into a pandas DataFrame
 5 | df = pd.read_excel('dirty_data.xlsx')
 6 | 
 7 | # Handling Missing Data
 8 | 
 9 | # Identify missing values
10 | missing_values = df.isnull().sum()
11 | 
12 | # Replace missing values with the mean (for numeric columns)
13 | df['Age'].fillna(df['Age'].mean(), inplace=True)
14 | 
15 | # Replace missing values with the mode (for categorical columns)
16 | # df['Salary'].fillna(df['Salary'].mode()[0], inplace=True)
17 | 
18 | # Forward-fill or backward-fill missing values
19 | # df['ColumnWithMissingValues'].fillna(method='ffill', inplace=True)
20 | 
21 | # Interpolate missing values based on trends
22 | # df['NumericColumn'].interpolate(method='linear', inplace=True)
23 | 
24 | # Remove rows or columns with missing data
25 | df.dropna(axis=0, inplace=True)  # Remove rows with missing data
26 | df.dropna(axis=1, inplace=True)  # Remove columns with missing data
27 | 
28 | # Handling Duplicates
29 | 
30 | # Detect and display duplicate rows
31 | duplicate_rows = df[df.duplicated()]
32 | print("Duplicate Rows:")
33 | print(duplicate_rows)
34 | 
35 | # Remove duplicate rows
36 | df.drop_duplicates(inplace=True)
37 | 
38 | # Handling Data Type Conversion
39 | 
40 | # Check data types
41 | print(df.dtypes)
42 | 
43 | # Convert a column to a different data type (e.g., float)
44 | df.loc[df['Salary']=='Missing', 'Salary'] = np.NaN
45 | df.loc[:, 'Salary'] = df['Salary'].str.replace("$", "")
46 | df.loc[:, 'Salary'] = df['Salary'].str.replace(",", "")
47 | df['Salary'] = df['Salary'].astype(float)
48 | 
49 | # Now that Salary is a numeric column, we can fill the missing values with mean
50 | df['Salary'].fillna(df['Salary'].mean(), inplace=True)
51 | 
52 | # Excel-Specific Data Issues
53 | 
54 | # No code needed, ensure Excel data is cleaned (e.g., merged cells unmerged, empty cells removed) before import
55 | 


--------------------------------------------------------------------------------
/Chapter 8/create_sample_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Create a DataFrame with missing data, duplicates, and mixed data types
 5 | data = {
 6 |     'ID': [1, 2, 3, 4, 5, 6],
 7 |     'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eva', 'Eva'],
 8 |     'Age': [25, np.nan, 30, 28, 22, 23],
 9 |     'Salary': ['$50,000', '$60,000', 'Missing', '$65,000', '$55,000', '$75,000']
10 | }
11 | 
12 | df = pd.DataFrame(data)
13 | 
14 | # Introduce some missing data
15 | df.loc[1, 'Age'] = np.nan
16 | df.loc[3, 'Salary'] = np.nan
17 | 
18 | # Introduce duplicates
19 | df = pd.concat([df, df.iloc[1:3]], ignore_index=True)
20 | 
21 | # Save the sample data
22 | df.to_excel('dirty_data.xlsx')
23 | 


--------------------------------------------------------------------------------
/Chapter 8/data_distribution.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from scipy import stats
 5 | import statsmodels.api as sm
 6 | 
 7 | # Generate sample data from a lognormal distribution
 8 | np.random.seed(0)
 9 | data = np.random.lognormal(mean=0, sigma=1, size=1000)
10 | 
11 | # Create a Pandas DataFrame
12 | df = pd.DataFrame({'Data': data})
13 | 
14 | # Plot a histogram of the data
15 | plt.hist(data, bins=30, color='skyblue', edgecolor='black')
16 | plt.title('Histogram of Data')
17 | plt.xlabel('Value')
18 | plt.ylabel('Frequency')
19 | plt.show()
20 | 
21 | # Perform the Shapiro-Wilk test for normality
22 | shapiro_stat, shapiro_p = stats.shapiro(data)
23 | is_normal = shapiro_p > 0.05  # Check if data is normally distributed
24 | print(f'Shapiro-Wilk p-value: {shapiro_p}')
25 | print(f'Is data normally distributed? {is_normal}')
26 | 
27 | # Create Q-Q plot with a Normal distribution
28 | sm.qqplot(data, line='s', color='skyblue')
29 | plt.title('Q-Q Plot (Normal)')
30 | plt.xlabel('Theoretical Quantiles')
31 | plt.ylabel('Sample Quantiles')
32 | plt.show()
33 | 
34 | # Create Q-Q plot with a lognormal distribution
35 | log_data = np.log(data)
36 | sm.qqplot(log_data, line='s', color='skyblue')
37 | plt.title('Q-Q Plot (Lognormal)')
38 | plt.xlabel('Theoretical Quantiles')
39 | plt.ylabel('Sample Quantiles')
40 | plt.show()
41 | 
42 | # Calculate skewness and kurtosis
43 | skewness = stats.skew(data)
44 | kurtosis = stats.kurtosis(data)
45 | 
46 | print(f'Skewness: {skewness}')
47 | print(f'Kurtosis: {kurtosis}')
48 | 


--------------------------------------------------------------------------------
/Chapter 8/relationships.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import seaborn as sns
 4 | import matplotlib.pyplot as plt
 5 | import ppscore as pps
 6 | 
 7 | # Generate test data with three variables
 8 | np.random.seed(0)
 9 | data = {
10 |     'Feature1': np.random.randn(100),
11 |     'Feature2': np.random.randn(100) * 2,
12 | }
13 | 
14 | # Create a linear Target variable based on Feature1 and a non-linear function of Feature2
15 | data['Target'] = data['Feature1'] * 2 + np.sin(data['Feature2']) + np.random.randn(100) * 0.5
16 | 
17 | # Create a DataFrame
18 | df = pd.DataFrame(data)
19 | 
20 | # Calculate and plot the correlation heatmap
21 | corr_matrix = df.corr()
22 | plt.figure(figsize=(8, 6))
23 | sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
24 | plt.title('Correlation Heatmap')
25 | plt.show()
26 | 
27 | # Calculate the Predictive Power Score (PPS)
28 | plt.figure(figsize=(8, 6))
29 | matrix_df = pps.matrix(df)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
30 | sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
31 | plt.title("Predictive Power Score (PPS) Heatmap")
32 | plt.show()
33 | 
34 | # Additional insights
35 | correlation_target = df['Feature1'].corr(df['Target'])
36 | pps_target = pps.score(df, 'Feature1', 'Target')
37 | 
38 | print(f'Correlation between Feature1 and Target: {correlation_target:.2f}')
39 | print(f'Predictive Power Score (PPS) between Feature1 and Target: {pps_target:.2f}')
40 | 


--------------------------------------------------------------------------------
/Chapter 8/summary_statistics.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import random
 3 | 
 4 | # Create a sample DataFrame
 5 | data = {
 6 |     'Age': [random.randint(18, 60) for _ in range(100)],
 7 |     'Gender': ['Male', 'Female'] * 50,
 8 |     'Income': [random.randint(20000, 100000) for _ in range(100)],
 9 |     'Region': ['North', 'South', 'East', 'West'] * 25
10 | }
11 | 
12 | df = pd.DataFrame(data)
13 | 
14 | # Calculate summary statistics for numerical features
15 | numerical_summary = df.describe()
16 | 
17 | # Calculate frequency counts and percentages for categorical features
18 | categorical_summary = df['Gender'].value_counts(normalize=True)
19 | 
20 | print("Summary Statistics for Numerical Features:")
21 | print(numerical_summary)
22 | 
23 | print("\nFrequency Counts and Percentages for Categorical Features (Gender):")
24 | print(categorical_summary)
25 | 


--------------------------------------------------------------------------------
/Chapter 9/ch9_linear_reg.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Library Load ------------------------------------------------------------
 3 | 
 4 | library(readxl)
 5 | 
 6 | 
 7 | # Get Data ----------------------------------------------------------------
 8 | 
 9 | df <- read_xlsx(
10 |   path = "chapter1/iris_data.xlsx",
11 |   sheet = "iris"
12 | )
13 | 
14 | head(df)
15 | 
16 | # Split the dataset by species
17 | iris_split <- split(df, df$species)
18 | 
19 | # Define the dependent variable and independent variables
20 | dependent_variable <- "petal_length"
21 | independent_variables <- c("petal_width", "sepal_length", "sepal_width")
22 | f_x <- formula(
23 |   paste(
24 |     dependent_variable, 
25 |     "~", 
26 |     paste(
27 |       independent_variables, 
28 |       collapse = " + "
29 |       )
30 |     )
31 |   )
32 | 
33 | # Create a function to perform linear regression on each subset
34 | perform_linear_regression <- function(data) {
35 |   lm_model <- lm(f_x, data = data)
36 |   return(lm_model)
37 | }
38 | 
39 | # Apply the linear regression to each subset using lapply
40 | results <- lapply(iris_split, perform_linear_regression)
41 | 
42 | # Get the summary of each linear model
43 | lapply(results, summary)
44 | 
45 | # Plot the model performance
46 | par(mfrow = c(2,2))
47 | lapply(results, plot)
48 | par(mfrow = c(1, 1))
49 | 
50 | # The above can also be rewritten as follows
51 | # Fit a linear model for each species
52 | lm_models <- lapply(iris_split, function(df) lm(f_x, data = df))
53 | 
54 | # Summarize the results
55 | lapply(lm_models, summary)
56 | 


--------------------------------------------------------------------------------
/Chapter 9/ch9_linear_reg_tidymodels.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Library Load ------------------------------------------------------------
 3 | 
 4 | library(readxl)
 5 | library(tidymodels)
 6 | library(purrr)
 7 | library(performance)
 8 | 
 9 | 
10 | # Get Data ----------------------------------------------------------------
11 | 
12 | df <- read_xlsx(
13 |   path = "chapter1/iris_data.xlsx",
14 |   sheet = "iris"
15 | )
16 | 
17 | # Split the data by Species -----------------------------------------------
18 | 
19 | iris_list <- split(df, df$species)
20 | 
21 | # Specify the Model -------------------------------------------------------
22 | 
23 | lm_model <- linear_reg(mode = "regression", engine = "lm")
24 | 
25 | # Define Formula ----------------------------------------------------------
26 | 
27 | f_x <- formula(paste("petal_width", "~", "petal_length + sepal_width + sepal_length"))
28 | 
29 | # Perform Linear Regression using purrr -----------------------------------
30 | # Create The Model
31 | lm_mod <- linear_reg(mode = "regression", engine = "lm")
32 | 
33 | # Make the workflow
34 | wf <- workflow() |>
35 |   add_model(lm_mod)
36 | 
37 | # Make the function that will get mapped
38 | lm_fit_list <- function(df) {
39 |   #create recipe
40 |   recipe_train <- recipe(f_x, data = df) %>%
41 |     step_normalize(all_predictors())
42 |   
43 |   #fit workflow on the data
44 |   fit_wf <- wf |>
45 |     add_recipe(recipe_train) |>
46 |     fit(data = df)
47 | 
48 |   fit_wf
49 | }
50 | 
51 | # Map the linear model ----------------------------------------------------
52 | 
53 | model_list <- map(iris_list, lm_fit_list)
54 | lapply(model_list, tidy)
55 | lapply(model_list, glance)
56 | 
57 | # Check the Model
58 | model_list |>
59 |   map(extract_fit_engine) |> 
60 |   map(check_model) 
61 | 
62 | # Alternate Nested Method
63 | nested_lm <- df |>
64 |   nest(data = -species) |>
65 |   mutate(split = map(data, ~ initial_split(., prop = 8/10)),
66 |          train = map(split, ~ training(.)),
67 |          test  = map(split, ~ testing(.)),
68 |          fit   = map(train, ~ lm(f_x, data = .)),
69 |          pred  = map2(.x = fit, .y = test, ~ predict(object = .x, newdata = .y)))
70 | 
71 | nested_lm |>
72 |   select(species, pred) |>
73 |   unnest(pred)


--------------------------------------------------------------------------------
/Chapter 9/ch9_logistic_reg.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Library Load ------------------------------------------------------------
 3 | 
 4 | 
 5 | library(tidyverse)
 6 | 
 7 | df <- Titanic |>
 8 |   as.data.frame() |>
 9 |   uncount(Freq)
10 | 
11 | 
12 | # Splits ------------------------------------------------------------------
13 | 
14 | # Split the data into training and test sets
15 | set.seed(123)
16 | train_index <- sample(nrow(df), floor(nrow(df) * 0.8), replace = FALSE)
17 | train <- df[train_index, ]
18 | test <- df[-train_index, ]
19 | 
20 | # Train a model -----------------------------------------------------------
21 | 
22 | # Train the logistic regression model
23 | model <- glm(Survived ~ Sex + Age + Class, data = train, family = "binomial")
24 | 
25 | # Predict -----------------------------------------------------------------
26 | 
27 | # Evaluate the model on the test set
28 | predictions <- predict(model, newdata = test, type = "response")
29 | pred_resp <- ifelse(predictions <= 0.5, "No", "Yes")
30 | 
31 | # Calculate the accuracy of the model
32 | accuracy <- mean(pred_resp == test$Survived)
33 | 
34 | # Print the accuracy of the model
35 | print(accuracy)
36 | 
37 | # Print the confusion matrix
38 | table(pred_resp, test$Survived)
39 | 


--------------------------------------------------------------------------------
/Chapter 9/ch9_logistic_reg_tidymodels.R:
--------------------------------------------------------------------------------
 1 | # Library Load ------------------------------------------------------------
 2 | 
 3 | library(tidymodels)
 4 | library(healthyR.ai)
 5 | 
 6 | # Convert to a tibble for tidymodels
 7 | df <- Titanic |>
 8 |   as_tibble() |>
 9 |   uncount(n)  |>
10 |   mutate(across(where(is.character), as.factor))
11 | 
12 | # Splits ------------------------------------------------------------------
13 | 
14 | # Set seed for reproducibility
15 | set.seed(123)
16 | 
17 | # Split the data into training and test sets
18 | split <- initial_split(df, prop = 0.8)
19 | train <- training(split)
20 | test <- testing(split)
21 | 
22 | # Train a model -----------------------------------------------------------
23 | 
24 | # Create a recipe for pre-processing
25 | recipe <- recipe(Survived ~ Sex + Age + Class, data = train)
26 | 
27 | # Specify logistic regression as the model
28 | log_reg <- logistic_reg() |>
29 |   set_engine("glm", family = "binomial")
30 | 
31 | # Combine the recipe and model into a workflow
32 | workflow <- workflow() %>%
33 |   add_recipe(recipe) %>%
34 |   add_model(log_reg)
35 | 
36 | # Train the logistic regression model
37 | fit <- fit(workflow, data = train)
38 | 
39 | # Predict -----------------------------------------------------------------
40 | 
41 | # Predict on the test set
42 | predictions <- predict(fit, new_data = test) |>
43 |   bind_cols(test) |>
44 |   select(Class:Survived, .pred_class)
45 | 
46 | # Better method
47 | pred_fit_tbl <- fit |>
48 |   augment(new_data = test)
49 | 
50 | # Accuracy Check ----------------------------------------------------------
51 | 
52 | # Accuracy metrics for the model to be scored against from the healthyR.ai package
53 | perf <- hai_default_classification_metric_set()
54 | 
55 | # Calculate the accuracy metrics
56 | perf(pred_fit_tbl, truth = Survived, estimate = .pred_class)
57 | 
58 | # Print the confusion matrix
59 | predictions |>
60 |   conf_mat(truth = Survived, estimate = .pred_class)
61 | 
62 | # Use broom to tidy and glance the fitted model
63 | tidy(fit, exponentiate = TRUE, conf.int = TRUE)
64 | glance(fit)
65 | 
66 | # Visualize' --------------------------------------------------------------
67 | 
68 | # ROC Curve
69 | roc_curve(pred_fit_tbl, truth = Survived, .pred_Yes, event_level = "second") |> 
70 |   autoplot()


--------------------------------------------------------------------------------
/Chapter 9/linear_regression.py:
--------------------------------------------------------------------------------
 1 | # Import necessary libraries
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from sklearn.model_selection import train_test_split
 6 | import statsmodels.api as sm 
 7 | from statsmodels.graphics.regressionplots import plot_regress_exog
 8 | from statsmodels.graphics.gofplots import qqplot
 9 | 
10 | # Step 0: Generate sample data and save as Excel file
11 | np.random.seed(0)
12 | n_samples = 100
13 | X = np.random.rand(n_samples, 2)  # Two features
14 | y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(n_samples)  # Linear relationship with noise
15 | 
16 | # Create a pandas DataFrame
17 | data = {'Feature1': X[:, 0], 'Feature2': X[:, 1], 'Target': y}
18 | df = pd.DataFrame(data)
19 | 
20 | # Save the data to Excel
21 | df.to_excel("linear_regression_input.xlsx")
22 | 
23 | # Step 1: Import Excel data into a pandas DataFrame
24 | excel_file = "linear_regression_input.xlsx"
25 | df = pd.read_excel(excel_file)
26 | 
27 | # Step 2: Explore the data
28 | # Use the tools learned in the previous chapter on EDA
29 | 
30 | # Step 3: Data Preparation (if needed)
31 | # Use the tools learned in the previous chapter on data cleaning
32 | 
33 | # Step 4: Split data into training and testing sets
34 | X = df[['Feature1', 'Feature2']] # Independent variables
35 | y = df['Target'] # Dependent variable
36 | 
37 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
38 | 
39 | # Step 5: Fit the Linear Regression model
40 | # Add a constant (intercept) to the independent variables
41 | X_train = sm.add_constant(X_train)
42 | X_test = sm.add_constant(X_test)
43 | 
44 | # Fit the linear model
45 | model = sm.OLS(y_train, X_train).fit()
46 | 
47 | # Step 6: Model Evaluation
48 | y_pred = model.predict(X_test)
49 | 
50 | # Print the model summary
51 | print(model.summary())
52 | 
53 | # Step 7: Visualization
54 | plt.scatter(X_test['Feature1'], y_test, color='blue', label='Actual')
55 | plt.scatter(X_test['Feature1'], y_pred, color='red', label='Predicted')
56 | plt.xlabel('Feature1')
57 | plt.ylabel('Target')
58 | plt.title('Linear Regression Prediction')
59 | plt.legend()
60 | plt.show()
61 | 
62 | # Set the backend to 'Agg' before generating the plots
63 | plt.switch_backend('TkAgg')
64 | 
65 | # Residuals
66 | fig, ax = plt.subplots(figsize=(12, 8))
67 | plot_regress_exog(model, "Feature1", fig=fig)
68 | plt.show()  
69 | 
70 | # Q-Q plot
71 | qqplot(model.resid, line="s")
72 | plt.show()


--------------------------------------------------------------------------------
/Chapter 9/logistic_regression.py:
--------------------------------------------------------------------------------
 1 | # Import necessary libraries
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.linear_model import LogisticRegression
 7 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 8 | 
 9 | # Step 0: Generate sample data
10 | np.random.seed(0)
11 | n_samples = 100
12 | X = np.random.rand(n_samples, 2)  # Two features
13 | y = (X[:, 0] + X[:, 1] > 1).astype(int)  # Binary classification based on a condition
14 | 
15 | # Create a pandas DataFrame
16 | data = {'Feature1': X[:, 0], 'Feature2': X[:, 1], 'Target': y}
17 | df = pd.DataFrame(data)
18 | 
19 | df.to_excel("logistic_regression_input.xlsx")
20 | 
21 | # Step 1: Import Excel data into a pandas DataFrame
22 | excel_file = "logistic_regression_input.xlsx"
23 | df = pd.read_excel(excel_file)
24 | 
25 | # Step 2: Split data into training and testing sets
26 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
27 | 
28 | # Step 3: Create and train the logistic regression model
29 | model = LogisticRegression()
30 | model.fit(X_train, y_train)
31 | 
32 | # Step 4: Visualization
33 | 
34 | # Visualization for binary classification
35 | plt.scatter(X_test[y_test == 1][:, 0], X_test[y_test == 1][:, 1], color='blue', label='Class 1 (Actual)')
36 | plt.scatter(X_test[y_test == 0][:, 0], X_test[y_test == 0][:, 1], color='red', label='Class 0 (Actual)')
37 | plt.xlabel('Feature1')
38 | plt.ylabel('Feature2')
39 | plt.title('Logistic Regression Prediction')
40 | plt.legend()
41 | plt.show()
42 | 
43 | # Step 5: Model Evaluation and Interpretation
44 | y_pred = model.predict(X_test)
45 | 
46 | accuracy = accuracy_score(y_test, y_pred)
47 | conf_matrix = confusion_matrix(y_test, y_pred)
48 | class_report = classification_report(y_test, y_pred)
49 | 
50 | print("Accuracy:", accuracy)
51 | print("Confusion Matrix:\n", conf_matrix)
52 | print("Classification Report:\n", class_report)
53 | 


--------------------------------------------------------------------------------
/Chapter1/ch1_create_iris_dataset.R:
--------------------------------------------------------------------------------
 1 | library(writexl)
 2 | library(janitor)
 3 | library(dplyr)
 4 | 
 5 | df <- iris |> clean_names()
 6 | 
 7 | l <- df |>
 8 |   clean_names() |>
 9 |   split(f = df$species)
10 | 
11 | lt <- c(l, iris = list(df))
12 | 
13 | write_xlsx(lt, path = "ch1/iris_data.xlsx")
14 | 


--------------------------------------------------------------------------------
/Chapter1/ch1_pkgs.R:
--------------------------------------------------------------------------------
1 | pkgs <- c("openxlsx", "xlsx", "readxl")
2 | install.packages(pkgs, dependencies = TRUE)
3 | lapply(pkgs, library, character.only = TRUE)
4 | 


--------------------------------------------------------------------------------
/Chapter1/ch1_save_xlsx_as_xlsb.R:
--------------------------------------------------------------------------------
 1 | # Load the openxlsx package
 2 | library(openxlsx)
 3 | 
 4 | # Set the path to the xlsx file
 5 | xlsx_file <- "C:/Users/steve/OneDrive/Desktop/Extending_Excel/ch1/iris_data.xlsx"
 6 | 
 7 | # Open the xlsx file
 8 | wb <- openxlsx::loadWorkbook(xlsx_file)
 9 | 
10 | # Save the xlsx file as an xlsb file
11 | openxlsx::saveWorkbook(wb, "C:/Users/steve/OneDrive/Desktop/Extending_Excel/ch1/iris_data.xlsb")
12 | 


--------------------------------------------------------------------------------
/Chapter1/excel_sheet_reader.R:
--------------------------------------------------------------------------------
 1 | read_excel_sheets <- function(filename, single_tbl = FALSE) {
 2 |   sheets <- readxl::excel_sheets(filename)
 3 |   
 4 |   if (single_tbl){
 5 |     x <- purrr::map_df(sheets, readxl::read_excel, path = filename)
 6 |   } else {
 7 |     x <- purrr::map(sheets, ~ readxl::read_excel(filename, sheet = .x))
 8 |     purrr::set_names(x, sheets)
 9 |   }
10 |   
11 |   x
12 | }
13 | 
14 | f <- "Chapter1/iris_data.xlsx"
15 | 
16 | read_excel_sheets(f, F)
17 | 


--------------------------------------------------------------------------------
/Chapter1/iris_data.xlsb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter1/iris_data.xlsb


--------------------------------------------------------------------------------
/Chapter1/multisheet_openpyxl.py:
--------------------------------------------------------------------------------
 1 | from openpyxl import load_workbook
 2 | import pandas as pd
 3 | 
 4 | def read_single_sheet(workbook, sheet_name):
 5 |         
 6 |         # Load the sheet from the workbook
 7 |         sheet = workbook[sheet_name]
 8 | 
 9 |         # Read out the raaw data including headers
10 |         sheet_data_raw = sheet.values
11 | 
12 |         # Separate the headers into a variable
13 |         columns = next(sheet_data_raw)[0:]
14 | 
15 |         # Create a DataFrame based on the second and subsequent lines of data with the header as column names and return it
16 |         return pd.DataFrame(sheet_data_raw, columns=columns)
17 | 
18 | 
19 | def read_multiple_sheets(file_path):
20 | 
21 |     # Load the workbook
22 |     workbook = load_workbook(file_path)
23 | 
24 |     # Get a list of all sheet names in the workbook
25 |     sheet_names = workbook.sheetnames
26 | 
27 |     # Cycle through the sheet names, load the data for each and concatenate them into a single DataFrame
28 |     return pd.concat([read_single_sheet(workbook=workbook, sheet_name=sheet_name) for sheet_name in sheet_names], ignore_index=True)
29 | 
30 | # Define the file path and sheet names
31 | file_path = 'iris_data.xlsx'
32 | 
33 | # Read the data from multiple sheets
34 | consolidated_data = read_multiple_sheets(file_path)
35 | 
36 | # Display the consolidated data
37 | print(consolidated_data.head())
38 | 


--------------------------------------------------------------------------------
/Chapter1/open_excel_openpyxl.py:
--------------------------------------------------------------------------------
 1 | import openpyxl
 2 | import pandas as pd
 3 | 
 4 | # Load the workbook
 5 | wb = openpyxl.load_workbook('iris_data.xlsx')
 6 | 
 7 | # Select the sheet
 8 | sheet = wb['versicolor']
 9 | 
10 | # Extract the values (including header)
11 | sheet_data_raw = sheet.values
12 | 
13 | # Separate the headers into a variable
14 | header = next(sheet_data_raw)[0:]
15 | 
16 | # Create a DataFrame based on the second and subsequent lines of data with the header as column names
17 | sheet_data = pd.DataFrame(sheet_data_raw, columns=header)
18 | 
19 | print(sheet_data.head())


--------------------------------------------------------------------------------
/Chapter1/open_excel_pandas.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | # Read the Excel file
4 | df = pd.read_excel('iris_data.xlsx', sheet_name='setosa')
5 | 
6 | # Display the first few rows of the DataFrame
7 | print(df.head())
8 | 


--------------------------------------------------------------------------------
/Chapter1/read_xlsx_files.R:
--------------------------------------------------------------------------------
 1 | # Load Libraries
 2 | pkgs <- c("openxlsx", "xlsx", "readxl")
 3 | lapply(pkgs, library, character.only = TRUE)
 4 | 
 5 | f_path <- "Chapter1/iris_data.xlsx"
 6 | 
 7 | # Use openxlsx
 8 | openxlsx::read.xlsx(f_path) |> head(5)
 9 | openxlsx::read.xlsx(f_path, sheet = "iris") |> head(5)
10 | 
11 | # Use xlsx
12 | xlsx::read.xlsx(file = f_path, sheetIndex = 1) |> head(5)
13 | xlsx::read.xlsx(file = f_path, sheetName = "iris") |> head(5)
14 | 
15 | # Use readxl
16 | readxl::read_excel(f_path) |> head(5)
17 | readxl::read_excel(f_path, "iris") |> head(5)
18 | 


--------------------------------------------------------------------------------
/Chapter12/call_plumber.R:
--------------------------------------------------------------------------------
 1 | # Library Load
 2 | library(plumber)
 3 | 
 4 | # Set dir and file path
 5 | wd <- getwd()
 6 | sub_dir <- paste0("/Chapter12/")
 7 | full_dir <- paste0(wd, sub_dir)
 8 | f <- "plumber_api.R"
 9 | f_path <- paste0(full_dir, f)
10 | 
11 | # Initiate root
12 | root <- pr(f_path)
13 | root
14 | 
15 | root |> pr_run()
16 | 


--------------------------------------------------------------------------------
/Chapter12/imgs/api_histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/api_histogram.png


--------------------------------------------------------------------------------
/Chapter12/imgs/enter_api_argument.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/enter_api_argument.png


--------------------------------------------------------------------------------
/Chapter12/imgs/get_api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/get_api.png


--------------------------------------------------------------------------------
/Chapter12/imgs/swagger_plumber_api_screen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/swagger_plumber_api_screen.png


--------------------------------------------------------------------------------
/Chapter12/imgs/vba_curl_request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/vba_curl_request.png


--------------------------------------------------------------------------------
/Chapter12/plumber_api.R:
--------------------------------------------------------------------------------
1 | #* Plot out data from a random normal distribution
2 | #* @param .mean The mean of the standard normal distribution
3 | #* @get /plot
4 | #* @serializer png
5 | function(.mean) {
6 |   mu <- as.numeric(.mean)
7 |   hist(rnorm(n = 1000, mean = mu, sd = 1))
8 | }


--------------------------------------------------------------------------------
/Chapter12/vba_plumber_curl_request.bas:
--------------------------------------------------------------------------------
 1 | Sub MakeCurlRequestAndInsertImage()
 2 |     ' Define the curl command
 3 |     Dim curlCommand As String
 4 |     curlCommand = "curl -X GET ""http://127.0.0.1:6855/plot?.mean=0"" -H ""accept: image/png"" -o " & Environ("TEMP") & "\temp_image.png"
 5 | 
 6 |     ' Run the curl command using Shell
 7 |     Shell "cmd /c " & curlCommand, vbHide
 8 | 
 9 |     ' Create a new worksheet or refer to an existing one (Sheet1)
10 |     Dim ws As Worksheet
11 |     Set ws = ActiveWorkbook.Worksheets("Sheet1")
12 | 
13 |     ' Clear previous content in Sheet1
14 |     ws.Cells.Clear
15 | 
16 |     ' Insert the image into the worksheet
17 |     ws.Pictures.Insert(Environ("TEMP") & "\temp_image.png").Select
18 | End Sub
19 | 


--------------------------------------------------------------------------------
/Chapter14/auto_xgb.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/auto_xgb.rar


--------------------------------------------------------------------------------
/Chapter14/ch14_data.R:
--------------------------------------------------------------------------------
 1 | # Library Load
 2 | library(tidyverse)
 3 | library(writexl)
 4 | library(janitor)
 5 | 
 6 | # Write File to disk
 7 | file_path <- paste0(getwd(), "/Chapter14/")
 8 | 
 9 | # Split data by cut and clean names of the list
10 | df_list <- split(diamonds, diamonds$cut) |>
11 |   clean_names()
12 | 
13 | # Write to xlsx
14 | df_list |>
15 |   write_xlsx(paste0(file_path, "diamonds_split.xlsx"))
16 | 


--------------------------------------------------------------------------------
/Chapter14/ch14_diamonds_eda.R:
--------------------------------------------------------------------------------
 1 | # Library Load
 2 | library(ggplot2)
 3 | library(dplyr)
 4 | library(healthyR)
 5 | library(readxl)
 6 | 
 7 | # Source Functions
 8 | source(paste0(getwd(),"/chapter1/excel_sheet_reader.R"))
 9 | 
10 | # Read data
11 | file_path <- paste0(getwd(), "/Chapter14/")
12 | 
13 | df <- read_excel_sheets(
14 |   filename = paste0(file_path, "diamonds_split.xlsx"),
15 |   single_tbl = TRUE
16 | )
17 | 
18 | # Visualize Data
19 | # Create optimal binning via the opt_bin() function from healthyR
20 | breaks <- tibble(x = df$price) |>
21 |   opt_bin(x) |>
22 |   pull(value)
23 | head(breaks)
24 | 
25 | par(mfrow = c(1, 2))
26 | hist(df$price, main = "Price Histogram - Default binning",
27 |      xlab = "Price", ylab = "Frequency")
28 | hist(df$price, breaks = breaks, main = "Price Histogram - Optimal binning",
29 |      xlab = "Price", ylab = "Frequency")
30 | par(mfrow = c(1, 1))
31 | 
32 | df |>
33 |   ggplot(aes(x = carat, y = price, fill = cut)) +
34 |   geom_hex(bins = length(breaks), alpha = 1/5) +
35 |   facet_wrap(~ clarity, scales = "free") +
36 |   theme_minimal() +
37 |   labs(
38 |     x = "Carat",
39 |     y = "Price",
40 |     title = "Diamonds Data",
41 |     fill = "Cut"
42 |   ) +
43 |   hr_scale_color_colorblind()
44 | 
45 | df |>
46 |   ggplot(aes(x = carat, y = price, fill = cut)) +
47 |   geom_boxplot(alpha = 1/5, outlier.color = "lightgrey") +
48 |   facet_wrap(~ clarity, scales = "free") +
49 |   theme_minimal() +
50 |   labs(
51 |     x = "Carat",
52 |     y = "Price",
53 |     title = "Diamonds Data",
54 |     fille = "Cut"
55 |   ) +
56 |   hr_scale_color_colorblind()
57 | 
58 | df |>
59 |   summarize(m = mean(price), .by = c(clarity, cut)) |> 
60 |   ggplot(aes(x = clarity, y = m, group = cut, color = cut)) +
61 |   geom_point() +
62 |   geom_line() +
63 |   geom_smooth() +
64 |   facet_wrap(~cut, ncol = 2) +
65 |   labs(x= "Clarity", 
66 |        y = "Mean Price", 
67 |        title = "Mean Price by Clarity and Cut",
68 |        color = "Cut") +
69 |   theme_minimal() +
70 |   hr_scale_color_colorblind()
71 | 
72 | df |>
73 |   summarize(m = mean(price/carat), .by = c(cut, color, clarity)) |>
74 |   ggplot(aes(x = color, y = m, group = clarity, color = clarity)) +
75 |   geom_point() +
76 |   geom_line() +
77 |   facet_wrap(~ cut, ncol = 2, scales = "free") +
78 |   labs(x= "Clarity", 
79 |        y = "Mean Price", 
80 |        title = "Mean Price per Carat by Clarity, Color and Cut",
81 |        color = "Cut") +
82 |   theme_minimal() +
83 |   hr_scale_color_colorblind()
84 | 
85 | df |>
86 |   ggplot(aes(x = price)) +
87 |   geom_histogram(breaks = breaks, fill = "lightblue",
88 |                  color = "black") +
89 |   theme_minimal() +
90 |   facet_wrap(~ cut, ncol = 2, scales = 'free') +
91 |   labs(x = "Price", y = "Frequency", title = "Price Histogram by Cut")
92 | 
93 | 


--------------------------------------------------------------------------------
/Chapter14/ch14_diamonds_modeling.R:
--------------------------------------------------------------------------------
 1 | # Lib Load
 2 | library(healthyR.ai)
 3 | library(dplyr)
 4 | 
 5 | glimpse(head(df, 2))
 6 | 
 7 | # Pass data through pre-processor
 8 | rec_obj <- hai_xgboost_data_prepper(
 9 |   .data = df, 
10 |   .recipe_formula = price ~ .
11 | )
12 | rec_obj
13 | 
14 | # Now see the juiced output
15 | get_juiced_data(rec_obj) |>
16 |   head(2) |>
17 |   glimpse()
18 | 
19 | # Now perform modeling using the hai_auto_xgboost() function
20 | auto_xgb <- hai_auto_xgboost(
21 |   .data = df,
22 |   .rec_obj = rec_obj,
23 |   .best_metric = "rsq",
24 |   .num_cores = 10,
25 |   .model_type = "regression"
26 | )
27 | 
28 | xgb_wflw_fit <- auto_xgb$model_info$fitted_wflw
29 | class(xgb_wflw_fit)
30 | mod_spec <- xgb_wflw_fit[["fit"]][["actions"]][["model"]][["spec"]]
31 | mod_spec
32 | 
33 | # Save the model
34 | save_path <- paste0(getwd(), "/Chapter14/")
35 | saveRDS(xgb_wflw_fit, paste0(save_path, "xgb_wflw_fit.rds"))
36 |         


--------------------------------------------------------------------------------
/Chapter14/imgs/ggplot_diamonds_boxplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_boxplot.png


--------------------------------------------------------------------------------
/Chapter14/imgs/ggplot_diamonds_hex_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_hex_plot.png


--------------------------------------------------------------------------------
/Chapter14/imgs/ggplot_diamonds_hist_by_cut.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_hist_by_cut.png


--------------------------------------------------------------------------------
/Chapter14/imgs/ggplot_diamonds_mean_price.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_mean_price.png


--------------------------------------------------------------------------------
/Chapter14/imgs/ggplot_diamonds_mean_price_per_carat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_mean_price_per_carat.png


--------------------------------------------------------------------------------
/Chapter14/imgs/hist_default_and_optbin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/hist_default_and_optbin.png


--------------------------------------------------------------------------------
/Chapter14/xgb_wflw_fit.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/xgb_wflw_fit.rds


--------------------------------------------------------------------------------
/Chapter2/adding_sheets.py:
--------------------------------------------------------------------------------
 1 | import openpyxl
 2 | 
 3 | # Create a new workbook
 4 | workbook = openpyxl.Workbook()
 5 | 
 6 | # Add a new sheet
 7 | workbook.create_sheet(title="Sheet2")
 8 | 
 9 | # Save the changes
10 | workbook.save("example.xlsx")
11 | 


--------------------------------------------------------------------------------
/Chapter2/cell_update.py:
--------------------------------------------------------------------------------
 1 | import openpyxl
 2 | 
 3 | # Load an existing workbook
 4 | workbook = openpyxl.load_workbook("example.xlsx")
 5 | 
 6 | # Add a new sheet
 7 | workbook.create_sheet(title="Sheet1")
 8 | 
 9 | # Select a sheet
10 | sheet_name = "Sheet1"
11 | 
12 | sheet = workbook[sheet_name]
13 | 
14 | # Update a cell value
15 | sheet["A1"] = "Hello, World!"
16 | 
17 | # Save the changes
18 | workbook.save("example.xlsx")
19 | 


--------------------------------------------------------------------------------
/Chapter2/create_workbook.py:
--------------------------------------------------------------------------------
1 | import openpyxl
2 | 
3 | # Create a new workbook
4 | workbook = openpyxl.Workbook()
5 | 


--------------------------------------------------------------------------------
/Chapter2/deleting_sheet.py:
--------------------------------------------------------------------------------
 1 | import openpyxl
 2 | 
 3 | # Load an existing workbook
 4 | workbook = openpyxl.load_workbook("example.xlsx")
 5 | 
 6 | # Delete a sheet
 7 | sheet_name = "Sheet2"
 8 | sheet = workbook[sheet_name]
 9 | workbook.remove(sheet)
10 | 


--------------------------------------------------------------------------------
/Chapter2/excel_write_bench.R:
--------------------------------------------------------------------------------
 1 | library(rbenchmark)
 2 | library(xlsx)
 3 | library(writexl)
 4 | library(openxlsx)
 5 | library(dplyr)
 6 | 
 7 | n <- 5
 8 | 
 9 | benchmark(
10 |   "writexl" = {
11 |     writexl::write_xlsx(iris, tempfile())
12 |   },
13 |   "openxlsx" = {
14 |     openxlsx::write.xlsx(iris, tempfile())
15 |   },
16 |   "xlsx" = {
17 |     xlsx::write.xlsx(iris, paste0(tempfile(),".xlsx"))
18 |   },
19 |   replications = n,
20 |   columns = c(
21 |     "test","replications","elapsed","relative","user.self","sys.self")
22 | ) |>
23 |   arrange(relative)
24 | 


--------------------------------------------------------------------------------
/Chapter2/export2excel_pandas.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Create a DataFrame with sample data
 4 | data = {
 5 |     'Name': ['John', 'Jane', 'Mike'],
 6 |     'Age': [25, 30, 35],
 7 |     'City': ['New York', 'London', 'Sydney']
 8 | }
 9 | df = pd.DataFrame(data)
10 | 
11 | # Export the DataFrame to an Excel file
12 | df.to_excel('data.xlsx', index=False)
13 | 


--------------------------------------------------------------------------------
/Chapter2/output_file_size_compare.R:
--------------------------------------------------------------------------------
1 | writexl::write_xlsx(iris, tmp1 <- tempfile())
2 | file.info(tmp1)$size
3 | 
4 | openxlsx::write.xlsx(iris, tmp2 <- tempfile())
5 | file.info(tmp2)$size
6 | 
7 | xlsx::write.xlsx(iris, tmp3 <- paste0(tempfile(),".xlsx"))
8 | file.info(tmp3)$size
9 | 


--------------------------------------------------------------------------------
/Chapter7/ch7_tables_with_R.R:
--------------------------------------------------------------------------------
 1 | # Convert the dataset to a data frame
 2 | df <- as.data.frame(UCBAdmissions)
 3 | # Create a contingency table using xtabs()
 4 | xtabs(Freq ~ Gender + Admit, df)
 5 | 
 6 | # The gt package
 7 | if(!require(gt)){install.packages("gt", dependencies = TRUE)}
 8 | library(dplyr)
 9 | library(tibble)
10 | 
11 | tab <- mtcars |>
12 |   rownames_to_column() |>
13 |   arrange(factor(cyl), mpg) |>
14 |   group_by(cyl) |>
15 |   slice(1:3) |>
16 |   gt() 
17 | 
18 | tab <- tab |>
19 |   tab_spanner(
20 |     label = "Performance",
21 |     columns = c(mpg, disp, hp, drat, wt, qsec)
22 |   ) 
23 | 
24 | tab <- tab |>
25 |   tab_spanner(
26 |     label = "Specs",
27 |     columns = c(vs, am, gear, carb)
28 |   ) 
29 | 
30 | tab <- tab |>
31 |   tab_header(
32 |     title = md("The Cars of **mtcars**"),
33 |     subtitle = "These are some fine automobiles"
34 |   )
35 | 
36 | tab
37 | 
38 | # pivot_table() with tidyquant
39 | library(tidyquant)
40 | library(purrr)
41 | 
42 | pivot_table(.data = iris,
43 |             .rows = ~ Species,
44 |             .values = c(~ mean(Sepal.Length), ~ mean(Sepal.Width))) |>
45 |   set_names("Species","Mean_Sepal_Length","Mean_Sepal_Width")
46 | 


--------------------------------------------------------------------------------
/Chapter8/ch8.R:
--------------------------------------------------------------------------------
 1 | # The skimr package
 2 | if(!require(skimr)){install.packages("skimr")}
 3 | library(skimr)
 4 | skim(iris)
 5 | 
 6 | if(!require(TidyDensity)){install.packages("TidyDensity")}
 7 | tidy_normal() |> skim()
 8 | 
 9 | if(!require(GGally)){install.packages("GGally")}
10 | library(GGally)
11 | library(TidyDensity)
12 | tidy_normal(.n = 200) |> 
13 |   ggpairs(columns = c("y","p","q","dx","dy"))
14 | 
15 | if(!require(DataExplorer)){install.packages("DataExplorer")}
16 | library(DataExplorer)
17 | library(TidyDensity)
18 | library(dplyr)
19 | 
20 | df <- tidy_normal(.n = 200)
21 | 
22 | df |>
23 |   introduce() |>
24 |   glimpse()
25 | 
26 | df |>
27 |   plot_intro() +
28 |   theme_minimal()
29 | 
30 | df |> 
31 |   plot_qq()
32 | 
33 | df[c("q","y")] |> 
34 |   plot_qq()
35 | 


--------------------------------------------------------------------------------
/Chapter9/ch9_linear_reg.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Library Load ------------------------------------------------------------
 3 | 
 4 | library(readxl)
 5 | 
 6 | 
 7 | # Get Data ----------------------------------------------------------------
 8 | 
 9 | df <- read_xlsx(
10 |   path = "chapter1/iris_data.xlsx",
11 |   sheet = "iris"
12 | )
13 | 
14 | head(df)
15 | 
16 | # Split the dataset by species
17 | iris_split <- split(df, df$species)
18 | 
19 | # Define the dependent variable and independent variables
20 | dependent_variable <- "petal_length"
21 | independent_variables <- c("petal_width", "sepal_length", "sepal_width")
22 | f_x <- formula(
23 |   paste(
24 |     dependent_variable, 
25 |     "~", 
26 |     paste(
27 |       independent_variables, 
28 |       collapse = " + "
29 |       )
30 |     )
31 |   )
32 | 
33 | # Create a function to perform linear regression on each subset
34 | perform_linear_regression <- function(data) {
35 |   lm_model <- lm(f_x, data = data)
36 |   return(lm_model)
37 | }
38 | 
39 | # Apply the linear regression to each subset using lapply
40 | results <- lapply(iris_split, perform_linear_regression)
41 | 
42 | # Get the summary of each linear model
43 | lapply(results, summary)
44 | 
45 | # Plot the model performance
46 | par(mfrow = c(2,2))
47 | lapply(results, plot)
48 | par(mfrow = c(1, 1))
49 | 
50 | # The above can also be rewritten as follows
51 | # Fit a linear model for each species
52 | lm_models <- lapply(iris_split, function(df) lm(f_x, data = df))
53 | 
54 | # Summarize the results
55 | lapply(lm_models, summary)
56 | 


--------------------------------------------------------------------------------
/Chapter9/ch9_linear_reg_tidymodels.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Library Load ------------------------------------------------------------
 3 | 
 4 | library(readxl)
 5 | library(tidymodels)
 6 | library(purrr)
 7 | library(performance)
 8 | 
 9 | 
10 | # Get Data ----------------------------------------------------------------
11 | 
12 | df <- read_xlsx(
13 |   path = "chapter1/iris_data.xlsx",
14 |   sheet = "iris"
15 | )
16 | 
17 | # Split the data by Species -----------------------------------------------
18 | 
19 | iris_list <- split(df, df$species)
20 | 
21 | # Specify the Model -------------------------------------------------------
22 | 
23 | lm_model <- linear_reg(mode = "regression", engine = "lm")
24 | 
25 | # Define Formula ----------------------------------------------------------
26 | 
27 | f_x <- formula(paste("petal_width", "~", "petal_length + sepal_width + sepal_length"))
28 | 
29 | # Perform Linear Regression using purrr -----------------------------------
30 | # Create The Model
31 | lm_mod <- linear_reg(mode = "regression", engine = "lm")
32 | 
33 | # Make the workflow
34 | wf <- workflow() |>
35 |   add_model(lm_mod)
36 | 
37 | # Make the function that will get mapped
38 | lm_fit_list <- function(df) {
39 |   #create recipe
40 |   recipe_train <- recipe(f_x, data = df) %>%
41 |     step_normalize(all_predictors())
42 |   
43 |   #fit workflow on the data
44 |   fit_wf <- wf |>
45 |     add_recipe(recipe_train) |>
46 |     fit(data = df)
47 | 
48 |   fit_wf
49 | }
50 | 
51 | # Map the linear model ----------------------------------------------------
52 | 
53 | model_list <- map(iris_list, lm_fit_list)
54 | lapply(model_list, tidy)
55 | lapply(model_list, glance)
56 | 
57 | # Check the Model
58 | model_list |>
59 |   map(extract_fit_engine) |> 
60 |   map(check_model) 
61 | 
62 | # Alternate Nested Method
63 | nested_lm <- df |>
64 |   nest(data = -species) |>
65 |   mutate(split = map(data, ~ initial_split(., prop = 8/10)),
66 |          train = map(split, ~ training(.)),
67 |          test  = map(split, ~ testing(.)),
68 |          fit   = map(train, ~ lm(f_x, data = .)),
69 |          pred  = map2(.x = fit, .y = test, ~ predict(object = .x, newdata = .y)))
70 | 
71 | nested_lm |>
72 |   select(species, pred) |>
73 |   unnest(pred)


--------------------------------------------------------------------------------
/Chapter9/ch9_logistic_reg.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Library Load ------------------------------------------------------------
 3 | 
 4 | 
 5 | library(tidyverse)
 6 | 
 7 | df <- Titanic |>
 8 |   as.data.frame() |>
 9 |   uncount(Freq)
10 | 
11 | 
12 | # Splits ------------------------------------------------------------------
13 | 
14 | # Split the data into training and test sets
15 | set.seed(123)
16 | train_index <- sample(nrow(df), floor(nrow(df) * 0.8), replace = FALSE)
17 | train <- df[train_index, ]
18 | test <- df[-train_index, ]
19 | 
20 | # Train a model -----------------------------------------------------------
21 | 
22 | # Train the logistic regression model
23 | model <- glm(Survived ~ Sex + Age + Class, data = train, family = "binomial")
24 | 
25 | # Predict -----------------------------------------------------------------
26 | 
27 | # Evaluate the model on the test set
28 | predictions <- predict(model, newdata = test, type = "response")
29 | pred_resp <- ifelse(predictions <= 0.5, "No", "Yes")
30 | 
31 | # Calculate the accuracy of the model
32 | accuracy <- mean(pred_resp == test$Survived)
33 | 
34 | # Print the accuracy of the model
35 | print(accuracy)
36 | 
37 | # Print the confusion matrix
38 | table(pred_resp, test$Survived)
39 | 


--------------------------------------------------------------------------------
/Chapter9/ch9_logistic_reg_tidymodels.R:
--------------------------------------------------------------------------------
 1 | # Library Load ------------------------------------------------------------
 2 | 
 3 | library(tidymodels)
 4 | library(healthyR.ai)
 5 | 
 6 | # Convert to a tibble for tidymodels
 7 | df <- Titanic |>
 8 |   as_tibble() |>
 9 |   uncount(n)  |>
10 |   mutate(across(where(is.character), as.factor))
11 | 
12 | # Splits ------------------------------------------------------------------
13 | 
14 | # Set seed for reproducibility
15 | set.seed(123)
16 | 
17 | # Split the data into training and test sets
18 | split <- initial_split(df, prop = 0.8)
19 | train <- training(split)
20 | test <- testing(split)
21 | 
22 | # Train a model -----------------------------------------------------------
23 | 
24 | # Create a recipe for pre-processing
25 | recipe <- recipe(Survived ~ Sex + Age + Class, data = train)
26 | 
27 | # Specify logistic regression as the model
28 | log_reg <- logistic_reg() |>
29 |   set_engine("glm", family = "binomial")
30 | 
31 | # Combine the recipe and model into a workflow
32 | workflow <- workflow() %>%
33 |   add_recipe(recipe) %>%
34 |   add_model(log_reg)
35 | 
36 | # Train the logistic regression model
37 | fit <- fit(workflow, data = train)
38 | 
39 | # Predict -----------------------------------------------------------------
40 | 
41 | # Predict on the test set
42 | predictions <- predict(fit, new_data = test) |>
43 |   bind_cols(test) |>
44 |   select(Class:Survived, .pred_class)
45 | 
46 | # Better method
47 | pred_fit_tbl <- fit |>
48 |   augment(new_data = test)
49 | 
50 | # Accuracy Check ----------------------------------------------------------
51 | 
52 | # Accuracy metrics for the model to be scored against from the healthyR.ai package
53 | perf <- hai_default_classification_metric_set()
54 | 
55 | # Calculate the accuracy metrics
56 | perf(pred_fit_tbl, truth = Survived, estimate = .pred_class)
57 | 
58 | # Print the confusion matrix
59 | predictions |>
60 |   conf_mat(truth = Survived, estimate = .pred_class)
61 | 
62 | # Use broom to tidy and glance the fitted model
63 | tidy(fit, exponentiate = TRUE, conf.int = TRUE)
64 | glance(fit)
65 | 
66 | # Visualize' --------------------------------------------------------------
67 | 
68 | # ROC Curve
69 | roc_curve(pred_fit_tbl, truth = Survived, .pred_Yes, event_level = "second") |> 
70 |   autoplot()


--------------------------------------------------------------------------------
/Extending-Excel-with-Python-and-R.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/GroupingExample.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/GroupingExample.xlsx


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center"><a href="https://packt.link/mlsumgh"><img src="https://static.packt-cdn.com/assets/images/ML Summit Banner v3 1200x627.png" alt="Machine Learning Summit 2025"/></a></p>
 2 | 
 3 | ## Machine Learning Summit 2025
 4 | **Bridging Theory and Practice: ML Solutions for Today’s Challenges**
 5 | 
 6 | 3 days, 20+ experts, and 25+ tech sessions and talks covering critical aspects of:
 7 | - **Agentic and Generative AI**
 8 | - **Applied Machine Learning in the Real World**
 9 | - **ML Engineering and Optimization**
10 | 
11 | 👉 [Book your ticket now >>](https://packt.link/mlsumgh)
12 | 
13 | ---
14 | 
15 | ## Join Our Newsletters 📬
16 | 
17 | ### DataPro  
18 | *The future of AI is unfolding. Don’t fall behind.*
19 | 
20 | <p><a href="https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes"><img src="https://static.packt-cdn.com/assets/images/DataPro NL QR Code.png" alt="DataPro QR" width="150"/></a></p>
21 | 
22 | Stay ahead with [**DataPro**](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes), the free weekly newsletter for data scientists, AI/ML researchers, and data engineers.  
23 | From trending tools like **PyTorch**, **scikit-learn**, **XGBoost**, and **BentoML** to hands-on insights on **database optimization** and real-world **ML workflows**, you’ll get what matters, fast.
24 | 
25 | > Stay sharp with [DataPro](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes). Join **115K+ data professionals** who never miss a beat.
26 | 
27 | ---
28 | 
29 | ### BIPro  
30 | *Business runs on data. Make sure yours tells the right story.*
31 | 
32 | <p><a href="https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes"><img src="https://static.packt-cdn.com/assets/images/BIPro NL QR Code.png" alt="BIPro QR" width="150"/></a></p>
33 | 
34 | [**BIPro**](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes) is your free weekly newsletter for BI professionals, analysts, and data leaders.  
35 | Get practical tips on **dashboarding**, **data visualization**, and **analytics strategy** with tools like **Power BI**, **Tableau**, **Looker**, **SQL**, and **dbt**.
36 | 
37 | > Get smarter with [BIPro](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes). Trusted by **35K+ BI professionals**, see what you’re missing.
38 | 
39 | # Extending Excel with Python and R
40 | 
41 | <a href="https://www.packtpub.com/product/extending-excel-with-python-and-r/9781804610695"><img src="https://m.media-amazon.com/images/I/61L8WSsuu3L._SL1360_.jpg" alt="Extending Excel with Python and R" height="256px" align="right"></a>
42 | 
43 | This is the code repository for [Extending Excel with Python and R](https://www.packtpub.com/product/extending-excel-with-python-and-r/9781804610695), published by Packt.
44 | 
45 | **Unlock the potential of analytics languages for advanced data manipulation and visualization**
46 | 
47 | ## What is this book about?
48 | 
49 | For businesses, data analysis and visualization are crucial for informed decision-making; however, Excel’s limitations can make these tasks time-consuming and challenging. Extending Excel with Python and R is a game-changer resource, written by experts Steven Sanderson, the author of the healthyverse suite of R packages, and David Kun, co-founder of Functional Analytics, the company behind the ownR platform engineering solution for R, Python, and other data science languages.
50 | 
51 | This book covers the following exciting features: 
52 | * Read and write Excel files with R and Python libraries
53 | * Automate Excel tasks with R and Python scripts
54 | * Use R and Python to execute Excel VBA macros
55 | * rmat Excel sheets using R and Python packages
56 | * Create graphs with ggplot2 and Matplotlib in Excel
57 | * Analyze Excel data with statistical methods and time series analysis
58 | * Explore various methods to call R and Python functions from Excel
59 | 
60 | If you feel this book is for you, get your [copy](https://www.amazon.com/Extending-Excel-Python-manipulation-visualization/dp/1804610690/ref=sr_1_1?sr=8-1) today!
61 | 
62 | 
63 | ## Instructions and Navigations
64 | All of the code is organized into folders.
65 | 
66 | The code will look like the following:
67 | ```
68 | install.packages("devtools")
69 | # Install development version from GitHub
70 | devtools::install_github(
71 | 'R-package/styledTables',
72 | build_vignettes = TRUE
73 | )
74 | ```
75 | 
76 | **Following is what you need for this book:**
77 | If you’re a data analyst or data scientist, or a quants, actuaries, or data practitioner looking to enhance your Excel skills and expand your data analysis capabilities with R and Python, this book is for you. It provides a comprehensive introduction to the topics covered, making it suitable for both beginners and intermediate learners. A basic understanding of Excel, Python, and R is all you need to get started.
78 | 
79 | With the following software and hardware list you can run all code files present in the book (Chapter 1-12).
80 | 
81 | ### Software and Hardware List
82 | 
83 | | Chapter  | Software required                                                                    | OS required                        |
84 | | -------- | -------------------------------------------------------------------------------------| -----------------------------------|
85 | |  	1-12	   |  R | Windows (for the VBA parts), macOS, or Linux (for all content excluding VBA) 		|
86 | |  	1-12	   |  Python 3.11 | |
87 | |  	1-12	   | Excel (including VBA) | |
88 | 
89 | ### Related products <Other books you may enjoy>
90 | * Data Modeling with Microsoft Excel [[Packt]](https://www.packtpub.com/product/data-modeling-with-microsoft-excel/9781803240282) [[Amazon]](https://www.amazon.com/Data-Modeling-Microsoft-Excel-comprehensive/dp/1803240288/ref=sr_1_1?sr=8-1)
91 |   
92 | * Building Interactive Dashboards in Microsoft 365 Excel  [[Packt]](https://www.packtpub.com/product/building-interactive-dashboards-in-microsoft-365-excel/9781803237299) [[Amazon]](https://www.amazon.com/Building-Interactive-Dashboards-Microsoft-Excel/dp/1803237295/ref=sr_1_1?sr=8-1)
93 |   
94 | ## Get to Know the Authors
95 | **Steven Sanderson** has been working in healthcare for almost 20 years with a focus in the last 12 years on analytics. Steve has spent those years working on dashboards, automations, and visualizations for clinical, finance and IT operations. Steven is also the author of the healthyverse suite of R packages which are in active development. Steven received his MPH from Stony Brook University School of Medicine Graduate Program in Public Health.
96 | 
97 | **David Kun** is the co-founder of Functional Analytics, the company behind the ownR platform engineering solution for R, Python and other data science languages. He is a qualified Actuary with two MSc’s concentrated on Mathematics. He has been using R since his MSc thesis in 2006 and Python since 2018.
98 | 
99 | 


--------------------------------------------------------------------------------
/aligned_table_openpyxl.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/aligned_table_openpyxl.xlsx


--------------------------------------------------------------------------------
/aligned_table_pandas.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/aligned_table_pandas.xlsx


--------------------------------------------------------------------------------
/chapter6/ch6_barplot.R:
--------------------------------------------------------------------------------
 1 | library(healthyR.data)
 2 | library(healthyR)
 3 | library(ggplot2)
 4 | library(dplyr)
 5 | library(forcats)
 6 | library(purrr)
 7 | 
 8 | df <- healthyR_data |>
 9 |   filter(payer_grouping != '?') |>
10 |   category_counts_tbl(
11 |     .count_col = payer_grouping
12 |     , .arrange = TRUE
13 |     , ip_op_flag
14 |   ) |>
15 |   group_by(ip_op_flag) |>
16 |   mutate(order_var = paste0(
17 |     sprintf("%02i", as.integer(rank(n))),
18 |     " - ",
19 |     payer_grouping
20 |     )) |>
21 |   ungroup()
22 | 
23 | ggplot(df, aes(x = order_var, y = n)) +
24 |   geom_col(alpha = 0.328) +
25 |   labs(x = "", y = "") +
26 |   theme(legend.position = "none") +
27 |   facet_wrap(~ ip_op_flag, scale = "free") +
28 |   scale_x_discrete(labels =  with(df, as.character(payer_grouping) |> 
29 |                                     set_names(order_var))) +
30 |   xlab(NULL) +
31 |   theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) +
32 |   coord_flip() +
33 |   theme_minimal()
34 | 


--------------------------------------------------------------------------------
/chapter6/ch6_cowplot.R:
--------------------------------------------------------------------------------
 1 | # Install Libraries
 2 | install.packages("ggplot2")
 3 | install.packages("cowplot")
 4 | 
 5 | # Load required libraries
 6 | library(ggplot2)
 7 | library(cowplot)
 8 | 
 9 | # Load the Iris dataset
10 | data(iris)
11 | 
12 | # Create separate histograms for each species
13 | histograms <- list()
14 | for (species in unique(iris$Species)) {
15 |   data_subset <- iris[iris$Species == species, ]
16 |   
17 |   histogram <- ggplot(data_subset, aes(x = Sepal.Width)) +
18 |     geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") +
19 |     labs(title = paste("Sepal Width Histogram for", species)) +
20 |     labs(x = "", y = "") +
21 |     theme_minimal()
22 |   
23 |   histograms[[species]] <- histogram
24 | }
25 | 
26 | # Create histogram for all species combined
27 | all_species_hist <- ggplot(iris, aes(x = Sepal.Width)) +
28 |   geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") +
29 |   labs(title = "Sepal Width Histogram for All Species") +
30 |   theme_minimal()
31 | 
32 | # Arrange histograms using cowplot
33 | plot_grid(
34 |   histograms[["setosa"]], 
35 |   histograms[["versicolor"]], 
36 |   histograms[["virginica"]], 
37 |   all_species_hist,
38 |   ncol = 2,
39 |   align = "hv"
40 | )
41 | 
42 | histograms <- lapply(unique(iris$Species), function(species) {
43 |   data_subset <- iris[iris$Species == species, ]
44 |   
45 |   histogram <- ggplot(data_subset, aes(x = Sepal.Width)) +
46 |     geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") +
47 |     labs(title = paste("Sepal Width Histogram for", species)) +
48 |     labs(x = "", y = "") +
49 |     theme_minimal()
50 |   
51 |   return(histogram)
52 | })
53 | 
54 | histograms
55 | 


--------------------------------------------------------------------------------
/chapter6/ch6_dumbell_plot.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(dplyr)
 3 | 
 4 | # Sample data
 5 | data <- data.frame(
 6 |   Category = c("A", "B", "C", "D"),
 7 |   Initial = c(10, 15, 8, 12),
 8 |   Final = c(18, 22, 14, 16)
 9 | )
10 | 
11 | # Calculate the midpoint for positioning the dots and lines
12 | data <- data %>%
13 |   mutate(Midpoint = (Initial + Final) / 2)
14 | 
15 | # Create the dumbbell plot using ggplot2
16 | dumbbell_plot <- ggplot(data, aes(x = Category, xend = Category, 
17 |                                   y = Initial, yend = Final)) +
18 |   geom_segment(color = "gray50") +  # Lines connecting dots
19 |   geom_point(color = "blue", size = 3) +  # Initial values
20 |   geom_point(aes(y = Final), color = "orange", size = 3) +  # Final values
21 |   geom_point(aes(y = Midpoint), color = "green", size = 3) + # Midpoint Values
22 |   geom_text(aes(label = Midpoint), 
23 |             y = data$Midpoint, vjust = -.5, size = 3) +  # Midpoint labels
24 |   labs(title = "Dumbbell Plot",
25 |        x = "Category",
26 |        y = "Values") +
27 |   theme_minimal()
28 | 
29 | # Print the plot
30 | dumbbell_plot
31 | 


--------------------------------------------------------------------------------
/chapter6/ch6_ggplot2.R:
--------------------------------------------------------------------------------
 1 | install.packages("ggplot2")
 2 | library(ggplot2)
 3 | 
 4 | # Make a histogram of the sepal width for all species
 5 | hist(iris$Sepal.Width)
 6 | 
 7 | # Make a histogram of the sepal width for each species
 8 | par(mfrow = c(2,2))
 9 | for (species in unique(iris$Species)) {
10 |   hist(iris$Sepal.Width[iris$Species == species], main = species,
11 |        xlab = species)
12 | }
13 | hist(iris$Sepal.Width, main = "All Species")
14 | par(mfrow = c(1,1))
15 | 
16 | # Make a histogram of the sepal width for all species
17 | iris |>
18 | ggplot(aes(x = Sepal.Width)) + 
19 |   geom_histogram(alpha = 0.328) +
20 |   theme_minimal()
21 | 
22 | # Make a histogram of the sepal width for each species
23 | iris |>
24 | ggplot(aes(x = Sepal.Width, fill = Species)) + 
25 |   geom_histogram(alpha = 0.328) +
26 |   theme_minimal()
27 | 
28 | # Make a histogram of the sepal width for each species and facet them
29 | iris |>
30 |   ggplot(aes(x = Sepal.Width, fill = Species)) +
31 |   geom_histogram(alpha = 0.328) +
32 |   facet_wrap(~ Species, scales = "free") +
33 |   theme_minimal()
34 | 


--------------------------------------------------------------------------------
/chapter6/ch6_timeseries.R:
--------------------------------------------------------------------------------
1 | plot.ts(AirPassengers)
2 | plot(decompose(AirPassengers))
3 | 
4 | library(healthyR.ts)
5 | 
6 | ts_brownian_motion() |>
7 |   ts_brownian_motion_plot(t, y)
8 | 


--------------------------------------------------------------------------------
/chapter6/imgs/payergroup_barplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/chapter6/imgs/payergroup_barplot.png


--------------------------------------------------------------------------------
/colored_table_openpyxl.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/colored_table_openpyxl.xlsx


--------------------------------------------------------------------------------
/colored_table_pandas.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/colored_table_pandas.xlsx


--------------------------------------------------------------------------------
/conditional_formatting.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/conditional_formatting.xlsx


--------------------------------------------------------------------------------
/data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/data.xlsx


--------------------------------------------------------------------------------
/dirty_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/dirty_data.xlsx


--------------------------------------------------------------------------------
/example.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/example.xlsx


--------------------------------------------------------------------------------
/heatmap_with_conditional_formatting.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/heatmap_with_conditional_formatting.xlsx


--------------------------------------------------------------------------------
/iris_data.xlsm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/iris_data.xlsm


--------------------------------------------------------------------------------
/iris_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/iris_data.xlsx


--------------------------------------------------------------------------------
/linear_regression_input.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/linear_regression_input.xlsx


--------------------------------------------------------------------------------
/logistic_regression_input.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/logistic_regression_input.xlsx


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/requirements.txt


--------------------------------------------------------------------------------
/requirements.txt.bak:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/requirements.txt.bak


--------------------------------------------------------------------------------
/styled_table_openpyxl.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/styled_table_openpyxl.xlsx


--------------------------------------------------------------------------------
/styled_table_pandas.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/styled_table_pandas.xlsx


--------------------------------------------------------------------------------
/time_series_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/time_series_data.xlsx


--------------------------------------------------------------------------------