├── .Rproj.user └── 69A1DB49 │ ├── pcs │ ├── debug-breakpoints.pper │ ├── files-pane.pper │ ├── source-pane.pper │ ├── windowlayoutstate.pper │ └── workbench-pane.pper │ └── sources │ └── prop │ └── 91DA5C98 ├── .gitignore ├── Chapter 10 ├── airpassengers.xlsx ├── forecasting_lstm.py ├── forecasting_prophet.py ├── forecasting_statsmodel.py ├── imgs │ ├── AirPassengers.png │ ├── acf_plot.png │ ├── auto_arima_calibration_plot.png │ ├── pacf_apts.png │ ├── ts_obj_rnorm_25.png │ └── tuning_grid_plot.png ├── plots.py ├── sample_data.py ├── statistics.py └── time_series.R ├── Chapter 11 ├── call_plumber.R ├── fastapi_add.py ├── imgs │ ├── BERT_Console.png │ ├── BERT_Console_Excell_Addins.png │ ├── BERT_VBA_to_R_density_plot_rnorm.png │ └── RAND.png ├── matrix_multiplication.py ├── matrix_multiplication.xlsm ├── multiply.py ├── multiply.xlsx ├── plumber_api.R └── vba_plumber_curl_request.bas ├── Chapter 3 ├── Sub_MultiplyByRandom.bas ├── executing_VBA.py ├── interacting_Excel_objects.py ├── mult_by_rand_ch3.xlsm ├── retreiving_data.py ├── run_MultByRand_macro.R └── testing_environment.py ├── Chapter 4 ├── apscheduler.py ├── case_study.py ├── get_user_input.R ├── hello_world.R ├── hello_world_schedule.R ├── install_taskscheduleR.R ├── schedule.py └── send_basic_email.py ├── Chapter 5 ├── aligning_text_openpyxl.py ├── aligning_text_pandas.py ├── background_colors_openpyxl.py ├── background_colors_pandas.py ├── conditional_formatting.py ├── font_properties_openpyxl.py ├── font_properties_pandas.py ├── heatmap.py ├── install_ch5_packages.R ├── pivot_table.py ├── using_basictabler.R └── using_styledTables.R ├── Chapter 6 ├── ch6_barplot.R ├── ch6_cowplot.R ├── ch6_dumbell_plot.R ├── ch6_ggplot2.R ├── ch6_timeseries.R ├── insert_image_pywin32.py ├── matplotlib_basics.py ├── matplotlib_customizations.py ├── plotnine_additional_layers.py ├── plotnine_basics.py └── plotnine_customizations.py ├── Chapter 7 ├── create_pivot.py ├── grouping.py └── manipulate_pivot.py ├── Chapter 8 ├── ch8.R ├── clean_data.py ├── create_sample_data.py ├── data_distribution.py ├── relationships.py └── summary_statistics.py ├── Chapter 9 ├── ch9_linear_reg.R ├── ch9_linear_reg_tidymodels.R ├── ch9_logistic_reg.R ├── ch9_logistic_reg_tidymodels.R ├── linear_regression.py └── logistic_regression.py ├── Chapter1 ├── ch1_create_iris_dataset.R ├── ch1_pkgs.R ├── ch1_save_xlsx_as_xlsb.R ├── excel_sheet_reader.R ├── iris_data.xlsb ├── multisheet_openpyxl.py ├── open_excel_openpyxl.py ├── open_excel_pandas.py └── read_xlsx_files.R ├── Chapter12 ├── call_plumber.R ├── imgs │ ├── api_histogram.png │ ├── enter_api_argument.png │ ├── get_api.png │ ├── swagger_plumber_api_screen.png │ └── vba_curl_request.png ├── plumber_api.R └── vba_plumber_curl_request.bas ├── Chapter14 ├── auto_xgb.rar ├── ch14_data.R ├── ch14_diamonds_eda.R ├── ch14_diamonds_modeling.R ├── imgs │ ├── ggplot_diamonds_boxplot.png │ ├── ggplot_diamonds_hex_plot.png │ ├── ggplot_diamonds_hist_by_cut.png │ ├── ggplot_diamonds_mean_price.png │ ├── ggplot_diamonds_mean_price_per_carat.png │ └── hist_default_and_optbin.png └── xgb_wflw_fit.rds ├── Chapter2 ├── adding_sheets.py ├── cell_update.py ├── create_workbook.py ├── deleting_sheet.py ├── excel_write_bench.R ├── export2excel_pandas.py └── output_file_size_compare.R ├── Chapter7 └── ch7_tables_with_R.R ├── Chapter8 └── ch8.R ├── Chapter9 ├── ch9_linear_reg.R ├── ch9_linear_reg_tidymodels.R ├── ch9_logistic_reg.R └── ch9_logistic_reg_tidymodels.R ├── Extending-Excel-with-Python-and-R.Rproj ├── GroupingExample.xlsx ├── LICENSE ├── README.md ├── aligned_table_openpyxl.xlsx ├── aligned_table_pandas.xlsx ├── chapter6 ├── ch6_barplot.R ├── ch6_cowplot.R ├── ch6_dumbell_plot.R ├── ch6_ggplot2.R ├── ch6_timeseries.R └── imgs │ └── payergroup_barplot.png ├── colored_table_openpyxl.xlsx ├── colored_table_pandas.xlsx ├── conditional_formatting.xlsx ├── data.xlsx ├── dirty_data.xlsx ├── example.xlsx ├── heatmap_with_conditional_formatting.xlsx ├── iris_data.xlsm ├── iris_data.xlsx ├── linear_regression_input.xlsx ├── logistic_regression_input.xlsx ├── requirements.txt ├── requirements.txt.bak ├── styled_table_openpyxl.xlsx ├── styled_table_pandas.xlsx └── time_series_data.xlsx /.Rproj.user/69A1DB49/pcs/debug-breakpoints.pper: -------------------------------------------------------------------------------- 1 | { 2 | "debugBreakpointsState": { 3 | "breakpoints": [] 4 | } 5 | } -------------------------------------------------------------------------------- /.Rproj.user/69A1DB49/pcs/files-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "sortOrder": [ 3 | { 4 | "columnIndex": 2, 5 | "ascending": true 6 | } 7 | ], 8 | "path": "C:/Users/steve/Documents/GitHub/Extending-Excel-with-Python-and-R/Chapter 10" 9 | } -------------------------------------------------------------------------------- /.Rproj.user/69A1DB49/pcs/source-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "activeTab": 2 3 | } -------------------------------------------------------------------------------- /.Rproj.user/69A1DB49/pcs/windowlayoutstate.pper: -------------------------------------------------------------------------------- 1 | { 2 | "left": { 3 | "splitterpos": 417, 4 | "topwindowstate": "NORMAL", 5 | "panelheight": 822, 6 | "windowheight": 860 7 | }, 8 | "right": { 9 | "splitterpos": 520, 10 | "topwindowstate": "NORMAL", 11 | "panelheight": 822, 12 | "windowheight": 860 13 | } 14 | } -------------------------------------------------------------------------------- /.Rproj.user/69A1DB49/pcs/workbench-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "TabSet1": 0, 3 | "TabSet2": 0, 4 | "TabZoom": {} 5 | } -------------------------------------------------------------------------------- /.Rproj.user/69A1DB49/sources/prop/91DA5C98: -------------------------------------------------------------------------------- 1 | { 2 | "tempName": "Untitled2", 3 | "source_window_id": "", 4 | "Source": "Source", 5 | "cursorPosition": "38,12", 6 | "scrollLine": "25" 7 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .bookenv/* 2 | *.xlsx 3 | .Rproj.user 4 | *.png 5 | .Rproj.user/69A1DB49/sources/prop/91DA5C98 6 | *.pper 7 | *.pper 8 | -------------------------------------------------------------------------------- /Chapter 10/airpassengers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/airpassengers.xlsx -------------------------------------------------------------------------------- /Chapter 10/forecasting_lstm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from keras.models import Sequential 5 | from keras.layers import LSTM, Dense 6 | from sklearn.preprocessing import MinMaxScaler 7 | 8 | # Load the time series data (replace with your data) 9 | time_series_data = pd.read_excel('time_series_data.xlsx') 10 | 11 | # Normalize the data to be in the range [0, 1] 12 | scaler = MinMaxScaler() 13 | data = scaler.fit_transform(time_series_data['Value'].to_numpy().reshape(-1, 1)) 14 | 15 | # Split the data into training and testing sets 16 | train_size = int(len(data) * 0.67) 17 | train, test = data[0:train_size, :], data[train_size:len(data), :] 18 | 19 | # Create sequences and labels for training 20 | def create_dataset(dataset, look_back=1): 21 | X, Y = [], [] 22 | for i in range(len(dataset) - look_back): 23 | a = dataset[i:(i + look_back), 0] 24 | X.append(a) 25 | Y.append(dataset[i + look_back, 0]) 26 | return np.array(X), np.array(Y) 27 | 28 | look_back = 3 29 | X_train, Y_train = create_dataset(train, look_back) 30 | X_test, Y_test = create_dataset(test, look_back) 31 | 32 | # Reshape the data for LSTM input 33 | X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) 34 | X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) 35 | 36 | # Create and train an LSTM model 37 | model = Sequential() 38 | model.add(LSTM(4, input_shape=(1, look_back))) 39 | model.add(Dense(1)) 40 | model.compile(loss='mean_squared_error', optimizer='adam') 41 | model.fit(X_train, Y_train, epochs=100, batch_size=1, verbose=2) 42 | 43 | # Make predictions: 44 | trainPredict = model.predict(X_train) 45 | testPredict = model.predict(X_test) 46 | 47 | # Inverse transform the predictions to the original scale 48 | trainPredict = scaler.inverse_transform(trainPredict) 49 | testPredict = scaler.inverse_transform(testPredict) 50 | 51 | # Plot the training predictions 52 | trainPredictPlot = np.empty_like(data) 53 | trainPredictPlot[:, :] = np.nan 54 | trainPredictPlot[look_back:len(trainPredict) + look_back, :] = trainPredict 55 | 56 | # Plot the test predictions 57 | testPredictPlot = np.empty_like(data) 58 | testPredictPlot[:, :] = np.nan 59 | testPredictPlot[len(trainPredict) + (look_back * 2):len(data), :] = testPredict 60 | 61 | # Plot the training data in blue 62 | plt.plot(time_series_data['Value'], color='blue', label='Actual Data') 63 | 64 | # Create shaded regions for the training and test data 65 | plt.fill_between(range(len(data)), 0, trainPredictPlot, color='lightgray', label='Training Data') 66 | plt.fill_between(range(len(data)), trainPredictPlot, testPredictPlot, color='lightcoral', label='Test Data') 67 | 68 | # Overlay the predictions in green 69 | plt.plot(testPredictPlot, color='green', label='Predictions') 70 | 71 | plt.title('Time Series Analysis with LSTM') 72 | plt.legend() 73 | plt.show() -------------------------------------------------------------------------------- /Chapter 10/forecasting_prophet.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import pandas as pd 3 | from prophet import Prophet 4 | from prophet.plot import plot 5 | 6 | # Load the time series data (replace with your data) 7 | time_series_data = pd.read_excel('time_series_data.xlsx') 8 | 9 | # Create a DataFrame with 'ds' and 'y' columns 10 | df = pd.DataFrame({'ds': time_series_data['Date'], 'y': time_series_data['Value']}) 11 | 12 | # Initialize and fit the Prophet model without weekly seasonality 13 | model = Prophet(weekly_seasonality=False) 14 | 15 | # Add custom seasonality obtained from domain knowledge (in this case: we generated the data so) 16 | model.add_seasonality(name='custom_season', period=365, fourier_order=5) 17 | 18 | # Fit the customized model 19 | model.fit(df) 20 | 21 | # Create a dataframe for future dates 22 | forecast_steps = 150 # Adjust the number of forecast steps as needed 23 | future = model.make_future_dataframe(periods=forecast_steps, freq='D') 24 | 25 | # Make predictions 26 | forecast = model.predict(future) 27 | 28 | # Plot the forecast 29 | fig = model.plot(forecast) 30 | 31 | fig.show() 32 | 33 | # Plot components of the forecast (trend, yearly, and weekly seasonality) 34 | fig2 = model.plot_components(forecast) 35 | 36 | fig2.show() 37 | -------------------------------------------------------------------------------- /Chapter 10/forecasting_statsmodel.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import pandas as pd 3 | import numpy as np 4 | import statsmodels.api as sm 5 | from scipy.stats import norm 6 | import matplotlib.pyplot as plt 7 | 8 | # Load the time series data (replace with your data) 9 | time_series_data = pd.read_excel('time_series_data.xlsx')['Value'] 10 | 11 | # Perform the Augmented Dickey-Fuller test to check for stationarity 12 | result = sm.tsa.adfuller(time_series_data, autolag='AIC') 13 | 14 | # If the p-value is greater than a threshold (e.g., 0.05), perform differencing to make the data stationary 15 | if result[1] > 0.05: 16 | differenced_data = np.diff(time_series_data, n=1) 17 | else: 18 | differenced_data = time_series_data 19 | 20 | # Build an ARIMA model 21 | order = (1, 1, 1) # Replace with appropriate values based on ACF and PACF analysis 22 | model = sm.tsa.ARIMA(differenced_data, order=order) 23 | 24 | # Fit the ARIMA model 25 | model_fit = model.fit() 26 | 27 | # Make forecasts 28 | forecast_steps = 50 # Adjust the number of forecast steps as needed 29 | forecast = model_fit.forecast(steps=forecast_steps) 30 | 31 | # If the p-value is greater than a threshold (e.g., 0.05), perform differencing to make the data stationary 32 | if result[1] > 0.05: 33 | # The model was trained on the differenced data so the forecasts have to be added to the last data point 34 | cumsum_forecasts = np.cumsum(forecast) 35 | 36 | # Add this cumulative sum to the last observed value in your raw data 37 | real_forecasts = cumsum_forecasts + time_series_data[len(time_series_data)-1] 38 | 39 | else: 40 | real_forecasts = forecast 41 | 42 | # Retrieve ARIMA model parameters 43 | params = model_fit.params 44 | p, d, q = order 45 | resid = model_fit.resid 46 | 47 | # Compute the standard errors 48 | stderr = np.std(resid) 49 | 50 | # Calculate the confidence intervals 51 | z_score = norm.ppf(0.975) # For a 95% confidence interval 52 | conf_int = np.column_stack((real_forecasts - z_score * stderr, real_forecasts + z_score * stderr)) 53 | 54 | # Separate the forecasts into point forecasts and confidence intervals 55 | point_forecasts = real_forecasts # The point forecasts 56 | forecast_stderr = stderr # The standard errors of the forecasts 57 | lower_bound = conf_int[:, 0] # Lower confidence interval bounds 58 | upper_bound = conf_int[:, 1] # Upper confidence interval bounds 59 | 60 | # Visualize the original time series and forecasts 61 | plt.figure(figsize=(12, 6)) 62 | plt.plot(time_series_data, label='Original Time Series', color='blue') 63 | plt.plot(range(len(time_series_data), len(time_series_data) + forecast_steps), real_forecasts, label='Forecast', color='red') 64 | plt.fill_between(range(len(time_series_data), len(time_series_data) + forecast_steps), conf_int[:, 0], conf_int[:, 1], color='pink', alpha=0.5) 65 | plt.xlabel('Time Steps') 66 | plt.ylabel('Value') 67 | plt.title('ARIMA Time Series Forecast') 68 | plt.legend() 69 | plt.show() 70 | -------------------------------------------------------------------------------- /Chapter 10/imgs/AirPassengers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/AirPassengers.png -------------------------------------------------------------------------------- /Chapter 10/imgs/acf_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/acf_plot.png -------------------------------------------------------------------------------- /Chapter 10/imgs/auto_arima_calibration_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/auto_arima_calibration_plot.png -------------------------------------------------------------------------------- /Chapter 10/imgs/pacf_apts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/pacf_apts.png -------------------------------------------------------------------------------- /Chapter 10/imgs/ts_obj_rnorm_25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/ts_obj_rnorm_25.png -------------------------------------------------------------------------------- /Chapter 10/imgs/tuning_grid_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 10/imgs/tuning_grid_plot.png -------------------------------------------------------------------------------- /Chapter 10/plots.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import statsmodels.api as sm 5 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 6 | 7 | # Load time series data (replace 'time_series_data.xlsx' with your data file) 8 | data = pd.read_excel('time_series_data.xlsx') 9 | 10 | # Convert the 'Date' column to datetime format and set it as the index 11 | data['Date'] = pd.to_datetime(data['Date']) 12 | data.set_index('Date', inplace=True) 13 | 14 | # Plot the time series 15 | plt.figure(figsize=(12, 6)) 16 | plt.plot(data['Value']) 17 | plt.title('Time Series Plot') 18 | plt.xlabel('Date') 19 | plt.ylabel('Value') 20 | plt.grid(True) 21 | plt.show() 22 | 23 | # ACF and PACF plots 24 | fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8)) 25 | 26 | # ACF plot 27 | plot_acf(data['Value'], lags=10, ax=ax1) 28 | ax1.set_title('Autocorrelation Function (ACF)') 29 | 30 | # PACF plot 31 | plot_pacf(data['Value'], lags=10, ax=ax2) 32 | ax2.set_title('Partial Autocorrelation Function (PACF)') 33 | 34 | plt.tight_layout() 35 | plt.show() 36 | -------------------------------------------------------------------------------- /Chapter 10/sample_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Create a time index 6 | date_rng = pd.date_range(start='2022-01-01', end='2023-12-31', freq='D') 7 | 8 | # Create a trend component 9 | trend = 0.05 * np.arange(len(date_rng)) 10 | 11 | # Create a seasonal component (cyclicality) 12 | seasonal = 2.5 * np.sin(2 * np.pi * np.arange(len(date_rng)) / 365) 13 | 14 | # Add some random noise 15 | noise = np.random.normal(0, 0.5, len(date_rng)) 16 | 17 | # Combine all components to create the time series 18 | time_series = trend + seasonal + noise 19 | 20 | # Create a DataFrame 21 | df = pd.DataFrame({'Date': date_rng, 'Value': time_series}) 22 | 23 | # Save the data to an Excel file 24 | df.to_excel('time_series_data.xlsx', index=False) 25 | 26 | # Read the data back into pandas 27 | loaded_df = pd.read_excel('time_series_data.xlsx') 28 | 29 | # Display the first few rows 30 | print(loaded_df.head()) 31 | -------------------------------------------------------------------------------- /Chapter 10/statistics.py: -------------------------------------------------------------------------------- 1 | # Load the data 2 | import pandas as pd 3 | 4 | # Read the data back into pandas 5 | df = pd.read_excel('time_series_data.xlsx') 6 | 7 | # Augmented Dickey-Fuller Test 8 | 9 | from statsmodels.tsa.stattools import adfuller 10 | 11 | adf_result = adfuller(df['Value']) 12 | print("\nAugmented Dickey-Fuller Test:") 13 | print(f"ADF Statistic: {adf_result[0]}") 14 | print(f"P-value: {adf_result[1]}") 15 | print("Null Hypothesis (H0): Data is non-stationary") 16 | print("Alternative Hypothesis (H1): Data is stationary") 17 | 18 | if adf_result[1] <= 0.05: 19 | print("Result: Reject the null hypothesis. Data is stationary.") 20 | else: 21 | print("Result: Fail to reject the null hypothesis. Data is non-stationary.") 22 | 23 | # Time Series Decomposition 24 | 25 | from statsmodels.tsa.seasonal import seasonal_decompose 26 | import matplotlib.pyplot as plt 27 | 28 | decomposition = seasonal_decompose(df['Value'], model='additive', period=365) 29 | trend = decomposition.trend 30 | seasonal = decomposition.seasonal 31 | residual = decomposition.resid 32 | 33 | # Plot the decomposition components 34 | plt.figure(figsize=(12, 8)) 35 | plt.subplot(411) 36 | plt.plot(df['Date'], df['Value'], label='Original') 37 | plt.legend(loc='best') 38 | plt.subplot(412) 39 | plt.plot(df['Date'], trend, label='Trend') 40 | plt.legend(loc='best') 41 | plt.subplot(413) 42 | plt.plot(df['Date'], seasonal, label='Seasonal') 43 | plt.legend(loc='best') 44 | plt.subplot(414) 45 | plt.plot(df['Date'], residual, label='Residual') 46 | plt.legend(loc='best') 47 | plt.suptitle("Time Series Decomposition") 48 | plt.show() 49 | -------------------------------------------------------------------------------- /Chapter 10/time_series.R: -------------------------------------------------------------------------------- 1 | # Generate a Random Time Series 2 | # Set seed to make results reproducible 3 | set.seed(123) 4 | # Generate Random Points using a gaussian distribution with mean 0 and sd = 1 5 | n <- 25 6 | x <- rnorm(n) 7 | head(x) 8 | 9 | # Make x a ts object 10 | ts_obj <- ts(x) 11 | 12 | class(ts_obj) 13 | str(ts_obj) 14 | attributes(ts_obj) 15 | plot(ts_obj) 16 | 17 | # Change Start 18 | ts(x, start = 1980) 19 | ts(x, start = c(1980, 05)) 20 | ts(x, start = 1980, frequency = 12) 21 | ts(x, start = 1980, frequency = 12/3) 22 | # Change End 23 | ts(x, end = 2023) 24 | ts(x, end = 2023, frequency = 12) 25 | ts(x, end = 2023, frequency = 12/3) 26 | 27 | # AirPassengers - Plotting, ACF/PACF 28 | library(readxl) 29 | library(writexl) 30 | 31 | # Write Out the AirPassengers dataset to Excel as a data.frame object 32 | write_xlsx(AirPassengers |> as.data.frame(), "./Chapter 10/airpassengers.xlsx") 33 | 34 | # Read the airpassengers.xlsx file in and convert to a ts object starting at 1949 35 | ap_ts <- read_xlsx("./Chapter 10/airpassengers.xlsx") |> 36 | ts(start = 1949, frequency = 12) 37 | 38 | class(ap_ts) 39 | 40 | # Plot the ts object 41 | plot(ap_ts) 42 | 43 | # Decomposition and Visualization 44 | plot(decompose(ap_ts)) 45 | 46 | # P/ACF 47 | acf(ap_ts) 48 | acf(ap_ts, type = "partial") 49 | 50 | # Auto Arima Modeling 51 | library(healthyR.ts) 52 | library(dplyr) 53 | library(timetk) 54 | library(modeltime) 55 | 56 | ap_tbl <- ts_to_tbl(ap_ts) |> 57 | select(-index) 58 | 59 | class(ap_tbl) 60 | 61 | # Time Series Split 62 | splits <- time_series_split( 63 | ap_tbl 64 | , date_col 65 | , assess = 12 66 | , skip = 3 67 | , cumulative = TRUE 68 | ) 69 | 70 | splits 71 | 72 | ts_auto_arima <- ts_auto_arima( 73 | .data = ap_tbl, 74 | .num_cores = 10, 75 | .date_col = date_col, 76 | .value_col = x, 77 | .rsamp_obj = splits, 78 | .formula = x ~ ., 79 | .grid_size = 20, 80 | .cv_slice_limit = 5, 81 | .tune = TRUE 82 | ) 83 | 84 | # Brownian Motion 85 | ts_brownian_motion() 86 | ts_brownian_motion_plot(t, y) 87 | -------------------------------------------------------------------------------- /Chapter 11/call_plumber.R: -------------------------------------------------------------------------------- 1 | # Library Load 2 | library(plumber) 3 | 4 | # Set dir and file path 5 | wd <- getwd() 6 | sub_dir <- paste0("/Chapter 11/") 7 | full_dir <- paste0(wd, sub_dir) 8 | f <- "plumber_api.R" 9 | f_path <- paste0(full_dir, f) 10 | 11 | # Initiate root 12 | root <- pr(f_path) 13 | root 14 | 15 | root |> pr_run() 16 | -------------------------------------------------------------------------------- /Chapter 11/fastapi_add.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Query 2 | 3 | app = FastAPI() 4 | 5 | @app.get("/api/add") 6 | def add_numbers( 7 | num1: int = Query(..., description="First number"), 8 | num2: int = Query(..., description="Second number"), 9 | ): 10 | result = num1 + num2 11 | return {"result": result} 12 | -------------------------------------------------------------------------------- /Chapter 11/imgs/BERT_Console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/imgs/BERT_Console.png -------------------------------------------------------------------------------- /Chapter 11/imgs/BERT_Console_Excell_Addins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/imgs/BERT_Console_Excell_Addins.png -------------------------------------------------------------------------------- /Chapter 11/imgs/BERT_VBA_to_R_density_plot_rnorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/imgs/BERT_VBA_to_R_density_plot_rnorm.png -------------------------------------------------------------------------------- /Chapter 11/imgs/RAND.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/imgs/RAND.png -------------------------------------------------------------------------------- /Chapter 11/matrix_multiplication.py: -------------------------------------------------------------------------------- 1 | import xlwings as xw 2 | import numpy as np 3 | 4 | @xw.func 5 | @xw.arg('x', np.array, ndim=2) 6 | @xw.arg('y', np.array, ndim=2) 7 | def matrix_mult(x, y): 8 | return x @ y 9 | -------------------------------------------------------------------------------- /Chapter 11/matrix_multiplication.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/matrix_multiplication.xlsm -------------------------------------------------------------------------------- /Chapter 11/multiply.py: -------------------------------------------------------------------------------- 1 | import xlwings as xw 2 | 3 | 4 | def main(): 5 | wb = xw.Book.caller() 6 | a = wb.sheets[0]['A1'].value 7 | b = wb.sheets[0]['B1'].value 8 | wb.sheets[0]['C1'].value = a * b 9 | pass 10 | -------------------------------------------------------------------------------- /Chapter 11/multiply.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 11/multiply.xlsx -------------------------------------------------------------------------------- /Chapter 11/plumber_api.R: -------------------------------------------------------------------------------- 1 | #* Plot out data from a random normal distribution 2 | #* @param .mean The mean of the standard normal distribution 3 | #* @get /plot 4 | #* @serializer png 5 | function(.mean) { 6 | mu <- as.numeric(.mean) 7 | hist(rnorm(n = 1000, mean = mu, sd = 1)) 8 | } -------------------------------------------------------------------------------- /Chapter 11/vba_plumber_curl_request.bas: -------------------------------------------------------------------------------- 1 | Sub MakeCurlRequestAndInsertImage() 2 | ' Define the curl command 3 | Dim curlCommand As String 4 | curlCommand = "curl -X GET ""http://127.0.0.1:6855/plot?.mean=0"" -H ""accept: image/png"" -o " & Environ("TEMP") & "\temp_image.png" 5 | 6 | ' Run the curl command using Shell 7 | Shell "cmd /c " & curlCommand, vbHide 8 | 9 | ' Create a new worksheet or refer to an existing one (Sheet1) 10 | Dim ws As Worksheet 11 | Set ws = ActiveWorkbook.Worksheets("Sheet1") 12 | 13 | ' Clear previous content in Sheet1 14 | ws.Cells.Clear 15 | 16 | ' Insert the image into the worksheet 17 | ws.Pictures.Insert(Environ("TEMP") & "\temp_image.png").Select 18 | End Sub 19 | -------------------------------------------------------------------------------- /Chapter 3/Sub_MultiplyByRandom.bas: -------------------------------------------------------------------------------- 1 | Sub MultiplyByRandom() 2 | Dim rng As Range 3 | Dim cell As Range 4 | 5 | ' Set the range to the desired range on Sheet2 6 | Set rng = Sheets("Sheet2").Range("C3:C13") 7 | 8 | ' Loop through each cell in the range 9 | For Each cell In rng 10 | ' Multiply the cell value by RAND() and store the result in the adjacent cell 11 | cell.Offset(0, 1).Value = cell.Value * Rnd() 12 | Next cell 13 | End Sub -------------------------------------------------------------------------------- /Chapter 3/executing_VBA.py: -------------------------------------------------------------------------------- 1 | import win32com.client as win32 2 | import os 3 | 4 | excel_app = win32.Dispatch("Excel.Application") 5 | 6 | path = os.getcwd().replace('\'','\\') + '\\' 7 | 8 | workbook = excel_app.Workbooks.Open(path+"iris_data.xlsm") 9 | excel_app.Run("examplePythonVBA") 10 | workbook.Close(SaveChanges=True) 11 | excel_app.Quit() 12 | -------------------------------------------------------------------------------- /Chapter 3/interacting_Excel_objects.py: -------------------------------------------------------------------------------- 1 | import win32com.client as win32 2 | import os 3 | 4 | excel_app = win32.Dispatch("Excel.Application") 5 | path = os.getcwd().replace('\'','\\') + '\\' 6 | 7 | workbook = excel_app.Workbooks.Open(path+"iris_data.xlsx") 8 | worksheet = workbook.Worksheets("Sheet1") 9 | 10 | data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] 11 | for row_index, row_data in enumerate(data, start=1): 12 | for col_index, value in enumerate(row_data, start=1): 13 | worksheet.Cells(row_index, col_index).Value = value 14 | 15 | workbook.Close(SaveChanges=True) 16 | excel_app.Quit() 17 | -------------------------------------------------------------------------------- /Chapter 3/mult_by_rand_ch3.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter 3/mult_by_rand_ch3.xlsm -------------------------------------------------------------------------------- /Chapter 3/retreiving_data.py: -------------------------------------------------------------------------------- 1 | import win32com.client as win32 2 | import os 3 | 4 | excel_app = win32.Dispatch("Excel.Application") 5 | path = os.getcwd().replace('\'','\\') + '\\' 6 | 7 | workbook = excel_app.Workbooks.Open(path+"iris_data.xlsx") 8 | worksheet = workbook.Worksheets("Sheet1") 9 | 10 | # Access multiple cells using Range notation 11 | range_of_cells = worksheet.Range('A1:C3') 12 | 13 | # Read the values from the range of cells 14 | values = range_of_cells.Value 15 | 16 | workbook.Close(SaveChanges=False) 17 | excel_app.Quit() 18 | 19 | print(values) 20 | -------------------------------------------------------------------------------- /Chapter 3/run_MultByRand_macro.R: -------------------------------------------------------------------------------- 1 | # Load the library 2 | library(RDCOMClient) 3 | 4 | # Set file path 5 | f_path <- "C:/Users/steve/Documents/GitHub/Extending-Excel-with-Python-and-R/" 6 | f_chapter <- "chapter3/" 7 | f_name <- "mult_by_rand_ch3.xlsm" 8 | f <- paste0(f_path, f_chapter, f_name) 9 | 10 | # Make Excel App 11 | xl_app <- COMCreate("Excel.Application") 12 | xl_wkbk <- xl_app$Workbooks()$Open(f) 13 | xl_app[['Visible']] <- TRUE 14 | 15 | macro_name <- "MultiplyByRandom" 16 | 17 | # Run the macro 18 | xl_app$Run(macro_name) 19 | 20 | # Save and Quit 21 | xl_wkbk$close(TRUE); xl_app$Quit() 22 | -------------------------------------------------------------------------------- /Chapter 3/testing_environment.py: -------------------------------------------------------------------------------- 1 | import win32com.client as win32 2 | 3 | excel_app = win32.Dispatch("Excel.Application") 4 | 5 | vba_interface = excel_app.VBE 6 | 7 | -------------------------------------------------------------------------------- /Chapter 4/apscheduler.py: -------------------------------------------------------------------------------- 1 | from apscheduler.schedulers.blocking import BlockingScheduler 2 | 3 | # Create a scheduler instance 4 | scheduler = BlockingScheduler() 5 | 6 | # Define a task function 7 | def send_email(): 8 | # Code to send an email 9 | print("Email sent!") 10 | 11 | # Schedule the task to run every hour 12 | scheduler.add_job(send_email, 'interval', hours=1) 13 | 14 | # Start the scheduler 15 | scheduler.start() 16 | -------------------------------------------------------------------------------- /Chapter 4/case_study.py: -------------------------------------------------------------------------------- 1 | import os 2 | import smtplib 3 | from email.mime.multipart import MIMEMultipart 4 | from email.mime.text import MIMEText 5 | 6 | def task1(): 7 | # Simulate data processing for task 1 8 | print("Task 1 in progress...") 9 | # ... your code here ... 10 | print("Task 1 completed successfully") 11 | 12 | def task2(): 13 | # Simulate data processing for task 2 14 | print("Task 2 in progress...") 15 | # ... your code here ... 16 | print("Task 2 completed successfully") 17 | 18 | def send_email_notification(task_name, status): 19 | sender_email = os.environ.get("from_email") 20 | recipient_email = os.environ.get("to_email") 21 | 22 | # Create a multi-part email message 23 | message = MIMEMultipart() 24 | message["From"] = sender_email 25 | message["To"] = recipient_email 26 | 27 | if status == "success": 28 | subject = f"Task {task_name} completed successfully" 29 | body = f"The task {task_name} has been completed successfully." 30 | elif status == "error": 31 | subject = f"Error in task {task_name}" 32 | body = f"There was an error while executing task {task_name}. Please check the log files or attachments for more information." 33 | 34 | # Attach log files or other relevant attachments 35 | attachment = MIMEText("... attachment content ...") 36 | attachment.add_header("Content-Disposition", "attachment", filename="log.txt") 37 | message.attach(attachment) 38 | 39 | message["Subject"] = subject 40 | message.attach(MIMEText(body, "plain")) 41 | 42 | # Connect to the SMTP server and send the email 43 | with smtplib.SMTP("smtp.example.com", 587) as server: 44 | server.starttls() 45 | server.login(sender_email, os.environ.get("password")) 46 | server.send_message(message) 47 | 48 | # Usage example 49 | task1() 50 | send_email_notification("task1", "success") 51 | 52 | task2() 53 | send_email_notification("task2", "error") 54 | -------------------------------------------------------------------------------- /Chapter 4/get_user_input.R: -------------------------------------------------------------------------------- 1 | # Import the necessary package 2 | install.packages("svDialogs") 3 | library(svDialogs) 4 | 5 | # Create a message box 6 | name <- dlg_input(message = "What is your name? ") 7 | 8 | # Print the name that the user entered 9 | print(name$res) 10 | 11 | -------------------------------------------------------------------------------- /Chapter 4/hello_world.R: -------------------------------------------------------------------------------- 1 | library("tcltk") 2 | tkmessageBox( 3 | title='Message', 4 | message = paste0("Hello, it is: ", Sys.time()), 5 | type = "ok" 6 | ) 7 | -------------------------------------------------------------------------------- /Chapter 4/hello_world_schedule.R: -------------------------------------------------------------------------------- 1 | library(taskscheduleR) 2 | 3 | # Create a task scheduler job that runs the script every hour 4 | taskscheduler_create( 5 | taskname = "Hello World Hourly", 6 | rscript = "hello_world.R", 7 | schedule = "0 * * * *" 8 | ) 9 | 10 | # Create a task scheduler job that runs the script once a day at 10:00 AM 11 | taskscheduler_create( 12 | taskname = "Hello World Daily", 13 | rscript = "hello_world.R", 14 | schedule = "0 10 * * *" 15 | ) -------------------------------------------------------------------------------- /Chapter 4/install_taskscheduleR.R: -------------------------------------------------------------------------------- 1 | # The Package itself 2 | install.packages("taskscheduleR") 3 | 4 | # If you want to use the GUI 5 | install.packages('miniUI') 6 | install.packages('shiny') 7 | -------------------------------------------------------------------------------- /Chapter 4/schedule.py: -------------------------------------------------------------------------------- 1 | import schedule 2 | import time 3 | 4 | def job(): 5 | print("This job is executed every day at 8:00 AM.") 6 | 7 | # Schedule the job to run every day at 8:00 AM 8 | schedule.every().day.at("08:00").do(job) 9 | 10 | # Keep the program running 11 | while True: 12 | schedule.run_pending() 13 | time.sleep(1) 14 | -------------------------------------------------------------------------------- /Chapter 4/send_basic_email.py: -------------------------------------------------------------------------------- 1 | import os 2 | import smtplib 3 | from email.mime.text import MIMEText 4 | from email.mime.multipart import MIMEMultipart 5 | 6 | # Define email server and credentials 7 | smtp_server = 'smtp.gmail.com' 8 | smtp_port = 587 9 | smtp_username = 'your_username' 10 | smtp_password = 'your_password' 11 | 12 | # Create a MIME message 13 | message = MIMEMultipart() 14 | message['From'] = 'sender@example.com' 15 | message['To'] = 'recipient@example.com' 16 | message['Subject'] = 'Test Email' 17 | 18 | # Add the email body 19 | body = MIMEText('This is the email body.') 20 | message.attach(body) 21 | 22 | # Establish a connection with the email server 23 | with smtplib.SMTP(smtp_server, smtp_port) as server: 24 | # Start the TLS encryption 25 | server.starttls() 26 | 27 | # Log in to the email server 28 | server.login(smtp_username, smtp_password) 29 | 30 | # Send the email 31 | server.send_message(message) 32 | -------------------------------------------------------------------------------- /Chapter 5/aligning_text_openpyxl.py: -------------------------------------------------------------------------------- 1 | # OpenPyXL example for aligning text within cells 2 | from openpyxl import Workbook 3 | from openpyxl.styles import Alignment 4 | 5 | wb = Workbook() 6 | ws = wb.active 7 | 8 | # Applying text alignment 9 | alignment = Alignment(horizontal='center', vertical='center') 10 | ws['A1'].alignment = alignment 11 | 12 | ws['A1'] = 'Name' 13 | ws['B1'] = 'Age' 14 | ws['C1'] = 'City' 15 | 16 | wb.save('aligned_table_openpyxl.xlsx') 17 | -------------------------------------------------------------------------------- /Chapter 5/aligning_text_pandas.py: -------------------------------------------------------------------------------- 1 | # Pandas example for aligning text within cells 2 | import pandas as pd 3 | 4 | data = {'Name': ['John', 'Alice', 'Michael'], 5 | 'Age': [25, 30, 22], 6 | 'City': ['New York', 'London', 'Paris']} 7 | 8 | df = pd.DataFrame(data) 9 | 10 | # Applying text alignment 11 | alignment_styles = {'text-align': 'center'} 12 | styled_df = df.style.set_properties(subset=['Name', 'Age', 'City'], **alignment_styles) 13 | styled_df.to_excel('aligned_table_pandas.xlsx', index=False) 14 | -------------------------------------------------------------------------------- /Chapter 5/background_colors_openpyxl.py: -------------------------------------------------------------------------------- 1 | # openpyxl example for cell background colors 2 | from openpyxl import Workbook 3 | from openpyxl.styles import PatternFill 4 | 5 | wb = Workbook() 6 | ws = wb.active 7 | 8 | # Applying cell background colors 9 | yellow_fill = PatternFill(start_color='FFFF00', end_color='FFFF00', fill_type='solid') 10 | ws['A1'].fill = yellow_fill 11 | 12 | ws['A1'] = 'Name' 13 | ws['B1'] = 'Age' 14 | ws['C1'] = 'City' 15 | 16 | wb.save('colored_table_openpyxl.xlsx') 17 | -------------------------------------------------------------------------------- /Chapter 5/background_colors_pandas.py: -------------------------------------------------------------------------------- 1 | # Pandas example for cell background colors 2 | import pandas as pd 3 | 4 | data = {'Name': ['John', 'Alice', 'Michael'], 5 | 'Age': [25, 30, 22], 6 | 'City': ['New York', 'London', 'Paris']} 7 | 8 | df = pd.DataFrame(data) 9 | 10 | # Create a styler object 11 | styled_df = df.style 12 | 13 | # Define the style for the cells 14 | styled_df = styled_df.applymap(lambda _: 'background-color: yellow', subset=pd.IndexSlice[0, ['Name', 'Age']]) 15 | 16 | # Save the styled DataFrame to an Excel file 17 | styled_df.to_excel('colored_table_pandas.xlsx', index=False) 18 | -------------------------------------------------------------------------------- /Chapter 5/conditional_formatting.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import openpyxl 3 | from openpyxl.formatting.rule import ColorScaleRule, CellIsRule, FormulaRule 4 | 5 | # Create some sample data 6 | data = {'Name': ['John', 'Alice', 'Michael', 'Emily'], 7 | 'Age': [25, 30, 22, 28], 8 | 'City': ['New York', 'London', 'Paris', 'Sydney'], 9 | 'Sales': [1000, 800, 1200, 900]} 10 | 11 | df = pd.DataFrame(data) 12 | 13 | # Write the DataFrame to a worksheet 14 | df.to_excel("conditional_formatting.xlsx", index=False) 15 | 16 | # Load the workbook 17 | wb = openpyxl.load_workbook('conditional_formatting.xlsx') 18 | ws = wb.active 19 | 20 | # Define conditional formatting rules 21 | red_text_rule = CellIsRule(operator="lessThan", formula=["1000"], stopIfTrue=True, font=openpyxl.styles.Font(color="FF0000")) 22 | ws.conditional_formatting.add(f"D2:D{len(df)+1}", red_text_rule) 23 | 24 | # Define the condition for the green fill color scale 25 | min_sales = min(df['Age']) 26 | max_sales = max(df['Age']) 27 | 28 | green_fill_rule = ColorScaleRule( 29 | start_type='num', start_value=min_sales, start_color='0000FF00', 30 | end_type='num', end_value=max_sales, end_color='00FFFF00') 31 | 32 | ws.conditional_formatting.add(f"B2:B{len(df)+1}", green_fill_rule) 33 | 34 | # Save the Excel workbook 35 | wb.save('conditional_formatting.xlsx') 36 | -------------------------------------------------------------------------------- /Chapter 5/font_properties_openpyxl.py: -------------------------------------------------------------------------------- 1 | # OpenPyXL example for setting font properties 2 | from openpyxl import Workbook 3 | from openpyxl.styles import Font 4 | 5 | wb = Workbook() 6 | ws = wb.active 7 | 8 | # Applying font properties 9 | font = Font(size=14, bold=True, italic=True, color='0000FF') 10 | ws['A1'].font = font 11 | 12 | ws['A1'] = 'Name' 13 | ws['B1'] = 'Age' 14 | ws['C1'] = 'City' 15 | 16 | wb.save('styled_table_openpyxl.xlsx') 17 | -------------------------------------------------------------------------------- /Chapter 5/font_properties_pandas.py: -------------------------------------------------------------------------------- 1 | # Pandas example for setting font properties 2 | import pandas as pd 3 | 4 | data = {'Name': ['John', 'Alice', 'Michael'], 5 | 'Age': [25, 30, 22], 6 | 'City': ['New York', 'London', 'Paris']} 7 | 8 | df = pd.DataFrame(data) 9 | 10 | # Define a function to apply font properties 11 | def apply_font_properties(value): 12 | return 'font-weight: bold; font-size: 14px; font-style: italic; color: blue' 13 | 14 | # Applying font properties 15 | styled_df = df.style.applymap(apply_font_properties, subset='Name') 16 | 17 | # Save the styled DataFrame to an Excel file 18 | styled_df.to_excel('styled_table_pandas.xlsx', index=False) 19 | -------------------------------------------------------------------------------- /Chapter 5/heatmap.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import openpyxl 3 | from openpyxl.utils.dataframe import dataframe_to_rows 4 | from openpyxl.formatting.rule import ColorScaleRule 5 | 6 | # Sample data for the heatmap 7 | data = { 8 | 'Category': ['A', 'B', 'C', 'D'], 9 | 'Jan': [10, 20, 30, 40], 10 | 'Feb': [15, 25, 35, 45], 11 | 'Mar': [12, 22, 32, 42], 12 | 'Apr': [18, 28, 38, 48] 13 | } 14 | 15 | # Convert data to a pandas DataFrame 16 | df = pd.DataFrame(data) 17 | 18 | # Write the DataFrame to a worksheet 19 | df.to_excel("heatmap_with_conditional_formatting.xlsx", index=False) 20 | 21 | # Load the workbook 22 | wb = openpyxl.load_workbook('heatmap_with_conditional_formatting.xlsx') 23 | ws = wb.active 24 | 25 | # Define the range for conditional formatting (excluding the 'Category' column) 26 | data_range = f'B2:E{len(df) + 1}' # Adjust the range based on the DataFrame size 27 | 28 | # Apply color scale conditional formatting to the range 29 | color_scale_rule = ColorScaleRule(start_type='min', start_color='FFFFFF', end_type='max', end_color='FF0000') 30 | ws.conditional_formatting.add(data_range, color_scale_rule) 31 | 32 | # Save the workbook 33 | wb.save('heatmap_with_conditional_formatting.xlsx') 34 | -------------------------------------------------------------------------------- /Chapter 5/install_ch5_packages.R: -------------------------------------------------------------------------------- 1 | # styledtabls 2 | install.packages("devtools") 3 | 4 | # Install development version from GitHub 5 | devtools::install_github("R-package/styledTables", build_opts = NULL) 6 | 7 | # tidyxl 8 | install.packages('tidyxl') 9 | 10 | # basictabler 11 | install.packages('basictabler') 12 | -------------------------------------------------------------------------------- /Chapter 5/pivot_table.py: -------------------------------------------------------------------------------- 1 | # Import the required modules from the `win32com.client` package: 2 | import win32com.client as win32 3 | 4 | # Create a new instance of Excel and make it visible: 5 | excel = win32.Dispatch('Excel.Application') 6 | excel.Visible = True 7 | 8 | # Create a new workbook or open an existing one: 9 | workbook = excel.Workbooks.Add() # Create a new workbook 10 | # Or to open an existing workbook: 11 | # workbook = excel.Workbooks.Open('path/to/your/workbook.xlsx') 12 | 13 | # Get the reference to the sheet where you want to create the Pivot Table: 14 | sheet = workbook.ActiveSheet # Get the active sheet 15 | # Or specify the sheet by its name: 16 | # sheet = workbook.Sheets('Sheet1') 17 | 18 | # Populate the data into the sheet (optional, if you have data to analyze): 19 | # Sample data 20 | data = [ 21 | ['Product', 'Category', 'Sales'], 22 | ['Product A', 'Category 1', 100], 23 | ['Product B', 'Category 2', 200], 24 | ['Product C', 'Category 1', 150], 25 | ['Product D', 'Category 2', 50], 26 | # Add more data rows here... 27 | ] 28 | 29 | # Write the data to the sheet 30 | for row_index, row in enumerate(data, start=1): 31 | for col_index, value in enumerate(row, start=1): 32 | sheet.Cells(row_index, col_index).Value = value 33 | 34 | # Add a new worksheet to the workbook to hold the Pivot Table: 35 | pivot_table_sheet = workbook.Worksheets.Add() 36 | pivot_table_sheet.Name = 'Pivot Table' 37 | 38 | # Create a Pivot Cache using the data range: 39 | pivot_cache = workbook.PivotCaches().Create(SourceType=1, SourceData=sheet.UsedRange) 40 | 41 | # Create the Pivot Table on the new sheet using the Pivot Cache: 42 | pivot_table = pivot_cache.CreatePivotTable(TableDestination=pivot_table_sheet.Cells(3, 1), TableName='MyPivotTable') 43 | 44 | # Add fields to the Pivot Table, specifying their orientation (rows, columns, data, etc.): 45 | pivot_table.PivotFields('Product').Orientation = 1 # row field 46 | pivot_table.PivotFields('Category').Orientation = 2 # column field 47 | pivot_table.PivotFields('Sales').Orientation = 4 # data field 48 | 49 | # Control row and column grandtotals 50 | pivot_table.ColumnGrand = True 51 | pivot_table.RowGrand = True 52 | 53 | # Decide which fields have Subtotals 54 | pivot_table.PivotFields('Sales').Subtotals = [False]*12 55 | pivot_table.PivotFields('Product').Subtotals = [False]*12 56 | pivot_table.PivotFields('Category').Subtotals = [True]*12 57 | 58 | # Customize labels and styles 59 | pivot_table.ShowTableStyleRowStripes = False 60 | pivot_table.PivotFields('Product').Caption = 'Product Name' 61 | pivot_table.PivotFields('Sales').NumberFormat = '#,##0' 62 | pivot_table.PivotFields('Sales').Caption = 'Total Sales' 63 | 64 | # Save the workbook and close Excel: 65 | workbook.SaveAs('./pivot_table.xlsx') 66 | workbook.Close() 67 | excel.Quit() 68 | -------------------------------------------------------------------------------- /Chapter 5/using_basictabler.R: -------------------------------------------------------------------------------- 1 | library(basictabler) 2 | 3 | # Create a data frame 4 | data <- data.frame( 5 | name = c("John Doe", "Jane Doe"), 6 | age = c(30, 25), 7 | salary = c(100000, 50000) 8 | ) 9 | 10 | # Plain table 11 | table_plain <- qhtbl(data, theme = "largeplain") 12 | table_plain 13 | 14 | # Create a basictabler object 15 | table <- qhtbl(data, 16 | theme = "largeplain", 17 | tableStyle = list("border-color" = "maroon"), 18 | headingStyle = list( 19 | "color" = "cornsilk", "background-color" = "maroon", 20 | "font-style" = "italic", "border-color" = "maroon" 21 | ), 22 | cellStyle = list( 23 | "color" = "maroon", "background-color" = "cornsilk", 24 | "border-color" = "maroon" 25 | ) 26 | ) 27 | 28 | # Render the table to HTML 29 | table 30 | 31 | # A longer example 32 | library(TidyDensity) 33 | tn <- tidy_normal(.n = 10) 34 | 35 | tbl <- BasicTable$new() 36 | # formatting values (explained in the introduction vignette) 37 | columnFormats <- list( 38 | NULL, 39 | NULL, 40 | "%.4f", 41 | "%.4f", 42 | "%.4f", 43 | "%.4f", 44 | "%.4f" 45 | ) 46 | tbl$addData(tn, 47 | firstColumnAsRowHeaders = TRUE, 48 | explicitColumnHeaders = c("Simulation", "x", "y", "dx", "dy", "p", "q"), 49 | columnFormats = columnFormats 50 | ) 51 | tbl$renderTable() 52 | 53 | # Add some conditional formatting 54 | cells <- tbl$getCells(rowNumbers = 2:11, columnNumbers = 3:7, matchMode = "combinations") 55 | 56 | tbl$mapStyling( 57 | cells = cells, styleProperty = "background-color", valueType = "color", 58 | mapType = "logic", 59 | mappings = list( 60 | "v<=-3", "red", 61 | "-3 6 | styled_table(keep_header = TRUE) |> 7 | set_border_position("all", row_id = 1) |> 8 | set_bold(row_id = 1) |> 9 | set_fill_color("#00FF00", col_id = 3, condition = X >= 0.5) 10 | 11 | # open new xlsx workbook and create a worksheet 12 | wb <- createWorkbook() 13 | sheet <- createSheet(wb, "tidy_normal") 14 | 15 | # insert the styled table in the worksheet 16 | write_excel(sheet, st) 17 | 18 | # save the workbook 19 | saveWorkbook(wb, "chapter5/styledTables_test.xlsx") 20 | -------------------------------------------------------------------------------- /Chapter 6/ch6_barplot.R: -------------------------------------------------------------------------------- 1 | library(healthyR.data) 2 | library(healthyR) 3 | library(ggplot2) 4 | library(dplyr) 5 | library(forcats) 6 | library(purrr) 7 | 8 | df <- healthyR_data |> 9 | filter(payer_grouping != '?') |> 10 | category_counts_tbl( 11 | .count_col = payer_grouping 12 | , .arrange = TRUE 13 | , ip_op_flag 14 | ) |> 15 | group_by(ip_op_flag) |> 16 | mutate(order_var = paste0( 17 | sprintf("%02i", as.integer(rank(n))), 18 | " - ", 19 | payer_grouping 20 | )) |> 21 | ungroup() 22 | 23 | ggplot(df, aes(x = order_var, y = n)) + 24 | geom_col(alpha = 0.328) + 25 | labs(x = "", y = "") + 26 | theme(legend.position = "none") + 27 | facet_wrap(~ ip_op_flag, scale = "free") + 28 | scale_x_discrete(labels = with(df, as.character(payer_grouping) |> 29 | set_names(order_var))) + 30 | xlab(NULL) + 31 | theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) + 32 | coord_flip() + 33 | theme_minimal() 34 | -------------------------------------------------------------------------------- /Chapter 6/ch6_cowplot.R: -------------------------------------------------------------------------------- 1 | # Install Libraries 2 | install.packages("ggplot2") 3 | install.packages("cowplot") 4 | 5 | # Load required libraries 6 | library(ggplot2) 7 | library(cowplot) 8 | 9 | # Load the Iris dataset 10 | data(iris) 11 | 12 | # Create separate histograms for each species 13 | histograms <- list() 14 | for (species in unique(iris$Species)) { 15 | data_subset <- iris[iris$Species == species, ] 16 | 17 | histogram <- ggplot(data_subset, aes(x = Sepal.Width)) + 18 | geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") + 19 | labs(title = paste("Sepal Width Histogram for", species)) + 20 | labs(x = "", y = "") + 21 | theme_minimal() 22 | 23 | histograms[[species]] <- histogram 24 | } 25 | 26 | # Create histogram for all species combined 27 | all_species_hist <- ggplot(iris, aes(x = Sepal.Width)) + 28 | geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") + 29 | labs(title = "Sepal Width Histogram for All Species") + 30 | theme_minimal() 31 | 32 | # Arrange histograms using cowplot 33 | plot_grid( 34 | histograms[["setosa"]], 35 | histograms[["versicolor"]], 36 | histograms[["virginica"]], 37 | all_species_hist, 38 | ncol = 2, 39 | align = "hv" 40 | ) 41 | 42 | histograms <- lapply(unique(iris$Species), function(species) { 43 | data_subset <- iris[iris$Species == species, ] 44 | 45 | histogram <- ggplot(data_subset, aes(x = Sepal.Width)) + 46 | geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") + 47 | labs(title = paste("Sepal Width Histogram for", species)) + 48 | labs(x = "", y = "") + 49 | theme_minimal() 50 | 51 | return(histogram) 52 | }) 53 | 54 | histograms 55 | -------------------------------------------------------------------------------- /Chapter 6/ch6_dumbell_plot.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(dplyr) 3 | 4 | # Sample data 5 | data <- data.frame( 6 | Category = c("A", "B", "C", "D"), 7 | Initial = c(10, 15, 8, 12), 8 | Final = c(18, 22, 14, 16) 9 | ) 10 | 11 | # Calculate the midpoint for positioning the dots and lines 12 | data <- data %>% 13 | mutate(Midpoint = (Initial + Final) / 2) 14 | 15 | # Create the dumbbell plot using ggplot2 16 | dumbbell_plot <- ggplot(data, aes(x = Category, xend = Category, 17 | y = Initial, yend = Final)) + 18 | geom_segment(color = "gray50") + # Lines connecting dots 19 | geom_point(color = "blue", size = 3) + # Initial values 20 | geom_point(aes(y = Final), color = "orange", size = 3) + # Final values 21 | geom_point(aes(y = Midpoint), color = "green", size = 3) + # Midpoint Values 22 | geom_text(aes(label = Midpoint), 23 | y = data$Midpoint, vjust = -.5, size = 3) + # Midpoint labels 24 | labs(title = "Dumbbell Plot", 25 | x = "Category", 26 | y = "Values") + 27 | theme_minimal() 28 | 29 | # Print the plot 30 | dumbbell_plot 31 | -------------------------------------------------------------------------------- /Chapter 6/ch6_ggplot2.R: -------------------------------------------------------------------------------- 1 | install.packages("ggplot2") 2 | library(ggplot2) 3 | 4 | # Make a histogram of the sepal width for all species 5 | hist(iris$Sepal.Width) 6 | 7 | # Make a histogram of the sepal width for each species 8 | par(mfrow = c(2,2)) 9 | for (species in unique(iris$Species)) { 10 | hist(iris$Sepal.Width[iris$Species == species], main = species, 11 | xlab = species) 12 | } 13 | hist(iris$Sepal.Width, main = "All Species") 14 | par(mfrow = c(1,1)) 15 | 16 | # Make a histogram of the sepal width for all species 17 | iris |> 18 | ggplot(aes(x = Sepal.Width)) + 19 | geom_histogram(alpha = 0.328) + 20 | theme_minimal() 21 | 22 | # Make a histogram of the sepal width for each species 23 | iris |> 24 | ggplot(aes(x = Sepal.Width, fill = Species)) + 25 | geom_histogram(alpha = 0.328) + 26 | theme_minimal() 27 | 28 | # Make a histogram of the sepal width for each species and facet them 29 | iris |> 30 | ggplot(aes(x = Sepal.Width, fill = Species)) + 31 | geom_histogram(alpha = 0.328) + 32 | facet_wrap(~ Species, scales = "free") + 33 | theme_minimal() 34 | -------------------------------------------------------------------------------- /Chapter 6/ch6_timeseries.R: -------------------------------------------------------------------------------- 1 | plot.ts(AirPassengers) 2 | plot(decompose(AirPassengers)) 3 | 4 | library(healthyR.ts) 5 | 6 | ts_brownian_motion() |> 7 | ts_brownian_motion_plot(t, y) 8 | -------------------------------------------------------------------------------- /Chapter 6/insert_image_pywin32.py: -------------------------------------------------------------------------------- 1 | import win32com.client as win32 2 | 3 | # Initialize Excel 4 | excel = win32.gencache.EnsureDispatch('Excel.Application') 5 | excel.Visible = True 6 | 7 | # Create a new workbook 8 | workbook = excel.Workbooks.Add() 9 | 10 | # Define the image path 11 | image_path = 'path\\to\\your\\image.png' 12 | 13 | # Insert the image into a specific sheet and cell 14 | sheet = workbook.ActiveSheet 15 | cell = sheet.Range("A1") # You can specify the cell where you want to insert the image 16 | 17 | # Add the image to the worksheet 18 | sheet.Shapes.AddPicture(image_path, LinkToFile=False, SaveWithDocument=True, Left=cell.Left, Top=cell.Top, Width=300, Height=200) 19 | 20 | # Save the workbook 21 | workbook.SaveAs('your_excel_with_image.xlsx') 22 | 23 | # Close Excel 24 | excel.Application.Quit() 25 | -------------------------------------------------------------------------------- /Chapter 6/matplotlib_basics.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | ### scatter plot 6 | data = { 7 | 'Height': [155, 162, 168, 173, 179], 8 | 'Weight': [50, 56, 61, 65, 72] 9 | } 10 | 11 | df = pd.DataFrame(data) 12 | 13 | # Create a scatter plot 14 | df.plot.scatter(x='Height', y='Weight', title='Scatter Plot of Height vs. Weight') 15 | 16 | # Save the plot to a file (e.g., .png) 17 | plt.savefig('matplotlib_scatter_plot.png') 18 | 19 | # Show the plot 20 | plt.show() 21 | 22 | 23 | ### bar chart 24 | 25 | data = {'Category': ['A', 'B', 'C', 'D', 'E'], 26 | 'Values': [15, 28, 24, 20, 32]} 27 | 28 | df = pd.DataFrame(data) 29 | 30 | # Create a basic bar chart 31 | plt.figure(figsize=(8, 6)) 32 | plt.bar(df['Category'], df['Values'], color='skyblue') 33 | plt.xlabel('Categories') 34 | plt.ylabel('Values') 35 | plt.title('Basic Bar Chart') 36 | 37 | # Save the plot to a file (e.g., .png) 38 | plt.savefig('matplotlib_bar_chart.png') 39 | 40 | plt.show() 41 | 42 | ### histogram 43 | 44 | # Generate some random data for the histogram 45 | data = numpy.random.normal(0, 1, 1000) 46 | 47 | import matplotlib.pyplot as plt 48 | 49 | # Create a basic histogram 50 | plt.figure(figsize=(8, 6)) 51 | plt.hist(data, bins=20, color='lightblue', edgecolor='black') 52 | plt.xlabel('Values') 53 | plt.ylabel('Frequency') 54 | plt.title('Basic Histogram') 55 | 56 | # Save the plot to a file (e.g., .png) 57 | plt.savefig('matplotlib_histogram.png') 58 | 59 | plt.show() 60 | 61 | ### box plot 62 | 63 | # Generate some random data for the box plot 64 | data = [numpy.random.normal(0, 1, 100) for _ in range(3)] # Three sets of random data 65 | 66 | # Create a basic box plot 67 | plt.figure(figsize=(8, 6)) 68 | plt.boxplot(data, vert=False, labels=['Set 1', 'Set 2', 'Set 3']) 69 | plt.xlabel('Values') 70 | plt.ylabel('Data Sets') 71 | plt.title('Basic Box Plot') 72 | 73 | # Save the plot to a file (e.g., .png) 74 | plt.savefig('matplotlib_boxplot.png') 75 | 76 | plt.show() 77 | 78 | ### heatmap 79 | 80 | # Generate some random data for the heatmap 81 | data = numpy.random.rand(5, 5) # Create a 5x5 matrix of random values 82 | 83 | # Create a heatmap 84 | plt.figure(figsize=(8, 6)) 85 | heatmap = plt.imshow(data, cmap='viridis', interpolation='nearest') 86 | plt.colorbar(heatmap) 87 | plt.title('Heatmap Example') 88 | 89 | # Save the plot to a file (e.g., .png) 90 | plt.savefig('matplotlib_heatmap.png') 91 | 92 | plt.show() 93 | 94 | ### violinplot 95 | 96 | # Generate some random data for the violin plot 97 | data = [numpy.random.normal(0, std, 100) for std in range(1, 4)] 98 | 99 | # Create a violin plot 100 | plt.figure(figsize=(8, 6)) 101 | plt.violinplot(data, showmedians=True) 102 | plt.title('Violin Plot Example') 103 | plt.xticks([1, 2, 3], ['Group 1', 'Group 2', 'Group 3']) 104 | plt.xlabel('Groups') 105 | plt.ylabel('Values') 106 | 107 | # Save the plot to a file (e.g., .png) 108 | plt.savefig('matplotlib_violinplot.png') 109 | 110 | plt.show() 111 | -------------------------------------------------------------------------------- /Chapter 6/matplotlib_customizations.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | ### labels and titles 4 | 5 | # Sample data 6 | x = [1, 2, 3, 4, 5] 7 | y = [10, 20, 25, 30, 35] 8 | 9 | # Create a scatter plot 10 | plt.scatter(x, y) 11 | 12 | # Customize labels and titles 13 | plt.xlabel('X-axis Label') 14 | plt.ylabel('Y-axis Label') 15 | plt.title('Custom Title') 16 | plt.suptitle('Subtitle for Additional Context') 17 | 18 | # Save the plot to a file (e.g., .png) 19 | plt.savefig('matplotlib_labels.png') 20 | 21 | # Display the plot 22 | plt.show() 23 | 24 | ### axes and legend 25 | 26 | # Sample data 27 | x = [1, 2, 3, 4, 5] 28 | y = [10, 20, 25, 30, 35] 29 | 30 | # Create a line plot 31 | plt.plot(x, y, label='Data Series A') 32 | 33 | # Customize axes and legend 34 | plt.xlim(0, 6) 35 | plt.ylim(0, 40) 36 | plt.xticks([1, 2, 3, 4, 5]) 37 | plt.yticks([0, 10, 20, 30, 40]) 38 | plt.legend() 39 | 40 | # Save the plot to a file (e.g., .png) 41 | plt.savefig('matplotlib_axes_legends.png') 42 | 43 | # Display the plot 44 | plt.show() 45 | 46 | ### themes 47 | 48 | # Apply a different theme 49 | plt.style.use('ggplot') 50 | 51 | # Sample data and plot 52 | x = [1, 2, 3, 4, 5] 53 | y = [10, 20, 25, 30, 35] 54 | plt.plot(x, y) 55 | 56 | # Save the plot to a file (e.g., .png) 57 | plt.savefig('matplotlib_themes.png') 58 | 59 | # Display the plot 60 | plt.show() 61 | 62 | ### text formatting 63 | 64 | # Sample data and plot 65 | x = [1, 2, 3, 4, 5] 66 | y = [10, 20, 25, 30, 35] 67 | plt.plot(x, y) 68 | 69 | # Customize text formatting 70 | plt.title('Custom Title', fontsize=16, fontweight='bold', color='blue') 71 | plt.xlabel('X-axis Label', fontsize=12, fontstyle='italic', color='green') 72 | plt.ylabel('Y-axis Label', fontsize=12, fontweight='bold', color='red') 73 | 74 | # Save the plot to a file (e.g., .png) 75 | plt.savefig('matplotlib_text_formatting.png') 76 | 77 | # Display the plot 78 | plt.show() 79 | -------------------------------------------------------------------------------- /Chapter 6/plotnine_additional_layers.py: -------------------------------------------------------------------------------- 1 | from plotnine import ggplot, aes, geom_line, geom_point, geom_errorbar, position_dodge, geom_text, labs, theme_minimal, geom_smooth 2 | import pandas 3 | import numpy 4 | 5 | ### Error bars 6 | # Sample data 7 | data = pandas.DataFrame({ 8 | 'x': [1, 2, 3, 4, 5], 9 | 'y': [10, 15, 8, 12, 18], 10 | 'group': ['A', 'A', 'B', 'B', 'C'], 11 | 'error': [1, 2, 1.5, 1, 2.5], 12 | 'label_x': [2, 4, 3, 1, 5], 13 | 'label_y': [16, 11, 6, 13, 17], 14 | 'annotation_text': ['Peak', 'Valley', 'Low', 'High', 'Bottom'] 15 | }) 16 | 17 | # Create a ggplot object 18 | gg = ggplot(data, aes(x='x', y='y', group='group')) + \ 19 | geom_line() + \ 20 | geom_point() + \ 21 | geom_errorbar(aes(ymin='y - error', ymax='y + error'), width=0.1, size=0.5, position=position_dodge(width=0.2)) + \ 22 | geom_text(aes(x='label_x', y='label_y', label='annotation_text'), size=10) 23 | 24 | # Draw the plot 25 | print(gg) 26 | 27 | ### Trendline 28 | # Sample data 29 | data = pandas.DataFrame({ 30 | 'X': numpy.arange(1, 21), 31 | 'Y': numpy.random.randint(1, 101, size=20) 32 | }) 33 | 34 | # Create a base plot 35 | gg = (ggplot(data, aes(x='X', y='Y')) + 36 | geom_point() + 37 | labs(title='Scatter Plot with Trendline') + 38 | theme_minimal() 39 | ) 40 | 41 | # Add a trendline 42 | gg = gg + geom_smooth(method='lm', se=False, linetype='dashed', color='red', size=1) 43 | 44 | print(gg) 45 | 46 | ### Annotations 47 | # Sample data 48 | data = pandas.DataFrame({ 49 | 'X': numpy.arange(1, 11), 50 | 'Y': numpy.random.randint(1, 101, size=10) 51 | }) 52 | 53 | # Create a base plot 54 | gg = (ggplot(data, aes(x='X', y='Y')) + 55 | geom_point() + 56 | labs(title='Scatter Plot with Annotations') + 57 | theme_minimal() 58 | ) 59 | 60 | # Add an annotation 61 | gg = gg + geom_text(aes(label='Y'), nudge_y=5, color='blue') 62 | 63 | print(gg) 64 | -------------------------------------------------------------------------------- /Chapter 6/plotnine_basics.py: -------------------------------------------------------------------------------- 1 | # Scatter Plot: 2 | 3 | from plotnine import ggplot, aes, geom_point, geom_bar, geom_histogram, geom_boxplot, geom_tile, geom_violin, theme_minimal, labs 4 | import pandas 5 | 6 | # Sample data 7 | data = pandas.DataFrame({'x': [1, 2, 3, 4, 5], 8 | 'y': [2, 4, 1, 3, 5]}) 9 | 10 | # Create a scatter plot 11 | gg = ggplot(aes(x='x', y='y'), data) + geom_point() 12 | print(gg) 13 | 14 | 15 | # Bar Chart: 16 | 17 | # Sample data 18 | data = pandas.DataFrame({'category': ['A', 'B', 'C', 'D'], 19 | 'value': [10, 25, 15, 30]}) 20 | 21 | # Create a bar chart 22 | gg = ggplot(aes(x='category', y='value'), data) + geom_bar(stat='identity') 23 | print(gg) 24 | 25 | 26 | # Histogram: 27 | 28 | # Sample data 29 | data = pandas.DataFrame({'values': [1, 2, 2, 3, 3, 3, 4, 4, 5]}) 30 | 31 | # Create a histogram 32 | gg = ggplot(aes(x='values'), data) + geom_histogram(binwidth=1, fill='blue', color='black') 33 | print(gg) 34 | 35 | 36 | # Box Plot: 37 | 38 | # Sample data 39 | data = pandas.DataFrame({'category': ['A', 'A', 'B', 'B', 'C', 'C'], 40 | 'value': [10, 15, 20, 25, 30, 35]}) 41 | 42 | # Create a box plot 43 | gg = ggplot(aes(x='category', y='value'), data) + geom_boxplot() 44 | print(gg) 45 | 46 | 47 | # Heatmap: 48 | 49 | # Sample data 50 | data = { 51 | 'x': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D', 'D'], 52 | 'y': ['W', 'X', 'Y', 'Z', 'W', 'X', 'Y', 'Z', 'W', 'X', 'Y', 'Z', 'W', 'X', 'Y', 'Z'], 53 | 'value': [10, 15, 5, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80] 54 | } 55 | 56 | # Convert data to a DataFrame 57 | data = pandas.DataFrame(data) 58 | 59 | # Create a heatmap 60 | gg = (ggplot(data, aes(x='x', y='y', fill='value')) 61 | + geom_tile() 62 | + theme_minimal() 63 | + labs(title='Heatmap Example', x='X-Axis', y='Y-Axis', fill='Values')) 64 | print(gg) 65 | 66 | 67 | # Violin Plot: 68 | 69 | # Sample data 70 | data = { 71 | 'Category': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'D', 'D', 'D'], 72 | 'Value': [10, 15, 25, 30, 35, 45, 50, 65, 70, 75] 73 | } 74 | 75 | # Convert data to a DataFrame 76 | df = pandas.DataFrame(data) 77 | 78 | # Create a violin plot 79 | gg = (ggplot(df, aes(x='Category', y='Value', fill='Category')) 80 | + geom_violin() 81 | + theme_minimal() 82 | + labs(title='Violin Plot Example', x='Category', y='Value', fill='Category')) 83 | print(gg) 84 | -------------------------------------------------------------------------------- /Chapter 6/plotnine_customizations.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | from plotnine import ggplot, aes, geom_point, xlab, ylab, ggtitle, labs, scale_x_continuous, scale_y_continuous, scale_color_manual, theme_minimal, theme_light, theme, element_text 3 | import pandas 4 | 5 | # Sample data 6 | data = pandas.DataFrame({'X': [1, 2, 3, 4, 5], 7 | 'Y': [10, 15, 5, 20, 25], 8 | 'Category': ['A', 'B', 'A', 'B', 'A']}) 9 | 10 | # Create a base scatter plot 11 | gg = (ggplot(data, aes(x='X', y='Y', color='Category')) + 12 | geom_point()) 13 | 14 | # Customize labels and titles 15 | gg = gg + xlab("Custom X Label") + ylab("Custom Y Label") 16 | gg = gg + ggtitle("Custom Plot Title") + labs(subtitle="Custom Subtitle") 17 | 18 | # Customize axes and legends 19 | gg = gg + scale_x_continuous(breaks=[0, 1, 2, 3, 4], labels=["Zero", "One", "Two", "Three", "Four"]) 20 | gg = gg + scale_y_continuous(limits=(0, 30)) 21 | gg = gg + scale_color_manual(values={'A': 'red', 'B': 'blue'}) 22 | 23 | # Customize color palettes 24 | # Map the 'category' variable to the 'fill' aesthetic 25 | gg = gg + aes(fill='Category') 26 | 27 | # Apply themes 28 | gg = gg + theme_minimal() 29 | gg = gg + theme_light() 30 | 31 | # Control text formatting 32 | gg = gg + theme(text=element_text(size=12, family="Arial", face="bold", color="black"), 33 | axis_text_x=element_text(angle=45, hjust=1)) 34 | 35 | print(gg) 36 | -------------------------------------------------------------------------------- /Chapter 7/create_pivot.py: -------------------------------------------------------------------------------- 1 | import win32com.client as win32 2 | 3 | # Create an Excel workbook and add a sheet 4 | excel = win32.gencache.EnsureDispatch('Excel.Application') 5 | workbook = excel.Workbooks.Add() 6 | worksheet = workbook.Worksheets(1) 7 | 8 | # Add some test data 9 | worksheet.Cells(1, 1).Value = 'Name' 10 | worksheet.Cells(1, 2).Value = 'Category' 11 | worksheet.Cells(1, 3).Value = 'Sales' 12 | 13 | worksheet.Cells(2, 1).Value = 'John' 14 | worksheet.Cells(2, 2).Value = 'Electronics' 15 | worksheet.Cells(2, 3).Value = 1000 16 | 17 | worksheet.Cells(3, 1).Value = 'Alice' 18 | worksheet.Cells(3, 2).Value = 'Clothing' 19 | worksheet.Cells(3, 3).Value = 800 20 | 21 | worksheet.Cells(4, 1).Value = 'John' 22 | worksheet.Cells(4, 2).Value = 'Clothing' 23 | worksheet.Cells(4, 3).Value = 300 24 | 25 | # Add more data as needed 26 | 27 | # Define the range of data to be used as input for the pivot table 28 | data_range = worksheet.Range('A1:C4') # Adjust the range as needed 29 | 30 | # Add a new worksheet to the workbook to hold the Pivot Table: 31 | pivot_table_sheet = workbook.Worksheets.Add() 32 | pivot_table_sheet.Name = 'Pivot Table' 33 | 34 | # Create a Pivot Cache using the data range: 35 | pivot_cache = workbook.PivotCaches().Create(SourceType=1, SourceData=data_range) 36 | 37 | # Create the Pivot Table on the new sheet using the Pivot Cache: 38 | pivot_table = pivot_cache.CreatePivotTable(TableDestination=pivot_table_sheet.Cells(3, 1), TableName='MyPivotTable') 39 | 40 | # Add the row, column and data fields 41 | pivot_table.PivotFields('Name').Orientation = 1 # row field 42 | pivot_table.PivotFields('Category').Orientation = 2 # column field 43 | pivot_table.PivotFields('Sales').Orientation = 4 # data field 44 | 45 | # Add the calculated fields 46 | calculated_field = pivot_table.CalculatedFields().Add("Total Sales", "=SUM(Sales)") 47 | 48 | # Refresh the PivotTable to apply changes 49 | pivot_table.RefreshTable() 50 | 51 | # Save the Workbook and close Excel 52 | workbook.SaveAs('PivotTableExample.xlsx') 53 | workbook.Close() 54 | excel.Quit() 55 | -------------------------------------------------------------------------------- /Chapter 7/grouping.py: -------------------------------------------------------------------------------- 1 | # Sample Data Generation 2 | import pandas as pd 3 | import random 4 | from datetime import datetime, timedelta 5 | import win32com.client as win32 6 | import os 7 | import numpy as np 8 | 9 | data = { 10 | 'Date': [datetime(2023, 1, 1) + timedelta(days=i) for i in range(365)], 11 | 'Sales': [random.randint(100, 1000) for _ in range(365)] 12 | } 13 | 14 | df = pd.DataFrame(data) 15 | 16 | # Create an ExcelWriter object and write the DataFrame to the Excel worksheet 17 | df.to_excel("GroupingExample.xlsx", sheet_name='Sheet1', index=False) 18 | 19 | # Connect to Excel 20 | excel = win32.gencache.EnsureDispatch('Excel.Application') 21 | 22 | # Open the Excel workbook and add a sheet 23 | wd = os.getcwd() 24 | workbook = excel.Workbooks.Open(os.path.join(wd, 'GroupingExample.xlsx')) # Replace with your workbook path 25 | worksheet = workbook.Worksheets(1) 26 | 27 | # Add a new worksheet to the workbook to hold the Pivot Table: 28 | pivot_table_sheet = workbook.Worksheets.Add() 29 | pivot_table_sheet.Name = 'Pivot Table' 30 | 31 | # Define the range of data to be used as input for the pivot table 32 | data_range = worksheet.Range('A1:B365') 33 | 34 | # Create a Pivot Cache using the data range: 35 | pivot_cache = workbook.PivotCaches().Create(SourceType=1, SourceData=data_range) 36 | 37 | starting_row = 3 38 | 39 | # Create the Pivot Table on the new sheet using the Pivot Cache: 40 | pivot_table = pivot_cache.CreatePivotTable(TableDestination=pivot_table_sheet.Cells(starting_row, 1), TableName='MyPivotTable') 41 | 42 | # Add the 'Date' field to Rows and define the date_field variable as done with name_field in the example above. 43 | date_field = pivot_table.PivotFields('Date') 44 | date_field.Orientation = 1 # row field 45 | pivot_table.PivotFields('Sales').Orientation = 4 # data field 46 | 47 | # Add the calculated fields 48 | calculated_field = pivot_table.CalculatedFields().Add("Total Sales", "=SUM(Sales)") 49 | 50 | # Group by months 51 | date_field.Subtotals = [False]*12 52 | date_field.NumberFormat = 'MMMM YYYY' 53 | 54 | # Sort Rows 55 | date_field.AutoSort(1, "Date") 56 | 57 | # count the unique values for each value of the date column in the pivot 58 | date_values = pd.DataFrame([item.Value for item in date_field.PivotItems()], columns = ['date']) 59 | unique_values = pd.DataFrame(np.transpose(np.unique(date_values, return_counts=True)), columns=['date', 'count']) 60 | date_values_count = date_values.merge(unique_values).drop_duplicates() 61 | 62 | # Group by months 63 | # Set the GroupOn property 64 | date_range = pivot_table_sheet.Range(f"A4:A{starting_row + date_values_count['count'].iloc[0]}") 65 | date_range.Group() 66 | 67 | # You can use the above method to group the other months as well if you want to 68 | # Note: the pivot is now changed, the second group starts at row starting_row + 2, instead of starting_row + 32 69 | 70 | # change the formatting of the grouped column to show only month and year and change back the original date column to show the full date 71 | pivot_table.PivotFields('Date2').NumberFormat = 'MMMM YYYY' 72 | date_field.NumberFormat = 'DD MMMM YYYY' 73 | 74 | # hide the details of the grouped values 75 | for item in pivot_table.PivotFields('Date2').PivotItems(): 76 | item.ShowDetail = False 77 | 78 | # Refresh data 79 | pivot_table.RefreshTable() 80 | 81 | #pivot_table.PivotFields('Date2').Orientation = 2 82 | 83 | # Save and close 84 | workbook.Save() 85 | workbook.Close() 86 | excel.Quit() 87 | -------------------------------------------------------------------------------- /Chapter 7/manipulate_pivot.py: -------------------------------------------------------------------------------- 1 | import win32com.client as win32 2 | 3 | # Connect to Excel 4 | excel = win32.gencache.EnsureDispatch('Excel.Application') 5 | 6 | # Open the workbook with the pivot table 7 | workbook = excel.Workbooks.Open('PivotTableExample.xlsx') # Replace with your workbook path 8 | worksheet = workbook.Worksheets(1) 9 | 10 | # Access the Pivot Table 11 | pivot_table = worksheet.PivotTables('MyPivotTable') # Use the name of your pivot table 12 | 13 | # Filter by value (need to make the field a Page field instaed of a column field) 14 | category_field = pivot_table.PivotFields('Category') 15 | category_field.Orientation = 3 # page field 16 | category_field.CurrentPage = "Electronics" 17 | 18 | # Sort Rows or Columns 19 | name_field = pivot_table.PivotFields('Name') 20 | name_field.AutoSort(1, "Name") 21 | 22 | # Define the new source data range 23 | new_source_data_range = 'Sheet1!A1:C2' 24 | 25 | # Update the SourceData property of the pivot table's Table object 26 | pivot_table.TableRange2(workbook.Sheets('Sheet1').Range(new_source_data_range)) 27 | 28 | # Refresh data 29 | pivot_table.RefreshTable() 30 | 31 | # Save and close 32 | workbook.Save() 33 | workbook.Close() 34 | excel.Quit() 35 | -------------------------------------------------------------------------------- /Chapter 8/ch8.R: -------------------------------------------------------------------------------- 1 | # The skimr package 2 | if(!require(skimr)){install.packages("skimr")} 3 | library(skimr) 4 | skim(iris) 5 | 6 | if(!require(TidyDensity)){install.packages("TidyDensity")} 7 | tidy_normal() |> skim() 8 | 9 | if(!require(GGally)){install.packages("GGally")} 10 | library(GGally) 11 | library(TidyDensity) 12 | tidy_normal(.n = 200) |> 13 | ggpairs(columns = c("y","p","q","dx","dy")) 14 | 15 | if(!require(DataExplorer)){install.packages("DataExplorer")} 16 | library(DataExplorer) 17 | library(TidyDensity) 18 | library(dplyr) 19 | 20 | df <- tidy_normal(.n = 200) 21 | 22 | df |> 23 | introduce() |> 24 | glimpse() 25 | 26 | df |> 27 | plot_intro() + 28 | theme_minimal() 29 | 30 | df |> 31 | plot_qq() 32 | 33 | df[c("q","y")] |> 34 | plot_qq() 35 | -------------------------------------------------------------------------------- /Chapter 8/clean_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Load Excel data into a pandas DataFrame 5 | df = pd.read_excel('dirty_data.xlsx') 6 | 7 | # Handling Missing Data 8 | 9 | # Identify missing values 10 | missing_values = df.isnull().sum() 11 | 12 | # Replace missing values with the mean (for numeric columns) 13 | df['Age'].fillna(df['Age'].mean(), inplace=True) 14 | 15 | # Replace missing values with the mode (for categorical columns) 16 | # df['Salary'].fillna(df['Salary'].mode()[0], inplace=True) 17 | 18 | # Forward-fill or backward-fill missing values 19 | # df['ColumnWithMissingValues'].fillna(method='ffill', inplace=True) 20 | 21 | # Interpolate missing values based on trends 22 | # df['NumericColumn'].interpolate(method='linear', inplace=True) 23 | 24 | # Remove rows or columns with missing data 25 | df.dropna(axis=0, inplace=True) # Remove rows with missing data 26 | df.dropna(axis=1, inplace=True) # Remove columns with missing data 27 | 28 | # Handling Duplicates 29 | 30 | # Detect and display duplicate rows 31 | duplicate_rows = df[df.duplicated()] 32 | print("Duplicate Rows:") 33 | print(duplicate_rows) 34 | 35 | # Remove duplicate rows 36 | df.drop_duplicates(inplace=True) 37 | 38 | # Handling Data Type Conversion 39 | 40 | # Check data types 41 | print(df.dtypes) 42 | 43 | # Convert a column to a different data type (e.g., float) 44 | df.loc[df['Salary']=='Missing', 'Salary'] = np.NaN 45 | df.loc[:, 'Salary'] = df['Salary'].str.replace("$", "") 46 | df.loc[:, 'Salary'] = df['Salary'].str.replace(",", "") 47 | df['Salary'] = df['Salary'].astype(float) 48 | 49 | # Now that Salary is a numeric column, we can fill the missing values with mean 50 | df['Salary'].fillna(df['Salary'].mean(), inplace=True) 51 | 52 | # Excel-Specific Data Issues 53 | 54 | # No code needed, ensure Excel data is cleaned (e.g., merged cells unmerged, empty cells removed) before import 55 | -------------------------------------------------------------------------------- /Chapter 8/create_sample_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Create a DataFrame with missing data, duplicates, and mixed data types 5 | data = { 6 | 'ID': [1, 2, 3, 4, 5, 6], 7 | 'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eva', 'Eva'], 8 | 'Age': [25, np.nan, 30, 28, 22, 23], 9 | 'Salary': ['$50,000', '$60,000', 'Missing', '$65,000', '$55,000', '$75,000'] 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | 14 | # Introduce some missing data 15 | df.loc[1, 'Age'] = np.nan 16 | df.loc[3, 'Salary'] = np.nan 17 | 18 | # Introduce duplicates 19 | df = pd.concat([df, df.iloc[1:3]], ignore_index=True) 20 | 21 | # Save the sample data 22 | df.to_excel('dirty_data.xlsx') 23 | -------------------------------------------------------------------------------- /Chapter 8/data_distribution.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from scipy import stats 5 | import statsmodels.api as sm 6 | 7 | # Generate sample data from a lognormal distribution 8 | np.random.seed(0) 9 | data = np.random.lognormal(mean=0, sigma=1, size=1000) 10 | 11 | # Create a Pandas DataFrame 12 | df = pd.DataFrame({'Data': data}) 13 | 14 | # Plot a histogram of the data 15 | plt.hist(data, bins=30, color='skyblue', edgecolor='black') 16 | plt.title('Histogram of Data') 17 | plt.xlabel('Value') 18 | plt.ylabel('Frequency') 19 | plt.show() 20 | 21 | # Perform the Shapiro-Wilk test for normality 22 | shapiro_stat, shapiro_p = stats.shapiro(data) 23 | is_normal = shapiro_p > 0.05 # Check if data is normally distributed 24 | print(f'Shapiro-Wilk p-value: {shapiro_p}') 25 | print(f'Is data normally distributed? {is_normal}') 26 | 27 | # Create Q-Q plot with a Normal distribution 28 | sm.qqplot(data, line='s', color='skyblue') 29 | plt.title('Q-Q Plot (Normal)') 30 | plt.xlabel('Theoretical Quantiles') 31 | plt.ylabel('Sample Quantiles') 32 | plt.show() 33 | 34 | # Create Q-Q plot with a lognormal distribution 35 | log_data = np.log(data) 36 | sm.qqplot(log_data, line='s', color='skyblue') 37 | plt.title('Q-Q Plot (Lognormal)') 38 | plt.xlabel('Theoretical Quantiles') 39 | plt.ylabel('Sample Quantiles') 40 | plt.show() 41 | 42 | # Calculate skewness and kurtosis 43 | skewness = stats.skew(data) 44 | kurtosis = stats.kurtosis(data) 45 | 46 | print(f'Skewness: {skewness}') 47 | print(f'Kurtosis: {kurtosis}') 48 | -------------------------------------------------------------------------------- /Chapter 8/relationships.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | import ppscore as pps 6 | 7 | # Generate test data with three variables 8 | np.random.seed(0) 9 | data = { 10 | 'Feature1': np.random.randn(100), 11 | 'Feature2': np.random.randn(100) * 2, 12 | } 13 | 14 | # Create a linear Target variable based on Feature1 and a non-linear function of Feature2 15 | data['Target'] = data['Feature1'] * 2 + np.sin(data['Feature2']) + np.random.randn(100) * 0.5 16 | 17 | # Create a DataFrame 18 | df = pd.DataFrame(data) 19 | 20 | # Calculate and plot the correlation heatmap 21 | corr_matrix = df.corr() 22 | plt.figure(figsize=(8, 6)) 23 | sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5) 24 | plt.title('Correlation Heatmap') 25 | plt.show() 26 | 27 | # Calculate the Predictive Power Score (PPS) 28 | plt.figure(figsize=(8, 6)) 29 | matrix_df = pps.matrix(df)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore') 30 | sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True) 31 | plt.title("Predictive Power Score (PPS) Heatmap") 32 | plt.show() 33 | 34 | # Additional insights 35 | correlation_target = df['Feature1'].corr(df['Target']) 36 | pps_target = pps.score(df, 'Feature1', 'Target') 37 | 38 | print(f'Correlation between Feature1 and Target: {correlation_target:.2f}') 39 | print(f'Predictive Power Score (PPS) between Feature1 and Target: {pps_target:.2f}') 40 | -------------------------------------------------------------------------------- /Chapter 8/summary_statistics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import random 3 | 4 | # Create a sample DataFrame 5 | data = { 6 | 'Age': [random.randint(18, 60) for _ in range(100)], 7 | 'Gender': ['Male', 'Female'] * 50, 8 | 'Income': [random.randint(20000, 100000) for _ in range(100)], 9 | 'Region': ['North', 'South', 'East', 'West'] * 25 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | 14 | # Calculate summary statistics for numerical features 15 | numerical_summary = df.describe() 16 | 17 | # Calculate frequency counts and percentages for categorical features 18 | categorical_summary = df['Gender'].value_counts(normalize=True) 19 | 20 | print("Summary Statistics for Numerical Features:") 21 | print(numerical_summary) 22 | 23 | print("\nFrequency Counts and Percentages for Categorical Features (Gender):") 24 | print(categorical_summary) 25 | -------------------------------------------------------------------------------- /Chapter 9/ch9_linear_reg.R: -------------------------------------------------------------------------------- 1 | 2 | # Library Load ------------------------------------------------------------ 3 | 4 | library(readxl) 5 | 6 | 7 | # Get Data ---------------------------------------------------------------- 8 | 9 | df <- read_xlsx( 10 | path = "chapter1/iris_data.xlsx", 11 | sheet = "iris" 12 | ) 13 | 14 | head(df) 15 | 16 | # Split the dataset by species 17 | iris_split <- split(df, df$species) 18 | 19 | # Define the dependent variable and independent variables 20 | dependent_variable <- "petal_length" 21 | independent_variables <- c("petal_width", "sepal_length", "sepal_width") 22 | f_x <- formula( 23 | paste( 24 | dependent_variable, 25 | "~", 26 | paste( 27 | independent_variables, 28 | collapse = " + " 29 | ) 30 | ) 31 | ) 32 | 33 | # Create a function to perform linear regression on each subset 34 | perform_linear_regression <- function(data) { 35 | lm_model <- lm(f_x, data = data) 36 | return(lm_model) 37 | } 38 | 39 | # Apply the linear regression to each subset using lapply 40 | results <- lapply(iris_split, perform_linear_regression) 41 | 42 | # Get the summary of each linear model 43 | lapply(results, summary) 44 | 45 | # Plot the model performance 46 | par(mfrow = c(2,2)) 47 | lapply(results, plot) 48 | par(mfrow = c(1, 1)) 49 | 50 | # The above can also be rewritten as follows 51 | # Fit a linear model for each species 52 | lm_models <- lapply(iris_split, function(df) lm(f_x, data = df)) 53 | 54 | # Summarize the results 55 | lapply(lm_models, summary) 56 | -------------------------------------------------------------------------------- /Chapter 9/ch9_linear_reg_tidymodels.R: -------------------------------------------------------------------------------- 1 | 2 | # Library Load ------------------------------------------------------------ 3 | 4 | library(readxl) 5 | library(tidymodels) 6 | library(purrr) 7 | library(performance) 8 | 9 | 10 | # Get Data ---------------------------------------------------------------- 11 | 12 | df <- read_xlsx( 13 | path = "chapter1/iris_data.xlsx", 14 | sheet = "iris" 15 | ) 16 | 17 | # Split the data by Species ----------------------------------------------- 18 | 19 | iris_list <- split(df, df$species) 20 | 21 | # Specify the Model ------------------------------------------------------- 22 | 23 | lm_model <- linear_reg(mode = "regression", engine = "lm") 24 | 25 | # Define Formula ---------------------------------------------------------- 26 | 27 | f_x <- formula(paste("petal_width", "~", "petal_length + sepal_width + sepal_length")) 28 | 29 | # Perform Linear Regression using purrr ----------------------------------- 30 | # Create The Model 31 | lm_mod <- linear_reg(mode = "regression", engine = "lm") 32 | 33 | # Make the workflow 34 | wf <- workflow() |> 35 | add_model(lm_mod) 36 | 37 | # Make the function that will get mapped 38 | lm_fit_list <- function(df) { 39 | #create recipe 40 | recipe_train <- recipe(f_x, data = df) %>% 41 | step_normalize(all_predictors()) 42 | 43 | #fit workflow on the data 44 | fit_wf <- wf |> 45 | add_recipe(recipe_train) |> 46 | fit(data = df) 47 | 48 | fit_wf 49 | } 50 | 51 | # Map the linear model ---------------------------------------------------- 52 | 53 | model_list <- map(iris_list, lm_fit_list) 54 | lapply(model_list, tidy) 55 | lapply(model_list, glance) 56 | 57 | # Check the Model 58 | model_list |> 59 | map(extract_fit_engine) |> 60 | map(check_model) 61 | 62 | # Alternate Nested Method 63 | nested_lm <- df |> 64 | nest(data = -species) |> 65 | mutate(split = map(data, ~ initial_split(., prop = 8/10)), 66 | train = map(split, ~ training(.)), 67 | test = map(split, ~ testing(.)), 68 | fit = map(train, ~ lm(f_x, data = .)), 69 | pred = map2(.x = fit, .y = test, ~ predict(object = .x, newdata = .y))) 70 | 71 | nested_lm |> 72 | select(species, pred) |> 73 | unnest(pred) -------------------------------------------------------------------------------- /Chapter 9/ch9_logistic_reg.R: -------------------------------------------------------------------------------- 1 | 2 | # Library Load ------------------------------------------------------------ 3 | 4 | 5 | library(tidyverse) 6 | 7 | df <- Titanic |> 8 | as.data.frame() |> 9 | uncount(Freq) 10 | 11 | 12 | # Splits ------------------------------------------------------------------ 13 | 14 | # Split the data into training and test sets 15 | set.seed(123) 16 | train_index <- sample(nrow(df), floor(nrow(df) * 0.8), replace = FALSE) 17 | train <- df[train_index, ] 18 | test <- df[-train_index, ] 19 | 20 | # Train a model ----------------------------------------------------------- 21 | 22 | # Train the logistic regression model 23 | model <- glm(Survived ~ Sex + Age + Class, data = train, family = "binomial") 24 | 25 | # Predict ----------------------------------------------------------------- 26 | 27 | # Evaluate the model on the test set 28 | predictions <- predict(model, newdata = test, type = "response") 29 | pred_resp <- ifelse(predictions <= 0.5, "No", "Yes") 30 | 31 | # Calculate the accuracy of the model 32 | accuracy <- mean(pred_resp == test$Survived) 33 | 34 | # Print the accuracy of the model 35 | print(accuracy) 36 | 37 | # Print the confusion matrix 38 | table(pred_resp, test$Survived) 39 | -------------------------------------------------------------------------------- /Chapter 9/ch9_logistic_reg_tidymodels.R: -------------------------------------------------------------------------------- 1 | # Library Load ------------------------------------------------------------ 2 | 3 | library(tidymodels) 4 | library(healthyR.ai) 5 | 6 | # Convert to a tibble for tidymodels 7 | df <- Titanic |> 8 | as_tibble() |> 9 | uncount(n) |> 10 | mutate(across(where(is.character), as.factor)) 11 | 12 | # Splits ------------------------------------------------------------------ 13 | 14 | # Set seed for reproducibility 15 | set.seed(123) 16 | 17 | # Split the data into training and test sets 18 | split <- initial_split(df, prop = 0.8) 19 | train <- training(split) 20 | test <- testing(split) 21 | 22 | # Train a model ----------------------------------------------------------- 23 | 24 | # Create a recipe for pre-processing 25 | recipe <- recipe(Survived ~ Sex + Age + Class, data = train) 26 | 27 | # Specify logistic regression as the model 28 | log_reg <- logistic_reg() |> 29 | set_engine("glm", family = "binomial") 30 | 31 | # Combine the recipe and model into a workflow 32 | workflow <- workflow() %>% 33 | add_recipe(recipe) %>% 34 | add_model(log_reg) 35 | 36 | # Train the logistic regression model 37 | fit <- fit(workflow, data = train) 38 | 39 | # Predict ----------------------------------------------------------------- 40 | 41 | # Predict on the test set 42 | predictions <- predict(fit, new_data = test) |> 43 | bind_cols(test) |> 44 | select(Class:Survived, .pred_class) 45 | 46 | # Better method 47 | pred_fit_tbl <- fit |> 48 | augment(new_data = test) 49 | 50 | # Accuracy Check ---------------------------------------------------------- 51 | 52 | # Accuracy metrics for the model to be scored against from the healthyR.ai package 53 | perf <- hai_default_classification_metric_set() 54 | 55 | # Calculate the accuracy metrics 56 | perf(pred_fit_tbl, truth = Survived, estimate = .pred_class) 57 | 58 | # Print the confusion matrix 59 | predictions |> 60 | conf_mat(truth = Survived, estimate = .pred_class) 61 | 62 | # Use broom to tidy and glance the fitted model 63 | tidy(fit, exponentiate = TRUE, conf.int = TRUE) 64 | glance(fit) 65 | 66 | # Visualize' -------------------------------------------------------------- 67 | 68 | # ROC Curve 69 | roc_curve(pred_fit_tbl, truth = Survived, .pred_Yes, event_level = "second") |> 70 | autoplot() -------------------------------------------------------------------------------- /Chapter 9/linear_regression.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn.model_selection import train_test_split 6 | import statsmodels.api as sm 7 | from statsmodels.graphics.regressionplots import plot_regress_exog 8 | from statsmodels.graphics.gofplots import qqplot 9 | 10 | # Step 0: Generate sample data and save as Excel file 11 | np.random.seed(0) 12 | n_samples = 100 13 | X = np.random.rand(n_samples, 2) # Two features 14 | y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(n_samples) # Linear relationship with noise 15 | 16 | # Create a pandas DataFrame 17 | data = {'Feature1': X[:, 0], 'Feature2': X[:, 1], 'Target': y} 18 | df = pd.DataFrame(data) 19 | 20 | # Save the data to Excel 21 | df.to_excel("linear_regression_input.xlsx") 22 | 23 | # Step 1: Import Excel data into a pandas DataFrame 24 | excel_file = "linear_regression_input.xlsx" 25 | df = pd.read_excel(excel_file) 26 | 27 | # Step 2: Explore the data 28 | # Use the tools learned in the previous chapter on EDA 29 | 30 | # Step 3: Data Preparation (if needed) 31 | # Use the tools learned in the previous chapter on data cleaning 32 | 33 | # Step 4: Split data into training and testing sets 34 | X = df[['Feature1', 'Feature2']] # Independent variables 35 | y = df['Target'] # Dependent variable 36 | 37 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 38 | 39 | # Step 5: Fit the Linear Regression model 40 | # Add a constant (intercept) to the independent variables 41 | X_train = sm.add_constant(X_train) 42 | X_test = sm.add_constant(X_test) 43 | 44 | # Fit the linear model 45 | model = sm.OLS(y_train, X_train).fit() 46 | 47 | # Step 6: Model Evaluation 48 | y_pred = model.predict(X_test) 49 | 50 | # Print the model summary 51 | print(model.summary()) 52 | 53 | # Step 7: Visualization 54 | plt.scatter(X_test['Feature1'], y_test, color='blue', label='Actual') 55 | plt.scatter(X_test['Feature1'], y_pred, color='red', label='Predicted') 56 | plt.xlabel('Feature1') 57 | plt.ylabel('Target') 58 | plt.title('Linear Regression Prediction') 59 | plt.legend() 60 | plt.show() 61 | 62 | # Set the backend to 'Agg' before generating the plots 63 | plt.switch_backend('TkAgg') 64 | 65 | # Residuals 66 | fig, ax = plt.subplots(figsize=(12, 8)) 67 | plot_regress_exog(model, "Feature1", fig=fig) 68 | plt.show() 69 | 70 | # Q-Q plot 71 | qqplot(model.resid, line="s") 72 | plt.show() -------------------------------------------------------------------------------- /Chapter 9/logistic_regression.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 8 | 9 | # Step 0: Generate sample data 10 | np.random.seed(0) 11 | n_samples = 100 12 | X = np.random.rand(n_samples, 2) # Two features 13 | y = (X[:, 0] + X[:, 1] > 1).astype(int) # Binary classification based on a condition 14 | 15 | # Create a pandas DataFrame 16 | data = {'Feature1': X[:, 0], 'Feature2': X[:, 1], 'Target': y} 17 | df = pd.DataFrame(data) 18 | 19 | df.to_excel("logistic_regression_input.xlsx") 20 | 21 | # Step 1: Import Excel data into a pandas DataFrame 22 | excel_file = "logistic_regression_input.xlsx" 23 | df = pd.read_excel(excel_file) 24 | 25 | # Step 2: Split data into training and testing sets 26 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 27 | 28 | # Step 3: Create and train the logistic regression model 29 | model = LogisticRegression() 30 | model.fit(X_train, y_train) 31 | 32 | # Step 4: Visualization 33 | 34 | # Visualization for binary classification 35 | plt.scatter(X_test[y_test == 1][:, 0], X_test[y_test == 1][:, 1], color='blue', label='Class 1 (Actual)') 36 | plt.scatter(X_test[y_test == 0][:, 0], X_test[y_test == 0][:, 1], color='red', label='Class 0 (Actual)') 37 | plt.xlabel('Feature1') 38 | plt.ylabel('Feature2') 39 | plt.title('Logistic Regression Prediction') 40 | plt.legend() 41 | plt.show() 42 | 43 | # Step 5: Model Evaluation and Interpretation 44 | y_pred = model.predict(X_test) 45 | 46 | accuracy = accuracy_score(y_test, y_pred) 47 | conf_matrix = confusion_matrix(y_test, y_pred) 48 | class_report = classification_report(y_test, y_pred) 49 | 50 | print("Accuracy:", accuracy) 51 | print("Confusion Matrix:\n", conf_matrix) 52 | print("Classification Report:\n", class_report) 53 | -------------------------------------------------------------------------------- /Chapter1/ch1_create_iris_dataset.R: -------------------------------------------------------------------------------- 1 | library(writexl) 2 | library(janitor) 3 | library(dplyr) 4 | 5 | df <- iris |> clean_names() 6 | 7 | l <- df |> 8 | clean_names() |> 9 | split(f = df$species) 10 | 11 | lt <- c(l, iris = list(df)) 12 | 13 | write_xlsx(lt, path = "ch1/iris_data.xlsx") 14 | -------------------------------------------------------------------------------- /Chapter1/ch1_pkgs.R: -------------------------------------------------------------------------------- 1 | pkgs <- c("openxlsx", "xlsx", "readxl") 2 | install.packages(pkgs, dependencies = TRUE) 3 | lapply(pkgs, library, character.only = TRUE) 4 | -------------------------------------------------------------------------------- /Chapter1/ch1_save_xlsx_as_xlsb.R: -------------------------------------------------------------------------------- 1 | # Load the openxlsx package 2 | library(openxlsx) 3 | 4 | # Set the path to the xlsx file 5 | xlsx_file <- "C:/Users/steve/OneDrive/Desktop/Extending_Excel/ch1/iris_data.xlsx" 6 | 7 | # Open the xlsx file 8 | wb <- openxlsx::loadWorkbook(xlsx_file) 9 | 10 | # Save the xlsx file as an xlsb file 11 | openxlsx::saveWorkbook(wb, "C:/Users/steve/OneDrive/Desktop/Extending_Excel/ch1/iris_data.xlsb") 12 | -------------------------------------------------------------------------------- /Chapter1/excel_sheet_reader.R: -------------------------------------------------------------------------------- 1 | read_excel_sheets <- function(filename, single_tbl = FALSE) { 2 | sheets <- readxl::excel_sheets(filename) 3 | 4 | if (single_tbl){ 5 | x <- purrr::map_df(sheets, readxl::read_excel, path = filename) 6 | } else { 7 | x <- purrr::map(sheets, ~ readxl::read_excel(filename, sheet = .x)) 8 | purrr::set_names(x, sheets) 9 | } 10 | 11 | x 12 | } 13 | 14 | f <- "Chapter1/iris_data.xlsx" 15 | 16 | read_excel_sheets(f, F) 17 | -------------------------------------------------------------------------------- /Chapter1/iris_data.xlsb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter1/iris_data.xlsb -------------------------------------------------------------------------------- /Chapter1/multisheet_openpyxl.py: -------------------------------------------------------------------------------- 1 | from openpyxl import load_workbook 2 | import pandas as pd 3 | 4 | def read_single_sheet(workbook, sheet_name): 5 | 6 | # Load the sheet from the workbook 7 | sheet = workbook[sheet_name] 8 | 9 | # Read out the raaw data including headers 10 | sheet_data_raw = sheet.values 11 | 12 | # Separate the headers into a variable 13 | columns = next(sheet_data_raw)[0:] 14 | 15 | # Create a DataFrame based on the second and subsequent lines of data with the header as column names and return it 16 | return pd.DataFrame(sheet_data_raw, columns=columns) 17 | 18 | 19 | def read_multiple_sheets(file_path): 20 | 21 | # Load the workbook 22 | workbook = load_workbook(file_path) 23 | 24 | # Get a list of all sheet names in the workbook 25 | sheet_names = workbook.sheetnames 26 | 27 | # Cycle through the sheet names, load the data for each and concatenate them into a single DataFrame 28 | return pd.concat([read_single_sheet(workbook=workbook, sheet_name=sheet_name) for sheet_name in sheet_names], ignore_index=True) 29 | 30 | # Define the file path and sheet names 31 | file_path = 'iris_data.xlsx' 32 | 33 | # Read the data from multiple sheets 34 | consolidated_data = read_multiple_sheets(file_path) 35 | 36 | # Display the consolidated data 37 | print(consolidated_data.head()) 38 | -------------------------------------------------------------------------------- /Chapter1/open_excel_openpyxl.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | import pandas as pd 3 | 4 | # Load the workbook 5 | wb = openpyxl.load_workbook('iris_data.xlsx') 6 | 7 | # Select the sheet 8 | sheet = wb['versicolor'] 9 | 10 | # Extract the values (including header) 11 | sheet_data_raw = sheet.values 12 | 13 | # Separate the headers into a variable 14 | header = next(sheet_data_raw)[0:] 15 | 16 | # Create a DataFrame based on the second and subsequent lines of data with the header as column names 17 | sheet_data = pd.DataFrame(sheet_data_raw, columns=header) 18 | 19 | print(sheet_data.head()) -------------------------------------------------------------------------------- /Chapter1/open_excel_pandas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Read the Excel file 4 | df = pd.read_excel('iris_data.xlsx', sheet_name='setosa') 5 | 6 | # Display the first few rows of the DataFrame 7 | print(df.head()) 8 | -------------------------------------------------------------------------------- /Chapter1/read_xlsx_files.R: -------------------------------------------------------------------------------- 1 | # Load Libraries 2 | pkgs <- c("openxlsx", "xlsx", "readxl") 3 | lapply(pkgs, library, character.only = TRUE) 4 | 5 | f_path <- "Chapter1/iris_data.xlsx" 6 | 7 | # Use openxlsx 8 | openxlsx::read.xlsx(f_path) |> head(5) 9 | openxlsx::read.xlsx(f_path, sheet = "iris") |> head(5) 10 | 11 | # Use xlsx 12 | xlsx::read.xlsx(file = f_path, sheetIndex = 1) |> head(5) 13 | xlsx::read.xlsx(file = f_path, sheetName = "iris") |> head(5) 14 | 15 | # Use readxl 16 | readxl::read_excel(f_path) |> head(5) 17 | readxl::read_excel(f_path, "iris") |> head(5) 18 | -------------------------------------------------------------------------------- /Chapter12/call_plumber.R: -------------------------------------------------------------------------------- 1 | # Library Load 2 | library(plumber) 3 | 4 | # Set dir and file path 5 | wd <- getwd() 6 | sub_dir <- paste0("/Chapter12/") 7 | full_dir <- paste0(wd, sub_dir) 8 | f <- "plumber_api.R" 9 | f_path <- paste0(full_dir, f) 10 | 11 | # Initiate root 12 | root <- pr(f_path) 13 | root 14 | 15 | root |> pr_run() 16 | -------------------------------------------------------------------------------- /Chapter12/imgs/api_histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/api_histogram.png -------------------------------------------------------------------------------- /Chapter12/imgs/enter_api_argument.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/enter_api_argument.png -------------------------------------------------------------------------------- /Chapter12/imgs/get_api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/get_api.png -------------------------------------------------------------------------------- /Chapter12/imgs/swagger_plumber_api_screen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/swagger_plumber_api_screen.png -------------------------------------------------------------------------------- /Chapter12/imgs/vba_curl_request.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter12/imgs/vba_curl_request.png -------------------------------------------------------------------------------- /Chapter12/plumber_api.R: -------------------------------------------------------------------------------- 1 | #* Plot out data from a random normal distribution 2 | #* @param .mean The mean of the standard normal distribution 3 | #* @get /plot 4 | #* @serializer png 5 | function(.mean) { 6 | mu <- as.numeric(.mean) 7 | hist(rnorm(n = 1000, mean = mu, sd = 1)) 8 | } -------------------------------------------------------------------------------- /Chapter12/vba_plumber_curl_request.bas: -------------------------------------------------------------------------------- 1 | Sub MakeCurlRequestAndInsertImage() 2 | ' Define the curl command 3 | Dim curlCommand As String 4 | curlCommand = "curl -X GET ""http://127.0.0.1:6855/plot?.mean=0"" -H ""accept: image/png"" -o " & Environ("TEMP") & "\temp_image.png" 5 | 6 | ' Run the curl command using Shell 7 | Shell "cmd /c " & curlCommand, vbHide 8 | 9 | ' Create a new worksheet or refer to an existing one (Sheet1) 10 | Dim ws As Worksheet 11 | Set ws = ActiveWorkbook.Worksheets("Sheet1") 12 | 13 | ' Clear previous content in Sheet1 14 | ws.Cells.Clear 15 | 16 | ' Insert the image into the worksheet 17 | ws.Pictures.Insert(Environ("TEMP") & "\temp_image.png").Select 18 | End Sub 19 | -------------------------------------------------------------------------------- /Chapter14/auto_xgb.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/auto_xgb.rar -------------------------------------------------------------------------------- /Chapter14/ch14_data.R: -------------------------------------------------------------------------------- 1 | # Library Load 2 | library(tidyverse) 3 | library(writexl) 4 | library(janitor) 5 | 6 | # Write File to disk 7 | file_path <- paste0(getwd(), "/Chapter14/") 8 | 9 | # Split data by cut and clean names of the list 10 | df_list <- split(diamonds, diamonds$cut) |> 11 | clean_names() 12 | 13 | # Write to xlsx 14 | df_list |> 15 | write_xlsx(paste0(file_path, "diamonds_split.xlsx")) 16 | -------------------------------------------------------------------------------- /Chapter14/ch14_diamonds_eda.R: -------------------------------------------------------------------------------- 1 | # Library Load 2 | library(ggplot2) 3 | library(dplyr) 4 | library(healthyR) 5 | library(readxl) 6 | 7 | # Source Functions 8 | source(paste0(getwd(),"/chapter1/excel_sheet_reader.R")) 9 | 10 | # Read data 11 | file_path <- paste0(getwd(), "/Chapter14/") 12 | 13 | df <- read_excel_sheets( 14 | filename = paste0(file_path, "diamonds_split.xlsx"), 15 | single_tbl = TRUE 16 | ) 17 | 18 | # Visualize Data 19 | # Create optimal binning via the opt_bin() function from healthyR 20 | breaks <- tibble(x = df$price) |> 21 | opt_bin(x) |> 22 | pull(value) 23 | head(breaks) 24 | 25 | par(mfrow = c(1, 2)) 26 | hist(df$price, main = "Price Histogram - Default binning", 27 | xlab = "Price", ylab = "Frequency") 28 | hist(df$price, breaks = breaks, main = "Price Histogram - Optimal binning", 29 | xlab = "Price", ylab = "Frequency") 30 | par(mfrow = c(1, 1)) 31 | 32 | df |> 33 | ggplot(aes(x = carat, y = price, fill = cut)) + 34 | geom_hex(bins = length(breaks), alpha = 1/5) + 35 | facet_wrap(~ clarity, scales = "free") + 36 | theme_minimal() + 37 | labs( 38 | x = "Carat", 39 | y = "Price", 40 | title = "Diamonds Data", 41 | fill = "Cut" 42 | ) + 43 | hr_scale_color_colorblind() 44 | 45 | df |> 46 | ggplot(aes(x = carat, y = price, fill = cut)) + 47 | geom_boxplot(alpha = 1/5, outlier.color = "lightgrey") + 48 | facet_wrap(~ clarity, scales = "free") + 49 | theme_minimal() + 50 | labs( 51 | x = "Carat", 52 | y = "Price", 53 | title = "Diamonds Data", 54 | fille = "Cut" 55 | ) + 56 | hr_scale_color_colorblind() 57 | 58 | df |> 59 | summarize(m = mean(price), .by = c(clarity, cut)) |> 60 | ggplot(aes(x = clarity, y = m, group = cut, color = cut)) + 61 | geom_point() + 62 | geom_line() + 63 | geom_smooth() + 64 | facet_wrap(~cut, ncol = 2) + 65 | labs(x= "Clarity", 66 | y = "Mean Price", 67 | title = "Mean Price by Clarity and Cut", 68 | color = "Cut") + 69 | theme_minimal() + 70 | hr_scale_color_colorblind() 71 | 72 | df |> 73 | summarize(m = mean(price/carat), .by = c(cut, color, clarity)) |> 74 | ggplot(aes(x = color, y = m, group = clarity, color = clarity)) + 75 | geom_point() + 76 | geom_line() + 77 | facet_wrap(~ cut, ncol = 2, scales = "free") + 78 | labs(x= "Clarity", 79 | y = "Mean Price", 80 | title = "Mean Price per Carat by Clarity, Color and Cut", 81 | color = "Cut") + 82 | theme_minimal() + 83 | hr_scale_color_colorblind() 84 | 85 | df |> 86 | ggplot(aes(x = price)) + 87 | geom_histogram(breaks = breaks, fill = "lightblue", 88 | color = "black") + 89 | theme_minimal() + 90 | facet_wrap(~ cut, ncol = 2, scales = 'free') + 91 | labs(x = "Price", y = "Frequency", title = "Price Histogram by Cut") 92 | 93 | -------------------------------------------------------------------------------- /Chapter14/ch14_diamonds_modeling.R: -------------------------------------------------------------------------------- 1 | # Lib Load 2 | library(healthyR.ai) 3 | library(dplyr) 4 | 5 | glimpse(head(df, 2)) 6 | 7 | # Pass data through pre-processor 8 | rec_obj <- hai_xgboost_data_prepper( 9 | .data = df, 10 | .recipe_formula = price ~ . 11 | ) 12 | rec_obj 13 | 14 | # Now see the juiced output 15 | get_juiced_data(rec_obj) |> 16 | head(2) |> 17 | glimpse() 18 | 19 | # Now perform modeling using the hai_auto_xgboost() function 20 | auto_xgb <- hai_auto_xgboost( 21 | .data = df, 22 | .rec_obj = rec_obj, 23 | .best_metric = "rsq", 24 | .num_cores = 10, 25 | .model_type = "regression" 26 | ) 27 | 28 | xgb_wflw_fit <- auto_xgb$model_info$fitted_wflw 29 | class(xgb_wflw_fit) 30 | mod_spec <- xgb_wflw_fit[["fit"]][["actions"]][["model"]][["spec"]] 31 | mod_spec 32 | 33 | # Save the model 34 | save_path <- paste0(getwd(), "/Chapter14/") 35 | saveRDS(xgb_wflw_fit, paste0(save_path, "xgb_wflw_fit.rds")) 36 | -------------------------------------------------------------------------------- /Chapter14/imgs/ggplot_diamonds_boxplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_boxplot.png -------------------------------------------------------------------------------- /Chapter14/imgs/ggplot_diamonds_hex_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_hex_plot.png -------------------------------------------------------------------------------- /Chapter14/imgs/ggplot_diamonds_hist_by_cut.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_hist_by_cut.png -------------------------------------------------------------------------------- /Chapter14/imgs/ggplot_diamonds_mean_price.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_mean_price.png -------------------------------------------------------------------------------- /Chapter14/imgs/ggplot_diamonds_mean_price_per_carat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/ggplot_diamonds_mean_price_per_carat.png -------------------------------------------------------------------------------- /Chapter14/imgs/hist_default_and_optbin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/imgs/hist_default_and_optbin.png -------------------------------------------------------------------------------- /Chapter14/xgb_wflw_fit.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/Chapter14/xgb_wflw_fit.rds -------------------------------------------------------------------------------- /Chapter2/adding_sheets.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | 3 | # Create a new workbook 4 | workbook = openpyxl.Workbook() 5 | 6 | # Add a new sheet 7 | workbook.create_sheet(title="Sheet2") 8 | 9 | # Save the changes 10 | workbook.save("example.xlsx") 11 | -------------------------------------------------------------------------------- /Chapter2/cell_update.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | 3 | # Load an existing workbook 4 | workbook = openpyxl.load_workbook("example.xlsx") 5 | 6 | # Add a new sheet 7 | workbook.create_sheet(title="Sheet1") 8 | 9 | # Select a sheet 10 | sheet_name = "Sheet1" 11 | 12 | sheet = workbook[sheet_name] 13 | 14 | # Update a cell value 15 | sheet["A1"] = "Hello, World!" 16 | 17 | # Save the changes 18 | workbook.save("example.xlsx") 19 | -------------------------------------------------------------------------------- /Chapter2/create_workbook.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | 3 | # Create a new workbook 4 | workbook = openpyxl.Workbook() 5 | -------------------------------------------------------------------------------- /Chapter2/deleting_sheet.py: -------------------------------------------------------------------------------- 1 | import openpyxl 2 | 3 | # Load an existing workbook 4 | workbook = openpyxl.load_workbook("example.xlsx") 5 | 6 | # Delete a sheet 7 | sheet_name = "Sheet2" 8 | sheet = workbook[sheet_name] 9 | workbook.remove(sheet) 10 | -------------------------------------------------------------------------------- /Chapter2/excel_write_bench.R: -------------------------------------------------------------------------------- 1 | library(rbenchmark) 2 | library(xlsx) 3 | library(writexl) 4 | library(openxlsx) 5 | library(dplyr) 6 | 7 | n <- 5 8 | 9 | benchmark( 10 | "writexl" = { 11 | writexl::write_xlsx(iris, tempfile()) 12 | }, 13 | "openxlsx" = { 14 | openxlsx::write.xlsx(iris, tempfile()) 15 | }, 16 | "xlsx" = { 17 | xlsx::write.xlsx(iris, paste0(tempfile(),".xlsx")) 18 | }, 19 | replications = n, 20 | columns = c( 21 | "test","replications","elapsed","relative","user.self","sys.self") 22 | ) |> 23 | arrange(relative) 24 | -------------------------------------------------------------------------------- /Chapter2/export2excel_pandas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Create a DataFrame with sample data 4 | data = { 5 | 'Name': ['John', 'Jane', 'Mike'], 6 | 'Age': [25, 30, 35], 7 | 'City': ['New York', 'London', 'Sydney'] 8 | } 9 | df = pd.DataFrame(data) 10 | 11 | # Export the DataFrame to an Excel file 12 | df.to_excel('data.xlsx', index=False) 13 | -------------------------------------------------------------------------------- /Chapter2/output_file_size_compare.R: -------------------------------------------------------------------------------- 1 | writexl::write_xlsx(iris, tmp1 <- tempfile()) 2 | file.info(tmp1)$size 3 | 4 | openxlsx::write.xlsx(iris, tmp2 <- tempfile()) 5 | file.info(tmp2)$size 6 | 7 | xlsx::write.xlsx(iris, tmp3 <- paste0(tempfile(),".xlsx")) 8 | file.info(tmp3)$size 9 | -------------------------------------------------------------------------------- /Chapter7/ch7_tables_with_R.R: -------------------------------------------------------------------------------- 1 | # Convert the dataset to a data frame 2 | df <- as.data.frame(UCBAdmissions) 3 | # Create a contingency table using xtabs() 4 | xtabs(Freq ~ Gender + Admit, df) 5 | 6 | # The gt package 7 | if(!require(gt)){install.packages("gt", dependencies = TRUE)} 8 | library(dplyr) 9 | library(tibble) 10 | 11 | tab <- mtcars |> 12 | rownames_to_column() |> 13 | arrange(factor(cyl), mpg) |> 14 | group_by(cyl) |> 15 | slice(1:3) |> 16 | gt() 17 | 18 | tab <- tab |> 19 | tab_spanner( 20 | label = "Performance", 21 | columns = c(mpg, disp, hp, drat, wt, qsec) 22 | ) 23 | 24 | tab <- tab |> 25 | tab_spanner( 26 | label = "Specs", 27 | columns = c(vs, am, gear, carb) 28 | ) 29 | 30 | tab <- tab |> 31 | tab_header( 32 | title = md("The Cars of **mtcars**"), 33 | subtitle = "These are some fine automobiles" 34 | ) 35 | 36 | tab 37 | 38 | # pivot_table() with tidyquant 39 | library(tidyquant) 40 | library(purrr) 41 | 42 | pivot_table(.data = iris, 43 | .rows = ~ Species, 44 | .values = c(~ mean(Sepal.Length), ~ mean(Sepal.Width))) |> 45 | set_names("Species","Mean_Sepal_Length","Mean_Sepal_Width") 46 | -------------------------------------------------------------------------------- /Chapter8/ch8.R: -------------------------------------------------------------------------------- 1 | # The skimr package 2 | if(!require(skimr)){install.packages("skimr")} 3 | library(skimr) 4 | skim(iris) 5 | 6 | if(!require(TidyDensity)){install.packages("TidyDensity")} 7 | tidy_normal() |> skim() 8 | 9 | if(!require(GGally)){install.packages("GGally")} 10 | library(GGally) 11 | library(TidyDensity) 12 | tidy_normal(.n = 200) |> 13 | ggpairs(columns = c("y","p","q","dx","dy")) 14 | 15 | if(!require(DataExplorer)){install.packages("DataExplorer")} 16 | library(DataExplorer) 17 | library(TidyDensity) 18 | library(dplyr) 19 | 20 | df <- tidy_normal(.n = 200) 21 | 22 | df |> 23 | introduce() |> 24 | glimpse() 25 | 26 | df |> 27 | plot_intro() + 28 | theme_minimal() 29 | 30 | df |> 31 | plot_qq() 32 | 33 | df[c("q","y")] |> 34 | plot_qq() 35 | -------------------------------------------------------------------------------- /Chapter9/ch9_linear_reg.R: -------------------------------------------------------------------------------- 1 | 2 | # Library Load ------------------------------------------------------------ 3 | 4 | library(readxl) 5 | 6 | 7 | # Get Data ---------------------------------------------------------------- 8 | 9 | df <- read_xlsx( 10 | path = "chapter1/iris_data.xlsx", 11 | sheet = "iris" 12 | ) 13 | 14 | head(df) 15 | 16 | # Split the dataset by species 17 | iris_split <- split(df, df$species) 18 | 19 | # Define the dependent variable and independent variables 20 | dependent_variable <- "petal_length" 21 | independent_variables <- c("petal_width", "sepal_length", "sepal_width") 22 | f_x <- formula( 23 | paste( 24 | dependent_variable, 25 | "~", 26 | paste( 27 | independent_variables, 28 | collapse = " + " 29 | ) 30 | ) 31 | ) 32 | 33 | # Create a function to perform linear regression on each subset 34 | perform_linear_regression <- function(data) { 35 | lm_model <- lm(f_x, data = data) 36 | return(lm_model) 37 | } 38 | 39 | # Apply the linear regression to each subset using lapply 40 | results <- lapply(iris_split, perform_linear_regression) 41 | 42 | # Get the summary of each linear model 43 | lapply(results, summary) 44 | 45 | # Plot the model performance 46 | par(mfrow = c(2,2)) 47 | lapply(results, plot) 48 | par(mfrow = c(1, 1)) 49 | 50 | # The above can also be rewritten as follows 51 | # Fit a linear model for each species 52 | lm_models <- lapply(iris_split, function(df) lm(f_x, data = df)) 53 | 54 | # Summarize the results 55 | lapply(lm_models, summary) 56 | -------------------------------------------------------------------------------- /Chapter9/ch9_linear_reg_tidymodels.R: -------------------------------------------------------------------------------- 1 | 2 | # Library Load ------------------------------------------------------------ 3 | 4 | library(readxl) 5 | library(tidymodels) 6 | library(purrr) 7 | library(performance) 8 | 9 | 10 | # Get Data ---------------------------------------------------------------- 11 | 12 | df <- read_xlsx( 13 | path = "chapter1/iris_data.xlsx", 14 | sheet = "iris" 15 | ) 16 | 17 | # Split the data by Species ----------------------------------------------- 18 | 19 | iris_list <- split(df, df$species) 20 | 21 | # Specify the Model ------------------------------------------------------- 22 | 23 | lm_model <- linear_reg(mode = "regression", engine = "lm") 24 | 25 | # Define Formula ---------------------------------------------------------- 26 | 27 | f_x <- formula(paste("petal_width", "~", "petal_length + sepal_width + sepal_length")) 28 | 29 | # Perform Linear Regression using purrr ----------------------------------- 30 | # Create The Model 31 | lm_mod <- linear_reg(mode = "regression", engine = "lm") 32 | 33 | # Make the workflow 34 | wf <- workflow() |> 35 | add_model(lm_mod) 36 | 37 | # Make the function that will get mapped 38 | lm_fit_list <- function(df) { 39 | #create recipe 40 | recipe_train <- recipe(f_x, data = df) %>% 41 | step_normalize(all_predictors()) 42 | 43 | #fit workflow on the data 44 | fit_wf <- wf |> 45 | add_recipe(recipe_train) |> 46 | fit(data = df) 47 | 48 | fit_wf 49 | } 50 | 51 | # Map the linear model ---------------------------------------------------- 52 | 53 | model_list <- map(iris_list, lm_fit_list) 54 | lapply(model_list, tidy) 55 | lapply(model_list, glance) 56 | 57 | # Check the Model 58 | model_list |> 59 | map(extract_fit_engine) |> 60 | map(check_model) 61 | 62 | # Alternate Nested Method 63 | nested_lm <- df |> 64 | nest(data = -species) |> 65 | mutate(split = map(data, ~ initial_split(., prop = 8/10)), 66 | train = map(split, ~ training(.)), 67 | test = map(split, ~ testing(.)), 68 | fit = map(train, ~ lm(f_x, data = .)), 69 | pred = map2(.x = fit, .y = test, ~ predict(object = .x, newdata = .y))) 70 | 71 | nested_lm |> 72 | select(species, pred) |> 73 | unnest(pred) -------------------------------------------------------------------------------- /Chapter9/ch9_logistic_reg.R: -------------------------------------------------------------------------------- 1 | 2 | # Library Load ------------------------------------------------------------ 3 | 4 | 5 | library(tidyverse) 6 | 7 | df <- Titanic |> 8 | as.data.frame() |> 9 | uncount(Freq) 10 | 11 | 12 | # Splits ------------------------------------------------------------------ 13 | 14 | # Split the data into training and test sets 15 | set.seed(123) 16 | train_index <- sample(nrow(df), floor(nrow(df) * 0.8), replace = FALSE) 17 | train <- df[train_index, ] 18 | test <- df[-train_index, ] 19 | 20 | # Train a model ----------------------------------------------------------- 21 | 22 | # Train the logistic regression model 23 | model <- glm(Survived ~ Sex + Age + Class, data = train, family = "binomial") 24 | 25 | # Predict ----------------------------------------------------------------- 26 | 27 | # Evaluate the model on the test set 28 | predictions <- predict(model, newdata = test, type = "response") 29 | pred_resp <- ifelse(predictions <= 0.5, "No", "Yes") 30 | 31 | # Calculate the accuracy of the model 32 | accuracy <- mean(pred_resp == test$Survived) 33 | 34 | # Print the accuracy of the model 35 | print(accuracy) 36 | 37 | # Print the confusion matrix 38 | table(pred_resp, test$Survived) 39 | -------------------------------------------------------------------------------- /Chapter9/ch9_logistic_reg_tidymodels.R: -------------------------------------------------------------------------------- 1 | # Library Load ------------------------------------------------------------ 2 | 3 | library(tidymodels) 4 | library(healthyR.ai) 5 | 6 | # Convert to a tibble for tidymodels 7 | df <- Titanic |> 8 | as_tibble() |> 9 | uncount(n) |> 10 | mutate(across(where(is.character), as.factor)) 11 | 12 | # Splits ------------------------------------------------------------------ 13 | 14 | # Set seed for reproducibility 15 | set.seed(123) 16 | 17 | # Split the data into training and test sets 18 | split <- initial_split(df, prop = 0.8) 19 | train <- training(split) 20 | test <- testing(split) 21 | 22 | # Train a model ----------------------------------------------------------- 23 | 24 | # Create a recipe for pre-processing 25 | recipe <- recipe(Survived ~ Sex + Age + Class, data = train) 26 | 27 | # Specify logistic regression as the model 28 | log_reg <- logistic_reg() |> 29 | set_engine("glm", family = "binomial") 30 | 31 | # Combine the recipe and model into a workflow 32 | workflow <- workflow() %>% 33 | add_recipe(recipe) %>% 34 | add_model(log_reg) 35 | 36 | # Train the logistic regression model 37 | fit <- fit(workflow, data = train) 38 | 39 | # Predict ----------------------------------------------------------------- 40 | 41 | # Predict on the test set 42 | predictions <- predict(fit, new_data = test) |> 43 | bind_cols(test) |> 44 | select(Class:Survived, .pred_class) 45 | 46 | # Better method 47 | pred_fit_tbl <- fit |> 48 | augment(new_data = test) 49 | 50 | # Accuracy Check ---------------------------------------------------------- 51 | 52 | # Accuracy metrics for the model to be scored against from the healthyR.ai package 53 | perf <- hai_default_classification_metric_set() 54 | 55 | # Calculate the accuracy metrics 56 | perf(pred_fit_tbl, truth = Survived, estimate = .pred_class) 57 | 58 | # Print the confusion matrix 59 | predictions |> 60 | conf_mat(truth = Survived, estimate = .pred_class) 61 | 62 | # Use broom to tidy and glance the fitted model 63 | tidy(fit, exponentiate = TRUE, conf.int = TRUE) 64 | glance(fit) 65 | 66 | # Visualize' -------------------------------------------------------------- 67 | 68 | # ROC Curve 69 | roc_curve(pred_fit_tbl, truth = Survived, .pred_Yes, event_level = "second") |> 70 | autoplot() -------------------------------------------------------------------------------- /Extending-Excel-with-Python-and-R.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /GroupingExample.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/GroupingExample.xlsx -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Machine Learning Summit 2025

2 | 3 | ## Machine Learning Summit 2025 4 | **Bridging Theory and Practice: ML Solutions for Today’s Challenges** 5 | 6 | 3 days, 20+ experts, and 25+ tech sessions and talks covering critical aspects of: 7 | - **Agentic and Generative AI** 8 | - **Applied Machine Learning in the Real World** 9 | - **ML Engineering and Optimization** 10 | 11 | 👉 [Book your ticket now >>](https://packt.link/mlsumgh) 12 | 13 | --- 14 | 15 | ## Join Our Newsletters 📬 16 | 17 | ### DataPro 18 | *The future of AI is unfolding. Don’t fall behind.* 19 | 20 |

DataPro QR

21 | 22 | Stay ahead with [**DataPro**](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes), the free weekly newsletter for data scientists, AI/ML researchers, and data engineers. 23 | From trending tools like **PyTorch**, **scikit-learn**, **XGBoost**, and **BentoML** to hands-on insights on **database optimization** and real-world **ML workflows**, you’ll get what matters, fast. 24 | 25 | > Stay sharp with [DataPro](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes). Join **115K+ data professionals** who never miss a beat. 26 | 27 | --- 28 | 29 | ### BIPro 30 | *Business runs on data. Make sure yours tells the right story.* 31 | 32 |

BIPro QR

33 | 34 | [**BIPro**](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes) is your free weekly newsletter for BI professionals, analysts, and data leaders. 35 | Get practical tips on **dashboarding**, **data visualization**, and **analytics strategy** with tools like **Power BI**, **Tableau**, **Looker**, **SQL**, and **dbt**. 36 | 37 | > Get smarter with [BIPro](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes). Trusted by **35K+ BI professionals**, see what you’re missing. 38 | 39 | # Extending Excel with Python and R 40 | 41 | Extending Excel with Python and R 42 | 43 | This is the code repository for [Extending Excel with Python and R](https://www.packtpub.com/product/extending-excel-with-python-and-r/9781804610695), published by Packt. 44 | 45 | **Unlock the potential of analytics languages for advanced data manipulation and visualization** 46 | 47 | ## What is this book about? 48 | 49 | For businesses, data analysis and visualization are crucial for informed decision-making; however, Excel’s limitations can make these tasks time-consuming and challenging. Extending Excel with Python and R is a game-changer resource, written by experts Steven Sanderson, the author of the healthyverse suite of R packages, and David Kun, co-founder of Functional Analytics, the company behind the ownR platform engineering solution for R, Python, and other data science languages. 50 | 51 | This book covers the following exciting features: 52 | * Read and write Excel files with R and Python libraries 53 | * Automate Excel tasks with R and Python scripts 54 | * Use R and Python to execute Excel VBA macros 55 | * rmat Excel sheets using R and Python packages 56 | * Create graphs with ggplot2 and Matplotlib in Excel 57 | * Analyze Excel data with statistical methods and time series analysis 58 | * Explore various methods to call R and Python functions from Excel 59 | 60 | If you feel this book is for you, get your [copy](https://www.amazon.com/Extending-Excel-Python-manipulation-visualization/dp/1804610690/ref=sr_1_1?sr=8-1) today! 61 | 62 | 63 | ## Instructions and Navigations 64 | All of the code is organized into folders. 65 | 66 | The code will look like the following: 67 | ``` 68 | install.packages("devtools") 69 | # Install development version from GitHub 70 | devtools::install_github( 71 | 'R-package/styledTables', 72 | build_vignettes = TRUE 73 | ) 74 | ``` 75 | 76 | **Following is what you need for this book:** 77 | If you’re a data analyst or data scientist, or a quants, actuaries, or data practitioner looking to enhance your Excel skills and expand your data analysis capabilities with R and Python, this book is for you. It provides a comprehensive introduction to the topics covered, making it suitable for both beginners and intermediate learners. A basic understanding of Excel, Python, and R is all you need to get started. 78 | 79 | With the following software and hardware list you can run all code files present in the book (Chapter 1-12). 80 | 81 | ### Software and Hardware List 82 | 83 | | Chapter | Software required | OS required | 84 | | -------- | -------------------------------------------------------------------------------------| -----------------------------------| 85 | | 1-12 | R | Windows (for the VBA parts), macOS, or Linux (for all content excluding VBA) | 86 | | 1-12 | Python 3.11 | | 87 | | 1-12 | Excel (including VBA) | | 88 | 89 | ### Related products 90 | * Data Modeling with Microsoft Excel [[Packt]](https://www.packtpub.com/product/data-modeling-with-microsoft-excel/9781803240282) [[Amazon]](https://www.amazon.com/Data-Modeling-Microsoft-Excel-comprehensive/dp/1803240288/ref=sr_1_1?sr=8-1) 91 | 92 | * Building Interactive Dashboards in Microsoft 365 Excel [[Packt]](https://www.packtpub.com/product/building-interactive-dashboards-in-microsoft-365-excel/9781803237299) [[Amazon]](https://www.amazon.com/Building-Interactive-Dashboards-Microsoft-Excel/dp/1803237295/ref=sr_1_1?sr=8-1) 93 | 94 | ## Get to Know the Authors 95 | **Steven Sanderson** has been working in healthcare for almost 20 years with a focus in the last 12 years on analytics. Steve has spent those years working on dashboards, automations, and visualizations for clinical, finance and IT operations. Steven is also the author of the healthyverse suite of R packages which are in active development. Steven received his MPH from Stony Brook University School of Medicine Graduate Program in Public Health. 96 | 97 | **David Kun** is the co-founder of Functional Analytics, the company behind the ownR platform engineering solution for R, Python and other data science languages. He is a qualified Actuary with two MSc’s concentrated on Mathematics. He has been using R since his MSc thesis in 2006 and Python since 2018. 98 | 99 | -------------------------------------------------------------------------------- /aligned_table_openpyxl.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/aligned_table_openpyxl.xlsx -------------------------------------------------------------------------------- /aligned_table_pandas.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/aligned_table_pandas.xlsx -------------------------------------------------------------------------------- /chapter6/ch6_barplot.R: -------------------------------------------------------------------------------- 1 | library(healthyR.data) 2 | library(healthyR) 3 | library(ggplot2) 4 | library(dplyr) 5 | library(forcats) 6 | library(purrr) 7 | 8 | df <- healthyR_data |> 9 | filter(payer_grouping != '?') |> 10 | category_counts_tbl( 11 | .count_col = payer_grouping 12 | , .arrange = TRUE 13 | , ip_op_flag 14 | ) |> 15 | group_by(ip_op_flag) |> 16 | mutate(order_var = paste0( 17 | sprintf("%02i", as.integer(rank(n))), 18 | " - ", 19 | payer_grouping 20 | )) |> 21 | ungroup() 22 | 23 | ggplot(df, aes(x = order_var, y = n)) + 24 | geom_col(alpha = 0.328) + 25 | labs(x = "", y = "") + 26 | theme(legend.position = "none") + 27 | facet_wrap(~ ip_op_flag, scale = "free") + 28 | scale_x_discrete(labels = with(df, as.character(payer_grouping) |> 29 | set_names(order_var))) + 30 | xlab(NULL) + 31 | theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=.5)) + 32 | coord_flip() + 33 | theme_minimal() 34 | -------------------------------------------------------------------------------- /chapter6/ch6_cowplot.R: -------------------------------------------------------------------------------- 1 | # Install Libraries 2 | install.packages("ggplot2") 3 | install.packages("cowplot") 4 | 5 | # Load required libraries 6 | library(ggplot2) 7 | library(cowplot) 8 | 9 | # Load the Iris dataset 10 | data(iris) 11 | 12 | # Create separate histograms for each species 13 | histograms <- list() 14 | for (species in unique(iris$Species)) { 15 | data_subset <- iris[iris$Species == species, ] 16 | 17 | histogram <- ggplot(data_subset, aes(x = Sepal.Width)) + 18 | geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") + 19 | labs(title = paste("Sepal Width Histogram for", species)) + 20 | labs(x = "", y = "") + 21 | theme_minimal() 22 | 23 | histograms[[species]] <- histogram 24 | } 25 | 26 | # Create histogram for all species combined 27 | all_species_hist <- ggplot(iris, aes(x = Sepal.Width)) + 28 | geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") + 29 | labs(title = "Sepal Width Histogram for All Species") + 30 | theme_minimal() 31 | 32 | # Arrange histograms using cowplot 33 | plot_grid( 34 | histograms[["setosa"]], 35 | histograms[["versicolor"]], 36 | histograms[["virginica"]], 37 | all_species_hist, 38 | ncol = 2, 39 | align = "hv" 40 | ) 41 | 42 | histograms <- lapply(unique(iris$Species), function(species) { 43 | data_subset <- iris[iris$Species == species, ] 44 | 45 | histogram <- ggplot(data_subset, aes(x = Sepal.Width)) + 46 | geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") + 47 | labs(title = paste("Sepal Width Histogram for", species)) + 48 | labs(x = "", y = "") + 49 | theme_minimal() 50 | 51 | return(histogram) 52 | }) 53 | 54 | histograms 55 | -------------------------------------------------------------------------------- /chapter6/ch6_dumbell_plot.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(dplyr) 3 | 4 | # Sample data 5 | data <- data.frame( 6 | Category = c("A", "B", "C", "D"), 7 | Initial = c(10, 15, 8, 12), 8 | Final = c(18, 22, 14, 16) 9 | ) 10 | 11 | # Calculate the midpoint for positioning the dots and lines 12 | data <- data %>% 13 | mutate(Midpoint = (Initial + Final) / 2) 14 | 15 | # Create the dumbbell plot using ggplot2 16 | dumbbell_plot <- ggplot(data, aes(x = Category, xend = Category, 17 | y = Initial, yend = Final)) + 18 | geom_segment(color = "gray50") + # Lines connecting dots 19 | geom_point(color = "blue", size = 3) + # Initial values 20 | geom_point(aes(y = Final), color = "orange", size = 3) + # Final values 21 | geom_point(aes(y = Midpoint), color = "green", size = 3) + # Midpoint Values 22 | geom_text(aes(label = Midpoint), 23 | y = data$Midpoint, vjust = -.5, size = 3) + # Midpoint labels 24 | labs(title = "Dumbbell Plot", 25 | x = "Category", 26 | y = "Values") + 27 | theme_minimal() 28 | 29 | # Print the plot 30 | dumbbell_plot 31 | -------------------------------------------------------------------------------- /chapter6/ch6_ggplot2.R: -------------------------------------------------------------------------------- 1 | install.packages("ggplot2") 2 | library(ggplot2) 3 | 4 | # Make a histogram of the sepal width for all species 5 | hist(iris$Sepal.Width) 6 | 7 | # Make a histogram of the sepal width for each species 8 | par(mfrow = c(2,2)) 9 | for (species in unique(iris$Species)) { 10 | hist(iris$Sepal.Width[iris$Species == species], main = species, 11 | xlab = species) 12 | } 13 | hist(iris$Sepal.Width, main = "All Species") 14 | par(mfrow = c(1,1)) 15 | 16 | # Make a histogram of the sepal width for all species 17 | iris |> 18 | ggplot(aes(x = Sepal.Width)) + 19 | geom_histogram(alpha = 0.328) + 20 | theme_minimal() 21 | 22 | # Make a histogram of the sepal width for each species 23 | iris |> 24 | ggplot(aes(x = Sepal.Width, fill = Species)) + 25 | geom_histogram(alpha = 0.328) + 26 | theme_minimal() 27 | 28 | # Make a histogram of the sepal width for each species and facet them 29 | iris |> 30 | ggplot(aes(x = Sepal.Width, fill = Species)) + 31 | geom_histogram(alpha = 0.328) + 32 | facet_wrap(~ Species, scales = "free") + 33 | theme_minimal() 34 | -------------------------------------------------------------------------------- /chapter6/ch6_timeseries.R: -------------------------------------------------------------------------------- 1 | plot.ts(AirPassengers) 2 | plot(decompose(AirPassengers)) 3 | 4 | library(healthyR.ts) 5 | 6 | ts_brownian_motion() |> 7 | ts_brownian_motion_plot(t, y) 8 | -------------------------------------------------------------------------------- /chapter6/imgs/payergroup_barplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/chapter6/imgs/payergroup_barplot.png -------------------------------------------------------------------------------- /colored_table_openpyxl.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/colored_table_openpyxl.xlsx -------------------------------------------------------------------------------- /colored_table_pandas.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/colored_table_pandas.xlsx -------------------------------------------------------------------------------- /conditional_formatting.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/conditional_formatting.xlsx -------------------------------------------------------------------------------- /data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/data.xlsx -------------------------------------------------------------------------------- /dirty_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/dirty_data.xlsx -------------------------------------------------------------------------------- /example.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/example.xlsx -------------------------------------------------------------------------------- /heatmap_with_conditional_formatting.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/heatmap_with_conditional_formatting.xlsx -------------------------------------------------------------------------------- /iris_data.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/iris_data.xlsm -------------------------------------------------------------------------------- /iris_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/iris_data.xlsx -------------------------------------------------------------------------------- /linear_regression_input.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/linear_regression_input.xlsx -------------------------------------------------------------------------------- /logistic_regression_input.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/logistic_regression_input.xlsx -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/requirements.txt -------------------------------------------------------------------------------- /requirements.txt.bak: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/requirements.txt.bak -------------------------------------------------------------------------------- /styled_table_openpyxl.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/styled_table_openpyxl.xlsx -------------------------------------------------------------------------------- /styled_table_pandas.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/styled_table_pandas.xlsx -------------------------------------------------------------------------------- /time_series_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Extending-Excel-with-Python-and-R/8410bd18b9355182d60173f5b27f9019a42b4196/time_series_data.xlsx --------------------------------------------------------------------------------