├── notebooks ├── model_scores.p ├── arima_model_scores.p ├── 04_viewing_results.ipynb └── 03_arima_modeling.ipynb ├── model_output ├── lstm_forecast.png ├── XGBoost_forecast.png ├── arima_forecast.png ├── compare_models.png ├── RandomForest_forecast.png └── LinearRegression_forecast.png ├── README.md ├── results.py ├── data_preprocessing.py └── models.py /notebooks/model_scores.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mollyryanruby/sales_forecasting/HEAD/notebooks/model_scores.p -------------------------------------------------------------------------------- /model_output/lstm_forecast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mollyryanruby/sales_forecasting/HEAD/model_output/lstm_forecast.png -------------------------------------------------------------------------------- /notebooks/arima_model_scores.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mollyryanruby/sales_forecasting/HEAD/notebooks/arima_model_scores.p -------------------------------------------------------------------------------- /model_output/XGBoost_forecast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mollyryanruby/sales_forecasting/HEAD/model_output/XGBoost_forecast.png -------------------------------------------------------------------------------- /model_output/arima_forecast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mollyryanruby/sales_forecasting/HEAD/model_output/arima_forecast.png -------------------------------------------------------------------------------- /model_output/compare_models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mollyryanruby/sales_forecasting/HEAD/model_output/compare_models.png -------------------------------------------------------------------------------- /model_output/RandomForest_forecast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mollyryanruby/sales_forecasting/HEAD/model_output/RandomForest_forecast.png -------------------------------------------------------------------------------- /model_output/LinearRegression_forecast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mollyryanruby/sales_forecasting/HEAD/model_output/LinearRegression_forecast.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NEW LOCATION FOR UPDATED CODE: https://github.com/mollyryanruby/auto_forecast 2 | # 5 Machine Learning Techniques for Forecasting Sales 3 | 4 | ## Objective: 5 | Predict the number of monthly product sales using regressive and time-series modeling techniques. 6 | Paper: 7 | https://medium.com/towards-data-science/5-machine-learning-techniques-for-sales-forecasting-598e4984b109 8 | 9 | ## Featured Techniques: 10 | * EDA 11 | * Linear Regression 12 | * Random Forest Regression 13 | * XGBoost 14 | * Long Short Term Memory (artifical recurrent neural network) 15 | * ARIMA Time Series Forecasting 16 | 17 | ## Results: 18 | * Best results were obtained from the XGBoost and LSTM models 19 | * All models predicted within 2% of monthly mean sales for 12 month prediction 20 | 21 | ## Data Source: 22 | https://www.kaggle.com/c/demand-forecasting-kernels-only/data 23 | 24 | -------------------------------------------------------------------------------- /results.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script creates a csv of the model scores and outputs a plot to visualize 3 | the comparison. 4 | 5 | Models include Linear Regression, Random Forest, XGBoost, LSTM,and ARIMA. 6 | """ 7 | 8 | import pickle 9 | import pandas as pd 10 | import numpy as np 11 | 12 | import matplotlib.pyplot as plt 13 | import seaborn as sns 14 | 15 | def create_results_df(): 16 | """Returns a pandas dataframe with the root mean squared error, mean 17 | absolute error, and R2 score for each model. 18 | """ 19 | # Load pickled scores for each model 20 | results_dict = pickle.load(open("model_scores.p", "rb")) 21 | 22 | # Create pandas df and save as csv 23 | results_df = pd.DataFrame.from_dict(results_dict, orient='index', 24 | columns=['RMSE', 'MAE', 'R2']) 25 | 26 | results_df = results_df.sort_values(by='RMSE', 27 | ascending=False).reset_index() 28 | 29 | results_df.to_csv('../data/results.csv') 30 | 31 | return results_df 32 | 33 | def plot_results(results_df): 34 | """Generates and saves and lineplot with one line indicating RMSE scores 35 | for each model and one line indicating MAE scores for each model. 36 | """ 37 | fig, ax = plt.subplots(figsize=(12, 5)) 38 | sns.lineplot(np.arange(len(results_df)), 'RMSE', data=results_df, ax=ax, 39 | label='RMSE', color='mediumblue') 40 | sns.lineplot(np.arange(len(results_df)), 'MAE', data=results_df, ax=ax, 41 | label='MAE', color='Cyan') 42 | 43 | plt.xticks(np.arange(len(results_df)), rotation=45) 44 | ax.set_xticklabels(results_df['index']) 45 | ax.set(xlabel="Model", 46 | ylabel="Scores", 47 | title="Model Error Comparison") 48 | sns.despine() 49 | 50 | plt.savefig(f'../model_output/compare_models.png') 51 | 52 | def main(): 53 | """Calls functions to compare modelling results""" 54 | 55 | results = create_results_df() 56 | plot_results(results) 57 | 58 | main() 59 | -------------------------------------------------------------------------------- /data_preprocessing.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script loads data from Kaggle, generates monthly dataframe and performs 3 | differencing to create stationarity. Exports csv files for regression 4 | modeling and for Arima modeling. 5 | 6 | Data: https://www.kaggle.com/c/demand-forecasting-kernels-only/data 7 | 8 | Output CSV files 9 | -- ../data/monthly_data.csv 10 | -- ../data/stationary_df.csv 11 | -- ../data/model_df.csv 12 | -- ../data/arima_df.csv 13 | """ 14 | 15 | import pandas as pd 16 | 17 | def load_data(): 18 | """Returns a pandas dataframe from the train data set in Kaggle's Demand 19 | Forecasting competition. 20 | """ 21 | 22 | url = """https://www.kaggle.com/c/demand-forecasting-kernels-only/download/ 23 | ryQFx3IEtFjqjv3s0dXL%2Fversions%2FzjbSfpE39fdJlMotCpen%2Ffiles%2 24 | Ftrain.csv""" 25 | 26 | return pd.read_csv(url) 27 | 28 | 29 | def monthly_sales(data): 30 | """Returns a dataframe where each row represents total sales for a given 31 | month. Columns include 'date' by month and 'sales'. 32 | """ 33 | monthly_data = data.copy() 34 | 35 | # Drop the day indicator from the date column 36 | monthly_data.date = monthly_data.date.apply(lambda x: str(x)[:-3]) 37 | 38 | # Sum sales per month 39 | monthly_data = monthly_data.groupby('date')['sales'].sum().reset_index() 40 | monthly_data.date = pd.to_datetime(monthly_data.date) 41 | 42 | monthly_data.to_csv('../data/monthly_data.csv') 43 | 44 | return monthly_data 45 | 46 | def get_diff(data): 47 | """Returns the dataframe with a column for sales difference between each 48 | month. Results in a stationary time series dataframe. Prior EDA revealed 49 | that the monthly data was not stationary as it had a time-dependent mean. 50 | """ 51 | data['sales_diff'] = data.sales.diff() 52 | data = data.dropna() 53 | 54 | data.to_csv('../data/stationary_df.csv') 55 | 56 | return data 57 | 58 | 59 | def generate_supervised(data): 60 | """Generates a csv file where each row represents a month and columns 61 | include sales, the dependent variable, and prior sales for each lag. Based 62 | on EDA, 12 lag features are generated. Data is used for regression modeling. 63 | 64 | Output df: 65 | month1 sales lag1 lag2 lag3 ... lag11 lag12 66 | month2 sales lag1 lag2 lag3 ... lag11 lag12 67 | """ 68 | supervised_df = data.copy() 69 | 70 | #create column for each lag 71 | for i in range(1, 13): 72 | col_name = 'lag_' + str(i) 73 | supervised_df[col_name] = supervised_df['sales_diff'].shift(i) 74 | 75 | #drop null values 76 | supervised_df = supervised_df.dropna().reset_index(drop=True) 77 | 78 | supervised_df.to_csv('../data/model_df.csv', index=False) 79 | 80 | def generate_arima_data(data): 81 | """Generates a csv file with a datetime index and a dependent sales column 82 | for ARIMA modeling. 83 | """ 84 | dt_data = data.set_index('date').drop('sales', axis=1) 85 | dt_data.dropna(axis=0) 86 | 87 | dt_data.to_csv('../data/arima_df.csv') 88 | 89 | 90 | def main(): 91 | """Loads data from Kaggle, generates monthly dataframe and performs 92 | differencing to create stationarity. Exports csv files for regression 93 | modeling and for Arima modeling. 94 | """ 95 | sales_data = load_data() 96 | monthly_df = monthly_sales(sales_data) 97 | stationary_df = get_diff(monthly_df) 98 | 99 | generate_supervised(stationary_df) 100 | generate_arima_data(stationary_df) 101 | 102 | main() 103 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script predicts the number of monthly product sales using regressive and 3 | time-series modeling techniques. A graph of predicted values against actual 4 | values is plotted for each model and the root mean squared error, mean absolute 5 | error, and R2 scores are pickled for comparison. 6 | 7 | Modeling techniques include: 8 | -- Linear Regression 9 | -- Random Forest Regression 10 | -- XGBoost 11 | -- Long Short Term Memory (artifical recurrent neural network) 12 | -- ARIMA Time Series Forecasting 13 | 14 | """ 15 | 16 | import pickle 17 | 18 | import pandas as pd 19 | import numpy as np 20 | 21 | import matplotlib.pyplot as plt 22 | import seaborn as sns 23 | 24 | from sklearn.preprocessing import MinMaxScaler 25 | from sklearn.linear_model import LinearRegression 26 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 27 | from sklearn.ensemble import RandomForestRegressor 28 | from xgboost.sklearn import XGBRegressor 29 | 30 | import keras 31 | from keras.layers import Dense 32 | from keras.models import Sequential 33 | from keras.layers import LSTM 34 | 35 | import statsmodels.api as sm 36 | 37 | def load_data(file_name): 38 | """Returns a pandas dataframe from a csv file.""" 39 | return pd.read_csv(file_name) 40 | 41 | model_scores = {} 42 | 43 | def tts(data): 44 | """Splits the data into train and test. Test set consists of the last 12 45 | months of data. 46 | """ 47 | data = data.drop(['sales', 'date'], axis=1) 48 | train, test = data[0:-12].values, data[-12:].values 49 | 50 | return train, test 51 | 52 | def scale_data(train_set, test_set): 53 | """Scales data using MinMaxScaler and separates data into X_train, y_train, 54 | X_test, and y_test. 55 | 56 | Keyword Arguments: 57 | -- train_set: dataset used to train the model 58 | -- test_set: dataset used to test the model 59 | """ 60 | 61 | #apply Min Max Scaler 62 | scaler = MinMaxScaler(feature_range=(-1, 1)) 63 | scaler = scaler.fit(train_set) 64 | 65 | # reshape training set 66 | train_set = train_set.reshape(train_set.shape[0], train_set.shape[1]) 67 | train_set_scaled = scaler.transform(train_set) 68 | 69 | # reshape test set 70 | test_set = test_set.reshape(test_set.shape[0], test_set.shape[1]) 71 | test_set_scaled = scaler.transform(test_set) 72 | 73 | X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1].ravel() 74 | X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1].ravel() 75 | 76 | return X_train, y_train, X_test, y_test, scaler 77 | 78 | def undo_scaling(y_pred, x_test, scaler_obj, lstm=False): 79 | """For visualizing and comparing results, undoes the scaling effect on 80 | predictions. 81 | 82 | Keyword arguments: 83 | -- y_pred: model predictions 84 | -- x_test: features from the test set used for predictions 85 | -- scaler_obj: the scaler objects used for min-max scaling 86 | -- lstm: indicate if the model run is the lstm. If True, additional 87 | transformation occurs 88 | """ 89 | 90 | #reshape y_pred 91 | y_pred = y_pred.reshape(y_pred.shape[0], 1, 1) 92 | 93 | if not lstm: 94 | x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1]) 95 | 96 | #rebuild test set for inverse transform 97 | pred_test_set = [] 98 | for index in range(0, len(y_pred)): 99 | pred_test_set.append(np.concatenate([y_pred[index], x_test[index]], 100 | axis=1)) 101 | 102 | #reshape pred_test_set 103 | pred_test_set = np.array(pred_test_set) 104 | pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], 105 | pred_test_set.shape[2]) 106 | 107 | #inverse transform 108 | pred_test_set_inverted = scaler_obj.inverse_transform(pred_test_set) 109 | 110 | return pred_test_set_inverted 111 | 112 | def predict_df(unscaled_predictions, original_df): 113 | """Generates a dataframe that shows the predicted sales for each month 114 | for plotting results. 115 | 116 | Keyword arguments: 117 | -- unscaled_predictions: the model predictions that do not have min-max or 118 | other scaling applied 119 | -- original_df: the original monthly sales dataframe 120 | """ 121 | #create dataframe that shows the predicted sales 122 | result_list = [] 123 | sales_dates = list(original_df[-13:].date) 124 | act_sales = list(original_df[-13:].sales) 125 | 126 | for index in range(0, len(unscaled_predictions)): 127 | result_dict = {} 128 | result_dict['pred_value'] = int(unscaled_predictions[index][0] + 129 | act_sales[index]) 130 | result_dict['date'] = sales_dates[index+1] 131 | result_list.append(result_dict) 132 | 133 | df_result = pd.DataFrame(result_list) 134 | 135 | return df_result 136 | 137 | def get_scores(unscaled_df, original_df, model_name): 138 | """Prints the root mean squared error, mean absolute error, and r2 scores 139 | for each model. Saves all results in a model_scores dictionary for 140 | comparison. 141 | 142 | Keyword arguments: 143 | -- unscaled_predictions: the model predictions that do not have min-max or 144 | other scaling applied 145 | -- original_df: the original monthly sales dataframe 146 | -- model_name: the name that will be used to store model scores 147 | """ 148 | rmse = np.sqrt(mean_squared_error(original_df.sales[-12:], unscaled_df.pred_value[-12:])) 149 | mae = mean_absolute_error(original_df.sales[-12:], unscaled_df.pred_value[-12:]) 150 | r2 = r2_score(original_df.sales[-12:], unscaled_df.pred_value[-12:]) 151 | model_scores[model_name] = [rmse, mae, r2] 152 | 153 | print(f"RMSE: {rmse}") 154 | print(f"MAE: {mae}") 155 | print(f"R2 Score: {r2}") 156 | 157 | def plot_results(results, original_df, model_name): 158 | """Plots predictions over original data to visualize results. Saves each 159 | plot as a png. 160 | 161 | Keyword arguments: 162 | -- results: a dataframe with unscaled predictions 163 | -- original_df: the original monthly sales dataframe 164 | -- model_name: the name that will be used in the plot title 165 | """ 166 | fig, ax = plt.subplots(figsize=(15, 5)) 167 | sns.lineplot(original_df.date, original_df.sales, data=original_df, ax=ax, 168 | label='Original', color='mediumblue') 169 | sns.lineplot(results.date, results.pred_value, data=results, ax=ax, 170 | label='Predicted', color='red') 171 | ax.set(xlabel="Date", 172 | ylabel="Sales", 173 | title=f"{model_name} Sales Forecasting Prediction") 174 | ax.legend() 175 | sns.despine() 176 | 177 | plt.savefig(f'../model_output/{model_name}_forecast.png') 178 | 179 | def regressive_model(train_data, test_data, model, model_name): 180 | """Runs regressive models in SKlearn framework. First calls scale_data 181 | to split into X and y and scale the data. Then fits and predicts. Finally, 182 | predictions are unscaled, scores are printed, and results are plotted and 183 | saved. 184 | 185 | Keyword arguments: 186 | -- train_set: dataset used to train the model 187 | -- test_set: dataset used to test the model 188 | -- model: the sklearn model and model arguments in the form of 189 | model(kwarga) 190 | -- model_name: the name that will be used to store model scores and plotting 191 | """ 192 | 193 | # Split into X & y and scale data 194 | X_train, y_train, X_test, y_test, scaler_object = scale_data(train_data, 195 | test_data) 196 | # Run sklearn models 197 | mod = model 198 | mod.fit(X_train, y_train) 199 | predictions = mod.predict(X_test) 200 | 201 | # Undo scaling to compare predictions against original data 202 | original_df = load_data('../data/monthly_data.csv') 203 | unscaled = undo_scaling(predictions, X_test, scaler_object) 204 | unscaled_df = predict_df(unscaled, original_df) 205 | 206 | # print scores and plot results 207 | get_scores(unscaled_df, original_df, model_name) 208 | plot_results(unscaled_df, original_df, model_name) 209 | 210 | def lstm_model(train_data, test_data): 211 | """Runs a long-short-term-memory nueral net with 2 dense layers. Generates 212 | predictions that are then unscaled. Scores are printed and results are 213 | plotted and saved. 214 | 215 | Keyword arguments: 216 | -- train_set: dataset used to train the model 217 | -- test_set: dataset used to test the model 218 | """ 219 | 220 | # Split into X & y and scale data 221 | X_train, y_train, X_test, y_test, scaler_object = scale_data(train_data, test_data) 222 | 223 | X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1]) 224 | X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1]) 225 | 226 | # Build LSTM 227 | model = Sequential() 228 | model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True)) 229 | model.add(Dense(1)) 230 | model.add(Dense(1)) 231 | model.compile(loss='mean_squared_error', optimizer='adam') 232 | model.fit(X_train, y_train, epochs=200, batch_size=1, verbose=1, shuffle=False) 233 | predictions = model.predict(X_test, batch_size=1) 234 | 235 | # Undo scaling to compare predictions against original data 236 | original_df = load_data('../data/monthly_data.csv') 237 | unscaled = undo_scaling(predictions, X_test, scaler_object, lstm=True) 238 | unscaled_df = predict_df(unscaled, original_df) 239 | 240 | # print scores and plot results 241 | get_scores(unscaled_df, original_df, 'LSTM') 242 | plot_results(unscaled_df, original_df, 'LSTM') 243 | 244 | def sarimax_model(data): 245 | """Runs an arima model with 12 lags and yearly seasonal impact. Generates 246 | dynamic predictions for last 12 months. Prints and saves scores and plots 247 | results. 248 | """ 249 | # Model 250 | sar = sm.tsa.statespace.SARIMAX(data.sales_diff, order=(12, 0, 0), 251 | seasonal_order=(0, 1, 0, 12), 252 | trend='c').fit() 253 | 254 | # Generate predictions 255 | start, end, dynamic = 40, 100, 7 256 | data['pred_value'] = sar.predict(start=start, end=end, dynamic=dynamic) 257 | 258 | # Generate predictions dataframe 259 | original_df = load_data('../data/monthly_data.csv') 260 | unscaled_df = predict_df(data, original_df) 261 | 262 | # print scores and plot results 263 | get_scores(unscaled_df, original_df, 'ARIMA') 264 | plot_results(unscaled_df, original_df, 'ARIMA') 265 | 266 | def main(): 267 | """Calls all functions to load data, run regression models, run lstm model, 268 | and run arima model. 269 | """ 270 | # Regression models 271 | model_df = load_data('../data/model_df.csv') 272 | train, test = tts(model_df) 273 | 274 | # Sklearn 275 | regressive_model(train, test, LinearRegression(), 'LinearRegression') 276 | regressive_model(train, test, RandomForestRegressor(n_estimators=100, 277 | max_depth=20), 278 | 'RandomForest') 279 | regressive_model(train, test, XGBRegressor(n_estimators=100, 280 | learning_rate=0.2, 281 | objective='reg:squarederror'), 282 | 'XGBoost') 283 | # Keras 284 | lstm_model(train, test) 285 | 286 | # Arima 287 | ts_data = load_data('../data/arima_df.csv').set_index('date') 288 | ts_data.index = pd.to_datetime(ts_data.index) 289 | 290 | sarimax_model(ts_data) 291 | 292 | main() 293 | 294 | # Save mmodel scores to compare all model results in results.py 295 | pickle.dump(model_scores, open("model_scores.p", "wb")) 296 | 297 | 298 | # 299 | -------------------------------------------------------------------------------- /notebooks/04_viewing_results.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 87, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import seaborn as sns\n", 14 | "%matplotlib inline\n", 15 | "\n", 16 | "import pickle" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# Create Results Dataframe" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 98, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "def create_results_df():\n", 33 | " results_dict = pickle.load(open(\"model_scores.p\", \"rb\"))\n", 34 | " \n", 35 | " results_dict.update(pickle.load(open(\"arima_model_scores.p\", \"rb\")))\n", 36 | " \n", 37 | " restults_df = pd.DataFrame.from_dict(results_dict, orient='index', \n", 38 | " columns=['RMSE', 'MAE','R2'])\n", 39 | " \n", 40 | " restults_df = restults_df.sort_values(by='RMSE', ascending=False).reset_index()\n", 41 | " \n", 42 | " return restults_df" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 99, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/html": [ 53 | "
\n", 54 | "\n", 67 | "\n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | "
indexRMSEMAER2
0Random Forest18599.23296615832.7500000.987794
1LinearRegression16221.04079112433.0000000.990716
2ARIMA14959.89346711265.3357490.983564
3LSTM14638.74835011951.0833330.992439
4XGBoost13574.79263211649.6666670.993498
\n", 115 | "
" 116 | ], 117 | "text/plain": [ 118 | " index RMSE MAE R2\n", 119 | "0 Random Forest 18599.232966 15832.750000 0.987794\n", 120 | "1 LinearRegression 16221.040791 12433.000000 0.990716\n", 121 | "2 ARIMA 14959.893467 11265.335749 0.983564\n", 122 | "3 LSTM 14638.748350 11951.083333 0.992439\n", 123 | "4 XGBoost 13574.792632 11649.666667 0.993498" 124 | ] 125 | }, 126 | "execution_count": 99, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "results = create_results_df()\n", 133 | "results" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "# Plot Results" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 117, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "def plot_results(results_df):\n", 150 | " fig, ax = plt.subplots(figsize=(12, 5))\n", 151 | " sns.lineplot(np.arange(len(results_df)), 'RMSE', data=results_df, ax=ax, \n", 152 | " label='RMSE', color='mediumblue')\n", 153 | " sns.lineplot(np.arange(len(results_df)), 'MAE', data=results_df, ax=ax, \n", 154 | " label='MAE', color='Cyan')\n", 155 | " \n", 156 | " plt.xticks(np.arange(len(results_df)),rotation=45)\n", 157 | " ax.set_xticklabels(results_df['index'])\n", 158 | " ax.set(xlabel = \"Model\",\n", 159 | " ylabel = \"Scores\",\n", 160 | " title = \"Model Error Comparison\")\n", 161 | " sns.despine()\n", 162 | " \n", 163 | " plt.savefig(f'../model_output/compare_models.png')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 118, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "image/png": "\n", 174 | "text/plain": [ 175 | "
" 176 | ] 177 | }, 178 | "metadata": { 179 | "needs_background": "light" 180 | }, 181 | "output_type": "display_data" 182 | } 183 | ], 184 | "source": [ 185 | "plot_results(results)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 111, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "With XGBoost, prediction is within 1.3% of the actual.\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "average_monthly_sales = 894478 #see eda notebook\n", 203 | "gboost = 11649.666667\n", 204 | "percentage_off = round(gboost/average_monthly_sales*100, 2)\n", 205 | "\n", 206 | "print(f\"With XGBoost, prediction is within {percentage_off}% of the actual.\")" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python [conda env:metis] *", 220 | "language": "python", 221 | "name": "conda-env-metis-py" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.7.4" 234 | }, 235 | "toc": { 236 | "base_numbering": 1, 237 | "nav_menu": {}, 238 | "number_sections": true, 239 | "sideBar": true, 240 | "skip_h1_title": false, 241 | "title_cell": "Table of Contents", 242 | "title_sidebar": "Contents", 243 | "toc_cell": false, 244 | "toc_position": {}, 245 | "toc_section_display": true, 246 | "toc_window_display": true 247 | } 248 | }, 249 | "nbformat": 4, 250 | "nbformat_minor": 2 251 | } 252 | -------------------------------------------------------------------------------- /notebooks/03_arima_modeling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import seaborn as sns\n", 14 | "%matplotlib inline\n", 15 | "\n", 16 | "import statsmodels.tsa.api as smt\n", 17 | "import statsmodels.api as sm\n", 18 | "from statsmodels.tools.eval_measures import rmse\n", 19 | "\n", 20 | "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", 21 | "\n", 22 | "import pickle" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# Load Data" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "def load_data():\n", 39 | " return pd.read_csv('../data/arima_df.csv').set_index('date')\n", 40 | "\n", 41 | "ts_data = load_data()" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "ts_data.index = pd.to_datetime(ts_data.index)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "# SARIMAX Modeling" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 162, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "def get_scores(data):\n", 67 | " \n", 68 | " model_scores = {}\n", 69 | " \n", 70 | " rmse = np.sqrt(mean_squared_error(data.sales_diff[-12:], data.forecast[-12:]))\n", 71 | " mae = mean_absolute_error(data.sales_diff[-12:], data.forecast[-12:])\n", 72 | " r2 = r2_score(data.sales_diff[-12:], data.forecast[-12:])\n", 73 | " model_scores['ARIMA'] = [rmse, mae, r2]\n", 74 | " \n", 75 | " print(f\"RMSE: {rmse}\")\n", 76 | " print(f\"MAE: {mae}\")\n", 77 | " print(f\"R2 Score: {r2}\")\n", 78 | " \n", 79 | " pickle.dump(model_scores, open( \"arima_model_scores.p\", \"wb\" ))" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 175, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stderr", 89 | "output_type": "stream", 90 | "text": [ 91 | "/Users/mollyliebeskind/opt/anaconda3/envs/metis/lib/python3.7/site-packages/statsmodels/tsa/base/tsa_model.py:162: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.\n", 92 | " % freq, ValueWarning)\n", 93 | "/Users/mollyliebeskind/opt/anaconda3/envs/metis/lib/python3.7/site-packages/statsmodels/tsa/base/tsa_model.py:162: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.\n", 94 | " % freq, ValueWarning)\n" 95 | ] 96 | }, 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "RMSE: 14959.893467320022\n", 102 | "MAE: 11265.335748850031\n", 103 | "R2 Score: 0.9835642876210896\n" 104 | ] 105 | }, 106 | { 107 | "data": { 108 | "image/png": "\n", 109 | "text/plain": [ 110 | "
" 111 | ] 112 | }, 113 | "metadata": { 114 | "needs_background": "light" 115 | }, 116 | "output_type": "display_data" 117 | } 118 | ], 119 | "source": [ 120 | "def sarimax_model(data):\n", 121 | " \n", 122 | " # Model\n", 123 | " sar = sm.tsa.statespace.SARIMAX(ts_data.sales_diff, order=(12,0,0), seasonal_order=(0,1,0,12), trend='c').fit()\n", 124 | "\n", 125 | " # Predictions\n", 126 | " start, end, dynamic = 40, 100, 7\n", 127 | " data['forecast'] = sar.predict(start=start, end=end, dynamic=dynamic) \n", 128 | " pred_df = data.forecast[start+dynamic:end]\n", 129 | " \n", 130 | " data[['sales_diff', 'forecast']].plot(color=['mediumblue', 'Red'])\n", 131 | " \n", 132 | " get_scores(data)\n", 133 | "\n", 134 | " return sar, data, pred_df\n", 135 | "\n", 136 | "sar, ts_data, predictions = sarimax_model(ts_data)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 164, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "image/png": "\n", 147 | "text/plain": [ 148 | "
" 149 | ] 150 | }, 151 | "metadata": { 152 | "needs_background": "light" 153 | }, 154 | "output_type": "display_data" 155 | } 156 | ], 157 | "source": [ 158 | "sar.plot_diagnostics(figsize=(10, 8));" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# Plot Results" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 165, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "def predict_df(prediction_df):\n", 175 | " \n", 176 | " #load in original dataframe without scaling applied\n", 177 | " original_df = pd.read_csv('../data/train.csv')\n", 178 | " original_df.date = original_df.date.apply(lambda x: str(x)[:-3])\n", 179 | " original_df = original_df.groupby('date')['sales'].sum().reset_index()\n", 180 | " original_df.date = pd.to_datetime(original_df.date)\n", 181 | " \n", 182 | " #create dataframe that shows the predicted sales\n", 183 | " result_list = []\n", 184 | " sales_dates = list(original_df[-13:].date)\n", 185 | " act_sales = list(original_df[-13:].sales)\n", 186 | " \n", 187 | " for index in range(0,len(prediction_df)):\n", 188 | " result_dict = {}\n", 189 | " result_dict['pred_value'] = int(prediction_df[index] + act_sales[index])\n", 190 | " result_dict['date'] = sales_dates[index+1]\n", 191 | " result_list.append(result_dict)\n", 192 | " \n", 193 | " df_result = pd.DataFrame(result_list)\n", 194 | " \n", 195 | " return df_result, original_df" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 173, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "def plot_results(results, original_df, model_name):\n", 205 | "\n", 206 | " fig, ax = plt.subplots(figsize=(15,5))\n", 207 | " sns.lineplot(original_df.date, original_df.sales, data=original_df, ax=ax, \n", 208 | " label='Original', color='mediumblue')\n", 209 | " sns.lineplot(results.date, results.pred_value, data=results, ax=ax, \n", 210 | " label='Predicted', color='Red')\n", 211 | " \n", 212 | " ax.set(xlabel = \"Date\",\n", 213 | " ylabel = \"Sales\",\n", 214 | " title = f\"{model_name} Sales Forecasting Prediction\")\n", 215 | " \n", 216 | " ax.legend()\n", 217 | " \n", 218 | " sns.despine()\n", 219 | " \n", 220 | "\n", 221 | " plt.savefig(f'../model_output/{model_name}_forecast.png')" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 174, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "image/png": "\n", 232 | "text/plain": [ 233 | "
" 234 | ] 235 | }, 236 | "metadata": { 237 | "needs_background": "light" 238 | }, 239 | "output_type": "display_data" 240 | } 241 | ], 242 | "source": [ 243 | "prediction_df, original_df = predict_df(predictions)\n", 244 | "plot_results(prediction_df, original_df, 'arima')" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "We can also get dynamic predictions. One-step-ahead prediction uses the true values of the endogenous values at each step to predict the next in-sample value. Dynamic predictions use one-step-ahead prediction up to some point in the dataset (specified by the dynamic argument); after that, the previous predicted endogenous values are used in place of the true endogenous values for each new predicted element." 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [] 260 | } 261 | ], 262 | "metadata": { 263 | "kernelspec": { 264 | "display_name": "Python [conda env:metis] *", 265 | "language": "python", 266 | "name": "conda-env-metis-py" 267 | }, 268 | "language_info": { 269 | "codemirror_mode": { 270 | "name": "ipython", 271 | "version": 3 272 | }, 273 | "file_extension": ".py", 274 | "mimetype": "text/x-python", 275 | "name": "python", 276 | "nbconvert_exporter": "python", 277 | "pygments_lexer": "ipython3", 278 | "version": "3.7.4" 279 | }, 280 | "toc": { 281 | "base_numbering": 1, 282 | "nav_menu": {}, 283 | "number_sections": true, 284 | "sideBar": true, 285 | "skip_h1_title": false, 286 | "title_cell": "Table of Contents", 287 | "title_sidebar": "Contents", 288 | "toc_cell": false, 289 | "toc_position": {}, 290 | "toc_section_display": true, 291 | "toc_window_display": true 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 2 296 | } 297 | --------------------------------------------------------------------------------