├── xgboost_model.pkl ├── __pycache__ ├── viz.cpython-312.pyc ├── utils.cpython-312.pyc ├── config.cpython-312.pyc ├── models.cpython-312.pyc ├── evaluation.cpython-312.pyc ├── advanced_viz.cpython-312.pyc ├── data_loader.cpython-312.pyc ├── advanced_models.cpython-312.pyc └── advanced_evaluation.cpython-312.pyc ├── requirements.txt ├── evaluation.py ├── viz.py ├── data_loader.py ├── phyton_project (1).py ├── config.py ├── TODO.md ├── models.py ├── utils.py ├── advanced_models.py ├── advanced_evaluation.py ├── advanced_viz.py ├── README.md ├── app.py └── app_advanced.py /xgboost_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/xgboost_model.pkl -------------------------------------------------------------------------------- /__pycache__/viz.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/viz.cpython-312.pyc -------------------------------------------------------------------------------- /__pycache__/utils.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/utils.cpython-312.pyc -------------------------------------------------------------------------------- /__pycache__/config.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/config.cpython-312.pyc -------------------------------------------------------------------------------- /__pycache__/models.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/models.cpython-312.pyc -------------------------------------------------------------------------------- /__pycache__/evaluation.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/evaluation.cpython-312.pyc -------------------------------------------------------------------------------- /__pycache__/advanced_viz.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/advanced_viz.cpython-312.pyc -------------------------------------------------------------------------------- /__pycache__/data_loader.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/data_loader.cpython-312.pyc -------------------------------------------------------------------------------- /__pycache__/advanced_models.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/advanced_models.cpython-312.pyc -------------------------------------------------------------------------------- /__pycache__/advanced_evaluation.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/advanced_evaluation.cpython-312.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | tensorflow==2.16.1 3 | scikit-learn 4 | pandas 5 | numpy 6 | matplotlib 7 | seaborn 8 | xgboost 9 | plotly 10 | statsmodels 11 | scipy 12 | joblib 13 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 3 | 4 | def mean_absolute_percentage_error(y_true, y_pred): 5 | """ 6 | Calculate Mean Absolute Percentage Error (MAPE). 7 | """ 8 | y_true, y_pred = np.array(y_true), np.array(y_pred) 9 | return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 10 | 11 | def evaluate_model(y_true, y_pred): 12 | """ 13 | Evaluate model with MAE, RMSE, MAPE, and R2 Score. 14 | """ 15 | mae = mean_absolute_error(y_true, y_pred) 16 | rmse = np.sqrt(mean_squared_error(y_true, y_pred)) 17 | mape = mean_absolute_percentage_error(y_true, y_pred) 18 | r2 = r2_score(y_true, y_pred) 19 | return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'R2': r2} 20 | -------------------------------------------------------------------------------- /viz.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import pandas as pd 4 | 5 | def plot_forecast(y_true, y_pred, title='Forecast vs Actual'): 6 | """ 7 | Plot observed vs predicted values. 8 | """ 9 | plt.figure(figsize=(10, 6)) 10 | plt.plot(y_true.index, y_true, label='Observed', color='blue') 11 | plt.plot(y_true.index, y_pred, label='Predicted', color='red') 12 | plt.xlabel('Date') 13 | plt.ylabel('Value') 14 | plt.title(title) 15 | plt.legend() 16 | plt.show() 17 | 18 | def plot_residuals(y_true, y_pred): 19 | """ 20 | Plot residuals. 21 | """ 22 | residuals = y_true - y_pred 23 | plt.figure(figsize=(10, 6)) 24 | plt.plot(residuals.index, residuals, color='green') 25 | plt.axhline(0, color='black', linestyle='--') 26 | plt.xlabel('Date') 27 | plt.ylabel('Residuals') 28 | plt.title('Residuals Plot') 29 | plt.show() 30 | 31 | def plot_distribution(residuals): 32 | """ 33 | Plot distribution of residuals. 34 | """ 35 | plt.figure(figsize=(8, 6)) 36 | sns.histplot(residuals, kde=True) 37 | plt.title('Residuals Distribution') 38 | plt.show() 39 | -------------------------------------------------------------------------------- /data_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | 5 | def load_data(file_path, date_col='date', target_col='target'): 6 | """ 7 | Load time series data from CSV. 8 | Assumes the CSV has a date column and a target column. 9 | """ 10 | df = pd.read_csv(file_path, parse_dates=[date_col]) 11 | df.set_index(date_col, inplace=True) 12 | return df 13 | 14 | def add_lag_features(df, target_col='target', lags=5): 15 | """ 16 | Add lag features to the dataframe. 17 | """ 18 | for lag in range(1, lags + 1): 19 | df[f'lag_{lag}'] = df[target_col].shift(lag) 20 | df.dropna(inplace=True) 21 | return df 22 | 23 | def preprocess_data(df, target_col='target', lags=5, test_size=0.2): 24 | """ 25 | Preprocess data: add lags, split into train and test. 26 | """ 27 | df = add_lag_features(df, target_col, lags) 28 | X = df.drop(columns=[target_col]) 29 | y = df[target_col] 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) 31 | return X_train, X_test, y_train, y_test 32 | 33 | def create_sample_data(): 34 | """ 35 | Create sample data for demonstration. 36 | """ 37 | date_range = pd.date_range(start='2022-01-01', periods=100, freq='D') 38 | time_series_data = np.cumsum(np.random.randn(100)) 39 | df = pd.DataFrame({'date': date_range, 'target': time_series_data}) 40 | df.set_index('date', inplace=True) 41 | return df 42 | -------------------------------------------------------------------------------- /phyton_project (1).py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """phyton-project 3 | 4 | Automatically generated by Colab. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/1ejf8lG6yOr-FjL99Nn2dNliPjCHgpxPz 8 | """ 9 | 10 | import pandas as pd 11 | import numpy as np 12 | import xgboost as xgb 13 | import matplotlib.pyplot as plt 14 | import seaborn as sns 15 | 16 | # Load your actual time series data into a pandas DataFrame (replace this with your data) 17 | # For demonstration purposes, let's create a sample dataset 18 | date_range = pd.date_range(start='2022-01-01', periods=100, freq='D') 19 | time_series_data = np.cumsum(np.random.randn(100)) 20 | df = pd.DataFrame({'date': date_range, 'target': time_series_data}) 21 | 22 | # Convert datetime column to features (year, month, day) 23 | df['year'] = df['date'].dt.year 24 | df['month'] = df['date'].dt.month 25 | df['day'] = df['date'].dt.day 26 | 27 | # Drop the original datetime column 28 | df.drop(columns=['date'], inplace=True) 29 | 30 | # Split data into train and validation sets 31 | train_size = int(0.8 * len(df)) 32 | train, val = df[:train_size], df[train_size:] 33 | 34 | # Define features and target 35 | X_train, y_train = train.drop(columns=['target']), train['target'] 36 | X_val, y_val = val.drop(columns=['target']), val['target'] 37 | 38 | # Train an XGBoost model 39 | model = xgb.XGBRegressor() 40 | model.fit(X_train, y_train) 41 | 42 | # Make predictions 43 | y_pred = model.predict(X_val) 44 | 45 | # Visualize observed vs. predicted values 46 | plt.plot(val.index, y_val, label='Observed', color='blue') 47 | plt.plot(val.index, y_pred, label='Predicted', color='red') 48 | plt.xlabel('Date') 49 | plt.ylabel('Value') 50 | plt.title('XGBoost Time Series Forecasting') 51 | plt.legend() 52 | plt.show() -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # Configuration settings for the Time Series Forecasting App 4 | 5 | # Default hyperparameters for models 6 | DEFAULT_PARAMS = { 7 | 'XGBoost': { 8 | 'n_estimators': 100, 9 | 'learning_rate': 0.1, 10 | 'max_depth': 6 11 | }, 12 | 'Random Forest': { 13 | 'n_estimators': 100, 14 | 'max_depth': 10, 15 | 'min_samples_split': 2 16 | }, 17 | 'LSTM': { 18 | 'units': 50, 19 | 'epochs': 10, 20 | 'batch_size': 32, 21 | 'dropout': 0.2 22 | } 23 | } 24 | 25 | # Hyperparameter tuning grids 26 | TUNING_GRIDS = { 27 | 'XGBoost': { 28 | 'n_estimators': [50, 100, 150], 29 | 'learning_rate': [0.01, 0.1, 0.2], 30 | 'max_depth': [3, 6, 9] 31 | }, 32 | 'Random Forest': { 33 | 'n_estimators': [50, 100, 150], 34 | 'max_depth': [5, 10, 15], 35 | 'min_samples_split': [2, 5, 10] 36 | } 37 | } 38 | 39 | # Data preprocessing settings 40 | DATA_SETTINGS = { 41 | 'default_lags': 5, 42 | 'test_size': 0.2, 43 | 'date_col': 'date', 44 | 'target_col': 'target' 45 | } 46 | 47 | # Visualization settings 48 | VIZ_SETTINGS = { 49 | 'figsize': (10, 6), 50 | 'colors': { 51 | 'observed': 'blue', 52 | 'predicted': 'red', 53 | 'residuals': 'green' 54 | } 55 | } 56 | 57 | # App settings 58 | APP_SETTINGS = { 59 | 'title': 'Advanced Time Series Forecasting Web App', 60 | 'sidebar_title': 'Configuration', 61 | 'default_model': 'XGBoost' 62 | } 63 | 64 | # File paths 65 | DATA_DIR = 'data' 66 | MODEL_DIR = 'models' 67 | RESULTS_DIR = 'results' 68 | 69 | # Create directories if they don't exist 70 | for dir_path in [DATA_DIR, MODEL_DIR, RESULTS_DIR]: 71 | if not os.path.exists(dir_path): 72 | os.makedirs(dir_path) 73 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # TODO List for Enhancing Time Series Forecasting Project 2 | 3 | ## Step 1: Create requirements.txt ✅ 4 | - List all necessary dependencies: streamlit, tensorflow, scikit-learn, pandas, numpy, matplotlib, seaborn, xgboost 5 | 6 | ## Step 2: Create data_loader.py ✅ 7 | - Implement functions for loading CSV data, preprocessing (handling dates, adding lag features), splitting into train/val 8 | 9 | ## Step 3: Create models.py ✅ 10 | - Implement XGBoost, Random Forest, and LSTM models with hyperparameter tuning support 11 | 12 | ## Step 4: Create evaluation.py ✅ 13 | - Implement evaluation metrics: MAE, RMSE, MAPE 14 | 15 | ## Step 5: Create viz.py ✅ 16 | - Implement visualization functions: observed vs predicted plots, residuals, etc. 17 | 18 | ## Step 6: Create app.py ✅ 19 | - Build Streamlit web app with UI for data upload, feature selection, model choice, hyperparams, and displaying results 20 | 21 | ## Step 7: Install dependencies ✅ 22 | - Run pip install -r requirements.txt 23 | 24 | ## Step 8: Run and test the Streamlit app ✅ 25 | - Execute streamlit run app.py and verify all features work 26 | 27 | ## Step 9: Verify and finalize ✅ 28 | - Check for any bugs, ensure performance optimizations are in place 29 | 30 | ## Step 10: Add Advanced Features ✅ 31 | - Create config.py for configuration management 32 | - Create utils.py for utility functions (validation, scaling, logging, etc.) 33 | - Create advanced_models.py with additional models (Gradient Boosting, AdaBoost, SVR, MLP, Bidirectional LSTM, GRU, ARIMA, SARIMA) 34 | - Create advanced_evaluation.py with comprehensive metrics (SMAPE, MASE, MDA, etc.) 35 | - Create advanced_viz.py with interactive plots and advanced visualizations 36 | - Update requirements.txt with additional dependencies (plotly, statsmodels, scipy, joblib) 37 | - Enhance app.py with new models, advanced metrics, and interactive visualizations 38 | 39 | ## Step 11: Final Testing and Documentation ✅ 40 | - Test all new features and models 41 | - Ensure backward compatibility 42 | - Add proper error handling 43 | - Create README.md with project description and usage instructions 44 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | from sklearn.ensemble import RandomForestRegressor 3 | from sklearn.model_selection import GridSearchCV 4 | from tensorflow.keras.models import Sequential 5 | from tensorflow.keras.layers import LSTM, Dense 6 | import numpy as np 7 | 8 | def train_xgboost(X_train, y_train, params=None): 9 | """ 10 | Train XGBoost model with optional hyperparameter tuning. 11 | """ 12 | if params is None: 13 | params = {'n_estimators': 100, 'learning_rate': 0.1} 14 | model = xgb.XGBRegressor(**params) 15 | model.fit(X_train, y_train) 16 | return model 17 | 18 | def train_random_forest(X_train, y_train, params=None): 19 | """ 20 | Train Random Forest model with optional hyperparameter tuning. 21 | """ 22 | if params is None: 23 | params = {'n_estimators': 100, 'max_depth': 10} 24 | model = RandomForestRegressor(**params) 25 | model.fit(X_train, y_train) 26 | return model 27 | 28 | def train_lstm(X_train, y_train, params=None): 29 | """ 30 | Train LSTM model. 31 | Note: LSTM requires 3D input (samples, timesteps, features). 32 | Assuming X_train is reshaped appropriately. 33 | """ 34 | if params is None: 35 | params = {'units': 50, 'epochs': 10, 'batch_size': 32} 36 | # Reshape for LSTM: assuming univariate, timesteps=1 for simplicity 37 | X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1])) 38 | model = Sequential() 39 | model.add(LSTM(params['units'], input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]))) 40 | model.add(Dense(1)) 41 | model.compile(optimizer='adam', loss='mse') 42 | model.fit(X_train_reshaped, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0) 43 | return model 44 | 45 | def tune_hyperparameters(model_type, X_train, y_train, param_grid): 46 | """ 47 | Perform hyperparameter tuning using GridSearchCV. 48 | """ 49 | if model_type == 'xgboost': 50 | model = xgb.XGBRegressor() 51 | elif model_type == 'random_forest': 52 | model = RandomForestRegressor() 53 | else: 54 | raise ValueError("Unsupported model type for tuning") 55 | grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') 56 | grid_search.fit(X_train, y_train) 57 | return grid_search.best_estimator_, grid_search.best_params_ 58 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import StandardScaler, MinMaxScaler 4 | import logging 5 | 6 | # Set up logging 7 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 8 | logger = logging.getLogger(__name__) 9 | 10 | def setup_logging(log_level=logging.INFO): 11 | """ 12 | Set up logging configuration. 13 | """ 14 | logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s') 15 | 16 | def validate_data(df, date_col='date', target_col='target'): 17 | """ 18 | Validate the input data. 19 | """ 20 | if df.empty: 21 | raise ValueError("DataFrame is empty") 22 | if date_col not in df.columns: 23 | raise ValueError(f"Date column '{date_col}' not found in data") 24 | if target_col not in df.columns: 25 | raise ValueError(f"Target column '{target_col}' not found in data") 26 | if not pd.api.types.is_datetime64_any_dtype(df[date_col]): 27 | raise ValueError(f"Date column '{date_col}' is not in datetime format") 28 | logger.info("Data validation passed") 29 | 30 | def scale_features(X_train, X_test, method='standard'): 31 | """ 32 | Scale features using StandardScaler or MinMaxScaler. 33 | """ 34 | if method == 'standard': 35 | scaler = StandardScaler() 36 | elif method == 'minmax': 37 | scaler = MinMaxScaler() 38 | else: 39 | raise ValueError("Invalid scaling method. Choose 'standard' or 'minmax'") 40 | 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled, scaler 44 | 45 | def inverse_scale_predictions(scaled_pred, scaler, original_y): 46 | """ 47 | Inverse scale predictions if target was scaled. 48 | """ 49 | # Assuming target is not scaled in this implementation 50 | return scaled_pred 51 | 52 | def save_model(model, filename): 53 | """ 54 | Save trained model to file. 55 | """ 56 | import joblib 57 | joblib.dump(model, filename) 58 | logger.info(f"Model saved to {filename}") 59 | 60 | def load_model(filename): 61 | """ 62 | Load model from file. 63 | """ 64 | import joblib 65 | model = joblib.load(filename) 66 | logger.info(f"Model loaded from {filename}") 67 | return model 68 | 69 | def calculate_forecast_accuracy(y_true, y_pred, threshold=0.1): 70 | """ 71 | Calculate forecast accuracy based on a threshold. 72 | """ 73 | accuracy = np.mean(np.abs((y_true - y_pred) / y_true) < threshold) * 100 74 | return accuracy 75 | 76 | def generate_forecast_report(metrics, model_name): 77 | """ 78 | Generate a summary report of the forecast results. 79 | """ 80 | report = f""" 81 | Forecast Report for {model_name} 82 | ================================ 83 | MAE: {metrics['MAE']:.4f} 84 | RMSE: {metrics['RMSE']:.4f} 85 | MAPE: {metrics['MAPE']:.4f}% 86 | """ 87 | return report 88 | 89 | def detect_outliers(df, column, method='iqr', threshold=1.5): 90 | """ 91 | Detect outliers in a column using IQR or Z-score method. 92 | """ 93 | if method == 'iqr': 94 | Q1 = df[column].quantile(0.25) 95 | Q3 = df[column].quantile(0.75) 96 | IQR = Q3 - Q1 97 | lower_bound = Q1 - threshold * IQR 98 | upper_bound = Q3 + threshold * IQR 99 | outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)] 100 | elif method == 'zscore': 101 | z_scores = np.abs((df[column] - df[column].mean()) / df[column].std()) 102 | outliers = df[z_scores > threshold] 103 | else: 104 | raise ValueError("Invalid outlier detection method. Choose 'iqr' or 'zscore'") 105 | return outliers 106 | 107 | def handle_missing_values(df, method='interpolate'): 108 | """ 109 | Handle missing values in the dataframe. 110 | """ 111 | if method == 'interpolate': 112 | df = df.interpolate(method='linear') 113 | elif method == 'forward_fill': 114 | df = df.fillna(method='ffill') 115 | elif method == 'backward_fill': 116 | df = df.fillna(method='bfill') 117 | elif method == 'drop': 118 | df = df.dropna() 119 | else: 120 | raise ValueError("Invalid missing value handling method") 121 | return df 122 | 123 | def create_time_features(df, date_col='date'): 124 | """ 125 | Create additional time-based features from date column. 126 | """ 127 | df = df.copy() 128 | df['year'] = df[date_col].dt.year 129 | df['month'] = df[date_col].dt.month 130 | df['day'] = df[date_col].dt.day 131 | df['day_of_week'] = df[date_col].dt.dayofweek 132 | df['quarter'] = df[date_col].dt.quarter 133 | df['is_weekend'] = df[date_col].dt.dayofweek.isin([5, 6]).astype(int) 134 | return df 135 | -------------------------------------------------------------------------------- /advanced_models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor 4 | from sklearn.svm import SVR 5 | from sklearn.neural_network import MLPRegressor 6 | from tensorflow.keras.models import Sequential 7 | from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, GRU 8 | from tensorflow.keras.callbacks import EarlyStopping 9 | from statsmodels.tsa.arima.model import ARIMA 10 | from statsmodels.tsa.statespace.sarimax import SARIMAX 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | def train_gradient_boosting(X_train, y_train, params=None): 15 | """ 16 | Train Gradient Boosting model. 17 | """ 18 | if params is None: 19 | params = {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3} 20 | model = GradientBoostingRegressor(**params) 21 | model.fit(X_train, y_train) 22 | return model 23 | 24 | def train_ada_boost(X_train, y_train, params=None): 25 | """ 26 | Train AdaBoost model. 27 | """ 28 | if params is None: 29 | params = {'n_estimators': 50, 'learning_rate': 1.0} 30 | model = AdaBoostRegressor(**params) 31 | model.fit(X_train, y_train) 32 | return model 33 | 34 | def train_svr(X_train, y_train, params=None): 35 | """ 36 | Train Support Vector Regression model. 37 | """ 38 | if params is None: 39 | params = {'kernel': 'rbf', 'C': 1.0, 'epsilon': 0.1} 40 | model = SVR(**params) 41 | model.fit(X_train, y_train) 42 | return model 43 | 44 | def train_mlp(X_train, y_train, params=None): 45 | """ 46 | Train Multi-Layer Perceptron model. 47 | """ 48 | if params is None: 49 | params = {'hidden_layer_sizes': (100, 50), 'activation': 'relu', 'max_iter': 500} 50 | model = MLPRegressor(**params) 51 | model.fit(X_train, y_train) 52 | return model 53 | 54 | def train_bidirectional_lstm(X_train, y_train, params=None): 55 | """ 56 | Train Bidirectional LSTM model. 57 | """ 58 | if params is None: 59 | params = {'units': 50, 'epochs': 10, 'batch_size': 32, 'dropout': 0.2} 60 | 61 | X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1])) 62 | model = Sequential() 63 | model.add(Bidirectional(LSTM(params['units'], return_sequences=True), input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]))) 64 | model.add(Dropout(params['dropout'])) 65 | model.add(LSTM(params['units'] // 2)) 66 | model.add(Dropout(params['dropout'])) 67 | model.add(Dense(1)) 68 | model.compile(optimizer='adam', loss='mse') 69 | 70 | early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True) 71 | model.fit(X_train_reshaped, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0, callbacks=[early_stopping]) 72 | return model 73 | 74 | def train_gru(X_train, y_train, params=None): 75 | """ 76 | Train GRU model. 77 | """ 78 | if params is None: 79 | params = {'units': 50, 'epochs': 10, 'batch_size': 32, 'dropout': 0.2} 80 | 81 | X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1])) 82 | model = Sequential() 83 | model.add(GRU(params['units'], input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]))) 84 | model.add(Dropout(params['dropout'])) 85 | model.add(Dense(1)) 86 | model.compile(optimizer='adam', loss='mse') 87 | 88 | early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True) 89 | model.fit(X_train_reshaped, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0, callbacks=[early_stopping]) 90 | return model 91 | 92 | def train_arima(y_train, order=(5, 1, 0)): 93 | """ 94 | Train ARIMA model. 95 | """ 96 | model = ARIMA(y_train, order=order) 97 | model_fit = model.fit() 98 | return model_fit 99 | 100 | def train_sarima(y_train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)): 101 | """ 102 | Train SARIMA model. 103 | """ 104 | model = SARIMAX(y_train, order=order, seasonal_order=seasonal_order) 105 | model_fit = model.fit(disp=False) 106 | return model_fit 107 | 108 | def ensemble_forecast(models, X_test, weights=None): 109 | """ 110 | Create ensemble forecast from multiple models. 111 | """ 112 | if weights is None: 113 | weights = [1/len(models)] * len(models) 114 | 115 | predictions = [] 116 | for model in models: 117 | if hasattr(model, 'predict'): 118 | pred = model.predict(X_test) 119 | else: 120 | # For statsmodels models 121 | pred = model.forecast(steps=len(X_test)) 122 | predictions.append(pred) 123 | 124 | # Weighted average 125 | ensemble_pred = np.average(predictions, axis=0, weights=weights) 126 | return ensemble_pred 127 | 128 | def train_stacked_model(base_models, meta_model, X_train, y_train, X_val, y_val): 129 | """ 130 | Train a stacked ensemble model. 131 | """ 132 | # Get predictions from base models 133 | base_predictions = [] 134 | for model in base_models: 135 | model.fit(X_train, y_train) 136 | pred = model.predict(X_val) 137 | base_predictions.append(pred) 138 | 139 | # Create meta features 140 | meta_features = np.column_stack(base_predictions) 141 | 142 | # Train meta model 143 | meta_model.fit(meta_features, y_val) 144 | 145 | return base_models, meta_model 146 | -------------------------------------------------------------------------------- /advanced_evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 4 | from scipy.stats import shapiro, normaltest 5 | import warnings 6 | warnings.filterwarnings('ignore') 7 | 8 | def mean_absolute_percentage_error(y_true, y_pred): 9 | """ 10 | Calculate Mean Absolute Percentage Error (MAPE). 11 | """ 12 | y_true, y_pred = np.array(y_true), np.array(y_pred) 13 | return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 14 | 15 | def symmetric_mean_absolute_percentage_error(y_true, y_pred): 16 | """ 17 | Calculate Symmetric Mean Absolute Percentage Error (SMAPE). 18 | """ 19 | y_true, y_pred = np.array(y_true), np.array(y_pred) 20 | return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))) 21 | 22 | def mean_absolute_scaled_error(y_true, y_pred, y_train=None, m=1): 23 | """ 24 | Calculate Mean Absolute Scaled Error (MASE). 25 | """ 26 | y_true, y_pred = np.array(y_true), np.array(y_pred) 27 | mae = mean_absolute_error(y_true, y_pred) 28 | 29 | if y_train is not None: 30 | # Calculate naive forecast MAE 31 | y_train = np.array(y_train) 32 | naive_errors = [] 33 | for i in range(m, len(y_train)): 34 | naive_errors.append(abs(y_train[i] - y_train[i-m])) 35 | naive_mae = np.mean(naive_errors) 36 | else: 37 | # Use simple naive method (previous value) 38 | naive_mae = np.mean(np.abs(np.diff(y_true))) 39 | 40 | return mae / naive_mae 41 | 42 | def root_mean_squared_percentage_error(y_true, y_pred): 43 | """ 44 | Calculate Root Mean Squared Percentage Error (RMSPE). 45 | """ 46 | y_true, y_pred = np.array(y_true), np.array(y_pred) 47 | return np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2)) * 100 48 | 49 | def mean_directional_accuracy(y_true, y_pred): 50 | """ 51 | Calculate Mean Directional Accuracy (MDA). 52 | """ 53 | y_true, y_pred = np.array(y_true), np.array(y_pred) 54 | actual_direction = np.sign(np.diff(y_true)) 55 | pred_direction = np.sign(np.diff(y_pred)) 56 | return np.mean(actual_direction == pred_direction) * 100 57 | 58 | def theil_u_statistic(y_true, y_pred): 59 | """ 60 | Calculate Theil's U statistic. 61 | """ 62 | y_true, y_pred = np.array(y_true), np.array(y_pred) 63 | naive_pred = np.roll(y_true, 1)[1:] # Naive forecast: previous value 64 | y_true = y_true[1:] 65 | y_pred = y_pred[1:] 66 | 67 | rmse_model = np.sqrt(mean_squared_error(y_true, y_pred)) 68 | rmse_naive = np.sqrt(mean_squared_error(y_true, naive_pred)) 69 | 70 | return rmse_model / rmse_naive 71 | 72 | def forecast_bias(y_true, y_pred): 73 | """ 74 | Calculate forecast bias (mean error). 75 | """ 76 | return np.mean(y_pred - y_true) 77 | 78 | def tracking_signal(y_true, y_pred, cumulative=True): 79 | """ 80 | Calculate tracking signal. 81 | """ 82 | errors = y_pred - y_true 83 | if cumulative: 84 | cum_errors = np.cumsum(errors) 85 | cum_abs_errors = np.cumsum(np.abs(errors)) 86 | return cum_errors / cum_abs_errors 87 | else: 88 | return errors / np.abs(errors) 89 | 90 | def residual_analysis(y_true, y_pred): 91 | """ 92 | Perform comprehensive residual analysis. 93 | """ 94 | residuals = y_true - y_pred 95 | 96 | # Normality tests 97 | shapiro_stat, shapiro_p = shapiro(residuals) 98 | dagostino_stat, dagostino_p = normaltest(residuals) 99 | 100 | # Autocorrelation (simple lag-1) 101 | autocorr = np.corrcoef(residuals[:-1], residuals[1:])[0, 1] 102 | 103 | # Heteroscedasticity test (simple: correlation between |residuals| and predictions) 104 | hetero_corr = np.corrcoef(np.abs(residuals), y_pred)[0, 1] 105 | 106 | analysis = { 107 | 'mean_residual': np.mean(residuals), 108 | 'std_residual': np.std(residuals), 109 | 'shapiro_normality': {'statistic': shapiro_stat, 'p_value': shapiro_p}, 110 | 'dagostino_normality': {'statistic': dagostino_stat, 'p_value': dagostino_p}, 111 | 'autocorrelation_lag1': autocorr, 112 | 'heteroscedasticity_corr': hetero_corr 113 | } 114 | 115 | return analysis 116 | 117 | def comprehensive_evaluation(y_true, y_pred, y_train=None): 118 | """ 119 | Comprehensive model evaluation with multiple metrics. 120 | """ 121 | metrics = { 122 | 'MAE': mean_absolute_error(y_true, y_pred), 123 | 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)), 124 | 'MAPE': mean_absolute_percentage_error(y_true, y_pred), 125 | 'SMAPE': symmetric_mean_absolute_percentage_error(y_true, y_pred), 126 | 'RMSPE': root_mean_squared_percentage_error(y_true, y_pred), 127 | 'R2': r2_score(y_true, y_pred), 128 | 'MDA': mean_directional_accuracy(y_true, y_pred), 129 | 'Theil_U': theil_u_statistic(y_true, y_pred), 130 | 'Forecast_Bias': forecast_bias(y_true, y_pred), 131 | 'MASE': mean_absolute_scaled_error(y_true, y_pred, y_train) 132 | } 133 | 134 | # Residual analysis 135 | metrics['Residual_Analysis'] = residual_analysis(y_true, y_pred) 136 | 137 | return metrics 138 | 139 | def print_evaluation_report(metrics, model_name="Model"): 140 | """ 141 | Print a formatted evaluation report. 142 | """ 143 | report = f""" 144 | === {model_name} Evaluation Report === 145 | ====================================== 146 | 147 | Error Metrics: 148 | -------------- 149 | MAE: {metrics['MAE']:.4f} 150 | RMSE: {metrics['RMSE']:.4f} 151 | MAPE: {metrics['MAPE']:.4f}% 152 | SMAPE: {metrics['SMAPE']:.4f}% 153 | RMSPE: {metrics['RMSPE']:.4f}% 154 | MASE: {metrics['MASE']:.4f} 155 | 156 | Accuracy Metrics: 157 | ----------------- 158 | R² Score: {metrics['R2']:.4f} 159 | MDA: {metrics['MDA']:.4f}% 160 | 161 | Forecast Quality: 162 | ----------------- 163 | Theil's U: {metrics['Theil_U']:.4f} 164 | Forecast Bias: {metrics['Forecast_Bias']:.4f} 165 | 166 | Residual Analysis: 167 | ------------------ 168 | Mean Residual: {metrics['Residual_Analysis']['mean_residual']:.4f} 169 | Std Residual: {metrics['Residual_Analysis']['std_residual']:.4f} 170 | Autocorr (lag1): {metrics['Residual_Analysis']['autocorrelation_lag1']:.4f} 171 | Hetero Corr: {metrics['Residual_Analysis']['heteroscedasticity_corr']:.4f} 172 | 173 | Normality Tests: 174 | ---------------- 175 | Shapiro-Wilk: p-value = {metrics['Residual_Analysis']['shapiro_normality']['p_value']:.4f} 176 | D'Agostino: p-value = {metrics['Residual_Analysis']['dagostino_normality']['p_value']:.4f} 177 | """ 178 | 179 | print(report) 180 | return report 181 | -------------------------------------------------------------------------------- /advanced_viz.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import pandas as pd 4 | import numpy as np 5 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 6 | import plotly.graph_objects as go 7 | import plotly.express as px 8 | from plotly.subplots import make_subplots 9 | import warnings 10 | warnings.filterwarnings('ignore') 11 | 12 | def plot_forecast_interactive(y_true, y_pred, title='Interactive Forecast vs Actual'): 13 | """ 14 | Create interactive forecast plot using Plotly. 15 | """ 16 | fig = go.Figure() 17 | 18 | fig.add_trace(go.Scatter(x=y_true.index, y=y_true, mode='lines', name='Observed', 19 | line=dict(color='blue', width=2))) 20 | fig.add_trace(go.Scatter(x=y_true.index, y=y_pred, mode='lines', name='Predicted', 21 | line=dict(color='red', width=2, dash='dash'))) 22 | 23 | fig.update_layout(title=title, 24 | xaxis_title='Date', 25 | yaxis_title='Value', 26 | hovermode='x unified') 27 | 28 | return fig 29 | 30 | def plot_residuals_analysis(y_true, y_pred, figsize=(15, 10)): 31 | """ 32 | Comprehensive residuals analysis plots. 33 | """ 34 | residuals = y_true - y_pred 35 | 36 | fig, axes = plt.subplots(2, 3, figsize=figsize) 37 | 38 | # Residuals over time 39 | axes[0, 0].plot(residuals.index, residuals, color='green', alpha=0.7) 40 | axes[0, 0].axhline(0, color='black', linestyle='--') 41 | axes[0, 0].set_title('Residuals Over Time') 42 | axes[0, 0].set_xlabel('Date') 43 | axes[0, 0].set_ylabel('Residuals') 44 | 45 | # Residuals distribution 46 | sns.histplot(residuals, kde=True, ax=axes[0, 1]) 47 | axes[0, 1].set_title('Residuals Distribution') 48 | 49 | # Q-Q plot 50 | from scipy import stats 51 | stats.probplot(residuals, dist="norm", plot=axes[0, 2]) 52 | axes[0, 2].set_title('Q-Q Plot') 53 | 54 | # Residuals vs Fitted 55 | axes[1, 0].scatter(y_pred, residuals, alpha=0.5) 56 | axes[1, 0].axhline(0, color='red', linestyle='--') 57 | axes[1, 0].set_title('Residuals vs Fitted Values') 58 | axes[1, 0].set_xlabel('Fitted Values') 59 | axes[1, 0].set_ylabel('Residuals') 60 | 61 | # Autocorrelation 62 | max_lags_acf = min(20, len(residuals) - 1) 63 | plot_acf(residuals, ax=axes[1, 1], lags=max_lags_acf) 64 | axes[1, 1].set_title('Residuals Autocorrelation') 65 | 66 | # Partial Autocorrelation 67 | max_lags_pacf = min(10, len(residuals) // 2 - 1) 68 | if max_lags_pacf > 0: 69 | plot_pacf(residuals, ax=axes[1, 2], lags=max_lags_pacf) 70 | axes[1, 2].set_title('Residuals Partial Autocorrelation') 71 | else: 72 | axes[1, 2].text(0.5, 0.5, 'Not enough data\nfor PACF', ha='center', va='center', transform=axes[1, 2].transAxes) 73 | axes[1, 2].set_title('Residuals Partial Autocorrelation') 74 | 75 | plt.tight_layout() 76 | return fig 77 | 78 | def plot_model_comparison(models_metrics, metric='RMSE'): 79 | """ 80 | Plot comparison of different models. 81 | """ 82 | model_names = list(models_metrics.keys()) 83 | values = [models_metrics[name][metric] for name in model_names] 84 | 85 | fig, ax = plt.subplots(figsize=(10, 6)) 86 | bars = ax.bar(model_names, values, color='skyblue', edgecolor='navy', linewidth=1) 87 | 88 | ax.set_title(f'Model Comparison - {metric}') 89 | ax.set_xlabel('Models') 90 | ax.set_ylabel(metric) 91 | ax.tick_params(axis='x', rotation=45) 92 | 93 | # Add value labels on bars 94 | for bar, value in zip(bars, values): 95 | ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(), 96 | f'{value:.4f}', ha='center', va='bottom') 97 | 98 | plt.tight_layout() 99 | return fig 100 | 101 | def plot_feature_importance(model, feature_names, top_n=20): 102 | """ 103 | Plot feature importance for tree-based models. 104 | """ 105 | if hasattr(model, 'feature_importances_'): 106 | importance = model.feature_importances_ 107 | indices = np.argsort(importance)[::-1][:top_n] 108 | 109 | plt.figure(figsize=(10, 8)) 110 | plt.title('Feature Importances') 111 | plt.bar(range(top_n), importance[indices], align='center') 112 | plt.xticks(range(top_n), [feature_names[i] for i in indices], rotation=90) 113 | plt.tight_layout() 114 | return plt.gcf() 115 | else: 116 | print("Model does not have feature_importances_ attribute") 117 | return None 118 | 119 | def plot_learning_curve(model, X_train, y_train, cv=5): 120 | """ 121 | Plot learning curve for a model. 122 | """ 123 | from sklearn.model_selection import learning_curve 124 | 125 | train_sizes, train_scores, val_scores = learning_curve( 126 | model, X_train, y_train, cv=cv, n_jobs=-1, 127 | train_sizes=np.linspace(0.1, 1.0, 10), scoring='neg_mean_squared_error' 128 | ) 129 | 130 | train_scores_mean = -np.mean(train_scores, axis=1) 131 | train_scores_std = np.std(train_scores, axis=1) 132 | val_scores_mean = -np.mean(val_scores, axis=1) 133 | val_scores_std = np.std(val_scores, axis=1) 134 | 135 | plt.figure(figsize=(10, 6)) 136 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 137 | train_scores_mean + train_scores_std, alpha=0.1, color="r") 138 | plt.fill_between(train_sizes, val_scores_mean - val_scores_std, 139 | val_scores_mean + val_scores_std, alpha=0.1, color="g") 140 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") 141 | plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Cross-validation score") 142 | 143 | plt.title('Learning Curve') 144 | plt.xlabel('Training Size') 145 | plt.ylabel('MSE') 146 | plt.legend(loc="best") 147 | plt.grid(True) 148 | plt.tight_layout() 149 | return plt.gcf() 150 | 151 | def plot_prediction_intervals(y_true, y_pred, lower_bound, upper_bound, title='Prediction Intervals'): 152 | """ 153 | Plot predictions with confidence intervals. 154 | """ 155 | plt.figure(figsize=(12, 6)) 156 | plt.plot(y_true.index, y_true, label='Observed', color='blue', linewidth=2) 157 | plt.plot(y_true.index, y_pred, label='Predicted', color='red', linewidth=2) 158 | plt.fill_between(y_true.index, lower_bound, upper_bound, alpha=0.3, color='red', label='95% Prediction Interval') 159 | 160 | plt.title(title) 161 | plt.xlabel('Date') 162 | plt.ylabel('Value') 163 | plt.legend() 164 | plt.grid(True, alpha=0.3) 165 | plt.tight_layout() 166 | return plt.gcf() 167 | 168 | def create_dashboard(y_true, y_pred, metrics, model_name): 169 | """ 170 | Create a comprehensive dashboard with multiple plots. 171 | """ 172 | fig = make_subplots( 173 | rows=3, cols=2, 174 | subplot_titles=('Forecast vs Actual', 'Residuals Distribution', 175 | 'Residuals Over Time', 'Q-Q Plot', 176 | 'Model Metrics', 'Feature Importance (if available)'), 177 | specs=[[{"secondary_y": False}, {"secondary_y": False}], 178 | [{"secondary_y": False}, {"secondary_y": False}], 179 | [{"secondary_y": False}, {"secondary_y": False}]] 180 | ) 181 | 182 | # Forecast vs Actual 183 | fig.add_trace(go.Scatter(x=y_true.index, y=y_true, mode='lines', name='Observed', 184 | line=dict(color='blue')), row=1, col=1) 185 | fig.add_trace(go.Scatter(x=y_true.index, y=y_pred, mode='lines', name='Predicted', 186 | line=dict(color='red')), row=1, col=1) 187 | 188 | # Residuals Distribution 189 | residuals = y_true - y_pred 190 | fig.add_trace(go.Histogram(x=residuals, nbinsx=30, name='Residuals'), row=1, col=2) 191 | 192 | # Residuals Over Time 193 | fig.add_trace(go.Scatter(x=y_true.index, y=residuals, mode='lines', name='Residuals Over Time', 194 | line=dict(color='green')), row=2, col=1) 195 | 196 | # Q-Q Plot 197 | from scipy import stats 198 | qq = stats.probplot(residuals, dist="norm") 199 | fig.add_trace(go.Scatter(x=qq[0][0], y=qq[0][1], mode='markers', name='Q-Q Plot'), row=2, col=2) 200 | 201 | # Model Metrics 202 | metrics_text = "
".join([f"{k}: {v:.4f}" for k, v in metrics.items() if isinstance(v, (int, float))]) 203 | fig.add_trace(go.Table( 204 | header=dict(values=['Metric', 'Value']), 205 | cells=dict(values=[list(metrics.keys()), [f"{v:.4f}" if isinstance(v, (int, float)) else str(v) for v in metrics.values()]]) 206 | ), row=3, col=1) 207 | 208 | # Placeholder for Feature Importance 209 | fig.add_trace(go.Bar(x=['Feature 1', 'Feature 2'], y=[0.5, 0.3], name='Feature Importance'), row=3, col=2) 210 | 211 | fig.update_layout(height=1200, title_text=f"{model_name} - Comprehensive Dashboard") 212 | return fig 213 | 214 | def plot_seasonal_decomposition(ts, model='additive', period=None): 215 | """ 216 | Plot seasonal decomposition of time series. 217 | """ 218 | from statsmodels.tsa.seasonal import seasonal_decompose 219 | 220 | if period is None: 221 | period = 12 # Assume monthly data 222 | 223 | decomposition = seasonal_decompose(ts, model=model, period=period) 224 | 225 | fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True) 226 | 227 | axes[0].plot(ts, label='Original') 228 | axes[0].legend() 229 | axes[0].set_title('Original Time Series') 230 | 231 | axes[1].plot(decomposition.trend, label='Trend') 232 | axes[1].legend() 233 | axes[1].set_title('Trend Component') 234 | 235 | axes[2].plot(decomposition.seasonal, label='Seasonal') 236 | axes[2].legend() 237 | axes[2].set_title('Seasonal Component') 238 | 239 | axes[3].plot(decomposition.resid, label='Residual') 240 | axes[3].legend() 241 | axes[3].set_title('Residual Component') 242 | 243 | plt.tight_layout() 244 | return fig 245 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🚀 Advanced Time Series Forecasting Tool 2 | 3 | A comprehensive, enterprise-grade Streamlit web application for professional time series forecasting with cutting-edge AI capabilities. This tool combines 13+ forecasting models, advanced evaluation metrics, interactive visualizations, and AI-powered insights to deliver state-of-the-art time series analysis. 4 | 5 | ## 🌟 Key Features 6 | 7 | ### 🤖 **13 Advanced Forecasting Models** 8 | - **Machine Learning**: XGBoost, Random Forest, Gradient Boosting, AdaBoost, SVR, MLP 9 | - **Deep Learning**: LSTM, Bidirectional LSTM, GRU 10 | - **Statistical**: ARIMA, SARIMA 11 | - **Ensemble Methods**: Voting Regressor, Stacking Ensemble 12 | 13 | ### 📊 **Comprehensive Evaluation Suite** 14 | - **Basic Metrics**: MAE, RMSE, MAPE, R² Score 15 | - **Advanced Metrics**: SMAPE, RMSPE, MASE, MDA, Theil's U, Forecast Bias 16 | - **Statistical Tests**: Normality, autocorrelation, heteroscedasticity analysis 17 | 18 | ### 🎨 **Interactive Visualizations** 19 | - Plotly-powered interactive forecast plots 20 | - Comprehensive residual analysis (distribution, ACF, Q-Q plots) 21 | - Seasonal decomposition with trend/seasonal/residual components 22 | - Model comparison dashboards 23 | - Feature importance analysis with SHAP values 24 | 25 | ### 🛠️ **Advanced Data Processing** 26 | - Automatic data validation and intelligent cleaning 27 | - Smart lag feature generation for temporal dependencies 28 | - Multiple missing value imputation strategies 29 | - Outlier detection and handling 30 | - Feature scaling (Standard, MinMax, Robust) 31 | - Time-based feature engineering 32 | 33 | ### 🎛️ **Professional User Interface** 34 | - Drag-and-drop CSV upload with real-time validation 35 | - Dynamic parameter tuning with live updates 36 | - Hyperparameter optimization with GridSearchCV 37 | - Model comparison and benchmarking 38 | - Export capabilities (predictions, metrics, models) 39 | 40 | ### 🔬 **AI-Powered Features** 41 | - **SHAP Analysis**: Explainable AI for model interpretability 42 | - **Feature Importance**: Global and local feature impact analysis 43 | - **Time Series Cross-Validation**: Rolling forecast validation 44 | - **Automated Model Selection**: Performance-based recommendations 45 | 46 | ## 🚀 Quick Start 47 | 48 | ### Installation 49 | 50 | 1. **Clone the repository**: 51 | ```bash 52 | git clone https://github.com/your-username/time-series-forecasting-tool.git 53 | cd time-series-forecasting-tool 54 | ``` 55 | 56 | 2. **Install dependencies**: 57 | ```bash 58 | pip install -r requirements.txt 59 | ``` 60 | 61 | 3. **Launch the application**: 62 | ```bash 63 | streamlit run app_advanced.py 64 | ``` 65 | 66 | 4. **Open your browser** to `http://localhost:8501` 67 | 68 | ### Usage Guide 69 | 70 | 1. **📁 Data Upload**: Upload your CSV file or use the built-in sample dataset 71 | 2. **⚙️ Preprocessing**: Configure lag features, test size, and scaling options 72 | 3. **🤖 Model Selection**: Choose from 13 forecasting models with pre-configured parameters 73 | 4. **🔬 Advanced Features**: Enable SHAP analysis, cross-validation, and feature importance 74 | 5. **📊 Evaluation**: Review comprehensive metrics and statistical analysis 75 | 6. **📈 Visualization**: Explore interactive charts and residual diagnostics 76 | 7. **💾 Export**: Download predictions, metrics reports, and trained models 77 | 78 | ## 🏗️ Project Architecture 79 | 80 | ``` 81 | ├── app_advanced.py # Main advanced Streamlit application 82 | ├── app.py # Basic Streamlit application 83 | ├── config.py # Configuration and hyperparameters 84 | ├── data_loader.py # Data loading and preprocessing utilities 85 | ├── models.py # Core ML models (XGBoost, RF, LSTM) 86 | ├── advanced_models.py # Additional models and ensemble methods 87 | ├── evaluation.py # Basic evaluation metrics 88 | ├── advanced_evaluation.py # Comprehensive evaluation suite 89 | ├── viz.py # Basic visualization functions 90 | ├── advanced_viz.py # Interactive and advanced visualizations 91 | ├── utils.py # Utility functions (validation, scaling, etc.) 92 | ├── requirements.txt # Python dependencies 93 | ├── TODO.md # Development roadmap 94 | └── README.md # This documentation 95 | ``` 96 | 97 | ## 📋 Requirements 98 | 99 | ### Core Dependencies 100 | - **streamlit** (>=1.28.0): Web application framework 101 | - **tensorflow** (>=2.13.0): Deep learning models 102 | - **scikit-learn** (>=1.3.0): Machine learning algorithms 103 | - **pandas** (>=2.0.0): Data manipulation 104 | - **numpy** (>=1.24.0): Numerical computing 105 | 106 | ### Visualization & Analysis 107 | - **plotly** (>=5.15.0): Interactive visualizations 108 | - **matplotlib** (>=3.7.0): Static plotting 109 | - **seaborn** (>=0.12.0): Statistical visualization 110 | - **statsmodels** (>=0.14.0): Statistical models 111 | 112 | ### Specialized Libraries 113 | - **xgboost** (>=1.7.0): Gradient boosting 114 | - **shap** (>=0.42.0): Explainable AI (optional) 115 | - **joblib** (>=1.3.0): Model serialization 116 | - **scipy** (>=1.11.0): Scientific computing 117 | 118 | ## 🎯 Model Capabilities 119 | 120 | ### Machine Learning Models 121 | | Model | Description | Best For | 122 | |-------|-------------|----------| 123 | | XGBoost | Gradient boosting with trees | High accuracy, feature importance | 124 | | Random Forest | Ensemble of decision trees | Robust, handles missing data | 125 | | Gradient Boosting | Sequential ensemble method | Competitive accuracy | 126 | | AdaBoost | Adaptive boosting | Binary classification adaptation | 127 | | SVR | Support Vector Regression | Non-linear relationships | 128 | | MLP | Neural network | Complex patterns | 129 | 130 | ### Deep Learning Models 131 | | Model | Description | Best For | 132 | |-------|-------------|----------| 133 | | LSTM | Long Short-Term Memory | Sequential dependencies | 134 | | Bidirectional LSTM | Forward + backward LSTM | Context-aware forecasting | 135 | | GRU | Gated Recurrent Units | Efficient sequential modeling | 136 | 137 | ### Statistical Models 138 | | Model | Description | Best For | 139 | |-------|-------------|----------| 140 | | ARIMA | AutoRegressive Integrated MA | Stationary time series | 141 | | SARIMA | Seasonal ARIMA | Seasonal patterns | 142 | 143 | ### Ensemble Methods 144 | | Model | Description | Best For | 145 | |-------|-------------|----------| 146 | | Voting Ensemble | Weighted average of models | Improved stability | 147 | | Stacking Ensemble | Meta-model on base predictions | Maximum accuracy | 148 | 149 | ## 📊 Evaluation Framework 150 | 151 | ### Performance Metrics 152 | - **MAE**: Mean Absolute Error - Average magnitude of errors 153 | - **RMSE**: Root Mean Squared Error - Penalizes large errors 154 | - **MAPE**: Mean Absolute Percentage Error - Scale-independent 155 | - **SMAPE**: Symmetric MAPE - Handles zero values 156 | - **MASE**: Mean Absolute Scaled Error - Compares to naive forecast 157 | - **MDA**: Mean Directional Accuracy - Direction prediction accuracy 158 | 159 | ### Statistical Analysis 160 | - **Normality Tests**: Shapiro-Wilk, Kolmogorov-Smirnov 161 | - **Autocorrelation**: ACF/PACF analysis for residuals 162 | - **Heteroscedasticity**: Breusch-Pagan, White tests 163 | - **Stationarity**: Augmented Dickey-Fuller test 164 | 165 | ## 🔬 Advanced Features 166 | 167 | ### SHAP Explainability 168 | - Global feature importance across all predictions 169 | - Local explanations for individual forecasts 170 | - Waterfall plots showing feature contributions 171 | - Summary plots for feature impact analysis 172 | 173 | ### Cross-Validation 174 | - Time series split validation 175 | - Rolling forecast evaluation 176 | - Performance stability assessment 177 | - Overfitting detection 178 | 179 | ### Feature Engineering 180 | - Automatic lag feature creation 181 | - Rolling statistics (mean, std, min, max) 182 | - Seasonal indicators 183 | - Calendar features (day of week, month, quarter) 184 | 185 | ## 🎨 User Interface 186 | 187 | ### Dashboard Layout 188 | - **Header**: Professional branding with gradient styling 189 | - **Sidebar**: Organized controls for data, preprocessing, and models 190 | - **Main Panel**: Metrics cards, visualizations, and results 191 | - **Export Section**: Download options for results and models 192 | 193 | ### Responsive Design 194 | - Mobile-friendly layout 195 | - Collapsible sidebar 196 | - Progressive disclosure of advanced options 197 | - Real-time feedback and progress indicators 198 | 199 | ## 🚀 Deployment Options 200 | 201 | ### Local Development 202 | ```bash 203 | streamlit run app_advanced.py --server.port 8501 --server.address 0.0.0.0 204 | ``` 205 | 206 | ### Docker Deployment 207 | ```dockerfile 208 | FROM python:3.11-slim 209 | COPY . /app 210 | WORKDIR /app 211 | RUN pip install -r requirements.txt 212 | EXPOSE 8501 213 | CMD ["streamlit", "run", "app_advanced.py", "--server.address", "0.0.0.0"] 214 | ``` 215 | 216 | ### Cloud Platforms 217 | - **Streamlit Cloud**: Direct deployment from GitHub 218 | - **Heroku**: Container-based deployment 219 | - **AWS/GCP/Azure**: Scalable cloud deployment 220 | - **Docker Hub**: Containerized distribution 221 | 222 | ## 🤝 Contributing 223 | 224 | We welcome contributions! Please follow these steps: 225 | 226 | 1. **Fork** the repository 227 | 2. **Create** a feature branch (`git checkout -b feature/AmazingFeature`) 228 | 3. **Commit** changes (`git commit -m 'Add AmazingFeature'`) 229 | 4. **Push** to branch (`git push origin feature/AmazingFeature`) 230 | 5. **Open** a Pull Request 231 | 232 | ### Development Guidelines 233 | - Follow PEP 8 style guidelines 234 | - Add docstrings to all functions 235 | - Include unit tests for new features 236 | - Update documentation for API changes 237 | - Ensure backward compatibility 238 | 239 | ## 📄 License 240 | 241 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 242 | 243 | ## 🙏 Acknowledgments 244 | 245 | - **Streamlit** for the amazing web app framework 246 | - **TensorFlow/Keras** for deep learning capabilities 247 | - **scikit-learn** for comprehensive ML algorithms 248 | - **Plotly** for interactive visualizations 249 | - **SHAP** for model explainability 250 | - **statsmodels** for statistical modeling 251 | 252 | ## 🔮 Future Roadmap 253 | 254 | ### Phase 1 (Completed) 255 | - ✅ 13 forecasting models implementation 256 | - ✅ Comprehensive evaluation metrics 257 | - ✅ Interactive visualizations 258 | - ✅ Professional UI/UX 259 | 260 | ### Phase 2 (In Progress) 261 | - 🔄 Prophet model integration 262 | - 🔄 Automated model selection 263 | - 🔄 Prediction intervals 264 | - 🔄 Multi-step forecasting 265 | 266 | ### Phase 3 (Planned) 267 | - 📋 Real-time forecasting dashboard 268 | - 📋 Anomaly detection system 269 | - 📋 Model deployment API 270 | - 📋 Database integration 271 | - 📋 Performance monitoring 272 | 273 | ## 📞 Support 274 | 275 | For questions, issues, or contributions: 276 | 277 | - **GitHub Issues**: Bug reports and feature requests 278 | - **Discussions**: General questions and community support 279 | - **Pull Requests**: Code contributions welcome 280 | 281 | ## 🎉 Getting Started 282 | 283 | Ready to forecast? Get started in minutes: 284 | 285 | 1. Clone the repo 286 | 2. Install dependencies 287 | 3. Run `streamlit run app_advanced.py` 288 | 4. Upload your data and start forecasting! 289 | 290 | --- 291 | 292 | **Built with ❤️ for the data science community** 293 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from data_loader import load_data, preprocess_data, create_sample_data 5 | from models import train_xgboost, train_random_forest, train_lstm, tune_hyperparameters 6 | from advanced_models import (train_gradient_boosting, train_ada_boost, train_svr, train_mlp, 7 | train_bidirectional_lstm, train_gru, train_arima, train_sarima, 8 | ensemble_forecast) 9 | from evaluation import evaluate_model 10 | from advanced_evaluation import comprehensive_evaluation, print_evaluation_report 11 | from viz import plot_forecast, plot_residuals, plot_distribution 12 | from advanced_viz import (plot_forecast_interactive, plot_residuals_analysis, plot_model_comparison, 13 | create_dashboard, plot_seasonal_decomposition) 14 | from config import DEFAULT_PARAMS, TUNING_GRIDS, DATA_SETTINGS, APP_SETTINGS 15 | from utils import (validate_data, scale_features, save_model, load_model, 16 | handle_missing_values, create_time_features, detect_outliers) 17 | import matplotlib.pyplot as plt 18 | import plotly.graph_objects as go 19 | 20 | # Set page configuration 21 | st.set_page_config( 22 | page_title="Advanced Time Series Forecasting", 23 | page_icon="📈", 24 | layout="wide", 25 | initial_sidebar_state="expanded" 26 | ) 27 | 28 | # Custom CSS for better styling 29 | st.markdown(""" 30 | 193 | """, unsafe_allow_html=True) 194 | 195 | # Main header with custom styling 196 | st.markdown('

📈 Advanced Time Series Forecasting Tool

', unsafe_allow_html=True) 197 | st.markdown('

Professional time series analysis with 11+ forecasting models and interactive visualizations

', unsafe_allow_html=True) 198 | 199 | # Sidebar for inputs 200 | st.sidebar.markdown('', unsafe_allow_html=True) 201 | uploaded_file = st.sidebar.file_uploader('Upload CSV file', type=['csv']) 202 | if uploaded_file is not None: 203 | df = load_data(uploaded_file) 204 | st.sidebar.success('✅ Data loaded successfully!') 205 | else: 206 | st.sidebar.info('ℹ️ Using sample data.') 207 | df = create_sample_data() 208 | 209 | st.sidebar.markdown('', unsafe_allow_html=True) 210 | lags = st.sidebar.slider('Number of lag features', 1, 10, 5) 211 | test_size = st.sidebar.slider('Test size', 0.1, 0.5, 0.2) 212 | 213 | X_train, X_test, y_train, y_test = preprocess_data(df, lags=lags, test_size=test_size) 214 | 215 | st.sidebar.markdown('', unsafe_allow_html=True) 216 | model_options = ['XGBoost', 'Random Forest', 'LSTM', 'Gradient Boosting', 'AdaBoost', 'SVR', 'MLP', 'Bidirectional LSTM', 'GRU', 'ARIMA', 'SARIMA'] 217 | model_choice = st.sidebar.selectbox('Choose model', model_options) 218 | 219 | st.sidebar.markdown('', unsafe_allow_html=True) 220 | if model_choice in DEFAULT_PARAMS: 221 | params = DEFAULT_PARAMS[model_choice].copy() 222 | for param_name, default_value in params.items(): 223 | if isinstance(default_value, int): 224 | params[param_name] = st.sidebar.slider(param_name, 1, 200, default_value) 225 | elif isinstance(default_value, float): 226 | params[param_name] = st.sidebar.slider(param_name, 0.001, 1.0, default_value) 227 | else: 228 | st.sidebar.warning(f"Default parameters not set for {model_choice}. Using basic settings.") 229 | params = {} 230 | 231 | tune = st.sidebar.checkbox('Tune hyperparameters?') 232 | if tune and model_choice in TUNING_GRIDS: 233 | param_grid = TUNING_GRIDS[model_choice] 234 | model, best_params = tune_hyperparameters(model_choice.lower().replace(' ', '_'), X_train, y_train, param_grid) 235 | st.sidebar.write('Best params:', best_params) 236 | else: 237 | if model_choice == 'XGBoost': 238 | model = train_xgboost(X_train, y_train, params) 239 | elif model_choice == 'Random Forest': 240 | model = train_random_forest(X_train, y_train, params) 241 | elif model_choice == 'LSTM': 242 | model = train_lstm(X_train, y_train, params) 243 | elif model_choice == 'Gradient Boosting': 244 | model = train_gradient_boosting(X_train, y_train, params) 245 | elif model_choice == 'AdaBoost': 246 | model = train_ada_boost(X_train, y_train, params) 247 | elif model_choice == 'SVR': 248 | model = train_svr(X_train, y_train, params) 249 | elif model_choice == 'MLP': 250 | model = train_mlp(X_train, y_train, params) 251 | elif model_choice == 'Bidirectional LSTM': 252 | model = train_bidirectional_lstm(X_train, y_train, params) 253 | elif model_choice == 'GRU': 254 | model = train_gru(X_train, y_train, params) 255 | elif model_choice == 'ARIMA': 256 | model = train_arima(y_train) 257 | elif model_choice == 'SARIMA': 258 | model = train_sarima(y_train) 259 | else: 260 | st.error(f"Model {model_choice} not implemented yet.") 261 | st.stop() 262 | 263 | # Train and predict 264 | if model_choice in ['LSTM', 'Bidirectional LSTM', 'GRU']: 265 | X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1])) 266 | y_pred = model.predict(X_test_reshaped).flatten() 267 | elif model_choice in ['ARIMA', 'SARIMA']: 268 | # For ARIMA/SARIMA, forecast the test period 269 | y_pred = model.forecast(steps=len(y_test)) 270 | y_pred.index = y_test.index 271 | else: 272 | y_pred = model.predict(X_test) 273 | 274 | # Evaluate 275 | metrics = evaluate_model(y_test, y_pred) 276 | 277 | # Display metrics in styled cards 278 | st.markdown('

📊 Model Evaluation

', unsafe_allow_html=True) 279 | 280 | col1, col2, col3, col4 = st.columns(4) 281 | with col1: 282 | st.markdown(f""" 283 |
284 |
MAE
285 |
{metrics['MAE']:.4f}
286 |
287 | """, unsafe_allow_html=True) 288 | 289 | with col2: 290 | st.markdown(f""" 291 |
292 |
RMSE
293 |
{metrics['RMSE']:.4f}
294 |
295 | """, unsafe_allow_html=True) 296 | 297 | with col3: 298 | st.markdown(f""" 299 |
300 |
MAPE
301 |
{metrics['MAPE']:.2f}%
302 |
303 | """, unsafe_allow_html=True) 304 | 305 | with col4: 306 | st.markdown(f""" 307 |
308 |
R² Score
309 |
{metrics['R2']:.4f}
310 |
311 | """, unsafe_allow_html=True) 312 | 313 | # Advanced evaluation option 314 | if st.checkbox('🔬 Show Advanced Metrics'): 315 | adv_metrics = comprehensive_evaluation(y_test, y_pred, y_train) 316 | st.markdown('

Advanced Performance Metrics

', unsafe_allow_html=True) 317 | for key, value in adv_metrics.items(): 318 | if isinstance(value, dict): 319 | st.markdown(f"**{key}:**") 320 | for sub_key, sub_value in value.items(): 321 | st.write(f" - {sub_key}: {sub_value}") 322 | else: 323 | st.write(f"**{key}:** {value}") 324 | 325 | # Visualize 326 | st.markdown('

📈 Visualizations

', unsafe_allow_html=True) 327 | 328 | # Interactive Plot 329 | if st.checkbox('📊 Show Interactive Plot'): 330 | interactive_fig = plot_forecast_interactive(y_test, y_pred) 331 | st.plotly_chart(interactive_fig) 332 | 333 | # Static plots 334 | st.markdown('

Basic Analysis

', unsafe_allow_html=True) 335 | col1, col2 = st.columns(2) 336 | 337 | with col1: 338 | fig1, ax1 = plt.subplots(figsize=(8, 5)) 339 | ax1.plot(y_test.index, y_test, label='Observed', color='blue', linewidth=2) 340 | ax1.plot(y_test.index, y_pred, label='Predicted', color='red', linewidth=2, linestyle='--') 341 | ax1.set_xlabel('Date', fontsize=12) 342 | ax1.set_ylabel('Value', fontsize=12) 343 | ax1.set_title('Forecast vs Actual', fontsize=14, fontweight='bold') 344 | ax1.legend() 345 | ax1.grid(True, alpha=0.3) 346 | st.pyplot(fig1) 347 | 348 | with col2: 349 | residuals = y_test - y_pred 350 | fig2, ax2 = plt.subplots(figsize=(8, 5)) 351 | ax2.plot(residuals.index, residuals, color='green', linewidth=2) 352 | ax2.axhline(0, color='black', linestyle='--', linewidth=1) 353 | ax2.set_xlabel('Date', fontsize=12) 354 | ax2.set_ylabel('Residuals', fontsize=12) 355 | ax2.set_title('Residuals Over Time', fontsize=14, fontweight='bold') 356 | ax2.grid(True, alpha=0.3) 357 | st.pyplot(fig2) 358 | 359 | # Residuals distribution 360 | st.markdown('

Residuals Distribution

', unsafe_allow_html=True) 361 | fig3, ax3 = plt.subplots(figsize=(10, 6)) 362 | ax3.hist(residuals, bins=20, alpha=0.7, color='purple', edgecolor='black') 363 | ax3.set_title('Residuals Distribution', fontsize=14, fontweight='bold') 364 | ax3.set_xlabel('Residual Value', fontsize=12) 365 | ax3.set_ylabel('Frequency', fontsize=12) 366 | ax3.grid(True, alpha=0.3) 367 | st.pyplot(fig3) 368 | 369 | # Advanced visualizations 370 | if st.checkbox('🔬 Show Advanced Visualizations'): 371 | st.markdown('

Comprehensive Residuals Analysis

', unsafe_allow_html=True) 372 | res_fig = plot_residuals_analysis(y_test, y_pred) 373 | st.pyplot(res_fig) 374 | 375 | st.markdown('

Seasonal Decomposition

', unsafe_allow_html=True) 376 | try: 377 | seasonal_fig = plot_seasonal_decomposition(df[DATA_SETTINGS['target_col']]) 378 | st.pyplot(seasonal_fig) 379 | except: 380 | st.warning("⚠️ Seasonal decomposition requires more data points (minimum 24 observations recommended).") 381 | -------------------------------------------------------------------------------- /app_advanced.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from data_loader import load_data, preprocess_data, create_sample_data 5 | from models import train_xgboost, train_random_forest, train_lstm, tune_hyperparameters 6 | from advanced_models import (train_gradient_boosting, train_ada_boost, train_svr, train_mlp, 7 | train_bidirectional_lstm, train_gru, train_arima, train_sarima, 8 | ensemble_forecast) 9 | from evaluation import evaluate_model 10 | from advanced_evaluation import comprehensive_evaluation, print_evaluation_report 11 | from viz import plot_forecast, plot_residuals, plot_distribution 12 | from advanced_viz import (plot_forecast_interactive, plot_residuals_analysis, plot_model_comparison, 13 | create_dashboard, plot_seasonal_decomposition) 14 | from config import DEFAULT_PARAMS, TUNING_GRIDS, DATA_SETTINGS, APP_SETTINGS 15 | from utils import (validate_data, scale_features, save_model, load_model, 16 | handle_missing_values, create_time_features, detect_outliers) 17 | import matplotlib.pyplot as plt 18 | import plotly.graph_objects as go 19 | import warnings 20 | warnings.filterwarnings('ignore') 21 | 22 | # Import additional libraries for advanced features 23 | try: 24 | import shap 25 | SHAP_AVAILABLE = True 26 | except ImportError: 27 | SHAP_AVAILABLE = False 28 | 29 | try: 30 | from sklearn.ensemble import VotingRegressor 31 | from sklearn.model_selection import TimeSeriesSplit 32 | VOTING_AVAILABLE = True 33 | except ImportError: 34 | VOTING_AVAILABLE = False 35 | 36 | # Set page configuration 37 | st.set_page_config( 38 | page_title="Advanced Time Series Forecasting Pro", 39 | page_icon="🚀", 40 | layout="wide", 41 | initial_sidebar_state="expanded" 42 | ) 43 | 44 | # Custom CSS for better styling 45 | st.markdown(""" 46 | 209 | """, unsafe_allow_html=True) 210 | 211 | # Main header with custom styling 212 | st.markdown('

🚀 Advanced Time Series Forecasting Pro

', unsafe_allow_html=True) 213 | st.markdown('

Professional time series analysis with 11+ forecasting models, ensemble methods, and AI-powered insights

', unsafe_allow_html=True) 214 | 215 | # Initialize session state for advanced features 216 | if 'models_trained' not in st.session_state: 217 | st.session_state.models_trained = {} 218 | if 'ensemble_models' not in st.session_state: 219 | st.session_state.ensemble_models = {} 220 | if 'predictions' not in st.session_state: 221 | st.session_state.predictions = {} 222 | 223 | # Sidebar for inputs 224 | st.sidebar.markdown('', unsafe_allow_html=True) 225 | uploaded_file = st.sidebar.file_uploader('Upload CSV file', type=['csv']) 226 | if uploaded_file is not None: 227 | df = load_data(uploaded_file) 228 | st.sidebar.success('✅ Data loaded successfully!') 229 | else: 230 | st.sidebar.info('ℹ️ Using sample data.') 231 | df = create_sample_data() 232 | 233 | st.sidebar.markdown('', unsafe_allow_html=True) 234 | lags = st.sidebar.slider('Number of lag features', 1, 20, 5) 235 | test_size = st.sidebar.slider('Test size', 0.1, 0.5, 0.2) 236 | scaling_method = st.sidebar.selectbox('Feature scaling', ['none', 'standard', 'minmax'], index=0) 237 | 238 | X_train, X_test, y_train, y_test = preprocess_data(df, lags=lags, test_size=test_size) 239 | 240 | # Apply scaling if selected 241 | if scaling_method != 'none': 242 | X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test, method=scaling_method) 243 | X_train, X_test = X_train_scaled, X_test_scaled 244 | 245 | st.sidebar.markdown('', unsafe_allow_html=True) 246 | model_options = ['XGBoost', 'Random Forest', 'LSTM', 'Gradient Boosting', 'AdaBoost', 'SVR', 'MLP', 247 | 'Bidirectional LSTM', 'GRU', 'ARIMA', 'SARIMA', 'Ensemble (Voting)', 'Ensemble (Stacking)'] 248 | model_choice = st.sidebar.selectbox('Choose model', model_options) 249 | 250 | # Advanced features 251 | st.sidebar.markdown('', unsafe_allow_html=True) 252 | enable_shap = st.sidebar.checkbox('Enable SHAP analysis', value=False, disabled=not SHAP_AVAILABLE) 253 | enable_cross_validation = st.sidebar.checkbox('Time series cross-validation', value=False) 254 | enable_feature_importance = st.sidebar.checkbox('Feature importance analysis', value=False) 255 | 256 | st.sidebar.markdown('', unsafe_allow_html=True) 257 | if model_choice in DEFAULT_PARAMS: 258 | params = DEFAULT_PARAMS[model_choice].copy() 259 | for param_name, default_value in params.items(): 260 | if isinstance(default_value, int): 261 | params[param_name] = st.sidebar.slider(param_name, 1, 200, default_value) 262 | elif isinstance(default_value, float): 263 | params[param_name] = st.sidebar.slider(param_name, 0.001, 1.0, default_value) 264 | else: 265 | st.sidebar.warning(f"Default parameters not set for {model_choice}. Using basic settings.") 266 | params = {} 267 | 268 | tune = st.sidebar.checkbox('Tune hyperparameters?') 269 | if tune and model_choice in TUNING_GRIDS: 270 | param_grid = TUNING_GRIDS[model_choice] 271 | model, best_params = tune_hyperparameters(model_choice.lower().replace(' ', '_'), X_train, y_train, param_grid) 272 | st.sidebar.write('Best params:', best_params) 273 | else: 274 | # Train individual models 275 | if model_choice == 'XGBoost': 276 | model = train_xgboost(X_train, y_train, params) 277 | elif model_choice == 'Random Forest': 278 | model = train_random_forest(X_train, y_train, params) 279 | elif model_choice == 'LSTM': 280 | model = train_lstm(X_train, y_train, params) 281 | elif model_choice == 'Gradient Boosting': 282 | model = train_gradient_boosting(X_train, y_train, params) 283 | elif model_choice == 'AdaBoost': 284 | model = train_ada_boost(X_train, y_train, params) 285 | elif model_choice == 'SVR': 286 | model = train_svr(X_train, y_train, params) 287 | elif model_choice == 'MLP': 288 | model = train_mlp(X_train, y_train, params) 289 | elif model_choice == 'Bidirectional LSTM': 290 | model = train_bidirectional_lstm(X_train, y_train, params) 291 | elif model_choice == 'GRU': 292 | model = train_gru(X_train, y_train, params) 293 | elif model_choice == 'ARIMA': 294 | model = train_arima(y_train) 295 | elif model_choice == 'SARIMA': 296 | model = train_sarima(y_train) 297 | elif model_choice == 'Ensemble (Voting)': 298 | if VOTING_AVAILABLE: 299 | # Train multiple models for ensemble 300 | models = [] 301 | model_names = ['XGBoost', 'Random Forest', 'Gradient Boosting'] 302 | 303 | for name in model_names: 304 | if name == 'XGBoost': 305 | m = train_xgboost(X_train, y_train, DEFAULT_PARAMS.get('XGBoost', {})) 306 | elif name == 'Random Forest': 307 | m = train_random_forest(X_train, y_train, DEFAULT_PARAMS.get('Random Forest', {})) 308 | elif name == 'Gradient Boosting': 309 | m = train_gradient_boosting(X_train, y_train, DEFAULT_PARAMS.get('Gradient Boosting', {})) 310 | models.append((name, m)) 311 | 312 | model = VotingRegressor(estimators=models) 313 | model.fit(X_train, y_train) 314 | else: 315 | st.error("VotingRegressor not available. Install scikit-learn.") 316 | st.stop() 317 | elif model_choice == 'Ensemble (Stacking)': 318 | # Implement stacking ensemble 319 | base_models = [] 320 | model_names = ['XGBoost', 'Random Forest', 'Gradient Boosting'] 321 | 322 | for name in model_names: 323 | if name == 'XGBoost': 324 | m = train_xgboost(X_train, y_train, DEFAULT_PARAMS.get('XGBoost', {})) 325 | elif name == 'Random Forest': 326 | m = train_random_forest(X_train, y_train, DEFAULT_PARAMS.get('Random Forest', {})) 327 | elif name == 'Gradient Boosting': 328 | m = train_gradient_boosting(X_train, y_train, DEFAULT_PARAMS.get('Gradient Boosting', {})) 329 | base_models.append(m) 330 | 331 | # Use XGBoost as meta model 332 | meta_model = train_xgboost(X_train, y_train, DEFAULT_PARAMS.get('XGBoost', {})) 333 | 334 | # Simple stacking implementation 335 | base_predictions = np.column_stack([m.predict(X_train) for m in base_models]) 336 | meta_model.fit(base_predictions, y_train) 337 | model = {'base_models': base_models, 'meta_model': meta_model} 338 | else: 339 | st.error(f"Model {model_choice} not implemented yet.") 340 | st.stop() 341 | 342 | # Train and predict 343 | if model_choice in ['LSTM', 'Bidirectional LSTM', 'GRU']: 344 | X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1])) 345 | y_pred = model.predict(X_test_reshaped).flatten() 346 | elif model_choice in ['ARIMA', 'SARIMA']: 347 | # For ARIMA/SARIMA, forecast the test period 348 | y_pred = model.forecast(steps=len(y_test)) 349 | y_pred.index = y_test.index 350 | elif model_choice == 'Ensemble (Stacking)': 351 | # Stacking prediction 352 | base_predictions = np.column_stack([m.predict(X_test) for m in model['base_models']]) 353 | y_pred = model['meta_model'].predict(base_predictions) 354 | else: 355 | y_pred = model.predict(X_test) 356 | 357 | # Evaluate 358 | metrics = evaluate_model(y_test, y_pred) 359 | 360 | # Display metrics in styled cards 361 | st.markdown('

📊 Model Evaluation

', unsafe_allow_html=True) 362 | 363 | col1, col2, col3, col4 = st.columns(4) 364 | with col1: 365 | st.markdown(f""" 366 |
367 |
MAE
368 |
{metrics['MAE']:.4f}
369 |
370 | """, unsafe_allow_html=True) 371 | 372 | with col2: 373 | st.markdown(f""" 374 |
375 |
RMSE
376 |
{metrics['RMSE']:.4f}
377 |
378 | """, unsafe_allow_html=True) 379 | 380 | with col3: 381 | st.markdown(f""" 382 |
383 |
MAPE
384 |
{metrics['MAPE']:.2f}%
385 |
386 | """, unsafe_allow_html=True) 387 | 388 | with col4: 389 | st.markdown(f""" 390 |
391 |
R² Score
392 |
{metrics['R2']:.4f}
393 |
394 | """, unsafe_allow_html=True) 395 | 396 | # Advanced evaluation option 397 | if st.checkbox('🔬 Show Advanced Metrics'): 398 | adv_metrics = comprehensive_evaluation(y_test, y_pred, y_train) 399 | st.markdown('

Advanced Performance Metrics

', unsafe_allow_html=True) 400 | for key, value in adv_metrics.items(): 401 | if isinstance(value, dict): 402 | st.markdown(f"**{key}:**") 403 | for sub_key, sub_value in value.items(): 404 | st.write(f" - {sub_key}: {sub_value}") 405 | else: 406 | st.write(f"**{key}:** {value}") 407 | 408 | # Visualize 409 | st.markdown('

📈 Visualizations

', unsafe_allow_html=True) 410 | 411 | # Interactive Plot 412 | if st.checkbox('📊 Show Interactive Plot'): 413 | interactive_fig = plot_forecast_interactive(y_test, y_pred) 414 | st.plotly_chart(interactive_fig) 415 | 416 | # Static plots 417 | st.markdown('

Basic Analysis

', unsafe_allow_html=True) 418 | col1, col2 = st.columns(2) 419 | 420 | with col1: 421 | fig1, ax1 = plt.subplots(figsize=(8, 5)) 422 | ax1.plot(y_test.index, y_test, label='Observed', color='blue', linewidth=2) 423 | ax1.plot(y_test.index, y_pred, label='Predicted', color='red', linewidth=2, linestyle='--') 424 | ax1.set_xlabel('Date', fontsize=12) 425 | ax1.set_ylabel('Value', fontsize=12) 426 | ax1.set_title('Forecast vs Actual', fontsize=14, fontweight='bold') 427 | ax1.legend() 428 | ax1.grid(True, alpha=0.3) 429 | st.pyplot(fig1) 430 | 431 | with col2: 432 | residuals = y_test - y_pred 433 | fig2, ax2 = plt.subplots(figsize=(8, 5)) 434 | ax2.plot(residuals.index, residuals, color='green', linewidth=2) 435 | ax2.axhline(0, color='black', linestyle='--', linewidth=1) 436 | ax2.set_xlabel('Date', fontsize=12) 437 | ax2.set_ylabel('Residuals', fontsize=12) 438 | ax2.set_title('Residuals Over Time', fontsize=14, fontweight='bold') 439 | ax2.grid(True, alpha=0.3) 440 | st.pyplot(fig2) 441 | 442 | # Residuals distribution 443 | st.markdown('

Residuals Distribution

', unsafe_allow_html=True) 444 | fig3, ax3 = plt.subplots(figsize=(10, 6)) 445 | ax3.hist(residuals, bins=20, alpha=0.7, color='purple', edgecolor='black') 446 | ax3.set_title('Residuals Distribution', fontsize=14, fontweight='bold') 447 | ax3.set_xlabel('Residual Value', fontsize=12) 448 | ax3.set_ylabel('Frequency', fontsize=12) 449 | ax3.grid(True, alpha=0.3) 450 | st.pyplot(fig3) 451 | 452 | # Advanced visualizations 453 | if st.checkbox('🔬 Show Advanced Visualizations'): 454 | st.markdown('

Comprehensive Residuals Analysis

', unsafe_allow_html=True) 455 | res_fig = plot_residuals_analysis(y_test, y_pred) 456 | st.pyplot(res_fig) 457 | 458 | st.markdown('

Seasonal Decomposition

', unsafe_allow_html=True) 459 | try: 460 | seasonal_fig = plot_seasonal_decomposition(df[DATA_SETTINGS['target_col']]) 461 | st.pyplot(seasonal_fig) 462 | except: 463 | st.warning("⚠️ Seasonal decomposition requires more data points (minimum 24 observations recommended).") 464 | 465 | # Advanced Features Section 466 | st.markdown('

🚀 Advanced AI Features

', unsafe_allow_html=True) 467 | 468 | # SHAP Analysis 469 | if enable_shap and SHAP_AVAILABLE and hasattr(model, 'predict'): 470 | try: 471 | st.markdown('

SHAP Feature Importance

', unsafe_allow_html=True) 472 | 473 | # Sample data for SHAP (use smaller sample for performance) 474 | background = X_train.sample(min(100, len(X_train)), random_state=42) 475 | test_sample = X_test.sample(min(50, len(X_test)), random_state=42) 476 | 477 | if model_choice in ['XGBoost', 'Random Forest', 'Gradient Boosting']: 478 | explainer = shap.TreeExplainer(model) 479 | shap_values = explainer.shap_values(test_sample) 480 | 481 | # Summary plot 482 | fig, ax = plt.subplots(figsize=(10, 6)) 483 | shap.summary_plot(shap_values, test_sample, show=False) 484 | st.pyplot(fig) 485 | 486 | # Waterfall plot for first prediction 487 | st.markdown("**SHAP Waterfall Plot (First Prediction):**") 488 | fig, ax = plt.subplots(figsize=(10, 6)) 489 | shap.plots.waterfall(explainer.expected_value, shap_values[0], test_sample.iloc[0], show=False) 490 | st.pyplot(fig) 491 | 492 | else: 493 | st.info("SHAP analysis is most effective for tree-based models (XGBoost, Random Forest, Gradient Boosting).") 494 | 495 | except Exception as e: 496 | st.error(f"SHAP analysis failed: {str(e)}") 497 | 498 | # Feature Importance Analysis 499 | if enable_feature_importance and hasattr(model, 'feature_importances_'): 500 | st.markdown('

Feature Importance Analysis

', unsafe_allow_html=True) 501 | 502 | try: 503 | importance = model.feature_importances_ 504 | feature_names = [f'lag_{i+1}' for i in range(len(importance))] 505 | 506 | fig, ax = plt.subplots(figsize=(10, 6)) 507 | indices = np.argsort(importance)[::-1][:min(20, len(importance))] 508 | ax.bar(range(len(indices)), importance[indices], align='center') 509 | ax.set_xticks(range(len(indices))) 510 | ax.set_xticklabels([feature_names[i] for i in indices], rotation=45) 511 | ax.set_title('Feature Importances') 512 | ax.set_xlabel('Features') 513 | ax.set_ylabel('Importance') 514 | plt.tight_layout() 515 | st.pyplot(fig) 516 | 517 | except Exception as e: 518 | st.error(f"Feature importance analysis failed: {str(e)}") 519 | 520 | # Cross-validation 521 | if enable_cross_validation: 522 | st.markdown('

Time Series Cross-Validation

', unsafe_allow_html=True) 523 | 524 | try: 525 | from sklearn.model_selection import cross_val_score 526 | from sklearn.metrics import make_scorer, mean_absolute_error 527 | 528 | # Time series split 529 | tscv = TimeSeriesSplit(n_splits=5) 530 | 531 | if model_choice not in ['ARIMA', 'SARIMA', 'Ensemble (Stacking)']: 532 | cv_scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring=make_scorer(mean_absolute_error)) 533 | 534 | st.write(f"Cross-validation MAE scores: {cv_scores}") 535 | st.write(f"Mean CV MAE: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") 536 | 537 | # Plot CV scores 538 | fig, ax = plt.subplots(figsize=(8, 5)) 539 | ax.plot(range(1, len(cv_scores) + 1), cv_scores, 'o-', linewidth=2, markersize=8) 540 | ax.set_title('Cross-Validation Scores') 541 | ax.set_xlabel('Fold') 542 | ax.set_ylabel('MAE') 543 | ax.grid(True, alpha=0.3) 544 | st.pyplot(fig) 545 | else: 546 | st.info("Cross-validation is not applicable for ARIMA/SARIMA or stacking models in this implementation.") 547 | 548 | except Exception as e: 549 | st.error(f"Cross-validation failed: {str(e)}") 550 | 551 | # Model Comparison Dashboard 552 | if st.checkbox('📊 Create Model Comparison Dashboard'): 553 | st.markdown('

Model Comparison Dashboard

', unsafe_allow_html=True) 554 | 555 | try: 556 | dashboard_fig = create_dashboard(y_test, y_pred, metrics, model_choice) 557 | st.plotly_chart(dashboard_fig, use_container_width=True) 558 | except Exception as e: 559 | st.error(f"Dashboard creation failed: {str(e)}") 560 | 561 | # Export Results 562 | st.markdown('

💾 Export Results

', unsafe_allow_html=True) 563 | 564 | col1, col2, col3 = st.columns(3) 565 | 566 | with col1: 567 | if st.button('📄 Export Metrics Report'): 568 | report = print_evaluation_report(metrics, model_choice) 569 | st.download_button( 570 | label="Download Report", 571 | data=report, 572 | file_name=f"{model_choice}_report.txt", 573 | mime="text/plain" 574 | ) 575 | 576 | with col2: 577 | if st.button('📊 Export Predictions CSV'): 578 | results_df = pd.DataFrame({ 579 | 'Date': y_test.index, 580 | 'Actual': y_test.values, 581 | 'Predicted': y_pred, 582 | 'Residual': y_test.values - y_pred 583 | }) 584 | csv = results_df.to_csv(index=False) 585 | st.download_button( 586 | label="Download CSV", 587 | data=csv, 588 | file_name=f"{model_choice}_predictions.csv", 589 | mime="text/csv" 590 | ) 591 | 592 | with col3: 593 | if st.button('💾 Save Model'): 594 | try: 595 | if model_choice not in ['ARIMA', 'SARIMA', 'Ensemble (Stacking)']: 596 | save_model(model, f"{model_choice.lower().replace(' ', '_')}_model.pkl") 597 | st.success(f"✅ Model saved as {model_choice.lower().replace(' ', '_')}_model.pkl") 598 | else: 599 | st.warning("Model saving not implemented for this model type.") 600 | except Exception as e: 601 | st.error(f"Failed to save model: {str(e)}") 602 | 603 | # Footer 604 | st.markdown("---") 605 | st.markdown('

🚀 Advanced Time Series Forecasting Pro - Powered by AI & Machine Learning

', unsafe_allow_html=True) 606 | --------------------------------------------------------------------------------