├── xgboost_model.pkl
├── __pycache__
├── viz.cpython-312.pyc
├── utils.cpython-312.pyc
├── config.cpython-312.pyc
├── models.cpython-312.pyc
├── evaluation.cpython-312.pyc
├── advanced_viz.cpython-312.pyc
├── data_loader.cpython-312.pyc
├── advanced_models.cpython-312.pyc
└── advanced_evaluation.cpython-312.pyc
├── requirements.txt
├── evaluation.py
├── viz.py
├── data_loader.py
├── phyton_project (1).py
├── config.py
├── TODO.md
├── models.py
├── utils.py
├── advanced_models.py
├── advanced_evaluation.py
├── advanced_viz.py
├── README.md
├── app.py
└── app_advanced.py
/xgboost_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/xgboost_model.pkl
--------------------------------------------------------------------------------
/__pycache__/viz.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/viz.cpython-312.pyc
--------------------------------------------------------------------------------
/__pycache__/utils.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/utils.cpython-312.pyc
--------------------------------------------------------------------------------
/__pycache__/config.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/config.cpython-312.pyc
--------------------------------------------------------------------------------
/__pycache__/models.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/models.cpython-312.pyc
--------------------------------------------------------------------------------
/__pycache__/evaluation.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/evaluation.cpython-312.pyc
--------------------------------------------------------------------------------
/__pycache__/advanced_viz.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/advanced_viz.cpython-312.pyc
--------------------------------------------------------------------------------
/__pycache__/data_loader.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/data_loader.cpython-312.pyc
--------------------------------------------------------------------------------
/__pycache__/advanced_models.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/advanced_models.cpython-312.pyc
--------------------------------------------------------------------------------
/__pycache__/advanced_evaluation.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Time-Series-Forecasting-Tool/HEAD/__pycache__/advanced_evaluation.cpython-312.pyc
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | tensorflow==2.16.1
3 | scikit-learn
4 | pandas
5 | numpy
6 | matplotlib
7 | seaborn
8 | xgboost
9 | plotly
10 | statsmodels
11 | scipy
12 | joblib
13 |
--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
3 |
4 | def mean_absolute_percentage_error(y_true, y_pred):
5 | """
6 | Calculate Mean Absolute Percentage Error (MAPE).
7 | """
8 | y_true, y_pred = np.array(y_true), np.array(y_pred)
9 | return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
10 |
11 | def evaluate_model(y_true, y_pred):
12 | """
13 | Evaluate model with MAE, RMSE, MAPE, and R2 Score.
14 | """
15 | mae = mean_absolute_error(y_true, y_pred)
16 | rmse = np.sqrt(mean_squared_error(y_true, y_pred))
17 | mape = mean_absolute_percentage_error(y_true, y_pred)
18 | r2 = r2_score(y_true, y_pred)
19 | return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}
20 |
--------------------------------------------------------------------------------
/viz.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import seaborn as sns
3 | import pandas as pd
4 |
5 | def plot_forecast(y_true, y_pred, title='Forecast vs Actual'):
6 | """
7 | Plot observed vs predicted values.
8 | """
9 | plt.figure(figsize=(10, 6))
10 | plt.plot(y_true.index, y_true, label='Observed', color='blue')
11 | plt.plot(y_true.index, y_pred, label='Predicted', color='red')
12 | plt.xlabel('Date')
13 | plt.ylabel('Value')
14 | plt.title(title)
15 | plt.legend()
16 | plt.show()
17 |
18 | def plot_residuals(y_true, y_pred):
19 | """
20 | Plot residuals.
21 | """
22 | residuals = y_true - y_pred
23 | plt.figure(figsize=(10, 6))
24 | plt.plot(residuals.index, residuals, color='green')
25 | plt.axhline(0, color='black', linestyle='--')
26 | plt.xlabel('Date')
27 | plt.ylabel('Residuals')
28 | plt.title('Residuals Plot')
29 | plt.show()
30 |
31 | def plot_distribution(residuals):
32 | """
33 | Plot distribution of residuals.
34 | """
35 | plt.figure(figsize=(8, 6))
36 | sns.histplot(residuals, kde=True)
37 | plt.title('Residuals Distribution')
38 | plt.show()
39 |
--------------------------------------------------------------------------------
/data_loader.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from sklearn.model_selection import train_test_split
4 |
5 | def load_data(file_path, date_col='date', target_col='target'):
6 | """
7 | Load time series data from CSV.
8 | Assumes the CSV has a date column and a target column.
9 | """
10 | df = pd.read_csv(file_path, parse_dates=[date_col])
11 | df.set_index(date_col, inplace=True)
12 | return df
13 |
14 | def add_lag_features(df, target_col='target', lags=5):
15 | """
16 | Add lag features to the dataframe.
17 | """
18 | for lag in range(1, lags + 1):
19 | df[f'lag_{lag}'] = df[target_col].shift(lag)
20 | df.dropna(inplace=True)
21 | return df
22 |
23 | def preprocess_data(df, target_col='target', lags=5, test_size=0.2):
24 | """
25 | Preprocess data: add lags, split into train and test.
26 | """
27 | df = add_lag_features(df, target_col, lags)
28 | X = df.drop(columns=[target_col])
29 | y = df[target_col]
30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
31 | return X_train, X_test, y_train, y_test
32 |
33 | def create_sample_data():
34 | """
35 | Create sample data for demonstration.
36 | """
37 | date_range = pd.date_range(start='2022-01-01', periods=100, freq='D')
38 | time_series_data = np.cumsum(np.random.randn(100))
39 | df = pd.DataFrame({'date': date_range, 'target': time_series_data})
40 | df.set_index('date', inplace=True)
41 | return df
42 |
--------------------------------------------------------------------------------
/phyton_project (1).py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """phyton-project
3 |
4 | Automatically generated by Colab.
5 |
6 | Original file is located at
7 | https://colab.research.google.com/drive/1ejf8lG6yOr-FjL99Nn2dNliPjCHgpxPz
8 | """
9 |
10 | import pandas as pd
11 | import numpy as np
12 | import xgboost as xgb
13 | import matplotlib.pyplot as plt
14 | import seaborn as sns
15 |
16 | # Load your actual time series data into a pandas DataFrame (replace this with your data)
17 | # For demonstration purposes, let's create a sample dataset
18 | date_range = pd.date_range(start='2022-01-01', periods=100, freq='D')
19 | time_series_data = np.cumsum(np.random.randn(100))
20 | df = pd.DataFrame({'date': date_range, 'target': time_series_data})
21 |
22 | # Convert datetime column to features (year, month, day)
23 | df['year'] = df['date'].dt.year
24 | df['month'] = df['date'].dt.month
25 | df['day'] = df['date'].dt.day
26 |
27 | # Drop the original datetime column
28 | df.drop(columns=['date'], inplace=True)
29 |
30 | # Split data into train and validation sets
31 | train_size = int(0.8 * len(df))
32 | train, val = df[:train_size], df[train_size:]
33 |
34 | # Define features and target
35 | X_train, y_train = train.drop(columns=['target']), train['target']
36 | X_val, y_val = val.drop(columns=['target']), val['target']
37 |
38 | # Train an XGBoost model
39 | model = xgb.XGBRegressor()
40 | model.fit(X_train, y_train)
41 |
42 | # Make predictions
43 | y_pred = model.predict(X_val)
44 |
45 | # Visualize observed vs. predicted values
46 | plt.plot(val.index, y_val, label='Observed', color='blue')
47 | plt.plot(val.index, y_pred, label='Predicted', color='red')
48 | plt.xlabel('Date')
49 | plt.ylabel('Value')
50 | plt.title('XGBoost Time Series Forecasting')
51 | plt.legend()
52 | plt.show()
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | # Configuration settings for the Time Series Forecasting App
4 |
5 | # Default hyperparameters for models
6 | DEFAULT_PARAMS = {
7 | 'XGBoost': {
8 | 'n_estimators': 100,
9 | 'learning_rate': 0.1,
10 | 'max_depth': 6
11 | },
12 | 'Random Forest': {
13 | 'n_estimators': 100,
14 | 'max_depth': 10,
15 | 'min_samples_split': 2
16 | },
17 | 'LSTM': {
18 | 'units': 50,
19 | 'epochs': 10,
20 | 'batch_size': 32,
21 | 'dropout': 0.2
22 | }
23 | }
24 |
25 | # Hyperparameter tuning grids
26 | TUNING_GRIDS = {
27 | 'XGBoost': {
28 | 'n_estimators': [50, 100, 150],
29 | 'learning_rate': [0.01, 0.1, 0.2],
30 | 'max_depth': [3, 6, 9]
31 | },
32 | 'Random Forest': {
33 | 'n_estimators': [50, 100, 150],
34 | 'max_depth': [5, 10, 15],
35 | 'min_samples_split': [2, 5, 10]
36 | }
37 | }
38 |
39 | # Data preprocessing settings
40 | DATA_SETTINGS = {
41 | 'default_lags': 5,
42 | 'test_size': 0.2,
43 | 'date_col': 'date',
44 | 'target_col': 'target'
45 | }
46 |
47 | # Visualization settings
48 | VIZ_SETTINGS = {
49 | 'figsize': (10, 6),
50 | 'colors': {
51 | 'observed': 'blue',
52 | 'predicted': 'red',
53 | 'residuals': 'green'
54 | }
55 | }
56 |
57 | # App settings
58 | APP_SETTINGS = {
59 | 'title': 'Advanced Time Series Forecasting Web App',
60 | 'sidebar_title': 'Configuration',
61 | 'default_model': 'XGBoost'
62 | }
63 |
64 | # File paths
65 | DATA_DIR = 'data'
66 | MODEL_DIR = 'models'
67 | RESULTS_DIR = 'results'
68 |
69 | # Create directories if they don't exist
70 | for dir_path in [DATA_DIR, MODEL_DIR, RESULTS_DIR]:
71 | if not os.path.exists(dir_path):
72 | os.makedirs(dir_path)
73 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | # TODO List for Enhancing Time Series Forecasting Project
2 |
3 | ## Step 1: Create requirements.txt ✅
4 | - List all necessary dependencies: streamlit, tensorflow, scikit-learn, pandas, numpy, matplotlib, seaborn, xgboost
5 |
6 | ## Step 2: Create data_loader.py ✅
7 | - Implement functions for loading CSV data, preprocessing (handling dates, adding lag features), splitting into train/val
8 |
9 | ## Step 3: Create models.py ✅
10 | - Implement XGBoost, Random Forest, and LSTM models with hyperparameter tuning support
11 |
12 | ## Step 4: Create evaluation.py ✅
13 | - Implement evaluation metrics: MAE, RMSE, MAPE
14 |
15 | ## Step 5: Create viz.py ✅
16 | - Implement visualization functions: observed vs predicted plots, residuals, etc.
17 |
18 | ## Step 6: Create app.py ✅
19 | - Build Streamlit web app with UI for data upload, feature selection, model choice, hyperparams, and displaying results
20 |
21 | ## Step 7: Install dependencies ✅
22 | - Run pip install -r requirements.txt
23 |
24 | ## Step 8: Run and test the Streamlit app ✅
25 | - Execute streamlit run app.py and verify all features work
26 |
27 | ## Step 9: Verify and finalize ✅
28 | - Check for any bugs, ensure performance optimizations are in place
29 |
30 | ## Step 10: Add Advanced Features ✅
31 | - Create config.py for configuration management
32 | - Create utils.py for utility functions (validation, scaling, logging, etc.)
33 | - Create advanced_models.py with additional models (Gradient Boosting, AdaBoost, SVR, MLP, Bidirectional LSTM, GRU, ARIMA, SARIMA)
34 | - Create advanced_evaluation.py with comprehensive metrics (SMAPE, MASE, MDA, etc.)
35 | - Create advanced_viz.py with interactive plots and advanced visualizations
36 | - Update requirements.txt with additional dependencies (plotly, statsmodels, scipy, joblib)
37 | - Enhance app.py with new models, advanced metrics, and interactive visualizations
38 |
39 | ## Step 11: Final Testing and Documentation ✅
40 | - Test all new features and models
41 | - Ensure backward compatibility
42 | - Add proper error handling
43 | - Create README.md with project description and usage instructions
44 |
--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
1 | import xgboost as xgb
2 | from sklearn.ensemble import RandomForestRegressor
3 | from sklearn.model_selection import GridSearchCV
4 | from tensorflow.keras.models import Sequential
5 | from tensorflow.keras.layers import LSTM, Dense
6 | import numpy as np
7 |
8 | def train_xgboost(X_train, y_train, params=None):
9 | """
10 | Train XGBoost model with optional hyperparameter tuning.
11 | """
12 | if params is None:
13 | params = {'n_estimators': 100, 'learning_rate': 0.1}
14 | model = xgb.XGBRegressor(**params)
15 | model.fit(X_train, y_train)
16 | return model
17 |
18 | def train_random_forest(X_train, y_train, params=None):
19 | """
20 | Train Random Forest model with optional hyperparameter tuning.
21 | """
22 | if params is None:
23 | params = {'n_estimators': 100, 'max_depth': 10}
24 | model = RandomForestRegressor(**params)
25 | model.fit(X_train, y_train)
26 | return model
27 |
28 | def train_lstm(X_train, y_train, params=None):
29 | """
30 | Train LSTM model.
31 | Note: LSTM requires 3D input (samples, timesteps, features).
32 | Assuming X_train is reshaped appropriately.
33 | """
34 | if params is None:
35 | params = {'units': 50, 'epochs': 10, 'batch_size': 32}
36 | # Reshape for LSTM: assuming univariate, timesteps=1 for simplicity
37 | X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
38 | model = Sequential()
39 | model.add(LSTM(params['units'], input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
40 | model.add(Dense(1))
41 | model.compile(optimizer='adam', loss='mse')
42 | model.fit(X_train_reshaped, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0)
43 | return model
44 |
45 | def tune_hyperparameters(model_type, X_train, y_train, param_grid):
46 | """
47 | Perform hyperparameter tuning using GridSearchCV.
48 | """
49 | if model_type == 'xgboost':
50 | model = xgb.XGBRegressor()
51 | elif model_type == 'random_forest':
52 | model = RandomForestRegressor()
53 | else:
54 | raise ValueError("Unsupported model type for tuning")
55 | grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')
56 | grid_search.fit(X_train, y_train)
57 | return grid_search.best_estimator_, grid_search.best_params_
58 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from sklearn.preprocessing import StandardScaler, MinMaxScaler
4 | import logging
5 |
6 | # Set up logging
7 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8 | logger = logging.getLogger(__name__)
9 |
10 | def setup_logging(log_level=logging.INFO):
11 | """
12 | Set up logging configuration.
13 | """
14 | logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
15 |
16 | def validate_data(df, date_col='date', target_col='target'):
17 | """
18 | Validate the input data.
19 | """
20 | if df.empty:
21 | raise ValueError("DataFrame is empty")
22 | if date_col not in df.columns:
23 | raise ValueError(f"Date column '{date_col}' not found in data")
24 | if target_col not in df.columns:
25 | raise ValueError(f"Target column '{target_col}' not found in data")
26 | if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
27 | raise ValueError(f"Date column '{date_col}' is not in datetime format")
28 | logger.info("Data validation passed")
29 |
30 | def scale_features(X_train, X_test, method='standard'):
31 | """
32 | Scale features using StandardScaler or MinMaxScaler.
33 | """
34 | if method == 'standard':
35 | scaler = StandardScaler()
36 | elif method == 'minmax':
37 | scaler = MinMaxScaler()
38 | else:
39 | raise ValueError("Invalid scaling method. Choose 'standard' or 'minmax'")
40 |
41 | X_train_scaled = scaler.fit_transform(X_train)
42 | X_test_scaled = scaler.transform(X_test)
43 | return X_train_scaled, X_test_scaled, scaler
44 |
45 | def inverse_scale_predictions(scaled_pred, scaler, original_y):
46 | """
47 | Inverse scale predictions if target was scaled.
48 | """
49 | # Assuming target is not scaled in this implementation
50 | return scaled_pred
51 |
52 | def save_model(model, filename):
53 | """
54 | Save trained model to file.
55 | """
56 | import joblib
57 | joblib.dump(model, filename)
58 | logger.info(f"Model saved to {filename}")
59 |
60 | def load_model(filename):
61 | """
62 | Load model from file.
63 | """
64 | import joblib
65 | model = joblib.load(filename)
66 | logger.info(f"Model loaded from {filename}")
67 | return model
68 |
69 | def calculate_forecast_accuracy(y_true, y_pred, threshold=0.1):
70 | """
71 | Calculate forecast accuracy based on a threshold.
72 | """
73 | accuracy = np.mean(np.abs((y_true - y_pred) / y_true) < threshold) * 100
74 | return accuracy
75 |
76 | def generate_forecast_report(metrics, model_name):
77 | """
78 | Generate a summary report of the forecast results.
79 | """
80 | report = f"""
81 | Forecast Report for {model_name}
82 | ================================
83 | MAE: {metrics['MAE']:.4f}
84 | RMSE: {metrics['RMSE']:.4f}
85 | MAPE: {metrics['MAPE']:.4f}%
86 | """
87 | return report
88 |
89 | def detect_outliers(df, column, method='iqr', threshold=1.5):
90 | """
91 | Detect outliers in a column using IQR or Z-score method.
92 | """
93 | if method == 'iqr':
94 | Q1 = df[column].quantile(0.25)
95 | Q3 = df[column].quantile(0.75)
96 | IQR = Q3 - Q1
97 | lower_bound = Q1 - threshold * IQR
98 | upper_bound = Q3 + threshold * IQR
99 | outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
100 | elif method == 'zscore':
101 | z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
102 | outliers = df[z_scores > threshold]
103 | else:
104 | raise ValueError("Invalid outlier detection method. Choose 'iqr' or 'zscore'")
105 | return outliers
106 |
107 | def handle_missing_values(df, method='interpolate'):
108 | """
109 | Handle missing values in the dataframe.
110 | """
111 | if method == 'interpolate':
112 | df = df.interpolate(method='linear')
113 | elif method == 'forward_fill':
114 | df = df.fillna(method='ffill')
115 | elif method == 'backward_fill':
116 | df = df.fillna(method='bfill')
117 | elif method == 'drop':
118 | df = df.dropna()
119 | else:
120 | raise ValueError("Invalid missing value handling method")
121 | return df
122 |
123 | def create_time_features(df, date_col='date'):
124 | """
125 | Create additional time-based features from date column.
126 | """
127 | df = df.copy()
128 | df['year'] = df[date_col].dt.year
129 | df['month'] = df[date_col].dt.month
130 | df['day'] = df[date_col].dt.day
131 | df['day_of_week'] = df[date_col].dt.dayofweek
132 | df['quarter'] = df[date_col].dt.quarter
133 | df['is_weekend'] = df[date_col].dt.dayofweek.isin([5, 6]).astype(int)
134 | return df
135 |
--------------------------------------------------------------------------------
/advanced_models.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
4 | from sklearn.svm import SVR
5 | from sklearn.neural_network import MLPRegressor
6 | from tensorflow.keras.models import Sequential
7 | from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, GRU
8 | from tensorflow.keras.callbacks import EarlyStopping
9 | from statsmodels.tsa.arima.model import ARIMA
10 | from statsmodels.tsa.statespace.sarimax import SARIMAX
11 | import warnings
12 | warnings.filterwarnings('ignore')
13 |
14 | def train_gradient_boosting(X_train, y_train, params=None):
15 | """
16 | Train Gradient Boosting model.
17 | """
18 | if params is None:
19 | params = {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3}
20 | model = GradientBoostingRegressor(**params)
21 | model.fit(X_train, y_train)
22 | return model
23 |
24 | def train_ada_boost(X_train, y_train, params=None):
25 | """
26 | Train AdaBoost model.
27 | """
28 | if params is None:
29 | params = {'n_estimators': 50, 'learning_rate': 1.0}
30 | model = AdaBoostRegressor(**params)
31 | model.fit(X_train, y_train)
32 | return model
33 |
34 | def train_svr(X_train, y_train, params=None):
35 | """
36 | Train Support Vector Regression model.
37 | """
38 | if params is None:
39 | params = {'kernel': 'rbf', 'C': 1.0, 'epsilon': 0.1}
40 | model = SVR(**params)
41 | model.fit(X_train, y_train)
42 | return model
43 |
44 | def train_mlp(X_train, y_train, params=None):
45 | """
46 | Train Multi-Layer Perceptron model.
47 | """
48 | if params is None:
49 | params = {'hidden_layer_sizes': (100, 50), 'activation': 'relu', 'max_iter': 500}
50 | model = MLPRegressor(**params)
51 | model.fit(X_train, y_train)
52 | return model
53 |
54 | def train_bidirectional_lstm(X_train, y_train, params=None):
55 | """
56 | Train Bidirectional LSTM model.
57 | """
58 | if params is None:
59 | params = {'units': 50, 'epochs': 10, 'batch_size': 32, 'dropout': 0.2}
60 |
61 | X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
62 | model = Sequential()
63 | model.add(Bidirectional(LSTM(params['units'], return_sequences=True), input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
64 | model.add(Dropout(params['dropout']))
65 | model.add(LSTM(params['units'] // 2))
66 | model.add(Dropout(params['dropout']))
67 | model.add(Dense(1))
68 | model.compile(optimizer='adam', loss='mse')
69 |
70 | early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
71 | model.fit(X_train_reshaped, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0, callbacks=[early_stopping])
72 | return model
73 |
74 | def train_gru(X_train, y_train, params=None):
75 | """
76 | Train GRU model.
77 | """
78 | if params is None:
79 | params = {'units': 50, 'epochs': 10, 'batch_size': 32, 'dropout': 0.2}
80 |
81 | X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
82 | model = Sequential()
83 | model.add(GRU(params['units'], input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
84 | model.add(Dropout(params['dropout']))
85 | model.add(Dense(1))
86 | model.compile(optimizer='adam', loss='mse')
87 |
88 | early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
89 | model.fit(X_train_reshaped, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0, callbacks=[early_stopping])
90 | return model
91 |
92 | def train_arima(y_train, order=(5, 1, 0)):
93 | """
94 | Train ARIMA model.
95 | """
96 | model = ARIMA(y_train, order=order)
97 | model_fit = model.fit()
98 | return model_fit
99 |
100 | def train_sarima(y_train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)):
101 | """
102 | Train SARIMA model.
103 | """
104 | model = SARIMAX(y_train, order=order, seasonal_order=seasonal_order)
105 | model_fit = model.fit(disp=False)
106 | return model_fit
107 |
108 | def ensemble_forecast(models, X_test, weights=None):
109 | """
110 | Create ensemble forecast from multiple models.
111 | """
112 | if weights is None:
113 | weights = [1/len(models)] * len(models)
114 |
115 | predictions = []
116 | for model in models:
117 | if hasattr(model, 'predict'):
118 | pred = model.predict(X_test)
119 | else:
120 | # For statsmodels models
121 | pred = model.forecast(steps=len(X_test))
122 | predictions.append(pred)
123 |
124 | # Weighted average
125 | ensemble_pred = np.average(predictions, axis=0, weights=weights)
126 | return ensemble_pred
127 |
128 | def train_stacked_model(base_models, meta_model, X_train, y_train, X_val, y_val):
129 | """
130 | Train a stacked ensemble model.
131 | """
132 | # Get predictions from base models
133 | base_predictions = []
134 | for model in base_models:
135 | model.fit(X_train, y_train)
136 | pred = model.predict(X_val)
137 | base_predictions.append(pred)
138 |
139 | # Create meta features
140 | meta_features = np.column_stack(base_predictions)
141 |
142 | # Train meta model
143 | meta_model.fit(meta_features, y_val)
144 |
145 | return base_models, meta_model
146 |
--------------------------------------------------------------------------------
/advanced_evaluation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
4 | from scipy.stats import shapiro, normaltest
5 | import warnings
6 | warnings.filterwarnings('ignore')
7 |
8 | def mean_absolute_percentage_error(y_true, y_pred):
9 | """
10 | Calculate Mean Absolute Percentage Error (MAPE).
11 | """
12 | y_true, y_pred = np.array(y_true), np.array(y_pred)
13 | return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
14 |
15 | def symmetric_mean_absolute_percentage_error(y_true, y_pred):
16 | """
17 | Calculate Symmetric Mean Absolute Percentage Error (SMAPE).
18 | """
19 | y_true, y_pred = np.array(y_true), np.array(y_pred)
20 | return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
21 |
22 | def mean_absolute_scaled_error(y_true, y_pred, y_train=None, m=1):
23 | """
24 | Calculate Mean Absolute Scaled Error (MASE).
25 | """
26 | y_true, y_pred = np.array(y_true), np.array(y_pred)
27 | mae = mean_absolute_error(y_true, y_pred)
28 |
29 | if y_train is not None:
30 | # Calculate naive forecast MAE
31 | y_train = np.array(y_train)
32 | naive_errors = []
33 | for i in range(m, len(y_train)):
34 | naive_errors.append(abs(y_train[i] - y_train[i-m]))
35 | naive_mae = np.mean(naive_errors)
36 | else:
37 | # Use simple naive method (previous value)
38 | naive_mae = np.mean(np.abs(np.diff(y_true)))
39 |
40 | return mae / naive_mae
41 |
42 | def root_mean_squared_percentage_error(y_true, y_pred):
43 | """
44 | Calculate Root Mean Squared Percentage Error (RMSPE).
45 | """
46 | y_true, y_pred = np.array(y_true), np.array(y_pred)
47 | return np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2)) * 100
48 |
49 | def mean_directional_accuracy(y_true, y_pred):
50 | """
51 | Calculate Mean Directional Accuracy (MDA).
52 | """
53 | y_true, y_pred = np.array(y_true), np.array(y_pred)
54 | actual_direction = np.sign(np.diff(y_true))
55 | pred_direction = np.sign(np.diff(y_pred))
56 | return np.mean(actual_direction == pred_direction) * 100
57 |
58 | def theil_u_statistic(y_true, y_pred):
59 | """
60 | Calculate Theil's U statistic.
61 | """
62 | y_true, y_pred = np.array(y_true), np.array(y_pred)
63 | naive_pred = np.roll(y_true, 1)[1:] # Naive forecast: previous value
64 | y_true = y_true[1:]
65 | y_pred = y_pred[1:]
66 |
67 | rmse_model = np.sqrt(mean_squared_error(y_true, y_pred))
68 | rmse_naive = np.sqrt(mean_squared_error(y_true, naive_pred))
69 |
70 | return rmse_model / rmse_naive
71 |
72 | def forecast_bias(y_true, y_pred):
73 | """
74 | Calculate forecast bias (mean error).
75 | """
76 | return np.mean(y_pred - y_true)
77 |
78 | def tracking_signal(y_true, y_pred, cumulative=True):
79 | """
80 | Calculate tracking signal.
81 | """
82 | errors = y_pred - y_true
83 | if cumulative:
84 | cum_errors = np.cumsum(errors)
85 | cum_abs_errors = np.cumsum(np.abs(errors))
86 | return cum_errors / cum_abs_errors
87 | else:
88 | return errors / np.abs(errors)
89 |
90 | def residual_analysis(y_true, y_pred):
91 | """
92 | Perform comprehensive residual analysis.
93 | """
94 | residuals = y_true - y_pred
95 |
96 | # Normality tests
97 | shapiro_stat, shapiro_p = shapiro(residuals)
98 | dagostino_stat, dagostino_p = normaltest(residuals)
99 |
100 | # Autocorrelation (simple lag-1)
101 | autocorr = np.corrcoef(residuals[:-1], residuals[1:])[0, 1]
102 |
103 | # Heteroscedasticity test (simple: correlation between |residuals| and predictions)
104 | hetero_corr = np.corrcoef(np.abs(residuals), y_pred)[0, 1]
105 |
106 | analysis = {
107 | 'mean_residual': np.mean(residuals),
108 | 'std_residual': np.std(residuals),
109 | 'shapiro_normality': {'statistic': shapiro_stat, 'p_value': shapiro_p},
110 | 'dagostino_normality': {'statistic': dagostino_stat, 'p_value': dagostino_p},
111 | 'autocorrelation_lag1': autocorr,
112 | 'heteroscedasticity_corr': hetero_corr
113 | }
114 |
115 | return analysis
116 |
117 | def comprehensive_evaluation(y_true, y_pred, y_train=None):
118 | """
119 | Comprehensive model evaluation with multiple metrics.
120 | """
121 | metrics = {
122 | 'MAE': mean_absolute_error(y_true, y_pred),
123 | 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
124 | 'MAPE': mean_absolute_percentage_error(y_true, y_pred),
125 | 'SMAPE': symmetric_mean_absolute_percentage_error(y_true, y_pred),
126 | 'RMSPE': root_mean_squared_percentage_error(y_true, y_pred),
127 | 'R2': r2_score(y_true, y_pred),
128 | 'MDA': mean_directional_accuracy(y_true, y_pred),
129 | 'Theil_U': theil_u_statistic(y_true, y_pred),
130 | 'Forecast_Bias': forecast_bias(y_true, y_pred),
131 | 'MASE': mean_absolute_scaled_error(y_true, y_pred, y_train)
132 | }
133 |
134 | # Residual analysis
135 | metrics['Residual_Analysis'] = residual_analysis(y_true, y_pred)
136 |
137 | return metrics
138 |
139 | def print_evaluation_report(metrics, model_name="Model"):
140 | """
141 | Print a formatted evaluation report.
142 | """
143 | report = f"""
144 | === {model_name} Evaluation Report ===
145 | ======================================
146 |
147 | Error Metrics:
148 | --------------
149 | MAE: {metrics['MAE']:.4f}
150 | RMSE: {metrics['RMSE']:.4f}
151 | MAPE: {metrics['MAPE']:.4f}%
152 | SMAPE: {metrics['SMAPE']:.4f}%
153 | RMSPE: {metrics['RMSPE']:.4f}%
154 | MASE: {metrics['MASE']:.4f}
155 |
156 | Accuracy Metrics:
157 | -----------------
158 | R² Score: {metrics['R2']:.4f}
159 | MDA: {metrics['MDA']:.4f}%
160 |
161 | Forecast Quality:
162 | -----------------
163 | Theil's U: {metrics['Theil_U']:.4f}
164 | Forecast Bias: {metrics['Forecast_Bias']:.4f}
165 |
166 | Residual Analysis:
167 | ------------------
168 | Mean Residual: {metrics['Residual_Analysis']['mean_residual']:.4f}
169 | Std Residual: {metrics['Residual_Analysis']['std_residual']:.4f}
170 | Autocorr (lag1): {metrics['Residual_Analysis']['autocorrelation_lag1']:.4f}
171 | Hetero Corr: {metrics['Residual_Analysis']['heteroscedasticity_corr']:.4f}
172 |
173 | Normality Tests:
174 | ----------------
175 | Shapiro-Wilk: p-value = {metrics['Residual_Analysis']['shapiro_normality']['p_value']:.4f}
176 | D'Agostino: p-value = {metrics['Residual_Analysis']['dagostino_normality']['p_value']:.4f}
177 | """
178 |
179 | print(report)
180 | return report
181 |
--------------------------------------------------------------------------------
/advanced_viz.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import seaborn as sns
3 | import pandas as pd
4 | import numpy as np
5 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
6 | import plotly.graph_objects as go
7 | import plotly.express as px
8 | from plotly.subplots import make_subplots
9 | import warnings
10 | warnings.filterwarnings('ignore')
11 |
12 | def plot_forecast_interactive(y_true, y_pred, title='Interactive Forecast vs Actual'):
13 | """
14 | Create interactive forecast plot using Plotly.
15 | """
16 | fig = go.Figure()
17 |
18 | fig.add_trace(go.Scatter(x=y_true.index, y=y_true, mode='lines', name='Observed',
19 | line=dict(color='blue', width=2)))
20 | fig.add_trace(go.Scatter(x=y_true.index, y=y_pred, mode='lines', name='Predicted',
21 | line=dict(color='red', width=2, dash='dash')))
22 |
23 | fig.update_layout(title=title,
24 | xaxis_title='Date',
25 | yaxis_title='Value',
26 | hovermode='x unified')
27 |
28 | return fig
29 |
30 | def plot_residuals_analysis(y_true, y_pred, figsize=(15, 10)):
31 | """
32 | Comprehensive residuals analysis plots.
33 | """
34 | residuals = y_true - y_pred
35 |
36 | fig, axes = plt.subplots(2, 3, figsize=figsize)
37 |
38 | # Residuals over time
39 | axes[0, 0].plot(residuals.index, residuals, color='green', alpha=0.7)
40 | axes[0, 0].axhline(0, color='black', linestyle='--')
41 | axes[0, 0].set_title('Residuals Over Time')
42 | axes[0, 0].set_xlabel('Date')
43 | axes[0, 0].set_ylabel('Residuals')
44 |
45 | # Residuals distribution
46 | sns.histplot(residuals, kde=True, ax=axes[0, 1])
47 | axes[0, 1].set_title('Residuals Distribution')
48 |
49 | # Q-Q plot
50 | from scipy import stats
51 | stats.probplot(residuals, dist="norm", plot=axes[0, 2])
52 | axes[0, 2].set_title('Q-Q Plot')
53 |
54 | # Residuals vs Fitted
55 | axes[1, 0].scatter(y_pred, residuals, alpha=0.5)
56 | axes[1, 0].axhline(0, color='red', linestyle='--')
57 | axes[1, 0].set_title('Residuals vs Fitted Values')
58 | axes[1, 0].set_xlabel('Fitted Values')
59 | axes[1, 0].set_ylabel('Residuals')
60 |
61 | # Autocorrelation
62 | max_lags_acf = min(20, len(residuals) - 1)
63 | plot_acf(residuals, ax=axes[1, 1], lags=max_lags_acf)
64 | axes[1, 1].set_title('Residuals Autocorrelation')
65 |
66 | # Partial Autocorrelation
67 | max_lags_pacf = min(10, len(residuals) // 2 - 1)
68 | if max_lags_pacf > 0:
69 | plot_pacf(residuals, ax=axes[1, 2], lags=max_lags_pacf)
70 | axes[1, 2].set_title('Residuals Partial Autocorrelation')
71 | else:
72 | axes[1, 2].text(0.5, 0.5, 'Not enough data\nfor PACF', ha='center', va='center', transform=axes[1, 2].transAxes)
73 | axes[1, 2].set_title('Residuals Partial Autocorrelation')
74 |
75 | plt.tight_layout()
76 | return fig
77 |
78 | def plot_model_comparison(models_metrics, metric='RMSE'):
79 | """
80 | Plot comparison of different models.
81 | """
82 | model_names = list(models_metrics.keys())
83 | values = [models_metrics[name][metric] for name in model_names]
84 |
85 | fig, ax = plt.subplots(figsize=(10, 6))
86 | bars = ax.bar(model_names, values, color='skyblue', edgecolor='navy', linewidth=1)
87 |
88 | ax.set_title(f'Model Comparison - {metric}')
89 | ax.set_xlabel('Models')
90 | ax.set_ylabel(metric)
91 | ax.tick_params(axis='x', rotation=45)
92 |
93 | # Add value labels on bars
94 | for bar, value in zip(bars, values):
95 | ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(),
96 | f'{value:.4f}', ha='center', va='bottom')
97 |
98 | plt.tight_layout()
99 | return fig
100 |
101 | def plot_feature_importance(model, feature_names, top_n=20):
102 | """
103 | Plot feature importance for tree-based models.
104 | """
105 | if hasattr(model, 'feature_importances_'):
106 | importance = model.feature_importances_
107 | indices = np.argsort(importance)[::-1][:top_n]
108 |
109 | plt.figure(figsize=(10, 8))
110 | plt.title('Feature Importances')
111 | plt.bar(range(top_n), importance[indices], align='center')
112 | plt.xticks(range(top_n), [feature_names[i] for i in indices], rotation=90)
113 | plt.tight_layout()
114 | return plt.gcf()
115 | else:
116 | print("Model does not have feature_importances_ attribute")
117 | return None
118 |
119 | def plot_learning_curve(model, X_train, y_train, cv=5):
120 | """
121 | Plot learning curve for a model.
122 | """
123 | from sklearn.model_selection import learning_curve
124 |
125 | train_sizes, train_scores, val_scores = learning_curve(
126 | model, X_train, y_train, cv=cv, n_jobs=-1,
127 | train_sizes=np.linspace(0.1, 1.0, 10), scoring='neg_mean_squared_error'
128 | )
129 |
130 | train_scores_mean = -np.mean(train_scores, axis=1)
131 | train_scores_std = np.std(train_scores, axis=1)
132 | val_scores_mean = -np.mean(val_scores, axis=1)
133 | val_scores_std = np.std(val_scores, axis=1)
134 |
135 | plt.figure(figsize=(10, 6))
136 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
137 | train_scores_mean + train_scores_std, alpha=0.1, color="r")
138 | plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
139 | val_scores_mean + val_scores_std, alpha=0.1, color="g")
140 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
141 | plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Cross-validation score")
142 |
143 | plt.title('Learning Curve')
144 | plt.xlabel('Training Size')
145 | plt.ylabel('MSE')
146 | plt.legend(loc="best")
147 | plt.grid(True)
148 | plt.tight_layout()
149 | return plt.gcf()
150 |
151 | def plot_prediction_intervals(y_true, y_pred, lower_bound, upper_bound, title='Prediction Intervals'):
152 | """
153 | Plot predictions with confidence intervals.
154 | """
155 | plt.figure(figsize=(12, 6))
156 | plt.plot(y_true.index, y_true, label='Observed', color='blue', linewidth=2)
157 | plt.plot(y_true.index, y_pred, label='Predicted', color='red', linewidth=2)
158 | plt.fill_between(y_true.index, lower_bound, upper_bound, alpha=0.3, color='red', label='95% Prediction Interval')
159 |
160 | plt.title(title)
161 | plt.xlabel('Date')
162 | plt.ylabel('Value')
163 | plt.legend()
164 | plt.grid(True, alpha=0.3)
165 | plt.tight_layout()
166 | return plt.gcf()
167 |
168 | def create_dashboard(y_true, y_pred, metrics, model_name):
169 | """
170 | Create a comprehensive dashboard with multiple plots.
171 | """
172 | fig = make_subplots(
173 | rows=3, cols=2,
174 | subplot_titles=('Forecast vs Actual', 'Residuals Distribution',
175 | 'Residuals Over Time', 'Q-Q Plot',
176 | 'Model Metrics', 'Feature Importance (if available)'),
177 | specs=[[{"secondary_y": False}, {"secondary_y": False}],
178 | [{"secondary_y": False}, {"secondary_y": False}],
179 | [{"secondary_y": False}, {"secondary_y": False}]]
180 | )
181 |
182 | # Forecast vs Actual
183 | fig.add_trace(go.Scatter(x=y_true.index, y=y_true, mode='lines', name='Observed',
184 | line=dict(color='blue')), row=1, col=1)
185 | fig.add_trace(go.Scatter(x=y_true.index, y=y_pred, mode='lines', name='Predicted',
186 | line=dict(color='red')), row=1, col=1)
187 |
188 | # Residuals Distribution
189 | residuals = y_true - y_pred
190 | fig.add_trace(go.Histogram(x=residuals, nbinsx=30, name='Residuals'), row=1, col=2)
191 |
192 | # Residuals Over Time
193 | fig.add_trace(go.Scatter(x=y_true.index, y=residuals, mode='lines', name='Residuals Over Time',
194 | line=dict(color='green')), row=2, col=1)
195 |
196 | # Q-Q Plot
197 | from scipy import stats
198 | qq = stats.probplot(residuals, dist="norm")
199 | fig.add_trace(go.Scatter(x=qq[0][0], y=qq[0][1], mode='markers', name='Q-Q Plot'), row=2, col=2)
200 |
201 | # Model Metrics
202 | metrics_text = "
".join([f"{k}: {v:.4f}" for k, v in metrics.items() if isinstance(v, (int, float))])
203 | fig.add_trace(go.Table(
204 | header=dict(values=['Metric', 'Value']),
205 | cells=dict(values=[list(metrics.keys()), [f"{v:.4f}" if isinstance(v, (int, float)) else str(v) for v in metrics.values()]])
206 | ), row=3, col=1)
207 |
208 | # Placeholder for Feature Importance
209 | fig.add_trace(go.Bar(x=['Feature 1', 'Feature 2'], y=[0.5, 0.3], name='Feature Importance'), row=3, col=2)
210 |
211 | fig.update_layout(height=1200, title_text=f"{model_name} - Comprehensive Dashboard")
212 | return fig
213 |
214 | def plot_seasonal_decomposition(ts, model='additive', period=None):
215 | """
216 | Plot seasonal decomposition of time series.
217 | """
218 | from statsmodels.tsa.seasonal import seasonal_decompose
219 |
220 | if period is None:
221 | period = 12 # Assume monthly data
222 |
223 | decomposition = seasonal_decompose(ts, model=model, period=period)
224 |
225 | fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)
226 |
227 | axes[0].plot(ts, label='Original')
228 | axes[0].legend()
229 | axes[0].set_title('Original Time Series')
230 |
231 | axes[1].plot(decomposition.trend, label='Trend')
232 | axes[1].legend()
233 | axes[1].set_title('Trend Component')
234 |
235 | axes[2].plot(decomposition.seasonal, label='Seasonal')
236 | axes[2].legend()
237 | axes[2].set_title('Seasonal Component')
238 |
239 | axes[3].plot(decomposition.resid, label='Residual')
240 | axes[3].legend()
241 | axes[3].set_title('Residual Component')
242 |
243 | plt.tight_layout()
244 | return fig
245 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🚀 Advanced Time Series Forecasting Tool
2 |
3 | A comprehensive, enterprise-grade Streamlit web application for professional time series forecasting with cutting-edge AI capabilities. This tool combines 13+ forecasting models, advanced evaluation metrics, interactive visualizations, and AI-powered insights to deliver state-of-the-art time series analysis.
4 |
5 | ## 🌟 Key Features
6 |
7 | ### 🤖 **13 Advanced Forecasting Models**
8 | - **Machine Learning**: XGBoost, Random Forest, Gradient Boosting, AdaBoost, SVR, MLP
9 | - **Deep Learning**: LSTM, Bidirectional LSTM, GRU
10 | - **Statistical**: ARIMA, SARIMA
11 | - **Ensemble Methods**: Voting Regressor, Stacking Ensemble
12 |
13 | ### 📊 **Comprehensive Evaluation Suite**
14 | - **Basic Metrics**: MAE, RMSE, MAPE, R² Score
15 | - **Advanced Metrics**: SMAPE, RMSPE, MASE, MDA, Theil's U, Forecast Bias
16 | - **Statistical Tests**: Normality, autocorrelation, heteroscedasticity analysis
17 |
18 | ### 🎨 **Interactive Visualizations**
19 | - Plotly-powered interactive forecast plots
20 | - Comprehensive residual analysis (distribution, ACF, Q-Q plots)
21 | - Seasonal decomposition with trend/seasonal/residual components
22 | - Model comparison dashboards
23 | - Feature importance analysis with SHAP values
24 |
25 | ### 🛠️ **Advanced Data Processing**
26 | - Automatic data validation and intelligent cleaning
27 | - Smart lag feature generation for temporal dependencies
28 | - Multiple missing value imputation strategies
29 | - Outlier detection and handling
30 | - Feature scaling (Standard, MinMax, Robust)
31 | - Time-based feature engineering
32 |
33 | ### 🎛️ **Professional User Interface**
34 | - Drag-and-drop CSV upload with real-time validation
35 | - Dynamic parameter tuning with live updates
36 | - Hyperparameter optimization with GridSearchCV
37 | - Model comparison and benchmarking
38 | - Export capabilities (predictions, metrics, models)
39 |
40 | ### 🔬 **AI-Powered Features**
41 | - **SHAP Analysis**: Explainable AI for model interpretability
42 | - **Feature Importance**: Global and local feature impact analysis
43 | - **Time Series Cross-Validation**: Rolling forecast validation
44 | - **Automated Model Selection**: Performance-based recommendations
45 |
46 | ## 🚀 Quick Start
47 |
48 | ### Installation
49 |
50 | 1. **Clone the repository**:
51 | ```bash
52 | git clone https://github.com/your-username/time-series-forecasting-tool.git
53 | cd time-series-forecasting-tool
54 | ```
55 |
56 | 2. **Install dependencies**:
57 | ```bash
58 | pip install -r requirements.txt
59 | ```
60 |
61 | 3. **Launch the application**:
62 | ```bash
63 | streamlit run app_advanced.py
64 | ```
65 |
66 | 4. **Open your browser** to `http://localhost:8501`
67 |
68 | ### Usage Guide
69 |
70 | 1. **📁 Data Upload**: Upload your CSV file or use the built-in sample dataset
71 | 2. **⚙️ Preprocessing**: Configure lag features, test size, and scaling options
72 | 3. **🤖 Model Selection**: Choose from 13 forecasting models with pre-configured parameters
73 | 4. **🔬 Advanced Features**: Enable SHAP analysis, cross-validation, and feature importance
74 | 5. **📊 Evaluation**: Review comprehensive metrics and statistical analysis
75 | 6. **📈 Visualization**: Explore interactive charts and residual diagnostics
76 | 7. **💾 Export**: Download predictions, metrics reports, and trained models
77 |
78 | ## 🏗️ Project Architecture
79 |
80 | ```
81 | ├── app_advanced.py # Main advanced Streamlit application
82 | ├── app.py # Basic Streamlit application
83 | ├── config.py # Configuration and hyperparameters
84 | ├── data_loader.py # Data loading and preprocessing utilities
85 | ├── models.py # Core ML models (XGBoost, RF, LSTM)
86 | ├── advanced_models.py # Additional models and ensemble methods
87 | ├── evaluation.py # Basic evaluation metrics
88 | ├── advanced_evaluation.py # Comprehensive evaluation suite
89 | ├── viz.py # Basic visualization functions
90 | ├── advanced_viz.py # Interactive and advanced visualizations
91 | ├── utils.py # Utility functions (validation, scaling, etc.)
92 | ├── requirements.txt # Python dependencies
93 | ├── TODO.md # Development roadmap
94 | └── README.md # This documentation
95 | ```
96 |
97 | ## 📋 Requirements
98 |
99 | ### Core Dependencies
100 | - **streamlit** (>=1.28.0): Web application framework
101 | - **tensorflow** (>=2.13.0): Deep learning models
102 | - **scikit-learn** (>=1.3.0): Machine learning algorithms
103 | - **pandas** (>=2.0.0): Data manipulation
104 | - **numpy** (>=1.24.0): Numerical computing
105 |
106 | ### Visualization & Analysis
107 | - **plotly** (>=5.15.0): Interactive visualizations
108 | - **matplotlib** (>=3.7.0): Static plotting
109 | - **seaborn** (>=0.12.0): Statistical visualization
110 | - **statsmodels** (>=0.14.0): Statistical models
111 |
112 | ### Specialized Libraries
113 | - **xgboost** (>=1.7.0): Gradient boosting
114 | - **shap** (>=0.42.0): Explainable AI (optional)
115 | - **joblib** (>=1.3.0): Model serialization
116 | - **scipy** (>=1.11.0): Scientific computing
117 |
118 | ## 🎯 Model Capabilities
119 |
120 | ### Machine Learning Models
121 | | Model | Description | Best For |
122 | |-------|-------------|----------|
123 | | XGBoost | Gradient boosting with trees | High accuracy, feature importance |
124 | | Random Forest | Ensemble of decision trees | Robust, handles missing data |
125 | | Gradient Boosting | Sequential ensemble method | Competitive accuracy |
126 | | AdaBoost | Adaptive boosting | Binary classification adaptation |
127 | | SVR | Support Vector Regression | Non-linear relationships |
128 | | MLP | Neural network | Complex patterns |
129 |
130 | ### Deep Learning Models
131 | | Model | Description | Best For |
132 | |-------|-------------|----------|
133 | | LSTM | Long Short-Term Memory | Sequential dependencies |
134 | | Bidirectional LSTM | Forward + backward LSTM | Context-aware forecasting |
135 | | GRU | Gated Recurrent Units | Efficient sequential modeling |
136 |
137 | ### Statistical Models
138 | | Model | Description | Best For |
139 | |-------|-------------|----------|
140 | | ARIMA | AutoRegressive Integrated MA | Stationary time series |
141 | | SARIMA | Seasonal ARIMA | Seasonal patterns |
142 |
143 | ### Ensemble Methods
144 | | Model | Description | Best For |
145 | |-------|-------------|----------|
146 | | Voting Ensemble | Weighted average of models | Improved stability |
147 | | Stacking Ensemble | Meta-model on base predictions | Maximum accuracy |
148 |
149 | ## 📊 Evaluation Framework
150 |
151 | ### Performance Metrics
152 | - **MAE**: Mean Absolute Error - Average magnitude of errors
153 | - **RMSE**: Root Mean Squared Error - Penalizes large errors
154 | - **MAPE**: Mean Absolute Percentage Error - Scale-independent
155 | - **SMAPE**: Symmetric MAPE - Handles zero values
156 | - **MASE**: Mean Absolute Scaled Error - Compares to naive forecast
157 | - **MDA**: Mean Directional Accuracy - Direction prediction accuracy
158 |
159 | ### Statistical Analysis
160 | - **Normality Tests**: Shapiro-Wilk, Kolmogorov-Smirnov
161 | - **Autocorrelation**: ACF/PACF analysis for residuals
162 | - **Heteroscedasticity**: Breusch-Pagan, White tests
163 | - **Stationarity**: Augmented Dickey-Fuller test
164 |
165 | ## 🔬 Advanced Features
166 |
167 | ### SHAP Explainability
168 | - Global feature importance across all predictions
169 | - Local explanations for individual forecasts
170 | - Waterfall plots showing feature contributions
171 | - Summary plots for feature impact analysis
172 |
173 | ### Cross-Validation
174 | - Time series split validation
175 | - Rolling forecast evaluation
176 | - Performance stability assessment
177 | - Overfitting detection
178 |
179 | ### Feature Engineering
180 | - Automatic lag feature creation
181 | - Rolling statistics (mean, std, min, max)
182 | - Seasonal indicators
183 | - Calendar features (day of week, month, quarter)
184 |
185 | ## 🎨 User Interface
186 |
187 | ### Dashboard Layout
188 | - **Header**: Professional branding with gradient styling
189 | - **Sidebar**: Organized controls for data, preprocessing, and models
190 | - **Main Panel**: Metrics cards, visualizations, and results
191 | - **Export Section**: Download options for results and models
192 |
193 | ### Responsive Design
194 | - Mobile-friendly layout
195 | - Collapsible sidebar
196 | - Progressive disclosure of advanced options
197 | - Real-time feedback and progress indicators
198 |
199 | ## 🚀 Deployment Options
200 |
201 | ### Local Development
202 | ```bash
203 | streamlit run app_advanced.py --server.port 8501 --server.address 0.0.0.0
204 | ```
205 |
206 | ### Docker Deployment
207 | ```dockerfile
208 | FROM python:3.11-slim
209 | COPY . /app
210 | WORKDIR /app
211 | RUN pip install -r requirements.txt
212 | EXPOSE 8501
213 | CMD ["streamlit", "run", "app_advanced.py", "--server.address", "0.0.0.0"]
214 | ```
215 |
216 | ### Cloud Platforms
217 | - **Streamlit Cloud**: Direct deployment from GitHub
218 | - **Heroku**: Container-based deployment
219 | - **AWS/GCP/Azure**: Scalable cloud deployment
220 | - **Docker Hub**: Containerized distribution
221 |
222 | ## 🤝 Contributing
223 |
224 | We welcome contributions! Please follow these steps:
225 |
226 | 1. **Fork** the repository
227 | 2. **Create** a feature branch (`git checkout -b feature/AmazingFeature`)
228 | 3. **Commit** changes (`git commit -m 'Add AmazingFeature'`)
229 | 4. **Push** to branch (`git push origin feature/AmazingFeature`)
230 | 5. **Open** a Pull Request
231 |
232 | ### Development Guidelines
233 | - Follow PEP 8 style guidelines
234 | - Add docstrings to all functions
235 | - Include unit tests for new features
236 | - Update documentation for API changes
237 | - Ensure backward compatibility
238 |
239 | ## 📄 License
240 |
241 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
242 |
243 | ## 🙏 Acknowledgments
244 |
245 | - **Streamlit** for the amazing web app framework
246 | - **TensorFlow/Keras** for deep learning capabilities
247 | - **scikit-learn** for comprehensive ML algorithms
248 | - **Plotly** for interactive visualizations
249 | - **SHAP** for model explainability
250 | - **statsmodels** for statistical modeling
251 |
252 | ## 🔮 Future Roadmap
253 |
254 | ### Phase 1 (Completed)
255 | - ✅ 13 forecasting models implementation
256 | - ✅ Comprehensive evaluation metrics
257 | - ✅ Interactive visualizations
258 | - ✅ Professional UI/UX
259 |
260 | ### Phase 2 (In Progress)
261 | - 🔄 Prophet model integration
262 | - 🔄 Automated model selection
263 | - 🔄 Prediction intervals
264 | - 🔄 Multi-step forecasting
265 |
266 | ### Phase 3 (Planned)
267 | - 📋 Real-time forecasting dashboard
268 | - 📋 Anomaly detection system
269 | - 📋 Model deployment API
270 | - 📋 Database integration
271 | - 📋 Performance monitoring
272 |
273 | ## 📞 Support
274 |
275 | For questions, issues, or contributions:
276 |
277 | - **GitHub Issues**: Bug reports and feature requests
278 | - **Discussions**: General questions and community support
279 | - **Pull Requests**: Code contributions welcome
280 |
281 | ## 🎉 Getting Started
282 |
283 | Ready to forecast? Get started in minutes:
284 |
285 | 1. Clone the repo
286 | 2. Install dependencies
287 | 3. Run `streamlit run app_advanced.py`
288 | 4. Upload your data and start forecasting!
289 |
290 | ---
291 |
292 | **Built with ❤️ for the data science community**
293 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | import numpy as np
4 | from data_loader import load_data, preprocess_data, create_sample_data
5 | from models import train_xgboost, train_random_forest, train_lstm, tune_hyperparameters
6 | from advanced_models import (train_gradient_boosting, train_ada_boost, train_svr, train_mlp,
7 | train_bidirectional_lstm, train_gru, train_arima, train_sarima,
8 | ensemble_forecast)
9 | from evaluation import evaluate_model
10 | from advanced_evaluation import comprehensive_evaluation, print_evaluation_report
11 | from viz import plot_forecast, plot_residuals, plot_distribution
12 | from advanced_viz import (plot_forecast_interactive, plot_residuals_analysis, plot_model_comparison,
13 | create_dashboard, plot_seasonal_decomposition)
14 | from config import DEFAULT_PARAMS, TUNING_GRIDS, DATA_SETTINGS, APP_SETTINGS
15 | from utils import (validate_data, scale_features, save_model, load_model,
16 | handle_missing_values, create_time_features, detect_outliers)
17 | import matplotlib.pyplot as plt
18 | import plotly.graph_objects as go
19 |
20 | # Set page configuration
21 | st.set_page_config(
22 | page_title="Advanced Time Series Forecasting",
23 | page_icon="📈",
24 | layout="wide",
25 | initial_sidebar_state="expanded"
26 | )
27 |
28 | # Custom CSS for better styling
29 | st.markdown("""
30 |
193 | """, unsafe_allow_html=True)
194 |
195 | # Main header with custom styling
196 | st.markdown('
Professional time series analysis with 11+ forecasting models and interactive visualizations
', unsafe_allow_html=True) 198 | 199 | # Sidebar for inputs 200 | st.sidebar.markdown('', unsafe_allow_html=True) 201 | uploaded_file = st.sidebar.file_uploader('Upload CSV file', type=['csv']) 202 | if uploaded_file is not None: 203 | df = load_data(uploaded_file) 204 | st.sidebar.success('✅ Data loaded successfully!') 205 | else: 206 | st.sidebar.info('ℹ️ Using sample data.') 207 | df = create_sample_data() 208 | 209 | st.sidebar.markdown('', unsafe_allow_html=True) 210 | lags = st.sidebar.slider('Number of lag features', 1, 10, 5) 211 | test_size = st.sidebar.slider('Test size', 0.1, 0.5, 0.2) 212 | 213 | X_train, X_test, y_train, y_test = preprocess_data(df, lags=lags, test_size=test_size) 214 | 215 | st.sidebar.markdown('', unsafe_allow_html=True) 216 | model_options = ['XGBoost', 'Random Forest', 'LSTM', 'Gradient Boosting', 'AdaBoost', 'SVR', 'MLP', 'Bidirectional LSTM', 'GRU', 'ARIMA', 'SARIMA'] 217 | model_choice = st.sidebar.selectbox('Choose model', model_options) 218 | 219 | st.sidebar.markdown('', unsafe_allow_html=True) 220 | if model_choice in DEFAULT_PARAMS: 221 | params = DEFAULT_PARAMS[model_choice].copy() 222 | for param_name, default_value in params.items(): 223 | if isinstance(default_value, int): 224 | params[param_name] = st.sidebar.slider(param_name, 1, 200, default_value) 225 | elif isinstance(default_value, float): 226 | params[param_name] = st.sidebar.slider(param_name, 0.001, 1.0, default_value) 227 | else: 228 | st.sidebar.warning(f"Default parameters not set for {model_choice}. Using basic settings.") 229 | params = {} 230 | 231 | tune = st.sidebar.checkbox('Tune hyperparameters?') 232 | if tune and model_choice in TUNING_GRIDS: 233 | param_grid = TUNING_GRIDS[model_choice] 234 | model, best_params = tune_hyperparameters(model_choice.lower().replace(' ', '_'), X_train, y_train, param_grid) 235 | st.sidebar.write('Best params:', best_params) 236 | else: 237 | if model_choice == 'XGBoost': 238 | model = train_xgboost(X_train, y_train, params) 239 | elif model_choice == 'Random Forest': 240 | model = train_random_forest(X_train, y_train, params) 241 | elif model_choice == 'LSTM': 242 | model = train_lstm(X_train, y_train, params) 243 | elif model_choice == 'Gradient Boosting': 244 | model = train_gradient_boosting(X_train, y_train, params) 245 | elif model_choice == 'AdaBoost': 246 | model = train_ada_boost(X_train, y_train, params) 247 | elif model_choice == 'SVR': 248 | model = train_svr(X_train, y_train, params) 249 | elif model_choice == 'MLP': 250 | model = train_mlp(X_train, y_train, params) 251 | elif model_choice == 'Bidirectional LSTM': 252 | model = train_bidirectional_lstm(X_train, y_train, params) 253 | elif model_choice == 'GRU': 254 | model = train_gru(X_train, y_train, params) 255 | elif model_choice == 'ARIMA': 256 | model = train_arima(y_train) 257 | elif model_choice == 'SARIMA': 258 | model = train_sarima(y_train) 259 | else: 260 | st.error(f"Model {model_choice} not implemented yet.") 261 | st.stop() 262 | 263 | # Train and predict 264 | if model_choice in ['LSTM', 'Bidirectional LSTM', 'GRU']: 265 | X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1])) 266 | y_pred = model.predict(X_test_reshaped).flatten() 267 | elif model_choice in ['ARIMA', 'SARIMA']: 268 | # For ARIMA/SARIMA, forecast the test period 269 | y_pred = model.forecast(steps=len(y_test)) 270 | y_pred.index = y_test.index 271 | else: 272 | y_pred = model.predict(X_test) 273 | 274 | # Evaluate 275 | metrics = evaluate_model(y_test, y_pred) 276 | 277 | # Display metrics in styled cards 278 | st.markdown('Professional time series analysis with 11+ forecasting models, ensemble methods, and AI-powered insights
', unsafe_allow_html=True) 214 | 215 | # Initialize session state for advanced features 216 | if 'models_trained' not in st.session_state: 217 | st.session_state.models_trained = {} 218 | if 'ensemble_models' not in st.session_state: 219 | st.session_state.ensemble_models = {} 220 | if 'predictions' not in st.session_state: 221 | st.session_state.predictions = {} 222 | 223 | # Sidebar for inputs 224 | st.sidebar.markdown('', unsafe_allow_html=True) 225 | uploaded_file = st.sidebar.file_uploader('Upload CSV file', type=['csv']) 226 | if uploaded_file is not None: 227 | df = load_data(uploaded_file) 228 | st.sidebar.success('✅ Data loaded successfully!') 229 | else: 230 | st.sidebar.info('ℹ️ Using sample data.') 231 | df = create_sample_data() 232 | 233 | st.sidebar.markdown('', unsafe_allow_html=True) 234 | lags = st.sidebar.slider('Number of lag features', 1, 20, 5) 235 | test_size = st.sidebar.slider('Test size', 0.1, 0.5, 0.2) 236 | scaling_method = st.sidebar.selectbox('Feature scaling', ['none', 'standard', 'minmax'], index=0) 237 | 238 | X_train, X_test, y_train, y_test = preprocess_data(df, lags=lags, test_size=test_size) 239 | 240 | # Apply scaling if selected 241 | if scaling_method != 'none': 242 | X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test, method=scaling_method) 243 | X_train, X_test = X_train_scaled, X_test_scaled 244 | 245 | st.sidebar.markdown('', unsafe_allow_html=True) 246 | model_options = ['XGBoost', 'Random Forest', 'LSTM', 'Gradient Boosting', 'AdaBoost', 'SVR', 'MLP', 247 | 'Bidirectional LSTM', 'GRU', 'ARIMA', 'SARIMA', 'Ensemble (Voting)', 'Ensemble (Stacking)'] 248 | model_choice = st.sidebar.selectbox('Choose model', model_options) 249 | 250 | # Advanced features 251 | st.sidebar.markdown('', unsafe_allow_html=True) 252 | enable_shap = st.sidebar.checkbox('Enable SHAP analysis', value=False, disabled=not SHAP_AVAILABLE) 253 | enable_cross_validation = st.sidebar.checkbox('Time series cross-validation', value=False) 254 | enable_feature_importance = st.sidebar.checkbox('Feature importance analysis', value=False) 255 | 256 | st.sidebar.markdown('', unsafe_allow_html=True) 257 | if model_choice in DEFAULT_PARAMS: 258 | params = DEFAULT_PARAMS[model_choice].copy() 259 | for param_name, default_value in params.items(): 260 | if isinstance(default_value, int): 261 | params[param_name] = st.sidebar.slider(param_name, 1, 200, default_value) 262 | elif isinstance(default_value, float): 263 | params[param_name] = st.sidebar.slider(param_name, 0.001, 1.0, default_value) 264 | else: 265 | st.sidebar.warning(f"Default parameters not set for {model_choice}. Using basic settings.") 266 | params = {} 267 | 268 | tune = st.sidebar.checkbox('Tune hyperparameters?') 269 | if tune and model_choice in TUNING_GRIDS: 270 | param_grid = TUNING_GRIDS[model_choice] 271 | model, best_params = tune_hyperparameters(model_choice.lower().replace(' ', '_'), X_train, y_train, param_grid) 272 | st.sidebar.write('Best params:', best_params) 273 | else: 274 | # Train individual models 275 | if model_choice == 'XGBoost': 276 | model = train_xgboost(X_train, y_train, params) 277 | elif model_choice == 'Random Forest': 278 | model = train_random_forest(X_train, y_train, params) 279 | elif model_choice == 'LSTM': 280 | model = train_lstm(X_train, y_train, params) 281 | elif model_choice == 'Gradient Boosting': 282 | model = train_gradient_boosting(X_train, y_train, params) 283 | elif model_choice == 'AdaBoost': 284 | model = train_ada_boost(X_train, y_train, params) 285 | elif model_choice == 'SVR': 286 | model = train_svr(X_train, y_train, params) 287 | elif model_choice == 'MLP': 288 | model = train_mlp(X_train, y_train, params) 289 | elif model_choice == 'Bidirectional LSTM': 290 | model = train_bidirectional_lstm(X_train, y_train, params) 291 | elif model_choice == 'GRU': 292 | model = train_gru(X_train, y_train, params) 293 | elif model_choice == 'ARIMA': 294 | model = train_arima(y_train) 295 | elif model_choice == 'SARIMA': 296 | model = train_sarima(y_train) 297 | elif model_choice == 'Ensemble (Voting)': 298 | if VOTING_AVAILABLE: 299 | # Train multiple models for ensemble 300 | models = [] 301 | model_names = ['XGBoost', 'Random Forest', 'Gradient Boosting'] 302 | 303 | for name in model_names: 304 | if name == 'XGBoost': 305 | m = train_xgboost(X_train, y_train, DEFAULT_PARAMS.get('XGBoost', {})) 306 | elif name == 'Random Forest': 307 | m = train_random_forest(X_train, y_train, DEFAULT_PARAMS.get('Random Forest', {})) 308 | elif name == 'Gradient Boosting': 309 | m = train_gradient_boosting(X_train, y_train, DEFAULT_PARAMS.get('Gradient Boosting', {})) 310 | models.append((name, m)) 311 | 312 | model = VotingRegressor(estimators=models) 313 | model.fit(X_train, y_train) 314 | else: 315 | st.error("VotingRegressor not available. Install scikit-learn.") 316 | st.stop() 317 | elif model_choice == 'Ensemble (Stacking)': 318 | # Implement stacking ensemble 319 | base_models = [] 320 | model_names = ['XGBoost', 'Random Forest', 'Gradient Boosting'] 321 | 322 | for name in model_names: 323 | if name == 'XGBoost': 324 | m = train_xgboost(X_train, y_train, DEFAULT_PARAMS.get('XGBoost', {})) 325 | elif name == 'Random Forest': 326 | m = train_random_forest(X_train, y_train, DEFAULT_PARAMS.get('Random Forest', {})) 327 | elif name == 'Gradient Boosting': 328 | m = train_gradient_boosting(X_train, y_train, DEFAULT_PARAMS.get('Gradient Boosting', {})) 329 | base_models.append(m) 330 | 331 | # Use XGBoost as meta model 332 | meta_model = train_xgboost(X_train, y_train, DEFAULT_PARAMS.get('XGBoost', {})) 333 | 334 | # Simple stacking implementation 335 | base_predictions = np.column_stack([m.predict(X_train) for m in base_models]) 336 | meta_model.fit(base_predictions, y_train) 337 | model = {'base_models': base_models, 'meta_model': meta_model} 338 | else: 339 | st.error(f"Model {model_choice} not implemented yet.") 340 | st.stop() 341 | 342 | # Train and predict 343 | if model_choice in ['LSTM', 'Bidirectional LSTM', 'GRU']: 344 | X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1])) 345 | y_pred = model.predict(X_test_reshaped).flatten() 346 | elif model_choice in ['ARIMA', 'SARIMA']: 347 | # For ARIMA/SARIMA, forecast the test period 348 | y_pred = model.forecast(steps=len(y_test)) 349 | y_pred.index = y_test.index 350 | elif model_choice == 'Ensemble (Stacking)': 351 | # Stacking prediction 352 | base_predictions = np.column_stack([m.predict(X_test) for m in model['base_models']]) 353 | y_pred = model['meta_model'].predict(base_predictions) 354 | else: 355 | y_pred = model.predict(X_test) 356 | 357 | # Evaluate 358 | metrics = evaluate_model(y_test, y_pred) 359 | 360 | # Display metrics in styled cards 361 | st.markdown('🚀 Advanced Time Series Forecasting Pro - Powered by AI & Machine Learning
', unsafe_allow_html=True) 606 | --------------------------------------------------------------------------------