├── requirements.txt ├── README.md ├── .gitignore └── data.py /requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

PyTorch Dataset for multivariate time series

2 |

3 | Custom PyTorch Dataset object for multivariate time series forecasting purposes. 4 | It splits, preprocesses and frames the provided pandas.DataFrame and returns two DataLoader for training and testing. 5 |

6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | .idea/ 106 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import torch 4 | from torch.utils.data import TensorDataset, DataLoader 5 | from sklearn.preprocessing import StandardScaler, OneHotEncoder 6 | from sklearn.compose import ColumnTransformer 7 | from sklearn.model_selection import TimeSeriesSplit 8 | from typing import List, Optional, Tuple 9 | 10 | 11 | class TimeSeriesDataset: 12 | """ 13 | A class for preprocessing and loading time series data for pytorch models. 14 | 15 | attributes: 16 | data (pd.DataFrame): the input time series data. 17 | categorical_cols (List[str]): list of categorical column names. 18 | target_col (str): name of the target column. 19 | seq_length (int): length of the input sequence. 20 | prediction_window (int): length of the prediction window. 21 | numerical_cols (List[str]): list of numerical column names. 22 | preprocessor (ColumnTransformer): sklearn preprocessor for data transformation. 23 | """ 24 | 25 | def __init__(self, 26 | data: pd.DataFrame, 27 | categorical_cols: List[str], 28 | target_col: str, 29 | seq_length: int, 30 | prediction_window: int = 1): 31 | """ 32 | Initialize the TimeSeriesDataset. 33 | 34 | args: 35 | data (pd.DataFrame): the input time series data. 36 | categorical_cols (List[str]): list of categorical column names. 37 | target_col (str): name of the target column. 38 | seq_length (int): length of the input sequence. 39 | prediction_window (int): length of the prediction window. 40 | """ 41 | self.data = data 42 | self.categorical_cols = categorical_cols 43 | self.numerical_cols = list(set(data.columns) - set(categorical_cols) - {target_col}) 44 | self.target_col = target_col 45 | self.seq_length = seq_length 46 | self.prediction_window = prediction_window 47 | self.preprocessor = None 48 | 49 | def preprocess_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: 50 | """ 51 | Preprocess the data using sklearn ColumnTransformer. 52 | 53 | returns: 54 | Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: preprocessed training and testing data. 55 | """ 56 | X = self.data.drop(self.target_col, axis=1) 57 | y = self.data[self.target_col] 58 | 59 | self.preprocessor = ColumnTransformer( 60 | [("scaler", StandardScaler(), self.numerical_cols), 61 | ("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'), self.categorical_cols)], 62 | remainder="passthrough" 63 | ) 64 | 65 | # use timeseriessplit for time series data :cite[c4] 66 | tscv = TimeSeriesSplit(n_splits=5) 67 | for train_index, test_index in tscv.split(X): 68 | X_train, X_test = X.iloc[train_index], X.iloc[test_index] 69 | y_train, y_test = y.iloc[train_index], y.iloc[test_index] 70 | 71 | X_train = self.preprocessor.fit_transform(X_train) 72 | X_test = self.preprocessor.transform(X_test) 73 | 74 | return X_train, X_test, y_train.values, y_test.values 75 | 76 | def frame_series(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> TensorDataset: 77 | """ 78 | Create a TensorDataset from the input data. 79 | 80 | args: 81 | X (np.ndarray): input features. 82 | y (Optional[np.ndarray]): target values. 83 | 84 | returns: 85 | TensorDataset: dataset containing the framed series. 86 | """ 87 | nb_obs, nb_features = X.shape 88 | features, target, y_hist = [], [], [] 89 | 90 | for i in range(nb_obs - self.seq_length - self.prediction_window + 1): 91 | features.append(torch.FloatTensor(X[i:i + self.seq_length, :])) 92 | 93 | features_var = torch.stack(features) 94 | 95 | if y is not None: 96 | for i in range(nb_obs - self.seq_length - self.prediction_window + 1): 97 | target.append(torch.FloatTensor(y[i + self.seq_length:i + self.seq_length + self.prediction_window])) 98 | y_hist.append( 99 | torch.FloatTensor(y[i + self.seq_length - 1:i + self.seq_length + self.prediction_window - 1])) 100 | 101 | target_var, y_hist_var = torch.stack(target), torch.stack(y_hist) 102 | return TensorDataset(features_var, target_var, y_hist_var) 103 | 104 | return TensorDataset(features_var) 105 | 106 | def get_loaders(self, batch_size: int) -> Tuple[DataLoader, DataLoader]: 107 | """ 108 | Create DataLoader objects for training and testing data. 109 | 110 | args: 111 | batch_size (int): size of each batch. 112 | 113 | returns: 114 | Tuple[DataLoader, DataLoader]: DataLoader objects for training and testing data. 115 | """ 116 | X_train, X_test, y_train, y_test = self.preprocess_data() 117 | train_dataset = self.frame_series(X_train, y_train) 118 | test_dataset = self.frame_series(X_test, y_test) 119 | 120 | train_iter = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True) 121 | test_iter = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True) 122 | 123 | return train_iter, test_iter 124 | --------------------------------------------------------------------------------