PyTorch Dataset for multivariate time series

├── requirements.txt
├── README.md
├── .gitignore
└── data.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | <h1 align="center">PyTorch Dataset for multivariate time series</h1>
2 | <p>
3 | Custom PyTorch <code>Dataset</code> object for multivariate time series forecasting purposes.
4 |     It splits, preprocesses and frames the provided <code>pandas.DataFrame</code> and returns two <code>DataLoader</code> for training and testing.
5 | </p>
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | .idea/
106 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import torch
  4 | from torch.utils.data import TensorDataset, DataLoader
  5 | from sklearn.preprocessing import StandardScaler, OneHotEncoder
  6 | from sklearn.compose import ColumnTransformer
  7 | from sklearn.model_selection import TimeSeriesSplit
  8 | from typing import List, Optional, Tuple
  9 | 
 10 | 
 11 | class TimeSeriesDataset:
 12 |     """
 13 |     A class for preprocessing and loading time series data for pytorch models.
 14 | 
 15 |     attributes:
 16 |         data (pd.DataFrame): the input time series data.
 17 |         categorical_cols (List[str]): list of categorical column names.
 18 |         target_col (str): name of the target column.
 19 |         seq_length (int): length of the input sequence.
 20 |         prediction_window (int): length of the prediction window.
 21 |         numerical_cols (List[str]): list of numerical column names.
 22 |         preprocessor (ColumnTransformer): sklearn preprocessor for data transformation.
 23 |     """
 24 | 
 25 |     def __init__(self,
 26 |                  data: pd.DataFrame,
 27 |                  categorical_cols: List[str],
 28 |                  target_col: str,
 29 |                  seq_length: int,
 30 |                  prediction_window: int = 1):
 31 |         """
 32 |         Initialize the TimeSeriesDataset.
 33 | 
 34 |         args:
 35 |             data (pd.DataFrame): the input time series data.
 36 |             categorical_cols (List[str]): list of categorical column names.
 37 |             target_col (str): name of the target column.
 38 |             seq_length (int): length of the input sequence.
 39 |             prediction_window (int): length of the prediction window.
 40 |         """
 41 |         self.data = data
 42 |         self.categorical_cols = categorical_cols
 43 |         self.numerical_cols = list(set(data.columns) - set(categorical_cols) - {target_col})
 44 |         self.target_col = target_col
 45 |         self.seq_length = seq_length
 46 |         self.prediction_window = prediction_window
 47 |         self.preprocessor = None
 48 | 
 49 |     def preprocess_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
 50 |         """
 51 |         Preprocess the data using sklearn ColumnTransformer.
 52 | 
 53 |         returns:
 54 |             Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: preprocessed training and testing data.
 55 |         """
 56 |         X = self.data.drop(self.target_col, axis=1)
 57 |         y = self.data[self.target_col]
 58 | 
 59 |         self.preprocessor = ColumnTransformer(
 60 |             [("scaler", StandardScaler(), self.numerical_cols),
 61 |              ("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'), self.categorical_cols)],
 62 |             remainder="passthrough"
 63 |         )
 64 | 
 65 |         # use timeseriessplit for time series data :cite[c4]
 66 |         tscv = TimeSeriesSplit(n_splits=5)
 67 |         for train_index, test_index in tscv.split(X):
 68 |             X_train, X_test = X.iloc[train_index], X.iloc[test_index]
 69 |             y_train, y_test = y.iloc[train_index], y.iloc[test_index]
 70 | 
 71 |         X_train = self.preprocessor.fit_transform(X_train)
 72 |         X_test = self.preprocessor.transform(X_test)
 73 | 
 74 |         return X_train, X_test, y_train.values, y_test.values
 75 | 
 76 |     def frame_series(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> TensorDataset:
 77 |         """
 78 |         Create a TensorDataset from the input data.
 79 | 
 80 |         args:
 81 |             X (np.ndarray): input features.
 82 |             y (Optional[np.ndarray]): target values.
 83 | 
 84 |         returns:
 85 |             TensorDataset: dataset containing the framed series.
 86 |         """
 87 |         nb_obs, nb_features = X.shape
 88 |         features, target, y_hist = [], [], []
 89 | 
 90 |         for i in range(nb_obs - self.seq_length - self.prediction_window + 1):
 91 |             features.append(torch.FloatTensor(X[i:i + self.seq_length, :]))
 92 | 
 93 |         features_var = torch.stack(features)
 94 | 
 95 |         if y is not None:
 96 |             for i in range(nb_obs - self.seq_length - self.prediction_window + 1):
 97 |                 target.append(torch.FloatTensor(y[i + self.seq_length:i + self.seq_length + self.prediction_window]))
 98 |                 y_hist.append(
 99 |                     torch.FloatTensor(y[i + self.seq_length - 1:i + self.seq_length + self.prediction_window - 1]))
100 | 
101 |             target_var, y_hist_var = torch.stack(target), torch.stack(y_hist)
102 |             return TensorDataset(features_var, target_var, y_hist_var)
103 | 
104 |         return TensorDataset(features_var)
105 | 
106 |     def get_loaders(self, batch_size: int) -> Tuple[DataLoader, DataLoader]:
107 |         """
108 |         Create DataLoader objects for training and testing data.
109 | 
110 |         args:
111 |             batch_size (int): size of each batch.
112 | 
113 |         returns:
114 |             Tuple[DataLoader, DataLoader]: DataLoader objects for training and testing data.
115 |         """
116 |         X_train, X_test, y_train, y_test = self.preprocess_data()
117 |         train_dataset = self.frame_series(X_train, y_train)
118 |         test_dataset = self.frame_series(X_test, y_test)
119 | 
120 |         train_iter = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
121 |         test_iter = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
122 | 
123 |         return train_iter, test_iter
124 | 


--------------------------------------------------------------------------------