├── requirements.txt
├── README.md
├── .gitignore
└── data.py
/requirements.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
PyTorch Dataset for multivariate time series
2 |
3 | Custom PyTorch Dataset object for multivariate time series forecasting purposes.
4 | It splits, preprocesses and frames the provided pandas.DataFrame and returns two DataLoader for training and testing.
5 |
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 | .idea/
106 |
--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import torch
4 | from torch.utils.data import TensorDataset, DataLoader
5 | from sklearn.preprocessing import StandardScaler, OneHotEncoder
6 | from sklearn.compose import ColumnTransformer
7 | from sklearn.model_selection import TimeSeriesSplit
8 | from typing import List, Optional, Tuple
9 |
10 |
11 | class TimeSeriesDataset:
12 | """
13 | A class for preprocessing and loading time series data for pytorch models.
14 |
15 | attributes:
16 | data (pd.DataFrame): the input time series data.
17 | categorical_cols (List[str]): list of categorical column names.
18 | target_col (str): name of the target column.
19 | seq_length (int): length of the input sequence.
20 | prediction_window (int): length of the prediction window.
21 | numerical_cols (List[str]): list of numerical column names.
22 | preprocessor (ColumnTransformer): sklearn preprocessor for data transformation.
23 | """
24 |
25 | def __init__(self,
26 | data: pd.DataFrame,
27 | categorical_cols: List[str],
28 | target_col: str,
29 | seq_length: int,
30 | prediction_window: int = 1):
31 | """
32 | Initialize the TimeSeriesDataset.
33 |
34 | args:
35 | data (pd.DataFrame): the input time series data.
36 | categorical_cols (List[str]): list of categorical column names.
37 | target_col (str): name of the target column.
38 | seq_length (int): length of the input sequence.
39 | prediction_window (int): length of the prediction window.
40 | """
41 | self.data = data
42 | self.categorical_cols = categorical_cols
43 | self.numerical_cols = list(set(data.columns) - set(categorical_cols) - {target_col})
44 | self.target_col = target_col
45 | self.seq_length = seq_length
46 | self.prediction_window = prediction_window
47 | self.preprocessor = None
48 |
49 | def preprocess_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
50 | """
51 | Preprocess the data using sklearn ColumnTransformer.
52 |
53 | returns:
54 | Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: preprocessed training and testing data.
55 | """
56 | X = self.data.drop(self.target_col, axis=1)
57 | y = self.data[self.target_col]
58 |
59 | self.preprocessor = ColumnTransformer(
60 | [("scaler", StandardScaler(), self.numerical_cols),
61 | ("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'), self.categorical_cols)],
62 | remainder="passthrough"
63 | )
64 |
65 | # use timeseriessplit for time series data :cite[c4]
66 | tscv = TimeSeriesSplit(n_splits=5)
67 | for train_index, test_index in tscv.split(X):
68 | X_train, X_test = X.iloc[train_index], X.iloc[test_index]
69 | y_train, y_test = y.iloc[train_index], y.iloc[test_index]
70 |
71 | X_train = self.preprocessor.fit_transform(X_train)
72 | X_test = self.preprocessor.transform(X_test)
73 |
74 | return X_train, X_test, y_train.values, y_test.values
75 |
76 | def frame_series(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> TensorDataset:
77 | """
78 | Create a TensorDataset from the input data.
79 |
80 | args:
81 | X (np.ndarray): input features.
82 | y (Optional[np.ndarray]): target values.
83 |
84 | returns:
85 | TensorDataset: dataset containing the framed series.
86 | """
87 | nb_obs, nb_features = X.shape
88 | features, target, y_hist = [], [], []
89 |
90 | for i in range(nb_obs - self.seq_length - self.prediction_window + 1):
91 | features.append(torch.FloatTensor(X[i:i + self.seq_length, :]))
92 |
93 | features_var = torch.stack(features)
94 |
95 | if y is not None:
96 | for i in range(nb_obs - self.seq_length - self.prediction_window + 1):
97 | target.append(torch.FloatTensor(y[i + self.seq_length:i + self.seq_length + self.prediction_window]))
98 | y_hist.append(
99 | torch.FloatTensor(y[i + self.seq_length - 1:i + self.seq_length + self.prediction_window - 1]))
100 |
101 | target_var, y_hist_var = torch.stack(target), torch.stack(y_hist)
102 | return TensorDataset(features_var, target_var, y_hist_var)
103 |
104 | return TensorDataset(features_var)
105 |
106 | def get_loaders(self, batch_size: int) -> Tuple[DataLoader, DataLoader]:
107 | """
108 | Create DataLoader objects for training and testing data.
109 |
110 | args:
111 | batch_size (int): size of each batch.
112 |
113 | returns:
114 | Tuple[DataLoader, DataLoader]: DataLoader objects for training and testing data.
115 | """
116 | X_train, X_test, y_train, y_test = self.preprocess_data()
117 | train_dataset = self.frame_series(X_train, y_train)
118 | test_dataset = self.frame_series(X_test, y_test)
119 |
120 | train_iter = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
121 | test_iter = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
122 |
123 | return train_iter, test_iter
124 |
--------------------------------------------------------------------------------