├── .gitignore ├── README.md ├── conf ├── __init__.py ├── conf.py ├── electricity.yaml ├── traffic.yaml └── volatility.yaml ├── data_formatters ├── __init__.py ├── base.py ├── electricity.py ├── favorita.py ├── traffic.py ├── utils.py └── volatility.py ├── dataset ├── __init__.py └── ts_dataset.py ├── env.yml ├── inference.py ├── main.py ├── models ├── temporal_fusion_t │ ├── __init__.py │ ├── add_and_norm.py │ ├── base.py │ ├── gated_linear_unit.py │ ├── gated_residual_network.py │ ├── interpretable_multi_head_attention.py │ ├── linear_layer.py │ ├── lstm_combine_and_mask.py │ ├── scaled_dot_product_attention.py │ ├── static_combine_and_mask.py │ ├── tft_model.py │ └── time_distributed.py ├── transformer │ ├── __init__.py │ ├── decoder.py │ ├── encoder.py │ ├── loss.py │ ├── multiHeadAttention.py │ ├── positionwiseFeedForward.py │ ├── transformer.py │ └── utils.py └── transformer_grn │ ├── __init__.py │ ├── decoder.py │ ├── encoder.py │ ├── loss.py │ ├── multiHeadAttention.py │ ├── positionwiseFeedForward.py │ ├── transformer.py │ └── utils.py ├── progress_bar.py ├── requirements.txt ├── scheduler.py ├── slurm.py ├── slurm └── Traffic_5TR.sh ├── trainer.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | ### Pycharm stuffs 10 | .idea/ 11 | log/* 12 | *__pycache__/ 13 | 14 | ### Pytorch model weights 15 | *.pth 16 | 17 | ### Dataset 18 | data/* 19 | 20 | ### Temp files 21 | tmp/* 22 | 23 | ### images 24 | *.jpg 25 | *.jpeg 26 | *.png 27 | *.tif 28 | *.tiff 29 | 30 | # Distribution / packaging 31 | .Python 32 | build/ 33 | develop-eggs/ 34 | dist/ 35 | downloads/ 36 | eggs/ 37 | .eggs/ 38 | lib/ 39 | lib64/ 40 | parts/ 41 | sdist/ 42 | var/ 43 | wheels/ 44 | pip-wheel-metadata/ 45 | share/python-wheels/ 46 | *.egg-info/ 47 | .installed.cfg 48 | *.egg 49 | MANIFEST 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .nox/ 65 | .coverage 66 | .coverage.* 67 | .cache 68 | nosetests.xml 69 | coverage.xml 70 | *.cover 71 | *.py,cover 72 | .hypothesis/ 73 | .pytest_cache/ 74 | 75 | # Translations 76 | *.mo 77 | *.pot 78 | 79 | # Django stuff: 80 | *.log 81 | local_settings.py 82 | db.sqlite3 83 | db.sqlite3-journal 84 | 85 | # Flask stuff: 86 | instance/ 87 | .webassets-cache 88 | 89 | # Scrapy stuff: 90 | .scrapy 91 | 92 | # Sphinx documentation 93 | docs/_build/ 94 | 95 | # PyBuilder 96 | target/ 97 | 98 | # Jupyter Notebook 99 | .ipynb_checkpoints 100 | 101 | # IPython 102 | profile_default/ 103 | ipython_config.py 104 | 105 | # pyenv 106 | .python-version 107 | 108 | # pipenv 109 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 110 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 111 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 112 | # install all needed dependencies. 113 | #Pipfile.lock 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🐐 Time Series Forecasting 2 | Time series forecasting models 3 | 4 | ## Set-up 5 | 6 | - Install the required packages (pip or conda) 7 | - `pip install -r requirements.txt` 8 | - `conda env create -f env.yml` 9 | 10 | - Download data 11 | - https://drive.google.com/file/d/1Na7e2yJy1Oix8-HcKQS97u1VZodpZ-OZ/view?usp=sharing 12 | 13 | - Train and Test on electricity dataset 14 | - `python ./main.py --exp_name electricity --conf_file_path ./conf/electricity.yaml` 15 | 16 | Plot prediction on Test set 17 | - `python ./main.py --exp_name electricity --conf_file_path ./conf/electricity.yaml --inference=True` 18 | 19 | 20 | ## Models 21 | 22 | - Temporal fusion transformer
23 | https://arxiv.org/pdf/1912.09363.pdf 24 | 25 | Usage:
26 | - `model: tf_transformer` 27 | 28 | - Transformer
29 | https://arxiv.org/pdf/1706.03762.pdf
30 | https://pytorch.org/tutorials/beginner/transformer_tutorial.html 31 | 32 | Usage:
33 | - `model: transformer` 34 | 35 | - GRN-Tranformer
36 | Use GRN block after multi-head attention to encode static variables 37 | 38 | Usage:
39 | - `model: grn_transformer` 40 | -------------------------------------------------------------------------------- /conf/__init__.py: -------------------------------------------------------------------------------- 1 | from conf.conf import Conf -------------------------------------------------------------------------------- /conf/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # --------------------- 3 | 4 | import os 5 | 6 | PYTHONPATH = '..:.' 7 | if os.environ.get('PYTHONPATH', default=None) is None: 8 | os.environ['PYTHONPATH'] = PYTHONPATH 9 | else: 10 | os.environ['PYTHONPATH'] += (':' + PYTHONPATH) 11 | 12 | import yaml 13 | import socket 14 | import random 15 | import torch 16 | import numpy as np 17 | from path import Path 18 | from typing import Optional 19 | import termcolor 20 | from datetime import datetime 21 | 22 | 23 | def set_seed(seed=None): 24 | # type: (Optional[int]) -> int 25 | """ 26 | set the random seed using the required value (`seed`) 27 | or a random value if `seed` is `None` 28 | :return: the newly set seed 29 | """ 30 | if seed is None: 31 | seed = random.randint(1, 10000) 32 | random.seed(seed) 33 | torch.manual_seed(seed) 34 | np.random.seed(seed) 35 | return seed 36 | 37 | 38 | class Conf(object): 39 | HOSTNAME = socket.gethostname() 40 | LOG_PATH = Path('./logs/') 41 | 42 | def __init__(self, conf_file_path=None, seed=None, exp_name=None, log=True): 43 | # type: (str, int, str, bool) -> None 44 | """ 45 | :param conf_file_path: optional path of the configuration file 46 | :param seed: desired seed for the RNG; if `None`, it will be chosen randomly 47 | :param exp_name: name of the experiment 48 | :param log: `True` if you want to log each step; `False` otherwise 49 | """ 50 | self.exp_name = exp_name 51 | self.log_each_step = log 52 | 53 | # print project name and host name 54 | self.project_name = Path(__file__).parent.parent.basename() 55 | m_str = f'┃ {self.project_name}@{Conf.HOSTNAME} ┃' 56 | u_str = '┏' + '━' * (len(m_str) - 2) + '┓' 57 | b_str = '┗' + '━' * (len(m_str) - 2) + '┛' 58 | print(u_str + '\n' + m_str + '\n' + b_str) 59 | 60 | # define output paths 61 | self.project_log_path = Path('./log') 62 | 63 | # set random seed 64 | self.seed = set_seed(seed) # type: int 65 | 66 | self.keys_to_hide = list(self.__dict__.keys()) + ['keys_to_hide'] 67 | 68 | # if the configuration file is not specified 69 | # try to load a configuration file based on the experiment name 70 | tmp = Path(__file__).parent / (self.exp_name + '.yaml') 71 | if conf_file_path is None and tmp.exists(): 72 | conf_file_path = tmp 73 | 74 | # read the YAML configuation file 75 | if conf_file_path is None: 76 | y = {} 77 | else: 78 | conf_file = open(conf_file_path, 'r') 79 | y = yaml.load(conf_file, Loader=yaml.Loader) 80 | 81 | # read configuration parameters from YAML file 82 | # or set their default value 83 | self.lr = y.get('lr', 0.0001) # type: float 84 | self.epochs = y.get('num_epochs', 100) # type: int 85 | self.n_workers = y.get('n_workers', 1) # type: int 86 | self.batch_size = y.get('batch_size', 64) # type: int 87 | self.quantiles = y.get('quantiles', [0.1, 0.5, 0.9]) # type: list 88 | self.ds_name = y.get('ds_name', "electricity") # type: str 89 | self.all_params = y # type: dict 90 | 91 | self.exp_log_path = self.project_log_path / self.all_params["model"] / exp_name / datetime.now().strftime( 92 | "%m-%d-%Y - %H-%M-%S") 93 | 94 | default_device = 'cuda' if torch.cuda.is_available() else 'cpu' 95 | self.device = y.get('DEVICE', default_device) # type: str 96 | 97 | def write_to_file(self, out_file_path): 98 | # type: (str) -> None 99 | """ 100 | Writes configuration parameters to `out_file_path` 101 | :param out_file_path: path of the output file 102 | """ 103 | import re 104 | 105 | ansi_escape = re.compile(r'\x1B\[[0-?]*[ -/]*[@-~]') 106 | text = ansi_escape.sub('', str(self)) 107 | with open(out_file_path, 'w') as out_file: 108 | print(text, file=out_file) 109 | 110 | def __str__(self): 111 | # type: () -> str 112 | out_str = '' 113 | for key in self.__dict__: 114 | if key in self.keys_to_hide: 115 | continue 116 | value = self.__dict__[key] 117 | if type(value) is Path or type(value) is str: 118 | value = value.replace(Conf.LOG_PATH, '$LOG_PATH') 119 | value = termcolor.colored(value, 'yellow') 120 | else: 121 | value = termcolor.colored(f'{value}', 'magenta') 122 | out_str += termcolor.colored(f'{key.upper()}', 'blue') 123 | out_str += termcolor.colored(': ', 'red') 124 | out_str += value 125 | out_str += '\n' 126 | return out_str[:-1] 127 | 128 | def no_color_str(self): 129 | # type: () -> str 130 | out_str = '' 131 | for key in self.__dict__: 132 | value = self.__dict__[key] 133 | if type(value) is Path or type(value) is str: 134 | value = value.replace(Conf.LOG_PATH, '$LOG_PATH') 135 | out_str += f'{key.upper()}: {value}\n' 136 | return out_str[:-1] 137 | 138 | 139 | def show_default_params(): 140 | """ 141 | Print default configuration parameters 142 | """ 143 | cnf = Conf(exp_name='default') 144 | print(f'\nDefault configuration parameters: \n{cnf}') 145 | 146 | 147 | if __name__ == '__main__': 148 | show_default_params() 149 | -------------------------------------------------------------------------------- /conf/electricity.yaml: -------------------------------------------------------------------------------- 1 | #Hyper Params 2 | batch_size: 64 3 | device: cuda 4 | lr: 0.001 5 | num_epochs: 20 6 | n_workers: 0 7 | model: transformer 8 | loader: base 9 | 10 | # Dataset 11 | ds_name: electricity 12 | train_samples: 450000 13 | test_samples: 50000 14 | val_samples: 50000 15 | input_size: 5 16 | output_size: 1 17 | total_time_steps: 192 18 | num_encoder_steps: 168 19 | static_input_loc: 20 | - 4 21 | input_obs_loc: 22 | - 0 23 | known_categorical_inputs: 24 | - 0 25 | known_regular_inputs: 26 | - 1 27 | - 2 28 | - 3 29 | category_counts: 30 | - 369 31 | 32 | # Model Temporal Fusion Transformer 33 | quantiles: 34 | - 0.1 35 | - 0.5 36 | - 0.9 37 | batch_first: true 38 | early_stopping_patience: 5 39 | hidden_layer_size: 160 40 | stack_size: 1 41 | dropout_rate: 0.1 42 | max_gradient_norm: 0.01 43 | num_heads: 4 44 | 45 | # Model Transformer 46 | d_model: 64 47 | q: 16 48 | v: 16 49 | h: 4 50 | N: 2 51 | attention_size: 0 52 | dropout: 0.1 53 | pe: original 54 | chunk_mode: None 55 | d_input: 5 56 | d_output: 3 57 | -------------------------------------------------------------------------------- /conf/traffic.yaml: -------------------------------------------------------------------------------- 1 | # Hyper Params 2 | batch_size: 128 3 | device: cuda 4 | lr: 0.001 5 | num_epochs: 100 6 | n_workers: 0 7 | model: tf_transformer 8 | 9 | # Dataset 10 | ds_name: traffic 11 | train_samples: 10000 12 | test_samples: 1000 13 | val_samples: 1000 14 | input_size: 5 15 | output_size: 1 16 | total_time_steps: 192 17 | num_encoder_steps: 168 18 | static_input_loc: 19 | - 4 20 | input_obs_loc: 21 | - 0 22 | known_categorical_inputs: 23 | - 0 24 | known_regular_inputs: 25 | - 1 26 | - 2 27 | - 3 28 | category_counts: 29 | - 963 30 | 31 | # Model Temporal Fusion Transformer 32 | quantiles: 33 | - 0.1 34 | - 0.5 35 | - 0.9 36 | batch_first: true 37 | early_stopping_patience: 5 38 | hidden_layer_size: 320 39 | stack_size: 1 40 | dropout_rate: 0.3 41 | max_gradient_norm: 100.0 42 | num_heads: 4 43 | multiprocessing_workers: 5 44 | 45 | # Model Transformer 46 | d_model: 64 47 | q: 16 48 | v: 16 49 | h: 4 50 | N: 2 51 | attention_size: 0 52 | dropout: 0.1 53 | pe: original 54 | chunk_mode: None 55 | d_input: 5 56 | d_output: 3 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /conf/volatility.yaml: -------------------------------------------------------------------------------- 1 | batch_first: true 2 | batch_size: 64 3 | train_samples: 10000 4 | test_samples: 1000 5 | val_samples: 1000 6 | category_counts: 7 | - 7 8 | - 31 9 | - 53 10 | - 12 11 | - 4 12 | device: cuda 13 | dropout_rate: 0.3 14 | ds_name: volatility 15 | early_stopping_patience: 5 16 | hidden_layer_size: 160 17 | input_obs_loc: 18 | - 0 19 | input_size: 8 20 | known_categorical_inputs: 21 | - 0 22 | - 1 23 | - 2 24 | - 3 25 | - 4 26 | known_regular_inputs: 27 | - 2 28 | lr: 0.0001 29 | max_gradient_norm: 0.01 30 | n_workers: 0 31 | num_encoder_steps: 252 32 | num_epochs: 100 33 | num_heads: 1 34 | output_size: 1 35 | quantiles: 36 | - 0.1 37 | - 0.5 38 | - 0.9 39 | stack_size: 1 40 | static_input_loc: 41 | - 7 42 | total_time_steps: 257 43 | -------------------------------------------------------------------------------- /data_formatters/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from data_formatters.utils import make_data_formatter, csv_path_to_folder 17 | from data_formatters import volatility, electricity, favorita, traffic 18 | -------------------------------------------------------------------------------- /data_formatters/base.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Lint as: python3 17 | """Default data formatting functions for experiments. 18 | 19 | For new datasets, inherit form GenericDataFormatter and implement 20 | all abstract functions. 21 | 22 | These dataset-specific methods: 23 | 1) Define the column and input types for tabular dataframes used by model 24 | 2) Perform the necessary input feature engineering & normalisation steps 25 | 3) Reverts the normalisation for predictions 26 | 4) Are responsible for train, validation and test splits 27 | 28 | 29 | """ 30 | 31 | import abc 32 | import enum 33 | 34 | 35 | # Type defintions 36 | class DataTypes(enum.IntEnum): 37 | """Defines numerical types of each column.""" 38 | REAL_VALUED = 0 39 | CATEGORICAL = 1 40 | DATE = 2 41 | 42 | 43 | class InputTypes(enum.IntEnum): 44 | """Defines input types of each column.""" 45 | TARGET = 0 46 | OBSERVED_INPUT = 1 47 | KNOWN_INPUT = 2 48 | STATIC_INPUT = 3 49 | ID = 4 # Single column used as an entity identifier 50 | TIME = 5 # Single column exclusively used as a time index 51 | 52 | 53 | class GenericDataFormatter(abc.ABC): 54 | """Abstract base class for all data formatters. 55 | 56 | User can implement the abstract methods below to perform dataset-specific 57 | manipulations. 58 | 59 | """ 60 | 61 | @abc.abstractmethod 62 | def set_scalers(self, df): 63 | """Calibrates scalers using the data supplied.""" 64 | raise NotImplementedError() 65 | 66 | @abc.abstractmethod 67 | def transform_inputs(self, df): 68 | """Performs feature transformation.""" 69 | raise NotImplementedError() 70 | 71 | @abc.abstractmethod 72 | def format_predictions(self, df): 73 | """Reverts any normalisation to give predictions in original scale.""" 74 | raise NotImplementedError() 75 | 76 | @abc.abstractmethod 77 | def split_data(self, df): 78 | """Performs the default train, validation and test splits.""" 79 | raise NotImplementedError() 80 | 81 | @property 82 | @abc.abstractmethod 83 | def _column_definition(self): 84 | """Defines order, input type and data type of each column.""" 85 | raise NotImplementedError() 86 | 87 | @abc.abstractmethod 88 | def get_fixed_params(self): 89 | """Defines the fixed parameters used by the model for training. 90 | 91 | Requires the following keys: 92 | 'total_time_steps': Defines the total number of time steps used by TFT 93 | 'num_encoder_steps': Determines length of LSTM encoder (i.e. history) 94 | 'num_epochs': Maximum number of epochs for training 95 | 'early_stopping_patience': Early stopping param for keras 96 | 'multiprocessing_workers': # of cpus for data processing 97 | 98 | 99 | Returns: 100 | A dictionary of fixed parameters, e.g.: 101 | 102 | fixed_params = { 103 | 'total_time_steps': 252 + 5, 104 | 'num_encoder_steps': 252, 105 | 'num_epochs': 100, 106 | 'early_stopping_patience': 5, 107 | 'multiprocessing_workers': 5, 108 | } 109 | """ 110 | raise NotImplementedError 111 | 112 | # Shared functions across data-formatters 113 | @property 114 | def num_classes_per_cat_input(self): 115 | """Returns number of categories per relevant input. 116 | 117 | This is seqeuently required for keras embedding layers. 118 | """ 119 | return self._num_classes_per_cat_input 120 | 121 | def get_num_samples_for_calibration(self): 122 | """Gets the default number of training and validation samples. 123 | 124 | Use to sub-sample the data for network calibration and a value of -1 uses 125 | all available samples. 126 | 127 | Returns: 128 | Tuple of (training samples, validation samples) 129 | """ 130 | return -1, -1 131 | 132 | def get_column_definition(self): 133 | """"Returns formatted column definition in order expected by the TFT.""" 134 | 135 | column_definition = self._column_definition 136 | 137 | # Sanity checks first. 138 | # Ensure only one ID and time column exist 139 | def _check_single_column(input_type): 140 | 141 | length = len([tup for tup in column_definition if tup[2] == input_type]) 142 | 143 | if length != 1: 144 | raise ValueError('Illegal number of inputs ({}) of type {}'.format( 145 | length, input_type)) 146 | 147 | _check_single_column(InputTypes.ID) 148 | _check_single_column(InputTypes.TIME) 149 | 150 | identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID] 151 | time = [tup for tup in column_definition if tup[2] == InputTypes.TIME] 152 | real_inputs = [ 153 | tup for tup in column_definition if tup[1] == DataTypes.REAL_VALUED and 154 | tup[2] not in {InputTypes.ID, InputTypes.TIME} 155 | ] 156 | categorical_inputs = [ 157 | tup for tup in column_definition if tup[1] == DataTypes.CATEGORICAL and 158 | tup[2] not in {InputTypes.ID, InputTypes.TIME} 159 | ] 160 | 161 | return identifier + time + real_inputs + categorical_inputs 162 | 163 | def _get_input_columns(self): 164 | """Returns names of all input columns.""" 165 | return [ 166 | tup[0] 167 | for tup in self.get_column_definition() 168 | if tup[2] not in {InputTypes.ID, InputTypes.TIME} 169 | ] 170 | 171 | def _get_tft_input_indices(self): 172 | """Returns the relevant indexes and input sizes required by TFT.""" 173 | 174 | # Functions 175 | def _extract_tuples_from_data_type(data_type, defn): 176 | return [ 177 | tup for tup in defn if tup[1] == data_type and 178 | tup[2] not in {InputTypes.ID, InputTypes.TIME} 179 | ] 180 | 181 | def _get_locations(input_types, defn): 182 | return [i for i, tup in enumerate(defn) if tup[2] in input_types] 183 | 184 | # Start extraction 185 | column_definition = [ 186 | tup for tup in self.get_column_definition() 187 | if tup[2] not in {InputTypes.ID, InputTypes.TIME} 188 | ] 189 | 190 | categorical_inputs = _extract_tuples_from_data_type(DataTypes.CATEGORICAL, 191 | column_definition) 192 | real_inputs = _extract_tuples_from_data_type(DataTypes.REAL_VALUED, 193 | column_definition) 194 | 195 | locations = { 196 | 'input_size': 197 | len(self._get_input_columns()), 198 | 'output_size': 199 | len(_get_locations({InputTypes.TARGET}, column_definition)), 200 | 'category_counts': 201 | self.num_classes_per_cat_input, 202 | 'input_obs_loc': 203 | _get_locations({InputTypes.TARGET}, column_definition), 204 | 'static_input_loc': 205 | _get_locations({InputTypes.STATIC_INPUT}, column_definition), 206 | 'known_regular_inputs': 207 | _get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT}, 208 | real_inputs), 209 | 'known_categorical_inputs': 210 | _get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT}, 211 | categorical_inputs), 212 | } 213 | 214 | return locations 215 | 216 | def get_experiment_params(self): 217 | """Returns fixed model parameters for experiments.""" 218 | 219 | required_keys = [ 220 | 'total_time_steps', 'num_encoder_steps', 'num_epochs', 221 | 'early_stopping_patience', 'multiprocessing_workers' 222 | ] 223 | 224 | fixed_params = self.get_fixed_params() 225 | 226 | for k in required_keys: 227 | if k not in fixed_params: 228 | raise ValueError('Field {}'.format(k) + 229 | ' missing from fixed parameter definitions!') 230 | 231 | fixed_params['column_definition'] = self.get_column_definition() 232 | 233 | fixed_params.update(self._get_tft_input_indices()) 234 | 235 | return fixed_params 236 | 237 | -------------------------------------------------------------------------------- /data_formatters/electricity.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Lint as: python3 17 | """Custom formatting functions for Electricity dataset. 18 | 19 | Defines dataset specific column definitions and data transformations. Uses 20 | entity specific z-score normalization. 21 | """ 22 | 23 | import data_formatters.base 24 | import data_formatters.utils as utils 25 | import pandas as pd 26 | import sklearn.preprocessing 27 | 28 | GenericDataFormatter = data_formatters.base.GenericDataFormatter 29 | DataTypes = data_formatters.base.DataTypes 30 | InputTypes = data_formatters.base.InputTypes 31 | 32 | 33 | class ElectricityFormatter(GenericDataFormatter): 34 | """Defines and formats data for the electricity dataset. 35 | 36 | Note that per-entity z-score normalization is used here, and is implemented 37 | across functions. 38 | 39 | Attributes: 40 | column_definition: Defines input and data type of column used in the 41 | experiment. 42 | identifiers: Entity identifiers used in experiments. 43 | """ 44 | 45 | _column_definition = [ 46 | ('id', DataTypes.REAL_VALUED, InputTypes.ID), 47 | ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.TIME), 48 | ('power_usage', DataTypes.REAL_VALUED, InputTypes.TARGET), 49 | ('hour', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 50 | ('day_of_week', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 51 | ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 52 | ('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 53 | ] 54 | 55 | def __init__(self): 56 | """Initialises formatter.""" 57 | 58 | self.identifiers = None 59 | self._real_scalers = None 60 | self._cat_scalers = None 61 | self._target_scaler = None 62 | self._num_classes_per_cat_input = None 63 | self._time_steps = self.get_fixed_params()['total_time_steps'] 64 | 65 | def split_data(self, df, valid_boundary=1315, test_boundary=1339): 66 | """Splits data frame into training-validation-test data frames. 67 | 68 | This also calibrates scaling object, and transforms data for each split. 69 | 70 | Args: 71 | df: Source data frame to split. 72 | valid_boundary: Starting year for validation data 73 | test_boundary: Starting year for test data 74 | 75 | Returns: 76 | Tuple of transformed (train, valid, test) data. 77 | """ 78 | 79 | print('Formatting train-valid-test splits.') 80 | 81 | index = df['days_from_start'] 82 | train = df.loc[index < valid_boundary] 83 | valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)] 84 | test = df.loc[index >= test_boundary - 7] 85 | 86 | self.set_scalers(train) 87 | 88 | return (self.transform_inputs(data) for data in [train, valid, test]) 89 | 90 | def set_scalers(self, df): 91 | """Calibrates scalers using the data supplied. 92 | 93 | Args: 94 | df: Data to use to calibrate scalers. 95 | """ 96 | print('Setting scalers with training data...') 97 | 98 | column_definitions = self.get_column_definition() 99 | id_column = utils.get_single_col_by_input_type(InputTypes.ID, 100 | column_definitions) 101 | target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, 102 | column_definitions) 103 | 104 | # Format real scalers 105 | real_inputs = utils.extract_cols_from_data_type( 106 | DataTypes.REAL_VALUED, column_definitions, 107 | {InputTypes.ID, InputTypes.TIME}) 108 | 109 | # Initialise scaler caches 110 | self._real_scalers = {} 111 | self._target_scaler = {} 112 | identifiers = [] 113 | for identifier, sliced in df.groupby(id_column): 114 | 115 | if len(sliced) >= self._time_steps: 116 | 117 | data = sliced[real_inputs].values 118 | targets = sliced[[target_column]].values 119 | self._real_scalers[identifier] \ 120 | = sklearn.preprocessing.StandardScaler().fit(data) 121 | 122 | self._target_scaler[identifier] \ 123 | = sklearn.preprocessing.StandardScaler().fit(targets) 124 | identifiers.append(identifier) 125 | 126 | # Format categorical scalers 127 | categorical_inputs = utils.extract_cols_from_data_type( 128 | DataTypes.CATEGORICAL, column_definitions, 129 | {InputTypes.ID, InputTypes.TIME}) 130 | 131 | categorical_scalers = {} 132 | num_classes = [] 133 | for col in categorical_inputs: 134 | # Set all to str so that we don't have mixed integer/string columns 135 | srs = df[col].apply(str) 136 | categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit( 137 | srs.values) 138 | num_classes.append(srs.nunique()) 139 | 140 | # Set categorical scaler outputs 141 | self._cat_scalers = categorical_scalers 142 | self._num_classes_per_cat_input = num_classes 143 | 144 | # Extract identifiers in case required 145 | self.identifiers = identifiers 146 | 147 | def transform_inputs(self, df): 148 | """Performs feature transformations. 149 | 150 | This includes both feature engineering, preprocessing and normalisation. 151 | 152 | Args: 153 | df: Data frame to transform. 154 | 155 | Returns: 156 | Transformed data frame. 157 | 158 | """ 159 | 160 | if self._real_scalers is None and self._cat_scalers is None: 161 | raise ValueError('Scalers have not been set!') 162 | 163 | # Extract relevant columns 164 | column_definitions = self.get_column_definition() 165 | id_col = utils.get_single_col_by_input_type(InputTypes.ID, 166 | column_definitions) 167 | real_inputs = utils.extract_cols_from_data_type( 168 | DataTypes.REAL_VALUED, column_definitions, 169 | {InputTypes.ID, InputTypes.TIME}) 170 | categorical_inputs = utils.extract_cols_from_data_type( 171 | DataTypes.CATEGORICAL, column_definitions, 172 | {InputTypes.ID, InputTypes.TIME}) 173 | 174 | # Transform real inputs per entity 175 | df_list = [] 176 | for identifier, sliced in df.groupby(id_col): 177 | 178 | # Filter out any trajectories that are too short 179 | if len(sliced) >= self._time_steps: 180 | sliced_copy = sliced.copy() 181 | sliced_copy[real_inputs] = self._real_scalers[identifier].transform( 182 | sliced_copy[real_inputs].values) 183 | df_list.append(sliced_copy) 184 | 185 | output = pd.concat(df_list, axis=0) 186 | 187 | # Format categorical inputs 188 | for col in categorical_inputs: 189 | string_df = df[col].apply(str) 190 | output[col] = self._cat_scalers[col].transform(string_df) 191 | 192 | return output 193 | 194 | def format_predictions(self, predictions): 195 | """Reverts any normalisation to give predictions in original scale. 196 | 197 | Args: 198 | predictions: Dataframe of model predictions. 199 | 200 | Returns: 201 | Data frame of unnormalised predictions. 202 | """ 203 | 204 | if self._target_scaler is None: 205 | raise ValueError('Scalers have not been set!') 206 | 207 | column_names = predictions.columns 208 | 209 | df_list = [] 210 | for identifier, sliced in predictions.groupby('identifier'): 211 | sliced_copy = sliced.copy() 212 | target_scaler = self._target_scaler[identifier] 213 | 214 | for col in column_names: 215 | if col not in {'forecast_time', 'identifier'}: 216 | sliced_copy[col] = target_scaler.inverse_transform(sliced_copy[col].values.reshape(-1,1)) 217 | df_list.append(sliced_copy) 218 | 219 | output = pd.concat(df_list, axis=0) 220 | 221 | return output 222 | 223 | # Default params 224 | def get_fixed_params(self): 225 | """Returns fixed model parameters for experiments.""" 226 | 227 | fixed_params = { 228 | 'total_time_steps': 8 * 24, 229 | 'num_encoder_steps': 7 * 24, 230 | 'num_epochs': 100, 231 | 'early_stopping_patience': 5, 232 | 'multiprocessing_workers': 5 233 | } 234 | 235 | return fixed_params 236 | 237 | def get_default_model_params(self): 238 | """Returns default optimised model parameters.""" 239 | 240 | model_params = { 241 | 'dropout_rate': 0.1, 242 | 'hidden_layer_size': 160, 243 | 'learning_rate': 0.001, 244 | 'minibatch_size': 64, 245 | 'max_gradient_norm': 0.01, 246 | 'num_heads': 4, 247 | 'stack_size': 1 248 | } 249 | 250 | return model_params 251 | 252 | def get_num_samples_for_calibration(self): 253 | """Gets the default number of training and validation samples. 254 | 255 | Use to sub-sample the data for network calibration and a value of -1 uses 256 | all available samples. 257 | 258 | Returns: 259 | Tuple of (training samples, validation samples) 260 | """ 261 | return 450000, 50000 262 | -------------------------------------------------------------------------------- /data_formatters/favorita.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Lint as: python3 17 | """Custom formatting functions for Favorita dataset. 18 | 19 | Defines dataset specific column definitions and data transformations. 20 | """ 21 | 22 | import data_formatters.base 23 | import data_formatters.utils as utils 24 | import pandas as pd 25 | import sklearn.preprocessing 26 | 27 | DataTypes = data_formatters.base.DataTypes 28 | InputTypes = data_formatters.base.InputTypes 29 | 30 | 31 | class FavoritaFormatter(data_formatters.base.GenericDataFormatter): 32 | """Defines and formats data for the Favorita dataset. 33 | 34 | Attributes: 35 | column_definition: Defines input and data type of column used in the 36 | experiment. 37 | identifiers: Entity identifiers used in experiments. 38 | """ 39 | 40 | _column_definition = [ 41 | ('traj_id', DataTypes.REAL_VALUED, InputTypes.ID), 42 | ('date', DataTypes.DATE, InputTypes.TIME), 43 | ('log_sales', DataTypes.REAL_VALUED, InputTypes.TARGET), 44 | ('onpromotion', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), 45 | ('transactions', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), 46 | ('oil', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), 47 | ('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), 48 | ('day_of_month', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 49 | ('month', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 50 | ('national_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), 51 | ('regional_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), 52 | ('local_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), 53 | ('open', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 54 | ('item_nbr', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 55 | ('store_nbr', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 56 | ('city', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 57 | ('state', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 58 | ('type', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 59 | ('cluster', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 60 | ('family', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 61 | ('class', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 62 | ('perishable', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT) 63 | ] 64 | 65 | def __init__(self): 66 | """Initialises formatter.""" 67 | 68 | self.identifiers = None 69 | self._real_scalers = None 70 | self._cat_scalers = None 71 | self._target_scaler = None 72 | self._num_classes_per_cat_input = None 73 | 74 | def split_data(self, df, valid_boundary=None, test_boundary=None): 75 | """Splits data frame into training-validation-test data frames. 76 | 77 | This also calibrates scaling object, and transforms data for each split. 78 | 79 | Args: 80 | df: Source data frame to split. 81 | valid_boundary: Starting year for validation data 82 | test_boundary: Starting year for test data 83 | 84 | Returns: 85 | Tuple of transformed (train, valid, test) data. 86 | """ 87 | 88 | print('Formatting train-valid-test splits.') 89 | 90 | if valid_boundary is None: 91 | valid_boundary = pd.datetime(2015, 12, 1) 92 | 93 | fixed_params = self.get_fixed_params() 94 | time_steps = fixed_params['total_time_steps'] 95 | lookback = fixed_params['num_encoder_steps'] 96 | forecast_horizon = time_steps - lookback 97 | 98 | df['date'] = pd.to_datetime(df['date']) 99 | df_lists = {'train': [], 'valid': [], 'test': []} 100 | for _, sliced in df.groupby('traj_id'): 101 | index = sliced['date'] 102 | train = sliced.loc[index < valid_boundary] 103 | train_len = len(train) 104 | valid_len = train_len + forecast_horizon 105 | valid = sliced.iloc[train_len - lookback:valid_len, :] 106 | test = sliced.iloc[valid_len - lookback:valid_len + forecast_horizon, :] 107 | 108 | sliced_map = {'train': train, 'valid': valid, 'test': test} 109 | 110 | for k in sliced_map: 111 | item = sliced_map[k] 112 | 113 | if len(item) >= time_steps: 114 | df_lists[k].append(item) 115 | 116 | dfs = {k: pd.concat(df_lists[k], axis=0) for k in df_lists} 117 | 118 | train = dfs['train'] 119 | self.set_scalers(train, set_real=True) 120 | 121 | # Use all data for label encoding to handle labels not present in training. 122 | self.set_scalers(df, set_real=False) 123 | 124 | # Filter out identifiers not present in training (i.e. cold-started items). 125 | def filter_ids(frame): 126 | identifiers = set(self.identifiers) 127 | index = frame['traj_id'] 128 | return frame.loc[index.apply(lambda x: x in identifiers)] 129 | 130 | valid = filter_ids(dfs['valid']) 131 | test = filter_ids(dfs['test']) 132 | 133 | return (self.transform_inputs(data) for data in [train, valid, test]) 134 | 135 | def set_scalers(self, df, set_real=True): 136 | """Calibrates scalers using the data supplied. 137 | 138 | Label encoding is applied to the entire dataset (i.e. including test), 139 | so that unseen labels can be handled at run-time. 140 | 141 | Args: 142 | df: Data to use to calibrate scalers. 143 | set_real: Whether to fit set real-valued or categorical scalers 144 | """ 145 | print('Setting scalers with training data...') 146 | 147 | column_definitions = self.get_column_definition() 148 | id_column = utils.get_single_col_by_input_type(InputTypes.ID, 149 | column_definitions) 150 | target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, 151 | column_definitions) 152 | 153 | if set_real: 154 | 155 | # Extract identifiers in case required 156 | self.identifiers = list(df[id_column].unique()) 157 | 158 | # Format real scalers 159 | self._real_scalers = {} 160 | #for col in ['oil', 'transactions', 'log_sales']: 161 | # self._real_scalers[col] = (df[col].mean(), df[col].std()) 162 | 163 | self._target_scaler = (df[target_column].mean(), df[target_column].std()) 164 | 165 | else: 166 | # Format categorical scalers 167 | categorical_inputs = utils.extract_cols_from_data_type( 168 | DataTypes.CATEGORICAL, column_definitions, 169 | {InputTypes.ID, InputTypes.TIME}) 170 | 171 | categorical_scalers = {} 172 | num_classes = [] 173 | if self.identifiers is None: 174 | raise ValueError('Scale real-valued inputs first!') 175 | id_set = set(self.identifiers) 176 | valid_idx = df['traj_id'].apply(lambda x: x in id_set) 177 | for col in categorical_inputs: 178 | # Set all to str so that we don't have mixed integer/string columns 179 | srs = df[col].apply(str).loc[valid_idx] 180 | categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit( 181 | srs.values) 182 | 183 | num_classes.append(srs.nunique()) 184 | 185 | # Set categorical scaler outputs 186 | self._cat_scalers = categorical_scalers 187 | self._num_classes_per_cat_input = num_classes 188 | 189 | def transform_inputs(self, df): 190 | """Performs feature transformations. 191 | 192 | This includes both feature engineering, preprocessing and normalisation. 193 | 194 | Args: 195 | df: Data frame to transform. 196 | 197 | Returns: 198 | Transformed data frame. 199 | 200 | """ 201 | output = df.copy() 202 | 203 | if self._real_scalers is None and self._cat_scalers is None: 204 | raise ValueError('Scalers have not been set!') 205 | 206 | column_definitions = self.get_column_definition() 207 | 208 | categorical_inputs = utils.extract_cols_from_data_type( 209 | DataTypes.CATEGORICAL, column_definitions, 210 | {InputTypes.ID, InputTypes.TIME}) 211 | 212 | # Format real inputs 213 | #for col in ['log_sales', 'oil', 'transactions']: 214 | # mean, std = self._real_scalers[col] 215 | # output[col] = (df[col] - mean) / std 216 | 217 | 218 | output['log_sales'] = output['log_sales'].fillna(0.) # mean imputation 219 | 220 | # Format categorical inputs 221 | for col in categorical_inputs: 222 | string_df = df[col].apply(str) 223 | output[col] = self._cat_scalers[col].transform(string_df) 224 | 225 | return output 226 | 227 | def format_predictions(self, predictions): 228 | """Reverts any normalisation to give predictions in original scale. 229 | 230 | Args: 231 | predictions: Dataframe of model predictions. 232 | 233 | Returns: 234 | Data frame of unnormalised predictions. 235 | """ 236 | output = predictions.copy() 237 | 238 | column_names = predictions.columns 239 | mean, std = self._target_scaler 240 | for col in column_names: 241 | if col not in {'forecast_time', 'identifier'}: 242 | output[col] = (predictions[col] * std) + mean 243 | 244 | return output 245 | 246 | # Default params 247 | def get_fixed_params(self): 248 | """Returns fixed model parameters for experiments.""" 249 | 250 | fixed_params = { 251 | 'total_time_steps': 120, 252 | 'num_encoder_steps': 30, 253 | 'num_epochs': 100, 254 | 'early_stopping_patience': 5, 255 | 'multiprocessing_workers': 5 256 | } 257 | 258 | return fixed_params 259 | 260 | def get_default_model_params(self): 261 | """Returns default optimised model parameters.""" 262 | 263 | model_params = { 264 | 'dropout_rate': 0.1, 265 | 'hidden_layer_size': 240, 266 | 'learning_rate': 0.001, 267 | 'minibatch_size': 128, 268 | 'max_gradient_norm': 100., 269 | 'num_heads': 4, 270 | 'stack_size': 1 271 | } 272 | 273 | return model_params 274 | 275 | def get_num_samples_for_calibration(self): 276 | """Gets the default number of training and validation samples. 277 | 278 | Use to sub-sample the data for network calibration and a value of -1 uses 279 | all available samples. 280 | 281 | Returns: 282 | Tuple of (training samples, validation samples) 283 | """ 284 | return 450000, 50000 285 | 286 | def get_column_definition(self): 287 | """"Formats column definition in order expected by the TFT. 288 | 289 | Modified for Favorita to match column order of original experiment. 290 | 291 | Returns: 292 | Favorita-specific column definition 293 | """ 294 | 295 | column_definition = self._column_definition 296 | 297 | # Sanity checks first. 298 | # Ensure only one ID and time column exist 299 | def _check_single_column(input_type): 300 | 301 | length = len([tup for tup in column_definition if tup[2] == input_type]) 302 | 303 | if length != 1: 304 | raise ValueError('Illegal number of inputs ({}) of type {}'.format( 305 | length, input_type)) 306 | 307 | _check_single_column(InputTypes.ID) 308 | _check_single_column(InputTypes.TIME) 309 | 310 | identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID] 311 | time = [tup for tup in column_definition if tup[2] == InputTypes.TIME] 312 | real_inputs = [ 313 | tup for tup in column_definition if tup[1] == DataTypes.REAL_VALUED and 314 | tup[2] not in {InputTypes.ID, InputTypes.TIME} 315 | ] 316 | 317 | col_definition_map = {tup[0]: tup for tup in column_definition} 318 | col_order = [ 319 | 'item_nbr', 'store_nbr', 'city', 'state', 'type', 'cluster', 'family', 320 | 'class', 'perishable', 'onpromotion', 'day_of_week', 'national_hol', 321 | 'regional_hol', 'local_hol' 322 | ] 323 | categorical_inputs = [ 324 | col_definition_map[k] for k in col_order if k in col_definition_map 325 | ] 326 | 327 | return identifier + time + real_inputs + categorical_inputs 328 | -------------------------------------------------------------------------------- /data_formatters/traffic.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Lint as: python3 17 | """Custom formatting functions for Traffic dataset. 18 | 19 | Defines dataset specific column definitions and data transformations. This also 20 | performs z-score normalization across the entire dataset, hence re-uses most of 21 | the same functions as volatility. 22 | """ 23 | 24 | import data_formatters.base 25 | import data_formatters.volatility 26 | import data_formatters.utils 27 | 28 | VolatilityFormatter = data_formatters.volatility.VolatilityFormatter 29 | DataTypes = data_formatters.base.DataTypes 30 | InputTypes = data_formatters.base.InputTypes 31 | 32 | 33 | class TrafficFormatter(VolatilityFormatter): 34 | """Defines and formats data for the traffic dataset. 35 | 36 | This also performs z-score normalization across the entire dataset, hence 37 | re-uses most of the same functions as volatility. 38 | 39 | Attributes: 40 | column_definition: Defines input and data type of column used in the 41 | experiment. 42 | identifiers: Entity identifiers used in experiments. 43 | """ 44 | 45 | _column_definition = [ 46 | ('id', DataTypes.REAL_VALUED, InputTypes.ID), 47 | ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.TIME), 48 | ('values', DataTypes.REAL_VALUED, InputTypes.TARGET), 49 | ('time_on_day', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 50 | ('day_of_week', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 51 | ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 52 | ('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 53 | ] 54 | 55 | def split_data(self, df, valid_boundary=151, test_boundary=166): 56 | """Splits data frame into training-validation-test data frames. 57 | 58 | This also calibrates scaling object, and transforms data for each split. 59 | 60 | Args: 61 | df: Source data frame to split. 62 | valid_boundary: Starting year for validation data 63 | test_boundary: Starting year for test data 64 | 65 | Returns: 66 | Tuple of transformed (train, valid, test) data. 67 | """ 68 | 69 | print('Formatting train-valid-test splits.') 70 | 71 | index = df['sensor_day'] 72 | train = df.loc[index < valid_boundary] 73 | valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)] 74 | test = df.loc[index >= test_boundary - 7] 75 | 76 | self.set_scalers(train) 77 | 78 | return (self.transform_inputs(data) for data in [train, valid, test]) 79 | 80 | # Default params 81 | def get_fixed_params(self): 82 | """Returns fixed model parameters for experiments.""" 83 | 84 | fixed_params = { 85 | 'total_time_steps': 8 * 24, 86 | 'num_encoder_steps': 7 * 24, 87 | 'num_epochs': 100, 88 | 'early_stopping_patience': 5, 89 | 'multiprocessing_workers': 5 90 | } 91 | 92 | return fixed_params 93 | 94 | def get_default_model_params(self): 95 | """Returns default optimised model parameters.""" 96 | 97 | model_params = { 98 | 'dropout_rate': 0.3, 99 | 'hidden_layer_size': 320, 100 | 'learning_rate': 0.001, 101 | 'minibatch_size': 128, 102 | 'max_gradient_norm': 100., 103 | 'num_heads': 4, 104 | 'stack_size': 1 105 | } 106 | 107 | return model_params 108 | 109 | def get_num_samples_for_calibration(self): 110 | """Gets the default number of training and validation samples. 111 | 112 | Use to sub-sample the data for network calibration and a value of -1 uses 113 | all available samples. 114 | 115 | Returns: 116 | Tuple of (training samples, validation samples) 117 | """ 118 | return 450000, 50000 119 | -------------------------------------------------------------------------------- /data_formatters/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Lint as: python3 17 | """Generic helper functions used across codebase.""" 18 | 19 | import os 20 | import pathlib 21 | import torch 22 | import numpy as np 23 | import data_formatters 24 | 25 | 26 | # Loss functions. 27 | def pytorch_quantile_loss(y, y_pred, quantile): 28 | """Computes quantile loss for tensorflow. 29 | 30 | Standard quantile loss as defined in the "Training Procedure" section of 31 | the main TFT paper 32 | 33 | Args: 34 | y: Targets 35 | y_pred: Predictions 36 | quantile: Quantile to use for loss calculations (between 0 & 1) 37 | 38 | Returns: 39 | Tensor for quantile loss. 40 | """ 41 | 42 | # Checks quantile 43 | if quantile < 0 or quantile > 1: 44 | raise ValueError( 45 | 'Illegal quantile value={}! Values should be between 0 and 1.'.format( 46 | quantile)) 47 | 48 | prediction_underflow = y - y_pred 49 | q_loss = quantile * torch.max(prediction_underflow, torch.zeros_like(prediction_underflow)) + ( 50 | 1. - quantile) * torch.max(-prediction_underflow, torch.zeros_like(prediction_underflow)) 51 | 52 | return torch.sum(q_loss, axis=-1) 53 | 54 | 55 | 56 | # Generic. 57 | def get_single_col_by_input_type(input_type, column_definition): 58 | """Returns name of single column. 59 | 60 | Args: 61 | input_type: Input type of column to extract 62 | column_definition: Column definition list for experiment 63 | """ 64 | 65 | l = [tup[0] for tup in column_definition if tup[2] == input_type] 66 | 67 | if len(l) != 1: 68 | raise ValueError('Invalid number of columns for {}'.format(input_type)) 69 | 70 | return l[0] 71 | 72 | 73 | def extract_cols_from_data_type(data_type, column_definition, 74 | excluded_input_types): 75 | """Extracts the names of columns that correspond to a define data_type. 76 | 77 | Args: 78 | data_type: DataType of columns to extract. 79 | column_definition: Column definition to use. 80 | excluded_input_types: Set of input types to exclude 81 | 82 | Returns: 83 | List of names for columns with data type specified. 84 | """ 85 | return [ 86 | tup[0] 87 | for tup in column_definition 88 | if tup[1] == data_type and tup[2] not in excluded_input_types 89 | ] 90 | 91 | 92 | def numpy_normalised_quantile_loss(y, y_pred, quantile): 93 | """Computes normalised quantile loss for numpy arrays. 94 | 95 | Uses the q-Risk metric as defined in the "Training Procedure" section of the 96 | main TFT paper. 97 | 98 | Args: 99 | y: Targets 100 | y_pred: Predictions 101 | quantile: Quantile to use for loss calculations (between 0 & 1) 102 | 103 | Returns: 104 | Float for normalised quantile loss. 105 | """ 106 | prediction_underflow = y - y_pred 107 | weighted_errors = quantile * np.maximum(prediction_underflow, 0.) \ 108 | + (1. - quantile) * np.maximum(-prediction_underflow, 0.) 109 | 110 | quantile_loss = weighted_errors.mean() 111 | normaliser = y.abs().mean() 112 | 113 | return 2 * quantile_loss / normaliser 114 | 115 | 116 | # OS related functions. 117 | def create_folder_if_not_exist(directory): 118 | """Creates folder if it doesn't exist. 119 | 120 | Args: 121 | directory: Folder path to create. 122 | """ 123 | # Also creates directories recursively 124 | pathlib.Path(directory).mkdir(parents=True, exist_ok=True) 125 | 126 | 127 | def make_data_formatter(exp_name): 128 | """Gets a data formatter object for experiment. 129 | 130 | Returns: 131 | Default DataFormatter per experiment. 132 | """ 133 | 134 | data_formatter_class = { 135 | 'volatility': data_formatters.volatility.VolatilityFormatter, 136 | 'electricity': data_formatters.electricity.ElectricityFormatter, 137 | 'traffic': data_formatters.traffic.TrafficFormatter, 138 | 'favorita': data_formatters.favorita.FavoritaFormatter, 139 | } 140 | 141 | return data_formatter_class[exp_name]() 142 | 143 | 144 | def csv_path_to_folder(path: str): 145 | return "/".join(path.split('/')[:-1]) + "/" 146 | 147 | 148 | def data_csv_path(exp_name): 149 | csv_map = { 150 | 'volatility': './data/volatility/formatted_omi_vol.csv', 151 | 'electricity': './data/electricity/hourly_electricity.csv', 152 | 'traffic': './data/traffic/hourly_data.csv', 153 | 'favorita': './data/favorita/favorita_consolidated.csv', 154 | } 155 | 156 | return csv_map[exp_name] 157 | -------------------------------------------------------------------------------- /data_formatters/volatility.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Lint as: python3 17 | """Custom formatting functions for Volatility dataset. 18 | 19 | Defines dataset specific column definitions and data transformations. 20 | """ 21 | 22 | import data_formatters.base 23 | import data_formatters.utils 24 | import sklearn.preprocessing 25 | from data_formatters import utils 26 | 27 | GenericDataFormatter = data_formatters.base.GenericDataFormatter 28 | DataTypes = data_formatters.base.DataTypes 29 | InputTypes = data_formatters.base.InputTypes 30 | 31 | 32 | class VolatilityFormatter(GenericDataFormatter): 33 | """Defines and formats data for the volatility dataset. 34 | 35 | Attributes: 36 | column_definition: Defines input and data type of column used in the 37 | experiment. 38 | identifiers: Entity identifiers used in experiments. 39 | """ 40 | 41 | _column_definition = [ 42 | ('Symbol', DataTypes.CATEGORICAL, InputTypes.ID), 43 | ('date', DataTypes.DATE, InputTypes.TIME), 44 | ('log_vol', DataTypes.REAL_VALUED, InputTypes.TARGET), 45 | ('open_to_close', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), 46 | ('days_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT), 47 | ('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), 48 | ('day_of_month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), 49 | ('week_of_year', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), 50 | ('month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), 51 | ('Region', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), 52 | ] 53 | 54 | def __init__(self): 55 | """Initialises formatter.""" 56 | 57 | self.identifiers = None 58 | self._real_scalers = None 59 | self._cat_scalers = None 60 | self._target_scaler = None 61 | self._num_classes_per_cat_input = None 62 | 63 | def split_data(self, df, valid_boundary=2016, test_boundary=2018): 64 | """Splits data frame into training-validation-test data frames. 65 | 66 | This also calibrates scaling object, and transforms data for each split. 67 | 68 | Args: 69 | df: Source data frame to split. 70 | valid_boundary: Starting year for validation data 71 | test_boundary: Starting year for test data 72 | 73 | Returns: 74 | Tuple of transformed (train, valid, test) data. 75 | """ 76 | 77 | print('Formatting train-valid-test splits.') 78 | 79 | index = df['year'] 80 | train = df.loc[index < valid_boundary] 81 | valid = df.loc[(index >= valid_boundary) & (index < test_boundary)] 82 | test = df.loc[index >= test_boundary] 83 | 84 | self.set_scalers(train) 85 | 86 | return (self.transform_inputs(data) for data in [train, valid, test]) 87 | 88 | def set_scalers(self, df): 89 | """Calibrates scalers using the data supplied. 90 | 91 | Args: 92 | df: Data to use to calibrate scalers. 93 | """ 94 | print('Setting scalers with training data...') 95 | 96 | column_definitions = self.get_column_definition() 97 | id_column = utils.get_single_col_by_input_type(InputTypes.ID, 98 | column_definitions) 99 | target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, 100 | column_definitions) 101 | 102 | # Extract identifiers in case required 103 | self.identifiers = list(df[id_column].unique()) 104 | 105 | # Format real scalers 106 | real_inputs = utils.extract_cols_from_data_type( 107 | DataTypes.REAL_VALUED, column_definitions, 108 | {InputTypes.ID, InputTypes.TIME}) 109 | 110 | data = df[real_inputs].values 111 | self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data) 112 | self._target_scaler = sklearn.preprocessing.StandardScaler().fit( 113 | df[[target_column]].values) # used for predictions 114 | 115 | # Format categorical scalers 116 | categorical_inputs = utils.extract_cols_from_data_type( 117 | DataTypes.CATEGORICAL, column_definitions, 118 | {InputTypes.ID, InputTypes.TIME}) 119 | 120 | categorical_scalers = {} 121 | num_classes = [] 122 | for col in categorical_inputs: 123 | # Set all to str so that we don't have mixed integer/string columns 124 | srs = df[col].apply(str) 125 | categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit( 126 | srs.values) 127 | num_classes.append(srs.nunique()) 128 | 129 | # Set categorical scaler outputs 130 | self._cat_scalers = categorical_scalers 131 | self._num_classes_per_cat_input = num_classes 132 | 133 | def transform_inputs(self, df): 134 | """Performs feature transformations. 135 | 136 | This includes both feature engineering, preprocessing and normalisation. 137 | 138 | Args: 139 | df: Data frame to transform. 140 | 141 | Returns: 142 | Transformed data frame. 143 | 144 | """ 145 | output = df.copy() 146 | 147 | if self._real_scalers is None and self._cat_scalers is None: 148 | raise ValueError('Scalers have not been set!') 149 | 150 | column_definitions = self.get_column_definition() 151 | 152 | real_inputs = utils.extract_cols_from_data_type( 153 | DataTypes.REAL_VALUED, column_definitions, 154 | {InputTypes.ID, InputTypes.TIME}) 155 | categorical_inputs = utils.extract_cols_from_data_type( 156 | DataTypes.CATEGORICAL, column_definitions, 157 | {InputTypes.ID, InputTypes.TIME}) 158 | 159 | # Format real inputs 160 | output[real_inputs] = self._real_scalers.transform(df[real_inputs].values) 161 | 162 | # Format categorical inputs 163 | for col in categorical_inputs: 164 | string_df = df[col].apply(str) 165 | output[col] = self._cat_scalers[col].transform(string_df) 166 | 167 | return output 168 | 169 | def format_predictions(self, predictions): 170 | """Reverts any normalisation to give predictions in original scale. 171 | 172 | Args: 173 | predictions: Dataframe of model predictions. 174 | 175 | Returns: 176 | Data frame of unnormalised predictions. 177 | """ 178 | output = predictions.copy() 179 | 180 | column_names = predictions.columns 181 | 182 | for col in column_names: 183 | if col not in {'forecast_time', 'identifier'}: 184 | output[col] = self._target_scaler.inverse_transform(predictions[col].values.reshape(-1,1)) 185 | 186 | return output 187 | 188 | # Default params 189 | def get_fixed_params(self): 190 | """Returns fixed model parameters for experiments.""" 191 | 192 | fixed_params = { 193 | 'total_time_steps': 252 + 5, 194 | 'num_encoder_steps': 252, 195 | 'num_epochs': 100, 196 | 'early_stopping_patience': 5, 197 | 'multiprocessing_workers': 5, 198 | } 199 | 200 | return fixed_params 201 | 202 | def get_default_model_params(self): 203 | """Returns default optimised model parameters.""" 204 | 205 | model_params = { 206 | 'dropout_rate': 0.3, 207 | 'hidden_layer_size': 160, 208 | 'learning_rate': 0.01, 209 | 'minibatch_size': 64, 210 | 'max_gradient_norm': 0.01, 211 | 'num_heads': 1, 212 | 'stack_size': 1 213 | } 214 | 215 | return model_params 216 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevinc/Transformer_Timeseries/c3705aa9ece058dca98ec7af74f9a9a1a325f7a1/dataset/__init__.py -------------------------------------------------------------------------------- /dataset/ts_dataset.py: -------------------------------------------------------------------------------- 1 | from torch import from_numpy 2 | import pandas as pd 3 | import data_formatters.utils as utils 4 | from data_formatters.base import InputTypes 5 | from torch.utils.data import Dataset 6 | import numpy as np 7 | import click 8 | from os import path 9 | 10 | class TSDataset(Dataset): 11 | ## Mostly adapted from original TFT Github, data_formatters 12 | def __init__(self, cnf, data_formatter): 13 | 14 | self.params = cnf.all_params 15 | 16 | self.csv = utils.data_csv_path(cnf.ds_name) 17 | self.data = pd.read_csv(self.csv, index_col=0, na_filter=False) 18 | 19 | self.train_set, self.valid_set, self.test_set = data_formatter.split_data(self.data) 20 | self.params['column_definition'] = data_formatter.get_column_definition() 21 | 22 | self.inputs = None 23 | self.outputs = None 24 | self.time = None 25 | self.identifiers = None 26 | 27 | def train(self): 28 | max_samples = self.params['train_samples'] 29 | if path.exists(utils.csv_path_to_folder(self.csv) + "processed_traindata.npz"): 30 | f = np.load(utils.csv_path_to_folder(self.csv) + "processed_traindata.npz", allow_pickle=True) 31 | self.inputs, self.outputs, self.time, self.identifiers = f[f.files[0]], f[f.files[1]], f[f.files[2]], f[ 32 | f.files[3]] 33 | else: 34 | self.preprocess(self.train_set, max_samples) 35 | np.savez(utils.csv_path_to_folder(self.csv) + "processed_traindata.npz", self.inputs, self.outputs, 36 | self.time, 37 | self.identifiers) 38 | 39 | def test(self): 40 | max_samples = self.params['test_samples'] 41 | if path.exists(utils.csv_path_to_folder(self.csv) + "processed_testdata.npz"): 42 | f = np.load(utils.csv_path_to_folder(self.csv) + "processed_testdata.npz", allow_pickle=True) 43 | self.inputs, self.outputs, self.time, self.identifiers = f[f.files[0]], f[f.files[1]], f[f.files[2]], f[ 44 | f.files[3]] 45 | else: 46 | self.preprocess(self.test_set, max_samples) 47 | np.savez(utils.csv_path_to_folder(self.csv) + "processed_testdata.npz", self.inputs, self.outputs, 48 | self.time, 49 | self.identifiers) 50 | 51 | def val(self): 52 | max_samples = self.params['val_samples'] 53 | if path.exists(utils.csv_path_to_folder(self.csv) + "processed_validdata.npz"): 54 | f = np.load(utils.csv_path_to_folder(self.csv) + "processed_validdata.npz", allow_pickle=True) 55 | self.inputs, self.outputs, self.time, self.identifiers = f[f.files[0]], f[f.files[1]], f[f.files[2]], f[ 56 | f.files[3]] 57 | else: 58 | self.preprocess(self.valid_set, max_samples) 59 | np.savez(utils.csv_path_to_folder(self.csv) + "processed_validdata.npz", self.inputs, self.outputs, 60 | self.time, 61 | self.identifiers) 62 | 63 | def preprocess(self, data, max_samples): 64 | time_steps = int(self.params['total_time_steps']) 65 | input_size = int(self.params['input_size']) 66 | output_size = int(self.params['output_size']) 67 | column_definition = self.params['column_definition'] 68 | 69 | id_col = self._get_single_col_by_type(InputTypes.ID) 70 | time_col = self._get_single_col_by_type(InputTypes.TIME) 71 | 72 | data.sort_values(by=[id_col, time_col], inplace=True) 73 | print('Getting valid sampling locations.') 74 | valid_sampling_locations = [] 75 | split_data_map = {} 76 | for identifier, df in data.groupby(id_col): 77 | # print('Getting locations for {}'.format(identifier)) 78 | num_entries = len(df) 79 | if num_entries >= time_steps: 80 | valid_sampling_locations += [ 81 | (identifier, time_steps + i) 82 | for i in range(num_entries - time_steps + 1) 83 | ] 84 | split_data_map[identifier] = df 85 | 86 | self.inputs = np.zeros((max_samples, time_steps, input_size)) 87 | self.outputs = np.zeros((max_samples, time_steps, output_size)) 88 | self.time = np.empty((max_samples, time_steps, 1), dtype=object) 89 | self.identifiers = np.empty((max_samples, time_steps, 1), dtype=object) 90 | print('# available segments={}'.format(len(valid_sampling_locations))) 91 | 92 | if max_samples > 0 and len(valid_sampling_locations) > max_samples: 93 | print('Extracting {} samples...'.format(max_samples)) 94 | ranges = [ 95 | valid_sampling_locations[i] for i in np.random.choice( 96 | len(valid_sampling_locations), max_samples, replace=False) 97 | ] 98 | else: 99 | print('Max samples={} exceeds # available segments={}'.format( 100 | max_samples, len(valid_sampling_locations))) 101 | ranges = valid_sampling_locations 102 | 103 | id_col = self._get_single_col_by_type(InputTypes.ID) 104 | time_col = self._get_single_col_by_type(InputTypes.TIME) 105 | target_col = self._get_single_col_by_type(InputTypes.TARGET) 106 | input_cols = [ 107 | tup[0] 108 | for tup in column_definition 109 | if tup[2] not in {InputTypes.ID, InputTypes.TIME} 110 | ] 111 | 112 | for i, tup in enumerate(ranges): 113 | if ((i + 1) % 1000) == 0: 114 | print(i + 1, 'of', max_samples, 'samples done...') 115 | identifier, start_idx = tup 116 | sliced = split_data_map[identifier].iloc[start_idx - time_steps:start_idx] 117 | 118 | self.inputs[i, :, :] = sliced[input_cols] 119 | self.outputs[i, :, :] = sliced[[target_col]] 120 | self.time[i, :, 0] = sliced[time_col] 121 | self.identifiers[i, :, 0] = sliced[id_col] 122 | 123 | def __getitem__(self, index): 124 | 125 | num_encoder_steps = int(self.params['num_encoder_steps']) 126 | s = { 127 | 'inputs': self.inputs[index].astype(float), 128 | 'outputs': self.outputs[index, num_encoder_steps:, :], 129 | 'active_entries': np.ones_like(self.outputs[index, num_encoder_steps:, :]), 130 | 'time': self.time[index].tolist(), 131 | 'identifier': self.identifiers[index].tolist() 132 | } 133 | 134 | return s 135 | 136 | def __len__(self): 137 | return self.inputs.shape[0] 138 | 139 | def _get_single_col_by_type(self, input_type): 140 | """Returns name of single column for input type.""" 141 | return utils.get_single_col_by_input_type(input_type, self.params['column_definition']) 142 | 143 | 144 | @click.command() 145 | @click.option('--conf_file_path', type=str, default="./conf/electricity.yaml") 146 | def main(conf_file_path): 147 | import data_formatters.utils as utils 148 | from conf import Conf 149 | 150 | cnf = Conf(conf_file_path=conf_file_path, seed=15, exp_name="test", log=False) 151 | data_formatter = utils.make_data_formatter(cnf.ds_name) 152 | dataset_train = TSDataset(cnf, data_formatter) 153 | dataset_train.train() 154 | 155 | for i in range(10): 156 | # 192 x ['power_usage', 'hour', 'day_of_week', 'hours_from_start', 'categorical_id'] 157 | x = dataset_train[i]['inputs'] 158 | # 24 x ['power_usage'] 159 | y = dataset_train[i]['outputs'] 160 | print(f'Example #{i}: x.shape={x.shape}, y.shape={y.shape}') 161 | 162 | 163 | if __name__ == "__main__": 164 | main() -------------------------------------------------------------------------------- /env.yml: -------------------------------------------------------------------------------- 1 | name: tft 2 | channels: 3 | - anaconda 4 | - pytorch 5 | - conda-forge 6 | - defaults 7 | dependencies: 8 | - setuptools=58.0.4=py37h06a4308_0 9 | - _libgcc_mutex=0.1=conda_forge 10 | - _openmp_mutex=4.5=2_kmp_llvm 11 | - absl-py=1.0.0=pyhd8ed1ab_0 12 | - aiohttp=3.8.1=py37h540881e_1 13 | - aiosignal=1.2.0=pyhd8ed1ab_0 14 | - alsa-lib=1.2.3=h516909a_0 15 | - async-timeout=4.0.2=pyhd8ed1ab_0 16 | - asynctest=0.13.0=py_0 17 | - attrs=21.4.0=pyhd8ed1ab_0 18 | - azure-core=1.23.1=pyhd8ed1ab_0 19 | - azure-storage-blob=12.11.0=pyhd8ed1ab_0 20 | - backcall=0.2.0=pyh9f0ad1d_0 21 | - backports=1.0=py_2 22 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 23 | - blinker=1.4=py_1 24 | - brotli=1.0.9=h166bdaf_7 25 | - brotli-bin=1.0.9=h166bdaf_7 26 | - brotlipy=0.7.0=py37h540881e_1004 27 | - c-ares=1.18.1=h7f98852_0 28 | - ca-certificates=2021.10.8=ha878542_0 29 | - cachetools=5.0.0=pyhd8ed1ab_0 30 | - certifi=2021.10.8=py37h89c1867_2 31 | - cffi=1.15.0=py37h036bc23_0 32 | - charset-normalizer=2.0.12=pyhd8ed1ab_0 33 | - click=8.1.3=py37h89c1867_0 34 | - codecov=2.1.11=pyhd3deb0d_0 35 | - colorama=0.4.4=pyh9f0ad1d_0 36 | - coverage=6.3.2=py37h540881e_2 37 | - cryptography=36.0.2=py37h38fbfac_1 38 | - cycler=0.11.0=pyhd8ed1ab_0 39 | - dbus=1.13.6=h5008d03_3 40 | - decorator=5.1.1=pyhd8ed1ab_0 41 | - expat=2.4.8=h27087fc_0 42 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 43 | - font-ttf-inconsolata=3.000=h77eed37_0 44 | - font-ttf-source-code-pro=2.038=h77eed37_0 45 | - font-ttf-ubuntu=0.83=hab24e00_0 46 | - fontconfig=2.14.0=h8e229c2_0 47 | - fonts-conda-ecosystem=1=0 48 | - fonts-conda-forge=1=0 49 | - fonttools=4.33.3=py37h540881e_0 50 | - freetype=2.10.4=h0708190_1 51 | - frozenlist=1.3.0=py37h540881e_1 52 | - future=0.18.2=py37h89c1867_5 53 | - gettext=0.19.8.1=h73d1719_1008 54 | - giflib=5.2.1=h36c2ea0_2 55 | - google-auth=2.6.6=pyh6c4a22f_0 56 | - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0 57 | - grpcio=1.45.0=py37he500948_0 58 | - gst-plugins-base=1.20.1=hcf0ee16_1 59 | - gstreamer=1.20.1=hd4edc92_1 60 | - icu=69.1=h9c3ff4c_0 61 | - idna=3.3=pyhd8ed1ab_0 62 | - importlib-metadata=4.11.3=py37h89c1867_1 63 | - ipdb=0.13.9=pyhd8ed1ab_0 64 | - ipython=7.33.0=py37h89c1867_0 65 | - isodate=0.6.1=pyhd8ed1ab_0 66 | - jbig=2.1=h7f98852_2003 67 | - jedi=0.18.1=py37h89c1867_1 68 | - joblib=1.1.0=pyhd8ed1ab_0 69 | - jpeg=9e=h166bdaf_1 70 | - keyutils=1.6.1=h166bdaf_0 71 | - kiwisolver=1.4.2=py37h7cecad7_1 72 | - krb5=1.19.3=h3790be6_0 73 | - lcms2=2.12=hddcbb42_0 74 | - ld_impl_linux-64=2.36.1=hea4e1c9_2 75 | - lerc=3.0=h9c3ff4c_0 76 | - libblas=3.9.0=14_linux64_openblas 77 | - libbrotlicommon=1.0.9=h166bdaf_7 78 | - libbrotlidec=1.0.9=h166bdaf_7 79 | - libbrotlienc=1.0.9=h166bdaf_7 80 | - libcblas=3.9.0=14_linux64_openblas 81 | - libclang=13.0.1=default_hc23dcda_0 82 | - libdeflate=1.10=h7f98852_0 83 | - libedit=3.1.20191231=he28a2e2_2 84 | - libevent=2.1.10=h9b69904_4 85 | - libffi=3.4.2=h7f98852_5 86 | - libgcc-ng=11.2.0=h1d223b6_16 87 | - libgfortran-ng=11.2.0=h69a702a_16 88 | - libgfortran5=11.2.0=h5c6108e_16 89 | - libglib=2.70.2=h174f98d_4 90 | - libiconv=1.16=h516909a_0 91 | - liblapack=3.9.0=14_linux64_openblas 92 | - libllvm13=13.0.1=hf817b99_2 93 | - libnsl=2.0.0=h7f98852_0 94 | - libogg=1.3.4=h7f98852_1 95 | - libopenblas=0.3.20=pthreads_h78a6416_0 96 | - libopus=1.3.1=h7f98852_1 97 | - libpng=1.6.37=h21135ba_2 98 | - libpq=14.2=hd57d9b9_0 99 | - libprotobuf=3.20.0=h6239696_0 100 | - libstdcxx-ng=11.2.0=he4da1e4_16 101 | - libtiff=4.3.0=h542a066_3 102 | - libuuid=2.32.1=h7f98852_1000 103 | - libvorbis=1.3.7=h9c3ff4c_0 104 | - libwebp=1.2.2=h3452ae3_0 105 | - libwebp-base=1.2.2=h7f98852_1 106 | - libxcb=1.13=h7f98852_1004 107 | - libxkbcommon=1.0.3=he3ba5ed_0 108 | - libxml2=2.9.12=h885dcf4_1 109 | - libzlib=1.2.11=h166bdaf_1014 110 | - llvm-openmp=14.0.3=he0ac6c6_0 111 | - lz4-c=1.9.3=h9c3ff4c_1 112 | - markdown=3.3.6=pyhd8ed1ab_0 113 | - matplotlib=3.5.1=py37h89c1867_0 114 | - matplotlib-base=3.5.1=py37h1058ff1_0 115 | - matplotlib-inline=0.1.3=pyhd8ed1ab_0 116 | - mkl=2021.4.0=h8d4b97c_729 117 | - mkl-service=2.4.0=py37h402132d_0 118 | - msrest=0.6.21=pyh44b312d_0 119 | - multidict=6.0.2=py37h540881e_1 120 | - munkres=1.1.4=pyh9f0ad1d_0 121 | - mysql-common=8.0.29=haf5c9bc_0 122 | - mysql-libs=8.0.29=h28c427c_0 123 | - ncurses=6.3=h27087fc_1 124 | - ninja=1.10.2=h4bd325d_1 125 | - nspr=4.32=h9c3ff4c_1 126 | - nss=3.77=h2350873_0 127 | - numpy=1.21.6=py37h976b520_0 128 | - oauthlib=3.2.0=pyhd8ed1ab_0 129 | - openjpeg=2.4.0=hb52868f_1 130 | - openssl=1.1.1n=h166bdaf_0 131 | - packaging=21.3=pyhd8ed1ab_0 132 | - pandas=1.3.5=py37he8f5f7f_0 133 | - parso=0.8.3=pyhd8ed1ab_0 134 | - path=16.4.0=py37h89c1867_1 135 | - pcre=8.45=h9c3ff4c_0 136 | - pexpect=4.8.0=pyh9f0ad1d_2 137 | - pickleshare=0.7.5=py_1003 138 | - pillow=9.1.0=py37h44f0d7a_2 139 | - pip=22.0.4=pyhd8ed1ab_0 140 | - prompt-toolkit=3.0.29=pyha770c72_0 141 | - protobuf=3.20.0=py37hd23a5d3_4 142 | - pthread-stubs=0.4=h36c2ea0_1001 143 | - ptyprocess=0.7.0=pyhd3deb0d_0 144 | - pyasn1=0.4.8=py_0 145 | - pyasn1-modules=0.2.7=py_0 146 | - pycparser=2.21=pyhd8ed1ab_0 147 | - pygments=2.12.0=pyhd8ed1ab_0 148 | - pyjwt=2.3.0=pyhd8ed1ab_1 149 | - pyopenssl=22.0.0=pyhd8ed1ab_0 150 | - pyparsing=3.0.8=pyhd8ed1ab_0 151 | - pyqt=5.12.3=py37h89c1867_8 152 | - pyqt-impl=5.12.3=py37hac37412_8 153 | - pyqt5-sip=4.19.18=py37hcd2ae1e_8 154 | - pyqtchart=5.12=py37he336c9b_8 155 | - pyqtwebengine=5.12.1=py37he336c9b_8 156 | - pysocks=1.7.1=py37h89c1867_5 157 | - python=3.7.12=hb7a2778_100_cpython 158 | - python-dateutil=2.8.2=pyhd8ed1ab_0 159 | - python_abi=3.7=2_cp37m 160 | - pytorch-model-summary=0.1.1=py_0 161 | - pytz=2022.1=pyhd8ed1ab_0 162 | - pyu2f=0.1.5=pyhd8ed1ab_0 163 | - pyyaml=6.0=py37h540881e_4 164 | - qt=5.12.9=h1304e3e_6 165 | - readline=8.1=h46c0cb4_0 166 | - requests=2.27.1=pyhd8ed1ab_0 167 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0 168 | - rsa=4.8=pyhd8ed1ab_0 169 | - scikit-learn=1.0.2=py37hf9e9bfc_0 170 | - scipy=1.7.3=py37hf2a6cf1_0 171 | - six=1.16.0=pyh6c4a22f_0 172 | - sqlite=3.38.3=h4ff8645_0 173 | - tbb=2021.5.0=h924138e_1 174 | - tensorboard=2.8.0=pyhd8ed1ab_1 175 | - tensorboard-data-server=0.6.0=py37h38fbfac_2 176 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 177 | - termcolor=1.1.0=py_2 178 | - threadpoolctl=3.1.0=pyh8a188c0_0 179 | - tk=8.6.12=h27826a3_0 180 | - tomli=2.0.1=pyhd8ed1ab_0 181 | - tornado=6.1=py37h540881e_3 182 | - tqdm=4.64.0=pyhd8ed1ab_0 183 | - traitlets=5.1.1=pyhd8ed1ab_0 184 | - typing=3.10.0.0=pyhd8ed1ab_0 185 | - typing-extensions=4.2.0=hd8ed1ab_1 186 | - typing_extensions=4.2.0=pyha770c72_1 187 | - unicodedata2=14.0.0=py37h540881e_1 188 | - urllib3=1.26.9=pyhd8ed1ab_0 189 | - wcwidth=0.2.5=pyh9f0ad1d_2 190 | - werkzeug=2.1.2=pyhd8ed1ab_0 191 | - wheel=0.37.1=pyhd8ed1ab_0 192 | - xorg-libxau=1.0.9=h7f98852_0 193 | - xorg-libxdmcp=1.1.3=h7f98852_0 194 | - xz=5.2.5=h516909a_1 195 | - yaml=0.2.5=h7f98852_2 196 | - yarl=1.7.2=py37h540881e_2 197 | - zipp=3.8.0=pyhd8ed1ab_0 198 | - zlib=1.2.11=h166bdaf_1014 199 | - zstd=1.5.2=ha95c52a_0 200 | - blas=1.0=mkl 201 | - bzip2=1.0.8=h7b6447c_0 202 | - cudatoolkit=10.2.89=hfd86e86_1 203 | - gmp=6.2.1=h2531618_2 204 | - gnutls=3.6.15=he1e5248_0 205 | - lame=3.100=h7b6447c_0 206 | - libidn2=2.3.2=h7f8727e_0 207 | - libtasn1=4.16.0=h27cfd23_0 208 | - libunistring=0.9.10=h27cfd23_0 209 | - libuv=1.40.0=h7b6447c_0 210 | - nettle=3.7.3=hbbd107a_1 211 | - openh264=2.1.1=h4ff587b_0 212 | - ffmpeg=4.3=hf484d3e_0 213 | - pytorch=1.11.0=py3.7_cuda10.2_cudnn7.6.5_0 214 | - pytorch-mutex=1.0=cuda 215 | - torchaudio=0.11.0=py37_cu102 216 | - torchvision=0.12.0=py37_cu102 217 | prefix: /home/grads/m/mrsergazinov/.conda/envs/tft 218 | 219 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # --------------------- 3 | 4 | from time import time 5 | import numpy as np 6 | import torch 7 | from torch import optim 8 | from torch.utils.data import DataLoader 9 | from conf import Conf 10 | from dataset.ts_dataset import TSDataset 11 | from models.temporal_fusion_t import tft_model 12 | from progress_bar import ProgressBar 13 | from utils import QuantileLoss, symmetric_mean_absolute_percentage_error, unnormalize_tensor, plot_temporal_serie 14 | import data_formatters.utils as utils 15 | from models.transformer import Transformer 16 | from models.transformer_grn.transformer import Transformer as GRNTransformer 17 | 18 | 19 | 20 | class TS(object): 21 | """ 22 | Class for loading and test the pre-trained model 23 | """ 24 | 25 | def __init__(self, cnf): 26 | # type: (Conf) -> Trainer 27 | 28 | self.cnf = cnf 29 | self.data_formatter = utils.make_data_formatter(cnf.ds_name) 30 | 31 | loader = TSDataset 32 | dataset_test = loader(self.cnf, self.data_formatter) 33 | dataset_test.test() 34 | 35 | # init model 36 | model_choice = self.cnf.all_params["model"] 37 | if model_choice == "transformer": 38 | # Baseline transformer 39 | self.model = Transformer(self.cnf.all_params) 40 | elif model_choice == "tf_transformer": 41 | # Temporal fusion transformer 42 | self.model = tft_model.TFT(self.cnf.all_params) 43 | elif model_choice == "grn_transformer": 44 | # Transformer + GRN to encode static vars 45 | self.model = GRNTransformer(self.cnf.all_params) 46 | else: 47 | raise NameError 48 | 49 | self.model = self.model.to(cnf.device) 50 | 51 | # init optimizer 52 | self.optimizer = optim.Adam(params=self.model.parameters(), lr=cnf.lr) 53 | self.loss = QuantileLoss(cnf.quantiles) 54 | 55 | # init test loader 56 | self.test_loader = DataLoader( 57 | dataset=dataset_test, batch_size=cnf.batch_size, 58 | num_workers=cnf.n_workers, shuffle=False, pin_memory=True, 59 | ) 60 | 61 | # init logging stuffs 62 | self.log_path = cnf.exp_log_path 63 | self.log_freq = len(self.test_loader) 64 | self.train_losses = [] 65 | self.test_loss = [] 66 | self.test_losses = {'p10': [], 'p50': [], 'p90': []} 67 | self.test_smape = [] 68 | 69 | # starting values 70 | self.epoch = 0 71 | self.best_test_loss = None 72 | 73 | # init progress bar 74 | self.progress_bar = ProgressBar(max_step=self.log_freq, max_epoch=self.cnf.epochs) 75 | 76 | # possibly load checkpoint 77 | self.load_ck() 78 | 79 | print("Finished preparing datasets.") 80 | 81 | def load_ck(self): 82 | """ 83 | load training checkpoint 84 | """ 85 | ck_path = self.log_path / self.cnf.exp_name + '_best.pth' 86 | if ck_path.exists(): 87 | ck = torch.load(ck_path) 88 | print(f'[loading checkpoint \'{ck_path}\']') 89 | self.model.load_state_dict(ck) 90 | 91 | def test(self): 92 | """ 93 | Quick test and plot prediction without saving or logging stuff on tensorboarc 94 | """ 95 | with torch.no_grad(): 96 | self.model.eval() 97 | p10_forecast, p10_forecast, p90_forecast, target = None, None, None, None 98 | 99 | t = time() 100 | for step, sample in enumerate(self.test_loader): 101 | 102 | # Hide future predictions from input vector, set to 0 (or 1) values where timestep > encoder_steps 103 | steps = self.cnf.all_params['num_encoder_steps'] 104 | pred_len = sample['outputs'].shape[1] 105 | x = sample['inputs'].float().to(self.cnf.device) 106 | x[:, steps:, 0] = 1 107 | 108 | # Feed input to the model 109 | if self.cnf.all_params["model"] == "transformer" or self.cnf.all_params["model"] == "grn_transformer": 110 | 111 | # Auto-regressive prediction 112 | for i in range(pred_len): 113 | output = self.model.forward(x) 114 | x[:, steps + i, 0] = output[:, i, 1] 115 | output = self.model.forward(x) 116 | 117 | elif self.cnf.all_params["model"] == "tf_transformer": 118 | output, _, _ = self.model.forward(x) 119 | else: 120 | raise NameError 121 | 122 | output = output.squeeze() 123 | y, y_pred = sample['outputs'].squeeze().float().to(self.cnf.device), output 124 | 125 | # Compute loss 126 | loss, _ = self.loss(y_pred, y) 127 | smape = symmetric_mean_absolute_percentage_error(output[:, :, 1].detach().cpu().numpy(), 128 | sample['outputs'][:, :, 0].detach().cpu().numpy()) 129 | 130 | # De-Normalize to compute metrics 131 | target = unnormalize_tensor(self.data_formatter, y, sample['identifier'][0][0]) 132 | p10_forecast = unnormalize_tensor(self.data_formatter, y_pred[..., 0], sample['identifier'][0][0]) 133 | p50_forecast = unnormalize_tensor(self.data_formatter, y_pred[..., 1], sample['identifier'][0][0]) 134 | p90_forecast = unnormalize_tensor(self.data_formatter, y_pred[..., 2], sample['identifier'][0][0]) 135 | 136 | # Compute metrics 137 | self.test_losses['p10'].append(self.loss.numpy_normalised_quantile_loss(p10_forecast, target, 0.1)) 138 | self.test_losses['p50'].append(self.loss.numpy_normalised_quantile_loss(p50_forecast, target, 0.5)) 139 | self.test_losses['p90'].append(self.loss.numpy_normalised_quantile_loss(p90_forecast, target, 0.9)) 140 | 141 | self.test_loss.append(loss.item()) 142 | self.test_smape.append(smape) 143 | 144 | # Plot serie prediction 145 | p1, p2, p3, target = np.expand_dims(p10_forecast, axis=-1), np.expand_dims(p50_forecast, axis=-1), \ 146 | np.expand_dims(p90_forecast, axis=-1), np.expand_dims(target, axis=-1) 147 | p = np.concatenate((p1, p2, p3), axis=-1) 148 | plot_temporal_serie(p, target) 149 | 150 | # Log stuff 151 | for k in self.test_losses.keys(): 152 | mean_test_loss = np.mean(self.test_losses[k]) 153 | print(f'\t● AVG {k} Loss on TEST-set: {mean_test_loss:.6f} │ T: {time() - t:.2f} s') 154 | 155 | # log log log 156 | mean_test_loss = np.mean(self.test_loss) 157 | mean_smape = np.mean(self.test_smape) 158 | print(f'\t● AVG Loss on TEST-set: {mean_test_loss:.6f} │ T: {time() - t:.2f} s') 159 | print(f'\t● AVG SMAPE on TEST-set: {mean_smape:.6f} │ T: {time() - t:.2f} s') 160 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # --------------------- 3 | 4 | import click 5 | import torch.backends.cudnn as cudnn 6 | 7 | from conf import Conf 8 | from trainer import Trainer 9 | from inference import TS 10 | 11 | cudnn.benchmark = True 12 | 13 | 14 | @click.command() 15 | @click.option('--exp_name', type=str, default=None) 16 | @click.option('--conf_file_path', type=str, default=None) 17 | @click.option('--seed', type=int, default=None) 18 | @click.option('--inference', type=bool, default=False) 19 | def main(exp_name, conf_file_path, seed, inference): 20 | # type: (str, str, int, bool) -> None 21 | 22 | # if `exp_name` is None, 23 | # ask the user to enter it 24 | if exp_name is None: 25 | exp_name = click.prompt('▶ experiment name', default='default') 26 | 27 | # if `exp_name` contains '!', 28 | # `log_each_step` becomes `False` 29 | log_each_step = True 30 | if '!' in exp_name: 31 | exp_name = exp_name.replace('!', '') 32 | log_each_step = False 33 | 34 | # if `exp_name` contains a '@' character, 35 | # the number following '@' is considered as 36 | # the desired random seed for the experiment 37 | split = exp_name.split('@') 38 | if len(split) == 2: 39 | seed = int(split[1]) 40 | exp_name = split[0] 41 | 42 | cnf = Conf(conf_file_path=conf_file_path, seed=seed, exp_name=exp_name, log=log_each_step) 43 | print(f'\n{cnf}') 44 | 45 | print(f'\n▶ Starting Experiment \'{exp_name}\' [seed: {cnf.seed}]') 46 | 47 | if inference: 48 | ts_model = TS(cnf=cnf) 49 | ts_model.test() 50 | else: 51 | trainer = Trainer(cnf=cnf) 52 | trainer.run() 53 | 54 | 55 | if __name__ == '__main__': 56 | main() 57 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/__init__.py: -------------------------------------------------------------------------------- 1 | from models.temporal_fusion_t.base import BaseModel 2 | from models.temporal_fusion_t.tft_model import TFT -------------------------------------------------------------------------------- /models/temporal_fusion_t/add_and_norm.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | 4 | class AddAndNorm(nn.Module): 5 | def __init__(self, hidden_layer_size): 6 | super(AddAndNorm, self).__init__() 7 | 8 | self.normalize = nn.LayerNorm(hidden_layer_size) 9 | 10 | def forward(self, x1, x2): 11 | x = torch.add(x1, x2) 12 | return self.normalize(x) 13 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # --------------------- 3 | 4 | from abc import ABCMeta 5 | from abc import abstractmethod 6 | from typing import Union 7 | 8 | import torch 9 | from path import Path 10 | from torch import nn 11 | 12 | 13 | class BaseModel(nn.Module, metaclass=ABCMeta): 14 | 15 | def __init__(self): 16 | super().__init__() 17 | 18 | 19 | def kaiming_init(self, activation): 20 | # type: (str) -> () 21 | """ 22 | Apply "Kaiming-Normal" initialization to all Conv2D(s) of the model. 23 | :param activation: activation function after conv; values in {'relu', 'leaky_relu'} 24 | :return: 25 | """ 26 | assert activation in ['ReLU', 'LeakyReLU', 'leaky_relu'], \ 27 | '`activation` must be \'ReLU\' or \'LeakyReLU\'' 28 | 29 | if activation == 'LeakyReLU': 30 | activation = 'leaky_relu' 31 | activation = activation.lower() 32 | 33 | for m in self.modules(): 34 | if isinstance(m, nn.Conv2d): 35 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity=activation) 36 | if m.bias is not None: 37 | nn.init.constant_(m.bias, 0) 38 | 39 | 40 | @abstractmethod 41 | def forward(self, x): 42 | # type: (torch.Tensor) -> torch.Tensor 43 | """ 44 | Defines the computation performed at every call. 45 | Should be overridden by all subclasses. 46 | """ 47 | ... 48 | 49 | 50 | @property 51 | def n_param(self): 52 | # type: (BaseModel) -> int 53 | """ 54 | :return: number of parameters 55 | """ 56 | return sum(p.numel() for p in self.parameters() if p.requires_grad) 57 | 58 | 59 | @property 60 | def current_device(self): 61 | # type: () -> str 62 | """ 63 | :return: string that represents the device on which the model is currently located 64 | >> e.g.: 'cpu', 'cuda', 'cuda:0', 'cuda:1', ... 65 | """ 66 | return str(next(self.parameters()).device) 67 | 68 | 69 | @property 70 | def is_cuda(self): 71 | # type: () -> bool 72 | """ 73 | :return: `True` if the model is on Cuda; `False` otherwise 74 | """ 75 | return 'cuda' in self.current_device 76 | 77 | 78 | def save_w(self, path): 79 | # type: (Union[str, Path]) -> None 80 | """ 81 | save model weights in the specified path 82 | """ 83 | torch.save(self.state_dict(), path) 84 | 85 | 86 | def load_w(self, path): 87 | # type: (Union[str, Path]) -> None 88 | """ 89 | load model weights from the specified path 90 | """ 91 | self.load_state_dict(torch.load(path)) 92 | 93 | 94 | def requires_grad(self, flag): 95 | # type: (bool) -> None 96 | """ 97 | :param flag: True if the model requires gradient, False otherwise 98 | """ 99 | for p in self.parameters(): 100 | p.requires_grad = flag 101 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/gated_linear_unit.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from models.temporal_fusion_t.linear_layer import LinearLayer 4 | 5 | class GLU(nn.Module): 6 | #Gated Linear Unit 7 | def __init__(self, 8 | input_size, 9 | hidden_layer_size, 10 | dropout_rate=None, 11 | use_time_distributed=True, 12 | batch_first=False 13 | ): 14 | super(GLU, self).__init__() 15 | self.hidden_layer_size = hidden_layer_size 16 | self.dropout_rate = dropout_rate 17 | self.use_time_distributed = use_time_distributed 18 | 19 | if dropout_rate is not None: 20 | self.dropout = nn.Dropout(self.dropout_rate) 21 | 22 | self.activation_layer = LinearLayer(input_size, hidden_layer_size, use_time_distributed, batch_first) 23 | self.gated_layer = LinearLayer(input_size, hidden_layer_size, use_time_distributed, batch_first) 24 | 25 | self.sigmoid = nn.Sigmoid() 26 | 27 | def forward(self, x): 28 | if self.dropout_rate is not None: 29 | x = self.dropout(x) 30 | 31 | activation = self.activation_layer(x) 32 | gated = self.sigmoid(self.gated_layer(x)) 33 | 34 | return torch.mul(activation, gated), gated 35 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/gated_residual_network.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import math 3 | import torch 4 | from models.temporal_fusion_t.linear_layer import LinearLayer 5 | from models.temporal_fusion_t.add_and_norm import AddAndNorm 6 | from models.temporal_fusion_t.gated_linear_unit import GLU 7 | 8 | class GatedResidualNetwork(nn.Module): 9 | def __init__(self, 10 | input_size, 11 | hidden_layer_size, 12 | output_size=None, 13 | dropout_rate=None, 14 | use_time_distributed=True, 15 | return_gate=False, 16 | batch_first=False 17 | ): 18 | 19 | super(GatedResidualNetwork, self).__init__() 20 | if output_size is None: 21 | output = hidden_layer_size 22 | else: 23 | output = output_size 24 | 25 | self.output = output 26 | self.input_size = input_size 27 | self.output_size = output_size 28 | self.hidden_layer_size = hidden_layer_size 29 | self.return_gate = return_gate 30 | 31 | self.linear_layer = LinearLayer(input_size, output, use_time_distributed, batch_first) 32 | 33 | self.hidden_linear_layer1 = LinearLayer(input_size, hidden_layer_size, use_time_distributed, batch_first) 34 | self.hidden_context_layer = LinearLayer(hidden_layer_size, hidden_layer_size, use_time_distributed, batch_first) 35 | self.hidden_linear_layer2 = LinearLayer(hidden_layer_size, hidden_layer_size, use_time_distributed, batch_first) 36 | 37 | self.elu1 = nn.ELU() 38 | self.glu = GLU(hidden_layer_size, output, dropout_rate, use_time_distributed, batch_first) 39 | self.add_and_norm = AddAndNorm(hidden_layer_size=output) 40 | 41 | def forward(self, x, context=None): 42 | # Setup skip connection 43 | if self.output_size is None: 44 | skip = x 45 | else: 46 | skip = self.linear_layer(x) 47 | 48 | # Apply feedforward network 49 | hidden = self.hidden_linear_layer1(x) 50 | if context is not None: 51 | hidden = hidden + self.hidden_context_layer(context) 52 | hidden = self.elu1(hidden) 53 | hidden = self.hidden_linear_layer2(hidden) 54 | 55 | gating_layer, gate = self.glu(hidden) 56 | if self.return_gate: 57 | return self.add_and_norm(skip, gating_layer), gate 58 | else: 59 | return self.add_and_norm(skip, gating_layer) -------------------------------------------------------------------------------- /models/temporal_fusion_t/interpretable_multi_head_attention.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from models.temporal_fusion_t.scaled_dot_product_attention import ScaledDotProductAttention 4 | 5 | class InterpretableMultiHeadAttention(nn.Module): 6 | """Defines interpretable multi-head attention layer. 7 | 8 | Attributes: 9 | n_head: Number of heads 10 | d_k: Key/query dimensionality per head 11 | d_v: Value dimensionality 12 | dropout: Dropout rate to apply 13 | qs_layers: List of queries across heads 14 | ks_layers: List of keys across heads 15 | vs_layers: List of values across heads 16 | attention: Scaled dot product attention layer 17 | w_o: Output weight matrix to project internal state to the original TFT 18 | state size 19 | """ 20 | 21 | def __init__(self, n_head, d_model, dropout_rate): 22 | """Initialises layer. 23 | 24 | Args: 25 | n_head: Number of heads 26 | d_model: TFT state dimensionality 27 | dropout: Dropout discard rate 28 | """ 29 | super(InterpretableMultiHeadAttention, self).__init__() 30 | 31 | self.n_head = n_head 32 | self.d_k = self.d_v = d_k = d_v = d_model // n_head 33 | self.dropout = nn.Dropout(dropout_rate) 34 | 35 | self.qs_layers = nn.ModuleList() 36 | self.ks_layers = nn.ModuleList() 37 | self.vs_layers = nn.ModuleList() 38 | 39 | # Use same value layer to facilitate interp 40 | vs_layer = nn.Linear(d_model, d_v, bias=False) 41 | qs_layer = nn.Linear(d_model, d_k, bias=False) 42 | ks_layer = nn.Linear(d_model, d_k, bias=False) 43 | 44 | for _ in range(n_head): 45 | self.qs_layers.append(qs_layer) 46 | self.ks_layers.append(ks_layer) 47 | self.vs_layers.append(vs_layer) # use same vs_layer 48 | 49 | self.attention = ScaledDotProductAttention() 50 | self.w_o = nn.Linear(self.d_k, d_model, bias=False) 51 | 52 | def forward(self, q, k, v, mask=None): 53 | """Applies interpretable multihead attention. 54 | 55 | Using T to denote the number of time steps fed into the transformer. 56 | 57 | Args: 58 | q: Query tensor of shape=(?, T, d_model) 59 | k: Key of shape=(?, T, d_model) 60 | v: Values of shape=(?, T, d_model) 61 | mask: Masking if required with shape=(?, T, T) 62 | 63 | Returns: 64 | Tuple of (layer outputs, attention weights) 65 | """ 66 | n_head = self.n_head 67 | heads = [] 68 | attns = [] 69 | for i in range(n_head): 70 | qs = self.qs_layers[i](q) 71 | ks = self.ks_layers[i](k) 72 | vs = self.vs_layers[i](v) 73 | head, attn = self.attention(qs, ks, vs, mask) 74 | 75 | head_dropout = self.dropout(head) 76 | heads.append(head_dropout) 77 | attns.append(attn) 78 | head = torch.stack(heads) if n_head > 1 else heads[0] 79 | attn = torch.stack(attns) 80 | 81 | outputs = torch.mean(head, axis=0) if n_head > 1 else head 82 | outputs = self.w_o(outputs) 83 | outputs = self.dropout(outputs) # output dropout 84 | 85 | return outputs, attn 86 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/linear_layer.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from models.temporal_fusion_t.time_distributed import TimeDistributed 4 | 5 | class LinearLayer(nn.Module): 6 | def __init__(self, 7 | input_size, 8 | size, 9 | use_time_distributed=True, 10 | batch_first=False): 11 | super(LinearLayer, self).__init__() 12 | 13 | self.use_time_distributed=use_time_distributed 14 | self.input_size=input_size 15 | self.size=size 16 | if use_time_distributed: 17 | self.layer = TimeDistributed(nn.Linear(input_size, size), batch_first=batch_first) 18 | else: 19 | self.layer = nn.Linear(input_size, size) 20 | 21 | def forward(self, x): 22 | return self.layer(x) 23 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/lstm_combine_and_mask.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from models.temporal_fusion_t.gated_residual_network import GatedResidualNetwork 4 | 5 | 6 | class LSTMCombineAndMask(nn.Module): 7 | def __init__(self, input_size, num_inputs, hidden_layer_size, dropout_rate, use_time_distributed=False, batch_first=True): 8 | super(LSTMCombineAndMask, self).__init__() 9 | 10 | self.hidden_layer_size = hidden_layer_size 11 | self.input_size = input_size 12 | self.num_inputs = num_inputs 13 | self.dropout_rate = dropout_rate 14 | 15 | self.flattened_grn = GatedResidualNetwork(self.num_inputs*self.hidden_layer_size, self.hidden_layer_size, self.num_inputs, self.dropout_rate, use_time_distributed=use_time_distributed, return_gate=True, batch_first=batch_first) 16 | 17 | self.single_variable_grns = nn.ModuleList() 18 | for i in range(self.num_inputs): 19 | self.single_variable_grns.append(GatedResidualNetwork(self.hidden_layer_size, self.hidden_layer_size, None, self.dropout_rate, use_time_distributed=use_time_distributed, return_gate=False, batch_first=batch_first)) 20 | 21 | self.softmax = nn.Softmax(dim=2) 22 | 23 | def forward(self, embedding, additional_context=None): 24 | # Add temporal features 25 | _, time_steps, embedding_dim, num_inputs = list(embedding.shape) 26 | 27 | flattened_embedding = torch.reshape(embedding, 28 | [-1, time_steps, embedding_dim * num_inputs]) 29 | 30 | expanded_static_context = additional_context.unsqueeze(1) 31 | 32 | if additional_context is not None: 33 | sparse_weights, static_gate = self.flattened_grn(flattened_embedding, expanded_static_context) 34 | else: 35 | sparse_weights = self.flattened_grn(flattened_embedding) 36 | 37 | sparse_weights = self.softmax(sparse_weights).unsqueeze(2) 38 | 39 | trans_emb_list = [] 40 | for i in range(self.num_inputs): 41 | ##select slice of embedding belonging to a single input 42 | trans_emb_list.append( 43 | self.single_variable_grns[i](embedding[Ellipsis,i]) 44 | ) 45 | 46 | transformed_embedding = torch.stack(trans_emb_list, dim=-1) 47 | 48 | combined = transformed_embedding*sparse_weights 49 | 50 | temporal_ctx = combined.sum(dim=-1) 51 | 52 | return temporal_ctx, sparse_weights, static_gate 53 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/scaled_dot_product_attention.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | 4 | class ScaledDotProductAttention(nn.Module): 5 | """Defines scaled dot product attention layer. 6 | 7 | Attributes: 8 | dropout: Dropout rate to use 9 | activation: Normalisation function for scaled dot product attention (e.g. 10 | softmax by default) 11 | """ 12 | 13 | def __init__(self, attn_dropout=0.0): 14 | super(ScaledDotProductAttention, self).__init__() 15 | 16 | self.dropout = nn.Dropout(attn_dropout) 17 | self.activation = nn.Softmax(dim=-1) 18 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 19 | 20 | def forward(self, q, k, v, mask): 21 | """Applies scaled dot product attention. 22 | 23 | Args: 24 | q: Queries 25 | k: Keys 26 | v: Values 27 | mask: Masking if required -- sets softmax to very large value 28 | 29 | Returns: 30 | Tuple of (layer outputs, attention weights) 31 | """ 32 | attn = torch.bmm(q,k.permute(0,2,1)) # shape=(batch, q, k) 33 | if mask is not None: 34 | attn = attn.masked_fill(mask.bool().to(self.device), -1e9) 35 | 36 | attn = self.activation(attn) 37 | attn = self.dropout(attn) 38 | output = torch.bmm(attn,v) 39 | return output, attn 40 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/static_combine_and_mask.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from models.temporal_fusion_t.gated_residual_network import GatedResidualNetwork 4 | 5 | class StaticCombineAndMask(nn.Module): 6 | def __init__(self, input_size, num_static, hidden_layer_size, dropout_rate, additional_context=None, use_time_distributed=False, batch_first=True): 7 | super(StaticCombineAndMask, self).__init__() 8 | 9 | self.hidden_layer_size = hidden_layer_size 10 | self.input_size =input_size 11 | self.num_static = num_static 12 | self.dropout_rate = dropout_rate 13 | self.additional_context = additional_context 14 | 15 | if self.additional_context is not None: 16 | self.flattened_grn = GatedResidualNetwork(self.num_static*self.hidden_layer_size, self.hidden_layer_size, self.num_static, self.dropout_rate, use_time_distributed=False, return_gate=False, batch_first=batch_first) 17 | else: 18 | self.flattened_grn = GatedResidualNetwork(self.num_static*self.hidden_layer_size, self.hidden_layer_size, self.num_static, self.dropout_rate, use_time_distributed=False, return_gate=False, batch_first=batch_first) 19 | 20 | 21 | self.single_variable_grns = nn.ModuleList() 22 | for i in range(self.num_static): 23 | self.single_variable_grns.append(GatedResidualNetwork(self.hidden_layer_size, self.hidden_layer_size, None, self.dropout_rate, use_time_distributed=False, return_gate=False, batch_first=batch_first)) 24 | 25 | self.softmax = nn.Softmax(dim=1) 26 | 27 | def forward(self, embedding, additional_context=None): 28 | # Add temporal features 29 | _, num_static, _ = list(embedding.shape) 30 | flattened_embedding = torch.flatten(embedding, start_dim=1) 31 | if additional_context is not None: 32 | sparse_weights = self.flattened_grn(flattened_embedding, additional_context) 33 | else: 34 | sparse_weights = self.flattened_grn(flattened_embedding) 35 | 36 | sparse_weights = self.softmax(sparse_weights).unsqueeze(2) 37 | 38 | trans_emb_list = [] 39 | for i in range(self.num_static): 40 | ##select slice of embedding belonging to a single input 41 | trans_emb_list.append( 42 | self.single_variable_grns[i](torch.flatten(embedding[:, i:i + 1, :], start_dim=1)) 43 | ) 44 | 45 | transformed_embedding = torch.stack(trans_emb_list, dim=1) 46 | 47 | combined = transformed_embedding*sparse_weights 48 | 49 | static_vec = combined.sum(dim=1) 50 | 51 | return static_vec, sparse_weights 52 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/tft_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Temporal Fusion Transformers: https://arxiv.org/abs/1912.09363 3 | """ 4 | 5 | import math 6 | import torch 7 | import ipdb 8 | import json 9 | from torch import nn 10 | from models.temporal_fusion_t.base import BaseModel 11 | from models.temporal_fusion_t.add_and_norm import AddAndNorm 12 | from models.temporal_fusion_t.gated_residual_network import GatedResidualNetwork 13 | from models.temporal_fusion_t.gated_linear_unit import GLU 14 | from models.temporal_fusion_t.linear_layer import LinearLayer 15 | from models.temporal_fusion_t.lstm_combine_and_mask import LSTMCombineAndMask 16 | from models.temporal_fusion_t.static_combine_and_mask import StaticCombineAndMask 17 | from models.temporal_fusion_t.time_distributed import TimeDistributed 18 | from models.temporal_fusion_t.interpretable_multi_head_attention import InterpretableMultiHeadAttention 19 | 20 | 21 | class TFT(BaseModel): 22 | def __init__(self, raw_params): 23 | super(TFT, self).__init__() 24 | 25 | params = dict(raw_params) # copy locally 26 | print(params) 27 | 28 | # Data parameters 29 | self.time_steps = int(params['total_time_steps']) 30 | self.input_size = int(params['input_size']) 31 | self.output_size = int(params['output_size']) 32 | self.category_counts = json.loads(str(params['category_counts'])) 33 | self.n_multiprocessing_workers = int(params['n_workers']) 34 | 35 | # Relevant indices for TFT 36 | self._input_obs_loc = json.loads(str(params['input_obs_loc'])) 37 | self._static_input_loc = json.loads(str(params['static_input_loc'])) 38 | self._known_regular_input_idx = json.loads( 39 | str(params['known_regular_inputs'])) 40 | self._known_categorical_input_idx = json.loads( 41 | str(params['known_categorical_inputs'])) 42 | 43 | # Network params 44 | self.quantiles = list(params['quantiles']) 45 | self.device = str(params['device']) 46 | self.hidden_layer_size = int(params['hidden_layer_size']) 47 | self.dropout_rate = float(params['dropout_rate']) 48 | self.max_gradient_norm = float(params['max_gradient_norm']) 49 | self.learning_rate = float(params['lr']) 50 | self.minibatch_size = int(params['batch_size']) 51 | self.num_epochs = int(params['num_epochs']) 52 | self.early_stopping_patience = int(params['early_stopping_patience']) 53 | 54 | self.num_encoder_steps = int(params['num_encoder_steps']) 55 | self.num_stacks = int(params['stack_size']) 56 | self.num_heads = int(params['num_heads']) 57 | self.batch_first = True 58 | self.num_static = len(self._static_input_loc) 59 | self.num_inputs = len(self._known_regular_input_idx) + self.output_size 60 | self.num_inputs_decoder = len(self._known_regular_input_idx) 61 | 62 | # Serialisation options 63 | # self._temp_folder = os.path.join(params['model_folder'], 'tmp') 64 | # self.reset_temp_folder() 65 | 66 | # Extra components to store Tensorflow nodes for attention computations 67 | self._input_placeholder = None 68 | self._attention_components = None 69 | self._prediction_parts = None 70 | 71 | # print('*** params ***') 72 | # for k in params: 73 | # print('# {} = {}'.format(k, params[k])) 74 | 75 | ####### 76 | time_steps = self.time_steps 77 | num_categorical_variables = len(self.category_counts) 78 | num_regular_variables = self.input_size - num_categorical_variables 79 | 80 | embedding_sizes = [ 81 | self.hidden_layer_size for i, size in enumerate(self.category_counts) 82 | ] 83 | 84 | print("num_categorical_variables") 85 | print(num_categorical_variables) 86 | self.embeddings = nn.ModuleList() 87 | for i in range(num_categorical_variables): 88 | embedding = nn.Embedding(self.category_counts[i], embedding_sizes[i]) 89 | self.embeddings.append(embedding) 90 | 91 | self.static_input_layer = nn.Linear(self.hidden_layer_size, self.hidden_layer_size) 92 | self.time_varying_embedding_layer = LinearLayer(input_size=1, size=self.hidden_layer_size, use_time_distributed=True, batch_first=self.batch_first) 93 | 94 | self.static_combine_and_mask = StaticCombineAndMask( 95 | input_size=self.input_size, 96 | num_static=self.num_static, 97 | hidden_layer_size=self.hidden_layer_size, 98 | dropout_rate=self.dropout_rate, 99 | additional_context=None, 100 | use_time_distributed=False, 101 | batch_first=self.batch_first) 102 | self.static_context_variable_selection_grn = GatedResidualNetwork( 103 | input_size=self.hidden_layer_size, 104 | hidden_layer_size=self.hidden_layer_size, 105 | output_size=None, 106 | dropout_rate=self.dropout_rate, 107 | use_time_distributed=False, 108 | return_gate=False, 109 | batch_first=self.batch_first) 110 | self.static_context_enrichment_grn = GatedResidualNetwork( 111 | input_size=self.hidden_layer_size, 112 | hidden_layer_size=self.hidden_layer_size, 113 | output_size=None, 114 | dropout_rate=self.dropout_rate, 115 | use_time_distributed=False, 116 | return_gate=False, 117 | batch_first=self.batch_first) 118 | self.static_context_state_h_grn = GatedResidualNetwork( 119 | input_size=self.hidden_layer_size, 120 | hidden_layer_size=self.hidden_layer_size, 121 | output_size=None, 122 | dropout_rate=self.dropout_rate, 123 | use_time_distributed=False, 124 | return_gate=False, 125 | batch_first=self.batch_first) 126 | self.static_context_state_c_grn = GatedResidualNetwork( 127 | input_size=self.hidden_layer_size, 128 | hidden_layer_size=self.hidden_layer_size, 129 | output_size=None, 130 | dropout_rate=self.dropout_rate, 131 | use_time_distributed=False, 132 | return_gate=False, 133 | batch_first=self.batch_first) 134 | self.historical_lstm_combine_and_mask = LSTMCombineAndMask( 135 | input_size=self.num_encoder_steps, 136 | num_inputs=self.num_inputs, 137 | hidden_layer_size=self.hidden_layer_size, 138 | dropout_rate=self.dropout_rate, 139 | use_time_distributed=True, 140 | batch_first=self.batch_first) 141 | self.future_lstm_combine_and_mask = LSTMCombineAndMask( 142 | input_size=self.num_encoder_steps, 143 | num_inputs=self.num_inputs_decoder, 144 | hidden_layer_size=self.hidden_layer_size, 145 | dropout_rate=self.dropout_rate, 146 | use_time_distributed=True, 147 | batch_first=self.batch_first) 148 | 149 | self.lstm_encoder = nn.LSTM(input_size=self.hidden_layer_size, hidden_size=self.hidden_layer_size, batch_first=self.batch_first) 150 | self.lstm_decoder = nn.LSTM(input_size=self.hidden_layer_size, hidden_size=self.hidden_layer_size, batch_first=self.batch_first) 151 | 152 | self.lstm_glu = GLU( 153 | input_size=self.hidden_layer_size, 154 | hidden_layer_size=self.hidden_layer_size, 155 | dropout_rate=self.dropout_rate, 156 | use_time_distributed=True, 157 | batch_first=self.batch_first) 158 | self.lstm_glu_add_and_norm = AddAndNorm(hidden_layer_size=self.hidden_layer_size) 159 | 160 | self.static_enrichment_grn = GatedResidualNetwork( 161 | input_size=self.hidden_layer_size, 162 | hidden_layer_size=self.hidden_layer_size, 163 | output_size=None, 164 | dropout_rate=self.dropout_rate, 165 | use_time_distributed=True, 166 | return_gate=True, 167 | batch_first=self.batch_first) 168 | 169 | self.self_attn_layer = InterpretableMultiHeadAttention(self.num_heads, self.hidden_layer_size, dropout_rate=self.dropout_rate) 170 | 171 | self.self_attention_glu = GLU( 172 | input_size=self.hidden_layer_size, 173 | hidden_layer_size=self.hidden_layer_size, 174 | dropout_rate=self.dropout_rate, 175 | use_time_distributed=True, 176 | batch_first=self.batch_first) 177 | self.self_attention_glu_add_and_norm = AddAndNorm(hidden_layer_size=self.hidden_layer_size) 178 | 179 | self.decoder_grn = GatedResidualNetwork( 180 | input_size=self.hidden_layer_size, 181 | hidden_layer_size=self.hidden_layer_size, 182 | output_size=None, 183 | dropout_rate=self.dropout_rate, 184 | use_time_distributed=True, 185 | return_gate=False, 186 | batch_first=self.batch_first) 187 | 188 | self.final_glu = GLU( 189 | input_size=self.hidden_layer_size, 190 | hidden_layer_size=self.hidden_layer_size, 191 | dropout_rate=self.dropout_rate, 192 | use_time_distributed=True, 193 | batch_first=self.batch_first) 194 | self.final_glu_add_and_norm = AddAndNorm(hidden_layer_size=self.hidden_layer_size) 195 | 196 | self.output_layer = LinearLayer( 197 | input_size=self.hidden_layer_size, 198 | size=self.output_size * len(self.quantiles), 199 | use_time_distributed=True, 200 | batch_first=self.batch_first) 201 | 202 | def get_decoder_mask(self, self_attn_inputs): 203 | """Returns causal mask to apply for self-attention layer. 204 | 205 | Args: 206 | self_attn_inputs: Inputs to self attention layer to determine mask shape 207 | """ 208 | len_s = self_attn_inputs.shape[1] # 192 209 | bs = self_attn_inputs.shape[:1][0] # [64] 210 | # create batch_size identity matrices 211 | mask = torch.cumsum(torch.eye(len_s).reshape((1, len_s, len_s)).repeat(bs, 1, 1), 1) 212 | return mask 213 | 214 | def get_tft_embeddings(self, all_inputs): 215 | time_steps = self.time_steps 216 | 217 | num_categorical_variables = len(self.category_counts) 218 | num_regular_variables = self.input_size - num_categorical_variables 219 | 220 | embedding_sizes = [ 221 | self.hidden_layer_size for i, size in enumerate(self.category_counts) 222 | ] 223 | 224 | regular_inputs, categorical_inputs \ 225 | = all_inputs[:, :, :num_regular_variables], \ 226 | all_inputs[:, :, num_regular_variables:] 227 | 228 | embedded_inputs = [ 229 | self.embeddings[i](categorical_inputs[:,:, i].long()) 230 | for i in range(num_categorical_variables) 231 | ] 232 | 233 | # Static inputs 234 | if self._static_input_loc: 235 | static_inputs = [] 236 | for i in range(num_regular_variables): 237 | if i in self._static_input_loc: 238 | reg_i = self.static_input_layer(regular_inputs[:, 0, i:i + 1]) 239 | static_inputs.append(reg_i) 240 | 241 | emb_inputs = [] 242 | for i in range(num_categorical_variables): 243 | if i + num_regular_variables in self._static_input_loc: 244 | emb_inputs.append(embedded_inputs[i][:, 0, :]) 245 | 246 | static_inputs += emb_inputs 247 | static_inputs = torch.stack(static_inputs, dim=1) 248 | 249 | else: 250 | static_inputs = None 251 | 252 | # Targets 253 | obs_inputs = torch.stack([ 254 | self.time_varying_embedding_layer(regular_inputs[Ellipsis, i:i + 1].float()) 255 | for i in self._input_obs_loc 256 | ], dim=-1) 257 | 258 | 259 | # Observed (a prioir unknown) inputs 260 | wired_embeddings = [] 261 | for i in range(num_categorical_variables): 262 | if i not in self._known_categorical_input_idx and i not in self._input_obs_loc: 263 | e = self.embeddings[i](categorical_inputs[:, :, i]) 264 | wired_embeddings.append(e) 265 | 266 | unknown_inputs = [] 267 | for i in range(regular_inputs.shape[-1]): 268 | if i not in self._known_regular_input_idx and i not in self._input_obs_loc: 269 | e = self.time_varying_embedding_layer(regular_inputs[Ellipsis, i:i + 1]) 270 | unknown_inputs.append(e) 271 | 272 | if unknown_inputs + wired_embeddings: 273 | unknown_inputs = torch.stack(unknown_inputs + wired_embeddings, dim=-1) 274 | else: 275 | unknown_inputs = None 276 | 277 | # A priori known inputs 278 | known_regular_inputs = [] 279 | for i in self._known_regular_input_idx: 280 | if i not in self._static_input_loc: 281 | known_regular_inputs.append(self.time_varying_embedding_layer(regular_inputs[Ellipsis, i:i + 1].float())) 282 | 283 | known_categorical_inputs = [] 284 | for i in self._known_categorical_input_idx: 285 | if i + num_regular_variables not in self._static_input_loc: 286 | known_categorical_inputs.append(embedded_inputs[i]) 287 | 288 | known_combined_layer = torch.stack(known_regular_inputs + known_categorical_inputs, dim=-1) 289 | 290 | return unknown_inputs, known_combined_layer, obs_inputs, static_inputs 291 | 292 | def forward(self, x): 293 | # Size definitions. 294 | time_steps = self.time_steps 295 | combined_input_size = self.input_size 296 | encoder_steps = self.num_encoder_steps 297 | all_inputs = x.to(self.device) 298 | 299 | unknown_inputs, known_combined_layer, obs_inputs, static_inputs \ 300 | = self.get_tft_embeddings(all_inputs) 301 | 302 | # Isolate known and observed historical inputs. 303 | if unknown_inputs is not None: 304 | historical_inputs = torch.cat([ 305 | unknown_inputs[:, :encoder_steps, :], 306 | known_combined_layer[:, :encoder_steps, :], 307 | obs_inputs[:, :encoder_steps, :] 308 | ], dim=-1) 309 | else: 310 | historical_inputs = torch.cat([ 311 | known_combined_layer[:, :encoder_steps, :], 312 | obs_inputs[:, :encoder_steps, :] 313 | ], dim=-1) 314 | 315 | # Isolate only known future inputs. 316 | future_inputs = known_combined_layer[:, encoder_steps:, :] 317 | 318 | static_encoder, static_weights = self.static_combine_and_mask(static_inputs) 319 | static_context_variable_selection = self.static_context_variable_selection_grn(static_encoder) 320 | static_context_enrichment = self.static_context_enrichment_grn(static_encoder) 321 | static_context_state_h = self.static_context_state_h_grn(static_encoder) 322 | static_context_state_c = self.static_context_state_c_grn(static_encoder) 323 | historical_features, historical_flags, _ = self.historical_lstm_combine_and_mask(historical_inputs, static_context_variable_selection) 324 | future_features, future_flags, _ = self.future_lstm_combine_and_mask(future_inputs, static_context_variable_selection) 325 | 326 | history_lstm, (state_h, state_c) = self.lstm_encoder(historical_features, (static_context_state_h.unsqueeze(0), static_context_state_c.unsqueeze(0))) 327 | future_lstm, _ = self.lstm_decoder(future_features, (state_h, state_c)) 328 | 329 | lstm_layer = torch.cat([history_lstm, future_lstm], dim=1) 330 | # Apply gated skip connection 331 | input_embeddings = torch.cat([historical_features, future_features], dim=1) 332 | 333 | lstm_layer, _ = self.lstm_glu(lstm_layer) 334 | temporal_feature_layer = self.lstm_glu_add_and_norm(lstm_layer, input_embeddings) 335 | 336 | # Static enrichment layers 337 | expanded_static_context = static_context_enrichment.unsqueeze(1) 338 | enriched, _ = self.static_enrichment_grn(temporal_feature_layer, expanded_static_context) 339 | 340 | # Decoder self attention 341 | mask = self.get_decoder_mask(enriched) 342 | x, self_att = self.self_attn_layer(enriched, enriched, enriched, mask)#, attn_mask=mask.repeat(self.num_heads, 1, 1)) 343 | 344 | x, _ = self.self_attention_glu(x) 345 | x = self.self_attention_glu_add_and_norm(x, enriched) 346 | 347 | # Nonlinear processing on outputs 348 | decoder = self.decoder_grn(x) 349 | # Final skip connection 350 | decoder, _ = self.final_glu(decoder) 351 | transformer_layer = self.final_glu_add_and_norm(decoder, temporal_feature_layer) 352 | # Attention components for explainability 353 | attention_components = { 354 | # Temporal attention weights 355 | 'decoder_self_attn': self_att, 356 | # Static variable selection weights 357 | 'static_flags': static_weights[Ellipsis, 0], 358 | # Variable selection weights of past inputs 359 | 'historical_flags': historical_flags[Ellipsis, 0, :], 360 | # Variable selection weights of future inputs 361 | 'future_flags': future_flags[Ellipsis, 0, :] 362 | } 363 | 364 | outputs = self.output_layer(transformer_layer[:, self.num_encoder_steps:, :]) 365 | return outputs, all_inputs, attention_components 366 | -------------------------------------------------------------------------------- /models/temporal_fusion_t/time_distributed.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | 4 | class TimeDistributed(nn.Module): 5 | ## Takes any module and stacks the time dimension with the batch dimenison of inputs before apply the module 6 | ## From: https://discuss.pytorch.org/t/any-pytorch-function-can-work-as-keras-timedistributed/1346/4 7 | def __init__(self, module, batch_first=False): 8 | super(TimeDistributed, self).__init__() 9 | self.module = module 10 | self.batch_first = batch_first 11 | 12 | def forward(self, x): 13 | 14 | if len(x.size()) <= 2: 15 | return self.module(x) 16 | 17 | # Squash samples and timesteps into a single axis 18 | x_reshape = x.contiguous().view(-1, x.size(-1)) # (samples * timesteps, input_size) 19 | 20 | if x_reshape.dtype != torch.float32: 21 | x_reshape = x_reshape.float() 22 | 23 | y = self.module(x_reshape) 24 | 25 | # We have to reshape Y 26 | if self.batch_first: 27 | y = y.contiguous().view(x.size(0), -1, y.size(-1)) # (samples, timesteps, output_size) 28 | else: 29 | y = y.view(-1, x.size(1), y.size(-1)) # (timesteps, samples, output_size) 30 | 31 | return y 32 | -------------------------------------------------------------------------------- /models/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from models.transformer.transformer import Transformer 2 | -------------------------------------------------------------------------------- /models/transformer/decoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from models.transformer.multiHeadAttention import MultiHeadAttention, MultiHeadAttentionChunk, MultiHeadAttentionWindow 7 | from models.transformer.positionwiseFeedForward import PositionwiseFeedForward 8 | 9 | 10 | class Decoder(nn.Module): 11 | """Decoder block from Attention is All You Need. 12 | 13 | Apply two Multi Head Attention block followed by a Point-wise Feed Forward block. 14 | Residual sum and normalization are applied at each step. 15 | 16 | Parameters 17 | ---------- 18 | d_model: 19 | Dimension of the input vector. 20 | q: 21 | Dimension of all query matrix. 22 | v: 23 | Dimension of all value matrix. 24 | h: 25 | Number of heads. 26 | attention_size: 27 | Number of backward elements to apply attention. 28 | Deactivated if ``None``. Default is ``None``. 29 | dropout: 30 | Dropout probability after each MHA or PFF block. 31 | Default is ``0.3``. 32 | chunk_mode: 33 | Swict between different MultiHeadAttention blocks. 34 | One of ``'chunk'``, ``'window'`` or ``None``. Default is ``'chunk'``. 35 | """ 36 | 37 | def __init__(self, 38 | d_model: int, 39 | q: int, 40 | v: int, 41 | h: int, 42 | attention_size: int = None, 43 | dropout: float = 0.3, 44 | chunk_mode: str = 'chunk'): 45 | """Initialize the Decoder block""" 46 | super().__init__() 47 | 48 | chunk_mode_modules = { 49 | 'chunk': MultiHeadAttentionChunk, 50 | 'window': MultiHeadAttentionWindow, 51 | } 52 | 53 | if chunk_mode in chunk_mode_modules.keys(): 54 | MHA = chunk_mode_modules[chunk_mode] 55 | else: 56 | MHA = MultiHeadAttention 57 | 58 | self._selfAttention = MHA(d_model, q, v, h, attention_size=attention_size) 59 | self._encoderDecoderAttention = MHA(d_model, q, v, h, attention_size=attention_size) 60 | self._feedForward = PositionwiseFeedForward(d_model) 61 | 62 | self._layerNorm1 = nn.LayerNorm(d_model) 63 | self._layerNorm2 = nn.LayerNorm(d_model) 64 | self._layerNorm3 = nn.LayerNorm(d_model) 65 | 66 | self._dropout = nn.Dropout(p=dropout) 67 | 68 | def forward(self, x: torch.Tensor, memory: torch.Tensor) -> torch.Tensor: 69 | """Propagate the input through the Decoder block. 70 | 71 | Apply the self attention block, add residual and normalize. 72 | Apply the encoder-decoder attention block, add residual and normalize. 73 | Apply the feed forward network, add residual and normalize. 74 | 75 | Parameters 76 | ---------- 77 | x: 78 | Input tensor with shape (batch_size, K, d_model). 79 | memory: 80 | Memory tensor with shape (batch_size, K, d_model) 81 | from encoder output. 82 | 83 | Returns 84 | ------- 85 | x: 86 | Output tensor with shape (batch_size, K, d_model). 87 | """ 88 | # Self attention 89 | residual = x 90 | x = self._selfAttention(query=x, key=x, value=x, mask="future") 91 | x = self._dropout(x) 92 | x = self._layerNorm1(x + residual) 93 | 94 | # Encoder-decoder attention 95 | residual = x 96 | x = self._encoderDecoderAttention(query=x, key=memory, value=memory) 97 | x = self._dropout(x) 98 | x = self._layerNorm2(x + residual) 99 | 100 | # Feed forward 101 | residual = x 102 | x = self._feedForward(x) 103 | x = self._dropout(x) 104 | x = self._layerNorm3(x + residual) 105 | 106 | return x 107 | -------------------------------------------------------------------------------- /models/transformer/encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from models.transformer.multiHeadAttention import MultiHeadAttention, MultiHeadAttentionChunk, MultiHeadAttentionWindow 7 | from models.transformer.positionwiseFeedForward import PositionwiseFeedForward 8 | 9 | 10 | class Encoder(nn.Module): 11 | """Encoder block from Attention is All You Need. 12 | 13 | Apply Multi Head Attention block followed by a Point-wise Feed Forward block. 14 | Residual sum and normalization are applied at each step. 15 | 16 | Parameters 17 | ---------- 18 | d_model: 19 | Dimension of the input vector. 20 | q: 21 | Dimension of all query matrix. 22 | v: 23 | Dimension of all value matrix. 24 | h: 25 | Number of heads. 26 | attention_size: 27 | Number of backward elements to apply attention. 28 | Deactivated if ``None``. Default is ``None``. 29 | dropout: 30 | Dropout probability after each MHA or PFF block. 31 | Default is ``0.3``. 32 | chunk_mode: 33 | Swict between different MultiHeadAttention blocks. 34 | One of ``'chunk'``, ``'window'`` or ``None``. Default is ``'chunk'``. 35 | """ 36 | 37 | def __init__(self, 38 | d_model: int, 39 | q: int, 40 | v: int, 41 | h: int, 42 | attention_size: int = None, 43 | dropout: float = 0.3, 44 | chunk_mode: str = 'chunk'): 45 | """Initialize the Encoder block""" 46 | super().__init__() 47 | 48 | chunk_mode_modules = { 49 | 'chunk': MultiHeadAttentionChunk, 50 | 'window': MultiHeadAttentionWindow, 51 | } 52 | 53 | if chunk_mode in chunk_mode_modules.keys(): 54 | MHA = chunk_mode_modules[chunk_mode] 55 | else: 56 | MHA = MultiHeadAttention 57 | 58 | self._selfAttention = MHA(d_model, q, v, h, attention_size=attention_size) 59 | self._feedForward = PositionwiseFeedForward(d_model) 60 | 61 | self._layerNorm1 = nn.LayerNorm(d_model) 62 | self._layerNorm2 = nn.LayerNorm(d_model) 63 | 64 | self._dopout = nn.Dropout(p=dropout) 65 | 66 | def forward(self, x: torch.Tensor) -> torch.Tensor: 67 | """Propagate the input through the Encoder block. 68 | 69 | Apply the Multi Head Attention block, add residual and normalize. 70 | Apply the Point-wise Feed Forward block, add residual and normalize. 71 | 72 | Parameters 73 | ---------- 74 | x: 75 | Input tensor with shape (batch_size, K, d_model). 76 | 77 | Returns 78 | ------- 79 | Output tensor with shape (batch_size, K, d_model). 80 | """ 81 | # Self attention 82 | residual = x 83 | x = self._selfAttention(query=x, key=x, value=x) 84 | x = self._dopout(x) 85 | x = self._layerNorm1(x + residual) 86 | 87 | # Feed forward 88 | residual = x 89 | x = self._feedForward(x) 90 | x = self._dopout(x) 91 | x = self._layerNorm2(x + residual) 92 | 93 | return x 94 | 95 | @property 96 | def attention_map(self) -> torch.Tensor: 97 | """Attention map after a forward propagation, 98 | variable `score` in the original paper. 99 | """ 100 | return self._selfAttention.attention_map 101 | -------------------------------------------------------------------------------- /models/transformer/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class OZELoss(nn.Module): 6 | """Custom loss for TRNSys metamodel. 7 | 8 | Compute, for temperature and consumptions, the intergral of the squared differences 9 | over time. Sum the log with a coeficient ``alpha``. 10 | 11 | .. math:: 12 | \Delta_T = \sqrt{\int (y_{est}^T - y^T)^2} 13 | 14 | \Delta_Q = \sqrt{\int (y_{est}^Q - y^Q)^2} 15 | 16 | loss = log(1 + \Delta_T) + \\alpha \cdot log(1 + \Delta_Q) 17 | 18 | Parameters: 19 | ----------- 20 | alpha: 21 | Coefficient for consumption. Default is ``0.3``. 22 | """ 23 | 24 | def __init__(self, reduction: str = 'mean', alpha: float = 0.3): 25 | super().__init__() 26 | 27 | self.alpha = alpha 28 | self.reduction = reduction 29 | 30 | self.base_loss = nn.MSELoss(reduction=self.reduction) 31 | 32 | def forward(self, 33 | y_true: torch.Tensor, 34 | y_pred: torch.Tensor) -> torch.Tensor: 35 | """Compute the loss between a target value and a prediction. 36 | 37 | Parameters 38 | ---------- 39 | y_true: 40 | Target value. 41 | y_pred: 42 | Estimated value. 43 | 44 | Returns 45 | ------- 46 | Loss as a tensor with gradient attached. 47 | """ 48 | delta_Q = self.base_loss(y_pred[..., :-1], y_true[..., :-1]) 49 | delta_T = self.base_loss(y_pred[..., -1], y_true[..., -1]) 50 | 51 | if self.reduction == 'none': 52 | delta_Q = delta_Q.mean(dim=(1, 2)) 53 | delta_T = delta_T.mean(dim=(1)) 54 | 55 | return torch.log(1 + delta_T) + self.alpha * torch.log(1 + delta_Q) 56 | -------------------------------------------------------------------------------- /models/transformer/multiHeadAttention.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from models.transformer.utils import generate_local_map_mask 9 | 10 | 11 | class MultiHeadAttention(nn.Module): 12 | """Multi Head Attention block from Attention is All You Need. 13 | 14 | Given 3 inputs of shape (batch_size, K, d_model), that will be used 15 | to compute query, keys and values, we output a self attention 16 | tensor of shape (batch_size, K, d_model). 17 | 18 | Parameters 19 | ---------- 20 | d_model: 21 | Dimension of the input vector. 22 | q: 23 | Dimension of all query matrix. 24 | v: 25 | Dimension of all value matrix. 26 | h: 27 | Number of heads. 28 | attention_size: 29 | Number of backward elements to apply attention. 30 | Deactivated if ``None``. Default is ``None``. 31 | """ 32 | 33 | def __init__(self, 34 | d_model: int, 35 | q: int, 36 | v: int, 37 | h: int, 38 | attention_size: int = None): 39 | """Initialize the Multi Head Block.""" 40 | super().__init__() 41 | 42 | self._h = h 43 | self._attention_size = attention_size 44 | 45 | # Query, keys and value matrices 46 | self._W_q = nn.Linear(d_model, q*self._h) 47 | self._W_k = nn.Linear(d_model, q*self._h) 48 | self._W_v = nn.Linear(d_model, v*self._h) 49 | 50 | # Output linear function 51 | self._W_o = nn.Linear(self._h*v, d_model) 52 | 53 | # Score placeholder 54 | self._scores = None 55 | 56 | def forward(self, 57 | query: torch.Tensor, 58 | key: torch.Tensor, 59 | value: torch.Tensor, 60 | mask: Optional[str] = None) -> torch.Tensor: 61 | """Propagate forward the input through the MHB. 62 | 63 | We compute for each head the queries, keys and values matrices, 64 | followed by the Scaled Dot-Product. The result is concatenated 65 | and returned with shape (batch_size, K, d_model). 66 | 67 | Parameters 68 | ---------- 69 | query: 70 | Input tensor with shape (batch_size, K, d_model) used to compute queries. 71 | key: 72 | Input tensor with shape (batch_size, K, d_model) used to compute keys. 73 | value: 74 | Input tensor with shape (batch_size, K, d_model) used to compute values. 75 | mask: 76 | Mask to apply on scores before computing attention. 77 | One of ``'subsequent'``, None. Default is None. 78 | 79 | Returns 80 | ------- 81 | Self attention tensor with shape (batch_size, K, d_model). 82 | """ 83 | rows = query.shape[1] 84 | cols = key.shape[1] 85 | 86 | # Compute Q, K and V, concatenate heads on batch dimension 87 | queries = torch.cat(self._W_q(query).chunk(self._h, dim=-1), dim=0) 88 | keys = torch.cat(self._W_k(key).chunk(self._h, dim=-1), dim=0) 89 | values = torch.cat(self._W_v(value).chunk(self._h, dim=-1), dim=0) 90 | 91 | # Scaled Dot Product 92 | self._scores = torch.bmm(queries, keys.transpose(1, 2)) / np.sqrt(cols) 93 | 94 | # Compute local map mask 95 | if self._attention_size != 0: 96 | attention_mask = generate_local_map_mask(rows, cols, self._attention_size, mask_future=False, device=self._scores.device) 97 | self._scores = self._scores.masked_fill(attention_mask, float('-inf')) 98 | 99 | # Compute future mask 100 | if mask == "future": 101 | future_mask = torch.triu(torch.ones((rows, cols)), diagonal=1).bool() 102 | future_mask = future_mask.to(self._scores.device) 103 | self._scores = self._scores.masked_fill(future_mask, float('-inf')) 104 | 105 | # Apply sotfmax 106 | self._scores = F.softmax(self._scores, dim=-1) 107 | 108 | attention = torch.bmm(self._scores, values) 109 | 110 | # Concatenat the heads 111 | attention_heads = torch.cat(attention.chunk(self._h, dim=0), dim=-1) 112 | 113 | # Apply linear transformation W^O 114 | self_attention = self._W_o(attention_heads) 115 | 116 | return self_attention 117 | 118 | @property 119 | def attention_map(self) -> torch.Tensor: 120 | """Attention map after a forward propagation, 121 | variable `score` in the original paper. 122 | """ 123 | if self._scores is None: 124 | raise RuntimeError( 125 | "Evaluate the model once to generate attention map") 126 | return self._scores 127 | 128 | 129 | class MultiHeadAttentionChunk(MultiHeadAttention): 130 | """Multi Head Attention block with chunk. 131 | 132 | Given 3 inputs of shape (batch_size, K, d_model), that will be used 133 | to compute query, keys and values, we output a self attention 134 | tensor of shape (batch_size, K, d_model). 135 | Queries, keys and values are divided in chunks of constant size. 136 | 137 | Parameters 138 | ---------- 139 | d_model: 140 | Dimension of the input vector. 141 | q: 142 | Dimension of all query matrix. 143 | v: 144 | Dimension of all value matrix. 145 | h: 146 | Number of heads. 147 | attention_size: 148 | Number of backward elements to apply attention. 149 | Deactivated if ``None``. Default is ``None``. 150 | chunk_size: 151 | Size of chunks to apply attention on. Last one may be smaller (see :class:`torch.Tensor.chunk`). 152 | Default is 168. 153 | """ 154 | 155 | def __init__(self, 156 | d_model: int, 157 | q: int, 158 | v: int, 159 | h: int, 160 | attention_size: int = None, 161 | chunk_size: Optional[int] = 168, 162 | **kwargs): 163 | """Initialize the Multi Head Block.""" 164 | super().__init__(d_model, q, v, h, attention_size, **kwargs) 165 | 166 | self._chunk_size = chunk_size 167 | 168 | # Score mask for decoder 169 | self._future_mask = nn.Parameter(torch.triu(torch.ones((self._chunk_size, self._chunk_size)), diagonal=1).bool(), 170 | requires_grad=False) 171 | 172 | if self._attention_size is not None: 173 | self._attention_mask = nn.Parameter(generate_local_map_mask(self._chunk_size, self._chunk_size, self._attention_size), 174 | requires_grad=False) 175 | 176 | def forward(self, 177 | query: torch.Tensor, 178 | key: torch.Tensor, 179 | value: torch.Tensor, 180 | mask: Optional[str] = None) -> torch.Tensor: 181 | """Propagate forward the input through the MHB. 182 | 183 | We compute for each head the queries, keys and values matrices, 184 | followed by the Scaled Dot-Product. The result is concatenated 185 | and returned with shape (batch_size, K, d_model). 186 | 187 | Parameters 188 | ---------- 189 | query: 190 | Input tensor with shape (batch_size, K, d_model) used to compute queries. 191 | key: 192 | Input tensor with shape (batch_size, K, d_model) used to compute keys. 193 | value: 194 | Input tensor with shape (batch_size, K, d_model) used to compute values. 195 | mask: 196 | Mask to apply on scores before computing attention. 197 | One of ``'subsequent'``, None. Default is None. 198 | 199 | Returns 200 | ------- 201 | Self attention tensor with shape (batch_size, K, d_model). 202 | """ 203 | K = query.shape[1] 204 | n_chunk = K // self._chunk_size 205 | 206 | # Compute Q, K and V, concatenate heads on batch dimension 207 | queries = torch.cat(torch.cat(self._W_q(query).chunk(self._h, dim=-1), dim=0).chunk(n_chunk, dim=1), dim=0) 208 | keys = torch.cat(torch.cat(self._W_k(key).chunk(self._h, dim=-1), dim=0).chunk(n_chunk, dim=1), dim=0) 209 | values = torch.cat(torch.cat(self._W_v(value).chunk(self._h, dim=-1), dim=0).chunk(n_chunk, dim=1), dim=0) 210 | 211 | # Scaled Dot Product 212 | self._scores = torch.bmm(queries, keys.transpose(1, 2)) / np.sqrt(self._chunk_size) 213 | 214 | # Compute local map mask 215 | if self._attention_size is not None: 216 | self._scores = self._scores.masked_fill(self._attention_mask, float('-inf')) 217 | 218 | # Compute future mask 219 | if mask == "subsequent": 220 | self._scores = self._scores.masked_fill(self._future_mask, float('-inf')) 221 | 222 | # Apply softmax 223 | self._scores = F.softmax(self._scores, dim=-1) 224 | 225 | attention = torch.bmm(self._scores, values) 226 | 227 | # Concatenat the heads 228 | attention_heads = torch.cat(torch.cat(attention.chunk( 229 | n_chunk, dim=0), dim=1).chunk(self._h, dim=0), dim=-1) 230 | 231 | # Apply linear transformation W^O 232 | self_attention = self._W_o(attention_heads) 233 | 234 | return self_attention 235 | 236 | 237 | class MultiHeadAttentionWindow(MultiHeadAttention): 238 | """Multi Head Attention block with moving window. 239 | 240 | Given 3 inputs of shape (batch_size, K, d_model), that will be used 241 | to compute query, keys and values, we output a self attention 242 | tensor of shape (batch_size, K, d_model). 243 | Queries, keys and values are divided in chunks using a moving window. 244 | 245 | Parameters 246 | ---------- 247 | d_model: 248 | Dimension of the input vector. 249 | q: 250 | Dimension of all query matrix. 251 | v: 252 | Dimension of all value matrix. 253 | h: 254 | Number of heads. 255 | attention_size: 256 | Number of backward elements to apply attention. 257 | Deactivated if ``None``. Default is ``None``. 258 | window_size: 259 | Size of the window used to extract chunks. 260 | Default is 168 261 | padding: 262 | Padding around each window. Padding will be applied to input sequence. 263 | Default is 168 // 4 = 42. 264 | """ 265 | 266 | def __init__(self, 267 | d_model: int, 268 | q: int, 269 | v: int, 270 | h: int, 271 | attention_size: int = None, 272 | window_size: Optional[int] = 168, 273 | padding: Optional[int] = 168 // 4, 274 | **kwargs): 275 | """Initialize the Multi Head Block.""" 276 | super().__init__(d_model, q, v, h, attention_size, **kwargs) 277 | 278 | self._window_size = window_size 279 | self._padding = padding 280 | self._q = q 281 | self._v = v 282 | 283 | # Step size for the moving window 284 | self._step = self._window_size - 2 * self._padding 285 | 286 | # Score mask for decoder 287 | self._future_mask = nn.Parameter(torch.triu(torch.ones((self._window_size, self._window_size)), diagonal=1).bool(), 288 | requires_grad=False) 289 | 290 | if self._attention_size is not None: 291 | self._attention_mask = nn.Parameter(generate_local_map_mask(self._window_size, self._window_size, self._attention_size), 292 | requires_grad=False) 293 | 294 | def forward(self, 295 | query: torch.Tensor, 296 | key: torch.Tensor, 297 | value: torch.Tensor, 298 | mask: Optional[str] = None) -> torch.Tensor: 299 | """Propagate forward the input through the MHB. 300 | 301 | We compute for each head the queries, keys and values matrices, 302 | followed by the Scaled Dot-Product. The result is concatenated 303 | and returned with shape (batch_size, K, d_model). 304 | 305 | Parameters 306 | ---------- 307 | query: 308 | Input tensor with shape (batch_size, K, d_model) used to compute queries. 309 | key: 310 | Input tensor with shape (batch_size, K, d_model) used to compute keys. 311 | value: 312 | Input tensor with shape (batch_size, K, d_model) used to compute values. 313 | mask: 314 | Mask to apply on scores before computing attention. 315 | One of ``'subsequent'``, None. Default is None. 316 | 317 | Returns 318 | ------- 319 | Self attention tensor with shape (batch_size, K, d_model). 320 | """ 321 | batch_size = query.shape[0] 322 | 323 | # Apply padding to input sequence 324 | query = F.pad(query.transpose(1, 2), (self._padding, self._padding), 'replicate').transpose(1, 2) 325 | key = F.pad(key.transpose(1, 2), (self._padding, self._padding), 'replicate').transpose(1, 2) 326 | value = F.pad(value.transpose(1, 2), (self._padding, self._padding), 'replicate').transpose(1, 2) 327 | 328 | # Compute Q, K and V, concatenate heads on batch dimension 329 | queries = torch.cat(self._W_q(query).chunk(self._h, dim=-1), dim=0) 330 | keys = torch.cat(self._W_k(key).chunk(self._h, dim=-1), dim=0) 331 | values = torch.cat(self._W_v(value).chunk(self._h, dim=-1), dim=0) 332 | 333 | # Divide Q, K and V using a moving window 334 | queries = queries.unfold(dimension=1, size=self._window_size, step=self._step).reshape((-1, self._q, self._window_size)).transpose(1, 2) 335 | keys = keys.unfold(dimension=1, size=self._window_size, step=self._step).reshape((-1, self._q, self._window_size)).transpose(1, 2) 336 | values = values.unfold(dimension=1, size=self._window_size, step=self._step).reshape((-1, self._v, self._window_size)).transpose(1, 2) 337 | 338 | # Scaled Dot Product 339 | self._scores = torch.bmm(queries, keys.transpose(1, 2)) / np.sqrt(self._window_size) 340 | 341 | # Compute local map mask 342 | if self._attention_size is not None: 343 | self._scores = self._scores.masked_fill(self._attention_mask, float('-inf')) 344 | 345 | # Compute future mask 346 | if mask == "subsequent": 347 | self._scores = self._scores.masked_fill(self._future_mask, float('-inf')) 348 | 349 | # Apply softmax 350 | self._scores = F.softmax(self._scores, dim=-1) 351 | 352 | attention = torch.bmm(self._scores, values) 353 | 354 | # Fold chunks back 355 | attention = attention.reshape((batch_size*self._h, -1, self._window_size, self._v)) 356 | attention = attention[:, :, self._padding:-self._padding, :] 357 | attention = attention.reshape((batch_size*self._h, -1, self._v)) 358 | 359 | # Concatenat the heads 360 | attention_heads = torch.cat(attention.chunk(self._h, dim=0), dim=-1) 361 | 362 | # Apply linear transformation W^O 363 | self_attention = self._W_o(attention_heads) 364 | 365 | return self_attention 366 | 367 | 368 | def main(): 369 | mhd = MultiHeadAttention(128, 5, 5, 4, attention_size=0) 370 | k = torch.rand((64, 5, 128)) 371 | q = torch.rand((64, 5, 128)) 372 | v = torch.rand((64, 5, 128)) 373 | 374 | attention = mhd.forward(q,k,v, mask='future') 375 | return attention 376 | 377 | 378 | if __name__ == '__main__': 379 | main() 380 | -------------------------------------------------------------------------------- /models/transformer/positionwiseFeedForward.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class PositionwiseFeedForward(nn.Module): 9 | """Position-wise Feed Forward Network block from Attention is All You Need. 10 | 11 | Apply two linear transformations to each input, separately but indetically. We 12 | implement them as 1D convolutions. Input and output have a shape (batch_size, d_model). 13 | 14 | Parameters 15 | ---------- 16 | d_model: 17 | Dimension of input tensor. 18 | d_ff: 19 | Dimension of hidden layer, default is 2048. 20 | """ 21 | 22 | def __init__(self, 23 | d_model: int, 24 | d_ff: Optional[int] = 128): 25 | """Initialize the PFF block.""" 26 | super().__init__() 27 | 28 | self._linear1 = nn.Linear(d_model, d_ff) 29 | self._linear2 = nn.Linear(d_ff, d_model) 30 | 31 | def forward(self, x: torch.Tensor) -> torch.Tensor: 32 | """Propagate forward the input through the PFF block. 33 | 34 | Apply the first linear transformation, then a relu actvation, 35 | and the second linear transformation. 36 | 37 | Parameters 38 | ---------- 39 | x: 40 | Input tensor with shape (batch_size, K, d_model). 41 | 42 | Returns 43 | ------- 44 | Output tensor with shape (batch_size, K, d_model). 45 | """ 46 | return self._linear2(F.relu(self._linear1(x))) 47 | -------------------------------------------------------------------------------- /models/transformer/transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from models.transformer.encoder import Encoder 5 | from models.transformer.decoder import Decoder 6 | from models.transformer.utils import generate_original_PE, generate_regular_PE 7 | 8 | 9 | class Transformer(nn.Module): 10 | """Transformer model from Attention is All You Need. 11 | 12 | A classic transformer model adapted for sequential data. 13 | Embedding has been replaced with a fully connected layer, 14 | the last layer softmax is now a sigmoid. 15 | 16 | Attributes 17 | ---------- 18 | layers_encoding: :py:class:`list` of :class:`Encoder.Encoder` 19 | stack of Encoder layers. 20 | layers_decoding: :py:class:`list` of :class:`Decoder.Decoder` 21 | stack of Decoder layers. 22 | 23 | Parameters 24 | ---------- 25 | d_input: 26 | Model input dimension. 27 | d_model: 28 | Dimension of the input vector. 29 | d_output: 30 | Model output dimension. 31 | q: 32 | Dimension of queries and keys. 33 | v: 34 | Dimension of values. 35 | h: 36 | Number of heads. 37 | N: 38 | Number of encoder and decoder layers to stack. 39 | attention_size: 40 | Number of backward elements to apply attention. 41 | Deactivated if ``None``. Default is ``None``. 42 | dropout: 43 | Dropout probability after each MHA or PFF block. 44 | Default is ``0.3``. 45 | chunk_mode: 46 | Swict between different MultiHeadAttention blocks. 47 | One of ``'chunk'``, ``'window'`` or ``None``. Default is ``'chunk'``. 48 | pe: 49 | Type of positional encoding to add. 50 | Must be one of ``'original'``, ``'regular'`` or ``None``. Default is ``None``. 51 | """ 52 | 53 | def __init__(self, cnf: dict): 54 | """Create transformer structure from Encoder and Decoder blocks.""" 55 | super().__init__() 56 | 57 | d_model = cnf["d_model"] 58 | q = cnf["q"] 59 | v = cnf["v"] 60 | h = cnf["h"] 61 | N = cnf["N"] 62 | attention_size = cnf["attention_size"] 63 | dropout = cnf["dropout"] 64 | pe = cnf["pe"] 65 | chunk_mode = cnf["chunk_mode"] 66 | d_input = cnf["d_input"] 67 | d_output = cnf["d_output"] 68 | self.time_steps = cnf["num_encoder_steps"] 69 | 70 | self._d_model = d_model 71 | 72 | self.layers_encoding = nn.ModuleList([Encoder(d_model, 73 | q, 74 | v, 75 | h, 76 | attention_size=attention_size, 77 | dropout=dropout, 78 | chunk_mode=chunk_mode) for _ in range(N)]) 79 | self.layers_decoding = nn.ModuleList([Decoder(d_model, 80 | q, 81 | v, 82 | h, 83 | attention_size=attention_size, 84 | dropout=dropout, 85 | chunk_mode=chunk_mode) for _ in range(N)]) 86 | 87 | self._embedding_input = nn.Linear(d_input, d_model) 88 | self._embedding_output = nn.Linear(d_input, d_model) 89 | 90 | self._linear = nn.Linear(d_model, d_output) 91 | 92 | pe_functions = { 93 | 'original': generate_original_PE, 94 | 'regular': generate_regular_PE, 95 | } 96 | 97 | if pe in pe_functions.keys(): 98 | self._generate_PE = pe_functions[pe] 99 | else: 100 | self._generate_PE = None 101 | 102 | self.name = 'transformer' 103 | 104 | def forward(self, xy: torch.Tensor) -> torch.Tensor: 105 | """Propagate input through transformer 106 | 107 | Forward input through an embedding module, 108 | the encoder then decoder stacks, and an output module. 109 | 110 | Parameters 111 | ---------- 112 | x: 113 | :class:`torch.Tensor` of shape (batch_size, K, d_input). 114 | 115 | Returns 116 | ------- 117 | Output tensor with shape (batch_size, K, d_output). 118 | """ 119 | x = xy[:, :self.time_steps] 120 | y = xy[:, self.time_steps:] 121 | 122 | # Shift tensor add start token 123 | pad = torch.ones((y.shape[0], 1, y.shape[2])).to(y.device) 124 | y = torch.cat((pad, y), dim=1)[:, :-1, :] 125 | 126 | # Embeddin module 127 | encoding_x = self._embedding_input(x) 128 | encoding_y = self._embedding_output(y) 129 | 130 | # Add position encoding 131 | if self._generate_PE is not None: 132 | positional_encoding = self._generate_PE(x.shape[1], self._d_model) 133 | positional_encoding = positional_encoding.to(encoding_x.device) 134 | encoding_x.add_(positional_encoding) 135 | 136 | # Encoding stack 137 | for layer in self.layers_encoding: 138 | encoding_x = layer(encoding_x) 139 | 140 | # Decoding stack 141 | decoding = encoding_y 142 | 143 | # Add position encoding 144 | if self._generate_PE is not None: 145 | positional_encoding = self._generate_PE(y.shape[1], self._d_model) 146 | positional_encoding = positional_encoding.to(decoding.device) 147 | decoding.add_(positional_encoding) 148 | 149 | for layer in self.layers_decoding: 150 | decoding = layer(decoding, encoding_x) 151 | 152 | # Output module 153 | output = self._linear(decoding) 154 | return output 155 | -------------------------------------------------------------------------------- /models/transformer/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def generate_original_PE(length: int, d_model: int) -> torch.Tensor: 8 | """Generate positional encoding as described in original paper. :class:`torch.Tensor` 9 | 10 | Parameters 11 | ---------- 12 | length: 13 | Time window length, i.e. K. 14 | d_model: 15 | Dimension of the model vector. 16 | 17 | Returns 18 | ------- 19 | Tensor of shape (K, d_model). 20 | """ 21 | PE = torch.zeros((length, d_model)) 22 | 23 | pos = torch.arange(length).unsqueeze(1) 24 | PE[:, 0::2] = torch.sin( 25 | pos / torch.pow(1000, torch.arange(0, d_model, 2, dtype=torch.float32)/d_model)) 26 | PE[:, 1::2] = torch.cos( 27 | pos / torch.pow(1000, torch.arange(1, d_model, 2, dtype=torch.float32)/d_model)) 28 | 29 | return PE 30 | 31 | 32 | def generate_regular_PE(length: int, d_model: int, period: Optional[int] = 24) -> torch.Tensor: 33 | """Generate positional encoding with a given period. 34 | 35 | Parameters 36 | ---------- 37 | length: 38 | Time window length, i.e. K. 39 | d_model: 40 | Dimension of the model vector. 41 | period: 42 | Size of the pattern to repeat. 43 | Default is 24. 44 | 45 | Returns 46 | ------- 47 | Tensor of shape (K, d_model). 48 | """ 49 | PE = torch.zeros((length, d_model)) 50 | 51 | pos = torch.arange(length, dtype=torch.float32).unsqueeze(1) 52 | PE = torch.sin(pos * 2 * np.pi / period) 53 | PE = PE.repeat((1, d_model)) 54 | 55 | return PE 56 | 57 | 58 | def generate_local_map_mask(row: int, 59 | col: int, 60 | attention_size: int, 61 | mask_future=False, 62 | device: torch.device = 'cpu') -> torch.BoolTensor: 63 | """Compute attention mask as attention_size wide diagonal. 64 | 65 | Parameters 66 | ---------- 67 | row: 68 | Time dimension size v1 69 | col: 70 | Time dimension size v2 71 | attention_size: 72 | Number of backward elements to apply attention. 73 | device: 74 | torch device. Default is ``'cpu'``. 75 | 76 | Returns 77 | ------- 78 | Mask as a boolean tensor. 79 | """ 80 | local_map = np.empty((row, col)) 81 | i, j = np.indices(local_map.shape) 82 | 83 | if mask_future: 84 | local_map[i, j] = (i - j > attention_size) ^ (j - i > 0) 85 | else: 86 | local_map[i, j] = np.abs(i - j) > attention_size 87 | 88 | return torch.BoolTensor(local_map).to(device) 89 | -------------------------------------------------------------------------------- /models/transformer_grn/__init__.py: -------------------------------------------------------------------------------- 1 | from models.transformer_grn.transformer import Transformer 2 | -------------------------------------------------------------------------------- /models/transformer_grn/decoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from models.transformer.multiHeadAttention import MultiHeadAttention, MultiHeadAttentionChunk, MultiHeadAttentionWindow 7 | from models.transformer.positionwiseFeedForward import PositionwiseFeedForward 8 | from models.temporal_fusion_t.gated_residual_network import GatedResidualNetwork 9 | 10 | 11 | class Decoder(nn.Module): 12 | """Decoder block from Attention is All You Need. 13 | 14 | Apply two Multi Head Attention block followed by a Point-wise Feed Forward block. 15 | Residual sum and normalization are applied at each step. 16 | 17 | Parameters 18 | ---------- 19 | d_model: 20 | Dimension of the input vector. 21 | q: 22 | Dimension of all query matrix. 23 | v: 24 | Dimension of all value matrix. 25 | h: 26 | Number of heads. 27 | attention_size: 28 | Number of backward elements to apply attention. 29 | Deactivated if ``None``. Default is ``None``. 30 | dropout: 31 | Dropout probability after each MHA or PFF block. 32 | Default is ``0.3``. 33 | chunk_mode: 34 | Swict between different MultiHeadAttention blocks. 35 | One of ``'chunk'``, ``'window'`` or ``None``. Default is ``'chunk'``. 36 | """ 37 | 38 | def __init__(self, 39 | d_model: int, 40 | q: int, 41 | v: int, 42 | h: int, 43 | attention_size: int = None, 44 | dropout: float = 0.3, 45 | chunk_mode: str = 'chunk'): 46 | """Initialize the Decoder block""" 47 | super().__init__() 48 | 49 | chunk_mode_modules = { 50 | 'chunk': MultiHeadAttentionChunk, 51 | 'window': MultiHeadAttentionWindow, 52 | } 53 | 54 | if chunk_mode in chunk_mode_modules.keys(): 55 | MHA = chunk_mode_modules[chunk_mode] 56 | else: 57 | MHA = MultiHeadAttention 58 | 59 | self._selfAttention = MHA(d_model, q, v, h, attention_size=attention_size) 60 | self._encoderDecoderAttention = MHA(d_model, q, v, h, attention_size=attention_size) 61 | self._feedForward = PositionwiseFeedForward(d_model) 62 | 63 | self._layerNorm1 = nn.LayerNorm(d_model) 64 | self._layerNorm2 = nn.LayerNorm(d_model) 65 | self._layerNorm3 = nn.LayerNorm(d_model) 66 | 67 | self._dropout = nn.Dropout(p=dropout) 68 | self.grn = GatedResidualNetwork( 69 | input_size=d_model, 70 | hidden_layer_size=d_model, 71 | output_size=None, 72 | dropout_rate=0.1, 73 | use_time_distributed=True, 74 | return_gate=True, 75 | batch_first=True) 76 | 77 | def forward(self, x: torch.Tensor, memory: torch.Tensor, context=None) -> torch.Tensor: 78 | """Propagate the input through the Decoder block. 79 | 80 | Apply the self attention block, add residual and normalize. 81 | Apply the encoder-decoder attention block, add residual and normalize. 82 | Apply the feed forward network, add residual and normalize. 83 | 84 | Parameters 85 | ---------- 86 | x: 87 | Input tensor with shape (batch_size, K, d_model). 88 | memory: 89 | Memory tensor with shape (batch_size, K, d_model) 90 | from encoder output. 91 | 92 | Returns 93 | ------- 94 | x: 95 | Output tensor with shape (batch_size, K, d_model). 96 | """ 97 | # Self attention 98 | residual = x 99 | x = self._selfAttention(query=x, key=x, value=x, mask="future") 100 | x = self._dropout(x) 101 | x = self._layerNorm1(x + residual) 102 | 103 | # Inject static vars 104 | if context is not None: 105 | x, _ = self.grn(x, context) 106 | 107 | # Encoder-decoder attention 108 | residual = x 109 | x = self._encoderDecoderAttention(query=x, key=memory, value=memory) 110 | x = self._dropout(x) 111 | x = self._layerNorm2(x + residual) 112 | 113 | # Feed forward 114 | residual = x 115 | x = self._feedForward(x) 116 | x = self._dropout(x) 117 | x = self._layerNorm3(x + residual) 118 | 119 | return x 120 | -------------------------------------------------------------------------------- /models/transformer_grn/encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from models.transformer.multiHeadAttention import MultiHeadAttention, MultiHeadAttentionChunk, MultiHeadAttentionWindow 7 | from models.transformer.positionwiseFeedForward import PositionwiseFeedForward 8 | from models.temporal_fusion_t.gated_residual_network import GatedResidualNetwork 9 | 10 | 11 | class Encoder(nn.Module): 12 | """Encoder block from Attention is All You Need. 13 | 14 | Apply Multi Head Attention block followed by a Point-wise Feed Forward block. 15 | Residual sum and normalization are applied at each step. 16 | 17 | Parameters 18 | ---------- 19 | d_model: 20 | Dimension of the input vector. 21 | q: 22 | Dimension of all query matrix. 23 | v: 24 | Dimension of all value matrix. 25 | h: 26 | Number of heads. 27 | attention_size: 28 | Number of backward elements to apply attention. 29 | Deactivated if ``None``. Default is ``None``. 30 | dropout: 31 | Dropout probability after each MHA or PFF block. 32 | Default is ``0.3``. 33 | chunk_mode: 34 | Swict between different MultiHeadAttention blocks. 35 | One of ``'chunk'``, ``'window'`` or ``None``. Default is ``'chunk'``. 36 | """ 37 | 38 | def __init__(self, 39 | d_model: int, 40 | q: int, 41 | v: int, 42 | h: int, 43 | attention_size: int = None, 44 | dropout: float = 0.3, 45 | chunk_mode: str = 'chunk'): 46 | """Initialize the Encoder block""" 47 | super().__init__() 48 | 49 | chunk_mode_modules = { 50 | 'chunk': MultiHeadAttentionChunk, 51 | 'window': MultiHeadAttentionWindow, 52 | } 53 | 54 | if chunk_mode in chunk_mode_modules.keys(): 55 | MHA = chunk_mode_modules[chunk_mode] 56 | else: 57 | MHA = MultiHeadAttention 58 | 59 | self._selfAttention = MHA(d_model, q, v, h, attention_size=attention_size) 60 | self._feedForward = PositionwiseFeedForward(d_model) 61 | 62 | self._layerNorm1 = nn.LayerNorm(d_model) 63 | self._layerNorm2 = nn.LayerNorm(d_model) 64 | 65 | self._dopout = nn.Dropout(p=dropout) 66 | 67 | self.grn = GatedResidualNetwork( 68 | input_size=d_model, 69 | hidden_layer_size=d_model, 70 | output_size=None, 71 | dropout_rate=0.1, 72 | use_time_distributed=True, 73 | return_gate=True, 74 | batch_first=True) 75 | 76 | def forward(self, x: torch.Tensor, context=None) -> torch.Tensor: 77 | """Propagate the input through the Encoder block. 78 | 79 | Apply the Multi Head Attention block, add residual and normalize. 80 | Apply the Point-wise Feed Forward block, add residual and normalize. 81 | 82 | Parameters 83 | ---------- 84 | x: 85 | Input tensor with shape (batch_size, K, d_model). 86 | 87 | Returns 88 | ------- 89 | Output tensor with shape (batch_size, K, d_model). 90 | """ 91 | # Self attention 92 | residual = x 93 | x = self._selfAttention(query=x, key=x, value=x) 94 | x = self._dopout(x) 95 | x = self._layerNorm1(x + residual) 96 | 97 | # Inject static vars 98 | if context is not None: 99 | x, _ = self.grn(x, context) 100 | 101 | # Feed forward 102 | residual = x 103 | x = self._feedForward(x) 104 | x = self._dopout(x) 105 | x = self._layerNorm2(x + residual) 106 | 107 | return x 108 | 109 | @property 110 | def attention_map(self) -> torch.Tensor: 111 | """Attention map after a forward propagation, 112 | variable `score` in the original paper. 113 | """ 114 | return self._selfAttention.attention_map 115 | -------------------------------------------------------------------------------- /models/transformer_grn/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class OZELoss(nn.Module): 6 | """Custom loss for TRNSys metamodel. 7 | 8 | Compute, for temperature and consumptions, the intergral of the squared differences 9 | over time. Sum the log with a coeficient ``alpha``. 10 | 11 | .. math:: 12 | \Delta_T = \sqrt{\int (y_{est}^T - y^T)^2} 13 | 14 | \Delta_Q = \sqrt{\int (y_{est}^Q - y^Q)^2} 15 | 16 | loss = log(1 + \Delta_T) + \\alpha \cdot log(1 + \Delta_Q) 17 | 18 | Parameters: 19 | ----------- 20 | alpha: 21 | Coefficient for consumption. Default is ``0.3``. 22 | """ 23 | 24 | def __init__(self, reduction: str = 'mean', alpha: float = 0.3): 25 | super().__init__() 26 | 27 | self.alpha = alpha 28 | self.reduction = reduction 29 | 30 | self.base_loss = nn.MSELoss(reduction=self.reduction) 31 | 32 | def forward(self, 33 | y_true: torch.Tensor, 34 | y_pred: torch.Tensor) -> torch.Tensor: 35 | """Compute the loss between a target value and a prediction. 36 | 37 | Parameters 38 | ---------- 39 | y_true: 40 | Target value. 41 | y_pred: 42 | Estimated value. 43 | 44 | Returns 45 | ------- 46 | Loss as a tensor with gradient attached. 47 | """ 48 | delta_Q = self.base_loss(y_pred[..., :-1], y_true[..., :-1]) 49 | delta_T = self.base_loss(y_pred[..., -1], y_true[..., -1]) 50 | 51 | if self.reduction == 'none': 52 | delta_Q = delta_Q.mean(dim=(1, 2)) 53 | delta_T = delta_T.mean(dim=(1)) 54 | 55 | return torch.log(1 + delta_T) + self.alpha * torch.log(1 + delta_Q) 56 | -------------------------------------------------------------------------------- /models/transformer_grn/multiHeadAttention.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from models.transformer.utils import generate_local_map_mask 9 | 10 | 11 | class MultiHeadAttention(nn.Module): 12 | """Multi Head Attention block from Attention is All You Need. 13 | 14 | Given 3 inputs of shape (batch_size, K, d_model), that will be used 15 | to compute query, keys and values, we output a self attention 16 | tensor of shape (batch_size, K, d_model). 17 | 18 | Parameters 19 | ---------- 20 | d_model: 21 | Dimension of the input vector. 22 | q: 23 | Dimension of all query matrix. 24 | v: 25 | Dimension of all value matrix. 26 | h: 27 | Number of heads. 28 | attention_size: 29 | Number of backward elements to apply attention. 30 | Deactivated if ``None``. Default is ``None``. 31 | """ 32 | 33 | def __init__(self, 34 | d_model: int, 35 | q: int, 36 | v: int, 37 | h: int, 38 | attention_size: int = None): 39 | """Initialize the Multi Head Block.""" 40 | super().__init__() 41 | 42 | self._h = h 43 | self._attention_size = attention_size 44 | 45 | # Query, keys and value matrices 46 | self._W_q = nn.Linear(d_model, q*self._h) 47 | self._W_k = nn.Linear(d_model, q*self._h) 48 | self._W_v = nn.Linear(d_model, v*self._h) 49 | 50 | # Output linear function 51 | self._W_o = nn.Linear(self._h*v, d_model) 52 | 53 | # Score placeholder 54 | self._scores = None 55 | 56 | def forward(self, 57 | query: torch.Tensor, 58 | key: torch.Tensor, 59 | value: torch.Tensor, 60 | mask: Optional[str] = None) -> torch.Tensor: 61 | """Propagate forward the input through the MHB. 62 | 63 | We compute for each head the queries, keys and values matrices, 64 | followed by the Scaled Dot-Product. The result is concatenated 65 | and returned with shape (batch_size, K, d_model). 66 | 67 | Parameters 68 | ---------- 69 | query: 70 | Input tensor with shape (batch_size, K, d_model) used to compute queries. 71 | key: 72 | Input tensor with shape (batch_size, K, d_model) used to compute keys. 73 | value: 74 | Input tensor with shape (batch_size, K, d_model) used to compute values. 75 | mask: 76 | Mask to apply on scores before computing attention. 77 | One of ``'subsequent'``, None. Default is None. 78 | 79 | Returns 80 | ------- 81 | Self attention tensor with shape (batch_size, K, d_model). 82 | """ 83 | rows = query.shape[1] 84 | cols = key.shape[1] 85 | 86 | # Compute Q, K and V, concatenate heads on batch dimension 87 | queries = torch.cat(self._W_q(query).chunk(self._h, dim=-1), dim=0) 88 | keys = torch.cat(self._W_k(key).chunk(self._h, dim=-1), dim=0) 89 | values = torch.cat(self._W_v(value).chunk(self._h, dim=-1), dim=0) 90 | 91 | # Scaled Dot Product 92 | self._scores = torch.bmm(queries, keys.transpose(1, 2)) / np.sqrt(cols) 93 | 94 | # Compute local map mask 95 | if self._attention_size != 0: 96 | attention_mask = generate_local_map_mask(rows, cols, self._attention_size, mask_future=False, device=self._scores.device) 97 | self._scores = self._scores.masked_fill(attention_mask, float('-inf')) 98 | 99 | # Compute future mask 100 | if mask == "future": 101 | future_mask = torch.triu(torch.ones((rows, cols)), diagonal=1).bool() 102 | future_mask = future_mask.to(self._scores.device) 103 | self._scores = self._scores.masked_fill(future_mask, float('-inf')) 104 | 105 | # Apply sotfmax 106 | self._scores = F.softmax(self._scores, dim=-1) 107 | 108 | attention = torch.bmm(self._scores, values) 109 | 110 | # Concatenat the heads 111 | attention_heads = torch.cat(attention.chunk(self._h, dim=0), dim=-1) 112 | 113 | # Apply linear transformation W^O 114 | self_attention = self._W_o(attention_heads) 115 | 116 | return self_attention 117 | 118 | @property 119 | def attention_map(self) -> torch.Tensor: 120 | """Attention map after a forward propagation, 121 | variable `score` in the original paper. 122 | """ 123 | if self._scores is None: 124 | raise RuntimeError( 125 | "Evaluate the model once to generate attention map") 126 | return self._scores 127 | 128 | 129 | class MultiHeadAttentionChunk(MultiHeadAttention): 130 | """Multi Head Attention block with chunk. 131 | 132 | Given 3 inputs of shape (batch_size, K, d_model), that will be used 133 | to compute query, keys and values, we output a self attention 134 | tensor of shape (batch_size, K, d_model). 135 | Queries, keys and values are divided in chunks of constant size. 136 | 137 | Parameters 138 | ---------- 139 | d_model: 140 | Dimension of the input vector. 141 | q: 142 | Dimension of all query matrix. 143 | v: 144 | Dimension of all value matrix. 145 | h: 146 | Number of heads. 147 | attention_size: 148 | Number of backward elements to apply attention. 149 | Deactivated if ``None``. Default is ``None``. 150 | chunk_size: 151 | Size of chunks to apply attention on. Last one may be smaller (see :class:`torch.Tensor.chunk`). 152 | Default is 168. 153 | """ 154 | 155 | def __init__(self, 156 | d_model: int, 157 | q: int, 158 | v: int, 159 | h: int, 160 | attention_size: int = None, 161 | chunk_size: Optional[int] = 168, 162 | **kwargs): 163 | """Initialize the Multi Head Block.""" 164 | super().__init__(d_model, q, v, h, attention_size, **kwargs) 165 | 166 | self._chunk_size = chunk_size 167 | 168 | # Score mask for decoder 169 | self._future_mask = nn.Parameter(torch.triu(torch.ones((self._chunk_size, self._chunk_size)), diagonal=1).bool(), 170 | requires_grad=False) 171 | 172 | if self._attention_size is not None: 173 | self._attention_mask = nn.Parameter(generate_local_map_mask(self._chunk_size, self._chunk_size, self._attention_size), 174 | requires_grad=False) 175 | 176 | def forward(self, 177 | query: torch.Tensor, 178 | key: torch.Tensor, 179 | value: torch.Tensor, 180 | mask: Optional[str] = None) -> torch.Tensor: 181 | """Propagate forward the input through the MHB. 182 | 183 | We compute for each head the queries, keys and values matrices, 184 | followed by the Scaled Dot-Product. The result is concatenated 185 | and returned with shape (batch_size, K, d_model). 186 | 187 | Parameters 188 | ---------- 189 | query: 190 | Input tensor with shape (batch_size, K, d_model) used to compute queries. 191 | key: 192 | Input tensor with shape (batch_size, K, d_model) used to compute keys. 193 | value: 194 | Input tensor with shape (batch_size, K, d_model) used to compute values. 195 | mask: 196 | Mask to apply on scores before computing attention. 197 | One of ``'subsequent'``, None. Default is None. 198 | 199 | Returns 200 | ------- 201 | Self attention tensor with shape (batch_size, K, d_model). 202 | """ 203 | K = query.shape[1] 204 | n_chunk = K // self._chunk_size 205 | 206 | # Compute Q, K and V, concatenate heads on batch dimension 207 | queries = torch.cat(torch.cat(self._W_q(query).chunk(self._h, dim=-1), dim=0).chunk(n_chunk, dim=1), dim=0) 208 | keys = torch.cat(torch.cat(self._W_k(key).chunk(self._h, dim=-1), dim=0).chunk(n_chunk, dim=1), dim=0) 209 | values = torch.cat(torch.cat(self._W_v(value).chunk(self._h, dim=-1), dim=0).chunk(n_chunk, dim=1), dim=0) 210 | 211 | # Scaled Dot Product 212 | self._scores = torch.bmm(queries, keys.transpose(1, 2)) / np.sqrt(self._chunk_size) 213 | 214 | # Compute local map mask 215 | if self._attention_size is not None: 216 | self._scores = self._scores.masked_fill(self._attention_mask, float('-inf')) 217 | 218 | # Compute future mask 219 | if mask == "subsequent": 220 | self._scores = self._scores.masked_fill(self._future_mask, float('-inf')) 221 | 222 | # Apply softmax 223 | self._scores = F.softmax(self._scores, dim=-1) 224 | 225 | attention = torch.bmm(self._scores, values) 226 | 227 | # Concatenat the heads 228 | attention_heads = torch.cat(torch.cat(attention.chunk( 229 | n_chunk, dim=0), dim=1).chunk(self._h, dim=0), dim=-1) 230 | 231 | # Apply linear transformation W^O 232 | self_attention = self._W_o(attention_heads) 233 | 234 | return self_attention 235 | 236 | 237 | class MultiHeadAttentionWindow(MultiHeadAttention): 238 | """Multi Head Attention block with moving window. 239 | 240 | Given 3 inputs of shape (batch_size, K, d_model), that will be used 241 | to compute query, keys and values, we output a self attention 242 | tensor of shape (batch_size, K, d_model). 243 | Queries, keys and values are divided in chunks using a moving window. 244 | 245 | Parameters 246 | ---------- 247 | d_model: 248 | Dimension of the input vector. 249 | q: 250 | Dimension of all query matrix. 251 | v: 252 | Dimension of all value matrix. 253 | h: 254 | Number of heads. 255 | attention_size: 256 | Number of backward elements to apply attention. 257 | Deactivated if ``None``. Default is ``None``. 258 | window_size: 259 | Size of the window used to extract chunks. 260 | Default is 168 261 | padding: 262 | Padding around each window. Padding will be applied to input sequence. 263 | Default is 168 // 4 = 42. 264 | """ 265 | 266 | def __init__(self, 267 | d_model: int, 268 | q: int, 269 | v: int, 270 | h: int, 271 | attention_size: int = None, 272 | window_size: Optional[int] = 168, 273 | padding: Optional[int] = 168 // 4, 274 | **kwargs): 275 | """Initialize the Multi Head Block.""" 276 | super().__init__(d_model, q, v, h, attention_size, **kwargs) 277 | 278 | self._window_size = window_size 279 | self._padding = padding 280 | self._q = q 281 | self._v = v 282 | 283 | # Step size for the moving window 284 | self._step = self._window_size - 2 * self._padding 285 | 286 | # Score mask for decoder 287 | self._future_mask = nn.Parameter(torch.triu(torch.ones((self._window_size, self._window_size)), diagonal=1).bool(), 288 | requires_grad=False) 289 | 290 | if self._attention_size is not None: 291 | self._attention_mask = nn.Parameter(generate_local_map_mask(self._window_size, self._window_size, self._attention_size), 292 | requires_grad=False) 293 | 294 | def forward(self, 295 | query: torch.Tensor, 296 | key: torch.Tensor, 297 | value: torch.Tensor, 298 | mask: Optional[str] = None) -> torch.Tensor: 299 | """Propagate forward the input through the MHB. 300 | 301 | We compute for each head the queries, keys and values matrices, 302 | followed by the Scaled Dot-Product. The result is concatenated 303 | and returned with shape (batch_size, K, d_model). 304 | 305 | Parameters 306 | ---------- 307 | query: 308 | Input tensor with shape (batch_size, K, d_model) used to compute queries. 309 | key: 310 | Input tensor with shape (batch_size, K, d_model) used to compute keys. 311 | value: 312 | Input tensor with shape (batch_size, K, d_model) used to compute values. 313 | mask: 314 | Mask to apply on scores before computing attention. 315 | One of ``'subsequent'``, None. Default is None. 316 | 317 | Returns 318 | ------- 319 | Self attention tensor with shape (batch_size, K, d_model). 320 | """ 321 | batch_size = query.shape[0] 322 | 323 | # Apply padding to input sequence 324 | query = F.pad(query.transpose(1, 2), (self._padding, self._padding), 'replicate').transpose(1, 2) 325 | key = F.pad(key.transpose(1, 2), (self._padding, self._padding), 'replicate').transpose(1, 2) 326 | value = F.pad(value.transpose(1, 2), (self._padding, self._padding), 'replicate').transpose(1, 2) 327 | 328 | # Compute Q, K and V, concatenate heads on batch dimension 329 | queries = torch.cat(self._W_q(query).chunk(self._h, dim=-1), dim=0) 330 | keys = torch.cat(self._W_k(key).chunk(self._h, dim=-1), dim=0) 331 | values = torch.cat(self._W_v(value).chunk(self._h, dim=-1), dim=0) 332 | 333 | # Divide Q, K and V using a moving window 334 | queries = queries.unfold(dimension=1, size=self._window_size, step=self._step).reshape((-1, self._q, self._window_size)).transpose(1, 2) 335 | keys = keys.unfold(dimension=1, size=self._window_size, step=self._step).reshape((-1, self._q, self._window_size)).transpose(1, 2) 336 | values = values.unfold(dimension=1, size=self._window_size, step=self._step).reshape((-1, self._v, self._window_size)).transpose(1, 2) 337 | 338 | # Scaled Dot Product 339 | self._scores = torch.bmm(queries, keys.transpose(1, 2)) / np.sqrt(self._window_size) 340 | 341 | # Compute local map mask 342 | if self._attention_size is not None: 343 | self._scores = self._scores.masked_fill(self._attention_mask, float('-inf')) 344 | 345 | # Compute future mask 346 | if mask == "subsequent": 347 | self._scores = self._scores.masked_fill(self._future_mask, float('-inf')) 348 | 349 | # Apply softmax 350 | self._scores = F.softmax(self._scores, dim=-1) 351 | 352 | attention = torch.bmm(self._scores, values) 353 | 354 | # Fold chunks back 355 | attention = attention.reshape((batch_size*self._h, -1, self._window_size, self._v)) 356 | attention = attention[:, :, self._padding:-self._padding, :] 357 | attention = attention.reshape((batch_size*self._h, -1, self._v)) 358 | 359 | # Concatenat the heads 360 | attention_heads = torch.cat(attention.chunk(self._h, dim=0), dim=-1) 361 | 362 | # Apply linear transformation W^O 363 | self_attention = self._W_o(attention_heads) 364 | 365 | return self_attention 366 | -------------------------------------------------------------------------------- /models/transformer_grn/positionwiseFeedForward.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class PositionwiseFeedForward(nn.Module): 9 | """Position-wise Feed Forward Network block from Attention is All You Need. 10 | 11 | Apply two linear transformations to each input, separately but indetically. We 12 | implement them as 1D convolutions. Input and output have a shape (batch_size, d_model). 13 | 14 | Parameters 15 | ---------- 16 | d_model: 17 | Dimension of input tensor. 18 | d_ff: 19 | Dimension of hidden layer, default is 2048. 20 | """ 21 | 22 | def __init__(self, 23 | d_model: int, 24 | d_ff: Optional[int] = 128): 25 | """Initialize the PFF block.""" 26 | super().__init__() 27 | 28 | self._linear1 = nn.Linear(d_model, d_ff) 29 | self._linear2 = nn.Linear(d_ff, d_model) 30 | 31 | def forward(self, x: torch.Tensor) -> torch.Tensor: 32 | """Propagate forward the input through the PFF block. 33 | 34 | Apply the first linear transformation, then a relu actvation, 35 | and the second linear transformation. 36 | 37 | Parameters 38 | ---------- 39 | x: 40 | Input tensor with shape (batch_size, K, d_model). 41 | 42 | Returns 43 | ------- 44 | Output tensor with shape (batch_size, K, d_model). 45 | """ 46 | return self._linear2(F.relu(self._linear1(x))) 47 | -------------------------------------------------------------------------------- /models/transformer_grn/transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from models.transformer_grn.encoder import Encoder 5 | from models.transformer_grn.decoder import Decoder 6 | from models.transformer.utils import generate_original_PE, generate_regular_PE 7 | from models.temporal_fusion_t.linear_layer import LinearLayer 8 | 9 | 10 | class Transformer(nn.Module): 11 | """Transformer model from Attention is All You Need. 12 | 13 | A classic transformer model adapted for sequential data. 14 | Embedding has been replaced with a fully connected layer, 15 | the last layer softmax is now a sigmoid. 16 | 17 | Attributes 18 | ---------- 19 | layers_encoding: :py:class:`list` of :class:`Encoder.Encoder` 20 | stack of Encoder layers. 21 | layers_decoding: :py:class:`list` of :class:`Decoder.Decoder` 22 | stack of Decoder layers. 23 | 24 | Parameters 25 | ---------- 26 | d_input: 27 | Model input dimension. 28 | d_model: 29 | Dimension of the input vector. 30 | d_output: 31 | Model output dimension. 32 | q: 33 | Dimension of queries and keys. 34 | v: 35 | Dimension of values. 36 | h: 37 | Number of heads. 38 | N: 39 | Number of encoder and decoder layers to stack. 40 | attention_size: 41 | Number of backward elements to apply attention. 42 | Deactivated if ``None``. Default is ``None``. 43 | dropout: 44 | Dropout probability after each MHA or PFF block. 45 | Default is ``0.3``. 46 | chunk_mode: 47 | Swict between different MultiHeadAttention blocks. 48 | One of ``'chunk'``, ``'window'`` or ``None``. Default is ``'chunk'``. 49 | pe: 50 | Type of positional encoding to add. 51 | Must be one of ``'original'``, ``'regular'`` or ``None``. Default is ``None``. 52 | """ 53 | 54 | def __init__(self, cnf: dict): 55 | """Create transformer structure from Encoder and Decoder blocks.""" 56 | super().__init__() 57 | 58 | d_model = cnf["d_model"] 59 | q = cnf["q"] 60 | v = cnf["v"] 61 | h = cnf["h"] 62 | N = cnf["N"] 63 | attention_size = cnf["attention_size"] 64 | dropout = cnf["dropout"] 65 | pe = cnf["pe"] 66 | chunk_mode = cnf["chunk_mode"] 67 | d_input = cnf["d_input"] 68 | d_output = cnf["d_output"] 69 | self.time_steps = cnf["num_encoder_steps"] 70 | self.static_vars = cnf['static_input_loc'] 71 | self.regular_vars = cnf['known_regular_inputs'] + cnf['input_obs_loc'] 72 | 73 | self._d_model = d_model 74 | 75 | self.layers_encoding = nn.ModuleList([Encoder(d_model, 76 | q, 77 | v, 78 | h, 79 | attention_size=attention_size, 80 | dropout=dropout, 81 | chunk_mode=chunk_mode) for _ in range(N)]) 82 | self.layers_decoding = nn.ModuleList([Decoder(d_model, 83 | q, 84 | v, 85 | h, 86 | attention_size=attention_size, 87 | dropout=dropout, 88 | chunk_mode=chunk_mode) for _ in range(N)]) 89 | 90 | self._embedding_categorical = nn.ModuleList() 91 | for i in range(len(self.static_vars)): 92 | embedding = nn.Embedding(cnf['category_counts'][i], d_model) 93 | self._embedding_categorical.append(embedding) 94 | 95 | self._time_varying_embedding_layer = LinearLayer(input_size=len(self.regular_vars), size=d_model, 96 | use_time_distributed=True, batch_first=True) 97 | 98 | self._linear = nn.Linear(d_model, d_output) 99 | 100 | pe_functions = { 101 | 'original': generate_original_PE, 102 | 'regular': generate_regular_PE, 103 | } 104 | 105 | if pe in pe_functions.keys(): 106 | self._generate_PE = pe_functions[pe] 107 | else: 108 | self._generate_PE = None 109 | 110 | self.name = 'transformer' 111 | 112 | def split_features(self, x): 113 | x_static = torch.stack([ 114 | self._embedding_categorical[i](x[..., ix].long()) 115 | for i, ix in enumerate(self.static_vars) 116 | ], dim=-1) 117 | 118 | x_static = x_static[:, 0:1, :].squeeze(-1) 119 | x_input = self._time_varying_embedding_layer(x[..., self.regular_vars]) 120 | 121 | return x_input, x_static 122 | 123 | def forward(self, xy: torch.Tensor) -> torch.Tensor: 124 | """Propagate input through transformer 125 | 126 | Forward input through an embedding module, 127 | the encoder then decoder stacks, and an output module. 128 | 129 | Parameters 130 | ---------- 131 | x: 132 | :class:`torch.Tensor` of shape (batch_size, K, d_input). 133 | 134 | Returns 135 | ------- 136 | Output tensor with shape (batch_size, K, d_output). 137 | """ 138 | x = xy[:, :self.time_steps] 139 | y = xy[:, self.time_steps:] 140 | 141 | # Shift tensor add start token 142 | pad = torch.ones((y.shape[0], 1, y.shape[2])).to(y.device) 143 | y = torch.cat((pad, y), dim=1)[:, :-1, :] 144 | 145 | x_input, x_static = self.split_features(x) 146 | y_input, y_static = self.split_features(y) 147 | 148 | # Add position encoding 149 | if self._generate_PE is not None: 150 | positional_encoding = self._generate_PE(x_input.shape[1], self._d_model) 151 | positional_encoding = positional_encoding.to(x_input.device) 152 | x_input.add_(positional_encoding) 153 | 154 | # Encoding stack 155 | for layer in self.layers_encoding: 156 | encoding_x = layer(x_input, context=x_static) 157 | 158 | # Decoding stack 159 | decoding = y_input 160 | 161 | # Add position encoding 162 | if self._generate_PE is not None: 163 | positional_encoding = self._generate_PE(y.shape[1], self._d_model) 164 | positional_encoding = positional_encoding.to(decoding.device) 165 | decoding.add_(positional_encoding) 166 | 167 | for layer in self.layers_decoding: 168 | decoding = layer(decoding, encoding_x, context=y_static) 169 | 170 | # Output module 171 | output = self._linear(decoding) 172 | return output 173 | -------------------------------------------------------------------------------- /models/transformer_grn/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def generate_original_PE(length: int, d_model: int) -> torch.Tensor: 8 | """Generate positional encoding as described in original paper. :class:`torch.Tensor` 9 | 10 | Parameters 11 | ---------- 12 | length: 13 | Time window length, i.e. K. 14 | d_model: 15 | Dimension of the model vector. 16 | 17 | Returns 18 | ------- 19 | Tensor of shape (K, d_model). 20 | """ 21 | PE = torch.zeros((length, d_model)) 22 | 23 | pos = torch.arange(length).unsqueeze(1) 24 | PE[:, 0::2] = torch.sin( 25 | pos / torch.pow(1000, torch.arange(0, d_model, 2, dtype=torch.float32)/d_model)) 26 | PE[:, 1::2] = torch.cos( 27 | pos / torch.pow(1000, torch.arange(1, d_model, 2, dtype=torch.float32)/d_model)) 28 | 29 | return PE 30 | 31 | 32 | def generate_regular_PE(length: int, d_model: int, period: Optional[int] = 24) -> torch.Tensor: 33 | """Generate positional encoding with a given period. 34 | 35 | Parameters 36 | ---------- 37 | length: 38 | Time window length, i.e. K. 39 | d_model: 40 | Dimension of the model vector. 41 | period: 42 | Size of the pattern to repeat. 43 | Default is 24. 44 | 45 | Returns 46 | ------- 47 | Tensor of shape (K, d_model). 48 | """ 49 | PE = torch.zeros((length, d_model)) 50 | 51 | pos = torch.arange(length, dtype=torch.float32).unsqueeze(1) 52 | PE = torch.sin(pos * 2 * np.pi / period) 53 | PE = PE.repeat((1, d_model)) 54 | 55 | return PE 56 | 57 | 58 | def generate_local_map_mask(row: int, 59 | col: int, 60 | attention_size: int, 61 | mask_future=False, 62 | device: torch.device = 'cpu') -> torch.BoolTensor: 63 | """Compute attention mask as attention_size wide diagonal. 64 | 65 | Parameters 66 | ---------- 67 | row: 68 | Time dimension size v1 69 | col: 70 | Time dimension size v2 71 | attention_size: 72 | Number of backward elements to apply attention. 73 | device: 74 | torch device. Default is ``'cpu'``. 75 | 76 | Returns 77 | ------- 78 | Mask as a boolean tensor. 79 | """ 80 | local_map = np.empty((row, col)) 81 | i, j = np.indices(local_map.shape) 82 | 83 | if mask_future: 84 | local_map[i, j] = (i - j > attention_size) ^ (j - i > 0) 85 | else: 86 | local_map[i, j] = np.abs(i - j) > attention_size 87 | 88 | return torch.BoolTensor(local_map).to(device) 89 | -------------------------------------------------------------------------------- /progress_bar.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # --------------------- 3 | 4 | import math 5 | from datetime import datetime 6 | 7 | 8 | class ProgressBar(object): 9 | """ 10 | Utility class for the management of progress bars showing training progress in the form 11 | "[] Epoch ." 12 | """ 13 | 14 | 15 | @property 16 | def progress(self): 17 | # type: () -> float 18 | return (self.step + 1) / self.max_step 19 | 20 | 21 | def __init__(self, max_step, max_epoch, current_epoch=0): 22 | # type: (int, int, int) -> None 23 | self.max_step = max_step 24 | self.max_epoch = max_epoch 25 | self.current_epoch = current_epoch 26 | self.step = 0 27 | 28 | 29 | def inc(self): 30 | # type: () -> () 31 | """ 32 | Increase the progress bar value by one unit 33 | """ 34 | self.step = self.step + 1 35 | if self.step == self.max_step: 36 | self.step = 0 37 | self.current_epoch = self.current_epoch + 1 38 | 39 | 40 | def __str__(self): 41 | # type: () -> str 42 | value = int(round(self.progress * 50)) 43 | date = datetime.now().strftime("%b-%d@%H:%M").lower() 44 | progress_bar = ('█' * value + ('┈' * (50 - value))) 45 | return '\r[{}] Epoch {:0{e}d}.{:0{s}d}: │{}│ {:6.2f}%'.format( 46 | date, self.current_epoch, self.step + 1, 47 | progress_bar, 100 * self.progress, 48 | e=math.ceil(math.log10(self.max_epoch)), 49 | s=math.ceil(math.log10(self.max_step + 1)), 50 | ) 51 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | azure-storage-blob 2 | typing>=3.7 3 | Click>=7.0 4 | numpy>=1.17 5 | torchsummary>=1.5 6 | matplotlib>=3.1 7 | torch>=1.3 8 | termcolor>=1.1 9 | torchvision>=0.2 10 | Pillow>=6.2 11 | tensorboardX>=1.9 12 | PyYAML>=5.1.2 13 | path.py>=12.0 14 | pandas 15 | scikit-learn -------------------------------------------------------------------------------- /scheduler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # --------------------- 3 | 4 | import torch.backends.cudnn as cudnn 5 | from conf import Conf 6 | from trainer import Trainer 7 | import time 8 | 9 | import glob 10 | from pathlib import Path 11 | from retry import retry 12 | import click 13 | 14 | cudnn.benchmark = True 15 | 16 | 17 | @click.command() 18 | @click.option('--exp_path', type=str, default="./conf/experiments/") 19 | @retry(tries=2, delay=2) 20 | def scheduler(exp_path): 21 | for i, file in enumerate(sorted(glob.glob(exp_path + "*.yaml"))): 22 | time.sleep(5) 23 | exp_name = Path(file).stem 24 | cnf = Conf(conf_file_path=file, exp_name=exp_name, seed=666, log=False) 25 | print("\n Starting experiment: " + exp_name + "\n") 26 | trainer = Trainer(cnf=cnf) 27 | try: 28 | trainer.run() 29 | except Exception as e: 30 | print(e) 31 | del trainer 32 | print("\n Starting next experiment...\n") 33 | 34 | 35 | if __name__ == '__main__': 36 | scheduler() 37 | -------------------------------------------------------------------------------- /slurm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # --------------------- 3 | 4 | import subprocess 5 | 6 | import click 7 | from path import Path 8 | 9 | 10 | # ----------------------------- 11 | # Template of the Slurm script 12 | # ----------------------------- 13 | TEMPLATE = '''#!/bin/bash 14 | #SBATCH --job-name=**exp** 15 | #SBATCH --output=**project**/slurm/log/out.**exp**.txt 16 | #SBATCH --error=**project**/slurm/log/err.**exp**.txt 17 | #SBATCH --open-mode=append 18 | #SBATCH --partition=prod 19 | #SBATCH --nodes=1 20 | #SBATCH --ntasks=1 21 | #SBATCH --cpus-per-task=4 22 | #SBATCH --gres=gpu:1 23 | 24 | source activate python3 25 | 26 | cd **project** 27 | srun python -u main.py --exp_name '**exp**!' --conf_file_path '**cnf**' 28 | ''' 29 | 30 | 31 | @click.command() 32 | def main(): 33 | """ 34 | (1) creates slurm script 35 | (2) saves it in 'slurm/.sh 36 | (3) runs it if `sbatch` == True 37 | """ 38 | 39 | out_err_log_dir_path = Path('slurm/log') 40 | if not out_err_log_dir_path.exists(): 41 | out_err_log_dir_path.makedirs() 42 | 43 | exp_name = click.prompt('▶ experiment name', type=str) 44 | if Path(f'conf/{exp_name}.yaml').exists(): 45 | conf_file_name = click.prompt('▶ conf file name', default=f'{exp_name}.yaml') 46 | else: 47 | conf_file_name = click.prompt('▶ conf file name', default='default.yaml') 48 | 49 | if '/' in conf_file_name: 50 | conf_file_path = conf_file_name 51 | else: 52 | conf_file_path = f'conf/{conf_file_name}' 53 | project_dir_path = Path('.').abspath() 54 | 55 | text = TEMPLATE 56 | text = text.replace('**exp**', exp_name) 57 | text = text.replace('**cnf**', conf_file_path) 58 | text = text.replace('**project**', project_dir_path) 59 | if 'flanzi' in project_dir_path: 60 | text = text.replace('source activate python3', '#source activate python3') 61 | 62 | print('\n-------------------------------------\n') 63 | print(text) 64 | 65 | out_file_path = Path('slurm') / exp_name + '.sh' 66 | out_file_path.write_text(text=text) 67 | 68 | print('-------------------------------------\n') 69 | if click.confirm('▶ sbatch now?', default=True): 70 | print('\n-------------------------------------\n') 71 | command = f'sbatch {out_file_path}' 72 | process = subprocess.Popen(command.split(), stdout=subprocess.PIPE) 73 | output, error = process.communicate() 74 | print('▶', output.decode()) 75 | if error: 76 | print('▶ [ERROR] - ', error.decode()) 77 | 78 | 79 | if __name__ == '__main__': 80 | main() 81 | -------------------------------------------------------------------------------- /slurm/Traffic_5TR.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=TF_Traffic 3 | #SBATCH --output=/homes/svincenzi/TIME_SERIES/slurm/log/out.TF_Traffic.txt 4 | #SBATCH --error=/homes/svincenzi/TIME_SERIES/slurm/log/err.TF_Traffic.txt 5 | #SBATCH --open-mode=append 6 | #SBATCH --partition=prod 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks=1 9 | #SBATCH --cpus-per-task=4 10 | #SBATCH --gres=gpu:1 11 | 12 | source activate py_env2 13 | module load cuda/10.0 14 | 15 | export PYTHONPATH=/homes/svincenzi/TIME_SERIES 16 | 17 | 18 | cd /homes/svincenzi/TIME_SERIES 19 | srun python -u main.py --exp_name 'TF_Traffic!' --conf_file_path 'conf/traffic.yaml' 20 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # --------------------- 3 | 4 | from time import time 5 | import numpy as np 6 | import torch 7 | from torch import optim 8 | from torch.utils.data import DataLoader 9 | from torch.utils.tensorboard import SummaryWriter 10 | from conf import Conf 11 | from dataset.ts_dataset import TSDataset 12 | from models.temporal_fusion_t import tft_model 13 | from progress_bar import ProgressBar 14 | from utils import QuantileLoss, symmetric_mean_absolute_percentage_error, unnormalize_tensor, plot_temporal_serie 15 | import data_formatters.utils as utils 16 | from models.transformer import Transformer 17 | from models.transformer_grn.transformer import Transformer as GRNTransformer 18 | 19 | 20 | class Trainer(object): 21 | """ 22 | Class for training and test the model 23 | """ 24 | 25 | def __init__(self, cnf): 26 | # type: (Conf) -> Trainer 27 | 28 | torch.set_num_threads(3) 29 | 30 | self.cnf = cnf 31 | self.data_formatter = utils.make_data_formatter(cnf.ds_name) 32 | 33 | loader = TSDataset 34 | 35 | # init dataset 36 | dataset_train = loader(self.cnf, self.data_formatter) 37 | dataset_train.train() 38 | dataset_test = loader(self.cnf, self.data_formatter) 39 | dataset_test.test() 40 | 41 | # init model 42 | model_choice = self.cnf.all_params["model"] 43 | if model_choice == "transformer": 44 | # Baseline transformer 45 | self.model = Transformer(self.cnf.all_params) 46 | elif model_choice == "tf_transformer": 47 | # Temporal fusion transformer 48 | self.model = tft_model.TFT(self.cnf.all_params) 49 | elif model_choice == "grn_transformer": 50 | # Transformer + GRN to encode static vars 51 | self.model = GRNTransformer(self.cnf.all_params) 52 | else: 53 | raise NameError 54 | 55 | self.model = self.model.to(cnf.device) 56 | 57 | # init optimizer 58 | self.optimizer = optim.Adam(params=self.model.parameters(), lr=cnf.lr) 59 | self.loss = QuantileLoss(cnf.quantiles) 60 | 61 | # init train loader 62 | self.train_loader = DataLoader( 63 | dataset=dataset_train, batch_size=cnf.batch_size, 64 | num_workers=cnf.n_workers, shuffle=True, pin_memory=True, 65 | ) 66 | 67 | # init test loader 68 | self.test_loader = DataLoader( 69 | dataset=dataset_test, batch_size=cnf.batch_size, 70 | num_workers=cnf.n_workers, shuffle=False, pin_memory=True, 71 | ) 72 | 73 | # init logging stuffs 74 | self.log_path = cnf.exp_log_path 75 | print(f'tensorboard --logdir={cnf.project_log_path.abspath()}\n') 76 | self.sw = SummaryWriter(self.log_path) 77 | self.log_freq = len(self.train_loader) 78 | self.train_losses = [] 79 | self.test_loss = [] 80 | self.test_losses = {'p10': [], 'p50': [], 'p90': []} 81 | self.test_smape = [] 82 | 83 | # starting values 84 | self.epoch = 0 85 | self.best_test_loss = None 86 | 87 | # init progress bar 88 | self.progress_bar = ProgressBar(max_step=self.log_freq, max_epoch=self.cnf.epochs) 89 | 90 | # possibly load checkpoint 91 | self.load_ck() 92 | 93 | print("Finished preparing datasets.") 94 | 95 | def load_ck(self): 96 | """ 97 | load training checkpoint 98 | """ 99 | ck_path = self.log_path / 'training.ck' 100 | if ck_path.exists(): 101 | ck = torch.load(ck_path) 102 | print(f'[loading checkpoint \'{ck_path}\']') 103 | self.epoch = ck['epoch'] 104 | self.progress_bar.current_epoch = self.epoch 105 | self.model.load_state_dict(ck['model']) 106 | self.optimizer.load_state_dict(ck['optimizer']) 107 | self.best_test_loss = self.best_test_loss 108 | 109 | def save_ck(self): 110 | """ 111 | save training checkpoint 112 | """ 113 | ck = { 114 | 'epoch': self.epoch, 115 | 'model': self.model.state_dict(), 116 | 'optimizer': self.optimizer.state_dict(), 117 | 'best_test_loss': self.best_test_loss 118 | } 119 | torch.save(ck, self.log_path / 'training.ck') 120 | 121 | def train(self): 122 | """ 123 | train model for one epoch on the Training-Set. 124 | """ 125 | start_time = time() 126 | self.model.train() 127 | 128 | times = [] 129 | for step, sample in enumerate(self.train_loader): 130 | t = time() 131 | self.optimizer.zero_grad() 132 | # Feed input to the model 133 | x = sample['inputs'].float().to(self.cnf.device) 134 | if self.cnf.all_params["model"] == "tf_transformer": 135 | output, _, _ = self.model.forward(x) 136 | else: 137 | output = self.model.forward(x) 138 | 139 | # Compute Loss 140 | loss, _ = self.loss(output.squeeze(), sample['outputs'].squeeze().float().to(self.cnf.device)) 141 | loss.backward() 142 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cnf.all_params['max_gradient_norm']) 143 | self.train_losses.append(loss.item()) 144 | self.optimizer.step(None) 145 | 146 | # print an incredible progress bar 147 | times.append(time() - t) 148 | if self.cnf.log_each_step or (not self.cnf.log_each_step and self.progress_bar.progress == 1): 149 | print(f'\r{self.progress_bar} ' 150 | f'│ Loss: {np.mean(self.train_losses):.6f} ' 151 | f'│ ↯: {1 / np.mean(times):5.2f} step/s', end='') 152 | self.progress_bar.inc() 153 | 154 | # log average loss of this epoch 155 | mean_epoch_loss = np.mean(self.train_losses) 156 | self.sw.add_scalar(tag='train_loss', scalar_value=mean_epoch_loss, global_step=self.epoch) 157 | self.train_losses = [] 158 | 159 | # log epoch duration 160 | print(f' │ T: {time() - start_time:.2f} s') 161 | 162 | def test(self): 163 | """ 164 | test model on the Test-Set 165 | """ 166 | self.model.eval() 167 | output, sample = None, None 168 | 169 | t = time() 170 | for step, sample in enumerate(self.test_loader): 171 | 172 | # Hide future predictions from input vector, set to 0 (or 1) values where timestep > encoder_steps 173 | steps = self.cnf.all_params['num_encoder_steps'] 174 | pred_len = sample['outputs'].shape[1] 175 | x = sample['inputs'].float().to(self.cnf.device) 176 | x[:, steps:, 0] = 1 177 | 178 | # Feed input to the model 179 | if self.cnf.all_params["model"] == "transformer" or self.cnf.all_params["model"] == "grn_transformer": 180 | 181 | # Auto-regressive prediction 182 | for i in range(pred_len): 183 | output = self.model.forward(x) 184 | x[:, steps + i, 0] = output[:, i, 1] 185 | output = self.model.forward(x) 186 | 187 | elif self.cnf.all_params["model"] == "tf_transformer": 188 | output, _, _ = self.model.forward(x) 189 | else: 190 | raise NameError 191 | 192 | output = output.squeeze() 193 | y, y_pred = sample['outputs'].squeeze().float().to(self.cnf.device), output 194 | 195 | # Compute loss 196 | loss, _ = self.loss(y_pred, y) 197 | smape = symmetric_mean_absolute_percentage_error(output[:, :, 1].detach().cpu().numpy(), 198 | sample['outputs'][:, :, 0].detach().cpu().numpy()) 199 | 200 | # De-Normalize to compute metrics 201 | target = unnormalize_tensor(self.data_formatter, y, sample['identifier'][0][0]) 202 | p10_forecast = unnormalize_tensor(self.data_formatter, y_pred[..., 0], sample['identifier'][0][0]) 203 | p50_forecast = unnormalize_tensor(self.data_formatter, y_pred[..., 1], sample['identifier'][0][0]) 204 | p90_forecast = unnormalize_tensor(self.data_formatter, y_pred[..., 2], sample['identifier'][0][0]) 205 | 206 | # Compute metrics 207 | self.test_losses['p10'].append(self.loss.numpy_normalised_quantile_loss(p10_forecast, target, 0.1)) 208 | self.test_losses['p50'].append(self.loss.numpy_normalised_quantile_loss(p50_forecast, target, 0.5)) 209 | self.test_losses['p90'].append(self.loss.numpy_normalised_quantile_loss(p90_forecast, target, 0.9)) 210 | 211 | self.test_loss.append(loss.item()) 212 | self.test_smape.append(smape) 213 | 214 | # Log stuff 215 | for k in self.test_losses.keys(): 216 | mean_test_loss = np.mean(self.test_losses[k]) 217 | print(f'\t● AVG {k} Loss on TEST-set: {mean_test_loss:.6f} │ T: {time() - t:.2f} s') 218 | self.sw.add_scalar(tag=k + '_test_loss', scalar_value=mean_test_loss, global_step=self.epoch) 219 | 220 | # log log log 221 | mean_test_loss = np.mean(self.test_loss) 222 | mean_smape = np.mean(self.test_smape) 223 | print(f'\t● AVG Loss on TEST-set: {mean_test_loss:.6f} │ T: {time() - t:.2f} s') 224 | print(f'\t● AVG SMAPE on TEST-set: {mean_smape:.6f} │ T: {time() - t:.2f} s') 225 | self.sw.add_scalar(tag='test_smape', scalar_value=mean_smape, global_step=self.epoch) 226 | self.sw.add_scalar(tag='test_loss', scalar_value=mean_test_loss, global_step=self.epoch) 227 | 228 | # save best model 229 | if self.best_test_loss is None or mean_test_loss < self.best_test_loss: 230 | self.best_test_loss = mean_test_loss 231 | torch.save(self.model.state_dict(), self.log_path / self.cnf.exp_name + '_best.pth') 232 | 233 | def run(self): 234 | """ 235 | start model training procedure (train > test > checkpoint > repeat) 236 | """ 237 | for _ in range(self.epoch, self.cnf.epochs): 238 | self.train() 239 | 240 | with torch.no_grad(): 241 | self.test() 242 | 243 | self.epoch += 1 244 | self.save_ck() 245 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # --------------------- 3 | 4 | import json 5 | import os 6 | from datetime import datetime 7 | from enum import Enum 8 | from typing import * 9 | from typing import Callable, List, TypeVar 10 | 11 | import PIL 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | import pandas as pd 15 | import torch 16 | from PIL.Image import Image 17 | from matplotlib import cm 18 | from matplotlib import figure 19 | from pathlib import Path 20 | from torch import Tensor 21 | from torch import nn 22 | from torchvision.transforms import ToTensor 23 | 24 | 25 | class QuantileLoss(nn.Module): 26 | ## From: https://medium.com/the-artificial-impostor/quantile-regression-part-2-6fdbc26b2629 27 | 28 | def __init__(self, quantiles): 29 | ##takes a list of quantiles 30 | super().__init__() 31 | self.quantiles = quantiles 32 | 33 | def numpy_normalised_quantile_loss(self, y_pred, y, quantile): 34 | """Computes normalised quantile loss for numpy arrays. 35 | Uses the q-Risk metric as defined in the "Training Procedure" section of the 36 | main TFT paper. 37 | Args: 38 | y: Targets 39 | y_pred: Predictions 40 | quantile: Quantile to use for loss calculations (between 0 & 1) 41 | Returns: 42 | Float for normalised quantile loss. 43 | """ 44 | if isinstance(y_pred, torch.Tensor): 45 | y_pred = y_pred.detach().cpu().numpy() 46 | 47 | if len(y_pred.shape) == 3: 48 | ix = self.quantiles.index(quantile) 49 | y_pred = y_pred[..., ix] 50 | 51 | if isinstance(y, torch.Tensor): 52 | y = y.detach().cpu().numpy() 53 | 54 | prediction_underflow = y - y_pred 55 | weighted_errors = quantile * np.maximum(prediction_underflow, 0.) \ 56 | + (1. - quantile) * np.maximum(-prediction_underflow, 0.) 57 | 58 | quantile_loss = weighted_errors.mean() 59 | normaliser = np.abs(y).mean() 60 | 61 | return 2 * quantile_loss / normaliser 62 | 63 | def forward(self, preds, target, ret_losses=True): 64 | assert not target.requires_grad 65 | assert preds.size(0) == target.size(0) 66 | losses = [] 67 | 68 | for i, q in enumerate(self.quantiles): 69 | errors = target - preds[:, :, i] 70 | losses.append( 71 | torch.max( 72 | (q - 1) * errors, 73 | q * errors 74 | ).unsqueeze(1)) 75 | loss = torch.mean( 76 | torch.sum(torch.cat(losses, dim=1), dim=1)) 77 | if ret_losses: 78 | return loss, losses 79 | return loss 80 | 81 | 82 | def unnormalize_tensor(data_formatter, data, identifier): 83 | data = pd.DataFrame( 84 | data.detach().cpu().numpy(), 85 | columns=[ 86 | 't+{}'.format(i) 87 | for i in range(data.shape[1]) 88 | ]) 89 | 90 | data['identifier'] = np.array(identifier) 91 | data = data_formatter.format_predictions(data) 92 | 93 | return data.drop(columns=['identifier']).values 94 | 95 | 96 | def symmetric_mean_absolute_percentage_error(forecast, actual): 97 | # Symmetric Mean Absolute Percentage Error (SMAPE) 98 | sequence_length = forecast.shape[1] 99 | sumf = np.sum(np.abs(forecast - actual) / (np.abs(actual) + np.abs(forecast)), axis=1) 100 | return np.mean((2 * sumf) / sequence_length) 101 | 102 | 103 | def plot_temporal_serie(y_pred, y_true): 104 | if isinstance(y_pred, Tensor): 105 | y_pred = y_pred.detach().cpu().numpy() 106 | 107 | if isinstance(y_true, Tensor): 108 | y_true = y_true.detach().cpu().numpy() 109 | 110 | ind = np.random.choice(y_pred.shape[0]) 111 | plt.plot(y_pred[ind, :, 0], label='pred_1') 112 | plt.plot(y_pred[ind, :, 1], label='pred_5') 113 | plt.plot(y_pred[ind, :, 2], label='pred_9') 114 | 115 | plt.plot(y_true[ind, :, 0], label='true') 116 | plt.legend() 117 | plt.show() 118 | 119 | 120 | def imread(path): 121 | # type: (Union[Path, str]) -> Image 122 | """ 123 | Reads the image located in `path` 124 | :param path: 125 | :return: 126 | """ 127 | with open(path, 'rb') as f: 128 | with PIL.Image.open(f) as img: 129 | return img.convert('RGB') 130 | 131 | 132 | def pyplot_to_numpy(pyplot_figure): 133 | # type: (figure.Figure) -> np.ndarray 134 | """ 135 | Converts a PyPlot figure into a NumPy array 136 | :param pyplot_figure: figure you want to convert 137 | :return: converted NumPy array 138 | """ 139 | pyplot_figure.canvas.draw() 140 | x = np.fromstring(pyplot_figure.canvas.tostring_rgb(), dtype=np.uint8, sep='') 141 | x = x.reshape(pyplot_figure.canvas.get_width_height()[::-1] + (3,)) 142 | return x 143 | 144 | 145 | def pyplot_to_tensor(pyplot_figure): 146 | # type: (figure.Figure) -> Tensor 147 | """ 148 | Converts a PyPlot figure into a PyTorch tensor 149 | :param pyplot_figure: figure you want to convert 150 | :return: converted PyTorch tensor 151 | """ 152 | x = pyplot_to_numpy(pyplot_figure=pyplot_figure) 153 | x = ToTensor()(x) 154 | return x 155 | 156 | 157 | def apply_colormap_to_tensor(x, cmap='jet', range=(None, None)): 158 | # type: (Tensor, str, Optional[Tuple[float, float]]) -> Tensor 159 | """ 160 | :param x: Tensor with shape (1, H, W) 161 | :param cmap: name of the color map you want to apply 162 | :param range: tuple of (minimum possible value in x, maximum possible value in x) 163 | :return: Tensor with shape (3, H, W) 164 | """ 165 | cmap = cm.ScalarMappable(cmap=cmap) 166 | cmap.set_clim(vmin=range[0], vmax=range[1]) 167 | x = x.detatch().cpu().numpy() 168 | x = x.squeeze() 169 | x = cmap.to_rgba(x)[:, :, :-1] 170 | return ToTensor()(x) 171 | 172 | --------------------------------------------------------------------------------