├── .gitignore ├── LICENSE ├── README.md ├── agents ├── __init__.py ├── base.py └── rnn_autoencoder.py ├── configs ├── __init__.py └── config_rnn_ae.json ├── data └── .keep ├── datasets ├── __init__.py └── ecg5000.py ├── experiments ├── .keep └── checkpoints │ └── .keep ├── graphs ├── __init__.py ├── losses │ ├── MAEAUCLoss.py │ ├── MAELoss.py │ ├── MSEAUCLoss.py │ └── MSELoss.py └── models │ ├── __init__.py │ └── recurrent_autoencoder.py ├── main.py ├── notebooks └── .keep ├── requirements.txt └── utils ├── __init__.py ├── assets ├── dec_eq.PNG ├── decoder.png ├── enc_eq.PNG └── encoder.png ├── checkpoints.py ├── config.py ├── create_config.py ├── data_preparation.py ├── metrics.py └── samplers.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Jupyter Notebook 7 | .ipynb_checkpoints 8 | 9 | # Pycharm 10 | .idea/ 11 | 12 | # Data 13 | data/* 14 | !data/.keep 15 | 16 | # Experiments 17 | experiments/checkpoints/* 18 | !experiments/checkpoints/.keep 19 | 20 | # Personal files 21 | Usefull_Link.py 22 | create_config.py 23 | ecg5000_data_preparation.py 24 | fpr 25 | roc_auc 26 | tpr 27 | train_loss_all 28 | valid_MAE_all 29 | valid_loss_all 30 | utils/data_preparation_old.py 31 | 32 | # Env 33 | rnn_ae_env/* 34 | demo_env/* 35 | 36 | # Notebooks 37 | notebooks/* 38 | !notebooks/.keep 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Francesco 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Recurrent Neural Networks-based Autoencoders 2 | A PyTorch implementation of [LSTM-based Encoder-Decoder for Multi-sensor Anomaly Detection](https://arxiv.org/pdf/1607.00148.pdf) 3 | 4 | 5 | ## Table of Contents: 6 | 7 | - [RecAE-PyTorch](#recae-pytorch) 8 | - [Project Structure](#project-structure) 9 | - [Model](#model) 10 | - [Data](#data) 11 | - [Requirements](#requirements) 12 | - [Usage](#usage) 13 | 14 | 15 | ### Project Structure: 16 | The project structure is based on the following [Pytorch Project Template](https://github.com/moemen95/PyTorch-Project-Template) 17 | ``` 18 | ├── agents 19 | | └── rnn_autoencoder.py # the main training agent for the recurrent NN-based AE 20 | ├── graphs 21 | | └── models 22 | | | └── recurrent_autoencoder.py # recurrent NN-based AE model definition 23 | | └── losses 24 | | | └── MAELoss.py # contains the Mean Absolute Error (MAE) loss 25 | | | └── MSELoss.py # contains the Mean Squared Error (MSE) loss 26 | | | └── AUCLoss.py # under development (DO NOT USE!) 27 | ├── datasets # contains all dataloaders for the project 28 | | └── ecg5000.py # dataloader for ECG5000 dataset 29 | ├── data 30 | | └── ECG5000 # contains all ECG time series 31 | ├── utils # utilities folder containing metrics, checkpoints and arg parsing (configs). 32 | | └── assets 33 | | └── checkpoints.py 34 | | └── config.py 35 | | └── metrics.py 36 | | └── create_config.py 37 | | └── data_preparation.py 38 | ├── notebooks # Folder where adding your notebook 39 | ├── experiments # Folder where saving the results of your experiments 40 | ├── main.py 41 | 42 | ``` 43 | 44 | ### Model 45 | #### Encoder 46 | 47 | ![alt text](./utils/assets/encoder.png "Encoder") 48 | 49 | 50 | In the encoder each vector of a time-window of length is fed into a recurrent unit to perform the following computation: 51 | 52 |

53 | 54 | 55 | #### Decoder 56 | ![alt text](./utils/assets/decoder.png "Decoder") 57 | 58 | In the decoder we reconstruct the time series in reverse order: 59 | 60 |

61 | 62 | 63 |

64 | 65 | ### Data 66 | 67 | #### Description 68 | The [ECG5000 dataset](http://www.timeseriesclassification.com/description.php?Dataset=ECG5000) contains 5000 ElectroCardioGram (ECG) univariate time series of length . Each sequence corresponds to an heartbeat. Five classes are annotated, corresponding to the following labels: Normal (N), R-on-T Premature Ventricular Contraction (R-on-T PVC), Premature Ventricular Contraction (PVC), Supra-ventricular Premature or Ectopic Beat (SP or EB) and Unclassified Beat (UB). For each class we have the number of instances reported in the following Table: 69 | 70 | | Class | #Instance | 71 | | --- | --- | 72 | | N | 2919 | 73 | | R-on-T PVC | 1767 | 74 | | PVC | 194 | 75 | | SP or EB | 96 | 76 | | UB | 24 | 77 | 78 | Since the main task here is anomaly detection rather than classification, all istances which do not belong to class N have been merged in unique class which will be referred to as Anomalous (AN). 79 | 80 | #### Download and data partioning 81 | You can directly download the ECG5000 dataset from [here](http://www.timeseriesclassification.com/description.php?Dataset=ECG5000) or by running the script ```utils/data_preparation.py```. This script allows performing data partitioning as well, i.e., splitting your data in training, validation and test set. For more details, run the following: ``` python utils/data_preparation.py -h``` 82 | 83 | 84 | ### Requirements 85 | Check [requirements.txt](https://github.com/PyLink88/Recurrent-Autoencoder/blob/main/requirements.txt). 86 | 87 | ### Usage 88 | - Before running the project, you need to add your configuration into the folder ```configs/``` as found [here](https://github.com/PyLink88/Recurrent-Autoencoder/blob/main/configs/config_rnn_ae.json). To this aim, you can just modify the script ```utils/create_config.py```and then running the following 89 | ``` python utils/create_config.py```. 90 | - Finally to run the project: ``` python main.py configs/config_rnn_ae.json``` 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /agents/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | path = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 7 | mod = __import__('.'.join([__name__, py]), fromlist=[py]) 8 | classes = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)] 9 | for cls in classes: 10 | setattr(sys.modules[__name__], cls.__name__, cls) -------------------------------------------------------------------------------- /agents/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | The Base Agent class, where all other agents inherit from, that contains definitions for all the necessary functions 3 | """ 4 | import logging 5 | 6 | class BaseAgent: 7 | """ 8 | This base class will contain the base functions to be overloaded by any agent you will implement. 9 | """ 10 | 11 | def __init__(self, config): 12 | self.config = config 13 | 14 | def load_checkpoint(self, file_name): 15 | """ 16 | Latest checkpoint loader 17 | :param file_name: name of the checkpoint file 18 | :return: 19 | """ 20 | raise NotImplementedError 21 | 22 | def save_checkpoint(self, file_name="checkpoint.pth.tar", is_best=0): 23 | """ 24 | Checkpoint saver 25 | :param file_name: name of the checkpoint file 26 | :param is_best: boolean flag to indicate whether current checkpoint's metric is the best so far 27 | :return: 28 | """ 29 | raise NotImplementedError 30 | 31 | def run(self): 32 | """ 33 | The main operator 34 | :return: 35 | """ 36 | raise NotImplementedError 37 | 38 | def train(self): 39 | """ 40 | Main training loop 41 | :return: 42 | """ 43 | raise NotImplementedError 44 | 45 | def train_one_epoch(self): 46 | """ 47 | One epoch of training 48 | :return: 49 | """ 50 | raise NotImplementedError 51 | 52 | def validate_one_epoch(self): 53 | """ 54 | One cycle of model validation 55 | :return: 56 | """ 57 | raise NotImplementedError 58 | 59 | def finalize(self): 60 | """ 61 | Finalizes all the operations of the 2 Main classes of the process, the operator and the data loader 62 | :return: 63 | """ 64 | raise NotImplementedError 65 | -------------------------------------------------------------------------------- /agents/rnn_autoencoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | The RNN autoencoder agent class 3 | """ 4 | 5 | import torch 6 | from torch import nn 7 | 8 | import numpy as np 9 | from tqdm import tqdm 10 | import shutil 11 | import os 12 | 13 | from agents.base import BaseAgent 14 | from utils.metrics import AverageMeter 15 | from utils.checkpoints import checkpoints_folder 16 | from utils.config import save_config 17 | from datasets.ecg5000 import ECG500DataLoader 18 | from graphs.models.recurrent_autoencoder import RecurrentAE 19 | from graphs.losses.MAEAUCLoss import MAEAUCLoss 20 | from graphs.losses.MSEAUCLoss import MSEAUCLoss 21 | from graphs.losses.MAELoss import MAELoss 22 | from graphs.losses.MSELoss import MSELoss 23 | 24 | class RecurrentAEAgent(BaseAgent): 25 | 26 | def __init__(self, config): 27 | super().__init__(config) 28 | 29 | # Create an instance from the Model 30 | self.model = RecurrentAE(self.config) 31 | 32 | # Create an instance from the data loader 33 | self.data_loader = ECG500DataLoader(self.config) # CHANGE 34 | 35 | # Create instance from the loss 36 | self.loss = {'MSE': MSELoss(), 37 | 'MAE': MAELoss(), 38 | 'MSEAUC': MSEAUCLoss(), 39 | 'MAEAUC': MAEAUCLoss()}[self.config.loss] 40 | 41 | # Create instance from the optimizer 42 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr = self.config.learning_rate) 43 | 44 | # Training info 45 | self.current_epoch = 0 46 | 47 | # Creating folder where to save checkpoints 48 | self.checkpoints_path = checkpoints_folder(self.config) 49 | 50 | # Initialize my counters 51 | self.current_epoch = 0 52 | self.best_valid = 10e+16 # Setting a very large values 53 | self.train_loss = np.array([], dtype = np.float64) 54 | self.train_loss_parz = np.array([], dtype=np.float64) 55 | self.valid_loss = np.array([], dtype = np.float64) 56 | 57 | # Check is cuda is available or not 58 | self.is_cuda = torch.cuda.is_available() 59 | # Construct the flag and make sure that cuda is available 60 | self.cuda = self.is_cuda & self.config.cuda 61 | 62 | if self.cuda: 63 | self.device = torch.device("cuda") 64 | torch.cuda.manual_seed_all(self.config.seed) 65 | torch.cuda.set_device(self.config.gpu_device) 66 | print("Operation will be on *****GPU-CUDA***** ") 67 | #print_cuda_statistics() 68 | else: 69 | self.device = torch.device("cpu") 70 | torch.manual_seed(self.config.seed) 71 | print("Operation will be on *****CPU***** ") 72 | 73 | self.model = self.model.to(self.device) 74 | self.loss = self.loss.to(self.device) 75 | 76 | # Loading chekpoint 77 | self.load_checkpoint(self.config.checkpoint_file) 78 | 79 | def train(self): 80 | 81 | for epoch in range(self.current_epoch, self.config.max_epoch): 82 | 83 | self.current_epoch = epoch 84 | 85 | # Training epoch 86 | if self.config.training_type == "one_class": 87 | perf_train = self.train_one_epoch() 88 | self.train_loss = np.append(self.train_loss, perf_train[0].avg) 89 | print('Training loss at epoch ' + str(self.current_epoch) + ' is ' + str(perf_train[0].avg)) 90 | else: 91 | perf_train, perf_train_parz = self.train_one_epoch() 92 | self.train_loss = np.append(self.train_loss, perf_train.avg) 93 | self.train_loss_parz = np.append(self.train_loss_parz, perf_train_parz.avg) 94 | print('Training loss at epoch ' + str(self.current_epoch) + ' is ' + str(perf_train.avg)) 95 | print('Training loss parz at epoch ' + str(self.current_epoch) + ' is ' + str(perf_train_parz.avg)) 96 | 97 | # Validation 98 | perf_valid = self.validate_one_epoch() 99 | self.valid_loss = np.append(self.valid_loss, perf_valid.avg) 100 | print('Validation loss at epoch ' + str(self.current_epoch) + ' is ' + str(perf_valid.avg)) 101 | 102 | 103 | # Saving 104 | is_best = perf_valid.sum < self.best_valid 105 | if is_best: 106 | self.best_valid = perf_valid.sum 107 | self.save_checkpoint(is_best=is_best) 108 | 109 | def train_one_epoch(self): 110 | """ One epoch training step """ 111 | 112 | # Initialize tqdm 113 | tqdm_batch = tqdm(self.data_loader.train_loader, total = self.data_loader.train_iterations, 114 | desc ="Epoch-{}-".format(self.current_epoch)) 115 | 116 | # Set the model to be in training mode 117 | self.model.train() 118 | 119 | # Initialize your average meters 120 | epoch_loss = AverageMeter() 121 | epoch_loss_parz = AverageMeter() 122 | 123 | # One epoch of training 124 | for x, y in tqdm_batch: 125 | if self.cuda: 126 | x, y = x.cuda(), y.cuda() 127 | 128 | # Model 129 | x_hat = self.model(x) 130 | 131 | # Current training loss 132 | if self.config.training_type == "one_class": 133 | cur_tr_loss = self.loss(x, x_hat) 134 | else: 135 | cur_tr_loss, cur_tr_parz_loss = self.loss(x, x_hat, y, self.config.lambda_auc) 136 | 137 | if np.isnan(float(cur_tr_loss.item())): 138 | raise ValueError('Loss is nan during training...') 139 | 140 | # Optimizer 141 | self.optimizer.zero_grad() 142 | cur_tr_loss.backward() 143 | self.optimizer.step() 144 | 145 | # Updating loss 146 | if self.config.training_type == "one_class": 147 | epoch_loss.update(cur_tr_loss.item()) 148 | else: 149 | epoch_loss.update(cur_tr_loss.item()) 150 | epoch_loss_parz.update(cur_tr_parz_loss.item()) 151 | 152 | tqdm_batch.close() 153 | 154 | return epoch_loss, epoch_loss_parz 155 | 156 | def validate_one_epoch(self): 157 | """ One epoch validation step """ 158 | # Initialize tqdm 159 | tqdm_batch = tqdm(self.data_loader.valid_loader, total = self.data_loader.valid_iterations, 160 | desc = "Validation at epoch -{}-".format(self.current_epoch)) 161 | 162 | # Set the model to be in evaluation mode 163 | self.model.eval() 164 | 165 | # Initialize your average meters 166 | epoch_loss = AverageMeter() 167 | 168 | with torch.no_grad(): 169 | 170 | for x, y in tqdm_batch: 171 | if self.cuda: 172 | x, y = x.cuda(), y.cuda() 173 | 174 | # Model 175 | x_hat = self.model(x) 176 | 177 | # Current training loss 178 | if self.config.training_type == "one_class": 179 | cur_val_loss = self.loss(x, x_hat) 180 | else: 181 | cur_val_loss = self.loss(x, x_hat, y, self.config.lambda_auc) 182 | 183 | if np.isnan(float(cur_val_loss.item())): 184 | raise ValueError('Loss is nan during validation...') 185 | 186 | # Updating loss 187 | epoch_loss.update(cur_val_loss.item()) 188 | 189 | tqdm_batch.close() 190 | return epoch_loss 191 | 192 | def save_checkpoint(self, filename ='checkpoint.pth.tar', is_best = 0): 193 | """ 194 | Saving the latest checkpoint of the training 195 | :param filename: filename which will contain the state 196 | :param is_best: flag is it is the best model 197 | :return: 198 | """ 199 | state = { 200 | 'epoch': self.current_epoch, 201 | 'state_dict': self.model.state_dict(), 202 | 'optimizer': self.optimizer.state_dict(), 203 | 'valid_loss': self.valid_loss, 204 | 'train_loss': self.train_loss, 205 | 'train_loss_parz': self.train_loss_parz 206 | } 207 | 208 | # Save the state 209 | torch.save(state, self.checkpoints_path + filename) 210 | 211 | # If it is the best copy it to another file 'model_best.pth.tar' 212 | if is_best: 213 | shutil.copyfile(self.checkpoints_path + filename, 214 | self.checkpoints_path + 'model_best.pth.tar') 215 | print('Saving a best model') 216 | 217 | def load_checkpoint(self, filename): 218 | 219 | if self.config.load_checkpoint: 220 | filename = self.checkpoints_path + filename 221 | try: 222 | checkpoint = torch.load(filename) 223 | self.current_epoch = checkpoint['epoch'] 224 | self.model.load_state_dict(checkpoint['state_dict']) 225 | self.optimizer.load_state_dict(checkpoint['optimizer']) 226 | self.valid_loss = checkpoint['valid_loss'] 227 | self.train_loss = checkpoint['train_loss'] 228 | self.train_loss_parz = checkpoint['train_loss_parz'] 229 | 230 | print("Checkpoint loaded successfully from '{}' at (epoch {}) \n" 231 | .format(self.checkpoints_path , checkpoint['epoch'])) 232 | except OSError as e: 233 | print("No checkpoint exists from '{}'. Skipping...".format(self.config.checkpoint_dir)) 234 | else: 235 | print('Training a new model from scratch') 236 | 237 | def run(self): 238 | """ 239 | The main operator 240 | :return: 241 | """ 242 | # Saving config 243 | save_config(self.config, self.checkpoints_path) 244 | 245 | # Model training 246 | self.train() 247 | 248 | def finalize(self): 249 | """ 250 | Finalizes all the operations of the 2 Main classes of the process, the operator and the data loader 251 | :return: 252 | """ 253 | self.save_checkpoint() 254 | self.data_loader.finalize() 255 | 256 | 257 | 258 | 259 | -------------------------------------------------------------------------------- /configs/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | path = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 7 | mod = __import__('.'.join([__name__, py]), fromlist=[py]) 8 | classes = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)] 9 | for cls in classes: 10 | setattr(sys.modules[__name__], cls.__name__, cls) -------------------------------------------------------------------------------- /configs/config_rnn_ae.json: -------------------------------------------------------------------------------- 1 | {"exp_name": "rnn_ae_ECG5000_exp_0", "agent": "RecurrentAEAgent", "rnn_type": "GRU", "rnn_act": "None", "n_layers": 1, "latent_dim": 8, "n_features": 1, "learning_rate": 0.001, "batch_size": 128, "batch_size_val": 256, "max_epoch": 2000, "loss": "MAE", "lambda_auc": 0.1, "sampler_random_state": 88, "data_folder": "./data/ECG5000/numpy/", "X_train": "X_train.npy", "y_train": "y_train.npy", "X_train_p": "X_train_p.npy", "y_train_p": "y_train_p.npy", "X_val": "X_val.npy", "y_val": "y_val.npy", "X_test": "X_test.npy", "y_test": "y_test.npy", "X_val_p": "X_val_p.npy", "y_val_p": "y_val_p.npy", "training_type": "one_class", "validation_type": "one_class", "checkpoint_file": "checkpoint.pth.tar", "checkpoint_dir": "./experiments/checkpoints/", "load_checkpoint": false, "cuda": false, "device": "cpu", "gpu_device": 0, "seed": 58} -------------------------------------------------------------------------------- /data/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyLink88/Recurrent-Autoencoder/e04c84775f2396bee44843c990ea6b91908f0f3a/data/.keep -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | path = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 7 | mod = __import__('.'.join([__name__, py]), fromlist=[py]) 8 | classes = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)] 9 | for cls in classes: 10 | setattr(sys.modules[__name__], cls.__name__, cls) -------------------------------------------------------------------------------- /datasets/ecg5000.py: -------------------------------------------------------------------------------- 1 | """ 2 | ECG5000 Dataloader implementation, used in RNN_Autoencoder 3 | """ 4 | 5 | import numpy as np 6 | from utils.samplers import StratifiedSampler 7 | 8 | import torch 9 | from torch.utils.data import DataLoader, TensorDataset, Dataset 10 | 11 | 12 | 13 | class ECG500DataLoader: 14 | def __init__(self, config): 15 | self.config = config 16 | 17 | # Loading training data 18 | if self.config.training_type == 'one_class': 19 | # If loss without AUC penalty is used 20 | X_train = np.load(self.config.data_folder + self.config.X_train).astype(np.float32) 21 | y_train = np.load(self.config.data_folder + self.config.y_train).astype(np.float32) 22 | else: 23 | # If loss with AUC penalty is used 24 | X_train = np.load(self.config.data_folder + self.config.X_train_p).astype(np.float32) 25 | y_train = np.load(self.config.data_folder + self.config.y_train_p).astype(np.float32) 26 | 27 | # Loading validation data to control model training 28 | X_val = np.load(self.config.data_folder + self.config.X_val).astype(np.float32) 29 | y_val = np.load(self.config.data_folder + self.config.y_val).astype(np.float32) 30 | 31 | # From numpy to torch 32 | if X_train.ndim < 3: 33 | X_train = torch.from_numpy(X_train).unsqueeze(2) 34 | X_val = torch.from_numpy(X_val).unsqueeze(2) 35 | else: 36 | X_train = torch.from_numpy(X_train) 37 | X_val = torch.from_numpy(X_val) 38 | 39 | y_train = torch.from_numpy(y_train) 40 | y_val = torch.from_numpy(y_val) 41 | 42 | # Tensordataset 43 | training = TensorDataset(X_train, y_train) 44 | validation = TensorDataset(X_val, y_val) 45 | 46 | # Dataloader 47 | if self.config.training_type == 'one_class': 48 | 49 | self.train_loader = DataLoader(training, batch_size = self.config.batch_size, shuffle = True) 50 | else: 51 | 52 | sampler = StratifiedSampler(y_train, 53 | batch_size =self.config.batch_size, 54 | random_state =self.config.sampler_random_state) 55 | self.train_loader = DataLoader(training, batch_sampler = sampler) 56 | 57 | self.valid_loader = DataLoader(validation, batch_size = self.config.batch_size_val, shuffle = False) 58 | 59 | # Number of batches 60 | self.train_iterations = len(self.train_loader) 61 | self.valid_iterations = len(self.valid_loader) 62 | 63 | def finalize(self): 64 | pass 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /experiments/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyLink88/Recurrent-Autoencoder/e04c84775f2396bee44843c990ea6b91908f0f3a/experiments/.keep -------------------------------------------------------------------------------- /experiments/checkpoints/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyLink88/Recurrent-Autoencoder/e04c84775f2396bee44843c990ea6b91908f0f3a/experiments/checkpoints/.keep -------------------------------------------------------------------------------- /graphs/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | path = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 7 | mod = __import__('.'.join([__name__, py]), fromlist=[py]) 8 | classes = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)] 9 | for cls in classes: 10 | setattr(sys.modules[__name__], cls.__name__, cls) -------------------------------------------------------------------------------- /graphs/losses/MAEAUCLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | def MAEAUC_approx(x, x_hat, y, lambda_auc): 5 | 6 | # Computing error for each row 7 | err = torch.abs(x - x_hat).mean(axis = (1, 2)) 8 | 9 | # Selecting error of positive and negative example 10 | err_n = err[y == 1] 11 | err_a = err[y > 1] 12 | n_a = (err_a.shape)[0] 13 | n_n = (err_n.shape)[0] 14 | 15 | # If there are positive examples compute the AUC penalty 16 | if n_a > 0: 17 | diff = err_a.view(-1, 1).unsqueeze(1) - err_n.view(-1, 1) 18 | exp = torch.sigmoid(diff).sum() 19 | auc = lambda_auc * exp / (n_a * n_n) 20 | mean_loss = err.mean() 21 | penalized_loss = err.mean() - auc 22 | return penalized_loss, mean_loss 23 | else: 24 | mean_loss = err.mean() 25 | return mean_loss 26 | 27 | class MAEAUCLoss(nn.Module): 28 | def __init__(self): 29 | super().__init__() 30 | self.loss = MAEAUC_approx 31 | 32 | def forward(self, x_hat, x_true, y, lambda_auc): 33 | loss = self.loss(x_hat, x_true, y, lambda_auc) 34 | return loss 35 | 36 | if __name__ == '__main__': 37 | 38 | # lambda_auc in {0,0.1,1,10,100,1000,10000} with MSE error 39 | x = torch.rand([10,2,3]) 40 | x_hat = torch.rand([10,2,3]) +.2 41 | y = torch.tensor([0,0,0,0,0,0,1,1,1,1]) 42 | 43 | loss = MAEAUCLoss() 44 | print(loss(x, x_hat, y, 10)) 45 | print(MAEAUC_approx(x, x_hat, y, 10)) 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /graphs/losses/MAELoss.py: -------------------------------------------------------------------------------- 1 | """ 2 | MAE loss 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | class MAELoss(nn.Module): 9 | def __init__(self): 10 | super().__init__() 11 | self.loss = nn.L1Loss(reduction = 'mean') 12 | 13 | def forward(self, y_hat, y_true): 14 | loss = self.loss(y_hat, y_true) 15 | return loss 16 | -------------------------------------------------------------------------------- /graphs/losses/MSEAUCLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | def MSEAUC_approx(x, x_hat, y, lambda_auc): 5 | 6 | # Computing error for each row 7 | err = torch.pow(x - x_hat, 2).mean(axis = (1, 2)) 8 | 9 | # Selecting error of positive and negative example 10 | err_n = err[y == 1] 11 | err_a = err[y > 1] 12 | n_a = (err_a.shape)[0] 13 | n_n = (err_n.shape)[0] 14 | 15 | # If there are positive examples compute the AUC penalty 16 | if n_a > 0: 17 | diff = err_a.view(-1, 1).unsqueeze(1) - err_n.view(-1, 1) 18 | exp = torch.sigmoid(diff).sum() 19 | auc = lambda_auc * exp / (n_a * n_n) 20 | mean_loss = err.mean() 21 | penalized_loss = mean_loss + auc 22 | return penalized_loss, mean_loss 23 | else: 24 | mean_loss = err.mean() 25 | return mean_loss 26 | 27 | class MSEAUCLoss(nn.Module): 28 | def __init__(self): 29 | super().__init__() 30 | self.loss = MSEAUC_approx 31 | 32 | def forward(self, x_hat, x_true, y, lambda_auc): 33 | loss = self.loss(x_hat, x_true, y, lambda_auc) 34 | return loss 35 | 36 | if __name__ == '__main__': 37 | 38 | # lambda_auc in {0,0.1,1,10,100,1000,10000} with MSE error 39 | x = torch.rand([10,2,3]) 40 | x_hat = torch.rand([10,2,3]) +.2 41 | y = torch.tensor([0,0,0,0,0,0,1,1,1,1]) 42 | 43 | loss = MSEAUCLoss() 44 | print(loss(x, x_hat, y, 10)) 45 | print(MSEAUC_approx(x, x_hat, y, 10)) 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /graphs/losses/MSELoss.py: -------------------------------------------------------------------------------- 1 | """ 2 | MSE loss 3 | 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | 8 | class MSELoss(nn.Module): 9 | def __init__(self): 10 | super().__init__() 11 | self.loss = nn.MSELoss(reduction = 'mean') 12 | 13 | def forward(self, y_hat, y_true): 14 | loss = self.loss(y_hat, y_true) 15 | return loss -------------------------------------------------------------------------------- /graphs/models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | path = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 7 | mod = __import__('.'.join([__name__, py]), fromlist=[py]) 8 | classes = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)] 9 | for cls in classes: 10 | setattr(sys.modules[__name__], cls.__name__, cls) -------------------------------------------------------------------------------- /graphs/models/recurrent_autoencoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Recurrent Autoencoder PyTorch implementation 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | from easydict import EasyDict as edict 8 | from functools import partial 9 | 10 | class RecurrentEncoder(nn.Module): 11 | """Recurrent encoder""" 12 | 13 | def __init__(self, n_features, latent_dim, rnn): 14 | super().__init__() 15 | 16 | self.rec_enc1 = rnn(n_features, latent_dim, batch_first=True) 17 | 18 | def forward(self, x): 19 | _, h_n = self.rec_enc1(x) 20 | 21 | return h_n 22 | 23 | class RecurrentDecoder(nn.Module): 24 | """Recurrent decoder for RNN and GRU""" 25 | 26 | def __init__(self, latent_dim, n_features, rnn_cell, device): 27 | super().__init__() 28 | 29 | self.n_features = n_features 30 | self.device = device 31 | self.rec_dec1 = rnn_cell(n_features, latent_dim) 32 | self.dense_dec1 = nn.Linear(latent_dim, n_features) 33 | 34 | def forward(self, h_0, seq_len): 35 | # Initialize output 36 | x = torch.tensor([], device = self.device) 37 | 38 | # Squeezing 39 | h_i = h_0.squeeze() 40 | 41 | # Reconstruct first element with encoder output 42 | x_i = self.dense_dec1(h_i) 43 | 44 | # Reconstruct remaining elements 45 | for i in range(0, seq_len): 46 | h_i = self.rec_dec1(x_i, h_i) 47 | x_i = self.dense_dec1(h_i) 48 | x = torch.cat([x, x_i], axis=1) 49 | 50 | return x.view(-1, seq_len, self.n_features) 51 | 52 | 53 | class RecurrentDecoderLSTM(nn.Module): 54 | """Recurrent decoder LSTM""" 55 | 56 | def __init__(self, latent_dim, n_features, rnn_cell, device): 57 | super().__init__() 58 | 59 | self.n_features = n_features 60 | self.device = device 61 | self.rec_dec1 = rnn_cell(n_features, latent_dim) 62 | self.dense_dec1 = nn.Linear(latent_dim, n_features) 63 | 64 | def forward(self, h_0, seq_len): 65 | # Initialize output 66 | x = torch.tensor([], device = self.device) 67 | 68 | # Squeezing 69 | h_i = [h.squeeze() for h in h_0] 70 | 71 | # Reconstruct first element with encoder output 72 | x_i = self.dense_dec1(h_i[0]) 73 | 74 | # Reconstruct remaining elements 75 | for i in range(0, seq_len): 76 | h_i = self.rec_dec1(x_i, h_i) 77 | x_i = self.dense_dec1(h_i[0]) 78 | x = torch.cat([x, x_i], axis = 1) 79 | 80 | return x.view(-1, seq_len, self.n_features) 81 | 82 | 83 | class RecurrentAE(nn.Module): 84 | """Recurrent autoencoder""" 85 | 86 | def __init__(self, config): 87 | super().__init__() 88 | 89 | # Encoder and decoder configuration 90 | self.config = config 91 | self.rnn, self.rnn_cell = self.get_rnn_type(self.config.rnn_type, self.config.rnn_act) 92 | self.decoder = self.get_decoder(self.config.rnn_type) 93 | self.latent_dim = self.config.latent_dim 94 | self.n_features = self.config.n_features 95 | self.device = self.config.device 96 | 97 | # Encoder and decoder 98 | self.encoder = RecurrentEncoder(self.n_features, self.latent_dim, self.rnn) 99 | self.decoder = self.decoder(self.latent_dim, self.n_features, self.rnn_cell, self.device) 100 | 101 | def forward(self, x): 102 | seq_len = x.shape[1] 103 | h_n = self.encoder(x) 104 | out = self.decoder(h_n, seq_len) 105 | 106 | return torch.flip(out, [1]) 107 | 108 | @staticmethod 109 | def get_rnn_type(rnn_type, rnn_act=None): 110 | """Get recurrent layer and cell type""" 111 | if rnn_type == 'RNN': 112 | rnn = partial(nn.RNN, nonlinearity=rnn_act) 113 | rnn_cell = partial(nn.RNNCell, nonlinearity=rnn_act) 114 | 115 | else: 116 | rnn = getattr(nn, rnn_type) 117 | rnn_cell = getattr(nn, rnn_type + 'Cell') 118 | 119 | return rnn, rnn_cell 120 | 121 | @staticmethod 122 | def get_decoder(rnn_type): 123 | """Get recurrent decoder type""" 124 | if rnn_type == 'LSTM': 125 | decoder = RecurrentDecoderLSTM 126 | else: 127 | decoder = RecurrentDecoder 128 | return decoder 129 | 130 | if __name__ == '__main__': 131 | 132 | # Configuration 133 | config = {} 134 | config['n_features'] = 1 135 | config['latent_dim'] = 4 136 | config['rnn_type'] = 'GRU' 137 | config['rnn_act'] = 'relu' 138 | config['device'] = 'cpu' 139 | config = edict(config) 140 | 141 | # Adding random data 142 | X = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 143 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 144 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]], dtype=torch.float32).unsqueeze(2) 145 | 146 | # Model 147 | model = RecurrentAE(config) 148 | 149 | # Encoder 150 | h = model.encoder(X) 151 | out = model.decoder(h, seq_len = 10) 152 | out = torch.flip(out, [1]) 153 | 154 | # Loss 155 | loss = nn.L1Loss(reduction = 'mean') 156 | l = loss(X, out) 157 | 158 | 159 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """ 2 | __author__ = "Francesco Cannarile" 3 | Main 4 | -Capture the config file 5 | -Process the json config passed 6 | -Create an agent instance 7 | -Run the agent 8 | -Finalize 9 | """ 10 | 11 | import argparse 12 | from utils.config import * 13 | from agents import * 14 | 15 | import warnings 16 | warnings.filterwarnings("ignore") 17 | 18 | 19 | def main(): 20 | arg_parser = argparse.ArgumentParser(description = 'Configuration path') 21 | arg_parser.add_argument('config', help = 'The Configuration file in json format') 22 | args = arg_parser.parse_args() 23 | config = process_config(args.config) 24 | print(config.agent) 25 | agent_class = globals()[config.agent] 26 | agent = agent_class(config) 27 | agent.run() 28 | agent.finalize() 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /notebooks/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyLink88/Recurrent-Autoencoder/e04c84775f2396bee44843c990ea6b91908f0f3a/notebooks/.keep -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | easydict==1.9 2 | matplotlib==3.3.4 3 | numpy==1.19.5 4 | pandas==1.1.5 5 | requests==2.25.1 6 | scikit-learn==0.24.1 7 | seaborn==0.11.1 8 | torch==1.8.1 9 | tqdm==4.60.0 10 | zipfile36==0.1.3 -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | path = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 7 | mod = __import__('.'.join([__name__, py]), fromlist=[py]) 8 | classes = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)] 9 | for cls in classes: 10 | setattr(sys.modules[__name__], cls.__name__, cls) -------------------------------------------------------------------------------- /utils/assets/dec_eq.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyLink88/Recurrent-Autoencoder/e04c84775f2396bee44843c990ea6b91908f0f3a/utils/assets/dec_eq.PNG -------------------------------------------------------------------------------- /utils/assets/decoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyLink88/Recurrent-Autoencoder/e04c84775f2396bee44843c990ea6b91908f0f3a/utils/assets/decoder.png -------------------------------------------------------------------------------- /utils/assets/enc_eq.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyLink88/Recurrent-Autoencoder/e04c84775f2396bee44843c990ea6b91908f0f3a/utils/assets/enc_eq.PNG -------------------------------------------------------------------------------- /utils/assets/encoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyLink88/Recurrent-Autoencoder/e04c84775f2396bee44843c990ea6b91908f0f3a/utils/assets/encoder.png -------------------------------------------------------------------------------- /utils/checkpoints.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def checkpoints_folder(config): 4 | """Create chekpoints_folder""" 5 | 6 | # Path of the folfer where to save checkpoint 7 | path = config.checkpoint_dir + config.exp_name 8 | 9 | if not os.path.exists(path): 10 | os.mkdir(path) 11 | print('Create checkpoints folder: ' + path) 12 | else: 13 | print('Checkpoints folder ' + path + ' already exists') 14 | return path + "/" 15 | 16 | -------------------------------------------------------------------------------- /utils/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | from easydict import EasyDict 3 | 4 | def get_config_from_json(json_file): 5 | """ 6 | Get the config from a json file 7 | """ 8 | # parse the configurations from the config json file provided 9 | with open(json_file, 'r') as config_file: 10 | try: 11 | config_dict = json.load(config_file) 12 | # EasyDict allows to access dict values as attributes (works recursively). 13 | config = EasyDict(config_dict) 14 | return config, config_dict 15 | except ValueError: 16 | print("INVALID JSON file format.. Please provide a good json file") 17 | exit(-1) 18 | 19 | def process_config(json_file): 20 | """ 21 | Get the json file 22 | Processing it with EasyDict to be accessible as attributes 23 | then editing the path of the experiments folder 24 | creating some important directories in the experiment folder 25 | Then setup the logging in the whole program 26 | """ 27 | config, _ = get_config_from_json(json_file) 28 | 29 | # making sure that you have provided the exp_name. 30 | try: 31 | print(" *************************************** ") 32 | print("Experiment name: {}".format(config.exp_name)) 33 | print(" *************************************** ") 34 | except AttributeError: 35 | print("ERROR!!..Please provide the exp_name in json file..") 36 | exit(-1) 37 | return config 38 | 39 | def save_config(dict_file, save_path): 40 | """Save the configuration""" 41 | 42 | myJSON = json.dumps(dict_file) 43 | with open(save_path + "exp_config.json", "w") as jsonfile: 44 | jsonfile.write(myJSON) 45 | print('Saved configuration in ' + save_path ) 46 | 47 | 48 | -------------------------------------------------------------------------------- /utils/create_config.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | -Create config file 4 | """ 5 | 6 | import json 7 | 8 | config_rnn_ae = { 9 | 10 | # Experiment information 11 | "exp_name": "rnn_ae_ECG5000_exp_0", 12 | "agent": "RecurrentAEAgent", 13 | 14 | # Architecture hyperparameters 15 | "rnn_type": "GRU", 16 | "rnn_act": "None", 17 | "n_layers": 1, 18 | "latent_dim": 8, 19 | "n_features": 1, 20 | 21 | # Optimization hyperparameters 22 | "learning_rate": 0.001, 23 | "batch_size": 128, 24 | "batch_size_val": 256, 25 | "max_epoch": 2000, 26 | 27 | # Loss function 28 | 'loss': 'MAE', 29 | 30 | # AUC hyperparameters 31 | 'lambda_auc': 0.1, 32 | 'sampler_random_state': 88, 33 | 34 | # Folder where to retrieve the data and their names 35 | "data_folder": "./data/ECG5000/numpy/", 36 | "X_train": "X_train.npy", 37 | "y_train": "y_train.npy", 38 | "X_train_p": "X_train_p.npy", 39 | "y_train_p": "y_train_p.npy", 40 | "X_val": "X_val.npy", 41 | "y_val": "y_val.npy", 42 | "X_test": "X_test.npy", 43 | "y_test": "y_test.npy", 44 | "X_val_p": "X_val_p.npy", 45 | "y_val_p": "y_val_p.npy", 46 | 47 | # Training type: by now set equal to "one_class" 48 | "training_type": "one_class", 49 | "validation_type": "one_class", 50 | 51 | # Checkpoints 52 | "checkpoint_file": "checkpoint.pth.tar", 53 | "checkpoint_dir": "./experiments/checkpoints/", 54 | "load_checkpoint": False, 55 | 56 | # GPU settings 57 | "cuda": False, 58 | "device": "cpu", 59 | "gpu_device": 0, 60 | "seed": 58 61 | } 62 | 63 | if __name__ == '__main__': 64 | myJSON = json.dumps(config_rnn_ae) 65 | with open("./configs/config_rnn_ae.json", "w") as jsonfile: 66 | jsonfile.write(myJSON) 67 | 68 | print("Config successfully written") 69 | 70 | 71 | -------------------------------------------------------------------------------- /utils/data_preparation.py: -------------------------------------------------------------------------------- 1 | import os 2 | from zipfile import ZipFile 3 | import numpy as np 4 | import pandas as pd 5 | import requests 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.model_selection import StratifiedShuffleSplit 8 | import argparse 9 | 10 | # For reproducibility 11 | np.random.seed(88) 12 | 13 | def download_url(url, save_path, chunk_size = 128): 14 | """ Download data util function""" 15 | r = requests.get(url, stream=True) 16 | with open(save_path, 'wb') as fd: 17 | for chunk in r.iter_content(chunk_size = chunk_size): 18 | fd.write(chunk) 19 | 20 | def data_preparation(): 21 | """Download, unzip and partition ECG5000 dataset""" 22 | 23 | # Parsing input arguments 24 | desc_str = "Data downloading and partitioning" 25 | parser = argparse.ArgumentParser(description = desc_str) 26 | 27 | # Arguments 28 | down_str = "Download data 1, otherwise 0" 29 | tr_str = "Percentage of normal instances to be placed in the training set." 30 | val_str_n = ("Percentage of normal instances to be placed in the validation set:" 31 | "half of these are used to control model training, " 32 | "the remaining ones for model selection.") 33 | val_str_a = ("Percentage of anomalous instances " 34 | "w.r.t. normal instances in the training set" 35 | "used for model selection" 36 | "(e.g, if the training set contains 95 normal instances," 37 | " if you set this parameter equal to 0.05, then," 38 | "5 anomalous instances will be selected)." 39 | "The remamining anomalous instances are placed in the test set.") 40 | parser.add_argument("download", type = int, help = down_str) 41 | parser.add_argument("perc_tr_n", type = float, help = tr_str) 42 | parser.add_argument("perc_val_n", type = float, help = val_str_n) 43 | parser.add_argument("perc_val_an", type = float, help = val_str_a ) 44 | args = parser.parse_args() 45 | 46 | # Creating folder 47 | if args.download: 48 | data_path = "data/ECG5000" 49 | if not os.path.exists(data_path): 50 | os.mkdir(data_path) 51 | print('Create ECG5000 folder') 52 | 53 | # Data dowloading 54 | url = 'http://www.timeseriesclassification.com/Downloads/ECG5000.zip' 55 | save_path = 'data/ECG5000.zip' 56 | print('### Starting downloading ECG5000 data ###') 57 | download_url(url, save_path) 58 | print('### Download done! ###') 59 | 60 | # Unzipping 61 | file_name = "data/ECG5000.zip" 62 | save_path = "data/ECG5000" 63 | with ZipFile(file_name, 'r') as zip: 64 | print('Extracting all the files now...') 65 | zip.extractall(save_path) 66 | print('Extraction done!') 67 | 68 | # Removing useless files 69 | os.remove('data/ECG5000.zip') 70 | os.remove('data/ECG5000/ECG5000_TRAIN.arff') 71 | os.remove('data/ECG5000/ECG5000_TRAIN.ts') 72 | os.remove('data/ECG5000/ECG5000_TEST.ts') 73 | os.remove('data/ECG5000/ECG5000_TEST.arff') 74 | os.remove('data/ECG5000/ECG5000.txt') 75 | 76 | # Creating folder where to save numpy data 77 | data_path = "data/ECG5000/numpy" 78 | if not os.path.exists(data_path): 79 | os.mkdir(data_path) 80 | print('Create ECG5000/numpy folder') 81 | 82 | # Loading data 83 | train = pd.read_table('C:/Users/eid0108486/Desktop/Pytorch/my_projects/RecAE/data/ECG5000/ECG5000_TRAIN.txt', sep=r'\s{2,}', engine='python', header=None) 84 | test = pd.read_table('C:/Users/eid0108486/Desktop/Pytorch/my_projects/RecAE/data/ECG5000/ECG5000_TEST.txt', sep=r'\s{2,}', engine='python', header=None) 85 | 86 | # Concatenating 87 | df = pd.concat([train, test]) 88 | new_columns = list(df.columns) 89 | new_columns[0] = 'Class' 90 | df.columns = new_columns 91 | 92 | # Dividing in normal and not normal data 93 | normal = df.loc[df.Class == 1] 94 | anomaly = df.loc[df.Class != 1] 95 | 96 | # Splitting normal data in training, validation and test set 97 | X_train_n, X_val_n = train_test_split(normal, random_state = 88, test_size = 1 - args.perc_tr_n) 98 | X_val_n, X_test_n = train_test_split(X_val_n, random_state = 88, test_size = 1- args.perc_val_n) 99 | 100 | # Splitting validation data into two folds: the former to control model training, the latter 101 | # for model selection 102 | 103 | X_val_nA, X_val_nB = train_test_split(X_val_n, random_state=88, test_size = 0.5) 104 | 105 | # Splitting anomalous data in validation and test set 106 | perc_anol_all = args.perc_val_an 107 | n_anol = len(X_train_n) * perc_anol_all / (1 - perc_anol_all) 108 | perc_anol_val_a = n_anol / len(anomaly) 109 | perc_anol_test_a = 1 - perc_anol_val_a 110 | X_val_a, X_test_a = train_test_split(anomaly, 111 | random_state = 88, 112 | test_size = perc_anol_test_a, 113 | stratify = anomaly.Class) 114 | 115 | # Splitting anomalous validation data into two splitting: the former for model training the latter 116 | # for model selection 117 | X_val_aA, X_val_aB = train_test_split(X_val_a, random_state = 88, test_size = 0.5) 118 | 119 | 120 | # Training data ONLY NORMAL 121 | X_train = X_train_n.iloc[:, 1:].values 122 | y_train = X_train_n.iloc[:, 0].values 123 | 124 | # Training data NORMAL + ANOL 125 | X_train_p = pd.concat([X_train_n.iloc[:, 1:], X_val_aA.iloc[:, 1:]]).values 126 | y_train_p = pd.concat([X_train_n.iloc[:, 0], X_val_aA.iloc[:, 0]]).values 127 | 128 | # Validation data only normal to control model training 129 | X_val = X_val_n.iloc[:, 1:].values 130 | y_val = X_val_n.iloc[:, 0].values 131 | 132 | # Validation data: both normal and anomalous data for model selection: AUC LOSS 133 | X_val_p = pd.concat([X_val_nB.iloc[:, 1:], X_val_aB.iloc[:, 1:]]).values 134 | y_val_p = pd.concat([X_val_nB.iloc[:, 0], X_val_aB.iloc[:, 0]]).values 135 | 136 | # Validation data: both normal and anomalous data for model selection: NO AUC LOSS 137 | X_val_p_full = pd.concat([X_val_nB.iloc[:, 1:], X_val_a.iloc[:, 1:]]).values 138 | y_val_p_full = pd.concat([X_val_nB.iloc[:, 0], X_val_a.iloc[:, 0]]).values 139 | 140 | # Test data 141 | X_test = pd.concat([X_test_n.iloc[:, 1:], X_test_a.iloc[:, 1:]]).values 142 | y_test = pd.concat([X_test_n.iloc[:, 0], X_test_a.iloc[:, 0]]).values 143 | 144 | # Saving training data (only normal instances) 145 | np.save('./data/ECG5000/numpy/X_train.npy', X_train) 146 | np.save('./data/ECG5000/numpy/y_train.npy', y_train) 147 | 148 | # Saving training data (normal instances + anomalous) 149 | np.save('./data/ECG5000/numpy/X_train_p.npy', X_train_p) 150 | np.save('./data/ECG5000/numpy/y_train_p.npy', y_train_p) 151 | 152 | # Saving validation data (only normal instances to control model training) 153 | np.save('./data/ECG5000/numpy/X_val.npy', X_val) 154 | np.save('./data/ECG5000/numpy/y_val.npy', y_val) 155 | 156 | # Saving validation data (normal + anomalous instances to perform model selection yes AUC) 157 | np.save('./data/ECG5000/numpy/X_val_p.npy', X_val_p) 158 | np.save('./data/ECG5000/numpy/y_val_p.npy', y_val_p) 159 | 160 | # Saving validation data (normal + anomalous instances to perform model selection no AUC) 161 | np.save('./data/ECG5000/numpy/X_val_p_full.npy', X_val_p_full) 162 | np.save('./data/ECG5000/numpy/y_val_p_full.npy', y_val_p_full) 163 | 164 | # Saving test data (normal + anomalous instances) 165 | np.save('./data/ECG5000/numpy/X_test.npy', X_test) 166 | np.save('./data/ECG5000/numpy/y_test.npy', y_test) 167 | 168 | print('Saved data in numpy') 169 | 170 | if __name__ == '__main__': 171 | data_preparation() 172 | print('Data preparation done!') 173 | 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /utils/metrics.py: -------------------------------------------------------------------------------- 1 | class AverageMeter: 2 | """ 3 | Class to be an average meter for any average metric like loss, accuracy, etc.. 4 | """ 5 | 6 | def __init__(self): 7 | self.value = 0 8 | self.avg = 0 9 | self.sum = 0 10 | self.count = 0 11 | self.reset() 12 | 13 | def reset(self): 14 | self.value = 0 15 | self.avg = 0 16 | self.sum = 0 17 | self.count = 0 18 | 19 | def update(self, val, n=1): 20 | self.value = val 21 | self.sum += val * n 22 | self.count += n 23 | self.avg = self.sum / self.count 24 | 25 | @property 26 | def val(self): 27 | return self.avg -------------------------------------------------------------------------------- /utils/samplers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.model_selection import StratifiedKFold 3 | 4 | class Sampler(object): 5 | """Base class for all Samplers. 6 | Every Sampler subclass has to provide an __iter__ method, providing a way 7 | to iterate over indices of dataset elements, and a __len__ method that 8 | returns the length of the returned iterators. 9 | """ 10 | 11 | def __init__(self, data_source): 12 | pass 13 | 14 | 15 | def __iter__(self): 16 | raise NotImplementedError 17 | 18 | def __len__(self): 19 | raise NotImplementedError 20 | 21 | 22 | class StratifiedSampler(Sampler): 23 | """Stratified batch sampling""" 24 | 25 | def __init__(self, y, batch_size, random_state, shuffle=True): 26 | 27 | if torch.is_tensor(y): 28 | y = y.numpy() 29 | assert len(y.shape) == 1, 'label array must be 1D' 30 | 31 | self.X = torch.randn(len(y), 1).numpy() 32 | self.y = y 33 | self.shuffle = shuffle 34 | self.rnd_state = random_state 35 | self.n_batches = int(len(y) / batch_size) 36 | 37 | 38 | def __iter__(self): 39 | skf = StratifiedKFold(n_splits = self.n_batches, 40 | shuffle = self.shuffle, 41 | random_state = self.rnd_state) 42 | for train_idx, test_idx in skf.split(self.X, self.y): 43 | yield test_idx 44 | 45 | def __len__(self): 46 | return self.n_batches 47 | 48 | 49 | if __name__ == '__main__': 50 | 51 | from torch.utils.data import DataLoader, TensorDataset 52 | 53 | # Creating a toy dataset 54 | y1 = torch.zeros([50,1]) 55 | y2 = torch.ones([12, 1]) 56 | y = torch.cat([y1, y2], axis = 0).squeeze() 57 | x = torch.ones([len(y), 2]) 58 | x[10,:] = 3.14 59 | x[-21,:] = 392 60 | 61 | # Using StratiFiedSampler 62 | dataset = TensorDataset(x, y) 63 | my_sampler = StratifiedSampler(y,10,10) 64 | loader = DataLoader(dataset, batch_sampler = my_sampler) 65 | 66 | # Checking results 67 | for x, y in loader: 68 | print(x) 69 | 70 | 71 | 72 | 73 | --------------------------------------------------------------------------------