├── DATA
    └── column.pkl
├── README.md
├── extract.py
├── img
    └── image.png
├── model
    ├── dataset.py
    ├── dynamic_vae.py
    └── tasks.py
├── params.json
├── requirements.txt
├── train.py
└── utils.py


/DATA/column.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thinkenergy/dynamic_vae/998255e717c54589de95c694a848ee7bb9946168/DATA/column.pkl


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Dynamic VAE frame
 2 | 
 3 |  1. Automatic feature extraction can be achieved by probability distribution of battery data  
 4 | 
 5 | > The application of data science method to anomaly discrimination in time series is limited.  The main reason is that exception tags are usually few in quantity, low in quality, mismarked or omitted.  In order to solve this problem, we hope to process a large number of time series data that have not been manually filtered through the compilation and interpretation model related to information theory and the use of large-scale networks (such as Transformer) to parameterize the probability distribution of data and the correlation function on each feature.  
 6 | > The parameters and network structure acquired by learning contain highly nonlinear features that are difficult to be extracted artificially, which can help existing models to achieve better performance in anomaly and health prediction tasks.  
 7 | 
 8 |  2. A simple model is used to detect anomalies in extracted features
 9 | ## Purpose of model
10 |  1. For feature extraction of battery data
11 | ![image](img/image.png)
12 |  2. According to the extracted features, the anomaly detection model is learned.
13 | 
14 | ## How to get data
15 | ```python
16 | cd DATA
17 | wget http://82.156.209.173/s/6Saazbbxq92iez7/download
18 | unzip download
19 | cd dataset/dahu
20 | tar -xf test_mulmileage.tar
21 | ```
22 | You can download and decompress the files to the DATA/ directory according to the provided data link, put the data folder in the DATA subdirectory.
23 | ### How to run：
24 | ```python
25 | python train.py
26 | ```
27 | > You need to switch to the root directory of the project and run Python train.py. The network will generate the features of the data set extracted under the current time and store them in the feature folder, store the model structure of the network in the Model folder, and store the loss changes during training in the Loss folder.  


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time : 2021/11/13 14:29
 3 | # @Author : huangshaobo，liujiachang,zhangyang                                                 
 4 | # @Email : sdk.eval@thinkenergy.net.cn
 5 | # @File : extract.py
 6 | import json
 7 | import os
 8 | import sys
 9 | import time
10 | import torch
11 | from torch.utils.data import DataLoader
12 | from tqdm import tqdm
13 | from utils import collate
14 | from model import dataset
15 | from train import extract
16 | from utils import to_var, collate, Normalizer, PreprocessNormalizer
17 | from model import tasks
18 | import pickle
19 | 
20 | 
21 | class Extraction:
22 | 
23 |     def __init__(self, args):
24 |         self.args = args
25 | 
26 |     def main(self):
27 |         """
28 |         Used for feature extraction
29 |         test: Normalized test data is similar to train in train.py  
30 |         task: Task, such as EvTask and JeveTask, is used to extract features of different dimensions  
31 |         model: The trained model is the same as the model saved in train.py
32 |         """
33 |         model_params_path = os.path.join(self.args.current_model_path, "model_params.json")
34 |         with open(model_params_path, 'r') as load_f:
35 |             prams_dict = json.load(load_f)
36 |         model_params = prams_dict['args']
37 |         start_time = time.time()
38 |         data_pre = dataset.Dataset(model_params["test_path"])
39 |         self.normalizer = pickle.load(open(os.path.join(self.args.current_model_path, "norm.pkl"), 'rb'))
40 |         test = PreprocessNormalizer(data_pre, normalizer_fn=self.normalizer.norm_func)
41 | 
42 |         task = tasks.Task(task_name=model_params["task"], columns=model_params["columns"])
43 | 
44 |         # Open the saved model file
45 |         model_torch = os.path.join(model_params["current_model_path"], "model.torch")
46 |         model = to_var(torch.load(model_torch)).float()
47 |         model.encoder_filter = task.encoder_filter
48 |         model.decoder_filter = task.decoder_filter
49 |         model.noise_scale = model_params["noise_scale"]
50 |         data_loader = DataLoader(dataset=test, batch_size=model_params["batch_size"], shuffle=True,
51 |                                  num_workers=model_params["jobs"], drop_last=False,                           
52 |                                  pin_memory=torch.cuda.is_available(),
53 |                                  collate_fn=collate if model_params["variable_length"] else None)
54 | 
55 |         print("sliding windows dataset length is: ", len(test))
56 |         print("model", model)
57 | 
58 |         # Start extracting features using trained models
59 |         model.eval()
60 |         p_bar = tqdm(total=len(data_loader), desc='saving', ncols=100, mininterval=1, maxinterval=10, miniters=1)
61 |         extract(data_loader, model, task, model_params["save_feature_path"], p_bar, model_params["noise_scale"],
62 |                 model_params["variable_length"])
63 |         p_bar.close()
64 |         print("Feature extraction of all test saved at", model_params["save_feature_path"])
65 |         print("The total time consuming：", time.time() - start_time)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     import argparse
70 | 
71 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
72 |     parser = argparse.ArgumentParser(description='Train Example')
73 |     parser.add_argument('--current_model_path', type=str,
74 |                         default='2021-12-04-15-19-38/model/')
75 |     args = parser.parse_args()
76 |     Extraction(args).main()
77 | 


--------------------------------------------------------------------------------
/img/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thinkenergy/dynamic_vae/998255e717c54589de95c694a848ee7bb9946168/img/image.png


--------------------------------------------------------------------------------
/model/dataset.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time : 2021/10/9 14:53
 3 | # @Author : huangshaobo,liujiachang,zhangyang 
 4 | # @Email : sdk.eval@thinkenergy.net.cn
 5 | # @File : dataset.py
 6 | 
 7 | import os
 8 | import torch
 9 | 
10 | class Dataset:
11 |     def __init__(self, data_path):
12 |       
13 |         self.data_path = data_path
14 |         self.battery_dataset = []
15 |         self.data_lst = os.listdir(data_path)
16 |         for i in range(len(self.data_lst)):
17 |             single_path=os.path.join(self.data_path, self.data_lst[i])
18 |             train1 =torch.load(single_path)
19 |             self.battery_dataset.append(train1)
20 |     def __len__(self):
21 |         return len(self.battery_dataset)
22 | 
23 |     def __getitem__(self, idx):
24 |         file = self.battery_dataset[idx]
25 |         return file
26 | 


--------------------------------------------------------------------------------
/model/dynamic_vae.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time : 2021/10/9 14:53
 3 | # @Author : huangshaobo,liujiachang,zhangyang 
 4 | # @Email : sdk.eval@thinkenergy.net.cn
 5 | # @File : dynamic_vae.py
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
10 | 
11 | from utils import to_var
12 | 
13 | 
14 | class DynamicVAE(nn.Module):
15 |     def __init__(self, rnn_type, hidden_size, latent_size, encoder_embedding_size, output_embedding_size,
16 |                  decoder_embedding_size, num_layers=1, bidirectional=False, variable_length=False, **params):
17 |         super().__init__()
18 |         self.latent_size = latent_size
19 |         self.bidirectional = bidirectional
20 |         self.num_layers = num_layers
21 |         self.hidden_size = hidden_size
22 |         self.variable_length = variable_length
23 |         rnn = eval('nn.' + rnn_type.upper())
24 | 
25 |         self.encoder_rnn = rnn(encoder_embedding_size, hidden_size, num_layers=num_layers,
26 |                                bidirectional=self.bidirectional, batch_first=True)
27 |         self.decoder_rnn = rnn(decoder_embedding_size, hidden_size, num_layers=num_layers,
28 |                                bidirectional=self.bidirectional, batch_first=True)
29 | 
30 |         self.hidden_factor = (2 if bidirectional else 1) * num_layers
31 | 
32 |         self.hidden2mean = nn.Linear(hidden_size * self.hidden_factor, latent_size)
33 |         self.hidden2log_v = nn.Linear(hidden_size * self.hidden_factor, latent_size)
34 |         self.latent2hidden = nn.Linear(latent_size, hidden_size * self.hidden_factor)
35 |         self.outputs2embedding = nn.Linear(hidden_size * (2 if bidirectional else 1), output_embedding_size)
36 |         self.mean2latent = nn.Sequential(nn.Linear(latent_size, int(hidden_size / 2)), nn.ReLU(),
37 |                                          nn.Linear(int(hidden_size / 2), 1))
38 | 
39 |     def forward(self, input_sequence, encoder_filter, decoder_filter, seq_lengths, noise_scale=1.0):
40 |         batch_size = input_sequence.size(0)
41 |         en_input_sequence = encoder_filter(input_sequence)
42 |         en_input_embedding = en_input_sequence.to(torch.float32)
43 |         if self.variable_length:
44 |             en_input_embedding = pack_padded_sequence(en_input_embedding, seq_lengths, batch_first=True)
45 |         output, hidden = self.encoder_rnn(en_input_embedding)
46 |         if self.bidirectional or self.num_layers > 1:
47 |             hidden = hidden.view(batch_size, self.hidden_size * self.hidden_factor)
48 |         else:
49 |             hidden = hidden.squeeze()
50 | 
51 |         mean = self.hidden2mean(hidden)
52 |         log_v = self.hidden2log_v(hidden)
53 |         std = torch.exp(0.5 * log_v)
54 |         mean_pred = self.mean2latent(mean)
55 | 
56 |         z = to_var(torch.randn([batch_size, self.latent_size]))
57 |         if self.training:
58 |             z = z * std * noise_scale + mean
59 |         else:
60 |             z = mean
61 |         hidden = self.latent2hidden(z)
62 | 
63 |         if self.bidirectional or self.num_layers > 1:
64 |             hidden = hidden.view(self.hidden_factor, batch_size, self.hidden_size)
65 |         else:
66 |             hidden = hidden.unsqueeze(0)
67 | 
68 |         de_input_sequence = decoder_filter(input_sequence)
69 |         de_input_embedding = de_input_sequence.to(torch.float32)
70 |         if self.variable_length:
71 |             de_input_embedding = pack_padded_sequence(de_input_embedding, seq_lengths, batch_first=True)
72 | 
73 |             outputs, _ = self.decoder_rnn(de_input_embedding, hidden)
74 |             outputs, _ = pad_packed_sequence(outputs, batch_first=True)
75 |         else:
76 |             outputs, _ = self.decoder_rnn(de_input_embedding, hidden)
77 |         log_p = self.outputs2embedding(outputs)
78 |         return log_p, mean, log_v, z, mean_pred
79 | 


--------------------------------------------------------------------------------
/model/tasks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time : 2021/10/11 16:48
 3 | # @Author : huangshaobo,liujiachang,zhangyang
 4 | # @Email : sdk.eval@thinkenergy.net.cn
 5 | # @File : tasks.py
 6 | import numpy as np
 7 | import torch
 8 | 
 9 | from utils import to_var
10 | 
11 | 
12 | class Label:
13 |     def __init__(self, column_name, training_set, sample_length=50):
14 |         self.label = column_name
15 |         self.sample_length = sample_length
16 |         self.sample_mileage = [training_set[i][1][self.label] for i in range(self.sample_length)]
17 |         self.max_mileage = max(self.sample_mileage)
18 |         self.min_mileage = min(self.sample_mileage)
19 | 
20 |     def loss(self, batch, mean_pred, is_mse=True):
21 |         label_data = []
22 |         for i in batch[1][self.label]:
23 |             norm_label = (i - self.min_mileage) / (self.max_mileage - self.min_mileage)
24 |             label_data.append(norm_label)
25 |         label = torch.tensor(label_data)
26 |         x = mean_pred.squeeze().to("cuda")
27 |         y = label.float().to("cuda")
28 |         mse = torch.nn.MSELoss(reduction='mean')
29 |         loss = 0
30 |         if is_mse:
31 |             loss = mse(x, y)
32 |         return loss
33 | 
34 | 
35 | class Task:
36 |     def __init__(self, columns, encoder_dimension=122, decoder_dimension=122,
37 |                  output_dimension=122, task_name='ev'):
38 |         self.encoder_dimension = encoder_dimension
39 |         self.decoder_dimension = decoder_dimension
40 |         self.output_dimension = output_dimension
41 |         self.task_name = task_name
42 |         self.columns = columns
43 |         self.encoder = []
44 |         self.decoder = []
45 |         self.target = []
46 |         eval(self.task_name.capitalize() + 'Task.set_params')(self)
47 |         eval(self.task_name.capitalize() + 'Task.get_task_idx')(self, columns)
48 | 
49 |     def encoder_filter(self, input_embedding):
50 |         # Extracting input dimensions
51 |         return eval(self.task_name.capitalize() + 'Task.task_encoder')(self, input_embedding, self.columns)
52 | 
53 |     def decoder_filter(self, input_embedding):
54 |         # Escape device
55 |         self.decoder = self.encoder[:self.decoder_dimension]
56 |         return to_tensor(to_array(input_embedding)[:, :, self.decoder])
57 | 
58 |     def target_filter(self, input_embedding):
59 |         # Output target dimension
60 |         self.target = self.encoder[self.decoder_dimension:]
61 |         return to_tensor(to_array(input_embedding)[:, :, self.target])
62 | 
63 |     def task_encoder(self, input_embedding, columns):
64 |         return to_tensor(to_array(input_embedding)[:, :, self.encoder])
65 | 
66 | 
67 | class EvTask(Task):
68 |     def set_params(self):
69 |         """
70 |         The number of dimensions
71 |         """
72 |         self.encoder_dimension = 6
73 |         self.decoder_dimension = 2
74 |         self.output_dimension = 4
75 | 
76 |     def get_task_idx(self, columns):
77 |         """
78 |         Extract the specified column
79 |         Args:
80 |             columns
81 |         Return:
82 |             Specifies the corresponding subscript for the column
83 |         """
84 |         self.encoder = np.array(
85 |             [columns.index("soc"), columns.index("current"),
86 |              columns.index("max_temp"), columns.index("max_single_volt"),
87 |              columns.index("min_single_volt"), columns.index("volt")]).astype(int)
88 |         return self.encoder
89 | 
90 | def to_tensor(input_embedding):
91 |     return to_var(torch.from_numpy(np.array(input_embedding)))
92 | 
93 | 
94 | def to_array(input_embedding):
95 |     return input_embedding.cpu().numpy()
96 | 


--------------------------------------------------------------------------------
/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "task": "ev",
 3 |   "epochs":3,
 4 |   "jobs": 32,
 5 |   "batch_size": 128,
 6 |   "learning_rate": 0.001,
 7 |   "cosine_factor": 1.0,
 8 |   "rnn_type": "gru",
 9 |   "model_type": "rnn",
10 |   "kernel_size": 3,
11 |   "nhead": 2,
12 |   "dim_feedforward": 2048,
13 |   "hidden_size": 32,
14 |   "num_layers": 3,
15 |   "bidirectional": true,
16 |   "smoothing": false,
17 |   "latent_size": 32,
18 |   "noise_scale": 0.01,
19 |   "anneal_function": "linear",
20 |   "x0": 500,
21 |   "anneal0": 0.5,
22 |   "variable_length": false,
23 |   "min_length": 30,
24 |   "nll_weight": 0.05,
25 |   "latent_label_weight": 0.01,
26 |   "save_model_path": "PRETRAIN",
27 |   "granularity_all": 1000,
28 |   "num_granularity_all": 100,
29 |   "granularity_car": 1000,
30 |   "num_granularity_car": 200,
31 |   "use_flag": "rec_error",
32 |   "train_path": "DATA/dataset/dahu/test_mulmileage"
33 | }
34 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.19.5
2 | pandas==1.1.5
3 | matplotlib==3.3.4
4 | torch==1.9.1+cu111


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time : 2021/10/11 9:59
  3 | # @Author : huangshaobo,liujiachang,zhangyang  
  4 | # @Email : sdk.eval@thinkenergy.net.cn
  5 | # @File : train.py
  6 | import json
  7 | import os
  8 | import pickle
  9 | import sys
 10 | import time
 11 | from collections import OrderedDict
 12 | 
 13 | import matplotlib.pyplot as plt
 14 | import numpy as np
 15 | import pandas as pd
 16 | import torch
 17 | from torch.optim.lr_scheduler import CosineAnnealingLR
 18 | from torch.utils.data import DataLoader
 19 | from tqdm import tqdm
 20 | from model import tasks
 21 | from model import dynamic_vae
 22 | from utils import to_var, collate, Normalizer, PreprocessNormalizer
 23 | from model import dataset
 24 | torch.cuda.current_device()
 25 | torch.cuda._initialized = True
 26 | 
 27 | class Train:
 28 | 
 29 |     def __init__(self, args):
 30 |         """
 31 |         Training module initialization, loading project parameters and creating model save path
 32 | 
 33 |         Args: 
 34 |             args：
 35 |                 Default parameters,class Namespace,
 36 |             normalizer：
 37 |                 Normalization method, class utils.Normalizer or None
 38 |             data：
 39 |                 Training data, class dataset.SlidingWindowBattery or None
 40 |             label_data：
 41 |                 Data mileage label, class tasks.Label or None
 42 |             data_task：
 43 |                 The model of task  ，class tasks.Task or None
 44 |             **kwargs etc.
 45 |         """
 46 |         self.args= args
 47 |         time_now = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
 48 |         current_path = os.path.join(self.args.save_model_path, time_now)
 49 |         self.mkdir(current_path)
 50 |         self.current_path = current_path
 51 |         self.current_epoch = 1
 52 |         self.step = 1
 53 |         self.loss_dict = OrderedDict()
 54 | 
 55 |         loss_picture_path = os.path.join(current_path, "loss")
 56 |         feature_path = os.path.join(current_path, "feature")
 57 |         current_model_path = os.path.join(current_path, "model")
 58 |         save_feature_path = os.path.join(current_path, "mean")
 59 |         result_path = os.path.join(current_path, "result")
 60 |         # Create a model save path and add the associated path to the ARGS for subsequent calls  
 61 |         self.mkdir(loss_picture_path)
 62 |         self.mkdir(feature_path)
 63 |         self.mkdir(current_model_path)
 64 |         self.mkdir(result_path)
 65 |         self.mkdir(save_feature_path)
 66 | 
 67 |         self.args.loss_picture_path = loss_picture_path
 68 |         self.args.feature_path = feature_path
 69 |         self.args.result_path = result_path
 70 |         self.args.save_feature_path = save_feature_path
 71 |         self.args.current_path = current_path
 72 |         self.args.current_model_path = current_model_path
 73 | 
 74 |     @staticmethod
 75 |     def mkdir(path):
 76 |         """
 77 |         Creating a folder
 78 |     
 79 |         Args:
 80 |             path: String. folder path. 
 81 | 
 82 |         """
 83 |         if os.path.exists(path):
 84 |             print('%s is exist' % path)
 85 |         else:
 86 |             os.makedirs(path)
 87 | 
 88 |     def main(self):
 89 |         """
 90 |         Training main program ,Load training data, preprocess, create model and train according to parameters, save model results and related parameters  
 91 |         train: normalized data used for model input.train[i][0]is an array with shape m * N.
 92 |                 train[i][1]is a dict containing label, CAR, charge_segment, Mileage, and timestamp
 93 |         model： models used for training, including DynamicVAE  
 94 |         loss: nll kl label
 95 |         rec_error: Reconstruction error, calculated by calculating the MSE of log_p and target  
 96 |         
 97 |         """     
 98 |         print("Loading data to memory. This may take a few minutes...")
 99 |         data_pre = dataset.Dataset(self.args.train_path)
100 |         self.normalizer = Normalizer(dfs=[data_pre[i][0] for i in range(20)], variable_length=self.args.variable_length)
101 |         train = PreprocessNormalizer(data_pre, normalizer_fn=self.normalizer.norm_func)
102 |         print("Data loaded successfully.")
103 | 
104 |         self.args.columns=torch.load(os.path.join("DATA/","column.pkl"))
105 |         self.data_task = tasks.Task(task_name=self.args.task, columns=self.args.columns)
106 |         params = dict(
107 |             rnn_type=self.args.rnn_type,
108 |             hidden_size=self.args.hidden_size,
109 |             latent_size=self.args.latent_size,
110 |             num_layers=self.args.num_layers,
111 |             bidirectional=self.args.bidirectional,
112 |             kernel_size=self.args.kernel_size,
113 |             nhead=self.args.nhead,
114 |             dim_feedforward=self.args.dim_feedforward,
115 |             variable_length=self.args.variable_length,
116 |             encoder_embedding_size=self.data_task.encoder_dimension,
117 |             decoder_embedding_size=self.data_task.decoder_dimension,
118 |             output_embedding_size=self.data_task.output_dimension)
119 |         # Specify the model
120 |         if self.args.model_type == "rnn":
121 |             model = to_var(dynamic_vae.DynamicVAE(**params)).float()
122 |         else:
123 |             model = None
124 | 
125 |         print("model", model)
126 |         # A way to specify the optimizer and update the learning rate
127 |         optimizer = torch.optim.AdamW(model.parameters(), lr=self.args.learning_rate, weight_decay=1e-6)
128 |         scheduler = CosineAnnealingLR(optimizer, T_max=self.args.epochs,
129 |                                       eta_min=self.args.cosine_factor * self.args.learning_rate)
130 |         # Load the data with the DataLoader
131 |         data_loader = DataLoader(dataset=train, batch_size=self.args.batch_size, shuffle=True,
132 |                                  num_workers=self.args.jobs, drop_last=False, pin_memory=torch.cuda.is_available(),
133 |                                  collate_fn=collate if self.args.variable_length else None)
134 |         time_start = time.time()
135 |         try:
136 |             p_bar = tqdm(total=len(data_loader) * self.args.epochs, desc='training', ncols=160, mininterval=1,
137 |                          maxinterval=10, miniters=1)
138 |             while self.current_epoch <= self.args.epochs:
139 |                 model.train()
140 |                 total_loss, total_nll, total_label, total_kl, iteration = 0, 0, 0, 0, 0
141 |                 for batch in data_loader:
142 |                     batch_ = to_var(batch[0]).float()
143 |                     seq_lengths = batch[1]['seq_lengths'] if self.args.variable_length else None
144 |                     log_p, mean, log_v, z, mean_pred = model(batch_,
145 |                                                              encoder_filter=self.data_task.encoder_filter,
146 |                                                              decoder_filter=self.data_task.decoder_filter,
147 |                                                              seq_lengths=seq_lengths, noise_scale=self.args.noise_scale)
148 |                     target = self.data_task.target_filter(batch_)
149 | 
150 |                     nll_loss, kl_loss, kl_weight = self.loss_fn(log_p, target, mean, log_v)
151 |                     self.label_data = tasks.Label(column_name="mileage", training_set=train)
152 |                     label_loss = self.label_data.loss(batch, mean_pred, is_mse=True)
153 |                     loss = (self.args.nll_weight * nll_loss + self.args.latent_label_weight * label_loss + kl_weight *
154 |                             kl_loss / batch_.shape[0])
155 | 
156 |                     # Update parameter  
157 |                     optimizer.zero_grad()
158 |                     loss.backward()
159 |                     optimizer.step()
160 | 
161 |                     # Calculates and displays loss  
162 |                     total_loss += loss.item()
163 |                     total_nll += nll_loss.item()
164 |                     total_label += label_loss.item()
165 |                     total_kl += kl_loss.item() / batch_.shape[0]
166 |                     loss_info = {'mean_loss': total_loss / (1 + iteration), 'nll_loss': total_nll / (1 + iteration),
167 |                                  "label_loss": total_label / (1 + iteration), "kl_loss": total_kl / (1 + iteration)}
168 |                     p_bar.set_postfix(loss_info)
169 |                     p_bar.set_description('training - Epoch %d/%i' % (self.current_epoch, self.args.epochs))
170 | 
171 |                     # save loss
172 |                     if iteration == len(data_loader) - 1:
173 |                         self.save_loss(loss_info, log_p, target)
174 | 
175 |                     self.step += 1
176 |                     p_bar.update(1)
177 |                     iteration += 1
178 | 
179 |                 scheduler.step()
180 |                 self.current_epoch += 1
181 |             p_bar.close()
182 | 
183 |         except KeyboardInterrupt:
184 |             print("Caught keyboard interrupt; quit training.")
185 |             pass
186 | 
187 |         print("Train completed, save information")
188 |         # Save the model and related parameters
189 |         model.eval()
190 |         p_bar = tqdm(total=len(data_loader), desc='saving', ncols=100, mininterval=1, maxinterval=10, miniters=1)
191 |         extract(data_loader, model, self.data_task, self.args.feature_path, p_bar, self.args.noise_scale,
192 |                 self.args.variable_length)
193 |         p_bar.close()
194 |         print("The total time consuming：", time.time() - time_start)
195 |         self.model_result_save(model)
196 |         self.loss_visual()
197 |         print("All parameters have been saved at", self.args.feature_path)
198 | 
199 |     def model_result_save(self, model):
200 |         """
201 |         To save pretrain model ,normalization pkl and training parameters
202 |         args: 
203 |             model: training of vae model
204 |         """
205 |         model_params = {'train_time_start': self.current_path,
206 |                         'train_time_end': time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())),
207 |                         'args': vars(self.args),
208 |                         'loss': self.loss_dict}
209 |         with open(os.path.join(self.args.current_model_path, 'model_params.json'), 'w') as f:
210 |             json.dump(model_params, f, indent=4)
211 |         model_path = os.path.join(self.args.current_model_path, "model.torch")
212 |         torch.save(model, model_path)
213 |         norm_path = os.path.join(self.args.current_model_path, "norm.pkl")
214 |         with open(norm_path, "wb") as f:
215 |             pickle.dump(self.normalizer, f)
216 |             
217 | 
218 |     def loss_fn(self, log_p, target, mean, log_v):
219 |         """
220 |         Calculate NLL_loss, KL_loss, and KL_weight of KL_loss.
221 | 
222 |         args: 
223 |             log_p: training output of model
224 |             target: training target selected by task
225 |             mean: mean of vae model
226 |             log_v: variance of training characteristics
227 | 
228 |         return:
229 |             nll_loss: Negative Log Likelihood loss
230 |             kl_loss: KL Divergence
231 |             kl_weight: float, anneal0, changed by anneal_function
232 | 
233 |         """
234 |         nll = torch.nn.SmoothL1Loss(reduction='mean')
235 |         nll_loss = nll(log_p, target)
236 |         kl_loss = -0.5 * torch.sum(1 + log_v - mean.pow(2) - log_v.exp())
237 |         kl_weight = self.kl_anneal_function()
238 |         return nll_loss, kl_loss, kl_weight
239 | 
240 |     def kl_anneal_function(self):
241 |         """
242 |         Anneal update function  
243 |         """
244 |         if self.args.anneal_function == 'logistic':
245 |             return self.args.anneal0 * float(1 / (1 + np.exp(-self.args.k * (self.step - self.args.x0))))
246 |         elif self.args.anneal_function == 'linear':
247 |             return self.args.anneal0 * min(1, self.step / self.args.x0)
248 |         else:
249 |             return self.args.anneal0
250 | 
251 |     def loss_visual(self):
252 |         """
253 |         Visualization of each loss in the training process
254 |         """
255 |         if self.args.epochs == 0:
256 |             return
257 |         x = list(self.loss_dict.keys())
258 |         df_loss = pd.DataFrame(dict(self.loss_dict)).T.sort_index()
259 |         mean_loss = df_loss['mean_loss'].values.astype(float)
260 |         nll_loss = df_loss['nll_loss'].values.astype(float)
261 |         label_loss = df_loss['label_loss'].values.astype(float)
262 |         kl_loss = df_loss['kl_loss'].values.astype(float)
263 | 
264 |         plt.figure()
265 |         plt.subplot(2, 1, 1)
266 |         plt.plot(x, mean_loss, 'r.-', label='mean_loss')
267 |         plt.legend()
268 | 
269 |         plt.subplot(2, 3, 4)
270 |         plt.plot(x, nll_loss, 'bo-', label='nll_loss')
271 |         plt.legend()
272 | 
273 |         plt.subplot(2, 3, 5)
274 |         plt.plot(x, label_loss, 'bo-', label='label_loss')
275 |         plt.legend()
276 | 
277 |         plt.subplot(2, 3, 6)
278 |         plt.plot(x, kl_loss, 'bo-', label='kl_loss')
279 |         plt.legend()
280 |         plt.savefig(self.args.loss_picture_path + '/' + 'loss.png')
281 |         plt.close('all')
282 | 
283 |     def save_loss(self, loss_info, log_p, target):
284 |         """
285 |         To save loss
286 |         """
287 |         self.loss_dict[str(self.current_epoch)] = loss_info
288 |         n_image = log_p.shape[-1]
289 |         for i in range(n_image):
290 |             plt.subplot(n_image, 1, i + 1)
291 |             plt.plot(log_p[0, :, i].cpu().detach().numpy(), 'y',
292 |                      label='lp-' + str(self.current_epoch))
293 |             plt.plot(target[0, :, i].cpu().detach().numpy(), 'c',
294 |                      label='tg-' + str(self.current_epoch))
295 |             plt.legend()
296 |         loss_path = os.path.join(self.args.loss_picture_path, "%i_epoch.jpg" % self.current_epoch)
297 |         plt.savefig(loss_path)
298 |         plt.close('all')
299 |     
300 |     def getmodelparams(self):
301 |         return os.path.join(self.args.current_model_path, 'model_params.json')
302 |     
303 | def save_features_info(feature_path, batch, iteration, log_p, mean, target):
304 |     """
305 |     Feature preservation function  
306 |     """
307 |     mse = torch.nn.MSELoss(reduction='mean')
308 |     dict_path = os.path.join(feature_path, "%i_label.file" % iteration)
309 |     with open(dict_path, "wb") as f:
310 |         rec_error = [float(mse(log_p[i], target[i])) for i in range(batch[0].shape[0])]
311 |         batch[1].update({'rec_error': rec_error})
312 |         torch.save(batch[1], f)
313 |     mean_path = os.path.join(feature_path, "%i_npy.npy" % iteration)
314 |     np_mean = mean.data.cpu().numpy()
315 |     np.save(mean_path, np_mean)
316 | 
317 | 
318 | def extract(data_loader, model, data_task, feature_path, p_bar, noise_scale, variable_length):
319 |     """
320 |     Feature extraction function, the process is similar to the process in train.py
321 |     """
322 |     iteration = 0
323 |     for batch in data_loader:
324 |         batch_ = to_var(batch[0]).float()
325 |         seq_lengths = batch[1]['seq_lengths'] if variable_length else None
326 |         log_p, mean, log_v, z, mean_pred = model(batch_, encoder_filter=data_task.encoder_filter,
327 |                                                  decoder_filter=data_task.decoder_filter,
328 |                                                  seq_lengths=seq_lengths, noise_scale=noise_scale)
329 |         target = data_task.target_filter(batch_)
330 |         save_features_info(feature_path, batch, iteration, log_p, mean, target)
331 |         p_bar.update(1)
332 |         iteration += 1
333 | 
334 | 
335 | if __name__ == '__main__':
336 |     import argparse
337 | 
338 |     #from anomaly_detection.model import projects
339 | 
340 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
341 |     parser = argparse.ArgumentParser(description='Train Example')
342 |     parser.add_argument('--config_path', type=str,
343 |                         default=os.path.join( 'params.json'))
344 | 
345 |     args = parser.parse_args()
346 | 
347 |     with open(args.config_path, 'r') as file:
348 |         p_args = argparse.Namespace()
349 |         p_args.__dict__.update(json.load(file))
350 |         args = parser.parse_args(namespace=p_args)
351 |     print("Loaded configs at %s" % args.config_path)
352 |     print("args", args)
353 |     Train(args).main()
354 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time : 2021/11/3 17:21
  3 | # @Author : huangshaobo,liujiachang,zhangyang  
  4 | # @Email : sdk.eval@thinkenergy.net.cn
  5 | # @File : utils.py
  6 | import traceback
  7 | 
  8 | import torch
  9 | from collections import OrderedDict
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | from scipy import interpolate as ip
 14 | 
 15 | def config_valid(config):
 16 |     config = vars(config)
 17 |     keys = config.keys()
 18 |     try:
 19 |         assert 'anneal0' in keys and type(config['anneal0']) == float 
 20 |         assert 'anneal_function' in keys and type(config['anneal_function']) == str and config['anneal_function'] in [
 21 |             'logistic', 'linear']
 22 |         assert 'batch_size' in keys and type(config['batch_size']) == int
 23 |         assert 'bidirectional' in keys and type(config['bidirectional']) == bool
 24 |         assert 'cell_level' in keys and type(config['cell_level']) == bool
 25 |         assert 'config_path' in keys and type(config['config_path']) == str
 26 |         assert 'cosine_factor' in keys and type(config['cosine_factor']) == float
 27 |         assert 'dim_feedforward' in keys and type(config['dim_feedforward']) == int
 28 |         assert 'epochs' in keys and type(config['epochs']) == int
 29 |         assert 'evaluation_path' in keys and type(config['evaluation_path']) == str
 30 |         assert 'hidden_size' in keys and type(config['hidden_size']) == int
 31 |         assert 'interpolate' in keys and type(config['interpolate']) == int
 32 |         assert 'interval' in keys and type(config['interval']) == int
 33 |         assert 'jobs' in keys and type(config['jobs']) == int
 34 |         assert 'k' in keys and type(config['k']) == float
 35 |         assert 'kernel_size' in keys and type(config['kernel_size']) == int
 36 |         assert 'latent_label_weight' in keys and isinstance(config['latent_label_weight'], (int, float))
 37 |         assert 'latent_size' in keys and type(config['latent_size']) == int
 38 |         assert 'learning_rate' in keys and type(config['learning_rate']) == float
 39 |         assert 'model_type' in keys and type(config['model_type']) == str and config['model_type'] in ["rnn",
 40 |                                                                                                        "transformer"]
 41 |         assert 'nhead' in keys and type(config['nhead']) == int
 42 |         assert 'nll_weight' in keys and isinstance(config['nll_weight'], (int, float))
 43 |         assert 'noise_scale' in keys and type(config['noise_scale']) == float
 44 |         assert 'norm' in keys and type(config['norm']) == str
 45 |         assert 'num_layers' in keys and type(config['num_layers']) == int
 46 |         assert 'project' in keys and type(config['project']) == str
 47 |         assert 'ram' in keys and type(config['ram']) == bool
 48 |         assert 'rnn_type' in keys and type(config['rnn_type']) == str and config['rnn_type'] in ['rnn', 'lstm', 'gru']
 49 |         assert 'save_model_path' in keys and type(config['save_model_path']) == str
 50 |         assert 'smoothing' in keys and type(config['smoothing']) == bool
 51 |         assert 'task' in keys and type(config['task']) == str
 52 |         assert 'test_path' in keys and type(config['test_path']) == str
 53 |         assert 'train_path' in keys and type(config['train_path']) == str
 54 |         assert 'use_flag' in keys and type(config['use_flag']) == str and config['use_flag'] in ["rec_error", "l2norm",
 55 |                                                                                                  "copod_score"]
 56 |         assert 'x0' in keys and type(config['x0']) == int
 57 |         assert 'variable_length' in keys and type(config['variable_length']) == bool
 58 |         assert 'min_length' in keys and type(config['min_length']) == int
 59 |         assert 'granularity_all' in keys and type(config['granularity_all']) == int
 60 |         assert 'num_granularity_all' in keys and type(config['num_granularity_all']) == int
 61 |         assert 'granularity_car' in keys and type(config['granularity_car']) == int
 62 |         assert 'num_granularity_car' in keys and type(config['num_granularity_car']) == int
 63 |         print('The config effective')
 64 |         return True
 65 |     except AssertionError as _:
 66 |         print('The config is invalid')
 67 |         traceback.print_exc()
 68 |         return False
 69 | 
 70 | 
 71 | def to_var(x):
 72 |     """
 73 |     If there is a GPU that puts X in CUDA 
 74 |     """
 75 |     if torch.cuda.is_available():
 76 |         x = x.cuda()
 77 |     return x
 78 | 
 79 | def collate(batch_data):
 80 |     """
 81 |     Collate is used to determine how the Dataloader generates batch, which is used to sort the padding of sequences of different lengths  
 82 |     args:
 83 |         batch_data - list of (tensor, metadata)
 84 | 
 85 |     return:
 86 |         (padded_sent_seq, data_lengths), metadata
 87 | 
 88 |     """
 89 | 
 90 |     batch_data.sort(key=lambda xi: len(xi[0]), reverse=True)
 91 |     seq_lengths = [len(xi[0]) for xi in batch_data]
 92 |     max_len = max(seq_lengths)
 93 | 
 94 |     sent_seq = [torch.FloatTensor(v[0]) for v in batch_data]
 95 |     # Processing metadata, merging dict  
 96 |     metadata_list = [xi[1] for xi in batch_data] 
 97 |     metadata = OrderedDict([('label', []), ('car', []), ('charge_segment', []), ('mileage', []), ('timestamp', [])])
 98 |     for i in range(len(metadata_list)):
 99 |         for key, value in metadata_list[i].items():
100 |             metadata[key].append(value)
101 | 
102 |     padded_sent_seq = torch.FloatTensor([pad_tensor(v, max_len) for v in sent_seq])
103 |     metadata['seq_lengths'] = seq_lengths
104 |     return padded_sent_seq, metadata
105 | 
106 | 
107 | 
108 | class Normalizer:
109 |     def __init__(self, dfs=None, variable_length=False):
110 |         """
111 |         Normalizer
112 |         Args:
113 |             dfs: The list contains each dataframe
114 |             variable_length: Is it variable length data
115 |         """
116 |         self.max_norm = 0
117 |         self.min_norm = 0
118 |         self.std = 0
119 |         self.mean = 0
120 |         res = []
121 |         if dfs is not None:
122 |             if variable_length:
123 |                 norm_length = min([len(df) for df in dfs])  # Given the variable length data, take the shortest data and make norm
124 |                 dfs = [df[0:norm_length] for df in dfs]
125 |             res.extend(dfs)
126 |             res = np.array(res)
127 |             self.compute_min_max(res)
128 |         else:
129 |             raise Exception("df list not specified")
130 | 
131 |     def compute_min_max(self, res):
132 |         """
133 |         Calculate maximum minimum mean and standard deviation
134 |         """
135 |         column_max_all = np.max(res, axis=1)
136 |         column_min_all = np.min(res, axis=1)
137 |         column_std_all = np.std(res, axis=1)
138 |         column_mean_all = np.mean(res, axis=1)
139 |         self.max_norm = np.max(column_max_all, axis=0)
140 |         self.min_norm = np.min(column_min_all, axis=0)
141 |         self.std = np.mean(column_std_all, axis=0)
142 |         self.mean = np.mean(column_mean_all, axis=0)
143 | 
144 |     def std_norm_df(self, df):
145 |         return (df - self.mean) / np.maximum(1e-4, self.std)
146 | 
147 |     def norm_func(self, df):
148 |         df_norm = df.copy()
149 |         (df_norm - self.mean) / np.maximum(np.maximum(1e-4, self.std), 0.1 * (self.max_norm - self.min_norm))
150 |         return df_norm
151 | 
152 | 
153 | class PreprocessNormalizer:
154 |     """
155 |     Data normalization class
156 |     """
157 |     def __init__(self, dataset, normalizer_fn=None):
158 |         self.dataset = dataset
159 |         self.normalizer_fn = normalizer_fn
160 | 
161 |     def __len__(self):
162 |         return len(self.dataset)
163 | 
164 |     def __getitem__(self, idx):
165 |         df, metadata = self.dataset[idx][0], self.dataset[idx][1]
166 |         if self.normalizer_fn is not None:
167 |             df = self.normalizer_fn(df)
168 |         return df, metadata
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------