├── time2graph ├── __init__.py ├── core │ ├── __init__.py │ ├── __pycache__ │ │ ├── Optimize.cpython-37.pyc │ │ ├── Optimize.cpython-38.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── model_gat.cpython-37.pyc │ │ ├── model_gat.cpython-38.pyc │ │ ├── model_gin.cpython-37.pyc │ │ ├── model_gin.cpython-38.pyc │ │ ├── model_utils.cpython-37.pyc │ │ ├── model_utils.cpython-38.pyc │ │ ├── distance_utils.cpython-37.pyc │ │ ├── distance_utils.cpython-38.pyc │ │ ├── shapelet_utils.cpython-37.pyc │ │ ├── shapelet_utils.cpython-38.pyc │ │ ├── static_shapelets.cpython-37.pyc │ │ ├── static_shapelets.cpython-38.pyc │ │ ├── time_aware_shapelets.cpython-37.pyc │ │ └── time_aware_shapelets.cpython-38.pyc │ ├── model_utils.py │ ├── static_shapelets.py │ ├── distance_utils.py │ ├── Optimize.py │ ├── shapelet_embedding.py │ ├── model_embeds.py │ ├── shapelet_utils.py │ ├── time_aware_shapelets.py │ └── model_gin.py ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ ├── gat.cpython-37.pyc │ │ ├── gat.cpython-38.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── mp_utils.cpython-37.pyc │ │ ├── mp_utils.cpython-38.pyc │ │ ├── base_utils.cpython-37.pyc │ │ ├── base_utils.cpython-38.pyc │ │ ├── gat_utils.cpython-37.pyc │ │ └── gat_utils.cpython-38.pyc │ ├── gat.py │ ├── gat_utils.py │ ├── deep_utils.py │ ├── mp_utils.py │ ├── deep_models.py │ └── base_utils.py └── __pycache__ │ ├── __init__.cpython-37.pyc │ └── __init__.cpython-38.pyc ├── requirements.txt ├── config.py ├── README.md ├── data_load.py └── my_train.py /time2graph/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /time2graph/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /time2graph/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /time2graph/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/gat.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/gat.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/gat.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/gat.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/Optimize.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/Optimize.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/Optimize.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/Optimize.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/model_gat.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_gat.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/model_gat.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_gat.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/model_gin.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_gin.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/model_gin.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_gin.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/mp_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/mp_utils.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/mp_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/mp_utils.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/model_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_utils.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/model_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_utils.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/base_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/base_utils.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/base_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/base_utils.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/gat_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/gat_utils.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/utils/__pycache__/gat_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/gat_utils.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/distance_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/distance_utils.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/distance_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/distance_utils.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/shapelet_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/shapelet_utils.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/shapelet_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/shapelet_utils.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/static_shapelets.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/static_shapelets.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/static_shapelets.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/static_shapelets.cpython-38.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/time_aware_shapelets.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/time_aware_shapelets.cpython-37.pyc -------------------------------------------------------------------------------- /time2graph/core/__pycache__/time_aware_shapelets.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/time_aware_shapelets.cpython-38.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dill>=0.2.5 2 | six>=1.10.0 3 | scipy>=1.3.0 4 | numpy>=1.16.0 5 | scikit_learn>=0.19.1 6 | pandas>=0.23 7 | xgboost>=0.80 8 | torch>=0.4.1 9 | networkx>=2.1 10 | tslearn>=0.2.5 11 | pathos>=0.2 -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from os import path, cpu_count 3 | from time2graph.utils.base_utils import Debugger 4 | 5 | module_path = path.dirname(path.abspath(__file__)) 6 | njobs = cpu_count() 7 | if njobs >= 40: 8 | njobs = int(njobs / 2) 9 | 10 | 11 | 12 | 13 | 14 | __all__ = [ 15 | 'np', 16 | 'path', 17 | 'Debugger', 18 | 'module_path', 19 | 'njobs' 20 | ] 21 | -------------------------------------------------------------------------------- /time2graph/core/model_utils.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from torch.utils.data.sampler import WeightedRandomSampler 3 | 4 | 5 | class NumpyDataset(Dataset): 6 | """ Dataset wrapping numpy ndarrays 7 | Each sample will be retrieved by indexing numpy-arrays along the first dimension. 8 | 9 | Arguments: 10 | *ndarrays (numpy-ndarray): ndarrays that have the same size of the first dimension. 11 | """ 12 | def __init__(self, *ndarrays): 13 | assert all(ndarrays[0].shape[0] == ndarray.shape[0] for ndarray in ndarrays) 14 | self.ndarrays = ndarrays 15 | 16 | def __getitem__(self, idx): 17 | return tuple(ndarray[idx] for ndarray in self.ndarrays) 18 | 19 | def __len__(self): 20 | return self.ndarrays[0].shape[0] 21 | 22 | 23 | class StratifiedSampler(WeightedRandomSampler): 24 | def __init__(self, label, num_class): 25 | self.num_class = num_class 26 | weights = self.__get_weight(label=label) 27 | super(StratifiedSampler, self).__init__(weights=weights, num_samples=len(weights)) 28 | 29 | def __get_weight(self, label): 30 | num_class = self.num_class 31 | cnt = [0] * num_class 32 | for lb in label: 33 | cnt[lb] += 1 34 | weight_per_class, total = [0.0] * num_class, float(sum(cnt)) 35 | for k in range(num_class): 36 | weight_per_class[k] = total / float(cnt[k]) 37 | ret = [0.0] * len(label) 38 | for idx, val in enumerate(label): 39 | ret[idx] = weight_per_class[val] 40 | return ret 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Quick Links 2 | 3 | - [Building and Testing](#building-and-testing) 4 | - [Usage](#usage) 5 | - [Performance](#performance) 6 | - [Reference](#reference) 7 | 8 | ## Building and Testing 9 | 10 | This project is implemented primarily in Python 3.6, with several dependencies listed below. We have tested the framework on Ubuntu 16.04.5 LTS with kernel 4.4.0, and it is expected to easily build and run under a regular Unix-like system. 11 | 12 | ### Dependencies 13 | 14 | - [Python 3.7](https://www.python.org). 15 | Version 3.7.0 has been tested. Higher versions are expected be compatible with current implementation, while there may be syntax errors or conflicts under python 2.x. 16 | 17 | - [PyTorch](https://pytorch.org). 18 | 19 | Version 1.7.0 has been tested. You can find installation instructions [here](https://pytorch.org/get-started/locally/). Note that the GPU support is **ENCOURAGED** as it greatly boosts training efficiency. 20 | 21 | 22 | - [Other Python modules](https://pypi.python.org). Some other Python module dependencies are listed in ```requirements.txt```, which can be easily installed with pip: 23 | 24 | ```bash 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | ## Reference 29 | [1] R. Wang, Y. Zhang, L. Peng, G. Fortino and P. -H. Ho, "Time-Varying-Aware Network Traffic Prediction Via Deep Learning in IIoT," in IEEE Transactions on Industrial Informatics, vol. 18, no. 11, pp. 8129-8137, Nov. 2022, doi: 10.1109/TII.2022.3163558. 30 | 31 | ``` 32 | @ARTICLE{9745370, 33 | author={Wang, Ranran and Zhang, Yin and Peng, Limei and Fortino, Giancarlo and Ho, Pin-Han}, 34 | journal={IEEE Transactions on Industrial Informatics}, 35 | title={Time-Varying-Aware Network Traffic Prediction Via Deep Learning in IIoT}, 36 | year={2022}, 37 | volume={18}, 38 | number={11}, 39 | pages={8129-8137}, 40 | doi={10.1109/TII.2022.3163558}} 41 | ```# NetworkTrafficPrediction -------------------------------------------------------------------------------- /time2graph/utils/gat.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .gat_utils import GraphAttentionLayer 5 | 6 | 7 | class GAT(nn.Module): 8 | def __init__(self, nfeat, nhid, nnodes, nclass, dropout, alpha, nheads, aggregate): 9 | """Dense version of GAT.""" 10 | super(GAT, self).__init__() 11 | self.dropout = dropout 12 | # sum or aggregate flag 13 | self.aggregate = aggregate 14 | self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)] 15 | for i, attention in enumerate(self.attentions): 16 | self.add_module('attention_{}'.format(i), attention) 17 | 18 | # self.out_att = GraphAttentionLayer(nhid * nheads, nclass, dropout=dropout, 19 | # alpha=alpha, concat=False, reshape=True) 20 | # self.add_module('attention_out', self.out_att) 21 | 22 | self.hidden_size = nnodes * nhid * nheads if self.aggregate else nnodes * nhid 23 | self.output = nn.Sequential( 24 | nn.Linear(self.hidden_size, self.hidden_size * 2), 25 | nn.ReLU(), 26 | nn.Linear(self.hidden_size * 2, nclass) 27 | ) 28 | 29 | def forward(self, x, adj, feat_flag=False): 30 | x = F.dropout(x, self.dropout, training=self.training) 31 | x_head = [att(x, adj) for att in self.attentions] 32 | if self.aggregate: 33 | x = torch.cat(x_head, dim=2).view(x.size()[0], -1) 34 | else: 35 | x = torch.sum(torch.stack(x_head, dim=2), dim=2).view(x.size()[0], -1) 36 | x = F.dropout(x, self.dropout, training=self.training) 37 | if feat_flag: 38 | return F.elu(x) 39 | else: 40 | x = self.output(F.elu(x)) 41 | return F.log_softmax(x, dim=1) 42 | 43 | 44 | def accuracy_torch(output, labels): 45 | preds = output.max(1)[1].type_as(labels) 46 | correct = preds.eq(labels).double() 47 | correct = correct.sum() 48 | return correct / len(labels) 49 | 50 | 51 | def label_np(output, cuda): 52 | if cuda: 53 | return output.max(1)[1].cpu().numpy() 54 | else: 55 | return output.max(1)[1].numpy() 56 | 57 | 58 | def output_np(output, cuda): 59 | if cuda: 60 | return output.detach().cpu().numpy() 61 | else: 62 | return output.detach().numpy() 63 | -------------------------------------------------------------------------------- /time2graph/utils/gat_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.utils.data import DataLoader 5 | 6 | 7 | class GraphAttentionLayer(nn.Module): 8 | """ 9 | Simple GAT layer, similar to https://arxiv.org/abs/1710.10903 10 | """ 11 | 12 | def __init__(self, in_features, out_features, dropout, alpha, concat=True, reshape=False): 13 | super(GraphAttentionLayer, self).__init__() 14 | self.dropout = dropout 15 | self.in_features = in_features 16 | self.out_features = out_features 17 | self.alpha = alpha 18 | self.concat = concat 19 | self.reshape = reshape 20 | 21 | self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)), requires_grad=True) 22 | nn.init.xavier_uniform_(self.W.data, gain=1.414) 23 | self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1)), requires_grad=True) 24 | nn.init.xavier_uniform_(self.a.data, gain=1.414) 25 | 26 | self.leakyrelu = nn.LeakyReLU(self.alpha) 27 | 28 | def forward(self, input, adj): 29 | h = torch.matmul(input, self.W) 30 | nbatch, N = h.size()[0], h.size()[1] 31 | a_input = torch.cat([h.repeat(1, 1, N).view(nbatch, N * N, -1), 32 | h.repeat(1, N, 1)], dim=2).view(nbatch, N, -1, 2 * self.out_features) 33 | e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(3)) 34 | 35 | zero_vec = -9e15 * torch.ones_like(e) 36 | attention = torch.where(adj > 0, e, zero_vec) 37 | attention = F.softmax(attention, dim=2) 38 | attention = F.dropout(attention, self.dropout, training=self.training) 39 | h_prime = torch.matmul(attention, h) 40 | if self.reshape: 41 | h_prime = h_prime.view(nbatch, -1) 42 | 43 | if self.concat: 44 | return F.elu(h_prime) 45 | else: 46 | return h_prime 47 | 48 | def __repr__(self): 49 | return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')' 50 | 51 | 52 | class GATDataloader(DataLoader): 53 | def __init__(self, *args, **kwargs): 54 | super(GATDataloader, self).__init__(*args, **kwargs) 55 | 56 | 57 | class GATDataset(object): 58 | def __init__(self, feat, adj, y=None): 59 | if y is not None: 60 | self.data = [(feat[k], adj[k], y[k]) for k in range(len(y))] 61 | else: 62 | self.data = [(feat[k], adj[k]) for k in range(len(adj))] 63 | 64 | def __getitem__(self, item): 65 | return self.data[item] 66 | 67 | def __len__(self): 68 | return len(self.data) 69 | -------------------------------------------------------------------------------- /time2graph/core/static_shapelets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scipy.stats import entropy 3 | from ..utils.base_utils import Queue 4 | from .model_utils import * 5 | from .shapelet_utils import * 6 | from .distance_utils import * 7 | 8 | 9 | def __static_shapelet_candidate_loss(cand, time_series_set, label, warp, num_segment, seg_length, measurement, **kwargs): 10 | assert seg_length == cand.shape[0] and num_segment * seg_length == time_series_set.shape[1] 11 | distances = np.zeros(time_series_set.shape[0], dtype=float) 12 | for i in range(time_series_set.shape[0]): 13 | distances[i] = pattern_distance_no_timing(pattern=cand, time_series=time_series_set[i], warp=warp, measurement=measurement) 14 | positive_distance = distances[label == 1] 15 | negative_distance = distances[label == 0] 16 | max_val, min_val = np.max(distances), np.min(distances) 17 | num_bins = int(max_val - min_val) + 1 18 | positive_norm = np.histogram(a=positive_distance, bins=num_bins, range=(min_val, max_val), density=True)[0] 19 | negative_norm = np.histogram(a=negative_distance, bins=num_bins, range=(min_val, max_val), density=True)[0] 20 | positive_norm[positive_norm == 0] = 1e-3 21 | negative_norm[negative_norm == 0] = 1e-3 22 | return -(entropy(negative_norm, positive_norm) + entropy(positive_norm, negative_norm)) 23 | 24 | 25 | def __static_shapelet_candidate_loss_factory(time_series_set, label, warp, num_segment, seg_length, measurement, **kwargs): 26 | def __main__(pid, args, queue): 27 | ret = [] 28 | for cand in args: 29 | loss = __static_shapelet_candidate_loss( 30 | cand=cand, time_series_set=time_series_set, label=label, warp=warp, num_segment=num_segment, 31 | seg_length=seg_length, measurement=measurement, **kwargs 32 | ) 33 | ret.append((cand, loss)) 34 | queue.put(0) 35 | return ret 36 | return __main__ 37 | 38 | 39 | def learn_static_shapelets(time_series_set, label, K, C, warp, num_segment, seg_length, measurement, **kwargs): 40 | cands = generate_shapelet_candidate(time_series_set=time_series_set, num_segment=num_segment, 41 | seg_length=seg_length, candidate_size=C, **kwargs) 42 | parmap = ParMap( 43 | work=__static_shapelet_candidate_loss_factory( 44 | time_series_set=time_series_set, label=label, warp=warp, num_segment=num_segment, seg_length=seg_length, 45 | measurement=measurement, **kwargs 46 | ), 47 | monitor=parallel_monitor(msg='learning static shapelets', size=len(cands), 48 | debug=kwargs.get('debug', True)), 49 | njobs=kwargs.get('njobs', NJOBS) 50 | ) 51 | return sorted(parmap.run(data=cands), key=lambda x: x[-1])[:K] 52 | -------------------------------------------------------------------------------- /data_load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Oct 24 16:07:22 2021 4 | 5 | @author: Administrator 6 | """ 7 | import numpy as np 8 | import pandas as pd 9 | import random 10 | import pickle 11 | import os 12 | from tqdm import tqdm 13 | def getsingleGroup(pro_data,group,src_len,tar_len,step): 14 | ''' 15 | pro_data 全部数据 16 | group 单组 key 17 | 单组生成序列 18 | 19 | ''' 20 | curent_df=pro_data.loc[(pro_data['hostname']==group[0]) & (pro_data['series']==group[1])] 21 | tw=src_len+tar_len#总的采样窗口大小,前面是X,后面部分的Mean是Y 22 | step=step 23 | X=[] 24 | Y=[] 25 | 26 | L=len(curent_df) 27 | #按时间排序 28 | curent_df['time'] = pd.to_datetime(curent_df['time_window']) 29 | curent_df.sort_values('time', inplace=True) 30 | useful_column=[ 'Mean', 'SD', 'Open', 'High','Low', 'Close', 'Volume']#取特征列 31 | 32 | for i in range(0,L-tw,step): 33 | # train_seq = df_tmp[features].values[i:i+tw] 34 | train_seq =curent_df[i:i+tw][useful_column]# 35 | X.append(train_seq.values[i:i+src_len]) 36 | Y.append(train_seq[i+src_len:]['Mean'].values) 37 | if i>L-tw and i1000:#控制内存 43 | X=X[-1000:] 44 | Y=Y[-1000:] 45 | return np.array(X),np.array(Y) 46 | 47 | def get_dataset(inputdir,src_len,tar_len,step=5,train_probility=0.8,sample_pro=10000): 48 | 49 | if os.path.exists("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len)): 50 | train=pickle.load(open("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'rb'))#生成样本集 51 | valid=pickle.load(open("valid_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'rb')) 52 | print("数据:",train['X'].shape) 53 | return train['X'],train['Y'],valid['X'],valid['Y'] 54 | else: 55 | pro_data=pd.read_csv(inputdir) 56 | all_sample=[] 57 | for k1,k2 in pro_data.groupby(by=['hostname','series']): 58 | all_sample.append(k1) 59 | all_sample=all_sample[:sample_pro]#少搞点试试 60 | random.shuffle(all_sample) 61 | print('总采样点数:',len(all_sample))#19005 62 | train_all_sample=all_sample[:int(len(all_sample)*train_probility)] 63 | test_all_sample=list(filter(lambda x: x not in train_all_sample, all_sample)) 64 | print('训练样本',len(train_all_sample),'测试样本:',len(test_all_sample)) 65 | print('生成训练样本...') 66 | train_x,train_y=[],[] 67 | for id_ in tqdm(train_all_sample): 68 | x_i,y_i=getsingleGroup(pro_data,id_,src_len,tar_len,step)#一组样本 69 | train_x.extend(x_i) 70 | train_y.extend(y_i) 71 | 72 | with open("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'wb') as f: 73 | pickle.dump({'X':np.array(train_x),'Y':np.array(train_y)},f) 74 | 75 | print('生成测试样本...') 76 | valid_x,valid_y=[],[] 77 | for id_ in tqdm(test_all_sample): 78 | x_i,y_i=getsingleGroup(pro_data,id_,src_len,tar_len,step)#一组样本 79 | valid_x.extend(x_i) 80 | valid_y.extend(y_i) 81 | 82 | with open("valid_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'wb') as f: 83 | pickle.dump({'X':np.array(valid_x),'Y':np.array(valid_y)},f) 84 | return np.array(train_x),np.array(train_y),np.array(valid_x),np.array(valid_y) -------------------------------------------------------------------------------- /time2graph/utils/deep_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.autograd import Variable 4 | import torch.nn.functional as F 5 | from torch.utils.data import DataLoader 6 | from .base_utils import Debugger, evaluate_performance 7 | 8 | 9 | def latent_loss(z_mean, z_std): 10 | mean_2 = z_mean * z_mean 11 | std_2 = z_std * z_std 12 | return 0.5 * torch.mean(mean_2 + std_2 - torch.log(std_2) - 1) 13 | 14 | 15 | class DeepDataloader(DataLoader): 16 | def __init__(self, *args, **kwargs): 17 | super(DeepDataloader, self).__init__(*args, **kwargs) 18 | 19 | 20 | class DeepDataset(object): 21 | def __init__(self, x, y): 22 | self.x = x 23 | self.y = y 24 | 25 | def __getitem__(self, item): 26 | return self.x[item], self.y[item] 27 | 28 | def __len__(self): 29 | return len(self.y) 30 | 31 | 32 | def train_RNNs(epoch, dataloader, rnn, criterion, optimizer, debug, gpu_enable): 33 | rnn.train() 34 | for i, (sequences, target) in enumerate(dataloader, 0): 35 | sequences = sequences.double() 36 | if gpu_enable: 37 | sequences = sequences.cuda() 38 | target = target.cuda() 39 | sequences = Variable(sequences) 40 | target = Variable(target) 41 | output = rnn(sequences) 42 | loss = criterion(output, target) 43 | optimizer.zero_grad() 44 | loss.backward() 45 | optimizer.step() 46 | 47 | if i % int(len(dataloader) / 10 + 1) == 0: 48 | Debugger.debug_print('[{}][{}][{}], Loss: {}'.format( 49 | epoch, i, len(dataloader), loss.item()), debug=debug) 50 | 51 | 52 | def train_VAE(epoch, dataloader, vae, criterion, optimizer, debug, gpu_enable): 53 | vae.train() 54 | for i, (sequences, target) in enumerate(dataloader, 0): 55 | optimizer.zero_grad() 56 | sequences = sequences.double() 57 | if gpu_enable: 58 | sequences = sequences.cuda() 59 | target = target.cuda() 60 | sequences = Variable(sequences) 61 | output = vae(sequences) 62 | loss = criterion(output, sequences) + latent_loss(vae.z_mean, vae.z_sigma) 63 | loss.backward() 64 | optimizer.step() 65 | Debugger.debug_print('[{}][{}][{}], Loss: {}'.format( 66 | epoch, i, len(dataloader), loss.item(), debug=debug)) 67 | 68 | 69 | def test_DeepModels(dataloader, rnn, criterion, debug, gpu_enable): 70 | for th in range(5, 20, 1): 71 | test_loss = 0 72 | correct = 0 73 | rnn.eval() 74 | y_pred, y_test = [], [] 75 | th = th / 20 76 | for i, (sequences, target) in enumerate(dataloader, 0): 77 | rnn.zero_grad() 78 | sequences = sequences.double() 79 | if gpu_enable: 80 | sequences = sequences.cuda() 81 | target = target.cuda() 82 | sequences = Variable(sequences) 83 | target = Variable(target) 84 | output = rnn(sequences) 85 | test_loss += criterion(output, target).item() 86 | pred = F.softmax(output, dim=1)[:, 1].data.cpu().numpy() 87 | tmp = np.zeros(len(pred)) 88 | tmp[pred >= th] = 1 89 | y_pred += list(tmp) 90 | y_test += list(target.cpu().numpy()) 91 | test_loss /= len(dataloader.dataset) 92 | y_pred, y_test = np.array(y_pred, dtype=np.int).reshape(-1), np.array(y_test, dtype=np.int).reshape(-1) 93 | accu, prec, recall, f1 = evaluate_performance(y_pred=y_pred, y_true=y_test) 94 | Debugger.info_print('res: accu {:.4f}, prec {:.4f}, recall {:.4f}, f1 {:.4f}'.format( 95 | accu, prec, recall, f1 96 | )) 97 | Debugger.debug_print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format( 98 | test_loss, correct, len(dataloader.dataset), 99 | 100. * correct / len(dataloader.dataset)), debug=debug) 100 | -------------------------------------------------------------------------------- /time2graph/utils/mp_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import itertools 3 | import sys 4 | import dill 5 | import contextlib 6 | import math 7 | from pathos.helpers import mp 8 | import numpy as np 9 | 10 | NJOBS = mp.cpu_count() 11 | if NJOBS >= 20: 12 | NJOBS = 20 13 | 14 | __all__ = [ 15 | 'NJOBS', 16 | 'ParMap', 17 | 'parallel_monitor' 18 | ] 19 | 20 | 21 | class ParMap(object): 22 | def __init__(self, work, monitor=None, njobs=NJOBS, maxtasksperchild=100): 23 | self.work_func = work 24 | self.monitor_func = monitor 25 | self.__njobs = njobs 26 | self.__mtpc = maxtasksperchild 27 | 28 | self.__pool = None 29 | 30 | def close(self): 31 | if self.__pool is not None: 32 | self.__pool.close() 33 | self.__pool.join() 34 | self.__pool = None 35 | 36 | def __del__(self): 37 | self.close() 38 | 39 | @property 40 | def njobs(self): 41 | return self.__njobs 42 | 43 | @njobs.setter 44 | def njobs(self, n): 45 | self.__njobs = n 46 | self.close() 47 | 48 | def default_chunk(self, dlen): 49 | return int(math.ceil(float(dlen) / self.njobs)) 50 | 51 | def run(self, data, chunk=None, shuffle=False): 52 | if chunk is None: 53 | chunk = self.default_chunk(len(data)) 54 | 55 | if shuffle: 56 | data, order, invorder = shuffle_sample(data) 57 | else: 58 | invorder = None 59 | 60 | slices = slice_sample(data, chunk=chunk) 61 | res = self.run_slices(slices) 62 | 63 | if shuffle: 64 | res = apply_order(res, invorder) 65 | 66 | return res 67 | 68 | def run_slices(self, slices): 69 | mgr = mp.Manager() 70 | report_queue = mgr.Queue() 71 | if self.monitor_func is not None: 72 | monitor = mp.Process(target=self.monitor_func, args=(report_queue,)) 73 | monitor.start() 74 | else: 75 | monitor = None 76 | 77 | if self.njobs == 1: 78 | res = [] 79 | for slc in slices: 80 | res.append(self.work_func(None, slc, report_queue)) 81 | else: 82 | dill_work_func = dill.dumps(self.work_func) 83 | with contextlib.closing(mp.Pool(self.njobs, maxtasksperchild=self.__mtpc)) as pool: 84 | res = pool.map(func_wrapper, [[dill_work_func, slc, report_queue] for slc in slices]) 85 | res = list(itertools.chain.from_iterable(res)) 86 | 87 | report_queue.put(StopIteration()) 88 | if monitor is not None: 89 | monitor.join() 90 | 91 | return res 92 | 93 | 94 | def func_wrapper(args): 95 | func = dill.loads(args[0]) 96 | return func(mp.current_process().ident, *args[1:]) 97 | 98 | 99 | def apply_order(sample, order): 100 | return [sample[o] for o in order] 101 | 102 | 103 | def shuffle_sample(sample): 104 | order = np.random.permutation(np.arange(len(sample))) 105 | invorder = np.zeros((len(sample), ), dtype='int32') 106 | invorder[order] = np.arange(len(sample)) 107 | 108 | return apply_order(sample, order), order, invorder 109 | 110 | 111 | def slice_sample(sample, chunk=None, nslice=None): 112 | slices = [] 113 | if chunk is None: 114 | chunk = int(len(sample) / nslice) 115 | else: 116 | if nslice is not None: 117 | raise RuntimeError("chunk ({}) and slice ({}) should not be specified simultaneously".format(chunk, nslice)) 118 | 119 | curstart = 0 120 | while True: 121 | if curstart >= len(sample): 122 | break 123 | slices.append(sample[curstart:min(curstart + chunk, len(sample))]) 124 | curstart += chunk 125 | 126 | return slices 127 | 128 | 129 | def parallel_monitor(msg, size, debug): 130 | def monitor(queue): 131 | cnt = 0 132 | while True: 133 | obj = queue.get() 134 | if isinstance(obj, StopIteration): 135 | break 136 | if isinstance(obj, int): 137 | if obj != 0: 138 | cnt += obj 139 | else: 140 | cnt += 1 141 | else: 142 | cnt += 1 143 | if debug: 144 | print('[debug]' + '{} executed by {:.2f}%'.format(msg, float(cnt) / size * 100) + '\r', end='') 145 | sys.stdout.flush() 146 | # Debugger.debug_print(msg='{} executed by {:.2f}%'.format(msg, float(cnt) / size * 100), 147 | # debug=debug) 148 | return monitor 149 | -------------------------------------------------------------------------------- /time2graph/utils/deep_models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | 8 | class LSTMClassifier(nn.Module): 9 | def __init__(self, data_size, hidden_size, output_size, 10 | dropout, hidden_dim=128, gpu_enable=False): 11 | super(LSTMClassifier, self).__init__() 12 | self.data_size = data_size 13 | self.hidden_size = hidden_size 14 | self.output_size = output_size 15 | self.gpu_enable = gpu_enable 16 | self.model = nn.LSTM(data_size, hidden_size, batch_first=True).double() 17 | self.hidden2out = nn.Sequential( 18 | nn.Linear(hidden_size, hidden_dim), 19 | nn.ReLU(), 20 | nn.Linear(hidden_dim, output_size) 21 | ) 22 | self.dropout = nn.Dropout(p=dropout) 23 | 24 | def init_hidden(self, batch_size): 25 | if self.gpu_enable: 26 | return ( 27 | Variable(torch.zeros(1, batch_size, self.hidden_size).double().cuda()), 28 | Variable(torch.zeros(1, batch_size, self.hidden_size).double().cuda()) 29 | ) 30 | else: 31 | return ( 32 | Variable(torch.zeros(1, batch_size, self.hidden_size).double()), 33 | Variable(torch.zeros(1, batch_size, self.hidden_size).double()) 34 | ) 35 | 36 | def forward(self, X): 37 | hidden = self.init_hidden(batch_size=len(X)) 38 | outputs, (h_n, c_n) = self.model(X.double(), hidden) 39 | # return self.softmax(self.hidden2out(outputs)) 40 | return self.hidden2out(h_n[0]) 41 | 42 | 43 | class GRUClassifier(nn.Module): 44 | def __init__(self, data_size, hidden_size, output_size, dropout, 45 | gpu_enable=False): 46 | super(GRUClassifier, self).__init__() 47 | self.data_size = data_size 48 | self.hidden_size = hidden_size 49 | self.output_size = output_size 50 | self.gpu_enable = gpu_enable 51 | self.model = nn.GRU(data_size, hidden_size, batch_first=True).double() 52 | self.hidden2out = nn.Linear(hidden_size, output_size) 53 | 54 | def init_hidden(self, batch_size): 55 | if self.gpu_enable: 56 | return Variable(torch.zeros(1, batch_size, self.hidden_size).double().cuda()) 57 | else: 58 | return Variable(torch.zeros(1, batch_size, self.hidden_size).double()) 59 | 60 | def forward(self, X): 61 | hidden = self.init_hidden(batch_size=len(X)) 62 | outputs, (h_n, c_n) = self.model(X.double(), hidden) 63 | return self.hidden2out(h_n[0]) 64 | 65 | 66 | class EnDecoder(nn.Module): 67 | def __init__(self, D_in, H, D_out): 68 | super(EnDecoder, self).__init__() 69 | self.linear_1 = nn.Linear(D_in, H) 70 | self.linear_2 = nn.Linear(H, D_out) 71 | 72 | def forward(self, x): 73 | x = F.relu(self.linear_1(x)) 74 | return F.relu(self.linear_2(x)) 75 | 76 | 77 | class VAE(nn.Module): 78 | def __init__(self, encoder, decoder, encode_dim, latent_dim): 79 | super(VAE, self).__init__() 80 | self.encoder = encoder 81 | self.decoder = decoder 82 | self.encode_dim = encode_dim 83 | self.latent_dim = latent_dim 84 | self.__enc_mu = nn.Linear(encode_dim, latent_dim) 85 | self.__enc_log_sigma = nn.Linear(encode_dim, latent_dim) 86 | 87 | def __sample_latent(self, h_enc): 88 | mu = self.__enc_mu(h_enc) 89 | log_sigma = self.__enc_log_sigma(h_enc) 90 | sigma = torch.exp(log_sigma) 91 | std_z = torch.from_numpy(np.random.normal(0, 1, size=sigma.size())).double() 92 | self.z_mean = mu 93 | self.z_sigma = sigma 94 | return mu + sigma * Variable(std_z, requires_grad=False) 95 | 96 | def forward(self, state): 97 | h_enc = self.encoder(state) 98 | z = self.__sample_latent(h_enc=h_enc) 99 | return self.decoder(z) 100 | 101 | 102 | class MLP(nn.Module): 103 | def __init__(self, data_size, hidden_size, output_size, n_class=2): 104 | super(MLP, self).__init__() 105 | self.data_size = data_size 106 | self.hidden_size = hidden_size 107 | self.output_size = output_size 108 | self.hidden_layer = nn.Linear(data_size, hidden_size) 109 | self.output_layer = nn.Linear(hidden_size, output_size) 110 | self.out = nn.Linear(output_size, n_class) 111 | 112 | def forward(self, x): 113 | x = x.view(self.batch_size, self.data_size) 114 | return self.out(F.relu(self.output_layer(F.relu(self.hidden_layer(x))))) 115 | -------------------------------------------------------------------------------- /time2graph/core/distance_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def greedy_dtw_path(x, y, warp, dist=lambda x, y: np.linalg.norm(x - y)): 5 | if np.ndim(x) == 1: 6 | x = x.reshape(-1, 1) 7 | if np.ndim(y) == 1: 8 | y = y.reshape(-1, 1) 9 | nrows, ncols = x.shape[0], y.shape[0] 10 | ridx, cidx, rpath, cpath = 0, 0, [0], [0] 11 | while ridx < nrows - 1 and cidx < ncols - 1: 12 | rdist = dist(x[ridx + 1], y[cidx]) 13 | cdist = dist(x[ridx], y[cidx + 1]) 14 | ddist = dist(x[ridx + 1], y[cidx + 1]) 15 | if ddist < rdist and ddist < cdist: 16 | ridx += 1 17 | cidx += 1 18 | elif rdist < cdist: 19 | if ridx < cidx + warp: 20 | ridx += 1 21 | else: 22 | cidx += 1 23 | else: 24 | if cidx < ridx + warp: 25 | cidx += 1 26 | else: 27 | ridx += 1 28 | rpath.append(ridx) 29 | cpath.append(cidx) 30 | for k in range(ridx + 1, nrows): 31 | rpath.append(k) 32 | cpath.append(ncols - 1) 33 | for k in range(cidx + 1, ncols): 34 | cpath.append(k) 35 | rpath.append(nrows - 1) 36 | return np.array(rpath), np.array(cpath) 37 | 38 | 39 | def parameterized_gdtw_npy(x, y, w, warp, dist=lambda x, y: np.linalg.norm(x - y)): 40 | if np.ndim(x) == 1: 41 | x = x.reshape(-1, 1) 42 | if np.ndim(y) == 1: 43 | y = y.reshape(-1, 1) 44 | dpath = greedy_dtw_path(x=x, y=y, dist=dist, warp=warp) 45 | return dist((x * np.abs(w).reshape(len(w), -1))[dpath[0]], y[dpath[1]]) 46 | 47 | 48 | def expand_array(y, warp): 49 | size = y.shape[0] 50 | tmp_y = np.concatenate((y[size - warp: size, :], y, y[: warp, :]), axis=0) 51 | return np.array([tmp_y[k: (k+2 * warp + 1)] for k in range(size)], dtype=np.float32) 52 | 53 | 54 | def softmax(x): 55 | """Compute softmax values for each sets of scores in x.""" 56 | # def __errcall(type, msg): 57 | # Debugger.info_print(msg='RuntimeWarning: {}-{}, {}'.format(type, msg, x)) 58 | # np.seterrcall(__errcall) 59 | # np.seterr(divide='call') 60 | x -= np.max(x) 61 | return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True) 62 | 63 | 64 | def softmax_1d(x): 65 | x -= np.max(x) 66 | return np.exp(x) / np.sum(np.exp(x), keepdims=True) 67 | 68 | 69 | def parameterized_gw_npy(x, y, w, warp): 70 | distance = np.sum((x.reshape(x.shape[0], -1, x.shape[1]) - expand_array(y=y, warp=warp)) ** 2, 71 | axis=1) 72 | ''' 73 | TODO 74 | 这里是可以改进的点,原文通过求离shapelet最近的片段的距离来代表当前shapelet和序列的距离 75 | ''' 76 | 77 | softmin_distance = np.sum(softmax(-distance.astype(np.float64)).astype(np.float32) * distance, 78 | axis=1) 79 | return np.sqrt(np.sum(softmin_distance * np.abs(w))) 80 | 81 | 82 | def pattern_distance_time_aware(pattern, time_series, local_factor, global_factor, warp, 83 | init, measurement): 84 | if measurement == 'gw': 85 | dist = parameterized_gw_npy 86 | elif measurement == 'gdtw': 87 | dist = parameterized_gdtw_npy 88 | else: 89 | raise NotImplementedError('unsupported distance {}'.format(measurement)) 90 | num_segment = int(time_series.shape[0] / pattern.shape[0]) 91 | seg_length = pattern.shape[0] 92 | assert init + num_segment <= len(global_factor) 93 | time_series = time_series.reshape(num_segment, seg_length, -1) 94 | ret = np.zeros(num_segment, np.float32).reshape(-1) 95 | for k in range(num_segment): 96 | ret[k] = dist(x=pattern, y=time_series[k], w=local_factor, warp=warp) 97 | return np.sum(softmax_1d(-ret * np.abs(global_factor[init: init + num_segment])) 98 | * ret * np.abs(global_factor[init: init + num_segment])) 99 | 100 | 101 | def pattern_distance_no_timing(pattern, time_series, warp, measurement): 102 | if measurement == 'gw': 103 | dist = parameterized_gw_npy 104 | elif measurement == 'gdtw': 105 | dist = parameterized_gdtw_npy 106 | else: 107 | raise NotImplementedError('unsupported distance {}'.format(measurement)) 108 | num_segment = int(time_series.shape[0] / pattern.shape[0]) 109 | seg_length = pattern.shape[0] 110 | w = np.ones(seg_length, dtype=np.float32).reshape(-1) 111 | assert time_series.shape[0] == num_segment * pattern.shape[0] 112 | time_series = time_series.reshape(num_segment, pattern.shape[0], -1) 113 | ret = np.zeros(num_segment, np.float32).reshape(-1) 114 | for k in range(num_segment): 115 | ret[k] = dist(x=pattern, y=time_series[k], w=w, warp=warp) 116 | return np.sum(softmax_1d(-ret) * ret) 117 | -------------------------------------------------------------------------------- /time2graph/core/Optimize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 4 15:53:35 2021 4 | 5 | @author: Administrator 6 | """ 7 | import torch 8 | from collections import OrderedDict 9 | from torch.optim import Optimizer 10 | from torch.optim.lr_scheduler import LambdaLR 11 | from typing import Callable, Iterable, Tuple 12 | import math 13 | import numpy as np 14 | import torch.nn as nn 15 | 16 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): 17 | """ 18 | Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after 19 | a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer. 20 | Args: 21 | optimizer (:class:`~torch.optim.Optimizer`): 22 | The optimizer for which to schedule the learning rate. 23 | num_warmup_steps (:obj:`int`): 24 | The number of steps for the warmup phase. 25 | num_training_steps (:obj:`int`): 26 | The total number of training steps. 27 | last_epoch (:obj:`int`, `optional`, defaults to -1): 28 | The index of the last epoch when resuming training. 29 | Return: 30 | :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. 31 | """ 32 | 33 | def lr_lambda(current_step: int): 34 | if current_step < num_warmup_steps: 35 | return float(current_step) / float(max(1, num_warmup_steps)) 36 | return max( 37 | 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) 38 | ) 39 | 40 | return LambdaLR(optimizer, lr_lambda, last_epoch) 41 | 42 | class AdamW(Optimizer): 43 | """ 44 | Implements Adam algorithm with weight decay fix as introduced in `Decoupled Weight Decay Regularization 45 | `__. 46 | Parameters: 47 | params (:obj:`Iterable[torch.nn.parameter.Parameter]`): 48 | Iterable of parameters to optimize or dictionaries defining parameter groups. 49 | lr (:obj:`float`, `optional`, defaults to 1e-3): 50 | The learning rate to use. 51 | betas (:obj:`Tuple[float,float]`, `optional`, defaults to (0.9, 0.999)): 52 | Adam's betas parameters (b1, b2). 53 | eps (:obj:`float`, `optional`, defaults to 1e-6): 54 | Adam's epsilon for numerical stability. 55 | weight_decay (:obj:`float`, `optional`, defaults to 0): 56 | Decoupled weight decay to apply. 57 | correct_bias (:obj:`bool`, `optional`, defaults to `True`): 58 | Whether ot not to correct bias in Adam (for instance, in Bert TF repository they use :obj:`False`). 59 | """ 60 | 61 | def __init__( 62 | self, 63 | params: Iterable[torch.nn.parameter.Parameter], 64 | lr: float = 1e-3, 65 | betas: Tuple[float, float] = (0.9, 0.999), 66 | eps: float = 1e-6, 67 | weight_decay: float = 0.0, 68 | correct_bias: bool = True, 69 | ): 70 | if lr < 0.0: 71 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 72 | if not 0.0 <= betas[0] < 1.0: 73 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) 74 | if not 0.0 <= betas[1] < 1.0: 75 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) 76 | if not 0.0 <= eps: 77 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) 78 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) 79 | super().__init__(params, defaults) 80 | 81 | def step(self, closure: Callable = None): 82 | """ 83 | Performs a single optimization step. 84 | Arguments: 85 | closure (:obj:`Callable`, `optional`): A closure that reevaluates the model and returns the loss. 86 | """ 87 | loss = None 88 | if closure is not None: 89 | loss = closure() 90 | 91 | for group in self.param_groups: 92 | for p in group["params"]: 93 | if p.grad is None: 94 | continue 95 | grad = p.grad.data 96 | if grad.is_sparse: 97 | raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead") 98 | 99 | state = self.state[p] 100 | 101 | # State initialization 102 | if len(state) == 0: 103 | state["step"] = 0 104 | # Exponential moving average of gradient values 105 | state["exp_avg"] = torch.zeros_like(p.data) 106 | # Exponential moving average of squared gradient values 107 | state["exp_avg_sq"] = torch.zeros_like(p.data) 108 | 109 | exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] 110 | beta1, beta2 = group["betas"] 111 | 112 | state["step"] += 1 113 | 114 | # Decay the first and second moment running average coefficient 115 | # In-place operations to update the averages at the same time 116 | exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1) 117 | exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2) 118 | denom = exp_avg_sq.sqrt().add_(group["eps"]) 119 | 120 | step_size = group["lr"] 121 | if group["correct_bias"]: # No bias correction for Bert 122 | bias_correction1 = 1.0 - beta1 ** state["step"] 123 | bias_correction2 = 1.0 - beta2 ** state["step"] 124 | step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 125 | 126 | p.data.addcdiv_(exp_avg, denom, value=-step_size) 127 | 128 | # Just adding the square of the weights to the loss function is *not* 129 | # the correct way of using L2 regularization/weight decay with Adam, 130 | # since that will interact with the m and v parameters in strange ways. 131 | # 132 | # Instead we want to decay the weights in a manner that doesn't interact 133 | # with the m/v parameters. This is equivalent to adding the square 134 | # of the weights to the loss with plain (non-momentum) SGD. 135 | # Add weight decay at the end (fixed version) 136 | if group["weight_decay"] > 0.0: 137 | p.data.add_(p.data, alpha=-group["lr"] * group["weight_decay"]) 138 | 139 | return loss 140 | -------------------------------------------------------------------------------- /time2graph/core/shapelet_embedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .shapelet_utils import * 3 | embed_number = 5 4 | 5 | 6 | def time_series_embeds_factory__(embed_size, embeddings, threshold, 7 | multi_graph, debug, mode): 8 | def __concate__(pid, args, queue): 9 | ret = [] 10 | for sdist in args: 11 | tmp = np.zeros(len(sdist) * embed_size * embed_number, dtype=np.float32).reshape(-1) 12 | for sidx in range(len(sdist)): 13 | dist = sdist[sidx, :] 14 | target = np.argsort(np.argwhere(dist <= threshold).reshape(-1))[:embed_number] 15 | if len(target) == 0: 16 | continue 17 | weight = 1.0 - minmax_scale(dist[target]) 18 | if np.sum(weight) == 0: 19 | Debugger.warn_print(msg='dist {}, weight {}'.format(dist, weight), debug=debug) 20 | else: 21 | weight /= np.sum(weight) 22 | target_number = len(weight) 23 | for k in range(target_number): 24 | src, dst = (sidx * embed_number + k) * embed_size, (sidx * embed_number + k + 1) * embed_size 25 | if multi_graph: 26 | if sidx == 0: 27 | tmp[src: dst] = weight[k] * embeddings[sidx, target[k]].reshape(-1) 28 | elif sidx == len(sdist) - 1: 29 | tmp[src: dst] = weight[k] * embeddings[sidx - 1, target[k]].reshape(-1) 30 | else: 31 | former = weight[k] * embeddings[sidx - 1, target[k]].reshape(-1) 32 | latter = weight[k] * embeddings[sidx, target[k]].reshape(-1) 33 | tmp[src: dst] = (former + latter) 34 | else: 35 | tmp[src: dst] = weight[k] * embeddings[0, target[k]].reshape(-1) 36 | ret.append(tmp) 37 | queue.put(0) 38 | return ret 39 | 40 | def __aggregate__(pid, args, queue): 41 | ret = [] 42 | for sdist in args: 43 | tmp = np.zeros(len(sdist) * embed_size, dtype=np.float32).reshape(-1) 44 | for sidx in range(len(sdist)): 45 | dist = sdist[sidx, :] 46 | target = np.argsort(np.argwhere(dist <= threshold).reshape(-1))[:embed_number] 47 | if len(target) == 0: 48 | continue 49 | weight = 1.0 - minmax_scale(dist[target]) 50 | if np.sum(weight) == 0: 51 | Debugger.warn_print(msg='dist {}, weight {}'.format(dist, weight), debug=debug) 52 | else: 53 | weight /= np.sum(weight) 54 | src, dst = sidx * embed_size, (sidx + 1) * embed_size 55 | for k in range(len(weight)): 56 | if multi_graph: 57 | if sidx == 0: 58 | tmp[src: dst] += weight[k] * embeddings[sidx, target[k]].reshape(-1) 59 | elif sidx == len(sdist) - 1: 60 | tmp[src: dst] += weight[k] * embeddings[sidx - 1, target[k]].reshape(-1) 61 | else: 62 | former = weight[k] * embeddings[sidx - 1, target[k]].reshape(-1) 63 | latter = weight[k] * embeddings[sidx, target[k]].reshape(-1) 64 | tmp[src: dst] += (former + latter) 65 | else: 66 | tmp[src: dst] += weight[k] * embeddings[0, target[k]].reshape(-1) 67 | ret.append(tmp) 68 | queue.put(0) 69 | return ret 70 | 71 | if mode == 'concate': 72 | return __concate__ 73 | elif mode == 'aggregate': 74 | return __aggregate__ 75 | else: 76 | raise NotImplementedError('unsupported mode {}'.format(mode)) 77 | 78 | 79 | class ShapeletEmbedding(object): 80 | def __init__(self, seg_length, tflag, multi_graph, cache_dir, 81 | percentile, tanh, debug, measurement, mode, 82 | **deepwalk_args): 83 | self.seg_length = seg_length 84 | self.tflag = tflag 85 | self.multi_graph = multi_graph 86 | self.cache_dir = cache_dir 87 | self.tanh = tanh 88 | self.debug = debug 89 | self.percentile = percentile 90 | self.dist_threshold = -1 91 | self.measurement = measurement 92 | self.mode = mode 93 | self.deepwalk_args = deepwalk_args 94 | self.embed_size = self.deepwalk_args.get('representation_size', 256) 95 | self.embeddings = None 96 | 97 | def fit(self, time_series_set, shapelets, warp, init=0): 98 | Debugger.info_print('fit shape: {}'.format(time_series_set.shape)) 99 | tmat, sdist, dist_threshold = transition_matrix( 100 | time_series_set=time_series_set, shapelets=shapelets, seg_length=self.seg_length, 101 | tflag=self.tflag, multi_graph=self.multi_graph, tanh=self.tanh, debug=self.debug, 102 | init=init, warp=warp, percentile=self.percentile, threshold=self.dist_threshold, 103 | measurement=self.measurement) 104 | self.dist_threshold = dist_threshold 105 | self.embeddings = graph_embedding( 106 | tmat=tmat, num_shapelet=len(shapelets), embed_size=self.embed_size, 107 | cache_dir=self.cache_dir, **self.deepwalk_args) 108 | 109 | def time_series_embedding(self, time_series_set, shapelets, warp, init=0): 110 | if self.embeddings is None: 111 | self.fit(time_series_set=time_series_set, shapelets=shapelets, warp=warp) 112 | sdist = shapelet_distance(time_series_set=time_series_set, shapelets=shapelets, 113 | seg_length=self.seg_length, tflag=self.tflag, tanh=self.tanh, 114 | debug=self.debug, init=init, warp=warp, 115 | measurement=self.measurement) 116 | Debugger.info_print('embedding threshold {}'.format(self.dist_threshold)) 117 | Debugger.info_print('sdist size {}'.format(sdist.shape)) 118 | parmap = ParMap( 119 | work=time_series_embeds_factory__( 120 | embed_size=self.embed_size, embeddings=self.embeddings, threshold=self.dist_threshold, 121 | multi_graph=self.multi_graph, debug=self.debug, mode=self.mode), 122 | monitor=parallel_monitor(msg='time series embedding', size=sdist.shape[0], debug=self.debug), 123 | njobs=NJOBS 124 | ) 125 | if self.mode == 'concate': 126 | size = sdist.shape[1] * self.embed_size * embed_number 127 | elif self.mode == 'aggregate': 128 | size = sdist.shape[1] * self.embed_size 129 | else: 130 | raise NotImplementedError('unsupported mode {}'.format(self.mode)) 131 | return np.array(parmap.run(data=list(sdist)), dtype=np.float32).reshape(sdist.shape[0], size) 132 | -------------------------------------------------------------------------------- /my_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Oct 24 18:12:54 2021 4 | 5 | @author: Administrator 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | """ 10 | Created on Mon Oct 11 09:25:35 2021 11 | 用存好的训练数据测试数据测试模型 12 | @author: Administrator 13 | """ 14 | 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Variable 19 | import torch.nn as nn 20 | import pandas as pd 21 | import numpy as np 22 | import os 23 | from os import path, cpu_count 24 | import math 25 | from tqdm import tqdm 26 | import random 27 | from time2graph.core.model_gin import Flow2Graph 28 | from time2graph.utils.gat import GAT, accuracy_torch 29 | from pathos.helpers import mp 30 | import logging 31 | 32 | from pathos.helpers import mp 33 | import pickle 34 | import warnings 35 | warnings.filterwarnings("ignore") 36 | from torch.nn import MSELoss 37 | logger = logging.getLogger(__name__) 38 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 39 | datefmt='%m/%d/%Y %H:%M:%S', 40 | level=logging.INFO) 41 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 42 | #############model######################## 43 | 44 | def time2graphnet(K=30): 45 | ''' 46 | K 生成 shapelets的数目 47 | seg_length 子片段的长度 48 | num_segment 一个序列分几段 49 | gpu_enable 用不用GPU 50 | optimizer shapelets学习过程中使用什么优化方法 51 | device 使用的卡 52 | dropout 53 | lk_relu 54 | data_size 时间序列的特征维度 55 | softmax 算距离的时候是否加softmax 56 | percentile 图构建中的距离阈值(百分位数) weight小的p%不要 57 | dataset 数据集名字 58 | append 算特征的时候是否把片段自身加进去 59 | diff 是否求一阶方差 60 | 61 | ''' 62 | general_options = { 63 | 'init': 0, 64 | 'warp': 2, 65 | 'tflag': True, 66 | 'mode': 'embedding', 67 | 'candidate_method': 'greedy' 68 | 69 | } 70 | model = Flow2Graph( 71 | K, seg_length=10, num_segment=20, gpu_enable=False, optimizer='Adam', device=device, dropout=0.2, lk_relu=0.2, data_size=7, 72 | softmax=False, percentile=10,dataset='Unspecified', append=False, sort=False, feat_flag=True, 73 | feat_norm=True, aggregate=True, standard_scale=False, diff=False,reg=True,**general_options 74 | ) 75 | return model 76 | #########ourmodel############### 77 | 78 | 79 | 80 | #########ourmodel ending############### 81 | 82 | def getsingleGroup(pro_data,group,src_len,tar_len,step): 83 | ''' 84 | pro_data 全部数据 85 | group 单组 key 86 | 单组生成序列 87 | 88 | ''' 89 | curent_df=pro_data.loc[(pro_data['hostname']==group[0]) & (pro_data['series']==group[1])] 90 | tw=src_len+tar_len#总的采样窗口大小,前面是X,后面部分的Mean是Y 91 | step=step 92 | X=[] 93 | Y=[] 94 | 95 | L=len(curent_df) 96 | #按时间排序 97 | curent_df['time'] = pd.to_datetime(curent_df['time_window']) 98 | curent_df.sort_values('time', inplace=True) 99 | useful_column=[ 'Mean', 'SD', 'Open', 'High','Low', 'Close', 'Volume']#取特征列 100 | 101 | for i in range(0,L-tw,step): 102 | # train_seq = df_tmp[features].values[i:i+tw] 103 | if i>L-tw:#处理尾巴上的 104 | train_seq =curent_df[-tw:][useful_column] 105 | X.append(train_seq.values[:-src_len]) 106 | Y.append(train_seq[-src_len:]['Mean'].values) 107 | break 108 | train_seq =curent_df[i:i+tw][useful_column]# 109 | X.append(train_seq.values[:src_len]) 110 | Y.append(train_seq[src_len:]['Mean'].values) 111 | 112 | 113 | if len(X)>100:#控制内存 114 | X=X[-50:] 115 | Y=Y[-50:] 116 | break 117 | return np.array(X),np.array(Y) 118 | 119 | def get_dataset(inputdir,src_len,tar_len,step=5,train_probility=0.8,sample_pro=10000): 120 | 121 | if os.path.exists("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len)): 122 | train=pickle.load(open("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'rb'))#[:10000,:,:]#生成样本集 123 | valid=pickle.load(open("valid_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'rb'))#[:1000] 124 | print("数据:",type(train['X']),train['Y'].shape) 125 | 126 | return train['X'][:sample_pro*3],train['Y'][:sample_pro*3],valid['X'][:int(sample_pro*0.1)],valid['Y'][:int(sample_pro*0.1)] 127 | else: 128 | pro_data=pd.read_csv(inputdir+'above1900_data.csv') 129 | all_sample=[] 130 | for k1,k2 in pro_data.groupby(by=['hostname','series']): 131 | all_sample.append(k1) 132 | all_sample=all_sample[:sample_pro]#少搞点试试 133 | random.shuffle(all_sample) 134 | print('总采样点数:',len(all_sample))#19005 135 | train_all_sample=all_sample[:int(len(all_sample)*train_probility)] 136 | test_all_sample=list(filter(lambda x: x not in train_all_sample, all_sample)) 137 | print('训练样本',len(train_all_sample),'测试样本:',len(test_all_sample)) 138 | print('生成训练样本...') 139 | train_x,train_y=[],[] 140 | for id_ in tqdm(train_all_sample): 141 | x_i,y_i=getsingleGroup(pro_data,id_,src_len,tar_len,step)#一组样本 142 | train_x.extend(x_i) 143 | train_y.extend(y_i) 144 | 145 | with open("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'wb') as f: 146 | pickle.dump({'X':np.array(train_x),'Y':np.array(train_y)},f) 147 | 148 | print('生成测试样本...') 149 | valid_x,valid_y=[],[] 150 | for id_ in tqdm(test_all_sample): 151 | x_i,y_i=getsingleGroup(pro_data,id_,src_len,tar_len,step)#一组样本 152 | valid_x.extend(x_i) 153 | valid_y.extend(y_i) 154 | 155 | with open("valid_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'wb') as f: 156 | pickle.dump({'X':np.array(valid_x),'Y':np.array(valid_y)},f) 157 | return np.array(train_x),np.array(train_y),np.array(valid_x),np.array(valid_y) 158 | 159 | def get_label_forshapelets(y): 160 | ''' 161 | if y_n+1-y_n >0 : 162 | label=1 163 | else: 164 | label=0 165 | y 10000,24 166 | ''' 167 | #简单的使用差分,后续上升趋势为1,下降为0 168 | y2=((y[:,1]-y[:,0])>0).astype(int) 169 | return y2 170 | 171 | 172 | def main(Istest): 173 | hidden_size = 512 174 | embed_size = 7#输入X的 175 | de_size=24*1#输出的序列长度也就是要预测未来多少个小时的 176 | en_size=200 #输入的序列长度,采样窗口的长度(采好样的那个就是200) 177 | epoch=10#训练轮数 178 | train_batch_size=512#训练集一批的大小 179 | K=70 180 | model = time2graphnet(K) 181 | inputdir='../' 182 | train_x,train_y,valid_x,valid_y_1=get_dataset(inputdir,en_size,de_size)#拿到所有的序列 183 | # print(np.isnan(train_x).all()) 184 | # print(np.isnan(train_y).all()) 185 | # print(np.isnan(valid_x).all()) 186 | # print(np.isnan(valid_y).all()) 187 | print(train_x.shape) 188 | #数据归一化 189 | train_x=(train_x-train_x.mean())/train_x.std() 190 | train_y=(train_y-train_y.mean())/train_y.std() 191 | # #进行数据归一化处理 192 | valid_x=(valid_x-valid_x.mean())/valid_x.std() 193 | valid_y=(valid_y_1-valid_y_1.mean())/valid_y_1.std() 194 | all_X=np.vstack((train_x,valid_x)) 195 | for_rescale=(valid_x.mean(),valid_x.std(),valid_y_1) 196 | print("“*****",for_rescale[0],for_rescale[1]) 197 | model.data_size = embed_size 198 | shapelets_path = './cache/shapelets_%d_%d.cache'%(K,de_size/24) 199 | if path.isfile(shapelets_path): 200 | model.load_shapelets(shapelets_path) 201 | print('shapelets加载完成') 202 | else: 203 | print('开始提取shapelets ...') 204 | model.learn_shapelets(all_X, get_label_forshapelets(np.vstack((train_y,valid_y))), 20, 7) 205 | model.save_shapelets(shapelets_path) 206 | print('shapelets已保存') 207 | s=open('Flim_flow2graph_traing_log_shapelets_num%d_%d_%d.txt'% (K,en_size,de_size),'w') 208 | model.fit(for_rescale,train_x, train_y,valid_x,valid_y,get_label_forshapelets,epoch=epoch,train_batch_size=train_batch_size,de_size=de_size,logprintfile=s) 209 | s.close() 210 | 211 | logger.info("训练结束 "+"*"*20) 212 | 213 | if __name__ == "__main__": 214 | main(False) 215 | 216 | 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /time2graph/core/model_embeds.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pickle 4 | import torch 5 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 6 | from sklearn.model_selection import StratifiedKFold 7 | from sklearn.preprocessing import normalize 8 | from .time_aware_shapelets import learn_time_aware_shapelets 9 | from .static_shapelets import learn_static_shapelets 10 | from .shapelet_embedding import ShapeletEmbedding 11 | from ..utils.base_utils import ModelUtils, Debugger 12 | 13 | 14 | class Time2GraphEmbed(ModelUtils): 15 | """ 16 | Time2Graph model 17 | Hyper-parameters: 18 | K: number of learned shapelets 19 | C: number of candidates 20 | A: number of shapelets assigned to each segment 21 | tflag: timing flag 22 | opt_metric: optimal metric using in outside-classifier 23 | """ 24 | def __init__(self, kernel, K=100, C=1000, seg_length=30, warp=2, tflag=True, 25 | gpu_enable=True, percentile=15, opt_metric='f1', mode='aggregate', 26 | batch_size=100, **kwargs): 27 | super(Time2GraphEmbed, self).__init__(kernel=kernel, **kwargs) 28 | self.K = K 29 | self.C = C 30 | self.seg_length = seg_length 31 | self.warp = warp 32 | self.tflag = tflag 33 | self.opt_metric = opt_metric 34 | self.mode = mode 35 | self.batch_size = batch_size 36 | self.gpu_enable = gpu_enable 37 | self.percentile = percentile 38 | self.shapelets = None 39 | self.sembeds = None 40 | self.clf = None 41 | self.lr = kwargs.pop('lr', 1e-2) 42 | self.p = kwargs.pop('p', 2) 43 | self.alpha = kwargs.pop('alpha', 10.0) 44 | self.beta = kwargs.pop('beta', 5.0) 45 | self.multi_graph = kwargs.pop('multi_graph', False) 46 | self.debug = kwargs.pop('debug', True) 47 | self.measurement = kwargs.pop('measurement', 'gdtw') 48 | self.kwargs = kwargs 49 | Debugger.info_print('initialize t2g model with {}'.format(self.__dict__)) 50 | 51 | def learn_shapelets(self, x, y, num_segment, data_size, num_batch): 52 | assert x.shape[1] == num_segment * self.seg_length 53 | if self.tflag: 54 | self.shapelets = learn_time_aware_shapelets( 55 | time_series_set=x, label=y, K=self.K, C=self.C, p=self.p, 56 | num_segment=num_segment, seg_length=self.seg_length, data_size=data_size, 57 | lr=self.lr, alpha=self.alpha, beta=self.beta, num_batch=num_batch, 58 | measurement=self.measurement, gpu_enable=self.gpu_enable, **self.kwargs) 59 | else: 60 | self.shapelets = learn_static_shapelets(time_series_set=x, label=y, K=self.K, C=self.C, 61 | warp=self.warp, num_segment=num_segment, seg_length=self.seg_length, measurement=self.measurement, **self.kwargs) 62 | 63 | def fit_embedding_model(self, x, y, cache_dir, init=0): 64 | assert self.shapelets is not None, 'shapelets has not been learnt yet' 65 | self.sembeds = ShapeletEmbedding( 66 | seg_length=self.seg_length, tflag=self.tflag, multi_graph=self.multi_graph, 67 | cache_dir=cache_dir, tanh=self.kwargs.get('tanh', False), debug=self.debug, 68 | percentile=self.percentile, measurement=self.measurement, mode=self.mode, 69 | **self.kwargs) 70 | self.sembeds.fit(time_series_set=x[np.argwhere(y == 0).reshape(-1), :, :], 71 | shapelets=self.shapelets, warp=self.warp, init=init) 72 | 73 | def embed(self, x, init=0): 74 | assert self.sembeds is not None, 'shapelet-embedding model has not been learnt yet' 75 | return self.sembeds.time_series_embedding( 76 | time_series_set=x, shapelets=self.shapelets, warp=self.warp, init=init) 77 | 78 | def set_deepwalk_args(self, **dw_args): 79 | for key, val in dw_args.items(): 80 | self.kwargs[key] = val 81 | 82 | def fit(self, x, y, n_splits=5, init=0, reset=True, balanced=True, norm=False, cache_dir='.'): 83 | num_segment = int(x.shape[1] / self.seg_length) 84 | data_size = x.shape[-1] 85 | if reset or self.shapelets is None: 86 | self.learn_shapelets( 87 | x=x, y=y, num_segment=num_segment, data_size=data_size, num_batch=x.shape[0] // self.batch_size) 88 | if reset or self.sembeds is None: 89 | Debugger.info_print('fit embedding model...') 90 | self.fit_embedding_model(x=x, y=y, cache_dir=cache_dir, init=init) 91 | max_clf_args, max_metric, clf = None, -1, self.clf__() 92 | embeds = self.sembeds.time_series_embedding( 93 | time_series_set=x, shapelets=self.shapelets, 94 | warp=self.warp, init=init) 95 | if norm: 96 | embeds = normalize(embeds, axis=0) 97 | Debugger.info_print('{} paras to be tuned'.format(self.para_len(balanced=balanced))) 98 | arguments = self.clf_paras(balanced=balanced) 99 | arg_size, cnt = self.para_len(balanced=balanced), 0.0 100 | metric_method = self.return_metric_method(opt_metric=self.opt_metric) 101 | Debugger.info_print('running parameter tuning for fit...') 102 | max_accu, max_prec, max_recall, max_f1 = -1, -1, -1, -1 103 | __max_clf_model = './.clf_model.pickle' 104 | for args in arguments: 105 | clf.set_params(**args) 106 | Debugger.debug_print(msg='{:.2f}% inner args tuned; args: {}'.format(cnt * 100.0 / arg_size, args), 107 | debug=self.debug) 108 | skf = StratifiedKFold(n_splits=n_splits, shuffle=True) 109 | tmp, accu, prec, recall, f1 = 0, 0, 0, 0, 0 110 | for train_idx, test_idx in skf.split(embeds, y): 111 | clf.fit(embeds[train_idx], y[train_idx]) 112 | y_pred = clf.predict(embeds[test_idx]) 113 | tmp += metric_method(y_true=y[test_idx], y_pred=y_pred) 114 | accu += accuracy_score(y_true=y[test_idx], y_pred=y_pred) 115 | prec += precision_score(y_true=y[test_idx], y_pred=y_pred) 116 | recall += recall_score(y_true=y[test_idx], y_pred=y_pred) 117 | f1 += f1_score(y_true=y[test_idx], y_pred=y_pred) 118 | tmp /= n_splits 119 | accu /= n_splits 120 | prec /= n_splits 121 | recall /= n_splits 122 | f1 /= n_splits 123 | if max_metric < tmp: 124 | max_metric = tmp 125 | max_clf_args = args 126 | max_accu, max_prec, max_recall, max_f1 = accu, prec, recall, f1 127 | pickle.dump(clf, open(__max_clf_model, 'wb')) 128 | cnt += 1.0 129 | Debugger.info_print('args {} for clf {}-{}, performance: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format( 130 | max_clf_args, self.kernel, self.opt_metric, max_accu, max_prec, max_recall, max_f1)) 131 | self.clf = { 132 | 'clf': pickle.load(open(__max_clf_model, 'rb')), 133 | 'clf-args': max_clf_args, 134 | } 135 | 136 | def predict(self, x, norm=False): 137 | assert self.shapelets is not None, 'shapelets has not been learnt yet...' 138 | assert self.clf is not 'classifier has not been learnt yet...' 139 | if norm: 140 | embeds = normalize(self.embed(x=x), axis=0) 141 | else: 142 | embeds = self.embed(x=x) 143 | return self.clf['clf'].predict(embeds) 144 | 145 | def save_model(self, fpath): 146 | ret = {} 147 | for key, val in self.__dict__.items(): 148 | if key != 'xgb': 149 | ret[key] = val 150 | self.clf['clf'].save_model('{}.xgboost'.format(fpath)) 151 | torch.save(ret, fpath) 152 | 153 | def load_model(self, fpath, map_location='cuda:0'): 154 | # @TODO: specify map_location 155 | cache = torch.load(fpath, map_location=map_location) 156 | for key, val in cache.items(): 157 | self.__dict__[key] = val 158 | self.clf['clf'].load_model('{}.xgboost'.format(fpath)) 159 | 160 | def save_shapelets(self, fpath): 161 | torch.save(self.shapelets, fpath) 162 | 163 | def load_shapelets(self, fpath, map_location='cuda:0'): 164 | self.shapelets = torch.load(fpath, map_location=map_location) 165 | -------------------------------------------------------------------------------- /time2graph/core/shapelet_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.cluster import KMeans 3 | from sklearn.preprocessing import minmax_scale 4 | from .distance_utils import * 5 | from ..utils.base_utils import Debugger, syscmd 6 | from ..utils.mp_utils import ParMap, parallel_monitor, NJOBS 7 | __tmat_threshold = 1e-2 8 | 9 | 10 | def softmax_np(x): 11 | e_x = np.exp(x - np.max(x)) 12 | return e_x / e_x.sum(axis=0) 13 | 14 | 15 | def __candidate_cluster_factory(n_clusters, seg_length): 16 | def __main__(pid, args, queue): 17 | ret = [] 18 | for time_series_segments in args: 19 | kmeans = KMeans(n_clusters=n_clusters).fit(time_series_segments) 20 | ret.append(kmeans.cluster_centers_.reshape(n_clusters, seg_length, -1)) 21 | queue.put(0) 22 | return ret 23 | return __main__ 24 | 25 | 26 | def __candidate_greedy_factory(n_candiates, seg_length): 27 | def __main__(pid, args, queue): 28 | ret = [] 29 | for time_series_segments in args: 30 | size = time_series_segments.shape[0] 31 | center_segment = np.mean(time_series_segments, axis=0) 32 | cand_dist = np.linalg.norm( 33 | time_series_segments.reshape(size, -1) - center_segment.reshape(1, -1), axis=1) 34 | tmp = [] 35 | for cnt in range(n_candiates): 36 | idx = np.argmax(cand_dist) 37 | cand_dist[idx] = -1 38 | update_idx = cand_dist >= 0 39 | dims = np.sum(update_idx) 40 | cand_dist[update_idx] += np.linalg.norm( 41 | time_series_segments[update_idx].reshape(dims, -1) - time_series_segments[idx].reshape(1, -1), 42 | axis=1 43 | ) 44 | tmp.append(time_series_segments[idx].reshape(seg_length, -1)) 45 | ret.append(tmp) 46 | queue.put(0) 47 | return ret 48 | return __main__ 49 | 50 | 51 | def generate_shapelet_candidate(time_series_set, num_segment, seg_length, candidate_size, **kwargs): 52 | __method, __debug = kwargs.get('candidate_method', 'greedy'), kwargs.get('debug', True) 53 | njobs = kwargs.get('njobs', NJOBS) 54 | Debugger.debug_print('begin to generate shapelet candidates...', __debug) 55 | num_time_series = time_series_set.shape[0] 56 | time_series_set = time_series_set.reshape(num_time_series, num_segment, seg_length, -1) 57 | assert candidate_size >= num_segment, 'candidate-size {} should be larger ' \ 58 | 'than n_segments {}'.format(candidate_size, num_segment) 59 | args, n_clusters = [], candidate_size // num_segment 60 | for idx in range(num_segment): 61 | args.append(time_series_set[:, idx, :, :].reshape(num_time_series, -1)) 62 | if __method == 'cluster': 63 | work_func = __candidate_cluster_factory 64 | elif __method == 'greedy': 65 | work_func = __candidate_greedy_factory 66 | else: 67 | raise NotImplementedError('unsupported candidate generating method {}'.format(__method)) 68 | parmap = ParMap( 69 | work=work_func(n_clusters, seg_length), 70 | monitor=parallel_monitor(msg='generate candidate by {}'.format(__method), 71 | size=num_segment, debug=__debug), 72 | njobs=njobs 73 | ) 74 | ret = np.concatenate(parmap.run(data=args), axis=0) 75 | Debugger.info_print('candidates with length {} sampling done...'.format(seg_length)) 76 | Debugger.info_print('totally {} candidates with shape {}'.format(len(ret), ret.shape)) 77 | return ret 78 | 79 | 80 | def __shapelet_distance_factory(shapelets, num_segment, seg_length, tflag, 81 | init, warp, dist, global_flag=False): 82 | def __main__(pid, args, queue): 83 | ret = [] 84 | for time_series in args: 85 | time_series = time_series.reshape(num_segment, seg_length, -1) 86 | tmp = np.zeros((num_segment, len(shapelets)), dtype=np.float32) 87 | if tflag and global_flag: 88 | for idx, (pattern, local_factor, global_factor, _) in enumerate(shapelets): 89 | for k in range(num_segment): 90 | tmp[k, idx] = dist(x=pattern, y=time_series[k], 91 | w=local_factor, warp=warp) * np.abs(global_factor[init + k]) 92 | elif tflag and not global_flag: 93 | for idx, (pattern, local_factor, global_factor, _) in enumerate(shapelets): 94 | for k in range(num_segment): 95 | tmp[k, idx] = dist(x=pattern, y=time_series[k], w=local_factor, warp=warp) 96 | else: 97 | for idx, (pattern, _) in enumerate(shapelets): 98 | for k in range(num_segment): 99 | tmp[k, idx] = dist(x=pattern, y=time_series[k], 100 | w=np.ones(pattern.shape[0]), warp=warp) 101 | ret.append(tmp) 102 | queue.put(0) 103 | return ret 104 | return __main__ 105 | 106 | 107 | def shapelet_distance(time_series_set, shapelets, seg_length, tflag, tanh, debug, init, warp, measurement): 108 | """ 109 | returns: 110 | np.array, N x m x K 111 | num_time_series, num_segment, num_shapelet 112 | """ 113 | num_time_series = time_series_set.shape[0] 114 | num_segment = int(time_series_set.shape[1] / seg_length) 115 | num_shapelet = len(shapelets) 116 | assert num_segment * seg_length == time_series_set.shape[1] 117 | if measurement == 'gw': 118 | dist = parameterized_gw_npy 119 | elif measurement == 'gdtw': 120 | dist = parameterized_gdtw_npy 121 | else: 122 | raise NotImplementedError('unsupported distance {}'.format(measurement)) 123 | parmap = ParMap( 124 | work=__shapelet_distance_factory( 125 | shapelets=shapelets, num_segment=num_segment, seg_length=seg_length, 126 | tflag=tflag, init=init, warp=warp, dist=dist), 127 | monitor=parallel_monitor(msg='shapelet distance', size=num_time_series, debug=debug), 128 | njobs=NJOBS 129 | ) 130 | sdist = np.array(parmap.run(data=list(time_series_set)), dtype=np.float32).reshape( 131 | time_series_set.shape[0], num_segment, num_shapelet 132 | ) 133 | if tanh: 134 | sdist = np.tanh(sdist) 135 | # for tidx in range(num_time_series): 136 | # for sidx in range(num_segment): 137 | # min_val = np.min(sdist[tidx, sidx, :]) 138 | # max_val = np.max(sdist[tidx, sidx, :]) 139 | # assert max_val > min_val, '{}-{}: {}'.format(tidx, sidx, sdist[tidx, sidx, :]) 140 | # sdist[tidx, sidx, :] -= min_val 141 | # sdist[tidx, sidx, :] /= (max_val - min_val) 142 | # assert np.max(sdist) <= 1 and np.min(sdist) >= 0 143 | # import pickle 144 | # pickle.dump(sdist, open('./.sdist.pickle', 'wb')) 145 | # Debugger.info_print('dump sdist done for debug') 146 | return sdist 147 | 148 | 149 | def transition_matrix(time_series_set, shapelets, seg_length, tflag, multi_graph, 150 | percentile, threshold, tanh, debug, init, warp, measurement): 151 | num_time_series = time_series_set.shape[0] 152 | num_segment = int(time_series_set.shape[1] / seg_length) 153 | num_shapelet = len(shapelets) 154 | if multi_graph: 155 | gcnt = num_segment - 1 156 | else: 157 | gcnt = 1 158 | tmat = np.zeros((gcnt, num_shapelet, num_shapelet), dtype=np.float32) 159 | sdist = shapelet_distance( 160 | time_series_set=time_series_set, shapelets=shapelets, seg_length=seg_length, 161 | tflag=tflag, tanh=tanh, debug=debug, init=init, warp=warp, measurement=measurement) 162 | if percentile is not None: 163 | dist_threshold = np.percentile(sdist, percentile) 164 | Debugger.debug_print('threshold({}) {}, mean {}'.format(percentile, dist_threshold, np.mean(sdist)), debug=debug) 165 | else: 166 | dist_threshold = threshold 167 | Debugger.debug_print('threshold {}, mean {}'.format(dist_threshold, np.mean(sdist)), debug=debug) 168 | 169 | n_edges = 0 170 | for tidx in range(num_time_series): 171 | for sidx in range(num_segment - 1): 172 | src_dist = sdist[tidx, sidx, :] 173 | dst_dist = sdist[tidx, sidx + 1, :] 174 | src_idx = np.argwhere(src_dist <= dist_threshold).reshape(-1) 175 | dst_idx = np.argwhere(dst_dist <= dist_threshold).reshape(-1) 176 | if len(src_idx) == 0 or len(dst_idx) == 0: 177 | continue 178 | n_edges += len(src_idx) * len(dst_idx) 179 | src_dist[src_idx] = 1.0 - minmax_scale(src_dist[src_idx]) 180 | dst_dist[dst_idx] = 1.0 - minmax_scale(dst_dist[dst_idx]) 181 | # assert len(src_idx) == num_shapelets 182 | for src in src_idx: 183 | if multi_graph: 184 | tmat[sidx, src, dst_idx] += (src_dist[src] * dst_dist[dst_idx]) 185 | else: 186 | tmat[0, src, dst_idx] += (src_dist[src] * dst_dist[dst_idx]) 187 | Debugger.debug_print( 188 | '{:.2f}% transition matrix computed...'.format(float(tidx + 1) * 100 / num_time_series), 189 | debug=debug 190 | ) 191 | Debugger.debug_print('{} edges involved in shapelets graph'.format(n_edges), debug=debug) 192 | tmat[tmat <= __tmat_threshold] = 0.0 193 | for k in range(gcnt): 194 | for i in range(num_shapelet): 195 | norms = np.sum(tmat[k, i, :]) 196 | if norms == 0: 197 | tmat[k, i, i] = 1.0 198 | else: 199 | tmat[k, i, :] /= np.sum(tmat[k, i, :]) 200 | return tmat, sdist, dist_threshold 201 | 202 | 203 | def adjacent_matrix(sdist, num_time_series, num_segment, num_shapelet, percentile, threshold, debug): 204 | tmat = np.zeros((num_time_series, num_shapelet, num_shapelet), dtype=np.float32) 205 | for tidx in range(num_time_series): 206 | for sidx in range(num_segment - 1): 207 | src_dist = sdist[tidx, sidx, :] 208 | dst_dist = sdist[tidx, sidx + 1, :] 209 | src_dist = 1.0 - minmax_scale(src_dist) 210 | dst_dist = 1.0 - minmax_scale(dst_dist) 211 | # assert len(src_idx) == num_shapelets 212 | for src in range(num_shapelet): 213 | tmat[tidx, src, :] += (src_dist[src] * dst_dist) 214 | Debugger.debug_print( 215 | '{:.2f}% adjacent matrix computed...'.format(float(tidx + 1) * 100 / num_time_series), 216 | debug=debug 217 | ) 218 | if threshold is None: 219 | threshold = np.percentile(tmat, percentile) 220 | Debugger.debug_print('threshold({}%): {:.6f}, mean-value: {:.6f}'.format(percentile, threshold, np.mean(tmat)), debug=debug) 221 | else: 222 | Debugger.debug_print('threshold: {:.6f}, mean-value {:.6f}'.format(threshold, np.mean(tmat)), debug=debug) 223 | edge_idx = tmat >= threshold 224 | tmat[edge_idx] = 1 225 | tmat[~edge_idx] = 0 226 | num_edges = np.sum(np.sum(tmat, axis=2), axis=1) 227 | Debugger.debug_print('{:.2f}(std: {:.2f}) edges involved in average for each shapelet graph'.format( 228 | np.mean(num_edges), np.std(num_edges)), debug=debug) 229 | return tmat, threshold 230 | 231 | 232 | def __mat2edgelist(tmat, fpath): 233 | mat_shape = tmat.shape 234 | with open(fpath, 'w') as f: 235 | for src in range(mat_shape[0]): 236 | flag = False 237 | for dst in range(mat_shape[1]): 238 | if tmat[src, dst] <= 1e-5: 239 | continue 240 | f.write('{} {} {:.5f}\n'.format(src, dst, tmat[src, dst])) 241 | flag = True 242 | if not flag: 243 | f.write('{} {} 1.0000\n'.format(src, src)) 244 | f.close() 245 | 246 | 247 | def __embedding2mat(fpath, num_vertices, embed_size): 248 | mat = np.zeros((num_vertices, embed_size), dtype=np.float32) 249 | with open(fpath, 'r') as f: 250 | cnt = -1 251 | for line in f: 252 | if cnt < 0: 253 | cnt += 1 254 | continue 255 | line = line.split(' ') 256 | idx = int(line[0]) 257 | for k in range(embed_size): 258 | mat[idx, k] = float(line[k + 1]) 259 | f.close() 260 | return mat 261 | 262 | 263 | def graph_embedding(tmat, num_shapelet, embed_size, cache_dir, **deepwalk_paras): 264 | __deepwalk_args__ = [] 265 | Debugger.info_print('embed_size: {}'.format(embed_size)) 266 | ret = [] 267 | Debugger.info_print('transition matrix size {}'.format(tmat.shape)) 268 | for idx in range(tmat.shape[0]): 269 | edgelist_path = '{}/edgelist/{}.edgelist'.format(cache_dir, idx) 270 | embedding_path = '{}/embeds/{}.embeddings'.format(cache_dir, idx) 271 | __mat2edgelist(tmat=tmat[idx, :, :], fpath=edgelist_path) 272 | deepwalk_cmd = [ 273 | 'deepwalk --input {} --format weighted_edgelist --output {} --representation-size {}'.format( 274 | edgelist_path, embedding_path, embed_size) 275 | ] 276 | for key, val in deepwalk_paras.items(): 277 | if key in __deepwalk_args__: 278 | deepwalk_cmd.append('--{} {}'.format(key, val)) 279 | deepwalk_cmd = ' '.join(deepwalk_cmd) 280 | Debugger.info_print('run deepwalk with: {}'.format(deepwalk_cmd)) 281 | _ = syscmd(deepwalk_cmd) 282 | ret.append(__embedding2mat(fpath=embedding_path, num_vertices=num_shapelet, 283 | embed_size=embed_size)) 284 | return np.array(ret, dtype=np.float32).reshape(tmat.shape[0], num_shapelet, embed_size) 285 | -------------------------------------------------------------------------------- /time2graph/core/time_aware_shapelets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | from torch.autograd import * 4 | from torch import optim 5 | from torch.nn import functional as F 6 | from torch.distributions.normal import Normal 7 | from torch.utils.data import DataLoader 8 | from ..utils.base_utils import Queue 9 | from .model_utils import * 10 | from .shapelet_utils import * 11 | from .distance_utils import * 12 | 13 | 14 | def parameterized_gw_torch(x, y, w, torch_dtype, warp=2): 15 | """ 16 | gw distance in torch with timing factors. 17 | :param x: 18 | :param y: 19 | :param w: 20 | :param torch_dtype: 21 | :param warp: 22 | :return: 23 | """ 24 | distance = np.sum((x.reshape(x.shape[0], -1, x.shape[1]) - expand_array(y=y, warp=warp)) ** 2, 25 | axis=1) 26 | assert not torch.any(torch.isnan(w)), 'local: {}'.format(w) 27 | softmin_distance = np.sum(softmax(-distance.astype(np.float64)).astype(np.float32) * distance, 28 | axis=1) 29 | return torch.sqrt(torch.sum(torch.from_numpy(softmin_distance).type(torch_dtype) * torch.abs(w))) 30 | 31 | 32 | def parameterized_gdtw_torch(x, y, w, torch_dtype, warp=2): 33 | """ 34 | greedy-dtw distance in torch with timing factors. 35 | :param x: 36 | :param y: 37 | :param w: 38 | :param torch_dtype: 39 | :param warp: 40 | :return: 41 | """ 42 | dpath = greedy_dtw_path(x=x, y=y, warp=warp) 43 | return torch.norm((torch.from_numpy(x).type(torch_dtype) * w.reshape(x.shape[0], -1))[dpath[0]] - 44 | torch.from_numpy(y[dpath[1]]).type(torch_dtype)) 45 | 46 | 47 | def pattern_distance_torch(pattern, time_series, num_segment, seg_length, 48 | local_factor, global_factor, torch_dtype, measurement): 49 | """ 50 | compute distances between a pattern and a given time series. 51 | :param pattern: 52 | :param time_series: 53 | :param num_segment: 54 | :param seg_length: 55 | :param local_factor: 56 | :param global_factor: 57 | :param torch_dtype: 58 | :param measurement: 59 | :return: 60 | """ 61 | if measurement == 'gw': 62 | dist_torch = parameterized_gw_torch 63 | elif measurement == 'gdtw': 64 | dist_torch = parameterized_gdtw_torch 65 | else: 66 | raise NotImplementedError('unsupported distance {}'.format(measurement)) 67 | assert isinstance(time_series, np.ndarray) and isinstance(pattern, np.ndarray) 68 | time_series = time_series.reshape(num_segment, seg_length, -1) 69 | distance = Variable(torch.zeros(num_segment)).type(torch_dtype) 70 | for k in range(num_segment): 71 | distance[k] = dist_torch(x=pattern, y=time_series[k], w=local_factor, torch_dtype=torch_dtype) 72 | return torch.sum(F.softmax(-distance * torch.abs(global_factor), dim=0) 73 | * (distance * torch.abs(global_factor))) 74 | 75 | 76 | def __shapelet_candidate_loss(cand, time_series_set, label, num_segment, seg_length, 77 | data_size, p, lr, alpha, beta, num_batch, gpu_enable, 78 | measurement, **kwargs): 79 | """ 80 | loss for learning time-aware shapelets. 81 | :param cand: 82 | :param time_series_set: 83 | :param label: 84 | :param num_segment: 85 | :param seg_length: 86 | :param data_size:数据的维度 87 | :param p: 88 | normalizing parameter (0, 1, or 2). 89 | :param lr: 90 | learning rate. 91 | :param alpha: 92 | penalty weight for local timing factor. 93 | :param beta: 94 | penalty weight for global timing factor. 95 | :param num_batch: 96 | :param gpu_enable: 97 | :param measurement: 98 | :param kwargs: 99 | :return: 100 | """ 101 | if gpu_enable: 102 | torch_dtype = torch.cuda.FloatTensor 103 | else: 104 | torch_dtype = torch.FloatTensor 105 | dataset_numpy = NumpyDataset(time_series_set, label) 106 | num_class = len(np.unique(label).reshape(-1)) 107 | batch_size = int(len(dataset_numpy) // num_batch) 108 | local_factor_variable = Variable(torch.ones(seg_length).type(torch_dtype) / seg_length, requires_grad=True) 109 | global_factor_variable = Variable(torch.ones(num_segment).type(torch_dtype) / num_segment, requires_grad=True) 110 | current_loss, loss_queue, cnt, nan_cnt = 0.0, Queue(max_size=int(num_batch * 0.2)), 0, 0 111 | current_main_loss, current_penalty_loss = 0.0, 0.0 112 | max_iters, optimizer = kwargs.get('max_iters', 1), kwargs.get('optimizer', 'Adam') 113 | if optimizer == 'Adam': 114 | optimizer = optim.Adam 115 | elif optimizer == 'Adadelta': 116 | optimizer = optim.Adadelta 117 | elif optimizer == 'Adamax': 118 | optimizer = optim.Adamax 119 | else: 120 | raise NotImplementedError('unsupported optimizer {} for time-aware shapelets learning'.format(optimizer)) 121 | optimizer = optimizer([local_factor_variable, global_factor_variable], lr=lr) 122 | 123 | while cnt < max_iters: 124 | sampler = StratifiedSampler(label=label, num_class=num_class) 125 | dataloader = DataLoader(dataset=dataset_numpy, batch_size=batch_size, sampler=sampler) 126 | batch_cnt = 0 127 | for x, y in dataloader: 128 | x = np.array(x, dtype=np.float32).reshape(len(x), -1, data_size) 129 | y = np.array(y, dtype=np.float32).reshape(-1) 130 | assert not np.any(np.isnan(x)), 'original time series data with nan' 131 | lb_idx, sample_flag = [], True 132 | for k in range(num_class): 133 | tmp_idx = np.argwhere(y == k).reshape(-1) 134 | if k >= 1 and len(tmp_idx) > 0: 135 | sample_flag = False 136 | lb_idx.append(tmp_idx) 137 | if len(lb_idx[0]) == 0 or sample_flag: 138 | Debugger.debug_print('weighted sampling exception, positive {:.2f}/{}'.format(np.sum(y)/len(y), len(y))) 139 | continue 140 | loss = torch.Tensor([0.0]).type(torch_dtype) 141 | main_loss = torch.Tensor([0.0]).type(torch_dtype) 142 | penalty_loss = torch.Tensor([0.0]).type(torch_dtype) 143 | dist_tensor = torch.zeros(x.shape[0]).type(torch_dtype) 144 | for k in range(x.shape[0]): 145 | dist_tensor[k] = pattern_distance_torch( 146 | pattern=cand, time_series=x[k, :, :], num_segment=num_segment, 147 | seg_length=seg_length, local_factor=local_factor_variable, 148 | global_factor=global_factor_variable, torch_dtype=torch_dtype, 149 | measurement=measurement 150 | # ignore the warning of reshape/view for local_factor_variable 151 | ) 152 | assert not torch.isnan(dist_tensor).any(), 'dist: {}\nlocal: {}\nglobal: {}'.format( 153 | dist_tensor, local_factor_variable, global_factor_variable) 154 | mean, std = torch.mean(dist_tensor), torch.std(dist_tensor) 155 | dist_tensor = (dist_tensor - mean) / std 156 | # Debugger.info_print('transform: {}, {}'.format(torch.max(dist_tensor), torch.min(dist_tensor))) 157 | # Debugger.time_print(msg='pattern distance', begin=begin, profiling=True) 158 | for k in range(1, len(lb_idx)): 159 | src = dist_tensor[lb_idx[0]] 160 | dst = dist_tensor[lb_idx[k]] 161 | # src =torch.where(torch.isnan(src), torch.full_like(src, 0),src) 162 | # dst =torch.where(torch.isnan(dst), torch.full_like(dst, 0),dst) 163 | if len(src)!=len(dst): 164 | break 165 | # print(src) 166 | # print(dst) 167 | #填补其中的空值 168 | loss -= torch.abs(torch.distributions.kl.kl_divergence( 169 | Normal(torch.mean(src), torch.std(src)), 170 | Normal(torch.mean(dst), torch.std(dst)))) 171 | main_loss -= torch.abs(torch.distributions.kl.kl_divergence( 172 | Normal(torch.mean(src), torch.std(src)), 173 | Normal(torch.mean(dst), torch.std(dst)))) 174 | # Debugger.info_print('KL-loss: {}'.format(loss)) 175 | loss += (alpha * torch.norm(local_factor_variable, p=p) / seg_length) 176 | loss += (beta * torch.norm(global_factor_variable, p=p) / num_segment) 177 | 178 | penalty_loss += (alpha * torch.norm(local_factor_variable, p=p) / seg_length) 179 | penalty_loss += (beta * torch.norm(global_factor_variable, p=p) / num_segment) 180 | 181 | optimizer.zero_grad() 182 | loss.backward() 183 | optimizer.step() 184 | if gpu_enable: 185 | current_loss = float(loss.cpu().data.numpy()) 186 | current_main_loss = float(main_loss.cpu().data) 187 | current_penalty_loss = float(penalty_loss.cpu().data) 188 | else: 189 | current_loss = float(loss.data.numpy()) 190 | current_main_loss = float(main_loss.data) 191 | current_penalty_loss = float(penalty_loss.data) 192 | loss_queue.enqueue(current_loss) 193 | if np.isnan(current_loss) or torch.any(torch.isnan(local_factor_variable))\ 194 | or torch.any(torch.isnan(global_factor_variable)): 195 | local_factor_variable = Variable(torch.ones(seg_length).type(torch_dtype) / seg_length, requires_grad=True) 196 | global_factor_variable = Variable(torch.ones(num_segment).type(torch_dtype) / num_segment, requires_grad=True) 197 | current_loss = 1e5 198 | nan_cnt += 1 199 | if nan_cnt >= max_iters: 200 | break 201 | else: 202 | Debugger.debug_print('{:.2f}% steps, loss {:.6f} with {:.6f} and penalty {:.6f}'.format( 203 | batch_cnt * 100 / num_batch, current_loss, current_main_loss, current_penalty_loss)) 204 | batch_cnt += 1 205 | cnt += 1 206 | if nan_cnt >= max_iters: 207 | break 208 | else: 209 | avg_loss = np.mean(loss_queue.queue[1:]) 210 | if abs(current_loss - avg_loss) < kwargs.get('epsilon', 1e-2): 211 | break 212 | local_factor_variable = torch.abs(local_factor_variable) 213 | global_factor_variable = torch.abs(global_factor_variable) 214 | if gpu_enable: 215 | local_factor = local_factor_variable.cpu().data.numpy() 216 | global_factor = global_factor_variable.cpu().data.numpy() 217 | else: 218 | local_factor = local_factor_variable.data.numpy() 219 | global_factor = global_factor_variable.data.numpy() 220 | return local_factor, global_factor, current_loss, current_main_loss, current_penalty_loss 221 | 222 | 223 | def __shapelet_candidate_loss_factory(time_series_set, label, num_segment, 224 | seg_length, data_size, p, lr, alpha, beta, num_batch, 225 | gpu_enable, measurement, **kwargs): 226 | """ 227 | paralleling compute shapelet losses. 228 | :param time_series_set: 229 | :param label: 230 | :param num_segment: 231 | :param seg_length: 232 | :param data_size: 233 | :param p: 234 | :param lr: 235 | :param alpha: 236 | :param beta: 237 | :param num_batch: 238 | :param gpu_enable: 239 | :param measurement: 240 | :param kwargs: 241 | :return: 242 | """ 243 | def __main__(pid, args, queue): 244 | ret = [] 245 | for cand in args: 246 | local_factor, global_factor, loss, main_loss, penalty = __shapelet_candidate_loss( 247 | cand=cand, time_series_set=time_series_set, label=label, num_segment=num_segment, 248 | seg_length=seg_length, data_size=data_size, p=p, lr=lr, 249 | alpha=alpha, beta=beta, num_batch=num_batch, gpu_enable=gpu_enable, 250 | measurement=measurement, **kwargs 251 | ) 252 | ret.append((cand, local_factor, global_factor, loss, main_loss, penalty)) 253 | queue.put(0) 254 | return ret 255 | return __main__ 256 | 257 | 258 | def learn_time_aware_shapelets(time_series_set, label, K, C, num_segment, seg_length, data_size, 259 | p, lr, alpha, beta, num_batch, gpu_enable, measurement, **kwargs): 260 | """ 261 | learn time-aware shapelets. 262 | :param time_series_set: 263 | input time series data. 264 | :param label: 265 | input label. 266 | :param K: 267 | number of shapelets that finally learned. 268 | :param C: 269 | number of shapelet candidates in learning procedure. 270 | :param num_segment: 271 | :param seg_length: 272 | :param data_size: 273 | :param p: 274 | :param lr: 275 | :param alpha: 276 | :param beta: 277 | :param num_batch: 278 | :param gpu_enable: 279 | :param measurement: 280 | :param kwargs: 281 | :return: 282 | """ 283 | cands = generate_shapelet_candidate(time_series_set=time_series_set, num_segment=num_segment, 284 | seg_length=seg_length, candidate_size=C, **kwargs) 285 | parmap = ParMap( 286 | work=__shapelet_candidate_loss_factory( 287 | time_series_set=time_series_set, label=label, num_segment=num_segment, seg_length=seg_length, 288 | data_size=data_size, p=p, lr=lr, alpha=alpha, beta=beta, num_batch=num_batch, 289 | gpu_enable=gpu_enable, measurement=measurement, **kwargs 290 | ), 291 | monitor=parallel_monitor(msg='learning time-aware shapelets', size=len(cands), 292 | debug=kwargs.get('debug', True)), 293 | njobs=kwargs.get('njobs', NJOBS) 294 | ) 295 | result = sorted(parmap.run(data=cands), key=lambda x: x[3]) 296 | ret = [] 297 | for (cand, local_factor, global_factor, loss, main_loss, penalty) in result: 298 | ret.append((cand, local_factor, global_factor, loss)) 299 | return sorted(ret, key=lambda x: x[-1])[:K] 300 | -------------------------------------------------------------------------------- /time2graph/core/model_gin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | import torch.optim as optim 4 | import torch.nn.functional as F 5 | from config import * 6 | from torch.utils.data import Dataset,DataLoader,RandomSampler,SequentialSampler 7 | from scipy.special import softmax 8 | from .shapelet_utils import shapelet_distance, adjacent_matrix 9 | from sklearn.model_selection import StratifiedKFold 10 | from sklearn.preprocessing import MinMaxScaler 11 | from .time_aware_shapelets import learn_time_aware_shapelets 12 | from .static_shapelets import learn_static_shapelets 13 | from .Optimize import AdamW, get_linear_schedule_with_warmup 14 | from ..utils.base_utils import myNetwork 15 | from torch.utils.data import DataLoader, Dataset 16 | import torch.nn as nn 17 | import os 18 | import gc 19 | import logging 20 | logger = logging.getLogger(__name__) 21 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 22 | datefmt='%m/%d/%Y %H:%M:%S', 23 | level=logging.INFO) 24 | class myDataset(Dataset): 25 | def __init__(self, x,m, y=None,istrain=True): 26 | super(myDataset, self).__init__() 27 | self.x = torch.tensor(x,dtype=torch.float32) 28 | self.m = torch.tensor(m,dtype=torch.float32) 29 | if istrain: 30 | self.y =torch.tensor(y,dtype=torch.float32) 31 | self.istrain=istrain 32 | 33 | def __len__(self): 34 | return len(self.x) 35 | 36 | def __getitem__(self, idx): 37 | if self.istrain: 38 | return self.x[idx],self.m[idx], self.y[idx] 39 | else: 40 | return self.x[idx],self.m[idx] 41 | def torch_R2(pre,label): 42 | ''' 43 | pre 预测的流量序列 b,len 44 | label 实际的流量序列 b,len 45 | 46 | ''' 47 | r2=0 48 | mse=0 49 | for i in range(len(pre)): 50 | RSS= torch.sum((pre-label)** 2) 51 | TSS=torch.sum((label-torch.mean(label))** 2) 52 | r2+=1-RSS/TSS 53 | mse+=torch.mean((pre-label)** 2) 54 | 55 | return mse/len(pre),r2/len(pre) 56 | 57 | class Flow2Graph(nn.Module): 58 | """ 59 | Time2GraphGAT model 60 | Hyper-parameters: 61 | K: number of learned shapelets 62 | C: number of candidates 63 | A: number of shapelets assigned to each segment 64 | tflag: timing flag 65 | """ 66 | def __init__(self, K, seg_length, num_segment, warp=2, tflag=True, gpu_enable=False,device='cpu', optimizer='Adam', dropout=0.2, lk_relu=0.2, data_size=7, softmax=False, percentile=10, 67 | dataset='Unspecified', append=False, sort=False, 68 | feat_norm=True, aggregate=True, standard_scale=False, diff=False, **kwargs): 69 | super(Flow2Graph, self).__init__() 70 | self.K = K 71 | self.C = kwargs.pop('C', K * 10) 72 | self.seg_length = seg_length 73 | self.num_segment = num_segment 74 | self.data_size = data_size 75 | self.device=device 76 | self.warp = warp 77 | self.tflag = tflag 78 | self.gpu_enable = gpu_enable 79 | self.cuda = self.gpu_enable and torch.cuda.is_available() 80 | # Debugger.info_print('torch.cuda: {}, self.cuda: {}'.format(torch.cuda.is_available(), self.cuda)) 81 | self.shapelets = None 82 | self.append = append 83 | self.percentile = percentile 84 | self.threshold = None 85 | self.clf=None 86 | self.sort = sort 87 | self.aggregate = aggregate 88 | self.dropout = dropout 89 | self.lk_relu = lk_relu 90 | self.softmax = softmax 91 | self.dataset = dataset 92 | self.diff = diff 93 | self.standard_scale = standard_scale 94 | 95 | self.feat_norm = feat_norm 96 | self.pretrain = kwargs.pop('pretrain', None) 97 | 98 | self.lr = kwargs.pop('lr', 1e-3) 99 | self.p = kwargs.pop('p', 2) 100 | self.alpha = kwargs.pop('alpha', 0.1) 101 | self.beta = kwargs.pop('beta', 0.05) 102 | self.debug = kwargs.pop('debug', False) 103 | self.optimizer = optimizer 104 | self.measurement = kwargs.pop('measurement', 'gdtw') 105 | self.batch_size = kwargs.pop('batch_size', 200) 106 | self.init = kwargs.pop('init', 0) 107 | self.niter = kwargs.pop('niter', 1000) 108 | self.fastmode = kwargs.pop('fastmode', False) 109 | self.tol = kwargs.pop('tol', 1e-4) 110 | self.cuda = self.gpu_enable and torch.cuda.is_available() 111 | self.kwargs = kwargs 112 | Debugger.info_print('initialize our Flow2Graph with {}'.format(self.__dict__)) 113 | 114 | def learn_shapelets(self, x, y, num_segment, data_size): 115 | assert x.shape[1] == num_segment * self.seg_length 116 | Debugger.info_print('basic statistics before learn shapelets: max {:.4f}, min {:.4f}'.format(np.max(x), np.min(x))) 117 | if self.tflag: 118 | self.shapelets = learn_time_aware_shapelets( 119 | time_series_set=x, label=y, K=self.K, C=self.C, p=self.p, 120 | num_segment=num_segment, seg_length=self.seg_length, data_size=data_size, 121 | lr=self.lr, alpha=self.alpha, beta=self.beta, num_batch=int(x.shape[0] / self.batch_size), 122 | measurement=self.measurement, gpu_enable=self.gpu_enable, **self.kwargs) 123 | else: 124 | self.shapelets = learn_static_shapelets( 125 | time_series_set=x, label=y, K=self.K, C=self.C, warp=self.warp, 126 | num_segment=num_segment, seg_length=self.seg_length, measurement=self.measurement, **self.kwargs) 127 | 128 | def __gat_features__(self, X, train=False): 129 | __shapelet_distance = shapelet_distance( 130 | time_series_set=X, shapelets=self.shapelets, seg_length=self.seg_length, 131 | tflag=self.tflag, tanh=self.kwargs.get('tanh', False), debug=self.debug, 132 | init=self.init, warp=self.warp, measurement=self.measurement) 133 | threshold = None if train else self.threshold 134 | adj_matrix, self.threshold = adjacent_matrix( 135 | sdist=__shapelet_distance, num_time_series=X.shape[0], num_segment=int(X.shape[1] / self.seg_length), 136 | num_shapelet=self.K, percentile=self.percentile, threshold=threshold, debug=self.debug) 137 | __shapelet_distance = np.transpose(__shapelet_distance, axes=(0, 2, 1)) 138 | if self.sort: 139 | __shapelet_distance = softmax(-1 * np.sort(__shapelet_distance, axis=1), axis=1) 140 | if self.softmax and not self.sort: 141 | __shapelet_distance = softmax(__shapelet_distance, axis=1) 142 | if self.append: 143 | origin = np.array([v[0].reshape(-1) for v in self.shapelets], dtype=np.float).reshape(1, self.K, -1) 144 | return np.concatenate((__shapelet_distance, np.tile(origin, (__shapelet_distance.shape[0], 1, 1))), 145 | axis=2).astype(np.float), adj_matrix 146 | else: 147 | return __shapelet_distance.astype(np.float), adj_matrix 148 | 149 | 150 | def __preprocess_input_data(self, X): 151 | X_scale = X.copy() 152 | if self.diff: 153 | X_scale[:, : -1, :] = X[:, 1:, :] - X[:, :-1, :] 154 | X_scale[:, -1, :] = 0 155 | Debugger.debug_print('conduct time differing...') 156 | if self.standard_scale: 157 | for i in range(self.data_size): 158 | X_std = np.std(X_scale[:, :, i], axis=1).reshape(X.shape[0], -1) 159 | X_std[X_std == 0] = 1.0 160 | X_scale[:, :, i] = (X_scale[:, :, i] - np.mean(X_scale[:, :, i], axis=1).reshape(X.shape[0], -1)) / X_std 161 | Debugger.debug_print('conduct standard scaling on data-{}, with mean {:.2f} and var {:.2f}'.format(i, np.mean(X_scale[0, :, i]), np.std(X_scale[0, :, i]))) 162 | return X_scale 163 | def transfer(self,sps): 164 | ''' 165 | 把shapelets的列表转换成矩阵 166 | ''' 167 | ss=np.zeros((len(sps),self.seg_length)) 168 | for idx, (pattern, _, _, _) in enumerate(sps): 169 | ss[idx]=pattern[:,0]#取均值那一列 170 | return nn.Embedding.from_pretrained(torch.tensor(ss,dtype=torch.float32))# num_shapelets,seg_length 171 | def fit(self,for_rescale, X_scale, Y,valid_x_scale,valid_y,clf_func, reset=False,train_batch_size=256,de_size=24*1,epoch=100, 172 | display_steps=2,eval_steps=2,max_grad_norm=1.0,lr=0.3,l2_alpha=0.01,hidden_size=512,output_dir='model',logprintfile=None,): 173 | ''' 174 | X_scale, 175 | Y, 176 | valid_x_scale, 177 | valid_y 178 | clf_func :获取分类标签的函数 179 | ''' 180 | num_segment, data_size = int(X_scale.shape[1] / self.seg_length), X_scale.shape[-1] 181 | assert self.data_size == X_scale.shape[-1] 182 | X_scale = self.__preprocess_input_data(X_scale)#归一化 啥也没干其实 183 | valid_x_scale=self.__preprocess_input_data(valid_x_scale)#归一化 这里也是啥也没干其实 184 | 185 | if reset or self.shapelets is None: 186 | self.learn_shapelets(x=np.vstack((X_scale,valid_x_scale)), y=clf_func(np.vstack((Y,valid_y))), num_segment=num_segment, data_size=data_size) 187 | # self.__fit_gat(X=X_scale, Y=Y) 188 | print("获取数据的特征表示:") 189 | import pickle 190 | if os.path.exists('feauture_%d_%d.plk'%(self.K,de_size/24)): 191 | X_savel=pickle.load(open('feauture_%d_%d.plk'%(self.K,de_size/24), 'rb')) 192 | X_feat, X_adj =X_savel['X_feat'],X_savel['X_adj'] 193 | del X_savel 194 | gc.collect() 195 | else: 196 | X_feat, X_adj = self.__gat_features__(X_scale)#获得初始化的节点特征和邻接矩阵 197 | with open('feauture_%d_%d.plk'%(self.K,de_size/24), 'wb') as f: 198 | pickle.dump({'X_feat':np.array(X_feat),'X_adj':X_adj},f) 199 | 200 | print("开始训练!") 201 | #开始训练 202 | dataset = myDataset(X_feat, X_adj,Y) 203 | train_sampler = RandomSampler(dataset) 204 | train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=train_batch_size,num_workers=0) 205 | loss_fn = nn.MSELoss().to(self.device) 206 | en_size=X_scale.shape[1] 207 | nfeat=X_feat.shape[-1]#shapelets初始维度 208 | tar_len=Y.shape[-1]#预测目标的长度 209 | 210 | self.clf=myNetwork(en_size,data_size,nfeat,hidden_size,tar_len,self.seg_length,n_layers=2,dropout=0.1,modelname="Flim-GNN") 211 | ''' 212 | modelname:使用的模型名称 213 | nfeat:输入x特征矩阵维度 214 | nhid:中间层维度 215 | nclass:输出特征维度 216 | dropout:dropout的比例 217 | ''' 218 | optimizer = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 219 | # optimizer = torch.optim.Adam(self.parameters(), lr=lr) 220 | #设置优化器 221 | 222 | # optimizer = AdamW(self.parameters(), lr=lr, eps=1e-8,weight_decay=0.00001) 223 | # optimizer =torch.optim.SGD(self.model.parameters(), lr=args.lr, momentum=0.9) 224 | # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(len(train_dataloader)*epoch*0.2),num_training_steps=int(len(train_dataloader)*epoch)) 225 | 226 | self.clf.to(self.device) 227 | all_shapelets=self.transfer(self.shapelets)#拿到所有shapelet id到真实片段的映射 228 | self.clf.zero_grad() 229 | tr_loss,best_R2,avg_loss = 0.0, -10,0.0 230 | global_step=0 231 | for idx in range(epoch): 232 | tr_num=0 233 | train_loss=0 234 | self.clf.train() 235 | for step, batch in enumerate(train_dataloader): 236 | 237 | feat,adj,Y =(x.to(self.device) for x in batch) 238 | out =self.clf(feat,adj,all_shapelets.to(self.device))#state_orginal ,b,n,2 239 | # print(out.device) 240 | del batch,feat,adj 241 | gc.collect() 242 | loss =loss_fn(out,Y) 243 | optimizer.zero_grad() 244 | 245 | loss.backward()#先写到这里,后续再补充!! 246 | torch.nn.utils.clip_grad_norm_(self.parameters(),max_grad_norm) 247 | tr_loss += loss.item() 248 | tr_num+=1 249 | train_loss+=loss.item() 250 | #输出log 251 | if avg_loss==0: 252 | avg_loss=tr_loss 253 | avg_loss=round(train_loss/tr_num,8) 254 | 255 | if (step+1) % display_steps == 0: 256 | Debugger.info_print("At Training: epoch {} step {} loss {}".format(idx,step+1,avg_loss)) 257 | print("At Training: epoch {} step {} loss {}".format(idx,step+1,avg_loss),"\n",file=logprintfile) 258 | 259 | #update梯度 260 | optimizer.step() 261 | optimizer.zero_grad() 262 | # scheduler.step() 263 | global_step += 1 264 | 265 | #测试验证结果 266 | if (step+1) % eval_steps == 0: 267 | #输出验证集预测的结果 268 | out= self.infer(self.clf,valid_x_scale) 269 | #输出预测的f1和error distance 270 | results=self.eval(out,torch.tensor(valid_y,dtype=torch.float32).to(self.device)) 271 | with open('flow2graph_casestudy.pkl','wb')as fff: 272 | pickle.dump({'Y_label':for_rescale[2],'Y_pre':out.cpu().numpy()},fff) 273 | # pickle.dump({'Y_label':valid_y*for_rescale[1]+for_rescale[0],'Y_pre':out.cpu().numpy()*for_rescale[1]+for_rescale[0]},fff) 274 | 275 | #打印结果 276 | for key, value in results.items(): 277 | logger.info("测试结果 %s = %s", key, round(value,8)) 278 | #保存最好的年龄结果和模型 279 | if results['eval_R2']>best_R2: 280 | best_R2=results['eval_R2'] 281 | print(" "+"*"*20) 282 | print(" "+"*"*20,"\n",file=logprintfile) 283 | for key, value in results.items(): 284 | logger.info("测试结果 %s = %s", key, round(value,8)) 285 | print("测试结果 {} = {}".format(key, round(value,8)),"\n",file=logprintfile) 286 | logger.info(" Best R2:%s",round(best_R2,8)) 287 | logger.info(" Best mse:%s",round(results['eval_loss'],8)) 288 | print(" Best f1:",round(best_R2,8)," Best MSE:",round(results['eval_loss'],8),"\n",file=logprintfile) 289 | print(" "+"*"*20,"\n",file=logprintfile) 290 | logger.info(" "+"*"*20) 291 | 292 | model_to_save = self.clf.module if hasattr(self.clf, 'module') else self.clf # Only save the model it-self 293 | output_model_file = os.path.join(output_dir, "pytorch_time2graph_gcn_{}_{}.bin".format(en_size,tar_len)) 294 | torch.save(model_to_save.state_dict(), output_model_file) 295 | print(" Best R2:",round(best_R2,8)," Best MSE:",round(results['eval_loss'],8)) 296 | 297 | def infer(self,model,valid_x_scale,eval_batch_size=32): 298 | assert self.shapelets is not None, 'shapelets has not been learnt yet...' 299 | X_feat, X_adj = self.__gat_features__(valid_x_scale)#获得初始化的节点特征和邻接矩阵 300 | eval_dataset=myDataset(X_feat, X_adj,istrain=False) 301 | eval_sampler = SequentialSampler(eval_dataset) 302 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size,num_workers=4) 303 | all_shapelets=self.transfer(self.shapelets)#拿到所有shapelet id到真实片段的映射 304 | predict=[] 305 | for step, batch in enumerate(eval_dataloader): 306 | feat,adj =(x.to(self.device) for x in batch) 307 | del batch 308 | with torch.no_grad(): 309 | pred =model(feat,adj,all_shapelets.to(self.device))#b,tar_len 310 | # predict.append(pred.cpu().numpy())# b,tar_len 311 | predict.append(pred) 312 | del feat,adj,pred 313 | gc.collect() 314 | # predict=np.concatenate(predict,0) 315 | predict=torch.cat(predict,dim=0)#torch.stack(predict) 316 | return predict 317 | def eval(self,predict,Groudth): 318 | ''' 319 | predict sample_len ,de_size,1 320 | Groudth sample_len ,de_size,1 321 | ''' 322 | results={} 323 | m,r2=torch_R2(predict,Groudth) 324 | results['eval_loss']=m.cpu().item() 325 | results['eval_R2']=r2.cpu().item() 326 | # from sklearn.metrics import r2_score,mean_squared_error 327 | # results={} 328 | # results['eval_R2']=r2_score(Groudth,predict) 329 | # 330 | # results['eval_loss']=mean_squared_error(Groudth,predict) 331 | return results 332 | 333 | def reload(self,model,output_dir,en_size,tar_len): 334 | #读取在验证集结果最好的模型 335 | load_model_path=os.path.join(output_dir, "pytorch_time2graph_gcn_{}_{}.bin".format(en_size,tar_len)) 336 | logger.info("Load model from %s",load_model_path) 337 | model_to_load = model.module if hasattr(model, 'module') else model # Only save the model it-self 338 | model_to_load.load_state_dict(torch.load(load_model_path)) 339 | return model 340 | 341 | def save_shapelets(self, fpath): 342 | torch.save(self.shapelets, fpath) 343 | 344 | def load_shapelets(self, fpath, map_location='cuda:0'): 345 | self.shapelets = torch.load(fpath, map_location=map_location) 346 | -------------------------------------------------------------------------------- /time2graph/utils/base_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import time 4 | import itertools 5 | import psutil 6 | import torch.nn.functional as F 7 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 8 | from subprocess import * 9 | import torch 10 | import torch.nn as nn 11 | from torch.utils.data import DataLoader, Dataset 12 | from sklearn.metrics import r2_score,mean_squared_error 13 | 14 | class myDataset(Dataset): 15 | def __init__(self, x, y): 16 | super(myDataset, self).__init__() 17 | self.x = x 18 | self.y = y 19 | 20 | def __len__(self): 21 | return len(self.x) 22 | 23 | def __getitem__(self, idx): 24 | return self.x[idx], self.y[idx] 25 | 26 | 27 | class myMlp(nn.Module): 28 | def __init__(self, in_len=1448, out_len=24,gpu_enable=False): 29 | super(myMlp, self).__init__() 30 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 31 | self.net = nn.Sequential( 32 | nn.Linear(in_len, 2048), 33 | nn.BatchNorm1d(2048), 34 | nn.ReLU(), 35 | nn.Linear(2048,1024), 36 | nn.ReLU(), 37 | nn.Linear(1024,out_len) 38 | ) 39 | self.gpu_enable = gpu_enable 40 | 41 | def forward(self,x): 42 | if self.gpu_enable ==False: 43 | x = x.cpu() 44 | x = self.net(x) 45 | return x 46 | 47 | def fit(self, x, y): 48 | x = torch.tensor(x,dtype=torch.float32) 49 | y = torch.tensor(y,dtype=torch.float32) 50 | dataset = myDataset(x, y) 51 | data_len = len(dataset) 52 | batchsize = 128 53 | dataloader = DataLoader(dataset,batch_size=batchsize,shuffle=True,num_workers=4) 54 | epochs = 100 55 | loss_fn = nn.MSELoss().to(self.device) 56 | optimizer = torch.optim.SGD(self.parameters(), lr=10e-5, momentum=0.9) 57 | self.net.to(self.device) 58 | for epoch in range(epochs): 59 | i = 0 60 | for samples, labels in dataloader: 61 | samples = samples.to(self.device) 62 | labels = labels.to(self.device) 63 | out = self.forward(samples) 64 | loss = loss_fn(out, labels) 65 | r2 = r2_score(labels.cpu().detach().numpy(), out.cpu().detach().numpy()) 66 | optimizer.zero_grad() 67 | loss.backward() 68 | optimizer.step() 69 | i = i + len(samples) 70 | print( 71 | 'epoch:{}/{} iter:{}/{} loss:{} r2:{} '.format(epoch + 1, epochs, i, data_len, 72 | loss, r2)) 73 | from torch_geometric.nn import GINConv ,GCNConv,GATConv,GraphConv,FiLMConv 74 | class GIN(nn.Module): 75 | def __init__(self,modelname, nfeat, dropout,n_layer=5, JK="last", residual=False): 76 | ''' 77 | nfeat:输入x特征矩阵维度 78 | 79 | n_layer: GIN的层数 80 | dropout:dropout的比例 81 | ''' 82 | super(GIN, self).__init__() 83 | self.num_layers = n_layer 84 | self.JK = JK 85 | # add residual connection or not 86 | self.residual = residual 87 | self.dropout = dropout 88 | # List of GNNs 89 | 90 | self.convs = torch.nn.ModuleList() 91 | self.batch_norms = torch.nn.ModuleList() 92 | 93 | for layer in range(n_layer): 94 | if modelname=='GIN': 95 | self.convs.append(GINConv(self.MLP(nfeat,nfeat))) 96 | elif modelname=='GCN': 97 | self.convs.append(GCNConv(nfeat,nfeat)) 98 | elif modelname=='GAT': 99 | self.convs.append(GATConv(nfeat,nfeat)) 100 | elif modelname=='GNN': 101 | self.convs.append(GraphConv(nfeat,nfeat)) 102 | 103 | elif modelname=='Flim-GNN': 104 | self.convs.append(FiLMConv(nfeat,nfeat)) 105 | 106 | self.batch_norms.append(torch.nn.BatchNorm1d(nfeat)) 107 | @staticmethod 108 | def MLP(in_channels: int, out_channels: int) -> torch.nn.Module: 109 | return nn.Sequential( 110 | nn.Linear(in_channels, out_channels), 111 | nn.BatchNorm1d(out_channels), 112 | nn.ReLU(inplace=True), 113 | nn.Linear(out_channels, out_channels), 114 | ) 115 | def forward(self, x, adj): 116 | h_list=[x] 117 | 118 | for layer in range(self.num_layers): 119 | h = self.convs[layer](x, adj) 120 | h = self.batch_norms[layer](h) 121 | if layer == self.num_layers - 1: 122 | # remove relu for the last layer 123 | h = F.dropout(h, self.dropout, training=self.training) 124 | else: 125 | h = F.dropout(F.relu(h), self.dropout, training=self.training) 126 | 127 | if self.residual: 128 | h += h_list[layer] 129 | 130 | h_list.append(h) 131 | 132 | 133 | return torch.stack(h_list[-self.num_layers:]) 134 | 135 | 136 | 137 | 138 | 139 | 140 | class myNetwork(nn.Module): 141 | def __init__(self, en_size,embed_size,nfeat,hidden_size,tar_len,segment_len,n_layers=2,dropout=0.02,modelname="GIN"): 142 | ''' 143 | en_size 输入序列的长度 144 | embed_size 输入样本的特征维度 145 | nfeat 输入特征矩阵X的特征维度 146 | hidden_size 隐藏层的维度 147 | tar_len 输出序列的长度 148 | segment_len shapelet的长度 149 | n_layers GIN的层数 150 | ''' 151 | super(myNetwork, self).__init__() 152 | self.gin_layers=n_layers 153 | self.k=tar_len//segment_len 154 | self.gIn = GIN(modelname,hidden_size, dropout,n_layer=n_layers) 155 | 156 | 157 | self.net = nn.Sequential( 158 | nn.Linear(segment_len, 512), 159 | nn.ReLU(), 160 | nn.Flatten(), 161 | nn.Linear(512*self.k,512), 162 | nn.BatchNorm1d(512), 163 | nn.ReLU(), 164 | nn.Linear(512,tar_len) 165 | ) 166 | self.mlp1 =nn.Linear(nfeat,hidden_size) 167 | self.mlp2 =nn.Linear(self.gin_layers*hidden_size,hidden_size) 168 | def foroneGraph(self,x,adj): 169 | ''' 170 | 对于单个时间序列,单个图 171 | 172 | 返回其对应的关键shapelet index 173 | ''' 174 | a1=(adj >0).nonzero().t() 175 | 176 | x=F.relu(self.mlp1(x))#len,hidden_size 177 | h_=self.gIn(x,a1)#5,num_shapelets,hidden_size h_[-1]为所有shapelets的表示 178 | # print('隐藏层的维度:',h_.size())#torch.Size([2, 30, 256]) 179 | #node level Embedding 180 | node_E=x+h_[-1] 181 | 182 | 183 | #compute graph level EMbedding 184 | # Sum+CONCAT 185 | graph_Re=torch.sum(h_,dim=1).view(1,-1)#1, n_layer*hidden_size 186 | # print("%%%%%",graph_Re.size()) 187 | 188 | graph_Re=self.mlp2(graph_Re)#1, hidden_size 189 | graph_Re=F.relu(graph_Re)#1, hidden_size 190 | dis=-torch.mm(graph_Re,node_E.t())#1,num_shapelets 191 | #取top 192 | shapelet_index=torch.topk(dis,self.k,1)[1]#1,k 193 | return shapelet_index.squeeze(0) 194 | 195 | 196 | def forward(self,x,adj,embedding): 197 | ''' 198 | x:batch_size,num_shapelets, nfeat 199 | adj: batch_size,num_shapelets, num_shapelets 200 | embedding 前面获取到的shapelet字典 num_shapelets,segments_length 201 | ''' 202 | key_shapelets=[]#bacth_size, 203 | for o_x,o_adj in zip(x,adj): 204 | key_shapelets.append(self.foroneGraph(o_x,o_adj)) 205 | key_shapelets=torch.stack(key_shapelets) #bacth_size,k 206 | out=embedding(key_shapelets) #bacth_size,k,segments_length 207 | # print("out.size",out.size()) 208 | out=self.net(out)#bacth_size,tar_len 209 | return out 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | class ModelUtils(object): 220 | """ 221 | model utils for basic classifiers. 222 | kwargs list: 223 | lr paras 224 | penalty: list of str, candidate: l1, l2; 225 | c: list of float 226 | inter_scale: list of float 227 | rf and dts paras: 228 | criteria: list of str, candidate: gini, entropy 229 | max_features: list of str(including None), candidate: auto, log2 or None 230 | max_depth: list of int 231 | max_split: list of int 232 | min_leaf: list of int 233 | xgb paras: 234 | max_depth: list of int 235 | learning_rate: list of float 236 | n_jobs: int 237 | class_weight: list of int 238 | booster: list of str, candidate: gblinear, gbtree, dart 239 | svm paras: 240 | c: list of float 241 | svm_kernel: list of str, candidate: rbf, poly, sigmoid 242 | deepwalk paras: 243 | num_walks: list of int 244 | representation_size: list of int 245 | window_size: list of int 246 | workers: int 247 | undirected: bool 248 | """ 249 | def __init__(self, kernel, **kwargs): 250 | self.kernel = kernel 251 | self.kwargs = kwargs 252 | 253 | @property 254 | def clf__(self): 255 | if self.kernel == 'lr': 256 | from sklearn.linear_model import LogisticRegression 257 | return LogisticRegression 258 | elif self.kernel == 'svm': 259 | from sklearn.svm import SVC 260 | return SVC 261 | elif self.kernel == 'dts': 262 | from sklearn.tree import DecisionTreeClassifier 263 | return DecisionTreeClassifier 264 | elif self.kernel == 'rf': 265 | from sklearn.ensemble import RandomForestClassifier 266 | return RandomForestClassifier 267 | elif self.kernel == 'mlpReg': 268 | return myMlp 269 | 270 | # elif self.kernel == 'xgb': 271 | # from xgboost import XGBClassifier 272 | # return XGBClassifier 273 | else: 274 | raise NotImplementedError('unsupported kernel {}'.format(self.kernel)) 275 | 276 | def para_len(self, balanced): 277 | cnt = 0 278 | for args in self.clf_paras(balanced=balanced): 279 | cnt += 1 280 | return cnt 281 | 282 | def clf_paras(self, balanced): 283 | class_weight = 'balanced' if balanced else None 284 | if self.kernel == 'lr': 285 | penalty = self.kwargs.get('penalty', ['l1', 'l2']) 286 | c = self.kwargs.get('c', [pow(5, i) for i in range(-3, 3)]) 287 | intercept_scaling = self.kwargs.get('inter_scale', [pow(5, i) for i in range(-3, 3)]) 288 | for (p1, p2, p3) in itertools.product(penalty, c, intercept_scaling): 289 | yield { 290 | 'penalty': p1, 291 | 'C': p2, 292 | 'intercept_scaling': p3, 293 | 'class_weight': class_weight 294 | } 295 | elif self.kernel == 'rf' or self.kernel == 'dts': 296 | criteria = self.kwargs.get('criteria', ['gini', 'entropy']) 297 | max_features = self.kwargs.get('max_feature', ['auto', 'log2', None]) 298 | max_depth = self.kwargs.get('max_depth', [10, 25, 50]) 299 | min_samples_split = self.kwargs.get('max_split', [2, 4, 8]) 300 | min_samples_leaf = self.kwargs.get('min_leaf', [1, 3, 5]) 301 | for (p1, p2, p3, p4, p5) in itertools.product( 302 | criteria, max_features, max_depth, min_samples_split, min_samples_leaf 303 | ): 304 | yield { 305 | 'criterion': p1, 306 | 'max_features': p2, 307 | 'max_depth': p3, 308 | 'min_samples_split': p4, 309 | 'min_samples_leaf': p5, 310 | 'class_weight': class_weight 311 | } 312 | elif self.kernel == 'xgb': 313 | max_depth = self.kwargs.get('max_depth', [1, 4, 8, 12]) 314 | learning_rate = self.kwargs.get('learning_rate', [0.1, 0.2]) 315 | n_jobs = [self.kwargs.get('n_jobs', psutil.cpu_count())] 316 | class_weight = self.kwargs.get('class_weight', [1, 10, 50]) 317 | booster = self.kwargs.get('booster', ['gblinear', 'gbtree', 'dart']) 318 | n_estimators = self.kwargs.get('n_estimators', [10, 50, 100, 150]) 319 | for (p1, p2, p3, p4, p5, p6) in itertools.product( 320 | max_depth, learning_rate, booster, n_jobs, class_weight, n_estimators 321 | ): 322 | yield { 323 | 'max_depth': p1, 324 | 'learning_rate': p2, 325 | 'booster': p3, 326 | 'n_jobs': p4, 327 | 'scale_pos_weight': p5, 328 | 'n_estimators': p6 329 | } 330 | elif self.kernel == 'svm': 331 | c = self.kwargs.get('c', [pow(2, i) for i in range(-2, 2)]) 332 | svm_kernel = self.kwargs.get('svm_kernel', ['rbf', 'poly', 'sigmoid']) 333 | for (p1, p2) in itertools.product(c, svm_kernel): 334 | yield { 335 | 'C': p1, 336 | 'kernel': p2, 337 | 'class_weight': class_weight 338 | } 339 | else: 340 | raise NotImplementedError() 341 | 342 | @staticmethod 343 | def partition_data__(data, ratio, shuffle=True, multi=True): 344 | import random 345 | if not multi: 346 | size = len(data) 347 | if shuffle: 348 | idx = random.sample(range(size), int(size * ratio)) 349 | else: 350 | idx, step, cnt, init = [], 1.0 / ratio, 0, 0 351 | while cnt < int(size * ratio): 352 | idx.append(int(init)) 353 | init += step 354 | return data[idx] 355 | else: 356 | num, size = len(data), len(data[0]) 357 | if shuffle: 358 | idx = random.sample(range(size), int(size * ratio)) 359 | else: 360 | idx, step, cnt, init = [], 1.0 / ratio, 0, 0 361 | while cnt < int(size * ratio): 362 | idx.append(int(init)) 363 | init += step 364 | return [data[k][idx] for k in range(num)] 365 | 366 | def deepwalk_paras(self): 367 | num_walks = self.kwargs.get('num_walks', [10, 20]) 368 | representation_size = self.kwargs.get('representation_size', [32, 64, 128, 256]) 369 | walk_length = self.kwargs.get('walk_length', [32, 64, 128]) 370 | window_size = self.kwargs.get('window_size', [5, 10]) 371 | workers = self.kwargs.get('workers', psutil.cpu_count()) 372 | undirected = self.kwargs.get('undirected', False) 373 | for (p1, p2, p3, p4) in itertools.product( 374 | num_walks, representation_size, walk_length, window_size 375 | ): 376 | yield { 377 | 'number-walks': p1, 378 | 'representation-size': p2, 379 | 'walk-length': p3, 380 | 'window-size': p4, 381 | 'workers': workers, 382 | 'undirected': undirected 383 | } 384 | 385 | def return_metric_method(self, opt_metric): 386 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 387 | if opt_metric == 'accuracy': 388 | return accuracy_score 389 | elif opt_metric == 'precision': 390 | return precision_score 391 | elif opt_metric == 'recall': 392 | return recall_score 393 | elif opt_metric == 'f1': 394 | return f1_score 395 | else: 396 | raise NotImplementedError('unsupported metric {}'.format(opt_metric)) 397 | 398 | def save_model(self, fpath): 399 | pass 400 | 401 | def load_model(self, fpath, map_location='cuda:0'): 402 | pass 403 | 404 | def save_shapelets(self, fpath): 405 | pass 406 | 407 | def load_shapelets(self, fpath, map_location='cuda:0'): 408 | pass 409 | 410 | 411 | class Debugger(object): 412 | """ 413 | Class for debugger print 414 | """ 415 | def __init__(self): 416 | pass 417 | 418 | @staticmethod 419 | def error_print(msg, debug=True): 420 | if debug: 421 | print('[error]' + msg) 422 | 423 | @staticmethod 424 | def warn_print(msg, debug=True): 425 | if debug: 426 | print('[warning]' + msg) 427 | 428 | @staticmethod 429 | def debug_print(msg, debug=True): 430 | if debug: 431 | print('[debug]' + msg + '\r', end='') 432 | sys.stdout.flush() 433 | 434 | @staticmethod 435 | def info_print(msg): 436 | print('[info]' + msg) 437 | 438 | @staticmethod 439 | def time_print(msg, begin, profiling=False): 440 | if profiling: 441 | assert isinstance(begin, type(time.time())), 'invalid begin time {}'.format(begin) 442 | print('[info]{}, elapsed for {:.2f}s'.format(msg, time.time() - begin)) 443 | 444 | 445 | class Queue: 446 | def __init__(self, max_size): 447 | self.queue = [] 448 | self.max_size = max_size 449 | 450 | def enqueue(self, val): 451 | if self.size() == self.max_size: 452 | self.dequeue() 453 | self.queue.insert(0, val) 454 | 455 | def dequeue(self): 456 | if self.is_empty(): 457 | return None 458 | else: 459 | return self.queue.pop() 460 | 461 | def size(self): 462 | return len(self.queue) 463 | 464 | def is_empty(self): 465 | return self.size() == 0 466 | 467 | 468 | def convert_string(string, val, cvt_type='float'): 469 | """ 470 | Convert a string as given type. 471 | :param string: input string 472 | :param val: default return value if conversion fails 473 | :param cvt_type: conversion type 474 | :return: value with given type 475 | """ 476 | try: 477 | return eval(cvt_type)(string) 478 | except NameError as _: 479 | Debugger.warn_print('invalid convert type {}; use float() by default'.format(cvt_type)) 480 | return float(string) 481 | except ValueError as _: 482 | Debugger.warn_print('invalid convert value {}; return {} by default'.format(string, val)) 483 | return val 484 | 485 | 486 | def syscmd(cmd, encoding=''): 487 | """ 488 | Runs a command on the system, waits for the command to finish, and then 489 | returns the text output of the command. If the command produces no text 490 | output, the command's return code will be returned instead. 491 | 492 | :param cmd: command, str 493 | :param encoding: encoding method, str(utf8, unicode, etc) 494 | :return: return code or text output 495 | """ 496 | p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, 497 | stderr=STDOUT, close_fds=True) 498 | p.wait() 499 | output = p.stdout.read() 500 | if len(output) > 1: 501 | if encoding: 502 | return output.decode(encoding) 503 | else: 504 | return output 505 | return p.returncode 506 | 507 | 508 | --------------------------------------------------------------------------------