├── time2graph
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── Optimize.cpython-37.pyc
    │   │   ├── Optimize.cpython-38.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── model_gat.cpython-37.pyc
    │   │   ├── model_gat.cpython-38.pyc
    │   │   ├── model_gin.cpython-37.pyc
    │   │   ├── model_gin.cpython-38.pyc
    │   │   ├── model_utils.cpython-37.pyc
    │   │   ├── model_utils.cpython-38.pyc
    │   │   ├── distance_utils.cpython-37.pyc
    │   │   ├── distance_utils.cpython-38.pyc
    │   │   ├── shapelet_utils.cpython-37.pyc
    │   │   ├── shapelet_utils.cpython-38.pyc
    │   │   ├── static_shapelets.cpython-37.pyc
    │   │   ├── static_shapelets.cpython-38.pyc
    │   │   ├── time_aware_shapelets.cpython-37.pyc
    │   │   └── time_aware_shapelets.cpython-38.pyc
    │   ├── model_utils.py
    │   ├── static_shapelets.py
    │   ├── distance_utils.py
    │   ├── Optimize.py
    │   ├── shapelet_embedding.py
    │   ├── model_embeds.py
    │   ├── shapelet_utils.py
    │   ├── time_aware_shapelets.py
    │   └── model_gin.py
    ├── utils
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── gat.cpython-37.pyc
    │   │   ├── gat.cpython-38.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── mp_utils.cpython-37.pyc
    │   │   ├── mp_utils.cpython-38.pyc
    │   │   ├── base_utils.cpython-37.pyc
    │   │   ├── base_utils.cpython-38.pyc
    │   │   ├── gat_utils.cpython-37.pyc
    │   │   └── gat_utils.cpython-38.pyc
    │   ├── gat.py
    │   ├── gat_utils.py
    │   ├── deep_utils.py
    │   ├── mp_utils.py
    │   ├── deep_models.py
    │   └── base_utils.py
    └── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   └── __init__.cpython-38.pyc
├── requirements.txt
├── config.py
├── README.md
├── data_load.py
└── my_train.py


/time2graph/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/time2graph/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/time2graph/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/time2graph/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/gat.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/gat.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/gat.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/gat.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/Optimize.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/Optimize.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/Optimize.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/Optimize.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/model_gat.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_gat.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/model_gat.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_gat.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/model_gin.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_gin.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/model_gin.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_gin.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/mp_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/mp_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/mp_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/mp_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/model_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/model_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/model_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/base_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/base_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/base_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/base_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/gat_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/gat_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/utils/__pycache__/gat_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/utils/__pycache__/gat_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/distance_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/distance_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/distance_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/distance_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/shapelet_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/shapelet_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/shapelet_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/shapelet_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/static_shapelets.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/static_shapelets.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/static_shapelets.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/static_shapelets.cpython-38.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/time_aware_shapelets.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/time_aware_shapelets.cpython-37.pyc


--------------------------------------------------------------------------------
/time2graph/core/__pycache__/time_aware_shapelets.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mliwang/NetworkTrafficPrediction/HEAD/time2graph/core/__pycache__/time_aware_shapelets.cpython-38.pyc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | dill>=0.2.5
 2 | six>=1.10.0
 3 | scipy>=1.3.0
 4 | numpy>=1.16.0
 5 | scikit_learn>=0.19.1
 6 | pandas>=0.23
 7 | xgboost>=0.80
 8 | torch>=0.4.1
 9 | networkx>=2.1
10 | tslearn>=0.2.5
11 | pathos>=0.2


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from os import path, cpu_count
 3 | from time2graph.utils.base_utils import Debugger
 4 | 
 5 | module_path = path.dirname(path.abspath(__file__))
 6 | njobs = cpu_count()
 7 | if njobs >= 40:
 8 |     njobs = int(njobs / 2)
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | __all__ = [
15 |     'np',
16 |     'path',
17 |     'Debugger',
18 |     'module_path',
19 |     'njobs'
20 | ]
21 | 


--------------------------------------------------------------------------------
/time2graph/core/model_utils.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from torch.utils.data.sampler import WeightedRandomSampler
 3 | 
 4 | 
 5 | class NumpyDataset(Dataset):
 6 |     """ Dataset wrapping numpy ndarrays
 7 |     Each sample will be retrieved by indexing numpy-arrays along the first dimension.
 8 | 
 9 |     Arguments:
10 |         *ndarrays (numpy-ndarray): ndarrays that have the same size of the first dimension.
11 |     """
12 |     def __init__(self, *ndarrays):
13 |         assert all(ndarrays[0].shape[0] == ndarray.shape[0] for ndarray in ndarrays)
14 |         self.ndarrays = ndarrays
15 | 
16 |     def __getitem__(self, idx):
17 |         return tuple(ndarray[idx] for ndarray in self.ndarrays)
18 | 
19 |     def __len__(self):
20 |         return self.ndarrays[0].shape[0]
21 | 
22 | 
23 | class StratifiedSampler(WeightedRandomSampler):
24 |     def __init__(self, label, num_class):
25 |         self.num_class = num_class
26 |         weights = self.__get_weight(label=label)
27 |         super(StratifiedSampler, self).__init__(weights=weights, num_samples=len(weights))
28 | 
29 |     def __get_weight(self, label):
30 |         num_class = self.num_class
31 |         cnt = [0] * num_class
32 |         for lb in label:
33 |             cnt[lb] += 1
34 |         weight_per_class, total = [0.0] * num_class, float(sum(cnt))
35 |         for k in range(num_class):
36 |             weight_per_class[k] = total / float(cnt[k])
37 |         ret = [0.0] * len(label)
38 |         for idx, val in enumerate(label):
39 |             ret[idx] = weight_per_class[val]
40 |         return ret
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Quick Links
 2 | 
 3 | - [Building and Testing](#building-and-testing)
 4 | - [Usage](#usage)
 5 | - [Performance](#performance)
 6 | - [Reference](#reference)
 7 | 
 8 | ## Building and Testing
 9 | 
10 | This project is implemented primarily in Python 3.6, with several dependencies listed below. We have tested the framework on Ubuntu 16.04.5 LTS with kernel 4.4.0, and it is expected to easily build and run under a regular Unix-like system.
11 | 
12 | ### Dependencies
13 | 
14 | - [Python 3.7](https://www.python.org).
15 |   Version 3.7.0 has been tested. Higher versions are expected be compatible with current implementation, while there may be syntax errors or conflicts under python 2.x.
16 | 
17 | - [PyTorch](https://pytorch.org). 
18 | 
19 |   Version 1.7.0 has been tested. You can find installation instructions [here](https://pytorch.org/get-started/locally/). Note that the GPU support is **ENCOURAGED** as it greatly boosts training efficiency.
20 | 
21 | 
22 | - [Other Python modules](https://pypi.python.org). Some other Python module dependencies are listed in ```requirements.txt```, which can be easily installed with pip:
23 | 
24 |   ```bash
25 |   pip install -r requirements.txt
26 |   ```
27 | 
28 | ## Reference
29 | [1] R. Wang, Y. Zhang, L. Peng, G. Fortino and P. -H. Ho, "Time-Varying-Aware Network Traffic Prediction Via Deep Learning in IIoT," in IEEE Transactions on Industrial Informatics, vol. 18, no. 11, pp. 8129-8137, Nov. 2022, doi: 10.1109/TII.2022.3163558.
30 | 
31 | ```
32 | @ARTICLE{9745370,
33 |   author={Wang, Ranran and Zhang, Yin and Peng, Limei and Fortino, Giancarlo and Ho, Pin-Han},
34 |   journal={IEEE Transactions on Industrial Informatics}, 
35 |   title={Time-Varying-Aware Network Traffic Prediction Via Deep Learning in IIoT}, 
36 |   year={2022},
37 |   volume={18},
38 |   number={11},
39 |   pages={8129-8137},
40 |   doi={10.1109/TII.2022.3163558}}
41 | ```# NetworkTrafficPrediction


--------------------------------------------------------------------------------
/time2graph/utils/gat.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from .gat_utils import GraphAttentionLayer
 5 | 
 6 | 
 7 | class GAT(nn.Module):
 8 |     def __init__(self, nfeat, nhid, nnodes, nclass, dropout, alpha, nheads, aggregate):
 9 |         """Dense version of GAT."""
10 |         super(GAT, self).__init__()
11 |         self.dropout = dropout
12 |         # sum or aggregate flag
13 |         self.aggregate = aggregate
14 |         self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
15 |         for i, attention in enumerate(self.attentions):
16 |             self.add_module('attention_{}'.format(i), attention)
17 | 
18 |         # self.out_att = GraphAttentionLayer(nhid * nheads, nclass, dropout=dropout,
19 |         # alpha=alpha, concat=False, reshape=True)
20 |         # self.add_module('attention_out', self.out_att)
21 | 
22 |         self.hidden_size = nnodes * nhid * nheads if self.aggregate else nnodes * nhid
23 |         self.output = nn.Sequential(
24 |             nn.Linear(self.hidden_size, self.hidden_size * 2),
25 |             nn.ReLU(),
26 |             nn.Linear(self.hidden_size * 2, nclass)
27 |         )
28 | 
29 |     def forward(self, x, adj, feat_flag=False):
30 |         x = F.dropout(x, self.dropout, training=self.training)
31 |         x_head = [att(x, adj) for att in self.attentions]
32 |         if self.aggregate:
33 |             x = torch.cat(x_head, dim=2).view(x.size()[0], -1)
34 |         else:
35 |             x = torch.sum(torch.stack(x_head, dim=2), dim=2).view(x.size()[0], -1)
36 |         x = F.dropout(x, self.dropout, training=self.training)
37 |         if feat_flag:
38 |             return F.elu(x)
39 |         else:
40 |             x = self.output(F.elu(x))
41 |             return F.log_softmax(x, dim=1)
42 | 
43 | 
44 | def accuracy_torch(output, labels):
45 |     preds = output.max(1)[1].type_as(labels)
46 |     correct = preds.eq(labels).double()
47 |     correct = correct.sum()
48 |     return correct / len(labels)
49 | 
50 | 
51 | def label_np(output, cuda):
52 |     if cuda:
53 |         return output.max(1)[1].cpu().numpy()
54 |     else:
55 |         return output.max(1)[1].numpy()
56 | 
57 | 
58 | def output_np(output, cuda):
59 |     if cuda:
60 |         return output.detach().cpu().numpy()
61 |     else:
62 |         return output.detach().numpy()
63 | 


--------------------------------------------------------------------------------
/time2graph/utils/gat_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.utils.data import DataLoader
 5 | 
 6 | 
 7 | class GraphAttentionLayer(nn.Module):
 8 |     """
 9 |     Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
10 |     """
11 | 
12 |     def __init__(self, in_features, out_features, dropout, alpha, concat=True, reshape=False):
13 |         super(GraphAttentionLayer, self).__init__()
14 |         self.dropout = dropout
15 |         self.in_features = in_features
16 |         self.out_features = out_features
17 |         self.alpha = alpha
18 |         self.concat = concat
19 |         self.reshape = reshape
20 | 
21 |         self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)), requires_grad=True)
22 |         nn.init.xavier_uniform_(self.W.data, gain=1.414)
23 |         self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1)), requires_grad=True)
24 |         nn.init.xavier_uniform_(self.a.data, gain=1.414)
25 | 
26 |         self.leakyrelu = nn.LeakyReLU(self.alpha)
27 | 
28 |     def forward(self, input, adj):
29 |         h = torch.matmul(input, self.W)
30 |         nbatch, N = h.size()[0], h.size()[1]
31 |         a_input = torch.cat([h.repeat(1, 1, N).view(nbatch, N * N, -1),
32 |                              h.repeat(1, N, 1)], dim=2).view(nbatch, N, -1, 2 * self.out_features)
33 |         e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(3))
34 | 
35 |         zero_vec = -9e15 * torch.ones_like(e)
36 |         attention = torch.where(adj > 0, e, zero_vec)
37 |         attention = F.softmax(attention, dim=2)
38 |         attention = F.dropout(attention, self.dropout, training=self.training)
39 |         h_prime = torch.matmul(attention, h)
40 |         if self.reshape:
41 |             h_prime = h_prime.view(nbatch, -1)
42 | 
43 |         if self.concat:
44 |             return F.elu(h_prime)
45 |         else:
46 |             return h_prime
47 | 
48 |     def __repr__(self):
49 |         return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
50 | 
51 | 
52 | class GATDataloader(DataLoader):
53 |     def __init__(self, *args, **kwargs):
54 |         super(GATDataloader, self).__init__(*args, **kwargs)
55 | 
56 | 
57 | class GATDataset(object):
58 |     def __init__(self, feat, adj, y=None):
59 |         if y is not None:
60 |             self.data = [(feat[k], adj[k], y[k]) for k in range(len(y))]
61 |         else:
62 |             self.data = [(feat[k], adj[k]) for k in range(len(adj))]
63 | 
64 |     def __getitem__(self, item):
65 |         return self.data[item]
66 | 
67 |     def __len__(self):
68 |         return len(self.data)
69 | 


--------------------------------------------------------------------------------
/time2graph/core/static_shapelets.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scipy.stats import entropy
 3 | from ..utils.base_utils import Queue
 4 | from .model_utils import *
 5 | from .shapelet_utils import *
 6 | from .distance_utils import *
 7 | 
 8 | 
 9 | def __static_shapelet_candidate_loss(cand, time_series_set, label, warp, num_segment, seg_length, measurement, **kwargs):
10 |     assert seg_length == cand.shape[0] and num_segment * seg_length == time_series_set.shape[1]
11 |     distances = np.zeros(time_series_set.shape[0], dtype=float)
12 |     for i in range(time_series_set.shape[0]):
13 |         distances[i] = pattern_distance_no_timing(pattern=cand, time_series=time_series_set[i], warp=warp, measurement=measurement)
14 |     positive_distance = distances[label == 1]
15 |     negative_distance = distances[label == 0]
16 |     max_val, min_val = np.max(distances), np.min(distances)
17 |     num_bins = int(max_val - min_val) + 1
18 |     positive_norm = np.histogram(a=positive_distance, bins=num_bins, range=(min_val, max_val), density=True)[0]
19 |     negative_norm = np.histogram(a=negative_distance, bins=num_bins, range=(min_val, max_val), density=True)[0]
20 |     positive_norm[positive_norm == 0] = 1e-3
21 |     negative_norm[negative_norm == 0] = 1e-3
22 |     return -(entropy(negative_norm, positive_norm) + entropy(positive_norm, negative_norm))
23 | 
24 | 
25 | def __static_shapelet_candidate_loss_factory(time_series_set, label, warp, num_segment, seg_length, measurement, **kwargs):
26 |     def __main__(pid, args, queue):
27 |         ret = []
28 |         for cand in args:
29 |             loss = __static_shapelet_candidate_loss(
30 |                 cand=cand, time_series_set=time_series_set, label=label, warp=warp, num_segment=num_segment,
31 |                 seg_length=seg_length, measurement=measurement, **kwargs
32 |             )
33 |             ret.append((cand, loss))
34 |             queue.put(0)
35 |         return ret
36 |     return __main__
37 | 
38 | 
39 | def learn_static_shapelets(time_series_set, label, K, C, warp, num_segment, seg_length, measurement, **kwargs):
40 |     cands = generate_shapelet_candidate(time_series_set=time_series_set, num_segment=num_segment,
41 |                                         seg_length=seg_length, candidate_size=C, **kwargs)
42 |     parmap = ParMap(
43 |         work=__static_shapelet_candidate_loss_factory(
44 |             time_series_set=time_series_set, label=label, warp=warp, num_segment=num_segment, seg_length=seg_length,
45 |             measurement=measurement, **kwargs
46 |         ),
47 |         monitor=parallel_monitor(msg='learning static shapelets', size=len(cands),
48 |                                  debug=kwargs.get('debug', True)),
49 |         njobs=kwargs.get('njobs', NJOBS)
50 |     )
51 |     return sorted(parmap.run(data=cands), key=lambda x: x[-1])[:K]
52 | 


--------------------------------------------------------------------------------
/data_load.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Oct 24 16:07:22 2021
 4 | 
 5 | @author: Administrator
 6 | """
 7 | import numpy as np
 8 | import pandas as pd
 9 | import random
10 | import pickle
11 | import os
12 | from tqdm import tqdm
13 | def getsingleGroup(pro_data,group,src_len,tar_len,step):
14 |     '''
15 |     pro_data 全部数据
16 |     group 单组 key
17 |     单组生成序列
18 |     
19 |     '''
20 |     curent_df=pro_data.loc[(pro_data['hostname']==group[0]) & (pro_data['series']==group[1])]
21 |     tw=src_len+tar_len#总的采样窗口大小，前面是X,后面部分的Mean是Y
22 |     step=step
23 |     X=[]
24 |     Y=[]
25 |     
26 |     L=len(curent_df)
27 |         #按时间排序
28 |     curent_df['time'] = pd.to_datetime(curent_df['time_window'])
29 |     curent_df.sort_values('time', inplace=True)
30 |     useful_column=[ 'Mean', 'SD', 'Open', 'High','Low', 'Close', 'Volume']#取特征列
31 |         
32 |     for i in range(0,L-tw,step):
33 | #                train_seq = df_tmp[features].values[i:i+tw]
34 |         train_seq =curent_df[i:i+tw][useful_column]#
35 |         X.append(train_seq.values[i:i+src_len])
36 |         Y.append(train_seq[i+src_len:]['Mean'].values)
37 |         if i>L-tw and i<L:#处理尾巴上的
38 |             train_seq =curent_df[-tw:][useful_column]
39 |             X.append(train_seq.values[-tw:tw-src_len])
40 |             Y.append(train_seq[tw-src_len:]['Mean'].values)
41 |             
42 |         if len(X)>1000:#控制内存
43 |             X=X[-1000:]
44 |             Y=Y[-1000:]
45 |         return np.array(X),np.array(Y)
46 | 
47 | def get_dataset(inputdir,src_len,tar_len,step=5,train_probility=0.8,sample_pro=10000):
48 |     
49 |     if os.path.exists("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len)):
50 |         train=pickle.load(open("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'rb'))#生成样本集
51 |         valid=pickle.load(open("valid_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'rb'))
52 |         print("数据：",train['X'].shape)
53 |         return train['X'],train['Y'],valid['X'],valid['Y']
54 |     else:
55 |         pro_data=pd.read_csv(inputdir)
56 |         all_sample=[]
57 |         for k1,k2 in pro_data.groupby(by=['hostname','series']):
58 |             all_sample.append(k1)
59 |         all_sample=all_sample[:sample_pro]#少搞点试试
60 |         random.shuffle(all_sample)   
61 |         print('总采样点数：',len(all_sample))#19005
62 |         train_all_sample=all_sample[:int(len(all_sample)*train_probility)]
63 |         test_all_sample=list(filter(lambda x: x not in train_all_sample, all_sample))
64 |         print('训练样本',len(train_all_sample),'测试样本:',len(test_all_sample))
65 |         print('生成训练样本...')
66 |         train_x,train_y=[],[]
67 |         for id_ in tqdm(train_all_sample):
68 |             x_i,y_i=getsingleGroup(pro_data,id_,src_len,tar_len,step)#一组样本
69 |             train_x.extend(x_i)
70 |             train_y.extend(y_i)
71 |             
72 |         with open("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'wb') as f:
73 |             pickle.dump({'X':np.array(train_x),'Y':np.array(train_y)},f)
74 |             
75 |         print('生成测试样本...')
76 |         valid_x,valid_y=[],[]
77 |         for id_ in tqdm(test_all_sample):
78 |             x_i,y_i=getsingleGroup(pro_data,id_,src_len,tar_len,step)#一组样本
79 |             valid_x.extend(x_i)
80 |             valid_y.extend(y_i)
81 |             
82 |         with open("valid_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'wb') as f:
83 |             pickle.dump({'X':np.array(valid_x),'Y':np.array(valid_y)},f)
84 |         return np.array(train_x),np.array(train_y),np.array(valid_x),np.array(valid_y)


--------------------------------------------------------------------------------
/time2graph/utils/deep_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from torch.autograd import Variable
  4 | import torch.nn.functional as F
  5 | from torch.utils.data import DataLoader
  6 | from .base_utils import Debugger, evaluate_performance
  7 | 
  8 | 
  9 | def latent_loss(z_mean, z_std):
 10 |     mean_2 = z_mean * z_mean
 11 |     std_2 = z_std * z_std
 12 |     return 0.5 * torch.mean(mean_2 + std_2 - torch.log(std_2) - 1)
 13 | 
 14 | 
 15 | class DeepDataloader(DataLoader):
 16 |     def __init__(self, *args, **kwargs):
 17 |         super(DeepDataloader, self).__init__(*args, **kwargs)
 18 | 
 19 | 
 20 | class DeepDataset(object):
 21 |     def __init__(self, x, y):
 22 |         self.x = x
 23 |         self.y = y
 24 | 
 25 |     def __getitem__(self, item):
 26 |         return self.x[item], self.y[item]
 27 | 
 28 |     def __len__(self):
 29 |         return len(self.y)
 30 | 
 31 | 
 32 | def train_RNNs(epoch, dataloader, rnn, criterion, optimizer, debug, gpu_enable):
 33 |     rnn.train()
 34 |     for i, (sequences, target) in enumerate(dataloader, 0):
 35 |         sequences = sequences.double()
 36 |         if gpu_enable:
 37 |             sequences = sequences.cuda()
 38 |             target = target.cuda()
 39 |         sequences = Variable(sequences)
 40 |         target = Variable(target)
 41 |         output = rnn(sequences)
 42 |         loss = criterion(output, target)
 43 |         optimizer.zero_grad()
 44 |         loss.backward()
 45 |         optimizer.step()
 46 | 
 47 |         if i % int(len(dataloader) / 10 + 1) == 0:
 48 |             Debugger.debug_print('[{}][{}][{}], Loss: {}'.format(
 49 |                 epoch, i, len(dataloader), loss.item()), debug=debug)
 50 | 
 51 | 
 52 | def train_VAE(epoch, dataloader, vae, criterion, optimizer, debug, gpu_enable):
 53 |     vae.train()
 54 |     for i, (sequences, target) in enumerate(dataloader, 0):
 55 |         optimizer.zero_grad()
 56 |         sequences = sequences.double()
 57 |         if gpu_enable:
 58 |             sequences = sequences.cuda()
 59 |             target = target.cuda()
 60 |         sequences = Variable(sequences)
 61 |         output = vae(sequences)
 62 |         loss = criterion(output, sequences) + latent_loss(vae.z_mean, vae.z_sigma)
 63 |         loss.backward()
 64 |         optimizer.step()
 65 |         Debugger.debug_print('[{}][{}][{}], Loss: {}'.format(
 66 |             epoch, i, len(dataloader), loss.item(), debug=debug))
 67 | 
 68 | 
 69 | def test_DeepModels(dataloader, rnn, criterion, debug, gpu_enable):
 70 |     for th in range(5, 20, 1):
 71 |         test_loss = 0
 72 |         correct = 0
 73 |         rnn.eval()
 74 |         y_pred, y_test = [], []
 75 |         th = th / 20
 76 |         for i, (sequences, target) in enumerate(dataloader, 0):
 77 |             rnn.zero_grad()
 78 |             sequences = sequences.double()
 79 |             if gpu_enable:
 80 |                 sequences = sequences.cuda()
 81 |                 target = target.cuda()
 82 |             sequences = Variable(sequences)
 83 |             target = Variable(target)
 84 |             output = rnn(sequences)
 85 |             test_loss += criterion(output, target).item()
 86 |             pred = F.softmax(output, dim=1)[:, 1].data.cpu().numpy()
 87 |             tmp = np.zeros(len(pred))
 88 |             tmp[pred >= th] = 1
 89 |             y_pred += list(tmp)
 90 |             y_test += list(target.cpu().numpy())
 91 |         test_loss /= len(dataloader.dataset)
 92 |         y_pred, y_test = np.array(y_pred, dtype=np.int).reshape(-1), np.array(y_test, dtype=np.int).reshape(-1)
 93 |         accu, prec, recall, f1 = evaluate_performance(y_pred=y_pred, y_true=y_test)
 94 |         Debugger.info_print('res: accu {:.4f}, prec {:.4f}, recall {:.4f}, f1 {:.4f}'.format(
 95 |             accu, prec, recall, f1
 96 |         ))
 97 |         Debugger.debug_print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
 98 |             test_loss, correct, len(dataloader.dataset),
 99 |             100. * correct / len(dataloader.dataset)), debug=debug)
100 | 


--------------------------------------------------------------------------------
/time2graph/utils/mp_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import itertools
  3 | import sys
  4 | import dill
  5 | import contextlib
  6 | import math
  7 | from pathos.helpers import mp
  8 | import numpy as np
  9 | 
 10 | NJOBS = mp.cpu_count()
 11 | if NJOBS >= 20:
 12 |     NJOBS = 20
 13 | 
 14 | __all__ = [
 15 |     'NJOBS',
 16 |     'ParMap',
 17 |     'parallel_monitor'
 18 | ]
 19 | 
 20 | 
 21 | class ParMap(object):
 22 |     def __init__(self, work, monitor=None, njobs=NJOBS, maxtasksperchild=100):
 23 |         self.work_func = work
 24 |         self.monitor_func = monitor
 25 |         self.__njobs = njobs
 26 |         self.__mtpc = maxtasksperchild
 27 | 
 28 |         self.__pool = None
 29 | 
 30 |     def close(self):
 31 |         if self.__pool is not None:
 32 |             self.__pool.close()
 33 |             self.__pool.join()
 34 |         self.__pool = None
 35 | 
 36 |     def __del__(self):
 37 |         self.close()
 38 | 
 39 |     @property
 40 |     def njobs(self):
 41 |         return self.__njobs
 42 | 
 43 |     @njobs.setter
 44 |     def njobs(self, n):
 45 |         self.__njobs = n
 46 |         self.close()
 47 | 
 48 |     def default_chunk(self, dlen):
 49 |         return int(math.ceil(float(dlen) / self.njobs))
 50 | 
 51 |     def run(self, data, chunk=None, shuffle=False):
 52 |         if chunk is None:
 53 |             chunk = self.default_chunk(len(data))
 54 | 
 55 |         if shuffle:
 56 |             data, order, invorder = shuffle_sample(data)
 57 |         else:
 58 |             invorder = None
 59 | 
 60 |         slices = slice_sample(data, chunk=chunk)
 61 |         res = self.run_slices(slices)
 62 | 
 63 |         if shuffle:
 64 |             res = apply_order(res, invorder)
 65 | 
 66 |         return res
 67 | 
 68 |     def run_slices(self, slices):
 69 |         mgr = mp.Manager()
 70 |         report_queue = mgr.Queue()
 71 |         if self.monitor_func is not None:
 72 |             monitor = mp.Process(target=self.monitor_func, args=(report_queue,))
 73 |             monitor.start()
 74 |         else:
 75 |             monitor = None
 76 | 
 77 |         if self.njobs == 1:
 78 |             res = []
 79 |             for slc in slices:
 80 |                 res.append(self.work_func(None, slc, report_queue))
 81 |         else:
 82 |             dill_work_func = dill.dumps(self.work_func)
 83 |             with contextlib.closing(mp.Pool(self.njobs, maxtasksperchild=self.__mtpc)) as pool:
 84 |                 res = pool.map(func_wrapper, [[dill_work_func, slc, report_queue] for slc in slices])
 85 |         res = list(itertools.chain.from_iterable(res))
 86 | 
 87 |         report_queue.put(StopIteration())
 88 |         if monitor is not None:
 89 |             monitor.join()
 90 | 
 91 |         return res
 92 | 
 93 | 
 94 | def func_wrapper(args):
 95 |     func = dill.loads(args[0])
 96 |     return func(mp.current_process().ident, *args[1:])
 97 | 
 98 | 
 99 | def apply_order(sample, order):
100 |     return [sample[o] for o in order]
101 | 
102 | 
103 | def shuffle_sample(sample):
104 |     order = np.random.permutation(np.arange(len(sample)))
105 |     invorder = np.zeros((len(sample), ), dtype='int32')
106 |     invorder[order] = np.arange(len(sample))
107 | 
108 |     return apply_order(sample, order), order, invorder
109 | 
110 | 
111 | def slice_sample(sample, chunk=None, nslice=None):
112 |     slices = []
113 |     if chunk is None:
114 |         chunk = int(len(sample) / nslice)
115 |     else:
116 |         if nslice is not None:
117 |             raise RuntimeError("chunk ({}) and slice ({}) should not be specified simultaneously".format(chunk, nslice))
118 | 
119 |     curstart = 0
120 |     while True:
121 |         if curstart >= len(sample):
122 |             break
123 |         slices.append(sample[curstart:min(curstart + chunk, len(sample))])
124 |         curstart += chunk
125 | 
126 |     return slices
127 | 
128 | 
129 | def parallel_monitor(msg, size, debug):
130 |     def monitor(queue):
131 |         cnt = 0
132 |         while True:
133 |             obj = queue.get()
134 |             if isinstance(obj, StopIteration):
135 |                 break
136 |             if isinstance(obj, int):
137 |                 if obj != 0:
138 |                     cnt += obj
139 |                 else:
140 |                     cnt += 1
141 |             else:
142 |                 cnt += 1
143 |             if debug:
144 |                 print('[debug]' + '{} executed by {:.2f}%'.format(msg, float(cnt) / size * 100) + '\r', end='')
145 |                 sys.stdout.flush()
146 |             # Debugger.debug_print(msg='{} executed by {:.2f}%'.format(msg, float(cnt) / size * 100),
147 |             #                      debug=debug)
148 |     return monitor
149 | 


--------------------------------------------------------------------------------
/time2graph/utils/deep_models.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | 
  7 | 
  8 | class LSTMClassifier(nn.Module):
  9 |     def __init__(self, data_size, hidden_size, output_size,
 10 |                  dropout, hidden_dim=128, gpu_enable=False):
 11 |         super(LSTMClassifier, self).__init__()
 12 |         self.data_size = data_size
 13 |         self.hidden_size = hidden_size
 14 |         self.output_size = output_size
 15 |         self.gpu_enable = gpu_enable
 16 |         self.model = nn.LSTM(data_size, hidden_size, batch_first=True).double()
 17 |         self.hidden2out = nn.Sequential(
 18 |             nn.Linear(hidden_size, hidden_dim),
 19 |             nn.ReLU(),
 20 |             nn.Linear(hidden_dim, output_size)
 21 |         )
 22 |         self.dropout = nn.Dropout(p=dropout)
 23 | 
 24 |     def init_hidden(self, batch_size):
 25 |         if self.gpu_enable:
 26 |             return (
 27 |                 Variable(torch.zeros(1, batch_size, self.hidden_size).double().cuda()),
 28 |                 Variable(torch.zeros(1, batch_size, self.hidden_size).double().cuda())
 29 |             )
 30 |         else:
 31 |             return (
 32 |                 Variable(torch.zeros(1, batch_size, self.hidden_size).double()),
 33 |                 Variable(torch.zeros(1, batch_size, self.hidden_size).double())
 34 |             )
 35 | 
 36 |     def forward(self, X):
 37 |         hidden = self.init_hidden(batch_size=len(X))
 38 |         outputs, (h_n, c_n) = self.model(X.double(), hidden)
 39 |         # return self.softmax(self.hidden2out(outputs))
 40 |         return self.hidden2out(h_n[0])
 41 | 
 42 | 
 43 | class GRUClassifier(nn.Module):
 44 |     def __init__(self, data_size, hidden_size, output_size, dropout,
 45 |                  gpu_enable=False):
 46 |         super(GRUClassifier, self).__init__()
 47 |         self.data_size = data_size
 48 |         self.hidden_size = hidden_size
 49 |         self.output_size = output_size
 50 |         self.gpu_enable = gpu_enable
 51 |         self.model = nn.GRU(data_size, hidden_size, batch_first=True).double()
 52 |         self.hidden2out = nn.Linear(hidden_size, output_size)
 53 | 
 54 |     def init_hidden(self, batch_size):
 55 |         if self.gpu_enable:
 56 |             return Variable(torch.zeros(1, batch_size, self.hidden_size).double().cuda())
 57 |         else:
 58 |             return Variable(torch.zeros(1, batch_size, self.hidden_size).double())
 59 | 
 60 |     def forward(self, X):
 61 |         hidden = self.init_hidden(batch_size=len(X))
 62 |         outputs, (h_n, c_n) = self.model(X.double(), hidden)
 63 |         return self.hidden2out(h_n[0])
 64 | 
 65 | 
 66 | class EnDecoder(nn.Module):
 67 |     def __init__(self, D_in, H, D_out):
 68 |         super(EnDecoder, self).__init__()
 69 |         self.linear_1 = nn.Linear(D_in, H)
 70 |         self.linear_2 = nn.Linear(H, D_out)
 71 | 
 72 |     def forward(self, x):
 73 |         x = F.relu(self.linear_1(x))
 74 |         return F.relu(self.linear_2(x))
 75 | 
 76 | 
 77 | class VAE(nn.Module):
 78 |     def __init__(self, encoder, decoder, encode_dim, latent_dim):
 79 |         super(VAE, self).__init__()
 80 |         self.encoder = encoder
 81 |         self.decoder = decoder
 82 |         self.encode_dim = encode_dim
 83 |         self.latent_dim = latent_dim
 84 |         self.__enc_mu = nn.Linear(encode_dim, latent_dim)
 85 |         self.__enc_log_sigma = nn.Linear(encode_dim, latent_dim)
 86 | 
 87 |     def __sample_latent(self, h_enc):
 88 |         mu = self.__enc_mu(h_enc)
 89 |         log_sigma = self.__enc_log_sigma(h_enc)
 90 |         sigma = torch.exp(log_sigma)
 91 |         std_z = torch.from_numpy(np.random.normal(0, 1, size=sigma.size())).double()
 92 |         self.z_mean = mu
 93 |         self.z_sigma = sigma
 94 |         return mu + sigma * Variable(std_z, requires_grad=False)
 95 | 
 96 |     def forward(self, state):
 97 |         h_enc = self.encoder(state)
 98 |         z = self.__sample_latent(h_enc=h_enc)
 99 |         return self.decoder(z)
100 | 
101 | 
102 | class MLP(nn.Module):
103 |     def __init__(self, data_size, hidden_size, output_size, n_class=2):
104 |         super(MLP, self).__init__()
105 |         self.data_size = data_size
106 |         self.hidden_size = hidden_size
107 |         self.output_size = output_size
108 |         self.hidden_layer = nn.Linear(data_size, hidden_size)
109 |         self.output_layer = nn.Linear(hidden_size, output_size)
110 |         self.out = nn.Linear(output_size, n_class)
111 | 
112 |     def forward(self, x):
113 |         x = x.view(self.batch_size, self.data_size)
114 |         return self.out(F.relu(self.output_layer(F.relu(self.hidden_layer(x)))))
115 | 


--------------------------------------------------------------------------------
/time2graph/core/distance_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def greedy_dtw_path(x, y, warp, dist=lambda x, y: np.linalg.norm(x - y)):
  5 |     if np.ndim(x) == 1:
  6 |         x = x.reshape(-1, 1)
  7 |     if np.ndim(y) == 1:
  8 |         y = y.reshape(-1, 1)
  9 |     nrows, ncols = x.shape[0], y.shape[0]
 10 |     ridx, cidx, rpath, cpath = 0, 0, [0], [0]
 11 |     while ridx < nrows - 1 and cidx < ncols - 1:
 12 |         rdist = dist(x[ridx + 1], y[cidx])
 13 |         cdist = dist(x[ridx], y[cidx + 1])
 14 |         ddist = dist(x[ridx + 1], y[cidx + 1])
 15 |         if ddist < rdist and ddist < cdist:
 16 |             ridx += 1
 17 |             cidx += 1
 18 |         elif rdist < cdist:
 19 |             if ridx < cidx + warp:
 20 |                 ridx += 1
 21 |             else:
 22 |                 cidx += 1
 23 |         else:
 24 |             if cidx < ridx + warp:
 25 |                 cidx += 1
 26 |             else:
 27 |                 ridx += 1
 28 |         rpath.append(ridx)
 29 |         cpath.append(cidx)
 30 |     for k in range(ridx + 1, nrows):
 31 |         rpath.append(k)
 32 |         cpath.append(ncols - 1)
 33 |     for k in range(cidx + 1, ncols):
 34 |         cpath.append(k)
 35 |         rpath.append(nrows - 1)
 36 |     return np.array(rpath), np.array(cpath)
 37 | 
 38 | 
 39 | def parameterized_gdtw_npy(x, y, w, warp, dist=lambda x, y: np.linalg.norm(x - y)):
 40 |     if np.ndim(x) == 1:
 41 |         x = x.reshape(-1, 1)
 42 |     if np.ndim(y) == 1:
 43 |         y = y.reshape(-1, 1)
 44 |     dpath = greedy_dtw_path(x=x, y=y, dist=dist, warp=warp)
 45 |     return dist((x * np.abs(w).reshape(len(w), -1))[dpath[0]], y[dpath[1]])
 46 | 
 47 | 
 48 | def expand_array(y, warp):
 49 |     size = y.shape[0]
 50 |     tmp_y = np.concatenate((y[size - warp: size, :], y, y[: warp, :]), axis=0)
 51 |     return np.array([tmp_y[k: (k+2 * warp + 1)] for k in range(size)], dtype=np.float32)
 52 | 
 53 | 
 54 | def softmax(x):
 55 |     """Compute softmax values for each sets of scores in x."""
 56 |     # def __errcall(type, msg):
 57 |     #     Debugger.info_print(msg='RuntimeWarning: {}-{}, {}'.format(type, msg, x))
 58 |     # np.seterrcall(__errcall)
 59 |     # np.seterr(divide='call')
 60 |     x -= np.max(x)
 61 |     return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
 62 | 
 63 | 
 64 | def softmax_1d(x):
 65 |     x -= np.max(x)
 66 |     return np.exp(x) / np.sum(np.exp(x), keepdims=True)
 67 | 
 68 | 
 69 | def parameterized_gw_npy(x, y, w, warp):
 70 |     distance = np.sum((x.reshape(x.shape[0], -1, x.shape[1]) - expand_array(y=y, warp=warp)) ** 2,
 71 |                       axis=1)
 72 |     '''
 73 |     TODO
 74 |     这里是可以改进的点，原文通过求离shapelet最近的片段的距离来代表当前shapelet和序列的距离
 75 |     '''
 76 |     
 77 |     softmin_distance = np.sum(softmax(-distance.astype(np.float64)).astype(np.float32) * distance,
 78 |                               axis=1)
 79 |     return np.sqrt(np.sum(softmin_distance * np.abs(w)))
 80 | 
 81 | 
 82 | def pattern_distance_time_aware(pattern, time_series, local_factor, global_factor, warp,
 83 |                                 init, measurement):
 84 |     if measurement == 'gw':
 85 |         dist = parameterized_gw_npy
 86 |     elif measurement == 'gdtw':
 87 |         dist = parameterized_gdtw_npy
 88 |     else:
 89 |         raise NotImplementedError('unsupported distance {}'.format(measurement))
 90 |     num_segment = int(time_series.shape[0] / pattern.shape[0])
 91 |     seg_length = pattern.shape[0]
 92 |     assert init + num_segment <= len(global_factor)
 93 |     time_series = time_series.reshape(num_segment, seg_length, -1)
 94 |     ret = np.zeros(num_segment, np.float32).reshape(-1)
 95 |     for k in range(num_segment):
 96 |         ret[k] = dist(x=pattern, y=time_series[k], w=local_factor, warp=warp)
 97 |     return np.sum(softmax_1d(-ret * np.abs(global_factor[init: init + num_segment]))
 98 |                   * ret * np.abs(global_factor[init: init + num_segment]))
 99 | 
100 | 
101 | def pattern_distance_no_timing(pattern, time_series, warp, measurement):
102 |     if measurement == 'gw':
103 |         dist = parameterized_gw_npy
104 |     elif measurement == 'gdtw':
105 |         dist = parameterized_gdtw_npy
106 |     else:
107 |         raise NotImplementedError('unsupported distance {}'.format(measurement))
108 |     num_segment = int(time_series.shape[0] / pattern.shape[0])
109 |     seg_length = pattern.shape[0]
110 |     w = np.ones(seg_length, dtype=np.float32).reshape(-1)
111 |     assert time_series.shape[0] == num_segment * pattern.shape[0]
112 |     time_series = time_series.reshape(num_segment, pattern.shape[0], -1)
113 |     ret = np.zeros(num_segment, np.float32).reshape(-1)
114 |     for k in range(num_segment):
115 |         ret[k] = dist(x=pattern, y=time_series[k], w=w, warp=warp)
116 |     return np.sum(softmax_1d(-ret) * ret)
117 | 


--------------------------------------------------------------------------------
/time2graph/core/Optimize.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Nov  4 15:53:35 2021
  4 | 
  5 | @author: Administrator
  6 | """
  7 | import torch
  8 | from collections import OrderedDict
  9 | from torch.optim import Optimizer
 10 | from torch.optim.lr_scheduler import LambdaLR
 11 | from typing import Callable, Iterable, Tuple
 12 | import math
 13 | import numpy as np
 14 | import torch.nn as nn
 15 | 
 16 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
 17 |     """
 18 |     Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
 19 |     a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
 20 |     Args:
 21 |         optimizer (:class:`~torch.optim.Optimizer`):
 22 |             The optimizer for which to schedule the learning rate.
 23 |         num_warmup_steps (:obj:`int`):
 24 |             The number of steps for the warmup phase.
 25 |         num_training_steps (:obj:`int`):
 26 |             The total number of training steps.
 27 |         last_epoch (:obj:`int`, `optional`, defaults to -1):
 28 |             The index of the last epoch when resuming training.
 29 |     Return:
 30 |         :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
 31 |     """
 32 | 
 33 |     def lr_lambda(current_step: int):
 34 |         if current_step < num_warmup_steps:
 35 |             return float(current_step) / float(max(1, num_warmup_steps))
 36 |         return max(
 37 |             0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
 38 |         )
 39 | 
 40 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
 41 | 
 42 | class AdamW(Optimizer):
 43 |     """
 44 |     Implements Adam algorithm with weight decay fix as introduced in `Decoupled Weight Decay Regularization
 45 |     <https://arxiv.org/abs/1711.05101>`__.
 46 |     Parameters:
 47 |         params (:obj:`Iterable[torch.nn.parameter.Parameter]`):
 48 |             Iterable of parameters to optimize or dictionaries defining parameter groups.
 49 |         lr (:obj:`float`, `optional`, defaults to 1e-3):
 50 |             The learning rate to use.
 51 |         betas (:obj:`Tuple[float,float]`, `optional`, defaults to (0.9, 0.999)):
 52 |             Adam's betas parameters (b1, b2).
 53 |         eps (:obj:`float`, `optional`, defaults to 1e-6):
 54 |             Adam's epsilon for numerical stability.
 55 |         weight_decay (:obj:`float`, `optional`, defaults to 0):
 56 |             Decoupled weight decay to apply.
 57 |         correct_bias (:obj:`bool`, `optional`, defaults to `True`):
 58 |             Whether ot not to correct bias in Adam (for instance, in Bert TF repository they use :obj:`False`).
 59 |     """
 60 | 
 61 |     def __init__(
 62 |         self,
 63 |         params: Iterable[torch.nn.parameter.Parameter],
 64 |         lr: float = 1e-3,
 65 |         betas: Tuple[float, float] = (0.9, 0.999),
 66 |         eps: float = 1e-6,
 67 |         weight_decay: float = 0.0,
 68 |         correct_bias: bool = True,
 69 |     ):
 70 |         if lr < 0.0:
 71 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
 72 |         if not 0.0 <= betas[0] < 1.0:
 73 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
 74 |         if not 0.0 <= betas[1] < 1.0:
 75 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
 76 |         if not 0.0 <= eps:
 77 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
 78 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
 79 |         super().__init__(params, defaults)
 80 | 
 81 |     def step(self, closure: Callable = None):
 82 |         """
 83 |         Performs a single optimization step.
 84 |         Arguments:
 85 |             closure (:obj:`Callable`, `optional`): A closure that reevaluates the model and returns the loss.
 86 |         """
 87 |         loss = None
 88 |         if closure is not None:
 89 |             loss = closure()
 90 | 
 91 |         for group in self.param_groups:
 92 |             for p in group["params"]:
 93 |                 if p.grad is None:
 94 |                     continue
 95 |                 grad = p.grad.data
 96 |                 if grad.is_sparse:
 97 |                     raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
 98 | 
 99 |                 state = self.state[p]
100 | 
101 |                 # State initialization
102 |                 if len(state) == 0:
103 |                     state["step"] = 0
104 |                     # Exponential moving average of gradient values
105 |                     state["exp_avg"] = torch.zeros_like(p.data)
106 |                     # Exponential moving average of squared gradient values
107 |                     state["exp_avg_sq"] = torch.zeros_like(p.data)
108 | 
109 |                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
110 |                 beta1, beta2 = group["betas"]
111 | 
112 |                 state["step"] += 1
113 | 
114 |                 # Decay the first and second moment running average coefficient
115 |                 # In-place operations to update the averages at the same time
116 |                 exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1)
117 |                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
118 |                 denom = exp_avg_sq.sqrt().add_(group["eps"])
119 | 
120 |                 step_size = group["lr"]
121 |                 if group["correct_bias"]:  # No bias correction for Bert
122 |                     bias_correction1 = 1.0 - beta1 ** state["step"]
123 |                     bias_correction2 = 1.0 - beta2 ** state["step"]
124 |                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
125 | 
126 |                 p.data.addcdiv_(exp_avg, denom, value=-step_size)
127 | 
128 |                 # Just adding the square of the weights to the loss function is *not*
129 |                 # the correct way of using L2 regularization/weight decay with Adam,
130 |                 # since that will interact with the m and v parameters in strange ways.
131 |                 #
132 |                 # Instead we want to decay the weights in a manner that doesn't interact
133 |                 # with the m/v parameters. This is equivalent to adding the square
134 |                 # of the weights to the loss with plain (non-momentum) SGD.
135 |                 # Add weight decay at the end (fixed version)
136 |                 if group["weight_decay"] > 0.0:
137 |                     p.data.add_(p.data, alpha=-group["lr"] * group["weight_decay"])
138 | 
139 |         return loss
140 | 


--------------------------------------------------------------------------------
/time2graph/core/shapelet_embedding.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .shapelet_utils import *
  3 | embed_number = 5
  4 | 
  5 | 
  6 | def time_series_embeds_factory__(embed_size, embeddings, threshold,
  7 |                                  multi_graph, debug, mode):
  8 |     def __concate__(pid, args, queue):
  9 |         ret = []
 10 |         for sdist in args:
 11 |             tmp = np.zeros(len(sdist) * embed_size * embed_number, dtype=np.float32).reshape(-1)
 12 |             for sidx in range(len(sdist)):
 13 |                 dist = sdist[sidx, :]
 14 |                 target = np.argsort(np.argwhere(dist <= threshold).reshape(-1))[:embed_number]
 15 |                 if len(target) == 0:
 16 |                     continue
 17 |                 weight = 1.0 - minmax_scale(dist[target])
 18 |                 if np.sum(weight) == 0:
 19 |                     Debugger.warn_print(msg='dist {}, weight {}'.format(dist, weight), debug=debug)
 20 |                 else:
 21 |                     weight /= np.sum(weight)
 22 |                 target_number = len(weight)
 23 |                 for k in range(target_number):
 24 |                     src, dst = (sidx * embed_number + k) * embed_size, (sidx * embed_number + k + 1) * embed_size
 25 |                     if multi_graph:
 26 |                         if sidx == 0:
 27 |                             tmp[src: dst] = weight[k] * embeddings[sidx, target[k]].reshape(-1)
 28 |                         elif sidx == len(sdist) - 1:
 29 |                             tmp[src: dst] = weight[k] * embeddings[sidx - 1, target[k]].reshape(-1)
 30 |                         else:
 31 |                             former = weight[k] * embeddings[sidx - 1, target[k]].reshape(-1)
 32 |                             latter = weight[k] * embeddings[sidx, target[k]].reshape(-1)
 33 |                             tmp[src: dst] = (former + latter)
 34 |                     else:
 35 |                         tmp[src: dst] = weight[k] * embeddings[0, target[k]].reshape(-1)
 36 |             ret.append(tmp)
 37 |             queue.put(0)
 38 |         return ret
 39 | 
 40 |     def __aggregate__(pid, args, queue):
 41 |         ret = []
 42 |         for sdist in args:
 43 |             tmp = np.zeros(len(sdist) * embed_size, dtype=np.float32).reshape(-1)
 44 |             for sidx in range(len(sdist)):
 45 |                 dist = sdist[sidx, :]
 46 |                 target = np.argsort(np.argwhere(dist <= threshold).reshape(-1))[:embed_number]
 47 |                 if len(target) == 0:
 48 |                     continue
 49 |                 weight = 1.0 - minmax_scale(dist[target])
 50 |                 if np.sum(weight) == 0:
 51 |                     Debugger.warn_print(msg='dist {}, weight {}'.format(dist, weight), debug=debug)
 52 |                 else:
 53 |                     weight /= np.sum(weight)
 54 |                 src, dst = sidx * embed_size, (sidx + 1) * embed_size
 55 |                 for k in range(len(weight)):
 56 |                     if multi_graph:
 57 |                         if sidx == 0:
 58 |                             tmp[src: dst] += weight[k] * embeddings[sidx, target[k]].reshape(-1)
 59 |                         elif sidx == len(sdist) - 1:
 60 |                             tmp[src: dst] += weight[k] * embeddings[sidx - 1, target[k]].reshape(-1)
 61 |                         else:
 62 |                             former = weight[k] * embeddings[sidx - 1, target[k]].reshape(-1)
 63 |                             latter = weight[k] * embeddings[sidx, target[k]].reshape(-1)
 64 |                             tmp[src: dst] += (former + latter)
 65 |                     else:
 66 |                         tmp[src: dst] += weight[k] * embeddings[0, target[k]].reshape(-1)
 67 |             ret.append(tmp)
 68 |             queue.put(0)
 69 |         return ret
 70 | 
 71 |     if mode == 'concate':
 72 |         return __concate__
 73 |     elif mode == 'aggregate':
 74 |         return __aggregate__
 75 |     else:
 76 |         raise NotImplementedError('unsupported mode {}'.format(mode))
 77 | 
 78 | 
 79 | class ShapeletEmbedding(object):
 80 |     def __init__(self, seg_length, tflag, multi_graph, cache_dir,
 81 |                  percentile, tanh, debug, measurement, mode,
 82 |                  **deepwalk_args):
 83 |         self.seg_length = seg_length
 84 |         self.tflag = tflag
 85 |         self.multi_graph = multi_graph
 86 |         self.cache_dir = cache_dir
 87 |         self.tanh = tanh
 88 |         self.debug = debug
 89 |         self.percentile = percentile
 90 |         self.dist_threshold = -1
 91 |         self.measurement = measurement
 92 |         self.mode = mode
 93 |         self.deepwalk_args = deepwalk_args
 94 |         self.embed_size = self.deepwalk_args.get('representation_size', 256)
 95 |         self.embeddings = None
 96 | 
 97 |     def fit(self, time_series_set, shapelets, warp, init=0):
 98 |         Debugger.info_print('fit shape: {}'.format(time_series_set.shape))
 99 |         tmat, sdist, dist_threshold = transition_matrix(
100 |             time_series_set=time_series_set, shapelets=shapelets, seg_length=self.seg_length,
101 |             tflag=self.tflag, multi_graph=self.multi_graph, tanh=self.tanh, debug=self.debug,
102 |             init=init, warp=warp, percentile=self.percentile, threshold=self.dist_threshold,
103 |             measurement=self.measurement)
104 |         self.dist_threshold = dist_threshold
105 |         self.embeddings = graph_embedding(
106 |             tmat=tmat, num_shapelet=len(shapelets), embed_size=self.embed_size,
107 |             cache_dir=self.cache_dir, **self.deepwalk_args)
108 | 
109 |     def time_series_embedding(self, time_series_set, shapelets, warp, init=0):
110 |         if self.embeddings is None:
111 |             self.fit(time_series_set=time_series_set, shapelets=shapelets, warp=warp)
112 |         sdist = shapelet_distance(time_series_set=time_series_set, shapelets=shapelets,
113 |                                   seg_length=self.seg_length, tflag=self.tflag, tanh=self.tanh,
114 |                                   debug=self.debug, init=init, warp=warp,
115 |                                   measurement=self.measurement)
116 |         Debugger.info_print('embedding threshold {}'.format(self.dist_threshold))
117 |         Debugger.info_print('sdist size {}'.format(sdist.shape))
118 |         parmap = ParMap(
119 |             work=time_series_embeds_factory__(
120 |                 embed_size=self.embed_size, embeddings=self.embeddings, threshold=self.dist_threshold,
121 |                 multi_graph=self.multi_graph, debug=self.debug, mode=self.mode),
122 |             monitor=parallel_monitor(msg='time series embedding', size=sdist.shape[0], debug=self.debug),
123 |             njobs=NJOBS
124 |         )
125 |         if self.mode == 'concate':
126 |             size = sdist.shape[1] * self.embed_size * embed_number
127 |         elif self.mode == 'aggregate':
128 |             size = sdist.shape[1] * self.embed_size
129 |         else:
130 |             raise NotImplementedError('unsupported mode {}'.format(self.mode))
131 |         return np.array(parmap.run(data=list(sdist)), dtype=np.float32).reshape(sdist.shape[0], size)
132 | 


--------------------------------------------------------------------------------
/my_train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Oct 24 18:12:54 2021
  4 | 
  5 | @author: Administrator
  6 | """
  7 | 
  8 | # -*- coding: utf-8 -*-
  9 | """
 10 | Created on Mon Oct 11 09:25:35 2021
 11 | 用存好的训练数据测试数据测试模型
 12 | @author: Administrator
 13 | """
 14 | 
 15 | 
 16 | import torch
 17 | import torch.nn.functional as F
 18 | from torch.autograd import Variable
 19 | import torch.nn as nn
 20 | import pandas as pd
 21 | import numpy as np
 22 | import os
 23 | from os import path, cpu_count
 24 | import math
 25 | from tqdm import tqdm
 26 | import random
 27 | from time2graph.core.model_gin import Flow2Graph
 28 | from time2graph.utils.gat import GAT, accuracy_torch
 29 | from pathos.helpers import mp
 30 | import logging
 31 | 
 32 | from pathos.helpers import mp
 33 | import pickle
 34 | import warnings
 35 | warnings.filterwarnings("ignore")
 36 | from torch.nn import MSELoss
 37 | logger = logging.getLogger(__name__)
 38 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 39 |                     datefmt='%m/%d/%Y %H:%M:%S',
 40 |                     level=logging.INFO)
 41 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 42 | #############model########################
 43 | 
 44 | def time2graphnet(K=30):
 45 |     '''
 46 |     K  生成 shapelets的数目
 47 |     seg_length  子片段的长度
 48 |     num_segment  一个序列分几段
 49 |     gpu_enable  用不用GPU
 50 |     optimizer    shapelets学习过程中使用什么优化方法
 51 |     device    使用的卡
 52 |     dropout  
 53 |     lk_relu
 54 |     data_size  时间序列的特征维度
 55 |     softmax  算距离的时候是否加softmax
 56 |     percentile 图构建中的距离阈值（百分位数）  weight小的p%不要
 57 |     dataset  数据集名字
 58 |     append 算特征的时候是否把片段自身加进去
 59 |     diff   是否求一阶方差
 60 |     
 61 |     '''
 62 |     general_options = {
 63 |         'init': 0,
 64 |         'warp': 2,
 65 |         'tflag': True,
 66 |         'mode': 'embedding',
 67 |         'candidate_method': 'greedy'
 68 |         
 69 |     }
 70 |     model = Flow2Graph(
 71 |         K, seg_length=10, num_segment=20, gpu_enable=False, optimizer='Adam', device=device, dropout=0.2, lk_relu=0.2, data_size=7, 
 72 |         softmax=False, percentile=10,dataset='Unspecified', append=False, sort=False, feat_flag=True,
 73 |         feat_norm=True, aggregate=True, standard_scale=False, diff=False,reg=True,**general_options
 74 |     )
 75 |     return model        
 76 | #########ourmodel###############
 77 |         
 78 |     
 79 |     
 80 | #########ourmodel ending###############
 81 | 
 82 | def getsingleGroup(pro_data,group,src_len,tar_len,step):
 83 |     '''
 84 |     pro_data 全部数据
 85 |     group 单组 key
 86 |     单组生成序列
 87 |     
 88 |     '''
 89 |     curent_df=pro_data.loc[(pro_data['hostname']==group[0]) & (pro_data['series']==group[1])]
 90 |     tw=src_len+tar_len#总的采样窗口大小，前面是X,后面部分的Mean是Y
 91 |     step=step
 92 |     X=[]
 93 |     Y=[]
 94 |     
 95 |     L=len(curent_df)
 96 |         #按时间排序
 97 |     curent_df['time'] = pd.to_datetime(curent_df['time_window'])
 98 |     curent_df.sort_values('time', inplace=True)
 99 |     useful_column=[ 'Mean', 'SD', 'Open', 'High','Low', 'Close', 'Volume']#取特征列
100 |         
101 |     for i in range(0,L-tw,step):
102 | #                train_seq = df_tmp[features].values[i:i+tw]
103 |         if i>L-tw:#处理尾巴上的
104 |             train_seq =curent_df[-tw:][useful_column]
105 |             X.append(train_seq.values[:-src_len])
106 |             Y.append(train_seq[-src_len:]['Mean'].values)
107 |             break
108 |         train_seq =curent_df[i:i+tw][useful_column]#
109 |         X.append(train_seq.values[:src_len])
110 |         Y.append(train_seq[src_len:]['Mean'].values)
111 |         
112 |             
113 |         if len(X)>100:#控制内存
114 |             X=X[-50:]
115 |             Y=Y[-50:]
116 |             break
117 |     return np.array(X),np.array(Y)
118 | 
119 | def get_dataset(inputdir,src_len,tar_len,step=5,train_probility=0.8,sample_pro=10000):
120 |     
121 |     if os.path.exists("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len)):
122 |         train=pickle.load(open("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'rb'))#[:10000,:,:]#生成样本集
123 |         valid=pickle.load(open("valid_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'rb'))#[:1000]
124 |         print("数据：",type(train['X']),train['Y'].shape)
125 |         
126 |         return train['X'][:sample_pro*3],train['Y'][:sample_pro*3],valid['X'][:int(sample_pro*0.1)],valid['Y'][:int(sample_pro*0.1)]
127 |     else:
128 |         pro_data=pd.read_csv(inputdir+'above1900_data.csv')
129 |         all_sample=[]
130 |         for k1,k2 in pro_data.groupby(by=['hostname','series']):
131 |             all_sample.append(k1)
132 |         all_sample=all_sample[:sample_pro]#少搞点试试
133 |         random.shuffle(all_sample)   
134 |         print('总采样点数：',len(all_sample))#19005
135 |         train_all_sample=all_sample[:int(len(all_sample)*train_probility)]
136 |         test_all_sample=list(filter(lambda x: x not in train_all_sample, all_sample))
137 |         print('训练样本',len(train_all_sample),'测试样本:',len(test_all_sample))
138 |         print('生成训练样本...')
139 |         train_x,train_y=[],[]
140 |         for id_ in tqdm(train_all_sample):
141 |             x_i,y_i=getsingleGroup(pro_data,id_,src_len,tar_len,step)#一组样本
142 |             train_x.extend(x_i)
143 |             train_y.extend(y_i)
144 |             
145 |         with open("train_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'wb') as f:
146 |             pickle.dump({'X':np.array(train_x),'Y':np.array(train_y)},f)
147 |             
148 |         print('生成测试样本...')
149 |         valid_x,valid_y=[],[]
150 |         for id_ in tqdm(test_all_sample):
151 |             x_i,y_i=getsingleGroup(pro_data,id_,src_len,tar_len,step)#一组样本
152 |             valid_x.extend(x_i)
153 |             valid_y.extend(y_i)
154 |             
155 |         with open("valid_numpy_samplePro%d_%d_%d.pkl"%(sample_pro,src_len,tar_len), 'wb') as f:
156 |             pickle.dump({'X':np.array(valid_x),'Y':np.array(valid_y)},f)
157 |         return np.array(train_x),np.array(train_y),np.array(valid_x),np.array(valid_y)
158 | 
159 | def get_label_forshapelets(y):
160 |     '''
161 |     if y_n+1-y_n >0 :
162 |         label=1
163 |     else:
164 |         label=0
165 |     y 10000,24
166 |     '''
167 |     #简单的使用差分，后续上升趋势为1，下降为0
168 |     y2=((y[:,1]-y[:,0])>0).astype(int)
169 |     return y2
170 |  
171 |     
172 | def main(Istest):
173 |     hidden_size = 512
174 |     embed_size = 7#输入X的
175 |     de_size=24*1#输出的序列长度也就是要预测未来多少个小时的
176 |     en_size=200 #输入的序列长度,采样窗口的长度(采好样的那个就是200)
177 |     epoch=10#训练轮数
178 |     train_batch_size=512#训练集一批的大小
179 |     K=70
180 |     model = time2graphnet(K)
181 |     inputdir='../'
182 |     train_x,train_y,valid_x,valid_y_1=get_dataset(inputdir,en_size,de_size)#拿到所有的序列
183 |     # print(np.isnan(train_x).all())
184 |     # print(np.isnan(train_y).all())
185 |     # print(np.isnan(valid_x).all())
186 |     # print(np.isnan(valid_y).all())
187 |     print(train_x.shape)
188 |     #数据归一化
189 |     train_x=(train_x-train_x.mean())/train_x.std()
190 |     train_y=(train_y-train_y.mean())/train_y.std()
191 |     #    #进行数据归一化处理
192 |     valid_x=(valid_x-valid_x.mean())/valid_x.std()
193 |     valid_y=(valid_y_1-valid_y_1.mean())/valid_y_1.std()
194 |     all_X=np.vstack((train_x,valid_x))
195 |     for_rescale=(valid_x.mean(),valid_x.std(),valid_y_1)
196 |     print("“*****",for_rescale[0],for_rescale[1])
197 |     model.data_size = embed_size
198 |     shapelets_path = './cache/shapelets_%d_%d.cache'%(K,de_size/24)
199 |     if path.isfile(shapelets_path):
200 |         model.load_shapelets(shapelets_path)
201 |         print('shapelets加载完成')
202 |     else:
203 |         print('开始提取shapelets ...')
204 |         model.learn_shapelets(all_X, get_label_forshapelets(np.vstack((train_y,valid_y))), 20, 7)
205 |         model.save_shapelets(shapelets_path)
206 |         print('shapelets已保存')
207 |     s=open('Flim_flow2graph_traing_log_shapelets_num%d_%d_%d.txt'% (K,en_size,de_size),'w')
208 |     model.fit(for_rescale,train_x, train_y,valid_x,valid_y,get_label_forshapelets,epoch=epoch,train_batch_size=train_batch_size,de_size=de_size,logprintfile=s)
209 |     s.close()
210 |    
211 |     logger.info("训练结束  "+"*"*20)
212 | 
213 | if __name__ == "__main__":
214 |     main(False)
215 |     
216 |     
217 |     
218 |     
219 |     
220 |     


--------------------------------------------------------------------------------
/time2graph/core/model_embeds.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pickle
  4 | import torch
  5 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
  6 | from sklearn.model_selection import StratifiedKFold
  7 | from sklearn.preprocessing import normalize
  8 | from .time_aware_shapelets import learn_time_aware_shapelets
  9 | from .static_shapelets import learn_static_shapelets
 10 | from .shapelet_embedding import ShapeletEmbedding
 11 | from ..utils.base_utils import ModelUtils, Debugger
 12 | 
 13 | 
 14 | class Time2GraphEmbed(ModelUtils):
 15 |     """
 16 |         Time2Graph model
 17 |         Hyper-parameters:
 18 |             K: number of learned shapelets
 19 |             C: number of candidates
 20 |             A: number of shapelets assigned to each segment
 21 |             tflag: timing flag
 22 |             opt_metric: optimal metric using in outside-classifier
 23 |     """
 24 |     def __init__(self, kernel, K=100, C=1000, seg_length=30, warp=2, tflag=True,
 25 |                  gpu_enable=True, percentile=15, opt_metric='f1', mode='aggregate',
 26 |                  batch_size=100, **kwargs):
 27 |         super(Time2GraphEmbed, self).__init__(kernel=kernel, **kwargs)
 28 |         self.K = K
 29 |         self.C = C
 30 |         self.seg_length = seg_length
 31 |         self.warp = warp
 32 |         self.tflag = tflag
 33 |         self.opt_metric = opt_metric
 34 |         self.mode = mode
 35 |         self.batch_size = batch_size
 36 |         self.gpu_enable = gpu_enable
 37 |         self.percentile = percentile
 38 |         self.shapelets = None
 39 |         self.sembeds = None
 40 |         self.clf = None
 41 |         self.lr = kwargs.pop('lr', 1e-2)
 42 |         self.p = kwargs.pop('p', 2)
 43 |         self.alpha = kwargs.pop('alpha', 10.0)
 44 |         self.beta = kwargs.pop('beta', 5.0)
 45 |         self.multi_graph = kwargs.pop('multi_graph', False)
 46 |         self.debug = kwargs.pop('debug', True)
 47 |         self.measurement = kwargs.pop('measurement', 'gdtw')
 48 |         self.kwargs = kwargs
 49 |         Debugger.info_print('initialize t2g model with {}'.format(self.__dict__))
 50 | 
 51 |     def learn_shapelets(self, x, y, num_segment, data_size, num_batch):
 52 |         assert x.shape[1] == num_segment * self.seg_length
 53 |         if self.tflag:
 54 |             self.shapelets = learn_time_aware_shapelets(
 55 |                 time_series_set=x, label=y, K=self.K, C=self.C, p=self.p,
 56 |                 num_segment=num_segment, seg_length=self.seg_length, data_size=data_size,
 57 |                 lr=self.lr, alpha=self.alpha, beta=self.beta, num_batch=num_batch,
 58 |                 measurement=self.measurement, gpu_enable=self.gpu_enable, **self.kwargs)
 59 |         else:
 60 |             self.shapelets = learn_static_shapelets(time_series_set=x, label=y, K=self.K, C=self.C, 
 61 |                 warp=self.warp, num_segment=num_segment, seg_length=self.seg_length, measurement=self.measurement, **self.kwargs)
 62 | 
 63 |     def fit_embedding_model(self, x, y, cache_dir, init=0):
 64 |         assert self.shapelets is not None, 'shapelets has not been learnt yet'
 65 |         self.sembeds = ShapeletEmbedding(
 66 |             seg_length=self.seg_length, tflag=self.tflag, multi_graph=self.multi_graph,
 67 |             cache_dir=cache_dir, tanh=self.kwargs.get('tanh', False), debug=self.debug,
 68 |             percentile=self.percentile, measurement=self.measurement, mode=self.mode,
 69 |             **self.kwargs)
 70 |         self.sembeds.fit(time_series_set=x[np.argwhere(y == 0).reshape(-1), :, :],
 71 |                          shapelets=self.shapelets, warp=self.warp, init=init)
 72 | 
 73 |     def embed(self, x, init=0):
 74 |         assert self.sembeds is not None, 'shapelet-embedding model has not been learnt yet'
 75 |         return self.sembeds.time_series_embedding(
 76 |             time_series_set=x, shapelets=self.shapelets, warp=self.warp, init=init)
 77 | 
 78 |     def set_deepwalk_args(self, **dw_args):
 79 |         for key, val in dw_args.items():
 80 |             self.kwargs[key] = val
 81 | 
 82 |     def fit(self, x, y, n_splits=5, init=0, reset=True, balanced=True, norm=False, cache_dir='.'):
 83 |         num_segment = int(x.shape[1] / self.seg_length)
 84 |         data_size = x.shape[-1]
 85 |         if reset or self.shapelets is None:
 86 |             self.learn_shapelets(
 87 |                 x=x, y=y, num_segment=num_segment, data_size=data_size, num_batch=x.shape[0] // self.batch_size)
 88 |         if reset or self.sembeds is None:
 89 |             Debugger.info_print('fit embedding model...')
 90 |             self.fit_embedding_model(x=x, y=y, cache_dir=cache_dir, init=init)
 91 |         max_clf_args, max_metric, clf = None, -1, self.clf__()
 92 |         embeds = self.sembeds.time_series_embedding(
 93 |             time_series_set=x, shapelets=self.shapelets,
 94 |             warp=self.warp, init=init)
 95 |         if norm:
 96 |             embeds = normalize(embeds, axis=0)
 97 |         Debugger.info_print('{} paras to be tuned'.format(self.para_len(balanced=balanced)))
 98 |         arguments = self.clf_paras(balanced=balanced)
 99 |         arg_size, cnt = self.para_len(balanced=balanced), 0.0
100 |         metric_method = self.return_metric_method(opt_metric=self.opt_metric)
101 |         Debugger.info_print('running parameter tuning for fit...')
102 |         max_accu, max_prec, max_recall, max_f1 = -1, -1, -1, -1
103 |         __max_clf_model = './.clf_model.pickle'
104 |         for args in arguments:
105 |             clf.set_params(**args)
106 |             Debugger.debug_print(msg='{:.2f}% inner args tuned; args: {}'.format(cnt * 100.0 / arg_size, args),
107 |                                  debug=self.debug)
108 |             skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
109 |             tmp, accu, prec, recall, f1 = 0, 0, 0, 0, 0
110 |             for train_idx, test_idx in skf.split(embeds, y):
111 |                 clf.fit(embeds[train_idx], y[train_idx])
112 |                 y_pred = clf.predict(embeds[test_idx])
113 |                 tmp += metric_method(y_true=y[test_idx], y_pred=y_pred)
114 |                 accu += accuracy_score(y_true=y[test_idx], y_pred=y_pred)
115 |                 prec += precision_score(y_true=y[test_idx], y_pred=y_pred)
116 |                 recall += recall_score(y_true=y[test_idx], y_pred=y_pred)
117 |                 f1 += f1_score(y_true=y[test_idx], y_pred=y_pred)
118 |             tmp /= n_splits
119 |             accu /= n_splits
120 |             prec /= n_splits
121 |             recall /= n_splits
122 |             f1 /= n_splits
123 |             if max_metric < tmp:
124 |                 max_metric = tmp
125 |                 max_clf_args = args
126 |                 max_accu, max_prec, max_recall, max_f1 = accu, prec, recall, f1
127 |                 pickle.dump(clf, open(__max_clf_model, 'wb'))
128 |             cnt += 1.0
129 |         Debugger.info_print('args {} for clf {}-{}, performance: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(
130 |             max_clf_args, self.kernel, self.opt_metric, max_accu, max_prec, max_recall, max_f1))
131 |         self.clf = {
132 |             'clf': pickle.load(open(__max_clf_model, 'rb')),
133 |             'clf-args': max_clf_args,
134 |         }
135 | 
136 |     def predict(self, x, norm=False):
137 |         assert self.shapelets is not None, 'shapelets has not been learnt yet...'
138 |         assert self.clf is not 'classifier has not been learnt yet...'
139 |         if norm:
140 |             embeds = normalize(self.embed(x=x), axis=0)
141 |         else:
142 |             embeds = self.embed(x=x)
143 |         return self.clf['clf'].predict(embeds)
144 | 
145 |     def save_model(self, fpath):
146 |         ret = {}
147 |         for key, val in self.__dict__.items():
148 |             if key != 'xgb':
149 |                 ret[key] = val
150 |         self.clf['clf'].save_model('{}.xgboost'.format(fpath))
151 |         torch.save(ret, fpath)
152 | 
153 |     def load_model(self, fpath, map_location='cuda:0'):
154 |         # @TODO: specify map_location
155 |         cache = torch.load(fpath, map_location=map_location)
156 |         for key, val in cache.items():
157 |             self.__dict__[key] = val
158 |         self.clf['clf'].load_model('{}.xgboost'.format(fpath))
159 | 
160 |     def save_shapelets(self, fpath):
161 |         torch.save(self.shapelets, fpath)
162 | 
163 |     def load_shapelets(self, fpath, map_location='cuda:0'):
164 |         self.shapelets = torch.load(fpath, map_location=map_location)
165 | 


--------------------------------------------------------------------------------
/time2graph/core/shapelet_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from sklearn.cluster import KMeans
  3 | from sklearn.preprocessing import minmax_scale
  4 | from .distance_utils import *
  5 | from ..utils.base_utils import Debugger, syscmd
  6 | from ..utils.mp_utils import ParMap, parallel_monitor, NJOBS
  7 | __tmat_threshold = 1e-2
  8 | 
  9 | 
 10 | def softmax_np(x):
 11 |     e_x = np.exp(x - np.max(x))
 12 |     return e_x / e_x.sum(axis=0)
 13 | 
 14 | 
 15 | def __candidate_cluster_factory(n_clusters, seg_length):
 16 |     def __main__(pid, args, queue):
 17 |         ret = []
 18 |         for time_series_segments in args:
 19 |             kmeans = KMeans(n_clusters=n_clusters).fit(time_series_segments)
 20 |             ret.append(kmeans.cluster_centers_.reshape(n_clusters, seg_length, -1))
 21 |             queue.put(0)
 22 |         return ret
 23 |     return __main__
 24 | 
 25 | 
 26 | def __candidate_greedy_factory(n_candiates, seg_length):
 27 |     def __main__(pid, args, queue):
 28 |         ret = []
 29 |         for time_series_segments in args:
 30 |             size = time_series_segments.shape[0]
 31 |             center_segment = np.mean(time_series_segments, axis=0)
 32 |             cand_dist = np.linalg.norm(
 33 |                 time_series_segments.reshape(size, -1) - center_segment.reshape(1, -1), axis=1)
 34 |             tmp = []
 35 |             for cnt in range(n_candiates):
 36 |                 idx = np.argmax(cand_dist)
 37 |                 cand_dist[idx] = -1
 38 |                 update_idx = cand_dist >= 0
 39 |                 dims = np.sum(update_idx)
 40 |                 cand_dist[update_idx] += np.linalg.norm(
 41 |                     time_series_segments[update_idx].reshape(dims, -1) - time_series_segments[idx].reshape(1, -1),
 42 |                     axis=1
 43 |                 )
 44 |                 tmp.append(time_series_segments[idx].reshape(seg_length, -1))
 45 |             ret.append(tmp)
 46 |             queue.put(0)
 47 |         return ret
 48 |     return __main__
 49 | 
 50 | 
 51 | def generate_shapelet_candidate(time_series_set, num_segment, seg_length, candidate_size, **kwargs):
 52 |     __method, __debug = kwargs.get('candidate_method', 'greedy'), kwargs.get('debug', True)
 53 |     njobs = kwargs.get('njobs', NJOBS)
 54 |     Debugger.debug_print('begin to generate shapelet candidates...', __debug)
 55 |     num_time_series = time_series_set.shape[0]
 56 |     time_series_set = time_series_set.reshape(num_time_series, num_segment, seg_length, -1)
 57 |     assert candidate_size >= num_segment, 'candidate-size {} should be larger ' \
 58 |                                           'than n_segments {}'.format(candidate_size, num_segment)
 59 |     args, n_clusters = [], candidate_size // num_segment
 60 |     for idx in range(num_segment):
 61 |         args.append(time_series_set[:, idx, :, :].reshape(num_time_series, -1))
 62 |     if __method == 'cluster':
 63 |         work_func = __candidate_cluster_factory
 64 |     elif __method == 'greedy':
 65 |         work_func = __candidate_greedy_factory
 66 |     else:
 67 |         raise NotImplementedError('unsupported candidate generating method {}'.format(__method))
 68 |     parmap = ParMap(
 69 |         work=work_func(n_clusters, seg_length),
 70 |         monitor=parallel_monitor(msg='generate candidate by {}'.format(__method),
 71 |                                  size=num_segment, debug=__debug),
 72 |         njobs=njobs
 73 |     )
 74 |     ret = np.concatenate(parmap.run(data=args), axis=0)
 75 |     Debugger.info_print('candidates with length {} sampling done...'.format(seg_length))
 76 |     Debugger.info_print('totally {} candidates with shape {}'.format(len(ret), ret.shape))
 77 |     return ret
 78 | 
 79 | 
 80 | def __shapelet_distance_factory(shapelets, num_segment, seg_length, tflag,
 81 |                                 init, warp, dist, global_flag=False):
 82 |     def __main__(pid, args, queue):
 83 |         ret = []
 84 |         for time_series in args:
 85 |             time_series = time_series.reshape(num_segment, seg_length, -1)
 86 |             tmp = np.zeros((num_segment, len(shapelets)), dtype=np.float32)
 87 |             if tflag and global_flag:
 88 |                 for idx, (pattern, local_factor, global_factor, _) in enumerate(shapelets):
 89 |                     for k in range(num_segment):
 90 |                         tmp[k, idx] = dist(x=pattern, y=time_series[k],
 91 |                                            w=local_factor, warp=warp) * np.abs(global_factor[init + k])
 92 |             elif  tflag and not global_flag:
 93 |                 for idx, (pattern, local_factor, global_factor, _) in enumerate(shapelets):
 94 |                     for k in range(num_segment):
 95 |                         tmp[k, idx] = dist(x=pattern, y=time_series[k], w=local_factor, warp=warp)
 96 |             else:
 97 |                 for idx, (pattern, _) in enumerate(shapelets):
 98 |                     for k in range(num_segment):
 99 |                         tmp[k, idx] = dist(x=pattern, y=time_series[k],
100 |                                            w=np.ones(pattern.shape[0]), warp=warp)
101 |             ret.append(tmp)
102 |             queue.put(0)
103 |         return ret
104 |     return __main__
105 | 
106 | 
107 | def shapelet_distance(time_series_set, shapelets, seg_length, tflag, tanh, debug, init, warp, measurement):
108 |     """
109 |     returns:
110 |         np.array, N x m x K
111 |         num_time_series, num_segment, num_shapelet
112 |     """
113 |     num_time_series = time_series_set.shape[0]
114 |     num_segment = int(time_series_set.shape[1] / seg_length)
115 |     num_shapelet = len(shapelets)
116 |     assert num_segment * seg_length == time_series_set.shape[1]
117 |     if measurement == 'gw':
118 |         dist = parameterized_gw_npy
119 |     elif measurement == 'gdtw':
120 |         dist = parameterized_gdtw_npy
121 |     else:
122 |         raise NotImplementedError('unsupported distance {}'.format(measurement))
123 |     parmap = ParMap(
124 |         work=__shapelet_distance_factory(
125 |             shapelets=shapelets, num_segment=num_segment, seg_length=seg_length,
126 |             tflag=tflag, init=init, warp=warp, dist=dist),
127 |         monitor=parallel_monitor(msg='shapelet distance', size=num_time_series, debug=debug),
128 |         njobs=NJOBS
129 |     )
130 |     sdist = np.array(parmap.run(data=list(time_series_set)), dtype=np.float32).reshape(
131 |         time_series_set.shape[0], num_segment, num_shapelet
132 |     )
133 |     if tanh:
134 |         sdist = np.tanh(sdist)
135 |     # for tidx in range(num_time_series):
136 |     #     for sidx in range(num_segment):
137 |     #         min_val = np.min(sdist[tidx, sidx, :])
138 |     #         max_val = np.max(sdist[tidx, sidx, :])
139 |     #         assert max_val > min_val, '{}-{}: {}'.format(tidx, sidx, sdist[tidx, sidx, :])
140 |     #         sdist[tidx, sidx, :] -= min_val
141 |     #         sdist[tidx, sidx, :] /= (max_val - min_val)
142 |     # assert np.max(sdist) <= 1 and np.min(sdist) >= 0
143 |     # import pickle
144 |     # pickle.dump(sdist, open('./.sdist.pickle', 'wb'))
145 |     # Debugger.info_print('dump sdist done for debug')
146 |     return sdist
147 | 
148 | 
149 | def transition_matrix(time_series_set, shapelets, seg_length, tflag, multi_graph,
150 |                       percentile, threshold, tanh, debug, init, warp, measurement):
151 |     num_time_series = time_series_set.shape[0]
152 |     num_segment = int(time_series_set.shape[1] / seg_length)
153 |     num_shapelet = len(shapelets)
154 |     if multi_graph:
155 |         gcnt = num_segment - 1
156 |     else:
157 |         gcnt = 1
158 |     tmat = np.zeros((gcnt, num_shapelet, num_shapelet), dtype=np.float32)
159 |     sdist = shapelet_distance(
160 |         time_series_set=time_series_set, shapelets=shapelets, seg_length=seg_length,
161 |         tflag=tflag, tanh=tanh, debug=debug, init=init, warp=warp, measurement=measurement)
162 |     if percentile is not None:
163 |         dist_threshold = np.percentile(sdist, percentile)
164 |         Debugger.debug_print('threshold({}) {}, mean {}'.format(percentile, dist_threshold, np.mean(sdist)), debug=debug)
165 |     else:
166 |         dist_threshold = threshold
167 |         Debugger.debug_print('threshold {}, mean {}'.format(dist_threshold, np.mean(sdist)), debug=debug)
168 | 
169 |     n_edges = 0
170 |     for tidx in range(num_time_series):
171 |         for sidx in range(num_segment - 1):
172 |             src_dist = sdist[tidx, sidx, :]
173 |             dst_dist = sdist[tidx, sidx + 1, :]
174 |             src_idx = np.argwhere(src_dist <= dist_threshold).reshape(-1)
175 |             dst_idx = np.argwhere(dst_dist <= dist_threshold).reshape(-1)
176 |             if len(src_idx) == 0 or len(dst_idx) == 0:
177 |                 continue
178 |             n_edges += len(src_idx) * len(dst_idx)
179 |             src_dist[src_idx] = 1.0 - minmax_scale(src_dist[src_idx])
180 |             dst_dist[dst_idx] = 1.0 - minmax_scale(dst_dist[dst_idx])
181 |             # assert len(src_idx) == num_shapelets
182 |             for src in src_idx:
183 |                 if multi_graph:
184 |                     tmat[sidx, src, dst_idx] += (src_dist[src] * dst_dist[dst_idx])
185 |                 else:
186 |                     tmat[0, src, dst_idx] += (src_dist[src] * dst_dist[dst_idx])
187 |         Debugger.debug_print(
188 |             '{:.2f}% transition matrix computed...'.format(float(tidx + 1) * 100 / num_time_series),
189 |             debug=debug
190 |         )
191 |     Debugger.debug_print('{} edges involved in shapelets graph'.format(n_edges), debug=debug)
192 |     tmat[tmat <= __tmat_threshold] = 0.0
193 |     for k in range(gcnt):
194 |         for i in range(num_shapelet):
195 |             norms = np.sum(tmat[k, i, :])
196 |             if norms == 0:
197 |                 tmat[k, i, i] = 1.0
198 |             else:
199 |                 tmat[k, i, :] /= np.sum(tmat[k, i, :])
200 |     return tmat, sdist, dist_threshold
201 | 
202 | 
203 | def adjacent_matrix(sdist, num_time_series, num_segment, num_shapelet, percentile, threshold, debug):
204 |     tmat = np.zeros((num_time_series, num_shapelet, num_shapelet), dtype=np.float32)
205 |     for tidx in range(num_time_series):
206 |         for sidx in range(num_segment - 1):
207 |             src_dist = sdist[tidx, sidx, :]
208 |             dst_dist = sdist[tidx, sidx + 1, :]
209 |             src_dist = 1.0 - minmax_scale(src_dist)
210 |             dst_dist = 1.0 - minmax_scale(dst_dist)
211 |             # assert len(src_idx) == num_shapelets
212 |             for src in range(num_shapelet):
213 |                 tmat[tidx, src, :] += (src_dist[src] * dst_dist)
214 |         Debugger.debug_print(
215 |             '{:.2f}% adjacent matrix computed...'.format(float(tidx + 1) * 100 / num_time_series),
216 |             debug=debug
217 |         )
218 |     if threshold is None:
219 |         threshold = np.percentile(tmat, percentile)
220 |         Debugger.debug_print('threshold({}%): {:.6f}, mean-value: {:.6f}'.format(percentile, threshold, np.mean(tmat)), debug=debug)
221 |     else:
222 |         Debugger.debug_print('threshold: {:.6f}, mean-value {:.6f}'.format(threshold, np.mean(tmat)), debug=debug)
223 |     edge_idx = tmat >= threshold
224 |     tmat[edge_idx] = 1
225 |     tmat[~edge_idx] = 0
226 |     num_edges = np.sum(np.sum(tmat, axis=2), axis=1)
227 |     Debugger.debug_print('{:.2f}(std: {:.2f}) edges involved in average for each shapelet graph'.format(
228 |         np.mean(num_edges), np.std(num_edges)), debug=debug)
229 |     return tmat, threshold
230 | 
231 | 
232 | def __mat2edgelist(tmat, fpath):
233 |     mat_shape = tmat.shape
234 |     with open(fpath, 'w') as f:
235 |         for src in range(mat_shape[0]):
236 |             flag = False
237 |             for dst in range(mat_shape[1]):
238 |                 if tmat[src, dst] <= 1e-5:
239 |                     continue
240 |                 f.write('{} {}  {:.5f}\n'.format(src, dst, tmat[src, dst]))
241 |                 flag = True
242 |             if not flag:
243 |                 f.write('{} {}  1.0000\n'.format(src, src))
244 |         f.close()
245 | 
246 | 
247 | def __embedding2mat(fpath, num_vertices, embed_size):
248 |     mat = np.zeros((num_vertices, embed_size), dtype=np.float32)
249 |     with open(fpath, 'r') as f:
250 |         cnt = -1
251 |         for line in f:
252 |             if cnt < 0:
253 |                 cnt += 1
254 |                 continue
255 |             line = line.split(' ')
256 |             idx = int(line[0])
257 |             for k in range(embed_size):
258 |                 mat[idx, k] = float(line[k + 1])
259 |         f.close()
260 |     return mat
261 | 
262 | 
263 | def graph_embedding(tmat, num_shapelet, embed_size, cache_dir, **deepwalk_paras):
264 |     __deepwalk_args__ = []
265 |     Debugger.info_print('embed_size: {}'.format(embed_size))
266 |     ret = []
267 |     Debugger.info_print('transition matrix size {}'.format(tmat.shape))
268 |     for idx in range(tmat.shape[0]):
269 |         edgelist_path = '{}/edgelist/{}.edgelist'.format(cache_dir, idx)
270 |         embedding_path = '{}/embeds/{}.embeddings'.format(cache_dir, idx)
271 |         __mat2edgelist(tmat=tmat[idx, :, :], fpath=edgelist_path)
272 |         deepwalk_cmd = [
273 |             'deepwalk --input {} --format weighted_edgelist --output {} --representation-size {}'.format(
274 |                 edgelist_path, embedding_path, embed_size)
275 |         ]
276 |         for key, val in deepwalk_paras.items():
277 |             if key in __deepwalk_args__:
278 |                 deepwalk_cmd.append('--{} {}'.format(key, val))
279 |         deepwalk_cmd = ' '.join(deepwalk_cmd)
280 |         Debugger.info_print('run deepwalk with: {}'.format(deepwalk_cmd))
281 |         _ = syscmd(deepwalk_cmd)
282 |         ret.append(__embedding2mat(fpath=embedding_path, num_vertices=num_shapelet,
283 |                                    embed_size=embed_size))
284 |     return np.array(ret, dtype=np.float32).reshape(tmat.shape[0], num_shapelet, embed_size)
285 | 


--------------------------------------------------------------------------------
/time2graph/core/time_aware_shapelets.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import torch
  3 | from torch.autograd import *
  4 | from torch import optim
  5 | from torch.nn import functional as F
  6 | from torch.distributions.normal import Normal
  7 | from torch.utils.data import DataLoader
  8 | from ..utils.base_utils import Queue
  9 | from .model_utils import *
 10 | from .shapelet_utils import *
 11 | from .distance_utils import *
 12 | 
 13 | 
 14 | def parameterized_gw_torch(x, y, w, torch_dtype, warp=2):
 15 |     """
 16 |     gw distance in torch with timing factors.
 17 |     :param x:
 18 |     :param y:
 19 |     :param w:
 20 |     :param torch_dtype:
 21 |     :param warp:
 22 |     :return:
 23 |     """
 24 |     distance = np.sum((x.reshape(x.shape[0], -1, x.shape[1]) - expand_array(y=y, warp=warp)) ** 2,
 25 |                       axis=1)
 26 |     assert not torch.any(torch.isnan(w)), 'local: {}'.format(w)
 27 |     softmin_distance = np.sum(softmax(-distance.astype(np.float64)).astype(np.float32) * distance,
 28 |                               axis=1)
 29 |     return torch.sqrt(torch.sum(torch.from_numpy(softmin_distance).type(torch_dtype) * torch.abs(w)))
 30 | 
 31 | 
 32 | def parameterized_gdtw_torch(x, y, w, torch_dtype, warp=2):
 33 |     """
 34 |     greedy-dtw distance in torch with timing factors.
 35 |     :param x:
 36 |     :param y:
 37 |     :param w:
 38 |     :param torch_dtype:
 39 |     :param warp:
 40 |     :return:
 41 |     """
 42 |     dpath = greedy_dtw_path(x=x, y=y, warp=warp)
 43 |     return torch.norm((torch.from_numpy(x).type(torch_dtype) * w.reshape(x.shape[0], -1))[dpath[0]] -
 44 |                       torch.from_numpy(y[dpath[1]]).type(torch_dtype))
 45 | 
 46 | 
 47 | def pattern_distance_torch(pattern, time_series, num_segment, seg_length,
 48 |                            local_factor, global_factor, torch_dtype, measurement):
 49 |     """
 50 |     compute distances between a pattern and a given time series.
 51 |     :param pattern:
 52 |     :param time_series:
 53 |     :param num_segment:
 54 |     :param seg_length:
 55 |     :param local_factor:
 56 |     :param global_factor:
 57 |     :param torch_dtype:
 58 |     :param measurement:
 59 |     :return:
 60 |     """
 61 |     if measurement == 'gw':
 62 |         dist_torch = parameterized_gw_torch
 63 |     elif measurement == 'gdtw':
 64 |         dist_torch = parameterized_gdtw_torch
 65 |     else:
 66 |         raise NotImplementedError('unsupported distance {}'.format(measurement))
 67 |     assert isinstance(time_series, np.ndarray) and isinstance(pattern, np.ndarray)
 68 |     time_series = time_series.reshape(num_segment, seg_length, -1)
 69 |     distance = Variable(torch.zeros(num_segment)).type(torch_dtype)
 70 |     for k in range(num_segment):
 71 |         distance[k] = dist_torch(x=pattern, y=time_series[k], w=local_factor, torch_dtype=torch_dtype)
 72 |     return torch.sum(F.softmax(-distance * torch.abs(global_factor), dim=0)
 73 |                      * (distance * torch.abs(global_factor)))
 74 | 
 75 | 
 76 | def __shapelet_candidate_loss(cand, time_series_set, label, num_segment, seg_length,
 77 |                               data_size, p, lr, alpha, beta, num_batch, gpu_enable,
 78 |                               measurement, **kwargs):
 79 |     """
 80 |     loss for learning time-aware shapelets.
 81 |     :param cand:
 82 |     :param time_series_set:
 83 |     :param label:
 84 |     :param num_segment:
 85 |     :param seg_length:
 86 |     :param data_size:数据的维度
 87 |     :param p:
 88 |         normalizing parameter (0, 1, or 2).
 89 |     :param lr:
 90 |         learning rate.
 91 |     :param alpha:
 92 |         penalty weight for local timing factor.
 93 |     :param beta:
 94 |         penalty weight for global timing factor.
 95 |     :param num_batch:
 96 |     :param gpu_enable:
 97 |     :param measurement:
 98 |     :param kwargs:
 99 |     :return:
100 |     """
101 |     if gpu_enable:
102 |         torch_dtype = torch.cuda.FloatTensor
103 |     else:
104 |         torch_dtype = torch.FloatTensor
105 |     dataset_numpy = NumpyDataset(time_series_set, label)
106 |     num_class = len(np.unique(label).reshape(-1))
107 |     batch_size = int(len(dataset_numpy) // num_batch)
108 |     local_factor_variable = Variable(torch.ones(seg_length).type(torch_dtype) / seg_length, requires_grad=True)
109 |     global_factor_variable = Variable(torch.ones(num_segment).type(torch_dtype) / num_segment, requires_grad=True)
110 |     current_loss, loss_queue, cnt, nan_cnt = 0.0, Queue(max_size=int(num_batch * 0.2)), 0, 0
111 |     current_main_loss, current_penalty_loss = 0.0, 0.0
112 |     max_iters, optimizer = kwargs.get('max_iters', 1), kwargs.get('optimizer', 'Adam')
113 |     if optimizer == 'Adam':
114 |         optimizer = optim.Adam
115 |     elif optimizer == 'Adadelta':
116 |         optimizer = optim.Adadelta
117 |     elif optimizer == 'Adamax':
118 |         optimizer = optim.Adamax
119 |     else:
120 |         raise NotImplementedError('unsupported optimizer {} for time-aware shapelets learning'.format(optimizer))
121 |     optimizer = optimizer([local_factor_variable, global_factor_variable], lr=lr)
122 | 
123 |     while cnt < max_iters:
124 |         sampler = StratifiedSampler(label=label, num_class=num_class)
125 |         dataloader = DataLoader(dataset=dataset_numpy, batch_size=batch_size, sampler=sampler)
126 |         batch_cnt = 0
127 |         for x, y in dataloader:
128 |             x = np.array(x, dtype=np.float32).reshape(len(x), -1, data_size)
129 |             y = np.array(y, dtype=np.float32).reshape(-1)
130 |             assert not np.any(np.isnan(x)), 'original time series data with nan'
131 |             lb_idx, sample_flag = [], True
132 |             for k in range(num_class):
133 |                 tmp_idx = np.argwhere(y == k).reshape(-1)
134 |                 if k >= 1 and len(tmp_idx) > 0:
135 |                     sample_flag = False
136 |                 lb_idx.append(tmp_idx)
137 |             if len(lb_idx[0]) == 0 or sample_flag:
138 |                 Debugger.debug_print('weighted sampling exception, positive {:.2f}/{}'.format(np.sum(y)/len(y), len(y)))
139 |                 continue
140 |             loss = torch.Tensor([0.0]).type(torch_dtype)
141 |             main_loss = torch.Tensor([0.0]).type(torch_dtype)
142 |             penalty_loss = torch.Tensor([0.0]).type(torch_dtype)
143 |             dist_tensor = torch.zeros(x.shape[0]).type(torch_dtype)
144 |             for k in range(x.shape[0]):
145 |                 dist_tensor[k] = pattern_distance_torch(
146 |                     pattern=cand, time_series=x[k, :, :], num_segment=num_segment,
147 |                     seg_length=seg_length, local_factor=local_factor_variable,
148 |                     global_factor=global_factor_variable, torch_dtype=torch_dtype,
149 |                     measurement=measurement
150 |                     # ignore the warning of reshape/view for local_factor_variable
151 |                 )
152 |             assert not torch.isnan(dist_tensor).any(), 'dist: {}\nlocal: {}\nglobal: {}'.format(
153 |                 dist_tensor, local_factor_variable, global_factor_variable)
154 |             mean, std = torch.mean(dist_tensor), torch.std(dist_tensor)
155 |             dist_tensor = (dist_tensor - mean) / std
156 |             # Debugger.info_print('transform: {}, {}'.format(torch.max(dist_tensor), torch.min(dist_tensor)))
157 |             # Debugger.time_print(msg='pattern distance', begin=begin, profiling=True)
158 |             for k in range(1, len(lb_idx)):
159 |                 src = dist_tensor[lb_idx[0]]
160 |                 dst = dist_tensor[lb_idx[k]]
161 |                 # src =torch.where(torch.isnan(src), torch.full_like(src, 0),src)
162 |                 # dst =torch.where(torch.isnan(dst), torch.full_like(dst, 0),dst)
163 |                 if len(src)!=len(dst):
164 |                     break
165 |                 # print(src)
166 |                 # print(dst)
167 |                 #填补其中的空值
168 |                 loss -= torch.abs(torch.distributions.kl.kl_divergence(
169 |                     Normal(torch.mean(src), torch.std(src)),
170 |                     Normal(torch.mean(dst), torch.std(dst))))
171 |                 main_loss -= torch.abs(torch.distributions.kl.kl_divergence(
172 |                     Normal(torch.mean(src), torch.std(src)),
173 |                     Normal(torch.mean(dst), torch.std(dst))))
174 |                 # Debugger.info_print('KL-loss: {}'.format(loss))
175 |             loss += (alpha * torch.norm(local_factor_variable, p=p) / seg_length)
176 |             loss += (beta * torch.norm(global_factor_variable, p=p) / num_segment)
177 | 
178 |             penalty_loss += (alpha * torch.norm(local_factor_variable, p=p) / seg_length)
179 |             penalty_loss += (beta * torch.norm(global_factor_variable, p=p) / num_segment)
180 | 
181 |             optimizer.zero_grad()
182 |             loss.backward()
183 |             optimizer.step()
184 |             if gpu_enable:
185 |                 current_loss = float(loss.cpu().data.numpy())
186 |                 current_main_loss = float(main_loss.cpu().data)
187 |                 current_penalty_loss = float(penalty_loss.cpu().data)
188 |             else:
189 |                 current_loss = float(loss.data.numpy())
190 |                 current_main_loss = float(main_loss.data)
191 |                 current_penalty_loss = float(penalty_loss.data)
192 |             loss_queue.enqueue(current_loss)
193 |             if np.isnan(current_loss) or torch.any(torch.isnan(local_factor_variable))\
194 |                     or torch.any(torch.isnan(global_factor_variable)):
195 |                 local_factor_variable = Variable(torch.ones(seg_length).type(torch_dtype) / seg_length, requires_grad=True)
196 |                 global_factor_variable = Variable(torch.ones(num_segment).type(torch_dtype) / num_segment, requires_grad=True)
197 |                 current_loss = 1e5
198 |                 nan_cnt += 1
199 |                 if nan_cnt >= max_iters:
200 |                     break
201 |             else:
202 |                 Debugger.debug_print('{:.2f}% steps, loss {:.6f} with {:.6f} and penalty {:.6f}'.format(
203 |                     batch_cnt * 100 / num_batch, current_loss, current_main_loss, current_penalty_loss))
204 |             batch_cnt += 1
205 |         cnt += 1
206 |         if nan_cnt >= max_iters:
207 |             break
208 |         else:
209 |             avg_loss = np.mean(loss_queue.queue[1:])
210 |             if abs(current_loss - avg_loss) < kwargs.get('epsilon', 1e-2):
211 |                 break
212 |     local_factor_variable = torch.abs(local_factor_variable)
213 |     global_factor_variable = torch.abs(global_factor_variable)
214 |     if gpu_enable:
215 |         local_factor = local_factor_variable.cpu().data.numpy()
216 |         global_factor = global_factor_variable.cpu().data.numpy()
217 |     else:
218 |         local_factor = local_factor_variable.data.numpy()
219 |         global_factor = global_factor_variable.data.numpy()
220 |     return local_factor, global_factor, current_loss, current_main_loss, current_penalty_loss
221 | 
222 | 
223 | def __shapelet_candidate_loss_factory(time_series_set, label, num_segment,
224 |                                       seg_length, data_size, p, lr, alpha, beta, num_batch,
225 |                                       gpu_enable, measurement, **kwargs):
226 |     """
227 |     paralleling compute shapelet losses.
228 |     :param time_series_set:
229 |     :param label:
230 |     :param num_segment:
231 |     :param seg_length:
232 |     :param data_size:
233 |     :param p:
234 |     :param lr:
235 |     :param alpha:
236 |     :param beta:
237 |     :param num_batch:
238 |     :param gpu_enable:
239 |     :param measurement:
240 |     :param kwargs:
241 |     :return:
242 |     """
243 |     def __main__(pid, args, queue):
244 |         ret = []
245 |         for cand in args:
246 |             local_factor, global_factor, loss, main_loss, penalty = __shapelet_candidate_loss(
247 |                 cand=cand, time_series_set=time_series_set, label=label, num_segment=num_segment,
248 |                 seg_length=seg_length, data_size=data_size, p=p, lr=lr,
249 |                 alpha=alpha, beta=beta, num_batch=num_batch, gpu_enable=gpu_enable,
250 |                 measurement=measurement, **kwargs
251 |             )
252 |             ret.append((cand, local_factor, global_factor, loss, main_loss, penalty))
253 |             queue.put(0)
254 |         return ret
255 |     return __main__
256 | 
257 | 
258 | def learn_time_aware_shapelets(time_series_set, label, K, C, num_segment, seg_length, data_size,
259 |                                p, lr, alpha, beta, num_batch, gpu_enable, measurement, **kwargs):
260 |     """
261 |     learn time-aware shapelets.
262 |     :param time_series_set:
263 |         input time series data.
264 |     :param label:
265 |         input label.
266 |     :param K:
267 |         number of shapelets that finally learned.
268 |     :param C:
269 |         number of shapelet candidates in learning procedure.
270 |     :param num_segment:
271 |     :param seg_length:
272 |     :param data_size:
273 |     :param p:
274 |     :param lr:
275 |     :param alpha:
276 |     :param beta:
277 |     :param num_batch:
278 |     :param gpu_enable:
279 |     :param measurement:
280 |     :param kwargs:
281 |     :return:
282 |     """
283 |     cands = generate_shapelet_candidate(time_series_set=time_series_set, num_segment=num_segment,
284 |                                         seg_length=seg_length, candidate_size=C, **kwargs)
285 |     parmap = ParMap(
286 |         work=__shapelet_candidate_loss_factory(
287 |             time_series_set=time_series_set, label=label, num_segment=num_segment, seg_length=seg_length,
288 |             data_size=data_size, p=p, lr=lr, alpha=alpha, beta=beta, num_batch=num_batch,
289 |             gpu_enable=gpu_enable, measurement=measurement, **kwargs
290 |         ),
291 |         monitor=parallel_monitor(msg='learning time-aware shapelets', size=len(cands),
292 |                                  debug=kwargs.get('debug', True)),
293 |         njobs=kwargs.get('njobs', NJOBS)
294 |     )
295 |     result = sorted(parmap.run(data=cands), key=lambda x: x[3])
296 |     ret = []
297 |     for (cand, local_factor, global_factor, loss, main_loss, penalty) in result:
298 |         ret.append((cand, local_factor, global_factor, loss))
299 |     return sorted(ret, key=lambda x: x[-1])[:K]
300 | 


--------------------------------------------------------------------------------
/time2graph/core/model_gin.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import torch
  3 | import torch.optim as optim
  4 | import torch.nn.functional as F
  5 | from config import *
  6 | from torch.utils.data import Dataset,DataLoader,RandomSampler,SequentialSampler
  7 | from scipy.special import softmax
  8 | from .shapelet_utils import shapelet_distance, adjacent_matrix
  9 | from sklearn.model_selection import StratifiedKFold
 10 | from sklearn.preprocessing import MinMaxScaler
 11 | from .time_aware_shapelets import learn_time_aware_shapelets
 12 | from .static_shapelets import learn_static_shapelets
 13 | from .Optimize import AdamW, get_linear_schedule_with_warmup
 14 | from ..utils.base_utils import  myNetwork
 15 | from torch.utils.data import DataLoader, Dataset
 16 | import torch.nn as nn
 17 | import os
 18 | import gc
 19 | import logging
 20 | logger = logging.getLogger(__name__)
 21 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 22 |                     datefmt='%m/%d/%Y %H:%M:%S',
 23 |                     level=logging.INFO)
 24 | class myDataset(Dataset):
 25 |     def __init__(self, x,m, y=None,istrain=True):
 26 |         super(myDataset, self).__init__()
 27 |         self.x = torch.tensor(x,dtype=torch.float32)
 28 |         self.m = torch.tensor(m,dtype=torch.float32)
 29 |         if istrain:
 30 |             self.y =torch.tensor(y,dtype=torch.float32)
 31 |         self.istrain=istrain
 32 | 
 33 |     def __len__(self):
 34 |         return len(self.x)
 35 | 
 36 |     def __getitem__(self, idx):
 37 |         if self.istrain:
 38 |             return self.x[idx],self.m[idx], self.y[idx]
 39 |         else:
 40 |             return self.x[idx],self.m[idx]
 41 | def torch_R2(pre,label):
 42 |     '''
 43 |     pre  预测的流量序列  b,len
 44 |     label 实际的流量序列 b,len
 45 |     
 46 |     '''
 47 |     r2=0
 48 |     mse=0
 49 |     for i in range(len(pre)):
 50 |         RSS= torch.sum((pre-label)** 2)
 51 |         TSS=torch.sum((label-torch.mean(label))** 2)
 52 |         r2+=1-RSS/TSS
 53 |         mse+=torch.mean((pre-label)** 2)
 54 |         
 55 |     return mse/len(pre),r2/len(pre)
 56 | 
 57 | class Flow2Graph(nn.Module):
 58 |     """
 59 |         Time2GraphGAT model
 60 |         Hyper-parameters:
 61 |             K: number of learned shapelets
 62 |             C: number of candidates
 63 |             A: number of shapelets assigned to each segment
 64 |             tflag: timing flag
 65 |     """
 66 |     def __init__(self, K, seg_length, num_segment, warp=2, tflag=True, gpu_enable=False,device='cpu', optimizer='Adam', dropout=0.2, lk_relu=0.2, data_size=7, softmax=False,  percentile=10,
 67 |                  dataset='Unspecified', append=False, sort=False, 
 68 |                  feat_norm=True, aggregate=True, standard_scale=False, diff=False, **kwargs):
 69 |         super(Flow2Graph, self).__init__()
 70 |         self.K = K
 71 |         self.C = kwargs.pop('C', K * 10)
 72 |         self.seg_length = seg_length
 73 |         self.num_segment = num_segment
 74 |         self.data_size = data_size
 75 |         self.device=device
 76 |         self.warp = warp
 77 |         self.tflag = tflag
 78 |         self.gpu_enable = gpu_enable
 79 |         self.cuda = self.gpu_enable and torch.cuda.is_available()
 80 |         # Debugger.info_print('torch.cuda: {}, self.cuda: {}'.format(torch.cuda.is_available(), self.cuda))
 81 |         self.shapelets = None
 82 |         self.append = append
 83 |         self.percentile = percentile
 84 |         self.threshold = None
 85 |         self.clf=None
 86 |         self.sort = sort
 87 |         self.aggregate = aggregate
 88 |         self.dropout = dropout
 89 |         self.lk_relu = lk_relu
 90 |         self.softmax = softmax
 91 |         self.dataset = dataset
 92 |         self.diff = diff
 93 |         self.standard_scale = standard_scale
 94 |         
 95 |         self.feat_norm = feat_norm
 96 |         self.pretrain = kwargs.pop('pretrain', None)
 97 | 
 98 |         self.lr = kwargs.pop('lr', 1e-3)
 99 |         self.p = kwargs.pop('p', 2)
100 |         self.alpha = kwargs.pop('alpha', 0.1)
101 |         self.beta = kwargs.pop('beta', 0.05)
102 |         self.debug = kwargs.pop('debug', False)
103 |         self.optimizer = optimizer
104 |         self.measurement = kwargs.pop('measurement', 'gdtw')
105 |         self.batch_size = kwargs.pop('batch_size', 200)
106 |         self.init = kwargs.pop('init', 0)
107 |         self.niter = kwargs.pop('niter', 1000)
108 |         self.fastmode = kwargs.pop('fastmode', False)
109 |         self.tol = kwargs.pop('tol', 1e-4)
110 |         self.cuda = self.gpu_enable and torch.cuda.is_available()
111 |         self.kwargs = kwargs
112 |         Debugger.info_print('initialize our Flow2Graph with {}'.format(self.__dict__))
113 | 
114 |     def learn_shapelets(self, x, y, num_segment, data_size):
115 |         assert x.shape[1] == num_segment * self.seg_length
116 |         Debugger.info_print('basic statistics before learn shapelets: max {:.4f}, min {:.4f}'.format(np.max(x), np.min(x)))
117 |         if self.tflag:
118 |             self.shapelets = learn_time_aware_shapelets(
119 |                 time_series_set=x, label=y, K=self.K, C=self.C, p=self.p,
120 |                 num_segment=num_segment, seg_length=self.seg_length, data_size=data_size,
121 |                 lr=self.lr, alpha=self.alpha, beta=self.beta, num_batch=int(x.shape[0] / self.batch_size),
122 |                 measurement=self.measurement, gpu_enable=self.gpu_enable, **self.kwargs)
123 |         else:
124 |             self.shapelets = learn_static_shapelets(
125 |                 time_series_set=x, label=y, K=self.K, C=self.C, warp=self.warp,
126 |                 num_segment=num_segment, seg_length=self.seg_length, measurement=self.measurement, **self.kwargs)
127 | 
128 |     def __gat_features__(self, X, train=False):
129 |         __shapelet_distance = shapelet_distance(
130 |             time_series_set=X, shapelets=self.shapelets, seg_length=self.seg_length,
131 |             tflag=self.tflag, tanh=self.kwargs.get('tanh', False), debug=self.debug,
132 |             init=self.init, warp=self.warp, measurement=self.measurement)
133 |         threshold = None if train else self.threshold
134 |         adj_matrix, self.threshold = adjacent_matrix(
135 |             sdist=__shapelet_distance, num_time_series=X.shape[0], num_segment=int(X.shape[1] / self.seg_length),
136 |             num_shapelet=self.K, percentile=self.percentile, threshold=threshold, debug=self.debug)
137 |         __shapelet_distance = np.transpose(__shapelet_distance, axes=(0, 2, 1))
138 |         if self.sort:
139 |             __shapelet_distance = softmax(-1 * np.sort(__shapelet_distance, axis=1), axis=1)
140 |         if self.softmax and not self.sort:
141 |             __shapelet_distance = softmax(__shapelet_distance, axis=1)
142 |         if self.append:
143 |             origin = np.array([v[0].reshape(-1) for v in self.shapelets], dtype=np.float).reshape(1, self.K, -1)
144 |             return np.concatenate((__shapelet_distance, np.tile(origin, (__shapelet_distance.shape[0], 1, 1))),
145 |                                   axis=2).astype(np.float), adj_matrix
146 |         else:
147 |             return __shapelet_distance.astype(np.float), adj_matrix
148 | 
149 | 
150 |     def __preprocess_input_data(self, X):
151 |         X_scale = X.copy()
152 |         if self.diff:
153 |             X_scale[:, : -1, :] = X[:, 1:, :] - X[:, :-1, :]
154 |             X_scale[:, -1, :] = 0
155 |             Debugger.debug_print('conduct time differing...')
156 |         if self.standard_scale:
157 |             for i in range(self.data_size):
158 |                 X_std = np.std(X_scale[:, :, i], axis=1).reshape(X.shape[0], -1)
159 |                 X_std[X_std == 0] = 1.0
160 |                 X_scale[:, :, i] = (X_scale[:, :, i] - np.mean(X_scale[:, :, i], axis=1).reshape(X.shape[0], -1)) / X_std
161 |                 Debugger.debug_print('conduct standard scaling on data-{}, with mean {:.2f} and var {:.2f}'.format(i, np.mean(X_scale[0, :, i]), np.std(X_scale[0, :, i])))
162 |         return X_scale
163 |     def transfer(self,sps):
164 |         '''
165 |         把shapelets的列表转换成矩阵
166 |         '''
167 |         ss=np.zeros((len(sps),self.seg_length))
168 |         for idx, (pattern, _, _, _) in enumerate(sps):
169 |             ss[idx]=pattern[:,0]#取均值那一列
170 |         return nn.Embedding.from_pretrained(torch.tensor(ss,dtype=torch.float32))#  num_shapelets,seg_length
171 |     def fit(self,for_rescale, X_scale, Y,valid_x_scale,valid_y,clf_func, reset=False,train_batch_size=256,de_size=24*1,epoch=100,
172 |           display_steps=2,eval_steps=2,max_grad_norm=1.0,lr=0.3,l2_alpha=0.01,hidden_size=512,output_dir='model',logprintfile=None,):
173 |         '''
174 |         X_scale,  
175 |         Y,
176 |         valid_x_scale,
177 |         valid_y
178 |         clf_func :获取分类标签的函数
179 |         '''
180 |         num_segment, data_size = int(X_scale.shape[1] / self.seg_length), X_scale.shape[-1]
181 |         assert self.data_size == X_scale.shape[-1]
182 |         X_scale = self.__preprocess_input_data(X_scale)#归一化 啥也没干其实
183 |         valid_x_scale=self.__preprocess_input_data(valid_x_scale)#归一化  这里也是啥也没干其实
184 | 
185 |         if reset or self.shapelets is None:
186 |             self.learn_shapelets(x=np.vstack((X_scale,valid_x_scale)), y=clf_func(np.vstack((Y,valid_y))), num_segment=num_segment, data_size=data_size)
187 | #        self.__fit_gat(X=X_scale, Y=Y)
188 |         print("获取数据的特征表示:")
189 |         import pickle
190 |         if os.path.exists('feauture_%d_%d.plk'%(self.K,de_size/24)):
191 |              X_savel=pickle.load(open('feauture_%d_%d.plk'%(self.K,de_size/24), 'rb'))
192 |              X_feat, X_adj =X_savel['X_feat'],X_savel['X_adj']
193 |              del X_savel
194 |              gc.collect()
195 |         else:
196 |             X_feat, X_adj = self.__gat_features__(X_scale)#获得初始化的节点特征和邻接矩阵
197 |             with open('feauture_%d_%d.plk'%(self.K,de_size/24), 'wb') as f:
198 |                 pickle.dump({'X_feat':np.array(X_feat),'X_adj':X_adj},f)
199 |         
200 |         print("开始训练！")
201 |         #开始训练
202 |         dataset = myDataset(X_feat, X_adj,Y)
203 |         train_sampler = RandomSampler(dataset)
204 |         train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=train_batch_size,num_workers=0)
205 |         loss_fn = nn.MSELoss().to(self.device)
206 |         en_size=X_scale.shape[1]
207 |         nfeat=X_feat.shape[-1]#shapelets初始维度
208 |         tar_len=Y.shape[-1]#预测目标的长度
209 |         
210 |         self.clf=myNetwork(en_size,data_size,nfeat,hidden_size,tar_len,self.seg_length,n_layers=2,dropout=0.1,modelname="Flim-GNN")
211 |         '''
212 |         modelname:使用的模型名称
213 |         nfeat:输入x特征矩阵维度
214 |         nhid：中间层维度
215 |         nclass：输出特征维度
216 |         dropout：dropout的比例
217 |         '''
218 |         optimizer = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9)
219 |         # optimizer = torch.optim.Adam(self.parameters(), lr=lr)
220 |         #设置优化器
221 |         
222 |         # optimizer = AdamW(self.parameters(), lr=lr, eps=1e-8,weight_decay=0.00001)
223 |         # optimizer =torch.optim.SGD(self.model.parameters(), lr=args.lr, momentum=0.9)
224 |         # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(len(train_dataloader)*epoch*0.2),num_training_steps=int(len(train_dataloader)*epoch))    
225 |         
226 |         self.clf.to(self.device)
227 |         all_shapelets=self.transfer(self.shapelets)#拿到所有shapelet id到真实片段的映射
228 |         self.clf.zero_grad() 
229 |         tr_loss,best_R2,avg_loss = 0.0, -10,0.0
230 |         global_step=0
231 |         for idx in range(epoch):     
232 |             tr_num=0
233 |             train_loss=0
234 |             self.clf.train()
235 |             for step, batch in enumerate(train_dataloader):
236 |                 
237 |                 feat,adj,Y =(x.to(self.device) for x in batch)
238 |                 out =self.clf(feat,adj,all_shapelets.to(self.device))#state_orginal ,b,n,2
239 | #                print(out.device)
240 |                 del batch,feat,adj
241 |                 gc.collect()
242 |                 loss =loss_fn(out,Y)
243 |                 optimizer.zero_grad()
244 |     
245 |                 loss.backward()#先写到这里，后续再补充！！
246 |                 torch.nn.utils.clip_grad_norm_(self.parameters(),max_grad_norm)         
247 |                 tr_loss += loss.item()
248 |                 tr_num+=1
249 |                 train_loss+=loss.item()
250 |                 #输出log
251 |                 if avg_loss==0:
252 |                     avg_loss=tr_loss
253 |                 avg_loss=round(train_loss/tr_num,8)
254 |                 
255 |                 if (step+1) % display_steps == 0:
256 |                     Debugger.info_print("At Training:  epoch {} step {} loss {}".format(idx,step+1,avg_loss))
257 |                     print("At Training:  epoch {} step {} loss {}".format(idx,step+1,avg_loss),"\n",file=logprintfile)
258 |                
259 |                 #update梯度
260 |                 optimizer.step()
261 |                 optimizer.zero_grad()
262 |                 # scheduler.step()  
263 |                 global_step += 1
264 |                 
265 |                 #测试验证结果
266 |                 if (step+1) % eval_steps == 0:
267 |                     #输出验证集预测的结果
268 |                     out= self.infer(self.clf,valid_x_scale)
269 |                     #输出预测的f1和error distance
270 |                     results=self.eval(out,torch.tensor(valid_y,dtype=torch.float32).to(self.device))
271 |                     with open('flow2graph_casestudy.pkl','wb')as fff:
272 |                         pickle.dump({'Y_label':for_rescale[2],'Y_pre':out.cpu().numpy()},fff)
273 |                         # pickle.dump({'Y_label':valid_y*for_rescale[1]+for_rescale[0],'Y_pre':out.cpu().numpy()*for_rescale[1]+for_rescale[0]},fff)    
274 |                     
275 |                     #打印结果                  
276 |                     for key, value in results.items():
277 |                         logger.info("测试结果  %s = %s", key, round(value,8))      
278 |                     #保存最好的年龄结果和模型
279 |                     if results['eval_R2']>best_R2:
280 |                         best_R2=results['eval_R2']
281 |                         print("  "+"*"*20)  
282 |                         print("  "+"*"*20,"\n",file=logprintfile)
283 |                         for key, value in results.items():
284 |                             logger.info("测试结果  %s = %s", key, round(value,8))      
285 |                             print("测试结果  {} = {}".format(key, round(value,8)),"\n",file=logprintfile)
286 |                         logger.info("  Best R2:%s",round(best_R2,8))
287 |                         logger.info("  Best mse:%s",round(results['eval_loss'],8))
288 |                         print("  Best f1:",round(best_R2,8),"  Best MSE:",round(results['eval_loss'],8),"\n",file=logprintfile)
289 |                         print("  "+"*"*20,"\n",file=logprintfile)
290 |                         logger.info("  "+"*"*20)                          
291 |                         
292 |                         model_to_save = self.clf.module if hasattr(self.clf, 'module') else self.clf  # Only save the model it-self
293 |                         output_model_file = os.path.join(output_dir, "pytorch_time2graph_gcn_{}_{}.bin".format(en_size,tar_len))
294 |                         torch.save(model_to_save.state_dict(), output_model_file)
295 |             print("  Best R2:",round(best_R2,8),"  Best MSE:",round(results['eval_loss'],8))
296 | 
297 |     def infer(self,model,valid_x_scale,eval_batch_size=32):
298 |         assert self.shapelets is not None, 'shapelets has not been learnt yet...'
299 |         X_feat, X_adj = self.__gat_features__(valid_x_scale)#获得初始化的节点特征和邻接矩阵
300 |         eval_dataset=myDataset(X_feat, X_adj,istrain=False)
301 |         eval_sampler = SequentialSampler(eval_dataset)
302 |         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size,num_workers=4)
303 |         all_shapelets=self.transfer(self.shapelets)#拿到所有shapelet id到真实片段的映射
304 |         predict=[]
305 |         for step, batch in enumerate(eval_dataloader):
306 |             feat,adj =(x.to(self.device) for x in batch)
307 |             del batch
308 |             with torch.no_grad():
309 |                 pred  =model(feat,adj,all_shapelets.to(self.device))#b,tar_len
310 | #                predict.append(pred.cpu().numpy())# b,tar_len
311 |                 predict.append(pred)
312 |             del feat,adj,pred
313 |             gc.collect()
314 | #        predict=np.concatenate(predict,0) 
315 |         predict=torch.cat(predict,dim=0)#torch.stack(predict)
316 |         return predict
317 |     def eval(self,predict,Groudth):
318 |         '''
319 |         predict  sample_len ,de_size,1
320 |         Groudth sample_len ,de_size,1
321 |         '''
322 |         results={}
323 |         m,r2=torch_R2(predict,Groudth)
324 |         results['eval_loss']=m.cpu().item()
325 |         results['eval_R2']=r2.cpu().item()
326 | #        from sklearn.metrics import r2_score,mean_squared_error
327 | #        results={}
328 | #        results['eval_R2']=r2_score(Groudth,predict)
329 | #       
330 | #        results['eval_loss']=mean_squared_error(Groudth,predict)         
331 |         return results
332 |         
333 |     def reload(self,model,output_dir,en_size,tar_len):
334 |             #读取在验证集结果最好的模型
335 |         load_model_path=os.path.join(output_dir, "pytorch_time2graph_gcn_{}_{}.bin".format(en_size,tar_len))
336 |         logger.info("Load model from %s",load_model_path)
337 |         model_to_load = model.module if hasattr(model, 'module') else model  # Only save the model it-self
338 |         model_to_load.load_state_dict(torch.load(load_model_path))   
339 |         return model    
340 | 
341 |     def save_shapelets(self, fpath):
342 |         torch.save(self.shapelets, fpath)
343 | 
344 |     def load_shapelets(self, fpath, map_location='cuda:0'):
345 |         self.shapelets = torch.load(fpath, map_location=map_location)
346 | 


--------------------------------------------------------------------------------
/time2graph/utils/base_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import sys
  3 | import time
  4 | import itertools
  5 | import psutil
  6 | import torch.nn.functional as F
  7 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
  8 | from subprocess import *
  9 | import torch
 10 | import torch.nn as nn
 11 | from torch.utils.data import DataLoader, Dataset
 12 | from sklearn.metrics import r2_score,mean_squared_error
 13 | 
 14 | class myDataset(Dataset):
 15 |     def __init__(self, x, y):
 16 |         super(myDataset, self).__init__()
 17 |         self.x = x
 18 |         self.y = y
 19 | 
 20 |     def __len__(self):
 21 |         return len(self.x)
 22 | 
 23 |     def __getitem__(self, idx):
 24 |         return self.x[idx], self.y[idx]
 25 | 
 26 | 
 27 | class myMlp(nn.Module):
 28 |     def __init__(self, in_len=1448, out_len=24,gpu_enable=False):
 29 |         super(myMlp, self).__init__()
 30 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 31 |         self.net = nn.Sequential(
 32 |             nn.Linear(in_len, 2048),
 33 |             nn.BatchNorm1d(2048),
 34 |             nn.ReLU(),
 35 |             nn.Linear(2048,1024),
 36 |             nn.ReLU(),
 37 |             nn.Linear(1024,out_len)
 38 |         )
 39 |         self.gpu_enable = gpu_enable
 40 | 
 41 |     def forward(self,x):
 42 |         if self.gpu_enable ==False:
 43 |             x = x.cpu()
 44 |         x = self.net(x)
 45 |         return x
 46 | 
 47 |     def fit(self, x, y):
 48 |         x = torch.tensor(x,dtype=torch.float32)
 49 |         y = torch.tensor(y,dtype=torch.float32)
 50 |         dataset = myDataset(x, y)
 51 |         data_len = len(dataset)
 52 |         batchsize = 128
 53 |         dataloader = DataLoader(dataset,batch_size=batchsize,shuffle=True,num_workers=4)
 54 |         epochs = 100
 55 |         loss_fn = nn.MSELoss().to(self.device)
 56 |         optimizer = torch.optim.SGD(self.parameters(), lr=10e-5, momentum=0.9)
 57 |         self.net.to(self.device)
 58 |         for epoch in range(epochs):
 59 |             i = 0
 60 |             for samples, labels in dataloader:
 61 |                 samples = samples.to(self.device)
 62 |                 labels = labels.to(self.device)
 63 |                 out = self.forward(samples)
 64 |                 loss = loss_fn(out, labels)
 65 |                 r2 = r2_score(labels.cpu().detach().numpy(), out.cpu().detach().numpy())
 66 |                 optimizer.zero_grad()
 67 |                 loss.backward()
 68 |                 optimizer.step()
 69 |                 i = i + len(samples)
 70 |                 print(
 71 |                     'epoch:{}/{}    iter:{}/{}    loss:{}   r2:{} '.format(epoch + 1, epochs, i, data_len,
 72 |                                                                   loss, r2))
 73 | from torch_geometric.nn import GINConv ,GCNConv,GATConv,GraphConv,FiLMConv
 74 | class GIN(nn.Module):
 75 |     def __init__(self,modelname, nfeat, dropout,n_layer=5, JK="last", residual=False):
 76 |         '''
 77 |         nfeat:输入x特征矩阵维度
 78 |        
 79 |         n_layer: GIN的层数
 80 |         dropout：dropout的比例
 81 |         '''
 82 |         super(GIN, self).__init__()
 83 |         self.num_layers = n_layer
 84 |         self.JK = JK
 85 |         # add residual connection or not
 86 |         self.residual = residual
 87 |         self.dropout = dropout
 88 |          # List of GNNs
 89 |          
 90 |         self.convs = torch.nn.ModuleList()
 91 |         self.batch_norms = torch.nn.ModuleList()
 92 | 
 93 |         for layer in range(n_layer):
 94 |             if modelname=='GIN':
 95 |                 self.convs.append(GINConv(self.MLP(nfeat,nfeat)))
 96 |             elif modelname=='GCN':
 97 |                 self.convs.append(GCNConv(nfeat,nfeat))
 98 |             elif modelname=='GAT':
 99 |                 self.convs.append(GATConv(nfeat,nfeat))
100 |             elif modelname=='GNN':
101 |                 self.convs.append(GraphConv(nfeat,nfeat))
102 |                 
103 |             elif modelname=='Flim-GNN':
104 |                 self.convs.append(FiLMConv(nfeat,nfeat))
105 |             
106 |             self.batch_norms.append(torch.nn.BatchNorm1d(nfeat))
107 |     @staticmethod
108 |     def MLP(in_channels: int, out_channels: int) -> torch.nn.Module:
109 |         return nn.Sequential(
110 |             nn.Linear(in_channels, out_channels),
111 |             nn.BatchNorm1d(out_channels),
112 |             nn.ReLU(inplace=True),
113 |             nn.Linear(out_channels, out_channels),
114 |         )
115 |     def forward(self, x, adj):
116 |         h_list=[x]
117 |         
118 |         for layer in range(self.num_layers):
119 |             h = self.convs[layer](x, adj)
120 |             h = self.batch_norms[layer](h)
121 |             if layer == self.num_layers - 1:
122 |                 # remove relu for the last layer
123 |                 h = F.dropout(h, self.dropout, training=self.training)
124 |             else:
125 |                 h = F.dropout(F.relu(h), self.dropout, training=self.training)
126 |     
127 |             if self.residual:
128 |                 h += h_list[layer]
129 |     
130 |             h_list.append(h)
131 | 
132 |         
133 |         return torch.stack(h_list[-self.num_layers:])
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | class myNetwork(nn.Module):
141 |     def __init__(self, en_size,embed_size,nfeat,hidden_size,tar_len,segment_len,n_layers=2,dropout=0.02,modelname="GIN"):
142 |         '''
143 |         en_size 输入序列的长度
144 |         embed_size  输入样本的特征维度
145 |         nfeat  输入特征矩阵X的特征维度
146 |         hidden_size 隐藏层的维度
147 |         tar_len  输出序列的长度
148 |         segment_len  shapelet的长度
149 |         n_layers  GIN的层数
150 |         '''
151 |         super(myNetwork, self).__init__()
152 |         self.gin_layers=n_layers
153 |         self.k=tar_len//segment_len
154 |         self.gIn = GIN(modelname,hidden_size, dropout,n_layer=n_layers)
155 |        
156 |         
157 |         self.net = nn.Sequential(
158 |             nn.Linear(segment_len, 512),
159 |             nn.ReLU(),
160 |             nn.Flatten(),
161 |             nn.Linear(512*self.k,512),
162 |             nn.BatchNorm1d(512),
163 |             nn.ReLU(),
164 |             nn.Linear(512,tar_len)
165 |         )
166 |         self.mlp1 =nn.Linear(nfeat,hidden_size)
167 |         self.mlp2 =nn.Linear(self.gin_layers*hidden_size,hidden_size)
168 |     def foroneGraph(self,x,adj):
169 |         '''
170 |         对于单个时间序列，单个图
171 |                 
172 |         返回其对应的关键shapelet index
173 |         '''
174 |         a1=(adj >0).nonzero().t()
175 |         
176 |         x=F.relu(self.mlp1(x))#len,hidden_size
177 |         h_=self.gIn(x,a1)#5,num_shapelets,hidden_size   h_[-1]为所有shapelets的表示
178 |         # print('隐藏层的维度：',h_.size())#torch.Size([2, 30, 256])
179 |         #node level Embedding
180 |         node_E=x+h_[-1]
181 | 
182 | 
183 |         #compute graph level EMbedding
184 |        # Sum+CONCAT
185 |         graph_Re=torch.sum(h_,dim=1).view(1,-1)#1, n_layer*hidden_size
186 |         # print("%%%%%",graph_Re.size())
187 |         
188 |         graph_Re=self.mlp2(graph_Re)#1, hidden_size
189 |         graph_Re=F.relu(graph_Re)#1, hidden_size
190 |         dis=-torch.mm(graph_Re,node_E.t())#1,num_shapelets
191 |         #取top 
192 |         shapelet_index=torch.topk(dis,self.k,1)[1]#1,k
193 |         return shapelet_index.squeeze(0)
194 |         
195 | 
196 |     def forward(self,x,adj,embedding):
197 |         '''
198 |         x：batch_size,num_shapelets, nfeat
199 |         adj: batch_size,num_shapelets, num_shapelets
200 |         embedding  前面获取到的shapelet字典   num_shapelets,segments_length
201 |         '''
202 |         key_shapelets=[]#bacth_size,
203 |         for o_x,o_adj in zip(x,adj):
204 |             key_shapelets.append(self.foroneGraph(o_x,o_adj))
205 |         key_shapelets=torch.stack(key_shapelets)   #bacth_size,k
206 |         out=embedding(key_shapelets)  #bacth_size,k,segments_length
207 | #        print("out.size",out.size())
208 |         out=self.net(out)#bacth_size,tar_len
209 |         return out
210 |     
211 |         
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | class ModelUtils(object):
220 |     """
221 |         model utils for basic classifiers.
222 |         kwargs list:
223 |             lr paras
224 |                 penalty: list of str, candidate: l1, l2;
225 |                 c: list of float
226 |                 inter_scale: list of float
227 |             rf and dts paras:
228 |                 criteria: list of str, candidate: gini, entropy
229 |                 max_features: list of str(including None), candidate: auto, log2 or None
230 |                 max_depth: list of int
231 |                 max_split: list of int
232 |                 min_leaf: list of int
233 |             xgb paras:
234 |                 max_depth: list of int
235 |                 learning_rate: list of float
236 |                 n_jobs: int
237 |                 class_weight: list of int
238 |                 booster: list of str, candidate: gblinear, gbtree, dart
239 |             svm paras:
240 |                 c: list of float
241 |                 svm_kernel: list of str, candidate: rbf, poly, sigmoid
242 |             deepwalk paras:
243 |                 num_walks: list of int
244 |                 representation_size: list of int
245 |                 window_size: list of int
246 |                 workers: int
247 |                 undirected: bool
248 |     """
249 |     def __init__(self, kernel, **kwargs):
250 |         self.kernel = kernel
251 |         self.kwargs = kwargs
252 | 
253 |     @property
254 |     def clf__(self):
255 |         if self.kernel == 'lr':
256 |             from sklearn.linear_model import LogisticRegression
257 |             return LogisticRegression
258 |         elif self.kernel == 'svm':
259 |             from sklearn.svm import SVC
260 |             return SVC
261 |         elif self.kernel == 'dts':
262 |             from sklearn.tree import DecisionTreeClassifier
263 |             return DecisionTreeClassifier
264 |         elif self.kernel == 'rf':
265 |             from sklearn.ensemble import RandomForestClassifier
266 |             return RandomForestClassifier
267 |         elif self.kernel == 'mlpReg':
268 |             return myMlp
269 |         
270 |         # elif self.kernel == 'xgb':
271 |         #     from xgboost import XGBClassifier
272 |         #     return XGBClassifier
273 |         else:
274 |             raise NotImplementedError('unsupported kernel {}'.format(self.kernel))
275 | 
276 |     def para_len(self, balanced):
277 |         cnt = 0
278 |         for args in self.clf_paras(balanced=balanced):
279 |             cnt += 1
280 |         return cnt
281 | 
282 |     def clf_paras(self, balanced):
283 |         class_weight = 'balanced' if balanced else None
284 |         if self.kernel == 'lr':
285 |             penalty = self.kwargs.get('penalty', ['l1', 'l2'])
286 |             c = self.kwargs.get('c', [pow(5, i) for i in range(-3, 3)])
287 |             intercept_scaling = self.kwargs.get('inter_scale', [pow(5, i) for i in range(-3, 3)])
288 |             for (p1, p2, p3) in itertools.product(penalty, c, intercept_scaling):
289 |                 yield {
290 |                     'penalty': p1,
291 |                     'C': p2,
292 |                     'intercept_scaling': p3,
293 |                     'class_weight': class_weight
294 |                 }
295 |         elif self.kernel == 'rf' or self.kernel == 'dts':
296 |             criteria = self.kwargs.get('criteria', ['gini', 'entropy'])
297 |             max_features = self.kwargs.get('max_feature', ['auto', 'log2',  None])
298 |             max_depth = self.kwargs.get('max_depth', [10, 25, 50])
299 |             min_samples_split = self.kwargs.get('max_split', [2, 4, 8])
300 |             min_samples_leaf = self.kwargs.get('min_leaf', [1, 3, 5])
301 |             for (p1, p2, p3, p4, p5) in itertools.product(
302 |                     criteria, max_features, max_depth, min_samples_split, min_samples_leaf
303 |             ):
304 |                 yield {
305 |                     'criterion': p1,
306 |                     'max_features': p2,
307 |                     'max_depth': p3,
308 |                     'min_samples_split': p4,
309 |                     'min_samples_leaf': p5,
310 |                     'class_weight': class_weight
311 |                 }
312 |         elif self.kernel == 'xgb':
313 |             max_depth = self.kwargs.get('max_depth', [1, 4, 8, 12])
314 |             learning_rate = self.kwargs.get('learning_rate', [0.1, 0.2])
315 |             n_jobs = [self.kwargs.get('n_jobs', psutil.cpu_count())]
316 |             class_weight = self.kwargs.get('class_weight', [1, 10, 50])
317 |             booster = self.kwargs.get('booster', ['gblinear', 'gbtree', 'dart'])
318 |             n_estimators = self.kwargs.get('n_estimators', [10, 50, 100, 150])
319 |             for (p1, p2, p3, p4, p5, p6) in itertools.product(
320 |                     max_depth, learning_rate, booster, n_jobs, class_weight, n_estimators
321 |             ):
322 |                 yield {
323 |                     'max_depth': p1,
324 |                     'learning_rate': p2,
325 |                     'booster': p3,
326 |                     'n_jobs': p4,
327 |                     'scale_pos_weight': p5,
328 |                     'n_estimators': p6
329 |                 }
330 |         elif self.kernel == 'svm':
331 |             c = self.kwargs.get('c', [pow(2, i) for i in range(-2, 2)])
332 |             svm_kernel = self.kwargs.get('svm_kernel', ['rbf', 'poly', 'sigmoid'])
333 |             for (p1, p2) in itertools.product(c, svm_kernel):
334 |                 yield {
335 |                     'C': p1,
336 |                     'kernel': p2,
337 |                     'class_weight': class_weight
338 |                     }
339 |         else:
340 |             raise NotImplementedError()
341 | 
342 |     @staticmethod
343 |     def partition_data__(data, ratio, shuffle=True, multi=True):
344 |         import random
345 |         if not multi:
346 |             size = len(data)
347 |             if shuffle:
348 |                 idx = random.sample(range(size), int(size * ratio))
349 |             else:
350 |                 idx, step, cnt, init = [], 1.0 / ratio, 0, 0
351 |                 while cnt < int(size * ratio):
352 |                     idx.append(int(init))
353 |                     init += step
354 |             return data[idx]
355 |         else:
356 |             num, size = len(data), len(data[0])
357 |             if shuffle:
358 |                 idx = random.sample(range(size), int(size * ratio))
359 |             else:
360 |                 idx, step, cnt, init = [], 1.0 / ratio, 0, 0
361 |                 while cnt < int(size * ratio):
362 |                     idx.append(int(init))
363 |                     init += step
364 |             return [data[k][idx] for k in range(num)]
365 | 
366 |     def deepwalk_paras(self):
367 |         num_walks = self.kwargs.get('num_walks', [10, 20])
368 |         representation_size = self.kwargs.get('representation_size', [32, 64, 128, 256])
369 |         walk_length = self.kwargs.get('walk_length', [32, 64, 128])
370 |         window_size = self.kwargs.get('window_size', [5, 10])
371 |         workers = self.kwargs.get('workers', psutil.cpu_count())
372 |         undirected = self.kwargs.get('undirected', False)
373 |         for (p1, p2, p3, p4) in itertools.product(
374 |                 num_walks, representation_size, walk_length, window_size
375 |         ):
376 |             yield {
377 |                 'number-walks': p1,
378 |                 'representation-size': p2,
379 |                 'walk-length': p3,
380 |                 'window-size': p4,
381 |                 'workers': workers,
382 |                 'undirected': undirected
383 |             }
384 | 
385 |     def return_metric_method(self, opt_metric):
386 |         from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
387 |         if opt_metric == 'accuracy':
388 |             return accuracy_score
389 |         elif opt_metric == 'precision':
390 |             return precision_score
391 |         elif opt_metric == 'recall':
392 |             return recall_score
393 |         elif opt_metric == 'f1':
394 |             return f1_score
395 |         else:
396 |             raise NotImplementedError('unsupported metric {}'.format(opt_metric))
397 | 
398 |     def save_model(self, fpath):
399 |         pass
400 | 
401 |     def load_model(self, fpath, map_location='cuda:0'):
402 |         pass
403 | 
404 |     def save_shapelets(self, fpath):
405 |         pass
406 | 
407 |     def load_shapelets(self, fpath, map_location='cuda:0'):
408 |         pass
409 | 
410 | 
411 | class Debugger(object):
412 |     """
413 |         Class for debugger print
414 |     """
415 |     def __init__(self):
416 |         pass
417 | 
418 |     @staticmethod
419 |     def error_print(msg, debug=True):
420 |         if debug:
421 |             print('[error]' + msg)
422 | 
423 |     @staticmethod
424 |     def warn_print(msg, debug=True):
425 |         if debug:
426 |             print('[warning]' + msg)
427 | 
428 |     @staticmethod
429 |     def debug_print(msg, debug=True):
430 |         if debug:
431 |             print('[debug]' + msg + '\r', end='')
432 |             sys.stdout.flush()
433 | 
434 |     @staticmethod
435 |     def info_print(msg):
436 |         print('[info]' + msg)
437 | 
438 |     @staticmethod
439 |     def time_print(msg, begin, profiling=False):
440 |         if profiling:
441 |             assert isinstance(begin, type(time.time())), 'invalid begin time {}'.format(begin)
442 |             print('[info]{}, elapsed for {:.2f}s'.format(msg, time.time() - begin))
443 | 
444 | 
445 | class Queue:
446 |     def __init__(self, max_size):
447 |         self.queue = []
448 |         self.max_size = max_size
449 | 
450 |     def enqueue(self, val):
451 |         if self.size() == self.max_size:
452 |             self.dequeue()
453 |         self.queue.insert(0, val)
454 | 
455 |     def dequeue(self):
456 |         if self.is_empty():
457 |             return None
458 |         else:
459 |             return self.queue.pop()
460 | 
461 |     def size(self):
462 |         return len(self.queue)
463 | 
464 |     def is_empty(self):
465 |         return self.size() == 0
466 | 
467 | 
468 | def convert_string(string, val, cvt_type='float'):
469 |     """
470 |         Convert a string as given type.
471 |     :param string:  input string
472 |     :param val: default return value if conversion fails
473 |     :param cvt_type: conversion type
474 |     :return: value with given type
475 |     """
476 |     try:
477 |         return eval(cvt_type)(string)
478 |     except NameError as _:
479 |         Debugger.warn_print('invalid convert type {}; use float() by default'.format(cvt_type))
480 |         return float(string)
481 |     except ValueError as _:
482 |         Debugger.warn_print('invalid convert value {}; return {} by default'.format(string, val))
483 |         return val
484 | 
485 | 
486 | def syscmd(cmd, encoding=''):
487 |     """
488 |         Runs a command on the system, waits for the command to finish, and then
489 |     returns the text output of the command. If the command produces no text
490 |     output, the command's return code will be returned instead.
491 | 
492 |     :param cmd: command, str
493 |     :param encoding: encoding method, str(utf8, unicode, etc)
494 |     :return: return code or text output
495 |     """
496 |     p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE,
497 |               stderr=STDOUT, close_fds=True)
498 |     p.wait()
499 |     output = p.stdout.read()
500 |     if len(output) > 1:
501 |         if encoding:
502 |             return output.decode(encoding)
503 |         else:
504 |             return output
505 |     return p.returncode
506 | 
507 | 
508 | 


--------------------------------------------------------------------------------