├── .gitignore ├── README.md ├── __init__.py ├── archive ├── __init__.py ├── io_utils.py └── load_usr_dataset.py ├── baselines ├── __init__.py ├── evaluate_baselines.py ├── feature_based.py └── show_baseline_results.py ├── config.py ├── dataset └── UCRArchive_2018 │ ├── Earthquakes │ ├── Earthquakes_TEST.arff │ ├── Earthquakes_TEST.tsv │ ├── Earthquakes_TRAIN.arff │ ├── Earthquakes_TRAIN.tsv │ ├── README.md │ └── desktop.ini │ ├── Strawberry │ ├── README.md │ ├── Strawberry_TEST.tsv │ ├── Strawberry_TRAIN.tsv │ └── desktop.ini │ └── WormsTwoClass │ ├── README.md │ ├── WormsTwoClass_TEST.tsv │ ├── WormsTwoClass_TRAIN.tsv │ └── desktop.ini ├── docs ├── README.md ├── _config.yml ├── _layouts │ └── default.html ├── exp.jpg ├── motiv.jpg └── vis.jpg ├── evaluate_paras.py ├── requirements.txt ├── scripts ├── cache │ ├── ucr-Earthquakes_embedding_t2g_model.cache │ ├── ucr-Earthquakes_greedy_50_24_shapelets.cache │ ├── ucr-Strawberry_embedding_t2g_model.cache │ ├── ucr-Strawberry_greedy_50_15_shapelets.cache │ ├── ucr-WormsTwoClass_embedding_t2g_model.cache │ └── ucr-WormsTwoClass_greedy_20_30_shapelets.cache ├── run.py └── std_test.py ├── setup.py └── time2graph ├── __init__.py ├── core ├── __init__.py ├── distance_utils.py ├── model.py ├── model_embeds.py ├── model_sequence.py ├── model_utils.py ├── rnn │ ├── __init__.py │ ├── deep_models.py │ └── deep_utils.py ├── shapelet_embedding.py ├── shapelet_utils.py └── time_aware_shapelets.py └── utils ├── __init__.py ├── base_utils.py └── mp_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | env/ 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .coverage.* 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | *.cover 44 | .hypothesis/ 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | local_settings.py 53 | 54 | # Flask stuff: 55 | instance/ 56 | .webassets-cache 57 | 58 | # Scrapy stuff: 59 | .scrapy 60 | 61 | # Sphinx documentation 62 | docs/_build/ 63 | 64 | # Jupyter Notebook 65 | .ipynb_checkpoints 66 | 67 | # pyenv 68 | .python-version 69 | 70 | # celery beat schedule file 71 | celerybeat-schedule 72 | 73 | # dotenv 74 | .env 75 | 76 | # virtualenv 77 | .venv 78 | venv/ 79 | ENV/ 80 | 81 | # Spyder project settings 82 | .spyderproject 83 | .spyproject 84 | 85 | # Rope project settings 86 | .ropeproject 87 | 88 | # mkdocs documentation 89 | /site -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Time2Graph 2 | This project implements the Time2Graph model[1], which focuses on time series modeling with dynamic shapelets. 3 | 4 | ## Quick Links 5 | 6 | - [Building and Testing](#building-and-testing) 7 | - [Usage](#usage) 8 | - [Performance](#performance) 9 | - [Reference](#reference) 10 | 11 | ## Building and Testing 12 | 13 | This project is implemented primarily in Python 3.6, with several dependencies listed below. We have tested the whole framework on Ubuntu 16.04.5 LTS with kernel 4.4.0, and it is expected to easily build and run under a regular Unix-like system. 14 | 15 | ### Dependencies 16 | 17 | - [Python 3.6](https://www.python.org). 18 | Version 3.6.5 has been tested. Higher versions are expected be compatible with current implementation, while there may be syntax errors or conflicts under python 2.x. 19 | 20 | - [DeepWalk](https://github.com/phanein/deepwalk) 21 | We use a modified version of the original implementation of *deepwalk* to satisfy the support for directed and weighted graphs. The source codes with minor modifications can be found on [weighted_deepwalk](https://github.com/petecheng/weighted_deepwalk). 22 | 23 | - [PyTorch](https://pytorch.org). 24 | 25 | Version 0.4.1 has been tested. You can find installation instructions [here](https://pytorch.org/get-started/locally/). Note that the GPU support is **ENCOURAGED** as it greatly boosts training efficiency. 26 | 27 | - [XGBoost](https://github.com/dmlc/xgboost) 28 | 29 | Version 0.80 has been tested. You can find installation instructions [here](https://xgboost.readthedocs.io/en/latest/build.html). 30 | 31 | - [Other Python modules](https://pypi.python.org). Some other Python module dependencies are listed in ```requirements.txt```, which can be easily installed with pip: 32 | 33 | ```bash 34 | pip install -r requirements.txt 35 | ``` 36 | 37 | Although not all dependencies are mentioned in the installation instruction links above, you can find most of the libraries in the package repository of a regular Linux distribution. 38 | 39 | ### Building the Project 40 | 41 | Before building the project, we recommend switching the working directory to the project root directory. Assume the project root is at ````, then run command 42 | 43 | ```bash 44 | cd 45 | ``` 46 | 47 | Note that we assume ```` as your working directory in all the commands presented in the rest of this documentation. Then make sure that the environment variable `` PYTHONPATH`` is properly set, by running the following command (on a Linux distribution): 48 | 49 | ```bash 50 | export PYTHONPATH=`readlink -f ./` 51 | ``` 52 | 53 | ### Testing the Project (Reproducibility) 54 | 55 | A test script ```scripts/std_test.py``` is available for reproducibility on the benchmark datasets: 56 | 57 | ```markdown 58 | python . -h 59 | 60 | usage: . [-h] [--dataset] [--n_splits] [--model_cache] [--shapelet_cache] [--gpu_enable] 61 | 62 | optional arguments: 63 | -h, --help show this help message and exit 64 | --dataset str, one of `ucr-Earthquakes`, `ucr-WormsTwoClass` and `ucr-Strawberry`, 65 | which we have set the optimal parameters after fine-tuning. 66 | (default: `ucr-Earthquakes`) 67 | --n_splits int, number of splits in cross-validation. (default: 5) 68 | --model_cache bool, whether to use a pretrained model.(default: False) 69 | --shapelet_cache bool, whether to use a pretrained shapelets set.(default: False) 70 | --gpu_enable bool, whether to enable GPU usage. (default: False) 71 | ``` 72 | 73 | To quickly and exactly reproduce the results that reported in the paper, we highly **RECOMMEND** that set ``model_cache`` as True, since there are unavoidable randomness in the process of shapelets learning and graph embedding. And if only `shapelet_cache` is True, it will learn a new set of shapelet embeddings, which may bring some small fluctuations on the performance. So the easiest way for reproducibility and project testing is to run the following command: 74 | 75 | ```bash 76 | python scripts/std_test.py --model_cache --dataset *OPTION* --gpu_enable 77 | ``` 78 | 79 | ## Usage 80 | 81 | Given a set of time series data and the corresponding labels, the **Time2Graph** framework aims to learn the representations of original time series, and conduct time series classifications under the setting of supervised learning. 82 | 83 | ### Input Format 84 | 85 | The input time series data and labels are expected to be ```numpy.ndarray```: 86 | 87 | ```markdown 88 | Time_Series X: 89 | numpy.ndarray with shape (N x L x data_size), 90 | where N is the number of time series, L is the time series length, 91 | and data_size is the data dimension. 92 | Labels Y: 93 | numpy.ndarray with shape (N x 1), with 0 as negative, and 1 as positive samples. 94 | ``` 95 | 96 | We organize the preprocessing codes that load the *UCR* dataset in the `archive/` repo, and if you want to utilize the framework on other datasets, just preprocess the original data as the abovementioned format. Note that the time series data is not needed to be normalized or scaled, since you can set the parameter `scaled` as True when initializing **Time2Graph** model. 97 | 98 | ### Main Script 99 | 100 | Now that the input data is ready, the main script `scripts/run.py` is a pipeline example to train and test the whole framework. Firstly you need to modify the codes in the following block (*line 46-51*) to load your datasets, by reassigning `x_train, y_train, x_test, y_test` respectively. 101 | 102 | ```python 103 | if args.dataset.startswith('ucr'): 104 | dataset = args.dataset.rstrip('\n\r').split('-')[-1] 105 | x_train, y_train, x_test, y_test = load_usr_dataset_by_name( 106 | fname=dataset, length=args.seg_length * args.num_segment) 107 | else: 108 | raise NotImplementedError() 109 | ``` 110 | 111 | The help information of the main script `scripts/run.py` is listed as follows: 112 | 113 | ```markdown 114 | python . -h 115 | 116 | usage: .[-h] [-- dataset] [--K] [--C] [--num_segment] [--seg_length] [--data_size] 117 | [--n_splits] [--njobs] [--optimizer] [--alpha] [--beta] [--init] 118 | [--gpu_enable] [--opt_metric] [--cache] [--embed] [--embed_size] [--warp] 119 | [--cmethod] [--kernel] [--percentile] [--measurement] [--batch_size] 120 | [--tflag] [--scaled] [--norm] [--no_global] 121 | 122 | optional arguments: 123 | -h, --help show this help message and exit 124 | --dataset str, indicate which dataset to load; 125 | need to modify the codes in line 46-51. 126 | --K int, number of shapelets that try to learn 127 | --C int, number of shapelet candidates used for learning shapelets 128 | --num_segment int, number of segment that a time series have 129 | --seg_length int, the segment length, 130 | so the length of a time series is num_segment * seg_length 131 | --data_size int, the dimension of time series data 132 | --n_splits int, number of cross-validation, default 5. 133 | --njobs int, number of threads if using multiprocessing. 134 | --optimizer str, optimizer used for learning shapelets, default `Adam`. 135 | --alpha float, penalty for local timing factor, default 0.1. 136 | --beta float, penalty for global timing factor, default 0.05. 137 | --init int, init offset for time series, default 0. 138 | --gpu_enable bool, whether to use GPU, default False. 139 | --opt_metric str, metric for optimizing out-classifier, default `accuracy`. 140 | --cache bool, whether to save model cache, defualt False. 141 | --embed str, embedding mode, one of `aggregate` and `concate`. 142 | --embed_size int, embedding size in deepwalk, default 256. 143 | --wrap int, warp size in greedy-dtw, default 2. 144 | --cmethod str, candidate generation method, one of `cluster` and `greedy` 145 | --kernel str, choice of outer-classifer, default `xgb`. 146 | --percentile int, distance threshold (percentile) in graph construction, default 10 147 | --measurement str, distance measurement,default `gdtw`. 148 | --batch_size int, batch size, default 50 149 | --tflag bool, whether to use timing factors, default True. 150 | --scaled bool, whether to scale time seriee by z-normalize, default False. 151 | --norm bool, whether to normalize handcraft-features, default False. 152 | --no_global bool, whether to use global timing factor 153 | when constructing shapelet evolution graph, default False. 154 | ``` 155 | 156 | Some of the arguments may require further explanation: 157 | 158 | - ``--K/--C``: the number of shapelets should be carefully selected, and it is highly related with intrinsic properties of the dataset. And in our extensive experiments, `C` is often set 10 or 20 times of `K` to ensure that we can learn from a large pool of candidates. 159 | - ``--percentile`` , ``--alpha`` and `--beta`: we have conduct fine-tuning on several datasets, and in most cases we recommend the default settings, although modifying them may bring performance increment, as well as drop. 160 | 161 | ### Demo 162 | 163 | We include all three benchmark *UCR* datasets in the ``dataset`` directory, which is a subset of *UCR-Archive* time series dataset. See [Data Sets](#data-sets) for more details. Then a demo script is available by calling `scripts/run.py`, as the following: 164 | 165 | ```shell 166 | python scripts/run.py --dataset ucr-Earthquakes --K 50 --C 500 167 | --num_segment 21 --seg_length 24 --data_size 1 --embed concate --percentile 5 --gpu_enable 168 | ``` 169 | 170 | ## Evaluation 171 | 172 | ### Data Sets 173 | 174 | The three benchmark datasets reported in [1] was made public by [UCR](https://www.cs.ucr.edu/%7Eeamonn/time_series_data_2018/), which consists of many time series datasets. we select several *UCR* datasets from many candidates by the following reasons that: 1) to maintain the consistency of evaluation metrics between the real-world and public datasets, we only consider binary-label ones in *UCR*; 2) we have to make sure that there are enough training cases because we need sufficient samples to capture the normal transitions between shapelets (many binary-label datasets in *UCR* only have less than 100 training samples), and 3) we omit all datasets categorized as “image”, because the proposed intuition (timing factor, shapelet evolutions) may not be appropriate for time series transformed from images. After filtering based on the abovementioned criterion, and due to space limitation, we only present those three in [1]. We have tested some others such as *Ham* and *Computers*, etc., and also achieved competitive results compared with baseline methods. 175 | 176 | Furthermore, we apply the proposed *Time2Graph* model on two real-world scenarios: Electricity Consumption Records (**ECR**) provided by State Grid of China, and Network Traffic Flow (**NTF**) from China Telecom. Detailed dataset descriptions can be found in our paper. The performance increment compared with existing models clearly demonstrate the effectiveness of the framework, and below we list the final results along with several popular baselines. 177 | 178 | ### Performance 179 | 180 | | Accuracy on UCR(%) | Earthquakes | WormsTwoClass | Strawberry | 181 | | :----------------: | :---------: | :-----------: | :--------: | 182 | | NN-DTW | 70.31 | 68.16 | 95.53 | 183 | | TSF | 74.67 | 68.51 | 96.27 | 184 | | FS | 74.66 | 70.58 | 91.66 | 185 | | Time2Graph | **79.14** | **72.73** | **96.76** | 186 | 187 | | Performance on ECR(%) | Precision | Recall | F1 | 188 | | :-------------------: | :-------: | :-------: | :-------: | 189 | | NN-DTW | 15.52 | 18.15 | 16.73 | 190 | | TSF | 26.32 | 2.02 | 3.75 | 191 | | FS | 10.45 | 79.84* | 18.48 | 192 | | Time2Graph | **30.10** | **40.26** | **34.44** | 193 | 194 | | Performance on NTF(%) | Precision | Recall | F1 | 195 | | :-------------------: | :-------: | :-------: | :-------: | 196 | | NN-DTW | 33.20 | 43.75 | 37.75 | 197 | | TSF | 57.52 | 33.85 | 42.62 | 198 | | FS | 63.55 | 35.42 | 45.49 | 199 | | Time2Graph | **71.52** | **56.25** | **62.97** | 200 | 201 | Please refer to our paper [1] for detailed information about the experimental settings, the description of unpublished data sets, the full results of our experiments, along with ablation and observational studies. 202 | 203 | ## Reference 204 | 205 | [1] Cheng, Z; Yang, Y; Wang, W; Hu, W; Zhuang, Y and Song, G, 2020, Time2Graph: Revisiting Time Series Modeling with Dynamic Shapelets, In AAAI, 2020 206 | 207 | ``` 208 | @inproceedings{cheng2020time2graph, 209 | title = "{Time2Graph: Revisiting Time Series Modeling with Dynamic Shapelets}", 210 | author = {{Cheng}, Z. and {Yang}, Y. and {Wang}, W. and {Hu}, W. and {Zhuang}, Y. and {Song}, G.}, 211 | booktitle={Proceedings of Association for the Advancement of Artificial Intelligence (AAAI)}, 212 | year = 2020, 213 | } 214 | ``` -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/__init__.py -------------------------------------------------------------------------------- /archive/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/archive/__init__.py -------------------------------------------------------------------------------- /archive/io_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import time 3 | import datetime 4 | 5 | 6 | def convert_str2float(strr): 7 | if strr == '': 8 | return -1.0 9 | else: 10 | return float(strr) 11 | 12 | 13 | def convert_str2int(strr): 14 | if strr == '': 15 | return -1 16 | else: 17 | return int(strr) 18 | 19 | 20 | def get_month_in_year(timestamp): 21 | return int(time.localtime(timestamp).tm_mon) - 1 22 | 23 | 24 | def get_day_in_month(timestamp): 25 | return int(time.localtime(timestamp).tm_mday) - 1 26 | 27 | 28 | def get_day_in_year(timestamp): 29 | return int(time.localtime(timestamp).tm_yday) - 1 30 | 31 | 32 | def get_year(timestamp): 33 | return int(time.localtime(timestamp).tm_year) 34 | 35 | 36 | def format_time_from_str(time_str, tfmt): 37 | return int(time.mktime(time.strptime(time_str, tfmt))) 38 | 39 | 40 | def generate_time_series_time(begin, end, tfmt, duration): 41 | ret = [] 42 | d_begin = datetime.datetime.fromtimestamp(format_time_from_str(time_str=begin, tfmt=tfmt)) 43 | d_end = datetime.datetime.fromtimestamp(format_time_from_str(time_str=end, tfmt=tfmt)) 44 | while d_begin <= d_end: 45 | ret.append(d_begin.strftime(tfmt)) 46 | if duration == 'day': 47 | d_begin += datetime.timedelta(days=1) 48 | elif duration == 'hour': 49 | d_begin += datetime.timedelta(hours=1) 50 | else: 51 | raise NotImplementedError() 52 | return ret 53 | 54 | 55 | def generate_time_index(begin, end, tfmt, duration): 56 | ret = {} 57 | d_begin = datetime.datetime.fromtimestamp(format_time_from_str(time_str=begin, tfmt=tfmt)) 58 | d_end = datetime.datetime.fromtimestamp(format_time_from_str(time_str=end, tfmt=tfmt)) 59 | cnt = 0 60 | while d_begin <= d_end: 61 | ret[d_begin.strftime(tfmt)] = cnt 62 | cnt += 1 63 | if duration == 'day': 64 | d_begin += datetime.timedelta(days=1) 65 | elif duration == 'hour': 66 | d_begin += datetime.timedelta(hours=1) 67 | else: 68 | raise NotImplementedError() 69 | return ret 70 | 71 | 72 | def transform_np2tsv(x, y, fpath): 73 | output = open(fpath, 'w') 74 | for k in range(len(y)): 75 | data = x[k] 76 | output.write('{}'.format(y[k])) 77 | for i in range(len(data)): 78 | for j in range(len(data[i])): 79 | output.write('\t{}'.format(data[i, j])) 80 | output.write('\n') 81 | -------------------------------------------------------------------------------- /archive/load_usr_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas 3 | from config import * 4 | 5 | 6 | def load_usr_dataset_by_name(fname, length): 7 | """ 8 | load UCR dataset given dataset name. 9 | :param fname: 10 | dataset name, e.g., Earthquakes. 11 | :param length: 12 | time series length that want to load in. 13 | :return: 14 | """ 15 | dir_path = '{}/dataset/UCRArchive_2018'.format(module_path) 16 | assert path.isfile('{}/{}/{}_TEST.tsv'.format(dir_path, fname, fname)), '{} NOT EXIST in UCR!'.format(fname) 17 | train_data = pandas.read_csv('{}/{}/{}_TRAIN.tsv'.format(dir_path, fname, fname), sep='\t', header=None) 18 | test_data = pandas.read_csv('{}/{}/{}_TEST.tsv'.format(dir_path, fname, fname), sep='\t', header=None) 19 | init = train_data.shape[1] - length 20 | x_train, y_train = train_data.values[:, init:].astype(np.float).reshape(-1, length, 1), \ 21 | train_data[0].values.astype(np.int) 22 | x_test, y_test = test_data.values[:, init:].astype(np.float).reshape(-1, length, 1), \ 23 | test_data[0].values.astype(np.int) 24 | lbs = np.unique(y_train) 25 | y_train_return, y_test_return = np.copy(y_train), np.copy(y_test) 26 | for idx, val in enumerate(lbs): 27 | y_train_return[y_train == val] = idx 28 | y_test_return[y_test == val] = idx 29 | return x_train, y_train_return, x_test, y_test_return 30 | 31 | 32 | -------------------------------------------------------------------------------- /baselines/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | SAXVSM: pyts.Classification 3 | LS: tslearn.Shapelet 4 | """ -------------------------------------------------------------------------------- /baselines/evaluate_baselines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import argparse 3 | import warnings 4 | import os 5 | from config import * 6 | from time2graph.utils.base_utils import Debugger 7 | """ 8 | scripts for generating java-cmd that conduct baseline algorithms. 9 | """ 10 | 11 | if __name__ == '__main__': 12 | warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) 13 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 14 | parser.add_argument('--dataset', type=str, default='stealing') 15 | parser.add_argument('--classpath', type=str, 16 | default='{}/baselines/TimeSeriesClassification/'.format(module_path)) 17 | parser.add_argument('--input', type=str, default='{}/dataset/'.format(module_path)) 18 | parser.add_argument('--output', type=str, default='{}/dataset/'.format(module_path)) 19 | parser.add_argument('--top', type=str, default='{}/baselines/TimeSeriesClassification/' 20 | 'out/production/TimeSeriesClassification'.format(module_path)) 21 | parser.add_argument('--gpu_number', type=int, default=0) 22 | parser.add_argument('--clf', type=str, required=True) 23 | 24 | opt = parser.parse_args() 25 | all_clf = [ 26 | 'CID_DTW', 'DD_DTW', 'WDTW', 'ED', 'DTW', 27 | 'LearnShapelets', 'FastShapelets', 'BagOfPatterns', 28 | 'TSF', 'TSBF', 'LPS', 'ST', 'COTE' 29 | ] 30 | 31 | classpath = [] 32 | for dirpath, dirnames, fnamesList in os.walk(opt.classpath): 33 | Debugger.info_print('{}'.format(dirpath)) 34 | for fname in fnamesList: 35 | if fname.endswith('.jar'): 36 | classpath.append('{}{}'.format(dirpath, fname)) 37 | break 38 | Debugger.info_print('{}'.format(classpath)) 39 | 40 | cmd = 'CUDA_VISIBLE_DEVICES={} java -classpath {}'.format(opt.gpu_number, opt.top) 41 | if opt.clf != 'all': 42 | for p in classpath: 43 | cmd += ':{}'.format(p) 44 | dataset_cmd = cmd + ' development.DataSets -i {} -o {} -t {}'.format(opt.input, opt.output, opt.dataset) 45 | predict_cmd = cmd + ' timeseriesweka.examples.ClassificationExamples -i {} -o {} -t {} -c {}'.format( 46 | opt.input, opt.output, opt.dataset, opt.clf 47 | ) 48 | output = open('{}/evaluate_baselines_{}_{}.sh'.format(opt.top, opt.dataset, opt.clf), 'w') 49 | output.write('#!/usr/bin/env bash\n{}\n{}\n'.format(dataset_cmd, predict_cmd)) 50 | output.close() 51 | else: 52 | for p in classpath: 53 | cmd += ':{}'.format(p) 54 | dataset_cmd = cmd + ' development.DataSets -i {} -o {} -t {}'.format(opt.input, opt.output, opt.dataset) 55 | output = open('{}/evaluate_baselines_{}_{}.sh'.format(opt.top, opt.dataset, opt.clf), 'w') 56 | output.write('#!/usr/bin/env bash\n{}\n'.format(dataset_cmd)) 57 | for clf in all_clf: 58 | predict_cmd = cmd + ' timeseriesweka.examples.ClassificationExamples -i {} -o {} -t {} -c {}'.format( 59 | opt.input, opt.output, opt.dataset, clf 60 | ) 61 | output.write('{}\n'.format(predict_cmd)) 62 | output.close() 63 | -------------------------------------------------------------------------------- /baselines/feature_based.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from config import * 3 | from time2graph.utils.base_utils import ModelUtils 4 | from sklearn.model_selection import StratifiedKFold 5 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 6 | 7 | 8 | class FeatureModel(ModelUtils): 9 | """ 10 | Class for Handcraft-feature Model for time series classification. 11 | Feature list: 12 | a) mean, std of whole time series. 13 | b) mean, std of each segments. 14 | c) mean of the std of segments. 15 | d) std of the mean of segments. 16 | """ 17 | def __init__(self, seg_length, kernel='xgb', opt_metric='f1', **kwargs): 18 | super(FeatureModel, self).__init__(kernel=kernel, **kwargs) 19 | self.clf = None 20 | self.seg_length = seg_length 21 | self.opt_metric = opt_metric 22 | 23 | def extract_features(self, samples): 24 | num_samples, data_size = samples.shape[0], samples.shape[-1] 25 | samples = samples.reshape(num_samples, -1, self.seg_length, data_size) 26 | series_mean = np.mean(samples.reshape(num_samples, -1, data_size), axis=1).reshape(num_samples, -1) 27 | series_std = np.std(samples.reshape(num_samples, -1, data_size), axis=1).reshape(num_samples, -1) 28 | seg_mean, seg_std = np.mean(samples, axis=2), np.mean(samples, axis=2) 29 | seg_mean_std, seg_std_mean = np.std(seg_mean, axis=1), np.mean(seg_std, axis=1) 30 | seg_mean = seg_mean.reshape(num_samples, -1) 31 | seg_std = seg_std.reshape(num_samples, -1) 32 | seg_mean_std = seg_mean_std.reshape(num_samples, -1) 33 | seg_std_mean = seg_std_mean.reshape(num_samples, -1) 34 | return np.concatenate((series_mean, series_std, seg_mean, seg_std, seg_mean_std, seg_std_mean), axis=1) 35 | 36 | def fit(self, X, Y, n_splits=5, balanced=True): 37 | x = self.extract_features(samples=X) 38 | max_accu, max_prec, max_recall, max_f1, max_metric = -1, -1, -1, -1, -1 39 | arguments, opt_args = self.clf_paras(balanced=balanced), None 40 | metric_measure = self.return_metric_method(opt_metric=self.opt_metric) 41 | for args in arguments: 42 | self.clf.set_params(**args) 43 | skf = StratifiedKFold(n_splits=n_splits, shuffle=True) 44 | tmp = np.zeros(5, dtype=np.float32).reshape(-1) 45 | measure_vector = [metric_measure, accuracy_score, precision_score, recall_score, f1_score] 46 | for train_idx, test_idx in skf.split(x, Y): 47 | self.clf.fit(x[train_idx], Y[train_idx]) 48 | y_pred, y_true = self.clf.predict(x[test_idx]), Y[test_idx] 49 | for k in range(5): 50 | tmp[k] += measure_vector[k](y_true=y_true, y_pred=y_pred) 51 | tmp /= n_splits 52 | if max_metric < tmp[0]: 53 | max_metric = tmp 54 | opt_args = args 55 | max_accu, max_prec, max_recall, max_f1 = tmp[1:] 56 | Debugger.info_print('args {} for clf {}, performance: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format( 57 | opt_args, self.kernel, max_accu, max_prec, max_recall, max_f1)) 58 | self.clf.set_params(**opt_args) 59 | 60 | def predict(self, X, **kwargs): 61 | x = self.extract_features(samples=X) 62 | return self.clf.predict(x) 63 | -------------------------------------------------------------------------------- /baselines/show_baseline_results.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import argparse 3 | import warnings 4 | from config import * 5 | from time2graph.utils.base_utils import Debugger 6 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 7 | """ 8 | scripts for show baseline results generated by java-package provided in UCR. 9 | """ 10 | 11 | 12 | def load_baseline_results(fpath): 13 | y_pred, y_test = [], [] 14 | with open(fpath, 'r') as f: 15 | cnt = 0 16 | for line in f: 17 | if cnt < 3: 18 | cnt += 1 19 | continue 20 | line = line.rstrip('\n').split(',') 21 | if len(line) <= 4: 22 | continue 23 | y_test.append(int(line[0])) 24 | y_pred.append(int(line[1])) 25 | f.close() 26 | return y_pred, y_test 27 | 28 | 29 | if __name__ == '__main__': 30 | warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) 31 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 32 | parser.add_argument('--dataset', type=str, default='stealing') 33 | parser.add_argument('--clf', type=str, required=True) 34 | 35 | opt = parser.parse_args() 36 | all_clf = [ 37 | 'CID_DTW', 'DD_DTW', 'WDTW', 'ED', 'DTW', 38 | 'LearnShapelets', 'FastShapelets', 'BagOfPatterns', 39 | 'TSF', 'TSBF', 'LPS', 'SAX', 'ST', 'COTE', 'EE' 40 | ] 41 | assert opt.clf in all_clf 42 | fpath = '{}/dataset/{}/Predictions/{}/testFold0.csv'.format(module_path, opt.clf, opt.dataset) 43 | y_pred, y_test = load_baseline_results(fpath=fpath) 44 | Debugger.info_print('{} test samples with {:.4f} positive'.format(len(y_test), sum(y_test) / len(y_test))) 45 | accu = accuracy_score(y_true=y_test, y_pred=y_pred) 46 | prec = precision_score(y_true=y_test, y_pred=y_pred) 47 | recall = recall_score(y_true=y_test, y_pred=y_pred) 48 | f1 = f1_score(y_true=y_test, y_pred=y_pred) 49 | Debugger.info_print('res: accu {:.4f}, prec {:.4f}, recall {:.4f}, f1 {:.4f}'.format( 50 | accu, prec, recall, f1 51 | )) 52 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from os import path 4 | from time2graph.utils.base_utils import Debugger 5 | """ 6 | configuration file for benchmark datasets from UCR. 7 | Earthquakes (EQS). 8 | WormsTwoClass (WTC). 9 | StrawBerry (STB). 10 | including hyper-parameters and optimal arguments in xgboost. 11 | """ 12 | 13 | module_path = path.dirname(path.abspath(__file__)) 14 | 15 | 16 | EQS = { 17 | 'K': 50, 18 | 'C': 800, 19 | 'seg_length': 24, 20 | 'num_segment': 21, 21 | 'percentile': 5 22 | } 23 | 24 | WTC = { 25 | 'K': 20, 26 | 'C': 400, 27 | 'seg_length': 30, 28 | 'num_segment': 30, 29 | 'percentile': 5, 30 | 'global_flag': False 31 | } 32 | 33 | STB = { 34 | 'K': 50, 35 | 'C': 800, 36 | 'seg_length': 15, 37 | 'num_segment': 15, 38 | 'percentile': 10, 39 | 'embed': 'aggregate' 40 | } 41 | 42 | model_args = { 43 | 'ucr-Earthquakes': EQS, 44 | 'ucr-WormsTwoClass': WTC, 45 | 'ucr-Strawberry': STB 46 | } 47 | 48 | xgb_args = { 49 | 'ucr-Earthquakes': { 50 | 'max_depth': 16, 51 | 'learning_rate': 0.2, 52 | 'scale_pos_weight': 1, 53 | 'booster': 'gbtree' 54 | }, 55 | 'ucr-WormsTwoClass': { 56 | 'max_depth': 2, 57 | 'learning_rate': 0.2, 58 | 'scale_pos_weight': 1, 59 | 'booster': 'gbtree' 60 | }, 61 | 'ucr-Strawberry': { 62 | 'max_depth': 8, 63 | 'learning_rate': 0.2, 64 | 'scale_pos_weight': 1, 65 | 'booster': 'gbtree' 66 | } 67 | } 68 | 69 | __all__ = [ 70 | 'np', 71 | 'path', 72 | 'Debugger', 73 | 'module_path', 74 | 'model_args', 75 | 'xgb_args' 76 | ] 77 | -------------------------------------------------------------------------------- /dataset/UCRArchive_2018/Earthquakes/README.md: -------------------------------------------------------------------------------- 1 | # Earthquakes 2 | 3 | The earthquake classification problem involves predicting whether a major event is about to occur based on the most recent readings in the surrounding area. The data are taken from Northern California Earthquake Data Center and each data point is an averaged reading for one hour, with the first reading taken on Dec 1st 1967 and the last in 2003. This single time series are then turned into a classification problem of differentiating between a positive and negative major earthquake event. 4 | 5 | A major event is defined as any reading of over 5 on the Rictor scale. Major events are often followed by aftershocks. (The physics of these are well understood and their detection is not the objective of this dataset.) A positive case is defined a major event which is not preceded by another major event for at least 512 hours. 6 | 7 | Negative cases are instances where there is a reading below 4 (to avoid blurring of the boundaries between major and non-major events) that is preceded by at least 20 non-zero readings in the previous 512 hours (to avoid trivial negative cases). 8 | 9 | In total, 368 negative and 93 positive cases were extracted from 86,066 hourly readings. None of the cases overlap in time (i.e. a segmentation is used instead of a sliding window). 10 | 11 | Train size: 322 12 | 13 | Test size: 139 14 | 15 | Missing value: No 16 | 17 | Number of classses: 2 18 | 19 | Time series length: 512 20 | 21 | Data donated by Anthony Bagnall (see [1]). 22 | 23 | [1] http://www.timeseriesclassification.com/description.php?Dataset=Earthquakes 24 | -------------------------------------------------------------------------------- /dataset/UCRArchive_2018/Earthquakes/desktop.ini: -------------------------------------------------------------------------------- 1 | [.ShellClassInfo] 2 | InfoTip=This folder is shared online. 3 | IconFile=C:\Program Files\Google\Drive\googledrivesync.exe 4 | IconIndex=16 5 | -------------------------------------------------------------------------------- /dataset/UCRArchive_2018/Strawberry/README.md: -------------------------------------------------------------------------------- 1 | # Strawberry 2 | 3 | Food spectrographs are used in chemometrics to classify food types, a task that has obvious applications in food safety and quality assurance. This data was processed using Fourier transform infrared (FTIR) spectroscopy with attenuated total reflectance (ATR) sampling. More details are provided in [1][2]. 4 | 5 | The classes are strawberry purees (authentic samples) and non-strawberry purees (adulterated strawberries and other fruits). 6 | 7 | Train size: 613 8 | 9 | Test size: 370 10 | 11 | Missing value: No 12 | 13 | Number of classses: 2 14 | 15 | Time series length: 235 16 | 17 | Data donated by Katherine Kemsley and Anthony Bagnall (see [1], [2], [3]). 18 | 19 | [1] Holland, J. K., E. K. Kemsley, and R. H. Wilson. "Use of Fourier transform infrared spectroscopy and partial least squares regression for the detection of adulteration of strawberry purees." Journal of the Science of Food and Agriculture 76.2 (1998): 263-269. 20 | 21 | [2] https://csr.quadram.ac.uk/example-datasets-for-download/ 22 | 23 | [3] http://www.timeseriesclassification.com/description.php?Dataset=Strawberry -------------------------------------------------------------------------------- /dataset/UCRArchive_2018/Strawberry/desktop.ini: -------------------------------------------------------------------------------- 1 | [.ShellClassInfo] 2 | InfoTip=This folder is shared online. 3 | IconFile=C:\Program Files\Google\Drive\googledrivesync.exe 4 | IconIndex=16 5 | -------------------------------------------------------------------------------- /dataset/UCRArchive_2018/WormsTwoClass/README.md: -------------------------------------------------------------------------------- 1 | # WormTwoClass 2 | 3 | Caenorhabditis elegans (C. elegans) is a roundworm commonly used as a model organism in genetics study. The movement of these worms is known to be a useful indicator for understanding behavioural genetics. 4 | 5 | The data were original from [1][2], in which the authors described a system for recording the motion of worms on an agar plate and measuring a range of human-defined features. 6 | 7 | It has been shown that the space of shapes Caenorhabditis elegans adopts on an agar plate can be represented by combinations of four base shapes, or eigenworms. Once the worm outline is extracted, each frame of worm motion can be captured by four scalars representing the amplitudes along each dimension when the shape is projected onto the four eigenworms. 8 | 9 | The data were formatted for time series classification task and used in [3]. Each case is a series of the first eigenworm only, down-sampled to second-long intervals and averaged down so that all series are of length 900. There are 258 cases in total; each belongs to one of five types: one wild-type (the N2 reference strain - 109 cases) and four mutants: goa-1 (44 cases), unc-1 (35 cases), unc-38 (45 cases) and unc-63 (25 cases). 10 | 11 | In case of the *WormsTwoClass* dataset, the task is to classify worms of wild-type or mutant-type. 12 | 13 | In case of the *Worms* dataset, the task is to classify worms into one of the five categories. 14 | 15 | Train size: 181 16 | 17 | Test size: 77 18 | 19 | Missing value: No 20 | 21 | Number of classses: 2 22 | 23 | Time series length: 900 24 | 25 | Data donated by Andre Brown and Anthony Bagnall (see [1], [3]). 26 | 27 | [1] Brown, André EX, et al. "A dictionary of behavioral motifs reveals clusters of genes affecting Caenorhabditis elegans locomotion." Proceedings of the National Academy of Sciences 110.2 (2013): 791-796. 28 | 29 | [2] Yemini, Eviatar, et al. "A database of Caenorhabditis elegans behavioral phenotypes." Nature methods 10.9 (2013): 877. 30 | 31 | [3] Bagnall, Anthony, et al. "Time-series classification with COTE: the collective of transformation-based ensembles." IEEE Transactions on Knowledge and Data Engineering 27.9 (2015): 2522-2535. 32 | 33 | [4] http://www.timeseriesclassification.com/description.php?Dataset=Worms -------------------------------------------------------------------------------- /dataset/UCRArchive_2018/WormsTwoClass/desktop.ini: -------------------------------------------------------------------------------- 1 | [.ShellClassInfo] 2 | InfoTip=This folder is shared online. 3 | IconFile=C:\Program Files\Google\Drive\googledrivesync.exe 4 | IconIndex=16 5 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ### Time Series Modeling 2 | 3 | Time series modeling aims to discover the temporal relationships within chronologically arranged data. It has attracted extensive research over a wide range of fields, such as image alignment [2], speech recognition [3], etc. The key issue here is how to extract the representative features of a time series. A large part of previous frameworks range from classical feature engineering and representation learning to deep learning based models. While these methods have achieved good performance [4, 5], they have also been subject to criticism regarding their lack of interpretability. 4 | 5 | ### Intuition: Shapelet Dynamics 6 | 7 | ***Shapelets***, the time series subsequences that are representative of a class [6], can offer directly interpretable and explanatory insights in the classification scenario, and shapelet-based models have proven to be promising in various practical domains [7,8,9]. 8 | 9 | Existing efforts have mainly considered shapelets as static. However, in the real world, shapelets are often dynamic, which is reflected in two respects: 10 | 11 | * First, the same shapelet appearing at different time slices may have a range of different impacts. For instance, in the scenario of detecting electricity theft, low electricity consumption in summer or winter is more suspicious than it is in spring, as refrigeration or heating equipments costs more electrical power. 12 | * Second, determining the ways in which shapelets evolve is vital to a full understanding of a time series. In fact, shapelets with small values at a particular time can hardly distinguish an electricity thief from a normal user who indeed consumes a low level of electricity. An alternative method would involve identifying users who once had high electricity consumption shapelets but suddenly consumes very few electrical power. In other words, an important clue here is how shapelets evolve over time. 13 | 14 | We refer to the subsequences of a time series that are able to reflect its representativeness at different time slices as *time-aware shapelets*. Furthermore, to deeply mining the dynamics and correlations of shapelets, we propose a novel approach to learn the representations of a time series by extracting time-aware shapelets and constructing a shapelet evolution graph, referred as our AAAI'2020 paper [1]. 15 | 16 |
17 |
18 |
19 | 20 | Above shows an concrete example from real-world electricity consumption record data, which may better explain our motivations: Fig. a demonstrates the one-year electricity usage of a user who has stolen electrical power from January to May while using electrical power normally in the remaining months. We assign each month the most representative shapelet at that time and present the shapelets *#72* and *#67*, along with their timing factors in Fig. b, where dark areas indicate that the corresponding shapelet is more discriminative relative to light areas. The shapelet evolution graph is presented in Fig. c, illustrating how a shapelet would transfer from one to another *in a normal case*: for the normal electricity consumption record, there is a clear path for its shapelet transition (*#90* → *#67* → *#85*) in the graph. For the abnormal data, however, the path (*#85* → *#72* → *#7*) does not exist, indicating that the connectivity of the shapelet transition path provides an evidential basis for detecting an abnormal time series. Finally, we translate the problem of learning representations of shapelets and time series into a graph embedding problem. 21 | 22 | ### Extracting Time-aware Shapelets 23 | 24 | Formally, a shapelet $$v$$ is a segment that is representative of a certain class. More precisely, it can separate $$T$$ into two smaller sets, one that is close to $$v$$ and another far from $$v$$ by some specific criteria, such that for a time series classification task, positive and negative samples can be put into different groups. The criteria can be formalized as 25 | 26 | $$\mathcal{L} = -g(S_{pos}(v, T), S_{neg}(v, T))$$ 27 | 28 | where $$S_{*}(v, T)$$ denotes the set of distances with respect to a specific group $$T_{*}$$, and the function $$g$$ takes two finite sets as input, returns a scalar value to indicate how far these two sets are, and it can be *information gain*, or some dissimilarity measurements on sets, i.e., *KL* divergence. 29 | 30 | To capture the shapelet dynamics, We define two factors for quantitatively measuring the timing effects of shapelets at different levels. Specifically, we introduce the *local factor* $$w_n$$​ to denote the inner importance of the *n-th* element of a particular shapelet, then the distance between a shapelet $$v$$ and a segment $$s$$ is redefined as 31 | 32 | $$\hat{d}(v, s|w) = \tau(v, s | a^*, w) = (\sum\nolimits_{k=1}^{p}\ w_{a^*_1(k)} \cdot (v_{a^*_1(k)} - s_{a^*_2(k)})^2)^{\frac{1}{2}}$$ 33 | 34 | where $$a^*$$ refers to the best alignment for DTW distance. On the other hand, at a *global level*, we aim to measure the timing effects across segments on the discriminatory power of shapelets. It is inspired from the intuition that shapelets may represent totally different meaning at different time steps, and it is straightforward to measure such deviations by adding segment-level weights. Formally, we set a *global factor* $$u_m$$ to capture the cross-segments influence, then the distance between a shapelet $$v$$ and a time series $$t$$ can be rewritten as 35 | 36 | $$\hat{D}(v, t | w, u) = \min\nolimits_{1\le k \le m} u_k \cdot \hat{d}(v, s_k | w)$$ 37 | 38 | Then given a classification task, we establish a supervised learning method to select the most important time-aware shapelets and learn corresponding timing factors $$w_i$$ and $$u_i$$ for each shapelet $$v_i$$. In particular, we have a pool of segments as shapelet candidates that selected from all subsequences, and a set of time series $$T$$ with labels. For each candidate $$v$$, we have the following objective function: 39 | 40 | $$\hat{\mathcal{L}} = -g(S_{pos}(v, T), S_{neg}(v, T)) + \lambda ||w|| + \epsilon ||u||$$ 41 | 42 | and after learning the timing factors from shapelet candidates separately, we select the top *K* shapelets with minimal loss as our final time-aware shapelets. 43 | 44 | ### Constructing Shapelet Evolution Graph 45 | 46 | A ***Shapelet Evolution Graph*** is a directed and weighted graph $$G = (V,E)$$ in which $$V$$ consists of $$K$$ vertices, each denoting a shapelet, and each directed edge $$e_{i, j} \in E$$ is associated with a weight $$w_{i, j}$$, indicating the occurrence probability of shapelet $$v_i \in V$$ followed by another shapelet $$v_j \in V$$ in the same time series. The key idea here is that the shapelet evolution and transition patterns can be naturally reflected from the paths in the graph, then graph embedding mythologies can be applied to learn shapelet, as well as the time series representations. 47 | 48 | We first assign each segment $$s_i$$ of each time series to several shapelets that have the closest distances to $$s_i$$ according to the time-aware dissimilarity. In detail, we standardize the shapelet assignment probability as 49 | 50 | $$p_{i, j} = \frac{ 51 | \max(\hat{d_{i,*}}(v_{i, *}, s_i)) - \hat{d_{i,j}}(v_{i, j}, s_i) 52 | }{ 53 | \max(\hat{d_{i,*}}(v_{i, *}, s_i)) - \min(\hat{d_{i,*}}(v_{i, *}, s_i)) 54 | }$$ 55 | 56 | where 57 | 58 | $$\hat{d_{i,*}}(v_{i, *}, s_i) = u_*[i] * \hat{d}(v_{i, *}, s_i | w_*)$$ 59 | 60 | with a predefined constraint that $$\hat{d_{i, *}} \le \delta$$. Then, for each pair $$(j, k)$$, we create a weighted edge from shapelet $$v_{i, j}$$ to $$v_{i+1, k}$$ with weight $$p_{i, j} \cdot p_{i+1, k}$$ , and merge all duplicated edges as one by summing up their weights. Finally, we normalize the edge weights sourced from each node as 1, which naturally interprets the edge weight between each pair of nodes, i.e., $$v_i$$ and $$v_j$$ into the conditional probability that shapelet $$v_i$$ being transformed into $$v_j$$ in an adjacent time step. 61 | 62 | ### Time Series Representation Learning 63 | 64 | Finally, we learn the representations for both the shapelets and the given time series by using the shapelet evolution graph constructed as above. We first employ an existing graph embedding algorithm DeepWalk [10] to obtain vertex (shapelet) representation vectors $$\mu \in \mathbb{R}^B$$. Then, for each segment $$s_i$$ in a time series, we retrieve the embeddings of its assigned shapelets that have discussed above, and sum them up weighted by assignment probability, denoted as 65 | 66 | $$\Phi_i=(\sum\nolimits_{j}p_{i,j}\cdot\mu(v_{i,j})), \ 1 \le i \le m$$ 67 | 68 | and finally concatenate or aggregate all those $$m$$ segment embedding vectors to obtain the representation vector for original time series $$t$$. The time series embeddings can then be applied to various down streaming tasks, referred to the experiment section in our paper [1]. 69 | 70 | ### Evaluation Results 71 | 72 | We conduct time series classification tasks on three public benchmarks datasets from *UCR-Archive* [11] and two real-world datasets from State Grid of China and China Telecom. Experimental results are shown in the following table: 73 | 74 |
75 |
76 |
77 | 78 | We have also conduct extensive ablation and observational studies to validate our proposed framework. Here we construct the shapelet evolution graphs at different time steps for deeper understanding of shapelet dynamics, seen in the figure below. It shows two graphs, one for January and another for July. In January, shapelet *#45* has large in/out degrees, and its corresponding timing factor is highlighted in January and February (dark areas). It indicates that shapelet *#45* is likely to be a common pattern at the beginning of a year. As for July, shapelet *#45* is no longer as important as it was in January. Meanwhile, shapelet *#42*, which is almost an isolated point in January, becomes very important in July. Although we do not explicitly take seasonal information into consideration when constructing shapelet evolution graphs, the inclusion of the timing factors means that they are already incorporated into the process of the graph generation. 79 | 80 |
81 |
82 |
83 | 84 | 85 | 86 | ### Reference 87 | 88 | [1] Cheng, Z; Yang, Y; Wang, W; Hu, W; Zhuang, Y and Song, G, 2020, Time2Graph: Revisiting Time Series Modeling with Dynamic Shapelets, In AAAI, 2020 89 | 90 | [2] Peng, X.; Huang, J.; Hu, Q.; Zhang, S.; and Metaxas, D. N. 2014. Head pose estimation by instance parameterization. In *ICPR’14*, 1800–1805. 91 | 92 | [3] Shimodaira, H.; Noma, K.-i.; Nakai, M.; and Sagayama, S. 2002. Dynamic time-alignment kernel in support vector machine. In *NIPS’02*, 921–928. 93 | 94 | [4] Malhotra, P.; Ramakrishnan, A.; Anand, G.; Vig, L.; Agar- wal, P.; and Shroff, G. 2016. Lstm-based encoder- decoder for multi-sensor anomaly detection. *arXiv preprint arXiv:1607.00148*. 95 | 96 | [5] Johnson, M.; Duvenaud, D. K.; Wiltschko, A.; Adams, R. P.; and Datta, S. R. 2016. Composing graphical models with neu- ral networks for structured representations and fast inference. In *NIPS’16*, 2946–2954. 97 | 98 | [6] Ye, L., and Keogh, E. 2011. Time series shapelets: a novel technique that allows accurate, interpretable and fast classifi- cation. *DMKD.* 22(1):149–182. 99 | 100 | [7] Bostrom, A., and Bagnall, A. 2017. Binary shapelet trans- form for multiclass time series classification. In *TLSD- KCS’17.* 24–46. 101 | 102 | [8] Hills, J.; Lines, J.; Baranauskas, E.; Mapp, J.; and Bagnall, A. 2014. Classification of time series by shapelet transformation. *DMKD.* 28(4):851–881 103 | 104 | [9] Lines, J.; Davis, L. M.; Hills, J.; and Bagnall, A. 2012. A shapelet transform for time series classification. In *KDD’12*, 289–297. 105 | 106 | [10] Perozzi, B.; Al-Rfou, R.; and Skiena, S. 2014. Deepwalk: Online learning of social representations. In *KDD*, 701–710. 107 | 108 | [11] Dau, H. A.; Keogh, E.; Kamgar, K.; Yeh, C.-C. M.; Zhu, Y.; Gharghabi, S.; Ratanamahatana, C. A.; Yanping; Hu, B.; Begum, N.; Bagnall, A.; Mueen, A.; and Batista, G. 2018. The ucr time series classification archive. https://www.cs.ucr.edu/~eamonn/time_series_data_2018/. -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate 2 | title: Time2Graph 3 | description: "Time2Graph: Revisting Time Series Modeling with Dynamic Shapelets" 4 | -------------------------------------------------------------------------------- /docs/_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | {% seo %} 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | View on GitHub 21 | 22 |

{{ site.title | default: site.github.repository_name }}

23 |

{{ site.description | default: site.github.project_tagline }}

24 | 25 | {% if site.show_downloads %} 26 |
27 | Download this project as a .zip file 28 | Download this project as a tar.gz file 29 |
30 | {% endif %} 31 |
32 |
33 | 34 | 35 |
36 |
37 | {{ content }} 38 |
39 |
40 | 41 | 42 | 50 | 51 | {% if site.google_analytics %} 52 | 60 | {% endif %} 61 | 62 | 63 | -------------------------------------------------------------------------------- /docs/exp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/docs/exp.jpg -------------------------------------------------------------------------------- /docs/motiv.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/docs/motiv.jpg -------------------------------------------------------------------------------- /docs/vis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/docs/vis.jpg -------------------------------------------------------------------------------- /evaluate_paras.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import argparse 3 | import warnings 4 | import os 5 | from time2graph.utils.base_utils import Debugger 6 | 7 | if __name__ == '__main__': 8 | warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) 9 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 10 | parser.add_argument('--dataset', type=str, default='ucr-Earthquakes') 11 | parser.add_argument('--mode', type=str, default='embedding') 12 | parser.add_argument('--embed', type=str, default='aggregate') 13 | parser.add_argument('--target', type=str, required=True) 14 | parser.add_argument('--paras', type=str, required=True) 15 | parser.add_argument('--gpu_number', type=int, default=0) 16 | parser.add_argument('--embed_size', type=int, default=256) 17 | parser.add_argument('--n_splits', type=int, default=5) 18 | parser.add_argument('--K', type=int, default=100) 19 | parser.add_argument('--C', type=int, default=800) 20 | parser.add_argument('--num_segment', type=int, default=12) 21 | parser.add_argument('--seg_length', type=int, default=30) 22 | parser.add_argument('--total_length', type=int, default=-1) 23 | parser.add_argument('--batch_size', type=int, default=50) 24 | parser.add_argument('--warp', type=int, default=2) 25 | parser.add_argument('--njobs', type=int, default=20) 26 | parser.add_argument('--percentile', type=int, default=10) 27 | parser.add_argument('--init', type=int, default=0) 28 | 29 | opt = parser.parse_args() 30 | cmd = 'CUDA_VISIBLE_DEVICES={} python scripts/run.py --njobs {} ' \ 31 | '--init {} --gpu_enable --dataset {} --mode embedding ' \ 32 | '--percentile {} --batch_size {} --cmethod greedy --kernel xgb ' \ 33 | '--embed {} --opt_metric accuracy'.format( 34 | opt.gpu_number, opt.njobs, opt.init, opt.dataset, opt.percentile, opt.batch_size, opt.embed) 35 | paras = { 36 | 'K': opt.K, 37 | 'seg_length': opt.seg_length, 38 | 'num_segment': opt.num_segment, 39 | 'embed_size': opt.embed_size, 40 | 'warp': opt.warp 41 | } 42 | assert opt.target in paras 43 | if opt.target == 'seg_length': 44 | assert opt.total_length != -1 45 | paras.pop(opt.target) 46 | paras.pop('num_segment') 47 | else: 48 | paras.pop(opt.target) 49 | for key, val in paras.items(): 50 | cmd += ' --{} {}'.format(key, val) 51 | 52 | output = open('evaluate_paras_{}.sh'.format(opt.target), 'w') 53 | output.write('#!/usr/bin/env bash\n') 54 | for p in opt.paras.split(','): 55 | if opt.target == 'K': 56 | tmp = '{} --{} {} --C {}'.format(cmd, opt.target, p, int(p) * 10) 57 | Debugger.info_print('running: {}'.format(tmp)) 58 | output.write('{}\n'.format(tmp)) 59 | elif opt.target == 'seg_length': 60 | tmp = '{} --{} {} --{} {} --C {}'.format( 61 | cmd, opt.target, p, 'num_segment', 62 | int(opt.total_length // int(p)), int(paras['K'] * 10)) 63 | Debugger.info_print('running: {}'.format(tmp)) 64 | output.write('{}\n'.format(tmp)) 65 | else: 66 | tmp = '{} --{} {} --C {}'.format(cmd, opt.target, p, int(paras['K'] * 10)) 67 | Debugger.info_print('running: {}'.format(tmp)) 68 | output.write('{}\n'.format(tmp)) 69 | os.system('chmod u+x evaluate_paras_{}.sh'.format(opt.target)) 70 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dill>=0.2.5 2 | six>=1.10.0 3 | scipy>=1.3.0 4 | numpy>=1.16.0 5 | scikit_learn>=0.19.1 6 | pandas>=0.23 7 | xgboost>=0.80 8 | torch>=0.4.1 9 | networkx>=2.1 -------------------------------------------------------------------------------- /scripts/cache/ucr-Earthquakes_embedding_t2g_model.cache: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/scripts/cache/ucr-Earthquakes_embedding_t2g_model.cache -------------------------------------------------------------------------------- /scripts/cache/ucr-Earthquakes_greedy_50_24_shapelets.cache: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/scripts/cache/ucr-Earthquakes_greedy_50_24_shapelets.cache -------------------------------------------------------------------------------- /scripts/cache/ucr-Strawberry_embedding_t2g_model.cache: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/scripts/cache/ucr-Strawberry_embedding_t2g_model.cache -------------------------------------------------------------------------------- /scripts/cache/ucr-Strawberry_greedy_50_15_shapelets.cache: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/scripts/cache/ucr-Strawberry_greedy_50_15_shapelets.cache -------------------------------------------------------------------------------- /scripts/cache/ucr-WormsTwoClass_embedding_t2g_model.cache: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/scripts/cache/ucr-WormsTwoClass_embedding_t2g_model.cache -------------------------------------------------------------------------------- /scripts/cache/ucr-WormsTwoClass_greedy_20_30_shapelets.cache: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/scripts/cache/ucr-WormsTwoClass_greedy_20_30_shapelets.cache -------------------------------------------------------------------------------- /scripts/run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import argparse 3 | import warnings 4 | import os 5 | from config import * 6 | from archive.load_usr_dataset import load_usr_dataset_by_name 7 | from time2graph.utils.base_utils import Debugger 8 | from time2graph.core.model import Time2Graph 9 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 10 | """ 11 | scripts for running test. 12 | running command: 13 | 1. set PYTHONPATH environment; 14 | 2. python scripts/run.py **options 15 | 3. option list: 16 | --dataset, ucr-Earthquakes/WormsTwoClass/Strawberry 17 | --K, number of shapelets extracted 18 | --C, number of shapelet candidates 19 | --n_splits, number of splits in cross-validation 20 | --num_segment, number of segment a time series is divided into 21 | --seg_length, segment length 22 | --njobs, number of threads in parallel 23 | --data_size, data dimension of time series 24 | --optimizer, optimizer used in time-aware shapelets learning 25 | --alpha, penalty parameter of local timing factor 26 | --beta, penalty parameter of global timing factor 27 | --init, init index of time series data 28 | --gpu_enable, bool, whether to use GPU 29 | --opt_metric, which metric to optimize in prediction 30 | --cache, whether to dump model to local file 31 | --embed, which embed strategy to use (aggregate/concatenate) 32 | --embed_size, embedding size of shapelets 33 | --warp, warping size in greedy-dtw 34 | --cmethod, which algorithm to use in candidate generation (cluster/greedy) 35 | --kernel, specify outer-classifier (default xgboost) 36 | --percentile, percentile for distance threshold in constructing graph 37 | --measurement, which distance metric to use (default greedy-dtw) 38 | --batch_size, batch size in each training step 39 | --tflag, flag that whether to use timing factors 40 | --scaled, flag that whether to rescale time series data 41 | --norm, flag that whether to normalize extracted representations 42 | --no_global, whether to use global timing factors 43 | """ 44 | 45 | if __name__ == '__main__': 46 | warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) 47 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 48 | parser.add_argument('--dataset', type=str, default='ucr-Earthquakes', 49 | help='ucr-Earthquakes/WormsTwoClass/Strawberry') 50 | parser.add_argument('--K', type=int, default=100, help='number of shapelets extracted') 51 | parser.add_argument('--C', type=int, default=800, help='number of shapelet candidates') 52 | parser.add_argument('--n_splits', type=int, default=5, help='number of splits in cross-validation') 53 | parser.add_argument('--num_segment', type=int, default=12, help='number of segment a time series is divided into') 54 | parser.add_argument('--seg_length', type=int, default=30, help='segment length') 55 | parser.add_argument('--njobs', type=int, default=8, help='number of threads in parallel') 56 | parser.add_argument('--data_size', type=int, default=1, help='data dimension of time series') 57 | parser.add_argument('--optimizer', type=str, default='Adam', help='optimizer used in time-aware shapelets learning') 58 | parser.add_argument('--alpha', type=float, default=0.1, help='penalty parameter of local timing factor') 59 | parser.add_argument('--beta', type=float, default=0.05, help='penalty parameter of global timing factor') 60 | parser.add_argument('--init', type=int, default=0, help='init index of time series data') 61 | parser.add_argument('--gpu_enable', action='store_true', default=False, help='bool, whether to use GPU') 62 | parser.add_argument('--opt_metric', type=str, default='accuracy', help='which metric to optimize in prediction') 63 | parser.add_argument('--cache', action='store_true', default=False, help='whether to dump model to local file') 64 | parser.add_argument('--embed', type=str, default='aggregate', 65 | help='which embed strategy to use (aggregate/concatenate)') 66 | parser.add_argument('--embed_size', type=int, default=256, help='embedding size of shapelets') 67 | parser.add_argument('--warp', type=int, default=2, help='warping size in greedy-dtw') 68 | parser.add_argument('--cmethod', type=str, default='greedy', 69 | help='which algorithm to use in candidate generation (cluster/greedy)') 70 | parser.add_argument('--kernel', type=str, default='xgb', help='specify outer-classifier (default xgboost)') 71 | parser.add_argument('--percentile', type=int, default=10, 72 | help='percentile for distance threshold in constructing graph') 73 | parser.add_argument('--measurement', type=str, default='gdtw', 74 | help='which distance metric to use (default greedy-dtw)') 75 | parser.add_argument('--batch_size', type=int, default=50, 76 | help='batch size in each training step') 77 | parser.add_argument('--tflag', action='store_false', default=True, help='flag that whether to use timing factors') 78 | parser.add_argument('--scaled', action='store_true', default=False, 79 | help='flag that whether to rescale time series data') 80 | parser.add_argument('--norm', action='store_true', default=False, 81 | help='flag that whether to normalize extracted representations') 82 | parser.add_argument('--no_global', action='store_false', default=True, 83 | help='whether to use global timing factors') 84 | 85 | args = parser.parse_args() 86 | Debugger.info_print('running with {}'.format(args.__dict__)) 87 | 88 | if args.dataset.startswith('ucr'): 89 | dataset = args.dataset.rstrip('\n\r').split('-')[-1] 90 | x_train, y_train, x_test, y_test = load_usr_dataset_by_name( 91 | fname=dataset, length=args.seg_length * args.num_segment) 92 | else: 93 | raise NotImplementedError() 94 | Debugger.info_print('training: {:.2f} positive ratio with {}'.format(float(sum(y_train) / len(y_train)), 95 | len(y_train))) 96 | Debugger.info_print('test: {:.2f} positive ratio with {}'.format(float(sum(y_test) / len(y_test)), 97 | len(y_test))) 98 | m = Time2Graph(kernel=args.kernel, K=args.K, C=args.C, seg_length=args.seg_length, 99 | opt_metric=args.opt_metric, init=args.init, gpu_enable=args.gpu_enable, 100 | warp=args.warp, tflag=args.tflag, mode=args.embed, 101 | percentile=args.percentile, candidate_method=args.cmethod, 102 | batch_size=args.batch_size, njobs=args.njobs, 103 | optimizer=args.optimizer, alpha=args.alpha, 104 | beta=args.beta, measurement=args.measurement, 105 | representation_size=args.embed_size, data_size=args.data_size, 106 | scaled=args.scaled, norm=args.norm, global_flag=args.no_global, 107 | shapelets_cache='{}/scripts/cache/{}_{}_{}_{}_shapelets.cache'.format( 108 | module_path, args.dataset, args.cmethod, args.K, args.seg_length) 109 | ) 110 | 111 | res = np.zeros(4, dtype=np.float32) 112 | Debugger.info_print('training {}_mixed_model ...'.format(args.dataset)) 113 | cache_dir = '{}/scripts/cache/{}/'.format(module_path, args.dataset) 114 | if not path.isdir(cache_dir): 115 | os.mkdir(cache_dir) 116 | m.fit(X=x_train, Y=y_train, cache_dir=cache_dir, n_splits=args.n_splits) 117 | if args.cache: 118 | m.save_model(fpath='{}/scripts/cache/{}_embedding_t2g_model.cache'.format(module_path, args.dataset)) 119 | y_pred = m.predict(X=x_test)[0] 120 | Debugger.info_print('result: accu {:.4f}, prec {:.4f}, recall {:.4f}, f1 {:.4f}'.format( 121 | accuracy_score(y_true=y_test, y_pred=y_pred), 122 | precision_score(y_true=y_test, y_pred=y_pred), 123 | recall_score(y_true=y_test, y_pred=y_pred), 124 | f1_score(y_true=y_test, y_pred=y_pred) 125 | )) 126 | -------------------------------------------------------------------------------- /scripts/std_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | test scripts on three benchmark datasets: EQS, WTC, STB 4 | """ 5 | import argparse 6 | import warnings 7 | import os 8 | from config import * 9 | from archive.load_usr_dataset import load_usr_dataset_by_name 10 | from time2graph.utils.base_utils import Debugger 11 | from time2graph.core.model import Time2Graph 12 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 13 | 14 | 15 | if __name__ == '__main__': 16 | warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) 17 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 18 | parser.add_argument('--dataset', type=str, default='ucr-Earthquakes') 19 | parser.add_argument('--n_splits', type=int, default=5) 20 | parser.add_argument('--model_cache', action='store_true', default=False) 21 | parser.add_argument('--shapelet_cache', action='store_true', default=False) 22 | parser.add_argument('--gpu_enable', action='store_true', default=False) 23 | args = parser.parse_args() 24 | Debugger.info_print('running with {}'.format(args.__dict__)) 25 | 26 | # set default options 27 | general_options = { 28 | 'kernel': 'xgb', 29 | 'opt_metric': 'accuracy', 30 | 'init': 0, 31 | 'warp': 2, 32 | 'tflag': True, 33 | 'mode': 'embedding', 34 | 'candidate_method': 'greedy' 35 | } 36 | model_options = model_args[args.dataset] 37 | xgb_options = xgb_args[args.dataset] 38 | 39 | # load benchmark dataset 40 | if args.dataset.startswith('ucr'): 41 | dataset = args.dataset.rstrip('\n\r').split('-')[-1] 42 | x_train, y_train, x_test, y_test = load_usr_dataset_by_name( 43 | fname=dataset, length=model_options['seg_length'] * model_options['num_segment']) 44 | else: 45 | raise NotImplementedError() 46 | Debugger.info_print('training: {:.2f} positive ratio with {}'.format( 47 | float(sum(y_train) / len(y_train)), len(y_train))) 48 | Debugger.info_print('test: {:.2f} positive ratio with {}'.format( 49 | float(sum(y_test) / len(y_test)), len(y_test))) 50 | 51 | # initialize Time2Graph model 52 | m = Time2Graph(gpu_enable=args.gpu_enable, **model_options, **general_options, 53 | shapelets_cache='{}/scripts/cache/{}_{}_{}_{}_shapelets.cache'.format( 54 | module_path, args.dataset, general_options['candidate_method'], 55 | model_options['K'], model_options['seg_length'])) 56 | if args.model_cache: 57 | m.load_model(fpath='{}/scripts/cache/{}_embedding_t2g_model.cache'.format(module_path, args.dataset)) 58 | if args.shapelet_cache: 59 | m.t2g.load_shapelets(fpath=m.shapelets_cache) 60 | res = np.zeros(4, dtype=np.float32) 61 | 62 | Debugger.info_print('training {}_tim2graph_model ...'.format(args.dataset)) 63 | cache_dir = '{}/scripts/cache/{}'.format(module_path, args.dataset) 64 | 65 | if not path.isdir(cache_dir): 66 | os.mkdir(cache_dir) 67 | m.fit(X=x_train, Y=y_train, n_splits=args.n_splits, tuning=False, opt_args=xgb_options) 68 | y_pred = m.predict(X=x_test)[0] 69 | Debugger.info_print('classification result: accuracy {:.4f}, precision {:.4f}, recall {:.4f}, F1 {:.4f}'.format( 70 | accuracy_score(y_true=y_test, y_pred=y_pred), 71 | precision_score(y_true=y_test, y_pred=y_pred), 72 | recall_score(y_true=y_test, y_pred=y_pred), 73 | f1_score(y_true=y_test, y_pred=y_pred) 74 | )) 75 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import setuptools 3 | 4 | setuptools.setup( 5 | name='time2graph', 6 | version='0.1', 7 | packages=['archive', 8 | 'time2graph.core', 9 | 'time2graph.utils' 10 | ], 11 | author='petecheng', 12 | author_email='petecheng@zju.edu.cn', 13 | description='time2graph model', 14 | ) 15 | -------------------------------------------------------------------------------- /time2graph/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/time2graph/__init__.py -------------------------------------------------------------------------------- /time2graph/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/time2graph/core/__init__.py -------------------------------------------------------------------------------- /time2graph/core/distance_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | def greedy_dtw_path(x, y, warp, dist=lambda x, y: np.linalg.norm(x - y)): 6 | """ 7 | generate dtw-path greedily. 8 | :param x: 9 | :param y: 10 | :param warp: 11 | :param dist: 12 | :return: 13 | """ 14 | if np.ndim(x) == 1: 15 | x = x.reshape(-1, 1) 16 | if np.ndim(y) == 1: 17 | y = y.reshape(-1, 1) 18 | nrows, ncols = x.shape[0], y.shape[0] 19 | ridx, cidx, rpath, cpath = 0, 0, [0], [0] 20 | while ridx < nrows - 1 and cidx < ncols - 1: 21 | rdist = dist(x[ridx + 1], y[cidx]) 22 | cdist = dist(x[ridx], y[cidx + 1]) 23 | ddist = dist(x[ridx + 1], y[cidx + 1]) 24 | if ddist < rdist and ddist < cdist: 25 | ridx += 1 26 | cidx += 1 27 | elif rdist < cdist: 28 | if ridx < cidx + warp: 29 | ridx += 1 30 | else: 31 | cidx += 1 32 | else: 33 | if cidx < ridx + warp: 34 | cidx += 1 35 | else: 36 | ridx += 1 37 | rpath.append(ridx) 38 | cpath.append(cidx) 39 | for k in range(ridx + 1, nrows): 40 | rpath.append(k) 41 | cpath.append(ncols - 1) 42 | for k in range(cidx + 1, ncols): 43 | cpath.append(k) 44 | rpath.append(nrows - 1) 45 | return np.array(rpath), np.array(cpath) 46 | 47 | 48 | def parameterized_gdtw_npy(x, y, w, warp, dist=lambda x, y: np.linalg.norm(x - y)): 49 | if np.ndim(x) == 1: 50 | x = x.reshape(-1, 1) 51 | if np.ndim(y) == 1: 52 | y = y.reshape(-1, 1) 53 | dpath = greedy_dtw_path(x=x, y=y, dist=dist, warp=warp) 54 | return dist((x * np.abs(w).reshape(len(w), -1))[dpath[0]], y[dpath[1]]) 55 | 56 | 57 | def expand_array(y, warp): 58 | size = y.shape[0] 59 | tmp_y = np.concatenate((y[size - warp: size, :], y, y[: warp, :]), axis=0) 60 | return np.array([tmp_y[k: (k+2 * warp + 1)] for k in range(size)], dtype=np.float32) 61 | 62 | 63 | def softmax(x): 64 | """Compute softmax values for each sets of scores in x.""" 65 | return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True) 66 | 67 | 68 | def softmax_1d(x): 69 | return np.exp(x) / np.sum(np.exp(x), keepdims=True) 70 | 71 | 72 | def parameterized_gw_npy(x, y, w, warp): 73 | distance = np.sum((x.reshape(x.shape[0], -1, x.shape[1]) - expand_array(y=y, warp=warp)) ** 2, 74 | axis=1) 75 | softmin_distance = np.sum(softmax(-distance.astype(np.float64)).astype(np.float32) * distance, 76 | axis=1) 77 | return np.sqrt(np.sum(softmin_distance * np.abs(w))) 78 | 79 | 80 | def pattern_distance_time_aware(pattern, time_series, local_factor, global_factor, warp, 81 | init, measurement): 82 | """ 83 | pattern distance with timing factors in numpy. 84 | :param pattern: 85 | :param time_series: 86 | :param local_factor: 87 | :param global_factor: 88 | :param warp: 89 | :param init: 90 | :param measurement: 91 | :return: 92 | """ 93 | if measurement == 'gw': 94 | dist = parameterized_gw_npy 95 | elif measurement == 'gdtw': 96 | dist = parameterized_gdtw_npy 97 | else: 98 | raise NotImplementedError('unsupported distance {}'.format(measurement)) 99 | num_segment = int(time_series.shape[0] / pattern.shape[0]) 100 | seg_length = pattern.shape[0] 101 | assert init + num_segment <= len(global_factor) 102 | time_series = time_series.reshape(num_segment, seg_length, -1) 103 | ret = np.zeros(num_segment, np.float32).reshape(-1) 104 | for k in range(num_segment): 105 | ret[k] = dist(x=pattern, y=time_series[k], w=local_factor, warp=warp) 106 | return np.sum(softmax_1d(-ret * np.abs(global_factor[init: init + num_segment])) 107 | * ret * np.abs(global_factor[init: init + num_segment])) 108 | 109 | 110 | def pattern_distance_no_timing(pattern, time_series, warp, measurement): 111 | """ 112 | pattern distance without timing factor in numpy. 113 | :param pattern: 114 | :param time_series: 115 | :param warp: 116 | :param measurement: 117 | :return: 118 | """ 119 | if measurement == 'gw': 120 | dist = parameterized_gw_npy 121 | elif measurement == 'gdtw': 122 | dist = parameterized_gdtw_npy 123 | else: 124 | raise NotImplementedError('unsupported distance {}'.format(measurement)) 125 | num_segment = int(time_series.shape[0] / pattern.shape[0]) 126 | seg_length = pattern.shape[0] 127 | w = np.ones(seg_length, dtype=np.float32).reshape(-1) 128 | assert time_series.shape[0] == num_segment * pattern.shape[0] 129 | time_series = time_series.reshape(num_segment, pattern.shape[0], -1) 130 | ret = np.zeros(num_segment, np.float32).reshape(-1) 131 | for k in range(num_segment): 132 | ret[k] = dist(x=pattern, y=time_series[k], w=w, warp=warp) 133 | return np.sum(softmax(-ret) * ret) 134 | -------------------------------------------------------------------------------- /time2graph/core/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pickle 3 | from config import * 4 | from time2graph.utils.base_utils import ModelUtils 5 | from time2graph.core.model_embeds import Time2GraphEmbed 6 | from baselines.feature_based import FeatureModel 7 | from sklearn.model_selection import StratifiedKFold 8 | from sklearn.preprocessing import StandardScaler, MinMaxScaler 9 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 10 | 11 | 12 | class Time2Graph(ModelUtils): 13 | """ 14 | Main Class of Time2Graph Model. 15 | """ 16 | def __init__(self, kernel, shapelets_cache, K, C, seg_length, init, opt_metric, 17 | warp=2, tflag=True, gpu_enable=True, percentile=15, mode='concate', 18 | batch_size=100, data_size=1, scaled=False, norm=False, **kwargs): 19 | """ 20 | @param kernel: 21 | str, choice of outer-classifier; recommend using xgb, while valid candidates can be found in ModelUtils. 22 | @param shapelets_cache: 23 | str, the path of cache of shapelets. 24 | @param K: 25 | int, number of shapelets that try to learn. 26 | @param C: 27 | int, number of shapelet candidates when learning shapelets. 28 | @param seg_length: 29 | int, the length of a segment. 30 | @param init: 31 | int, initial offset in the original time series, default as 0. 32 | @param opt_metric: 33 | str, one of 'accuracy', 'precision', 'recall' and 'f1', on which to conduct fine-tuning. 34 | @param warp: 35 | int, warp step in greedy-dtw, default as 2. 36 | @param tflag: 37 | bool, flag that whether to add timing factors, default is True. 38 | That is it is set as False, it will learn static shapelets. 39 | @param gpu_enable: 40 | bool, whether to use gpu during computation. 41 | @param percentile: 42 | int, percentile that use to determine distance threshold when constructing shapelet evolution graph. 43 | @param mode: 44 | str, 'concate' or 'aggregate', the way to generate time series embeddings. 45 | That is, concate weighted segment embeddings or aggregate them as one. 46 | @param batch_size: 47 | int, batch size during training. 48 | @param data_size: 49 | int, the dimension of time series data, 50 | where we can denote time series shape as (N x L x data_size). 51 | @param scaled: 52 | bool, whether to scale time series by z-normalize 53 | @param norm: 54 | bool, whether to conduct min-max normalization when extract time series features 55 | @param kwargs: 56 | other candidate options, i.e., 57 | model_cache: bool, whether to load model from cache 58 | other options in Time2GraphEmbed. 59 | """ 60 | super(Time2Graph, self).__init__(kernel=kernel, **kwargs) 61 | self.shapelets_cache = shapelets_cache 62 | self.K = K 63 | self.C = C 64 | self.seg_length = seg_length 65 | self.init = init 66 | self.opt_metric = opt_metric 67 | self.warp = warp 68 | self.tflag = tflag 69 | self.gpu_enable = gpu_enable 70 | self.percentile = percentile 71 | self.mode = mode 72 | self.batch_size = batch_size 73 | self.data_size = data_size 74 | self.scaled = scaled 75 | self.norm = norm 76 | self.data_scaler = [StandardScaler() for _ in range(self.data_size)] 77 | self.feature_scaler = MinMaxScaler() 78 | model_cache = kwargs.get('model_cache', None) 79 | self.verbose = kwargs.get('verbose', False) 80 | if model_cache is not None: 81 | self.load_model(fpath=model_cache) 82 | Debugger.info_print('load time2graph model from cache {}...'.format(model_cache)) 83 | else: 84 | self.t2g = Time2GraphEmbed(kernel=kernel, K=K, C=C, seg_length=seg_length, 85 | opt_metric=opt_metric, warp=warp, tflag=tflag, 86 | gpu_enable=gpu_enable, percentile=percentile, mode=mode, 87 | batch_size=batch_size, **kwargs) 88 | if path.isfile(self.shapelets_cache): 89 | self.t2g.load_shapelets(fpath=self.shapelets_cache) 90 | self.fm = FeatureModel(seg_length=self.t2g.seg_length, kernel=kernel) 91 | self.clf = self.clf__() 92 | 93 | def extract_features(self, X, init=0, train=False): 94 | """ 95 | @param X: 96 | ndarray with shape (N x L x data_size), input time series 97 | @param init: 98 | int, the same as self.init 99 | @param train: 100 | bool, flag for training or not. 101 | @return: 102 | time series features (embeddings) 103 | """ 104 | feat = self.fm.extract_features(samples=X) 105 | if self.scaled: 106 | X_scaled = np.zeros(X.shape, dtype=np.float) 107 | for k in range(self.data_size): 108 | X_scaled[:, :, k] = self.data_scaler[k].fit_transform(X[:, :, k]) 109 | embed = self.t2g.embed(x=X_scaled, init=init) 110 | else: 111 | embed = self.t2g.embed(x=X, init=init) 112 | if self.norm: 113 | if train: 114 | feat = self.feature_scaler.fit_transform(X=feat) 115 | else: 116 | feat = self.feature_scaler.transform(X=feat) 117 | return np.concatenate((embed, feat), axis=1) 118 | 119 | def fit(self, X, Y, n_splits=5, balanced=True, cache_dir='{}/scripts/cache/'.format(module_path), **kwargs): 120 | """ 121 | @param X: 122 | ndarray with shape (N x L x data_size), input time series. 123 | @param Y: 124 | ndarray with shape (N x 1), labels. 125 | @param n_splits: 126 | int, number of splits in cross-validation. 127 | @param balanced: 128 | bool, whether to balance the pos/neg during fitting classifier. 129 | @param cache_dir: 130 | str, cache dir for graph embeddings. 131 | @param kwargs: 132 | tuning: bool, whether to tune the parameters of outer-classifier(xgb). 133 | opt_args: dict, if tuning is False, opt_args must be given that 134 | the optimal parameters of outer-classifier should be pre-defined. 135 | """ 136 | # fit data scaler 137 | for k in range(self.data_size): 138 | self.data_scaler[k].fit(X[:, :, k]) 139 | X_scaled = np.zeros(X.shape, dtype=np.float) 140 | for k in range(self.data_size): 141 | X_scaled[:, :, k] = self.data_scaler[k].fit_transform(X[:, :, k]) 142 | if self.t2g.shapelets is None: 143 | if self.scaled: 144 | self.t2g.learn_shapelets( 145 | x=X_scaled, y=Y, num_segment=int(X_scaled.shape[1] / self.seg_length), 146 | data_size=self.data_size, num_batch=int(X_scaled.shape[0] // self.batch_size)) 147 | else: 148 | self.t2g.learn_shapelets( 149 | x=X, y=Y, num_segment=int(X.shape[1] / self.seg_length), 150 | data_size=self.data_size, num_batch=int(X.shape[0] // self.batch_size)) 151 | self.t2g.save_shapelets(fpath=self.shapelets_cache) 152 | Debugger.info_print('saving shapelets cache to {}'.format(self.shapelets_cache)) 153 | if self.t2g.sembeds is None: 154 | Debugger.info_print('training embedding model...') 155 | if self.scaled: 156 | self.t2g.fit_embedding_model(x=X_scaled, y=Y, cache_dir=cache_dir) 157 | else: 158 | self.t2g.fit_embedding_model(x=X, y=Y, cache_dir=cache_dir) 159 | x = self.extract_features(X=X, init=self.init) 160 | Debugger.info_print('extract mixed features done...') 161 | max_accu, max_prec, max_recall, max_f1, max_metric = -1, -1, -1, -1, -1 162 | metric_measure = self.return_metric_method(opt_metric=self.t2g.opt_metric) 163 | tuning, opt_args = kwargs.get('tuning', True), kwargs.get('opt_args', None) 164 | 165 | ################################################### 166 | # fine-tuning to find optimal classifier parameters 167 | if tuning: 168 | arguments = self.clf_paras(balanced=balanced) 169 | for args in arguments: 170 | self.clf.set_params(**args) 171 | skf = StratifiedKFold(n_splits=n_splits, shuffle=True) 172 | tmp = np.zeros(5, dtype=np.float32).reshape(-1) 173 | measure_vector = [metric_measure, accuracy_score, precision_score, recall_score, f1_score] 174 | for train_idx, test_idx in skf.split(x, Y): 175 | self.clf.fit(x[train_idx], Y[train_idx]) 176 | y_pred, y_true = self.clf.predict(x[test_idx]), Y[test_idx] 177 | for k in range(5): 178 | tmp[k] += measure_vector[k](y_true=y_true, y_pred=y_pred) 179 | tmp /= n_splits 180 | Debugger.debug_print('args tuning: accu {:.4f}, prec {:.4f}, recall {:.4f}, f1 {:.4f}'.format( 181 | tmp[1], tmp[2], tmp[3], tmp[4] 182 | ), debug=self.verbose) 183 | if max_metric < tmp[0]: 184 | max_metric = tmp[0] 185 | opt_args = args 186 | max_accu, max_prec, max_recall, max_f1 = tmp[1:] 187 | if self.verbose: 188 | Debugger.info_print('args {} for clf {}, performance: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format( 189 | opt_args, self.kernel, max_accu, max_prec, max_recall, max_f1)) 190 | self.clf.set_params(**opt_args) 191 | 192 | ################################################### 193 | # load optimal parameters predefined before. 194 | else: 195 | assert opt_args is not None, 'missing opt args specified' 196 | self.clf.set_params(**opt_args) 197 | skf = StratifiedKFold(n_splits=n_splits, shuffle=True) 198 | tmp = np.zeros(5, dtype=np.float32).reshape(-1) 199 | measure_vector = [metric_measure, accuracy_score, precision_score, recall_score, f1_score] 200 | for train_idx, test_idx in skf.split(x, Y): 201 | self.clf.fit(x[train_idx], Y[train_idx]) 202 | y_pred, y_true = self.clf.predict(x[test_idx]), Y[test_idx] 203 | for k in range(5): 204 | tmp[k] += measure_vector[k](y_true=y_true, y_pred=y_pred) 205 | tmp /= n_splits 206 | if self.verbose: 207 | Debugger.info_print('args {} for clf {}, performance: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format( 208 | opt_args, self.kernel, tmp[1], tmp[2], tmp[3], tmp[4])) 209 | self.clf.fit(x, Y) 210 | 211 | def predict(self, X, **kwargs): 212 | """ 213 | :param X: 214 | input, with shape [N, T, data_size]. 215 | :param kwargs: 216 | ignore. 217 | :return: 218 | predicted label, predicted probability. 219 | """ 220 | x = self.extract_features(X=X, init=self.init) 221 | return self.clf.predict(x), self.clf.predict_proba(x)[:, 1] 222 | 223 | def save_model(self, fpath, **kwargs): 224 | """ 225 | dump model to a specific path. 226 | :param fpath: 227 | saving path. 228 | :param kwargs: 229 | ignore. 230 | :return: 231 | """ 232 | pickle.dump(self.__dict__, open(fpath, 'wb')) 233 | 234 | def load_model(self, fpath, **kwargs): 235 | """ 236 | save model from a given cache file. 237 | :param fpath: 238 | loading path. 239 | :param kwargs: 240 | ignore. 241 | :return: 242 | """ 243 | paras = pickle.load(open(fpath, 'rb')) 244 | for key, val in paras.items(): 245 | self.__dict__[key] = val 246 | -------------------------------------------------------------------------------- /time2graph/core/model_embeds.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pickle 4 | from copy import deepcopy 5 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 6 | from sklearn.model_selection import StratifiedKFold 7 | from sklearn.preprocessing import normalize 8 | from .time_aware_shapelets import learn_time_aware_shapelets 9 | from .shapelet_embedding import ShapeletEmbedding 10 | from ..utils.base_utils import ModelUtils, Debugger 11 | 12 | 13 | class Time2GraphEmbed(ModelUtils): 14 | """ 15 | Time2Graph model 16 | Hyper-parameters: 17 | K: number of learned shapelets 18 | C: number of candidates 19 | A: number of shapelets assigned to each segment 20 | tflag: timing flag 21 | opt_metric: optimal metric using in outside-classifier 22 | """ 23 | def __init__(self, kernel, K=100, C=1000, seg_length=30, warp=2, tflag=True, 24 | gpu_enable=True, percentile=15, opt_metric='f1', mode='aggregate', 25 | batch_size=100, **kwargs): 26 | super(Time2GraphEmbed, self).__init__(kernel=kernel, **kwargs) 27 | self.K = K 28 | self.C = C 29 | self.seg_length = seg_length 30 | self.warp = warp 31 | self.tflag = tflag 32 | self.opt_metric = opt_metric 33 | self.mode = mode 34 | self.batch_size = batch_size 35 | self.gpu_enable = gpu_enable 36 | self.percentile = percentile 37 | self.shapelets = None 38 | self.sembeds = None 39 | self.clf = None 40 | self.lr = kwargs.pop('lr', 1e-2) 41 | self.p = kwargs.pop('p', 2) 42 | self.alpha = kwargs.pop('alpha', 0.1) 43 | self.beta = kwargs.pop('beta', 0.05) 44 | self.multi_graph = kwargs.pop('multi_graph', False) 45 | self.debug = kwargs.pop('debug', True) 46 | self.measurement = kwargs.pop('measurement', 'gdtw') 47 | self.verbose = kwargs.pop('verbose', False) 48 | self.global_flag = kwargs.pop('global_flag', True) 49 | self.kwargs = kwargs 50 | Debugger.info_print('initialize t2g model with {}'.format(self.__dict__)) 51 | 52 | def learn_shapelets(self, x, y, num_segment, data_size, num_batch): 53 | """ 54 | learn time-aware shapelets. 55 | :param x: 56 | input time series data. 57 | :param y: 58 | input label. 59 | :param num_segment: 60 | number of segments that time series are divided into. 61 | :param data_size: 62 | data dimension of time series. 63 | :param num_batch: 64 | number of batch in training. 65 | :return: 66 | """ 67 | assert x.shape[1] == num_segment * self.seg_length 68 | if self.tflag: 69 | self.shapelets = learn_time_aware_shapelets( 70 | time_series_set=x, label=y, K=self.K, C=self.C, p=self.p, 71 | num_segment=num_segment, seg_length=self.seg_length, data_size=data_size, 72 | lr=self.lr, alpha=self.alpha, beta=self.beta, num_batch=num_batch, 73 | measurement=self.measurement, gpu_enable=self.gpu_enable, **self.kwargs) 74 | else: 75 | raise NotImplementedError() 76 | 77 | def fit_embedding_model(self, x, y, cache_dir, init=0): 78 | """ 79 | fit embedding model (learn shapelet embeddings). 80 | :param x: 81 | input time series data. 82 | :param y: 83 | input label. 84 | :param cache_dir: 85 | cache directory that saving edgelist and embedding results. 86 | :param init: 87 | init index of time series for processing. default as 0. 88 | :return: 89 | """ 90 | assert self.shapelets is not None, 'shapelets has not been learnt yet' 91 | self.sembeds = ShapeletEmbedding( 92 | seg_length=self.seg_length, tflag=self.tflag, multi_graph=self.multi_graph, 93 | cache_dir=cache_dir, tanh=self.kwargs.get('tanh', False), debug=self.debug, 94 | percentile=self.percentile, measurement=self.measurement, mode=self.mode, 95 | global_flag=self.global_flag, **self.kwargs) 96 | self.sembeds.fit(time_series_set=x[np.argwhere(y == 0).reshape(-1), :, :], 97 | shapelets=self.shapelets, warp=self.warp, init=init) 98 | 99 | def embed(self, x, init=0): 100 | assert self.sembeds is not None, 'shapelet-embedding model has not been learnt yet' 101 | return self.sembeds.time_series_embedding(time_series_set=x, shapelets=self.shapelets, warp=self.warp, init=init) 102 | 103 | def set_deepwalk_args(self, **dw_args): 104 | for key, val in dw_args.items(): 105 | self.kwargs[key] = val 106 | 107 | def fit(self, X, Y, n_splits=5, init=0, reset=True, balanced=True, norm=False, 108 | cache_dir='./', **kwargs): 109 | """ 110 | fit the whole embeds model. 111 | :param X: 112 | input time series data. 113 | :param Y: 114 | input label. 115 | :param n_splits: 116 | number of splits in cross validation. 117 | :param init: 118 | init index. default as 0. 119 | :param reset: 120 | bool, whether to reset shapelets or embedding cache. 121 | if True, re-learn shapelets and their embeddings. 122 | :param balanced: 123 | bool, whether to balance the pos/neg during fitting classifier. 124 | :param norm: 125 | whether to norm the embeddings. 126 | :param cache_dir: 127 | cache directory for edge-list and embeddings. 128 | :param kwargs: 129 | tuning: bool, whether to tune the parameters of outer-classifier(xgb). 130 | opt_args: dict, if tuning is False, opt_args must be given that 131 | the optimal parameters of outer-classifier should be pre-defined. 132 | :return: 133 | """ 134 | num_segment = int(X.shape[1] / self.seg_length) 135 | data_size = X.shape[-1] 136 | if reset or self.shapelets is None: 137 | self.learn_shapelets( 138 | x=X, y=Y, num_segment=num_segment, data_size=data_size, num_batch=X.shape[0] // self.batch_size) 139 | if reset or self.sembeds is None: 140 | Debugger.info_print('fit embedding model...') 141 | self.fit_embedding_model(x=X, y=Y, cache_dir=cache_dir, init=init) 142 | max_clf_args, max_metric, clf = None, -1, self.clf__() 143 | embeds = self.sembeds.time_series_embedding( 144 | time_series_set=X, shapelets=self.shapelets, 145 | warp=self.warp, init=init) 146 | if norm: 147 | embeds = normalize(embeds, axis=0) 148 | Debugger.info_print('{} paras to be tuned'.format(self.para_len(balanced=balanced))) 149 | arguments = self.clf_paras(balanced=balanced) 150 | arg_size, cnt = self.para_len(balanced=balanced), 0.0 151 | metric_method = self.return_metric_method(opt_metric=self.opt_metric) 152 | 153 | tuning, opt_args = kwargs.get('tuning', True), kwargs.get('opt_args', None) 154 | if tuning: 155 | Debugger.info_print('running parameter tuning for fit...') 156 | max_accu, max_prec, max_recall, max_f1, max_clf_model = -1, -1, -1, -1, None 157 | for args in arguments: 158 | clf.set_params(**args) 159 | Debugger.debug_print(msg='{:.2f}% inner args tuned; args: {}'.format(cnt * 100.0 / arg_size, args), 160 | debug=self.debug) 161 | skf = StratifiedKFold(n_splits=n_splits, shuffle=True) 162 | tmp, accu, prec, recall, f1 = 0, 0, 0, 0, 0 163 | for train_idx, test_idx in skf.split(embeds, Y): 164 | clf.fit(embeds[train_idx], Y[train_idx]) 165 | y_true, y_pred = Y[test_idx], clf.predict(embeds[test_idx]) 166 | tmp += metric_method(y_true=y_true, y_pred=y_pred) 167 | accu += accuracy_score(y_true=y_true, y_pred=y_pred) 168 | prec += precision_score(y_true=y_true, y_pred=y_pred) 169 | recall += recall_score(y_true=y_true, y_pred=y_pred) 170 | f1 += f1_score(y_true=y_true, y_pred=y_pred) 171 | tmp /= n_splits 172 | accu /= n_splits 173 | prec /= n_splits 174 | recall /= n_splits 175 | f1 /= n_splits 176 | if max_metric < tmp: 177 | max_metric, max_clf_args, max_clf_model = tmp, args, deepcopy(clf) 178 | max_accu, max_prec, max_recall, max_f1 = accu, prec, recall, f1 179 | cnt += 1.0 180 | if self.verbose: 181 | Debugger.info_print('args {} for clf {}-{}, performance: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format( 182 | max_clf_args, self.kernel, self.opt_metric, max_accu, max_prec, max_recall, max_f1)) 183 | self.clf = {'clf': max_clf_model, 'clf-args': max_clf_args} 184 | else: 185 | assert opt_args is not None, 'missing opt args specified' 186 | clf.set_params(**opt_args) 187 | skf = StratifiedKFold(n_splits=n_splits, shuffle=True) 188 | tmp = np.zeros(5, dtype=np.float32).reshape(-1) 189 | measure_vector = [metric_method, accuracy_score, precision_score, recall_score, f1_score] 190 | for train_idx, test_idx in skf.split(embeds, Y): 191 | clf.fit(embeds[train_idx], Y[train_idx]) 192 | y_pred, y_true = clf.predict(embeds[test_idx]), Y[test_idx] 193 | for k in range(5): 194 | tmp[k] += measure_vector[k](y_true=y_true, y_pred=y_pred) 195 | tmp /= n_splits 196 | if self.verbose: 197 | Debugger.info_print('args {} for clf {}, performance: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format( 198 | opt_args, self.kernel, tmp[1], tmp[2], tmp[3], tmp[4])) 199 | self.clf = {'clf': clf, 'clf-args': opt_args} 200 | self.clf['clf'].fit(X, Y) 201 | 202 | def predict(self, X, norm=False): 203 | assert self.shapelets is not None, 'shapelets has not been learnt yet...' 204 | assert self.clf is not 'classifier has not been learnt yet...' 205 | if norm: 206 | embeds = normalize(self.embed(x=X), axis=0) 207 | else: 208 | embeds = self.embed(x=X) 209 | return self.clf['clf'].predict(embeds) 210 | 211 | def save_model(self, fpath, **kwargs): 212 | pickle.dump(self.__dict__, open(fpath, 'wb')) 213 | 214 | def load_model(self, fpath, **kwargs): 215 | paras = pickle.load(open(fpath, 'rb')) 216 | for key, val in paras.items(): 217 | self.__dict__[key] = val 218 | 219 | def save_shapelets(self, fpath): 220 | pickle.dump(self.shapelets, open(fpath, 'wb')) 221 | 222 | def load_shapelets(self, fpath): 223 | self.shapelets = pickle.load(open(fpath, 'rb')) 224 | -------------------------------------------------------------------------------- /time2graph/core/model_sequence.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pickle 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | from .rnn.deep_models import LSTMClassifier, GRUClassifier 7 | from .rnn.deep_utils import DeepDataloader, DeepDataset, train_RNNs 8 | from .shapelet_utils import shapelet_distance 9 | from .time_aware_shapelets import learn_time_aware_shapelets 10 | from ..utils.base_utils import Debugger 11 | 12 | 13 | class Time2GraphSequence(object): 14 | """ 15 | Time2Sequence Model: 16 | that is, using shapelet sequence as the input of a Sequence Model. 17 | using as a baseline in Time2Graph paper. 18 | """ 19 | def __init__(self, K=100, C=1000, seg_length=30, warp=2, tflag=True, 20 | hidden_size=64, output_size=64, dropout=0.1, gpu_enable=True, 21 | model='lstm', batch_size=100, **kwargs): 22 | super(Time2GraphSequence, self).__init__() 23 | self.K = K 24 | self.C = C 25 | self.seg_length = seg_length 26 | self.warp = warp 27 | self.tflag = tflag 28 | self.model = model 29 | self.batch_size = batch_size 30 | self.gpu_enable = gpu_enable 31 | self.shapelets = None 32 | self.rnns = None 33 | self.hidden_size = hidden_size 34 | self.output_size = output_size 35 | self.dropout = dropout 36 | self.lr = kwargs.pop('lr', 1e-2) 37 | self.p = kwargs.pop('p', 2) 38 | self.alpha = kwargs.pop('alpha', 10.0) 39 | self.beta = kwargs.pop('beta', 5.0) 40 | self.debug = kwargs.pop('debug', True) 41 | self.measurement = kwargs.pop('measurement', 'gdtw') 42 | self.niter = kwargs.pop('niter', 10) 43 | self.n_sequences = kwargs.pop('n_sequences', 1) 44 | self.kwargs = kwargs 45 | assert self.n_sequences == 1 46 | Debugger.info_print('initialize t2g model with {}'.format(self.__dict__)) 47 | 48 | def learn_shapelets(self, x, y, num_segment, data_size, num_batch): 49 | assert x.shape[1] == num_segment * self.seg_length 50 | if self.tflag: 51 | self.shapelets = learn_time_aware_shapelets( 52 | time_series_set=x, label=y, K=self.K, C=self.C, p=self.p, 53 | num_segment=num_segment, seg_length=self.seg_length, data_size=data_size, 54 | lr=self.lr, alpha=self.alpha, beta=self.beta, num_batch=num_batch, 55 | measurement=self.measurement, gpu_enable=self.gpu_enable, **self.kwargs) 56 | else: 57 | raise NotImplementedError() 58 | 59 | def retrieve_sequence(self, x, init): 60 | """ 61 | generate shapelet sequence for input time series data x. 62 | :param x: 63 | :param init: 64 | :return: 65 | """ 66 | assert self.shapelets is not None 67 | if len(x.shape) == 2: 68 | x = x.reshape(x.shape[0], x.shape[1], 1) 69 | data_length = x.shape[1] 70 | shapelet_dist = shapelet_distance( 71 | time_series_set=x, shapelets=self.shapelets, seg_length=self.seg_length, tflag=self.tflag, 72 | tanh=self.kwargs.get('tanh', False), debug=self.debug, init=init, warp=self.warp, 73 | measurement=self.measurement) 74 | ret = [] 75 | for k in range(shapelet_dist.shape[0]): 76 | sdist, sequences = shapelet_dist[k], [] 77 | for i in range(self.n_sequences): 78 | tmp = [] 79 | for j in range(sdist.shape[0]): 80 | min_s = np.argsort(sdist[j, :]).reshape(-1)[i] 81 | tmp.append(self.shapelets[min_s][0]) 82 | sequences.append(np.concatenate(tmp, axis=0)) 83 | ret.append(np.array(sequences).reshape(self.n_sequences, data_length, -1)) 84 | return np.array(ret) 85 | 86 | def fit(self, X, Y, init): 87 | sequences = self.retrieve_sequence(x=X, init=init).reshape(X.shape[0], X.shape[1], -1) 88 | if self.model == 'lstm': 89 | self.rnns = LSTMClassifier(data_size=X.shape[-1], hidden_size=self.hidden_size, 90 | output_size=self.output_size, dropout=self.dropout) 91 | elif self.model == 'gru': 92 | self.rnns = GRUClassifier(data_size=X.shape[-1], hidden_size=self.hidden_size, 93 | output_size=self.output_size, dropout=self.dropout) 94 | else: 95 | raise NotImplementedError() 96 | self.rnns.double() 97 | criterion = nn.CrossEntropyLoss() 98 | optimizer = optim.Adam(self.rnns.parameters(), lr=self.lr) 99 | if self.gpu_enable: 100 | self.rnns.cuda() 101 | criterion.cuda() 102 | train_dataset = DeepDataset(x=sequences, y=Y) 103 | train_dataloader = DeepDataloader(train_dataset, batch_size=self.batch_size, shuffle=True, 104 | num_workers=2) 105 | for epoch in range(self.niter): 106 | train_RNNs(epoch=epoch, dataloader=train_dataloader, rnn=self.rnns, criterion=criterion, 107 | optimizer=optimizer, debug=self.debug, gpu_enable=self.gpu_enable) 108 | 109 | def predict(self, X, init): 110 | assert self.shapelets is not None, 'shapelets has not been learnt yet...' 111 | assert self.rnns is not None, 'classifier has not been learnt yet...' 112 | return self.rnns(self.retrieve_sequence(x=X, init=init), len(X)) 113 | 114 | def dump_shapelets(self, fpath): 115 | pickle.dump(self.shapelets, open(fpath, 'wb')) 116 | 117 | def load_shapelets(self, fpath): 118 | self.shapelets = pickle.load(open(fpath, 'rb')) 119 | 120 | def save_model(self, fpath, **kwargs): 121 | pickle.dump(self.__dict__, open(fpath, 'wb')) 122 | 123 | def load_model(self, fpath, **kwargs): 124 | paras = pickle.load(open(fpath, 'rb')) 125 | for key, val in paras.items(): 126 | self.__dict__[key] = val 127 | -------------------------------------------------------------------------------- /time2graph/core/model_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from torch.utils.data import Dataset 3 | from torch.utils.data.sampler import WeightedRandomSampler 4 | 5 | 6 | class NumpyDataset(Dataset): 7 | """ Dataset wrapping numpy ndarrays 8 | Each sample will be retrieved by indexing numpy-arrays along the first dimension. 9 | 10 | Arguments: 11 | *ndarrays (numpy-ndarray): ndarrays that have the same size of the first dimension. 12 | """ 13 | def __init__(self, *ndarrays): 14 | assert all(ndarrays[0].shape[0] == ndarray.shape[0] for ndarray in ndarrays) 15 | self.ndarrays = ndarrays 16 | 17 | def __getitem__(self, idx): 18 | return tuple(ndarray[idx] for ndarray in self.ndarrays) 19 | 20 | def __len__(self): 21 | return self.ndarrays[0].shape[0] 22 | 23 | 24 | class StratifiedSampler(WeightedRandomSampler): 25 | """ 26 | Stratified Sampler in torch. 27 | """ 28 | def __init__(self, label, num_class): 29 | self.num_class = num_class 30 | weights = self.__get_weight(label=label) 31 | super(StratifiedSampler, self).__init__(weights=weights, num_samples=len(weights)) 32 | 33 | def __get_weight(self, label): 34 | num_class = self.num_class 35 | cnt = [0] * num_class 36 | for lb in label: 37 | cnt[lb] += 1 38 | weight_per_class, total = [0.0] * num_class, float(sum(cnt)) 39 | for k in range(num_class): 40 | weight_per_class[k] = total / float(cnt[k]) 41 | ret = [0.0] * len(label) 42 | for idx, val in enumerate(label): 43 | ret[idx] = weight_per_class[val] 44 | return ret 45 | -------------------------------------------------------------------------------- /time2graph/core/rnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/time2graph/core/rnn/__init__.py -------------------------------------------------------------------------------- /time2graph/core/rnn/deep_models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | """ 8 | implement the LSTM, GRU, VAE and MLP as baselines. 9 | """ 10 | 11 | 12 | class LSTMClassifier(nn.Module): 13 | def __init__(self, data_size, hidden_size, output_size, 14 | dropout, hidden_dim=128, gpu_enable=False): 15 | super(LSTMClassifier, self).__init__() 16 | self.data_size = data_size 17 | self.hidden_size = hidden_size 18 | self.output_size = output_size 19 | self.gpu_enable = gpu_enable 20 | self.model = nn.LSTM(data_size, hidden_size, batch_first=True).double() 21 | self.hidden2out = nn.Sequential( 22 | nn.Linear(hidden_size, hidden_dim), 23 | nn.ReLU(), 24 | nn.Linear(hidden_dim, output_size) 25 | ) 26 | self.dropout = nn.Dropout(p=dropout) 27 | 28 | def init_hidden(self, batch_size): 29 | if self.gpu_enable: 30 | return ( 31 | Variable(torch.zeros(1, batch_size, self.hidden_size).double().cuda()), 32 | Variable(torch.zeros(1, batch_size, self.hidden_size).double().cuda()) 33 | ) 34 | else: 35 | return ( 36 | Variable(torch.zeros(1, batch_size, self.hidden_size).double()), 37 | Variable(torch.zeros(1, batch_size, self.hidden_size).double()) 38 | ) 39 | 40 | def forward(self, X): 41 | hidden = self.init_hidden(batch_size=len(X)) 42 | outputs, (h_n, c_n) = self.model(X.double(), hidden) 43 | # return self.softmax(self.hidden2out(outputs)) 44 | return self.hidden2out(h_n[0]) 45 | 46 | 47 | class GRUClassifier(nn.Module): 48 | def __init__(self, data_size, hidden_size, output_size, dropout, 49 | gpu_enable=False): 50 | super(GRUClassifier, self).__init__() 51 | self.data_size = data_size 52 | self.hidden_size = hidden_size 53 | self.output_size = output_size 54 | self.gpu_enable = gpu_enable 55 | self.model = nn.GRU(data_size, hidden_size, batch_first=True).double() 56 | self.hidden2out = nn.Linear(hidden_size, output_size) 57 | 58 | def init_hidden(self, batch_size): 59 | if self.gpu_enable: 60 | return Variable(torch.zeros(1, batch_size, self.hidden_size).double().cuda()) 61 | else: 62 | return Variable(torch.zeros(1, batch_size, self.hidden_size).double()) 63 | 64 | def forward(self, X): 65 | hidden = self.init_hidden(batch_size=len(X)) 66 | outputs, (h_n, c_n) = self.model(X.double(), hidden) 67 | return self.hidden2out(h_n[0]) 68 | 69 | 70 | class EnDecoder(nn.Module): 71 | def __init__(self, D_in, H, D_out): 72 | super(EnDecoder, self).__init__() 73 | self.linear_1 = nn.Linear(D_in, H) 74 | self.linear_2 = nn.Linear(H, D_out) 75 | 76 | def forward(self, x): 77 | x = F.relu(self.linear_1(x)) 78 | return F.relu(self.linear_2(x)) 79 | 80 | 81 | class VAE(nn.Module): 82 | def __init__(self, encoder, decoder, encode_dim, latent_dim): 83 | super(VAE, self).__init__() 84 | self.encoder = encoder 85 | self.decoder = decoder 86 | self.encode_dim = encode_dim 87 | self.latent_dim = latent_dim 88 | self.__enc_mu = nn.Linear(encode_dim, latent_dim) 89 | self.__enc_log_sigma = nn.Linear(encode_dim, latent_dim) 90 | 91 | def __sample_latent(self, h_enc): 92 | mu = self.__enc_mu(h_enc) 93 | log_sigma = self.__enc_log_sigma(h_enc) 94 | sigma = torch.exp(log_sigma) 95 | std_z = torch.from_numpy(np.random.normal(0, 1, size=sigma.size())).double() 96 | self.z_mean = mu 97 | self.z_sigma = sigma 98 | return mu + sigma * Variable(std_z, requires_grad=False) 99 | 100 | def forward(self, state): 101 | h_enc = self.encoder(state) 102 | z = self.__sample_latent(h_enc=h_enc) 103 | return self.decoder(z) 104 | 105 | 106 | class MLP(nn.Module): 107 | def __init__(self, data_size, hidden_size, output_size, n_class=2): 108 | super(MLP, self).__init__() 109 | self.data_size = data_size 110 | self.hidden_size = hidden_size 111 | self.output_size = output_size 112 | self.hidden_layer = nn.Linear(data_size, hidden_size) 113 | self.output_layer = nn.Linear(hidden_size, output_size) 114 | self.out = nn.Linear(output_size, n_class) 115 | 116 | def forward(self, x): 117 | x = x.view(self.batch_size, self.data_size) 118 | return self.out(F.relu(self.output_layer(F.relu(self.hidden_layer(x))))) 119 | -------------------------------------------------------------------------------- /time2graph/core/rnn/deep_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | import numpy as np 4 | from torch.autograd import Variable 5 | import torch.nn.functional as F 6 | from torch.utils.data import DataLoader 7 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 8 | from ...utils.base_utils import Debugger 9 | """ 10 | utils for deep models. 11 | """ 12 | 13 | 14 | def latent_loss(z_mean, z_std): 15 | mean_2 = z_mean * z_mean 16 | std_2 = z_std * z_std 17 | return 0.5 * torch.mean(mean_2 + std_2 - torch.log(std_2) - 1) 18 | 19 | 20 | class DeepDataloader(DataLoader): 21 | def __init__(self, *args, **kwargs): 22 | super(DeepDataloader, self).__init__(*args, **kwargs) 23 | 24 | 25 | class DeepDataset(object): 26 | def __init__(self, x, y): 27 | self.x = x 28 | self.y = y 29 | 30 | def __getitem__(self, item): 31 | return self.x[item], self.y[item] 32 | 33 | def __len__(self): 34 | return len(self.y) 35 | 36 | 37 | def train_RNNs(epoch, dataloader, rnn, criterion, optimizer, debug, gpu_enable): 38 | rnn.train() 39 | for i, (sequences, target) in enumerate(dataloader, 0): 40 | sequences = sequences.double() 41 | if gpu_enable: 42 | sequences = sequences.cuda() 43 | target = target.cuda() 44 | sequences = Variable(sequences) 45 | target = Variable(target) 46 | output = rnn(sequences) 47 | loss = criterion(output, target) 48 | optimizer.zero_grad() 49 | loss.backward() 50 | optimizer.step() 51 | 52 | if i % int(len(dataloader) / 10 + 1) == 0: 53 | Debugger.debug_print('[{}][{}][{}], Loss: {}'.format( 54 | epoch, i, len(dataloader), loss.item()), debug=debug) 55 | 56 | 57 | def train_VAE(epoch, dataloader, vae, criterion, optimizer, debug, gpu_enable): 58 | vae.train() 59 | for i, (sequences, target) in enumerate(dataloader, 0): 60 | optimizer.zero_grad() 61 | sequences = sequences.double() 62 | if gpu_enable: 63 | sequences = sequences.cuda() 64 | target = target.cuda() 65 | sequences = Variable(sequences) 66 | output = vae(sequences) 67 | loss = criterion(output, target) + latent_loss(vae.z_mean, vae.z_sigma) 68 | loss.backward() 69 | optimizer.step() 70 | if i % int(len(dataloader) / 10 + 1) == 0: 71 | Debugger.debug_print('[{}][{}][{}], Loss: {}'.format( 72 | epoch, i, len(dataloader), loss.item()), debug=debug) 73 | 74 | 75 | def test_DeepModels(dataloader, rnn, criterion, debug, gpu_enable): 76 | for th in range(5, 20, 1): 77 | test_loss = 0 78 | correct = 0 79 | rnn.eval() 80 | y_pred, y_test = [], [] 81 | th = th / 20 82 | for i, (sequences, target) in enumerate(dataloader, 0): 83 | rnn.zero_grad() 84 | sequences = sequences.double() 85 | if gpu_enable: 86 | sequences = sequences.cuda() 87 | target = target.cuda() 88 | sequences = Variable(sequences) 89 | target = Variable(target) 90 | output = rnn(sequences) 91 | test_loss += criterion(output, target).item() 92 | # pred = F.softmax(output, dim=1).data.max(1, keepdim=True)[1] 93 | pred = F.softmax(output, dim=1)[:, 1].data.cpu().numpy() 94 | print(pred) 95 | tmp = np.zeros(len(pred)) 96 | tmp[pred >= th] = 1 97 | # y_pred += list(pred.cpu().numpy()) 98 | y_pred += list(tmp) 99 | y_test += list(target.cpu().numpy()) 100 | # correct += pred.eq(target.data.view_as(pred)).cpu().sum() 101 | test_loss /= len(dataloader.dataset) 102 | y_pred, y_test = np.array(y_pred, dtype=np.int).reshape(-1), np.array(y_test, dtype=np.int).reshape(-1) 103 | accu = accuracy_score(y_true=y_test, y_pred=y_pred) 104 | prec = precision_score(y_true=y_test, y_pred=y_pred) 105 | recall = recall_score(y_true=y_test, y_pred=y_pred) 106 | f1 = f1_score(y_true=y_test, y_pred=y_pred) 107 | Debugger.info_print('res: accu {:.4f}, prec {:.4f}, recall {:.4f}, f1 {:.4f}'.format( 108 | accu, prec, recall, f1 109 | )) 110 | Debugger.debug_print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format( 111 | test_loss, correct, len(dataloader.dataset), 112 | 100. * correct / len(dataloader.dataset)), debug=debug) 113 | -------------------------------------------------------------------------------- /time2graph/core/shapelet_embedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .shapelet_utils import * 3 | embed_number = 5 4 | 5 | 6 | def time_series_embeds_factory__(embed_size, embeddings, threshold, 7 | multi_graph, debug, mode): 8 | def __concate__(pid, args, queue): 9 | ret = [] 10 | for sdist in args: 11 | tmp = np.zeros(len(sdist) * embed_size * embed_number, dtype=np.float32).reshape(-1) 12 | for sidx in range(len(sdist)): 13 | dist = sdist[sidx, :] 14 | target = np.argsort(np.argwhere(dist <= threshold).reshape(-1))[:embed_number] 15 | if len(target) == 0: 16 | continue 17 | weight = 1.0 - minmax_scale(dist[target]) 18 | if np.sum(weight) == 0: 19 | Debugger.warn_print(msg='dist {}, weight {}'.format(dist, weight), debug=debug) 20 | else: 21 | weight /= np.sum(weight) 22 | target_number = len(weight) 23 | for k in range(target_number): 24 | src, dst = (sidx * embed_number + k) * embed_size, (sidx * embed_number + k + 1) * embed_size 25 | if multi_graph: 26 | if sidx == 0: 27 | tmp[src: dst] = weight[k] * embeddings[sidx, target[k]].reshape(-1) 28 | elif sidx == len(sdist) - 1: 29 | tmp[src: dst] = weight[k] * embeddings[sidx - 1, target[k]].reshape(-1) 30 | else: 31 | former = weight[k] * embeddings[sidx - 1, target[k]].reshape(-1) 32 | latter = weight[k] * embeddings[sidx, target[k]].reshape(-1) 33 | tmp[src: dst] = (former + latter) 34 | else: 35 | tmp[src: dst] = weight[k] * embeddings[0, target[k]].reshape(-1) 36 | ret.append(tmp) 37 | queue.put(0) 38 | return ret 39 | 40 | def __aggregate__(pid, args, queue): 41 | ret = [] 42 | for sdist in args: 43 | tmp = np.zeros(len(sdist) * embed_size, dtype=np.float32).reshape(-1) 44 | for sidx in range(len(sdist)): 45 | dist = sdist[sidx, :] 46 | target = np.argsort(np.argwhere(dist <= threshold).reshape(-1))[:embed_number] 47 | if len(target) == 0: 48 | continue 49 | weight = 1.0 - minmax_scale(dist[target]) 50 | if np.sum(weight) == 0: 51 | Debugger.warn_print(msg='dist {}, weight {}'.format(dist, weight), debug=debug) 52 | else: 53 | weight /= np.sum(weight) 54 | src, dst = sidx * embed_size, (sidx + 1) * embed_size 55 | for k in range(len(weight)): 56 | if multi_graph: 57 | if sidx == 0: 58 | tmp[src: dst] += weight[k] * embeddings[sidx, target[k]].reshape(-1) 59 | elif sidx == len(sdist) - 1: 60 | tmp[src: dst] += weight[k] * embeddings[sidx - 1, target[k]].reshape(-1) 61 | else: 62 | former = weight[k] * embeddings[sidx - 1, target[k]].reshape(-1) 63 | latter = weight[k] * embeddings[sidx, target[k]].reshape(-1) 64 | tmp[src: dst] += (former + latter) 65 | else: 66 | tmp[src: dst] += weight[k] * embeddings[0, target[k]].reshape(-1) 67 | ret.append(tmp) 68 | queue.put(0) 69 | return ret 70 | 71 | if mode == 'concate': 72 | return __concate__ 73 | elif mode == 'aggregate': 74 | return __aggregate__ 75 | else: 76 | raise NotImplementedError('unsupported mode {}'.format(mode)) 77 | 78 | 79 | class ShapeletEmbedding(object): 80 | """ 81 | class for shapelet embeddings using weighted Deepwalk. 82 | """ 83 | def __init__(self, seg_length, tflag, multi_graph, cache_dir, 84 | percentile, tanh, debug, measurement, mode, global_flag, 85 | **deepwalk_args): 86 | """ 87 | :param seg_length: 88 | segment length of time series. 89 | :param tflag: 90 | whether to use timing factors when computing distances between shapelets. 91 | :param multi_graph: 92 | whether to learn embeddings for each time step. default False. 93 | :param cache_dir: 94 | cache directory for edge-list and embeddings. 95 | :param percentile: 96 | percentile for distance threshold when constructing shapelet evolution graph. 97 | scale: 0~100, usually setting 5 or 10. 98 | :param tanh: 99 | whether to conduct tanh transformation on distance matrix (default False). 100 | :param debug: 101 | verbose flag. 102 | :param measurement: 103 | which distance measurement to use, default: greedy-dtw. 104 | :param mode: 105 | mode of generating time series embeddings. 106 | options: 107 | 'concate': concatenate embeddings of all segments. 108 | 'aggregate': weighted-sum up embeddings of all segments. 109 | :param global_flag: 110 | whether to use global timing factors (default False). 111 | :param deepwalk_args: 112 | parameters for deepwalk. 113 | e.g., representation-size, default 256. 114 | """ 115 | self.seg_length = seg_length 116 | self.tflag = tflag 117 | self.multi_graph = multi_graph 118 | self.cache_dir = cache_dir 119 | self.tanh = tanh 120 | self.debug = debug 121 | self.percentile = percentile 122 | self.dist_threshold = -1 123 | self.measurement = measurement 124 | self.mode = mode 125 | self.global_flag = global_flag 126 | self.deepwalk_args = deepwalk_args 127 | self.embed_size = self.deepwalk_args.get('representation_size', 256) 128 | self.embeddings = None 129 | Debugger.info_print('initialize ShapeletEmbedding model with ops: {}'.format(self.__dict__)) 130 | 131 | def fit(self, time_series_set, shapelets, warp, init=0): 132 | """ 133 | generate shapelet embeddings. 134 | :param time_series_set: 135 | input time series. 136 | :param shapelets: 137 | corresponding shapelets learned from time series set. 138 | :param warp: 139 | warping for greedy-dtw. 140 | :param init: 141 | init index of time series. default 0. 142 | :return: 143 | """ 144 | Debugger.info_print('fit shape: {}'.format(time_series_set.shape)) 145 | tmat, sdist, dist_threshold = transition_matrix( 146 | time_series_set=time_series_set, shapelets=shapelets, seg_length=self.seg_length, 147 | tflag=self.tflag, multi_graph=self.multi_graph, tanh=self.tanh, debug=self.debug, 148 | init=init, warp=warp, percentile=self.percentile, threshold=self.dist_threshold, 149 | measurement=self.measurement, global_flag=self.global_flag) 150 | self.dist_threshold = dist_threshold 151 | self.embeddings = graph_embedding( 152 | tmat=tmat, num_shapelet=len(shapelets), embed_size=self.embed_size, 153 | cache_dir=self.cache_dir, **self.deepwalk_args) 154 | 155 | def time_series_embedding(self, time_series_set, shapelets, warp, init=0): 156 | """ 157 | generate time series embeddings. 158 | :param time_series_set: 159 | time series data. 160 | :param shapelets: 161 | corresponding shapelets learned from time series set. 162 | :param warp: 163 | warping for greedy-dtw. 164 | :param init: 165 | init index of time series. default 0. 166 | :return: 167 | """ 168 | if self.embeddings is None: 169 | self.fit(time_series_set=time_series_set, shapelets=shapelets, warp=warp) 170 | sdist = shapelet_distance(time_series_set=time_series_set, shapelets=shapelets, 171 | seg_length=self.seg_length, tflag=self.tflag, tanh=self.tanh, 172 | debug=self.debug, init=init, warp=warp, 173 | measurement=self.measurement, global_flag=self.global_flag) 174 | Debugger.info_print('embedding threshold {}'.format(self.dist_threshold)) 175 | Debugger.info_print('sdist size {}'.format(sdist.shape)) 176 | parmap = ParMap( 177 | work=time_series_embeds_factory__( 178 | embed_size=self.embed_size, embeddings=self.embeddings, threshold=self.dist_threshold, 179 | multi_graph=self.multi_graph, debug=self.debug, mode=self.mode), 180 | monitor=parallel_monitor(msg='time series embedding', size=sdist.shape[0], debug=self.debug), 181 | njobs=NJOBS 182 | ) 183 | if self.mode == 'concate': 184 | size = sdist.shape[1] * self.embed_size * embed_number 185 | elif self.mode == 'aggregate': 186 | size = sdist.shape[1] * self.embed_size 187 | else: 188 | raise NotImplementedError('unsupported mode {}'.format(self.mode)) 189 | return np.array(parmap.run(data=list(sdist)), dtype=np.float32).reshape(sdist.shape[0], size) 190 | -------------------------------------------------------------------------------- /time2graph/core/shapelet_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.cluster import KMeans 3 | from sklearn.preprocessing import minmax_scale 4 | from .distance_utils import * 5 | from ..utils.base_utils import Debugger, syscmd 6 | from ..utils.mp_utils import ParMap, parallel_monitor, NJOBS 7 | __tmat_threshold = 1e-2 8 | 9 | 10 | def softmax_np(x): 11 | """ 12 | softmax for numpy array on axis 0. 13 | :param x: 14 | :return: 15 | """ 16 | e_x = np.exp(x - np.max(x)) 17 | return e_x / e_x.sum(axis=0) 18 | 19 | 20 | def __candidate_cluster_factory(n_clusters, seg_length): 21 | """ 22 | generate shapelets candidates by clustering. 23 | :param n_clusters: 24 | :param seg_length: 25 | :return: 26 | """ 27 | def __main__(pid, args, queue): 28 | ret = [] 29 | for time_series_segments in args: 30 | kmeans = KMeans(n_clusters=n_clusters).fit(time_series_segments) 31 | ret.append(kmeans.cluster_centers_.reshape(n_clusters, seg_length, -1)) 32 | queue.put(0) 33 | return ret 34 | return __main__ 35 | 36 | 37 | def __candidate_greedy_factory(n_candiates, seg_length): 38 | """ 39 | generate shapelets candidates by greedy algorithms. 40 | :param n_candiates: 41 | :param seg_length: 42 | :return: 43 | """ 44 | def __main__(pid, args, queue): 45 | ret = [] 46 | for time_series_segments in args: 47 | size = time_series_segments.shape[0] 48 | center_segment = np.mean(time_series_segments, axis=0) 49 | cand_dist = np.linalg.norm( 50 | time_series_segments.reshape(size, -1) - center_segment.reshape(1, -1), axis=1) 51 | tmp = [] 52 | for cnt in range(n_candiates): 53 | idx = np.argmax(cand_dist) 54 | cand_dist[idx] = -1 55 | update_idx = cand_dist >= 0 56 | dims = np.sum(update_idx) 57 | cand_dist[update_idx] += np.linalg.norm( 58 | time_series_segments[update_idx].reshape(dims, -1) - time_series_segments[idx].reshape(1, -1), 59 | axis=1 60 | ) 61 | tmp.append(time_series_segments[idx].reshape(seg_length, -1)) 62 | ret.append(tmp) 63 | queue.put(0) 64 | return ret 65 | return __main__ 66 | 67 | 68 | def generate_shapelet_candidate(time_series_set, num_segment, seg_length, candidate_size, **kwargs): 69 | """ 70 | generate shapelet candidates. 71 | :param time_series_set: 72 | :param num_segment: 73 | :param seg_length: 74 | :param candidate_size: 75 | :param kwargs: 76 | candidate_method: 'greedy' or 'cluster'. 77 | debug: bool. 78 | :return: 79 | """ 80 | __method, __debug = kwargs.get('candidate_method', 'greedy'), kwargs.get('debug', True) 81 | njobs = kwargs.get('njobs', NJOBS) 82 | Debugger.debug_print('begin to generate shapelet candidates...', __debug) 83 | num_time_series = time_series_set.shape[0] 84 | time_series_set = time_series_set.reshape(num_time_series, num_segment, seg_length, -1) 85 | assert candidate_size >= num_segment, 'candidate-size {} should be larger ' \ 86 | 'than n_segments {}'.format(candidate_size, num_segment) 87 | args, n_clusters = [], candidate_size // num_segment 88 | for idx in range(num_segment): 89 | args.append(time_series_set[:, idx, :, :].reshape(num_time_series, -1)) 90 | if __method == 'cluster': 91 | work_func = __candidate_cluster_factory 92 | elif __method == 'greedy': 93 | work_func = __candidate_greedy_factory 94 | else: 95 | raise NotImplementedError('unsupported candidate generating method {}'.format(__method)) 96 | parmap = ParMap( 97 | work=work_func(n_clusters, seg_length), 98 | monitor=parallel_monitor(msg='generate candidate by {}'.format(__method), 99 | size=num_segment, debug=__debug), 100 | njobs=njobs 101 | ) 102 | ret = np.concatenate(parmap.run(data=args), axis=0) 103 | Debugger.info_print('candidates with length {} sampling done...'.format(seg_length)) 104 | Debugger.info_print('totally {} candidates with shape {}'.format(len(ret), ret.shape)) 105 | return ret 106 | 107 | 108 | def __shapelet_distance_factory(shapelets, num_segment, seg_length, tflag, 109 | init, warp, dist, global_flag): 110 | """ 111 | factory for computing distances between shapelet and time series. 112 | :param shapelets: 113 | learned time-aware shapelets. 114 | :param num_segment: 115 | :param seg_length: 116 | :param tflag: 117 | :param init: 118 | :param warp: 119 | :param dist: 120 | metric for computing distance. 121 | :param global_flag: 122 | whether to use global timing factors. 123 | :return: 124 | """ 125 | def __main__(pid, args, queue): 126 | ret = [] 127 | for time_series in args: 128 | time_series = time_series.reshape(num_segment, seg_length, -1) 129 | tmp = np.zeros((num_segment, len(shapelets)), dtype=np.float32) 130 | if tflag and global_flag: 131 | for idx, (pattern, local_factor, global_factor, _) in enumerate(shapelets): 132 | for k in range(num_segment): 133 | tmp[k, idx] = dist(x=pattern, y=time_series[k], 134 | w=local_factor, warp=warp) * np.abs(global_factor[init + k]) 135 | elif tflag and not global_flag: 136 | for idx, (pattern, local_factor, global_factor, _) in enumerate(shapelets): 137 | for k in range(num_segment): 138 | tmp[k, idx] = dist(x=pattern, y=time_series[k], w=local_factor, warp=warp) 139 | else: 140 | for idx, (pattern, _) in enumerate(shapelets): 141 | for k in range(num_segment): 142 | tmp[k, idx] = dist(x=pattern, y=time_series[k], 143 | w=np.ones(pattern.shape[0]), warp=warp) 144 | ret.append(tmp) 145 | queue.put(0) 146 | return ret 147 | return __main__ 148 | 149 | 150 | def shapelet_distance(time_series_set, shapelets, seg_length, tflag, tanh, debug, init, 151 | warp, measurement, global_flag): 152 | """ 153 | paralleling compute distances between time series and shapelets. 154 | :param time_series_set: 155 | :param shapelets: 156 | :param seg_length: 157 | :param tflag: 158 | :param tanh: 159 | :param debug: 160 | :param init: 161 | :param warp: 162 | :param measurement: 163 | :param global_flag: 164 | :return: 165 | distance matrix. 166 | """ 167 | num_time_series = time_series_set.shape[0] 168 | num_segment = int(time_series_set.shape[1] / seg_length) 169 | num_shapelet = len(shapelets) 170 | assert num_segment * seg_length == time_series_set.shape[1] 171 | if measurement == 'gw': 172 | dist = parameterized_gw_npy 173 | elif measurement == 'gdtw': 174 | dist = parameterized_gdtw_npy 175 | else: 176 | raise NotImplementedError('unsupported distance {}'.format(measurement)) 177 | parmap = ParMap( 178 | work=__shapelet_distance_factory( 179 | shapelets=shapelets, num_segment=num_segment, seg_length=seg_length, 180 | tflag=tflag, init=init, warp=warp, dist=dist, global_flag=global_flag), 181 | monitor=parallel_monitor(msg='shapelet distance', size=num_time_series, debug=debug), 182 | njobs=NJOBS 183 | ) 184 | sdist = np.array(parmap.run(data=list(time_series_set)), dtype=np.float32).reshape( 185 | time_series_set.shape[0], num_segment, num_shapelet 186 | ) 187 | if tanh: 188 | sdist = np.tanh(sdist) 189 | return sdist 190 | 191 | 192 | def transition_matrix(time_series_set, shapelets, seg_length, tflag, multi_graph, 193 | percentile, threshold, tanh, debug, init, warp, measurement, global_flag): 194 | """ 195 | compute shapelet transition matrix. 196 | :param time_series_set: 197 | :param shapelets: 198 | :param seg_length: 199 | :param tflag: 200 | :param multi_graph: 201 | :param percentile: 202 | percentile for distance threshold. 203 | :param threshold: 204 | distance threshold. 205 | only work when percentile is None. 206 | :param tanh: 207 | :param debug: 208 | :param init: 209 | :param warp: 210 | :param measurement: 211 | :param global_flag: 212 | :return: 213 | """ 214 | num_time_series = time_series_set.shape[0] 215 | num_segment = int(time_series_set.shape[1] / seg_length) 216 | num_shapelet = len(shapelets) 217 | if multi_graph: 218 | gcnt = num_segment - 1 219 | else: 220 | gcnt = 1 221 | tmat = np.zeros((gcnt, num_shapelet, num_shapelet), dtype=np.float32) 222 | sdist = shapelet_distance( 223 | time_series_set=time_series_set, shapelets=shapelets, seg_length=seg_length, tflag=tflag, 224 | tanh=tanh, debug=debug, init=init, warp=warp, measurement=measurement, global_flag=global_flag 225 | ) 226 | if percentile is not None: 227 | dist_threshold = np.percentile(sdist, percentile) 228 | Debugger.info_print('threshold({}) {}, mean {}'.format(percentile, dist_threshold, np.mean(sdist))) 229 | else: 230 | dist_threshold = threshold 231 | Debugger.info_print('threshold {}, mean {}'.format(dist_threshold, np.mean(sdist))) 232 | 233 | n_edges = 0 234 | for tidx in range(num_time_series): 235 | for sidx in range(num_segment - 1): 236 | src_dist = sdist[tidx, sidx, :] 237 | dst_dist = sdist[tidx, sidx + 1, :] 238 | src_idx = np.argwhere(src_dist <= dist_threshold).reshape(-1) 239 | dst_idx = np.argwhere(dst_dist <= dist_threshold).reshape(-1) 240 | if len(src_idx) == 0 or len(dst_idx) == 0: 241 | continue 242 | n_edges += len(src_idx) * len(dst_idx) 243 | src_dist[src_idx] = 1.0 - minmax_scale(src_dist[src_idx]) 244 | dst_dist[dst_idx] = 1.0 - minmax_scale(dst_dist[dst_idx]) 245 | for src in src_idx: 246 | if multi_graph: 247 | tmat[sidx, src, dst_idx] += (src_dist[src] * dst_dist[dst_idx]) 248 | else: 249 | tmat[0, src, dst_idx] += (src_dist[src] * dst_dist[dst_idx]) 250 | Debugger.debug_print( 251 | '{:.2f}% transition matrix computed...'.format(float(tidx + 1) * 100 / num_time_series), 252 | debug=debug 253 | ) 254 | Debugger.info_print('{} edges involved in shapelets graph'.format(n_edges)) 255 | tmat[tmat <= __tmat_threshold] = 0.0 256 | for k in range(gcnt): 257 | for i in range(num_shapelet): 258 | norms = np.sum(tmat[k, i, :]) 259 | if norms == 0: 260 | tmat[k, i, i] = 1.0 261 | else: 262 | tmat[k, i, :] /= np.sum(tmat[k, i, :]) 263 | return tmat, sdist, dist_threshold 264 | 265 | 266 | def __mat2edgelist(tmat, fpath): 267 | """ 268 | transform matrix to edge-list format that Deepwalk needs. 269 | :param tmat: 270 | :param fpath: 271 | :return: 272 | """ 273 | mat_shape = tmat.shape 274 | with open(fpath, 'w') as f: 275 | for src in range(mat_shape[0]): 276 | flag = False 277 | for dst in range(mat_shape[1]): 278 | if tmat[src, dst] <= 1e-5: 279 | continue 280 | f.write('{} {} {:.5f}\n'.format(src, dst, tmat[src, dst])) 281 | flag = True 282 | if not flag: 283 | f.write('{} {} 1.0000\n'.format(src, src)) 284 | f.close() 285 | 286 | 287 | def __embedding2mat(fpath, num_vertices, embed_size): 288 | """ 289 | loading embeddings from cache file into a numpy array. 290 | :param fpath: 291 | :param num_vertices: 292 | :param embed_size: 293 | :return: 294 | """ 295 | mat = np.zeros((num_vertices, embed_size), dtype=np.float32) 296 | with open(fpath, 'r') as f: 297 | cnt = -1 298 | for line in f: 299 | if cnt < 0: 300 | cnt += 1 301 | continue 302 | line = line.split(' ') 303 | idx = int(line[0]) 304 | for k in range(embed_size): 305 | mat[idx, k] = float(line[k + 1]) 306 | f.close() 307 | return mat 308 | 309 | 310 | def graph_embedding(tmat, num_shapelet, embed_size, cache_dir, **deepwalk_paras): 311 | """ 312 | conduct Deepwalk to generate shapelet embeddings. 313 | :param tmat: 314 | :param num_shapelet: 315 | :param embed_size: 316 | :param cache_dir: 317 | :param deepwalk_paras: 318 | optional deepwalk parameters. 319 | :return: 320 | """ 321 | __deepwalk_args__ = [] 322 | Debugger.info_print('embed_size: {}'.format(embed_size)) 323 | ret = [] 324 | Debugger.info_print('transition matrix size {}'.format(tmat.shape)) 325 | for idx in range(tmat.shape[0]): 326 | edgelist_path = '{}/{}.edgelist'.format(cache_dir, idx) 327 | embedding_path = '{}/{}.embeddings'.format(cache_dir, idx) 328 | __mat2edgelist(tmat=tmat[idx, :, :], fpath=edgelist_path) 329 | deepwalk_cmd = [ 330 | 'deepwalk --input {} --format weighted_edgelist --output {} --representation-size {}'.format( 331 | edgelist_path, embedding_path, embed_size) 332 | ] 333 | for key, val in deepwalk_paras.items(): 334 | if key in __deepwalk_args__: 335 | deepwalk_cmd.append('--{} {}'.format(key, val)) 336 | deepwalk_cmd = ' '.join(deepwalk_cmd) 337 | Debugger.info_print('run deepwalk with: {}'.format(deepwalk_cmd)) 338 | _ = syscmd(deepwalk_cmd) 339 | ret.append(__embedding2mat(fpath=embedding_path, num_vertices=num_shapelet, 340 | embed_size=embed_size)) 341 | return np.array(ret, dtype=np.float32).reshape(tmat.shape[0], num_shapelet, embed_size) 342 | -------------------------------------------------------------------------------- /time2graph/core/time_aware_shapelets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | from torch.autograd import * 4 | from torch import optim 5 | from torch.nn import functional as F 6 | from torch.distributions.normal import Normal 7 | from torch.utils.data import DataLoader 8 | from ..utils.base_utils import Queue 9 | from .model_utils import * 10 | from .shapelet_utils import * 11 | from .distance_utils import * 12 | 13 | 14 | def parameterized_gw_torch(x, y, w, torch_dtype, warp=2): 15 | """ 16 | gw distance in torch with timing factors. 17 | :param x: 18 | :param y: 19 | :param w: 20 | :param torch_dtype: 21 | :param warp: 22 | :return: 23 | """ 24 | distance = np.sum((x.reshape(x.shape[0], -1, x.shape[1]) - expand_array(y=y, warp=warp)) ** 2, 25 | axis=1) 26 | assert not torch.any(torch.isnan(w)), 'local: {}'.format(w) 27 | softmin_distance = np.sum(softmax(-distance.astype(np.float64)).astype(np.float32) * distance, 28 | axis=1) 29 | return torch.sqrt(torch.sum(torch.from_numpy(softmin_distance).type(torch_dtype) * torch.abs(w))) 30 | 31 | 32 | def parameterized_gdtw_torch(x, y, w, torch_dtype, warp=2): 33 | """ 34 | greedy-dtw distance in torch with timing factors. 35 | :param x: 36 | :param y: 37 | :param w: 38 | :param torch_dtype: 39 | :param warp: 40 | :return: 41 | """ 42 | dpath = greedy_dtw_path(x=x, y=y, warp=warp) 43 | return torch.norm((torch.from_numpy(x).type(torch_dtype) * w.reshape(x.shape[0], -1))[dpath[0]] - 44 | torch.from_numpy(y[dpath[1]]).type(torch_dtype)) 45 | 46 | 47 | def pattern_distance_torch(pattern, time_series, num_segment, seg_length, 48 | local_factor, global_factor, torch_dtype, measurement): 49 | """ 50 | compute distances between a pattern and a given time series. 51 | :param pattern: 52 | :param time_series: 53 | :param num_segment: 54 | :param seg_length: 55 | :param local_factor: 56 | :param global_factor: 57 | :param torch_dtype: 58 | :param measurement: 59 | :return: 60 | """ 61 | if measurement == 'gw': 62 | dist_torch = parameterized_gw_torch 63 | elif measurement == 'gdtw': 64 | dist_torch = parameterized_gdtw_torch 65 | else: 66 | raise NotImplementedError('unsupported distance {}'.format(measurement)) 67 | assert isinstance(time_series, np.ndarray) and isinstance(pattern, np.ndarray) 68 | time_series = time_series.reshape(num_segment, seg_length, -1) 69 | distance = Variable(torch.zeros(num_segment)).type(torch_dtype) 70 | for k in range(num_segment): 71 | distance[k] = dist_torch(x=pattern, y=time_series[k], w=local_factor, torch_dtype=torch_dtype) 72 | return torch.sum(F.softmax(-distance * torch.abs(global_factor), dim=0) 73 | * (distance * torch.abs(global_factor))) 74 | 75 | 76 | def __shapelet_candidate_loss(cand, time_series_set, label, num_segment, seg_length, 77 | data_size, p, lr, alpha, beta, num_batch, gpu_enable, 78 | measurement, **kwargs): 79 | """ 80 | loss for learning time-aware shapelets. 81 | :param cand: 82 | :param time_series_set: 83 | :param label: 84 | :param num_segment: 85 | :param seg_length: 86 | :param data_size: 87 | :param p: 88 | normalizing parameter (0, 1, or 2). 89 | :param lr: 90 | learning rate. 91 | :param alpha: 92 | penalty weight for local timing factor. 93 | :param beta: 94 | penalty weight for global timing factor. 95 | :param num_batch: 96 | :param gpu_enable: 97 | :param measurement: 98 | :param kwargs: 99 | :return: 100 | """ 101 | if gpu_enable: 102 | torch_dtype = torch.cuda.FloatTensor 103 | else: 104 | torch_dtype = torch.FloatTensor 105 | dataset_numpy = NumpyDataset(time_series_set, label) 106 | num_class = len(np.unique(label).reshape(-1)) 107 | batch_size = int(len(dataset_numpy) // num_batch) 108 | local_factor_variable = Variable(torch.ones(seg_length).type(torch_dtype) / seg_length, requires_grad=True) 109 | global_factor_variable = Variable(torch.ones(num_segment).type(torch_dtype) / num_segment, requires_grad=True) 110 | current_loss, loss_queue, cnt, nan_cnt = 0.0, Queue(max_size=int(num_batch * 0.2)), 0, 0 111 | current_main_loss, current_penalty_loss = 0.0, 0.0 112 | max_iters, optimizer = kwargs.get('max_iters', 1), kwargs.get('optimizer', 'Adam') 113 | if optimizer == 'Adam': 114 | optimizer = optim.Adam 115 | elif optimizer == 'Adadelta': 116 | optimizer = optim.Adadelta 117 | elif optimizer == 'Adamax': 118 | optimizer = optim.Adamax 119 | else: 120 | raise NotImplementedError('unsupported optimizer {} for time-aware shapelets learning'.format(optimizer)) 121 | optimizer = optimizer([local_factor_variable, global_factor_variable], lr=lr) 122 | 123 | while cnt < max_iters: 124 | sampler = StratifiedSampler(label=label, num_class=num_class) 125 | dataloader = DataLoader(dataset=dataset_numpy, batch_size=batch_size, sampler=sampler) 126 | batch_cnt = 0 127 | for x, y in dataloader: 128 | x = np.array(x, dtype=np.float32).reshape(len(x), -1, data_size) 129 | y = np.array(y, dtype=np.float32).reshape(-1) 130 | assert not np.any(np.isnan(x)), 'original time series data with nan' 131 | lb_idx, sample_flag = [], True 132 | for k in range(num_class): 133 | tmp_idx = np.argwhere(y == k).reshape(-1) 134 | if k >= 1 and len(tmp_idx) > 0: 135 | sample_flag = False 136 | lb_idx.append(tmp_idx) 137 | if len(lb_idx[0]) == 0 or sample_flag: 138 | Debugger.debug_print('weighted sampling exception, positive {:.2f}/{}'.format(np.sum(y)/len(y), len(y))) 139 | continue 140 | loss = torch.Tensor([0.0]).type(torch_dtype) 141 | main_loss = torch.Tensor([0.0]).type(torch_dtype) 142 | penalty_loss = torch.Tensor([0.0]).type(torch_dtype) 143 | dist_tensor = torch.zeros(x.shape[0]).type(torch_dtype) 144 | for k in range(x.shape[0]): 145 | dist_tensor[k] = pattern_distance_torch( 146 | pattern=cand, time_series=x[k, :, :], num_segment=num_segment, 147 | seg_length=seg_length, local_factor=local_factor_variable, 148 | global_factor=global_factor_variable, torch_dtype=torch_dtype, 149 | measurement=measurement 150 | # ignore the warning of reshape/view for local_factor_variable 151 | ) 152 | assert not torch.isnan(dist_tensor).any(), 'dist: {}\nlocal: {}\nglobal: {}'.format( 153 | dist_tensor, local_factor_variable, global_factor_variable) 154 | mean, std = torch.mean(dist_tensor), torch.std(dist_tensor) 155 | dist_tensor = (dist_tensor - mean) / std 156 | # Debugger.info_print('transform: {}, {}'.format(torch.max(dist_tensor), torch.min(dist_tensor))) 157 | # Debugger.time_print(msg='pattern distance', begin=begin, profiling=True) 158 | for k in range(1, len(lb_idx)): 159 | src = dist_tensor[lb_idx[0]] 160 | dst = dist_tensor[lb_idx[k]] 161 | loss -= torch.abs(torch.distributions.kl.kl_divergence( 162 | Normal(torch.mean(src), torch.std(src)), 163 | Normal(torch.mean(dst), torch.std(dst)))) 164 | main_loss -= torch.abs(torch.distributions.kl.kl_divergence( 165 | Normal(torch.mean(src), torch.std(src)), 166 | Normal(torch.mean(dst), torch.std(dst)))) 167 | # Debugger.info_print('KL-loss: {}'.format(loss)) 168 | loss += (alpha * torch.norm(local_factor_variable, p=p) / seg_length) 169 | loss += (beta * torch.norm(global_factor_variable, p=p) / num_segment) 170 | 171 | penalty_loss += (alpha * torch.norm(local_factor_variable, p=p) / seg_length) 172 | penalty_loss += (beta * torch.norm(global_factor_variable, p=p) / num_segment) 173 | 174 | optimizer.zero_grad() 175 | loss.backward() 176 | optimizer.step() 177 | if gpu_enable: 178 | current_loss = float(loss.cpu().data.numpy()) 179 | current_main_loss = float(main_loss.cpu().data) 180 | current_penalty_loss = float(penalty_loss.cpu().data) 181 | else: 182 | current_loss = float(loss.data.numpy()) 183 | current_main_loss = float(main_loss.data) 184 | current_penalty_loss = float(penalty_loss.data) 185 | loss_queue.enqueue(current_loss) 186 | if np.isnan(current_loss) or torch.any(torch.isnan(local_factor_variable))\ 187 | or torch.any(torch.isnan(global_factor_variable)): 188 | local_factor_variable = Variable(torch.ones(seg_length).type(torch_dtype) / seg_length, requires_grad=True) 189 | global_factor_variable = Variable(torch.ones(num_segment).type(torch_dtype) / num_segment, requires_grad=True) 190 | current_loss = 1e5 191 | nan_cnt += 1 192 | if nan_cnt >= max_iters: 193 | break 194 | else: 195 | Debugger.debug_print('{:.2f}% steps, loss {:.6f} with {:.6f} and penalty {:.6f}'.format( 196 | batch_cnt * 100 / num_batch, current_loss, current_main_loss, current_penalty_loss)) 197 | batch_cnt += 1 198 | cnt += 1 199 | if nan_cnt >= max_iters: 200 | break 201 | else: 202 | avg_loss = np.mean(loss_queue.queue[1:]) 203 | if abs(current_loss - avg_loss) < kwargs.get('epsilon', 1e-2): 204 | break 205 | local_factor_variable = torch.abs(local_factor_variable) 206 | global_factor_variable = torch.abs(global_factor_variable) 207 | if gpu_enable: 208 | local_factor = local_factor_variable.cpu().data.numpy() 209 | global_factor = global_factor_variable.cpu().data.numpy() 210 | else: 211 | local_factor = local_factor_variable.data.numpy() 212 | global_factor = global_factor_variable.data.numpy() 213 | return local_factor, global_factor, current_loss, current_main_loss, current_penalty_loss 214 | 215 | 216 | def __shapelet_candidate_loss_factory(time_series_set, label, num_segment, 217 | seg_length, data_size, p, lr, alpha, beta, num_batch, 218 | gpu_enable, measurement, **kwargs): 219 | """ 220 | paralleling compute shapelet losses. 221 | :param time_series_set: 222 | :param label: 223 | :param num_segment: 224 | :param seg_length: 225 | :param data_size: 226 | :param p: 227 | :param lr: 228 | :param alpha: 229 | :param beta: 230 | :param num_batch: 231 | :param gpu_enable: 232 | :param measurement: 233 | :param kwargs: 234 | :return: 235 | """ 236 | def __main__(pid, args, queue): 237 | ret = [] 238 | for cand in args: 239 | local_factor, global_factor, loss, main_loss, penalty = __shapelet_candidate_loss( 240 | cand=cand, time_series_set=time_series_set, label=label, num_segment=num_segment, 241 | seg_length=seg_length, data_size=data_size, p=p, lr=lr, 242 | alpha=alpha, beta=beta, num_batch=num_batch, gpu_enable=gpu_enable, 243 | measurement=measurement, **kwargs 244 | ) 245 | ret.append((cand, local_factor, global_factor, loss, main_loss, penalty)) 246 | queue.put(0) 247 | return ret 248 | return __main__ 249 | 250 | 251 | def learn_time_aware_shapelets(time_series_set, label, K, C, num_segment, seg_length, data_size, 252 | p, lr, alpha, beta, num_batch, gpu_enable, measurement, **kwargs): 253 | """ 254 | learn time-aware shapelets. 255 | :param time_series_set: 256 | input time series data. 257 | :param label: 258 | input label. 259 | :param K: 260 | number of shapelets that finally learned. 261 | :param C: 262 | number of shapelet candidates in learning procedure. 263 | :param num_segment: 264 | :param seg_length: 265 | :param data_size: 266 | :param p: 267 | :param lr: 268 | :param alpha: 269 | :param beta: 270 | :param num_batch: 271 | :param gpu_enable: 272 | :param measurement: 273 | :param kwargs: 274 | :return: 275 | """ 276 | cands = generate_shapelet_candidate(time_series_set=time_series_set, num_segment=num_segment, 277 | seg_length=seg_length, candidate_size=C, **kwargs) 278 | parmap = ParMap( 279 | work=__shapelet_candidate_loss_factory( 280 | time_series_set=time_series_set, label=label, num_segment=num_segment, seg_length=seg_length, 281 | data_size=data_size, p=p, lr=lr, alpha=alpha, beta=beta, num_batch=num_batch, 282 | gpu_enable=gpu_enable, measurement=measurement, **kwargs 283 | ), 284 | monitor=parallel_monitor(msg='learning time-aware shapelets', size=len(cands), 285 | debug=kwargs.get('debug', True)), 286 | njobs=kwargs.get('njobs', NJOBS) 287 | ) 288 | result = sorted(parmap.run(data=cands), key=lambda x: x[3]) 289 | ret = [] 290 | for (cand, local_factor, global_factor, loss, main_loss, penalty) in result: 291 | ret.append((cand, local_factor, global_factor, loss)) 292 | return sorted(ret, key=lambda x: x[-1])[:K] 293 | -------------------------------------------------------------------------------- /time2graph/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petecheng/Time2Graph/f3a7387d04869f2388bdda4b900c50149b57698e/time2graph/utils/__init__.py -------------------------------------------------------------------------------- /time2graph/utils/base_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import time 4 | import itertools 5 | import psutil 6 | from subprocess import * 7 | 8 | 9 | class ModelUtils(object): 10 | """ 11 | model utils for basic classifiers. 12 | kwargs list: 13 | lr paras 14 | penalty: list of str, candidate: l1, l2; 15 | c: list of float 16 | inter_scale: list of float 17 | rf and dts paras: 18 | criteria: list of str, candidate: gini, entropy 19 | max_features: list of str(including None), candidate: auto, log2 or None 20 | max_depth: list of int 21 | max_split: list of int 22 | min_leaf: list of int 23 | xgb paras: 24 | max_depth: list of int 25 | learning_rate: list of float 26 | n_jobs: int 27 | class_weight: list of int 28 | booster: list of str, candidate: gblinear, gbtree, dart 29 | svm paras: 30 | c: list of float 31 | svm_kernel: list of str, candidate: rbf, poly, sigmoid 32 | deepwalk paras: 33 | num_walks: list of int 34 | representation_size: list of int 35 | window_size: list of int 36 | workers: int 37 | undirected: bool 38 | """ 39 | def __init__(self, kernel, **kwargs): 40 | self.kernel = kernel 41 | self.kwargs = kwargs 42 | 43 | @property 44 | def clf__(self): 45 | if self.kernel == 'lr': 46 | from sklearn.linear_model import LogisticRegression 47 | return LogisticRegression 48 | elif self.kernel == 'svm': 49 | from sklearn.svm import SVC 50 | return SVC 51 | elif self.kernel == 'dts': 52 | from sklearn.tree import DecisionTreeClassifier 53 | return DecisionTreeClassifier 54 | elif self.kernel == 'rf': 55 | from sklearn.ensemble import RandomForestClassifier 56 | return RandomForestClassifier 57 | elif self.kernel == 'xgb': 58 | from xgboost import XGBClassifier 59 | return XGBClassifier 60 | else: 61 | raise NotImplementedError('unsupported kernel {}'.format(self.kernel)) 62 | 63 | def para_len(self, balanced): 64 | cnt = 0 65 | for _ in self.clf_paras(balanced=balanced): 66 | cnt += 1 67 | return cnt 68 | 69 | def clf_paras(self, balanced): 70 | class_weight = 'balanced' if balanced else None 71 | if self.kernel == 'lr': 72 | penalty = self.kwargs.get('penalty', ['l1', 'l2']) 73 | c = self.kwargs.get('c', [pow(5, i) for i in range(-3, 3)]) 74 | intercept_scaling = self.kwargs.get('inter_scale', [pow(5, i) for i in range(-3, 3)]) 75 | for (p1, p2, p3) in itertools.product(penalty, c, intercept_scaling): 76 | yield { 77 | 'penalty': p1, 78 | 'C': p2, 79 | 'intercept_scaling': p3, 80 | 'class_weight': class_weight 81 | } 82 | elif self.kernel == 'rf' or self.kernel == 'dts': 83 | criteria = self.kwargs.get('criteria', ['gini', 'entropy']) 84 | max_features = self.kwargs.get('max_feature', ['auto', 'log2', None]) 85 | max_depth = self.kwargs.get('max_depth', [10, 25, 50]) 86 | min_samples_split = self.kwargs.get('max_split', [2, 4, 8]) 87 | min_samples_leaf = self.kwargs.get('min_leaf', [1, 3, 5]) 88 | for (p1, p2, p3, p4, p5) in itertools.product( 89 | criteria, max_features, max_depth, min_samples_split, min_samples_leaf 90 | ): 91 | yield { 92 | 'criterion': p1, 93 | 'max_features': p2, 94 | 'max_depth': p3, 95 | 'min_samples_split': p4, 96 | 'min_samples_leaf': p5, 97 | 'class_weight': class_weight 98 | } 99 | elif self.kernel == 'xgb': 100 | max_depth = self.kwargs.get('max_depth', [1, 2, 4, 8, 12, 16]) 101 | learning_rate = self.kwargs.get('learning_rate', [0.1, 0.2, 0.3]) 102 | n_jobs = [self.kwargs.get('n_jobs', psutil.cpu_count())] 103 | class_weight = self.kwargs.get('class_weight', [1, 10, 50, 100]) 104 | booster = self.kwargs.get('booster', ['gblinear', 'gbtree', 'dart']) 105 | for (p1, p2, p3, p4, p5) in itertools.product( 106 | max_depth, learning_rate, booster, n_jobs, class_weight 107 | ): 108 | yield { 109 | 'max_depth': p1, 110 | 'learning_rate': p2, 111 | 'booster': p3, 112 | 'n_jobs': p4, 113 | 'scale_pos_weight': p5 114 | } 115 | elif self.kernel == 'svm': 116 | c = self.kwargs.get('c', [pow(2, i) for i in range(-2, 2)]) 117 | svm_kernel = self.kwargs.get('svm_kernel', ['rbf', 'poly', 'sigmoid']) 118 | for (p1, p2) in itertools.product(c, svm_kernel): 119 | yield { 120 | 'C': p1, 121 | 'kernel': p2, 122 | 'class_weight': class_weight 123 | } 124 | else: 125 | raise NotImplementedError() 126 | 127 | @staticmethod 128 | def partition_data__(data, ratio, shuffle=True, multi=True): 129 | import random 130 | if not multi: 131 | size = len(data) 132 | if shuffle: 133 | idx = random.sample(range(size), int(size * ratio)) 134 | else: 135 | idx, step, cnt, init = [], 1.0 / ratio, 0, 0 136 | while cnt < int(size * ratio): 137 | idx.append(int(init)) 138 | init += step 139 | return data[idx] 140 | else: 141 | num, size = len(data), len(data[0]) 142 | if shuffle: 143 | idx = random.sample(range(size), int(size * ratio)) 144 | else: 145 | idx, step, cnt, init = [], 1.0 / ratio, 0, 0 146 | while cnt < int(size * ratio): 147 | idx.append(int(init)) 148 | init += step 149 | return [data[k][idx] for k in range(num)] 150 | 151 | def deepwalk_paras(self): 152 | num_walks = self.kwargs.get('num_walks', [10, 20]) 153 | representation_size = self.kwargs.get('representation_size', [32, 64, 128, 256]) 154 | walk_length = self.kwargs.get('walk_length', [32, 64, 128]) 155 | window_size = self.kwargs.get('window_size', [5, 10]) 156 | workers = self.kwargs.get('workers', psutil.cpu_count()) 157 | undirected = self.kwargs.get('undirected', False) 158 | for (p1, p2, p3, p4) in itertools.product( 159 | num_walks, representation_size, walk_length, window_size 160 | ): 161 | yield { 162 | 'number-walks': p1, 163 | 'representation-size': p2, 164 | 'walk-length': p3, 165 | 'window-size': p4, 166 | 'workers': workers, 167 | 'undirected': undirected 168 | } 169 | 170 | def return_metric_method(self, opt_metric): 171 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 172 | if opt_metric == 'accuracy': 173 | return accuracy_score 174 | elif opt_metric == 'precision': 175 | return precision_score 176 | elif opt_metric == 'recall': 177 | return recall_score 178 | elif opt_metric == 'f1': 179 | return f1_score 180 | else: 181 | raise NotImplementedError('unsupported metric {}'.format(opt_metric)) 182 | 183 | def load_model(self, fpath, **kwargs): 184 | pass 185 | 186 | def save_model(self, fpath, **kwargs): 187 | pass 188 | 189 | def fit(self, X, Y, **kwargs): 190 | pass 191 | 192 | def predict(self, X, **kwargs): 193 | pass 194 | 195 | 196 | class Debugger(object): 197 | """ 198 | Class for debugger print 199 | """ 200 | def __init__(self): 201 | pass 202 | 203 | @staticmethod 204 | def error_print(msg, debug=True): 205 | if debug: 206 | print('[error]' + msg) 207 | 208 | @staticmethod 209 | def warn_print(msg, debug=True): 210 | if debug: 211 | print('[warning]' + msg) 212 | 213 | @staticmethod 214 | def debug_print(msg, debug=True): 215 | if debug: 216 | print('[debug]' + msg + '\r', end='') 217 | sys.stdout.flush() 218 | 219 | @staticmethod 220 | def info_print(msg): 221 | print('[info]' + msg) 222 | 223 | @staticmethod 224 | def time_print(msg, begin, profiling=False): 225 | if profiling: 226 | assert isinstance(begin, type(time.time())), 'invalid begin time {}'.format(begin) 227 | print('[info]{}, elapsed for {:.2f}s'.format(msg, time.time() - begin)) 228 | 229 | 230 | class Queue: 231 | def __init__(self, max_size): 232 | self.queue = [] 233 | self.max_size = max_size 234 | 235 | def enqueue(self, val): 236 | if self.size() == self.max_size: 237 | self.dequeue() 238 | self.queue.insert(0, val) 239 | 240 | def dequeue(self): 241 | if self.is_empty(): 242 | return None 243 | else: 244 | return self.queue.pop() 245 | 246 | def size(self): 247 | return len(self.queue) 248 | 249 | def is_empty(self): 250 | return self.size() == 0 251 | 252 | 253 | def convert_string(string, val, cvt_type='float'): 254 | """ 255 | Convert a string as given type. 256 | :param string: input string 257 | :param val: default return value if conversion fails 258 | :param cvt_type: conversion type 259 | :return: value with given type 260 | """ 261 | try: 262 | return eval(cvt_type)(string) 263 | except NameError as _: 264 | Debugger.warn_print('invalid convert type {}; use float() by default'.format(cvt_type)) 265 | return float(string) 266 | except ValueError as _: 267 | Debugger.warn_print('invalid convert value {}; return {} by default'.format(string, val)) 268 | return val 269 | 270 | 271 | def syscmd(cmd, encoding=''): 272 | """ 273 | Runs a command on the system, waits for the command to finish, and then 274 | returns the text output of the command. If the command produces no text 275 | output, the command's return code will be returned instead. 276 | 277 | :param cmd: command, str 278 | :param encoding: encoding method, str(utf8, unicode, etc) 279 | :return: return code or text output 280 | """ 281 | p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, 282 | stderr=STDOUT, close_fds=True) 283 | p.wait() 284 | output = p.stdout.read() 285 | if len(output) > 1: 286 | if encoding: 287 | return output.decode(encoding) 288 | else: 289 | return output 290 | return p.returncode 291 | -------------------------------------------------------------------------------- /time2graph/utils/mp_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function 3 | import itertools 4 | import dill 5 | import contextlib 6 | import math 7 | import multiprocessing as mp 8 | import numpy as np 9 | from .base_utils import Debugger 10 | """ 11 | paralleling utils. 12 | modified on the version implemented by Lekui Zhou (luckiezhou@zju.edu.cn) 13 | """ 14 | 15 | 16 | NJOBS = mp.cpu_count() 17 | if NJOBS >= 20: 18 | NJOBS = 20 19 | 20 | __all__ = [ 21 | 'NJOBS', 22 | 'ParMap', 23 | 'parallel_monitor' 24 | ] 25 | 26 | 27 | class ParMap(object): 28 | def __init__(self, work, monitor=None, njobs=NJOBS, maxtasksperchild=100): 29 | self.work_func = work 30 | self.monitor_func = monitor 31 | self.__njobs = njobs 32 | self.__mtpc = maxtasksperchild 33 | 34 | self.__pool = None 35 | 36 | def close(self): 37 | if self.__pool is not None: 38 | self.__pool.close() 39 | self.__pool.join() 40 | self.__pool = None 41 | 42 | def __del__(self): 43 | self.close() 44 | 45 | @property 46 | def njobs(self): 47 | return self.__njobs 48 | 49 | @njobs.setter 50 | def njobs(self, n): 51 | self.__njobs = n 52 | self.close() 53 | 54 | def default_chunk(self, dlen): 55 | return int(math.ceil(float(dlen) / self.njobs)) 56 | 57 | def run(self, data, chunk=None, shuffle=False): 58 | if chunk is None: 59 | chunk = self.default_chunk(len(data)) 60 | 61 | if shuffle: 62 | data, order, invorder = shuffle_sample(data) 63 | else: 64 | invorder = None 65 | 66 | slices = slice_sample(data, chunk=chunk) 67 | res = self.run_slices(slices) 68 | 69 | if shuffle: 70 | res = apply_order(res, invorder) 71 | 72 | return res 73 | 74 | def run_slices(self, slices): 75 | mgr = mp.Manager() 76 | report_queue = mgr.Queue() 77 | if self.monitor_func is not None: 78 | monitor = mp.Process(target=self.monitor_func, args=(report_queue,)) 79 | monitor.start() 80 | else: 81 | monitor = None 82 | 83 | if self.njobs == 1: 84 | res = [] 85 | for slc in slices: 86 | res.append(self.work_func(None, slc, report_queue)) 87 | else: 88 | dill_work_func = dill.dumps(self.work_func) 89 | with contextlib.closing(mp.Pool(self.njobs, maxtasksperchild=self.__mtpc)) as pool: 90 | res = pool.map(func_wrapper, [[dill_work_func, slc, report_queue] for slc in slices]) 91 | res = list(itertools.chain.from_iterable(res)) 92 | 93 | report_queue.put(StopIteration()) 94 | if monitor is not None: 95 | monitor.join() 96 | 97 | return res 98 | 99 | 100 | def func_wrapper(args): 101 | func = dill.loads(args[0]) 102 | return func(mp.current_process().ident, *args[1:]) 103 | 104 | 105 | def apply_order(sample, order): 106 | return [sample[o] for o in order] 107 | 108 | 109 | def shuffle_sample(sample): 110 | order = np.random.permutation(np.arange(len(sample))) 111 | invorder = np.zeros((len(sample), ), dtype='int32') 112 | invorder[order] = np.arange(len(sample)) 113 | 114 | return apply_order(sample, order), order, invorder 115 | 116 | 117 | def slice_sample(sample, chunk=None, nslice=None): 118 | slices = [] 119 | if chunk is None: 120 | chunk = int(len(sample) / nslice) 121 | else: 122 | if nslice is not None: 123 | raise RuntimeError("chunk ({}) and slice ({}) should not be specified simultaneously".format(chunk, nslice)) 124 | 125 | curstart = 0 126 | while True: 127 | if curstart >= len(sample): 128 | break 129 | slices.append(sample[curstart:min(curstart + chunk, len(sample))]) 130 | curstart += chunk 131 | 132 | return slices 133 | 134 | 135 | def parallel_monitor(msg, size, debug): 136 | def monitor(queue): 137 | cnt = 0 138 | while True: 139 | obj = queue.get() 140 | if isinstance(obj, StopIteration): 141 | break 142 | if isinstance(obj, int): 143 | if obj != 0: 144 | cnt += obj 145 | else: 146 | cnt += 1 147 | else: 148 | cnt += 1 149 | Debugger.debug_print(msg='{} executed by {:.2f}%'.format(msg, float(cnt) / size * 100), 150 | debug=debug) 151 | return monitor 152 | --------------------------------------------------------------------------------