├── .gitignore ├── README.md └── vectorization ├── __init__.py └── bow.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | bin/ 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Installer logs 24 | pip-log.txt 25 | pip-delete-this-directory.txt 26 | 27 | # Unit test / coverage reports 28 | .tox/ 29 | .coverage 30 | .cache 31 | nosetests.xml 32 | coverage.xml 33 | 34 | # Translations 35 | *.mo 36 | 37 | # Mr Developer 38 | .mr.developer.cfg 39 | .project 40 | .pydevproject 41 | 42 | # Rope 43 | .ropeproject 44 | 45 | # Django stuff: 46 | *.log 47 | *.pot 48 | 49 | # Sphinx documentation 50 | docs/_build/ 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TimeSeriesVectorization 2 | This toolbox provides some time series vectorization methods which could give better representation for classification / clustering or other analysis. 3 | 4 | ## BoWSp 5 | Bag of words is a common technique in text mining for document representation. Since this method shows good result in computer vision, it is also used to represent time series data. The subsequences from raw series were extracted as local patterns for learning codebook. Consequently, a time series data instance is encoded by the codebook, which describes different local patterns of time series data. With the learned codebook, each original time series data instance could be represented by BoW histogram. 6 | 7 | Requires: 8 | + Numpy == 1.8 9 | + SPArse Modeling Software(http://spams-devel.gforge.inria.fr/downloads.html) 10 | -------------------------------------------------------------------------------- /vectorization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/evan176/TimeSeriesVectorization/2ca552ebf3a5af34c8dc80f3bcc3d356e070a770/vectorization/__init__.py -------------------------------------------------------------------------------- /vectorization/bow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import math 4 | import numpy 5 | import spams 6 | import logging 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class BoWSp(): 12 | """BoWSp Class 13 | This module provides a vectorization way to represent time series by bow 14 | model. Bag of words is a common technique in text minig. It provides a 15 | good result in document representation. We apply BoW model to represent 16 | time series data. Before it, we must preprocess raw series. The second 17 | technique we choose is sparse coding. "Spams" provides very efficiently 18 | dictionary learning packages. After converting series to tokens by this 19 | method, we can apply BoW model now! 20 | Attributes: 21 | w_len(int): the length of window size 22 | k(int): dictionary size in sparse coding 23 | lambda1(float): lambda conefficient in sparse coding 24 | |X-D*a|^2 + lambda*|a|^1 25 | For more information, see spams packages: 26 | http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5 27 | interval(int): sliding window interval 28 | batch(bool): online learning or batch learning for sparse coding 29 | For more information, see spams packages: 30 | http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5 31 | iter1(int): learning iterations 32 | For more information, see spams packages: 33 | http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5 34 | """ 35 | def __init__(self, w_len=20, k=100, lambda1=None, 36 | interval=1, batch=False, iter1=-5): 37 | self.w_len = w_len 38 | self.k = k 39 | self.interval = interval 40 | self.batch = batch 41 | self.iter1 = iter1 42 | self.D = None 43 | if lambda1 == None: 44 | self.lambda1 = 1.0 / math.sqrt(w_len) 45 | else: 46 | self.lambda1 = lambda1 47 | # Log object information 48 | log_msg = "Initial BoWSp object: window length=%d, dictionary size=%d, \ 49 | lambda=%f, interval=%d" %(self.w_len, self.k, 50 | self.lambda1, self.interval) 51 | logger.info(log_msg) 52 | 53 | def fit(self, X): 54 | """Learn dictionary from data 55 | Args: 56 | X(list): Each item is a list which contains a time series instance. 57 | ex: [[1, 2, 3, 0, 3, 2, 4, 5], 58 | [0, 3, 2, 1, 5, 3, 1], 59 | [9, 0, 4, 1, 8 ,4, 9, 3, 1, 2], 60 | [8, 5, 2, 1, 8]] 61 | Returns: 62 | D(numpy 2d-array): learning dictionary from given time series data 63 | dimensions is w_len * k 64 | """ 65 | segment_list = self.segment(X, self.w_len, self.interval) 66 | self.D = self.learn_D(segment_list, self.k, self.lambda1, 67 | self.batch, self.iter1) 68 | return self.D 69 | 70 | def fit_transform(self, X): 71 | """Learn dictionary from data and represent it as bow model 72 | Args: 73 | X(list): Each item is a list which contains a time series instance. 74 | ex: [[1, 2, 3, 0, 3, 2, 4, 5], 75 | [0, 3, 2, 1, 5, 3, 1], 76 | [9, 0, 4, 1, 8 ,4, 9, 3, 1, 2], 77 | [8, 5, 2, 1, 8]] 78 | Returns: 79 | 80 | """ 81 | segment_list = self.segment(X, self.w_len, self.interval) 82 | self.D = self.learn_D(segment_list, self.k, self.lambda1, 83 | self.batch, self.iter1) 84 | bow_representation = self.coding_series(segment_list, self.D, 85 | self.lambda1) 86 | return bow_representation 87 | 88 | def transform(self, X): 89 | """Represent data as bow model 90 | Args: 91 | X(list): Each item is a list which contains a time series instance. 92 | ex: [[1, 2, 3, 0, 3, 2, 4, 5], 93 | [0, 3, 2, 1, 5, 3, 1], 94 | [9, 0, 4, 1, 8 ,4, 9, 3, 1, 2], 95 | [8, 5, 2, 1, 8]] 96 | Returns: 97 | 98 | """ 99 | segment_list = self.segment(X, self.w_len, self.interval) 100 | bow_representation = self.coding_series(segment_list, self.D, 101 | self.lambda1) 102 | return bow_representation 103 | 104 | @staticmethod 105 | def get_index(data_len, w_len, interval): 106 | """Retrieve index with given length of data and size of window 107 | Args: 108 | data_len(int): 109 | w_len(int): 110 | interval(int): 111 | Returns: 112 | stamp_index(list): list contains location of sliding window in 113 | given series 114 | len_index(int): the number of current windows 115 | 116 | """ 117 | # Initial timestamp index 118 | stamp_index = range(0, data_len-w_len, interval) 119 | # Get length of index 120 | len_index = len(stamp_index) 121 | # Log len_index for debug 122 | log_msg = "stamp index length: %d" %(len_index) 123 | logger.debug(log_msg) 124 | return stamp_index, len_index 125 | 126 | @staticmethod 127 | def segment(data, w_len=20, interval=1): 128 | """Segment series data to subsequences by sliding window with 129 | given length and same interval 130 | Args: 131 | data(list): each item is a time series instance 132 | w_len(int): legnth of window 133 | interval(int): size of interval 134 | Returns: 135 | segment_list(list): each item contains m subsequences which is 136 | sliced from original time series instance 137 | """ 138 | segment_list = list() 139 | # Slice data segment to temp 140 | for i in range(len(data)): 141 | stamp_index, len_index = BoWSp.get_index(len(data[i]), 142 | w_len, interval) 143 | temp = numpy.zeros([w_len, len_index]) 144 | for count, j in enumerate(stamp_index): 145 | temp[:, count] = data[i][j:j+w_len] 146 | segment_list.append(temp) 147 | 148 | return segment_list 149 | 150 | @staticmethod 151 | def learn_D(segment_list, k, lambda1=None, batch=False, iter1=-5): 152 | """Learn dictionary from given series with input parameters 153 | Args: 154 | segment_list(list): each item contains m subsequences which is 155 | sliced from original time series instance 156 | k(int): size of dictionary 157 | lambda1(float): lambda conefficient in sparse coding, 158 | |X-D*a|^2 + lambda*|a|^1 159 | For more information, see spams packages: 160 | http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5 161 | batch(bool): online learning or batch learning for sparse coding 162 | For more information, see spams packages: 163 | http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5 164 | iter1(int): learning iterations 165 | For more information, see spams packages: 166 | http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5 167 | Returns: 168 | D(numpy 2d-array): learning dictionary 169 | """ 170 | # Horizontal train list 171 | temp = numpy.hstack(segment_list) 172 | if lambda1 == None: 173 | lambda1 = 1.0 / math.sqrt(temp.shape[0]) 174 | # Log learning informatino 175 | log_msg = "learning dictionary with lambda: %f" %(lambda1) 176 | logger.info(log_msg) 177 | # Learn dictionary 178 | D = spams.trainDL( 179 | numpy.asfortranarray(temp), 180 | K=k, lambda1=lambda1, batch=batch, 181 | iter=iter1, posAlpha=True 182 | ) 183 | return D 184 | 185 | @staticmethod 186 | def coding_series(segment_list, D, lambda1): 187 | """Represent given series with learned dictionary 188 | Args: 189 | segment_list(list): each item contains m subsequences which is 190 | sliced from original time series instance 191 | D(numpy 2d-array): learning dictionary 192 | lambda1(float): lambda conefficient in sparse coding, 193 | |X-D*a|^2 + lambda*|a|^1 194 | For more information, see spams packages: 195 | http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams005.html#sec15 196 | Returns: 197 | bow_data(numpy array): transformed data 198 | """ 199 | k = D.shape[1] 200 | bow_data = numpy.zeros([len(segment_list), k]) 201 | 202 | # BoW for data 203 | for index, item in enumerate(segment_list): 204 | # Log lasso information 205 | log_msg = "Solve lasso problem for %d series" %(index) 206 | logger.info(log_msg) 207 | code = spams.lasso(numpy.asfortranarray(item), D, 208 | lambda1=lambda1, pos=True) 209 | code = numpy.sum(code.todense(), axis=1) 210 | bow_data[index:index+1, :] += code.reshape([1, k]) 211 | div = numpy.linalg.norm(bow_data[index, :]) 212 | if div > 0: 213 | bow_data[index, :] = bow_data[index, :] / div 214 | # Log bow result for debug 215 | log_msg = "%d series: " %(index) 216 | logger.debug(log_msg + str(bow_data[index, :])) 217 | return bow_data 218 | 219 | 220 | if __name__ == "__main__": 221 | pass 222 | --------------------------------------------------------------------------------