├── .gitignore
├── README.md
└── vectorization
    ├── __init__.py
    └── bow.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | bin/
10 | build/
11 | develop-eggs/
12 | dist/
13 | eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Installer logs
24 | pip-log.txt
25 | pip-delete-this-directory.txt
26 | 
27 | # Unit test / coverage reports
28 | .tox/
29 | .coverage
30 | .cache
31 | nosetests.xml
32 | coverage.xml
33 | 
34 | # Translations
35 | *.mo
36 | 
37 | # Mr Developer
38 | .mr.developer.cfg
39 | .project
40 | .pydevproject
41 | 
42 | # Rope
43 | .ropeproject
44 | 
45 | # Django stuff:
46 | *.log
47 | *.pot
48 | 
49 | # Sphinx documentation
50 | docs/_build/
51 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TimeSeriesVectorization
 2 | This toolbox provides some time series vectorization methods which could give better representation for classification / clustering or other analysis.
 3 | 
 4 | ## BoWSp
 5 | Bag of words is a common technique in text mining for document representation. Since this method shows good result in computer vision, it is also used to represent time series data. The subsequences from raw series were extracted as local patterns for learning codebook. Consequently, a time series data instance is encoded by the codebook, which describes different local patterns of time series data. With the learned codebook, each original time series data instance could be represented by BoW histogram.
 6 | 
 7 | Requires:
 8 | +  Numpy == 1.8
 9 | +  SPArse Modeling Software(http://spams-devel.gforge.inria.fr/downloads.html)
10 | 


--------------------------------------------------------------------------------
/vectorization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/evan176/TimeSeriesVectorization/2ca552ebf3a5af34c8dc80f3bcc3d356e070a770/vectorization/__init__.py


--------------------------------------------------------------------------------
/vectorization/bow.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | import math
  4 | import numpy
  5 | import spams
  6 | import logging
  7 | 
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | class BoWSp():
 12 |     """BoWSp Class
 13 |     This module provides a vectorization way to represent time series by bow
 14 |     model. Bag of words is a common technique in text minig. It provides a
 15 |     good result in document representation. We apply BoW model to represent
 16 |     time series data. Before it, we must preprocess raw series. The second
 17 |     technique we choose is sparse coding. "Spams" provides very efficiently
 18 |     dictionary learning packages. After converting series to tokens by this
 19 |     method, we can apply BoW model now!
 20 |     Attributes:
 21 |         w_len(int): the length of window size
 22 |         k(int): dictionary size in sparse coding
 23 |         lambda1(float): lambda conefficient in sparse coding
 24 |             |X-D*a|^2 + lambda*|a|^1
 25 |             For more information, see spams packages:
 26 |             http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5
 27 |         interval(int): sliding window interval
 28 |         batch(bool): online learning or batch learning for sparse coding
 29 |             For more information, see spams packages:
 30 |             http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5
 31 |         iter1(int): learning iterations
 32 |             For more information, see spams packages:
 33 |             http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5
 34 |     """
 35 |     def __init__(self, w_len=20, k=100, lambda1=None,
 36 |                  interval=1, batch=False, iter1=-5):
 37 |         self.w_len = w_len
 38 |         self.k = k
 39 |         self.interval = interval
 40 |         self.batch = batch
 41 |         self.iter1 = iter1
 42 |         self.D = None
 43 |         if lambda1 == None:
 44 |             self.lambda1 = 1.0 / math.sqrt(w_len)
 45 |         else:
 46 |             self.lambda1 = lambda1
 47 |         # Log object information
 48 |         log_msg = "Initial BoWSp object: window length=%d, dictionary size=%d, \
 49 |                    lambda=%f, interval=%d" %(self.w_len, self.k, 
 50 |                                              self.lambda1, self.interval)
 51 |         logger.info(log_msg)
 52 | 
 53 |     def fit(self, X):
 54 |         """Learn dictionary from data
 55 |         Args:
 56 |             X(list): Each item is a list which contains a time series instance.
 57 |                 ex: [[1, 2, 3, 0, 3, 2, 4, 5],
 58 |                      [0, 3, 2, 1, 5, 3, 1],
 59 |                      [9, 0, 4, 1, 8 ,4, 9, 3, 1, 2],
 60 |                      [8, 5, 2, 1, 8]]
 61 |         Returns:
 62 |             D(numpy 2d-array): learning dictionary from given time series data
 63 |                 dimensions is w_len * k
 64 |         """
 65 |         segment_list = self.segment(X, self.w_len, self.interval)
 66 |         self.D = self.learn_D(segment_list, self.k, self.lambda1,
 67 |                               self.batch, self.iter1)
 68 |         return self.D
 69 | 
 70 |     def fit_transform(self, X):
 71 |         """Learn dictionary from data and represent it as bow model
 72 |         Args:
 73 |             X(list): Each item is a list which contains a time series instance.
 74 |                 ex: [[1, 2, 3, 0, 3, 2, 4, 5],
 75 |                      [0, 3, 2, 1, 5, 3, 1],
 76 |                      [9, 0, 4, 1, 8 ,4, 9, 3, 1, 2],
 77 |                      [8, 5, 2, 1, 8]]
 78 |         Returns:
 79 | 
 80 |         """
 81 |         segment_list = self.segment(X, self.w_len, self.interval)
 82 |         self.D = self.learn_D(segment_list, self.k, self.lambda1,
 83 |                               self.batch, self.iter1)
 84 |         bow_representation = self.coding_series(segment_list, self.D,
 85 |                                                 self.lambda1)
 86 |         return bow_representation
 87 | 
 88 |     def transform(self, X):
 89 |         """Represent data as bow model
 90 |         Args:
 91 |             X(list): Each item is a list which contains a time series instance.
 92 |                 ex: [[1, 2, 3, 0, 3, 2, 4, 5],
 93 |                      [0, 3, 2, 1, 5, 3, 1],
 94 |                      [9, 0, 4, 1, 8 ,4, 9, 3, 1, 2],
 95 |                      [8, 5, 2, 1, 8]]
 96 |         Returns:
 97 | 
 98 |         """
 99 |         segment_list = self.segment(X, self.w_len, self.interval)
100 |         bow_representation = self.coding_series(segment_list, self.D,
101 |                                                 self.lambda1)
102 |         return bow_representation
103 | 
104 |     @staticmethod
105 |     def get_index(data_len, w_len, interval):
106 |         """Retrieve index with given length of data and size of window
107 |         Args:
108 |             data_len(int):
109 |             w_len(int):
110 |             interval(int):
111 |         Returns:
112 |             stamp_index(list): list contains location of sliding window in
113 |                 given series
114 |             len_index(int): the number of current windows
115 | 
116 |         """
117 |         # Initial timestamp index
118 |         stamp_index = range(0, data_len-w_len, interval)
119 |         # Get length of index
120 |         len_index = len(stamp_index)
121 |         # Log len_index for debug
122 |         log_msg = "stamp index length: %d" %(len_index)
123 |         logger.debug(log_msg)
124 |         return stamp_index, len_index
125 | 
126 |     @staticmethod
127 |     def segment(data, w_len=20, interval=1):
128 |         """Segment series data to subsequences by sliding window with
129 |         given length and same interval
130 |         Args:
131 |             data(list): each item is a time series instance
132 |             w_len(int): legnth of window
133 |             interval(int): size of interval
134 |         Returns:
135 |             segment_list(list): each item contains m subsequences which is
136 |                 sliced from original time series instance
137 |         """
138 |         segment_list = list()
139 |         # Slice data segment to temp
140 |         for i in range(len(data)):
141 |             stamp_index, len_index = BoWSp.get_index(len(data[i]),
142 |                                                      w_len, interval)
143 |             temp = numpy.zeros([w_len, len_index])
144 |             for count, j in enumerate(stamp_index):
145 |                 temp[:, count] = data[i][j:j+w_len]
146 |             segment_list.append(temp)
147 | 
148 |         return segment_list
149 | 
150 |     @staticmethod
151 |     def learn_D(segment_list, k, lambda1=None, batch=False, iter1=-5):
152 |         """Learn dictionary from given series with input parameters
153 |         Args:
154 |             segment_list(list): each item contains m subsequences which is
155 |                 sliced from original time series instance
156 |             k(int): size of dictionary
157 |             lambda1(float): lambda conefficient in sparse coding,
158 |                 |X-D*a|^2 + lambda*|a|^1
159 |                 For more information, see spams packages:
160 |                 http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5
161 |             batch(bool): online learning or batch learning for sparse coding
162 |                 For more information, see spams packages:
163 |                 http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5
164 |             iter1(int): learning iterations
165 |                 For more information, see spams packages:
166 |                 http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams004.html#sec5
167 |         Returns:
168 |             D(numpy 2d-array): learning dictionary
169 |         """
170 |         # Horizontal train list
171 |         temp = numpy.hstack(segment_list)
172 |         if lambda1 == None:
173 |             lambda1 = 1.0 / math.sqrt(temp.shape[0])
174 |         # Log learning informatino
175 |         log_msg = "learning dictionary with lambda: %f" %(lambda1)
176 |         logger.info(log_msg)
177 |         # Learn dictionary
178 |         D = spams.trainDL(
179 |                       numpy.asfortranarray(temp),
180 |                       K=k, lambda1=lambda1, batch=batch,
181 |                       iter=iter1, posAlpha=True
182 |                   )
183 |         return D
184 | 
185 |     @staticmethod
186 |     def coding_series(segment_list, D, lambda1):
187 |         """Represent given series with learned dictionary
188 |         Args:
189 |             segment_list(list): each item contains m subsequences which is
190 |                 sliced from original time series instance
191 |             D(numpy 2d-array): learning dictionary
192 |             lambda1(float): lambda conefficient in sparse coding,
193 |                 |X-D*a|^2 + lambda*|a|^1
194 |                 For more information, see spams packages:
195 |                 http://spams-devel.gforge.inria.fr/doc-python/html/doc_spams005.html#sec15
196 |         Returns:
197 |             bow_data(numpy array): transformed data
198 |         """
199 |         k = D.shape[1]
200 |         bow_data = numpy.zeros([len(segment_list), k])
201 | 
202 |         # BoW for data
203 |         for index, item in enumerate(segment_list):
204 |             # Log lasso information
205 |             log_msg = "Solve lasso problem for %d series" %(index)
206 |             logger.info(log_msg)
207 |             code = spams.lasso(numpy.asfortranarray(item), D,
208 |                                lambda1=lambda1, pos=True)
209 |             code = numpy.sum(code.todense(), axis=1)
210 |             bow_data[index:index+1, :] += code.reshape([1, k])
211 |             div = numpy.linalg.norm(bow_data[index, :])
212 |             if div > 0:
213 |                 bow_data[index, :] = bow_data[index, :] / div
214 |             # Log bow result for debug
215 |             log_msg = "%d series: " %(index)
216 |             logger.debug(log_msg + str(bow_data[index, :]))
217 |         return bow_data
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     pass
222 | 


--------------------------------------------------------------------------------