├── .gitignore
├── README.md
├── kNN_DTW.py
├── requirements.txt
└── test_kNN_DTW.py


/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | .idea
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Fast-Parallel-DTW-kNN-Python
 2 | 
 3 | A k-Nearest-Neighbour Search under the Dynamic Time Warping Metric is often in
 4 | the literature reported to achieve the highest accuracies.
 5 | 
 6 | However, the runtime costs are quite high, so an efficient implementation is key.
 7 | 
 8 | I compared different setups and implementations that can be used from Python.
 9 | This repository contains the best combination that I came up with. 
10 | It is based on an enhanced DTW C implementation and the kNN algorithm from sklearn which is running parallel.
11 | 
12 | It is only tested for python 2.7 so far.


--------------------------------------------------------------------------------
/kNN_DTW.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Maximilian Christ (max.christ@me.com)
  3 | 
  4 | """
  5 | Finally I found a fast dtw implementation in C with correct python bindings and not a hack with the ucr time series subsequence search.
  6 | 
  7 | pip install git+https://github.com/lukauskas/mlpy-plus-dtw
  8 |     or
  9 | pit install git+https://github.com/MaxBenChrist/mlpy-plus-dtw
 10 | 
 11 | This is an improved version of the DTW metric implementend in the mlpy packge by User Saulius Lukauskas.
 12 | Unfortunately, it seems that the mlpy package is not actively developed anymore.
 13 | (the latest version 3.5.0 was released in 2012)
 14 | 
 15 | """
 16 | 
 17 | import time
 18 | import pandas as pd
 19 | import numpy as np
 20 | 
 21 | from itertools import product
 22 | 
 23 | from mlpy.dtw import dtw_std
 24 | from scipy.stats import randint
 25 | 
 26 | from sklearn.neighbors import KNeighborsRegressor
 27 | from sklearn.model_selection import RandomizedSearchCV
 28 | from sklearn.pipeline import Pipeline
 29 | 
 30 | EOTS = -9999
 31 | # EOTS stands for "end of time series", which is our faked np.NaN. sklearn will not pass arrays with np.NaN because of
 32 | # its check_array method, but those those np.NaNs are needed for variable sized time series.
 33 | # So, for the usage of the sklearn nearest neighbour search we need a faked NaN
 34 | 
 35 | 
 36 | def _finite_of(x):
 37 |     """
 38 |     Removes all values from x that are not equal to EOTS
 39 | 
 40 |     :param x: the input
 41 |     :type x: iterable
 42 |     :return: the cleaned version of x
 43 |     :rtype: numpy.array
 44 | 
 45 |     """
 46 |     x = np.asarray(x)
 47 |     return x[x != EOTS]
 48 | 
 49 | def construct_kNN_Regressor(k, warping_penalty, constraint="slanted_band"):
 50 |     """
 51 |     Constructs the kNN Regressor under a DTW metric
 52 |     """
 53 |     dtw_metric = lambda x, y: dtw_std(x, y,
 54 |                                       dist_only=True,
 55 |                                       constraint=constraint,
 56 |                                       k=k,
 57 |                                       warping_penalty=warping_penalty)
 58 | 
 59 |     reg = KNeighborsRegressor(n_neighbours=5,
 60 |                               metric=dtw_metric,
 61 |                               n_jobs=1)
 62 | 
 63 |     return reg
 64 | 
 65 | def construct_X_from_tsfresh_container(df, column_id="id", column_sort="sort", column_value="value",
 66 |                                        all_possible_timestamps=None):
 67 |     """
 68 |     Constructs the feature matrix for the kNN Regressor under a DTW metric. The time series container should be in flat
 69 |     format
 70 | 
 71 |     You want to call this method differently for train and test set. However, it could be that for some time stamps,
 72 |     only readings are available in one of the sets. For this, we have the all_possible_timestamps iterable.
 73 |     Just collect all possible time stamps for sensor recordings from both train and test set and pass it as this
 74 |     parameter.
 75 | 
 76 |     """
 77 | 
 78 |     X = df.pivot(index=column_id, columns=column_sort, values=column_value)
 79 | 
 80 |     if all_possible_timestamps is not None:
 81 |         new_cols = list(set(all_possible_timestamps) - set(X.columns))
 82 |         X = pd.concat([df, pd.DataFrame(columns=new_cols, index=X.index)], axis=1)
 83 | 
 84 |     X = X.fillna(EOTS)
 85 | 
 86 |     return X
 87 | 
 88 | 
 89 | 
 90 | # todo: clean and refactor the following code
 91 | 
 92 | #
 93 | # def predict_kNNdtwReg(est, df, index, timestamps=None):
 94 | #     df = df.pivot(index="id", columns="sort", values="value")
 95 | #
 96 | #     if timestamps is not None:
 97 | #         new_cols = list(set(timestamps) - set(df.columns))
 98 | #         df = pd.concat([df, pd.DataFrame(columns=new_cols, index=df.index)], axis=1)
 99 | #
100 | #     df = df.fillna(EOTS)
101 | #
102 | #     # make sure predictions are in right order
103 | #     df = df.loc[index, :]
104 | #     return est.predict(df.values)
105 | #
106 | #
107 | # def random_gridsearch_kNNdtwReg(df, y, n_iter=5, timestamps=None):
108 | #     """
109 | #     df should be time series in tsfresh format
110 | #     y the target vector
111 | #     """
112 | #
113 | #     df = df.pivot(index="id", columns="sort", values="value")
114 | #
115 | #     if timestamps is not None:
116 | #         new_cols = list(set(timestamps) - set(df.columns))
117 | #         df = pd.concat([df, pd.DataFrame(columns=new_cols, index=df.index)], axis=1)
118 | #
119 | #     df = df.fillna(EOTS)
120 | #
121 | #     # specify parameters and distributions to sample from
122 | #     param_dist = {"n_neighbors": randint(1, 10),
123 | #                   "weights": ["uniform", "distance"],
124 | #                   "metric_params": [{"k": k, "warping_penalty": wp} for k, wp in product([1, 3, 5, 10, 15, 20],
125 | #                                                                                          [0, .1, .25, .5, .75])]
126 | #                   }
127 | #
128 | #     reg = KNeighborsRegressor(metric=fdtw)
129 | #     random_search = RandomizedSearchCV(reg,
130 | #                                        param_distributions=param_dist,
131 | #                                        n_iter=n_iter,
132 | #                                        verbose=2,
133 | #                                        n_jobs=30,
134 | #                                        error_score=9999)
135 | #
136 | #     start = time.time()
137 | #     random_search.fit(df.values, y.loc[df.index].values)
138 | #     end = time.time()
139 | #
140 | #     random_search.fitting_time = end-start
141 | #
142 | #     return random_search
143 | #
144 | #
145 | # def fit_dtw_pipe(df, y, timestamps=None):
146 | #
147 | #     df = df.pivot(index="id", columns="sort", values="value")
148 | #     if timestamps is not None:
149 | #         new_cols = list(set(timestamps) - set(df.columns))
150 | #         df = pd.concat([df, pd.DataFrame(columns=new_cols, index=df.index)], axis=1)
151 | #     df = df.fillna(EOTS)
152 | #
153 | #     pipe = Pipeline([("kNN_dtw",  KNeighborsRegressor(n_neighbors=3,
154 | #                                                       weights="distance",
155 | #                                                       metric=fdtw,
156 | #                                                       n_jobs=8,
157 | #                                                       metric_params={"k": 10,
158 | #                                                                      "warping_penalty": 0.1}
159 | #                                                       ))])
160 | #
161 | #     start = time.time()
162 | #     pipe.fit(df.values, y.loc[df.index].values)
163 | #     end = time.time()
164 | #
165 | #     pipe.fitting_time = end - start
166 | #
167 | #     return pipe
168 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | pytest
4 | sklearn
5 | git+https://github.com/MaxBenChrist/mlpy-plus-dtw # you have to install the other packages before!


--------------------------------------------------------------------------------
/test_kNN_DTW.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Maximilian Christ (max.christ@me.com)
 3 | 
 4 | import Unittest
 5 | from kNN_DTW import _finite_of, EOTS
 6 | import numpy as np
 7 | 
 8 | 
 9 | class Test_kNN_DTW(Unittest):
10 | 
11 |     def test__finite_of(self):
12 |         x = np.random.normal(size=100)
13 |         self.assertEqual(_finite_of(x), x)
14 |         self.assertEqual(_finite_of([0, 0, EOTS]), np.array([0, 0]))
15 |         self.assertEqual(_finite_of([EOTS, EOTS]), np.array([]))
16 | 


--------------------------------------------------------------------------------