├── .gitignore ├── README.md ├── kNN_DTW.py ├── requirements.txt └── test_kNN_DTW.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .idea 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fast-Parallel-DTW-kNN-Python 2 | 3 | A k-Nearest-Neighbour Search under the Dynamic Time Warping Metric is often in 4 | the literature reported to achieve the highest accuracies. 5 | 6 | However, the runtime costs are quite high, so an efficient implementation is key. 7 | 8 | I compared different setups and implementations that can be used from Python. 9 | This repository contains the best combination that I came up with. 10 | It is based on an enhanced DTW C implementation and the kNN algorithm from sklearn which is running parallel. 11 | 12 | It is only tested for python 2.7 so far. -------------------------------------------------------------------------------- /kNN_DTW.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Maximilian Christ (max.christ@me.com) 3 | 4 | """ 5 | Finally I found a fast dtw implementation in C with correct python bindings and not a hack with the ucr time series subsequence search. 6 | 7 | pip install git+https://github.com/lukauskas/mlpy-plus-dtw 8 | or 9 | pit install git+https://github.com/MaxBenChrist/mlpy-plus-dtw 10 | 11 | This is an improved version of the DTW metric implementend in the mlpy packge by User Saulius Lukauskas. 12 | Unfortunately, it seems that the mlpy package is not actively developed anymore. 13 | (the latest version 3.5.0 was released in 2012) 14 | 15 | """ 16 | 17 | import time 18 | import pandas as pd 19 | import numpy as np 20 | 21 | from itertools import product 22 | 23 | from mlpy.dtw import dtw_std 24 | from scipy.stats import randint 25 | 26 | from sklearn.neighbors import KNeighborsRegressor 27 | from sklearn.model_selection import RandomizedSearchCV 28 | from sklearn.pipeline import Pipeline 29 | 30 | EOTS = -9999 31 | # EOTS stands for "end of time series", which is our faked np.NaN. sklearn will not pass arrays with np.NaN because of 32 | # its check_array method, but those those np.NaNs are needed for variable sized time series. 33 | # So, for the usage of the sklearn nearest neighbour search we need a faked NaN 34 | 35 | 36 | def _finite_of(x): 37 | """ 38 | Removes all values from x that are not equal to EOTS 39 | 40 | :param x: the input 41 | :type x: iterable 42 | :return: the cleaned version of x 43 | :rtype: numpy.array 44 | 45 | """ 46 | x = np.asarray(x) 47 | return x[x != EOTS] 48 | 49 | def construct_kNN_Regressor(k, warping_penalty, constraint="slanted_band"): 50 | """ 51 | Constructs the kNN Regressor under a DTW metric 52 | """ 53 | dtw_metric = lambda x, y: dtw_std(x, y, 54 | dist_only=True, 55 | constraint=constraint, 56 | k=k, 57 | warping_penalty=warping_penalty) 58 | 59 | reg = KNeighborsRegressor(n_neighbours=5, 60 | metric=dtw_metric, 61 | n_jobs=1) 62 | 63 | return reg 64 | 65 | def construct_X_from_tsfresh_container(df, column_id="id", column_sort="sort", column_value="value", 66 | all_possible_timestamps=None): 67 | """ 68 | Constructs the feature matrix for the kNN Regressor under a DTW metric. The time series container should be in flat 69 | format 70 | 71 | You want to call this method differently for train and test set. However, it could be that for some time stamps, 72 | only readings are available in one of the sets. For this, we have the all_possible_timestamps iterable. 73 | Just collect all possible time stamps for sensor recordings from both train and test set and pass it as this 74 | parameter. 75 | 76 | """ 77 | 78 | X = df.pivot(index=column_id, columns=column_sort, values=column_value) 79 | 80 | if all_possible_timestamps is not None: 81 | new_cols = list(set(all_possible_timestamps) - set(X.columns)) 82 | X = pd.concat([df, pd.DataFrame(columns=new_cols, index=X.index)], axis=1) 83 | 84 | X = X.fillna(EOTS) 85 | 86 | return X 87 | 88 | 89 | 90 | # todo: clean and refactor the following code 91 | 92 | # 93 | # def predict_kNNdtwReg(est, df, index, timestamps=None): 94 | # df = df.pivot(index="id", columns="sort", values="value") 95 | # 96 | # if timestamps is not None: 97 | # new_cols = list(set(timestamps) - set(df.columns)) 98 | # df = pd.concat([df, pd.DataFrame(columns=new_cols, index=df.index)], axis=1) 99 | # 100 | # df = df.fillna(EOTS) 101 | # 102 | # # make sure predictions are in right order 103 | # df = df.loc[index, :] 104 | # return est.predict(df.values) 105 | # 106 | # 107 | # def random_gridsearch_kNNdtwReg(df, y, n_iter=5, timestamps=None): 108 | # """ 109 | # df should be time series in tsfresh format 110 | # y the target vector 111 | # """ 112 | # 113 | # df = df.pivot(index="id", columns="sort", values="value") 114 | # 115 | # if timestamps is not None: 116 | # new_cols = list(set(timestamps) - set(df.columns)) 117 | # df = pd.concat([df, pd.DataFrame(columns=new_cols, index=df.index)], axis=1) 118 | # 119 | # df = df.fillna(EOTS) 120 | # 121 | # # specify parameters and distributions to sample from 122 | # param_dist = {"n_neighbors": randint(1, 10), 123 | # "weights": ["uniform", "distance"], 124 | # "metric_params": [{"k": k, "warping_penalty": wp} for k, wp in product([1, 3, 5, 10, 15, 20], 125 | # [0, .1, .25, .5, .75])] 126 | # } 127 | # 128 | # reg = KNeighborsRegressor(metric=fdtw) 129 | # random_search = RandomizedSearchCV(reg, 130 | # param_distributions=param_dist, 131 | # n_iter=n_iter, 132 | # verbose=2, 133 | # n_jobs=30, 134 | # error_score=9999) 135 | # 136 | # start = time.time() 137 | # random_search.fit(df.values, y.loc[df.index].values) 138 | # end = time.time() 139 | # 140 | # random_search.fitting_time = end-start 141 | # 142 | # return random_search 143 | # 144 | # 145 | # def fit_dtw_pipe(df, y, timestamps=None): 146 | # 147 | # df = df.pivot(index="id", columns="sort", values="value") 148 | # if timestamps is not None: 149 | # new_cols = list(set(timestamps) - set(df.columns)) 150 | # df = pd.concat([df, pd.DataFrame(columns=new_cols, index=df.index)], axis=1) 151 | # df = df.fillna(EOTS) 152 | # 153 | # pipe = Pipeline([("kNN_dtw", KNeighborsRegressor(n_neighbors=3, 154 | # weights="distance", 155 | # metric=fdtw, 156 | # n_jobs=8, 157 | # metric_params={"k": 10, 158 | # "warping_penalty": 0.1} 159 | # ))]) 160 | # 161 | # start = time.time() 162 | # pipe.fit(df.values, y.loc[df.index].values) 163 | # end = time.time() 164 | # 165 | # pipe.fitting_time = end - start 166 | # 167 | # return pipe 168 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | pytest 4 | sklearn 5 | git+https://github.com/MaxBenChrist/mlpy-plus-dtw # you have to install the other packages before! -------------------------------------------------------------------------------- /test_kNN_DTW.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Maximilian Christ (max.christ@me.com) 3 | 4 | import Unittest 5 | from kNN_DTW import _finite_of, EOTS 6 | import numpy as np 7 | 8 | 9 | class Test_kNN_DTW(Unittest): 10 | 11 | def test__finite_of(self): 12 | x = np.random.normal(size=100) 13 | self.assertEqual(_finite_of(x), x) 14 | self.assertEqual(_finite_of([0, 0, EOTS]), np.array([0, 0])) 15 | self.assertEqual(_finite_of([EOTS, EOTS]), np.array([])) 16 | --------------------------------------------------------------------------------