├── .gitignore ├── AlgorithmsComparator.py ├── data_management ├── DataLoader.py ├── StreamGenerator.py └── __init__.py ├── demo.py ├── drift_detection_methods ├── __init__.py └── spc.py ├── ensemble_methods ├── DWM.py ├── SEA.py ├── __init__.py ├── ddd.py └── online_bagging.py ├── offline_methods ├── OfflineAlgorithmsWrapper.py └── __init__.py └── training_windows_methods ├── AdaptiveSVC.py ├── __init__.py └── test_AdaptiveSVC.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Bibliography of the project 2 | bibliography/ 3 | 4 | # Datasets 5 | data/ 6 | 7 | # Figures 8 | figures/ 9 | 10 | # Jupyter notebooks 11 | notebooks/ 12 | .ipynb_checkpoints/ 13 | *.ipynb 14 | 15 | # PyCharm directory 16 | .idea/ 17 | 18 | # Pycache 19 | __pycache__/* 20 | *.pyc 21 | -------------------------------------------------------------------------------- /AlgorithmsComparator.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections import defaultdict 3 | 4 | import numpy as np 5 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | class AlgorithmsComparator: 10 | def __init__(self, algorithms, stream_generator): 11 | """ Constructor of AlgorithmsComparator 12 | 13 | :param algorithms: iterable of tuples (algorithm_name, algorithm) 14 | :param stream_generator: instance of StreamGenerator 15 | """ 16 | 17 | self.algorithms = algorithms 18 | self.stream_generator = stream_generator 19 | 20 | self.predictions = dict() 21 | self.accuracies = defaultdict(list) 22 | self.precisions = defaultdict(list) 23 | self.recalls = defaultdict(list) 24 | self.f1_scores = defaultdict(list) 25 | self.X = None 26 | self.y = None 27 | 28 | self.time_to_update = defaultdict(list) 29 | self.time_to_predict = defaultdict(list) 30 | 31 | def _set_batch(self, X, y): 32 | """ Set X and y for current batch""" 33 | self.X = X 34 | self.y = y 35 | 36 | def _update_algorithms(self): 37 | """ Update algorithms with self.X and self.y """ 38 | for algorithm_name, algorithm in self.algorithms: 39 | print("\t\tAlgorithm {} ... ".format(algorithm_name), end="", flush=True) 40 | start_timer = time.clock() 41 | algorithm.update(self.X, self.y) 42 | end_timer = time.clock() 43 | time_to_update = end_timer - start_timer 44 | self.time_to_update[algorithm_name].append(time_to_update) 45 | print("OK. Time to update on this batch: {0:.3f} seconds".format(time_to_update)) 46 | 47 | def _predict_algorithms(self): 48 | """ Make the predictions for each algorithm on self.X""" 49 | for algorithm_name, algorithm in self.algorithms: 50 | print("\t\tAlgorithm {} ... ".format(algorithm_name), end="", flush=True) 51 | start_timer = time.clock() 52 | self.predictions[algorithm_name] = algorithm.predict(self.X) 53 | end_timer = time.clock() 54 | time_to_predict = end_timer - start_timer 55 | self.time_to_predict[algorithm_name].append(time_to_predict) 56 | print("OK. Time to predict on this batch: {0:.3f} seconds".format(time_to_predict)) 57 | 58 | def _evaluate_algorithms(self): 59 | """ Evaluate the performance of the algorithms on current batch""" 60 | for algorithm_name, algorithm in self.algorithms: 61 | # compute metrics 62 | accuracy = accuracy_score(self.y, self.predictions[algorithm_name]) 63 | precision = precision_score(self.y, self.predictions[algorithm_name]) 64 | recall = recall_score(self.y, self.predictions[algorithm_name]) 65 | f1 = f1_score(self.y, self.predictions[algorithm_name]) 66 | 67 | # add scores to dictionaries 68 | self.accuracies[algorithm_name].append(accuracy) 69 | self.precisions[algorithm_name].append(precision) 70 | self.recalls[algorithm_name].append(recall) 71 | self.f1_scores[algorithm_name].append(f1) 72 | 73 | def _plot(self, show_plot): 74 | """ Create the different plots """ 75 | # create 2*2 subplots 76 | fig, ax = plt.subplots(2, 2, figsize=(15, 10)) 77 | accuracy_fig = ax[0, 0] 78 | precision_fig = ax[0, 1] 79 | recall_fig = ax[1, 0] 80 | f1_fig = ax[1, 1] 81 | 82 | for algorithm_name, algorithm in self.algorithms: 83 | accuracy_fig.plot(self.accuracies[algorithm_name], label=algorithm_name) 84 | precision_fig.plot(self.precisions[algorithm_name], label=algorithm_name) 85 | recall_fig.plot(self.recalls[algorithm_name], label=algorithm_name) 86 | f1_fig.plot(self.f1_scores[algorithm_name], label=algorithm_name) 87 | 88 | # set title 89 | accuracy_fig.set_title("Accuracies over time") 90 | precision_fig.set_title("Precisions over time") 91 | recall_fig.set_title("Recalls over time") 92 | f1_fig.set_title("F1 scores over time") 93 | 94 | # locate legend 95 | accuracy_fig.legend(loc="best") 96 | precision_fig.legend(loc="best") 97 | recall_fig.legend(loc="best") 98 | f1_fig.legend(loc="best") 99 | 100 | # set figures' limits 101 | # accuracy_fig.set_ylim(0.7, 1.0) 102 | # precision_fig.set_ylim(0.7, 1.0) 103 | # recall_fig.set_ylim(0.7, 1.0) 104 | # f1_fig.set_ylim(0.7, 1.0) 105 | 106 | # set x-axis labels 107 | accuracy_fig.set_xlabel("Batch number") 108 | precision_fig.set_xlabel("Batch number") 109 | recall_fig.set_xlabel("Batch number") 110 | f1_fig.set_xlabel("Batch number") 111 | 112 | # save figure 113 | plt.savefig("figures/plots_{}.png".format(time.strftime("%Y%m%d_%H%M%S")), format="png") 114 | 115 | # show plot if needed 116 | if show_plot: 117 | plt.show() 118 | 119 | def plot_comparison(self, batch_size, stream_length=1e8, show_plot=True): 120 | """ Main method of AlgorithmsComparator: Simulate data stream and plot the performances of each algorithm""" 121 | print("\nLet is begin plot_comparison", end="\n" * 2) 122 | # first training on historical data 123 | X_train, y_train = self.stream_generator.get_historical_data() 124 | print("Historical Data") 125 | self._set_batch(X_train, y_train) 126 | print("\tUpdate on historical data") 127 | self._update_algorithms() 128 | 129 | # simulate data streaming 130 | for batch_nb, (X, y) in enumerate( 131 | self.stream_generator.generate(batch_size=batch_size, stream_length=stream_length)): 132 | print("Batch #{}".format(batch_nb)) 133 | # set current batch's X and y 134 | self._set_batch(X, y) 135 | 136 | # predict current batch, evaluate the performances and update the algorithms 137 | print("\tPrediction #{}".format(batch_nb)) 138 | self._predict_algorithms() 139 | print("\tEvaluation #{}".format(batch_nb)) 140 | self._evaluate_algorithms() 141 | print("\tUpdate #{}".format(batch_nb)) 142 | self._update_algorithms() 143 | 144 | print("Mean time to update") 145 | for algorithm_name, _ in self.algorithms: 146 | print( 147 | "\t{0}: {1:.3f} seconds".format(algorithm_name, np.mean(np.array(self.time_to_update[algorithm_name])))) 148 | 149 | print("Mean time to predict") 150 | for algorithm_name, _ in self.algorithms: 151 | print("\t{0}: {1:.3f} seconds".format(algorithm_name, 152 | np.mean(np.array(self.time_to_predict[algorithm_name])))) 153 | 154 | # make the plots 155 | self._plot(show_plot) 156 | -------------------------------------------------------------------------------- /data_management/DataLoader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler 8 | 9 | HEADER_NAMES = { 10 | 'SEA': [ 11 | 'attribute_1', 12 | 'attribute_2', 13 | 'attribute_3', 14 | 'label' 15 | ], 16 | 'KDD': [ 17 | 'duration', 18 | 'protocol_type', 19 | 'service', 20 | 'flag', 21 | 'src_bytes', 22 | 'dst_bytes', 23 | 'land', 24 | 'wrong_fragment', 25 | 'urgent', 26 | 'hot', 27 | 'num_failed_logins', 28 | 'logged_in', 29 | 'num_compromised', 30 | 'root_shell', 31 | 'su_attempted', 32 | 'num_root', 33 | 'num_file_creations', 34 | 'num_shells', 35 | 'num_access_files', 36 | 'num_outbound_cmds', 37 | 'is_host_login', 38 | 'is_guest_login', 39 | 'count', 40 | 'srv_count', 41 | 'serror_rate', 42 | 'srv_serror_rate', 43 | 'rerror_rate', 44 | 'srv_rerror_rate', 45 | 'same_srv_rate', 46 | 'diff_srv_rate', 47 | 'srv_diff_host_rate', 48 | 'dst_host_count', 49 | 'dst_host_srv_count', 50 | 'dst_host_same_srv_rate', 51 | 'dst_host_diff_srv_rate', 52 | 'dst_host_same_src_port_rate', 53 | 'dst_host_srv_diff_host_rate', 54 | 'dst_host_serror_rate', 55 | 'dst_host_srv_serror_rate', 56 | 'dst_host_rerror_rate', 57 | 'dst_host_srv_rerror_rate', 58 | 'label' 59 | ], 60 | } 61 | 62 | 63 | class DataLoader: 64 | def __init__(self, data_path, percentage_historical_data=0.2): 65 | self.data_path = data_path 66 | self.percentage_historical_data = percentage_historical_data 67 | self.X = None 68 | self.y = None 69 | self.X_historical = None 70 | self.y_historical = None 71 | self.list_classes = None 72 | 73 | def return_data(self): 74 | """ 75 | The data which is used for the streaming part emulation. 76 | :return: Tuple X and y 77 | """ 78 | return self.X, self.y 79 | 80 | def return_historical_data(self): 81 | """ 82 | The historical used for training the model before going online. 83 | :return: 84 | """ 85 | return self.X_historical, self.y_historical 86 | 87 | def split_data(self): 88 | """ 89 | Split the dataset based on the percentage given in argument (percentage_historical_data) 90 | """ 91 | number_histocal_data = int(self.percentage_historical_data * len(self.X)) 92 | self.X_historical = self.X[:number_histocal_data] 93 | self.y_historical = self.y[:number_histocal_data] 94 | self.X = self.X[number_histocal_data + 1:] 95 | self.y = self.y[number_histocal_data + 1:] 96 | 97 | def normalization(self): 98 | """ 99 | Normalized the data based on the historical data. Since we study concept drift we prefer to use a MinMax 100 | normalisation. 101 | """ 102 | mms = MinMaxScaler() 103 | self.X_historical = mms.fit_transform(self.X_historical) 104 | self.X = mms.transform(self.X) 105 | 106 | def save_data(self, path): 107 | if not os.path.exists(path): 108 | with open(self.data_path, 'wb') as data_file: 109 | data = {'X': self.X, 'y': self.y, 'X_historical': self.X_historical, 'y_historical': self.y_historical} 110 | pickle.dump(data, data_file, protocol=pickle.HIGHEST_PROTOCOL) 111 | 112 | def load_from_pickle(self): 113 | with open(self.data_path, 'rb') as data_file: 114 | data = pickle.load(data_file) 115 | self.X = data['X'] 116 | self.y = data['y'] 117 | self.X_historical = data['X_historical'] 118 | self.y_historical = data['y_historical'] 119 | 120 | def get_classes(self): 121 | return self.list_classes 122 | 123 | 124 | class SEALoader(DataLoader): 125 | def __init__(self, sea_data_path, use_pickle_for_loading=False, percentage_historical_data=0.2): 126 | DataLoader.__init__(self, sea_data_path, percentage_historical_data=percentage_historical_data) 127 | if use_pickle_for_loading: 128 | self.load_from_pickle() 129 | else: 130 | sea_df = pd.read_csv(self.data_path, header=None, names=HEADER_NAMES['SEA']) 131 | sea_data = sea_df.values 132 | self.X = sea_data[:, 1:3] 133 | self.y = sea_data[:, -1] 134 | self.list_classes = np.unique(self.y) 135 | DataLoader.split_data(self) 136 | DataLoader.normalization(self) 137 | # Normalization 138 | mms = MinMaxScaler() 139 | self.X = mms.fit_transform(self.X) 140 | 141 | 142 | class KDDCupLoader(DataLoader): 143 | """ 144 | This data set was used in KDD Cup 1999 Competition (Frank and Asuncion, 2010). The full dataset has about five 145 | million connection records, this is a set with only 10 % of the size. The original task has 24 training attack 146 | types. The original labels of attack types are changed to label abnormal in our experiments and we keep the label 147 | normal for normal connection. This way we simplify the set to two class problem. 148 | """ 149 | def __init__(self, kdd_data_path, use_pickle_for_loading=False, percentage_historical_data=0.2, dummies=True): 150 | ''' 151 | 152 | :param kdd_data_path: 153 | :param use_pickle_for_loading: You have registered a pickle file 154 | :param percentage_historical_data: Percentage of data to use for the historical training. 155 | :param dummies: If true convert categorical variable into dummy/indicator variables (one-hot encoded). 156 | Use dummies equal false when your learning algorithm is DecisionTree. 157 | :return: 158 | ''' 159 | DataLoader.__init__(self, kdd_data_path, percentage_historical_data=percentage_historical_data) 160 | if use_pickle_for_loading: 161 | self.load_from_pickle() 162 | else: # TODO shorten the following lines of code 163 | kdd_df = pd.read_csv( 164 | self.data_path, 165 | index_col=False, 166 | delimiter=',', 167 | header=None, 168 | names=HEADER_NAMES['KDD'] 169 | ) 170 | # TODO (minor) : Do not load these 2 columns at first 171 | useless_features = ["num_outbound_cmds", "is_host_login"] 172 | kdd_df = kdd_df.drop(useless_features, axis=1) 173 | 174 | # Handle symbolic data 175 | symbolic = [ 176 | "protocol_type", 177 | "service", 178 | "flag", 179 | "label" 180 | ] 181 | 182 | self.symbolic_df = kdd_df[symbolic] 183 | if dummies: 184 | symbolic_df_without_label = self.symbolic_df[self.symbolic_df.columns.difference(['label'])] 185 | dummies_df = pd.get_dummies(symbolic_df_without_label) 186 | non_categorical = kdd_df[kdd_df.columns.difference(symbolic)].values 187 | # Create X 188 | self.X = np.concatenate((non_categorical, dummies_df.values), axis=1) 189 | # Create y 190 | label = self.symbolic_df['label'].values 191 | self.y = LabelEncoder().fit_transform(label) 192 | self.list_classes = np.unique(self.y) 193 | DataLoader.split_data(self) 194 | DataLoader.normalization(self) 195 | else: 196 | self.__encode_symbolic_df() 197 | kdd_df[symbolic] = self.symbolic_df 198 | self.X = kdd_df[kdd_df.columns.difference(['label'])].values 199 | self.y = kdd_df['label'].values 200 | self.list_classes = np.unique(self.y) 201 | DataLoader.split_data(self) 202 | 203 | def __encode_symbolic_df(self): 204 | self.symbolic_encoder = defaultdict(LabelEncoder) 205 | # Encode the symbolic variables 206 | self.symbolic_df = self.symbolic_df.apply(lambda x: self.symbolic_encoder[x.name].fit_transform(x)) 207 | 208 | def inverse_encode_symbolic_df(self): 209 | self.symbolic_df.apply(lambda x: self.symbolic_encoder[x.name].inverse_transform(x)) 210 | 211 | 212 | class UsenetLoader(DataLoader): 213 | ''' 214 | Text dataset, inspired by Katakis et al. (2010), is a simulation of news filtering with a concept drift related to 215 | the change of interest of a user over time. For this purpose we use the data from 20 Newsgroups (Rennie, 2008) and 216 | handle it as follows. There are six topics chosen and the simulated user in each concept is subscribed to mailing 217 | list of four of them being interested only in two. Over time the virtual user decides to unsubscribe from those 218 | groups that he was not interested in and subscribe for two new ones that he becomes interested in. The previously 219 | interesting topics become out of his main interest. The Table 1 summarizes the concepts. Note that the topics of 220 | interest are repeated to simulate recurring concepts. The original dataset is divided into train and test. Data from 221 | train appears in the first three concepts whereas data from test is in the last three (recurring) concepts. 222 | The data is preprocessed with tm (Feinerer, 2010) package for R keeping only attributes (words) longer than three 223 | letters and with minimal document frequency greater than three. Moreover, from the remaining only those that are 224 | informative are kept (entropy > 75 x 10-5 ). Attribute values are binary indicating the presence or absence of the 225 | respective word. At the end the set has 659 attributes and 5,931 examples. 226 | ''' 227 | 228 | def __init__(self, sea_data_path, use_pickle_for_loading=False, percentage_historical_data=0.2): 229 | DataLoader.__init__(self, sea_data_path, percentage_historical_data=percentage_historical_data) 230 | if use_pickle_for_loading: 231 | self.load_from_pickle() 232 | else: 233 | usenet_df = pd.read_csv(self.data_path, header=None) 234 | d = {'no': 0., 'yes': 1., 't': 1., 'f': 0., 'tt': 1} # tt = error in the df 235 | usenet_data = usenet_df.replace(d).values 236 | self.X = usenet_data[:, :-1] 237 | self.y = usenet_data[:, -1] 238 | self.list_classes = np.unique(self.y) 239 | DataLoader.split_data(self) 240 | 241 | -------------------------------------------------------------------------------- /data_management/StreamGenerator.py: -------------------------------------------------------------------------------- 1 | class StreamGenerator: 2 | """ 3 | Emulate a stream of data for online learning algorithm 4 | """ 5 | def __init__(self, data_loader): 6 | """ 7 | Constructor of the StreamGenerator 8 | :param data_loader: Loader which inherits DataLoader 9 | """ 10 | self.data_loader = data_loader 11 | 12 | def get_historical_data(self): 13 | """ 14 | :return: A tuple X_historical_data and y_historical_data 15 | """ 16 | return self.data_loader.return_historical_data() 17 | 18 | def generate(self, stream_length=1e8, batch_size=1): 19 | """ 20 | Generator of streaming data 21 | :param stream_length: How many example do you want to see 22 | :param batch_size: batch size is one by default you can fixed it or randomized it. 23 | :return: A tuple X,y 24 | """ 25 | X, y = self.data_loader.return_data() 26 | X_length = len(X) 27 | if stream_length > X_length: 28 | stream_length = X_length 29 | 30 | for i in range(0, stream_length, batch_size): 31 | yield X[i:i + batch_size], y[i:i + batch_size] 32 | -------------------------------------------------------------------------------- /data_management/__init__.py: -------------------------------------------------------------------------------- 1 | from .DataLoader import SEALoader, KDDCupLoader, UsenetLoader 2 | from .StreamGenerator import StreamGenerator 3 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression, SGDClassifier 2 | from sklearn.svm import SVC 3 | from sklearn.tree import DecisionTreeClassifier 4 | 5 | from AlgorithmsComparator import AlgorithmsComparator 6 | from data_management.DataLoader import UsenetLoader, SEALoader, KDDCupLoader 7 | from data_management.StreamGenerator import StreamGenerator 8 | from drift_detection_methods.spc import DDM 9 | from ensemble_methods import SEA, DWM, OnlineBagging, DDD, DiversityWrapper 10 | from offline_methods import OfflineAlgorithmsWrapper 11 | from training_windows_methods import AdaptiveSVC 12 | 13 | # generate SEA concepts data 14 | sea_loader = SEALoader('data/sea.data', percentage_historical_data=0.2) 15 | list_classes = sea_loader.get_classes() 16 | sea_generator = StreamGenerator(sea_loader) 17 | 18 | # generate KDD data 19 | kdd_loader = KDDCupLoader('data/kddcup.data_10_percent', percentage_historical_data=0.2) 20 | list_classes = kdd_loader.get_classes() 21 | kdd_generator = StreamGenerator(kdd_loader) 22 | 23 | usenet_loader = UsenetLoader('data/usenet_recurrent3.3.data', percentage_historical_data=0.1) 24 | list_classes = usenet_loader.get_classes() 25 | usenet_generator = StreamGenerator(usenet_loader) 26 | 27 | # models 28 | SEA_decision_trees = SEA(10, list_classes=list_classes, 29 | base_estimator=OfflineAlgorithmsWrapper(DecisionTreeClassifier())) 30 | SEA_SVC = SEA(10, base_estimator=OfflineAlgorithmsWrapper(SVC())) 31 | adaptive_SVC = AdaptiveSVC(C=100, memory_limit=465) 32 | decision_tree = OfflineAlgorithmsWrapper(base_estimator=DecisionTreeClassifier()) 33 | 34 | # Online Bagging 35 | bagging_high_diversity = OnlineBagging(lambda_diversity=0.1, n_classes=list_classes, n_estimators=25) 36 | bagging_low_diversity = OnlineBagging(lambda_diversity=1, n_classes=list_classes, n_estimators=25) 37 | 38 | # DDD with Sea 39 | PARAM_LOG_REG = {'solver': 'sag', 'tol': 1e-1, 'C': 1e4} 40 | log_high_diversity = DiversityWrapper(lambda_diversity=0.1, 41 | list_classes=list_classes, 42 | base_estimator=LogisticRegression(**PARAM_LOG_REG)) 43 | log_low_diversity = DiversityWrapper(lambda_diversity=1, 44 | list_classes=list_classes, 45 | base_estimator=LogisticRegression(**PARAM_LOG_REG)) 46 | ddd_sea_log_reg = SEA 47 | p_sea_high_div = { 48 | 'base_estimator': log_high_diversity, 49 | 'n_estimators': 25, 50 | 'list_classes': list_classes 51 | } 52 | p_sea_low_div = { 53 | 'base_estimator': log_low_diversity, 54 | 'n_estimators': 25, 55 | 'list_classes': list_classes 56 | } 57 | ddd = DDD(ensemble_method=ddd_sea_log_reg, drift_detector=DDM, pl=p_sea_high_div, ph=p_sea_low_div) 58 | 59 | # DDD with online Bagging 60 | clf = OnlineBagging 61 | p_clf_high = {'lambda_diversity': 0.1, 62 | 'n_classes': list_classes, 63 | 'n_estimators': 25, 64 | 'base_estimator': SGDClassifier, 65 | 'p_estimators': {'loss': 'log'} # We cannot predict_proba with the hinge loss 66 | } 67 | p_clf_low = {'lambda_diversity': 1, 68 | 'n_classes': list_classes, 69 | 'n_estimators': 25, 70 | 'base_estimator': SGDClassifier, 71 | 'p_estimators': {'loss': 'log'} # We cannot predict_proba with the hinge loss 72 | } 73 | ddd_online_bagging = DDD(ensemble_method=clf, drift_detector=DDM, pl=p_clf_low, ph=p_clf_high) 74 | 75 | # DWM 76 | beta = 0.8 77 | theta = 0.01 78 | period = 5 79 | DWM_decision_trees = DWM(beta, theta, period, OfflineAlgorithmsWrapper(DecisionTreeClassifier())) 80 | DWM_SVC = DWM(beta, theta, period, base_estimator=OfflineAlgorithmsWrapper(SVC(probability=True))) 81 | 82 | dwm_log_high_diversity = DiversityWrapper(lambda_diversity=0.1, 83 | list_classes=list_classes, 84 | base_estimator=LogisticRegression(**PARAM_LOG_REG)) 85 | dwm_log_low_diversity = DiversityWrapper(lambda_diversity=1, 86 | list_classes=list_classes, 87 | base_estimator=LogisticRegression(**PARAM_LOG_REG)) 88 | dwm_log_reg = DWM 89 | p_dwm_high_div = { 90 | 'beta': 0.8, 91 | 'theta': 0.01, 92 | 'period': 5, 93 | 'base_estimator': dwm_log_high_diversity, 94 | 'list_classes': list_classes 95 | } 96 | p_dwm_low_div = { 97 | 'beta': 0.8, 98 | 'theta': 0.01, 99 | 'period': 5, 100 | 'base_estimator': dwm_log_low_diversity, 101 | 'list_classes': list_classes 102 | } 103 | 104 | ddd_dwm_log_reg = DDD(ensemble_method=dwm_log_reg, drift_detector=DDM, pl=p_dwm_high_div, ph=p_dwm_low_div) 105 | 106 | algorithms = [ 107 | # ("SEA (Decision Tree)", SEA_decision_trees), 108 | # ("Offline decision tree", decision_tree), 109 | # ("SEA (SVC)", SEA_SVC), 110 | # ("Adaptive SVC", adaptive_SVC), 111 | # ("Bagging low div (LogReg)", bagging_low_diversity), 112 | # ("Bagging high div (LogReg)", bagging_high_diversity), 113 | ("DDD (Online bagging)", ddd_online_bagging), 114 | ("DDD (DWM logreg)", ddd_dwm_log_reg), 115 | # ("DDD (SEA LogReg)", ddd) 116 | # ("DWM (Decision Tree)", DWM_decision_trees), 117 | # ("DWM (SVC)", DWM_SVC), 118 | ] 119 | 120 | #comparison of algorithms on SEA concepts 121 | # print("\nDataset: SEA concepts") 122 | # comparator = AlgorithmsComparator(algorithms, sea_generator) 123 | # comparator.plot_comparison(batch_size=3000, stream_length=48000) 124 | 125 | #comparison of algorithms on KDD dataset 126 | print("\nDataset: KDD") 127 | comparator = AlgorithmsComparator(algorithms, kdd_generator) 128 | comparator.plot_comparison(batch_size=3000, stream_length=480000) 129 | 130 | # print("\n Dataset: Usenet") 131 | # comparator = AlgorithmsComparator(algorithms, usenet_generator) 132 | # comparator.plot_comparison(batch_size=50, stream_length=6000) 133 | -------------------------------------------------------------------------------- /drift_detection_methods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanRxl/concept_drift/de08fa7e72b16c0620d6a548e6cee32fe445c17d/drift_detection_methods/__init__.py -------------------------------------------------------------------------------- /drift_detection_methods/spc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | ''' 4 | Detectors based on statistical process control. 5 | SPC considers learning as a process and monitors the evolution of this process. 6 | ''' 7 | 8 | 9 | class DDM: 10 | ''' 11 | This class follows the article: 12 | Gama, J., Medas, P., Castillo, G., Rodrigues, P.: Learning with drift detection. 13 | Lecture Notes in Computer Science 3171 (2004) 14 | ''' 15 | 16 | def __init__(self, verbose=False): 17 | self.verbose = verbose 18 | self.pmin = 10e8 19 | self.smin = 10e8 20 | self.t = 0 # number of examples seen 21 | self.pi = 1 # error-rate 22 | self.si = 0 # standard deviation 23 | self.psi = 10e8 24 | self.ctr = 0 25 | 26 | def reset_after_drift(self): 27 | self.pmin = 10e8 28 | self.smin = 10e8 29 | self.t = 0 # number of examples seen 30 | self.pi = 1 # error-rate 31 | self.si = 0 # standard deviation 32 | self.psi = 10e8 33 | 34 | def __update(self, y_true, y_pred): 35 | self.t += 1 # update the number of items seen 36 | good_predictions = y_pred == y_true 37 | error_rate = 1 - good_predictions 38 | self.pi += (error_rate - self.pi) / self.t 39 | self.si = np.sqrt(self.pi * (1 - self.pi) / self.t) 40 | 41 | if self.t > 30 and self.pi + self.si <= self.psi: 42 | self.pmin = self.pi 43 | self.smin = self.si 44 | self.psi = self.si + self.pi 45 | 46 | def drift_detection(self, y_true, y_pred): 47 | self.ctr += len(y_true) 48 | for yt, yp in zip(y_true, y_pred): 49 | drift = self.__drift_detection_lonely_example(yt, yp) 50 | if drift: 51 | return True 52 | return False 53 | 54 | def __drift_detection_lonely_example(self, y_true, y_pred): 55 | self.__update(y_true, y_pred) 56 | if self.t > 30 and self.pi + self.si >= self.pmin + 3 * self.smin: 57 | if self.verbose: 58 | print('Drift detected: time_step={0}'.format(self.ctr)) 59 | self.reset_after_drift() 60 | return True 61 | elif self.pmin + 2 * self.smin <= self.pi + self.si < self.pmin + 3 * self.smin: 62 | if self.verbose: 63 | print('Warning a drift may happens: time_step={0}'.format(self.ctr)) 64 | return False 65 | else: 66 | return False 67 | 68 | 69 | # TODO implement EDDM 70 | class EDDM(DDM): 71 | ''' 72 | This class is an implementation of the following algorithm: 73 | BAENA-GARCIA, Manuel, DEL CAMPO-ÁVILA, José, FIDALGO, Raúl, et al. 74 | Early drift detection method 75 | In : Fourth international workshop on knowledge discovery from data streams. 2006. p. 77-86. 76 | http://www.cs.upc.edu/~abifet/EDDM.pdf 77 | ''' 78 | 79 | def __init__(self, verbose=False): 80 | DDM.__init__(self, verbose=verbose) 81 | 82 | def drift_detection(self, y_true, y_pred): 83 | pass 84 | -------------------------------------------------------------------------------- /ensemble_methods/DWM.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | from sklearn.metrics.classification import accuracy_score 6 | from sklearn.tree import DecisionTreeClassifier 7 | 8 | 9 | class DWM: 10 | """ This class implements the DWM algorithm based on the article "Dynamic Weighted Majority: A New Ensemble Method for Tracking Concept Drift" by Jeremy Z. Kolter and Marcus A. Maloof """ 11 | 12 | def __init__(self, beta, theta, period, base_estimator=None, list_classes=None): 13 | # For noisy problems, a period parameter can be added 14 | """ Constructor of DWM 15 | 16 | :param base_estimator: instance of a classifier class (by default sklearn.tree.DecisionTreeClassifier()) 17 | :param beta : multiplier affecting the weight every time a classifier get the prediction wrong 18 | :param theta: threshold to remove the classifier from the list 19 | """ 20 | if base_estimator is None: 21 | self.base_estimator = DecisionTreeClassifier() 22 | else: 23 | self.base_estimator = base_estimator 24 | 25 | self.list_classifiers = [] 26 | self.new_classifier = None 27 | self.classifier_to_evaluate = None 28 | self.list_classes = list_classes 29 | self.weights = [] 30 | self.theta = theta 31 | self.beta = beta 32 | self.period = period 33 | self.step = 0 34 | 35 | def update(self, X, y): 36 | """ Update the ensemble of models 37 | 38 | :param X: new batch X 39 | :param y: array of labels 40 | """ 41 | self.step += 1 42 | 43 | # retrieve list of different classes if it is the first time we fit data 44 | if self.list_classes is None: 45 | self.list_classes = np.unique(y) 46 | 47 | # train new classifier 48 | self.new_classifier = deepcopy(self.base_estimator) 49 | self.new_classifier.update(X, y) 50 | 51 | # if there is not enough classifiers, add the new classfier in the ensemble 52 | if len(self.list_classifiers) == 0: 53 | self.list_classifiers.append(self.new_classifier) 54 | self.weights.append(1) 55 | # Otherwise, we lower the weights on the lower classifiers, multiplying them by beta 56 | # Once the weights are lowered, we remove the classifiers with weights under the threshold theta 57 | elif self.step > 0 and self.step % self.period == 0: 58 | # On each update, we'll use two empty lists to store the classifiers/weighs that pass the tests 59 | # Once the tests are ran on all classifiers/weights, that new list become the main one 60 | self.newlist_classifiers = [] 61 | self.newWeights = [] 62 | 63 | for clf, weight in zip(self.list_classifiers, self.weights): 64 | # If the prediction is untrue but the classifier still has enough weight, we'll keep him 65 | if np.sum(clf.predict(X) != y) > 250: 66 | if weight * (self.beta) > self.theta: 67 | self.newWeights.append(round(weight * (self.beta), 2)) 68 | self.newlist_classifiers.append(clf) 69 | else: 70 | self.newWeights.append(round(weight, 2)) 71 | self.newlist_classifiers.append(clf) 72 | 73 | self.weights = deepcopy(self.newWeights) 74 | 75 | self.list_classifiers = deepcopy(self.newlist_classifiers) 76 | # The step is finished by normalizing the weight vector 77 | norm = np.max(self.weights) 78 | self.weights = [weight / norm for weight in self.weights] 79 | # Now let's vote with the new weights 80 | # If the decision is still not correct, then we'll add a new classifier 81 | 82 | 83 | """ 84 | # make the prediction for each classifier 85 | predictions = np.array([clf.predict(X).tolist() for clf in self.list_classifiers]) 86 | 87 | # for each class, count the number of times the class is predicted 88 | nb_votes_by_class = [] 89 | for i in range(self.list_classes): 90 | nb_votes_by_class.append(0) 91 | for j in range(len(self.list_classes)): 92 | if predictions[j] == self.list_classes[i]: 93 | nb_votes_by_class[i] += self.weights[j] 94 | """ 95 | # for each example, return the class which was predicted the most 96 | # If the prediction is incorrect, then add a new classifier 97 | 98 | if np.any(self.predict(X) != y): 99 | # Train and add the new classifier 100 | self.new_classifier = deepcopy(self.base_estimator) 101 | self.new_classifier.update(X, y) 102 | self.list_classifiers.append(self.new_classifier) 103 | # Add the matching weight 104 | self.weights.append(1) 105 | """ 106 | for clf, weight in zip(self.list_classifiers, self.weights): 107 | # If the prediction is untrue but the classifier still has enough weight, we'll keep him 108 | print(self.weights, (self.beta) ** np.sum(clf.predict(X) != y), 109 | weight * ((self.beta) ** np.sum(clf.predict(X) != y))) 110 | """ 111 | 112 | """ 113 | def predict(self, X): 114 | # Make the prediction 115 | 116 | # :param X: examples to predict 117 | # :return: the prediction y_predict 118 | # make the prediction for each classifier 119 | predictions = np.array([clf.predict(X).tolist() for clf in self.list_classifiers]) 120 | 121 | # for each class, count the number of times the class is predicted 122 | nb_votes_by_class = [] 123 | for c in self.list_classes: 124 | nb_votes_by_class.append(0) 125 | for prediction, weight in zip(predictions, self.weights): 126 | print(prediction, c) 127 | if prediction == c: 128 | nb_votes_by_class[len(nb_votes_by_class)] += weight 129 | 130 | # for each example, return the class which was predicted the most 131 | return self.list_classes[np.argmax(nb_votes_by_class, axis=0)] 132 | """ 133 | 134 | def predict(self, X): 135 | """ Compute the probability of belonging to each class 136 | :param X: examples to predict 137 | :return: the probabilities, array of shape (n_examples, n_classes) 138 | """ 139 | # create empty array to retrieve 140 | array_probas = np.zeros((len(X), len(self.list_classes), len(self.list_classifiers))) 141 | 142 | # iterate over the classifiers and add the probabilities to the previous array 143 | for i, clf in enumerate(self.list_classifiers): 144 | array_probas[:, :, i] = clf.predict_proba(X) 145 | 146 | # compute and return the mean of probas computed by each classifier 147 | probs = np.average(array_probas, axis=2, weights=self.weights) 148 | return self.list_classes[np.argmax(probs, axis=1)] 149 | 150 | def predict_proba(self, X): 151 | """ Compute the probability of belonging to each class 152 | 153 | :param X: examples to predict 154 | :return: the probabilities, array of shape (n_examples, n_classes) 155 | """ 156 | # create empty array to retrieve 157 | array_probas = np.zeros((len(X), len(self.list_classes), len(self.list_classifiers))) 158 | 159 | # iterate over the classifiers and add the probabilities to the previous array 160 | for i, clf in enumerate(self.list_classifiers): 161 | array_probas[:, :, i] = clf.predict_proba(X) 162 | 163 | # compute and return the mean of probas computed by each classifier 164 | return np.average(array_probas, axis=2, weights=self.weights) 165 | 166 | 167 | if __name__ == "__main__": 168 | from data_management import SEALoader, StreamGenerator 169 | from sklearn.svm import SVC 170 | from offline_methods import OfflineAlgorithmsWrapper 171 | 172 | # generate data 173 | loader = SEALoader('../data/sea.data') 174 | generator = StreamGenerator(loader) 175 | 176 | # model 177 | beta = 0.5 178 | theta = 0.1 179 | period = 3 180 | clf = DWM(base_estimator=OfflineAlgorithmsWrapper(SVC(probability=True)), beta=beta, theta=theta, period=period) 181 | 182 | # record scores 183 | accuracy_results = [] 184 | 185 | for i, (X, y) in enumerate(generator.generate(batch_size=3000)): 186 | print("Batch #%d:" % i) 187 | print("update model\n") 188 | delete = i % period != 0 189 | clf.update(X, y) 190 | # predict 191 | print("predict for current X") 192 | y_predict = clf.predict(X) 193 | print("Accuracy score: %0.2f" % accuracy_score(y, y_predict)) 194 | accuracy_results.append(accuracy_score(y, y_predict)) 195 | 196 | plt.plot(accuracy_results) 197 | plt.ylabel('Accuracy Results') 198 | plt.show() 199 | -------------------------------------------------------------------------------- /ensemble_methods/SEA.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import numpy as np 4 | from sklearn.metrics.classification import accuracy_score 5 | from sklearn.tree import DecisionTreeClassifier 6 | from offline_methods import OfflineAlgorithmsWrapper 7 | 8 | 9 | class SEA: 10 | """ 11 | This class implements the SEA algorithm based on the article 12 | "A Streaming Ensemble Algorithm (SEA) for Large-Scale Classification" by W Nick Street and YongSeog Kim 13 | """ 14 | 15 | def __init__(self, n_estimators, base_estimator=None, scoring_method=None, list_classes=None): 16 | """ Constructor of SEA 17 | 18 | :param n_estimators: number of estimators in the ensemble 19 | :param base_estimator: instance of a classifier class (by default sklearn.tree.DecisionTreeClassifier()) 20 | 21 | """ 22 | if base_estimator is None: 23 | self.base_estimator = OfflineAlgorithmsWrapper(DecisionTreeClassifier()) 24 | else: 25 | self.base_estimator = base_estimator 26 | 27 | if scoring_method is None: 28 | self.scoring_method = accuracy_score 29 | else: 30 | self.scoring_method = scoring_method 31 | 32 | self.n_estimators = n_estimators 33 | 34 | self.list_classifiers = [] 35 | self.new_classifier = None 36 | self.classifier_to_evaluate = None 37 | self.list_classes = list_classes 38 | 39 | def update(self, X, y): 40 | """ Update the ensemble of models 41 | 42 | :param X: new batch X 43 | :param y: array of labels 44 | """ 45 | # retrieve list of different classes if it is the first time we fit data 46 | if self.list_classes is None: 47 | self.list_classes = np.unique(y) 48 | 49 | # train new classifier 50 | self.new_classifier = deepcopy(self.base_estimator) 51 | self.new_classifier.update(X, y) 52 | 53 | # if there is not enough classifiers, add the new classfier in the ensemble 54 | if len(self.list_classifiers) < self.n_estimators: 55 | self.list_classifiers.append(self.new_classifier) 56 | # otherwise, evaluate the (n_estimators + 1) estimators and remove the worst performing one 57 | else: 58 | if self.classifier_to_evaluate is None: 59 | self.classifier_to_evaluate = self.new_classifier 60 | else: 61 | # evaluate (n_estimators + 1) classifiers 62 | self.list_classifiers.append(self.classifier_to_evaluate) 63 | scores = [self.scoring_method(y, clf.predict(X)) for clf in self.list_classifiers] 64 | 65 | # remove the worst performing one 66 | self.list_classifiers.pop(int(np.argmin(scores))) 67 | 68 | def predict(self, X): 69 | """ Make the prediction 70 | 71 | :param X: examples to predict 72 | :return: the prediction y_predict 73 | """ 74 | # make the prediction for each classifier 75 | predictions = np.array([clf.predict(X).tolist() for clf in self.list_classifiers]) 76 | 77 | # for each class, count the number of times the class is predicted 78 | nb_votes_by_class = [] 79 | for c in self.list_classes: 80 | nb_votes_by_class.append(np.sum(predictions == c, axis=0)) 81 | 82 | # for each example, return the class which was predicted the most 83 | return self.list_classes[np.argmax(nb_votes_by_class, axis=0)] 84 | 85 | def predict_proba(self, X): 86 | """ Compute the probability of belonging to each class 87 | 88 | :param X: examples to predict 89 | :return: the probabilities, array of shape (n_examples, n_classes) 90 | """ 91 | # create empty array to retrieve 92 | array_probas = np.zeros((len(X), len(self.list_classes), len(self.list_classifiers))) 93 | 94 | # iterate over the classifiers and add the probabilities to the previous array 95 | for i, clf in enumerate(self.list_classifiers): 96 | array_probas[:, :, i] = clf.predict_proba(X) 97 | 98 | # compute and return the mean of probas computed by each classifier 99 | return array_probas.mean(axis=2) 100 | 101 | 102 | if __name__ == "__main__": 103 | from data_management import SEALoader, StreamGenerator 104 | from sklearn.svm import SVC 105 | from offline_methods import OfflineAlgorithmsWrapper 106 | # generate data 107 | loader = SEALoader('../data/sea.data') 108 | generator = StreamGenerator(loader) 109 | 110 | # model 111 | n_estimators = 5 112 | svc = OfflineAlgorithmsWrapper(SVC(probability=True)) 113 | clf = SEA(base_estimator=svc, n_estimators=n_estimators) 114 | 115 | for i, (X, y) in enumerate(generator.generate(batch_size=2000)): 116 | print("Batch #%d:" % i) 117 | # for the first batches, only update the model 118 | if i < n_estimators: 119 | print("update model\n") 120 | clf.update(X, y) 121 | else: 122 | # predict 123 | print("predict for current X") 124 | y_predict = clf.predict(X) 125 | # probas = clf.predict_proba(X) 126 | print("Accuracy score: %0.2f" % accuracy_score(y, y_predict)) 127 | # after some time, labels are available 128 | print("update model\n") 129 | clf.update(X, y) 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /ensemble_methods/__init__.py: -------------------------------------------------------------------------------- 1 | from .SEA import SEA 2 | from .ddd import DDD, DiversityWrapper 3 | from .online_bagging import OnlineBagging 4 | from .DWM import DWM 5 | -------------------------------------------------------------------------------- /ensemble_methods/ddd.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | from sklearn.metrics import accuracy_score 4 | from sklearn.linear_model import LogisticRegression 5 | from drift_detection_methods.spc import DDM 6 | from ensemble_methods.online_bagging import OnlineBagging 7 | 8 | PARAM_LOG_REG = {'solver': 'sag', 'tol': 1e-1, 'C': 1.e4} 9 | 10 | 11 | class DiversityWrapper: 12 | """ 13 | This is a wrapper for the learning algorithm used in ensemble methods by DDD. 14 | It allows to introduce high/low diversity during the training. 15 | Low diversity => lambda = 1 16 | High diversity => lambda =0.1 17 | """ 18 | def __init__(self, lambda_diversity=0.1, base_estimator=None, list_classes=None): 19 | """ 20 | 21 | :param lambda_diversity: Parameters of the Poisson distribution which introduce high/low diversity 22 | :param base_estimator: Estimators which is going to be used by this wrapper. 23 | :param list_classes: Number of classes to predict 24 | """ 25 | self.lambda_diversity = lambda_diversity 26 | if base_estimator is None: 27 | self.base_estimator = LogisticRegression(**PARAM_LOG_REG) 28 | else: 29 | self.base_estimator = base_estimator 30 | self.fitted = False # boolean which is True if base_estimator has been fit 31 | self.list_classes = list_classes 32 | 33 | def __create_diversity(self, X, y, lambda_diversity): 34 | """ 35 | :param X: 36 | :param y: 37 | :param lambda_diversity: 38 | :return: 39 | """ 40 | # Generate the number of time I want my classifier see the example 41 | X_training = None 42 | y_training = None 43 | while X_training is None and y_training is None: 44 | X_training = None 45 | y_training = None 46 | k = np.random.poisson(lambda_diversity, len(X)) 47 | while np.sum(k > 0): 48 | pos = np.where(k > 0) 49 | if X_training is None and y_training is None: 50 | X_training = X[pos] 51 | y_training = y[pos] 52 | else: 53 | X_pos = X[pos] 54 | y_pos = y[pos] 55 | if X_pos.shape[0] == 1: 56 | X_training = np.concatenate((X_training, X[pos].reshape((1, X[pos].shape[1]))), axis=0) 57 | else: 58 | X_training = np.concatenate((X_training, X[pos]), axis=0) 59 | y_training = np.vstack((y_training.reshape((-1, 1)), y_pos.reshape((-1, 1)))) 60 | # check if there is all classes pass to the fit methods 61 | k -= 1 62 | return X_training, y_training 63 | 64 | def __preprocess_X_and_y_fit(self, X, y): 65 | #TODO used only online algorithm with fit_partial method. 66 | """ 67 | Check if we have all the labels in the batch. 68 | :param X: 69 | :param y: 70 | :return: 71 | """ 72 | y_values = np.unique(y) 73 | if len(y_values) == len(self.list_classes): 74 | return X, y.reshape((y.shape[0],)) 75 | else: 76 | for val in self.list_classes: 77 | if val not in y_values: 78 | X = np.concatenate((X, np.zeros((1, X.shape[1]))), axis=0) 79 | y = np.vstack((y.reshape((-1, 1)), val)) 80 | return X, y.reshape((y.shape[0],)) 81 | 82 | def update(self, X, y): 83 | """Fit the base_estimator, only if it has not been fitted already""" 84 | X_with_diversity, y_with_diversity = self.__create_diversity(X, y, self.lambda_diversity) 85 | X_with_diversity, y_with_diversity = self.__preprocess_X_and_y_fit(X_with_diversity, y_with_diversity) 86 | self.base_estimator.fit(X_with_diversity, y_with_diversity) 87 | 88 | def predict(self, X): 89 | return self.base_estimator.predict(X) 90 | 91 | def predict_proba(self, X): 92 | return self.base_estimator.predict_proba(X) 93 | 94 | class PrequentialMetrics: 95 | def __init__(self): 96 | self.acc = 1 97 | self.var = 0 98 | self.std = 0 99 | self.t = 0 # time_step 100 | self.t_drift = 0 # time step of the previous drift 101 | 102 | def update(self, y_pred, y_true, drift): 103 | """ 104 | Update the Prequential accuracy according to the section 5 of the DDD publication 105 | if drift 106 | acc(t) = acc_ex(t) 107 | else 108 | acc(t) = acc(t-1) + acc_ex(t)-acc(t-1)/(t -t_drift+1) 109 | :param y_pred: predicted labels 110 | :param y_true: real labels 111 | :param drift: A drift has been detected 112 | :return: 113 | """ 114 | 115 | number_of_time_steps = len(y_pred) # number of time steps in the batch 116 | self.t += number_of_time_steps # update the number of items seen 117 | good_predictions = np.sum(y_pred == y_true) 118 | batch_accuracy = good_predictions / number_of_time_steps 119 | 120 | if drift: 121 | self.acc = batch_accuracy 122 | self.var = self.acc * (1 - self.acc) / number_of_time_steps 123 | self.t_drift = self.t 124 | else: 125 | self.acc += (batch_accuracy - self.acc) / (self.t - self.t_drift + 1) 126 | self.var = self.acc * (1 - self.acc) / (self.t - self.t_drift + 1) 127 | 128 | self.std = np.sqrt(self.var) 129 | 130 | 131 | class DDD: 132 | def __init__(self, drift_detector=None, ensemble_method=None, W=0.1, pl=None, ph=None): 133 | ''' 134 | This class implements the DDD algorithms based on the article: 135 | MINKU, Leandro L. et YAO, Xin. DDD: A new ensemble approach for dealing with concept drift. IEEE transactions on 136 | knowledge and data engineering, 2012, vol. 24, no 4, p. 619-633. 137 | :param ensemble_method: online ensemble algorithm (LogisticRegression by default) 138 | :param drift_detector: drift detection method to use 139 | :param stream: data stream 140 | :param W: multiplier constant W for the weight of the old low diversity ensemble 141 | :param pl: parameters for ensemble learning with low diversity 142 | :param ph: parameters for ensemble learning with high diversity 143 | :param pd: parameters for drift detection method 144 | :return: 145 | ''' 146 | 147 | if drift_detector is None: 148 | self.drift_detector = DDM 149 | else: 150 | self.drift_detector = drift_detector 151 | if ensemble_method is None: 152 | self.ensemble_method = OnlineBagging 153 | else: 154 | self.ensemble_method = ensemble_method 155 | self.drift_detector = drift_detector() 156 | self.W = W 157 | self.pl = pl 158 | self.ph = ph 159 | 160 | # Parameters 161 | self.mode_before_drift = True # before drift 162 | self.drift = False 163 | self.low_diversity_learner, self.high_diversity_learner = self.__init_ensemble() 164 | self.old_low_diversity_learner = self.old_high_diversity_learner = None 165 | self.metric_ol, self.metric_oh, self.metric_nl, self.metric_nh = self.__init_metrics() 166 | self.woh = self.wol = self.wnl = 0 167 | self.y_pred = None 168 | 169 | def __weighted_majority(self, X, hnl, hol, hoh, wnl, wol, woh): 170 | ''' 171 | Weighted majority between all the learning algorithms. 172 | The new high diversity learning algorithm is not considered because it is likely to have low accuracy 173 | on the new concept. 174 | :param hnl: new low diversity learning algorithm 175 | :param hol: old low diversity learning algorithm 176 | :param hoh: old high diversity learning algorithm 177 | :param wnl: weights 178 | :param wol: weights 179 | :param woh: weights 180 | :return: 181 | ''' 182 | y_hnl = hnl.predict_proba(X) 183 | y_hol = hol.predict_proba(X) 184 | y_hoh = hoh.predict_proba(X) 185 | return self.__scores_to_single_label(wnl * y_hnl + wol * y_hol + woh * y_hoh) 186 | 187 | @staticmethod 188 | def __init_metrics(): 189 | metric_ol = PrequentialMetrics() 190 | metric_oh = PrequentialMetrics() 191 | metric_nl = PrequentialMetrics() 192 | metric_nh = PrequentialMetrics() 193 | return metric_ol, metric_oh, metric_nl, metric_nh 194 | 195 | def __init_ensemble(self): 196 | hnl = self.ensemble_method(**self.pl) # ensemble low diversity 197 | hnh = self.ensemble_method(**self.ph) # ensemble high diversity 198 | return hnl, hnh 199 | 200 | @staticmethod 201 | def __scores_to_single_label(scores): 202 | if len(scores.shape) == 1: 203 | return (scores > 0).astype(np.int) 204 | else: 205 | return scores.argmax(axis=1) 206 | 207 | def predict(self, X): 208 | # Before a drift is detected only the low ensemble is used for system prediction 209 | if self.mode_before_drift: 210 | y_pred = self.low_diversity_learner.predict(X) 211 | else: 212 | sum_acc = self.metric_nl.acc + self.metric_ol.acc * self.W + self.metric_oh.acc 213 | self.wnl = self.metric_nl.acc / sum_acc 214 | self.wol = self.metric_ol.acc * self.W / sum_acc 215 | self.woh = self.metric_oh.acc / sum_acc 216 | y_pred = self.__weighted_majority(X, self.low_diversity_learner, self.old_low_diversity_learner, 217 | self.old_high_diversity_learner, self.wnl, self.wol, self.woh) 218 | self.y_pred = y_pred 219 | return y_pred 220 | 221 | def __drift_detection(self, X, y_true): 222 | # Not done in the paper but seems to be the proper position for the update 223 | self.metric_nl.update(self.y_pred, y_true, self.drift) 224 | self.metric_nh.update(self.high_diversity_learner.predict(X), y_true, self.drift) 225 | if not self.mode_before_drift: 226 | self.metric_oh.update(self.old_high_diversity_learner.predict(X), y_true, self.drift) 227 | self.metric_ol.update(self.old_low_diversity_learner.predict(X), y_true, self.drift) 228 | 229 | # Boolean == True if drift detect 230 | self.drift = self.drift_detector.drift_detection(y_true, self.y_pred) 231 | 232 | if self.drift: 233 | # The old low diversity ensemble after the second drift detection can be either 234 | # the same as the old high diversity learning with low diversity 235 | # after the first detection or the ensemble corresponding 236 | # to the new low diversity after the first drift detection depending 237 | # on which of them is the most accurate. 238 | if self.mode_before_drift or (not self.mode_before_drift and self.metric_nl.acc > self.metric_oh.acc): 239 | self.old_low_diversity_learner = self.low_diversity_learner 240 | self.metric_ol = self.metric_nl # Not said in the paper but make sense. 241 | else: 242 | self.old_low_diversity_learner = self.old_high_diversity_learner 243 | self.metric_ol = self.metric_oh # Not said in the paper but make sense. 244 | 245 | # The ensemble corresponding to the high diversity is registered as old 246 | self.old_high_diversity_learner = self.high_diversity_learner 247 | self.metric_oh = self.metric_nh # Not said in the paper but make sense. 248 | 249 | # After a drift is detected new low and high diversity ensemble are created 250 | self.low_diversity_learner, self.high_diversity_learner = self.__init_ensemble() 251 | # In the paper all the metrics are set to zero. Which is impossible in the predict method we divide 252 | # by 0. 253 | _, _, self.metric_nl, self.metric_nh = self.__init_metrics() 254 | self.mode_before_drift = False # After drift 255 | # if after drift 256 | if not self.mode_before_drift: 257 | if self.metric_nl.acc > self.metric_oh.acc and self.metric_nl.acc > self.metric_ol.acc: 258 | self.mode_before_drift = True 259 | elif self.metric_oh.acc - self.metric_oh.std > self.metric_nl.acc + self.metric_nl.std \ 260 | and self.metric_oh.acc - self.metric_oh.std > self.metric_ol.acc + self.metric_ol.std: 261 | self.low_diversity_learner = deepcopy(self.old_high_diversity_learner) 262 | self.metric_nl = deepcopy(self.metric_oh) 263 | self.mode_before_drift = True 264 | 265 | def update(self, X, y_true): 266 | # If we have never done predictions we cannot detect if there was a drift. 267 | if self.y_pred is not None: 268 | self.__drift_detection(X, y_true) 269 | self.low_diversity_learner.update(X, y_true) 270 | self.high_diversity_learner.update(X, y_true) 271 | if not self.mode_before_drift: 272 | self.old_low_diversity_learner.update(X, y_true) 273 | self.old_high_diversity_learner.update(X, y_true) 274 | 275 | if __name__ == "__main__": 276 | from data_management.StreamGenerator import StreamGenerator 277 | from data_management.DataLoader import KDDCupLoader, SEALoader 278 | from sklearn.linear_model import SGDClassifier 279 | 280 | # generate data 281 | loader = SEALoader('../data/sea.data', percentage_historical_data=0.1) 282 | generator = StreamGenerator(loader) 283 | # kdd_data_loader = KDDCupLoader('../data/kddcup.data_10_percent') 284 | # generator = StreamGenerator(kdd_data_loader) 285 | 286 | # model 287 | clf = OnlineBagging 288 | p_estimators = None 289 | n_classes = np.array(range(0, 2)) 290 | p_clf_high = {'lambda_diversity': 0.1, 291 | 'n_classes': n_classes, 292 | 'n_estimators': 25, 293 | 'base_estimator': SGDClassifier, 294 | } 295 | p_clf_low = {'lambda_diversity': 1, 296 | 'n_classes': n_classes, 297 | 'n_estimators': 25, 298 | 'base_estimator': SGDClassifier, 299 | } 300 | ddd = DDD(ensemble_method=clf, drift_detector=DDM, pl=p_clf_low, ph=p_clf_high) 301 | batch = 3000 302 | X_historical, y_historical = generator.get_historical_data() 303 | ddd.update(X_historical, y_historical) 304 | for i, (X, y_true) in enumerate(generator.generate(batch_size=batch)): 305 | y_pred = ddd.predict(X) 306 | print("Accuracy score: %0.2f" % accuracy_score(y_true, y_pred)) 307 | # after some time, labels are available 308 | print("update model\n") 309 | ddd.update(X, y_true) 310 | 311 | -------------------------------------------------------------------------------- /ensemble_methods/online_bagging.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import SGDClassifier 3 | 4 | PARAM_LOG_REG = {'solver': 'sag', 'tol': 1e-1, 'C': 1.e4} 5 | 6 | class OnlineBagging: 7 | def __init__(self, lambda_diversity=0.1, n_estimators=25, base_estimator=None, p_estimators=None, 8 | n_classes=None): 9 | ''' 10 | Online Bagging similar to offline Bagging but introduce the low and high diversity during the bagging 11 | :param lambda_diversity: low lambda diversity allows high diversity ensemble whereas high lambda_diversity 12 | induces low diversity. 13 | :param n_estimators: number of estimators for the bagging 14 | :param base_estimator: Online learning algorithm it should implements partial fit. 15 | The default value is SGDClassifer. 16 | :param p_estimators: Parameters of the online learning algorithms 17 | :param n_classes: number of classes you need to pass a list of classes 18 | ''' 19 | if base_estimator is None: 20 | self.base_estimator = SGDClassifier 21 | else: 22 | self.base_estimator = base_estimator 23 | 24 | self.lambda_diversity = lambda_diversity 25 | if p_estimators is not None: 26 | self.list_classifiers = [self.base_estimator(**p_estimators) for _ in range(n_estimators)] 27 | else: 28 | self.list_classifiers = [self.base_estimator() for _ in range(n_estimators)] 29 | 30 | self.list_classes = n_classes 31 | 32 | def update(self, X, y): 33 | """ Update the ensemble of models 34 | 35 | :param X: new single X 36 | :param y: new single y 37 | """ 38 | # retrieve list of different classes if it is the first time we fit data 39 | if self.list_classes is None: 40 | self.list_classes = np.unique(y) 41 | for classifier in self.list_classifiers: 42 | # Generate the number of time I want my classifier see the example 43 | k = np.random.poisson(self.lambda_diversity, len(X)) 44 | X_training = None 45 | y_training = None 46 | while np.sum(k > 0): 47 | pos = np.where(k > 0) 48 | if X_training is None and y_training is None: 49 | X_training = X[pos] 50 | y_training = y[pos] 51 | else: 52 | X_pos = X[pos] 53 | y_pos = y[pos] 54 | if X_pos.shape[0] == 1: 55 | X_training = np.concatenate((X_training, X[pos].reshape((1, X[pos].shape[1]))), axis=0) 56 | else: 57 | X_training = np.concatenate((X_training, X[pos]), axis=0) 58 | y_training = np.vstack((y_training.reshape((-1, 1)), y_pos.reshape((-1, 1)))) 59 | 60 | # check if there is all classes pass to the fit methods 61 | k -= 1 62 | if X_training is not None and y_training is not None: 63 | y_training = y_training.reshape((y_training.shape[0],)) 64 | classifier.partial_fit(X_training, y_training, self.list_classes) 65 | 66 | def predict(self, X): 67 | """ Make the prediction 68 | 69 | :param X: examples to predict 70 | :return: the prediction y_predict 71 | """ 72 | # make the prediction for each classifier 73 | predictions = np.array([clf.predict(X).tolist() for clf in self.list_classifiers]) 74 | 75 | # for each class, count the number of times the class is predicted 76 | nb_votes_by_class = [] 77 | for c in self.list_classes: 78 | nb_votes_by_class.append(np.sum(predictions == c, axis=0)) 79 | 80 | # for each example, return the class which was predicted the most 81 | return self.list_classes[np.argmax(nb_votes_by_class, axis=0)] 82 | 83 | def predict_proba(self, X): 84 | """ Compute the probability of belonging to each class 85 | 86 | :param X: examples to predict 87 | :return: the probabilities, array of shape (n_examples, n_classes) 88 | """ 89 | # create empty array to retrieve 90 | array_probas = np.zeros((len(X), len(self.list_classes), len(self.list_classifiers))) 91 | 92 | # iterate over the classifiers and add the probabilities to the previous array 93 | for i, clf in enumerate(self.list_classifiers): 94 | array_probas[:, :, i] = clf.predict_proba(X) 95 | 96 | # compute and return the mean of probas computed by each classifier 97 | return array_probas.mean(axis=2) 98 | 99 | 100 | if __name__ == "__main__": 101 | from data_management.StreamGenerator import StreamGenerator 102 | from data_management.DataLoader import SEALoader 103 | from sklearn.metrics import accuracy_score 104 | np.random.seed(3) 105 | # generate data 106 | loader = SEALoader('../data/sea.data') 107 | generator = StreamGenerator(loader) 108 | 109 | # model 110 | n_classes = np.array(range(0, 2)) 111 | clf = OnlineBagging(base_estimator=SGDClassifier, lambda_diversity=0.1, n_classes=n_classes, n_estimators=25, 112 | p_estimators=None) 113 | X_histo, y_histo = generator.get_historical_data() 114 | clf.update(X_histo, y_histo) 115 | for i, (X, y) in enumerate(generator.generate(batch_size=50)): 116 | print("Batch #%d:" % i) 117 | # predict 118 | print("predict for current X") 119 | y_predict = clf.predict(X) 120 | # probas = clf.predict_proba(X) 121 | print("Accuracy score: %0.2f" % accuracy_score(y, y_predict)) 122 | # after some time, labels are available 123 | print("update model\n") 124 | clf.update(X, y) 125 | -------------------------------------------------------------------------------- /offline_methods/OfflineAlgorithmsWrapper.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import ClassifierMixin 2 | 3 | 4 | class OfflineAlgorithmsWrapper: 5 | """ Wrapper on Scikit-learn classifiers, to use offline algorithms inside the project """ 6 | 7 | def __init__(self, base_estimator): 8 | """ Constructor of OfflineAlgorithmsWrapper 9 | 10 | :param base_estimator: instance of a classifier of scikit-learn (must be an instance of a subclass of sklearn.base.ClassifierMixin) 11 | """ 12 | self.base_estimator = base_estimator 13 | self._check_base_estimator() 14 | 15 | self.fitted = False # boolean which is True if base_estimator has been fit 16 | 17 | def _check_base_estimator(self): 18 | """ Raise a ValueError if base_estimator is not suitable for the project """ 19 | if not isinstance(self.base_estimator, ClassifierMixin): 20 | raise ValueError( 21 | "In constructor of OfflineAlgorithmsWrapper, base_estimator should be an instance of a subclass of sklearn.base.ClassifierMixin") 22 | 23 | def update(self, X, y): 24 | """ Fit the base_estimator, only if it has not been fitted already""" 25 | if not self.fitted: 26 | self.base_estimator.fit(X, y) 27 | self.fitted = True 28 | 29 | def predict(self, X): 30 | return self.base_estimator.predict(X) 31 | 32 | def predict_proba(self, X): 33 | return self.base_estimator.predict_proba(X) 34 | 35 | 36 | # Code example to test OfflineAlgorithmsWrapper 37 | if __name__ == "__main__": 38 | import numpy as np 39 | from data_management import SEALoader, StreamGenerator 40 | from sklearn.ensemble import RandomForestClassifier 41 | 42 | # generate data 43 | print("Get data...") 44 | loader = SEALoader('../data/sea.data') 45 | generator = StreamGenerator(loader) 46 | X_train, y_train = generator.get_historical_data() 47 | X_test, y_test = generator.generate(batch_size=1000).__next__() 48 | 49 | # create some models 50 | clf = OfflineAlgorithmsWrapper(RandomForestClassifier(n_estimators=100)) 51 | 52 | # fit models 53 | print("\nValue of self.fitted: %d" % clf.fitted) 54 | print("First call of update()...") 55 | clf.update(X_train, y_train) 56 | print("Value of self.fitted: %d" % clf.fitted) 57 | 58 | # predict 59 | print("\nPrediction of classes...") 60 | y_predict1 = clf.predict(X_test) 61 | 62 | # predict_proba 63 | print("\nPrediction of probabilities...") 64 | y_probas1 = clf.predict_proba(X_test) 65 | 66 | # try to update on X_test, y_test 67 | print("\nSecond call of update()...") 68 | clf.update(X_test, y_test) 69 | 70 | # second prediction 71 | print("\nSecond prediction of probabilities...") 72 | y_probas2 = clf.predict_proba(X_test) 73 | 74 | # comparison of probabilities computed after first and second call to update 75 | probs_are_equal = np.all(y_probas1 == y_probas2) 76 | if probs_are_equal: 77 | print("\nProbabilities are equal after first and second call to update") 78 | print(" -> OfflineAlgorithmsWrapper works as intended") 79 | else: 80 | print("\nProbabilities are different after first and second call to update") 81 | print(" -> OfflineAlgorithmsWrapper doesn't work as intended") 82 | -------------------------------------------------------------------------------- /offline_methods/__init__.py: -------------------------------------------------------------------------------- 1 | from .OfflineAlgorithmsWrapper import OfflineAlgorithmsWrapper -------------------------------------------------------------------------------- /training_windows_methods/AdaptiveSVC.py: -------------------------------------------------------------------------------- 1 | """ Adaptive SVC algorithm """ 2 | 3 | import numpy as np 4 | from sklearn.metrics.classification import accuracy_score 5 | from sklearn.svm import SVC 6 | 7 | 8 | class AdaptiveSVC: 9 | """ 10 | This class implements the adaptive SVM algorithm based on the article 11 | "Detecting Concept Drift with Support Vector Machines" by R Klinkenberg and Thorsten Joachims 12 | """ 13 | def __init__(self, memory_limit=10000, **svc_kwargs): 14 | """ 15 | Adaptive SVC constructor 16 | :parameter memory_limit: limit of line of data kept in memory, integer 17 | :parameter svc_kwargs: kwargs to give to the SVC classifiers, kwargs 18 | /!\ Only linear kernel is currently supported, so kwargs kernel-related will not be handled as expected 19 | """ 20 | self.memory = {'X': list(), 'y': list()} 21 | self.memory_limit = memory_limit 22 | self.memory_current_size = 0 23 | self.windows_in_memory = 0 24 | self.previous_best_window = 0 25 | self.svc_kwargs = svc_kwargs 26 | self.classifiers = list() 27 | self.training_set_sizes = list() 28 | self.xi_alpha_estimators = list() 29 | self.predicting_classifier = None 30 | 31 | def _add_new_batch_to_memory(self, X, y, batch_size): 32 | """ 33 | Add a new batch of data to the memory 34 | :parameter X: batch of unlabelled data, numpy.array, shape (n,m) 35 | :parameter y: labels of the batch of data, numpy.array, shape (n,1) 36 | :parameter batch_size: size of the batch, or len(y), integer == n 37 | """ 38 | 39 | if self.memory_current_size + batch_size > self.memory_limit: 40 | # It is not possible to store the last batch without exceeding the memory limit 41 | # So let is forget some of the oldest data 42 | number_of_data_to_forget = self.memory_current_size + batch_size - self.memory_limit 43 | oldest_batch_size = len(self.memory['X'][0]) 44 | if number_of_data_to_forget > oldest_batch_size: 45 | # More than the oldest batch has to be forgotten 46 | # Let is pop this oldest batch and remove the necessary amount of data from the second oldest batch 47 | self.memory['X'].pop(0) 48 | self.memory['y'].pop(0) 49 | self.memory_current_size -= oldest_batch_size 50 | number_of_data_to_forget -= oldest_batch_size 51 | self.windows_in_memory -= 1 52 | self.memory['X'][0] = self.memory['X'][0][number_of_data_to_forget:] 53 | self.memory['y'][0] = self.memory['y'][0][number_of_data_to_forget:] 54 | self.memory_current_size -= number_of_data_to_forget 55 | 56 | if self.memory['X'][0].size == 0: 57 | # The entire oldest batch has been forgotten and is now empty 58 | # Let is remove it from the memory 59 | self.memory['X'].pop(0) 60 | self.memory['y'].pop(0) 61 | self.windows_in_memory -= 1 62 | 63 | assert(self.memory_current_size + batch_size <= self.memory_limit) 64 | 65 | # Add the entire new batch of data to memory 66 | self.memory['X'].append(X) 67 | self.memory['y'].append(y) 68 | self.memory_current_size += batch_size 69 | self.windows_in_memory += 1 70 | 71 | def _svc_fit_on_window(self, window): 72 | """ 73 | Return an SVC classifier, fitted on the window given in argument 74 | :parameter window: number of batches to use from memory for learning, 0 < integer <= self.windows_in_memory 75 | """ 76 | if window is not None: 77 | X_train = np.concatenate(self.memory['X'][-window:], axis=0) 78 | y_train = np.concatenate(self.memory['y'][-window:], axis=0) 79 | return SVC(**self.svc_kwargs, kernel='linear').fit(X_train, y_train), len(y_train) 80 | else: 81 | return None 82 | 83 | def _compute_xi_alpha_estimators(self, X, y, batch_size): 84 | xi_alpha_estimators = list() 85 | # Compute R 86 | gram_X = X.dot(X.T) 87 | diag_gram_X = np.diag(gram_X).reshape(len(gram_X), 1) 88 | R = np.max(diag_gram_X - gram_X) 89 | for classifier, training_set_size in zip(self.classifiers, self.training_set_sizes): 90 | # Compute Xi 91 | w_opt = classifier.coef_ 92 | b_opt = classifier.intercept_ 93 | xi = np.zeros(shape=(batch_size, 1)) 94 | for data_index, (X_data_index, y_data_index) in enumerate(zip(X, y)): 95 | xi[data_index] = max(1 - float(y_data_index * (w_opt.dot(X_data_index) + b_opt)), 0) 96 | # Compute alpha 97 | alpha = np.zeros(shape=(batch_size, 1)) 98 | for support_vector_idx, alpha_coef in zip(classifier.support_, classifier.dual_coef_): 99 | alpha_idx = support_vector_idx - training_set_size - batch_size 100 | if alpha_idx > 0: 101 | alpha[alpha_idx] = alpha_coef 102 | # Compute xi-alpha estimator 103 | xi_alpha_estimators.append(np.sum((2 * alpha * R + xi) >= 1).astype(int) / batch_size) 104 | return xi_alpha_estimators 105 | 106 | def _update_memory_according_to_best_window(self, window, batch_size): 107 | """ 108 | Remove unnecessary data from memory according to the chosen best window for learning 109 | :parameter window: chosen best window for learning, 0 < integer <= self.windows_in_memory 110 | :parameter batch_size: size of the current batch, integer 111 | """ 112 | if window < self.windows_in_memory: 113 | oldest_batch_size = len(self.memory['X'][0]) 114 | self.memory['X'] = self.memory['X'][-window:] 115 | self.memory['y'] = self.memory['y'][-window:] 116 | self.memory_current_size -= oldest_batch_size 117 | if window < self.windows_in_memory - 1: 118 | self.memory_current_size -= (self.windows_in_memory - 1 - window) * batch_size 119 | self.windows_in_memory = window 120 | 121 | def update(self, X, y): 122 | """ 123 | Update the model with the batch given in argument 124 | :parameter X: batch of unlabelled data, numpy.array, shape (n,m) 125 | :parameter y: labels of the batch of data, numpy.array, shape (n,1) 126 | """ 127 | # Add this new batch to the memory 128 | batch_size = len(y) 129 | self._add_new_batch_to_memory(X, y, batch_size) 130 | 131 | # Learn on the different windows 132 | self.classifiers = list() 133 | self.training_set_sizes = list() 134 | training_windows = range(1, (self.previous_best_window + 1) + 1) 135 | 136 | for training_window in training_windows: 137 | window_is_admissible = 0 < training_window <= self.windows_in_memory 138 | if window_is_admissible: 139 | clf, training_set_size = self._svc_fit_on_window(training_window) 140 | self.classifiers.append(clf) 141 | self.training_set_sizes.append(training_set_size) 142 | 143 | # Compute the xi alpha estimators 144 | self.xi_alpha_estimators = self._compute_xi_alpha_estimators(X, y, batch_size) 145 | 146 | # Keep the classifier which has the best xi-alpha estimator 147 | best_classifier_index = np.argmin(self.xi_alpha_estimators) 148 | 149 | # Update self.previous_best_window 150 | assert(0 < training_windows[best_classifier_index] <= self.windows_in_memory) 151 | best_window = training_windows[best_classifier_index] 152 | self.previous_best_window = best_window 153 | 154 | # Update memory 155 | self._update_memory_according_to_best_window(best_window, batch_size) 156 | 157 | # Update predicting classifier 158 | self.predicting_classifier = self.classifiers[best_classifier_index] 159 | 160 | def predict(self, X): 161 | """ 162 | Predict labels associated to data given in argument 163 | :parameter X: batch of unlabelled data, numpy.array, shape (n,m) 164 | """ 165 | # Predict with the predicting classifier 166 | if self.predicting_classifier is not None: 167 | return self.predicting_classifier.predict(X) 168 | else: 169 | return np.zeros(shape=(len(X), 1)) 170 | 171 | 172 | if __name__ == "__main__": 173 | from data_management import SEALoader, StreamGenerator 174 | 175 | # generate data 176 | sea_loader = SEALoader('data/sea.data') 177 | sea_generator = StreamGenerator(sea_loader) 178 | 179 | # model 180 | clf = AdaptiveSVC(memory_limit=5000, C=100) 181 | 182 | for i, (X, y) in enumerate(sea_generator.generate(batch_size=2000)): 183 | print("\nBatch #%d:" % i) 184 | print("Update model") 185 | clf.update(X, y) 186 | print("clf.previous_best_window:", clf.previous_best_window) 187 | print("clf.training_set_sizes:", clf.training_set_sizes) 188 | print("clf.xi_alpha_estimators:", clf.xi_alpha_estimators) 189 | print("clf.windows_in_memory:", clf.windows_in_memory) 190 | print("clf.memory_current_size:", clf.memory_current_size) 191 | print("clf.memory_limit:", clf.memory_limit) 192 | # predict 193 | print("Predict for current X") 194 | y_predict = clf.predict(X) 195 | print("Accuracy score: %0.2f" % accuracy_score(y, y_predict)) 196 | if i > 9: 197 | break 198 | -------------------------------------------------------------------------------- /training_windows_methods/__init__.py: -------------------------------------------------------------------------------- 1 | from .AdaptiveSVC import AdaptiveSVC -------------------------------------------------------------------------------- /training_windows_methods/test_AdaptiveSVC.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_management.DataLoader import SEALoader 4 | from data_management.StreamGenerator import StreamGenerator 5 | from training_windows_methods.AdaptiveSVC import AdaptiveSVC 6 | 7 | 8 | class TestAdaptiveSVC(unittest.TestCase): 9 | 10 | def setUp(self): 11 | self.sea_loader = SEALoader('data/sea.data') 12 | self.sea_generator = StreamGenerator(self.sea_loader) 13 | 14 | def test_memory_manager(self): 15 | # model 16 | clf = AdaptiveSVC(memory_limit=500, C=100) 17 | 18 | for i, (X, y) in enumerate(self.sea_generator.generate(batch=200)): 19 | if i < 5: 20 | clf.update(X, y) 21 | if i == 0: 22 | self.assertEqual(clf.previous_best_window, 1) 23 | self.assertEqual(clf.memory_current_size, 200) 24 | self.assertEqual(len(clf.memory['y']), 1) 25 | if i == 1: 26 | self.assertEqual(clf.previous_best_window, 2) 27 | self.assertEqual(clf.memory_current_size, 400) 28 | self.assertEqual(len(clf.memory['y']), 2) 29 | if i == 2: 30 | self.assertEqual(clf.previous_best_window, 3) 31 | self.assertEqual(clf.memory_current_size, 500) 32 | self.assertEqual(len(clf.memory['y']), 3) 33 | if i == 3: 34 | self.assertEqual(clf.previous_best_window, 3) 35 | self.assertEqual(clf.memory_current_size, 500) 36 | self.assertEqual(len(clf.memory['y']), 3) 37 | if i == 4: 38 | self.assertEqual(clf.previous_best_window, 3) 39 | self.assertEqual(clf.memory_current_size, 500) 40 | self.assertEqual(len(clf.memory['y']), 3) 41 | else: 42 | break 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | --------------------------------------------------------------------------------